tokenize.mst
misty module tokenizer
var at
var character
var column_nr
var line_nr
var source
var token
var tokenator
var tokens
def error: ƒ (reason) {
set token.type: "error"
set token.text: text(source, token.at, at + 1)
set token.where: at
set token.reason: reason
}
def next: ƒ () {
if character = "\n"
set column_nr: 0
set line_nr: line_nr + 1
else
set column_nr: column_nr + 1
fi
set at: at + 1
set character: source[at]
set tokenator: tokenators[character]
}
def special: ƒ () {
set token.type: character
set token.text: character
return next()
}
def carriage_return: ƒ () {
call next()
if character = "\n"
return special()
fi
set token.type: "\n"
set token.text: "\n"
}
def comment: ƒ () {
do
call next()
if character = "\n" \/ character = null
break
fi
od
set token.type: "comment"
set token.text: source.slice(token.at, at)
}
def space: ƒ () {
do
call next()
if character ≠ " "
break
fi
od
if character = "#"
return comment()
fi
set token.type: "space"
set token.text: text(source, token.at, at)
}
def slash: ƒ () {
call next()
if character ≠ "\b"
set token.type: "/"
set token.text: "/"
return
fi
set token.type: "/\b"
set token.text: "/\b"
return next()
}
def backslash: ƒ () {
call next()
if character ≠ "/"
set token.type: "\b"
set token.text: "\b"
return
fi
set token.type: "\b/"
set token.text: "\b/"
return next()
}
def digit: ƒ () {
var e_seen: false
var period_seen: false
do loop
call next()
if tokenator ≠ digit
if character = "."
if period_seen
return error("Unexpected '.' in number")
fi
set period_seen: true
call next()
if tokenator ≠ digit
return error("Expected digits after '.'")
fi
else if character = "e"
if e_seen
return error("Unexpected 'e' in number")
fi
set e_seen: true
set period_seen: true
call next()
if character = "-"
call next()
fi
if tokenator ≠ digit
return error("Expected digits after 'e'")
fi
else
break loop
fi
fi
od loop
set token.type: "number"
set token.text: text(source, token.at, at)
}
def minus: ƒ () {
call next()
if tokenator ≠ digit
set token.type: "-"
set token.text: "-"
return
fi
call digit()
set token.text: "-" + token.text
}
def middle: ƒ () {
call next()
if tokenator = letter \/ tokenator = digit
return middle()
fi
if character = "_" \/ character = "$"
call next()
if tokenator ≠ letter /\ tokenator ≠ digit
return error("Misplaced separator")
fi
return middle()
fi
}
def letter: ƒ () {
call middle()
if token.type ≠ "error"
if character = "?"
call next()
fi
set token.type: "name"
set token.text: source.slice(token.at, at)
fi
}
def escape: {
b: "\b" # backslash
d: "»" # droite
g: "«" # gauche
n: "\n" # linefeed
q: "\q" # double quote
r: "\r" # carriage return
t: "\t" # tab
}
def hex: {
"0": true
"1": true
"2": true
"3": true
"4": true
"5": true
"6": true
"7": true
"8": true
"9": true
"A": true
"B": true
"C": true
"D": true
"E": true
"F": true
}
def quote: ƒ () {
var codepoint
var escapee
var value: ""
do
call next()
if character = "\q"
break
fi
if character = "\n" \/ character = null
return error("Unclosed text literal")
fi
if character = "\b"
call next()
let escapee: escape[character]
if text?(escapee)
set value: value ~ escapee
else if character = "u"
call next()
if character ≠ "{"
return error("Missing '{'")
fi
call next()
if not(hex[character])
return error("Missing hex codepoint")
fi
set escapee: character
do
call next()
if not(hex[character])
break
fi
set escapee: escapee ~ character
if character ≠ "}"
return error("Missing '}'")
set codepoint: number(escapee, 16)
if codepoint >= 4294967296 \/ codepoint = null
return error("Bad codepoint")
fi
set value: value ~ String.fromCodePoint(codepoint)
else
return error("Bad escapement")
fi
else
set value: value ~ character
fi
od
set token.type: "text"
set token.text: value
set token.quote: "\q"
return next()
}
def chevron: ƒ () {
var nesting: 0
do
call next()
if character = "»"
if nesting = 0
break
fi
set nesting: nesting - 1
else if character = "«"
set nesting: nesting + 1
else if character = ""
return error("Unclosed text literal")
fi
od
call next()
set token.type: "text"
set token.text: text(source, token.at + 1, at - 1)
set token.quote: "«"
}
def tokenators: {
"\n": special
"\r": carriage_return
" ": space
"#": comment
".": special
"": special
":": special
"/": slash
"\b": backslash
"|": special
"ƒ": special
"¶": special
"\q": quote
"«": chevron
"-": minus
"+": special
"*": special
"@": special
"(": special
")": special
"[": special
"]": special
"{": special
"}": special
"=": special
"<": special
">": special
"≠": special
"≤": special
"≥": special
"÷": special
"~": special
"≈": special
"0": digit
"1": digit
"2": digit
"3": digit
"4": digit
"5": digit
"6": digit
"7": digit
"8": digit
"9": digit
"A": letter
"B": letter
"C": letter
"D": letter
"E": letter
"F": letter
"G": letter
"H": letter
"I": letter
"J": letter
"K": letter
"L": letter
"M": letter
"N": letter
"O": letter
"P": letter
"Q": letter
"R": letter
"S": letter
"T": letter
"U": letter
"V": letter
"W": letter
"X": letter
"Y": letter
"Z": letter
"a": letter
"b": letter
"c": letter
"d": letter
"e": letter
"f": letter
"g": letter
"h": letter
"i": letter
"j": letter
"k": letter
"l": letter
"m": letter
"n": letter
"o": letter
"p": letter
"q": letter
"r": letter
"s": letter
"t": letter
"u": letter
"v": letter
"w": letter
"x": letter
"y": letter
"z": letter
}
return ƒ tokenize(source_text) {
set source: source_text
set at: 0
set line_nr: 0
set column_nr: 0
set tokens: []
set character: source[0]
set tokenator: tokenators[character]
do
if character = null
break
fi
set token: {
at
column_nr
line_nr
}
if function?(tokenator)
call tokenator()
else
call error("Bad token")
fi
tokens.push(token)
if token.type = "error"
return token
fi
od
return tokens
}
end tokenizer