tokenize.js
// tokenize.js # Misty tokenizer
const tokenize = (function () {
let at;
let character;
let column_nr;
let line_nr;
let source;
let token;
let tokenator;
let tokens;
function error(reason) {
token.type = "error";
token.text = source.slice(token.at);
token.where = at;
token.reason = reason;
}
function next() {
if (character === "\n") {
column_nr = 0;
line_nr += 1;
} else {
column_nr += 1;
}
at += 1;
character = source[at];
tokenator = tokenators[character];
}
function special() {
token.type = character;
token.text = character;
return next();
}
function carriage_return() {
next();
if (character === "\n") {
return special();
}
token.type = "\n";
token.text = "\n";
}
function space() {
while (true) {
next();
if (character !== " ") {
break;
}
}
token.type = "space";
token.text = source.slice(token.at, at);
}
function comment() {
while (true) {
next();
if (character === "\n" || character === undefined) {
break;
}
}
token.type = "comment";
token.text = source.slice(token.at, at);
}
function period() {
next();
if (character !== ".") {
token.type = ".";
token.text = ".";
return;
}
next();
if (character !== ".") {
return error("Two dots");
}
token.type = "...";
token.text = "...";
return next();
}
function slash() {
next();
if (character !== "\\") {
token.type = "/";
token.text = "/";
return;
}
token.type = "/\\";
token.text = "/\\";
return next();
}
function backslash() {
next();
if (character !== "/") {
token.type = "\\";
token.text = "\\";
return;
}
token.type = "\\/";
token.text = "\\/";
return next();
}
function digit() {
let e_seen = false;
let period_seen = false;
while (true) {
next();
if (tokenator !== digit) {
if (character === ".") {
if (period_seen) {
return error("Unexpected '.' in number");
}
period_seen = true;
next();
if (tokenator !== digit) {
return error("Expected digits after '.'");
}
} else if (character === "e") {
if (e_seen) {
return error("Unexpected 'e' in number");
}
e_seen = true;
period_seen = true;
next();
if (character === "-") {
next();
}
if (tokenator !== digit) {
return error("Expected digits after 'e'");
}
} else {
break;
}
}
}
token.type = "number";
token.text = source.slice(token.at, at);
}
function minus() {
next();
if (tokenator !== digit) {
token.type = "-";
token.text = "-";
return;
}
digit();
token.text = "-" + token.text;
}
function middle() {
next();
if (tokenator === letter || tokenator === digit) {
return middle();
}
if (character === "_" || character === "$") {
next();
if (tokenator !== letter && tokenator !== digit) {
return error("Misplaced separator");
}
return middle();
}
}
function letter() {
middle();
if (token.type !== "error") {
if (character === "?") {
next();
}
token.type = "name";
token.text = source.slice(token.at, at);
}
}
const escape = {
b: "\\",
d: "»",
g: "«",
n: "\n",
q: "\"",
r: "\r",
t: "\t"
};
const hex = {
"0": true,
"1": true,
"2": true,
"3": true,
"4": true,
"5": true,
"6": true,
"7": true,
"8": true,
"9": true,
"A": true,
"B": true,
"C": true,
"D": true,
"E": true,
"F": true
};
function quote() {
let value = "";
let escapee;
let codepoint = 0;
while (true) {
next();
if (character === "\"") {
break;
}
if (character === "\n" || character === undefined) {
return error("Unclosed text literal");
}
if (character === "\\") {
next();
escapee = escape[character];
if (typeof escapee === "string") {
value += escapee;
} else if (character === "u") {
next();
if (character !== "{") {
return error("Missing '{'");
}
next();
if (!hex[character]) {
return error("Missing hex codepoint");
}
escapee = character;
while (true) {
next();
if (!hex[character]) {
break;
}
escapee += character;
}
if (character !== "}") {
return error("Missing '}'");
}
codepoint = Number.parseInt(escapee, 16);
if (!Number.isFinite(codepoint) || codepoint >= 4294967296) {
return error("Bad codepoint");
}
value += String.fromCodePoint(codepoint);
} else {
return error("Bad escapement");
}
} else {
value += character;
}
}
token.type = "text";
token.text = value;
token.quote = "\"";
return next();
}
function chevron() {
let nesting = 0;
while (true) {
next();
if (character === "»") {
if (nesting === 0) {
break;
}
nesting -= 1;
} else if (character === "«") {
nesting += 1;
} else if (character === "") {
return error("Unclosed text literal");
}
}
next();
token.type = "text";
token.text = source.slice(token.at + 1, at - 1);
token.quote = "«";
}
const tokenators = {
"\n": special,
"\r": carriage_return,
" ": space,
"#": comment,
".": period,
",": special,
":": special,
"/": slash,
"\\": backslash,
"|": special,
"ƒ": special,
"¶": special,
"\"": quote,
"«": chevron,
"-": minus,
"+": special,
"*": special,
"@": special,
"(": special,
")": special,
"[": special,
"]": special,
"{": special,
"}": special,
"=": special,
"<": special,
">": special,
"≠": special,
"≤": special,
"≥": special,
"÷": special,
"~": special,
"≈": special,
"0": digit,
"1": digit,
"2": digit,
"3": digit,
"4": digit,
"5": digit,
"6": digit,
"7": digit,
"8": digit,
"9": digit,
"A": letter,
"B": letter,
"C": letter,
"D": letter,
"E": letter,
"F": letter,
"G": letter,
"H": letter,
"I": letter,
"J": letter,
"K": letter,
"L": letter,
"M": letter,
"N": letter,
"O": letter,
"P": letter,
"Q": letter,
"R": letter,
"S": letter,
"T": letter,
"U": letter,
"V": letter,
"W": letter,
"X": letter,
"Y": letter,
"Z": letter,
"a": letter,
"b": letter,
"c": letter,
"d": letter,
"e": letter,
"f": letter,
"g": letter,
"h": letter,
"i": letter,
"j": letter,
"k": letter,
"l": letter,
"m": letter,
"n": letter,
"o": letter,
"p": letter,
"q": letter,
"r": letter,
"s": letter,
"t": letter,
"u": letter,
"v": letter,
"w": letter,
"x": letter,
"y": letter,
"z": letter
};
return function tokenize(source_text) {
source = source_text;
at = 0;
line_nr = 0;
column_nr = 0;
tokens = [];
character = source[0];
tokenator = tokenators[character];
while (true) {
if (character === undefined) {
break;
}
token = {
at,
column_nr,
line_nr
};
if (typeof tokenator !== "function") {
error("Bad token");
} else {
tokenator();
}
tokens.push(token);
if (token.type === "error") {
break;
}
}
return tokens;
}
}());