Misty Programming Language:

tokenize.js

// tokenize.js  # Misty tokenizer
// 2023-12-05

const tokenize = (function () {
    let at;
    let character;
    let column_nr;
    let line_nr;
    let source;
    let token;
    let tokenator;
    let tokens;

    const tokenators = {
        "\n": newline,
        "\r": carriage_return,
        " ": space,
        "#": comment,
        ".": special,
        ",": special,
        ":": special,
        "/": slash,
        "\\": backslash,
        "|": special,
        "ƒ": special,
        "¶": special,
        "!": special,
        "\"": quote,
        "«": chevron,
        "-": minus,
        "+": special,
        "*": special,
        "@": special,
        "(": special,
        ")": special,
        "[": special,
        "]": special,
        "{": special,
        "}": special,
        "=": special,
        "<": less,
        ">": greater,
        "≠": special,
        "≤": special,
        "≥": special,
        "÷": special,
        "~": special,
        "≈": special,
        "%": special,
        "^": special,
        "&": special,
        "'": special,
        "`": special,
        "0": digit,
        "1": digit,
        "2": digit,
        "3": digit,
        "4": digit,
        "5": digit,
        "6": digit,
        "7": digit,
        "8": digit,
        "9": digit,
        "A": letter,
        "B": letter,
        "C": letter,
        "D": letter,
        "E": letter,
        "F": letter,
        "G": letter,
        "H": letter,
        "I": letter,
        "J": letter,
        "K": letter,
        "L": letter,
        "M": letter,
        "N": letter,
        "O": letter,
        "P": letter,
        "Q": letter,
        "R": letter,
        "S": letter,
        "T": letter,
        "U": letter,
        "V": letter,
        "W": letter,
        "X": letter,
        "Y": letter,
        "Z": letter,
        "a": letter,
        "b": letter,
        "c": letter,
        "d": letter,
        "e": letter,
        "f": letter,
        "g": letter,
        "h": letter,
        "i": letter,
        "j": letter,
        "k": letter,
        "l": letter,
        "m": letter,
        "n": letter,
        "o": letter,
        "p": letter,
        "q": letter,
        "r": letter,
        "s": letter,
        "t": letter,
        "u": letter,
        "v": letter,
        "w": letter,
        "x": letter,
        "y": letter,
        "z": letter
    };

    function error(reason) {
        token.kind = "error";
        token.text = source.slice(token.at, at + 1);
        token.where = at;
        token.reason = reason;
    }

    function next() {
        at += 1;
        column_nr += 1;
        character = source[at];
        tokenator = tokenators[character];
    }

    function special() {
        token.kind = character;
        token.text = character;
        return next();
    }

    function newline() {
        token.kind = "newline";
        token.text = "\n";
        column_nr = -1;
        line_nr += 1;
        return next();
    }

    function carriage_return() {
        token.kind = "newline";
        line_nr += 1;
        next();
        if (character === "\n") {
            token.text = "\r\n";
            column_nr = -1;
            return next();
        } else {
            token.text = "\n";
            column_nr = 0;
            return undefined;
        }
    }

    function comment() {
        while (true) {
            next();
            if (character === "\n" || character === undefined) {
                break;
            }
        }
        token.kind = "comment";
        token.text = source.slice(token.at, at);
    }

    function space() {
        while (true) {
            next();
            if (character !== " ") {
                break;
            }
        }
        token.kind = "space";
        token.text = source.slice(token.at, at);
    }

    function slash() {
        next();
        if (character !== "\\") {
            token.kind = "/";
            token.text = "/";
            return;
        }
        token.kind = "/\\";
        token.text = "/\\";
        return next();
    }

    function backslash() {
        next();
        if (character !== "/") {
            token.kind = "\\";
            token.text = "\\";
            return;
        }
        token.kind = "\\/";
        token.text = "\\/";
        return next();
    }

    function less() {
        next();
        if (character === "=") {
            token.kind = "<=";
            token.text = "<=";
            return next();
        }
        if (character === ">") {
            token.kind = "<>";
            token.text = "<>";
            return next();
        }
        token.kind = "<";
        token.text = "<";
        return;
    }

    function greater() {
        next();
        if (character === "=") {
            token.kind = ">=";
            token.text = ">=";
            return next();
        }
        token.kind = ">";
        token.text = ">";
        return;
    }

    function digit() {
        let e_seen = false;
        let period_seen = false;
        while (true) {
            next();
            if (tokenator !== digit) {
                if (character === ".") {
                    if (period_seen) {
                        return error("Unexpected '.' in number");
                    }
                    period_seen = true;
                    next();
                    if (tokenator !== digit) {
                        return error("Expected digits after '.'");
                    }
                } else if (character === "e") {
                    if (e_seen) {
                        return error("Unexpected 'e' in number");
                    }
                    e_seen = true;
                    period_seen = true;
                    next();
                    if (character === "-") {
                        next();
                    }
                    if (tokenator !== digit) {
                        return error("Expected digits after 'e'");
                    }
                } else {
                    break;
                }
            }
        }
        token.kind = "number";
        token.text = source.slice(token.at, at);
    }

    function minus() {
        next();
        if (tokenator !== digit) {
            token.kind = "-";
            token.text = "-";
            return;
        }
        digit();
        token.text = "-" + token.text;
    }

    function middle() {
        next();
        if (tokenator === letter || tokenator === digit) {
            return middle();
        }
        if (character === "_" || character === "$") {
            next();
            if (tokenator !== letter && tokenator !== digit) {
                return error("Misplaced separator");
            }
            return middle();
        }
    }

    function letter() {
        middle();
        if (token.kind !== "error") {
            if (character === "?") {
                next();
            }
            token.kind = "name";
            token.text = source.slice(token.at, at);
        }
    }

    const escape = {
        b: "\\",
        d: "»",
        g: "«",
        n: "\n",
        q: "\"",
        r: "\r",
        t: "\t"
    };

    const hex = {
        "0": true,
        "1": true,
        "2": true,
        "3": true,
        "4": true,
        "5": true,
        "6": true,
        "7": true,
        "8": true,
        "9": true,
        "A": true,
        "B": true,
        "C": true,
        "D": true,
        "E": true,
        "F": true
    };

    function quote() {
        let value = "";
        let escapee;
        let codepoint = 0;
        while (true) {
            next();
            if (character === "\"") {
                break;
            }
            if (character === "\n" || character === undefined) {
                return error("Unclosed text literal");
            }
            if (character === "\\") {
                next();
                escapee = escape[character];
                if (typeof escapee === "string") {
                    value += escapee;
                } else if (character === "u") {
                    next();
                    if (character !== "{") {
                        return error("Missing '{'");
                    }
                    next();
                    if (!hex[character]) {
                        return error("Missing hex codepoint");
                    }
                    escapee = character;
                    while (true) {
                        next();
                        if (!hex[character]) {
                            break;
                        }
                        escapee += character;
                    }
                    if (character !== "}") {
                        return error("Missing '}'");
                    }
                    codepoint = Number.parseInt(escapee, 16);
                    if (!Number.isFinite(codepoint) || codepoint >= 4294967296) {
                        return error("Bad codepoint");
                    }
                    value += String.fromCodePoint(codepoint);
                } else {
                    return error("Bad escapement");
                }
            } else {
                value += character;
            }
        }
        token.kind = "text";
        token.text = value;
        token.quote = "\"";
        return next();
    }

    function chevron() {
        let nesting = 0;
        while (true) {
            next();
            if (character === "»") {
                if (nesting === 0) {
                    break;
                }
                nesting -= 1;
            } else if (character === "«") {
                nesting += 1;
            } else if (character === "") {
                return error("Unclosed text literal");
            }
        }
        next();
        token.kind = "text";
        token.text = source.slice(token.at + 1, at - 1);
        token.quote = "«";
    }

    return function tokenize(source_text) {
        source = source_text;
        at = 0;
        line_nr = 0;
        column_nr = 0;
        tokens = [];
        character = source[0];
        tokenator = tokenators[character];
        while (true) {
            if (character === undefined) {
                break;
            }
            token = {
                at,
                column_nr,
                line_nr
            };
            if (typeof tokenator !== "function") {
                error("Bad token");
            } else {
                tokenator();
            }
            tokens.push(token);
            if (token.kind === "error") {
                return token;
            }
        }
        return tokens;
    }
}());