Misty Programming Language:

tokenize.mst

misty module tokenizer

var at
var character
var column_nr
var line_nr
var source
var token
var tokenator
var tokens

def error: ƒ (reason) {
    set token.type: "error"
    set token.text: text(source, token.at, at + 1)
    set token.where: at
    set token.reason: reason
}

def next: ƒ () {
    if character = "\n"
        set column_nr: 0
        set line_nr: line_nr + 1
    else
        set column_nr: column_nr + 1
    fi
    set at: at + 1
    set character: source[at]
    set tokenator: tokenators[character]
}

def special: ƒ () {
    set token.type: character
    set token.text: character
    return next()
}

def carriage_return: ƒ () {
    call next()
    if character = "\n"
        return special()
    fi
    set token.type: "\n"
    set token.text: "\n"
}

def comment: ƒ () {
    do
        call next()
        if character = "\n" \/ character = null
            break
        fi
    od
    set token.type: "comment"
    set token.text: source.slice(token.at, at)
}

def space: ƒ () {
    do
        call next()
        if character ≠ " "
            break
        fi
    od
    if character = "#"
        return comment()
    fi
    set token.type: "space"
    set token.text: text(source, token.at, at)
}

def slash: ƒ () {
    call next()
    if character ≠ "\b"
        set token.type: "/"
        set token.text: "/"
        return
    fi
    set token.type: "/\b"
    set token.text: "/\b"
    return next()
}

def backslash: ƒ () {
    call next()
    if character ≠ "/"
        set token.type: "\b"
        set token.text: "\b"
        return
    fi
    set token.type: "\b/"
    set token.text: "\b/"
    return next()
}

def digit: ƒ () {
    var e_seen: false
    var period_seen: false
    do loop
        call next()
        if tokenator ≠ digit
            if character = "."
                if period_seen
                    return error("Unexpected '.' in number")
                fi
                set period_seen: true
                call next()
                if tokenator ≠ digit
                    return error("Expected digits after '.'")
                fi
            else if character = "e"
                if e_seen
                    return error("Unexpected 'e' in number")
                fi
                set e_seen: true
                set period_seen: true
                call next()
                if character = "-"
                    call next()
                fi
                if tokenator ≠ digit
                    return error("Expected digits after 'e'")
                fi
            else
                break loop
            fi
        fi
    od loop
    set token.type: "number"
    set token.text: text(source, token.at, at)
}

def minus: ƒ () {
    call next()
    if tokenator ≠ digit
        set token.type: "-"
        set token.text: "-"
        return
    fi
    call digit()
    set token.text: "-" + token.text
}

def middle: ƒ () {
    call next()
    if tokenator = letter \/ tokenator = digit
        return middle()
    fi
    if character = "_" \/ character = "$"
        call next()
        if tokenator ≠ letter /\ tokenator ≠ digit
            return error("Misplaced separator")
        fi
        return middle()
    fi
}

def letter: ƒ () {
    call middle()
    if token.type ≠ "error"
        if character = "?"
            call next()
        fi
        set token.type: "name"
        set token.text: source.slice(token.at, at)
    fi
}

def escape: {
    b: "\b"     # backslash
    d: "»"      # droite
    g: "«"      # gauche
    n: "\n"     # linefeed
    q: "\q"     # double quote
    r: "\r"     # carriage return
    t: "\t"     # tab
}

def hex: {
    "0": true
    "1": true
    "2": true
    "3": true
    "4": true
    "5": true
    "6": true
    "7": true
    "8": true
    "9": true
    "A": true
    "B": true
    "C": true
    "D": true
    "E": true
    "F": true
}

def quote: ƒ () {
    var codepoint
    var escapee
    var value: ""
    do
        call next()
        if character = "\q"
            break
        fi
        if character = "\n" \/ character = null
            return error("Unclosed text literal")
        fi
        if character = "\b"
            call next()
            let escapee: escape[character]
            if text?(escapee)
                set value: value ~ escapee
            else if character = "u"
                call next()
                if character ≠ "{"
                    return error("Missing '{'")
                fi
                call next()
                if not(hex[character])
                    return error("Missing hex codepoint")
                fi
                set escapee: character
                do
                    call next()
                    if not(hex[character])
                        break
                    fi
                    set escapee: escapee ~ character
                if character ≠ "}"
                    return error("Missing '}'")
                set codepoint: number(escapee, 16)
                if codepoint >= 4294967296 \/ codepoint = null
                    return error("Bad codepoint")
                fi
                set value: value ~ String.fromCodePoint(codepoint)
            else
                return error("Bad escapement")
            fi
        else
            set value: value ~ character
        fi
    od
    set token.type: "text"
    set token.text: value
    set token.quote: "\q"
    return next()
}

def chevron: ƒ () {
    var nesting: 0
    do
        call next()
        if character = "»"
            if nesting = 0
                break
            fi
            set nesting: nesting - 1
        else if character = "«"
            set nesting: nesting + 1
        else if character = ""
            return error("Unclosed text literal")
        fi
    od
    call next()
    set token.type: "text"
    set token.text: text(source, token.at + 1, at - 1)
    set token.quote: "«"
}

def tokenators: {
    "\n": special
    "\r": carriage_return
    " ": space
    "#": comment
    ".": special
    "": special
    ":": special
    "/": slash
    "\b": backslash
    "|": special
    "ƒ": special
    "¶": special
    "\q": quote
    "«": chevron
    "-": minus
    "+": special
    "*": special
    "@": special
    "(": special
    ")": special
    "[": special
    "]": special
    "{": special
    "}": special
    "=": special
    "<": special
    ">": special
    "≠": special
    "≤": special
    "≥": special
    "÷": special
    "~": special
    "≈": special
    "0": digit
    "1": digit
    "2": digit
    "3": digit
    "4": digit
    "5": digit
    "6": digit
    "7": digit
    "8": digit
    "9": digit
    "A": letter
    "B": letter
    "C": letter
    "D": letter
    "E": letter
    "F": letter
    "G": letter
    "H": letter
    "I": letter
    "J": letter
    "K": letter
    "L": letter
    "M": letter
    "N": letter
    "O": letter
    "P": letter
    "Q": letter
    "R": letter
    "S": letter
    "T": letter
    "U": letter
    "V": letter
    "W": letter
    "X": letter
    "Y": letter
    "Z": letter
    "a": letter
    "b": letter
    "c": letter
    "d": letter
    "e": letter
    "f": letter
    "g": letter
    "h": letter
    "i": letter
    "j": letter
    "k": letter
    "l": letter
    "m": letter
    "n": letter
    "o": letter
    "p": letter
    "q": letter
    "r": letter
    "s": letter
    "t": letter
    "u": letter
    "v": letter
    "w": letter
    "x": letter
    "y": letter
    "z": letter
}

return ƒ tokenize(source_text) {
    set source: source_text
    set at: 0
    set line_nr: 0
    set column_nr: 0
    set tokens: []
    set character: source[0]
    set tokenator: tokenators[character]
    do
        if character = null
            break
        fi
        set token: {
            at
            column_nr
            line_nr
        }
        if function?(tokenator)
            call tokenator()
        else
            call error("Bad token")
        fi
        tokens.push(token)
        if token.type = "error"
            return token
        fi
    od
    return tokens
}

end tokenizer