2024-07-26
This post presents a Pratt parser for a subset of JavaScript implemented in Python.
This type of parser has become fairly popular due to the simplicity with which it handles operator precedence. In fact I'm part of a long lineage of bloggers - it's the "monad is a burrito" of parsing posts. I thought I'd bother writing up my attempt as I had a fun time writing it and was particularly happy with the brevity of the resulting code. Thank you thank you Andy C for bringing it into my life.
The parser will turn the following source:
{
"a": {"b": {"c": "d"}},
"foo": (1 === 2)
? "bar"
: 3 * 4 + 5
}
Into the following AST:
({
(: "a" ({ (: "b" ({ (: "c" "d")))))
(: "foo" (? (=== 1 2)
"bar"
(+ (* 3 4) 5))))
Some more examples:
1 |
1 |
1 + 2 |
(+ 1 2) |
1 + 2 * 3 |
(+ 1 (* 2 3)) |
1 * 2 + 3 |
(+ (* 1 2) 3) |
1 * (2 + 3) |
(* 1 (+ 2 3)) |
{1, 2, 3} |
({ 1 2 3) |
{"a": 1, "c": 1 + 2} |
({ (: "a" 1) (: "c" (+ 1 2))) |
from __future__ import annotations
from dataclasses import dataclass
import string
from typing import Iterator
EOF = "\x1a"
@dataclass
class PeekableIterator:
iter: Iterator[str]
peek: str = EOF
def __post_init__(self) -> None:
next(self)
def __next__(self) -> None:
self.peek = next(self.iter, EOF)
def tokenize(chars: PeekableIterator) -> Iterator[str]:
while chars.peek != EOF:
if chars.peek in string.whitespace:
next(chars)
elif chars.peek in "{}():?+*,":
yield chars.peek
next(chars)
elif chars.peek == "=":
next(chars)
assert chars.peek == "="
next(chars)
assert chars.peek == "="
next(chars)
yield "==="
elif chars.peek == '"':
current = ""
current += chars.peek
next(chars)
while chars.peek != '"':
current += chars.peek
next(chars)
current += chars.peek
next(chars)
yield current
else:
current = ""
while chars.peek in "0123456789":
current += chars.peek
next(chars)
yield current
With that defined, given the example source
from above, list(tokenize(PeekableIterator(iter(source))))
should give:
['{', '"a"', ':', '{', '"b"', ':', '{', '"c"', ':', '"d"', '}', '}', ',', '"foo"', ':', '(', '1', '===', '2', ')', '?', '"bar"', ':', '3', '*', '4', '+', '5', '}']
Now we define the AST Node
type and a small helper:
@dataclass
class Node:
token: str
children: list[Node]
def next_assert(tokens: PeekableIterator, to: str) -> None:
assert tokens.peek == to
next(tokens)
Then the relative precedence of each of the infix operators:
INFIX_PRECEDENCES = {
"===": 2,
":": 11,
"?": 12,
"+": 14,
"*": 15,
}
The parser itself is then pretty simple!
def parse(tokens: PeekableIterator, precedence: int) -> Node:
token = tokens.peek
next(tokens)
if token[0] in '"0123456789':
node = token # atom
elif token == "{":
node = Node(token, [])
while tokens.peek != "}":
if tokens.peek == ",":
next(tokens)
else:
node.children.append(parse(tokens, 2))
next_assert(tokens, "}")
elif token == "(":
node = parse(tokens, -1)
next_assert(tokens, ")")
else:
raise ValueError(f"Unrecognised token: {token}")
return infix(tokens, precedence, node)
def infix(tokens: PeekableIterator, precedence: int, left: Node) -> Node:
token = tokens.peek
next_precedence = INFIX_PRECEDENCES.get(token)
if next_precedence is None or precedence >= next_precedence:
return left
next(tokens)
node = Node(token, [left, parse(tokens, next_precedence)])
if token == "?": # ? is a ternary expression, so we parse one more child
next_assert(tokens, ":")
node.children.append(parse(tokens, next_precedence))
return infix(tokens, precedence, node)
If that was fun, here's a more complicated parser for an old hobby project of mine.