From 988f4bd077b6b239bcc85306fba7b21f76b33939 Mon Sep 17 00:00:00 2001
From: early <me@earlybird.gay>
Date: Thu, 26 Dec 2024 21:01:11 -0700
Subject: [PATCH] new parser checkpoint

---
 docs/dev/planning.md           |  21 ++
 docs/dev/templates.md          |  31 +++
 go.mod                         |   2 +-
 web/htmlt/parse/parse.go       | 316 +++++++++++++++++++++++++++++
 web/htmlt/parse/parse_test.go  |  80 ++++++++
 web/htmlt/parse/text.go        |  29 +++
 web/htmlt/parse/tokens.go      | 360 +++++++++++++++++++++++++++++++++
 web/htmlt/parse/tokens_test.go |  23 +++
 8 files changed, 861 insertions(+), 1 deletion(-)
 create mode 100644 docs/dev/planning.md
 create mode 100644 docs/dev/templates.md
 create mode 100644 web/htmlt/parse/parse.go
 create mode 100644 web/htmlt/parse/parse_test.go
 create mode 100644 web/htmlt/parse/text.go
 create mode 100644 web/htmlt/parse/tokens.go
 create mode 100644 web/htmlt/parse/tokens_test.go

diff --git a/docs/dev/planning.md b/docs/dev/planning.md
new file mode 100644
index 0000000..7e2448f
--- /dev/null
+++ b/docs/dev/planning.md
@@ -0,0 +1,21 @@
+# What's the plan, anyways
+
+I want to make templates and custom elements process concurrently. That means
+that `template.Execute` and the data loaders execute simultaneously, waiting on
+each other at critical points to yield a value or set a variable. This is going
+to require a significant refactor of everything, but frankly I think that is
+overdue outside of any feature goals, so that's just fine. I'm going to lay out
+a plan of attack for getting this feature live.
+
+## Write a parser
+
+`x/net/html` is a really good library! It's also not what this project needs.
+Go HTML is focused on being a spec-compliant parser and renderer, and templates
+are not compliant with the HTML specification in some circumstances. A parser
+that recognizes templates as separate from regular text would also be really
+handy!
+
+The first iteration of this parser is focused on restoring functionality to
+where it is now, not adding anything new. To this end, it'll mostly leave the
+semantics of templates alone, outside of recognizing things like template blocks
+as having children.
diff --git a/docs/dev/templates.md b/docs/dev/templates.md
new file mode 100644
index 0000000..904058a
--- /dev/null
+++ b/docs/dev/templates.md
@@ -0,0 +1,31 @@
+# Templates
+
+Templates are a mix of HTML and Go's standard templates. This document describes
+how template documents are tokenized and parsed before being processed.
+
+## Tokenization
+
+Documents are processed first by a tokenizer that splits it up into sequential
+tokens for processing. While this tokenizer isn't super concerned with syntax or
+semantics, and shouldn't throw errors for either on its own, it does have states
+that govern how tokens are processed. The main states are *content*, *template*,
+and *tag*, though each may have sub-states depending on what's being processed.
+
+### Content
+
+Content is a mix of plain text, templates, and tags. It's the simplest to
+tokenize, usually consisting of gathering text up until a template or tag starts
+and then outputting that as a TEXT token. TEXT tokens are literal text and
+whitespace making up the content body of the document.
+
+### Templates
+
+Templates are Go template expressions. Sometimes, these expressions have
+keywords like `if` or `range` or assignment characters `:=`. At the time of
+writing, pipelines are not processed into tokens, just left as plain text.
+
+### Tags
+
+Tags are HTML tags. These tags have a name and attributes, consisting of value
+attributes `name="value"` and boolean attributes `name`. Because data attributes
+are written `:name`, colons may be included in attribute names.
diff --git a/go.mod b/go.mod
index 707489b..cc7e651 100644
--- a/go.mod
+++ b/go.mod
@@ -1,6 +1,6 @@
 module git.earlybird.gay/today
 
-go 1.22.4
+go 1.23.4
 
 require (
 	golang.org/x/text v0.19.0
diff --git a/web/htmlt/parse/parse.go b/web/htmlt/parse/parse.go
new file mode 100644
index 0000000..e3786bd
--- /dev/null
+++ b/web/htmlt/parse/parse.go
@@ -0,0 +1,316 @@
+package parse
+
+import (
+	"fmt"
+	"io"
+	"iter"
+	"slices"
+	"strings"
+)
+
+type tokens iter.Seq[*Token]
+
+func (toks tokens) nextOf(tts ...TokenType) *Token {
+	var out *Token
+	for tok := range toks.until(tts...) {
+		out = tok
+	}
+	if slices.Contains(tts, out.Type) {
+		return out
+	} else {
+		return nil
+	}
+}
+
+// untilInclusive gets the next N tokens in p, until one is in types, or is EOF.
+func (toks tokens) until(tts ...TokenType) tokens {
+	return func(yield func(*Token) bool) {
+		for tok := range toks {
+			if !yield(tok) || slices.Contains(tts, tok.Type) || tok.Type == EOF {
+				return
+			}
+		}
+	}
+}
+
+func (toks tokens) discardUntil(tts ...TokenType) {
+	for range toks.until(tts...) {
+	}
+}
+
+func (toks tokens) seq() iter.Seq[*Token] {
+	return iter.Seq[*Token](toks)
+}
+
+// readContent returns a list of "content tokens", wherever they're acceptable.
+func readContent(toks tokens) ([]any, error) {
+	out := make([]any, 0)
+	var text *TextNode
+	resolveText := func() {
+		if text != nil {
+			out = append(out, text)
+		}
+		text = nil
+	}
+	for tok := range toks {
+		switch tok.Type {
+		case TEXT, WHITESPACE:
+			if text == nil {
+				text = new(TextNode)
+			}
+			text.Value += tok.Literal
+		case TEMPLATE_OPEN:
+			resolveText()
+			node, err := readTemplate(toks)
+			if err != nil {
+				return nil, err
+			}
+			out = append(out, node)
+		case TAG_OPEN:
+			resolveText()
+			node, err := readElement(toks)
+			if err != nil {
+				return nil, err
+			}
+			out = append(out, node)
+		case TAG_END_OPEN:
+			toks.discardUntil(TAG_CLOSE)
+		default:
+		}
+	}
+	resolveText()
+	return out, nil
+}
+
+type Document struct {
+	Children []any
+}
+
+func (d *Document) String() string {
+	out := ""
+	for _, child := range d.Children {
+		out += fmt.Sprint(child)
+	}
+	return out
+}
+
+func (d *Document) read(toks tokens) (err error) {
+	d.Children, err = readContent(toks)
+	return err
+}
+
+type TextNode struct {
+	Value string
+}
+
+func (node *TextNode) String() string {
+	return node.Value
+}
+
+type TemplateNode struct {
+	Value string
+}
+
+func (node *TemplateNode) String() string {
+	return fmt.Sprintf("{{ %s }}", node.Value)
+}
+
+// read reads tokens into the TemplateNode.
+func readTemplate(toks tokens) (any, error) {
+	node := new(TemplateNode)
+	for tok := range toks.until(TEMPLATE_CLOSE) {
+		switch tok.Type {
+		case TEMPLATE_KEYWORD:
+			switch tok.Literal {
+			case "if":
+				return readTemplateIf(toks)
+			case "else":
+				next := toks.nextOf(TEMPLATE_KEYWORD, TEMPLATE_CLOSE)
+				if next.Literal == "if" {
+					return readTemplateIf(toks)
+				} else if next.Type == TEXT {
+					toks.discardUntil(TEMPLATE_CLOSE)
+				}
+				return &TemplateElse{}, nil
+			case "end":
+				toks.discardUntil(TEMPLATE_CLOSE)
+				return &TemplateEnd{}, nil
+			default:
+				return nil, fmt.Errorf("unrecognized template keyword %s", tok.Literal)
+			}
+		case TEXT:
+			node.Value += " " + tok.Literal
+		}
+	}
+	node.Value = strings.TrimSpace(node.Value)
+	return node, nil
+}
+
+type TemplateElse struct{}
+type TemplateEnd struct{}
+
+type TemplateIf struct {
+	Condition string
+	Then      []any
+	Elif      []*TemplateIf
+	Else      []any
+}
+
+func (node *TemplateIf) String() string {
+	elifStr := ""
+	for _, elif := range node.Elif {
+		elifStr += fmt.Sprintf("{{ else if %s }}%s", elif.Condition, JoinAny(elif.Then, ""))
+	}
+	elseStr := ""
+	if len(node.Else) > 0 {
+		elseStr = fmt.Sprintf("{{ else }}%s", JoinAny(node.Else, ""))
+	}
+	return fmt.Sprintf("{{ if %s }}%s%s%s{{ end }}", node.Condition, JoinAny(node.Then, ""), elifStr, elseStr)
+}
+
+func readTemplateIf(toks tokens) (*TemplateIf, error) {
+	node := new(TemplateIf)
+	for tok := range toks.until(TEMPLATE_CLOSE) {
+		switch tok.Type {
+		case TEXT, WHITESPACE:
+			node.Condition += tok.Literal
+		}
+	}
+	node.Condition = strings.TrimSpace(node.Condition)
+	isElse := false
+	for {
+		then, err := readContent(toks.until(TEMPLATE_OPEN))
+		if err != nil {
+			return nil, err
+		}
+		if len(then) == 0 {
+			break
+		}
+		term := then[len(then)-1]
+		body := then[:len(then)-1]
+		switch v := term.(type) {
+		case *TemplateIf:
+			node.Then = append(node.Then, body...)
+			// Elifs steal the else. Take it back.
+			node.Else = append(node.Else, v.Else...)
+			v.Else = nil
+			node.Elif = append(node.Elif, v)
+		case *TemplateElse:
+			node.Then = append(node.Then, body...)
+			isElse = true
+		case *TemplateEnd:
+			if !isElse {
+				node.Then = body
+			} else {
+				node.Else = body
+				isElse = false
+			}
+		default:
+		}
+	}
+	return node, nil
+}
+
+var voidElems = []string{
+	"area", "base", "br", "col",
+	"embed", "hr", "img", "input",
+	"link", "meta", "param", "source",
+	"track", "wbr",
+}
+
+type Attribute struct {
+	Name  string
+	Value string
+	// Boolean is true if attr is a "boolean attribute"; it has no string value
+	// in the source HTML, so it just represents true-if-present. Boolean is
+	// *not* the value of the boolean attribute.
+	Boolean bool
+}
+
+func (attr Attribute) String() string {
+	if attr.Boolean {
+		return attr.Name
+	} else {
+		return fmt.Sprintf(`%s="%s"`, attr.Name, attr.Value)
+	}
+}
+
+type ElementNode struct {
+	Name       string
+	Attributes []Attribute
+	Children   []any
+
+	void bool
+}
+
+func (node *ElementNode) String() string {
+	attrsRaw := make([]string, len(node.Attributes))
+	for i, attr := range node.Attributes {
+		attrsRaw[i] = attr.String()
+	}
+	attrs := ""
+	if len(attrsRaw) > 0 {
+		attrs = strings.Join(attrsRaw, " ")
+	}
+	if node.void {
+		return fmt.Sprintf("<%s %s />", node.Name, attrs)
+	} else {
+		inner := ""
+		for _, child := range node.Children {
+			inner += fmt.Sprint(child)
+		}
+		return fmt.Sprintf("<%s %s >%s</%s>", node.Name, attrs, inner, node.Name)
+	}
+}
+
+func readElement(toks tokens) (*ElementNode, error) {
+	node := new(ElementNode)
+	node.Name = toks.nextOf(TEXT).Literal
+	node.void = slices.Contains(voidElems, node.Name)
+
+	tagToks := toks.until(TAG_CLOSE, TAG_VOID_CLOSE)
+	next := tagToks.nextOf(TEXT)
+	for next != nil {
+		name := next.Literal
+		next = tagToks.nextOf(TEXT, TAG_EQ)
+		// If it's text, this is a boolean attribute. Otherwise, it has a value.
+		if next == nil || next.Type == TEXT {
+			node.Attributes = append(node.Attributes, Attribute{
+				Name:    name,
+				Boolean: true,
+			})
+		} else {
+			value := ""
+			// Advance to the start of the value...
+			tagToks.discardUntil(TAG_QUOTE)
+			// Then read until the end.
+			for tok := range tagToks.until(TAG_QUOTE) {
+				switch tok.Type {
+				case TEXT:
+					value += tok.Literal
+				}
+			}
+			node.Attributes = append(node.Attributes, Attribute{
+				Name:  name,
+				Value: value,
+			})
+			next = tagToks.nextOf(TEXT)
+		}
+	}
+
+	if node.void {
+		return node, nil
+	}
+	children, err := readContent(toks.until(TAG_END_OPEN))
+	if err != nil {
+		return nil, err
+	}
+	node.Children = children
+	return node, nil
+}
+
+func Parse(r io.Reader) (any, error) {
+	doc := new(Document)
+	err := doc.read(Tokenize(r))
+	return doc, err
+}
diff --git a/web/htmlt/parse/parse_test.go b/web/htmlt/parse/parse_test.go
new file mode 100644
index 0000000..b9eae1e
--- /dev/null
+++ b/web/htmlt/parse/parse_test.go
@@ -0,0 +1,80 @@
+package parse
+
+import (
+	"fmt"
+	"slices"
+	"strings"
+	"testing"
+)
+
+func TestParseBasic(t *testing.T) {
+	testStrings := map[string]string{
+		"hello":         "Hello, World!",
+		"template":      "Hello, {{ `template` }}!",
+		"html":          "<div><p>Hello, HTML!</p><br></div>",
+		"html+template": "{{ if .condition }}<p>{{- .text -}}</p>{{ end }}",
+	}
+	for name, val := range testStrings {
+		t.Run(name, func(t *testing.T) {
+			doc, err := Parse(strings.NewReader(val))
+			t.Log(val)
+			if err != nil {
+				t.Fatal(err)
+			}
+			t.Log(doc)
+		})
+	}
+}
+
+func TestParseHTML(t *testing.T) {
+	testStrings := map[string]string{
+		"void":  "<p>Hello</p><br><p>World</p>",
+		"attrs": `<div class = "outer"><div my-boolean class="inner"></div></div>`,
+	}
+	for name, val := range testStrings {
+		t.Run(name, func(t *testing.T) {
+			doc, err := Parse(strings.NewReader(val))
+			t.Log(val)
+			if err != nil {
+				t.Fatal(err)
+			}
+			t.Log(doc)
+		})
+	}
+}
+func TestParseTemplate(t *testing.T) {
+	testStrings := map[string]string{
+		"if":           "{{ if .condition }}Hello{{ end }}",
+		"if-else":      "{{ if .condition }}Hello{{ else }}World{{ end }}",
+		"if-elif":      "{{ if .condition }}Hello{{ else if .other.condition }}World{{ end }}",
+		"if-elif-else": "{{ if .condition }}One{{ else if .other.condition }}Two{{ else }}Three{{ end }}",
+	}
+	for name, val := range testStrings {
+		t.Run(name, func(t *testing.T) {
+			doc, err := Parse(strings.NewReader(val))
+			t.Log(val)
+			if err != nil {
+				t.Fatal(err)
+			}
+			t.Log(doc)
+		})
+	}
+}
+
+func TestParseComplex(t *testing.T) {
+	testStrings := map[string]string{
+		"template-attr": `<img src="{{ .img }}">`,
+	}
+	for name, val := range testStrings {
+		t.Run(name, func(t *testing.T) {
+			toks := slices.Collect(Tokenize(strings.NewReader(val)).seq())
+			fmt.Println(toks)
+			doc, err := Parse(strings.NewReader(val))
+			t.Log(val)
+			if err != nil {
+				t.Fatal(err)
+			}
+			t.Log(doc)
+		})
+	}
+}
diff --git a/web/htmlt/parse/text.go b/web/htmlt/parse/text.go
new file mode 100644
index 0000000..697cf29
--- /dev/null
+++ b/web/htmlt/parse/text.go
@@ -0,0 +1,29 @@
+package parse
+
+import (
+	"fmt"
+	"regexp"
+	"strings"
+)
+
+var (
+	HtmlNameRegexp = regexp.MustCompile("^[a-zA-Z][a-zA-Z0-9]*(?:-[a-zA-Z][a-zA-Z0-9]*)*$")
+)
+
+func IsHtmlName(text string) bool {
+	return HtmlNameRegexp.MatchString(text)
+}
+
+func CutQuotes(text string) (string, bool) {
+	preQuote, preOk := strings.CutPrefix(text, `"`)
+	postQuote, postOk := strings.CutSuffix(preQuote, `"`)
+	return postQuote, preOk && postOk
+}
+
+func JoinAny(vals []any, sep string) string {
+	valsStr := make([]string, len(vals))
+	for i, val := range vals {
+		valsStr[i] = fmt.Sprintf("%s", val)
+	}
+	return strings.Join(valsStr, sep)
+}
diff --git a/web/htmlt/parse/tokens.go b/web/htmlt/parse/tokens.go
new file mode 100644
index 0000000..791f696
--- /dev/null
+++ b/web/htmlt/parse/tokens.go
@@ -0,0 +1,360 @@
+package parse
+
+import (
+	"bufio"
+	"errors"
+	"fmt"
+	"io"
+	"strings"
+)
+
+type TokenState int
+
+const (
+	STATE_CONTENT = TokenState(iota)
+	STATE_TEMPLATE
+	STATE_TAG
+)
+
+func (ts TokenState) String() string {
+	switch ts {
+	case STATE_CONTENT:
+		return "STATE_CONTENT"
+	case STATE_TEMPLATE:
+		return "STATE_TEMPLATE"
+	case STATE_TAG:
+		return "STATE_TAG"
+	default:
+		return "STATE_UNKNOWN"
+	}
+}
+
+type TokenType int
+
+const (
+	ERROR = TokenType(iota)
+	EOF
+	TEXT
+	WHITESPACE
+	TEMPLATE_OPEN    // {{
+	TEMPLATE_CLOSE   // }}
+	TEMPLATE_KEYWORD // range, with, else, if, end
+	TAG_OPEN         // <
+	TAG_END_OPEN     // </
+	TAG_CLOSE        // >
+	TAG_VOID_CLOSE   // />
+	TAG_EQ           // =
+	TAG_QUOTE        // "
+)
+
+func (tt TokenType) String() string {
+	switch tt {
+	case ERROR:
+		return "ERROR"
+	case EOF:
+		return "EOF"
+	case TEXT:
+		return "TEXT"
+	case WHITESPACE:
+		return "S"
+	case TEMPLATE_OPEN:
+		return "TEMPLATE_OPEN"
+	case TEMPLATE_CLOSE:
+		return "TEMPLATE_CLOSE"
+	case TEMPLATE_KEYWORD:
+		return "TEMPLATE_KEYWORD"
+	case TAG_OPEN:
+		return "TAG_OPEN"
+	case TAG_END_OPEN:
+		return "TAG_END_OPEN"
+	case TAG_CLOSE:
+		return "TAG_CLOSE"
+	case TAG_EQ:
+		return "TAG_EQ"
+	case TAG_QUOTE:
+		return "TAG_QUOTE"
+	default:
+		return "UNKNOWN"
+	}
+}
+
+const (
+	whitespace = " \n\t\r\f\b"
+)
+
+type Token struct {
+	Type    TokenType
+	Literal string
+}
+
+func (t *Token) String() string {
+	return fmt.Sprintf("%s(%s)", t.Type, t.Literal)
+}
+
+type tokenizer struct {
+	r         *bufio.Reader
+	state     TokenState
+	nextToken *Token
+	err       error
+}
+
+// peek returns the next i characters, and "true" if i characters were found.
+func (t *tokenizer) peek(i int) (string, bool) {
+	x, err := t.r.Peek(i)
+	if err != nil {
+		return string(x), false
+	}
+	return string(x), true
+}
+
+func (t *tokenizer) advance(i int) {
+	t.r.Discard(i)
+}
+
+func (t *tokenizer) nextContent() (*Token, error) {
+	acc := make([]rune, 100)
+	cursor := 0
+	accumulate := func(r rune) {
+		if cursor >= len(acc) {
+			acc = append(acc, make([]rune, 100)...)
+		}
+		acc[cursor] = r
+		cursor++
+	}
+	for {
+		if token, ok := t.peek(3); ok {
+			switch token {
+			case "{{-":
+				out := &Token{TEXT, string(acc[:cursor])}
+				t.state = STATE_TEMPLATE
+				return out, nil
+			}
+		}
+		if token, ok := t.peek(2); ok {
+			switch token {
+			case "{{":
+				out := &Token{TEXT, string(acc[:cursor])}
+				t.state = STATE_TEMPLATE
+				return out, nil
+			case "</":
+				out := &Token{TEXT, string(acc[:cursor])}
+				t.state = STATE_TAG
+				return out, nil
+			}
+		}
+		if token, ok := t.peek(1); ok {
+			switch token {
+			case "<":
+				out := &Token{TEXT, string(acc[:cursor])}
+				t.state = STATE_TAG
+				return out, nil
+			}
+		}
+		r, _, err := t.r.ReadRune()
+		if errors.Is(err, io.EOF) {
+			if cursor > 0 {
+				return &Token{TEXT, string(acc[:cursor])}, nil
+			} else {
+				return &Token{EOF, ""}, nil
+			}
+		} else if err != nil {
+			return nil, err
+		}
+		accumulate(r)
+	}
+}
+func (t *tokenizer) nextTemplate() (*Token, error) {
+	acc := make([]rune, 10)
+	cursor := 0
+	accumulate := func(r rune) {
+		if cursor >= len(acc) {
+			acc = append(acc, make([]rune, 10)...)
+		}
+		acc[cursor] = r
+		cursor++
+	}
+	for {
+		if token, ok := t.peek(5); ok {
+			switch token {
+			case "range":
+				out := &Token{TEMPLATE_KEYWORD, token}
+				t.advance(5)
+				return out, nil
+			}
+		}
+		if token, ok := t.peek(4); ok {
+			switch token {
+			case "with":
+				out := &Token{TEMPLATE_KEYWORD, token}
+				t.advance(4)
+				return out, nil
+			case "else":
+				out := &Token{TEMPLATE_KEYWORD, token}
+				t.advance(4)
+				return out, nil
+			}
+		}
+		if token, ok := t.peek(3); ok {
+			switch token {
+			case "end":
+				out := &Token{TEMPLATE_KEYWORD, token}
+				t.advance(3)
+				return out, nil
+			case "{{-":
+				out := &Token{TEMPLATE_OPEN, token}
+				t.advance(3)
+				return out, nil
+			case "-}}":
+				out := &Token{TEMPLATE_CLOSE, token}
+				t.advance(3)
+				t.state = STATE_CONTENT
+				return out, nil
+			}
+		}
+		if token, ok := t.peek(2); ok {
+			switch token {
+			case "if":
+				out := &Token{TEMPLATE_KEYWORD, token}
+				t.advance(2)
+				return out, nil
+			case "{{":
+				out := &Token{TEMPLATE_OPEN, token}
+				t.advance(2)
+				return out, nil
+			case "}}":
+				out := &Token{TEMPLATE_CLOSE, token}
+				t.advance(2)
+				t.state = STATE_CONTENT
+				return out, nil
+			}
+		}
+		r, _, err := t.r.ReadRune()
+		if errors.Is(err, io.EOF) {
+			return &Token{EOF, ""}, nil
+		} else if err != nil {
+			return nil, err
+		}
+		if strings.ContainsRune(whitespace, r) {
+			return &Token{TEXT, string(acc[:cursor])}, nil
+		} else {
+			accumulate(r)
+		}
+	}
+}
+func (t *tokenizer) nextTag() (*Token, error) {
+	acc := make([]rune, 10)
+	cursor := 0
+	accumulate := func(r rune) {
+		if cursor >= len(acc) {
+			acc = append(acc, make([]rune, 10)...)
+		}
+		acc[cursor] = r
+		cursor++
+	}
+	for {
+		if token, ok := t.peek(2); ok {
+			switch token {
+			case "</":
+				out := &Token{TAG_END_OPEN, token}
+				t.advance(2)
+				return out, nil
+			case "/>":
+				out := &Token{TEXT, string(acc[:cursor])}
+				t.nextToken = &Token{TAG_VOID_CLOSE, token}
+				t.advance(2)
+				t.state = STATE_CONTENT
+				return out, nil
+			}
+		}
+		if token, ok := t.peek(1); ok {
+			switch token {
+			case "<":
+				out := &Token{TAG_OPEN, token}
+				t.advance(1)
+				return out, nil
+			case "=":
+				out := &Token{TEXT, string(acc[:cursor])}
+				t.nextToken = &Token{TAG_EQ, token}
+				t.advance(1)
+				return out, nil
+			case `"`:
+				out := &Token{TEXT, string(acc[:cursor])}
+				t.nextToken = &Token{TAG_QUOTE, token}
+				t.advance(1)
+				return out, nil
+			case ">":
+				out := &Token{TEXT, string(acc[:cursor])}
+				t.nextToken = &Token{TAG_CLOSE, token}
+				t.advance(1)
+				t.state = STATE_CONTENT
+				return out, nil
+			}
+		}
+		r, _, err := t.r.ReadRune()
+		if errors.Is(err, io.EOF) {
+			return &Token{EOF, ""}, nil
+		} else if err != nil {
+			return nil, err
+		}
+		if strings.ContainsRune(whitespace, r) {
+			return &Token{TEXT, string(acc[:cursor])}, nil
+		} else {
+			accumulate(r)
+		}
+	}
+}
+
+func (t *tokenizer) next2() (*Token, error) {
+	var next *Token
+	var err error
+	for next == nil && err == nil {
+		if t.nextToken != nil {
+			next := t.nextToken
+			t.nextToken = nil
+			return next, nil
+		}
+		switch t.state {
+		case STATE_CONTENT:
+			next, err = t.nextContent()
+		case STATE_TEMPLATE:
+			next, err = t.nextTemplate()
+		case STATE_TAG:
+			next, err = t.nextTag()
+		default:
+			return nil, fmt.Errorf("unknown state %s", t.state)
+		}
+		if next.Type == TEXT && next.Literal == "" {
+			next = nil
+		}
+	}
+	return next, err
+}
+
+// all returns an iterator over all tokens produced by the tokenizer.
+// Stops iterating on EOF or error.
+func (t *tokenizer) all() tokens {
+	return func(yield func(*Token) bool) {
+		for {
+			if t.err != nil {
+				return
+			}
+			tok, err := t.next2()
+			if err != nil {
+				yield(&Token{ERROR, t.err.Error()})
+				break
+			}
+			if tok.Type == EOF {
+				t.err = io.EOF
+			}
+			if !yield(tok) {
+				break
+			}
+		}
+	}
+}
+
+func Tokenize(r io.Reader) tokens {
+	tkns := new(tokenizer)
+	tkns.r = bufio.NewReader(r)
+	return tkns.all()
+}
diff --git a/web/htmlt/parse/tokens_test.go b/web/htmlt/parse/tokens_test.go
new file mode 100644
index 0000000..f29e6ee
--- /dev/null
+++ b/web/htmlt/parse/tokens_test.go
@@ -0,0 +1,23 @@
+package parse
+
+import (
+	"slices"
+	"strings"
+	"testing"
+)
+
+func TestTokenize(t *testing.T) {
+	testStrings := map[string]string{
+		"hello":         "Hello, World!",
+		"template":      "Hello, {{ `template` }}!",
+		"html":          "<div><p>Hello, HTML!</p><br></div>",
+		"html+template": "{{ if .condition }}<p>{{- .text -}}</p>{{ end }}",
+	}
+	for name, val := range testStrings {
+		t.Run(name, func(t *testing.T) {
+			toks := slices.Collect(Tokenize(strings.NewReader(val)).seq())
+			t.Log(val)
+			t.Log(toks)
+		})
+	}
+}
-- 
2.39.5