]> git.earlybird.gay Git - today/commitdiff
new parser checkpoint
authorearly <me@earlybird.gay>
Fri, 27 Dec 2024 04:01:11 +0000 (21:01 -0700)
committerearly <me@earlybird.gay>
Fri, 27 Dec 2024 04:01:11 +0000 (21:01 -0700)
docs/dev/planning.md [new file with mode: 0644]
docs/dev/templates.md [new file with mode: 0644]
go.mod
web/htmlt/parse/parse.go [new file with mode: 0644]
web/htmlt/parse/parse_test.go [new file with mode: 0644]
web/htmlt/parse/text.go [new file with mode: 0644]
web/htmlt/parse/tokens.go [new file with mode: 0644]
web/htmlt/parse/tokens_test.go [new file with mode: 0644]

diff --git a/docs/dev/planning.md b/docs/dev/planning.md
new file mode 100644 (file)
index 0000000..7e2448f
--- /dev/null
@@ -0,0 +1,21 @@
+# What's the plan, anyways
+
+I want to make templates and custom elements process concurrently. That means
+that `template.Execute` and the data loaders execute simultaneously, waiting on
+each other at critical points to yield a value or set a variable. This is going
+to require a significant refactor of everything, but frankly I think that is
+overdue outside of any feature goals, so that's just fine. I'm going to lay out
+a plan of attack for getting this feature live.
+
+## Write a parser
+
+`x/net/html` is a really good library! It's also not what this project needs.
+Go HTML is focused on being a spec-compliant parser and renderer, and templates
+are not compliant with the HTML specification in some circumstances. A parser
+that recognizes templates as separate from regular text would also be really
+handy!
+
+The first iteration of this parser is focused on restoring functionality to
+where it is now, not adding anything new. To this end, it'll mostly leave the
+semantics of templates alone, outside of recognizing things like template blocks
+as having children.
diff --git a/docs/dev/templates.md b/docs/dev/templates.md
new file mode 100644 (file)
index 0000000..904058a
--- /dev/null
@@ -0,0 +1,31 @@
+# Templates
+
+Templates are a mix of HTML and Go's standard templates. This document describes
+how template documents are tokenized and parsed before being processed.
+
+## Tokenization
+
+Documents are processed first by a tokenizer that splits it up into sequential
+tokens for processing. While this tokenizer isn't super concerned with syntax or
+semantics, and shouldn't throw errors for either on its own, it does have states
+that govern how tokens are processed. The main states are *content*, *template*,
+and *tag*, though each may have sub-states depending on what's being processed.
+
+### Content
+
+Content is a mix of plain text, templates, and tags. It's the simplest to
+tokenize, usually consisting of gathering text up until a template or tag starts
+and then outputting that as a TEXT token. TEXT tokens are literal text and
+whitespace making up the content body of the document.
+
+### Templates
+
+Templates are Go template expressions. Sometimes, these expressions have
+keywords like `if` or `range` or assignment characters `:=`. At the time of
+writing, pipelines are not processed into tokens, just left as plain text.
+
+### Tags
+
+Tags are HTML tags. These tags have a name and attributes, consisting of value
+attributes `name="value"` and boolean attributes `name`. Because data attributes
+are written `:name`, colons may be included in attribute names.
diff --git a/go.mod b/go.mod
index 707489ba90b714c63af98ebe940f946a0b57ea9d..cc7e651fb0eae3b222e476c8d5977a8b3895d458 100644 (file)
--- a/go.mod
+++ b/go.mod
@@ -1,6 +1,6 @@
 module git.earlybird.gay/today
 
-go 1.22.4
+go 1.23.4
 
 require (
        golang.org/x/text v0.19.0
diff --git a/web/htmlt/parse/parse.go b/web/htmlt/parse/parse.go
new file mode 100644 (file)
index 0000000..e3786bd
--- /dev/null
@@ -0,0 +1,316 @@
+package parse
+
+import (
+       "fmt"
+       "io"
+       "iter"
+       "slices"
+       "strings"
+)
+
+type tokens iter.Seq[*Token]
+
+func (toks tokens) nextOf(tts ...TokenType) *Token {
+       var out *Token
+       for tok := range toks.until(tts...) {
+               out = tok
+       }
+       if slices.Contains(tts, out.Type) {
+               return out
+       } else {
+               return nil
+       }
+}
+
+// untilInclusive gets the next N tokens in p, until one is in types, or is EOF.
+func (toks tokens) until(tts ...TokenType) tokens {
+       return func(yield func(*Token) bool) {
+               for tok := range toks {
+                       if !yield(tok) || slices.Contains(tts, tok.Type) || tok.Type == EOF {
+                               return
+                       }
+               }
+       }
+}
+
+func (toks tokens) discardUntil(tts ...TokenType) {
+       for range toks.until(tts...) {
+       }
+}
+
+func (toks tokens) seq() iter.Seq[*Token] {
+       return iter.Seq[*Token](toks)
+}
+
+// readContent returns a list of "content tokens", wherever they're acceptable.
+func readContent(toks tokens) ([]any, error) {
+       out := make([]any, 0)
+       var text *TextNode
+       resolveText := func() {
+               if text != nil {
+                       out = append(out, text)
+               }
+               text = nil
+       }
+       for tok := range toks {
+               switch tok.Type {
+               case TEXT, WHITESPACE:
+                       if text == nil {
+                               text = new(TextNode)
+                       }
+                       text.Value += tok.Literal
+               case TEMPLATE_OPEN:
+                       resolveText()
+                       node, err := readTemplate(toks)
+                       if err != nil {
+                               return nil, err
+                       }
+                       out = append(out, node)
+               case TAG_OPEN:
+                       resolveText()
+                       node, err := readElement(toks)
+                       if err != nil {
+                               return nil, err
+                       }
+                       out = append(out, node)
+               case TAG_END_OPEN:
+                       toks.discardUntil(TAG_CLOSE)
+               default:
+               }
+       }
+       resolveText()
+       return out, nil
+}
+
+type Document struct {
+       Children []any
+}
+
+func (d *Document) String() string {
+       out := ""
+       for _, child := range d.Children {
+               out += fmt.Sprint(child)
+       }
+       return out
+}
+
+func (d *Document) read(toks tokens) (err error) {
+       d.Children, err = readContent(toks)
+       return err
+}
+
+type TextNode struct {
+       Value string
+}
+
+func (node *TextNode) String() string {
+       return node.Value
+}
+
+type TemplateNode struct {
+       Value string
+}
+
+func (node *TemplateNode) String() string {
+       return fmt.Sprintf("{{ %s }}", node.Value)
+}
+
+// read reads tokens into the TemplateNode.
+func readTemplate(toks tokens) (any, error) {
+       node := new(TemplateNode)
+       for tok := range toks.until(TEMPLATE_CLOSE) {
+               switch tok.Type {
+               case TEMPLATE_KEYWORD:
+                       switch tok.Literal {
+                       case "if":
+                               return readTemplateIf(toks)
+                       case "else":
+                               next := toks.nextOf(TEMPLATE_KEYWORD, TEMPLATE_CLOSE)
+                               if next.Literal == "if" {
+                                       return readTemplateIf(toks)
+                               } else if next.Type == TEXT {
+                                       toks.discardUntil(TEMPLATE_CLOSE)
+                               }
+                               return &TemplateElse{}, nil
+                       case "end":
+                               toks.discardUntil(TEMPLATE_CLOSE)
+                               return &TemplateEnd{}, nil
+                       default:
+                               return nil, fmt.Errorf("unrecognized template keyword %s", tok.Literal)
+                       }
+               case TEXT:
+                       node.Value += " " + tok.Literal
+               }
+       }
+       node.Value = strings.TrimSpace(node.Value)
+       return node, nil
+}
+
+type TemplateElse struct{}
+type TemplateEnd struct{}
+
+type TemplateIf struct {
+       Condition string
+       Then      []any
+       Elif      []*TemplateIf
+       Else      []any
+}
+
+func (node *TemplateIf) String() string {
+       elifStr := ""
+       for _, elif := range node.Elif {
+               elifStr += fmt.Sprintf("{{ else if %s }}%s", elif.Condition, JoinAny(elif.Then, ""))
+       }
+       elseStr := ""
+       if len(node.Else) > 0 {
+               elseStr = fmt.Sprintf("{{ else }}%s", JoinAny(node.Else, ""))
+       }
+       return fmt.Sprintf("{{ if %s }}%s%s%s{{ end }}", node.Condition, JoinAny(node.Then, ""), elifStr, elseStr)
+}
+
+func readTemplateIf(toks tokens) (*TemplateIf, error) {
+       node := new(TemplateIf)
+       for tok := range toks.until(TEMPLATE_CLOSE) {
+               switch tok.Type {
+               case TEXT, WHITESPACE:
+                       node.Condition += tok.Literal
+               }
+       }
+       node.Condition = strings.TrimSpace(node.Condition)
+       isElse := false
+       for {
+               then, err := readContent(toks.until(TEMPLATE_OPEN))
+               if err != nil {
+                       return nil, err
+               }
+               if len(then) == 0 {
+                       break
+               }
+               term := then[len(then)-1]
+               body := then[:len(then)-1]
+               switch v := term.(type) {
+               case *TemplateIf:
+                       node.Then = append(node.Then, body...)
+                       // Elifs steal the else. Take it back.
+                       node.Else = append(node.Else, v.Else...)
+                       v.Else = nil
+                       node.Elif = append(node.Elif, v)
+               case *TemplateElse:
+                       node.Then = append(node.Then, body...)
+                       isElse = true
+               case *TemplateEnd:
+                       if !isElse {
+                               node.Then = body
+                       } else {
+                               node.Else = body
+                               isElse = false
+                       }
+               default:
+               }
+       }
+       return node, nil
+}
+
+var voidElems = []string{
+       "area", "base", "br", "col",
+       "embed", "hr", "img", "input",
+       "link", "meta", "param", "source",
+       "track", "wbr",
+}
+
+type Attribute struct {
+       Name  string
+       Value string
+       // Boolean is true if attr is a "boolean attribute"; it has no string value
+       // in the source HTML, so it just represents true-if-present. Boolean is
+       // *not* the value of the boolean attribute.
+       Boolean bool
+}
+
+func (attr Attribute) String() string {
+       if attr.Boolean {
+               return attr.Name
+       } else {
+               return fmt.Sprintf(`%s="%s"`, attr.Name, attr.Value)
+       }
+}
+
+type ElementNode struct {
+       Name       string
+       Attributes []Attribute
+       Children   []any
+
+       void bool
+}
+
+func (node *ElementNode) String() string {
+       attrsRaw := make([]string, len(node.Attributes))
+       for i, attr := range node.Attributes {
+               attrsRaw[i] = attr.String()
+       }
+       attrs := ""
+       if len(attrsRaw) > 0 {
+               attrs = strings.Join(attrsRaw, " ")
+       }
+       if node.void {
+               return fmt.Sprintf("<%s %s />", node.Name, attrs)
+       } else {
+               inner := ""
+               for _, child := range node.Children {
+                       inner += fmt.Sprint(child)
+               }
+               return fmt.Sprintf("<%s %s >%s</%s>", node.Name, attrs, inner, node.Name)
+       }
+}
+
+func readElement(toks tokens) (*ElementNode, error) {
+       node := new(ElementNode)
+       node.Name = toks.nextOf(TEXT).Literal
+       node.void = slices.Contains(voidElems, node.Name)
+
+       tagToks := toks.until(TAG_CLOSE, TAG_VOID_CLOSE)
+       next := tagToks.nextOf(TEXT)
+       for next != nil {
+               name := next.Literal
+               next = tagToks.nextOf(TEXT, TAG_EQ)
+               // If it's text, this is a boolean attribute. Otherwise, it has a value.
+               if next == nil || next.Type == TEXT {
+                       node.Attributes = append(node.Attributes, Attribute{
+                               Name:    name,
+                               Boolean: true,
+                       })
+               } else {
+                       value := ""
+                       // Advance to the start of the value...
+                       tagToks.discardUntil(TAG_QUOTE)
+                       // Then read until the end.
+                       for tok := range tagToks.until(TAG_QUOTE) {
+                               switch tok.Type {
+                               case TEXT:
+                                       value += tok.Literal
+                               }
+                       }
+                       node.Attributes = append(node.Attributes, Attribute{
+                               Name:  name,
+                               Value: value,
+                       })
+                       next = tagToks.nextOf(TEXT)
+               }
+       }
+
+       if node.void {
+               return node, nil
+       }
+       children, err := readContent(toks.until(TAG_END_OPEN))
+       if err != nil {
+               return nil, err
+       }
+       node.Children = children
+       return node, nil
+}
+
+func Parse(r io.Reader) (any, error) {
+       doc := new(Document)
+       err := doc.read(Tokenize(r))
+       return doc, err
+}
diff --git a/web/htmlt/parse/parse_test.go b/web/htmlt/parse/parse_test.go
new file mode 100644 (file)
index 0000000..b9eae1e
--- /dev/null
@@ -0,0 +1,80 @@
+package parse
+
+import (
+       "fmt"
+       "slices"
+       "strings"
+       "testing"
+)
+
+func TestParseBasic(t *testing.T) {
+       testStrings := map[string]string{
+               "hello":         "Hello, World!",
+               "template":      "Hello, {{ `template` }}!",
+               "html":          "<div><p>Hello, HTML!</p><br></div>",
+               "html+template": "{{ if .condition }}<p>{{- .text -}}</p>{{ end }}",
+       }
+       for name, val := range testStrings {
+               t.Run(name, func(t *testing.T) {
+                       doc, err := Parse(strings.NewReader(val))
+                       t.Log(val)
+                       if err != nil {
+                               t.Fatal(err)
+                       }
+                       t.Log(doc)
+               })
+       }
+}
+
+func TestParseHTML(t *testing.T) {
+       testStrings := map[string]string{
+               "void":  "<p>Hello</p><br><p>World</p>",
+               "attrs": `<div class = "outer"><div my-boolean class="inner"></div></div>`,
+       }
+       for name, val := range testStrings {
+               t.Run(name, func(t *testing.T) {
+                       doc, err := Parse(strings.NewReader(val))
+                       t.Log(val)
+                       if err != nil {
+                               t.Fatal(err)
+                       }
+                       t.Log(doc)
+               })
+       }
+}
+func TestParseTemplate(t *testing.T) {
+       testStrings := map[string]string{
+               "if":           "{{ if .condition }}Hello{{ end }}",
+               "if-else":      "{{ if .condition }}Hello{{ else }}World{{ end }}",
+               "if-elif":      "{{ if .condition }}Hello{{ else if .other.condition }}World{{ end }}",
+               "if-elif-else": "{{ if .condition }}One{{ else if .other.condition }}Two{{ else }}Three{{ end }}",
+       }
+       for name, val := range testStrings {
+               t.Run(name, func(t *testing.T) {
+                       doc, err := Parse(strings.NewReader(val))
+                       t.Log(val)
+                       if err != nil {
+                               t.Fatal(err)
+                       }
+                       t.Log(doc)
+               })
+       }
+}
+
+func TestParseComplex(t *testing.T) {
+       testStrings := map[string]string{
+               "template-attr": `<img src="{{ .img }}">`,
+       }
+       for name, val := range testStrings {
+               t.Run(name, func(t *testing.T) {
+                       toks := slices.Collect(Tokenize(strings.NewReader(val)).seq())
+                       fmt.Println(toks)
+                       doc, err := Parse(strings.NewReader(val))
+                       t.Log(val)
+                       if err != nil {
+                               t.Fatal(err)
+                       }
+                       t.Log(doc)
+               })
+       }
+}
diff --git a/web/htmlt/parse/text.go b/web/htmlt/parse/text.go
new file mode 100644 (file)
index 0000000..697cf29
--- /dev/null
@@ -0,0 +1,29 @@
+package parse
+
+import (
+       "fmt"
+       "regexp"
+       "strings"
+)
+
+var (
+       HtmlNameRegexp = regexp.MustCompile("^[a-zA-Z][a-zA-Z0-9]*(?:-[a-zA-Z][a-zA-Z0-9]*)*$")
+)
+
+func IsHtmlName(text string) bool {
+       return HtmlNameRegexp.MatchString(text)
+}
+
+func CutQuotes(text string) (string, bool) {
+       preQuote, preOk := strings.CutPrefix(text, `"`)
+       postQuote, postOk := strings.CutSuffix(preQuote, `"`)
+       return postQuote, preOk && postOk
+}
+
+func JoinAny(vals []any, sep string) string {
+       valsStr := make([]string, len(vals))
+       for i, val := range vals {
+               valsStr[i] = fmt.Sprintf("%s", val)
+       }
+       return strings.Join(valsStr, sep)
+}
diff --git a/web/htmlt/parse/tokens.go b/web/htmlt/parse/tokens.go
new file mode 100644 (file)
index 0000000..791f696
--- /dev/null
@@ -0,0 +1,360 @@
+package parse
+
+import (
+       "bufio"
+       "errors"
+       "fmt"
+       "io"
+       "strings"
+)
+
+type TokenState int
+
+const (
+       STATE_CONTENT = TokenState(iota)
+       STATE_TEMPLATE
+       STATE_TAG
+)
+
+func (ts TokenState) String() string {
+       switch ts {
+       case STATE_CONTENT:
+               return "STATE_CONTENT"
+       case STATE_TEMPLATE:
+               return "STATE_TEMPLATE"
+       case STATE_TAG:
+               return "STATE_TAG"
+       default:
+               return "STATE_UNKNOWN"
+       }
+}
+
+type TokenType int
+
+const (
+       ERROR = TokenType(iota)
+       EOF
+       TEXT
+       WHITESPACE
+       TEMPLATE_OPEN    // {{
+       TEMPLATE_CLOSE   // }}
+       TEMPLATE_KEYWORD // range, with, else, if, end
+       TAG_OPEN         // <
+       TAG_END_OPEN     // </
+       TAG_CLOSE        // >
+       TAG_VOID_CLOSE   // />
+       TAG_EQ           // =
+       TAG_QUOTE        // "
+)
+
+func (tt TokenType) String() string {
+       switch tt {
+       case ERROR:
+               return "ERROR"
+       case EOF:
+               return "EOF"
+       case TEXT:
+               return "TEXT"
+       case WHITESPACE:
+               return "S"
+       case TEMPLATE_OPEN:
+               return "TEMPLATE_OPEN"
+       case TEMPLATE_CLOSE:
+               return "TEMPLATE_CLOSE"
+       case TEMPLATE_KEYWORD:
+               return "TEMPLATE_KEYWORD"
+       case TAG_OPEN:
+               return "TAG_OPEN"
+       case TAG_END_OPEN:
+               return "TAG_END_OPEN"
+       case TAG_CLOSE:
+               return "TAG_CLOSE"
+       case TAG_EQ:
+               return "TAG_EQ"
+       case TAG_QUOTE:
+               return "TAG_QUOTE"
+       default:
+               return "UNKNOWN"
+       }
+}
+
+const (
+       whitespace = " \n\t\r\f\b"
+)
+
+type Token struct {
+       Type    TokenType
+       Literal string
+}
+
+func (t *Token) String() string {
+       return fmt.Sprintf("%s(%s)", t.Type, t.Literal)
+}
+
+type tokenizer struct {
+       r         *bufio.Reader
+       state     TokenState
+       nextToken *Token
+       err       error
+}
+
+// peek returns the next i characters, and "true" if i characters were found.
+func (t *tokenizer) peek(i int) (string, bool) {
+       x, err := t.r.Peek(i)
+       if err != nil {
+               return string(x), false
+       }
+       return string(x), true
+}
+
+func (t *tokenizer) advance(i int) {
+       t.r.Discard(i)
+}
+
+func (t *tokenizer) nextContent() (*Token, error) {
+       acc := make([]rune, 100)
+       cursor := 0
+       accumulate := func(r rune) {
+               if cursor >= len(acc) {
+                       acc = append(acc, make([]rune, 100)...)
+               }
+               acc[cursor] = r
+               cursor++
+       }
+       for {
+               if token, ok := t.peek(3); ok {
+                       switch token {
+                       case "{{-":
+                               out := &Token{TEXT, string(acc[:cursor])}
+                               t.state = STATE_TEMPLATE
+                               return out, nil
+                       }
+               }
+               if token, ok := t.peek(2); ok {
+                       switch token {
+                       case "{{":
+                               out := &Token{TEXT, string(acc[:cursor])}
+                               t.state = STATE_TEMPLATE
+                               return out, nil
+                       case "</":
+                               out := &Token{TEXT, string(acc[:cursor])}
+                               t.state = STATE_TAG
+                               return out, nil
+                       }
+               }
+               if token, ok := t.peek(1); ok {
+                       switch token {
+                       case "<":
+                               out := &Token{TEXT, string(acc[:cursor])}
+                               t.state = STATE_TAG
+                               return out, nil
+                       }
+               }
+               r, _, err := t.r.ReadRune()
+               if errors.Is(err, io.EOF) {
+                       if cursor > 0 {
+                               return &Token{TEXT, string(acc[:cursor])}, nil
+                       } else {
+                               return &Token{EOF, ""}, nil
+                       }
+               } else if err != nil {
+                       return nil, err
+               }
+               accumulate(r)
+       }
+}
+func (t *tokenizer) nextTemplate() (*Token, error) {
+       acc := make([]rune, 10)
+       cursor := 0
+       accumulate := func(r rune) {
+               if cursor >= len(acc) {
+                       acc = append(acc, make([]rune, 10)...)
+               }
+               acc[cursor] = r
+               cursor++
+       }
+       for {
+               if token, ok := t.peek(5); ok {
+                       switch token {
+                       case "range":
+                               out := &Token{TEMPLATE_KEYWORD, token}
+                               t.advance(5)
+                               return out, nil
+                       }
+               }
+               if token, ok := t.peek(4); ok {
+                       switch token {
+                       case "with":
+                               out := &Token{TEMPLATE_KEYWORD, token}
+                               t.advance(4)
+                               return out, nil
+                       case "else":
+                               out := &Token{TEMPLATE_KEYWORD, token}
+                               t.advance(4)
+                               return out, nil
+                       }
+               }
+               if token, ok := t.peek(3); ok {
+                       switch token {
+                       case "end":
+                               out := &Token{TEMPLATE_KEYWORD, token}
+                               t.advance(3)
+                               return out, nil
+                       case "{{-":
+                               out := &Token{TEMPLATE_OPEN, token}
+                               t.advance(3)
+                               return out, nil
+                       case "-}}":
+                               out := &Token{TEMPLATE_CLOSE, token}
+                               t.advance(3)
+                               t.state = STATE_CONTENT
+                               return out, nil
+                       }
+               }
+               if token, ok := t.peek(2); ok {
+                       switch token {
+                       case "if":
+                               out := &Token{TEMPLATE_KEYWORD, token}
+                               t.advance(2)
+                               return out, nil
+                       case "{{":
+                               out := &Token{TEMPLATE_OPEN, token}
+                               t.advance(2)
+                               return out, nil
+                       case "}}":
+                               out := &Token{TEMPLATE_CLOSE, token}
+                               t.advance(2)
+                               t.state = STATE_CONTENT
+                               return out, nil
+                       }
+               }
+               r, _, err := t.r.ReadRune()
+               if errors.Is(err, io.EOF) {
+                       return &Token{EOF, ""}, nil
+               } else if err != nil {
+                       return nil, err
+               }
+               if strings.ContainsRune(whitespace, r) {
+                       return &Token{TEXT, string(acc[:cursor])}, nil
+               } else {
+                       accumulate(r)
+               }
+       }
+}
+func (t *tokenizer) nextTag() (*Token, error) {
+       acc := make([]rune, 10)
+       cursor := 0
+       accumulate := func(r rune) {
+               if cursor >= len(acc) {
+                       acc = append(acc, make([]rune, 10)...)
+               }
+               acc[cursor] = r
+               cursor++
+       }
+       for {
+               if token, ok := t.peek(2); ok {
+                       switch token {
+                       case "</":
+                               out := &Token{TAG_END_OPEN, token}
+                               t.advance(2)
+                               return out, nil
+                       case "/>":
+                               out := &Token{TEXT, string(acc[:cursor])}
+                               t.nextToken = &Token{TAG_VOID_CLOSE, token}
+                               t.advance(2)
+                               t.state = STATE_CONTENT
+                               return out, nil
+                       }
+               }
+               if token, ok := t.peek(1); ok {
+                       switch token {
+                       case "<":
+                               out := &Token{TAG_OPEN, token}
+                               t.advance(1)
+                               return out, nil
+                       case "=":
+                               out := &Token{TEXT, string(acc[:cursor])}
+                               t.nextToken = &Token{TAG_EQ, token}
+                               t.advance(1)
+                               return out, nil
+                       case `"`:
+                               out := &Token{TEXT, string(acc[:cursor])}
+                               t.nextToken = &Token{TAG_QUOTE, token}
+                               t.advance(1)
+                               return out, nil
+                       case ">":
+                               out := &Token{TEXT, string(acc[:cursor])}
+                               t.nextToken = &Token{TAG_CLOSE, token}
+                               t.advance(1)
+                               t.state = STATE_CONTENT
+                               return out, nil
+                       }
+               }
+               r, _, err := t.r.ReadRune()
+               if errors.Is(err, io.EOF) {
+                       return &Token{EOF, ""}, nil
+               } else if err != nil {
+                       return nil, err
+               }
+               if strings.ContainsRune(whitespace, r) {
+                       return &Token{TEXT, string(acc[:cursor])}, nil
+               } else {
+                       accumulate(r)
+               }
+       }
+}
+
+func (t *tokenizer) next2() (*Token, error) {
+       var next *Token
+       var err error
+       for next == nil && err == nil {
+               if t.nextToken != nil {
+                       next := t.nextToken
+                       t.nextToken = nil
+                       return next, nil
+               }
+               switch t.state {
+               case STATE_CONTENT:
+                       next, err = t.nextContent()
+               case STATE_TEMPLATE:
+                       next, err = t.nextTemplate()
+               case STATE_TAG:
+                       next, err = t.nextTag()
+               default:
+                       return nil, fmt.Errorf("unknown state %s", t.state)
+               }
+               if next.Type == TEXT && next.Literal == "" {
+                       next = nil
+               }
+       }
+       return next, err
+}
+
+// all returns an iterator over all tokens produced by the tokenizer.
+// Stops iterating on EOF or error.
+func (t *tokenizer) all() tokens {
+       return func(yield func(*Token) bool) {
+               for {
+                       if t.err != nil {
+                               return
+                       }
+                       tok, err := t.next2()
+                       if err != nil {
+                               yield(&Token{ERROR, t.err.Error()})
+                               break
+                       }
+                       if tok.Type == EOF {
+                               t.err = io.EOF
+                       }
+                       if !yield(tok) {
+                               break
+                       }
+               }
+       }
+}
+
+func Tokenize(r io.Reader) tokens {
+       tkns := new(tokenizer)
+       tkns.r = bufio.NewReader(r)
+       return tkns.all()
+}
diff --git a/web/htmlt/parse/tokens_test.go b/web/htmlt/parse/tokens_test.go
new file mode 100644 (file)
index 0000000..f29e6ee
--- /dev/null
@@ -0,0 +1,23 @@
+package parse
+
+import (
+       "slices"
+       "strings"
+       "testing"
+)
+
+func TestTokenize(t *testing.T) {
+       testStrings := map[string]string{
+               "hello":         "Hello, World!",
+               "template":      "Hello, {{ `template` }}!",
+               "html":          "<div><p>Hello, HTML!</p><br></div>",
+               "html+template": "{{ if .condition }}<p>{{- .text -}}</p>{{ end }}",
+       }
+       for name, val := range testStrings {
+               t.Run(name, func(t *testing.T) {
+                       toks := slices.Collect(Tokenize(strings.NewReader(val)).seq())
+                       t.Log(val)
+                       t.Log(toks)
+               })
+       }
+}