]> git.earlybird.gay Git - today/commitdiff
improve testing/debugging; fix discrepancy between with/if parsing
authorearly <me@earlybird.gay>
Tue, 31 Dec 2024 05:54:36 +0000 (22:54 -0700)
committerearly <me@earlybird.gay>
Tue, 31 Dec 2024 05:54:36 +0000 (22:54 -0700)
.local/.gitignore [new file with mode: 0644]
.local/README [new file with mode: 0644]
web/htmlt/parse/parse.go
web/htmlt/parse/parse_test.go
web/htmlt/parse/tokens.go
web/htmlt/parse/tokens_test.go

diff --git a/.local/.gitignore b/.local/.gitignore
new file mode 100644 (file)
index 0000000..4a16a23
--- /dev/null
@@ -0,0 +1,3 @@
+*
+!README
+!.gitignore
\ No newline at end of file
diff --git a/.local/README b/.local/README
new file mode 100644 (file)
index 0000000..f72f32b
--- /dev/null
@@ -0,0 +1 @@
+This folder is safe to save your local artifacts in!
\ No newline at end of file
index e3786bd9c2760a738497e2896d08400eacc63ff0..6c1e630206b41d75a82221a7e29c1a0259684df5 100644 (file)
@@ -3,56 +3,65 @@ package parse
 import (
        "fmt"
        "io"
-       "iter"
+       "log"
        "slices"
        "strings"
 )
 
-type tokens iter.Seq[*Token]
+type ParseState int
 
-func (toks tokens) nextOf(tts ...TokenType) *Token {
-       var out *Token
-       for tok := range toks.until(tts...) {
-               out = tok
-       }
-       if slices.Contains(tts, out.Type) {
-               return out
-       } else {
-               return nil
-       }
-}
+const (
+       PSTATE_CONTENT = ParseState(iota)
+       PSTATE_ATTRS
+)
 
-// untilInclusive gets the next N tokens in p, until one is in types, or is EOF.
-func (toks tokens) until(tts ...TokenType) tokens {
-       return func(yield func(*Token) bool) {
-               for tok := range toks {
-                       if !yield(tok) || slices.Contains(tts, tok.Type) || tok.Type == EOF {
-                               return
-                       }
-               }
-       }
-}
+// contentFunc is a reader over tokens that may return any number of
+// arbitrary nodes, like readContent and readAttributes. This is used to provide
+// callbacks to nodes where reading of mixed content is interrupted, like with
+// templates. This usage typically means a contentFunc passing itself as
+// a callback to readTemplate.
+type contentFunc func(until ...TokenType) ([]any, error)
 
-func (toks tokens) discardUntil(tts ...TokenType) {
-       for range toks.until(tts...) {
-       }
+// parser is a parser over a token iterator.
+// Parsers have two types of readers: content and node. Content readers have a
+// signature that conforms to contentFunc, and may return any number of nodes of
+// types depending on the reader. Node readers return a single node of defined
+// type. Node readers may also accept a callback to a content reader, if they
+// may contain variable types of content.
+type parser struct {
+       tokens tokens
+
+       debug *log.Logger
 }
 
-func (toks tokens) seq() iter.Seq[*Token] {
-       return iter.Seq[*Token](toks)
+// newParser returns a new parser with optional debug logging.
+// If debug is nil, log output is discarded. Debug logging is also passed to the
+// underlying tokenizer.
+func newParser(r io.Reader, debug *log.Logger) *parser {
+       if debug == nil {
+               debug = log.New(io.Discard, "", 0)
+       }
+       tokenizer := newTokenizer(r, debug)
+       return &parser{
+               tokens: tokenizer.all(),
+               debug:  debug,
+       }
 }
 
-// readContent returns a list of "content tokens", wherever they're acceptable.
-func readContent(toks tokens) ([]any, error) {
+// anyContent is a content reader for the base types of content in an HTML
+// template file; text, HTML, and templates.
+func (p *parser) anyContent(until ...TokenType) ([]any, error) {
+       p.debug.Printf("reading content until any %v", until)
        out := make([]any, 0)
        var text *TextNode
        resolveText := func() {
                if text != nil {
+                       p.debug.Printf("content resolving text %s", text.Value)
                        out = append(out, text)
                }
                text = nil
        }
-       for tok := range toks {
+       for tok := range p.tokens.until(until...) {
                switch tok.Type {
                case TEXT, WHITESPACE:
                        if text == nil {
@@ -61,24 +70,25 @@ func readContent(toks tokens) ([]any, error) {
                        text.Value += tok.Literal
                case TEMPLATE_OPEN:
                        resolveText()
-                       node, err := readTemplate(toks)
+                       node, err := p.templateNode(p.anyContent)
                        if err != nil {
                                return nil, err
                        }
                        out = append(out, node)
                case TAG_OPEN:
                        resolveText()
-                       node, err := readElement(toks)
+                       node, err := p.elementNode()
                        if err != nil {
                                return nil, err
                        }
                        out = append(out, node)
                case TAG_END_OPEN:
-                       toks.discardUntil(TAG_CLOSE)
+                       p.tokens.discardUntil(TAG_CLOSE)
                default:
                }
        }
        resolveText()
+       p.debug.Printf("read content %v", out)
        return out, nil
 }
 
@@ -94,9 +104,11 @@ func (d *Document) String() string {
        return out
 }
 
-func (d *Document) read(toks tokens) (err error) {
-       d.Children, err = readContent(toks)
-       return err
+func (p *parser) docNode() (d *Document, err error) {
+       p.debug.Println("reading document")
+       d = new(Document)
+       d.Children, err = p.anyContent()
+       return d, err
 }
 
 type TextNode struct {
@@ -115,40 +127,64 @@ func (node *TemplateNode) String() string {
        return fmt.Sprintf("{{ %s }}", node.Value)
 }
 
-// read reads tokens into the TemplateNode.
-func readTemplate(toks tokens) (any, error) {
+// templateNode is a node reader, and it may return a TemplateNode or any of
+// the template "block" nodes (TemplateIf, etc.) if the template contains the
+// corresponding keyword.
+func (p *parser) templateNode(callback contentFunc) (any, error) {
+       p.debug.Println("reading template")
        node := new(TemplateNode)
-       for tok := range toks.until(TEMPLATE_CLOSE) {
-               switch tok.Type {
-               case TEMPLATE_KEYWORD:
+       first := true
+
+readTemplate:
+       for tok := range p.tokens.until(TEMPLATE_CLOSE) {
+               if first && tok.Type == TEMPLATE_KEYWORD {
+                       p.debug.Printf("template starts with keyword '%s'; deferring", tok.Literal)
                        switch tok.Literal {
                        case "if":
-                               return readTemplateIf(toks)
+                               return p.templateIfNode(callback)
                        case "else":
-                               next := toks.nextOf(TEMPLATE_KEYWORD, TEMPLATE_CLOSE)
+                               next := p.tokens.nextOf(TEMPLATE_KEYWORD, TEMPLATE_CLOSE)
                                if next.Literal == "if" {
-                                       return readTemplateIf(toks)
+                                       return p.templateIfNode(callback)
                                } else if next.Type == TEXT {
-                                       toks.discardUntil(TEMPLATE_CLOSE)
+                                       p.tokens.discardUntil(TEMPLATE_CLOSE)
                                }
                                return &TemplateElse{}, nil
+                       case "with":
+                               return p.templateWithNode(callback)
                        case "end":
-                               toks.discardUntil(TEMPLATE_CLOSE)
+                               p.tokens.discardUntil(TEMPLATE_CLOSE)
                                return &TemplateEnd{}, nil
                        default:
                                return nil, fmt.Errorf("unrecognized template keyword %s", tok.Literal)
                        }
+               }
+               switch tok.Type {
                case TEXT:
                        node.Value += " " + tok.Literal
+               case TEMPLATE_CLOSE:
+                       break readTemplate
+               default:
+                       return nil, fmt.Errorf("unexpected token %s in templateNode", tok)
                }
        }
        node.Value = strings.TrimSpace(node.Value)
+       p.debug.Printf("read template expression %s", node)
        return node, nil
 }
 
 type TemplateElse struct{}
+
+func (node *TemplateElse) String() string {
+       return "TemplateElse"
+}
+
 type TemplateEnd struct{}
 
+func (node *TemplateEnd) String() string {
+       return "TemplateEnd"
+}
+
 type TemplateIf struct {
        Condition string
        Then      []any
@@ -168,46 +204,93 @@ func (node *TemplateIf) String() string {
        return fmt.Sprintf("{{ if %s }}%s%s%s{{ end }}", node.Condition, JoinAny(node.Then, ""), elifStr, elseStr)
 }
 
-func readTemplateIf(toks tokens) (*TemplateIf, error) {
+func (p *parser) templateIfNode(callback contentFunc) (*TemplateIf, error) {
+       p.debug.Println("reading template if")
        node := new(TemplateIf)
-       for tok := range toks.until(TEMPLATE_CLOSE) {
+       for tok := range p.tokens.until(TEMPLATE_CLOSE) {
                switch tok.Type {
                case TEXT, WHITESPACE:
                        node.Condition += tok.Literal
                }
        }
        node.Condition = strings.TrimSpace(node.Condition)
+
+       acc := make([]any, 0)
        isElse := false
+readBlock:
        for {
-               then, err := readContent(toks.until(TEMPLATE_OPEN))
+               then, err := callback(TEMPLATE_OPEN)
                if err != nil {
                        return nil, err
                }
                if len(then) == 0 {
                        break
                }
-               term := then[len(then)-1]
-               body := then[:len(then)-1]
-               switch v := term.(type) {
+               acc = append(acc, then...)
+               last := acc[len(acc)-1]
+               switch v := last.(type) {
                case *TemplateIf:
-                       node.Then = append(node.Then, body...)
+                       node.Then = acc[:len(acc)-1]
+                       acc = make([]any, 0)
                        // Elifs steal the else. Take it back.
-                       node.Else = append(node.Else, v.Else...)
+                       node.Else = v.Else
                        v.Else = nil
                        node.Elif = append(node.Elif, v)
                case *TemplateElse:
-                       node.Then = append(node.Then, body...)
+                       node.Then = acc[:len(acc)-1]
+                       acc = make([]any, 0)
                        isElse = true
                case *TemplateEnd:
                        if !isElse {
-                               node.Then = body
+                               node.Then = acc[:len(acc)-1]
                        } else {
-                               node.Else = body
+                               node.Else = acc[:len(acc)-1]
                                isElse = false
                        }
-               default:
+                       break readBlock
+               }
+       }
+       p.debug.Printf("read template if %s", node)
+       return node, nil
+}
+
+type TemplateWith struct {
+       Expression string
+       Content    []any
+}
+
+func (node *TemplateWith) String() string {
+       return fmt.Sprintf("{{ with %s }}%s{{ end }}", node.Expression, fmt.Sprint(node.Content...))
+}
+
+func (p *parser) templateWithNode(callback contentFunc) (*TemplateWith, error) {
+       p.debug.Println("reading template with")
+       node := new(TemplateWith)
+       for tok := range p.tokens.until(TEMPLATE_CLOSE) {
+               switch tok.Type {
+               case TEXT, WHITESPACE:
+                       node.Expression += tok.Literal
                }
        }
+       node.Expression = strings.TrimSpace(node.Expression)
+
+       acc := make([]any, 0)
+       for {
+               then, err := callback(TEMPLATE_OPEN)
+               if err != nil {
+                       return nil, err
+               }
+               if len(then) == 0 {
+                       break
+               }
+               acc = append(acc, then...)
+               last := acc[len(acc)-1]
+               if _, ok := last.(*TemplateEnd); ok {
+                       node.Content = acc[:len(acc)-1]
+                       break
+               }
+       }
+       p.debug.Printf("read template with %s", node)
        return node, nil
 }
 
@@ -220,7 +303,7 @@ var voidElems = []string{
 
 type Attribute struct {
        Name  string
-       Value string
+       Value []any
        // Boolean is true if attr is a "boolean attribute"; it has no string value
        // in the source HTML, so it just represents true-if-present. Boolean is
        // *not* the value of the boolean attribute.
@@ -231,77 +314,82 @@ func (attr Attribute) String() string {
        if attr.Boolean {
                return attr.Name
        } else {
-               return fmt.Sprintf(`%s="%s"`, attr.Name, attr.Value)
+               valStrs := make([]string, len(attr.Value))
+               for i, val := range attr.Value {
+                       valStrs[i] = fmt.Sprint(val)
+               }
+               return fmt.Sprintf(`%s="%s"`, attr.Name, strings.Join(valStrs, " "))
+       }
+}
+
+func (p *parser) attrContent(until ...TokenType) ([]any, error) {
+       p.debug.Println("reading tag attributes")
+       attrs := make([]any, 0)
+       attr := Attribute{}
+       for tok := range p.tokens.until(until...) {
+               switch tok.Type {
+               case TEXT:
+                       attr.Name = tok.Literal
+               case TAG_EQ:
+                       p.tokens.discardUntil(TAG_QUOTE)
+                       values, err := p.anyContent()
+                       if err != nil {
+                               return nil, err
+                       }
+                       attr.Value = append(attr.Value, values...)
+                       attrs = append(attrs, attr)
+                       attr = Attribute{}
+               case TEMPLATE_OPEN:
+                       tmpl, err := p.templateNode(p.attrContent)
+                       if err != nil {
+                               return nil, err
+                       }
+                       attrs = append(attrs, tmpl)
+               }
        }
+       return attrs, nil
 }
 
 type ElementNode struct {
        Name       string
-       Attributes []Attribute
+       Attributes []any
        Children   []any
 
        void bool
 }
 
 func (node *ElementNode) String() string {
-       attrsRaw := make([]string, len(node.Attributes))
-       for i, attr := range node.Attributes {
-               attrsRaw[i] = attr.String()
-       }
-       attrs := ""
-       if len(attrsRaw) > 0 {
-               attrs = strings.Join(attrsRaw, " ")
+       attrs := fmt.Sprint(node.Attributes...)
+       if attrs != "" {
+               attrs = " " + attrs
        }
        if node.void {
-               return fmt.Sprintf("<%s %s />", node.Name, attrs)
+               return fmt.Sprintf("<%s%s />", node.Name, attrs)
        } else {
                inner := ""
                for _, child := range node.Children {
                        inner += fmt.Sprint(child)
                }
-               return fmt.Sprintf("<%s %s >%s</%s>", node.Name, attrs, inner, node.Name)
+               return fmt.Sprintf("<%s%s>%s</%s>", node.Name, attrs, inner, node.Name)
        }
 }
 
-func readElement(toks tokens) (*ElementNode, error) {
+func (p *parser) elementNode() (*ElementNode, error) {
+       p.debug.Println("reading element")
        node := new(ElementNode)
-       node.Name = toks.nextOf(TEXT).Literal
+       node.Name = p.tokens.nextOf(TEXT).Literal
        node.void = slices.Contains(voidElems, node.Name)
 
-       tagToks := toks.until(TAG_CLOSE, TAG_VOID_CLOSE)
-       next := tagToks.nextOf(TEXT)
-       for next != nil {
-               name := next.Literal
-               next = tagToks.nextOf(TEXT, TAG_EQ)
-               // If it's text, this is a boolean attribute. Otherwise, it has a value.
-               if next == nil || next.Type == TEXT {
-                       node.Attributes = append(node.Attributes, Attribute{
-                               Name:    name,
-                               Boolean: true,
-                       })
-               } else {
-                       value := ""
-                       // Advance to the start of the value...
-                       tagToks.discardUntil(TAG_QUOTE)
-                       // Then read until the end.
-                       for tok := range tagToks.until(TAG_QUOTE) {
-                               switch tok.Type {
-                               case TEXT:
-                                       value += tok.Literal
-                               }
-                       }
-                       node.Attributes = append(node.Attributes, Attribute{
-                               Name:  name,
-                               Value: value,
-                       })
-                       next = tagToks.nextOf(TEXT)
-               }
+       attrs, err := p.attrContent(TAG_CLOSE, TAG_VOID_CLOSE)
+       if err != nil {
+               return nil, err
        }
+       node.Attributes = attrs
 
        if node.void {
                return node, nil
        }
-       children, err := readContent(toks.until(TAG_END_OPEN))
+       children, err := p.anyContent()
        if err != nil {
                return nil, err
        }
@@ -310,7 +398,6 @@ func readElement(toks tokens) (*ElementNode, error) {
 }
 
 func Parse(r io.Reader) (any, error) {
-       doc := new(Document)
-       err := doc.read(Tokenize(r))
-       return doc, err
+       parser := newParser(r, nil)
+       return parser.docNode()
 }
index b9eae1eb2b5ce3d34c820c9b39b2030482f03549..44c68fae539186cf6cef5c0eac3e3c7b874cb828 100644 (file)
@@ -1,27 +1,62 @@
 package parse
 
 import (
-       "fmt"
-       "slices"
+       "log"
+       "os"
        "strings"
        "testing"
 )
 
+// basicParseTest is a parser test.
+// if expected is given, the test result for input, stringified, is compared
+// to it; otherwise, input is expected to pass a roundtrip.
+type basicParseTest struct {
+       name  string
+       input string
+
+       expected string
+}
+
 func TestParseBasic(t *testing.T) {
-       testStrings := map[string]string{
-               "hello":         "Hello, World!",
-               "template":      "Hello, {{ `template` }}!",
-               "html":          "<div><p>Hello, HTML!</p><br></div>",
-               "html+template": "{{ if .condition }}<p>{{- .text -}}</p>{{ end }}",
+       tests := []basicParseTest{
+               {name: "expression", input: "Hello, {{ `template` }}!"},
+               {name: "if", input: "{{ if .condition }}hello{{ end }}"},
+               {name: "if-nested", input: "{{ if .message }}{{ .message }}{{ end }}"},
+               {name: "if-else", input: "{{ if .condition }}foo{{ else }}bar{{ end }}"},
+               {name: "if-else-nested", input: "{{ if .m1 }}{{ .m1 }}{{ else }}{{ .m2 }}{{ end }}"},
+               {name: "if-elif", input: "{{ if .c1 }}foo{{ else if .c2 }}bar{{ end }}"},
+               {name: "if-elif-nested", input: "{{ if .m1 }}{{ .m1 }}{{ else if .m2 }}{{ .m2 }}{{ end }}"},
+               {name: "if-elif-else", input: "{{ if .c1 }}foo{{ else if .c2 }}bar{{ else }}baz{{ end }}"},
+               {name: "if-elif-else-nested", input: "{{ if .m1 }}{{ .m1 }}{{ else if m2 }}{{ .m2 }}{{ else }}{{ .m3 }}{{ end }}"},
+               {name: "with", input: "{{ with .message }}hello{{ end }}"},
+               {name: "with-nested", input: "{{ with .message }}{{ . }}{{ end }}"},
        }
-       for name, val := range testStrings {
-               t.Run(name, func(t *testing.T) {
-                       doc, err := Parse(strings.NewReader(val))
-                       t.Log(val)
+       /*
+               testStrings := map[string]string{
+                       //"if":           "{{ if .condition }}hello{{ end }}",
+                       //"if-else": "{{ if .condition }}hello{{ else }}{{ .else }}{{ end }}",
+                       //"if-elif":      "{{ if .condition }}hello{{ else if .other }}world{{ end }}",
+                       //"if-elif-else": "{{ if .condition }}hello{{ else if .other }}foo{{ else }}bar{{ end }}",
+                       //"with":         "{{ with .value }}abc{{ . }}{{ end }}",
+                       //"html":                  "<div><p>Hello, HTML!</p><br></div>",
+                       //"html+template":         "{{ if .condition }}<p>{{- .text -}}</p>{{ end }}",
+               }
+       */
+       for _, test := range tests {
+               debug := log.New(os.Stderr, "", log.Lshortfile)
+               t.Run(test.name, func(t *testing.T) {
+                       parser := newParser(strings.NewReader(test.input), debug)
+                       doc, err := parser.docNode()
+                       t.Log(test.input)
                        if err != nil {
                                t.Fatal(err)
                        }
                        t.Log(doc)
+                       if test.expected != "" && doc.String() != test.expected {
+                               t.Fatal("result document doesn't match")
+                       } else if doc.String() != test.input {
+                               t.Fatal("result document didn't pass roundtrip")
+                       }
                })
        }
 }
@@ -42,33 +77,15 @@ func TestParseHTML(t *testing.T) {
                })
        }
 }
-func TestParseTemplate(t *testing.T) {
-       testStrings := map[string]string{
-               "if":           "{{ if .condition }}Hello{{ end }}",
-               "if-else":      "{{ if .condition }}Hello{{ else }}World{{ end }}",
-               "if-elif":      "{{ if .condition }}Hello{{ else if .other.condition }}World{{ end }}",
-               "if-elif-else": "{{ if .condition }}One{{ else if .other.condition }}Two{{ else }}Three{{ end }}",
-       }
-       for name, val := range testStrings {
-               t.Run(name, func(t *testing.T) {
-                       doc, err := Parse(strings.NewReader(val))
-                       t.Log(val)
-                       if err != nil {
-                               t.Fatal(err)
-                       }
-                       t.Log(doc)
-               })
-       }
-}
 
 func TestParseComplex(t *testing.T) {
        testStrings := map[string]string{
-               "template-attr": `<img src="{{ .img }}">`,
+               "attr-template-value":   `<img src="{{ .img }}">`,
+               "attr-if-template":      `<div {{ if .red }}class="red"{{ end }}></div>`,
+               "attr-if-else-template": `<div {{ if .red }}class="red"{{ else }}class="blue"{{ end }}></div>`,
        }
        for name, val := range testStrings {
                t.Run(name, func(t *testing.T) {
-                       toks := slices.Collect(Tokenize(strings.NewReader(val)).seq())
-                       fmt.Println(toks)
                        doc, err := Parse(strings.NewReader(val))
                        t.Log(val)
                        if err != nil {
index 791f6968b8a8729287c0723519f3ed395376f877..14d7e90fd54fa13b2dbfe5527fc94dcb265a1906 100644 (file)
@@ -5,25 +5,31 @@ import (
        "errors"
        "fmt"
        "io"
+       "iter"
+       "log"
+       "slices"
        "strings"
 )
 
 type TokenState int
 
 const (
-       STATE_CONTENT = TokenState(iota)
-       STATE_TEMPLATE
-       STATE_TAG
+       TSTATE_CONTENT = TokenState(iota)
+       TSTATE_TEMPLATE
+       TSTATE_TAG
+       TSTATE_ATTRS
 )
 
 func (ts TokenState) String() string {
        switch ts {
-       case STATE_CONTENT:
+       case TSTATE_CONTENT:
                return "STATE_CONTENT"
-       case STATE_TEMPLATE:
+       case TSTATE_TEMPLATE:
                return "STATE_TEMPLATE"
-       case STATE_TAG:
+       case TSTATE_TAG:
                return "STATE_TAG"
+       case TSTATE_ATTRS:
+               return "STATE_ATTRS"
        default:
                return "STATE_UNKNOWN"
        }
@@ -92,10 +98,23 @@ func (t *Token) String() string {
 }
 
 type tokenizer struct {
-       r         *bufio.Reader
-       state     TokenState
-       nextToken *Token
-       err       error
+       r          *bufio.Reader
+       state      TokenState
+       prevStates []TokenState
+       nextToken  *Token
+       err        error
+
+       debug *log.Logger
+}
+
+func newTokenizer(r io.Reader, debug *log.Logger) *tokenizer {
+       if debug == nil {
+               debug = log.New(io.Discard, "", 0)
+       }
+       return &tokenizer{
+               r:     bufio.NewReader(r),
+               debug: debug,
+       }
 }
 
 // peek returns the next i characters, and "true" if i characters were found.
@@ -111,6 +130,20 @@ func (t *tokenizer) advance(i int) {
        t.r.Discard(i)
 }
 
+func (t *tokenizer) setState(ts TokenState) {
+       t.debug.Printf("pushing tokenizer state %s", ts)
+       t.prevStates = append(t.prevStates, t.state)
+       t.state = ts
+}
+
+func (t *tokenizer) revertState() {
+       if len(t.prevStates) > 0 {
+               t.state = t.prevStates[len(t.prevStates)-1]
+               t.prevStates = t.prevStates[:len(t.prevStates)-1]
+               t.debug.Printf("popping tokenizer state %s", t.state)
+       }
+}
+
 func (t *tokenizer) nextContent() (*Token, error) {
        acc := make([]rune, 100)
        cursor := 0
@@ -126,7 +159,7 @@ func (t *tokenizer) nextContent() (*Token, error) {
                        switch token {
                        case "{{-":
                                out := &Token{TEXT, string(acc[:cursor])}
-                               t.state = STATE_TEMPLATE
+                               t.setState(TSTATE_TEMPLATE)
                                return out, nil
                        }
                }
@@ -134,11 +167,11 @@ func (t *tokenizer) nextContent() (*Token, error) {
                        switch token {
                        case "{{":
                                out := &Token{TEXT, string(acc[:cursor])}
-                               t.state = STATE_TEMPLATE
+                               t.setState(TSTATE_TEMPLATE)
                                return out, nil
                        case "</":
                                out := &Token{TEXT, string(acc[:cursor])}
-                               t.state = STATE_TAG
+                               t.setState(TSTATE_TAG)
                                return out, nil
                        }
                }
@@ -146,7 +179,7 @@ func (t *tokenizer) nextContent() (*Token, error) {
                        switch token {
                        case "<":
                                out := &Token{TEXT, string(acc[:cursor])}
-                               t.state = STATE_TAG
+                               t.setState(TSTATE_TAG)
                                return out, nil
                        }
                }
@@ -174,57 +207,31 @@ func (t *tokenizer) nextTemplate() (*Token, error) {
                cursor++
        }
        for {
-               if token, ok := t.peek(5); ok {
-                       switch token {
-                       case "range":
-                               out := &Token{TEMPLATE_KEYWORD, token}
-                               t.advance(5)
-                               return out, nil
-                       }
-               }
-               if token, ok := t.peek(4); ok {
-                       switch token {
-                       case "with":
-                               out := &Token{TEMPLATE_KEYWORD, token}
-                               t.advance(4)
-                               return out, nil
-                       case "else":
-                               out := &Token{TEMPLATE_KEYWORD, token}
-                               t.advance(4)
-                               return out, nil
-                       }
-               }
                if token, ok := t.peek(3); ok {
                        switch token {
-                       case "end":
-                               out := &Token{TEMPLATE_KEYWORD, token}
-                               t.advance(3)
-                               return out, nil
                        case "{{-":
                                out := &Token{TEMPLATE_OPEN, token}
                                t.advance(3)
                                return out, nil
                        case "-}}":
-                               out := &Token{TEMPLATE_CLOSE, token}
+                               out := &Token{TEXT, string(acc[:cursor])}
+                               t.nextToken = &Token{TEMPLATE_CLOSE, token}
                                t.advance(3)
-                               t.state = STATE_CONTENT
+                               t.revertState()
                                return out, nil
                        }
                }
                if token, ok := t.peek(2); ok {
                        switch token {
-                       case "if":
-                               out := &Token{TEMPLATE_KEYWORD, token}
-                               t.advance(2)
-                               return out, nil
                        case "{{":
                                out := &Token{TEMPLATE_OPEN, token}
                                t.advance(2)
                                return out, nil
                        case "}}":
-                               out := &Token{TEMPLATE_CLOSE, token}
+                               out := &Token{TEXT, string(acc[:cursor])}
+                               t.nextToken = &Token{TEMPLATE_CLOSE, token}
                                t.advance(2)
-                               t.state = STATE_CONTENT
+                               t.revertState()
                                return out, nil
                        }
                }
@@ -235,7 +242,13 @@ func (t *tokenizer) nextTemplate() (*Token, error) {
                        return nil, err
                }
                if strings.ContainsRune(whitespace, r) {
-                       return &Token{TEXT, string(acc[:cursor])}, nil
+                       token := string(acc[:cursor])
+                       switch token {
+                       case "if", "else", "with", "range", "end":
+                               return &Token{TEMPLATE_KEYWORD, token}, nil
+                       default:
+                               return &Token{TEXT, token}, nil
+                       }
                } else {
                        accumulate(r)
                }
@@ -262,7 +275,7 @@ func (t *tokenizer) nextTag() (*Token, error) {
                                out := &Token{TEXT, string(acc[:cursor])}
                                t.nextToken = &Token{TAG_VOID_CLOSE, token}
                                t.advance(2)
-                               t.state = STATE_CONTENT
+                               t.setState(TSTATE_CONTENT)
                                return out, nil
                        }
                }
@@ -286,7 +299,69 @@ func (t *tokenizer) nextTag() (*Token, error) {
                                out := &Token{TEXT, string(acc[:cursor])}
                                t.nextToken = &Token{TAG_CLOSE, token}
                                t.advance(1)
-                               t.state = STATE_CONTENT
+                               t.setState(TSTATE_CONTENT)
+                               return out, nil
+                       }
+               }
+               r, _, err := t.r.ReadRune()
+               if errors.Is(err, io.EOF) {
+                       return &Token{EOF, ""}, nil
+               } else if err != nil {
+                       return nil, err
+               }
+               if strings.ContainsRune(whitespace, r) {
+                       if cursor > 0 {
+                               t.setState(TSTATE_ATTRS)
+                       }
+                       return &Token{TEXT, string(acc[:cursor])}, nil
+               } else {
+                       accumulate(r)
+               }
+       }
+}
+
+func (t *tokenizer) nextAttrs() (*Token, error) {
+       acc := make([]rune, 10)
+       cursor := 0
+       accumulate := func(r rune) {
+               if cursor >= len(acc) {
+                       acc = append(acc, make([]rune, 10)...)
+               }
+               acc[cursor] = r
+               cursor++
+       }
+       for {
+               if token, ok := t.peek(2); ok {
+                       switch token {
+                       case "{{":
+                               out := &Token{TEXT, string(acc[:cursor])}
+                               t.setState(TSTATE_TEMPLATE)
+                               return out, nil
+                       case "/>":
+                               out := &Token{TEXT, string(acc[:cursor])}
+                               t.setState(TSTATE_TAG)
+                               return out, nil
+                       }
+               }
+               if token, ok := t.peek(1); ok {
+                       switch token {
+                       case "<":
+                               out := &Token{TAG_OPEN, token}
+                               t.advance(1)
+                               return out, nil
+                       case "=":
+                               out := &Token{TEXT, string(acc[:cursor])}
+                               t.nextToken = &Token{TAG_EQ, token}
+                               t.advance(1)
+                               return out, nil
+                       case `"`:
+                               out := &Token{TEXT, string(acc[:cursor])}
+                               t.nextToken = &Token{TAG_QUOTE, token}
+                               t.advance(1)
+                               return out, nil
+                       case ">":
+                               out := &Token{TEXT, string(acc[:cursor])}
+                               t.setState(TSTATE_TAG)
                                return out, nil
                        }
                }
@@ -304,7 +379,7 @@ func (t *tokenizer) nextTag() (*Token, error) {
        }
 }
 
-func (t *tokenizer) next2() (*Token, error) {
+func (t *tokenizer) next() (*Token, error) {
        var next *Token
        var err error
        for next == nil && err == nil {
@@ -314,12 +389,14 @@ func (t *tokenizer) next2() (*Token, error) {
                        return next, nil
                }
                switch t.state {
-               case STATE_CONTENT:
+               case TSTATE_CONTENT:
                        next, err = t.nextContent()
-               case STATE_TEMPLATE:
+               case TSTATE_TEMPLATE:
                        next, err = t.nextTemplate()
-               case STATE_TAG:
+               case TSTATE_TAG:
                        next, err = t.nextTag()
+               case TSTATE_ATTRS:
+                       next, err = t.nextAttrs()
                default:
                        return nil, fmt.Errorf("unknown state %s", t.state)
                }
@@ -327,9 +404,12 @@ func (t *tokenizer) next2() (*Token, error) {
                        next = nil
                }
        }
+       t.debug.Printf("got %s", next)
        return next, err
 }
 
+type tokens iter.Seq[*Token]
+
 // all returns an iterator over all tokens produced by the tokenizer.
 // Stops iterating on EOF or error.
 func (t *tokenizer) all() tokens {
@@ -338,7 +418,7 @@ func (t *tokenizer) all() tokens {
                        if t.err != nil {
                                return
                        }
-                       tok, err := t.next2()
+                       tok, err := t.next()
                        if err != nil {
                                yield(&Token{ERROR, t.err.Error()})
                                break
@@ -353,8 +433,34 @@ func (t *tokenizer) all() tokens {
        }
 }
 
-func Tokenize(r io.Reader) tokens {
-       tkns := new(tokenizer)
-       tkns.r = bufio.NewReader(r)
-       return tkns.all()
+func (toks tokens) nextOf(tts ...TokenType) *Token {
+       var out *Token
+       for tok := range toks.until(tts...) {
+               out = tok
+       }
+       if out != nil && slices.Contains(tts, out.Type) {
+               return out
+       } else {
+               return nil
+       }
+}
+
+// untilInclusive gets the next N tokens in p, until one is in types, or is EOF.
+func (toks tokens) until(tts ...TokenType) tokens {
+       return func(yield func(*Token) bool) {
+               for tok := range toks {
+                       if !yield(tok) || slices.Contains(tts, tok.Type) || tok.Type == EOF {
+                               return
+                       }
+               }
+       }
+}
+
+func (toks tokens) discardUntil(tts ...TokenType) {
+       for range toks.until(tts...) {
+       }
+}
+
+func (toks tokens) seq() iter.Seq[*Token] {
+       return iter.Seq[*Token](toks)
 }
index f29e6ee2c77e1af39efaad33829317026e5a165e..b943f93c0698ab0b5df99da7315f48dfdbae4cfe 100644 (file)
@@ -1,6 +1,8 @@
 package parse
 
 import (
+       "log"
+       "os"
        "slices"
        "strings"
        "testing"
@@ -14,8 +16,10 @@ func TestTokenize(t *testing.T) {
                "html+template": "{{ if .condition }}<p>{{- .text -}}</p>{{ end }}",
        }
        for name, val := range testStrings {
+               debug := log.New(os.Stderr, "", log.Lshortfile)
                t.Run(name, func(t *testing.T) {
-                       toks := slices.Collect(Tokenize(strings.NewReader(val)).seq())
+                       tokenizer := newTokenizer(strings.NewReader(val), debug)
+                       toks := slices.Collect(tokenizer.all().seq())
                        t.Log(val)
                        t.Log(toks)
                })