From 988f4bd077b6b239bcc85306fba7b21f76b33939 Mon Sep 17 00:00:00 2001 From: early Date: Thu, 26 Dec 2024 21:01:11 -0700 Subject: [PATCH] new parser checkpoint --- docs/dev/planning.md | 21 ++ docs/dev/templates.md | 31 +++ go.mod | 2 +- web/htmlt/parse/parse.go | 316 +++++++++++++++++++++++++++++ web/htmlt/parse/parse_test.go | 80 ++++++++ web/htmlt/parse/text.go | 29 +++ web/htmlt/parse/tokens.go | 360 +++++++++++++++++++++++++++++++++ web/htmlt/parse/tokens_test.go | 23 +++ 8 files changed, 861 insertions(+), 1 deletion(-) create mode 100644 docs/dev/planning.md create mode 100644 docs/dev/templates.md create mode 100644 web/htmlt/parse/parse.go create mode 100644 web/htmlt/parse/parse_test.go create mode 100644 web/htmlt/parse/text.go create mode 100644 web/htmlt/parse/tokens.go create mode 100644 web/htmlt/parse/tokens_test.go diff --git a/docs/dev/planning.md b/docs/dev/planning.md new file mode 100644 index 0000000..7e2448f --- /dev/null +++ b/docs/dev/planning.md @@ -0,0 +1,21 @@ +# What's the plan, anyways + +I want to make templates and custom elements process concurrently. That means +that `template.Execute` and the data loaders execute simultaneously, waiting on +each other at critical points to yield a value or set a variable. This is going +to require a significant refactor of everything, but frankly I think that is +overdue outside of any feature goals, so that's just fine. I'm going to lay out +a plan of attack for getting this feature live. + +## Write a parser + +`x/net/html` is a really good library! It's also not what this project needs. +Go HTML is focused on being a spec-compliant parser and renderer, and templates +are not compliant with the HTML specification in some circumstances. A parser +that recognizes templates as separate from regular text would also be really +handy! + +The first iteration of this parser is focused on restoring functionality to +where it is now, not adding anything new. To this end, it'll mostly leave the +semantics of templates alone, outside of recognizing things like template blocks +as having children. diff --git a/docs/dev/templates.md b/docs/dev/templates.md new file mode 100644 index 0000000..904058a --- /dev/null +++ b/docs/dev/templates.md @@ -0,0 +1,31 @@ +# Templates + +Templates are a mix of HTML and Go's standard templates. This document describes +how template documents are tokenized and parsed before being processed. + +## Tokenization + +Documents are processed first by a tokenizer that splits it up into sequential +tokens for processing. While this tokenizer isn't super concerned with syntax or +semantics, and shouldn't throw errors for either on its own, it does have states +that govern how tokens are processed. The main states are *content*, *template*, +and *tag*, though each may have sub-states depending on what's being processed. + +### Content + +Content is a mix of plain text, templates, and tags. It's the simplest to +tokenize, usually consisting of gathering text up until a template or tag starts +and then outputting that as a TEXT token. TEXT tokens are literal text and +whitespace making up the content body of the document. + +### Templates + +Templates are Go template expressions. Sometimes, these expressions have +keywords like `if` or `range` or assignment characters `:=`. At the time of +writing, pipelines are not processed into tokens, just left as plain text. + +### Tags + +Tags are HTML tags. These tags have a name and attributes, consisting of value +attributes `name="value"` and boolean attributes `name`. Because data attributes +are written `:name`, colons may be included in attribute names. diff --git a/go.mod b/go.mod index 707489b..cc7e651 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,6 @@ module git.earlybird.gay/today -go 1.22.4 +go 1.23.4 require ( golang.org/x/text v0.19.0 diff --git a/web/htmlt/parse/parse.go b/web/htmlt/parse/parse.go new file mode 100644 index 0000000..e3786bd --- /dev/null +++ b/web/htmlt/parse/parse.go @@ -0,0 +1,316 @@ +package parse + +import ( + "fmt" + "io" + "iter" + "slices" + "strings" +) + +type tokens iter.Seq[*Token] + +func (toks tokens) nextOf(tts ...TokenType) *Token { + var out *Token + for tok := range toks.until(tts...) { + out = tok + } + if slices.Contains(tts, out.Type) { + return out + } else { + return nil + } +} + +// untilInclusive gets the next N tokens in p, until one is in types, or is EOF. +func (toks tokens) until(tts ...TokenType) tokens { + return func(yield func(*Token) bool) { + for tok := range toks { + if !yield(tok) || slices.Contains(tts, tok.Type) || tok.Type == EOF { + return + } + } + } +} + +func (toks tokens) discardUntil(tts ...TokenType) { + for range toks.until(tts...) { + } +} + +func (toks tokens) seq() iter.Seq[*Token] { + return iter.Seq[*Token](toks) +} + +// readContent returns a list of "content tokens", wherever they're acceptable. +func readContent(toks tokens) ([]any, error) { + out := make([]any, 0) + var text *TextNode + resolveText := func() { + if text != nil { + out = append(out, text) + } + text = nil + } + for tok := range toks { + switch tok.Type { + case TEXT, WHITESPACE: + if text == nil { + text = new(TextNode) + } + text.Value += tok.Literal + case TEMPLATE_OPEN: + resolveText() + node, err := readTemplate(toks) + if err != nil { + return nil, err + } + out = append(out, node) + case TAG_OPEN: + resolveText() + node, err := readElement(toks) + if err != nil { + return nil, err + } + out = append(out, node) + case TAG_END_OPEN: + toks.discardUntil(TAG_CLOSE) + default: + } + } + resolveText() + return out, nil +} + +type Document struct { + Children []any +} + +func (d *Document) String() string { + out := "" + for _, child := range d.Children { + out += fmt.Sprint(child) + } + return out +} + +func (d *Document) read(toks tokens) (err error) { + d.Children, err = readContent(toks) + return err +} + +type TextNode struct { + Value string +} + +func (node *TextNode) String() string { + return node.Value +} + +type TemplateNode struct { + Value string +} + +func (node *TemplateNode) String() string { + return fmt.Sprintf("{{ %s }}", node.Value) +} + +// read reads tokens into the TemplateNode. +func readTemplate(toks tokens) (any, error) { + node := new(TemplateNode) + for tok := range toks.until(TEMPLATE_CLOSE) { + switch tok.Type { + case TEMPLATE_KEYWORD: + switch tok.Literal { + case "if": + return readTemplateIf(toks) + case "else": + next := toks.nextOf(TEMPLATE_KEYWORD, TEMPLATE_CLOSE) + if next.Literal == "if" { + return readTemplateIf(toks) + } else if next.Type == TEXT { + toks.discardUntil(TEMPLATE_CLOSE) + } + return &TemplateElse{}, nil + case "end": + toks.discardUntil(TEMPLATE_CLOSE) + return &TemplateEnd{}, nil + default: + return nil, fmt.Errorf("unrecognized template keyword %s", tok.Literal) + } + case TEXT: + node.Value += " " + tok.Literal + } + } + node.Value = strings.TrimSpace(node.Value) + return node, nil +} + +type TemplateElse struct{} +type TemplateEnd struct{} + +type TemplateIf struct { + Condition string + Then []any + Elif []*TemplateIf + Else []any +} + +func (node *TemplateIf) String() string { + elifStr := "" + for _, elif := range node.Elif { + elifStr += fmt.Sprintf("{{ else if %s }}%s", elif.Condition, JoinAny(elif.Then, "")) + } + elseStr := "" + if len(node.Else) > 0 { + elseStr = fmt.Sprintf("{{ else }}%s", JoinAny(node.Else, "")) + } + return fmt.Sprintf("{{ if %s }}%s%s%s{{ end }}", node.Condition, JoinAny(node.Then, ""), elifStr, elseStr) +} + +func readTemplateIf(toks tokens) (*TemplateIf, error) { + node := new(TemplateIf) + for tok := range toks.until(TEMPLATE_CLOSE) { + switch tok.Type { + case TEXT, WHITESPACE: + node.Condition += tok.Literal + } + } + node.Condition = strings.TrimSpace(node.Condition) + isElse := false + for { + then, err := readContent(toks.until(TEMPLATE_OPEN)) + if err != nil { + return nil, err + } + if len(then) == 0 { + break + } + term := then[len(then)-1] + body := then[:len(then)-1] + switch v := term.(type) { + case *TemplateIf: + node.Then = append(node.Then, body...) + // Elifs steal the else. Take it back. + node.Else = append(node.Else, v.Else...) + v.Else = nil + node.Elif = append(node.Elif, v) + case *TemplateElse: + node.Then = append(node.Then, body...) + isElse = true + case *TemplateEnd: + if !isElse { + node.Then = body + } else { + node.Else = body + isElse = false + } + default: + } + } + return node, nil +} + +var voidElems = []string{ + "area", "base", "br", "col", + "embed", "hr", "img", "input", + "link", "meta", "param", "source", + "track", "wbr", +} + +type Attribute struct { + Name string + Value string + // Boolean is true if attr is a "boolean attribute"; it has no string value + // in the source HTML, so it just represents true-if-present. Boolean is + // *not* the value of the boolean attribute. + Boolean bool +} + +func (attr Attribute) String() string { + if attr.Boolean { + return attr.Name + } else { + return fmt.Sprintf(`%s="%s"`, attr.Name, attr.Value) + } +} + +type ElementNode struct { + Name string + Attributes []Attribute + Children []any + + void bool +} + +func (node *ElementNode) String() string { + attrsRaw := make([]string, len(node.Attributes)) + for i, attr := range node.Attributes { + attrsRaw[i] = attr.String() + } + attrs := "" + if len(attrsRaw) > 0 { + attrs = strings.Join(attrsRaw, " ") + } + if node.void { + return fmt.Sprintf("<%s %s />", node.Name, attrs) + } else { + inner := "" + for _, child := range node.Children { + inner += fmt.Sprint(child) + } + return fmt.Sprintf("<%s %s >%s", node.Name, attrs, inner, node.Name) + } +} + +func readElement(toks tokens) (*ElementNode, error) { + node := new(ElementNode) + node.Name = toks.nextOf(TEXT).Literal + node.void = slices.Contains(voidElems, node.Name) + + tagToks := toks.until(TAG_CLOSE, TAG_VOID_CLOSE) + next := tagToks.nextOf(TEXT) + for next != nil { + name := next.Literal + next = tagToks.nextOf(TEXT, TAG_EQ) + // If it's text, this is a boolean attribute. Otherwise, it has a value. + if next == nil || next.Type == TEXT { + node.Attributes = append(node.Attributes, Attribute{ + Name: name, + Boolean: true, + }) + } else { + value := "" + // Advance to the start of the value... + tagToks.discardUntil(TAG_QUOTE) + // Then read until the end. + for tok := range tagToks.until(TAG_QUOTE) { + switch tok.Type { + case TEXT: + value += tok.Literal + } + } + node.Attributes = append(node.Attributes, Attribute{ + Name: name, + Value: value, + }) + next = tagToks.nextOf(TEXT) + } + } + + if node.void { + return node, nil + } + children, err := readContent(toks.until(TAG_END_OPEN)) + if err != nil { + return nil, err + } + node.Children = children + return node, nil +} + +func Parse(r io.Reader) (any, error) { + doc := new(Document) + err := doc.read(Tokenize(r)) + return doc, err +} diff --git a/web/htmlt/parse/parse_test.go b/web/htmlt/parse/parse_test.go new file mode 100644 index 0000000..b9eae1e --- /dev/null +++ b/web/htmlt/parse/parse_test.go @@ -0,0 +1,80 @@ +package parse + +import ( + "fmt" + "slices" + "strings" + "testing" +) + +func TestParseBasic(t *testing.T) { + testStrings := map[string]string{ + "hello": "Hello, World!", + "template": "Hello, {{ `template` }}!", + "html": "

Hello, HTML!


", + "html+template": "{{ if .condition }}

{{- .text -}}

{{ end }}", + } + for name, val := range testStrings { + t.Run(name, func(t *testing.T) { + doc, err := Parse(strings.NewReader(val)) + t.Log(val) + if err != nil { + t.Fatal(err) + } + t.Log(doc) + }) + } +} + +func TestParseHTML(t *testing.T) { + testStrings := map[string]string{ + "void": "

Hello


World

", + "attrs": `
`, + } + for name, val := range testStrings { + t.Run(name, func(t *testing.T) { + doc, err := Parse(strings.NewReader(val)) + t.Log(val) + if err != nil { + t.Fatal(err) + } + t.Log(doc) + }) + } +} +func TestParseTemplate(t *testing.T) { + testStrings := map[string]string{ + "if": "{{ if .condition }}Hello{{ end }}", + "if-else": "{{ if .condition }}Hello{{ else }}World{{ end }}", + "if-elif": "{{ if .condition }}Hello{{ else if .other.condition }}World{{ end }}", + "if-elif-else": "{{ if .condition }}One{{ else if .other.condition }}Two{{ else }}Three{{ end }}", + } + for name, val := range testStrings { + t.Run(name, func(t *testing.T) { + doc, err := Parse(strings.NewReader(val)) + t.Log(val) + if err != nil { + t.Fatal(err) + } + t.Log(doc) + }) + } +} + +func TestParseComplex(t *testing.T) { + testStrings := map[string]string{ + "template-attr": ``, + } + for name, val := range testStrings { + t.Run(name, func(t *testing.T) { + toks := slices.Collect(Tokenize(strings.NewReader(val)).seq()) + fmt.Println(toks) + doc, err := Parse(strings.NewReader(val)) + t.Log(val) + if err != nil { + t.Fatal(err) + } + t.Log(doc) + }) + } +} diff --git a/web/htmlt/parse/text.go b/web/htmlt/parse/text.go new file mode 100644 index 0000000..697cf29 --- /dev/null +++ b/web/htmlt/parse/text.go @@ -0,0 +1,29 @@ +package parse + +import ( + "fmt" + "regexp" + "strings" +) + +var ( + HtmlNameRegexp = regexp.MustCompile("^[a-zA-Z][a-zA-Z0-9]*(?:-[a-zA-Z][a-zA-Z0-9]*)*$") +) + +func IsHtmlName(text string) bool { + return HtmlNameRegexp.MatchString(text) +} + +func CutQuotes(text string) (string, bool) { + preQuote, preOk := strings.CutPrefix(text, `"`) + postQuote, postOk := strings.CutSuffix(preQuote, `"`) + return postQuote, preOk && postOk +} + +func JoinAny(vals []any, sep string) string { + valsStr := make([]string, len(vals)) + for i, val := range vals { + valsStr[i] = fmt.Sprintf("%s", val) + } + return strings.Join(valsStr, sep) +} diff --git a/web/htmlt/parse/tokens.go b/web/htmlt/parse/tokens.go new file mode 100644 index 0000000..791f696 --- /dev/null +++ b/web/htmlt/parse/tokens.go @@ -0,0 +1,360 @@ +package parse + +import ( + "bufio" + "errors" + "fmt" + "io" + "strings" +) + +type TokenState int + +const ( + STATE_CONTENT = TokenState(iota) + STATE_TEMPLATE + STATE_TAG +) + +func (ts TokenState) String() string { + switch ts { + case STATE_CONTENT: + return "STATE_CONTENT" + case STATE_TEMPLATE: + return "STATE_TEMPLATE" + case STATE_TAG: + return "STATE_TAG" + default: + return "STATE_UNKNOWN" + } +} + +type TokenType int + +const ( + ERROR = TokenType(iota) + EOF + TEXT + WHITESPACE + TEMPLATE_OPEN // {{ + TEMPLATE_CLOSE // }} + TEMPLATE_KEYWORD // range, with, else, if, end + TAG_OPEN // < + TAG_END_OPEN // + TAG_VOID_CLOSE // /> + TAG_EQ // = + TAG_QUOTE // " +) + +func (tt TokenType) String() string { + switch tt { + case ERROR: + return "ERROR" + case EOF: + return "EOF" + case TEXT: + return "TEXT" + case WHITESPACE: + return "S" + case TEMPLATE_OPEN: + return "TEMPLATE_OPEN" + case TEMPLATE_CLOSE: + return "TEMPLATE_CLOSE" + case TEMPLATE_KEYWORD: + return "TEMPLATE_KEYWORD" + case TAG_OPEN: + return "TAG_OPEN" + case TAG_END_OPEN: + return "TAG_END_OPEN" + case TAG_CLOSE: + return "TAG_CLOSE" + case TAG_EQ: + return "TAG_EQ" + case TAG_QUOTE: + return "TAG_QUOTE" + default: + return "UNKNOWN" + } +} + +const ( + whitespace = " \n\t\r\f\b" +) + +type Token struct { + Type TokenType + Literal string +} + +func (t *Token) String() string { + return fmt.Sprintf("%s(%s)", t.Type, t.Literal) +} + +type tokenizer struct { + r *bufio.Reader + state TokenState + nextToken *Token + err error +} + +// peek returns the next i characters, and "true" if i characters were found. +func (t *tokenizer) peek(i int) (string, bool) { + x, err := t.r.Peek(i) + if err != nil { + return string(x), false + } + return string(x), true +} + +func (t *tokenizer) advance(i int) { + t.r.Discard(i) +} + +func (t *tokenizer) nextContent() (*Token, error) { + acc := make([]rune, 100) + cursor := 0 + accumulate := func(r rune) { + if cursor >= len(acc) { + acc = append(acc, make([]rune, 100)...) + } + acc[cursor] = r + cursor++ + } + for { + if token, ok := t.peek(3); ok { + switch token { + case "{{-": + out := &Token{TEXT, string(acc[:cursor])} + t.state = STATE_TEMPLATE + return out, nil + } + } + if token, ok := t.peek(2); ok { + switch token { + case "{{": + out := &Token{TEXT, string(acc[:cursor])} + t.state = STATE_TEMPLATE + return out, nil + case " 0 { + return &Token{TEXT, string(acc[:cursor])}, nil + } else { + return &Token{EOF, ""}, nil + } + } else if err != nil { + return nil, err + } + accumulate(r) + } +} +func (t *tokenizer) nextTemplate() (*Token, error) { + acc := make([]rune, 10) + cursor := 0 + accumulate := func(r rune) { + if cursor >= len(acc) { + acc = append(acc, make([]rune, 10)...) + } + acc[cursor] = r + cursor++ + } + for { + if token, ok := t.peek(5); ok { + switch token { + case "range": + out := &Token{TEMPLATE_KEYWORD, token} + t.advance(5) + return out, nil + } + } + if token, ok := t.peek(4); ok { + switch token { + case "with": + out := &Token{TEMPLATE_KEYWORD, token} + t.advance(4) + return out, nil + case "else": + out := &Token{TEMPLATE_KEYWORD, token} + t.advance(4) + return out, nil + } + } + if token, ok := t.peek(3); ok { + switch token { + case "end": + out := &Token{TEMPLATE_KEYWORD, token} + t.advance(3) + return out, nil + case "{{-": + out := &Token{TEMPLATE_OPEN, token} + t.advance(3) + return out, nil + case "-}}": + out := &Token{TEMPLATE_CLOSE, token} + t.advance(3) + t.state = STATE_CONTENT + return out, nil + } + } + if token, ok := t.peek(2); ok { + switch token { + case "if": + out := &Token{TEMPLATE_KEYWORD, token} + t.advance(2) + return out, nil + case "{{": + out := &Token{TEMPLATE_OPEN, token} + t.advance(2) + return out, nil + case "}}": + out := &Token{TEMPLATE_CLOSE, token} + t.advance(2) + t.state = STATE_CONTENT + return out, nil + } + } + r, _, err := t.r.ReadRune() + if errors.Is(err, io.EOF) { + return &Token{EOF, ""}, nil + } else if err != nil { + return nil, err + } + if strings.ContainsRune(whitespace, r) { + return &Token{TEXT, string(acc[:cursor])}, nil + } else { + accumulate(r) + } + } +} +func (t *tokenizer) nextTag() (*Token, error) { + acc := make([]rune, 10) + cursor := 0 + accumulate := func(r rune) { + if cursor >= len(acc) { + acc = append(acc, make([]rune, 10)...) + } + acc[cursor] = r + cursor++ + } + for { + if token, ok := t.peek(2); ok { + switch token { + case "": + out := &Token{TEXT, string(acc[:cursor])} + t.nextToken = &Token{TAG_VOID_CLOSE, token} + t.advance(2) + t.state = STATE_CONTENT + return out, nil + } + } + if token, ok := t.peek(1); ok { + switch token { + case "<": + out := &Token{TAG_OPEN, token} + t.advance(1) + return out, nil + case "=": + out := &Token{TEXT, string(acc[:cursor])} + t.nextToken = &Token{TAG_EQ, token} + t.advance(1) + return out, nil + case `"`: + out := &Token{TEXT, string(acc[:cursor])} + t.nextToken = &Token{TAG_QUOTE, token} + t.advance(1) + return out, nil + case ">": + out := &Token{TEXT, string(acc[:cursor])} + t.nextToken = &Token{TAG_CLOSE, token} + t.advance(1) + t.state = STATE_CONTENT + return out, nil + } + } + r, _, err := t.r.ReadRune() + if errors.Is(err, io.EOF) { + return &Token{EOF, ""}, nil + } else if err != nil { + return nil, err + } + if strings.ContainsRune(whitespace, r) { + return &Token{TEXT, string(acc[:cursor])}, nil + } else { + accumulate(r) + } + } +} + +func (t *tokenizer) next2() (*Token, error) { + var next *Token + var err error + for next == nil && err == nil { + if t.nextToken != nil { + next := t.nextToken + t.nextToken = nil + return next, nil + } + switch t.state { + case STATE_CONTENT: + next, err = t.nextContent() + case STATE_TEMPLATE: + next, err = t.nextTemplate() + case STATE_TAG: + next, err = t.nextTag() + default: + return nil, fmt.Errorf("unknown state %s", t.state) + } + if next.Type == TEXT && next.Literal == "" { + next = nil + } + } + return next, err +} + +// all returns an iterator over all tokens produced by the tokenizer. +// Stops iterating on EOF or error. +func (t *tokenizer) all() tokens { + return func(yield func(*Token) bool) { + for { + if t.err != nil { + return + } + tok, err := t.next2() + if err != nil { + yield(&Token{ERROR, t.err.Error()}) + break + } + if tok.Type == EOF { + t.err = io.EOF + } + if !yield(tok) { + break + } + } + } +} + +func Tokenize(r io.Reader) tokens { + tkns := new(tokenizer) + tkns.r = bufio.NewReader(r) + return tkns.all() +} diff --git a/web/htmlt/parse/tokens_test.go b/web/htmlt/parse/tokens_test.go new file mode 100644 index 0000000..f29e6ee --- /dev/null +++ b/web/htmlt/parse/tokens_test.go @@ -0,0 +1,23 @@ +package parse + +import ( + "slices" + "strings" + "testing" +) + +func TestTokenize(t *testing.T) { + testStrings := map[string]string{ + "hello": "Hello, World!", + "template": "Hello, {{ `template` }}!", + "html": "

Hello, HTML!


", + "html+template": "{{ if .condition }}

{{- .text -}}

{{ end }}", + } + for name, val := range testStrings { + t.Run(name, func(t *testing.T) { + toks := slices.Collect(Tokenize(strings.NewReader(val)).seq()) + t.Log(val) + t.Log(toks) + }) + } +} -- 2.39.5