From 5d3bb896cec0f1d284182e15b5efdbf309059f1d Mon Sep 17 00:00:00 2001 From: early Date: Wed, 28 Aug 2024 23:09:53 -0600 Subject: [PATCH] modules are being a pain in the ass --- README.md | 9 +- go.mod | 9 +- go.sum | 4 +- html/LICENSE | 27 + html/PATENTS | 22 + html/README.md | 14 + html/atom/atom.go | 78 + html/atom/atom_test.go | 109 + html/atom/gen.go | 711 +++++ html/atom/table.go | 783 +++++ html/atom/table_test.go | 376 +++ html/charset/charset.go | 257 ++ html/charset/charset_test.go | 237 ++ html/charset/testdata/HTTP-charset.html | 48 + html/charset/testdata/HTTP-vs-UTF-8-BOM.html | 48 + .../testdata/HTTP-vs-meta-charset.html | 49 + .../testdata/HTTP-vs-meta-content.html | 49 + .../testdata/No-encoding-declaration.html | 47 + html/charset/testdata/README | 9 + html/charset/testdata/UTF-16BE-BOM.html | Bin 0 -> 2670 bytes html/charset/testdata/UTF-16LE-BOM.html | Bin 0 -> 2682 bytes .../testdata/UTF-8-BOM-vs-meta-charset.html | 49 + .../testdata/UTF-8-BOM-vs-meta-content.html | 48 + .../testdata/meta-charset-attribute.html | 48 + .../testdata/meta-content-attribute.html | 48 + html/comment_test.go | 291 ++ html/const.go | 111 + html/doc.go | 127 + html/doctype.go | 156 + html/entity.go | 2253 ++++++++++++++ html/entity_test.go | 29 + html/escape.go | 339 +++ html/escape_test.go | 97 + html/example_test.go | 40 + html/foreign.go | 222 ++ html/node.go | 225 ++ html/node_test.go | 146 + html/parse.go | 2454 ++++++++++++++++ html/parse_test.go | 490 ++++ html/render.go | 293 ++ html/render_test.go | 207 ++ .../issue_30600_parse_panics_in_cell_mode.dat | 12 + ...e_30961_error_nested_unknown_tag_types.dat | 11 + html/testdata/go/raw_tags_to_be_ignored.dat | 97 + html/testdata/go/select.dat | 12 + html/testdata/go/template.dat | 64 + html/testdata/go1.html | 2237 ++++++++++++++ html/testdata/webkit/README | 28 + html/testdata/webkit/adoption01.dat | 354 +++ html/testdata/webkit/adoption02.dat | 39 + html/testdata/webkit/blocks.dat | 719 +++++ html/testdata/webkit/comments01.dat | 224 ++ html/testdata/webkit/doctype01.dat | 470 +++ html/testdata/webkit/domjs-unsafe.dat | Bin 0 -> 10356 bytes html/testdata/webkit/entities01.dat | 943 ++++++ html/testdata/webkit/entities02.dat | 309 ++ html/testdata/webkit/foreign-fragment.dat | 559 ++++ html/testdata/webkit/html5test-com.dat | 302 ++ html/testdata/webkit/inbody01.dat | 54 + html/testdata/webkit/isindex.dat | 49 + html/testdata/webkit/main-element.dat | 46 + html/testdata/webkit/math.dat | 81 + html/testdata/webkit/menuitem-element.dat | 257 ++ .../testdata/webkit/namespace-sensitivity.dat | 16 + html/testdata/webkit/noscript01.dat | 237 ++ ...pending-spec-changes-plain-text-unsafe.dat | Bin 0 -> 927 bytes html/testdata/webkit/pending-spec-changes.dat | 46 + html/testdata/webkit/plain-text-unsafe.dat | Bin 0 -> 9388 bytes html/testdata/webkit/ruby.dat | 301 ++ html/testdata/webkit/scriptdata01.dat | 385 +++ html/testdata/webkit/scripted/adoption01.dat | 16 + html/testdata/webkit/scripted/ark.dat | 27 + html/testdata/webkit/scripted/webkit01.dat | 30 + html/testdata/webkit/svg.dat | 81 + html/testdata/webkit/tables01.dat | 286 ++ html/testdata/webkit/template.dat | 1604 ++++++++++ html/testdata/webkit/tests1.dat | 1988 +++++++++++++ html/testdata/webkit/tests10.dat | 849 ++++++ html/testdata/webkit/tests11.dat | 523 ++++ html/testdata/webkit/tests12.dat | 62 + html/testdata/webkit/tests14.dat | 75 + html/testdata/webkit/tests15.dat | 216 ++ html/testdata/webkit/tests16.dat | 2604 +++++++++++++++++ html/testdata/webkit/tests17.dat | 179 ++ html/testdata/webkit/tests18.dat | 534 ++++ html/testdata/webkit/tests19.dat | 1454 +++++++++ html/testdata/webkit/tests2.dat | 821 ++++++ html/testdata/webkit/tests20.dat | 582 ++++ html/testdata/webkit/tests21.dat | 325 ++ html/testdata/webkit/tests22.dat | 190 ++ html/testdata/webkit/tests23.dat | 168 ++ html/testdata/webkit/tests24.dat | 79 + html/testdata/webkit/tests25.dat | 288 ++ html/testdata/webkit/tests26.dat | 393 +++ html/testdata/webkit/tests3.dat | 305 ++ html/testdata/webkit/tests4.dat | 58 + html/testdata/webkit/tests5.dat | 210 ++ html/testdata/webkit/tests6.dat | 663 +++++ html/testdata/webkit/tests7.dat | 418 +++ html/testdata/webkit/tests8.dat | 162 + html/testdata/webkit/tests9.dat | 472 +++ html/testdata/webkit/tests_innerHTML_1.dat | 887 ++++++ html/testdata/webkit/tricky01.dat | 336 +++ html/testdata/webkit/webkit01.dat | 755 +++++ html/testdata/webkit/webkit02.dat | 303 ++ html/token.go | 1272 ++++++++ html/token_test.go | 917 ++++++ htmltree/attrs.go | 2 +- htmltree/prettify.go | 2 +- internal/compile/compile.go | 2 +- internal/compile/component.go | 4 +- internal/compile/template.go | 2 +- 112 files changed, 38541 insertions(+), 14 deletions(-) create mode 100644 html/LICENSE create mode 100644 html/PATENTS create mode 100644 html/README.md create mode 100644 html/atom/atom.go create mode 100644 html/atom/atom_test.go create mode 100644 html/atom/gen.go create mode 100644 html/atom/table.go create mode 100644 html/atom/table_test.go create mode 100644 html/charset/charset.go create mode 100644 html/charset/charset_test.go create mode 100644 html/charset/testdata/HTTP-charset.html create mode 100644 html/charset/testdata/HTTP-vs-UTF-8-BOM.html create mode 100644 html/charset/testdata/HTTP-vs-meta-charset.html create mode 100644 html/charset/testdata/HTTP-vs-meta-content.html create mode 100644 html/charset/testdata/No-encoding-declaration.html create mode 100644 html/charset/testdata/README create mode 100644 html/charset/testdata/UTF-16BE-BOM.html create mode 100644 html/charset/testdata/UTF-16LE-BOM.html create mode 100644 html/charset/testdata/UTF-8-BOM-vs-meta-charset.html create mode 100644 html/charset/testdata/UTF-8-BOM-vs-meta-content.html create mode 100644 html/charset/testdata/meta-charset-attribute.html create mode 100644 html/charset/testdata/meta-content-attribute.html create mode 100644 html/comment_test.go create mode 100644 html/const.go create mode 100644 html/doc.go create mode 100644 html/doctype.go create mode 100644 html/entity.go create mode 100644 html/entity_test.go create mode 100644 html/escape.go create mode 100644 html/escape_test.go create mode 100644 html/example_test.go create mode 100644 html/foreign.go create mode 100644 html/node.go create mode 100644 html/node_test.go create mode 100644 html/parse.go create mode 100644 html/parse_test.go create mode 100644 html/render.go create mode 100644 html/render_test.go create mode 100644 html/testdata/go/issue_30600_parse_panics_in_cell_mode.dat create mode 100644 html/testdata/go/issue_30961_error_nested_unknown_tag_types.dat create mode 100644 html/testdata/go/raw_tags_to_be_ignored.dat create mode 100644 html/testdata/go/select.dat create mode 100644 html/testdata/go/template.dat create mode 100644 html/testdata/go1.html create mode 100644 html/testdata/webkit/README create mode 100644 html/testdata/webkit/adoption01.dat create mode 100644 html/testdata/webkit/adoption02.dat create mode 100644 html/testdata/webkit/blocks.dat create mode 100644 html/testdata/webkit/comments01.dat create mode 100644 html/testdata/webkit/doctype01.dat create mode 100644 html/testdata/webkit/domjs-unsafe.dat create mode 100644 html/testdata/webkit/entities01.dat create mode 100644 html/testdata/webkit/entities02.dat create mode 100644 html/testdata/webkit/foreign-fragment.dat create mode 100644 html/testdata/webkit/html5test-com.dat create mode 100644 html/testdata/webkit/inbody01.dat create mode 100644 html/testdata/webkit/isindex.dat create mode 100644 html/testdata/webkit/main-element.dat create mode 100644 html/testdata/webkit/math.dat create mode 100644 html/testdata/webkit/menuitem-element.dat create mode 100644 html/testdata/webkit/namespace-sensitivity.dat create mode 100644 html/testdata/webkit/noscript01.dat create mode 100644 html/testdata/webkit/pending-spec-changes-plain-text-unsafe.dat create mode 100644 html/testdata/webkit/pending-spec-changes.dat create mode 100644 html/testdata/webkit/plain-text-unsafe.dat create mode 100644 html/testdata/webkit/ruby.dat create mode 100644 html/testdata/webkit/scriptdata01.dat create mode 100644 html/testdata/webkit/scripted/adoption01.dat create mode 100644 html/testdata/webkit/scripted/ark.dat create mode 100644 html/testdata/webkit/scripted/webkit01.dat create mode 100644 html/testdata/webkit/svg.dat create mode 100644 html/testdata/webkit/tables01.dat create mode 100644 html/testdata/webkit/template.dat create mode 100644 html/testdata/webkit/tests1.dat create mode 100644 html/testdata/webkit/tests10.dat create mode 100644 html/testdata/webkit/tests11.dat create mode 100644 html/testdata/webkit/tests12.dat create mode 100644 html/testdata/webkit/tests14.dat create mode 100644 html/testdata/webkit/tests15.dat create mode 100644 html/testdata/webkit/tests16.dat create mode 100644 html/testdata/webkit/tests17.dat create mode 100644 html/testdata/webkit/tests18.dat create mode 100644 html/testdata/webkit/tests19.dat create mode 100644 html/testdata/webkit/tests2.dat create mode 100644 html/testdata/webkit/tests20.dat create mode 100644 html/testdata/webkit/tests21.dat create mode 100644 html/testdata/webkit/tests22.dat create mode 100644 html/testdata/webkit/tests23.dat create mode 100644 html/testdata/webkit/tests24.dat create mode 100644 html/testdata/webkit/tests25.dat create mode 100644 html/testdata/webkit/tests26.dat create mode 100644 html/testdata/webkit/tests3.dat create mode 100644 html/testdata/webkit/tests4.dat create mode 100644 html/testdata/webkit/tests5.dat create mode 100644 html/testdata/webkit/tests6.dat create mode 100644 html/testdata/webkit/tests7.dat create mode 100644 html/testdata/webkit/tests8.dat create mode 100644 html/testdata/webkit/tests9.dat create mode 100644 html/testdata/webkit/tests_innerHTML_1.dat create mode 100644 html/testdata/webkit/tricky01.dat create mode 100644 html/testdata/webkit/webkit01.dat create mode 100644 html/testdata/webkit/webkit02.dat create mode 100644 html/token.go create mode 100644 html/token_test.go diff --git a/README.md b/README.md index 7cd3473..df9e1df 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ the Web Components standard. It aims to be usable for anything from static HTML websites to complex applications with a mix of server and client-side behavior. Author: Early N. -License: MIT +License: MIT, BSD ## Installation @@ -22,3 +22,10 @@ not promote foundational knowledge of the web platform. Today focuses on: - Expanding on Go's robust template system with reusable chunks of HTML ## How? + +## License + +The portions of `today-engine` owned by Early N. are under the MIT license. Some +code sections have a different license: + +- `package html` is under a BSD-3-Clause style license from Google. diff --git a/go.mod b/go.mod index b1c251b..fb5c1b4 100644 --- a/go.mod +++ b/go.mod @@ -2,8 +2,7 @@ module git.earlybird.gay/today-engine go 1.22.4 -require golang.org/x/net v0.27.0 - -require git.earlybird.gay/today-app v0.0.0-20240813010007-675f08ae35b1 - -replace golang.org/x/net => git.earlybird.gay/x/net v0.28.1 +require ( + git.earlybird.gay/today-app v0.0.0-20240813010007-675f08ae35b1 + golang.org/x/text v0.17.0 +) diff --git a/go.sum b/go.sum index 1da235d..ca1ee0b 100644 --- a/go.sum +++ b/go.sum @@ -1,4 +1,4 @@ git.earlybird.gay/today-app v0.0.0-20240813010007-675f08ae35b1 h1:Fbz2uRIWK6CR+jcNN0KQrna4u/kYZccn2obcla7dPfQ= git.earlybird.gay/today-app v0.0.0-20240813010007-675f08ae35b1/go.mod h1:AxsoC2ERffYriW60C1vdV34ew6JPKxllG9mHOTs128I= -git.earlybird.gay/x/net v0.28.1 h1:KgeNTRFt7fJfuJcwoyeNf3g/sLQfi/+647DHslFkrv4= -git.earlybird.gay/x/net v0.28.1/go.mod h1:K1yf3vIKjz6W4AnxgoJ67k6p0Aa5x9BcCuE/GdiZ0FA= +golang.org/x/text v0.17.0 h1:XtiM5bkSOt+ewxlOE/aE/AKEHibwj/6gvWMl9Rsh0Qc= +golang.org/x/text v0.17.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY= diff --git a/html/LICENSE b/html/LICENSE new file mode 100644 index 0000000..2a7cf70 --- /dev/null +++ b/html/LICENSE @@ -0,0 +1,27 @@ +Copyright 2009 The Go Authors. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google LLC nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/html/PATENTS b/html/PATENTS new file mode 100644 index 0000000..7330990 --- /dev/null +++ b/html/PATENTS @@ -0,0 +1,22 @@ +Additional IP Rights Grant (Patents) + +"This implementation" means the copyrightable works distributed by +Google as part of the Go project. + +Google hereby grants to You a perpetual, worldwide, non-exclusive, +no-charge, royalty-free, irrevocable (except as stated in this section) +patent license to make, have made, use, offer to sell, sell, import, +transfer and otherwise run, modify and propagate the contents of this +implementation of Go, where such license applies only to those patent +claims, both currently owned or controlled by Google and acquired in +the future, licensable by Google that are necessarily infringed by this +implementation of Go. This grant does not include claims that would be +infringed only as a consequence of further modification of this +implementation. If you or your agent or exclusive licensee institute or +order or agree to the institution of patent litigation against any +entity (including a cross-claim or counterclaim in a lawsuit) alleging +that this implementation of Go or any code incorporated within this +implementation of Go constitutes direct or contributory patent +infringement, or inducement of patent infringement, then any patent +rights granted to you under this License for this implementation of Go +shall terminate as of the date such litigation is filed. diff --git a/html/README.md b/html/README.md new file mode 100644 index 0000000..ad72cd7 --- /dev/null +++ b/html/README.md @@ -0,0 +1,14 @@ +# HTML + +## This is a fork. + +This repo forks `golang.org/x/net/html` and makes limited changes: + +- Foster parenting is disabled. +- Attribute keys are not automatically set to lowercase. + +These changes are made to support Today's use of `x/net/html` to parse and +re-render Go templates. They have the intended side effect of allowing invalid +HTML to be rendered, which is almost definitely not what you want. + +Please see the LICENSE and PATENTS file for this directory. diff --git a/html/atom/atom.go b/html/atom/atom.go new file mode 100644 index 0000000..b4439bc --- /dev/null +++ b/html/atom/atom.go @@ -0,0 +1,78 @@ +// Copyright 2012 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package atom provides integer codes (also known as atoms) for a fixed set of +// frequently occurring HTML strings: tag names and attribute keys such as "p" +// and "id". +// +// Sharing an atom's name between all elements with the same tag can result in +// fewer string allocations when tokenizing and parsing HTML. Integer +// comparisons are also generally faster than string comparisons. +// +// The value of an atom's particular code is not guaranteed to stay the same +// between versions of this package. Neither is any ordering guaranteed: +// whether atom.H1 < atom.H2 may also change. The codes are not guaranteed to +// be dense. The only guarantees are that e.g. looking up "div" will yield +// atom.Div, calling atom.Div.String will return "div", and atom.Div != 0. +package atom // import "git.earlybird.gay/today-engine/html/atom" + +// Atom is an integer code for a string. The zero value maps to "". +type Atom uint32 + +// String returns the atom's name. +func (a Atom) String() string { + start := uint32(a >> 8) + n := uint32(a & 0xff) + if start+n > uint32(len(atomText)) { + return "" + } + return atomText[start : start+n] +} + +func (a Atom) string() string { + return atomText[a>>8 : a>>8+a&0xff] +} + +// fnv computes the FNV hash with an arbitrary starting value h. +func fnv(h uint32, s []byte) uint32 { + for i := range s { + h ^= uint32(s[i]) + h *= 16777619 + } + return h +} + +func match(s string, t []byte) bool { + for i, c := range t { + if s[i] != c { + return false + } + } + return true +} + +// Lookup returns the atom whose name is s. It returns zero if there is no +// such atom. The lookup is case sensitive. +func Lookup(s []byte) Atom { + if len(s) == 0 || len(s) > maxAtomLen { + return 0 + } + h := fnv(hash0, s) + if a := table[h&uint32(len(table)-1)]; int(a&0xff) == len(s) && match(a.string(), s) { + return a + } + if a := table[(h>>16)&uint32(len(table)-1)]; int(a&0xff) == len(s) && match(a.string(), s) { + return a + } + return 0 +} + +// String returns a string whose contents are equal to s. In that sense, it is +// equivalent to string(s) but may be more efficient. +func String(s []byte) string { + if a := Lookup(s); a != 0 { + return a.String() + } + return string(s) +} diff --git a/html/atom/atom_test.go b/html/atom/atom_test.go new file mode 100644 index 0000000..6e33704 --- /dev/null +++ b/html/atom/atom_test.go @@ -0,0 +1,109 @@ +// Copyright 2012 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package atom + +import ( + "sort" + "testing" +) + +func TestKnown(t *testing.T) { + for _, s := range testAtomList { + if atom := Lookup([]byte(s)); atom.String() != s { + t.Errorf("Lookup(%q) = %#x (%q)", s, uint32(atom), atom.String()) + } + } +} + +func TestHits(t *testing.T) { + for _, a := range table { + if a == 0 { + continue + } + got := Lookup([]byte(a.String())) + if got != a { + t.Errorf("Lookup(%q) = %#x, want %#x", a.String(), uint32(got), uint32(a)) + } + } +} + +func TestMisses(t *testing.T) { + testCases := []string{ + "", + "\x00", + "\xff", + "A", + "DIV", + "Div", + "dIV", + "aa", + "a\x00", + "ab", + "abb", + "abbr0", + "abbr ", + " abbr", + " a", + "acceptcharset", + "acceptCharset", + "accept_charset", + "h0", + "h1h2", + "h7", + "onClick", + "λ", + // The following string has the same hash (0xa1d7fab7) as "onmouseover". + "\x00\x00\x00\x00\x00\x50\x18\xae\x38\xd0\xb7", + } + for _, tc := range testCases { + got := Lookup([]byte(tc)) + if got != 0 { + t.Errorf("Lookup(%q): got %d, want 0", tc, got) + } + } +} + +func TestForeignObject(t *testing.T) { + const ( + afo = Foreignobject + afO = ForeignObject + sfo = "foreignobject" + sfO = "foreignObject" + ) + if got := Lookup([]byte(sfo)); got != afo { + t.Errorf("Lookup(%q): got %#v, want %#v", sfo, got, afo) + } + if got := Lookup([]byte(sfO)); got != afO { + t.Errorf("Lookup(%q): got %#v, want %#v", sfO, got, afO) + } + if got := afo.String(); got != sfo { + t.Errorf("Atom(%#v).String(): got %q, want %q", afo, got, sfo) + } + if got := afO.String(); got != sfO { + t.Errorf("Atom(%#v).String(): got %q, want %q", afO, got, sfO) + } +} + +func BenchmarkLookup(b *testing.B) { + sortedTable := make([]string, 0, len(table)) + for _, a := range table { + if a != 0 { + sortedTable = append(sortedTable, a.String()) + } + } + sort.Strings(sortedTable) + + x := make([][]byte, 1000) + for i := range x { + x[i] = []byte(sortedTable[i%len(sortedTable)]) + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + for _, s := range x { + Lookup(s) + } + } +} diff --git a/html/atom/gen.go b/html/atom/gen.go new file mode 100644 index 0000000..1e249d1 --- /dev/null +++ b/html/atom/gen.go @@ -0,0 +1,711 @@ +// Copyright 2012 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build ignore + +//go:generate go run gen.go +//go:generate go run gen.go -test + +package main + +import ( + "bytes" + "flag" + "fmt" + "go/format" + "math/rand" + "os" + "sort" + "strings" +) + +// identifier converts s to a Go exported identifier. +// It converts "div" to "Div" and "accept-charset" to "AcceptCharset". +func identifier(s string) string { + b := make([]byte, 0, len(s)) + cap := true + for _, c := range s { + if c == '-' { + cap = true + continue + } + if cap && 'a' <= c && c <= 'z' { + c -= 'a' - 'A' + } + cap = false + b = append(b, byte(c)) + } + return string(b) +} + +var test = flag.Bool("test", false, "generate table_test.go") + +func genFile(name string, buf *bytes.Buffer) { + b, err := format.Source(buf.Bytes()) + if err != nil { + fmt.Fprintln(os.Stderr, err) + os.Exit(1) + } + if err := os.WriteFile(name, b, 0644); err != nil { + fmt.Fprintln(os.Stderr, err) + os.Exit(1) + } +} + +func main() { + flag.Parse() + + var all []string + all = append(all, elements...) + all = append(all, attributes...) + all = append(all, eventHandlers...) + all = append(all, extra...) + sort.Strings(all) + + // uniq - lists have dups + w := 0 + for _, s := range all { + if w == 0 || all[w-1] != s { + all[w] = s + w++ + } + } + all = all[:w] + + if *test { + var buf bytes.Buffer + fmt.Fprintln(&buf, "// Code generated by go generate gen.go; DO NOT EDIT.\n") + fmt.Fprintln(&buf, "//go:generate go run gen.go -test\n") + fmt.Fprintln(&buf, "package atom\n") + fmt.Fprintln(&buf, "var testAtomList = []string{") + for _, s := range all { + fmt.Fprintf(&buf, "\t%q,\n", s) + } + fmt.Fprintln(&buf, "}") + + genFile("table_test.go", &buf) + return + } + + // Find hash that minimizes table size. + var best *table + for i := 0; i < 1000000; i++ { + if best != nil && 1<<(best.k-1) < len(all) { + break + } + h := rand.Uint32() + for k := uint(0); k <= 16; k++ { + if best != nil && k >= best.k { + break + } + var t table + if t.init(h, k, all) { + best = &t + break + } + } + } + if best == nil { + fmt.Fprintf(os.Stderr, "failed to construct string table\n") + os.Exit(1) + } + + // Lay out strings, using overlaps when possible. + layout := append([]string{}, all...) + + // Remove strings that are substrings of other strings + for changed := true; changed; { + changed = false + for i, s := range layout { + if s == "" { + continue + } + for j, t := range layout { + if i != j && t != "" && strings.Contains(s, t) { + changed = true + layout[j] = "" + } + } + } + } + + // Join strings where one suffix matches another prefix. + for { + // Find best i, j, k such that layout[i][len-k:] == layout[j][:k], + // maximizing overlap length k. + besti := -1 + bestj := -1 + bestk := 0 + for i, s := range layout { + if s == "" { + continue + } + for j, t := range layout { + if i == j { + continue + } + for k := bestk + 1; k <= len(s) && k <= len(t); k++ { + if s[len(s)-k:] == t[:k] { + besti = i + bestj = j + bestk = k + } + } + } + } + if bestk > 0 { + layout[besti] += layout[bestj][bestk:] + layout[bestj] = "" + continue + } + break + } + + text := strings.Join(layout, "") + + atom := map[string]uint32{} + for _, s := range all { + off := strings.Index(text, s) + if off < 0 { + panic("lost string " + s) + } + atom[s] = uint32(off<<8 | len(s)) + } + + var buf bytes.Buffer + // Generate the Go code. + fmt.Fprintln(&buf, "// Code generated by go generate gen.go; DO NOT EDIT.\n") + fmt.Fprintln(&buf, "//go:generate go run gen.go\n") + fmt.Fprintln(&buf, "package atom\n\nconst (") + + // compute max len + maxLen := 0 + for _, s := range all { + if maxLen < len(s) { + maxLen = len(s) + } + fmt.Fprintf(&buf, "\t%s Atom = %#x\n", identifier(s), atom[s]) + } + fmt.Fprintln(&buf, ")\n") + + fmt.Fprintf(&buf, "const hash0 = %#x\n\n", best.h0) + fmt.Fprintf(&buf, "const maxAtomLen = %d\n\n", maxLen) + + fmt.Fprintf(&buf, "var table = [1<<%d]Atom{\n", best.k) + for i, s := range best.tab { + if s == "" { + continue + } + fmt.Fprintf(&buf, "\t%#x: %#x, // %s\n", i, atom[s], s) + } + fmt.Fprintf(&buf, "}\n") + datasize := (1 << best.k) * 4 + + fmt.Fprintln(&buf, "const atomText =") + textsize := len(text) + for len(text) > 60 { + fmt.Fprintf(&buf, "\t%q +\n", text[:60]) + text = text[60:] + } + fmt.Fprintf(&buf, "\t%q\n\n", text) + + genFile("table.go", &buf) + + fmt.Fprintf(os.Stdout, "%d atoms; %d string bytes + %d tables = %d total data\n", len(all), textsize, datasize, textsize+datasize) +} + +type byLen []string + +func (x byLen) Less(i, j int) bool { return len(x[i]) > len(x[j]) } +func (x byLen) Swap(i, j int) { x[i], x[j] = x[j], x[i] } +func (x byLen) Len() int { return len(x) } + +// fnv computes the FNV hash with an arbitrary starting value h. +func fnv(h uint32, s string) uint32 { + for i := 0; i < len(s); i++ { + h ^= uint32(s[i]) + h *= 16777619 + } + return h +} + +// A table represents an attempt at constructing the lookup table. +// The lookup table uses cuckoo hashing, meaning that each string +// can be found in one of two positions. +type table struct { + h0 uint32 + k uint + mask uint32 + tab []string +} + +// hash returns the two hashes for s. +func (t *table) hash(s string) (h1, h2 uint32) { + h := fnv(t.h0, s) + h1 = h & t.mask + h2 = (h >> 16) & t.mask + return +} + +// init initializes the table with the given parameters. +// h0 is the initial hash value, +// k is the number of bits of hash value to use, and +// x is the list of strings to store in the table. +// init returns false if the table cannot be constructed. +func (t *table) init(h0 uint32, k uint, x []string) bool { + t.h0 = h0 + t.k = k + t.tab = make([]string, 1< len(t.tab) { + return false + } + s := t.tab[i] + h1, h2 := t.hash(s) + j := h1 + h2 - i + if t.tab[j] != "" && !t.push(j, depth+1) { + return false + } + t.tab[j] = s + return true +} + +// The lists of element names and attribute keys were taken from +// https://html.spec.whatwg.org/multipage/indices.html#index +// as of the "HTML Living Standard - Last Updated 16 April 2018" version. + +// "command", "keygen" and "menuitem" have been removed from the spec, +// but are kept here for backwards compatibility. +var elements = []string{ + "a", + "abbr", + "address", + "area", + "article", + "aside", + "audio", + "b", + "base", + "bdi", + "bdo", + "blockquote", + "body", + "br", + "button", + "canvas", + "caption", + "cite", + "code", + "col", + "colgroup", + "command", + "data", + "datalist", + "dd", + "del", + "details", + "dfn", + "dialog", + "div", + "dl", + "dt", + "em", + "embed", + "fieldset", + "figcaption", + "figure", + "footer", + "form", + "h1", + "h2", + "h3", + "h4", + "h5", + "h6", + "head", + "header", + "hgroup", + "hr", + "html", + "i", + "iframe", + "img", + "input", + "ins", + "kbd", + "keygen", + "label", + "legend", + "li", + "link", + "main", + "map", + "mark", + "menu", + "menuitem", + "meta", + "meter", + "nav", + "noscript", + "object", + "ol", + "optgroup", + "option", + "output", + "p", + "param", + "picture", + "pre", + "progress", + "q", + "rp", + "rt", + "ruby", + "s", + "samp", + "script", + "section", + "select", + "slot", + "small", + "source", + "span", + "strong", + "style", + "sub", + "summary", + "sup", + "table", + "tbody", + "td", + "template", + "textarea", + "tfoot", + "th", + "thead", + "time", + "title", + "tr", + "track", + "u", + "ul", + "var", + "video", + "wbr", +} + +// https://html.spec.whatwg.org/multipage/indices.html#attributes-3 +// +// "challenge", "command", "contextmenu", "dropzone", "icon", "keytype", "mediagroup", +// "radiogroup", "spellcheck", "scoped", "seamless", "sortable" and "sorted" have been removed from the spec, +// but are kept here for backwards compatibility. +var attributes = []string{ + "abbr", + "accept", + "accept-charset", + "accesskey", + "action", + "allowfullscreen", + "allowpaymentrequest", + "allowusermedia", + "alt", + "as", + "async", + "autocomplete", + "autofocus", + "autoplay", + "challenge", + "charset", + "checked", + "cite", + "class", + "color", + "cols", + "colspan", + "command", + "content", + "contenteditable", + "contextmenu", + "controls", + "coords", + "crossorigin", + "data", + "datetime", + "default", + "defer", + "dir", + "dirname", + "disabled", + "download", + "draggable", + "dropzone", + "enctype", + "for", + "form", + "formaction", + "formenctype", + "formmethod", + "formnovalidate", + "formtarget", + "headers", + "height", + "hidden", + "high", + "href", + "hreflang", + "http-equiv", + "icon", + "id", + "inputmode", + "integrity", + "is", + "ismap", + "itemid", + "itemprop", + "itemref", + "itemscope", + "itemtype", + "keytype", + "kind", + "label", + "lang", + "list", + "loop", + "low", + "manifest", + "max", + "maxlength", + "media", + "mediagroup", + "method", + "min", + "minlength", + "multiple", + "muted", + "name", + "nomodule", + "nonce", + "novalidate", + "open", + "optimum", + "pattern", + "ping", + "placeholder", + "playsinline", + "poster", + "preload", + "radiogroup", + "readonly", + "referrerpolicy", + "rel", + "required", + "reversed", + "rows", + "rowspan", + "sandbox", + "spellcheck", + "scope", + "scoped", + "seamless", + "selected", + "shape", + "size", + "sizes", + "sortable", + "sorted", + "slot", + "span", + "spellcheck", + "src", + "srcdoc", + "srclang", + "srcset", + "start", + "step", + "style", + "tabindex", + "target", + "title", + "translate", + "type", + "typemustmatch", + "updateviacache", + "usemap", + "value", + "width", + "workertype", + "wrap", +} + +// "onautocomplete", "onautocompleteerror", "onmousewheel", +// "onshow" and "onsort" have been removed from the spec, +// but are kept here for backwards compatibility. +var eventHandlers = []string{ + "onabort", + "onautocomplete", + "onautocompleteerror", + "onauxclick", + "onafterprint", + "onbeforeprint", + "onbeforeunload", + "onblur", + "oncancel", + "oncanplay", + "oncanplaythrough", + "onchange", + "onclick", + "onclose", + "oncontextmenu", + "oncopy", + "oncuechange", + "oncut", + "ondblclick", + "ondrag", + "ondragend", + "ondragenter", + "ondragexit", + "ondragleave", + "ondragover", + "ondragstart", + "ondrop", + "ondurationchange", + "onemptied", + "onended", + "onerror", + "onfocus", + "onhashchange", + "oninput", + "oninvalid", + "onkeydown", + "onkeypress", + "onkeyup", + "onlanguagechange", + "onload", + "onloadeddata", + "onloadedmetadata", + "onloadend", + "onloadstart", + "onmessage", + "onmessageerror", + "onmousedown", + "onmouseenter", + "onmouseleave", + "onmousemove", + "onmouseout", + "onmouseover", + "onmouseup", + "onmousewheel", + "onwheel", + "onoffline", + "ononline", + "onpagehide", + "onpageshow", + "onpaste", + "onpause", + "onplay", + "onplaying", + "onpopstate", + "onprogress", + "onratechange", + "onreset", + "onresize", + "onrejectionhandled", + "onscroll", + "onsecuritypolicyviolation", + "onseeked", + "onseeking", + "onselect", + "onshow", + "onsort", + "onstalled", + "onstorage", + "onsubmit", + "onsuspend", + "ontimeupdate", + "ontoggle", + "onunhandledrejection", + "onunload", + "onvolumechange", + "onwaiting", +} + +// extra are ad-hoc values not covered by any of the lists above. +var extra = []string{ + "acronym", + "align", + "annotation", + "annotation-xml", + "applet", + "basefont", + "bgsound", + "big", + "blink", + "center", + "color", + "desc", + "face", + "font", + "foreignObject", // HTML is case-insensitive, but SVG-embedded-in-HTML is case-sensitive. + "foreignobject", + "frame", + "frameset", + "image", + "isindex", // "isindex" has been removed from the spec, but are kept here for backwards compatibility. + "listing", + "malignmark", + "marquee", + "math", + "mglyph", + "mi", + "mn", + "mo", + "ms", + "mtext", + "nobr", + "noembed", + "noframes", + "plaintext", + "prompt", + "public", + "rb", + "rtc", + "spacer", + "strike", + "svg", + "system", + "tt", + "xmp", +} diff --git a/html/atom/table.go b/html/atom/table.go new file mode 100644 index 0000000..2a93886 --- /dev/null +++ b/html/atom/table.go @@ -0,0 +1,783 @@ +// Code generated by go generate gen.go; DO NOT EDIT. + +//go:generate go run gen.go + +package atom + +const ( + A Atom = 0x1 + Abbr Atom = 0x4 + Accept Atom = 0x1a06 + AcceptCharset Atom = 0x1a0e + Accesskey Atom = 0x2c09 + Acronym Atom = 0xaa07 + Action Atom = 0x27206 + Address Atom = 0x6f307 + Align Atom = 0xb105 + Allowfullscreen Atom = 0x2080f + Allowpaymentrequest Atom = 0xc113 + Allowusermedia Atom = 0xdd0e + Alt Atom = 0xf303 + Annotation Atom = 0x1c90a + AnnotationXml Atom = 0x1c90e + Applet Atom = 0x31906 + Area Atom = 0x35604 + Article Atom = 0x3fc07 + As Atom = 0x3c02 + Aside Atom = 0x10705 + Async Atom = 0xff05 + Audio Atom = 0x11505 + Autocomplete Atom = 0x2780c + Autofocus Atom = 0x12109 + Autoplay Atom = 0x13c08 + B Atom = 0x101 + Base Atom = 0x3b04 + Basefont Atom = 0x3b08 + Bdi Atom = 0xba03 + Bdo Atom = 0x14b03 + Bgsound Atom = 0x15e07 + Big Atom = 0x17003 + Blink Atom = 0x17305 + Blockquote Atom = 0x1870a + Body Atom = 0x2804 + Br Atom = 0x202 + Button Atom = 0x19106 + Canvas Atom = 0x10306 + Caption Atom = 0x23107 + Center Atom = 0x22006 + Challenge Atom = 0x29b09 + Charset Atom = 0x2107 + Checked Atom = 0x47907 + Cite Atom = 0x19c04 + Class Atom = 0x56405 + Code Atom = 0x5c504 + Col Atom = 0x1ab03 + Colgroup Atom = 0x1ab08 + Color Atom = 0x1bf05 + Cols Atom = 0x1c404 + Colspan Atom = 0x1c407 + Command Atom = 0x1d707 + Content Atom = 0x58b07 + Contenteditable Atom = 0x58b0f + Contextmenu Atom = 0x3800b + Controls Atom = 0x1de08 + Coords Atom = 0x1ea06 + Crossorigin Atom = 0x1fb0b + Data Atom = 0x4a504 + Datalist Atom = 0x4a508 + Datetime Atom = 0x2b808 + Dd Atom = 0x2d702 + Default Atom = 0x10a07 + Defer Atom = 0x5c705 + Del Atom = 0x45203 + Desc Atom = 0x56104 + Details Atom = 0x7207 + Dfn Atom = 0x8703 + Dialog Atom = 0xbb06 + Dir Atom = 0x9303 + Dirname Atom = 0x9307 + Disabled Atom = 0x16408 + Div Atom = 0x16b03 + Dl Atom = 0x5e602 + Download Atom = 0x46308 + Draggable Atom = 0x17a09 + Dropzone Atom = 0x40508 + Dt Atom = 0x64b02 + Em Atom = 0x6e02 + Embed Atom = 0x6e05 + Enctype Atom = 0x28d07 + Face Atom = 0x21e04 + Fieldset Atom = 0x22608 + Figcaption Atom = 0x22e0a + Figure Atom = 0x24806 + Font Atom = 0x3f04 + Footer Atom = 0xf606 + For Atom = 0x25403 + ForeignObject Atom = 0x2540d + Foreignobject Atom = 0x2610d + Form Atom = 0x26e04 + Formaction Atom = 0x26e0a + Formenctype Atom = 0x2890b + Formmethod Atom = 0x2a40a + Formnovalidate Atom = 0x2ae0e + Formtarget Atom = 0x2c00a + Frame Atom = 0x8b05 + Frameset Atom = 0x8b08 + H1 Atom = 0x15c02 + H2 Atom = 0x2de02 + H3 Atom = 0x30d02 + H4 Atom = 0x34502 + H5 Atom = 0x34f02 + H6 Atom = 0x64d02 + Head Atom = 0x33104 + Header Atom = 0x33106 + Headers Atom = 0x33107 + Height Atom = 0x5206 + Hgroup Atom = 0x2ca06 + Hidden Atom = 0x2d506 + High Atom = 0x2db04 + Hr Atom = 0x15702 + Href Atom = 0x2e004 + Hreflang Atom = 0x2e008 + Html Atom = 0x5604 + HttpEquiv Atom = 0x2e80a + I Atom = 0x601 + Icon Atom = 0x58a04 + Id Atom = 0x10902 + Iframe Atom = 0x2fc06 + Image Atom = 0x30205 + Img Atom = 0x30703 + Input Atom = 0x44b05 + Inputmode Atom = 0x44b09 + Ins Atom = 0x20403 + Integrity Atom = 0x23f09 + Is Atom = 0x16502 + Isindex Atom = 0x30f07 + Ismap Atom = 0x31605 + Itemid Atom = 0x38b06 + Itemprop Atom = 0x19d08 + Itemref Atom = 0x3cd07 + Itemscope Atom = 0x67109 + Itemtype Atom = 0x31f08 + Kbd Atom = 0xb903 + Keygen Atom = 0x3206 + Keytype Atom = 0xd607 + Kind Atom = 0x17704 + Label Atom = 0x5905 + Lang Atom = 0x2e404 + Legend Atom = 0x18106 + Li Atom = 0xb202 + Link Atom = 0x17404 + List Atom = 0x4a904 + Listing Atom = 0x4a907 + Loop Atom = 0x5d04 + Low Atom = 0xc303 + Main Atom = 0x1004 + Malignmark Atom = 0xb00a + Manifest Atom = 0x6d708 + Map Atom = 0x31803 + Mark Atom = 0xb604 + Marquee Atom = 0x32707 + Math Atom = 0x32e04 + Max Atom = 0x33d03 + Maxlength Atom = 0x33d09 + Media Atom = 0xe605 + Mediagroup Atom = 0xe60a + Menu Atom = 0x38704 + Menuitem Atom = 0x38708 + Meta Atom = 0x4b804 + Meter Atom = 0x9805 + Method Atom = 0x2a806 + Mglyph Atom = 0x30806 + Mi Atom = 0x34702 + Min Atom = 0x34703 + Minlength Atom = 0x34709 + Mn Atom = 0x2b102 + Mo Atom = 0xa402 + Ms Atom = 0x67402 + Mtext Atom = 0x35105 + Multiple Atom = 0x35f08 + Muted Atom = 0x36705 + Name Atom = 0x9604 + Nav Atom = 0x1303 + Nobr Atom = 0x3704 + Noembed Atom = 0x6c07 + Noframes Atom = 0x8908 + Nomodule Atom = 0xa208 + Nonce Atom = 0x1a605 + Noscript Atom = 0x21608 + Novalidate Atom = 0x2b20a + Object Atom = 0x26806 + Ol Atom = 0x13702 + Onabort Atom = 0x19507 + Onafterprint Atom = 0x2360c + Onautocomplete Atom = 0x2760e + Onautocompleteerror Atom = 0x27613 + Onauxclick Atom = 0x61f0a + Onbeforeprint Atom = 0x69e0d + Onbeforeunload Atom = 0x6e70e + Onblur Atom = 0x56d06 + Oncancel Atom = 0x11908 + Oncanplay Atom = 0x14d09 + Oncanplaythrough Atom = 0x14d10 + Onchange Atom = 0x41b08 + Onclick Atom = 0x2f507 + Onclose Atom = 0x36c07 + Oncontextmenu Atom = 0x37e0d + Oncopy Atom = 0x39106 + Oncuechange Atom = 0x3970b + Oncut Atom = 0x3a205 + Ondblclick Atom = 0x3a70a + Ondrag Atom = 0x3b106 + Ondragend Atom = 0x3b109 + Ondragenter Atom = 0x3ba0b + Ondragexit Atom = 0x3c50a + Ondragleave Atom = 0x3df0b + Ondragover Atom = 0x3ea0a + Ondragstart Atom = 0x3f40b + Ondrop Atom = 0x40306 + Ondurationchange Atom = 0x41310 + Onemptied Atom = 0x40a09 + Onended Atom = 0x42307 + Onerror Atom = 0x42a07 + Onfocus Atom = 0x43107 + Onhashchange Atom = 0x43d0c + Oninput Atom = 0x44907 + Oninvalid Atom = 0x45509 + Onkeydown Atom = 0x45e09 + Onkeypress Atom = 0x46b0a + Onkeyup Atom = 0x48007 + Onlanguagechange Atom = 0x48d10 + Onload Atom = 0x49d06 + Onloadeddata Atom = 0x49d0c + Onloadedmetadata Atom = 0x4b010 + Onloadend Atom = 0x4c609 + Onloadstart Atom = 0x4cf0b + Onmessage Atom = 0x4da09 + Onmessageerror Atom = 0x4da0e + Onmousedown Atom = 0x4e80b + Onmouseenter Atom = 0x4f30c + Onmouseleave Atom = 0x4ff0c + Onmousemove Atom = 0x50b0b + Onmouseout Atom = 0x5160a + Onmouseover Atom = 0x5230b + Onmouseup Atom = 0x52e09 + Onmousewheel Atom = 0x53c0c + Onoffline Atom = 0x54809 + Ononline Atom = 0x55108 + Onpagehide Atom = 0x5590a + Onpageshow Atom = 0x5730a + Onpaste Atom = 0x57f07 + Onpause Atom = 0x59a07 + Onplay Atom = 0x5a406 + Onplaying Atom = 0x5a409 + Onpopstate Atom = 0x5ad0a + Onprogress Atom = 0x5b70a + Onratechange Atom = 0x5cc0c + Onrejectionhandled Atom = 0x5d812 + Onreset Atom = 0x5ea07 + Onresize Atom = 0x5f108 + Onscroll Atom = 0x60008 + Onsecuritypolicyviolation Atom = 0x60819 + Onseeked Atom = 0x62908 + Onseeking Atom = 0x63109 + Onselect Atom = 0x63a08 + Onshow Atom = 0x64406 + Onsort Atom = 0x64f06 + Onstalled Atom = 0x65909 + Onstorage Atom = 0x66209 + Onsubmit Atom = 0x66b08 + Onsuspend Atom = 0x67b09 + Ontimeupdate Atom = 0x400c + Ontoggle Atom = 0x68408 + Onunhandledrejection Atom = 0x68c14 + Onunload Atom = 0x6ab08 + Onvolumechange Atom = 0x6b30e + Onwaiting Atom = 0x6c109 + Onwheel Atom = 0x6ca07 + Open Atom = 0x1a304 + Optgroup Atom = 0x5f08 + Optimum Atom = 0x6d107 + Option Atom = 0x6e306 + Output Atom = 0x51d06 + P Atom = 0xc01 + Param Atom = 0xc05 + Pattern Atom = 0x6607 + Picture Atom = 0x7b07 + Ping Atom = 0xef04 + Placeholder Atom = 0x1310b + Plaintext Atom = 0x1b209 + Playsinline Atom = 0x1400b + Poster Atom = 0x2cf06 + Pre Atom = 0x47003 + Preload Atom = 0x48607 + Progress Atom = 0x5b908 + Prompt Atom = 0x53606 + Public Atom = 0x58606 + Q Atom = 0xcf01 + Radiogroup Atom = 0x30a + Rb Atom = 0x3a02 + Readonly Atom = 0x35708 + Referrerpolicy Atom = 0x3d10e + Rel Atom = 0x48703 + Required Atom = 0x24c08 + Reversed Atom = 0x8008 + Rows Atom = 0x9c04 + Rowspan Atom = 0x9c07 + Rp Atom = 0x23c02 + Rt Atom = 0x19a02 + Rtc Atom = 0x19a03 + Ruby Atom = 0xfb04 + S Atom = 0x2501 + Samp Atom = 0x7804 + Sandbox Atom = 0x12907 + Scope Atom = 0x67505 + Scoped Atom = 0x67506 + Script Atom = 0x21806 + Seamless Atom = 0x37108 + Section Atom = 0x56807 + Select Atom = 0x63c06 + Selected Atom = 0x63c08 + Shape Atom = 0x1e505 + Size Atom = 0x5f504 + Sizes Atom = 0x5f505 + Slot Atom = 0x1ef04 + Small Atom = 0x20605 + Sortable Atom = 0x65108 + Sorted Atom = 0x33706 + Source Atom = 0x37806 + Spacer Atom = 0x43706 + Span Atom = 0x9f04 + Spellcheck Atom = 0x4740a + Src Atom = 0x5c003 + Srcdoc Atom = 0x5c006 + Srclang Atom = 0x5f907 + Srcset Atom = 0x6f906 + Start Atom = 0x3fa05 + Step Atom = 0x58304 + Strike Atom = 0xd206 + Strong Atom = 0x6dd06 + Style Atom = 0x6ff05 + Sub Atom = 0x66d03 + Summary Atom = 0x70407 + Sup Atom = 0x70b03 + Svg Atom = 0x70e03 + System Atom = 0x71106 + Tabindex Atom = 0x4be08 + Table Atom = 0x59505 + Target Atom = 0x2c406 + Tbody Atom = 0x2705 + Td Atom = 0x9202 + Template Atom = 0x71408 + Textarea Atom = 0x35208 + Tfoot Atom = 0xf505 + Th Atom = 0x15602 + Thead Atom = 0x33005 + Time Atom = 0x4204 + Title Atom = 0x11005 + Tr Atom = 0xcc02 + Track Atom = 0x1ba05 + Translate Atom = 0x1f209 + Tt Atom = 0x6802 + Type Atom = 0xd904 + Typemustmatch Atom = 0x2900d + U Atom = 0xb01 + Ul Atom = 0xa702 + Updateviacache Atom = 0x460e + Usemap Atom = 0x59e06 + Value Atom = 0x1505 + Var Atom = 0x16d03 + Video Atom = 0x2f105 + Wbr Atom = 0x57c03 + Width Atom = 0x64905 + Workertype Atom = 0x71c0a + Wrap Atom = 0x72604 + Xmp Atom = 0x12f03 +) + +const hash0 = 0x81cdf10e + +const maxAtomLen = 25 + +var table = [1 << 9]Atom{ + 0x1: 0xe60a, // mediagroup + 0x2: 0x2e404, // lang + 0x4: 0x2c09, // accesskey + 0x5: 0x8b08, // frameset + 0x7: 0x63a08, // onselect + 0x8: 0x71106, // system + 0xa: 0x64905, // width + 0xc: 0x2890b, // formenctype + 0xd: 0x13702, // ol + 0xe: 0x3970b, // oncuechange + 0x10: 0x14b03, // bdo + 0x11: 0x11505, // audio + 0x12: 0x17a09, // draggable + 0x14: 0x2f105, // video + 0x15: 0x2b102, // mn + 0x16: 0x38704, // menu + 0x17: 0x2cf06, // poster + 0x19: 0xf606, // footer + 0x1a: 0x2a806, // method + 0x1b: 0x2b808, // datetime + 0x1c: 0x19507, // onabort + 0x1d: 0x460e, // updateviacache + 0x1e: 0xff05, // async + 0x1f: 0x49d06, // onload + 0x21: 0x11908, // oncancel + 0x22: 0x62908, // onseeked + 0x23: 0x30205, // image + 0x24: 0x5d812, // onrejectionhandled + 0x26: 0x17404, // link + 0x27: 0x51d06, // output + 0x28: 0x33104, // head + 0x29: 0x4ff0c, // onmouseleave + 0x2a: 0x57f07, // onpaste + 0x2b: 0x5a409, // onplaying + 0x2c: 0x1c407, // colspan + 0x2f: 0x1bf05, // color + 0x30: 0x5f504, // size + 0x31: 0x2e80a, // http-equiv + 0x33: 0x601, // i + 0x34: 0x5590a, // onpagehide + 0x35: 0x68c14, // onunhandledrejection + 0x37: 0x42a07, // onerror + 0x3a: 0x3b08, // basefont + 0x3f: 0x1303, // nav + 0x40: 0x17704, // kind + 0x41: 0x35708, // readonly + 0x42: 0x30806, // mglyph + 0x44: 0xb202, // li + 0x46: 0x2d506, // hidden + 0x47: 0x70e03, // svg + 0x48: 0x58304, // step + 0x49: 0x23f09, // integrity + 0x4a: 0x58606, // public + 0x4c: 0x1ab03, // col + 0x4d: 0x1870a, // blockquote + 0x4e: 0x34f02, // h5 + 0x50: 0x5b908, // progress + 0x51: 0x5f505, // sizes + 0x52: 0x34502, // h4 + 0x56: 0x33005, // thead + 0x57: 0xd607, // keytype + 0x58: 0x5b70a, // onprogress + 0x59: 0x44b09, // inputmode + 0x5a: 0x3b109, // ondragend + 0x5d: 0x3a205, // oncut + 0x5e: 0x43706, // spacer + 0x5f: 0x1ab08, // colgroup + 0x62: 0x16502, // is + 0x65: 0x3c02, // as + 0x66: 0x54809, // onoffline + 0x67: 0x33706, // sorted + 0x69: 0x48d10, // onlanguagechange + 0x6c: 0x43d0c, // onhashchange + 0x6d: 0x9604, // name + 0x6e: 0xf505, // tfoot + 0x6f: 0x56104, // desc + 0x70: 0x33d03, // max + 0x72: 0x1ea06, // coords + 0x73: 0x30d02, // h3 + 0x74: 0x6e70e, // onbeforeunload + 0x75: 0x9c04, // rows + 0x76: 0x63c06, // select + 0x77: 0x9805, // meter + 0x78: 0x38b06, // itemid + 0x79: 0x53c0c, // onmousewheel + 0x7a: 0x5c006, // srcdoc + 0x7d: 0x1ba05, // track + 0x7f: 0x31f08, // itemtype + 0x82: 0xa402, // mo + 0x83: 0x41b08, // onchange + 0x84: 0x33107, // headers + 0x85: 0x5cc0c, // onratechange + 0x86: 0x60819, // onsecuritypolicyviolation + 0x88: 0x4a508, // datalist + 0x89: 0x4e80b, // onmousedown + 0x8a: 0x1ef04, // slot + 0x8b: 0x4b010, // onloadedmetadata + 0x8c: 0x1a06, // accept + 0x8d: 0x26806, // object + 0x91: 0x6b30e, // onvolumechange + 0x92: 0x2107, // charset + 0x93: 0x27613, // onautocompleteerror + 0x94: 0xc113, // allowpaymentrequest + 0x95: 0x2804, // body + 0x96: 0x10a07, // default + 0x97: 0x63c08, // selected + 0x98: 0x21e04, // face + 0x99: 0x1e505, // shape + 0x9b: 0x68408, // ontoggle + 0x9e: 0x64b02, // dt + 0x9f: 0xb604, // mark + 0xa1: 0xb01, // u + 0xa4: 0x6ab08, // onunload + 0xa5: 0x5d04, // loop + 0xa6: 0x16408, // disabled + 0xaa: 0x42307, // onended + 0xab: 0xb00a, // malignmark + 0xad: 0x67b09, // onsuspend + 0xae: 0x35105, // mtext + 0xaf: 0x64f06, // onsort + 0xb0: 0x19d08, // itemprop + 0xb3: 0x67109, // itemscope + 0xb4: 0x17305, // blink + 0xb6: 0x3b106, // ondrag + 0xb7: 0xa702, // ul + 0xb8: 0x26e04, // form + 0xb9: 0x12907, // sandbox + 0xba: 0x8b05, // frame + 0xbb: 0x1505, // value + 0xbc: 0x66209, // onstorage + 0xbf: 0xaa07, // acronym + 0xc0: 0x19a02, // rt + 0xc2: 0x202, // br + 0xc3: 0x22608, // fieldset + 0xc4: 0x2900d, // typemustmatch + 0xc5: 0xa208, // nomodule + 0xc6: 0x6c07, // noembed + 0xc7: 0x69e0d, // onbeforeprint + 0xc8: 0x19106, // button + 0xc9: 0x2f507, // onclick + 0xca: 0x70407, // summary + 0xcd: 0xfb04, // ruby + 0xce: 0x56405, // class + 0xcf: 0x3f40b, // ondragstart + 0xd0: 0x23107, // caption + 0xd4: 0xdd0e, // allowusermedia + 0xd5: 0x4cf0b, // onloadstart + 0xd9: 0x16b03, // div + 0xda: 0x4a904, // list + 0xdb: 0x32e04, // math + 0xdc: 0x44b05, // input + 0xdf: 0x3ea0a, // ondragover + 0xe0: 0x2de02, // h2 + 0xe2: 0x1b209, // plaintext + 0xe4: 0x4f30c, // onmouseenter + 0xe7: 0x47907, // checked + 0xe8: 0x47003, // pre + 0xea: 0x35f08, // multiple + 0xeb: 0xba03, // bdi + 0xec: 0x33d09, // maxlength + 0xed: 0xcf01, // q + 0xee: 0x61f0a, // onauxclick + 0xf0: 0x57c03, // wbr + 0xf2: 0x3b04, // base + 0xf3: 0x6e306, // option + 0xf5: 0x41310, // ondurationchange + 0xf7: 0x8908, // noframes + 0xf9: 0x40508, // dropzone + 0xfb: 0x67505, // scope + 0xfc: 0x8008, // reversed + 0xfd: 0x3ba0b, // ondragenter + 0xfe: 0x3fa05, // start + 0xff: 0x12f03, // xmp + 0x100: 0x5f907, // srclang + 0x101: 0x30703, // img + 0x104: 0x101, // b + 0x105: 0x25403, // for + 0x106: 0x10705, // aside + 0x107: 0x44907, // oninput + 0x108: 0x35604, // area + 0x109: 0x2a40a, // formmethod + 0x10a: 0x72604, // wrap + 0x10c: 0x23c02, // rp + 0x10d: 0x46b0a, // onkeypress + 0x10e: 0x6802, // tt + 0x110: 0x34702, // mi + 0x111: 0x36705, // muted + 0x112: 0xf303, // alt + 0x113: 0x5c504, // code + 0x114: 0x6e02, // em + 0x115: 0x3c50a, // ondragexit + 0x117: 0x9f04, // span + 0x119: 0x6d708, // manifest + 0x11a: 0x38708, // menuitem + 0x11b: 0x58b07, // content + 0x11d: 0x6c109, // onwaiting + 0x11f: 0x4c609, // onloadend + 0x121: 0x37e0d, // oncontextmenu + 0x123: 0x56d06, // onblur + 0x124: 0x3fc07, // article + 0x125: 0x9303, // dir + 0x126: 0xef04, // ping + 0x127: 0x24c08, // required + 0x128: 0x45509, // oninvalid + 0x129: 0xb105, // align + 0x12b: 0x58a04, // icon + 0x12c: 0x64d02, // h6 + 0x12d: 0x1c404, // cols + 0x12e: 0x22e0a, // figcaption + 0x12f: 0x45e09, // onkeydown + 0x130: 0x66b08, // onsubmit + 0x131: 0x14d09, // oncanplay + 0x132: 0x70b03, // sup + 0x133: 0xc01, // p + 0x135: 0x40a09, // onemptied + 0x136: 0x39106, // oncopy + 0x137: 0x19c04, // cite + 0x138: 0x3a70a, // ondblclick + 0x13a: 0x50b0b, // onmousemove + 0x13c: 0x66d03, // sub + 0x13d: 0x48703, // rel + 0x13e: 0x5f08, // optgroup + 0x142: 0x9c07, // rowspan + 0x143: 0x37806, // source + 0x144: 0x21608, // noscript + 0x145: 0x1a304, // open + 0x146: 0x20403, // ins + 0x147: 0x2540d, // foreignObject + 0x148: 0x5ad0a, // onpopstate + 0x14a: 0x28d07, // enctype + 0x14b: 0x2760e, // onautocomplete + 0x14c: 0x35208, // textarea + 0x14e: 0x2780c, // autocomplete + 0x14f: 0x15702, // hr + 0x150: 0x1de08, // controls + 0x151: 0x10902, // id + 0x153: 0x2360c, // onafterprint + 0x155: 0x2610d, // foreignobject + 0x156: 0x32707, // marquee + 0x157: 0x59a07, // onpause + 0x158: 0x5e602, // dl + 0x159: 0x5206, // height + 0x15a: 0x34703, // min + 0x15b: 0x9307, // dirname + 0x15c: 0x1f209, // translate + 0x15d: 0x5604, // html + 0x15e: 0x34709, // minlength + 0x15f: 0x48607, // preload + 0x160: 0x71408, // template + 0x161: 0x3df0b, // ondragleave + 0x162: 0x3a02, // rb + 0x164: 0x5c003, // src + 0x165: 0x6dd06, // strong + 0x167: 0x7804, // samp + 0x168: 0x6f307, // address + 0x169: 0x55108, // ononline + 0x16b: 0x1310b, // placeholder + 0x16c: 0x2c406, // target + 0x16d: 0x20605, // small + 0x16e: 0x6ca07, // onwheel + 0x16f: 0x1c90a, // annotation + 0x170: 0x4740a, // spellcheck + 0x171: 0x7207, // details + 0x172: 0x10306, // canvas + 0x173: 0x12109, // autofocus + 0x174: 0xc05, // param + 0x176: 0x46308, // download + 0x177: 0x45203, // del + 0x178: 0x36c07, // onclose + 0x179: 0xb903, // kbd + 0x17a: 0x31906, // applet + 0x17b: 0x2e004, // href + 0x17c: 0x5f108, // onresize + 0x17e: 0x49d0c, // onloadeddata + 0x180: 0xcc02, // tr + 0x181: 0x2c00a, // formtarget + 0x182: 0x11005, // title + 0x183: 0x6ff05, // style + 0x184: 0xd206, // strike + 0x185: 0x59e06, // usemap + 0x186: 0x2fc06, // iframe + 0x187: 0x1004, // main + 0x189: 0x7b07, // picture + 0x18c: 0x31605, // ismap + 0x18e: 0x4a504, // data + 0x18f: 0x5905, // label + 0x191: 0x3d10e, // referrerpolicy + 0x192: 0x15602, // th + 0x194: 0x53606, // prompt + 0x195: 0x56807, // section + 0x197: 0x6d107, // optimum + 0x198: 0x2db04, // high + 0x199: 0x15c02, // h1 + 0x19a: 0x65909, // onstalled + 0x19b: 0x16d03, // var + 0x19c: 0x4204, // time + 0x19e: 0x67402, // ms + 0x19f: 0x33106, // header + 0x1a0: 0x4da09, // onmessage + 0x1a1: 0x1a605, // nonce + 0x1a2: 0x26e0a, // formaction + 0x1a3: 0x22006, // center + 0x1a4: 0x3704, // nobr + 0x1a5: 0x59505, // table + 0x1a6: 0x4a907, // listing + 0x1a7: 0x18106, // legend + 0x1a9: 0x29b09, // challenge + 0x1aa: 0x24806, // figure + 0x1ab: 0xe605, // media + 0x1ae: 0xd904, // type + 0x1af: 0x3f04, // font + 0x1b0: 0x4da0e, // onmessageerror + 0x1b1: 0x37108, // seamless + 0x1b2: 0x8703, // dfn + 0x1b3: 0x5c705, // defer + 0x1b4: 0xc303, // low + 0x1b5: 0x19a03, // rtc + 0x1b6: 0x5230b, // onmouseover + 0x1b7: 0x2b20a, // novalidate + 0x1b8: 0x71c0a, // workertype + 0x1ba: 0x3cd07, // itemref + 0x1bd: 0x1, // a + 0x1be: 0x31803, // map + 0x1bf: 0x400c, // ontimeupdate + 0x1c0: 0x15e07, // bgsound + 0x1c1: 0x3206, // keygen + 0x1c2: 0x2705, // tbody + 0x1c5: 0x64406, // onshow + 0x1c7: 0x2501, // s + 0x1c8: 0x6607, // pattern + 0x1cc: 0x14d10, // oncanplaythrough + 0x1ce: 0x2d702, // dd + 0x1cf: 0x6f906, // srcset + 0x1d0: 0x17003, // big + 0x1d2: 0x65108, // sortable + 0x1d3: 0x48007, // onkeyup + 0x1d5: 0x5a406, // onplay + 0x1d7: 0x4b804, // meta + 0x1d8: 0x40306, // ondrop + 0x1da: 0x60008, // onscroll + 0x1db: 0x1fb0b, // crossorigin + 0x1dc: 0x5730a, // onpageshow + 0x1dd: 0x4, // abbr + 0x1de: 0x9202, // td + 0x1df: 0x58b0f, // contenteditable + 0x1e0: 0x27206, // action + 0x1e1: 0x1400b, // playsinline + 0x1e2: 0x43107, // onfocus + 0x1e3: 0x2e008, // hreflang + 0x1e5: 0x5160a, // onmouseout + 0x1e6: 0x5ea07, // onreset + 0x1e7: 0x13c08, // autoplay + 0x1e8: 0x63109, // onseeking + 0x1ea: 0x67506, // scoped + 0x1ec: 0x30a, // radiogroup + 0x1ee: 0x3800b, // contextmenu + 0x1ef: 0x52e09, // onmouseup + 0x1f1: 0x2ca06, // hgroup + 0x1f2: 0x2080f, // allowfullscreen + 0x1f3: 0x4be08, // tabindex + 0x1f6: 0x30f07, // isindex + 0x1f7: 0x1a0e, // accept-charset + 0x1f8: 0x2ae0e, // formnovalidate + 0x1fb: 0x1c90e, // annotation-xml + 0x1fc: 0x6e05, // embed + 0x1fd: 0x21806, // script + 0x1fe: 0xbb06, // dialog + 0x1ff: 0x1d707, // command +} + +const atomText = "abbradiogrouparamainavalueaccept-charsetbodyaccesskeygenobrb" + + "asefontimeupdateviacacheightmlabelooptgroupatternoembedetail" + + "sampictureversedfnoframesetdirnameterowspanomoduleacronymali" + + "gnmarkbdialogallowpaymentrequestrikeytypeallowusermediagroup" + + "ingaltfooterubyasyncanvasidefaultitleaudioncancelautofocusan" + + "dboxmplaceholderautoplaysinlinebdoncanplaythrough1bgsoundisa" + + "bledivarbigblinkindraggablegendblockquotebuttonabortcitempro" + + "penoncecolgrouplaintextrackcolorcolspannotation-xmlcommandco" + + "ntrolshapecoordslotranslatecrossoriginsmallowfullscreenoscri" + + "ptfacenterfieldsetfigcaptionafterprintegrityfigurequiredfore" + + "ignObjectforeignobjectformactionautocompleteerrorformenctype" + + "mustmatchallengeformmethodformnovalidatetimeformtargethgroup" + + "osterhiddenhigh2hreflanghttp-equivideonclickiframeimageimgly" + + "ph3isindexismappletitemtypemarqueematheadersortedmaxlength4m" + + "inlength5mtextareadonlymultiplemutedoncloseamlessourceoncont" + + "extmenuitemidoncopyoncuechangeoncutondblclickondragendondrag" + + "enterondragexitemreferrerpolicyondragleaveondragoverondragst" + + "articleondropzonemptiedondurationchangeonendedonerroronfocus" + + "paceronhashchangeoninputmodeloninvalidonkeydownloadonkeypres" + + "spellcheckedonkeyupreloadonlanguagechangeonloadeddatalisting" + + "onloadedmetadatabindexonloadendonloadstartonmessageerroronmo" + + "usedownonmouseenteronmouseleaveonmousemoveonmouseoutputonmou" + + "seoveronmouseupromptonmousewheelonofflineononlineonpagehides" + + "classectionbluronpageshowbronpastepublicontenteditableonpaus" + + "emaponplayingonpopstateonprogressrcdocodeferonratechangeonre" + + "jectionhandledonresetonresizesrclangonscrollonsecuritypolicy" + + "violationauxclickonseekedonseekingonselectedonshowidth6onsor" + + "tableonstalledonstorageonsubmitemscopedonsuspendontoggleonun" + + "handledrejectionbeforeprintonunloadonvolumechangeonwaitingon" + + "wheeloptimumanifestrongoptionbeforeunloaddressrcsetstylesumm" + + "arysupsvgsystemplateworkertypewrap" diff --git a/html/atom/table_test.go b/html/atom/table_test.go new file mode 100644 index 0000000..8a30762 --- /dev/null +++ b/html/atom/table_test.go @@ -0,0 +1,376 @@ +// Code generated by go generate gen.go; DO NOT EDIT. + +//go:generate go run gen.go -test + +package atom + +var testAtomList = []string{ + "a", + "abbr", + "accept", + "accept-charset", + "accesskey", + "acronym", + "action", + "address", + "align", + "allowfullscreen", + "allowpaymentrequest", + "allowusermedia", + "alt", + "annotation", + "annotation-xml", + "applet", + "area", + "article", + "as", + "aside", + "async", + "audio", + "autocomplete", + "autofocus", + "autoplay", + "b", + "base", + "basefont", + "bdi", + "bdo", + "bgsound", + "big", + "blink", + "blockquote", + "body", + "br", + "button", + "canvas", + "caption", + "center", + "challenge", + "charset", + "checked", + "cite", + "class", + "code", + "col", + "colgroup", + "color", + "cols", + "colspan", + "command", + "content", + "contenteditable", + "contextmenu", + "controls", + "coords", + "crossorigin", + "data", + "datalist", + "datetime", + "dd", + "default", + "defer", + "del", + "desc", + "details", + "dfn", + "dialog", + "dir", + "dirname", + "disabled", + "div", + "dl", + "download", + "draggable", + "dropzone", + "dt", + "em", + "embed", + "enctype", + "face", + "fieldset", + "figcaption", + "figure", + "font", + "footer", + "for", + "foreignObject", + "foreignobject", + "form", + "formaction", + "formenctype", + "formmethod", + "formnovalidate", + "formtarget", + "frame", + "frameset", + "h1", + "h2", + "h3", + "h4", + "h5", + "h6", + "head", + "header", + "headers", + "height", + "hgroup", + "hidden", + "high", + "hr", + "href", + "hreflang", + "html", + "http-equiv", + "i", + "icon", + "id", + "iframe", + "image", + "img", + "input", + "inputmode", + "ins", + "integrity", + "is", + "isindex", + "ismap", + "itemid", + "itemprop", + "itemref", + "itemscope", + "itemtype", + "kbd", + "keygen", + "keytype", + "kind", + "label", + "lang", + "legend", + "li", + "link", + "list", + "listing", + "loop", + "low", + "main", + "malignmark", + "manifest", + "map", + "mark", + "marquee", + "math", + "max", + "maxlength", + "media", + "mediagroup", + "menu", + "menuitem", + "meta", + "meter", + "method", + "mglyph", + "mi", + "min", + "minlength", + "mn", + "mo", + "ms", + "mtext", + "multiple", + "muted", + "name", + "nav", + "nobr", + "noembed", + "noframes", + "nomodule", + "nonce", + "noscript", + "novalidate", + "object", + "ol", + "onabort", + "onafterprint", + "onautocomplete", + "onautocompleteerror", + "onauxclick", + "onbeforeprint", + "onbeforeunload", + "onblur", + "oncancel", + "oncanplay", + "oncanplaythrough", + "onchange", + "onclick", + "onclose", + "oncontextmenu", + "oncopy", + "oncuechange", + "oncut", + "ondblclick", + "ondrag", + "ondragend", + "ondragenter", + "ondragexit", + "ondragleave", + "ondragover", + "ondragstart", + "ondrop", + "ondurationchange", + "onemptied", + "onended", + "onerror", + "onfocus", + "onhashchange", + "oninput", + "oninvalid", + "onkeydown", + "onkeypress", + "onkeyup", + "onlanguagechange", + "onload", + "onloadeddata", + "onloadedmetadata", + "onloadend", + "onloadstart", + "onmessage", + "onmessageerror", + "onmousedown", + "onmouseenter", + "onmouseleave", + "onmousemove", + "onmouseout", + "onmouseover", + "onmouseup", + "onmousewheel", + "onoffline", + "ononline", + "onpagehide", + "onpageshow", + "onpaste", + "onpause", + "onplay", + "onplaying", + "onpopstate", + "onprogress", + "onratechange", + "onrejectionhandled", + "onreset", + "onresize", + "onscroll", + "onsecuritypolicyviolation", + "onseeked", + "onseeking", + "onselect", + "onshow", + "onsort", + "onstalled", + "onstorage", + "onsubmit", + "onsuspend", + "ontimeupdate", + "ontoggle", + "onunhandledrejection", + "onunload", + "onvolumechange", + "onwaiting", + "onwheel", + "open", + "optgroup", + "optimum", + "option", + "output", + "p", + "param", + "pattern", + "picture", + "ping", + "placeholder", + "plaintext", + "playsinline", + "poster", + "pre", + "preload", + "progress", + "prompt", + "public", + "q", + "radiogroup", + "rb", + "readonly", + "referrerpolicy", + "rel", + "required", + "reversed", + "rows", + "rowspan", + "rp", + "rt", + "rtc", + "ruby", + "s", + "samp", + "sandbox", + "scope", + "scoped", + "script", + "seamless", + "section", + "select", + "selected", + "shape", + "size", + "sizes", + "slot", + "small", + "sortable", + "sorted", + "source", + "spacer", + "span", + "spellcheck", + "src", + "srcdoc", + "srclang", + "srcset", + "start", + "step", + "strike", + "strong", + "style", + "sub", + "summary", + "sup", + "svg", + "system", + "tabindex", + "table", + "target", + "tbody", + "td", + "template", + "textarea", + "tfoot", + "th", + "thead", + "time", + "title", + "tr", + "track", + "translate", + "tt", + "type", + "typemustmatch", + "u", + "ul", + "updateviacache", + "usemap", + "value", + "var", + "video", + "wbr", + "width", + "workertype", + "wrap", + "xmp", +} diff --git a/html/charset/charset.go b/html/charset/charset.go new file mode 100644 index 0000000..00062a7 --- /dev/null +++ b/html/charset/charset.go @@ -0,0 +1,257 @@ +// Copyright 2013 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package charset provides common text encodings for HTML documents. +// +// The mapping from encoding labels to encodings is defined at +// https://encoding.spec.whatwg.org/. +package charset // import "git.earlybird.gay/today-engine/html/charset" + +import ( + "bytes" + "fmt" + "io" + "mime" + "strings" + "unicode/utf8" + + "git.earlybird.gay/today-engine/html" + "golang.org/x/text/encoding" + "golang.org/x/text/encoding/charmap" + "golang.org/x/text/encoding/htmlindex" + "golang.org/x/text/transform" +) + +// Lookup returns the encoding with the specified label, and its canonical +// name. It returns nil and the empty string if label is not one of the +// standard encodings for HTML. Matching is case-insensitive and ignores +// leading and trailing whitespace. Encoders will use HTML escape sequences for +// runes that are not supported by the character set. +func Lookup(label string) (e encoding.Encoding, name string) { + e, err := htmlindex.Get(label) + if err != nil { + return nil, "" + } + name, _ = htmlindex.Name(e) + return &htmlEncoding{e}, name +} + +type htmlEncoding struct{ encoding.Encoding } + +func (h *htmlEncoding) NewEncoder() *encoding.Encoder { + // HTML requires a non-terminating legacy encoder. We use HTML escapes to + // substitute unsupported code points. + return encoding.HTMLEscapeUnsupported(h.Encoding.NewEncoder()) +} + +// DetermineEncoding determines the encoding of an HTML document by examining +// up to the first 1024 bytes of content and the declared Content-Type. +// +// See http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#determining-the-character-encoding +func DetermineEncoding(content []byte, contentType string) (e encoding.Encoding, name string, certain bool) { + if len(content) > 1024 { + content = content[:1024] + } + + for _, b := range boms { + if bytes.HasPrefix(content, b.bom) { + e, name = Lookup(b.enc) + return e, name, true + } + } + + if _, params, err := mime.ParseMediaType(contentType); err == nil { + if cs, ok := params["charset"]; ok { + if e, name = Lookup(cs); e != nil { + return e, name, true + } + } + } + + if len(content) > 0 { + e, name = prescan(content) + if e != nil { + return e, name, false + } + } + + // Try to detect UTF-8. + // First eliminate any partial rune at the end. + for i := len(content) - 1; i >= 0 && i > len(content)-4; i-- { + b := content[i] + if b < 0x80 { + break + } + if utf8.RuneStart(b) { + content = content[:i] + break + } + } + hasHighBit := false + for _, c := range content { + if c >= 0x80 { + hasHighBit = true + break + } + } + if hasHighBit && utf8.Valid(content) { + return encoding.Nop, "utf-8", false + } + + // TODO: change default depending on user's locale? + return charmap.Windows1252, "windows-1252", false +} + +// NewReader returns an io.Reader that converts the content of r to UTF-8. +// It calls DetermineEncoding to find out what r's encoding is. +func NewReader(r io.Reader, contentType string) (io.Reader, error) { + preview := make([]byte, 1024) + n, err := io.ReadFull(r, preview) + switch { + case err == io.ErrUnexpectedEOF: + preview = preview[:n] + r = bytes.NewReader(preview) + case err != nil: + return nil, err + default: + r = io.MultiReader(bytes.NewReader(preview), r) + } + + if e, _, _ := DetermineEncoding(preview, contentType); e != encoding.Nop { + r = transform.NewReader(r, e.NewDecoder()) + } + return r, nil +} + +// NewReaderLabel returns a reader that converts from the specified charset to +// UTF-8. It uses Lookup to find the encoding that corresponds to label, and +// returns an error if Lookup returns nil. It is suitable for use as +// encoding/xml.Decoder's CharsetReader function. +func NewReaderLabel(label string, input io.Reader) (io.Reader, error) { + e, _ := Lookup(label) + if e == nil { + return nil, fmt.Errorf("unsupported charset: %q", label) + } + return transform.NewReader(input, e.NewDecoder()), nil +} + +func prescan(content []byte) (e encoding.Encoding, name string) { + z := html.NewTokenizer(bytes.NewReader(content)) + for { + switch z.Next() { + case html.ErrorToken: + return nil, "" + + case html.StartTagToken, html.SelfClosingTagToken: + tagName, hasAttr := z.TagName() + if !bytes.Equal(tagName, []byte("meta")) { + continue + } + attrList := make(map[string]bool) + gotPragma := false + + const ( + dontKnow = iota + doNeedPragma + doNotNeedPragma + ) + needPragma := dontKnow + + name = "" + e = nil + for hasAttr { + var key, val []byte + key, val, hasAttr = z.TagAttr() + ks := string(key) + if attrList[ks] { + continue + } + attrList[ks] = true + for i, c := range val { + if 'A' <= c && c <= 'Z' { + val[i] = c + 0x20 + } + } + + switch ks { + case "http-equiv": + if bytes.Equal(val, []byte("content-type")) { + gotPragma = true + } + + case "content": + if e == nil { + name = fromMetaElement(string(val)) + if name != "" { + e, name = Lookup(name) + if e != nil { + needPragma = doNeedPragma + } + } + } + + case "charset": + e, name = Lookup(string(val)) + needPragma = doNotNeedPragma + } + } + + if needPragma == dontKnow || needPragma == doNeedPragma && !gotPragma { + continue + } + + if strings.HasPrefix(name, "utf-16") { + name = "utf-8" + e = encoding.Nop + } + + if e != nil { + return e, name + } + } + } +} + +func fromMetaElement(s string) string { + for s != "" { + csLoc := strings.Index(s, "charset") + if csLoc == -1 { + return "" + } + s = s[csLoc+len("charset"):] + s = strings.TrimLeft(s, " \t\n\f\r") + if !strings.HasPrefix(s, "=") { + continue + } + s = s[1:] + s = strings.TrimLeft(s, " \t\n\f\r") + if s == "" { + return "" + } + if q := s[0]; q == '"' || q == '\'' { + s = s[1:] + closeQuote := strings.IndexRune(s, rune(q)) + if closeQuote == -1 { + return "" + } + return s[:closeQuote] + } + + end := strings.IndexAny(s, "; \t\n\f\r") + if end == -1 { + end = len(s) + } + return s[:end] + } + return "" +} + +var boms = []struct { + bom []byte + enc string +}{ + {[]byte{0xfe, 0xff}, "utf-16be"}, + {[]byte{0xff, 0xfe}, "utf-16le"}, + {[]byte{0xef, 0xbb, 0xbf}, "utf-8"}, +} diff --git a/html/charset/charset_test.go b/html/charset/charset_test.go new file mode 100644 index 0000000..c2f6244 --- /dev/null +++ b/html/charset/charset_test.go @@ -0,0 +1,237 @@ +// Copyright 2013 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package charset + +import ( + "bytes" + "encoding/xml" + "io" + "os" + "runtime" + "strings" + "testing" + + "golang.org/x/text/transform" +) + +func transformString(t transform.Transformer, s string) (string, error) { + r := transform.NewReader(strings.NewReader(s), t) + b, err := io.ReadAll(r) + return string(b), err +} + +type testCase struct { + utf8, other, otherEncoding string +} + +// testCases for encoding and decoding. +var testCases = []testCase{ + {"Résumé", "Résumé", "utf8"}, + {"Résumé", "R\xe9sum\xe9", "latin1"}, + {"これは漢字です。", "S0\x8c0o0\"oW[g0Y0\x020", "UTF-16LE"}, + {"これは漢字です。", "0S0\x8c0oo\"[W0g0Y0\x02", "UTF-16BE"}, + {"Hello, world", "Hello, world", "ASCII"}, + {"Gdańsk", "Gda\xf1sk", "ISO-8859-2"}, + {"Ââ Čč Đđ Ŋŋ Õõ Å Å¡ Žž Åå Ää", "\xc2\xe2 \xc8\xe8 \xa9\xb9 \xaf\xbf \xd5\xf5 \xaa\xba \xac\xbc \xc5\xe5 \xc4\xe4", "ISO-8859-10"}, + {"สำหรับ", "\xca\xd3\xcb\xc3\u047a", "ISO-8859-11"}, + {"latvieÅ¡u", "latvie\xf0u", "ISO-8859-13"}, + {"Seònaid", "Se\xf2naid", "ISO-8859-14"}, + {"€1 is cheap", "\xa41 is cheap", "ISO-8859-15"}, + {"românește", "rom\xe2ne\xbate", "ISO-8859-16"}, + {"nutraĵo", "nutra\xbco", "ISO-8859-3"}, + {"Kalâdlit", "Kal\xe2dlit", "ISO-8859-4"}, + {"русский", "\xe0\xe3\xe1\xe1\xda\xd8\xd9", "ISO-8859-5"}, + {"ελληνικά", "\xe5\xeb\xeb\xe7\xed\xe9\xea\xdc", "ISO-8859-7"}, + {"Kağan", "Ka\xf0an", "ISO-8859-9"}, + {"Résumé", "R\x8esum\x8e", "macintosh"}, + {"Gdańsk", "Gda\xf1sk", "windows-1250"}, + {"русский", "\xf0\xf3\xf1\xf1\xea\xe8\xe9", "windows-1251"}, + {"Résumé", "R\xe9sum\xe9", "windows-1252"}, + {"ελληνικά", "\xe5\xeb\xeb\xe7\xed\xe9\xea\xdc", "windows-1253"}, + {"Kağan", "Ka\xf0an", "windows-1254"}, + {"עִבְרִית", "\xf2\xc4\xe1\xc0\xf8\xc4\xe9\xfa", "windows-1255"}, + {"العربية", "\xc7\xe1\xda\xd1\xc8\xed\xc9", "windows-1256"}, + {"latvieÅ¡u", "latvie\xf0u", "windows-1257"}, + {"Việt", "Vi\xea\xf2t", "windows-1258"}, + {"สำหรับ", "\xca\xd3\xcb\xc3\u047a", "windows-874"}, + {"русский", "\xd2\xd5\xd3\xd3\xcb\xc9\xca", "KOI8-R"}, + {"українська", "\xd5\xcb\xd2\xc1\xa7\xce\xd3\xd8\xcb\xc1", "KOI8-U"}, + {"Hello 常用國字標準字體表", "Hello \xb1`\xa5\u03b0\xea\xa6r\xbc\u0437\u01e6r\xc5\xe9\xaa\xed", "big5"}, + {"Hello 常用國字標準字體表", "Hello \xb3\xa3\xd3\xc3\x87\xf8\xd7\xd6\x98\xcb\x9c\xca\xd7\xd6\xf3\x77\xb1\xed", "gbk"}, + {"Hello 常用國字標準字體表", "Hello \xb3\xa3\xd3\xc3\x87\xf8\xd7\xd6\x98\xcb\x9c\xca\xd7\xd6\xf3\x77\xb1\xed", "gb18030"}, + {"עִבְרִית", "\x81\x30\xfb\x30\x81\x30\xf6\x34\x81\x30\xf9\x33\x81\x30\xf6\x30\x81\x30\xfb\x36\x81\x30\xf6\x34\x81\x30\xfa\x31\x81\x30\xfb\x38", "gb18030"}, + {"㧯", "\x82\x31\x89\x38", "gb18030"}, + {"これは漢字です。", "\x82\xb1\x82\xea\x82\xcd\x8a\xbf\x8e\x9a\x82\xc5\x82\xb7\x81B", "SJIS"}, + {"Hello, 世界!", "Hello, \x90\xa2\x8aE!", "SJIS"}, + {"イウエオカ", "\xb2\xb3\xb4\xb5\xb6", "SJIS"}, + {"これは漢字です。", "\xa4\xb3\xa4\xec\xa4\u03f4\xc1\xbb\xfa\xa4\u01e4\xb9\xa1\xa3", "EUC-JP"}, + {"Hello, 世界!", "Hello, \x1b$B@$3&\x1b(B!", "ISO-2022-JP"}, + {"다음과 같은 조건을 따라야 합니다: 저작자표시", "\xb4\xd9\xc0\xbd\xb0\xfa \xb0\xb0\xc0\xba \xc1\xb6\xb0\xc7\xc0\xbb \xb5\xfb\xb6\xf3\xbe\xdf \xc7Õ´Ï´\xd9: \xc0\xfa\xc0\xdb\xc0\xdaÇ¥\xbd\xc3", "EUC-KR"}, +} + +func TestDecode(t *testing.T) { + testCases := append(testCases, []testCase{ + // Replace multi-byte maximum subpart of ill-formed subsequence with + // single replacement character (WhatWG requirement). + {"Rés\ufffdumé", "Rés\xe1\x80umé", "utf8"}, + }...) + for _, tc := range testCases { + e, _ := Lookup(tc.otherEncoding) + if e == nil { + t.Errorf("%s: not found", tc.otherEncoding) + continue + } + s, err := transformString(e.NewDecoder(), tc.other) + if err != nil { + t.Errorf("%s: decode %q: %v", tc.otherEncoding, tc.other, err) + continue + } + if s != tc.utf8 { + t.Errorf("%s: got %q, want %q", tc.otherEncoding, s, tc.utf8) + } + } +} + +func TestEncode(t *testing.T) { + testCases := append(testCases, []testCase{ + // Use Go-style replacement. + {"Rés\xe1\x80umé", "Rés\ufffd\ufffdumé", "utf8"}, + // U+0144 LATIN SMALL LETTER N WITH ACUTE not supported by encoding. + {"Gdańsk", "Gdańsk", "ISO-8859-11"}, + {"\ufffd", "�", "ISO-8859-11"}, + {"a\xe1\x80b", "a��b", "ISO-8859-11"}, + }...) + for _, tc := range testCases { + e, _ := Lookup(tc.otherEncoding) + if e == nil { + t.Errorf("%s: not found", tc.otherEncoding) + continue + } + s, err := transformString(e.NewEncoder(), tc.utf8) + if err != nil { + t.Errorf("%s: encode %q: %s", tc.otherEncoding, tc.utf8, err) + continue + } + if s != tc.other { + t.Errorf("%s: got %q, want %q", tc.otherEncoding, s, tc.other) + } + } +} + +var sniffTestCases = []struct { + filename, declared, want string +}{ + {"HTTP-charset.html", "text/html; charset=iso-8859-15", "iso-8859-15"}, + {"UTF-16LE-BOM.html", "", "utf-16le"}, + {"UTF-16BE-BOM.html", "", "utf-16be"}, + {"meta-content-attribute.html", "text/html", "iso-8859-15"}, + {"meta-charset-attribute.html", "text/html", "iso-8859-15"}, + {"No-encoding-declaration.html", "text/html", "utf-8"}, + {"HTTP-vs-UTF-8-BOM.html", "text/html; charset=iso-8859-15", "utf-8"}, + {"HTTP-vs-meta-content.html", "text/html; charset=iso-8859-15", "iso-8859-15"}, + {"HTTP-vs-meta-charset.html", "text/html; charset=iso-8859-15", "iso-8859-15"}, + {"UTF-8-BOM-vs-meta-content.html", "text/html", "utf-8"}, + {"UTF-8-BOM-vs-meta-charset.html", "text/html", "utf-8"}, +} + +func TestSniff(t *testing.T) { + switch runtime.GOOS { + case "nacl": // platforms that don't permit direct file system access + t.Skipf("not supported on %q", runtime.GOOS) + } + + for _, tc := range sniffTestCases { + content, err := os.ReadFile("testdata/" + tc.filename) + if err != nil { + t.Errorf("%s: error reading file: %v", tc.filename, err) + continue + } + + _, name, _ := DetermineEncoding(content, tc.declared) + if name != tc.want { + t.Errorf("%s: got %q, want %q", tc.filename, name, tc.want) + continue + } + } +} + +func TestReader(t *testing.T) { + switch runtime.GOOS { + case "nacl": // platforms that don't permit direct file system access + t.Skipf("not supported on %q", runtime.GOOS) + } + + for _, tc := range sniffTestCases { + content, err := os.ReadFile("testdata/" + tc.filename) + if err != nil { + t.Errorf("%s: error reading file: %v", tc.filename, err) + continue + } + + r, err := NewReader(bytes.NewReader(content), tc.declared) + if err != nil { + t.Errorf("%s: error creating reader: %v", tc.filename, err) + continue + } + + got, err := io.ReadAll(r) + if err != nil { + t.Errorf("%s: error reading from charset.NewReader: %v", tc.filename, err) + continue + } + + e, _ := Lookup(tc.want) + want, err := io.ReadAll(transform.NewReader(bytes.NewReader(content), e.NewDecoder())) + if err != nil { + t.Errorf("%s: error decoding with hard-coded charset name: %v", tc.filename, err) + continue + } + + if !bytes.Equal(got, want) { + t.Errorf("%s: got %q, want %q", tc.filename, got, want) + continue + } + } +} + +var metaTestCases = []struct { + meta, want string +}{ + {"", ""}, + {"text/html", ""}, + {"text/html; charset utf-8", ""}, + {"text/html; charset=latin-2", "latin-2"}, + {"text/html; charset; charset = utf-8", "utf-8"}, + {`charset="big5"`, "big5"}, + {"charset='shift_jis'", "shift_jis"}, +} + +func TestFromMeta(t *testing.T) { + for _, tc := range metaTestCases { + got := fromMetaElement(tc.meta) + if got != tc.want { + t.Errorf("%q: got %q, want %q", tc.meta, got, tc.want) + } + } +} + +func TestXML(t *testing.T) { + const s = "r\xe9sum\xe9" + + d := xml.NewDecoder(strings.NewReader(s)) + d.CharsetReader = NewReaderLabel + + var a struct { + Word string + } + if err := d.Decode(&a); err != nil { + t.Fatalf("Decode: %v", err) + } + + want := "résumé" + if a.Word != want { + t.Errorf("got %q, want %q", a.Word, want) + } +} diff --git a/html/charset/testdata/HTTP-charset.html b/html/charset/testdata/HTTP-charset.html new file mode 100644 index 0000000..9915fa0 --- /dev/null +++ b/html/charset/testdata/HTTP-charset.html @@ -0,0 +1,48 @@ + + + + HTTP charset + + + + + + + + + + + +

HTTP charset

+ + +
+ + +
 
+ + + + + +
+

The character encoding of a page can be set using the HTTP header charset declaration.

+

The test contains a div with a class name that contains the following sequence of bytes: 0xC3 0xBD 0xC3 0xA4 0xC3 0xA8. These represent different sequences of characters in ISO 8859-15, ISO 8859-1 and UTF-8. The external, UTF-8-encoded stylesheet contains a selector .test div.ÜÀÚ. This matches the sequence of bytes above when they are interpreted as ISO 8859-15. If the class name matches the selector then the test will pass.

The only character encoding declaration for this HTML file is in the HTTP header, which sets the encoding to ISO 8859-15.

+
+
+
Next test
HTML5
+

the-input-byte-stream-001
Result summary & related tests
Detailed results for this test
Link to spec

+
Assumptions:
  • The default encoding for the browser you are testing is not set to ISO 8859-15.
  • +
  • The test is read from a server that supports HTTP.
+
+ + + + + + diff --git a/html/charset/testdata/HTTP-vs-UTF-8-BOM.html b/html/charset/testdata/HTTP-vs-UTF-8-BOM.html new file mode 100644 index 0000000..26e5d8b --- /dev/null +++ b/html/charset/testdata/HTTP-vs-UTF-8-BOM.html @@ -0,0 +1,48 @@ + + + + HTTP vs UTF-8 BOM + + + + + + + + + + + +

HTTP vs UTF-8 BOM

+ + +
+ + +
 
+ + + + + +
+

A character encoding set in the HTTP header has lower precedence than the UTF-8 signature.

+

The HTTP header attempts to set the character encoding to ISO 8859-15. The page starts with a UTF-8 signature.

The test contains a div with a class name that contains the following sequence of bytes: 0xC3 0xBD 0xC3 0xA4 0xC3 0xA8. These represent different sequences of characters in ISO 8859-15, ISO 8859-1 and UTF-8. The external, UTF-8-encoded stylesheet contains a selector .test div.ýäè. This matches the sequence of bytes above when they are interpreted as UTF-8. If the class name matches the selector then the test will pass.

If the test is unsuccessful, the characters  should appear at the top of the page. These represent the bytes that make up the UTF-8 signature when encountered in the ISO 8859-15 encoding.

+
+
+
Next test
HTML5
+

the-input-byte-stream-034
Result summary & related tests
Detailed results for this test
Link to spec

+
Assumptions:
  • The default encoding for the browser you are testing is not set to ISO 8859-15.
  • +
  • The test is read from a server that supports HTTP.
+
+ + + + + + diff --git a/html/charset/testdata/HTTP-vs-meta-charset.html b/html/charset/testdata/HTTP-vs-meta-charset.html new file mode 100644 index 0000000..2f07e95 --- /dev/null +++ b/html/charset/testdata/HTTP-vs-meta-charset.html @@ -0,0 +1,49 @@ + + + + HTTP vs meta charset + + + + + + + + + + + +

HTTP vs meta charset

+ + +
+ + +
 
+ + + + + +
+

The HTTP header has a higher precedence than an encoding declaration in a meta charset attribute.

+

The HTTP header attempts to set the character encoding to ISO 8859-15. The page contains an encoding declaration in a meta charset attribute that attempts to set the character encoding to ISO 8859-1.

The test contains a div with a class name that contains the following sequence of bytes: 0xC3 0xBD 0xC3 0xA4 0xC3 0xA8. These represent different sequences of characters in ISO 8859-15, ISO 8859-1 and UTF-8. The external, UTF-8-encoded stylesheet contains a selector .test div.ÜÀÚ. This matches the sequence of bytes above when they are interpreted as ISO 8859-15. If the class name matches the selector then the test will pass.

+
+
+
Next test
HTML5
+

the-input-byte-stream-018
Result summary & related tests
Detailed results for this test
Link to spec

+
Assumptions:
  • The default encoding for the browser you are testing is not set to ISO 8859-15.
  • +
  • The test is read from a server that supports HTTP.
+
+ + + + + + diff --git a/html/charset/testdata/HTTP-vs-meta-content.html b/html/charset/testdata/HTTP-vs-meta-content.html new file mode 100644 index 0000000..6853cdd --- /dev/null +++ b/html/charset/testdata/HTTP-vs-meta-content.html @@ -0,0 +1,49 @@ + + + + HTTP vs meta content + + + + + + + + + + + +

HTTP vs meta content

+ + +
+ + +
 
+ + + + + +
+

The HTTP header has a higher precedence than an encoding declaration in a meta content attribute.

+

The HTTP header attempts to set the character encoding to ISO 8859-15. The page contains an encoding declaration in a meta content attribute that attempts to set the character encoding to ISO 8859-1.

The test contains a div with a class name that contains the following sequence of bytes: 0xC3 0xBD 0xC3 0xA4 0xC3 0xA8. These represent different sequences of characters in ISO 8859-15, ISO 8859-1 and UTF-8. The external, UTF-8-encoded stylesheet contains a selector .test div.ÜÀÚ. This matches the sequence of bytes above when they are interpreted as ISO 8859-15. If the class name matches the selector then the test will pass.

+
+
+
Next test
HTML5
+

the-input-byte-stream-016
Result summary & related tests
Detailed results for this test
Link to spec

+
Assumptions:
  • The default encoding for the browser you are testing is not set to ISO 8859-15.
  • +
  • The test is read from a server that supports HTTP.
+
+ + + + + + diff --git a/html/charset/testdata/No-encoding-declaration.html b/html/charset/testdata/No-encoding-declaration.html new file mode 100644 index 0000000..612e26c --- /dev/null +++ b/html/charset/testdata/No-encoding-declaration.html @@ -0,0 +1,47 @@ + + + + No encoding declaration + + + + + + + + + + + +

No encoding declaration

+ + +
+ + +
 
+ + + + + +
+

A page with no encoding information in HTTP, BOM, XML declaration or meta element will be treated as UTF-8.

+

The test on this page contains a div with a class name that contains the following sequence of bytes: 0xC3 0xBD 0xC3 0xA4 0xC3 0xA8. These represent different sequences of characters in ISO 8859-15, ISO 8859-1 and UTF-8. The external, UTF-8-encoded stylesheet contains a selector .test div.ýäè. This matches the sequence of bytes above when they are interpreted as UTF-8. If the class name matches the selector then the test will pass.

+
+
+
Next test
HTML5
+

the-input-byte-stream-015
Result summary & related tests
Detailed results for this test
Link to spec

+
Assumptions:
  • The test is read from a server that supports HTTP.
+
+ + + + + + diff --git a/html/charset/testdata/README b/html/charset/testdata/README new file mode 100644 index 0000000..38ef0f9 --- /dev/null +++ b/html/charset/testdata/README @@ -0,0 +1,9 @@ +These test cases come from +http://www.w3.org/International/tests/repository/html5/the-input-byte-stream/results-basics + +Distributed under both the W3C Test Suite License +(http://www.w3.org/Consortium/Legal/2008/04-testsuite-license) +and the W3C 3-clause BSD License +(http://www.w3.org/Consortium/Legal/2008/03-bsd-license). +To contribute to a W3C Test Suite, see the policies and contribution +forms (http://www.w3.org/2004/10/27-testcases). diff --git a/html/charset/testdata/UTF-16BE-BOM.html b/html/charset/testdata/UTF-16BE-BOM.html new file mode 100644 index 0000000000000000000000000000000000000000..3abf7a9343c20518e57dfea58b374fb0f4fb58a1 GIT binary patch literal 2670 zcmcJR?QRoS5Qc}JAoU&=BQ-(7b^;2j8i*i3RV1JlO@;VXIsPurV!WHiDdLW}i`*CO z^UnC>tih=KsVr;H&Y7?C&O3AV(?534uG?e##U9y_y|!QNi4``n+D>d{2lky^LnFNx z?9HrarH$>rwQR_$g)Hk0*&STI*EYq|47~&U9sfUB+ji})9eR{QqCUra7oDsZ5obtB zdxP%<)-$4Q;rSHJiM>U(#ZI=;?n^BC?Dp6lu=~_1-lnX3u03&2BlmQIY>L+!Uq7XoytKw^Q#oZSM?3*J?)&ojG&yzQRkC!Ml5JE?ax;lp_NYEcdUht`ZswOviB~L5hmJ|pXI71nn20w;>vG! zQGB$EE9&wC``&J#_Ym~PgRu-Bd>1!pOp0||k`kr=VJ zfH6I6rmRaeHA7U-A^OTsT+|d2a^i(>DePzZ{)ibXoCBvJnuYrd-3kkN$uy{qQK;=*Y;S87ro12aTgu^i*%f8zC3>a}9DIe4cfxOzsCw&(cqvP9{ud{N6f` z#TNDY(B6@Gpr|uN+%&x^XZjBHdc@2vsM(Tyc2=vshHQ5w+obmp>tuWT(t4BTUGAQw zxeI$UGSLUBg=WFbF;4f@4=^P2AgY@CFn8A`bcC=_&~)fiDe)#cUARRBzJ^k|%X)69 z+{Cb`wq}Rsg%B62CC_tK!AV(W{(MV?#mndR46CU#BUN<{8e?*oT+!pE5wF#O#TR#a z$9qRT)tpbw8zAI~QQJg2C3|6$I%(T(;`zOMy6SO+&;pG=c#2P|P-WZn$$DpWJlC3U z3*nvmz zwP{u~r$L?-m3uqp9I1+#3yE|3M$(s-BEtih=LQ>`qYoiktOop(wi%!;yh%+Rm z{e|xntY<{q!1F1Z6MKtngPm-p-4|H&+3m4AVE3_AyiHm6Tzlf4M(*ht*%YrezJ6kr zHGj45pc?64*$Cm%-zseWMA`x;)v*~jA=i}szqts9xmQkS`M11|(H7bTXAycsXU53+ zJ?120SRZeyiFjW7enPN`bxk$IaWV3o48oJF7D&2ysoY;6(s6%6vVfaYd&mC=erK!) zNGI^7upQgN)53OHe_VE<@J+G8*Y|p*)zB2Thdi}+YR<5QWHm!|a_*AoZXuv7)$xe| zm3Q$D7{|#}{m4X&UY!6(ZhyYi2(5JLzGE$H)W6BQklnjPMwn<Yvv7Z*TVWwD*=E3QpH37* z#lqXJA0A~J9T_<^W5smspmDg2p6ac5Bjn+~LAoow%1TCdZ*$K8`O zw_$HaCi+0N&@7la#_7KL5r$+QL{)Pi=I&aDjt~|Knht#`CEi4*3%97i_fSfASlwUz0=3V0GCxY}z81UC-nP=CGt2OqYV$ zoRCo+qM9YX*3FFORLC=E3B~S@+KROyk4r5 yX7?DaslDfIebqXgC!KKp4IYy+W~X?ddE6o=`A+x#x0AK&6MF#W&AXxbRrv+SX}PNa literal 0 HcmV?d00001 diff --git a/html/charset/testdata/UTF-8-BOM-vs-meta-charset.html b/html/charset/testdata/UTF-8-BOM-vs-meta-charset.html new file mode 100644 index 0000000..83de433 --- /dev/null +++ b/html/charset/testdata/UTF-8-BOM-vs-meta-charset.html @@ -0,0 +1,49 @@ + + + + UTF-8 BOM vs meta charset + + + + + + + + + + + +

UTF-8 BOM vs meta charset

+ + +
+ + +
 
+ + + + + +
+

A page with a UTF-8 BOM will be recognized as UTF-8 even if the meta charset attribute declares a different encoding.

+

The page contains an encoding declaration in a meta charset attribute that attempts to set the character encoding to ISO 8859-15, but the file starts with a UTF-8 signature.

The test contains a div with a class name that contains the following sequence of bytes: 0xC3 0xBD 0xC3 0xA4 0xC3 0xA8. These represent different sequences of characters in ISO 8859-15, ISO 8859-1 and UTF-8. The external, UTF-8-encoded stylesheet contains a selector .test div.ýäè. This matches the sequence of bytes above when they are interpreted as UTF-8. If the class name matches the selector then the test will pass.

+
+
+
HTML5
+

the-input-byte-stream-038
Result summary & related tests
Detailed results for this test
Link to spec

+
Assumptions:
  • The default encoding for the browser you are testing is not set to ISO 8859-15.
  • +
  • The test is read from a server that supports HTTP.
+
+ + + + + + diff --git a/html/charset/testdata/UTF-8-BOM-vs-meta-content.html b/html/charset/testdata/UTF-8-BOM-vs-meta-content.html new file mode 100644 index 0000000..501aac2 --- /dev/null +++ b/html/charset/testdata/UTF-8-BOM-vs-meta-content.html @@ -0,0 +1,48 @@ + + + + UTF-8 BOM vs meta content + + + + + + + + + + + +

UTF-8 BOM vs meta content

+ + +
+ + +
 
+ + + + + +
+

A page with a UTF-8 BOM will be recognized as UTF-8 even if the meta content attribute declares a different encoding.

+

The page contains an encoding declaration in a meta content attribute that attempts to set the character encoding to ISO 8859-15, but the file starts with a UTF-8 signature.

The test contains a div with a class name that contains the following sequence of bytes: 0xC3 0xBD 0xC3 0xA4 0xC3 0xA8. These represent different sequences of characters in ISO 8859-15, ISO 8859-1 and UTF-8. The external, UTF-8-encoded stylesheet contains a selector .test div.ýäè. This matches the sequence of bytes above when they are interpreted as UTF-8. If the class name matches the selector then the test will pass.

+
+
+
HTML5
+

the-input-byte-stream-037
Result summary & related tests
Detailed results for this test
Link to spec

+
Assumptions:
  • The default encoding for the browser you are testing is not set to ISO 8859-15.
  • +
  • The test is read from a server that supports HTTP.
+
+ + + + + + diff --git a/html/charset/testdata/meta-charset-attribute.html b/html/charset/testdata/meta-charset-attribute.html new file mode 100644 index 0000000..2d7d25a --- /dev/null +++ b/html/charset/testdata/meta-charset-attribute.html @@ -0,0 +1,48 @@ + + + + meta charset attribute + + + + + + + + + + + +

meta charset attribute

+ + +
+ + +
 
+ + + + + +
+

The character encoding of the page can be set by a meta element with charset attribute.

+

The only character encoding declaration for this HTML file is in the charset attribute of the meta element, which declares the encoding to be ISO 8859-15.

The test contains a div with a class name that contains the following sequence of bytes: 0xC3 0xBD 0xC3 0xA4 0xC3 0xA8. These represent different sequences of characters in ISO 8859-15, ISO 8859-1 and UTF-8. The external, UTF-8-encoded stylesheet contains a selector .test div.ÜÀÚ. This matches the sequence of bytes above when they are interpreted as ISO 8859-15. If the class name matches the selector then the test will pass.

+
+
+
HTML5
+

the-input-byte-stream-009
Result summary & related tests
Detailed results for this test
Link to spec

+
Assumptions:
  • The default encoding for the browser you are testing is not set to ISO 8859-15.
  • +
  • The test is read from a server that supports HTTP.
+
+ + + + + + diff --git a/html/charset/testdata/meta-content-attribute.html b/html/charset/testdata/meta-content-attribute.html new file mode 100644 index 0000000..1c3f228 --- /dev/null +++ b/html/charset/testdata/meta-content-attribute.html @@ -0,0 +1,48 @@ + + + + meta content attribute + + + + + + + + + + + +

meta content attribute

+ + +
+ + +
 
+ + + + + +
+

The character encoding of the page can be set by a meta element with http-equiv and content attributes.

+

The only character encoding declaration for this HTML file is in the content attribute of the meta element, which declares the encoding to be ISO 8859-15.

The test contains a div with a class name that contains the following sequence of bytes: 0xC3 0xBD 0xC3 0xA4 0xC3 0xA8. These represent different sequences of characters in ISO 8859-15, ISO 8859-1 and UTF-8. The external, UTF-8-encoded stylesheet contains a selector .test div.ÜÀÚ. This matches the sequence of bytes above when they are interpreted as ISO 8859-15. If the class name matches the selector then the test will pass.

+
+
+
HTML5
+

the-input-byte-stream-007
Result summary & related tests
Detailed results for this test
Link to spec

+
Assumptions:
  • The default encoding for the browser you are testing is not set to ISO 8859-15.
  • +
  • The test is read from a server that supports HTTP.
+
+ + + + + + diff --git a/html/comment_test.go b/html/comment_test.go new file mode 100644 index 0000000..fd47de8 --- /dev/null +++ b/html/comment_test.go @@ -0,0 +1,291 @@ +// Copyright 2023 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package html + +import ( + "bytes" + "strings" + "testing" +) + +// TestComments exhaustively tests every 'interesting' N-byte string is +// correctly parsed as a comment. N ranges from 4+1 to 4+maxSuffixLen +// inclusive. 4 is the length of the "", n.Data) + case DoctypeNode: + fmt.Fprintf(w, "") + case scopeMarkerNode: + return errors.New("unexpected scopeMarkerNode") + default: + return errors.New("unknown node type") + } + io.WriteString(w, "\n") + for c := n.FirstChild; c != nil; c = c.NextSibling { + if err := dumpLevel(w, c, level); err != nil { + return err + } + } + return nil +} + +func dump(n *Node) (string, error) { + if n == nil || n.FirstChild == nil { + return "", nil + } + var b bytes.Buffer + for c := n.FirstChild; c != nil; c = c.NextSibling { + if err := dumpLevel(&b, c, 0); err != nil { + return "", err + } + } + return b.String(), nil +} + +var testDataDirs = []string{"testdata/webkit/", "testdata/go/"} + +func TestParser(t *testing.T) { + for _, testDataDir := range testDataDirs { + testFiles, err := filepath.Glob(testDataDir + "*.dat") + if err != nil { + t.Fatal(err) + } + for _, tf := range testFiles { + f, err := os.Open(tf) + if err != nil { + t.Fatal(err) + } + defer f.Close() + r := bufio.NewReader(f) + + for i := 0; ; i++ { + ta, err := readParseTest(r) + if err == io.EOF { + break + } + if err != nil { + t.Fatal(err) + } + if parseTestBlacklist[ta.text] { + continue + } + + err = testParseCase(ta.text, ta.want, ta.context, ParseOptionEnableScripting(ta.scripting)) + + if err != nil { + t.Errorf("%s test #%d %q, %s", tf, i, ta.text, err) + } + } + } + } +} + +// Issue 16318 +func TestParserWithoutScripting(t *testing.T) { + text := `

` + want := `| +| +|