File size: 6,635 Bytes
e36aeda
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

//go:build ignore

package main

import (
	"bytes"
	_ "embed"
	"fmt"
	"go/format"
	"io"
	"log"
	"maps"
	"os"
	"slices"
	"strconv"
	"strings"
)

// We embed this source file in the resulting code-generation program in order
// to extract the definitions of the encoding type and constants from it and
// include them in the generated file.
//
//go:embed gen_encoding_table.go
var genSource string

const filename = "encoding_table.go"

func main() {
	var out bytes.Buffer
	fmt.Fprintln(&out, "// Code generated from gen_encoding_table.go using 'go generate'; DO NOT EDIT.")
	fmt.Fprintln(&out)
	fmt.Fprintln(&out, "// Copyright 2025 The Go Authors. All rights reserved.")
	fmt.Fprintln(&out, "// Use of this source code is governed by a BSD-style")
	fmt.Fprintln(&out, "// license that can be found in the LICENSE file.")
	fmt.Fprintln(&out)
	fmt.Fprintln(&out, "package url")
	fmt.Fprintln(&out)
	generateEnc(&out, genSource)
	generateTable(&out)

	formatted, err := format.Source(out.Bytes())
	if err != nil {
		log.Fatal("format:", err)
	}

	err = os.WriteFile(filename, formatted, 0644)
	if err != nil {
		log.Fatal("WriteFile:", err)
	}
}

func generateEnc(w io.Writer, src string) {
	var writeLine bool
	for line := range strings.Lines(src) {
		if strings.HasPrefix(line, "// START encoding") {
			writeLine = true
			continue
		}
		if strings.HasPrefix(line, "// END encoding") {
			return
		}
		if writeLine {
			fmt.Fprint(w, line)
		}
	}
}

func generateTable(w io.Writer) {
	fmt.Fprintln(w, "var table = [256]encoding{")

	// Sort the encodings (in decreasing order) to guarantee a stable output.
	sortedEncs := slices.Sorted(maps.Keys(encNames))
	slices.Reverse(sortedEncs)

	for i := range 256 {
		c := byte(i)
		var lineBuf bytes.Buffer

		// Write key to line buffer.
		lineBuf.WriteString(strconv.QuoteRune(rune(c)))

		lineBuf.WriteByte(':')

		// Write value to line buffer.
		blankVal := true
		if ishex(c) {
			// Set the hexChar bit if this char is hexadecimal.
			lineBuf.WriteString("hexChar")
			blankVal = false
		}
		for _, enc := range sortedEncs {
			if !shouldEscape(c, enc) {
				if !blankVal {
					lineBuf.WriteByte('|')
				}
				// Set this encoding mode's bit if this char should NOT be
				// escaped.
				name := encNames[enc]
				lineBuf.WriteString(name)
				blankVal = false
			}
		}

		if !blankVal {
			lineBuf.WriteString(",\n")
			w.Write(lineBuf.Bytes())
		}
	}
	fmt.Fprintln(w, "}")
}

// START encoding (keep this marker comment in sync with genEnc)
type encoding uint8

const (
	encodePath encoding = 1 << iota
	encodePathSegment
	encodeHost
	encodeZone
	encodeUserPassword
	encodeQueryComponent
	encodeFragment

	// hexChar is actually NOT an encoding mode, but there are only seven
	// encoding modes. We might as well abuse the otherwise unused most
	// significant bit in uint8 to indicate whether a character is
	// hexadecimal.
	hexChar
)

// END encoding (keep this marker comment in sync with genEnc)

// Keep this in sync with the definitions of encoding mode constants.
var encNames = map[encoding]string{
	encodePath:           "encodePath",
	encodePathSegment:    "encodePathSegment",
	encodeHost:           "encodeHost",
	encodeZone:           "encodeZone",
	encodeUserPassword:   "encodeUserPassword",
	encodeQueryComponent: "encodeQueryComponent",
	encodeFragment:       "encodeFragment",
}

// Return true if the specified character should be escaped when
// appearing in a URL string, according to RFC 3986.
//
// Please be informed that for now shouldEscape does not check all
// reserved characters correctly. See golang.org/issue/5684.
func shouldEscape(c byte, mode encoding) bool {
	// §2.3 Unreserved characters (alphanum)
	if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9' {
		return false
	}

	if mode == encodeHost || mode == encodeZone {
		// §3.2.2 Host allows
		//	sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
		// as part of reg-name.
		// We add : because we include :port as part of host.
		// We add [ ] because we include [ipv6]:port as part of host.
		// We add < > because they're the only characters left that
		// we could possibly allow, and Parse will reject them if we
		// escape them (because hosts can't use %-encoding for
		// ASCII bytes).
		switch c {
		case '!', '$', '&', '\'', '(', ')', '*', '+', ',', ';', '=', ':', '[', ']', '<', '>', '"':
			return false
		}
	}

	switch c {
	case '-', '_', '.', '~': // §2.3 Unreserved characters (mark)
		return false

	case '$', '&', '+', ',', '/', ':', ';', '=', '?', '@': // §2.2 Reserved characters (reserved)
		// Different sections of the URL allow a few of
		// the reserved characters to appear unescaped.
		switch mode {
		case encodePath: // §3.3
			// The RFC allows : @ & = + $ but saves / ; , for assigning
			// meaning to individual path segments. This package
			// only manipulates the path as a whole, so we allow those
			// last three as well. That leaves only ? to escape.
			return c == '?'

		case encodePathSegment: // §3.3
			// The RFC allows : @ & = + $ but saves / ; , for assigning
			// meaning to individual path segments.
			return c == '/' || c == ';' || c == ',' || c == '?'

		case encodeUserPassword: // §3.2.1
			// The RFC allows ';', ':', '&', '=', '+', '$', and ',' in
			// userinfo, so we must escape only '@', '/', and '?'.
			// The parsing of userinfo treats ':' as special so we must escape
			// that too.
			return c == '@' || c == '/' || c == '?' || c == ':'

		case encodeQueryComponent: // §3.4
			// The RFC reserves (so we must escape) everything.
			return true

		case encodeFragment: // §4.1
			// The RFC text is silent but the grammar allows
			// everything, so escape nothing.
			return false
		}
	}

	if mode == encodeFragment {
		// RFC 3986 §2.2 allows not escaping sub-delims. A subset of sub-delims are
		// included in reserved from RFC 2396 §2.2. The remaining sub-delims do not
		// need to be escaped. To minimize potential breakage, we apply two restrictions:
		// (1) we always escape sub-delims outside of the fragment, and (2) we always
		// escape single quote to avoid breaking callers that had previously assumed that
		// single quotes would be escaped. See issue #19917.
		switch c {
		case '!', '(', ')', '*':
			return false
		}
	}

	// Everything else must be escaped.
	return true
}

func ishex(c byte) bool {
	return '0' <= c && c <= '9' ||
		'a' <= c && c <= 'f' ||
		'A' <= c && c <= 'F'
}