File size: 19,549 Bytes

e36aeda

// Copyright 2023 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

//go:build goexperiment.jsonv2

package jsonwire

import (
	"io"
	"math"
	"slices"
	"strconv"
	"unicode/utf16"
	"unicode/utf8"
)

type ValueFlags uint

const (
	_ ValueFlags = (1 << iota) / 2 // powers of two starting with zero

	stringNonVerbatim  // string cannot be naively treated as valid UTF-8
	stringNonCanonical // string not formatted according to RFC 8785, section 3.2.2.2.
	// TODO: Track whether a number is a non-integer?
)

func (f *ValueFlags) Join(f2 ValueFlags) { *f |= f2 }
func (f ValueFlags) IsVerbatim() bool    { return f&stringNonVerbatim == 0 }
func (f ValueFlags) IsCanonical() bool   { return f&stringNonCanonical == 0 }

// ConsumeWhitespace consumes leading JSON whitespace per RFC 7159, section 2.
func ConsumeWhitespace(b []byte) (n int) {
	// NOTE: The arguments and logic are kept simple to keep this inlinable.
	for len(b) > n && (b[n] == ' ' || b[n] == '\t' || b[n] == '\r' || b[n] == '\n') {
		n++
	}
	return n
}

// ConsumeNull consumes the next JSON null literal per RFC 7159, section 3.
// It returns 0 if it is invalid, in which case consumeLiteral should be used.
func ConsumeNull(b []byte) int {
	// NOTE: The arguments and logic are kept simple to keep this inlinable.
	const literal = "null"
	if len(b) >= len(literal) && string(b[:len(literal)]) == literal {
		return len(literal)
	}
	return 0
}

// ConsumeFalse consumes the next JSON false literal per RFC 7159, section 3.
// It returns 0 if it is invalid, in which case consumeLiteral should be used.
func ConsumeFalse(b []byte) int {
	// NOTE: The arguments and logic are kept simple to keep this inlinable.
	const literal = "false"
	if len(b) >= len(literal) && string(b[:len(literal)]) == literal {
		return len(literal)
	}
	return 0
}

// ConsumeTrue consumes the next JSON true literal per RFC 7159, section 3.
// It returns 0 if it is invalid, in which case consumeLiteral should be used.
func ConsumeTrue(b []byte) int {
	// NOTE: The arguments and logic are kept simple to keep this inlinable.
	const literal = "true"
	if len(b) >= len(literal) && string(b[:len(literal)]) == literal {
		return len(literal)
	}
	return 0
}

// ConsumeLiteral consumes the next JSON literal per RFC 7159, section 3.
// If the input appears truncated, it returns io.ErrUnexpectedEOF.
func ConsumeLiteral(b []byte, lit string) (n int, err error) {
	for i := 0; i < len(b) && i < len(lit); i++ {
		if b[i] != lit[i] {
			return i, NewInvalidCharacterError(b[i:], "in literal "+lit+" (expecting "+strconv.QuoteRune(rune(lit[i]))+")")
		}
	}
	if len(b) < len(lit) {
		return len(b), io.ErrUnexpectedEOF
	}
	return len(lit), nil
}

// ConsumeSimpleString consumes the next JSON string per RFC 7159, section 7
// but is limited to the grammar for an ASCII string without escape sequences.
// It returns 0 if it is invalid or more complicated than a simple string,
// in which case consumeString should be called.
//
// It rejects '<', '>', and '&' for compatibility reasons since these were
// always escaped in the v1 implementation. Thus, if this function reports
// non-zero then we know that the string would be encoded the same way
// under both v1 or v2 escape semantics.
func ConsumeSimpleString(b []byte) (n int) {
	// NOTE: The arguments and logic are kept simple to keep this inlinable.
	if len(b) > 0 && b[0] == '"' {
		n++
		for len(b) > n && b[n] < utf8.RuneSelf && escapeASCII[b[n]] == 0 {
			n++
		}
		if uint(len(b)) > uint(n) && b[n] == '"' {
			n++
			return n
		}
	}
	return 0
}

// ConsumeString consumes the next JSON string per RFC 7159, section 7.
// If validateUTF8 is false, then this allows the presence of invalid UTF-8
// characters within the string itself.
// It reports the number of bytes consumed and whether an error was encountered.
// If the input appears truncated, it returns io.ErrUnexpectedEOF.
func ConsumeString(flags *ValueFlags, b []byte, validateUTF8 bool) (n int, err error) {
	return ConsumeStringResumable(flags, b, 0, validateUTF8)
}

// ConsumeStringResumable is identical to consumeString but supports resuming
// from a previous call that returned io.ErrUnexpectedEOF.
func ConsumeStringResumable(flags *ValueFlags, b []byte, resumeOffset int, validateUTF8 bool) (n int, err error) {
	// Consume the leading double quote.
	switch {
	case resumeOffset > 0:
		n = resumeOffset // already handled the leading quote
	case uint(len(b)) == 0:
		return n, io.ErrUnexpectedEOF
	case b[0] == '"':
		n++
	default:
		return n, NewInvalidCharacterError(b[n:], `at start of string (expecting '"')`)
	}

	// Consume every character in the string.
	for uint(len(b)) > uint(n) {
		// Optimize for long sequences of unescaped characters.
		noEscape := func(c byte) bool {
			return c < utf8.RuneSelf && ' ' <= c && c != '\\' && c != '"'
		}
		for uint(len(b)) > uint(n) && noEscape(b[n]) {
			n++
		}
		if uint(len(b)) <= uint(n) {
			return n, io.ErrUnexpectedEOF
		}

		// Check for terminating double quote.
		if b[n] == '"' {
			n++
			return n, nil
		}

		switch r, rn := utf8.DecodeRune(b[n:]); {
		// Handle UTF-8 encoded byte sequence.
		// Due to specialized handling of ASCII above, we know that
		// all normal sequences at this point must be 2 bytes or larger.
		case rn > 1:
			n += rn
		// Handle escape sequence.
		case r == '\\':
			flags.Join(stringNonVerbatim)
			resumeOffset = n
			if uint(len(b)) < uint(n+2) {
				return resumeOffset, io.ErrUnexpectedEOF
			}
			switch r := b[n+1]; r {
			case '/':
				// Forward slash is the only character with 3 representations.
				// Per RFC 8785, section 3.2.2.2., this must not be escaped.
				flags.Join(stringNonCanonical)
				n += 2
			case '"', '\\', 'b', 'f', 'n', 'r', 't':
				n += 2
			case 'u':
				if uint(len(b)) < uint(n+6) {
					if hasEscapedUTF16Prefix(b[n:], false) {
						return resumeOffset, io.ErrUnexpectedEOF
					}
					flags.Join(stringNonCanonical)
					return n, NewInvalidEscapeSequenceError(b[n:])
				}
				v1, ok := parseHexUint16(b[n+2 : n+6])
				if !ok {
					flags.Join(stringNonCanonical)
					return n, NewInvalidEscapeSequenceError(b[n : n+6])
				}
				// Only certain control characters can use the \uFFFF notation
				// for canonical formatting (per RFC 8785, section 3.2.2.2.).
				switch v1 {
				// \uFFFF notation not permitted for these characters.
				case '\b', '\f', '\n', '\r', '\t':
					flags.Join(stringNonCanonical)
				default:
					// \uFFFF notation only permitted for control characters.
					if v1 >= ' ' {
						flags.Join(stringNonCanonical)
					} else {
						// \uFFFF notation must be lower case.
						for _, c := range b[n+2 : n+6] {
							if 'A' <= c && c <= 'F' {
								flags.Join(stringNonCanonical)
							}
						}
					}
				}
				n += 6

				r := rune(v1)
				if validateUTF8 && utf16.IsSurrogate(r) {
					if uint(len(b)) < uint(n+6) {
						if hasEscapedUTF16Prefix(b[n:], true) {
							return resumeOffset, io.ErrUnexpectedEOF
						}
						flags.Join(stringNonCanonical)
						return n - 6, NewInvalidEscapeSequenceError(b[n-6:])
					} else if v2, ok := parseHexUint16(b[n+2 : n+6]); b[n] != '\\' || b[n+1] != 'u' || !ok {
						flags.Join(stringNonCanonical)
						return n - 6, NewInvalidEscapeSequenceError(b[n-6 : n+6])
					} else if r = utf16.DecodeRune(rune(v1), rune(v2)); r == utf8.RuneError {
						flags.Join(stringNonCanonical)
						return n - 6, NewInvalidEscapeSequenceError(b[n-6 : n+6])
					} else {
						n += 6
					}
				}
			default:
				flags.Join(stringNonCanonical)
				return n, NewInvalidEscapeSequenceError(b[n : n+2])
			}
		// Handle invalid UTF-8.
		case r == utf8.RuneError:
			if !utf8.FullRune(b[n:]) {
				return n, io.ErrUnexpectedEOF
			}
			flags.Join(stringNonVerbatim | stringNonCanonical)
			if validateUTF8 {
				return n, ErrInvalidUTF8
			}
			n++
		// Handle invalid control characters.
		case r < ' ':
			flags.Join(stringNonVerbatim | stringNonCanonical)
			return n, NewInvalidCharacterError(b[n:], "in string (expecting non-control character)")
		default:
			panic("BUG: unhandled character " + QuoteRune(b[n:]))
		}
	}
	return n, io.ErrUnexpectedEOF
}

// AppendUnquote appends the unescaped form of a JSON string in src to dst.
// Any invalid UTF-8 within the string will be replaced with utf8.RuneError,
// but the error will be specified as having encountered such an error.
// The input must be an entire JSON string with no surrounding whitespace.
func AppendUnquote[Bytes ~[]byte | ~string](dst []byte, src Bytes) (v []byte, err error) {
	dst = slices.Grow(dst, len(src))

	// Consume the leading double quote.
	var i, n int
	switch {
	case uint(len(src)) == 0:
		return dst, io.ErrUnexpectedEOF
	case src[0] == '"':
		i, n = 1, 1
	default:
		return dst, NewInvalidCharacterError(src, `at start of string (expecting '"')`)
	}

	// Consume every character in the string.
	for uint(len(src)) > uint(n) {
		// Optimize for long sequences of unescaped characters.
		noEscape := func(c byte) bool {
			return c < utf8.RuneSelf && ' ' <= c && c != '\\' && c != '"'
		}
		for uint(len(src)) > uint(n) && noEscape(src[n]) {
			n++
		}
		if uint(len(src)) <= uint(n) {
			dst = append(dst, src[i:n]...)
			return dst, io.ErrUnexpectedEOF
		}

		// Check for terminating double quote.
		if src[n] == '"' {
			dst = append(dst, src[i:n]...)
			n++
			if n < len(src) {
				err = NewInvalidCharacterError(src[n:], "after string value")
			}
			return dst, err
		}

		switch r, rn := utf8.DecodeRuneInString(string(truncateMaxUTF8(src[n:]))); {
		// Handle UTF-8 encoded byte sequence.
		// Due to specialized handling of ASCII above, we know that
		// all normal sequences at this point must be 2 bytes or larger.
		case rn > 1:
			n += rn
		// Handle escape sequence.
		case r == '\\':
			dst = append(dst, src[i:n]...)

			// Handle escape sequence.
			if uint(len(src)) < uint(n+2) {
				return dst, io.ErrUnexpectedEOF
			}
			switch r := src[n+1]; r {
			case '"', '\\', '/':
				dst = append(dst, r)
				n += 2
			case 'b':
				dst = append(dst, '\b')
				n += 2
			case 'f':
				dst = append(dst, '\f')
				n += 2
			case 'n':
				dst = append(dst, '\n')
				n += 2
			case 'r':
				dst = append(dst, '\r')
				n += 2
			case 't':
				dst = append(dst, '\t')
				n += 2
			case 'u':
				if uint(len(src)) < uint(n+6) {
					if hasEscapedUTF16Prefix(src[n:], false) {
						return dst, io.ErrUnexpectedEOF
					}
					return dst, NewInvalidEscapeSequenceError(src[n:])
				}
				v1, ok := parseHexUint16(src[n+2 : n+6])
				if !ok {
					return dst, NewInvalidEscapeSequenceError(src[n : n+6])
				}
				n += 6

				// Check whether this is a surrogate half.
				r := rune(v1)
				if utf16.IsSurrogate(r) {
					r = utf8.RuneError // assume failure unless the following succeeds
					if uint(len(src)) < uint(n+6) {
						if hasEscapedUTF16Prefix(src[n:], true) {
							return utf8.AppendRune(dst, r), io.ErrUnexpectedEOF
						}
						err = NewInvalidEscapeSequenceError(src[n-6:])
					} else if v2, ok := parseHexUint16(src[n+2 : n+6]); src[n] != '\\' || src[n+1] != 'u' || !ok {
						err = NewInvalidEscapeSequenceError(src[n-6 : n+6])
					} else if r = utf16.DecodeRune(rune(v1), rune(v2)); r == utf8.RuneError {
						err = NewInvalidEscapeSequenceError(src[n-6 : n+6])
					} else {
						n += 6
					}
				}

				dst = utf8.AppendRune(dst, r)
			default:
				return dst, NewInvalidEscapeSequenceError(src[n : n+2])
			}
			i = n
		// Handle invalid UTF-8.
		case r == utf8.RuneError:
			dst = append(dst, src[i:n]...)
			if !utf8.FullRuneInString(string(truncateMaxUTF8(src[n:]))) {
				return dst, io.ErrUnexpectedEOF
			}
			// NOTE: An unescaped string may be longer than the escaped string
			// because invalid UTF-8 bytes are being replaced.
			dst = append(dst, "\uFFFD"...)
			n += rn
			i = n
			err = ErrInvalidUTF8
		// Handle invalid control characters.
		case r < ' ':
			dst = append(dst, src[i:n]...)
			return dst, NewInvalidCharacterError(src[n:], "in string (expecting non-control character)")
		default:
			panic("BUG: unhandled character " + QuoteRune(src[n:]))
		}
	}
	dst = append(dst, src[i:n]...)
	return dst, io.ErrUnexpectedEOF
}

// hasEscapedUTF16Prefix reports whether b is possibly
// the truncated prefix of a \uFFFF escape sequence.
func hasEscapedUTF16Prefix[Bytes ~[]byte | ~string](b Bytes, lowerSurrogateHalf bool) bool {
	for i := range len(b) {
		switch c := b[i]; {
		case i == 0 && c != '\\':
			return false
		case i == 1 && c != 'u':
			return false
		case i == 2 && lowerSurrogateHalf && c != 'd' && c != 'D':
			return false // not within ['\uDC00':'\uDFFF']
		case i == 3 && lowerSurrogateHalf && !('c' <= c && c <= 'f') && !('C' <= c && c <= 'F'):
			return false // not within ['\uDC00':'\uDFFF']
		case i >= 2 && i < 6 && !('0' <= c && c <= '9') && !('a' <= c && c <= 'f') && !('A' <= c && c <= 'F'):
			return false
		}
	}
	return true
}

// UnquoteMayCopy returns the unescaped form of b.
// If there are no escaped characters, the output is simply a subslice of
// the input with the surrounding quotes removed.
// Otherwise, a new buffer is allocated for the output.
// It assumes the input is valid.
func UnquoteMayCopy(b []byte, isVerbatim bool) []byte {
	// NOTE: The arguments and logic are kept simple to keep this inlinable.
	if isVerbatim {
		return b[len(`"`) : len(b)-len(`"`)]
	}
	b, _ = AppendUnquote(nil, b)
	return b
}

// ConsumeSimpleNumber consumes the next JSON number per RFC 7159, section 6
// but is limited to the grammar for a positive integer.
// It returns 0 if it is invalid or more complicated than a simple integer,
// in which case consumeNumber should be called.
func ConsumeSimpleNumber(b []byte) (n int) {
	// NOTE: The arguments and logic are kept simple to keep this inlinable.
	if len(b) > 0 {
		if b[0] == '0' {
			n++
		} else if '1' <= b[0] && b[0] <= '9' {
			n++
			for len(b) > n && ('0' <= b[n] && b[n] <= '9') {
				n++
			}
		} else {
			return 0
		}
		if uint(len(b)) <= uint(n) || (b[n] != '.' && b[n] != 'e' && b[n] != 'E') {
			return n
		}
	}
	return 0
}

type ConsumeNumberState uint

const (
	consumeNumberInit ConsumeNumberState = iota
	beforeIntegerDigits
	withinIntegerDigits
	beforeFractionalDigits
	withinFractionalDigits
	beforeExponentDigits
	withinExponentDigits
)

// ConsumeNumber consumes the next JSON number per RFC 7159, section 6.
// It reports the number of bytes consumed and whether an error was encountered.
// If the input appears truncated, it returns io.ErrUnexpectedEOF.
//
// Note that JSON numbers are not self-terminating.
// If the entire input is consumed, then the caller needs to consider whether
// there may be subsequent unread data that may still be part of this number.
func ConsumeNumber(b []byte) (n int, err error) {
	n, _, err = ConsumeNumberResumable(b, 0, consumeNumberInit)
	return n, err
}

// ConsumeNumberResumable is identical to consumeNumber but supports resuming
// from a previous call that returned io.ErrUnexpectedEOF.
func ConsumeNumberResumable(b []byte, resumeOffset int, state ConsumeNumberState) (n int, _ ConsumeNumberState, err error) {
	// Jump to the right state when resuming from a partial consumption.
	n = resumeOffset
	if state > consumeNumberInit {
		switch state {
		case withinIntegerDigits, withinFractionalDigits, withinExponentDigits:
			// Consume leading digits.
			for uint(len(b)) > uint(n) && ('0' <= b[n] && b[n] <= '9') {
				n++
			}
			if uint(len(b)) <= uint(n) {
				return n, state, nil // still within the same state
			}
			state++ // switches "withinX" to "beforeY" where Y is the state after X
		}
		switch state {
		case beforeIntegerDigits:
			goto beforeInteger
		case beforeFractionalDigits:
			goto beforeFractional
		case beforeExponentDigits:
			goto beforeExponent
		default:
			return n, state, nil
		}
	}

	// Consume required integer component (with optional minus sign).
beforeInteger:
	resumeOffset = n
	if uint(len(b)) > 0 && b[0] == '-' {
		n++
	}
	switch {
	case uint(len(b)) <= uint(n):
		return resumeOffset, beforeIntegerDigits, io.ErrUnexpectedEOF
	case b[n] == '0':
		n++
		state = beforeFractionalDigits
	case '1' <= b[n] && b[n] <= '9':
		n++
		for uint(len(b)) > uint(n) && ('0' <= b[n] && b[n] <= '9') {
			n++
		}
		state = withinIntegerDigits
	default:
		return n, state, NewInvalidCharacterError(b[n:], "in number (expecting digit)")
	}

	// Consume optional fractional component.
beforeFractional:
	if uint(len(b)) > uint(n) && b[n] == '.' {
		resumeOffset = n
		n++
		switch {
		case uint(len(b)) <= uint(n):
			return resumeOffset, beforeFractionalDigits, io.ErrUnexpectedEOF
		case '0' <= b[n] && b[n] <= '9':
			n++
		default:
			return n, state, NewInvalidCharacterError(b[n:], "in number (expecting digit)")
		}
		for uint(len(b)) > uint(n) && ('0' <= b[n] && b[n] <= '9') {
			n++
		}
		state = withinFractionalDigits
	}

	// Consume optional exponent component.
beforeExponent:
	if uint(len(b)) > uint(n) && (b[n] == 'e' || b[n] == 'E') {
		resumeOffset = n
		n++
		if uint(len(b)) > uint(n) && (b[n] == '-' || b[n] == '+') {
			n++
		}
		switch {
		case uint(len(b)) <= uint(n):
			return resumeOffset, beforeExponentDigits, io.ErrUnexpectedEOF
		case '0' <= b[n] && b[n] <= '9':
			n++
		default:
			return n, state, NewInvalidCharacterError(b[n:], "in number (expecting digit)")
		}
		for uint(len(b)) > uint(n) && ('0' <= b[n] && b[n] <= '9') {
			n++
		}
		state = withinExponentDigits
	}

	return n, state, nil
}

// parseHexUint16 is similar to strconv.ParseUint,
// but operates directly on []byte and is optimized for base-16.
// See https://go.dev/issue/42429.
func parseHexUint16[Bytes ~[]byte | ~string](b Bytes) (v uint16, ok bool) {
	if len(b) != 4 {
		return 0, false
	}
	for i := range 4 {
		c := b[i]
		switch {
		case '0' <= c && c <= '9':
			c = c - '0'
		case 'a' <= c && c <= 'f':
			c = 10 + c - 'a'
		case 'A' <= c && c <= 'F':
			c = 10 + c - 'A'
		default:
			return 0, false
		}
		v = v*16 + uint16(c)
	}
	return v, true
}

// ParseUint parses b as a decimal unsigned integer according to
// a strict subset of the JSON number grammar, returning the value if valid.
// It returns (0, false) if there is a syntax error and
// returns (math.MaxUint64, false) if there is an overflow.
func ParseUint(b []byte) (v uint64, ok bool) {
	const unsafeWidth = 20 // len(fmt.Sprint(uint64(math.MaxUint64)))
	var n int
	for ; len(b) > n && ('0' <= b[n] && b[n] <= '9'); n++ {
		v = 10*v + uint64(b[n]-'0')
	}
	switch {
	case n == 0 || len(b) != n || (b[0] == '0' && string(b) != "0"):
		return 0, false
	case n >= unsafeWidth && (b[0] != '1' || v < 1e19 || n > unsafeWidth):
		return math.MaxUint64, false
	}
	return v, true
}

// ParseFloat parses a floating point number according to the Go float grammar.
// Note that the JSON number grammar is a strict subset.
//
// If the number overflows the finite representation of a float,
// then we return MaxFloat since any finite value will always be infinitely
// more accurate at representing another finite value than an infinite value.
func ParseFloat(b []byte, bits int) (v float64, ok bool) {
	fv, err := strconv.ParseFloat(string(b), bits)
	if math.IsInf(fv, 0) {
		switch {
		case bits == 32 && math.IsInf(fv, +1):
			fv = +math.MaxFloat32
		case bits == 64 && math.IsInf(fv, +1):
			fv = +math.MaxFloat64
		case bits == 32 && math.IsInf(fv, -1):
			fv = -math.MaxFloat32
		case bits == 64 && math.IsInf(fv, -1):
			fv = -math.MaxFloat64
		}
	}
	return fv, err == nil
}