Upload folder using huggingface_hub

e36aeda verified about 1 month ago

13.8 kB

	// Copyright 2020 The Go Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style
	// license that can be found in the LICENSE file.

	//go:build goexperiment.jsonv2

	package jsontext

	import (
	"bytes"
	"errors"
	"io"
	"slices"
	"sync"

	"encoding/json/internal/jsonflags"
	"encoding/json/internal/jsonwire"
	)

	// NOTE: Value is analogous to v1 json.RawMessage.

	// AppendFormat formats the JSON value in src and appends it to dst
	// according to the specified options.
	// See [Value.Format] for more details about the formatting behavior.
	//
	// The dst and src may overlap.
	// If an error is reported, then the entirety of src is appended to dst.
	func AppendFormat(dst, src []byte, opts ...Options) ([]byte, error) {
	e := getBufferedEncoder(opts...)
	defer putBufferedEncoder(e)
	e.s.Flags.Set(jsonflags.OmitTopLevelNewline \| 1)
	if err := e.s.WriteValue(src); err != nil {
	return append(dst, src...), err
	}
	return append(dst, e.s.Buf...), nil
	}

	// Value represents a single raw JSON value, which may be one of the following:
	// - a JSON literal (i.e., null, true, or false)
	// - a JSON string (e.g., "hello, world!")
	// - a JSON number (e.g., 123.456)
	// - an entire JSON object (e.g., {"fizz":"buzz"} )
	// - an entire JSON array (e.g., [1,2,3] )
	//
	// Value can represent entire array or object values, while [Token] cannot.
	// Value may contain leading and/or trailing whitespace.
	type Value []byte

	// Clone returns a copy of v.
	func (v Value) Clone() Value {
	return bytes.Clone(v)
	}

	// String returns the string formatting of v.
	func (v Value) String() string {
	if v == nil {
	return "null"
	}
	return string(v)
	}

	// IsValid reports whether the raw JSON value is syntactically valid
	// according to the specified options.
	//
	// By default (if no options are specified), it validates according to RFC 7493.
	// It verifies whether the input is properly encoded as UTF-8,
	// that escape sequences within strings decode to valid Unicode codepoints, and
	// that all names in each object are unique.
	// It does not verify whether numbers are representable within the limits
	// of any common numeric type (e.g., float64, int64, or uint64).
	//
	// Relevant options include:
	// - [AllowDuplicateNames]
	// - [AllowInvalidUTF8]
	//
	// All other options are ignored.
	func (v Value) IsValid(opts ...Options) bool {
	// TODO: Document support for [WithByteLimit] and [WithDepthLimit].
	d := getBufferedDecoder(v, opts...)
	defer putBufferedDecoder(d)
	_, errVal := d.ReadValue()
	_, errEOF := d.ReadToken()
	return errVal == nil && errEOF == io.EOF
	}

	// Format formats the raw JSON value in place.
	//
	// By default (if no options are specified), it validates according to RFC 7493
	// and produces the minimal JSON representation, where
	// all whitespace is elided and JSON strings use the shortest encoding.
	//
	// Relevant options include:
	// - [AllowDuplicateNames]
	// - [AllowInvalidUTF8]
	// - [EscapeForHTML]
	// - [EscapeForJS]
	// - [PreserveRawStrings]
	// - [CanonicalizeRawInts]
	// - [CanonicalizeRawFloats]
	// - [ReorderRawObjects]
	// - [SpaceAfterColon]
	// - [SpaceAfterComma]
	// - [Multiline]
	// - [WithIndent]
	// - [WithIndentPrefix]
	//
	// All other options are ignored.
	//
	// It is guaranteed to succeed if the value is valid according to the same options.
	// If the value is already formatted, then the buffer is not mutated.
	func (v *Value) Format(opts ...Options) error {
	// TODO: Document support for [WithByteLimit] and [WithDepthLimit].
	return v.format(opts, nil)
	}

	// format accepts two []Options to avoid the allocation appending them together.
	// It is equivalent to v.Format(append(opts1, opts2...)...).
	func (v *Value) format(opts1, opts2 []Options) error {
	e := getBufferedEncoder(opts1...)
	defer putBufferedEncoder(e)
	e.s.Join(opts2...)
	e.s.Flags.Set(jsonflags.OmitTopLevelNewline \| 1)
	if err := e.s.WriteValue(*v); err != nil {
	return err
	}
	if !bytes.Equal(*v, e.s.Buf) {
	v = append((v)[:0], e.s.Buf...)
	}
	return nil
	}

	// Compact removes all whitespace from the raw JSON value.
	//
	// It does not reformat JSON strings or numbers to use any other representation.
	// To maximize the set of JSON values that can be formatted,
	// this permits values with duplicate names and invalid UTF-8.
	//
	// Compact is equivalent to calling [Value.Format] with the following options:
	// - [AllowDuplicateNames](true)
	// - [AllowInvalidUTF8](true)
	// - [PreserveRawStrings](true)
	//
	// Any options specified by the caller are applied after the initial set
	// and may deliberately override prior options.
	func (v *Value) Compact(opts ...Options) error {
	return v.format([]Options{
	AllowDuplicateNames(true),
	AllowInvalidUTF8(true),
	PreserveRawStrings(true),
	}, opts)
	}

	// Indent reformats the whitespace in the raw JSON value so that each element
	// in a JSON object or array begins on a indented line according to the nesting.
	//
	// It does not reformat JSON strings or numbers to use any other representation.
	// To maximize the set of JSON values that can be formatted,
	// this permits values with duplicate names and invalid UTF-8.
	//
	// Indent is equivalent to calling [Value.Format] with the following options:
	// - [AllowDuplicateNames](true)
	// - [AllowInvalidUTF8](true)
	// - [PreserveRawStrings](true)
	// - [Multiline](true)
	//
	// Any options specified by the caller are applied after the initial set
	// and may deliberately override prior options.
	func (v *Value) Indent(opts ...Options) error {
	return v.format([]Options{
	AllowDuplicateNames(true),
	AllowInvalidUTF8(true),
	PreserveRawStrings(true),
	Multiline(true),
	}, opts)
	}

	// Canonicalize canonicalizes the raw JSON value according to the
	// JSON Canonicalization Scheme (JCS) as defined by RFC 8785
	// where it produces a stable representation of a JSON value.
	//
	// JSON strings are formatted to use their minimal representation,
	// JSON numbers are formatted as double precision numbers according
	// to some stable serialization algorithm.
	// JSON object members are sorted in ascending order by name.
	// All whitespace is removed.
	//
	// The output stability is dependent on the stability of the application data
	// (see RFC 8785, Appendix E). It cannot produce stable output from
	// fundamentally unstable input. For example, if the JSON value
	// contains ephemeral data (e.g., a frequently changing timestamp),
	// then the value is still unstable regardless of whether this is called.
	//
	// Canonicalize is equivalent to calling [Value.Format] with the following options:
	// - [CanonicalizeRawInts](true)
	// - [CanonicalizeRawFloats](true)
	// - [ReorderRawObjects](true)
	//
	// Any options specified by the caller are applied after the initial set
	// and may deliberately override prior options.
	//
	// Note that JCS treats all JSON numbers as IEEE 754 double precision numbers.
	// Any numbers with precision beyond what is representable by that form
	// will lose their precision when canonicalized. For example, integer values
	// beyond ±2⁵³ will lose their precision. To preserve the original representation
	// of JSON integers, additionally set [CanonicalizeRawInts] to false:
	//
	// v.Canonicalize(jsontext.CanonicalizeRawInts(false))
	func (v *Value) Canonicalize(opts ...Options) error {
	return v.format([]Options{
	CanonicalizeRawInts(true),
	CanonicalizeRawFloats(true),
	ReorderRawObjects(true),
	}, opts)
	}

	// MarshalJSON returns v as the JSON encoding of v.
	// It returns the stored value as the raw JSON output without any validation.
	// If v is nil, then this returns a JSON null.
	func (v Value) MarshalJSON() ([]byte, error) {
	// NOTE: This matches the behavior of v1 json.RawMessage.MarshalJSON.
	if v == nil {
	return []byte("null"), nil
	}
	return v, nil
	}

	// UnmarshalJSON sets v as the JSON encoding of b.
	// It stores a copy of the provided raw JSON input without any validation.
	func (v *Value) UnmarshalJSON(b []byte) error {
	// NOTE: This matches the behavior of v1 json.RawMessage.UnmarshalJSON.
	if v == nil {
	return errors.New("jsontext.Value: UnmarshalJSON on nil pointer")
	}
	v = append((v)[:0], b...)
	return nil
	}

	// Kind returns the starting token kind.
	// For a valid value, this will never include [KindEndObject] or [KindEndArray].
	func (v Value) Kind() Kind {
	if v := v[jsonwire.ConsumeWhitespace(v):]; len(v) > 0 {
	return Kind(v[0]).normalize()
	}
	return invalidKind
	}

	const commaAndWhitespace = ", \n\r\t"

	type objectMember struct {
	// name is the unquoted name.
	name []byte // e.g., "name"
	// buffer is the entirety of the raw JSON object member
	// starting from right after the previous member (or opening '{')
	// until right after the member value.
	buffer []byte // e.g., `, \n\r\t"name": "value"`
	}

	func (x objectMember) Compare(y objectMember) int {
	if c := jsonwire.CompareUTF16(x.name, y.name); c != 0 {
	return c
	}
	// With [AllowDuplicateNames] or [AllowInvalidUTF8],
	// names could be identical, so also sort using the member value.
	return jsonwire.CompareUTF16(
	bytes.TrimLeft(x.buffer, commaAndWhitespace),
	bytes.TrimLeft(y.buffer, commaAndWhitespace))
	}

	var objectMemberPool = sync.Pool{New: func() any { return new([]objectMember) }}

	func getObjectMembers() *[]objectMember {
	ns := objectMemberPool.Get().(*[]objectMember)
	ns = (ns)[:0]
	return ns
	}
	func putObjectMembers(ns *[]objectMember) {
	if cap(*ns) < 1<<10 {
	clear(*ns) // avoid pinning name and buffer
	objectMemberPool.Put(ns)
	}
	}

	// mustReorderObjects reorders in-place all object members in a JSON value,
	// which must be valid otherwise it panics.
	func mustReorderObjects(b []byte) {
	// Obtain a buffered encoder just to use its internal buffer as
	// a scratch buffer for reordering object members.
	e2 := getBufferedEncoder()
	defer putBufferedEncoder(e2)

	// Disable unnecessary checks to syntactically parse the JSON value.
	d := getBufferedDecoder(b)
	defer putBufferedDecoder(d)
	d.s.Flags.Set(jsonflags.AllowDuplicateNames \| jsonflags.AllowInvalidUTF8 \| 1)
	mustReorderObjectsFromDecoder(d, &e2.s.Buf) // per RFC 8785, section 3.2.3
	}

	// mustReorderObjectsFromDecoder recursively reorders all object members in place
	// according to the ordering specified in RFC 8785, section 3.2.3.
	//
	// Pre-conditions:
	// - The value is valid (i.e., no decoder errors should ever occur).
	// - Initial call is provided a Decoder reading from the start of v.
	//
	// Post-conditions:
	// - Exactly one JSON value is read from the Decoder.
	// - All fully-parsed JSON objects are reordered by directly moving
	// the members in the value buffer.
	//
	// The runtime is approximately O(n·log(n)) + O(m·log(m)),
	// where n is len(v) and m is the total number of object members.
	func mustReorderObjectsFromDecoder(d Decoder, scratch []byte) {
	switch tok, err := d.ReadToken(); tok.Kind() {
	case '{':
	// Iterate and collect the name and offsets for every object member.
	members := getObjectMembers()
	defer putObjectMembers(members)
	var prevMember objectMember
	isSorted := true

	beforeBody := d.InputOffset() // offset after '{'
	for d.PeekKind() != '}' {
	beforeName := d.InputOffset()
	var flags jsonwire.ValueFlags
	name, _ := d.s.ReadValue(&flags)
	name = jsonwire.UnquoteMayCopy(name, flags.IsVerbatim())
	mustReorderObjectsFromDecoder(d, scratch)
	afterValue := d.InputOffset()

	currMember := objectMember{name, d.s.buf[beforeName:afterValue]}
	if isSorted && len(*members) > 0 {
	isSorted = objectMember.Compare(prevMember, currMember) < 0
	}
	members = append(members, currMember)
	prevMember = currMember
	}
	afterBody := d.InputOffset() // offset before '}'
	d.ReadToken()

	// Sort the members; return early if it's already sorted.
	if isSorted {
	return
	}
	firstBufferBeforeSorting := (*members)[0].buffer
	slices.SortFunc(*members, objectMember.Compare)
	firstBufferAfterSorting := (*members)[0].buffer

	// Append the reordered members to a new buffer,
	// then copy the reordered members back over the original members.
	// Avoid swapping in place since each member may be a different size
	// where moving a member over a smaller member may corrupt the data
	// for subsequent members before they have been moved.
	//
	// The following invariant must hold:
	// sum([m.after-m.before for m in members]) == afterBody-beforeBody
	commaAndWhitespacePrefix := func(b []byte) []byte {
	return b[:len(b)-len(bytes.TrimLeft(b, commaAndWhitespace))]
	}
	sorted := (*scratch)[:0]
	for i, member := range *members {
	switch {
	case i == 0 && &member.buffer[0] != &firstBufferBeforeSorting[0]:
	// First member after sorting is not the first member before sorting,
	// so use the prefix of the first member before sorting.
	sorted = append(sorted, commaAndWhitespacePrefix(firstBufferBeforeSorting)...)
	sorted = append(sorted, bytes.TrimLeft(member.buffer, commaAndWhitespace)...)
	case i != 0 && &member.buffer[0] == &firstBufferBeforeSorting[0]:
	// Later member after sorting is the first member before sorting,
	// so use the prefix of the first member after sorting.
	sorted = append(sorted, commaAndWhitespacePrefix(firstBufferAfterSorting)...)
	sorted = append(sorted, bytes.TrimLeft(member.buffer, commaAndWhitespace)...)
	default:
	sorted = append(sorted, member.buffer...)
	}
	}
	if int(afterBody-beforeBody) != len(sorted) {
	panic("BUG: length invariant violated")
	}
	copy(d.s.buf[beforeBody:afterBody], sorted)

	// Update scratch buffer to the largest amount ever used.
	if len(sorted) > len(*scratch) {
	*scratch = sorted
	}
	case '[':
	for d.PeekKind() != ']' {
	mustReorderObjectsFromDecoder(d, scratch)
	}
	d.ReadToken()
	default:
	if err != nil {
	panic("BUG: " + err.Error())
	}
	}
	}