Upload folder using huggingface_hub

e36aeda verified 3 months ago

13.7 kB

	// Copyright 2025 The Go Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style
	// license that can be found in the LICENSE file.

	package unify

	import (
	"fmt"
	"iter"
	"reflect"
	"strings"
	)

	// An envSet is an immutable set of environments, where each environment is a
	// mapping from [ident]s to [Value]s.
	//
	// To keep this compact, we use an algebraic representation similar to
	// relational algebra. The atoms are zero, unit, or a singular binding:
	//
	// - A singular binding {x: v} is an environment set consisting of a single
	// environment that binds a single ident x to a single value v.
	//
	// - Zero (0) is the empty set.
	//
	// - Unit (1) is an environment set consisting of a single, empty environment
	// (no bindings).
	//
	// From these, we build up more complex sets of environments using sums and
	// cross products:
	//
	// - A sum, E + F, is simply the union of the two environment sets: E ∪ F
	//
	// - A cross product, E ⨯ F, is the Cartesian product of the two environment
	// sets, followed by joining each pair of environments: {e ⊕ f \| (e, f) ∊ E ⨯ F}
	//
	// The join of two environments, e ⊕ f, is an environment that contains all of
	// the bindings in either e or f. To detect bugs, it is an error if an
	// identifier is bound in both e and f (however, see below for what we could do
	// differently).
	//
	// Environment sets form a commutative semiring and thus obey the usual
	// commutative semiring rules:
	//
	// e + 0 = e
	// e ⨯ 0 = 0
	// e ⨯ 1 = e
	// e + f = f + e
	// e ⨯ f = f ⨯ e
	//
	// Furthermore, environments sets are additively and multiplicatively idempotent
	// because + and ⨯ are themselves defined in terms of sets:
	//
	// e + e = e
	// e ⨯ e = e
	//
	// # Examples
	//
	// To represent {{x: 1, y: 1}, {x: 2, y: 2}}, we build the two environments and
	// sum them:
	//
	// ({x: 1} ⨯ {y: 1}) + ({x: 2} ⨯ {y: 2})
	//
	// If we add a third variable z that can be 1 or 2, independent of x and y, we
	// get four logical environments:
	//
	// {x: 1, y: 1, z: 1}
	// {x: 2, y: 2, z: 1}
	// {x: 1, y: 1, z: 2}
	// {x: 2, y: 2, z: 2}
	//
	// This could be represented as a sum of all four environments, but because z is
	// independent, we can use a more compact representation:
	//
	// (({x: 1} ⨯ {y: 1}) + ({x: 2} ⨯ {y: 2})) ⨯ ({z: 1} + {z: 2})
	//
	// # Generalized cross product
	//
	// While cross-product is currently restricted to disjoint environments, we
	// could generalize the definition of joining two environments to:
	//
	// {xₖ: vₖ} ⊕ {xₖ: wₖ} = {xₖ: vₖ ∩ wₖ} (where unbound idents are bound to the [Top] value, ⟙)
	//
	// where v ∩ w is the unification of v and w. This itself could be coarsened to
	//
	// v ∩ w = v if w = ⟙
	// = w if v = ⟙
	// = v if v = w
	// = 0 otherwise
	//
	// We could use this rule to implement substitution. For example, E ⨯ {x: 1}
	// narrows environment set E to only environments in which x is bound to 1. But
	// we currently don't do this.
	type envSet struct {
	root *envExpr
	}

	type envExpr struct {
	// TODO: A tree-based data structure for this may not be ideal, since it
	// involves a lot of walking to find things and we often have to do deep
	// rewrites anyway for partitioning. Would some flattened array-style
	// representation be better, possibly combined with an index of ident uses?
	// We could even combine that with an immutable array abstraction (ala
	// Clojure) that could enable more efficient construction operations.

	kind envExprKind

	// For envBinding
	id *ident
	val *Value

	// For sum or product. Len must be >= 2 and none of the elements can have
	// the same kind as this node.
	operands []*envExpr
	}

	type envExprKind byte

	const (
	envZero envExprKind = iota
	envUnit
	envProduct
	envSum
	envBinding
	)

	var (
	// topEnv is the unit value (multiplicative identity) of a [envSet].
	topEnv = envSet{envExprUnit}
	// bottomEnv is the zero value (additive identity) of a [envSet].
	bottomEnv = envSet{envExprZero}

	envExprZero = &envExpr{kind: envZero}
	envExprUnit = &envExpr{kind: envUnit}
	)

	// bind binds id to each of vals in e.
	//
	// Its panics if id is already bound in e.
	//
	// Environments are typically initially constructed by starting with [topEnv]
	// and calling bind one or more times.
	func (e envSet) bind(id ident, vals ...Value) envSet {
	if e.isEmpty() {
	return bottomEnv
	}

	// TODO: If any of vals are _, should we just drop that val? We're kind of
	// inconsistent about whether an id missing from e means id is invalid or
	// means id is _.

	// Check that id isn't present in e.
	for range e.root.bindings(id) {
	panic("id " + id.name + " already present in environment")
	}

	// Create a sum of all the values.
	bindings := make([]*envExpr, 0, 1)
	for _, val := range vals {
	bindings = append(bindings, &envExpr{kind: envBinding, id: id, val: val})
	}

	// Multiply it in.
	return envSet{newEnvExprProduct(e.root, newEnvExprSum(bindings...))}
	}

	func (e envSet) isEmpty() bool {
	return e.root.kind == envZero
	}

	// bindings yields all [envBinding] nodes in e with the given id. If id is nil,
	// it yields all binding nodes.
	func (e envExpr) bindings(id ident) iter.Seq[*envExpr] {
	// This is just a pre-order walk and it happens this is the only thing we
	// need a pre-order walk for.
	return func(yield func(*envExpr) bool) {
	var rec func(e *envExpr) bool
	rec = func(e *envExpr) bool {
	if e.kind == envBinding && (id == nil \|\| e.id == id) {
	if !yield(e) {
	return false
	}
	}
	for _, o := range e.operands {
	if !rec(o) {
	return false
	}
	}
	return true
	}
	rec(e)
	}
	}

	// newEnvExprProduct constructs a product node from exprs, performing
	// simplifications. It does NOT check that bindings are disjoint.
	func newEnvExprProduct(exprs ...envExpr) envExpr {
	factors := make([]*envExpr, 0, 2)
	for _, expr := range exprs {
	switch expr.kind {
	case envZero:
	return envExprZero
	case envUnit:
	// No effect on product
	case envProduct:
	factors = append(factors, expr.operands...)
	default:
	factors = append(factors, expr)
	}
	}

	if len(factors) == 0 {
	return envExprUnit
	} else if len(factors) == 1 {
	return factors[0]
	}
	return &envExpr{kind: envProduct, operands: factors}
	}

	// newEnvExprSum constructs a sum node from exprs, performing simplifications.
	func newEnvExprSum(exprs ...envExpr) envExpr {
	// TODO: If all of envs are products (or bindings), factor any common terms.
	// E.g., x * y + x * z ==> x * (y + z). This is easy to do for binding
	// terms, but harder to do for more general terms.

	var have smallSet[*envExpr]
	terms := make([]*envExpr, 0, 2)
	for _, expr := range exprs {
	switch expr.kind {
	case envZero:
	// No effect on sum
	case envSum:
	for _, expr1 := range expr.operands {
	if have.Add(expr1) {
	terms = append(terms, expr1)
	}
	}
	default:
	if have.Add(expr) {
	terms = append(terms, expr)
	}
	}
	}

	if len(terms) == 0 {
	return envExprZero
	} else if len(terms) == 1 {
	return terms[0]
	}
	return &envExpr{kind: envSum, operands: terms}
	}

	func crossEnvs(env1, env2 envSet) envSet {
	// Confirm that envs have disjoint idents.
	var ids1 smallSet[*ident]
	for e := range env1.root.bindings(nil) {
	ids1.Add(e.id)
	}
	for e := range env2.root.bindings(nil) {
	if ids1.Has(e.id) {
	panic(fmt.Sprintf("%s bound on both sides of cross-product", e.id.name))
	}
	}

	return envSet{newEnvExprProduct(env1.root, env2.root)}
	}

	func unionEnvs(envs ...envSet) envSet {
	exprs := make([]*envExpr, len(envs))
	for i := range envs {
	exprs[i] = envs[i].root
	}
	return envSet{newEnvExprSum(exprs...)}
	}

	// envPartition is a subset of an env where id is bound to value in all
	// deterministic environments.
	type envPartition struct {
	id *ident
	value *Value
	env envSet
	}

	// partitionBy splits e by distinct bindings of id and removes id from each
	// partition.
	//
	// If there are environments in e where id is not bound, they will not be
	// reflected in any partition.
	//
	// It panics if e is bottom, since attempting to partition an empty environment
	// set almost certainly indicates a bug.
	func (e envSet) partitionBy(id *ident) []envPartition {
	if e.isEmpty() {
	// We could return zero partitions, but getting here at all almost
	// certainly indicates a bug.
	panic("cannot partition empty environment set")
	}

	// Emit a partition for each value of id.
	var seen smallSet[*Value]
	var parts []envPartition
	for n := range e.root.bindings(id) {
	if !seen.Add(n.val) {
	// Already emitted a partition for this value.
	continue
	}

	parts = append(parts, envPartition{
	id: id,
	value: n.val,
	env: envSet{e.root.substitute(id, n.val)},
	})
	}

	return parts
	}

	// substitute replaces bindings of id to val with 1 and bindings of id to any
	// other value with 0 and simplifies the result.
	func (e envExpr) substitute(id ident, val Value) envExpr {
	switch e.kind {
	default:
	panic("bad kind")

	case envZero, envUnit:
	return e

	case envBinding:
	if e.id != id {
	return e
	} else if e.val != val {
	return envExprZero
	} else {
	return envExprUnit
	}

	case envProduct, envSum:
	// Substitute each operand. Sometimes, this won't change anything, so we
	// build the new operands list lazily.
	var nOperands []*envExpr
	for i, op := range e.operands {
	nOp := op.substitute(id, val)
	if nOperands == nil && op != nOp {
	// Operand diverged; initialize nOperands.
	nOperands = make([]*envExpr, 0, len(e.operands))
	nOperands = append(nOperands, e.operands[:i]...)
	}
	if nOperands != nil {
	nOperands = append(nOperands, nOp)
	}
	}
	if nOperands == nil {
	// Nothing changed.
	return e
	}
	if e.kind == envProduct {
	return newEnvExprProduct(nOperands...)
	} else {
	return newEnvExprSum(nOperands...)
	}
	}
	}

	// A smallSet is a set optimized for stack allocation when small.
	type smallSet[T comparable] struct {
	array [32]T
	n int

	m map[T]struct{}
	}

	// Has returns whether val is in set.
	func (s *smallSet[T]) Has(val T) bool {
	arr := s.array[:s.n]
	for i := range arr {
	if arr[i] == val {
	return true
	}
	}
	_, ok := s.m[val]
	return ok
	}

	// Add adds val to the set and returns true if it was added (not already
	// present).
	func (s *smallSet[T]) Add(val T) bool {
	// Test for presence.
	if s.Has(val) {
	return false
	}

	// Add it
	if s.n < len(s.array) {
	s.array[s.n] = val
	s.n++
	} else {
	if s.m == nil {
	s.m = make(map[T]struct{})
	}
	s.m[val] = struct{}{}
	}
	return true
	}

	type ident struct {
	_ [0]func() // Not comparable (only compare *ident)
	name string
	}

	type Var struct {
	id *ident
	}

	func (d Var) Exact() bool {
	// These can't appear in concrete Values.
	panic("Exact called on non-concrete Value")
	}

	func (d Var) WhyNotExact() string {
	// These can't appear in concrete Values.
	return "WhyNotExact called on non-concrete Value"
	}

	func (d Var) decode(rv reflect.Value) error {
	return &inexactError{"var", rv.Type().String()}
	}

	func (d Var) unify(w Value, e envSet, swap bool, uf unifier) (Domain, envSet, error) {
	// TODO: Vars from !sums in the input can have a huge number of values.
	// Unifying these could be way more efficient with some indexes over any
	// exact values we can pull out, like Def fields that are exact Strings.
	// Maybe we try to produce an array of yes/no/maybe matches and then we only
	// have to do deeper evaluation of the maybes. We could probably cache this
	// on an envTerm. It may also help to special-case Var/Var unification to
	// pick which one to index versus enumerate.

	if vd, ok := w.Domain.(Var); ok && d.id == vd.id {
	// Unifying $x with $x results in $x. If we descend into this we'll have
	// problems because we strip $x out of the environment to keep ourselves
	// honest and then can't find it on the other side.
	//
	// TODO: I'm not positive this is the right fix.
	return vd, e, nil
	}

	// We need to unify w with the value of d in each possible environment. We
	// can save some work by grouping environments by the value of d, since
	// there will be a lot of redundancy here.
	var nEnvs []envSet
	envParts := e.partitionBy(d.id)
	for i, envPart := range envParts {
	exit := uf.enterVar(d.id, i)
	// Each branch logically gets its own copy of the initial environment
	// (narrowed down to just this binding of the variable), and each branch
	// may result in different changes to that starting environment.
	res, e2, err := w.unify(envPart.value, envPart.env, swap, uf)
	exit.exit()
	if err != nil {
	return nil, envSet{}, err
	}
	if res.Domain == nil {
	// This branch entirely failed to unify, so it's gone.
	continue
	}
	nEnv := e2.bind(d.id, res)
	nEnvs = append(nEnvs, nEnv)
	}

	if len(nEnvs) == 0 {
	// All branches failed
	return nil, bottomEnv, nil
	}

	// The effect of this is entirely captured in the environment. We can return
	// back the same Bind node.
	return d, unionEnvs(nEnvs...), nil
	}

	// An identPrinter maps [ident]s to unique string names.
	type identPrinter struct {
	ids map[*ident]string
	idGen map[string]int
	}

	func (p identPrinter) unique(id ident) string {
	if p.ids == nil {
	p.ids = make(map[*ident]string)
	p.idGen = make(map[string]int)
	}

	name, ok := p.ids[id]
	if !ok {
	gen := p.idGen[id.name]
	p.idGen[id.name]++
	if gen == 0 {
	name = id.name
	} else {
	name = fmt.Sprintf("%s#%d", id.name, gen)
	}
	p.ids[id] = name
	}

	return name
	}

	func (p identPrinter) slice(ids []ident) string {
	var strs []string
	for _, id := range ids {
	strs = append(strs, p.unique(id))
	}
	return fmt.Sprintf("[%s]", strings.Join(strs, ", "))
	}