package semantic import ( "hash/fnv" "math" "strings" "unicode" ) // HashingEmbedder implements Embedder using a feature-hashing (hashing trick) // approach. It produces fixed-dimension vectors by hashing word unigrams and // character n-grams into a compact vector space. No vocabulary construction // is required, making each Embed call fully independent. // // Properties: // - Fixed vector dimensionality regardless of vocabulary size // - Captures sub-word similarity (e.g. "btn" ↔ "button") // - L2-normalized output for cosine similarity compatibility // - Zero external dependencies — pure Go type HashingEmbedder struct { dim int // vector dimensionality ngramMin int // minimum character n-gram length ngramMax int // maximum character n-gram length wordWeight float32 // weight factor for word-level features ngramWeight float32 // weight factor for n-gram features } // NewHashingEmbedder creates a HashingEmbedder with the given dimension. // Higher dimensions reduce hash collisions but use more memory. // Recommended: 128 for speed, 256 for accuracy. func NewHashingEmbedder(dim int) *HashingEmbedder { if dim <= 0 { dim = 128 } return &HashingEmbedder{ dim: dim, ngramMin: 2, ngramMax: 4, wordWeight: 1.0, ngramWeight: 0.5, } } // Strategy returns "hashing". func (h *HashingEmbedder) Strategy() string { return "hashing" } // Embed converts a batch of texts into hashed feature vectors. func (h *HashingEmbedder) Embed(texts []string) ([][]float32, error) { result := make([][]float32, len(texts)) for i, text := range texts { result[i] = h.vectorize(text) } return result, nil } // vectorize converts a single text into a hashed feature vector combining // word-level, character n-gram, role-aware, and synonym features. func (h *HashingEmbedder) vectorize(text string) []float32 { vec := make([]float32, h.dim) // Normalize text text = strings.ToLower(text) // 1. Word-level features (captures exact word overlap) words := tokenizeForEmbedding(text) for _, word := range words { idx, sign := h.hashFeature("w:" + word) vec[idx] += sign * h.wordWeight } // 2. Character n-gram features (captures sub-word similarity) // e.g. "button" → "bu", "ut", "tt", "to", "on", "but", "utt", "tto", "ton" for _, word := range words { padded := "^" + word + "$" // boundary markers for n := h.ngramMin; n <= h.ngramMax; n++ { for i := 0; i <= len(padded)-n; i++ { ngram := padded[i : i+n] idx, sign := h.hashFeature("n:" + ngram) vec[idx] += sign * h.ngramWeight } } } // 3. Role-aware features: if a word is a known UI role, add an // extra feature to boost role-based matching for _, word := range words { if roleKeywords[word] { idx, sign := h.hashFeature("role:" + word) vec[idx] += sign * 0.8 } } // 4. Synonym features: inject word-level features for known synonyms // at a reduced weight so "sign in" and "log in" share vector space. for _, word := range words { if syns, ok := synonymIndex[word]; ok { for syn := range syns { synTokens := strings.Fields(syn) for _, st := range synTokens { idx, sign := h.hashFeature("w:" + st) vec[idx] += sign * h.wordWeight * 0.3 } } } } // 5. Multi-word synonym phrases: check consecutive word pairs/triples // so "look up" → "search" gets injected at the embedding level. for n := 2; n <= 3 && n <= len(words); n++ { for i := 0; i <= len(words)-n; i++ { phrase := strings.Join(words[i:i+n], " ") if syns, ok := synonymIndex[phrase]; ok { for syn := range syns { synTokens := strings.Fields(syn) for _, st := range synTokens { idx, sign := h.hashFeature("w:" + st) vec[idx] += sign * h.wordWeight * 0.3 } } } } } // L2-normalize for cosine similarity h.normalize(vec) return vec } // hashFeature hashes a feature string into an index [0, dim) and a sign // (+1 or -1). The sign hash preserves inner-product properties (the // "signed hashing trick" per Weinberger et al. 2009). func (h *HashingEmbedder) hashFeature(feature string) (int, float32) { // Index hash hasher := fnv.New32a() hasher.Write([]byte(feature)) idx := int(hasher.Sum32()) % h.dim if idx < 0 { idx = -idx } // Sign hash (use different seed by prepending marker) signHasher := fnv.New32() signHasher.Write([]byte("s:" + feature)) sign := float32(1.0) if signHasher.Sum32()%2 == 1 { sign = -1.0 } return idx, sign } // normalize L2-normalizes a vector in-place. func (h *HashingEmbedder) normalize(vec []float32) { var norm float64 for _, v := range vec { norm += float64(v) * float64(v) } if norm > 0 { invNorm := float32(1.0 / math.Sqrt(norm)) for i := range vec { vec[i] *= invNorm } } } // tokenizeForEmbedding splits text into lowercase tokens for embedding. func tokenizeForEmbedding(s string) []string { return strings.FieldsFunc(s, func(r rune) bool { return !unicode.IsLetter(r) && !unicode.IsDigit(r) }) }