File size: 5,035 Bytes
6a7089a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 | package semantic
import (
"hash/fnv"
"math"
"strings"
"unicode"
)
// HashingEmbedder implements Embedder using a feature-hashing (hashing trick)
// approach. It produces fixed-dimension vectors by hashing word unigrams and
// character n-grams into a compact vector space. No vocabulary construction
// is required, making each Embed call fully independent.
//
// Properties:
// - Fixed vector dimensionality regardless of vocabulary size
// - Captures sub-word similarity (e.g. "btn" ↔ "button")
// - L2-normalized output for cosine similarity compatibility
// - Zero external dependencies — pure Go
type HashingEmbedder struct {
dim int // vector dimensionality
ngramMin int // minimum character n-gram length
ngramMax int // maximum character n-gram length
wordWeight float32 // weight factor for word-level features
ngramWeight float32 // weight factor for n-gram features
}
// NewHashingEmbedder creates a HashingEmbedder with the given dimension.
// Higher dimensions reduce hash collisions but use more memory.
// Recommended: 128 for speed, 256 for accuracy.
func NewHashingEmbedder(dim int) *HashingEmbedder {
if dim <= 0 {
dim = 128
}
return &HashingEmbedder{
dim: dim,
ngramMin: 2,
ngramMax: 4,
wordWeight: 1.0,
ngramWeight: 0.5,
}
}
// Strategy returns "hashing".
func (h *HashingEmbedder) Strategy() string { return "hashing" }
// Embed converts a batch of texts into hashed feature vectors.
func (h *HashingEmbedder) Embed(texts []string) ([][]float32, error) {
result := make([][]float32, len(texts))
for i, text := range texts {
result[i] = h.vectorize(text)
}
return result, nil
}
// vectorize converts a single text into a hashed feature vector combining
// word-level, character n-gram, role-aware, and synonym features.
func (h *HashingEmbedder) vectorize(text string) []float32 {
vec := make([]float32, h.dim)
// Normalize text
text = strings.ToLower(text)
// 1. Word-level features (captures exact word overlap)
words := tokenizeForEmbedding(text)
for _, word := range words {
idx, sign := h.hashFeature("w:" + word)
vec[idx] += sign * h.wordWeight
}
// 2. Character n-gram features (captures sub-word similarity)
// e.g. "button" → "bu", "ut", "tt", "to", "on", "but", "utt", "tto", "ton"
for _, word := range words {
padded := "^" + word + "$" // boundary markers
for n := h.ngramMin; n <= h.ngramMax; n++ {
for i := 0; i <= len(padded)-n; i++ {
ngram := padded[i : i+n]
idx, sign := h.hashFeature("n:" + ngram)
vec[idx] += sign * h.ngramWeight
}
}
}
// 3. Role-aware features: if a word is a known UI role, add an
// extra feature to boost role-based matching
for _, word := range words {
if roleKeywords[word] {
idx, sign := h.hashFeature("role:" + word)
vec[idx] += sign * 0.8
}
}
// 4. Synonym features: inject word-level features for known synonyms
// at a reduced weight so "sign in" and "log in" share vector space.
for _, word := range words {
if syns, ok := synonymIndex[word]; ok {
for syn := range syns {
synTokens := strings.Fields(syn)
for _, st := range synTokens {
idx, sign := h.hashFeature("w:" + st)
vec[idx] += sign * h.wordWeight * 0.3
}
}
}
}
// 5. Multi-word synonym phrases: check consecutive word pairs/triples
// so "look up" → "search" gets injected at the embedding level.
for n := 2; n <= 3 && n <= len(words); n++ {
for i := 0; i <= len(words)-n; i++ {
phrase := strings.Join(words[i:i+n], " ")
if syns, ok := synonymIndex[phrase]; ok {
for syn := range syns {
synTokens := strings.Fields(syn)
for _, st := range synTokens {
idx, sign := h.hashFeature("w:" + st)
vec[idx] += sign * h.wordWeight * 0.3
}
}
}
}
}
// L2-normalize for cosine similarity
h.normalize(vec)
return vec
}
// hashFeature hashes a feature string into an index [0, dim) and a sign
// (+1 or -1). The sign hash preserves inner-product properties (the
// "signed hashing trick" per Weinberger et al. 2009).
func (h *HashingEmbedder) hashFeature(feature string) (int, float32) {
// Index hash
hasher := fnv.New32a()
hasher.Write([]byte(feature))
idx := int(hasher.Sum32()) % h.dim
if idx < 0 {
idx = -idx
}
// Sign hash (use different seed by prepending marker)
signHasher := fnv.New32()
signHasher.Write([]byte("s:" + feature))
sign := float32(1.0)
if signHasher.Sum32()%2 == 1 {
sign = -1.0
}
return idx, sign
}
// normalize L2-normalizes a vector in-place.
func (h *HashingEmbedder) normalize(vec []float32) {
var norm float64
for _, v := range vec {
norm += float64(v) * float64(v)
}
if norm > 0 {
invNorm := float32(1.0 / math.Sqrt(norm))
for i := range vec {
vec[i] *= invNorm
}
}
}
// tokenizeForEmbedding splits text into lowercase tokens for embedding.
func tokenizeForEmbedding(s string) []string {
return strings.FieldsFunc(s, func(r rune) bool {
return !unicode.IsLetter(r) && !unicode.IsDigit(r)
})
}
|