File size: 5,035 Bytes
6a7089a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
package semantic

import (
	"hash/fnv"
	"math"
	"strings"
	"unicode"
)

// HashingEmbedder implements Embedder using a feature-hashing (hashing trick)
// approach. It produces fixed-dimension vectors by hashing word unigrams and
// character n-grams into a compact vector space. No vocabulary construction
// is required, making each Embed call fully independent.
//
// Properties:
//   - Fixed vector dimensionality regardless of vocabulary size
//   - Captures sub-word similarity (e.g. "btn" ↔ "button")
//   - L2-normalized output for cosine similarity compatibility
//   - Zero external dependencies — pure Go
type HashingEmbedder struct {
	dim         int     // vector dimensionality
	ngramMin    int     // minimum character n-gram length
	ngramMax    int     // maximum character n-gram length
	wordWeight  float32 // weight factor for word-level features
	ngramWeight float32 // weight factor for n-gram features
}

// NewHashingEmbedder creates a HashingEmbedder with the given dimension.
// Higher dimensions reduce hash collisions but use more memory.
// Recommended: 128 for speed, 256 for accuracy.
func NewHashingEmbedder(dim int) *HashingEmbedder {
	if dim <= 0 {
		dim = 128
	}
	return &HashingEmbedder{
		dim:         dim,
		ngramMin:    2,
		ngramMax:    4,
		wordWeight:  1.0,
		ngramWeight: 0.5,
	}
}

// Strategy returns "hashing".
func (h *HashingEmbedder) Strategy() string { return "hashing" }

// Embed converts a batch of texts into hashed feature vectors.
func (h *HashingEmbedder) Embed(texts []string) ([][]float32, error) {
	result := make([][]float32, len(texts))
	for i, text := range texts {
		result[i] = h.vectorize(text)
	}
	return result, nil
}

// vectorize converts a single text into a hashed feature vector combining
// word-level, character n-gram, role-aware, and synonym features.
func (h *HashingEmbedder) vectorize(text string) []float32 {
	vec := make([]float32, h.dim)

	// Normalize text
	text = strings.ToLower(text)

	// 1. Word-level features (captures exact word overlap)
	words := tokenizeForEmbedding(text)
	for _, word := range words {
		idx, sign := h.hashFeature("w:" + word)
		vec[idx] += sign * h.wordWeight
	}

	// 2. Character n-gram features (captures sub-word similarity)
	//    e.g. "button" → "bu", "ut", "tt", "to", "on", "but", "utt", "tto", "ton"
	for _, word := range words {
		padded := "^" + word + "$" // boundary markers
		for n := h.ngramMin; n <= h.ngramMax; n++ {
			for i := 0; i <= len(padded)-n; i++ {
				ngram := padded[i : i+n]
				idx, sign := h.hashFeature("n:" + ngram)
				vec[idx] += sign * h.ngramWeight
			}
		}
	}

	// 3. Role-aware features: if a word is a known UI role, add an
	//    extra feature to boost role-based matching
	for _, word := range words {
		if roleKeywords[word] {
			idx, sign := h.hashFeature("role:" + word)
			vec[idx] += sign * 0.8
		}
	}

	// 4. Synonym features: inject word-level features for known synonyms
	//    at a reduced weight so "sign in" and "log in" share vector space.
	for _, word := range words {
		if syns, ok := synonymIndex[word]; ok {
			for syn := range syns {
				synTokens := strings.Fields(syn)
				for _, st := range synTokens {
					idx, sign := h.hashFeature("w:" + st)
					vec[idx] += sign * h.wordWeight * 0.3
				}
			}
		}
	}

	// 5. Multi-word synonym phrases: check consecutive word pairs/triples
	//    so "look up" → "search" gets injected at the embedding level.
	for n := 2; n <= 3 && n <= len(words); n++ {
		for i := 0; i <= len(words)-n; i++ {
			phrase := strings.Join(words[i:i+n], " ")
			if syns, ok := synonymIndex[phrase]; ok {
				for syn := range syns {
					synTokens := strings.Fields(syn)
					for _, st := range synTokens {
						idx, sign := h.hashFeature("w:" + st)
						vec[idx] += sign * h.wordWeight * 0.3
					}
				}
			}
		}
	}

	// L2-normalize for cosine similarity
	h.normalize(vec)
	return vec
}

// hashFeature hashes a feature string into an index [0, dim) and a sign
// (+1 or -1). The sign hash preserves inner-product properties (the
// "signed hashing trick" per Weinberger et al. 2009).
func (h *HashingEmbedder) hashFeature(feature string) (int, float32) {
	// Index hash
	hasher := fnv.New32a()
	hasher.Write([]byte(feature))
	idx := int(hasher.Sum32()) % h.dim
	if idx < 0 {
		idx = -idx
	}

	// Sign hash (use different seed by prepending marker)
	signHasher := fnv.New32()
	signHasher.Write([]byte("s:" + feature))
	sign := float32(1.0)
	if signHasher.Sum32()%2 == 1 {
		sign = -1.0
	}

	return idx, sign
}

// normalize L2-normalizes a vector in-place.
func (h *HashingEmbedder) normalize(vec []float32) {
	var norm float64
	for _, v := range vec {
		norm += float64(v) * float64(v)
	}
	if norm > 0 {
		invNorm := float32(1.0 / math.Sqrt(norm))
		for i := range vec {
			vec[i] *= invNorm
		}
	}
}

// tokenizeForEmbedding splits text into lowercase tokens for embedding.
func tokenizeForEmbedding(s string) []string {
	return strings.FieldsFunc(s, func(r rune) bool {
		return !unicode.IsLetter(r) && !unicode.IsDigit(r)
	})
}