| package semantic |
|
|
| import ( |
| "hash/fnv" |
| "math" |
| "strings" |
| "unicode" |
| ) |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| type HashingEmbedder struct { |
| dim int |
| ngramMin int |
| ngramMax int |
| wordWeight float32 |
| ngramWeight float32 |
| } |
|
|
| |
| |
| |
| func NewHashingEmbedder(dim int) *HashingEmbedder { |
| if dim <= 0 { |
| dim = 128 |
| } |
| return &HashingEmbedder{ |
| dim: dim, |
| ngramMin: 2, |
| ngramMax: 4, |
| wordWeight: 1.0, |
| ngramWeight: 0.5, |
| } |
| } |
|
|
| |
| func (h *HashingEmbedder) Strategy() string { return "hashing" } |
|
|
| |
| func (h *HashingEmbedder) Embed(texts []string) ([][]float32, error) { |
| result := make([][]float32, len(texts)) |
| for i, text := range texts { |
| result[i] = h.vectorize(text) |
| } |
| return result, nil |
| } |
|
|
| |
| |
| func (h *HashingEmbedder) vectorize(text string) []float32 { |
| vec := make([]float32, h.dim) |
|
|
| |
| text = strings.ToLower(text) |
|
|
| |
| words := tokenizeForEmbedding(text) |
| for _, word := range words { |
| idx, sign := h.hashFeature("w:" + word) |
| vec[idx] += sign * h.wordWeight |
| } |
|
|
| |
| |
| for _, word := range words { |
| padded := "^" + word + "$" |
| for n := h.ngramMin; n <= h.ngramMax; n++ { |
| for i := 0; i <= len(padded)-n; i++ { |
| ngram := padded[i : i+n] |
| idx, sign := h.hashFeature("n:" + ngram) |
| vec[idx] += sign * h.ngramWeight |
| } |
| } |
| } |
|
|
| |
| |
| for _, word := range words { |
| if roleKeywords[word] { |
| idx, sign := h.hashFeature("role:" + word) |
| vec[idx] += sign * 0.8 |
| } |
| } |
|
|
| |
| |
| for _, word := range words { |
| if syns, ok := synonymIndex[word]; ok { |
| for syn := range syns { |
| synTokens := strings.Fields(syn) |
| for _, st := range synTokens { |
| idx, sign := h.hashFeature("w:" + st) |
| vec[idx] += sign * h.wordWeight * 0.3 |
| } |
| } |
| } |
| } |
|
|
| |
| |
| for n := 2; n <= 3 && n <= len(words); n++ { |
| for i := 0; i <= len(words)-n; i++ { |
| phrase := strings.Join(words[i:i+n], " ") |
| if syns, ok := synonymIndex[phrase]; ok { |
| for syn := range syns { |
| synTokens := strings.Fields(syn) |
| for _, st := range synTokens { |
| idx, sign := h.hashFeature("w:" + st) |
| vec[idx] += sign * h.wordWeight * 0.3 |
| } |
| } |
| } |
| } |
| } |
|
|
| |
| h.normalize(vec) |
| return vec |
| } |
|
|
| |
| |
| |
| func (h *HashingEmbedder) hashFeature(feature string) (int, float32) { |
| |
| hasher := fnv.New32a() |
| hasher.Write([]byte(feature)) |
| idx := int(hasher.Sum32()) % h.dim |
| if idx < 0 { |
| idx = -idx |
| } |
|
|
| |
| signHasher := fnv.New32() |
| signHasher.Write([]byte("s:" + feature)) |
| sign := float32(1.0) |
| if signHasher.Sum32()%2 == 1 { |
| sign = -1.0 |
| } |
|
|
| return idx, sign |
| } |
|
|
| |
| func (h *HashingEmbedder) normalize(vec []float32) { |
| var norm float64 |
| for _, v := range vec { |
| norm += float64(v) * float64(v) |
| } |
| if norm > 0 { |
| invNorm := float32(1.0 / math.Sqrt(norm)) |
| for i := range vec { |
| vec[i] *= invNorm |
| } |
| } |
| } |
|
|
| |
| func tokenizeForEmbedding(s string) []string { |
| return strings.FieldsFunc(s, func(r rune) bool { |
| return !unicode.IsLetter(r) && !unicode.IsDigit(r) |
| }) |
| } |
|
|