WitNote / internal /semantic /embedder.go
AUXteam's picture
Upload folder using huggingface_hub
6a7089a verified
package semantic
// Embedder is the interface for converting text into dense vector
// representations. Implementations range from lightweight deterministic
// hashing (DummyEmbedder) to real ML models (future LocalEmbedder,
// OpenAIEmbedder).
type Embedder interface {
// Embed converts a batch of text strings into float32 vectors.
// All returned vectors must have the same dimensionality.
Embed(texts []string) ([][]float32, error)
// Strategy returns the name of the embedding strategy (e.g. "dummy", "local", "openai").
Strategy() string
}
// DummyEmbedder generates deterministic fixed-dimension vectors using a
// simple hash of each input string. Useful for architecture testing
// without real ML dependencies.
type DummyEmbedder struct {
Dim int // vector dimensionality (default 64)
}
// NewDummyEmbedder creates a DummyEmbedder with the given dimensionality.
func NewDummyEmbedder(dim int) *DummyEmbedder {
if dim <= 0 {
dim = 64
}
return &DummyEmbedder{Dim: dim}
}
// Strategy returns "dummy".
func (d *DummyEmbedder) Strategy() string { return "dummy" }
// Embed generates deterministic pseudo-vectors by hashing each character
// of the input string into the vector dimensions. The resulting vectors
// have useful properties: identical strings produce identical vectors,
// and strings with shared tokens produce vectors with non-zero cosine
// similarity.
func (d *DummyEmbedder) Embed(texts []string) ([][]float32, error) {
result := make([][]float32, len(texts))
for i, text := range texts {
result[i] = d.hashVec(text)
}
return result, nil
}
// hashVec produces a deterministic float32 vector from a string.
func (d *DummyEmbedder) hashVec(s string) []float32 {
vec := make([]float32, d.Dim)
for i, c := range s {
idx := (i*31 + int(c)) % d.Dim
if idx < 0 {
idx = -idx
}
vec[idx] += float32(c) / 128.0
}
// Normalize to unit length for cosine similarity.
var norm float64
for _, v := range vec {
norm += float64(v) * float64(v)
}
if norm > 0 {
invNorm := float32(1.0 / sqrt64(norm))
for j := range vec {
vec[j] *= invNorm
}
}
return vec
}
// sqrt64 is a simple float64 square root to avoid importing math in this file.
func sqrt64(x float64) float64 {
if x <= 0 {
return 0
}
// Newton's method, 20 iterations is ample for float64 precision.
z := x / 2
for i := 0; i < 20; i++ {
z = (z + x/z) / 2
}
return z
}