File size: 2,399 Bytes
6a7089a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
package semantic

// Embedder is the interface for converting text into dense vector
// representations. Implementations range from lightweight deterministic
// hashing (DummyEmbedder) to real ML models (future LocalEmbedder,
// OpenAIEmbedder).
type Embedder interface {
	// Embed converts a batch of text strings into float32 vectors.
	// All returned vectors must have the same dimensionality.
	Embed(texts []string) ([][]float32, error)

	// Strategy returns the name of the embedding strategy (e.g. "dummy", "local", "openai").
	Strategy() string
}

// DummyEmbedder generates deterministic fixed-dimension vectors using a
// simple hash of each input string. Useful for architecture testing
// without real ML dependencies.
type DummyEmbedder struct {
	Dim int // vector dimensionality (default 64)
}

// NewDummyEmbedder creates a DummyEmbedder with the given dimensionality.
func NewDummyEmbedder(dim int) *DummyEmbedder {
	if dim <= 0 {
		dim = 64
	}
	return &DummyEmbedder{Dim: dim}
}

// Strategy returns "dummy".
func (d *DummyEmbedder) Strategy() string { return "dummy" }

// Embed generates deterministic pseudo-vectors by hashing each character
// of the input string into the vector dimensions. The resulting vectors
// have useful properties: identical strings produce identical vectors,
// and strings with shared tokens produce vectors with non-zero cosine
// similarity.
func (d *DummyEmbedder) Embed(texts []string) ([][]float32, error) {
	result := make([][]float32, len(texts))
	for i, text := range texts {
		result[i] = d.hashVec(text)
	}
	return result, nil
}

// hashVec produces a deterministic float32 vector from a string.
func (d *DummyEmbedder) hashVec(s string) []float32 {
	vec := make([]float32, d.Dim)
	for i, c := range s {
		idx := (i*31 + int(c)) % d.Dim
		if idx < 0 {
			idx = -idx
		}
		vec[idx] += float32(c) / 128.0
	}
	// Normalize to unit length for cosine similarity.
	var norm float64
	for _, v := range vec {
		norm += float64(v) * float64(v)
	}
	if norm > 0 {
		invNorm := float32(1.0 / sqrt64(norm))
		for j := range vec {
			vec[j] *= invNorm
		}
	}
	return vec
}

// sqrt64 is a simple float64 square root to avoid importing math in this file.
func sqrt64(x float64) float64 {
	if x <= 0 {
		return 0
	}
	// Newton's method, 20 iterations is ample for float64 precision.
	z := x / 2
	for i := 0; i < 20; i++ {
		z = (z + x/z) / 2
	}
	return z
}