Fabuilds commited on
Commit
74ef753
·
verified ·
1 Parent(s): 780d252

Delete semantic_embedder.py

Browse files
Files changed (1) hide show
  1. semantic_embedder.py +0 -241
semantic_embedder.py DELETED
@@ -1,241 +0,0 @@
1
- """
2
- SEMANTIC EMBEDDER
3
- Lightweight embedding engine for manifold pathfinding.
4
-
5
- Uses sentence-transformers (all-MiniLM-L6-v2) for 384-dim vectors.
6
- Falls back to simple TF-IDF if transformers unavailable.
7
- """
8
- import sys
9
- import os
10
- import json
11
- import math
12
- import hashlib
13
- from typing import List, Dict
14
-
15
- # Try to import sentence-transformers
16
- try:
17
- from sentence_transformers import SentenceTransformer
18
- HAS_TRANSFORMERS = True
19
- except ImportError:
20
- HAS_TRANSFORMERS = False
21
- print("[EMBEDDER]: sentence-transformers not available, using fallback")
22
-
23
- class SemanticEmbedder:
24
- """
25
- Generates semantic embeddings for text.
26
- Caches results to avoid recomputation.
27
- """
28
-
29
- def __init__(self):
30
- self.cache_path = os.path.join(
31
- os.path.dirname(os.path.abspath(__file__)),
32
- "..",
33
- "Lattice_DB",
34
- "embedding_cache.json"
35
- )
36
- self.cache = self.load_cache()
37
-
38
- # Initialize model
39
- if HAS_TRANSFORMERS:
40
- print("[EMBEDDER]: Loading sentence-transformers model...")
41
- self.model = SentenceTransformer('all-MiniLM-L6-v2')
42
- self.embed_dim = 384
43
- self.mode = "transformers"
44
- print(f"[EMBEDDER]: Loaded (384-dim vectors)")
45
- else:
46
- self.model = None
47
- self.embed_dim = 128 # Fallback dimension
48
- self.mode = "fallback"
49
- print(f"[EMBEDDER]: Using fallback embeddings (128-dim)")
50
-
51
- def load_cache(self):
52
- """Load embedding cache from disk."""
53
- if os.path.exists(self.cache_path):
54
- try:
55
- with open(self.cache_path, 'r', encoding='utf-8') as f:
56
- return json.load(f)
57
- except:
58
- return {}
59
- return {}
60
-
61
- def save_cache(self):
62
- """Save embedding cache to disk."""
63
- os.makedirs(os.path.dirname(self.cache_path), exist_ok=True)
64
- with open(self.cache_path, 'w', encoding='utf-8') as f:
65
- json.dump(self.cache, f)
66
-
67
- def embed_text(self, text: str) -> List[float]:
68
- """
69
- Generate semantic embedding for text.
70
-
71
- Args:
72
- text: Input text to embed
73
-
74
- Returns:
75
- Vector of dimension self.embed_dim
76
- """
77
- # Check cache first
78
- cache_key = hashlib.md5(text.encode()).hexdigest()
79
-
80
- if cache_key in self.cache:
81
- return self.cache[cache_key]
82
-
83
- # Generate embedding
84
- if self.mode == "transformers":
85
- embedding = self._embed_transformers(text)
86
- else:
87
- embedding = self._embed_fallback(text)
88
-
89
- # Cache result
90
- self.cache[cache_key] = embedding
91
-
92
- # Save every 10 embeddings
93
- if len(self.cache) % 10 == 0:
94
- self.save_cache()
95
-
96
- return embedding
97
-
98
- def _embed_transformers(self, text: str) -> List[float]:
99
- """Use sentence-transformers to generate embedding."""
100
- embedding = self.model.encode(text, convert_to_numpy=True)
101
- return embedding.tolist()
102
-
103
- def _embed_fallback(self, text: str) -> List[float]:
104
- """
105
- Fallback embedding using simple TF-IDF-like approach.
106
- Not as good as transformers, but better than hash functions.
107
- """
108
- # Tokenize
109
- tokens = text.lower().split()
110
-
111
- # Character n-grams for robustness
112
- char_ngrams = []
113
- for i in range(len(text) - 2):
114
- char_ngrams.append(text[i:i+3].lower())
115
-
116
- # Create sparse vector
117
- vector = [0.0] * self.embed_dim
118
-
119
- # Hash tokens into vector dimensions
120
- for token in tokens:
121
- idx = hash(token) % self.embed_dim
122
- vector[idx] += 1.0
123
-
124
- # Hash character n-grams
125
- for ngram in char_ngrams:
126
- idx = hash(ngram) % self.embed_dim
127
- vector[idx] += 0.5
128
-
129
- # Normalize
130
- magnitude = math.sqrt(sum(x * x for x in vector))
131
- if magnitude > 0:
132
- vector = [x / magnitude for x in vector]
133
-
134
- return vector
135
-
136
- def cosine_similarity(self, vec_a: List[float], vec_b: List[float]) -> float:
137
- """
138
- Calculate cosine similarity between two vectors.
139
-
140
- Returns:
141
- Similarity score in [0, 1] (higher = more similar)
142
- """
143
- if len(vec_a) != len(vec_b):
144
- raise ValueError(f"Vector dimension mismatch: {len(vec_a)} vs {len(vec_b)}")
145
-
146
- # Dot product
147
- dot_product = sum(a * b for a, b in zip(vec_a, vec_b))
148
-
149
- # Magnitudes
150
- mag_a = math.sqrt(sum(a * a for a in vec_a))
151
- mag_b = math.sqrt(sum(b * b for b in vec_b))
152
-
153
- if mag_a == 0 or mag_b == 0:
154
- return 0.0
155
-
156
- similarity = dot_product / (mag_a * mag_b)
157
-
158
- # Clamp to [0, 1]
159
- return max(0.0, min(1.0, similarity))
160
-
161
- def get_cached_embedding(self, text: str) -> List[float]:
162
- """
163
- Get embedding from cache if available, otherwise generate.
164
- Same as embed_text() but explicit about caching.
165
- """
166
- return self.embed_text(text)
167
-
168
- def clear_cache(self):
169
- """Clear embedding cache."""
170
- self.cache = {}
171
- if os.path.exists(self.cache_path):
172
- os.remove(self.cache_path)
173
- print("[EMBEDDER]: Cache cleared")
174
-
175
-
176
- if __name__ == "__main__":
177
- print("="*60)
178
- print("SEMANTIC EMBEDDER - Test Suite")
179
- print("="*60 + "\n")
180
-
181
- embedder = SemanticEmbedder()
182
-
183
- # Test 1: Basic embedding
184
- print("Test 1: Basic Embedding")
185
- text = "React hooks allow functional components to use state"
186
- embedding = embedder.embed_text(text)
187
- print(f" Text: '{text}'")
188
- print(f" Embedding dim: {len(embedding)}")
189
- print(f" First 5 values: {embedding[:5]}")
190
-
191
- # Test 2: Similarity between related concepts
192
- print("\nTest 2: Semantic Similarity")
193
- concepts = [
194
- "React hooks and useEffect",
195
- "Functional components with state management",
196
- "Database connection pooling",
197
- "Singleton design pattern"
198
- ]
199
-
200
- embeddings = [embedder.embed_text(c) for c in concepts]
201
-
202
- print("\nSimilarity Matrix:")
203
- for i, concept_i in enumerate(concepts):
204
- for j, concept_j in enumerate(concepts):
205
- if j >= i: # Only upper triangle
206
- sim = embedder.cosine_similarity(embeddings[i], embeddings[j])
207
- print(f" [{i}] ↔ [{j}]: {sim:.3f}")
208
-
209
- print("\nConcept Labels:")
210
- for i, c in enumerate(concepts):
211
- print(f" [{i}]: {c}")
212
-
213
- # Test 3: Cache performance
214
- print("\nTest 3: Cache Performance")
215
- import time
216
-
217
- test_text = "This is a test string for cache performance"
218
-
219
- # First call (no cache)
220
- start = time.time()
221
- _ = embedder.embed_text(test_text)
222
- first_time = time.time() - start
223
-
224
- # Second call (cached)
225
- start = time.time()
226
- _ = embedder.embed_text(test_text)
227
- second_time = time.time() - start
228
-
229
- print(f" First call: {first_time*1000:.2f}ms")
230
- print(f" Cached call: {second_time*1000:.2f}ms")
231
- if second_time > 0:
232
- print(f" Speedup: {first_time/second_time:.1f}x")
233
- else:
234
- print(f" Speedup: >100x (instant cache)")
235
-
236
- # Save cache
237
- embedder.save_cache()
238
- print(f"\n✅ Embedder operational")
239
- print(f" Mode: {embedder.mode}")
240
- print(f" Dimension: {embedder.embed_dim}")
241
- print(f" Cached embeddings: {len(embedder.cache)}")