Spaces:
Sleeping
Sleeping
Delete semantic_embedder.py
Browse files- semantic_embedder.py +0 -241
semantic_embedder.py
DELETED
|
@@ -1,241 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
SEMANTIC EMBEDDER
|
| 3 |
-
Lightweight embedding engine for manifold pathfinding.
|
| 4 |
-
|
| 5 |
-
Uses sentence-transformers (all-MiniLM-L6-v2) for 384-dim vectors.
|
| 6 |
-
Falls back to simple TF-IDF if transformers unavailable.
|
| 7 |
-
"""
|
| 8 |
-
import sys
|
| 9 |
-
import os
|
| 10 |
-
import json
|
| 11 |
-
import math
|
| 12 |
-
import hashlib
|
| 13 |
-
from typing import List, Dict
|
| 14 |
-
|
| 15 |
-
# Try to import sentence-transformers
|
| 16 |
-
try:
|
| 17 |
-
from sentence_transformers import SentenceTransformer
|
| 18 |
-
HAS_TRANSFORMERS = True
|
| 19 |
-
except ImportError:
|
| 20 |
-
HAS_TRANSFORMERS = False
|
| 21 |
-
print("[EMBEDDER]: sentence-transformers not available, using fallback")
|
| 22 |
-
|
| 23 |
-
class SemanticEmbedder:
|
| 24 |
-
"""
|
| 25 |
-
Generates semantic embeddings for text.
|
| 26 |
-
Caches results to avoid recomputation.
|
| 27 |
-
"""
|
| 28 |
-
|
| 29 |
-
def __init__(self):
|
| 30 |
-
self.cache_path = os.path.join(
|
| 31 |
-
os.path.dirname(os.path.abspath(__file__)),
|
| 32 |
-
"..",
|
| 33 |
-
"Lattice_DB",
|
| 34 |
-
"embedding_cache.json"
|
| 35 |
-
)
|
| 36 |
-
self.cache = self.load_cache()
|
| 37 |
-
|
| 38 |
-
# Initialize model
|
| 39 |
-
if HAS_TRANSFORMERS:
|
| 40 |
-
print("[EMBEDDER]: Loading sentence-transformers model...")
|
| 41 |
-
self.model = SentenceTransformer('all-MiniLM-L6-v2')
|
| 42 |
-
self.embed_dim = 384
|
| 43 |
-
self.mode = "transformers"
|
| 44 |
-
print(f"[EMBEDDER]: Loaded (384-dim vectors)")
|
| 45 |
-
else:
|
| 46 |
-
self.model = None
|
| 47 |
-
self.embed_dim = 128 # Fallback dimension
|
| 48 |
-
self.mode = "fallback"
|
| 49 |
-
print(f"[EMBEDDER]: Using fallback embeddings (128-dim)")
|
| 50 |
-
|
| 51 |
-
def load_cache(self):
|
| 52 |
-
"""Load embedding cache from disk."""
|
| 53 |
-
if os.path.exists(self.cache_path):
|
| 54 |
-
try:
|
| 55 |
-
with open(self.cache_path, 'r', encoding='utf-8') as f:
|
| 56 |
-
return json.load(f)
|
| 57 |
-
except:
|
| 58 |
-
return {}
|
| 59 |
-
return {}
|
| 60 |
-
|
| 61 |
-
def save_cache(self):
|
| 62 |
-
"""Save embedding cache to disk."""
|
| 63 |
-
os.makedirs(os.path.dirname(self.cache_path), exist_ok=True)
|
| 64 |
-
with open(self.cache_path, 'w', encoding='utf-8') as f:
|
| 65 |
-
json.dump(self.cache, f)
|
| 66 |
-
|
| 67 |
-
def embed_text(self, text: str) -> List[float]:
|
| 68 |
-
"""
|
| 69 |
-
Generate semantic embedding for text.
|
| 70 |
-
|
| 71 |
-
Args:
|
| 72 |
-
text: Input text to embed
|
| 73 |
-
|
| 74 |
-
Returns:
|
| 75 |
-
Vector of dimension self.embed_dim
|
| 76 |
-
"""
|
| 77 |
-
# Check cache first
|
| 78 |
-
cache_key = hashlib.md5(text.encode()).hexdigest()
|
| 79 |
-
|
| 80 |
-
if cache_key in self.cache:
|
| 81 |
-
return self.cache[cache_key]
|
| 82 |
-
|
| 83 |
-
# Generate embedding
|
| 84 |
-
if self.mode == "transformers":
|
| 85 |
-
embedding = self._embed_transformers(text)
|
| 86 |
-
else:
|
| 87 |
-
embedding = self._embed_fallback(text)
|
| 88 |
-
|
| 89 |
-
# Cache result
|
| 90 |
-
self.cache[cache_key] = embedding
|
| 91 |
-
|
| 92 |
-
# Save every 10 embeddings
|
| 93 |
-
if len(self.cache) % 10 == 0:
|
| 94 |
-
self.save_cache()
|
| 95 |
-
|
| 96 |
-
return embedding
|
| 97 |
-
|
| 98 |
-
def _embed_transformers(self, text: str) -> List[float]:
|
| 99 |
-
"""Use sentence-transformers to generate embedding."""
|
| 100 |
-
embedding = self.model.encode(text, convert_to_numpy=True)
|
| 101 |
-
return embedding.tolist()
|
| 102 |
-
|
| 103 |
-
def _embed_fallback(self, text: str) -> List[float]:
|
| 104 |
-
"""
|
| 105 |
-
Fallback embedding using simple TF-IDF-like approach.
|
| 106 |
-
Not as good as transformers, but better than hash functions.
|
| 107 |
-
"""
|
| 108 |
-
# Tokenize
|
| 109 |
-
tokens = text.lower().split()
|
| 110 |
-
|
| 111 |
-
# Character n-grams for robustness
|
| 112 |
-
char_ngrams = []
|
| 113 |
-
for i in range(len(text) - 2):
|
| 114 |
-
char_ngrams.append(text[i:i+3].lower())
|
| 115 |
-
|
| 116 |
-
# Create sparse vector
|
| 117 |
-
vector = [0.0] * self.embed_dim
|
| 118 |
-
|
| 119 |
-
# Hash tokens into vector dimensions
|
| 120 |
-
for token in tokens:
|
| 121 |
-
idx = hash(token) % self.embed_dim
|
| 122 |
-
vector[idx] += 1.0
|
| 123 |
-
|
| 124 |
-
# Hash character n-grams
|
| 125 |
-
for ngram in char_ngrams:
|
| 126 |
-
idx = hash(ngram) % self.embed_dim
|
| 127 |
-
vector[idx] += 0.5
|
| 128 |
-
|
| 129 |
-
# Normalize
|
| 130 |
-
magnitude = math.sqrt(sum(x * x for x in vector))
|
| 131 |
-
if magnitude > 0:
|
| 132 |
-
vector = [x / magnitude for x in vector]
|
| 133 |
-
|
| 134 |
-
return vector
|
| 135 |
-
|
| 136 |
-
def cosine_similarity(self, vec_a: List[float], vec_b: List[float]) -> float:
|
| 137 |
-
"""
|
| 138 |
-
Calculate cosine similarity between two vectors.
|
| 139 |
-
|
| 140 |
-
Returns:
|
| 141 |
-
Similarity score in [0, 1] (higher = more similar)
|
| 142 |
-
"""
|
| 143 |
-
if len(vec_a) != len(vec_b):
|
| 144 |
-
raise ValueError(f"Vector dimension mismatch: {len(vec_a)} vs {len(vec_b)}")
|
| 145 |
-
|
| 146 |
-
# Dot product
|
| 147 |
-
dot_product = sum(a * b for a, b in zip(vec_a, vec_b))
|
| 148 |
-
|
| 149 |
-
# Magnitudes
|
| 150 |
-
mag_a = math.sqrt(sum(a * a for a in vec_a))
|
| 151 |
-
mag_b = math.sqrt(sum(b * b for b in vec_b))
|
| 152 |
-
|
| 153 |
-
if mag_a == 0 or mag_b == 0:
|
| 154 |
-
return 0.0
|
| 155 |
-
|
| 156 |
-
similarity = dot_product / (mag_a * mag_b)
|
| 157 |
-
|
| 158 |
-
# Clamp to [0, 1]
|
| 159 |
-
return max(0.0, min(1.0, similarity))
|
| 160 |
-
|
| 161 |
-
def get_cached_embedding(self, text: str) -> List[float]:
|
| 162 |
-
"""
|
| 163 |
-
Get embedding from cache if available, otherwise generate.
|
| 164 |
-
Same as embed_text() but explicit about caching.
|
| 165 |
-
"""
|
| 166 |
-
return self.embed_text(text)
|
| 167 |
-
|
| 168 |
-
def clear_cache(self):
|
| 169 |
-
"""Clear embedding cache."""
|
| 170 |
-
self.cache = {}
|
| 171 |
-
if os.path.exists(self.cache_path):
|
| 172 |
-
os.remove(self.cache_path)
|
| 173 |
-
print("[EMBEDDER]: Cache cleared")
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
if __name__ == "__main__":
|
| 177 |
-
print("="*60)
|
| 178 |
-
print("SEMANTIC EMBEDDER - Test Suite")
|
| 179 |
-
print("="*60 + "\n")
|
| 180 |
-
|
| 181 |
-
embedder = SemanticEmbedder()
|
| 182 |
-
|
| 183 |
-
# Test 1: Basic embedding
|
| 184 |
-
print("Test 1: Basic Embedding")
|
| 185 |
-
text = "React hooks allow functional components to use state"
|
| 186 |
-
embedding = embedder.embed_text(text)
|
| 187 |
-
print(f" Text: '{text}'")
|
| 188 |
-
print(f" Embedding dim: {len(embedding)}")
|
| 189 |
-
print(f" First 5 values: {embedding[:5]}")
|
| 190 |
-
|
| 191 |
-
# Test 2: Similarity between related concepts
|
| 192 |
-
print("\nTest 2: Semantic Similarity")
|
| 193 |
-
concepts = [
|
| 194 |
-
"React hooks and useEffect",
|
| 195 |
-
"Functional components with state management",
|
| 196 |
-
"Database connection pooling",
|
| 197 |
-
"Singleton design pattern"
|
| 198 |
-
]
|
| 199 |
-
|
| 200 |
-
embeddings = [embedder.embed_text(c) for c in concepts]
|
| 201 |
-
|
| 202 |
-
print("\nSimilarity Matrix:")
|
| 203 |
-
for i, concept_i in enumerate(concepts):
|
| 204 |
-
for j, concept_j in enumerate(concepts):
|
| 205 |
-
if j >= i: # Only upper triangle
|
| 206 |
-
sim = embedder.cosine_similarity(embeddings[i], embeddings[j])
|
| 207 |
-
print(f" [{i}] ↔ [{j}]: {sim:.3f}")
|
| 208 |
-
|
| 209 |
-
print("\nConcept Labels:")
|
| 210 |
-
for i, c in enumerate(concepts):
|
| 211 |
-
print(f" [{i}]: {c}")
|
| 212 |
-
|
| 213 |
-
# Test 3: Cache performance
|
| 214 |
-
print("\nTest 3: Cache Performance")
|
| 215 |
-
import time
|
| 216 |
-
|
| 217 |
-
test_text = "This is a test string for cache performance"
|
| 218 |
-
|
| 219 |
-
# First call (no cache)
|
| 220 |
-
start = time.time()
|
| 221 |
-
_ = embedder.embed_text(test_text)
|
| 222 |
-
first_time = time.time() - start
|
| 223 |
-
|
| 224 |
-
# Second call (cached)
|
| 225 |
-
start = time.time()
|
| 226 |
-
_ = embedder.embed_text(test_text)
|
| 227 |
-
second_time = time.time() - start
|
| 228 |
-
|
| 229 |
-
print(f" First call: {first_time*1000:.2f}ms")
|
| 230 |
-
print(f" Cached call: {second_time*1000:.2f}ms")
|
| 231 |
-
if second_time > 0:
|
| 232 |
-
print(f" Speedup: {first_time/second_time:.1f}x")
|
| 233 |
-
else:
|
| 234 |
-
print(f" Speedup: >100x (instant cache)")
|
| 235 |
-
|
| 236 |
-
# Save cache
|
| 237 |
-
embedder.save_cache()
|
| 238 |
-
print(f"\n✅ Embedder operational")
|
| 239 |
-
print(f" Mode: {embedder.mode}")
|
| 240 |
-
print(f" Dimension: {embedder.embed_dim}")
|
| 241 |
-
print(f" Cached embeddings: {len(embedder.cache)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|