Spaces:
Sleeping
Sleeping
File size: 7,832 Bytes
d68c0f8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 | """
SEMANTIC EMBEDDER
Lightweight embedding engine for manifold pathfinding.
Uses sentence-transformers (all-MiniLM-L6-v2) for 384-dim vectors.
Falls back to simple TF-IDF if transformers unavailable.
"""
import sys
import os
import json
import math
import hashlib
from typing import List, Dict
# Try to import sentence-transformers
try:
from sentence_transformers import SentenceTransformer
HAS_TRANSFORMERS = True
except ImportError:
HAS_TRANSFORMERS = False
print("[EMBEDDER]: sentence-transformers not available, using fallback")
class SemanticEmbedder:
"""
Generates semantic embeddings for text.
Caches results to avoid recomputation.
"""
def __init__(self):
self.cache_path = os.path.join(
os.path.dirname(os.path.abspath(__file__)),
"..",
"Lattice_DB",
"embedding_cache.json"
)
self.cache = self.load_cache()
# Initialize model
if HAS_TRANSFORMERS:
print("[EMBEDDER]: Loading sentence-transformers model...")
self.model = SentenceTransformer('all-MiniLM-L6-v2')
self.embed_dim = 384
self.mode = "transformers"
print(f"[EMBEDDER]: Loaded (384-dim vectors)")
else:
self.model = None
self.embed_dim = 128 # Fallback dimension
self.mode = "fallback"
print(f"[EMBEDDER]: Using fallback embeddings (128-dim)")
def load_cache(self):
"""Load embedding cache from disk."""
if os.path.exists(self.cache_path):
try:
with open(self.cache_path, 'r', encoding='utf-8') as f:
return json.load(f)
except:
return {}
return {}
def save_cache(self):
"""Save embedding cache to disk."""
os.makedirs(os.path.dirname(self.cache_path), exist_ok=True)
with open(self.cache_path, 'w', encoding='utf-8') as f:
json.dump(self.cache, f)
def embed_text(self, text: str) -> List[float]:
"""
Generate semantic embedding for text.
Args:
text: Input text to embed
Returns:
Vector of dimension self.embed_dim
"""
# Check cache first
cache_key = hashlib.md5(text.encode()).hexdigest()
if cache_key in self.cache:
return self.cache[cache_key]
# Generate embedding
if self.mode == "transformers":
embedding = self._embed_transformers(text)
else:
embedding = self._embed_fallback(text)
# Cache result
self.cache[cache_key] = embedding
# Save every 10 embeddings
if len(self.cache) % 10 == 0:
self.save_cache()
return embedding
def _embed_transformers(self, text: str) -> List[float]:
"""Use sentence-transformers to generate embedding."""
embedding = self.model.encode(text, convert_to_numpy=True)
return embedding.tolist()
def _embed_fallback(self, text: str) -> List[float]:
"""
Fallback embedding using simple TF-IDF-like approach.
Not as good as transformers, but better than hash functions.
"""
# Tokenize
tokens = text.lower().split()
# Character n-grams for robustness
char_ngrams = []
for i in range(len(text) - 2):
char_ngrams.append(text[i:i+3].lower())
# Create sparse vector
vector = [0.0] * self.embed_dim
# Hash tokens into vector dimensions
for token in tokens:
idx = hash(token) % self.embed_dim
vector[idx] += 1.0
# Hash character n-grams
for ngram in char_ngrams:
idx = hash(ngram) % self.embed_dim
vector[idx] += 0.5
# Normalize
magnitude = math.sqrt(sum(x * x for x in vector))
if magnitude > 0:
vector = [x / magnitude for x in vector]
return vector
def cosine_similarity(self, vec_a: List[float], vec_b: List[float]) -> float:
"""
Calculate cosine similarity between two vectors.
Returns:
Similarity score in [0, 1] (higher = more similar)
"""
if len(vec_a) != len(vec_b):
raise ValueError(f"Vector dimension mismatch: {len(vec_a)} vs {len(vec_b)}")
# Dot product
dot_product = sum(a * b for a, b in zip(vec_a, vec_b))
# Magnitudes
mag_a = math.sqrt(sum(a * a for a in vec_a))
mag_b = math.sqrt(sum(b * b for b in vec_b))
if mag_a == 0 or mag_b == 0:
return 0.0
similarity = dot_product / (mag_a * mag_b)
# Clamp to [0, 1]
return max(0.0, min(1.0, similarity))
def get_cached_embedding(self, text: str) -> List[float]:
"""
Get embedding from cache if available, otherwise generate.
Same as embed_text() but explicit about caching.
"""
return self.embed_text(text)
def clear_cache(self):
"""Clear embedding cache."""
self.cache = {}
if os.path.exists(self.cache_path):
os.remove(self.cache_path)
print("[EMBEDDER]: Cache cleared")
if __name__ == "__main__":
print("="*60)
print("SEMANTIC EMBEDDER - Test Suite")
print("="*60 + "\n")
embedder = SemanticEmbedder()
# Test 1: Basic embedding
print("Test 1: Basic Embedding")
text = "React hooks allow functional components to use state"
embedding = embedder.embed_text(text)
print(f" Text: '{text}'")
print(f" Embedding dim: {len(embedding)}")
print(f" First 5 values: {embedding[:5]}")
# Test 2: Similarity between related concepts
print("\nTest 2: Semantic Similarity")
concepts = [
"React hooks and useEffect",
"Functional components with state management",
"Database connection pooling",
"Singleton design pattern"
]
embeddings = [embedder.embed_text(c) for c in concepts]
print("\nSimilarity Matrix:")
for i, concept_i in enumerate(concepts):
for j, concept_j in enumerate(concepts):
if j >= i: # Only upper triangle
sim = embedder.cosine_similarity(embeddings[i], embeddings[j])
print(f" [{i}] ↔ [{j}]: {sim:.3f}")
print("\nConcept Labels:")
for i, c in enumerate(concepts):
print(f" [{i}]: {c}")
# Test 3: Cache performance
print("\nTest 3: Cache Performance")
import time
test_text = "This is a test string for cache performance"
# First call (no cache)
start = time.time()
_ = embedder.embed_text(test_text)
first_time = time.time() - start
# Second call (cached)
start = time.time()
_ = embedder.embed_text(test_text)
second_time = time.time() - start
print(f" First call: {first_time*1000:.2f}ms")
print(f" Cached call: {second_time*1000:.2f}ms")
if second_time > 0:
print(f" Speedup: {first_time/second_time:.1f}x")
else:
print(f" Speedup: >100x (instant cache)")
# Save cache
embedder.save_cache()
print(f"\n✅ Embedder operational")
print(f" Mode: {embedder.mode}")
print(f" Dimension: {embedder.embed_dim}")
print(f" Cached embeddings: {len(embedder.cache)}")
|