mozzic commited on
Commit
a4eccb6
·
verified ·
1 Parent(s): 509df1f

Upload src\indexing.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. src//indexing.py +77 -0
src//indexing.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ FAISS-based indexing for context units
3
+ """
4
+
5
+ import numpy as np
6
+ import os
7
+ from typing import List, Dict, Tuple, Optional
8
+ from pathlib import Path
9
+ from src.models import ContextUnit
10
+
11
+
12
+ class FAISSIndexer:
13
+ """FAISS-based vector indexer for context units."""
14
+
15
+ def __init__(self, dimension: int = 1536):
16
+ self.dimension = dimension
17
+ self.index = None
18
+ self.context_units: Dict[str, ContextUnit] = {}
19
+ self._initialize_index()
20
+
21
+ def _initialize_index(self):
22
+ """Initialize FAISS index."""
23
+ try:
24
+ import faiss
25
+ self.index = faiss.IndexFlatIP(self.dimension) # Inner product for cosine similarity
26
+ except ImportError:
27
+ print("FAISS not installed, using dummy index")
28
+ self.index = None
29
+
30
+ def add_context_unit(self, unit: ContextUnit):
31
+ """Add a single context unit."""
32
+ if self.index is None:
33
+ return
34
+
35
+ # Generate embedding (simplified - in real implementation use actual embeddings)
36
+ embedding = self._get_embedding(unit)
37
+
38
+ # Add to index
39
+ self.index.add(np.array([embedding], dtype=np.float32))
40
+ self.context_units[unit.cell.cell_id] = unit
41
+
42
+ def add_multiple(self, units: List[ContextUnit]):
43
+ """Add multiple context units."""
44
+ for unit in units:
45
+ self.add_context_unit(unit)
46
+
47
+ def search_units(self, query: str, k: int = 5) -> List[Tuple[ContextUnit, float]]:
48
+ """Search for similar units."""
49
+ if self.index is None or len(self.context_units) == 0:
50
+ return []
51
+
52
+ # Generate query embedding
53
+ query_embedding = self._get_embedding_from_text(query)
54
+
55
+ # Search
56
+ scores, indices = self.index.search(np.array([query_embedding], dtype=np.float32), min(k, self.index.ntotal))
57
+
58
+ results = []
59
+ for score, idx in zip(scores[0], indices[0]):
60
+ if idx < len(self.context_units):
61
+ unit_id = list(self.context_units.keys())[idx]
62
+ unit = self.context_units[unit_id]
63
+ results.append((unit, float(score)))
64
+
65
+ return results
66
+
67
+ def _get_embedding(self, unit: ContextUnit) -> np.ndarray:
68
+ """Get embedding for a context unit."""
69
+ text = f"{unit.intent} {unit.cell.source}"
70
+ return self._get_embedding_from_text(text)
71
+
72
+ def _get_embedding_from_text(self, text: str) -> np.ndarray:
73
+ """Get embedding from text (simplified)."""
74
+ # In real implementation, use OpenAI or other embedding API
75
+ # For now, return random vector
76
+ np.random.seed(hash(text) % 2**32)
77
+ return np.random.normal(0, 1, self.dimension).astype(np.float32)