Spaces:
Sleeping
Sleeping
| """ | |
| FAISS-based indexing for context units | |
| """ | |
| import numpy as np | |
| import os | |
| from typing import List, Dict, Tuple, Optional | |
| from pathlib import Path | |
| from src.models import ContextUnit | |
| class FAISSIndexer: | |
| """FAISS-based vector indexer for context units.""" | |
| def __init__(self, dimension: int = 1536): | |
| self.dimension = dimension | |
| self.index = None | |
| self.context_units: Dict[str, ContextUnit] = {} | |
| self._initialize_index() | |
| def _initialize_index(self): | |
| """Initialize FAISS index.""" | |
| try: | |
| import faiss | |
| self.index = faiss.IndexFlatIP(self.dimension) # Inner product for cosine similarity | |
| except ImportError: | |
| print("FAISS not installed, using dummy index") | |
| self.index = None | |
| def add_context_unit(self, unit: ContextUnit): | |
| """Add a single context unit.""" | |
| if self.index is None: | |
| return | |
| # Generate embedding (simplified - in real implementation use actual embeddings) | |
| embedding = self._get_embedding(unit) | |
| # Add to index | |
| self.index.add(np.array([embedding], dtype=np.float32)) | |
| self.context_units[unit.cell.cell_id] = unit | |
| def add_multiple(self, units: List[ContextUnit]): | |
| """Add multiple context units.""" | |
| for unit in units: | |
| self.add_context_unit(unit) | |
| def search_units(self, query: str, k: int = 5) -> List[Tuple[ContextUnit, float]]: | |
| """Search for similar units.""" | |
| if self.index is None or len(self.context_units) == 0: | |
| return [] | |
| # Generate query embedding | |
| query_embedding = self._get_embedding_from_text(query) | |
| # Search | |
| scores, indices = self.index.search(np.array([query_embedding], dtype=np.float32), min(k, self.index.ntotal)) | |
| results = [] | |
| for score, idx in zip(scores[0], indices[0]): | |
| if idx < len(self.context_units): | |
| unit_id = list(self.context_units.keys())[idx] | |
| unit = self.context_units[unit_id] | |
| results.append((unit, float(score))) | |
| return results | |
| def _get_embedding(self, unit: ContextUnit) -> np.ndarray: | |
| """Get embedding for a context unit.""" | |
| text = f"{unit.intent} {unit.cell.source}" | |
| return self._get_embedding_from_text(text) | |
| def _get_embedding_from_text(self, text: str) -> np.ndarray: | |
| """Get embedding from text (simplified).""" | |
| # In real implementation, use OpenAI or other embedding API | |
| # For now, return random vector | |
| np.random.seed(hash(text) % 2**32) | |
| return np.random.normal(0, 1, self.dimension).astype(np.float32) |