File size: 2,838 Bytes
a4eccb6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
"""

FAISS-based indexing for context units

"""

import numpy as np
import os
from typing import List, Dict, Tuple, Optional
from pathlib import Path
from src.models import ContextUnit


class FAISSIndexer:
    """FAISS-based vector indexer for context units."""
    
    def __init__(self, dimension: int = 1536):
        self.dimension = dimension
        self.index = None
        self.context_units: Dict[str, ContextUnit] = {}
        self._initialize_index()
    
    def _initialize_index(self):
        """Initialize FAISS index."""
        try:
            import faiss
            self.index = faiss.IndexFlatIP(self.dimension)  # Inner product for cosine similarity
        except ImportError:
            print("FAISS not installed, using dummy index")
            self.index = None
    
    def add_context_unit(self, unit: ContextUnit):
        """Add a single context unit."""
        if self.index is None:
            return
        
        # Generate embedding (simplified - in real implementation use actual embeddings)
        embedding = self._get_embedding(unit)
        
        # Add to index
        self.index.add(np.array([embedding], dtype=np.float32))
        self.context_units[unit.cell.cell_id] = unit
    
    def add_multiple(self, units: List[ContextUnit]):
        """Add multiple context units."""
        for unit in units:
            self.add_context_unit(unit)
    
    def search_units(self, query: str, k: int = 5) -> List[Tuple[ContextUnit, float]]:
        """Search for similar units."""
        if self.index is None or len(self.context_units) == 0:
            return []
        
        # Generate query embedding
        query_embedding = self._get_embedding_from_text(query)
        
        # Search
        scores, indices = self.index.search(np.array([query_embedding], dtype=np.float32), min(k, self.index.ntotal))
        
        results = []
        for score, idx in zip(scores[0], indices[0]):
            if idx < len(self.context_units):
                unit_id = list(self.context_units.keys())[idx]
                unit = self.context_units[unit_id]
                results.append((unit, float(score)))
        
        return results
    
    def _get_embedding(self, unit: ContextUnit) -> np.ndarray:
        """Get embedding for a context unit."""
        text = f"{unit.intent} {unit.cell.source}"
        return self._get_embedding_from_text(text)
    
    def _get_embedding_from_text(self, text: str) -> np.ndarray:
        """Get embedding from text (simplified)."""
        # In real implementation, use OpenAI or other embedding API
        # For now, return random vector
        np.random.seed(hash(text) % 2**32)
        return np.random.normal(0, 1, self.dimension).astype(np.float32)