File size: 7,983 Bytes
fadb000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
"""
FAISS index management for fast vector similarity search.
"""
import os
import pickle
from pathlib import Path
from typing import List, Optional, Tuple
import numpy as np

try:
    import faiss
    FAISS_AVAILABLE = True
except ImportError:
    FAISS_AVAILABLE = False
    faiss = None

from django.conf import settings


# Default index directory
INDEX_DIR = Path(settings.BASE_DIR) / "artifacts" / "faiss_indexes"
INDEX_DIR.mkdir(parents=True, exist_ok=True)


class FAISSIndex:
    """FAISS index wrapper for vector similarity search."""
    
    def __init__(self, dimension: int, index_type: str = "IVF"):
        """
        Initialize FAISS index.
        
        Args:
            dimension: Embedding dimension.
            index_type: Type of index ('IVF', 'HNSW', 'Flat').
        """
        if not FAISS_AVAILABLE:
            raise ImportError("FAISS not available. Install with: pip install faiss-cpu")
        
        self.dimension = dimension
        self.index_type = index_type
        self.index = None
        self.id_to_index = {}  # Map object ID to FAISS index
        self.index_to_id = {}  # Reverse mapping
        self._build_index()
    
    def _build_index(self):
        """Build FAISS index based on type."""
        if self.index_type == "Flat":
            # Brute-force exact search
            self.index = faiss.IndexFlatL2(self.dimension)
        elif self.index_type == "IVF":
            # Inverted file index (approximate, faster)
            nlist = 100  # Number of clusters
            quantizer = faiss.IndexFlatL2(self.dimension)
            self.index = faiss.IndexIVFFlat(quantizer, self.dimension, nlist)
        elif self.index_type == "HNSW":
            # Hierarchical Navigable Small World (fast approximate)
            M = 32  # Number of connections
            self.index = faiss.IndexHNSWFlat(self.dimension, M)
        else:
            raise ValueError(f"Unknown index type: {self.index_type}")
    
    def train(self, vectors: np.ndarray):
        """Train index (required for IVF)."""
        if hasattr(self.index, 'train') and not self.index.is_trained:
            self.index.train(vectors)
    
    def add(self, vectors: np.ndarray, ids: List[int]):
        """
        Add vectors to index.
        
        Args:
            vectors: Numpy array of shape (n, dimension).
            ids: List of object IDs corresponding to vectors.
        """
        if len(vectors) == 0:
            return
        
        # Normalize vectors
        faiss.normalize_L2(vectors)
        
        # Train if needed (for IVF)
        if hasattr(self.index, 'train') and not self.index.is_trained:
            self.train(vectors)
        
        # Get current index size
        start_idx = len(self.id_to_index)
        
        # Add to index
        self.index.add(vectors)
        
        # Update mappings
        for i, obj_id in enumerate(ids):
            faiss_idx = start_idx + i
            self.id_to_index[obj_id] = faiss_idx
            self.index_to_id[faiss_idx] = obj_id
    
    def search(self, query_vector: np.ndarray, k: int = 10) -> List[Tuple[int, float]]:
        """
        Search for similar vectors.
        
        Args:
            query_vector: Query vector of shape (dimension,).
            k: Number of results to return.
        
        Returns:
            List of (object_id, distance) tuples.
        """
        if self.index.ntotal == 0:
            return []
        
        # Normalize query
        query_vector = query_vector.reshape(1, -1).astype('float32')
        faiss.normalize_L2(query_vector)
        
        # Search
        distances, indices = self.index.search(query_vector, k)
        
        # Convert to object IDs
        results = []
        for idx, dist in zip(indices[0], distances[0]):
            if idx < 0:  # Invalid index
                continue
            obj_id = self.index_to_id.get(idx)
            if obj_id is not None:
                # Convert L2 distance to similarity (1 - normalized distance)
                similarity = 1.0 / (1.0 + float(dist))
                results.append((obj_id, similarity))
        
        return results
    
    def save(self, filepath: Path):
        """Save index to file."""
        filepath.parent.mkdir(parents=True, exist_ok=True)
        
        # Save FAISS index
        faiss.write_index(self.index, str(filepath))
        
        # Save mappings
        mappings_file = filepath.with_suffix('.mappings.pkl')
        with open(mappings_file, 'wb') as f:
            pickle.dump({
                'id_to_index': self.id_to_index,
                'index_to_id': self.index_to_id,
                'dimension': self.dimension,
                'index_type': self.index_type
            }, f)
    
    @classmethod
    def load(cls, filepath: Path) -> 'FAISSIndex':
        """Load index from file."""
        if not filepath.exists():
            raise FileNotFoundError(f"Index file not found: {filepath}")
        
        # Load FAISS index
        index = faiss.read_index(str(filepath))
        
        # Load mappings
        mappings_file = filepath.with_suffix('.mappings.pkl')
        with open(mappings_file, 'rb') as f:
            mappings = pickle.load(f)
        
        # Create instance
        instance = cls.__new__(cls)
        instance.index = index
        instance.id_to_index = mappings['id_to_index']
        instance.index_to_id = mappings['index_to_id']
        instance.dimension = mappings['dimension']
        instance.index_type = mappings['index_type']
        
        return instance


def build_faiss_index_for_model(model_class, model_name: str, index_type: str = "IVF") -> Optional[FAISSIndex]:
    """
    Build FAISS index for a Django model.
    
    Args:
        model_class: Django model class.
        model_name: Name of model (for file naming).
        index_type: Type of FAISS index.
    
    Returns:
        FAISSIndex instance or None if error.
    """
    if not FAISS_AVAILABLE:
        print("FAISS not available. Skipping index build.")
        return None
    
    from hue_portal.core.embeddings import get_embedding_dimension
    from hue_portal.core.embedding_utils import load_embedding
    
    # Get embedding dimension
    dim = get_embedding_dimension()
    if dim == 0:
        print("Cannot determine embedding dimension. Skipping index build.")
        return None
    
    # Get all instances with embeddings first to determine count
    instances = list(model_class.objects.exclude(embedding__isnull=True))
    if not instances:
        print(f"No instances with embeddings found for {model_name}.")
        return None
    
    # Auto-adjust index type: IVF requires at least 100 vectors for training with 100 clusters
    # If we have fewer vectors, use Flat index instead
    if index_type == "IVF" and len(instances) < 100:
        print(f"⚠️ Only {len(instances)} instances found. Switching from IVF to Flat index (IVF requires >= 100 vectors).")
        index_type = "Flat"
    
    # Create index
    faiss_index = FAISSIndex(dimension=dim, index_type=index_type)
    
    print(f"Building FAISS index for {model_name} ({len(instances)} instances, type: {index_type})...")
    
    # Collect vectors and IDs
    vectors = []
    ids = []
    
    for instance in instances:
        embedding = load_embedding(instance)
        if embedding is not None:
            vectors.append(embedding)
            ids.append(instance.id)
    
    if not vectors:
        print(f"No valid embeddings found for {model_name}.")
        return None
    
    # Convert to numpy array
    vectors_array = np.array(vectors, dtype='float32')
    
    # Add to index
    faiss_index.add(vectors_array, ids)
    
    # Save index
    index_file = INDEX_DIR / f"{model_name.lower()}_{index_type.lower()}.faiss"
    faiss_index.save(index_file)
    
    print(f"✅ Built and saved FAISS index: {index_file}")
    return faiss_index