Spaces:
Sleeping
Sleeping
File size: 7,983 Bytes
fadb000 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 |
"""
FAISS index management for fast vector similarity search.
"""
import os
import pickle
from pathlib import Path
from typing import List, Optional, Tuple
import numpy as np
try:
import faiss
FAISS_AVAILABLE = True
except ImportError:
FAISS_AVAILABLE = False
faiss = None
from django.conf import settings
# Default index directory
INDEX_DIR = Path(settings.BASE_DIR) / "artifacts" / "faiss_indexes"
INDEX_DIR.mkdir(parents=True, exist_ok=True)
class FAISSIndex:
"""FAISS index wrapper for vector similarity search."""
def __init__(self, dimension: int, index_type: str = "IVF"):
"""
Initialize FAISS index.
Args:
dimension: Embedding dimension.
index_type: Type of index ('IVF', 'HNSW', 'Flat').
"""
if not FAISS_AVAILABLE:
raise ImportError("FAISS not available. Install with: pip install faiss-cpu")
self.dimension = dimension
self.index_type = index_type
self.index = None
self.id_to_index = {} # Map object ID to FAISS index
self.index_to_id = {} # Reverse mapping
self._build_index()
def _build_index(self):
"""Build FAISS index based on type."""
if self.index_type == "Flat":
# Brute-force exact search
self.index = faiss.IndexFlatL2(self.dimension)
elif self.index_type == "IVF":
# Inverted file index (approximate, faster)
nlist = 100 # Number of clusters
quantizer = faiss.IndexFlatL2(self.dimension)
self.index = faiss.IndexIVFFlat(quantizer, self.dimension, nlist)
elif self.index_type == "HNSW":
# Hierarchical Navigable Small World (fast approximate)
M = 32 # Number of connections
self.index = faiss.IndexHNSWFlat(self.dimension, M)
else:
raise ValueError(f"Unknown index type: {self.index_type}")
def train(self, vectors: np.ndarray):
"""Train index (required for IVF)."""
if hasattr(self.index, 'train') and not self.index.is_trained:
self.index.train(vectors)
def add(self, vectors: np.ndarray, ids: List[int]):
"""
Add vectors to index.
Args:
vectors: Numpy array of shape (n, dimension).
ids: List of object IDs corresponding to vectors.
"""
if len(vectors) == 0:
return
# Normalize vectors
faiss.normalize_L2(vectors)
# Train if needed (for IVF)
if hasattr(self.index, 'train') and not self.index.is_trained:
self.train(vectors)
# Get current index size
start_idx = len(self.id_to_index)
# Add to index
self.index.add(vectors)
# Update mappings
for i, obj_id in enumerate(ids):
faiss_idx = start_idx + i
self.id_to_index[obj_id] = faiss_idx
self.index_to_id[faiss_idx] = obj_id
def search(self, query_vector: np.ndarray, k: int = 10) -> List[Tuple[int, float]]:
"""
Search for similar vectors.
Args:
query_vector: Query vector of shape (dimension,).
k: Number of results to return.
Returns:
List of (object_id, distance) tuples.
"""
if self.index.ntotal == 0:
return []
# Normalize query
query_vector = query_vector.reshape(1, -1).astype('float32')
faiss.normalize_L2(query_vector)
# Search
distances, indices = self.index.search(query_vector, k)
# Convert to object IDs
results = []
for idx, dist in zip(indices[0], distances[0]):
if idx < 0: # Invalid index
continue
obj_id = self.index_to_id.get(idx)
if obj_id is not None:
# Convert L2 distance to similarity (1 - normalized distance)
similarity = 1.0 / (1.0 + float(dist))
results.append((obj_id, similarity))
return results
def save(self, filepath: Path):
"""Save index to file."""
filepath.parent.mkdir(parents=True, exist_ok=True)
# Save FAISS index
faiss.write_index(self.index, str(filepath))
# Save mappings
mappings_file = filepath.with_suffix('.mappings.pkl')
with open(mappings_file, 'wb') as f:
pickle.dump({
'id_to_index': self.id_to_index,
'index_to_id': self.index_to_id,
'dimension': self.dimension,
'index_type': self.index_type
}, f)
@classmethod
def load(cls, filepath: Path) -> 'FAISSIndex':
"""Load index from file."""
if not filepath.exists():
raise FileNotFoundError(f"Index file not found: {filepath}")
# Load FAISS index
index = faiss.read_index(str(filepath))
# Load mappings
mappings_file = filepath.with_suffix('.mappings.pkl')
with open(mappings_file, 'rb') as f:
mappings = pickle.load(f)
# Create instance
instance = cls.__new__(cls)
instance.index = index
instance.id_to_index = mappings['id_to_index']
instance.index_to_id = mappings['index_to_id']
instance.dimension = mappings['dimension']
instance.index_type = mappings['index_type']
return instance
def build_faiss_index_for_model(model_class, model_name: str, index_type: str = "IVF") -> Optional[FAISSIndex]:
"""
Build FAISS index for a Django model.
Args:
model_class: Django model class.
model_name: Name of model (for file naming).
index_type: Type of FAISS index.
Returns:
FAISSIndex instance or None if error.
"""
if not FAISS_AVAILABLE:
print("FAISS not available. Skipping index build.")
return None
from hue_portal.core.embeddings import get_embedding_dimension
from hue_portal.core.embedding_utils import load_embedding
# Get embedding dimension
dim = get_embedding_dimension()
if dim == 0:
print("Cannot determine embedding dimension. Skipping index build.")
return None
# Get all instances with embeddings first to determine count
instances = list(model_class.objects.exclude(embedding__isnull=True))
if not instances:
print(f"No instances with embeddings found for {model_name}.")
return None
# Auto-adjust index type: IVF requires at least 100 vectors for training with 100 clusters
# If we have fewer vectors, use Flat index instead
if index_type == "IVF" and len(instances) < 100:
print(f"⚠️ Only {len(instances)} instances found. Switching from IVF to Flat index (IVF requires >= 100 vectors).")
index_type = "Flat"
# Create index
faiss_index = FAISSIndex(dimension=dim, index_type=index_type)
print(f"Building FAISS index for {model_name} ({len(instances)} instances, type: {index_type})...")
# Collect vectors and IDs
vectors = []
ids = []
for instance in instances:
embedding = load_embedding(instance)
if embedding is not None:
vectors.append(embedding)
ids.append(instance.id)
if not vectors:
print(f"No valid embeddings found for {model_name}.")
return None
# Convert to numpy array
vectors_array = np.array(vectors, dtype='float32')
# Add to index
faiss_index.add(vectors_array, ids)
# Save index
index_file = INDEX_DIR / f"{model_name.lower()}_{index_type.lower()}.faiss"
faiss_index.save(index_file)
print(f"✅ Built and saved FAISS index: {index_file}")
return faiss_index
|