""" Embeddings + FAISS index build/save/load. """ from typing import List from pathlib import Path import os import platform # On macOS, FAISS and PyTorch both ship libomp and loading both copies without # telling LibOMP they're duplicates aborts the interpreter. Setting this flag # before importing either library prevents the crash when building embeddings. if platform.system() == "Darwin": os.environ.setdefault("KMP_DUPLICATE_LIB_OK", "TRUE") import numpy as np # Import FAISS before torch/sentence-transformers so libomp loads in a safe order on macOS. import faiss from sentence_transformers import SentenceTransformer import pandas as pd def embed_texts(texts: List[str], model_name: str): """ Return matrix of embeddings for texts. # TODO hints: # - Load SentenceTransformer by name; encode with normalize_embeddings=True if available. # - Batch encode; return numpy array (n, d). # Acceptance: # - Returns embeddings and model reference (if needed). """ model = SentenceTransformer(model_name) embeddings = model.encode(texts, normalize_embeddings=True, show_progress_bar=True) # Ensure numpy array and float32 for FAISS compatibility embeddings = np.array(embeddings, dtype=np.float32) return embeddings, model def build_faiss_index(embeddings): """ Build a FAISS index and return it. # TODO hints: # - Use IndexFlatIP or L2; ensure vectors are normalized if using IP. # Acceptance: # - Returns a FAISS index ready for add/search. """ # Ensure embeddings are numpy array and float32 if not isinstance(embeddings, np.ndarray): embeddings = np.array(embeddings, dtype=np.float32) if embeddings.dtype != np.float32: embeddings = embeddings.astype(np.float32) # Make a copy before normalizing to avoid in-place modification issues # (normalize_L2 modifies the array in-place) embeddings = embeddings.copy() # Ensure embeddings are normalized for IndexFlatIP (inner product = cosine similarity) # Note: embeddings should already be normalized from embed_texts, but normalize_L2 is idempotent faiss.normalize_L2(embeddings) # Create IndexFlatIP (Inner Product) for normalized vectors dimension = embeddings.shape[1] index = faiss.IndexFlatIP(dimension) index.add(embeddings) return index def save_index(index, meta_rows, out_dir: str): """ Persist FAISS index + metadata (CSV/Parquet) to data/index/. Args: index: FAISS index to save meta_rows: List of dicts or DataFrame with metadata (chunk IDs, source info) out_dir: Output directory path # TODO hints: # - Write index to .faiss and metadata to .parquet with chunk IDs and source info. # Acceptance: # - Files exist in data/index/. """ out_path = Path(out_dir) out_path.mkdir(parents=True, exist_ok=True) # Save FAISS index index_path = out_path / 'index.faiss' faiss.write_index(index, str(index_path)) # Convert meta_rows to DataFrame if it's a list if isinstance(meta_rows, list): meta_df = pd.DataFrame(meta_rows) elif isinstance(meta_rows, pd.DataFrame): meta_df = meta_rows else: raise ValueError("meta_rows must be a list of dicts or a pandas DataFrame") # Save metadata metadata_path = out_path / 'metadata.parquet' meta_df.to_parquet(metadata_path, index=False) print(f"✅ Saved index to: {index_path}") print(f"✅ Saved metadata to: {metadata_path}") print(f" Index size: {index.ntotal} vectors") print(f" Metadata rows: {len(meta_df)}") def load_index(in_dir: str): """ Load FAISS index + metadata. Args: in_dir: Input directory path containing index.faiss and metadata.parquet # TODO hints: # - Read index and matching metadata frame; sanity-check row counts. # Acceptance: # - Returns (index, metadata_df). """ in_path = Path(in_dir) # Load FAISS index index_path = in_path / 'index.faiss' if not index_path.exists(): raise FileNotFoundError(f"Index file not found: {index_path}") index = faiss.read_index(str(index_path)) # Load metadata metadata_path = in_path / 'metadata.parquet' if not metadata_path.exists(): raise FileNotFoundError(f"Metadata file not found: {metadata_path}") meta_df = pd.read_parquet(metadata_path) # Sanity check: row counts should match if index.ntotal != len(meta_df): raise ValueError( f"Mismatch: index has {index.ntotal} vectors but metadata has {len(meta_df)} rows" ) print(f"✅ Loaded index: {index.ntotal} vectors, dimension {index.d}") print(f"✅ Loaded metadata: {len(meta_df)} rows") return index, meta_df