import os import sys import shutil import numpy as np import faiss from pathlib import Path ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) if ROOT not in sys.path: sys.path.insert(0, ROOT) from utils import base_utils as bu def build_faiss_index(config_path: str = "configs/config.json") -> None: config = bu.load_config(config_path) embeddings_dir = config["paths"].get("embeddings_dir", "data/embeddings") index_dir = config["paths"].get("index_dir", "data/index") embeddings_path = os.path.join(embeddings_dir, "embeddings.npy") metadata_path = os.path.join(embeddings_dir, "metadata.jsonl") if not os.path.exists(embeddings_path) or not os.path.exists(metadata_path): raise FileNotFoundError( f"embeddings.npy or metadata.jsonl not found in {embeddings_dir}. " "Run scripts/generate_embeddings.py first." ) Path(index_dir).mkdir(parents=True, exist_ok=True) embs = np.load(embeddings_path).astype("float32") if embs.ndim != 2: raise ValueError("embeddings.npy must be a 2D array [N, D]") num_vectors, dim = embs.shape print(f"Loaded embeddings: {num_vectors} vectors of dimension {dim}") faiss.normalize_L2(embs) index = faiss.IndexFlatIP(dim) index.add(embs) index_file = os.path.join(index_dir, "faiss.index") faiss.write_index(index, index_file) print("Index written to", index_file) target_metadata_path = os.path.join(index_dir, "metadata.jsonl") shutil.copyfile(metadata_path, target_metadata_path) print("Metadata copied to", target_metadata_path) if __name__ == "__main__": build_faiss_index()