| import os | |
| import sys | |
| import shutil | |
| import numpy as np | |
| import faiss | |
| from pathlib import Path | |
| ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) | |
| if ROOT not in sys.path: | |
| sys.path.insert(0, ROOT) | |
| from utils import base_utils as bu | |
| def build_faiss_index(config_path: str = "configs/config.json") -> None: | |
| config = bu.load_config(config_path) | |
| embeddings_dir = config["paths"].get("embeddings_dir", "data/embeddings") | |
| index_dir = config["paths"].get("index_dir", "data/index") | |
| embeddings_path = os.path.join(embeddings_dir, "embeddings.npy") | |
| metadata_path = os.path.join(embeddings_dir, "metadata.jsonl") | |
| if not os.path.exists(embeddings_path) or not os.path.exists(metadata_path): | |
| raise FileNotFoundError( | |
| f"embeddings.npy or metadata.jsonl not found in {embeddings_dir}. " | |
| "Run scripts/generate_embeddings.py first." | |
| ) | |
| Path(index_dir).mkdir(parents=True, exist_ok=True) | |
| embs = np.load(embeddings_path).astype("float32") | |
| if embs.ndim != 2: | |
| raise ValueError("embeddings.npy must be a 2D array [N, D]") | |
| num_vectors, dim = embs.shape | |
| print(f"Loaded embeddings: {num_vectors} vectors of dimension {dim}") | |
| faiss.normalize_L2(embs) | |
| index = faiss.IndexFlatIP(dim) | |
| index.add(embs) | |
| index_file = os.path.join(index_dir, "faiss.index") | |
| faiss.write_index(index, index_file) | |
| print("Index written to", index_file) | |
| target_metadata_path = os.path.join(index_dir, "metadata.jsonl") | |
| shutil.copyfile(metadata_path, target_metadata_path) | |
| print("Metadata copied to", target_metadata_path) | |
| if __name__ == "__main__": | |
| build_faiss_index() |