import faiss import numpy as np from src.utils.helpers import load_embeddings from src.configs.config import EMBEDDINGS_FILE, FAISS_INDEX_FILE, LOG_DIR import logging import os LOG_FILE = os.path.join(LOG_DIR, "docs_preprocessing.log") logging.basicConfig( filename=LOG_FILE, level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", ) def create_faiss_index(): """Create and save a FAISS index from embeddings.""" embeddings = load_embeddings(EMBEDDINGS_FILE) if embeddings.size == 0: logging.warning("No embeddings to index") return None dimension = embeddings.shape[1] num_vectors = len(embeddings) logging.info(f"Using IndexFlatL2 for {num_vectors} vectors") index = faiss.IndexFlatL2(dimension) index.add(embeddings) '''# Use simple flat index for small datasets (< 1000 vectors) # Use IVF index for larger datasets if num_vectors < 1000: logging.info(f"Using IndexFlatL2 for {num_vectors} vectors") index = faiss.IndexFlatL2(dimension) index.add(embeddings) else: logging.info(f"Using IndexIVFFlat for {num_vectors} vectors") # For IVF, we need at least 30x more vectors than clusters nlist = min(100, max(1, num_vectors)) quantizer = faiss.IndexFlatL2(dimension) index = faiss.IndexIVFFlat(quantizer, dimension, nlist, faiss.METRIC_L2) index.train(embeddings) index.add(embeddings)''' faiss.write_index(index, str(FAISS_INDEX_FILE)) logging.info(f"FAISS index created and saved to {FAISS_INDEX_FILE} with {index.ntotal} vectors") return index