Spaces:
Sleeping
Sleeping
| import faiss | |
| import numpy as np | |
| from src.utils.helpers import load_embeddings | |
| from src.configs.config import EMBEDDINGS_FILE, FAISS_INDEX_FILE, LOG_DIR | |
| import logging | |
| import os | |
| LOG_FILE = os.path.join(LOG_DIR, "docs_preprocessing.log") | |
| logging.basicConfig( | |
| filename=LOG_FILE, | |
| level=logging.INFO, | |
| format="%(asctime)s - %(levelname)s - %(message)s", | |
| datefmt="%Y-%m-%d %H:%M:%S", | |
| ) | |
| def create_faiss_index(): | |
| """Create and save a FAISS index from embeddings.""" | |
| embeddings = load_embeddings(EMBEDDINGS_FILE) | |
| if embeddings.size == 0: | |
| logging.warning("No embeddings to index") | |
| return None | |
| dimension = embeddings.shape[1] | |
| num_vectors = len(embeddings) | |
| logging.info(f"Using IndexFlatL2 for {num_vectors} vectors") | |
| index = faiss.IndexFlatL2(dimension) | |
| index.add(embeddings) | |
| '''# Use simple flat index for small datasets (< 1000 vectors) | |
| # Use IVF index for larger datasets | |
| if num_vectors < 1000: | |
| logging.info(f"Using IndexFlatL2 for {num_vectors} vectors") | |
| index = faiss.IndexFlatL2(dimension) | |
| index.add(embeddings) | |
| else: | |
| logging.info(f"Using IndexIVFFlat for {num_vectors} vectors") | |
| # For IVF, we need at least 30x more vectors than clusters | |
| nlist = min(100, max(1, num_vectors)) | |
| quantizer = faiss.IndexFlatL2(dimension) | |
| index = faiss.IndexIVFFlat(quantizer, dimension, nlist, faiss.METRIC_L2) | |
| index.train(embeddings) | |
| index.add(embeddings)''' | |
| faiss.write_index(index, str(FAISS_INDEX_FILE)) | |
| logging.info(f"FAISS index created and saved to {FAISS_INDEX_FILE} with {index.ntotal} vectors") | |
| return index |