RAG_APP / src /docs_embd /index.py
sxid003's picture
Upload 83 files
3107242 verified
import faiss
import numpy as np
from src.utils.helpers import load_embeddings
from src.configs.config import EMBEDDINGS_FILE, FAISS_INDEX_FILE, LOG_DIR
import logging
import os
LOG_FILE = os.path.join(LOG_DIR, "docs_preprocessing.log")
logging.basicConfig(
filename=LOG_FILE,
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
def create_faiss_index():
"""Create and save a FAISS index from embeddings."""
embeddings = load_embeddings(EMBEDDINGS_FILE)
if embeddings.size == 0:
logging.warning("No embeddings to index")
return None
dimension = embeddings.shape[1]
num_vectors = len(embeddings)
logging.info(f"Using IndexFlatL2 for {num_vectors} vectors")
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)
'''# Use simple flat index for small datasets (< 1000 vectors)
# Use IVF index for larger datasets
if num_vectors < 1000:
logging.info(f"Using IndexFlatL2 for {num_vectors} vectors")
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)
else:
logging.info(f"Using IndexIVFFlat for {num_vectors} vectors")
# For IVF, we need at least 30x more vectors than clusters
nlist = min(100, max(1, num_vectors))
quantizer = faiss.IndexFlatL2(dimension)
index = faiss.IndexIVFFlat(quantizer, dimension, nlist, faiss.METRIC_L2)
index.train(embeddings)
index.add(embeddings)'''
faiss.write_index(index, str(FAISS_INDEX_FILE))
logging.info(f"FAISS index created and saved to {FAISS_INDEX_FILE} with {index.ntotal} vectors")
return index