Spaces:
Sleeping
Sleeping
File size: 1,684 Bytes
3107242 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 |
import faiss
import numpy as np
from src.utils.helpers import load_embeddings
from src.configs.config import EMBEDDINGS_FILE, FAISS_INDEX_FILE, LOG_DIR
import logging
import os
LOG_FILE = os.path.join(LOG_DIR, "docs_preprocessing.log")
logging.basicConfig(
filename=LOG_FILE,
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
def create_faiss_index():
"""Create and save a FAISS index from embeddings."""
embeddings = load_embeddings(EMBEDDINGS_FILE)
if embeddings.size == 0:
logging.warning("No embeddings to index")
return None
dimension = embeddings.shape[1]
num_vectors = len(embeddings)
logging.info(f"Using IndexFlatL2 for {num_vectors} vectors")
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)
'''# Use simple flat index for small datasets (< 1000 vectors)
# Use IVF index for larger datasets
if num_vectors < 1000:
logging.info(f"Using IndexFlatL2 for {num_vectors} vectors")
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)
else:
logging.info(f"Using IndexIVFFlat for {num_vectors} vectors")
# For IVF, we need at least 30x more vectors than clusters
nlist = min(100, max(1, num_vectors))
quantizer = faiss.IndexFlatL2(dimension)
index = faiss.IndexIVFFlat(quantizer, dimension, nlist, faiss.METRIC_L2)
index.train(embeddings)
index.add(embeddings)'''
faiss.write_index(index, str(FAISS_INDEX_FILE))
logging.info(f"FAISS index created and saved to {FAISS_INDEX_FILE} with {index.ntotal} vectors")
return index |