File size: 1,684 Bytes
3107242
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import faiss
import numpy as np
from src.utils.helpers import load_embeddings
from src.configs.config import EMBEDDINGS_FILE, FAISS_INDEX_FILE, LOG_DIR
import logging
import os

LOG_FILE = os.path.join(LOG_DIR, "docs_preprocessing.log")
logging.basicConfig(
    filename=LOG_FILE,
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)

def create_faiss_index():
    """Create and save a FAISS index from embeddings."""
    embeddings = load_embeddings(EMBEDDINGS_FILE)
    if embeddings.size == 0:
        logging.warning("No embeddings to index")
        return None
    
    dimension = embeddings.shape[1]
    num_vectors = len(embeddings)

    logging.info(f"Using IndexFlatL2 for {num_vectors} vectors")
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    
    '''# Use simple flat index for small datasets (< 1000 vectors)
    # Use IVF index for larger datasets
    if num_vectors < 1000:
        logging.info(f"Using IndexFlatL2 for {num_vectors} vectors")
        index = faiss.IndexFlatL2(dimension)
        index.add(embeddings)
    else:
        logging.info(f"Using IndexIVFFlat for {num_vectors} vectors")
        # For IVF, we need at least 30x more vectors than clusters
        nlist = min(100, max(1, num_vectors))  
        quantizer = faiss.IndexFlatL2(dimension)
        index = faiss.IndexIVFFlat(quantizer, dimension, nlist, faiss.METRIC_L2)
        index.train(embeddings)
        index.add(embeddings)'''
    
    faiss.write_index(index, str(FAISS_INDEX_FILE))
    logging.info(f"FAISS index created and saved to {FAISS_INDEX_FILE} with {index.ntotal} vectors")
    return index