File size: 1,322 Bytes
7964128
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import numpy as np
import faiss
from pathlib import Path
import time
import sys

DATA_DIR = Path("data")
EMBEDDINGS_PATH = DATA_DIR / "embeddings_cache.npy"
OUTPUT_PATH = DATA_DIR / "index" / "optimized.index"

def main():
    if not EMBEDDINGS_PATH.exists():
        print("No embeddings found. Run scripts/1b... first.")
        sys.exit(1)
        
    print(f"Loading embeddings from {EMBEDDINGS_PATH}...")
    embeddings = np.load(EMBEDDINGS_PATH).astype(np.float32)
    d = embeddings.shape[1]
    nb = embeddings.shape[0]
    
    print(f"Dataset: {nb} items, {d} dimensions.")
    
    nlist = 100  
    m = 32       
    nbits = 8   
    
    print(f"Training IVF{nlist}, PQ{m} index...")
    quantizer = faiss.IndexFlatL2(d)
    index = faiss.IndexIVFPQ(quantizer, d, nlist, m, nbits)
    
    start_t = time.time()
    index.train(embeddings)
    print(f"Training time: {time.time() - start_t:.2f}s")
    
    print("Adding vectors to index...")
    index.add(embeddings)
    
    OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
    faiss.write_index(index, str(OUTPUT_PATH))
    
    print(f"Optimized index saved to {OUTPUT_PATH}")
    print(f"Original Size: {nb * d * 4 / 1024 / 1024:.2f} MB")
    print(f"Optimized Size: {nb * m / 1024 / 1024:.2f} MB (Approx)")

if __name__ == "__main__":
    main()