| import numpy as np | |
| import faiss | |
| from pathlib import Path | |
| import time | |
| import sys | |
| DATA_DIR = Path("data") | |
| EMBEDDINGS_PATH = DATA_DIR / "embeddings_cache.npy" | |
| OUTPUT_PATH = DATA_DIR / "index" / "optimized.index" | |
| def main(): | |
| if not EMBEDDINGS_PATH.exists(): | |
| print("No embeddings found. Run scripts/1b... first.") | |
| sys.exit(1) | |
| print(f"Loading embeddings from {EMBEDDINGS_PATH}...") | |
| embeddings = np.load(EMBEDDINGS_PATH).astype(np.float32) | |
| d = embeddings.shape[1] | |
| nb = embeddings.shape[0] | |
| print(f"Dataset: {nb} items, {d} dimensions.") | |
| nlist = 100 | |
| m = 32 | |
| nbits = 8 | |
| print(f"Training IVF{nlist}, PQ{m} index...") | |
| quantizer = faiss.IndexFlatL2(d) | |
| index = faiss.IndexIVFPQ(quantizer, d, nlist, m, nbits) | |
| start_t = time.time() | |
| index.train(embeddings) | |
| print(f"Training time: {time.time() - start_t:.2f}s") | |
| print("Adding vectors to index...") | |
| index.add(embeddings) | |
| OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True) | |
| faiss.write_index(index, str(OUTPUT_PATH)) | |
| print(f"Optimized index saved to {OUTPUT_PATH}") | |
| print(f"Original Size: {nb * d * 4 / 1024 / 1024:.2f} MB") | |
| print(f"Optimized Size: {nb * m / 1024 / 1024:.2f} MB (Approx)") | |
| if __name__ == "__main__": | |
| main() | |