File size: 1,573 Bytes
6f54a86
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import os
import sys
import shutil
import numpy as np
import faiss
from pathlib import Path

ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
if ROOT not in sys.path:
	sys.path.insert(0, ROOT)
from utils import base_utils as bu

def build_faiss_index(config_path: str = "configs/config.json") -> None:
	config = bu.load_config(config_path)

	embeddings_dir = config["paths"].get("embeddings_dir", "data/embeddings")
	index_dir = config["paths"].get("index_dir", "data/index")

	embeddings_path = os.path.join(embeddings_dir, "embeddings.npy")
	metadata_path = os.path.join(embeddings_dir, "metadata.jsonl")

	if not os.path.exists(embeddings_path) or not os.path.exists(metadata_path):
		raise FileNotFoundError(
			f"embeddings.npy or metadata.jsonl not found in {embeddings_dir}. "
			"Run scripts/generate_embeddings.py first."
		)

	Path(index_dir).mkdir(parents=True, exist_ok=True)

	embs = np.load(embeddings_path).astype("float32")
	if embs.ndim != 2:
		raise ValueError("embeddings.npy must be a 2D array [N, D]")

	num_vectors, dim = embs.shape
	print(f"Loaded embeddings: {num_vectors} vectors of dimension {dim}")

	faiss.normalize_L2(embs)

	index = faiss.IndexFlatIP(dim)
	index.add(embs)

	index_file = os.path.join(index_dir, "faiss.index")
	faiss.write_index(index, index_file)
	print("Index written to", index_file)

	target_metadata_path = os.path.join(index_dir, "metadata.jsonl")
	shutil.copyfile(metadata_path, target_metadata_path)
	print("Metadata copied to", target_metadata_path)


if __name__ == "__main__":
	build_faiss_index()