beta-NORM / scripts /build_index.py
GitHub Actions
Snapshot from GitHub master for HF Space
6f54a86
import os
import sys
import shutil
import numpy as np
import faiss
from pathlib import Path
ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
if ROOT not in sys.path:
sys.path.insert(0, ROOT)
from utils import base_utils as bu
def build_faiss_index(config_path: str = "configs/config.json") -> None:
config = bu.load_config(config_path)
embeddings_dir = config["paths"].get("embeddings_dir", "data/embeddings")
index_dir = config["paths"].get("index_dir", "data/index")
embeddings_path = os.path.join(embeddings_dir, "embeddings.npy")
metadata_path = os.path.join(embeddings_dir, "metadata.jsonl")
if not os.path.exists(embeddings_path) or not os.path.exists(metadata_path):
raise FileNotFoundError(
f"embeddings.npy or metadata.jsonl not found in {embeddings_dir}. "
"Run scripts/generate_embeddings.py first."
)
Path(index_dir).mkdir(parents=True, exist_ok=True)
embs = np.load(embeddings_path).astype("float32")
if embs.ndim != 2:
raise ValueError("embeddings.npy must be a 2D array [N, D]")
num_vectors, dim = embs.shape
print(f"Loaded embeddings: {num_vectors} vectors of dimension {dim}")
faiss.normalize_L2(embs)
index = faiss.IndexFlatIP(dim)
index.add(embs)
index_file = os.path.join(index_dir, "faiss.index")
faiss.write_index(index, index_file)
print("Index written to", index_file)
target_metadata_path = os.path.join(index_dir, "metadata.jsonl")
shutil.copyfile(metadata_path, target_metadata_path)
print("Metadata copied to", target_metadata_path)
if __name__ == "__main__":
build_faiss_index()