""" One-off script: clone the FreeCAD docs repo, chunk, embed, and build indices. Usage: git clone --depth 1 https://github.com/FreeCAD/FreeCAD-documentation freecad-docs python build_index.py --repo freecad-docs Outputs written to data/: chunks.parquet — all chunk metadata + text index.faiss — FAISS IndexFlatIP of bge-small-en-v1.5 embeddings bm25.pkl — serialised bm25s index """ import argparse import os import pickle import bm25s import faiss import numpy as np import pandas as pd from sentence_transformers import SentenceTransformer from tqdm import tqdm from src.chunk import chunk_pages from src.config import BM25_FILE, CHUNKS_FILE, EMBED_MODEL, FAISS_FILE from src.ingest import load_freecad_docs def _embed_batched(model: SentenceTransformer, texts: list[str], batch_size: int = 64) -> np.ndarray: all_vecs = [] for i in tqdm(range(0, len(texts), batch_size), desc="Embedding"): batch = texts[i : i + batch_size] vecs = model.encode(batch, normalize_embeddings=True, show_progress_bar=False) all_vecs.append(vecs) return np.vstack(all_vecs).astype("float32") def build(repo_root: str, data_dir: str = "data") -> None: os.makedirs(data_dir, exist_ok=True) print("Loading FreeCAD docs...") pages = load_freecad_docs(repo_root) print(f" {len(pages)} pages loaded") print("Chunking...") chunks = chunk_pages(pages) print(f" {len(chunks)} chunks produced") df = pd.DataFrame(chunks).set_index("chunk_id") df.to_parquet(CHUNKS_FILE) print(f" Saved {CHUNKS_FILE}") texts = df["text"].tolist() # ── BM25 index ──────────────────────────────────────────────────────────── print("Building BM25 index...") from src.retrieve import _tokenize # noqa: PLC0415 tokenized = bm25s.tokenize([" ".join(_tokenize(t)) for t in texts]) bm25_index = bm25s.BM25(method="bm25+") bm25_index.index(tokenized) with open(BM25_FILE, "wb") as f: pickle.dump(bm25_index, f) print(f" Saved {BM25_FILE}") # ── Dense index ─────────────────────────────────────────────────────────── print(f"Loading embedding model: {EMBED_MODEL}") model = SentenceTransformer(EMBED_MODEL) print("Embedding chunks (this may take a few minutes on CPU)...") vecs = _embed_batched(model, texts) dim = vecs.shape[1] index = faiss.IndexFlatIP(dim) index.add(vecs) faiss.write_index(index, FAISS_FILE) print(f" Saved {FAISS_FILE} ({index.ntotal} vectors, dim={dim})") print("\nDone. Commit the data/ directory to your Spaces repo.") if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--repo", default="freecad-docs", help="Path to the cloned FreeCAD-documentation repository") parser.add_argument("--data-dir", default="data") args = parser.parse_args() build(args.repo, args.data_dir)