| """ |
| One-off script: clone the FreeCAD docs repo, chunk, embed, and build indices. |
| |
| Usage: |
| git clone --depth 1 https://github.com/FreeCAD/FreeCAD-documentation freecad-docs |
| python build_index.py --repo freecad-docs |
| |
| Outputs written to data/: |
| chunks.parquet β all chunk metadata + text |
| index.faiss β FAISS IndexFlatIP of bge-small-en-v1.5 embeddings |
| bm25.pkl β serialised bm25s index |
| """ |
| import argparse |
| import os |
| import pickle |
|
|
| import bm25s |
| import faiss |
| import numpy as np |
| import pandas as pd |
| from sentence_transformers import SentenceTransformer |
| from tqdm import tqdm |
|
|
| from src.chunk import chunk_pages |
| from src.config import BM25_FILE, CHUNKS_FILE, EMBED_MODEL, FAISS_FILE |
| from src.ingest import load_freecad_docs |
|
|
|
|
| def _embed_batched(model: SentenceTransformer, texts: list[str], batch_size: int = 64) -> np.ndarray: |
| all_vecs = [] |
| for i in tqdm(range(0, len(texts), batch_size), desc="Embedding"): |
| batch = texts[i : i + batch_size] |
| vecs = model.encode(batch, normalize_embeddings=True, show_progress_bar=False) |
| all_vecs.append(vecs) |
| return np.vstack(all_vecs).astype("float32") |
|
|
|
|
| def build(repo_root: str, data_dir: str = "data") -> None: |
| os.makedirs(data_dir, exist_ok=True) |
|
|
| print("Loading FreeCAD docs...") |
| pages = load_freecad_docs(repo_root) |
| print(f" {len(pages)} pages loaded") |
|
|
| print("Chunking...") |
| chunks = chunk_pages(pages) |
| print(f" {len(chunks)} chunks produced") |
|
|
| df = pd.DataFrame(chunks).set_index("chunk_id") |
| df.to_parquet(CHUNKS_FILE) |
| print(f" Saved {CHUNKS_FILE}") |
|
|
| texts = df["text"].tolist() |
|
|
| |
| print("Building BM25 index...") |
| from src.retrieve import _tokenize |
| tokenized = bm25s.tokenize([" ".join(_tokenize(t)) for t in texts]) |
| bm25_index = bm25s.BM25(method="bm25+") |
| bm25_index.index(tokenized) |
| with open(BM25_FILE, "wb") as f: |
| pickle.dump(bm25_index, f) |
| print(f" Saved {BM25_FILE}") |
|
|
| |
| print(f"Loading embedding model: {EMBED_MODEL}") |
| model = SentenceTransformer(EMBED_MODEL) |
|
|
| print("Embedding chunks (this may take a few minutes on CPU)...") |
| vecs = _embed_batched(model, texts) |
|
|
| dim = vecs.shape[1] |
| index = faiss.IndexFlatIP(dim) |
| index.add(vecs) |
| faiss.write_index(index, FAISS_FILE) |
| print(f" Saved {FAISS_FILE} ({index.ntotal} vectors, dim={dim})") |
|
|
| print("\nDone. Commit the data/ directory to your Spaces repo.") |
|
|
|
|
| if __name__ == "__main__": |
| parser = argparse.ArgumentParser() |
| parser.add_argument("--repo", default="freecad-docs", |
| help="Path to the cloned FreeCAD-documentation repository") |
| parser.add_argument("--data-dir", default="data") |
| args = parser.parse_args() |
| build(args.repo, args.data_dir) |
|
|