File size: 3,199 Bytes
11ba2bd | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 | """
One-off script: clone the FreeCAD docs repo, chunk, embed, and build indices.
Usage:
git clone --depth 1 https://github.com/FreeCAD/FreeCAD-documentation freecad-docs
python build_index.py --repo freecad-docs
Outputs written to data/:
chunks.parquet β all chunk metadata + text
index.faiss β FAISS IndexFlatIP of bge-small-en-v1.5 embeddings
bm25.pkl β serialised bm25s index
"""
import argparse
import os
import pickle
import bm25s
import faiss
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
from src.chunk import chunk_pages
from src.config import BM25_FILE, CHUNKS_FILE, EMBED_MODEL, FAISS_FILE
from src.ingest import load_freecad_docs
def _embed_batched(model: SentenceTransformer, texts: list[str], batch_size: int = 64) -> np.ndarray:
all_vecs = []
for i in tqdm(range(0, len(texts), batch_size), desc="Embedding"):
batch = texts[i : i + batch_size]
vecs = model.encode(batch, normalize_embeddings=True, show_progress_bar=False)
all_vecs.append(vecs)
return np.vstack(all_vecs).astype("float32")
def build(repo_root: str, data_dir: str = "data") -> None:
os.makedirs(data_dir, exist_ok=True)
print("Loading FreeCAD docs...")
pages = load_freecad_docs(repo_root)
print(f" {len(pages)} pages loaded")
print("Chunking...")
chunks = chunk_pages(pages)
print(f" {len(chunks)} chunks produced")
df = pd.DataFrame(chunks).set_index("chunk_id")
df.to_parquet(CHUNKS_FILE)
print(f" Saved {CHUNKS_FILE}")
texts = df["text"].tolist()
# ββ BM25 index ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
print("Building BM25 index...")
from src.retrieve import _tokenize # noqa: PLC0415
tokenized = bm25s.tokenize([" ".join(_tokenize(t)) for t in texts])
bm25_index = bm25s.BM25(method="bm25+")
bm25_index.index(tokenized)
with open(BM25_FILE, "wb") as f:
pickle.dump(bm25_index, f)
print(f" Saved {BM25_FILE}")
# ββ Dense index βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
print(f"Loading embedding model: {EMBED_MODEL}")
model = SentenceTransformer(EMBED_MODEL)
print("Embedding chunks (this may take a few minutes on CPU)...")
vecs = _embed_batched(model, texts)
dim = vecs.shape[1]
index = faiss.IndexFlatIP(dim)
index.add(vecs)
faiss.write_index(index, FAISS_FILE)
print(f" Saved {FAISS_FILE} ({index.ntotal} vectors, dim={dim})")
print("\nDone. Commit the data/ directory to your Spaces repo.")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--repo", default="freecad-docs",
help="Path to the cloned FreeCAD-documentation repository")
parser.add_argument("--data-dir", default="data")
args = parser.parse_args()
build(args.repo, args.data_dir)
|