Spaces:

Undrick
/

NLP_Lab

Sleeping

NLP_Lab / build_index.py

apytel

Redesigns UI for FreeCAD RAG Python script generator

11ba2bd about 1 month ago

3.2 kB

	"""
	One-off script: clone the FreeCAD docs repo, chunk, embed, and build indices.

	Usage:
	git clone --depth 1 https://github.com/FreeCAD/FreeCAD-documentation freecad-docs
	python build_index.py --repo freecad-docs

	Outputs written to data/:
	chunks.parquet — all chunk metadata + text
	index.faiss — FAISS IndexFlatIP of bge-small-en-v1.5 embeddings
	bm25.pkl — serialised bm25s index
	"""
	import argparse
	import os
	import pickle

	import bm25s
	import faiss
	import numpy as np
	import pandas as pd
	from sentence_transformers import SentenceTransformer
	from tqdm import tqdm

	from src.chunk import chunk_pages
	from src.config import BM25_FILE, CHUNKS_FILE, EMBED_MODEL, FAISS_FILE
	from src.ingest import load_freecad_docs


	def _embed_batched(model: SentenceTransformer, texts: list[str], batch_size: int = 64) -> np.ndarray:
	all_vecs = []
	for i in tqdm(range(0, len(texts), batch_size), desc="Embedding"):
	batch = texts[i : i + batch_size]
	vecs = model.encode(batch, normalize_embeddings=True, show_progress_bar=False)
	all_vecs.append(vecs)
	return np.vstack(all_vecs).astype("float32")


	def build(repo_root: str, data_dir: str = "data") -> None:
	os.makedirs(data_dir, exist_ok=True)

	print("Loading FreeCAD docs...")
	pages = load_freecad_docs(repo_root)
	print(f" {len(pages)} pages loaded")

	print("Chunking...")
	chunks = chunk_pages(pages)
	print(f" {len(chunks)} chunks produced")

	df = pd.DataFrame(chunks).set_index("chunk_id")
	df.to_parquet(CHUNKS_FILE)
	print(f" Saved {CHUNKS_FILE}")

	texts = df["text"].tolist()

	# ── BM25 index ────────────────────────────────────────────────────────────
	print("Building BM25 index...")
	from src.retrieve import _tokenize # noqa: PLC0415
	tokenized = bm25s.tokenize([" ".join(_tokenize(t)) for t in texts])
	bm25_index = bm25s.BM25(method="bm25+")
	bm25_index.index(tokenized)
	with open(BM25_FILE, "wb") as f:
	pickle.dump(bm25_index, f)
	print(f" Saved {BM25_FILE}")

	# ── Dense index ───────────────────────────────────────────────────────────
	print(f"Loading embedding model: {EMBED_MODEL}")
	model = SentenceTransformer(EMBED_MODEL)

	print("Embedding chunks (this may take a few minutes on CPU)...")
	vecs = _embed_batched(model, texts)

	dim = vecs.shape[1]
	index = faiss.IndexFlatIP(dim)
	index.add(vecs)
	faiss.write_index(index, FAISS_FILE)
	print(f" Saved {FAISS_FILE} ({index.ntotal} vectors, dim={dim})")

	print("\nDone. Commit the data/ directory to your Spaces repo.")


	if __name__ == "__main__":
	parser = argparse.ArgumentParser()
	parser.add_argument("--repo", default="freecad-docs",
	help="Path to the cloned FreeCAD-documentation repository")
	parser.add_argument("--data-dir", default="data")
	args = parser.parse_args()
	build(args.repo, args.data_dir)