"""data/scripts/build_guidelines_index.py — index the UNHCR guidelines for RAG. Extracts text from every PDF in specs/data/guidelines/, splits it into overlapping chunks, embeds each chunk with a local Ollama embedding model, and writes a single JSON index that the ``guideline_search`` tool queries at runtime (no network). This grounds the assessment in the *actual* UNHCR Handbook + Guidelines so the agent cites real guidance instead of inventing law. Run: python data/scripts/build_guidelines_index.py """ from __future__ import annotations import json import os import re import sys from pathlib import Path import ollama from pypdf import PdfReader REPO_ROOT = Path(__file__).resolve().parent.parent.parent sys.path.insert(0, str(REPO_ROOT)) from app.config import load_env # noqa: E402 GUIDELINES_DIR = REPO_ROOT / "specs" / "data" / "guidelines" OUTPUT = REPO_ROOT / "specs" / "data" / "guidelines_index.json" EMBED_MODEL = os.getenv("EMBED_MODEL", "nomic-embed-text") CHUNK_CHARS = 1100 OVERLAP = 150 BATCH = 32 def _title(path: Path) -> str: name = path.stem if name.lower() == "handbook": return "UNHCR Handbook on Procedures and Criteria for Determining Refugee Status" return "UNHCR Guideline " + name def _chunks(text: str) -> list[str]: text = re.sub(r"[ \t]+", " ", text) text = re.sub(r"\n{2,}", "\n", text).strip() out, i = [], 0 while i < len(text): out.append(text[i:i + CHUNK_CHARS].strip()) i += CHUNK_CHARS - OVERLAP return [c for c in out if len(c) > 120] def main() -> int: load_env() client = ollama.Client(host=os.getenv("OLLAMA_HOST")) records: list[dict] = [] pdfs = sorted(GUIDELINES_DIR.glob("*.pdf")) print(f"Indexing {len(pdfs)} PDFs with {EMBED_MODEL} …") for pdf in pdfs: title = _title(pdf) try: reader = PdfReader(str(pdf)) text = "\n".join((pg.extract_text() or "") for pg in reader.pages) except Exception as exc: # noqa: BLE001 print(f" ! {pdf.name}: {exc}") continue chunks = _chunks(text) print(f" {pdf.name}: {len(chunks)} chunks") for start in range(0, len(chunks), BATCH): batch = chunks[start:start + BATCH] emb = client.embed(model=EMBED_MODEL, input=batch)["embeddings"] for chunk, vec in zip(batch, emb): records.append({"guideline": title, "source": pdf.name, "text": chunk, "embedding": vec}) OUTPUT.write_text(json.dumps({"model": EMBED_MODEL, "dim": len(records[0]["embedding"]) if records else 0, "chunks": records}, ensure_ascii=False)) print(f"Wrote {len(records)} chunks -> {OUTPUT} ({OUTPUT.stat().st_size // 1024} KB)") return 0 if __name__ == "__main__": raise SystemExit(main())