Spaces:
Running
Running
| """data/scripts/build_guidelines_index.py — index the UNHCR guidelines for RAG. | |
| Extracts text from every PDF in specs/data/guidelines/, splits it into overlapping | |
| chunks, embeds each chunk with a local Ollama embedding model, and writes a single | |
| JSON index that the ``guideline_search`` tool queries at runtime (no network). | |
| This grounds the assessment in the *actual* UNHCR Handbook + Guidelines so the | |
| agent cites real guidance instead of inventing law. | |
| Run: python data/scripts/build_guidelines_index.py | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import os | |
| import re | |
| import sys | |
| from pathlib import Path | |
| import ollama | |
| from pypdf import PdfReader | |
| REPO_ROOT = Path(__file__).resolve().parent.parent.parent | |
| sys.path.insert(0, str(REPO_ROOT)) | |
| from app.config import load_env # noqa: E402 | |
| GUIDELINES_DIR = REPO_ROOT / "specs" / "data" / "guidelines" | |
| OUTPUT = REPO_ROOT / "specs" / "data" / "guidelines_index.json" | |
| EMBED_MODEL = os.getenv("EMBED_MODEL", "nomic-embed-text") | |
| CHUNK_CHARS = 1100 | |
| OVERLAP = 150 | |
| BATCH = 32 | |
| def _title(path: Path) -> str: | |
| name = path.stem | |
| if name.lower() == "handbook": | |
| return "UNHCR Handbook on Procedures and Criteria for Determining Refugee Status" | |
| return "UNHCR Guideline " + name | |
| def _chunks(text: str) -> list[str]: | |
| text = re.sub(r"[ \t]+", " ", text) | |
| text = re.sub(r"\n{2,}", "\n", text).strip() | |
| out, i = [], 0 | |
| while i < len(text): | |
| out.append(text[i:i + CHUNK_CHARS].strip()) | |
| i += CHUNK_CHARS - OVERLAP | |
| return [c for c in out if len(c) > 120] | |
| def main() -> int: | |
| load_env() | |
| client = ollama.Client(host=os.getenv("OLLAMA_HOST")) | |
| records: list[dict] = [] | |
| pdfs = sorted(GUIDELINES_DIR.glob("*.pdf")) | |
| print(f"Indexing {len(pdfs)} PDFs with {EMBED_MODEL} …") | |
| for pdf in pdfs: | |
| title = _title(pdf) | |
| try: | |
| reader = PdfReader(str(pdf)) | |
| text = "\n".join((pg.extract_text() or "") for pg in reader.pages) | |
| except Exception as exc: # noqa: BLE001 | |
| print(f" ! {pdf.name}: {exc}") | |
| continue | |
| chunks = _chunks(text) | |
| print(f" {pdf.name}: {len(chunks)} chunks") | |
| for start in range(0, len(chunks), BATCH): | |
| batch = chunks[start:start + BATCH] | |
| emb = client.embed(model=EMBED_MODEL, input=batch)["embeddings"] | |
| for chunk, vec in zip(batch, emb): | |
| records.append({"guideline": title, "source": pdf.name, "text": chunk, "embedding": vec}) | |
| OUTPUT.write_text(json.dumps({"model": EMBED_MODEL, "dim": len(records[0]["embedding"]) if records else 0, | |
| "chunks": records}, ensure_ascii=False)) | |
| print(f"Wrote {len(records)} chunks -> {OUTPUT} ({OUTPUT.stat().st_size // 1024} KB)") | |
| return 0 | |
| if __name__ == "__main__": | |
| raise SystemExit(main()) | |