fugee / data /scripts /build_guidelines_index.py
helmo's picture
[agentic] Ground assessment in UNHCR guidelines via a RAG tool
f963859
Raw
History Blame Contribute Delete
2.85 kB
"""data/scripts/build_guidelines_index.py — index the UNHCR guidelines for RAG.
Extracts text from every PDF in specs/data/guidelines/, splits it into overlapping
chunks, embeds each chunk with a local Ollama embedding model, and writes a single
JSON index that the ``guideline_search`` tool queries at runtime (no network).
This grounds the assessment in the *actual* UNHCR Handbook + Guidelines so the
agent cites real guidance instead of inventing law.
Run: python data/scripts/build_guidelines_index.py
"""
from __future__ import annotations
import json
import os
import re
import sys
from pathlib import Path
import ollama
from pypdf import PdfReader
REPO_ROOT = Path(__file__).resolve().parent.parent.parent
sys.path.insert(0, str(REPO_ROOT))
from app.config import load_env # noqa: E402
GUIDELINES_DIR = REPO_ROOT / "specs" / "data" / "guidelines"
OUTPUT = REPO_ROOT / "specs" / "data" / "guidelines_index.json"
EMBED_MODEL = os.getenv("EMBED_MODEL", "nomic-embed-text")
CHUNK_CHARS = 1100
OVERLAP = 150
BATCH = 32
def _title(path: Path) -> str:
name = path.stem
if name.lower() == "handbook":
return "UNHCR Handbook on Procedures and Criteria for Determining Refugee Status"
return "UNHCR Guideline " + name
def _chunks(text: str) -> list[str]:
text = re.sub(r"[ \t]+", " ", text)
text = re.sub(r"\n{2,}", "\n", text).strip()
out, i = [], 0
while i < len(text):
out.append(text[i:i + CHUNK_CHARS].strip())
i += CHUNK_CHARS - OVERLAP
return [c for c in out if len(c) > 120]
def main() -> int:
load_env()
client = ollama.Client(host=os.getenv("OLLAMA_HOST"))
records: list[dict] = []
pdfs = sorted(GUIDELINES_DIR.glob("*.pdf"))
print(f"Indexing {len(pdfs)} PDFs with {EMBED_MODEL} …")
for pdf in pdfs:
title = _title(pdf)
try:
reader = PdfReader(str(pdf))
text = "\n".join((pg.extract_text() or "") for pg in reader.pages)
except Exception as exc: # noqa: BLE001
print(f" ! {pdf.name}: {exc}")
continue
chunks = _chunks(text)
print(f" {pdf.name}: {len(chunks)} chunks")
for start in range(0, len(chunks), BATCH):
batch = chunks[start:start + BATCH]
emb = client.embed(model=EMBED_MODEL, input=batch)["embeddings"]
for chunk, vec in zip(batch, emb):
records.append({"guideline": title, "source": pdf.name, "text": chunk, "embedding": vec})
OUTPUT.write_text(json.dumps({"model": EMBED_MODEL, "dim": len(records[0]["embedding"]) if records else 0,
"chunks": records}, ensure_ascii=False))
print(f"Wrote {len(records)} chunks -> {OUTPUT} ({OUTPUT.stat().st_size // 1024} KB)")
return 0
if __name__ == "__main__":
raise SystemExit(main())