Spaces:

build-small-hackathon
/

fugee

Running

App Files Files Community

fugee / data /scripts /build_guidelines_index.py

helmo

[agentic] Ground assessment in UNHCR guidelines via a RAG tool

f963859 27 days ago

Raw

History Blame Contribute Delete

2.85 kB

	"""data/scripts/build_guidelines_index.py — index the UNHCR guidelines for RAG.

	Extracts text from every PDF in specs/data/guidelines/, splits it into overlapping
	chunks, embeds each chunk with a local Ollama embedding model, and writes a single
	JSON index that the ``guideline_search`` tool queries at runtime (no network).

	This grounds the assessment in the actual UNHCR Handbook + Guidelines so the
	agent cites real guidance instead of inventing law.

	Run: python data/scripts/build_guidelines_index.py
	"""

	from __future__ import annotations

	import json
	import os
	import re
	import sys
	from pathlib import Path

	import ollama
	from pypdf import PdfReader

	REPO_ROOT = Path(__file__).resolve().parent.parent.parent
	sys.path.insert(0, str(REPO_ROOT))
	from app.config import load_env # noqa: E402

	GUIDELINES_DIR = REPO_ROOT / "specs" / "data" / "guidelines"
	OUTPUT = REPO_ROOT / "specs" / "data" / "guidelines_index.json"
	EMBED_MODEL = os.getenv("EMBED_MODEL", "nomic-embed-text")
	CHUNK_CHARS = 1100
	OVERLAP = 150
	BATCH = 32


	def _title(path: Path) -> str:
	name = path.stem
	if name.lower() == "handbook":
	return "UNHCR Handbook on Procedures and Criteria for Determining Refugee Status"
	return "UNHCR Guideline " + name


	def _chunks(text: str) -> list[str]:
	text = re.sub(r"[ \t]+", " ", text)
	text = re.sub(r"\n{2,}", "\n", text).strip()
	out, i = [], 0
	while i < len(text):
	out.append(text[i:i + CHUNK_CHARS].strip())
	i += CHUNK_CHARS - OVERLAP
	return [c for c in out if len(c) > 120]


	def main() -> int:
	load_env()
	client = ollama.Client(host=os.getenv("OLLAMA_HOST"))
	records: list[dict] = []

	pdfs = sorted(GUIDELINES_DIR.glob("*.pdf"))
	print(f"Indexing {len(pdfs)} PDFs with {EMBED_MODEL} …")
	for pdf in pdfs:
	title = _title(pdf)
	try:
	reader = PdfReader(str(pdf))
	text = "\n".join((pg.extract_text() or "") for pg in reader.pages)
	except Exception as exc: # noqa: BLE001
	print(f" ! {pdf.name}: {exc}")
	continue
	chunks = _chunks(text)
	print(f" {pdf.name}: {len(chunks)} chunks")
	for start in range(0, len(chunks), BATCH):
	batch = chunks[start:start + BATCH]
	emb = client.embed(model=EMBED_MODEL, input=batch)["embeddings"]
	for chunk, vec in zip(batch, emb):
	records.append({"guideline": title, "source": pdf.name, "text": chunk, "embedding": vec})

	OUTPUT.write_text(json.dumps({"model": EMBED_MODEL, "dim": len(records[0]["embedding"]) if records else 0,
	"chunks": records}, ensure_ascii=False))
	print(f"Wrote {len(records)} chunks -> {OUTPUT} ({OUTPUT.stat().st_size // 1024} KB)")
	return 0


	if __name__ == "__main__":
	raise SystemExit(main())