Spaces:

rohitsar567
/

InsuranceBot

Sleeping

App Files Files Community

InsuranceBot / tools /_pdf_text_cache.py

rohitsar567

recovery: integrate stalled-session work + de-stale/cleanup (pytest 215 green) [build-fix]

b87bd2d about 2 months ago

Raw

History Blame Contribute Delete

2.73 kB

	#!/usr/bin/env python3
	"""Extract + cache full text of every source PDF referenced by policy_facts.

	Writes plaintext to tools/.pdf_text_cache/<sha-ish>.txt keyed by pdf path.
	Idempotent. This is internal-data mining (the insurer's own IRDAI-filed
	wording), NOT web fabrication.
	"""
	from __future__ import annotations

	import hashlib
	import json
	import os
	import sys
	from pathlib import Path

	import pdfplumber

	ROOT = Path(__file__).resolve().parents[1]
	CACHE = ROOT / "tools" / ".pdf_text_cache"
	CACHE.mkdir(exist_ok=True)

	# Known path fixes (typo'd source_pdf_path in policy_facts).
	PATH_FIX = {
	"rag/corpus/bajaj-allianz/group-health-guard-gold__wordings.pdf":
	"rag/corpus/bajaj-allianz/group-health-guard-silver__wordings.pdf",
	}


	def cache_path(pdf_rel: str) -> Path:
	h = hashlib.sha1(pdf_rel.encode()).hexdigest()[:16]
	return CACHE / f"{h}.txt"


	def text_for(pdf_rel: str) -> str \| None:
	pdf_rel = PATH_FIX.get(pdf_rel, pdf_rel)
	p = ROOT / pdf_rel
	if not p.exists():
	return None
	cp = cache_path(pdf_rel)
	if cp.exists() and cp.stat().st_size > 0:
	return cp.read_text(errors="replace")
	parts = []
	try:
	with pdfplumber.open(p) as pdf:
	for pg in pdf.pages:
	parts.append(pg.extract_text() or "")
	for tbl in (pg.extract_tables() or []):
	for row in tbl:
	parts.append(
	" \| ".join(c or "" for c in row)
	)
	except Exception as e: # noqa: BLE001
	sys.stderr.write(f"!! {pdf_rel}: {e}\n")
	return None
	txt = "\n".join(parts)
	cp.write_text(txt, errors="replace")
	return txt


	def resolve_source_pdf(facts: dict) -> str \| None:
	sp = facts.get("source_pdf_path")
	if isinstance(sp, str):
	return sp
	meta = facts.get("_meta", {}) or {}
	if isinstance(meta.get("primary_source_pdf"), str):
	return meta["primary_source_pdf"]
	for v in facts.values():
	if isinstance(v, dict) and isinstance(v.get("source_pdf_path"), str):
	return v["source_pdf_path"]
	return None


	if __name__ == "__main__":
	import glob
	n_ok = n_fail = 0
	for f in sorted(glob.glob(str(ROOT / "40-data/policy_facts/*.json"))):
	if os.path.basename(f).startswith("_"):
	continue
	facts = json.loads(Path(f).read_text())
	sp = resolve_source_pdf(facts)
	if not sp:
	n_fail += 1
	continue
	t = text_for(sp)
	if t:
	n_ok += 1
	else:
	n_fail += 1
	print("FAIL", os.path.basename(f), sp)
	print(f"cached text for {n_ok} policies, {n_fail} failed")