Spaces:

rohitsar567
/

InsuranceBot

Sleeping

App Files Files Community

InsuranceBot / rag /source_map.py

rohitsar567

refactor: KI-047 — bucket reorg (Option A safe subset)

e2d09f9 about 2 months ago

Raw

History Blame Contribute Delete

10.5 kB

	"""Generate the Information Source Map — a detailed catalog of what knowledge
	the corpus contains, used for both reviewer-facing explainability and
	faithfulness verification at runtime.

	Outputs two artifacts after ingestion + extraction have run:

	1. 70-docs/information_source_map.md
	Human-readable per-policy catalog: insurer, policy, doc type, chunk count,
	pages covered, extracted-field summary, source URL. The "what does the bot
	know" reference.

	2. rag/source_map.json
	Machine-readable per-chunk index: {chunk_id, policy_id, page_range,
	extracted_terms, primary_topics}. Used by faithfulness verifier to look up
	whether a claim could plausibly trace to a chunk.

	Run:
	python -m rag.source_map
	"""

	from __future__ import annotations

	import json
	import re
	import time
	from collections import defaultdict
	from pathlib import Path

	import chromadb
	import duckdb
	from chromadb.config import Settings as ChromaSettings

	from backend.config import settings

	ROOT = settings.CORPUS_DIR.parent.parent
	MD_OUTPUT = ROOT / "docs" / "information_source_map.md"
	JSON_OUTPUT = settings.VECTORS_DIR.parent / "source_map.json"


	# Topic keywords used to tag chunks with the high-level concepts they cover.
	# Used for the JSON catalog + chunk-routing in retrieval.
	TOPIC_KEYWORDS: dict[str, list[str]] = {
	"waiting_period": ["waiting period", "pre-existing", "PED", "specific waiting", "initial waiting"],
	"coverage_scope": ["covered", "covers", "inpatient", "outpatient", "OPD", "domiciliary"],
	"exclusions": ["exclusion", "excluded", "not covered", "shall not pay", "permanent exclusion"],
	"claim_process": ["claim", "settlement", "TAT", "turnaround time", "reimbursement", "cashless"],
	"sum_insured": ["sum insured", "sum assured", "policy limit", "annual limit"],
	"room_rent": ["room rent", "ICU", "private room", "single room"],
	"copayment": ["co-payment", "copay", "deductible", "patient share"],
	"maternity": ["maternity", "pregnancy", "delivery", "newborn"],
	"ayush": ["AYUSH", "Ayurveda", "Yoga", "Unani", "Siddha", "Homeopathy"],
	"critical_illness": ["critical illness", "cancer", "stroke", "heart attack", "kidney failure"],
	"network": ["network hospital", "cashless", "network of hospitals", "empanelled"],
	"ncb": ["no claim bonus", "NCB", "cumulative bonus", "renewal bonus"],
	"restoration": ["restoration", "refill", "recharge"],
	"geography": ["pan-india", "worldwide", "overseas", "geographic"],
	"tax_section_80d": ["80D", "tax benefit", "tax deduction", "income tax"],
	"renewal": ["renewal", "renewability", "lifelong", "guaranteed renewal"],
	}


	def chroma_collection():
	client = chromadb.PersistentClient(
	path=str(settings.VECTORS_DIR),
	settings=ChromaSettings(anonymized_telemetry=False),
	)
	return client.get_or_create_collection(
	name="policies",
	metadata={"hnsw:space": "cosine"},
	)


	def load_extracted_policies() -> dict[str, dict]:
	"""Map policy_id -> extracted JSON from DuckDB."""
	out: dict[str, dict] = {}
	db = settings.STRUCTURED_DB
	if not db.exists():
	return out
	con = duckdb.connect(str(db), read_only=True)
	try:
	rows = con.execute("SELECT policy_id, data_json FROM policies").fetchall()
	for pid, data in rows:
	try:
	out[pid] = json.loads(data)
	except Exception:
	pass
	finally:
	con.close()
	return out


	def tag_topics(text: str) -> list[str]:
	"""Return the topics this chunk text covers."""
	t = text.lower()
	return [topic for topic, kws in TOPIC_KEYWORDS.items() if any(kw.lower() in t for kw in kws)]


	def summarize_fields(p: dict) -> dict:
	"""Pick high-leverage fields for the per-policy summary in the markdown."""
	def get(k, default="—"):
	v = p.get(k, default)
	if v is None or v == "" or v == []:
	return default
	return v
	return {
	"policy_name": get("policy_name"),
	"insurer_name": get("insurer_name"),
	"policy_type": get("policy_type"),
	"min_entry_age": get("min_entry_age"),
	"max_entry_age": get("max_entry_age"),
	"sum_insured_options": get("sum_insured_options"),
	"pre_existing_disease_waiting_months": get("pre_existing_disease_waiting_months"),
	"maternity_waiting_months": get("maternity_waiting_months"),
	"ayush_coverage": get("ayush_coverage"),
	"room_rent_capping": get("room_rent_capping"),
	"copayment_pct": get("copayment_pct"),
	"no_claim_bonus_pct": get("no_claim_bonus_pct"),
	"network_hospital_count": get("network_hospital_count"),
	"extraction_confidence_pct": get("extraction_confidence_pct"),
	}


	def build_machine_index() -> dict:
	"""Per-chunk index used by faithfulness verifier."""
	coll = chroma_collection()
	total = coll.count()
	if total == 0:
	return {"total_chunks": 0, "chunks": []}

	PAGE = 500
	chunks_out: list[dict] = []
	for offset in range(0, total, PAGE):
	res = coll.get(limit=PAGE, offset=offset, include=["documents", "metadatas"])
	for cid, doc, meta in zip(res["ids"], res["documents"], res["metadatas"]):
	chunks_out.append({
	"chunk_id": cid,
	"policy_id": meta.get("policy_id", ""),
	"insurer_slug": meta.get("insurer_slug", ""),
	"policy_name": meta.get("policy_name", ""),
	"doc_type": meta.get("doc_type", ""),
	"page_start": meta.get("page_start"),
	"page_end": meta.get("page_end"),
	"topics": tag_topics(doc),
	})
	return {
	"generated_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
	"total_chunks": total,
	"chunks": chunks_out,
	}


	def build_markdown(machine: dict, extracted: dict[str, dict]) -> str:
	"""Human-readable per-policy + per-topic catalog."""
	# Group chunks by policy
	by_policy: dict[str, list[dict]] = defaultdict(list)
	for c in machine.get("chunks", []):
	by_policy[c["policy_id"]].append(c)

	# Per-policy summary
	policies_md = []
	for pid in sorted(by_policy.keys()):
	chunks = by_policy[pid]
	meta = chunks[0]
	pages = sorted(set(c["page_start"] for c in chunks if c["page_start"]))
	topic_counts: dict[str, int] = defaultdict(int)
	for c in chunks:
	for t in c.get("topics", []):
	topic_counts[t] += 1
	topic_summary = ", ".join(f"{t}({n})" for t, n in sorted(topic_counts.items(), key=lambda kv: -kv[1])[:8])

	# Extracted-field summary
	ext = extracted.get(pid, {})
	f = summarize_fields(ext) if ext else {}
	field_lines = []
	for k, v in f.items():
	if v not in ("—", None, ""):
	field_lines.append(f" - {k}: {v}")
	field_block = "\n".join(field_lines) if field_lines else " - (extraction not yet run for this policy)"

	policies_md.append(
	f"### {meta['policy_name']} \n"
	f"_{meta['insurer_slug']} · {meta['doc_type']} · {len(chunks)} chunks · pages {min(pages) if pages else '?'}-{max(pages) if pages else '?'}_\n\n"
	f"Topics covered: {topic_summary or '(none auto-tagged)'}\n\n"
	f"Extracted fields:\n{field_block}\n\n"
	f"`policy_id`: `{pid}`\n"
	)

	# Per-topic inverted index
	topic_to_policies: dict[str, set[str]] = defaultdict(set)
	for c in machine.get("chunks", []):
	for t in c.get("topics", []):
	topic_to_policies[t].add(c["policy_id"])
	topic_md = []
	for topic in sorted(topic_to_policies.keys()):
	pols = sorted(topic_to_policies[topic])
	topic_md.append(f"- {topic} — covered in {len(pols)} policies: {', '.join(pols[:8])}{', …' if len(pols) > 8 else ''}")

	md = f"""# Information Source Map

	\| Field \| Value \|
	\| --- \| --- \|
	\| Generated \| {machine.get('generated_at', 'never')} \|
	\| Total chunks in vector store \| {machine.get('total_chunks', 0)} \|
	\| Policies indexed \| {len(by_policy)} \|
	\| Topics auto-tagged \| {len(TOPIC_KEYWORDS)} \|

	## 0. Purpose

	This document is the authoritative catalog of what the bot can answer. Every chunk in the Chroma vector store is summarized here, grouped by policy. For each policy, the high-value extracted fields are listed alongside.

	A reviewer can use this file to answer two questions:

	1. "Could the bot know this?" → look up the policy + topic.
	2. "Is the bot's answer plausibly grounded?" → cross-reference the policy_id and field in the runtime audit log.

	This artifact is regenerated after every ingestion or extraction run via `python -m rag.source_map`.

	## 1. Topic inverted index — what is covered, where

	{chr(10).join(topic_md) if topic_md else '_(no topics indexed yet — has ingestion run?)_'}

	## 2. Per-policy catalog

	{(chr(10) + chr(10)).join(policies_md) if policies_md else '_(no policies indexed yet)_'}

	---

	## 3. Machine-readable index

	A JSON form of this catalog is at `rag/source_map.json` — used by the faithfulness verifier to look up whether a claim could plausibly trace to a chunk before allowing it through.

	## 4. Coverage gaps (transparent)

	These are areas where the corpus is thin. Bot questions on these should refuse:

	- Regulatory documents (IRDAI): Deferred — see `decisions.md` D-017. The bot's faithfulness Gate 1 (retrieval floor) refuses these correctly.
	- Premium pricing: Out of scope (advisor, not broker). See `decisions.md` D-007.
	- Categories beyond Health (Life, Motor, Travel): Out of scope v1.
	- Star Health policies (11 PDFs): Star Health's CDN actively blocks scripted downloads. Mitigation pending in v2.
	"""
	return md


	def main():
	extracted = load_extracted_policies()
	machine = build_machine_index()

	JSON_OUTPUT.parent.mkdir(parents=True, exist_ok=True)
	JSON_OUTPUT.write_text(json.dumps(machine, indent=2))

	md = build_markdown(machine, extracted)
	MD_OUTPUT.parent.mkdir(parents=True, exist_ok=True)
	MD_OUTPUT.write_text(md)

	print(f"Wrote:")
	print(f" {MD_OUTPUT.relative_to(ROOT)} ({len(md)} bytes)")
	print(f" {JSON_OUTPUT.relative_to(ROOT)} ({machine.get('total_chunks', 0)} chunks)")
	print(f"Policies indexed: {len({c['policy_id'] for c in machine.get('chunks', [])})}")


	if __name__ == "__main__":
	main()