Spaces:

rohitsar567
/

InsuranceBot

Sleeping

App Files Files Community

InsuranceBot / tools /curate_batch2.py

rohitsar567

refactor: KI-050 — complete data/ → 40-data/ rename across all Python refs

52c6351 about 2 months ago

Raw

History Blame Contribute Delete

35.4 kB

	"""Curate batch 2 policy_facts JSONs from extracted text cache.

	Pattern-based field extraction matched to the schema used by batch 1.
	Writes one JSON per policy into 40-data/policy_facts/.
	"""
	import os
	import re
	import json
	import sys

	BASE = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
	CACHE = "/tmp/claude/policy_extract/text_cache"
	OUT_DIR = os.path.join(BASE, "40-data/policy_facts")
	os.makedirs(OUT_DIR, exist_ok=True)

	# ---------------------------------------------------------------------------
	# Batch 2 manifest: (policy_id, policy_name, insurer_slug, primary_pdf_rel,
	# text_cache_filename, [supporting_pdf_rel ... optional])
	# Excluded from batch (already curated):
	# aditya-birla activ-assure-diamond + activ-one
	# bajaj-allianz health-guard-gold + extra-care-plus
	# care-health care-supreme + care-classic + care-senior
	# hdfc-ergo my-optima-secure + optima-restore
	# icici-lombard elevate + health-shield-360 + complete-health
	# manipalcigna prohealth-prime + prohealth-protect (both from all-variants)
	# new-india new-india-floater-mediclaim
	# niva-bupa reassure-2 + senior-first + health-companion
	# star-health family-health-optima + star-comprehensive
	# tata-aig medicare + medicare-premier
	# ---------------------------------------------------------------------------
	MANIFEST = [
	# ABHI
	("aditya-birla__activ-health", "Aditya Birla Activ Health (Platinum Enhanced / Essential)", "aditya-birla",
	"rag/corpus/aditya-birla/activ-health-individual__wordings.pdf",
	"aditya-birla__activ-health-individual__wordings.txt", []),
	# Bajaj
	("bajaj-allianz__comprehensive-care-plan", "Bajaj Allianz Comprehensive Care Plan", "bajaj-allianz",
	"rag/corpus/bajaj-allianz/comprehensive-care-plan__wordings.pdf",
	"bajaj-allianz__comprehensive-care-plan__wordings.txt", []),
	("bajaj-allianz__global-health-care", "Bajaj Allianz Global Health Care", "bajaj-allianz",
	"rag/corpus/bajaj-allianz/global-health-care__wordings.pdf",
	"bajaj-allianz__global-health-care__wordings.txt", []),
	("bajaj-allianz__health-guard", "Bajaj Allianz Health Guard (Silver / Gold / Platinum)", "bajaj-allianz",
	"rag/corpus/bajaj-allianz/health-guard__wordings.pdf",
	"bajaj-allianz__health-guard__wordings.txt", []),
	("bajaj-allianz__silver-health", "Bajaj Allianz Silver Health (Senior Citizen)", "bajaj-allianz",
	"rag/corpus/bajaj-allianz/silver-health__cis.pdf",
	"bajaj-allianz__silver-health__cis.txt", []),
	("bajaj-allianz__tax-gain", "Bajaj Allianz Tax Gain", "bajaj-allianz",
	"rag/corpus/bajaj-allianz/tax-gain__cis.pdf",
	"bajaj-allianz__tax-gain__cis.txt", []),
	# Care
	("care-health__care-advantage", "Care Health Care Advantage", "care-health",
	"rag/corpus/care-health/care-advantage__brochure.pdf",
	"care-health__care-advantage__brochure.txt", []),
	("care-health__care-supreme-enhance", "Care Health Care Supreme Enhance (Top-up)", "care-health",
	"rag/corpus/care-health/care-supreme-enhance__wordings.pdf",
	"care-health__care-supreme-enhance__wordings.txt", []),
	("care-health__ultimate-care", "Care Health Ultimate Care", "care-health",
	"rag/corpus/care-health/ultimate-care__wordings.pdf",
	"care-health__ultimate-care__wordings.txt", []),
	# HDFC ERGO
	("hdfc-ergo__energy", "HDFC ERGO Energy (Diabetes / Hypertension)", "hdfc-ergo",
	"rag/corpus/hdfc-ergo/energy-diabetes-hypertension__wordings.pdf",
	"hdfc-ergo__energy-diabetes-hypertension__wordings.txt", []),
	("hdfc-ergo__my-health-medisure-prime", "HDFC ERGO my:health Medisure Prime", "hdfc-ergo",
	"rag/corpus/hdfc-ergo/my-health-medisure-prime__wordings.pdf",
	"hdfc-ergo__my-health-medisure-prime__wordings.txt", []),
	("hdfc-ergo__my-health-sampoorna-suraksha", "HDFC ERGO my:health Sampoorna Suraksha", "hdfc-ergo",
	"rag/corpus/hdfc-ergo/my-health-sampoorna-suraksha__brochure.pdf",
	"hdfc-ergo__my-health-sampoorna-suraksha__brochure.txt", []),
	("hdfc-ergo__my-health-suraksha", "HDFC ERGO my:health Suraksha", "hdfc-ergo",
	"rag/corpus/hdfc-ergo/my-health-suraksha__brochure.pdf",
	"hdfc-ergo__my-health-suraksha__brochure.txt", []),
	("hdfc-ergo__my-health-women-suraksha", "HDFC ERGO my:health Women Suraksha", "hdfc-ergo",
	"rag/corpus/hdfc-ergo/my-health-women-suraksha__brochure.pdf",
	"hdfc-ergo__my-health-women-suraksha__brochure.txt", []),
	("hdfc-ergo__optima-secure-older-variant", "HDFC ERGO Optima Secure (Older / Legacy Variant)", "hdfc-ergo",
	"rag/corpus/hdfc-ergo/my-optima-secure-older-variant__wordings.pdf",
	"hdfc-ergo__my-optima-secure-older-variant__wordings.txt", []),
	("hdfc-ergo__optima-enhance", "HDFC ERGO Optima Enhance (Top-up)", "hdfc-ergo",
	"rag/corpus/hdfc-ergo/optima-enhance__wordings.pdf",
	"hdfc-ergo__optima-enhance__wordings.txt", []),
	("hdfc-ergo__optima-plus", "HDFC ERGO Optima Plus", "hdfc-ergo",
	"rag/corpus/hdfc-ergo/optima-plus__wordings.pdf",
	"hdfc-ergo__optima-plus__wordings.txt", []),
	("hdfc-ergo__total-health-plan", "HDFC ERGO Total Health Plan", "hdfc-ergo",
	"rag/corpus/hdfc-ergo/total-health-plan__wordings.pdf",
	"hdfc-ergo__total-health-plan__wordings.txt", []),
	# ICICI Lombard
	("icici-lombard__arogya-sanjeevani", "ICICI Lombard Arogya Sanjeevani (Standard)", "icici-lombard",
	"rag/corpus/icici-lombard/arogya-sanjeevani__wordings.pdf",
	"icici-lombard__arogya-sanjeevani__wordings.txt", []),
	("icici-lombard__complete-health-umbrella", "ICICI Lombard Complete Health Insurance — Umbrella", "icici-lombard",
	"rag/corpus/icici-lombard/complete-health-insurance-umbrella__wordings.pdf",
	"icici-lombard__complete-health-insurance-umbrella__wordings.txt", []),
	("icici-lombard__health-advantedge", "ICICI Lombard Health Advantedge", "icici-lombard",
	"rag/corpus/icici-lombard/health-advantedge__wordings.pdf",
	"icici-lombard__health-advantedge__wordings.txt", []),
	("icici-lombard__health-booster", "ICICI Lombard Health Booster (Top-up)", "icici-lombard",
	"rag/corpus/icici-lombard/health-booster-top-up__wordings.pdf",
	"icici-lombard__health-booster-top-up__wordings.txt", []),
	("icici-lombard__health-elite-plus", "ICICI Lombard Health Elite Plus", "icici-lombard",
	"rag/corpus/icici-lombard/health-elite-plus__wordings.pdf",
	"icici-lombard__health-elite-plus__wordings.txt", []),
	# ManipalCigna
	("manipalcigna__prohealth-select", "ManipalCigna ProHealth Select", "manipalcigna",
	"rag/corpus/manipalcigna/prohealth-select__wordings.pdf",
	"manipalcigna__prohealth-select__wordings.txt", []),
	("manipalcigna__sarvah-param", "ManipalCigna Sarvah Param", "manipalcigna",
	"rag/corpus/manipalcigna/sarvah-param__wordings.pdf",
	"manipalcigna__sarvah-param__wordings.txt", []),
	# New India
	("new-india__asha-kiran", "New India Asha Kiran (Girl Child Family Floater)", "new-india",
	"rag/corpus/new-india/asha-kiran-policy__brochure.pdf",
	"new-india__asha-kiran-policy__brochure.txt", []),
	("new-india__janata-mediclaim", "New India Janata Mediclaim", "new-india",
	"rag/corpus/new-india/janata-mediclaim-policy__wordings.pdf",
	"new-india__janata-mediclaim-policy__wordings.txt", []),
	("new-india__mediclaim-policy", "New India Mediclaim Policy (Individual)", "new-india",
	"rag/corpus/new-india/new-india-mediclaim-policy__wordings.pdf",
	"new-india__new-india-mediclaim-policy__wordings.txt", []),
	("new-india__universal-health", "New India Universal Health Insurance", "new-india",
	"rag/corpus/new-india/universal-health-insurance__wordings.pdf",
	"new-india__universal-health-insurance__wordings.txt", []),
	("new-india__yuva-bharat", "New India Yuva Bharat Health Policy", "new-india",
	"rag/corpus/new-india/yuva-bharat-health-policy__wordings.pdf",
	"new-india__yuva-bharat-health-policy__wordings.txt", []),
	# Niva Bupa
	("niva-bupa__aspire", "Niva Bupa Aspire", "niva-bupa",
	"rag/corpus/niva-bupa/aspire__wordings.pdf",
	"niva-bupa__aspire__wordings.txt", []),
	("niva-bupa__health-plus-top-up", "Niva Bupa Health Plus (Top-up)", "niva-bupa",
	"rag/corpus/niva-bupa/health-plus-top-up__wordings.pdf",
	"niva-bupa__health-plus-top-up__wordings.txt", []),
	("niva-bupa__health-premia", "Niva Bupa Health Premia", "niva-bupa",
	"rag/corpus/niva-bupa/health-premia__wordings.pdf",
	"niva-bupa__health-premia__wordings.txt", []),
	("niva-bupa__reassure-3", "Niva Bupa ReAssure 3.0", "niva-bupa",
	"rag/corpus/niva-bupa/reassure-3-0__wordings.pdf",
	"niva-bupa__reassure-3-0__wordings.txt", []),
	("niva-bupa__rise", "Niva Bupa Rise", "niva-bupa",
	"rag/corpus/niva-bupa/rise__wordings.pdf",
	"niva-bupa__rise__wordings.txt", []),
	("niva-bupa__saral-suraksha", "Niva Bupa Saral Suraksha Bima (Standard)", "niva-bupa",
	"rag/corpus/niva-bupa/saral-suraksha-bima__wordings.pdf",
	"niva-bupa__saral-suraksha-bima__wordings.txt", []),
	# Star
	("star-health__health-premier", "Star Health Premier", "star-health",
	"rag/corpus/star-health/health-premier__wordings.pdf",
	"star-health__health-premier__wordings.txt", []),
	("star-health__senior-citizens-red-carpet", "Star Senior Citizens Red Carpet", "star-health",
	"rag/corpus/star-health/senior-citizens-red-carpet__brochure.pdf",
	"star-health__senior-citizens-red-carpet__brochure.txt", []),
	("star-health__star-assure", "Star Assure Insurance Policy", "star-health",
	"rag/corpus/star-health/star-assure__wordings.pdf",
	"star-health__star-assure__wordings.txt", []),
	("star-health__star-cardiac-care", "Star Cardiac Care Insurance", "star-health",
	"rag/corpus/star-health/star-cardiac-care__wordings.pdf",
	"star-health__star-cardiac-care__wordings.txt", []),
	("star-health__star-cardiac-care-platinum", "Star Cardiac Care Platinum", "star-health",
	"rag/corpus/star-health/star-cardiac-care-platinum__wordings.pdf",
	"star-health__star-cardiac-care-platinum__wordings.txt", []),
	# Tata AIG
	("tata-aig__medicare-lite", "Tata AIG MediCare Lite", "tata-aig",
	"rag/corpus/tata-aig/medicare-lite__cis.pdf",
	"tata-aig__medicare-lite__cis.txt", []),
	("tata-aig__medicare-select", "Tata AIG MediCare Select", "tata-aig",
	"rag/corpus/tata-aig/medicare-select__brochure.pdf",
	"tata-aig__medicare-select__brochure.txt", []),
	]

	# ---------------------------------------------------------------------------
	# Pattern-based field extractors
	# Each returns (value, quote) or (None, quote_with_explanation) on miss
	# ---------------------------------------------------------------------------

	def find_context(text, pattern, max_len=200, flags=re.IGNORECASE):
	m = re.search(pattern, text, flags)
	if not m:
	return None, None
	start = max(0, m.start() - 30)
	end = min(len(text), m.end() + 160)
	ctx = re.sub(r"\s+", " ", text[start:end]).strip()
	return m, ctx[:max_len]

	def extract_uin(text):
	# IRDAI UIN: 3-letter insurer + 3-5 letter product code + 5 digits + V + 6 digits
	# Examples: HDFHLIP25041V062425 (HDF + HLIP), SHAHLIP22032V052122 (SHA + HLIP),
	# CHIHLIP23128V012223 (CHI + HLIP), NBHHLIP26042V022526 (NBH + HLIP)
	pat = r"\b([A-Z]{6,9}[0-9]{5}V[0-9]{6})\b"
	m, ctx = find_context(text, pat)
	if m:
	return m.group(1), ctx
	return None, "UIN not found in extracted text"

	def extract_min_entry_age(text):
	# Look for "minimum entry age" / "min age" / "91 days"
	pats = [
	(r"[Mm]inimum [Ee]ntry [Aa]ge[^.\n]{0,80}?(\d+)\s*(day\|year\|month)", "explicit min"),
	(r"[Aa]ge at [Ee]ntry[^.\n]{0,40}?(\d+)\s*(day\|year\|month)", "age at entry"),
	(r"[Cc]hild[^.\n]{0,40}?(\d+)\s*day", "child entry"),
	(r"(\d+)\s[Dd]ays\s(?:to\|-\|–)\s\d+\s[Yy]ears", "range form"),
	]
	for pat, _ in pats:
	m, ctx = find_context(text, pat)
	if m:
	val = int(m.group(1))
	unit = m.group(2).lower() if m.lastindex and m.lastindex >= 2 else "days"
	return val, unit, ctx
	return None, None, "Min entry age not found"

	def extract_max_entry_age(text):
	pats = [
	(r"[Mm]aximum [Ee]ntry [Aa]ge[^.\n]{0,80}?(\d+)\s*[Yy]ear", "explicit max"),
	(r"[Ee]ntry [Aa]ge[^.\n]{0,40}?[Uu]p to (\d+)\s*[Yy]ear", "entry age up to"),
	(r"(\d+)\s[Dd]ays\s(?:to\|-\|–)\s(\d+)\s[Yy]ears", "range"),
	(r"[Mm]aximum [Aa]ge[^.\n]{0,40}?(\d+)\s*[Yy]ear", "max age"),
	]
	for pat, _ in pats:
	m, ctx = find_context(text, pat)
	if m:
	# Range pattern -> group 2 is max
	try:
	val = int(m.group(2)) if m.lastindex and m.lastindex >= 2 and m.group(2).isdigit() else int(m.group(1))
	except Exception:
	val = int(m.group(1))
	return val, "years", ctx
	return None, "years", "Max entry age not explicitly stated; check Policy Schedule"

	def extract_renewal_age(text):
	if re.search(r"[Ll]ifelong\s*[Rr]enew\|[Ll]ife[- ]?[Ll]ong\|[Nn]o\s+maximum\s+(cover\s+)?ceas\|continuous\s+life\s+long", text):
	m, ctx = find_context(text, r"[Ll]ifelong\s[Rr]enew\|[Ll]ife[- ]?[Ll]ong\s[Rr]enew\|[Nn]o\s+maximum\s+(cover\s+)?ceas\|continuous\s+life\s+long")
	return None, "Lifelong renewability" + ((": " + ctx) if ctx else "")
	m, ctx = find_context(text, r"[Mm]aximum\s+[Rr]enewal\s+[Aa]ge[^.\n]{0,40}?(\d+)\s*[Yy]ear")
	if m:
	return int(m.group(1)), ctx
	return None, "Max renewal age not specified; check Policy Schedule"

	def extract_sum_insured_options(text):
	# Look for currency lists e.g. "3 Lacs, 5 Lacs, 10 Lacs"
	m, ctx = find_context(text, r"[Ss]um\s+[Ii]nsured[^.\n]{0,300}?(\d+[\d,. ]{0,20}(?:Lakhs?\|Lacs?\|Crores?\|L\b\|Cr\b))")
	if m:
	# Try to gather numeric values from window
	window = text[max(0, m.start()-30): m.end()+400]
	nums = re.findall(r"(\d+(?:\.\d+)?)\s*(?:Lakhs?\|Lacs?\|L\b)", window, re.IGNORECASE)
	nums_cr = re.findall(r"(\d+(?:\.\d+)?)\s*(?:Crores?\|Cr\b)", window, re.IGNORECASE)
	vals = []
	for n in nums:
	try:
	v = int(float(n) * 100000)
	if 50000 <= v <= 1000000000:
	vals.append(v)
	except Exception:
	pass
	for n in nums_cr:
	try:
	v = int(float(n) * 10000000)
	if 50000 <= v <= 1000000000:
	vals.append(v)
	except Exception:
	pass
	vals = sorted(set(vals))
	# Require at least 2 distinct values to count this as a real enumeration
	if len(vals) >= 2:
	return vals, ctx[:200]
	return None, "Sum Insured options not enumerated in extracted text; check Policy Schedule"

	def extract_initial_waiting(text):
	m, ctx = find_context(text, r"(\d+)\s*[Dd]ays?\s+(?:from\s+the\s+(?:first\|date of)\|waiting period\|of\s+the\s+inception)")
	if m and int(m.group(1)) in (15, 30):
	return int(m.group(1)), ctx
	m, ctx = find_context(text, r"[Ee]xcl03[^.\n]{0,200}?(\d+)\s*days?")
	if m:
	return int(m.group(1)), ctx
	m, ctx = find_context(text, r"within\s+(\d+)\s*days\s+from\s+the\s+first")
	if m:
	return int(m.group(1)), ctx
	return 30, "Default IRDAI 30-day waiting period applies (not explicitly quoted in extracted snippet)"

	def extract_ped_waiting(text):
	# PED in months
	m, ctx = find_context(text, r"[Pp]re[- ]existing\s+[Dd]isease\s(?:\([^)]+\))?[^.\n]{0,300}?(\d+)\s(months\|years)")
	if m:
	val = int(m.group(1))
	unit = m.group(2).lower()
	months = val * 12 if "year" in unit else val
	return months, ctx
	m, ctx = find_context(text, r"PED[^.\n]{0,200}?(\d+)\s*(months\|years)")
	if m:
	val = int(m.group(1))
	unit = m.group(2).lower()
	months = val * 12 if "year" in unit else val
	return months, ctx
	m, ctx = find_context(text, r"[Ee]xcl01[^.\n]{0,200}?(\d+)\s*months")
	if m:
	return int(m.group(1)), ctx
	return None, "PED waiting period not extracted; check Section 5 / Excl01"

	def extract_specific_disease_waiting(text):
	m, ctx = find_context(text, r"(?:listed\|specified\|named\|specific)\s+(?:conditions?\|ailments?\|diseases?\|treatments?)[^.\n]{0,300}?(\d+)\s*(months\|years)")
	if m:
	val = int(m.group(1))
	unit = m.group(2).lower()
	months = val * 12 if "year" in unit else val
	return months, ctx
	m, ctx = find_context(text, r"[Ee]xcl02[^.\n]{0,200}?(\d+)\s*(months\|years)")
	if m:
	val = int(m.group(1))
	unit = m.group(2).lower()
	return val * 12 if "year" in unit else val, ctx
	return 24, "Default IRDAI 24-month specific-disease waiting (not explicitly quoted)"

	def extract_maternity_waiting(text):
	m, ctx = find_context(text, r"[Mm]aternity[^.\n]{0,200}?(\d+)\s*months?\s+(?:waiting\|of continuous)")
	if m:
	return int(m.group(1)), ctx
	m, ctx = find_context(text, r"[Ww]aiting\s+[Pp]eriod[^.\n]{0,50}?[Mm]aternity[^.\n]{0,80}?(\d+)\s*months?")
	if m:
	return int(m.group(1)), ctx
	return None, "Maternity waiting not specified or maternity excluded"

	def extract_pre_hosp_days(text):
	m, ctx = find_context(text, r"[Pp]re[- ]?[Hh]ospitalisation[^.\n]{0,200}?(\d+)\s*days?")
	if m:
	return int(m.group(1)), ctx
	m, ctx = find_context(text, r"(\d+)\s*days?\s+(?:prior to\|before).{0,40}(?:admission\|hospitali[sz]ation)")
	if m:
	return int(m.group(1)), ctx
	return None, "Pre-hospitalization days not extracted"

	def extract_post_hosp_days(text):
	m, ctx = find_context(text, r"[Pp]ost[- ]?[Hh]ospitalisation[^.\n]{0,200}?(\d+)\s*days?")
	if m:
	return int(m.group(1)), ctx
	m, ctx = find_context(text, r"(\d+)\s*days?\s+(?:after\|post\|following).{0,40}discharge")
	if m:
	return int(m.group(1)), ctx
	return None, "Post-hospitalization days not extracted"

	def extract_day_care_count(text):
	m, ctx = find_context(text, r"(\d{2,4})\s(?:listed\s+)?[Dd]ay\s[- ]?[Cc]are\s*(?:[Pp]rocedures?\|[Tt]reatments?)")
	if m:
	v = int(m.group(1))
	if 50 <= v <= 2000:
	return v, ctx
	m, ctx = find_context(text, r"[Dd]ay\s[- ]?[Cc]are[^.\n]{0,80}?(\d{2,4})\s[Pp]rocedures?")
	if m:
	v = int(m.group(1))
	if 50 <= v <= 2000:
	return v, ctx
	return None, "Day-care count not enumerated; covered per policy definition"

	def extract_ayush(text):
	if re.search(r"AYUSH", text):
	m, ctx = find_context(text, r"AYUSH[^.\n]{0,200}")
	return True, ctx
	if re.search(r"[Aa]lternative\s+[Tt]reatment", text):
	m, ctx = find_context(text, r"[Aa]lternative\s+[Tt]reatment[^.\n]{0,200}")
	return True, ctx
	return False, "AYUSH coverage not found in extracted text"

	def extract_maternity(text):
	# Check explicit "maternity not covered" or "Excl18"
	m1 = re.search(r"[Mm]aternity[^.\n]{0,80}?(?:not\s+covered\|excluded)", text)
	m2 = re.search(r"Excl18", text)
	m3 = re.search(r"[Mm]aternity\s+(?:[Ee]xpenses?\|[Cc]over\|[Bb]enefit)[^.\n]{0,300}?(?:lump\s+sum\|Rs\.?\s*\d\|INR\|deliveries?)", text)
	if m3 and not m1:
	# Has positive maternity description
	m, ctx = find_context(text, r"[Mm]aternity\s+(?:[Ee]xpenses?\|[Cc]over\|[Bb]enefit)[^.\n]{0,300}")
	return True, ctx
	if m1 or m2:
	if m1:
	m, ctx = find_context(text, r"[Mm]aternity[^.\n]{0,200}?(?:not\s+covered\|excluded)[^.\n]{0,100}")
	else:
	m, ctx = find_context(text, r"Excl18[^.\n]{0,200}")
	return False, ctx or "Maternity excluded (Excl18)"
	# No explicit mention -> default false for typical retail (most retail base excludes maternity)
	return False, "Maternity not explicitly mentioned; presumed excluded in base"

	def extract_newborn(text):
	m = re.search(r"[Nn]ew[ -]?[Bb]orn[^.\n]{0,200}", text)
	if m:
	ctx = re.sub(r"\s+", " ", text[m.start():m.end()+50]).strip()
	if re.search(r"not\s+covered\|excluded", ctx):
	return False, ctx[:200]
	return True, ctx[:200]
	return False, "Newborn cover not found; typically tied to maternity option"

	def extract_organ_donor(text):
	m = re.search(r"[Oo]rgan\s+[Dd]onor", text)
	if m:
	ctx = re.sub(r"\s+", " ", text[m.start():m.end()+200]).strip()
	if re.search(r"not\s+covered\|excluded", ctx):
	return False, ctx[:200]
	return True, ctx[:200]
	return False, "Organ donor cover not extracted"

	def extract_ncb(text):
	m = re.search(r"(?:[Nn]o\s+[Cc]laim\s+[Bb]onus\|[Cc]umulative\s+[Bb]onus\|NCB\|cumulative\s+bonus)[^.\n]{0,400}?(\d{1,3})\s*%", text)
	if m:
	v = int(m.group(1))
	if 5 <= v <= 100:
	ctx = re.sub(r"\s+", " ", text[max(0, m.start()):m.end()+50]).strip()[:220]
	return v, ctx
	m = re.search(r"(\d{1,3})\s*%\s+(?:increase\|bonus)\s+(?:in\|of)\s+(?:Sum\s+Insured\|SI)", text, re.IGNORECASE)
	if m:
	v = int(m.group(1))
	if 5 <= v <= 100:
	ctx = re.sub(r"\s+", " ", text[max(0, m.start()-40):m.end()+30]).strip()[:220]
	return v, ctx
	return None, "NCB % not extracted; product may use booster/recharge structure"

	def extract_restoration(text):
	# Patterns
	pats = [
	r"[Rr]estor[ae][^.\n]{0,300}",
	r"[Rr]echarge\s+of\s+[Ss]um\s+[Ii]nsured[^.\n]{0,300}",
	r"[Rr]efill[^.\n]{0,300}",
	r"[Rr]eset\s+[Bb]enefit[^.\n]{0,300}",
	r"[Rr]e[- ]?[Ii]nstatement[^.\n]{0,300}",
	]
	for p in pats:
	m = re.search(p, text)
	if m:
	ctx = re.sub(r"\s+", " ", text[m.start():m.end()]).strip()[:280]
	return ctx[:240], ctx
	return None, "Restoration benefit not found in extracted text"

	def extract_room_rent(text):
	# Prefer wording that's a capping description, not the room-rent definition
	pats = [
	r"[Nn]o\s+[Rr]oom\s+[Rr]ent\s+(?:[Cc]apping\|[Ll]imit\|[Ss]ub[- ]?[Ll]imit)[^.\n]{0,150}",
	r"[Rr]oom\s+[Rr]ent\s+No\s+Sub[- ]?Limit[^.\n]{0,100}",
	r"[Ss]ingle\s+[Pp]rivate\s+(?:AC\s+)?[Rr]oom[^.\n]{0,150}",
	r"[Rr]oom\s+[Rr]ent[^.\n]{0,200}?(?:up\s+to\|maximum\|capped\s+at\|limit\s+of\|sub[- ]?limit)\s(?:Rs\.?\|`\|INR\|\d+\s%)[^.\n]{0,100}",
	r"[Rr]oom\s+[Cc]ategory[^.\n]{0,160}",
	r"[Rr]oom\s+[Rr]ent[^.\n]{0,160}?(\d+\s%\|Rs\.?\s\d\|`\s\d\|INR\s\d)[^.\n]{0,80}",
	]
	for p in pats:
	m = re.search(p, text)
	if m:
	ctx = re.sub(r"\s+", " ", text[m.start():m.end()]).strip()[:240]
	return ctx[:200], ctx
	return None, "Room rent capping not extracted (only definition found, no explicit cap)"

	def extract_copayment(text):
	m = re.search(r"[Cc]o[- ]?payment\s+of\s+(\d{1,2})\s*%", text)
	if m:
	v = int(m.group(1))
	ctx = re.sub(r"\s+", " ", text[max(0, m.start()-30):m.end()+120]).strip()[:240]
	return v, ctx
	m = re.search(r"(\d{1,2})\s*%\s+[Cc]o[- ]?[Pp]ay", text)
	if m:
	v = int(m.group(1))
	ctx = re.sub(r"\s+", " ", text[max(0, m.start()-30):m.end()+120]).strip()[:240]
	return v, ctx
	return 0, "No mandatory copay extracted; product may have age-based or zone-based optional copay"

	def extract_deductible(text):
	m = re.search(r"[Dd]eductible[^.\n]{0,300}?(?:Rs\.?\s\|INR\s\|₹\s*)(\d[\d,]{2,})", text)
	if m:
	amt = int(m.group(1).replace(",", ""))
	ctx = re.sub(r"\s+", " ", text[max(0, m.start()-30):m.end()+120]).strip()[:240]
	return amt, ctx
	m = re.search(r"[Aa]ggregate\s+[Dd]eductible[^.\n]{0,200}", text)
	if m:
	ctx = re.sub(r"\s+", " ", text[m.start():m.end()]).strip()[:220]
	return None, ctx
	return None, "No base deductible (or only optional voluntary deductible add-on)"

	def extract_cashless(text):
	if re.search(r"[Cc]ashless", text):
	m, ctx = find_context(text, r"[Cc]ashless[^.\n]{0,200}")
	return True, ctx
	return None, "Cashless mention not found"

	def extract_policy_type(text, policy_id=""):
	pid = policy_id.lower()
	# ID-based classification first (most reliable)
	if "top-up" in pid or "supreme-enhance" in pid or "health-booster" in pid or "optima-enhance" in pid or "health-plus-top-up" in pid or "extra-care-plus" in pid:
	return "top-up", "Top-up / super top-up policy (per product name)"
	if "cardiac-care" in pid or "cancer-care" in pid or "criti-medicare" in pid or "criti-care" in pid:
	return "benefit", "Specialty cardiac/critical illness — benefit-based lump-sum on diagnosis"
	if "hospital-cash" in pid or "daily-cash" in pid:
	return "hospital-cash", "Hospital cash / daily benefit policy"
	# Heuristic on text
	if re.search(r"[Ss]uper\s+[Tt]op[- ]?up\|deductible[^.\n]{0,100}aggregate\|[Aa]ggregate\s+[Dd]eductible[^.\n]{0,200}[Ss]um\s+[Ii]nsured", text):
	return "top-up", "Top-up / super top-up policy (kicks in above a deductible)"
	if re.search(r"[Hh]ospital\s+[Cc]ash\|[Dd]aily\s+[Cc]ash\s+[Bb]enefit", text):
	return "hospital-cash", "Hospital cash / daily benefit policy"
	# Indemnity is the default for retail health
	if re.search(r"[Ii]ndemnity\|[Hh]ospitali[sz]ation\s+[Ee]xpenses?\s+[Ii]ndemnif\|[Ii]ndemnif", text):
	m, ctx = find_context(text, r"[Ii]ndemnity\|[Ii]ndemnif")
	return "indemnity", ctx or "Indemnity-based health insurance"
	return "indemnity", "Default indemnity (no explicit alternate type detected)"

	# ---------------------------------------------------------------------------
	# Driver
	# ---------------------------------------------------------------------------

	def curate_one(entry):
	policy_id, policy_name, insurer, primary_pdf, txt_name, supporting = entry
	txt_path = os.path.join(CACHE, txt_name)
	if not os.path.exists(txt_path):
	return None, f"text cache missing: {txt_name}"
	with open(txt_path, "r", encoding="utf-8") as f:
	text = f.read()
	if len(text) < 1000:
	return None, f"too short ({len(text)} chars)"

	uin_val, uin_quote = extract_uin(text)
	min_age, min_unit, min_q = extract_min_entry_age(text)
	max_age, max_unit, max_q = extract_max_entry_age(text)
	renewal_age, renewal_q = extract_renewal_age(text)
	si_vals, si_q = extract_sum_insured_options(text)
	init_wait, init_q = extract_initial_waiting(text)
	ped_m, ped_q = extract_ped_waiting(text)
	sd_m, sd_q = extract_specific_disease_waiting(text)
	mat_m, mat_q = extract_maternity_waiting(text)
	pre_d, pre_q = extract_pre_hosp_days(text)
	post_d, post_q = extract_post_hosp_days(text)
	dc_n, dc_q = extract_day_care_count(text)
	ayush_b, ayush_q = extract_ayush(text)
	mat_b, mat_bq = extract_maternity(text)
	nb_b, nb_q = extract_newborn(text)
	od_b, od_q = extract_organ_donor(text)
	ncb_v, ncb_q = extract_ncb(text)
	restore_v, restore_q = extract_restoration(text)
	rr_v, rr_q = extract_room_rent(text)
	copay_v, copay_q = extract_copayment(text)
	ded_v, ded_q = extract_deductible(text)
	cash_v, cash_q = extract_cashless(text)
	ptype_v, ptype_q = extract_policy_type(text, policy_id)

	# Compose JSON
	j = {
	"policy_id": policy_id,
	"policy_name": policy_name,
	"insurer_slug": insurer,
	"uin_code": {
	"value": uin_val,
	"source_pdf_path": primary_pdf,
	"source_quote": (uin_quote or "")[:240]
	},
	"min_entry_age": {
	"value": min_age,
	"unit": min_unit or "days",
	"source_pdf_path": primary_pdf,
	"source_quote": (min_q or "Not explicitly stated; per Policy Schedule")[:240]
	},
	"max_entry_age": {
	"value": max_age,
	"unit": max_unit or "years",
	"source_pdf_path": primary_pdf,
	"source_quote": (max_q or "Not explicitly stated; per Policy Schedule")[:240]
	},
	"max_renewal_age": {
	"value": renewal_age,
	"source_pdf_path": primary_pdf,
	"source_quote": (renewal_q or "Not specified")[:240]
	},
	"sum_insured_options": {
	"value": si_vals,
	"unit": "INR",
	"source_pdf_path": primary_pdf,
	"source_quote": (si_q or "Per Policy Schedule")[:240]
	},
	"initial_waiting_period_days": {
	"value": init_wait,
	"source_pdf_path": primary_pdf,
	"source_quote": (init_q or "Per IRDAI standard")[:240]
	},
	"pre_existing_disease_waiting_months": {
	"value": ped_m,
	"source_pdf_path": primary_pdf,
	"source_quote": (ped_q or "PED waiting per policy wording")[:240]
	},
	"specific_disease_waiting_months": {
	"value": sd_m,
	"source_pdf_path": primary_pdf,
	"source_quote": (sd_q or "Specific disease waiting per IRDAI standard")[:240]
	},
	"maternity_waiting_months": {
	"value": mat_m,
	"source_pdf_path": primary_pdf,
	"source_quote": (mat_q or "Maternity waiting only applies if maternity covered/opted")[:240]
	},
	"pre_hospitalization_days": {
	"value": pre_d,
	"source_pdf_path": primary_pdf,
	"source_quote": (pre_q or "Pre-hosp days per Policy Schedule")[:240]
	},
	"post_hospitalization_days": {
	"value": post_d,
	"source_pdf_path": primary_pdf,
	"source_quote": (post_q or "Post-hosp days per Policy Schedule")[:240]
	},
	"day_care_treatments_count": {
	"value": dc_n,
	"source_pdf_path": primary_pdf,
	"source_quote": (dc_q or "Day care covered per definition; count not enumerated")[:240]
	},
	"ayush_coverage": {
	"value": ayush_b,
	"source_pdf_path": primary_pdf,
	"source_quote": (ayush_q or "AYUSH cover not explicitly found")[:240]
	},
	"maternity_coverage": {
	"value": mat_b,
	"source_pdf_path": primary_pdf,
	"source_quote": (mat_bq or "Maternity status not explicitly extracted")[:240]
	},
	"newborn_coverage": {
	"value": nb_b,
	"source_pdf_path": primary_pdf,
	"source_quote": (nb_q or "Newborn cover status not extracted")[:240]
	},
	"organ_donor_expenses": {
	"value": od_b,
	"source_pdf_path": primary_pdf,
	"source_quote": (od_q or "Organ donor benefit not extracted")[:240]
	},
	"no_claim_bonus_pct": {
	"value": ncb_v,
	"source_pdf_path": primary_pdf,
	"source_quote": (ncb_q or "NCB % not extracted")[:240]
	},
	"restoration_benefit": {
	"value": restore_v,
	"source_pdf_path": primary_pdf,
	"source_quote": (restore_q or "Restoration not found in extracted text")[:240]
	},
	"room_rent_capping": {
	"value": rr_v,
	"source_pdf_path": primary_pdf,
	"source_quote": (rr_q or "Room rent capping not extracted")[:240]
	},
	"copayment_pct": {
	"value": copay_v,
	"source_pdf_path": primary_pdf,
	"source_quote": (copay_q or "No mandatory copay")[:240]
	},
	"deductible_amount": {
	"value": ded_v,
	"source_pdf_path": primary_pdf,
	"source_quote": (ded_q or "No base deductible")[:240]
	},
	"network_hospital_count": {
	"value": None,
	"source_url": None,
	"source_quote": "Insurer-level metric; not extracted in this curation pass"
	},
	"cashless_treatment_supported": {
	"value": cash_v if cash_v is not None else True,
	"source_pdf_path": primary_pdf,
	"source_quote": (cash_q or "Cashless implicit via insurer network")[:240]
	},
	"claim_settlement_ratio": {
	"value": None,
	"source_url": None,
	"source_quote": "Insurer-level metric (IRDAI Annual Report); not extracted"
	},
	"tat_cashless_authorization_hours": {
	"value": None,
	"source_pdf_path": None,
	"source_quote": "TAT not specified in policy wording; governed by IRDAI Master Circular"
	},
	"policy_type": {
	"value": ptype_v,
	"source_pdf_path": primary_pdf,
	"source_quote": (ptype_q or "Policy type inferred from product structure")[:240]
	},
	}

	# Completeness: count populated fields (value != None & not insurer-level)
	pdf_fields = [
	"uin_code", "min_entry_age", "max_entry_age", "sum_insured_options",
	"initial_waiting_period_days", "pre_existing_disease_waiting_months",
	"specific_disease_waiting_months", "pre_hospitalization_days",
	"post_hospitalization_days", "day_care_treatments_count",
	"ayush_coverage", "maternity_coverage", "newborn_coverage",
	"organ_donor_expenses", "no_claim_bonus_pct", "restoration_benefit",
	"room_rent_capping", "copayment_pct", "policy_type",
	"cashless_treatment_supported"
	]
	filled = sum(1 for f in pdf_fields if j[f]["value"] not in (None, ""))
	pct = int(round(filled / len(pdf_fields) * 100))

	j["_meta"] = {
	"curated_at": "2026-05-14",
	"primary_source_pdf": primary_pdf,
	"supporting_source_pdfs": supporting,
	"completeness_pct": pct,
	"notes": "Pattern-based extraction from local PDF via pdfplumber. Insurer-level metrics (CSR, network count) left null pending downstream backfill."
	}

	return j, pct


	def main():
	results = []
	skipped = []
	for i, entry in enumerate(MANIFEST, 1):
	policy_id = entry[0]
	out_path = os.path.join(OUT_DIR, f"{policy_id}.json")
	# Skip if already exists
	if os.path.exists(out_path):
	print(f"[{i}/{len(MANIFEST)}] {policy_id}: EXISTS — skipping")
	continue
	j, pct = curate_one(entry)
	if j is None:
	print(f"[{i}/{len(MANIFEST)}] {policy_id}: SKIP — {pct}")
	skipped.append((policy_id, pct))
	continue
	if pct < 50:
	print(f"[{i}/{len(MANIFEST)}] {policy_id}: LOW {pct}% — skipping (below threshold)")
	skipped.append((policy_id, f"low completeness {pct}%"))
	continue
	with open(out_path, "w", encoding="utf-8") as f:
	json.dump(j, f, indent=2, ensure_ascii=False)
	print(f"[{i}/{len(MANIFEST)}] {policy_id}: {pct}%")
	results.append((policy_id, pct))

	print()
	print(f"Wrote {len(results)} JSONs.")
	print(f"Skipped {len(skipped)}.")
	if results:
	avg = sum(p for _, p in results) / len(results)
	print(f"Average completeness: {avg:.1f}%")
	return results, skipped


	if __name__ == "__main__":
	main()