"""Curate batch 2 policy_facts JSONs from extracted text cache. Pattern-based field extraction matched to the schema used by batch 1. Writes one JSON per policy into 40-data/policy_facts/. """ import os import re import json import sys BASE = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) CACHE = "/tmp/claude/policy_extract/text_cache" OUT_DIR = os.path.join(BASE, "40-data/policy_facts") os.makedirs(OUT_DIR, exist_ok=True) # --------------------------------------------------------------------------- # Batch 2 manifest: (policy_id, policy_name, insurer_slug, primary_pdf_rel, # text_cache_filename, [supporting_pdf_rel ... optional]) # Excluded from batch (already curated): # aditya-birla activ-assure-diamond + activ-one # bajaj-allianz health-guard-gold + extra-care-plus # care-health care-supreme + care-classic + care-senior # hdfc-ergo my-optima-secure + optima-restore # icici-lombard elevate + health-shield-360 + complete-health # manipalcigna prohealth-prime + prohealth-protect (both from all-variants) # new-india new-india-floater-mediclaim # niva-bupa reassure-2 + senior-first + health-companion # star-health family-health-optima + star-comprehensive # tata-aig medicare + medicare-premier # --------------------------------------------------------------------------- MANIFEST = [ # ABHI ("aditya-birla__activ-health", "Aditya Birla Activ Health (Platinum Enhanced / Essential)", "aditya-birla", "rag/corpus/aditya-birla/activ-health-individual__wordings.pdf", "aditya-birla__activ-health-individual__wordings.txt", []), # Bajaj ("bajaj-allianz__comprehensive-care-plan", "Bajaj Allianz Comprehensive Care Plan", "bajaj-allianz", "rag/corpus/bajaj-allianz/comprehensive-care-plan__wordings.pdf", "bajaj-allianz__comprehensive-care-plan__wordings.txt", []), ("bajaj-allianz__global-health-care", "Bajaj Allianz Global Health Care", "bajaj-allianz", "rag/corpus/bajaj-allianz/global-health-care__wordings.pdf", "bajaj-allianz__global-health-care__wordings.txt", []), ("bajaj-allianz__health-guard", "Bajaj Allianz Health Guard (Silver / Gold / Platinum)", "bajaj-allianz", "rag/corpus/bajaj-allianz/health-guard__wordings.pdf", "bajaj-allianz__health-guard__wordings.txt", []), ("bajaj-allianz__silver-health", "Bajaj Allianz Silver Health (Senior Citizen)", "bajaj-allianz", "rag/corpus/bajaj-allianz/silver-health__cis.pdf", "bajaj-allianz__silver-health__cis.txt", []), ("bajaj-allianz__tax-gain", "Bajaj Allianz Tax Gain", "bajaj-allianz", "rag/corpus/bajaj-allianz/tax-gain__cis.pdf", "bajaj-allianz__tax-gain__cis.txt", []), # Care ("care-health__care-advantage", "Care Health Care Advantage", "care-health", "rag/corpus/care-health/care-advantage__brochure.pdf", "care-health__care-advantage__brochure.txt", []), ("care-health__care-supreme-enhance", "Care Health Care Supreme Enhance (Top-up)", "care-health", "rag/corpus/care-health/care-supreme-enhance__wordings.pdf", "care-health__care-supreme-enhance__wordings.txt", []), ("care-health__ultimate-care", "Care Health Ultimate Care", "care-health", "rag/corpus/care-health/ultimate-care__wordings.pdf", "care-health__ultimate-care__wordings.txt", []), # HDFC ERGO ("hdfc-ergo__energy", "HDFC ERGO Energy (Diabetes / Hypertension)", "hdfc-ergo", "rag/corpus/hdfc-ergo/energy-diabetes-hypertension__wordings.pdf", "hdfc-ergo__energy-diabetes-hypertension__wordings.txt", []), ("hdfc-ergo__my-health-medisure-prime", "HDFC ERGO my:health Medisure Prime", "hdfc-ergo", "rag/corpus/hdfc-ergo/my-health-medisure-prime__wordings.pdf", "hdfc-ergo__my-health-medisure-prime__wordings.txt", []), ("hdfc-ergo__my-health-sampoorna-suraksha", "HDFC ERGO my:health Sampoorna Suraksha", "hdfc-ergo", "rag/corpus/hdfc-ergo/my-health-sampoorna-suraksha__brochure.pdf", "hdfc-ergo__my-health-sampoorna-suraksha__brochure.txt", []), ("hdfc-ergo__my-health-suraksha", "HDFC ERGO my:health Suraksha", "hdfc-ergo", "rag/corpus/hdfc-ergo/my-health-suraksha__brochure.pdf", "hdfc-ergo__my-health-suraksha__brochure.txt", []), ("hdfc-ergo__my-health-women-suraksha", "HDFC ERGO my:health Women Suraksha", "hdfc-ergo", "rag/corpus/hdfc-ergo/my-health-women-suraksha__brochure.pdf", "hdfc-ergo__my-health-women-suraksha__brochure.txt", []), ("hdfc-ergo__optima-secure-older-variant", "HDFC ERGO Optima Secure (Older / Legacy Variant)", "hdfc-ergo", "rag/corpus/hdfc-ergo/my-optima-secure-older-variant__wordings.pdf", "hdfc-ergo__my-optima-secure-older-variant__wordings.txt", []), ("hdfc-ergo__optima-enhance", "HDFC ERGO Optima Enhance (Top-up)", "hdfc-ergo", "rag/corpus/hdfc-ergo/optima-enhance__wordings.pdf", "hdfc-ergo__optima-enhance__wordings.txt", []), ("hdfc-ergo__optima-plus", "HDFC ERGO Optima Plus", "hdfc-ergo", "rag/corpus/hdfc-ergo/optima-plus__wordings.pdf", "hdfc-ergo__optima-plus__wordings.txt", []), ("hdfc-ergo__total-health-plan", "HDFC ERGO Total Health Plan", "hdfc-ergo", "rag/corpus/hdfc-ergo/total-health-plan__wordings.pdf", "hdfc-ergo__total-health-plan__wordings.txt", []), # ICICI Lombard ("icici-lombard__arogya-sanjeevani", "ICICI Lombard Arogya Sanjeevani (Standard)", "icici-lombard", "rag/corpus/icici-lombard/arogya-sanjeevani__wordings.pdf", "icici-lombard__arogya-sanjeevani__wordings.txt", []), ("icici-lombard__complete-health-umbrella", "ICICI Lombard Complete Health Insurance — Umbrella", "icici-lombard", "rag/corpus/icici-lombard/complete-health-insurance-umbrella__wordings.pdf", "icici-lombard__complete-health-insurance-umbrella__wordings.txt", []), ("icici-lombard__health-advantedge", "ICICI Lombard Health Advantedge", "icici-lombard", "rag/corpus/icici-lombard/health-advantedge__wordings.pdf", "icici-lombard__health-advantedge__wordings.txt", []), ("icici-lombard__health-booster", "ICICI Lombard Health Booster (Top-up)", "icici-lombard", "rag/corpus/icici-lombard/health-booster-top-up__wordings.pdf", "icici-lombard__health-booster-top-up__wordings.txt", []), ("icici-lombard__health-elite-plus", "ICICI Lombard Health Elite Plus", "icici-lombard", "rag/corpus/icici-lombard/health-elite-plus__wordings.pdf", "icici-lombard__health-elite-plus__wordings.txt", []), # ManipalCigna ("manipalcigna__prohealth-select", "ManipalCigna ProHealth Select", "manipalcigna", "rag/corpus/manipalcigna/prohealth-select__wordings.pdf", "manipalcigna__prohealth-select__wordings.txt", []), ("manipalcigna__sarvah-param", "ManipalCigna Sarvah Param", "manipalcigna", "rag/corpus/manipalcigna/sarvah-param__wordings.pdf", "manipalcigna__sarvah-param__wordings.txt", []), # New India ("new-india__asha-kiran", "New India Asha Kiran (Girl Child Family Floater)", "new-india", "rag/corpus/new-india/asha-kiran-policy__brochure.pdf", "new-india__asha-kiran-policy__brochure.txt", []), ("new-india__janata-mediclaim", "New India Janata Mediclaim", "new-india", "rag/corpus/new-india/janata-mediclaim-policy__wordings.pdf", "new-india__janata-mediclaim-policy__wordings.txt", []), ("new-india__mediclaim-policy", "New India Mediclaim Policy (Individual)", "new-india", "rag/corpus/new-india/new-india-mediclaim-policy__wordings.pdf", "new-india__new-india-mediclaim-policy__wordings.txt", []), ("new-india__universal-health", "New India Universal Health Insurance", "new-india", "rag/corpus/new-india/universal-health-insurance__wordings.pdf", "new-india__universal-health-insurance__wordings.txt", []), ("new-india__yuva-bharat", "New India Yuva Bharat Health Policy", "new-india", "rag/corpus/new-india/yuva-bharat-health-policy__wordings.pdf", "new-india__yuva-bharat-health-policy__wordings.txt", []), # Niva Bupa ("niva-bupa__aspire", "Niva Bupa Aspire", "niva-bupa", "rag/corpus/niva-bupa/aspire__wordings.pdf", "niva-bupa__aspire__wordings.txt", []), ("niva-bupa__health-plus-top-up", "Niva Bupa Health Plus (Top-up)", "niva-bupa", "rag/corpus/niva-bupa/health-plus-top-up__wordings.pdf", "niva-bupa__health-plus-top-up__wordings.txt", []), ("niva-bupa__health-premia", "Niva Bupa Health Premia", "niva-bupa", "rag/corpus/niva-bupa/health-premia__wordings.pdf", "niva-bupa__health-premia__wordings.txt", []), ("niva-bupa__reassure-3", "Niva Bupa ReAssure 3.0", "niva-bupa", "rag/corpus/niva-bupa/reassure-3-0__wordings.pdf", "niva-bupa__reassure-3-0__wordings.txt", []), ("niva-bupa__rise", "Niva Bupa Rise", "niva-bupa", "rag/corpus/niva-bupa/rise__wordings.pdf", "niva-bupa__rise__wordings.txt", []), ("niva-bupa__saral-suraksha", "Niva Bupa Saral Suraksha Bima (Standard)", "niva-bupa", "rag/corpus/niva-bupa/saral-suraksha-bima__wordings.pdf", "niva-bupa__saral-suraksha-bima__wordings.txt", []), # Star ("star-health__health-premier", "Star Health Premier", "star-health", "rag/corpus/star-health/health-premier__wordings.pdf", "star-health__health-premier__wordings.txt", []), ("star-health__senior-citizens-red-carpet", "Star Senior Citizens Red Carpet", "star-health", "rag/corpus/star-health/senior-citizens-red-carpet__brochure.pdf", "star-health__senior-citizens-red-carpet__brochure.txt", []), ("star-health__star-assure", "Star Assure Insurance Policy", "star-health", "rag/corpus/star-health/star-assure__wordings.pdf", "star-health__star-assure__wordings.txt", []), ("star-health__star-cardiac-care", "Star Cardiac Care Insurance", "star-health", "rag/corpus/star-health/star-cardiac-care__wordings.pdf", "star-health__star-cardiac-care__wordings.txt", []), ("star-health__star-cardiac-care-platinum", "Star Cardiac Care Platinum", "star-health", "rag/corpus/star-health/star-cardiac-care-platinum__wordings.pdf", "star-health__star-cardiac-care-platinum__wordings.txt", []), # Tata AIG ("tata-aig__medicare-lite", "Tata AIG MediCare Lite", "tata-aig", "rag/corpus/tata-aig/medicare-lite__cis.pdf", "tata-aig__medicare-lite__cis.txt", []), ("tata-aig__medicare-select", "Tata AIG MediCare Select", "tata-aig", "rag/corpus/tata-aig/medicare-select__brochure.pdf", "tata-aig__medicare-select__brochure.txt", []), ] # --------------------------------------------------------------------------- # Pattern-based field extractors # Each returns (value, quote) or (None, quote_with_explanation) on miss # --------------------------------------------------------------------------- def find_context(text, pattern, max_len=200, flags=re.IGNORECASE): m = re.search(pattern, text, flags) if not m: return None, None start = max(0, m.start() - 30) end = min(len(text), m.end() + 160) ctx = re.sub(r"\s+", " ", text[start:end]).strip() return m, ctx[:max_len] def extract_uin(text): # IRDAI UIN: 3-letter insurer + 3-5 letter product code + 5 digits + V + 6 digits # Examples: HDFHLIP25041V062425 (HDF + HLIP), SHAHLIP22032V052122 (SHA + HLIP), # CHIHLIP23128V012223 (CHI + HLIP), NBHHLIP26042V022526 (NBH + HLIP) pat = r"\b([A-Z]{6,9}[0-9]{5}V[0-9]{6})\b" m, ctx = find_context(text, pat) if m: return m.group(1), ctx return None, "UIN not found in extracted text" def extract_min_entry_age(text): # Look for "minimum entry age" / "min age" / "91 days" pats = [ (r"[Mm]inimum [Ee]ntry [Aa]ge[^.\n]{0,80}?(\d+)\s*(day|year|month)", "explicit min"), (r"[Aa]ge at [Ee]ntry[^.\n]{0,40}?(\d+)\s*(day|year|month)", "age at entry"), (r"[Cc]hild[^.\n]{0,40}?(\d+)\s*day", "child entry"), (r"(\d+)\s*[Dd]ays\s*(?:to|-|–)\s*\d+\s*[Yy]ears", "range form"), ] for pat, _ in pats: m, ctx = find_context(text, pat) if m: val = int(m.group(1)) unit = m.group(2).lower() if m.lastindex and m.lastindex >= 2 else "days" return val, unit, ctx return None, None, "Min entry age not found" def extract_max_entry_age(text): pats = [ (r"[Mm]aximum [Ee]ntry [Aa]ge[^.\n]{0,80}?(\d+)\s*[Yy]ear", "explicit max"), (r"[Ee]ntry [Aa]ge[^.\n]{0,40}?[Uu]p to (\d+)\s*[Yy]ear", "entry age up to"), (r"(\d+)\s*[Dd]ays\s*(?:to|-|–)\s*(\d+)\s*[Yy]ears", "range"), (r"[Mm]aximum [Aa]ge[^.\n]{0,40}?(\d+)\s*[Yy]ear", "max age"), ] for pat, _ in pats: m, ctx = find_context(text, pat) if m: # Range pattern -> group 2 is max try: val = int(m.group(2)) if m.lastindex and m.lastindex >= 2 and m.group(2).isdigit() else int(m.group(1)) except Exception: val = int(m.group(1)) return val, "years", ctx return None, "years", "Max entry age not explicitly stated; check Policy Schedule" def extract_renewal_age(text): if re.search(r"[Ll]ifelong\s*[Rr]enew|[Ll]ife[- ]?[Ll]ong|[Nn]o\s+maximum\s+(cover\s+)?ceas|continuous\s+life\s+long", text): m, ctx = find_context(text, r"[Ll]ifelong\s*[Rr]enew|[Ll]ife[- ]?[Ll]ong\s*[Rr]enew|[Nn]o\s+maximum\s+(cover\s+)?ceas|continuous\s+life\s+long") return None, "Lifelong renewability" + ((": " + ctx) if ctx else "") m, ctx = find_context(text, r"[Mm]aximum\s+[Rr]enewal\s+[Aa]ge[^.\n]{0,40}?(\d+)\s*[Yy]ear") if m: return int(m.group(1)), ctx return None, "Max renewal age not specified; check Policy Schedule" def extract_sum_insured_options(text): # Look for currency lists e.g. "3 Lacs, 5 Lacs, 10 Lacs" m, ctx = find_context(text, r"[Ss]um\s+[Ii]nsured[^.\n]{0,300}?(\d+[\d,. ]{0,20}(?:Lakhs?|Lacs?|Crores?|L\b|Cr\b))") if m: # Try to gather numeric values from window window = text[max(0, m.start()-30): m.end()+400] nums = re.findall(r"(\d+(?:\.\d+)?)\s*(?:Lakhs?|Lacs?|L\b)", window, re.IGNORECASE) nums_cr = re.findall(r"(\d+(?:\.\d+)?)\s*(?:Crores?|Cr\b)", window, re.IGNORECASE) vals = [] for n in nums: try: v = int(float(n) * 100000) if 50000 <= v <= 1000000000: vals.append(v) except Exception: pass for n in nums_cr: try: v = int(float(n) * 10000000) if 50000 <= v <= 1000000000: vals.append(v) except Exception: pass vals = sorted(set(vals)) # Require at least 2 distinct values to count this as a real enumeration if len(vals) >= 2: return vals, ctx[:200] return None, "Sum Insured options not enumerated in extracted text; check Policy Schedule" def extract_initial_waiting(text): m, ctx = find_context(text, r"(\d+)\s*[Dd]ays?\s+(?:from\s+the\s+(?:first|date of)|waiting period|of\s+the\s+inception)") if m and int(m.group(1)) in (15, 30): return int(m.group(1)), ctx m, ctx = find_context(text, r"[Ee]xcl03[^.\n]{0,200}?(\d+)\s*days?") if m: return int(m.group(1)), ctx m, ctx = find_context(text, r"within\s+(\d+)\s*days\s+from\s+the\s+first") if m: return int(m.group(1)), ctx return 30, "Default IRDAI 30-day waiting period applies (not explicitly quoted in extracted snippet)" def extract_ped_waiting(text): # PED in months m, ctx = find_context(text, r"[Pp]re[- ]existing\s+[Dd]isease\s*(?:\([^)]+\))?[^.\n]{0,300}?(\d+)\s*(months|years)") if m: val = int(m.group(1)) unit = m.group(2).lower() months = val * 12 if "year" in unit else val return months, ctx m, ctx = find_context(text, r"PED[^.\n]{0,200}?(\d+)\s*(months|years)") if m: val = int(m.group(1)) unit = m.group(2).lower() months = val * 12 if "year" in unit else val return months, ctx m, ctx = find_context(text, r"[Ee]xcl01[^.\n]{0,200}?(\d+)\s*months") if m: return int(m.group(1)), ctx return None, "PED waiting period not extracted; check Section 5 / Excl01" def extract_specific_disease_waiting(text): m, ctx = find_context(text, r"(?:listed|specified|named|specific)\s+(?:conditions?|ailments?|diseases?|treatments?)[^.\n]{0,300}?(\d+)\s*(months|years)") if m: val = int(m.group(1)) unit = m.group(2).lower() months = val * 12 if "year" in unit else val return months, ctx m, ctx = find_context(text, r"[Ee]xcl02[^.\n]{0,200}?(\d+)\s*(months|years)") if m: val = int(m.group(1)) unit = m.group(2).lower() return val * 12 if "year" in unit else val, ctx return 24, "Default IRDAI 24-month specific-disease waiting (not explicitly quoted)" def extract_maternity_waiting(text): m, ctx = find_context(text, r"[Mm]aternity[^.\n]{0,200}?(\d+)\s*months?\s+(?:waiting|of continuous)") if m: return int(m.group(1)), ctx m, ctx = find_context(text, r"[Ww]aiting\s+[Pp]eriod[^.\n]{0,50}?[Mm]aternity[^.\n]{0,80}?(\d+)\s*months?") if m: return int(m.group(1)), ctx return None, "Maternity waiting not specified or maternity excluded" def extract_pre_hosp_days(text): m, ctx = find_context(text, r"[Pp]re[- ]?[Hh]ospitalisation[^.\n]{0,200}?(\d+)\s*days?") if m: return int(m.group(1)), ctx m, ctx = find_context(text, r"(\d+)\s*days?\s+(?:prior to|before).{0,40}(?:admission|hospitali[sz]ation)") if m: return int(m.group(1)), ctx return None, "Pre-hospitalization days not extracted" def extract_post_hosp_days(text): m, ctx = find_context(text, r"[Pp]ost[- ]?[Hh]ospitalisation[^.\n]{0,200}?(\d+)\s*days?") if m: return int(m.group(1)), ctx m, ctx = find_context(text, r"(\d+)\s*days?\s+(?:after|post|following).{0,40}discharge") if m: return int(m.group(1)), ctx return None, "Post-hospitalization days not extracted" def extract_day_care_count(text): m, ctx = find_context(text, r"(\d{2,4})\s*(?:listed\s+)?[Dd]ay\s*[- ]?[Cc]are\s*(?:[Pp]rocedures?|[Tt]reatments?)") if m: v = int(m.group(1)) if 50 <= v <= 2000: return v, ctx m, ctx = find_context(text, r"[Dd]ay\s*[- ]?[Cc]are[^.\n]{0,80}?(\d{2,4})\s*[Pp]rocedures?") if m: v = int(m.group(1)) if 50 <= v <= 2000: return v, ctx return None, "Day-care count not enumerated; covered per policy definition" def extract_ayush(text): if re.search(r"AYUSH", text): m, ctx = find_context(text, r"AYUSH[^.\n]{0,200}") return True, ctx if re.search(r"[Aa]lternative\s+[Tt]reatment", text): m, ctx = find_context(text, r"[Aa]lternative\s+[Tt]reatment[^.\n]{0,200}") return True, ctx return False, "AYUSH coverage not found in extracted text" def extract_maternity(text): # Check explicit "maternity not covered" or "Excl18" m1 = re.search(r"[Mm]aternity[^.\n]{0,80}?(?:not\s+covered|excluded)", text) m2 = re.search(r"Excl18", text) m3 = re.search(r"[Mm]aternity\s+(?:[Ee]xpenses?|[Cc]over|[Bb]enefit)[^.\n]{0,300}?(?:lump\s+sum|Rs\.?\s*\d|INR|deliveries?)", text) if m3 and not m1: # Has positive maternity description m, ctx = find_context(text, r"[Mm]aternity\s+(?:[Ee]xpenses?|[Cc]over|[Bb]enefit)[^.\n]{0,300}") return True, ctx if m1 or m2: if m1: m, ctx = find_context(text, r"[Mm]aternity[^.\n]{0,200}?(?:not\s+covered|excluded)[^.\n]{0,100}") else: m, ctx = find_context(text, r"Excl18[^.\n]{0,200}") return False, ctx or "Maternity excluded (Excl18)" # No explicit mention -> default false for typical retail (most retail base excludes maternity) return False, "Maternity not explicitly mentioned; presumed excluded in base" def extract_newborn(text): m = re.search(r"[Nn]ew[ -]?[Bb]orn[^.\n]{0,200}", text) if m: ctx = re.sub(r"\s+", " ", text[m.start():m.end()+50]).strip() if re.search(r"not\s+covered|excluded", ctx): return False, ctx[:200] return True, ctx[:200] return False, "Newborn cover not found; typically tied to maternity option" def extract_organ_donor(text): m = re.search(r"[Oo]rgan\s+[Dd]onor", text) if m: ctx = re.sub(r"\s+", " ", text[m.start():m.end()+200]).strip() if re.search(r"not\s+covered|excluded", ctx): return False, ctx[:200] return True, ctx[:200] return False, "Organ donor cover not extracted" def extract_ncb(text): m = re.search(r"(?:[Nn]o\s+[Cc]laim\s+[Bb]onus|[Cc]umulative\s+[Bb]onus|NCB|cumulative\s+bonus)[^.\n]{0,400}?(\d{1,3})\s*%", text) if m: v = int(m.group(1)) if 5 <= v <= 100: ctx = re.sub(r"\s+", " ", text[max(0, m.start()):m.end()+50]).strip()[:220] return v, ctx m = re.search(r"(\d{1,3})\s*%\s+(?:increase|bonus)\s+(?:in|of)\s+(?:Sum\s+Insured|SI)", text, re.IGNORECASE) if m: v = int(m.group(1)) if 5 <= v <= 100: ctx = re.sub(r"\s+", " ", text[max(0, m.start()-40):m.end()+30]).strip()[:220] return v, ctx return None, "NCB % not extracted; product may use booster/recharge structure" def extract_restoration(text): # Patterns pats = [ r"[Rr]estor[ae][^.\n]{0,300}", r"[Rr]echarge\s+of\s+[Ss]um\s+[Ii]nsured[^.\n]{0,300}", r"[Rr]efill[^.\n]{0,300}", r"[Rr]eset\s+[Bb]enefit[^.\n]{0,300}", r"[Rr]e[- ]?[Ii]nstatement[^.\n]{0,300}", ] for p in pats: m = re.search(p, text) if m: ctx = re.sub(r"\s+", " ", text[m.start():m.end()]).strip()[:280] return ctx[:240], ctx return None, "Restoration benefit not found in extracted text" def extract_room_rent(text): # Prefer wording that's a capping description, not the room-rent definition pats = [ r"[Nn]o\s+[Rr]oom\s+[Rr]ent\s+(?:[Cc]apping|[Ll]imit|[Ss]ub[- ]?[Ll]imit)[^.\n]{0,150}", r"[Rr]oom\s+[Rr]ent\s+No\s+Sub[- ]?Limit[^.\n]{0,100}", r"[Ss]ingle\s+[Pp]rivate\s+(?:AC\s+)?[Rr]oom[^.\n]{0,150}", r"[Rr]oom\s+[Rr]ent[^.\n]{0,200}?(?:up\s+to|maximum|capped\s+at|limit\s+of|sub[- ]?limit)\s*(?:Rs\.?|`|INR|\d+\s*%)[^.\n]{0,100}", r"[Rr]oom\s+[Cc]ategory[^.\n]{0,160}", r"[Rr]oom\s+[Rr]ent[^.\n]{0,160}?(\d+\s*%|Rs\.?\s*\d|`\s*\d|INR\s*\d)[^.\n]{0,80}", ] for p in pats: m = re.search(p, text) if m: ctx = re.sub(r"\s+", " ", text[m.start():m.end()]).strip()[:240] return ctx[:200], ctx return None, "Room rent capping not extracted (only definition found, no explicit cap)" def extract_copayment(text): m = re.search(r"[Cc]o[- ]?payment\s+of\s+(\d{1,2})\s*%", text) if m: v = int(m.group(1)) ctx = re.sub(r"\s+", " ", text[max(0, m.start()-30):m.end()+120]).strip()[:240] return v, ctx m = re.search(r"(\d{1,2})\s*%\s+[Cc]o[- ]?[Pp]ay", text) if m: v = int(m.group(1)) ctx = re.sub(r"\s+", " ", text[max(0, m.start()-30):m.end()+120]).strip()[:240] return v, ctx return 0, "No mandatory copay extracted; product may have age-based or zone-based optional copay" def extract_deductible(text): m = re.search(r"[Dd]eductible[^.\n]{0,300}?(?:Rs\.?\s*|INR\s*|₹\s*)(\d[\d,]{2,})", text) if m: amt = int(m.group(1).replace(",", "")) ctx = re.sub(r"\s+", " ", text[max(0, m.start()-30):m.end()+120]).strip()[:240] return amt, ctx m = re.search(r"[Aa]ggregate\s+[Dd]eductible[^.\n]{0,200}", text) if m: ctx = re.sub(r"\s+", " ", text[m.start():m.end()]).strip()[:220] return None, ctx return None, "No base deductible (or only optional voluntary deductible add-on)" def extract_cashless(text): if re.search(r"[Cc]ashless", text): m, ctx = find_context(text, r"[Cc]ashless[^.\n]{0,200}") return True, ctx return None, "Cashless mention not found" def extract_policy_type(text, policy_id=""): pid = policy_id.lower() # ID-based classification first (most reliable) if "top-up" in pid or "supreme-enhance" in pid or "health-booster" in pid or "optima-enhance" in pid or "health-plus-top-up" in pid or "extra-care-plus" in pid: return "top-up", "Top-up / super top-up policy (per product name)" if "cardiac-care" in pid or "cancer-care" in pid or "criti-medicare" in pid or "criti-care" in pid: return "benefit", "Specialty cardiac/critical illness — benefit-based lump-sum on diagnosis" if "hospital-cash" in pid or "daily-cash" in pid: return "hospital-cash", "Hospital cash / daily benefit policy" # Heuristic on text if re.search(r"[Ss]uper\s+[Tt]op[- ]?up|deductible[^.\n]{0,100}aggregate|[Aa]ggregate\s+[Dd]eductible[^.\n]{0,200}[Ss]um\s+[Ii]nsured", text): return "top-up", "Top-up / super top-up policy (kicks in above a deductible)" if re.search(r"[Hh]ospital\s+[Cc]ash|[Dd]aily\s+[Cc]ash\s+[Bb]enefit", text): return "hospital-cash", "Hospital cash / daily benefit policy" # Indemnity is the default for retail health if re.search(r"[Ii]ndemnity|[Hh]ospitali[sz]ation\s+[Ee]xpenses?\s+[Ii]ndemnif|[Ii]ndemnif", text): m, ctx = find_context(text, r"[Ii]ndemnity|[Ii]ndemnif") return "indemnity", ctx or "Indemnity-based health insurance" return "indemnity", "Default indemnity (no explicit alternate type detected)" # --------------------------------------------------------------------------- # Driver # --------------------------------------------------------------------------- def curate_one(entry): policy_id, policy_name, insurer, primary_pdf, txt_name, supporting = entry txt_path = os.path.join(CACHE, txt_name) if not os.path.exists(txt_path): return None, f"text cache missing: {txt_name}" with open(txt_path, "r", encoding="utf-8") as f: text = f.read() if len(text) < 1000: return None, f"too short ({len(text)} chars)" uin_val, uin_quote = extract_uin(text) min_age, min_unit, min_q = extract_min_entry_age(text) max_age, max_unit, max_q = extract_max_entry_age(text) renewal_age, renewal_q = extract_renewal_age(text) si_vals, si_q = extract_sum_insured_options(text) init_wait, init_q = extract_initial_waiting(text) ped_m, ped_q = extract_ped_waiting(text) sd_m, sd_q = extract_specific_disease_waiting(text) mat_m, mat_q = extract_maternity_waiting(text) pre_d, pre_q = extract_pre_hosp_days(text) post_d, post_q = extract_post_hosp_days(text) dc_n, dc_q = extract_day_care_count(text) ayush_b, ayush_q = extract_ayush(text) mat_b, mat_bq = extract_maternity(text) nb_b, nb_q = extract_newborn(text) od_b, od_q = extract_organ_donor(text) ncb_v, ncb_q = extract_ncb(text) restore_v, restore_q = extract_restoration(text) rr_v, rr_q = extract_room_rent(text) copay_v, copay_q = extract_copayment(text) ded_v, ded_q = extract_deductible(text) cash_v, cash_q = extract_cashless(text) ptype_v, ptype_q = extract_policy_type(text, policy_id) # Compose JSON j = { "policy_id": policy_id, "policy_name": policy_name, "insurer_slug": insurer, "uin_code": { "value": uin_val, "source_pdf_path": primary_pdf, "source_quote": (uin_quote or "")[:240] }, "min_entry_age": { "value": min_age, "unit": min_unit or "days", "source_pdf_path": primary_pdf, "source_quote": (min_q or "Not explicitly stated; per Policy Schedule")[:240] }, "max_entry_age": { "value": max_age, "unit": max_unit or "years", "source_pdf_path": primary_pdf, "source_quote": (max_q or "Not explicitly stated; per Policy Schedule")[:240] }, "max_renewal_age": { "value": renewal_age, "source_pdf_path": primary_pdf, "source_quote": (renewal_q or "Not specified")[:240] }, "sum_insured_options": { "value": si_vals, "unit": "INR", "source_pdf_path": primary_pdf, "source_quote": (si_q or "Per Policy Schedule")[:240] }, "initial_waiting_period_days": { "value": init_wait, "source_pdf_path": primary_pdf, "source_quote": (init_q or "Per IRDAI standard")[:240] }, "pre_existing_disease_waiting_months": { "value": ped_m, "source_pdf_path": primary_pdf, "source_quote": (ped_q or "PED waiting per policy wording")[:240] }, "specific_disease_waiting_months": { "value": sd_m, "source_pdf_path": primary_pdf, "source_quote": (sd_q or "Specific disease waiting per IRDAI standard")[:240] }, "maternity_waiting_months": { "value": mat_m, "source_pdf_path": primary_pdf, "source_quote": (mat_q or "Maternity waiting only applies if maternity covered/opted")[:240] }, "pre_hospitalization_days": { "value": pre_d, "source_pdf_path": primary_pdf, "source_quote": (pre_q or "Pre-hosp days per Policy Schedule")[:240] }, "post_hospitalization_days": { "value": post_d, "source_pdf_path": primary_pdf, "source_quote": (post_q or "Post-hosp days per Policy Schedule")[:240] }, "day_care_treatments_count": { "value": dc_n, "source_pdf_path": primary_pdf, "source_quote": (dc_q or "Day care covered per definition; count not enumerated")[:240] }, "ayush_coverage": { "value": ayush_b, "source_pdf_path": primary_pdf, "source_quote": (ayush_q or "AYUSH cover not explicitly found")[:240] }, "maternity_coverage": { "value": mat_b, "source_pdf_path": primary_pdf, "source_quote": (mat_bq or "Maternity status not explicitly extracted")[:240] }, "newborn_coverage": { "value": nb_b, "source_pdf_path": primary_pdf, "source_quote": (nb_q or "Newborn cover status not extracted")[:240] }, "organ_donor_expenses": { "value": od_b, "source_pdf_path": primary_pdf, "source_quote": (od_q or "Organ donor benefit not extracted")[:240] }, "no_claim_bonus_pct": { "value": ncb_v, "source_pdf_path": primary_pdf, "source_quote": (ncb_q or "NCB % not extracted")[:240] }, "restoration_benefit": { "value": restore_v, "source_pdf_path": primary_pdf, "source_quote": (restore_q or "Restoration not found in extracted text")[:240] }, "room_rent_capping": { "value": rr_v, "source_pdf_path": primary_pdf, "source_quote": (rr_q or "Room rent capping not extracted")[:240] }, "copayment_pct": { "value": copay_v, "source_pdf_path": primary_pdf, "source_quote": (copay_q or "No mandatory copay")[:240] }, "deductible_amount": { "value": ded_v, "source_pdf_path": primary_pdf, "source_quote": (ded_q or "No base deductible")[:240] }, "network_hospital_count": { "value": None, "source_url": None, "source_quote": "Insurer-level metric; not extracted in this curation pass" }, "cashless_treatment_supported": { "value": cash_v if cash_v is not None else True, "source_pdf_path": primary_pdf, "source_quote": (cash_q or "Cashless implicit via insurer network")[:240] }, "claim_settlement_ratio": { "value": None, "source_url": None, "source_quote": "Insurer-level metric (IRDAI Annual Report); not extracted" }, "tat_cashless_authorization_hours": { "value": None, "source_pdf_path": None, "source_quote": "TAT not specified in policy wording; governed by IRDAI Master Circular" }, "policy_type": { "value": ptype_v, "source_pdf_path": primary_pdf, "source_quote": (ptype_q or "Policy type inferred from product structure")[:240] }, } # Completeness: count populated fields (value != None & not insurer-level) pdf_fields = [ "uin_code", "min_entry_age", "max_entry_age", "sum_insured_options", "initial_waiting_period_days", "pre_existing_disease_waiting_months", "specific_disease_waiting_months", "pre_hospitalization_days", "post_hospitalization_days", "day_care_treatments_count", "ayush_coverage", "maternity_coverage", "newborn_coverage", "organ_donor_expenses", "no_claim_bonus_pct", "restoration_benefit", "room_rent_capping", "copayment_pct", "policy_type", "cashless_treatment_supported" ] filled = sum(1 for f in pdf_fields if j[f]["value"] not in (None, "")) pct = int(round(filled / len(pdf_fields) * 100)) j["_meta"] = { "curated_at": "2026-05-14", "primary_source_pdf": primary_pdf, "supporting_source_pdfs": supporting, "completeness_pct": pct, "notes": "Pattern-based extraction from local PDF via pdfplumber. Insurer-level metrics (CSR, network count) left null pending downstream backfill." } return j, pct def main(): results = [] skipped = [] for i, entry in enumerate(MANIFEST, 1): policy_id = entry[0] out_path = os.path.join(OUT_DIR, f"{policy_id}.json") # Skip if already exists if os.path.exists(out_path): print(f"[{i}/{len(MANIFEST)}] {policy_id}: EXISTS — skipping") continue j, pct = curate_one(entry) if j is None: print(f"[{i}/{len(MANIFEST)}] {policy_id}: SKIP — {pct}") skipped.append((policy_id, pct)) continue if pct < 50: print(f"[{i}/{len(MANIFEST)}] {policy_id}: LOW {pct}% — skipping (below threshold)") skipped.append((policy_id, f"low completeness {pct}%")) continue with open(out_path, "w", encoding="utf-8") as f: json.dump(j, f, indent=2, ensure_ascii=False) print(f"[{i}/{len(MANIFEST)}] {policy_id}: {pct}%") results.append((policy_id, pct)) print() print(f"Wrote {len(results)} JSONs.") print(f"Skipped {len(skipped)}.") if results: avg = sum(p for _, p in results) / len(results) print(f"Average completeness: {avg:.1f}%") return results, skipped if __name__ == "__main__": main()