Spaces:
Sleeping
Sleeping
| """Curate batch 2 policy_facts JSONs from extracted text cache. | |
| Pattern-based field extraction matched to the schema used by batch 1. | |
| Writes one JSON per policy into 40-data/policy_facts/. | |
| """ | |
| import os | |
| import re | |
| import json | |
| import sys | |
| BASE = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) | |
| CACHE = "/tmp/claude/policy_extract/text_cache" | |
| OUT_DIR = os.path.join(BASE, "40-data/policy_facts") | |
| os.makedirs(OUT_DIR, exist_ok=True) | |
| # --------------------------------------------------------------------------- | |
| # Batch 2 manifest: (policy_id, policy_name, insurer_slug, primary_pdf_rel, | |
| # text_cache_filename, [supporting_pdf_rel ... optional]) | |
| # Excluded from batch (already curated): | |
| # aditya-birla activ-assure-diamond + activ-one | |
| # bajaj-allianz health-guard-gold + extra-care-plus | |
| # care-health care-supreme + care-classic + care-senior | |
| # hdfc-ergo my-optima-secure + optima-restore | |
| # icici-lombard elevate + health-shield-360 + complete-health | |
| # manipalcigna prohealth-prime + prohealth-protect (both from all-variants) | |
| # new-india new-india-floater-mediclaim | |
| # niva-bupa reassure-2 + senior-first + health-companion | |
| # star-health family-health-optima + star-comprehensive | |
| # tata-aig medicare + medicare-premier | |
| # --------------------------------------------------------------------------- | |
| MANIFEST = [ | |
| # ABHI | |
| ("aditya-birla__activ-health", "Aditya Birla Activ Health (Platinum Enhanced / Essential)", "aditya-birla", | |
| "rag/corpus/aditya-birla/activ-health-individual__wordings.pdf", | |
| "aditya-birla__activ-health-individual__wordings.txt", []), | |
| # Bajaj | |
| ("bajaj-allianz__comprehensive-care-plan", "Bajaj Allianz Comprehensive Care Plan", "bajaj-allianz", | |
| "rag/corpus/bajaj-allianz/comprehensive-care-plan__wordings.pdf", | |
| "bajaj-allianz__comprehensive-care-plan__wordings.txt", []), | |
| ("bajaj-allianz__global-health-care", "Bajaj Allianz Global Health Care", "bajaj-allianz", | |
| "rag/corpus/bajaj-allianz/global-health-care__wordings.pdf", | |
| "bajaj-allianz__global-health-care__wordings.txt", []), | |
| ("bajaj-allianz__health-guard", "Bajaj Allianz Health Guard (Silver / Gold / Platinum)", "bajaj-allianz", | |
| "rag/corpus/bajaj-allianz/health-guard__wordings.pdf", | |
| "bajaj-allianz__health-guard__wordings.txt", []), | |
| ("bajaj-allianz__silver-health", "Bajaj Allianz Silver Health (Senior Citizen)", "bajaj-allianz", | |
| "rag/corpus/bajaj-allianz/silver-health__cis.pdf", | |
| "bajaj-allianz__silver-health__cis.txt", []), | |
| ("bajaj-allianz__tax-gain", "Bajaj Allianz Tax Gain", "bajaj-allianz", | |
| "rag/corpus/bajaj-allianz/tax-gain__cis.pdf", | |
| "bajaj-allianz__tax-gain__cis.txt", []), | |
| # Care | |
| ("care-health__care-advantage", "Care Health Care Advantage", "care-health", | |
| "rag/corpus/care-health/care-advantage__brochure.pdf", | |
| "care-health__care-advantage__brochure.txt", []), | |
| ("care-health__care-supreme-enhance", "Care Health Care Supreme Enhance (Top-up)", "care-health", | |
| "rag/corpus/care-health/care-supreme-enhance__wordings.pdf", | |
| "care-health__care-supreme-enhance__wordings.txt", []), | |
| ("care-health__ultimate-care", "Care Health Ultimate Care", "care-health", | |
| "rag/corpus/care-health/ultimate-care__wordings.pdf", | |
| "care-health__ultimate-care__wordings.txt", []), | |
| # HDFC ERGO | |
| ("hdfc-ergo__energy", "HDFC ERGO Energy (Diabetes / Hypertension)", "hdfc-ergo", | |
| "rag/corpus/hdfc-ergo/energy-diabetes-hypertension__wordings.pdf", | |
| "hdfc-ergo__energy-diabetes-hypertension__wordings.txt", []), | |
| ("hdfc-ergo__my-health-medisure-prime", "HDFC ERGO my:health Medisure Prime", "hdfc-ergo", | |
| "rag/corpus/hdfc-ergo/my-health-medisure-prime__wordings.pdf", | |
| "hdfc-ergo__my-health-medisure-prime__wordings.txt", []), | |
| ("hdfc-ergo__my-health-sampoorna-suraksha", "HDFC ERGO my:health Sampoorna Suraksha", "hdfc-ergo", | |
| "rag/corpus/hdfc-ergo/my-health-sampoorna-suraksha__brochure.pdf", | |
| "hdfc-ergo__my-health-sampoorna-suraksha__brochure.txt", []), | |
| ("hdfc-ergo__my-health-suraksha", "HDFC ERGO my:health Suraksha", "hdfc-ergo", | |
| "rag/corpus/hdfc-ergo/my-health-suraksha__brochure.pdf", | |
| "hdfc-ergo__my-health-suraksha__brochure.txt", []), | |
| ("hdfc-ergo__my-health-women-suraksha", "HDFC ERGO my:health Women Suraksha", "hdfc-ergo", | |
| "rag/corpus/hdfc-ergo/my-health-women-suraksha__brochure.pdf", | |
| "hdfc-ergo__my-health-women-suraksha__brochure.txt", []), | |
| ("hdfc-ergo__optima-secure-older-variant", "HDFC ERGO Optima Secure (Older / Legacy Variant)", "hdfc-ergo", | |
| "rag/corpus/hdfc-ergo/my-optima-secure-older-variant__wordings.pdf", | |
| "hdfc-ergo__my-optima-secure-older-variant__wordings.txt", []), | |
| ("hdfc-ergo__optima-enhance", "HDFC ERGO Optima Enhance (Top-up)", "hdfc-ergo", | |
| "rag/corpus/hdfc-ergo/optima-enhance__wordings.pdf", | |
| "hdfc-ergo__optima-enhance__wordings.txt", []), | |
| ("hdfc-ergo__optima-plus", "HDFC ERGO Optima Plus", "hdfc-ergo", | |
| "rag/corpus/hdfc-ergo/optima-plus__wordings.pdf", | |
| "hdfc-ergo__optima-plus__wordings.txt", []), | |
| ("hdfc-ergo__total-health-plan", "HDFC ERGO Total Health Plan", "hdfc-ergo", | |
| "rag/corpus/hdfc-ergo/total-health-plan__wordings.pdf", | |
| "hdfc-ergo__total-health-plan__wordings.txt", []), | |
| # ICICI Lombard | |
| ("icici-lombard__arogya-sanjeevani", "ICICI Lombard Arogya Sanjeevani (Standard)", "icici-lombard", | |
| "rag/corpus/icici-lombard/arogya-sanjeevani__wordings.pdf", | |
| "icici-lombard__arogya-sanjeevani__wordings.txt", []), | |
| ("icici-lombard__complete-health-umbrella", "ICICI Lombard Complete Health Insurance β Umbrella", "icici-lombard", | |
| "rag/corpus/icici-lombard/complete-health-insurance-umbrella__wordings.pdf", | |
| "icici-lombard__complete-health-insurance-umbrella__wordings.txt", []), | |
| ("icici-lombard__health-advantedge", "ICICI Lombard Health Advantedge", "icici-lombard", | |
| "rag/corpus/icici-lombard/health-advantedge__wordings.pdf", | |
| "icici-lombard__health-advantedge__wordings.txt", []), | |
| ("icici-lombard__health-booster", "ICICI Lombard Health Booster (Top-up)", "icici-lombard", | |
| "rag/corpus/icici-lombard/health-booster-top-up__wordings.pdf", | |
| "icici-lombard__health-booster-top-up__wordings.txt", []), | |
| ("icici-lombard__health-elite-plus", "ICICI Lombard Health Elite Plus", "icici-lombard", | |
| "rag/corpus/icici-lombard/health-elite-plus__wordings.pdf", | |
| "icici-lombard__health-elite-plus__wordings.txt", []), | |
| # ManipalCigna | |
| ("manipalcigna__prohealth-select", "ManipalCigna ProHealth Select", "manipalcigna", | |
| "rag/corpus/manipalcigna/prohealth-select__wordings.pdf", | |
| "manipalcigna__prohealth-select__wordings.txt", []), | |
| ("manipalcigna__sarvah-param", "ManipalCigna Sarvah Param", "manipalcigna", | |
| "rag/corpus/manipalcigna/sarvah-param__wordings.pdf", | |
| "manipalcigna__sarvah-param__wordings.txt", []), | |
| # New India | |
| ("new-india__asha-kiran", "New India Asha Kiran (Girl Child Family Floater)", "new-india", | |
| "rag/corpus/new-india/asha-kiran-policy__brochure.pdf", | |
| "new-india__asha-kiran-policy__brochure.txt", []), | |
| ("new-india__janata-mediclaim", "New India Janata Mediclaim", "new-india", | |
| "rag/corpus/new-india/janata-mediclaim-policy__wordings.pdf", | |
| "new-india__janata-mediclaim-policy__wordings.txt", []), | |
| ("new-india__mediclaim-policy", "New India Mediclaim Policy (Individual)", "new-india", | |
| "rag/corpus/new-india/new-india-mediclaim-policy__wordings.pdf", | |
| "new-india__new-india-mediclaim-policy__wordings.txt", []), | |
| ("new-india__universal-health", "New India Universal Health Insurance", "new-india", | |
| "rag/corpus/new-india/universal-health-insurance__wordings.pdf", | |
| "new-india__universal-health-insurance__wordings.txt", []), | |
| ("new-india__yuva-bharat", "New India Yuva Bharat Health Policy", "new-india", | |
| "rag/corpus/new-india/yuva-bharat-health-policy__wordings.pdf", | |
| "new-india__yuva-bharat-health-policy__wordings.txt", []), | |
| # Niva Bupa | |
| ("niva-bupa__aspire", "Niva Bupa Aspire", "niva-bupa", | |
| "rag/corpus/niva-bupa/aspire__wordings.pdf", | |
| "niva-bupa__aspire__wordings.txt", []), | |
| ("niva-bupa__health-plus-top-up", "Niva Bupa Health Plus (Top-up)", "niva-bupa", | |
| "rag/corpus/niva-bupa/health-plus-top-up__wordings.pdf", | |
| "niva-bupa__health-plus-top-up__wordings.txt", []), | |
| ("niva-bupa__health-premia", "Niva Bupa Health Premia", "niva-bupa", | |
| "rag/corpus/niva-bupa/health-premia__wordings.pdf", | |
| "niva-bupa__health-premia__wordings.txt", []), | |
| ("niva-bupa__reassure-3", "Niva Bupa ReAssure 3.0", "niva-bupa", | |
| "rag/corpus/niva-bupa/reassure-3-0__wordings.pdf", | |
| "niva-bupa__reassure-3-0__wordings.txt", []), | |
| ("niva-bupa__rise", "Niva Bupa Rise", "niva-bupa", | |
| "rag/corpus/niva-bupa/rise__wordings.pdf", | |
| "niva-bupa__rise__wordings.txt", []), | |
| ("niva-bupa__saral-suraksha", "Niva Bupa Saral Suraksha Bima (Standard)", "niva-bupa", | |
| "rag/corpus/niva-bupa/saral-suraksha-bima__wordings.pdf", | |
| "niva-bupa__saral-suraksha-bima__wordings.txt", []), | |
| # Star | |
| ("star-health__health-premier", "Star Health Premier", "star-health", | |
| "rag/corpus/star-health/health-premier__wordings.pdf", | |
| "star-health__health-premier__wordings.txt", []), | |
| ("star-health__senior-citizens-red-carpet", "Star Senior Citizens Red Carpet", "star-health", | |
| "rag/corpus/star-health/senior-citizens-red-carpet__brochure.pdf", | |
| "star-health__senior-citizens-red-carpet__brochure.txt", []), | |
| ("star-health__star-assure", "Star Assure Insurance Policy", "star-health", | |
| "rag/corpus/star-health/star-assure__wordings.pdf", | |
| "star-health__star-assure__wordings.txt", []), | |
| ("star-health__star-cardiac-care", "Star Cardiac Care Insurance", "star-health", | |
| "rag/corpus/star-health/star-cardiac-care__wordings.pdf", | |
| "star-health__star-cardiac-care__wordings.txt", []), | |
| ("star-health__star-cardiac-care-platinum", "Star Cardiac Care Platinum", "star-health", | |
| "rag/corpus/star-health/star-cardiac-care-platinum__wordings.pdf", | |
| "star-health__star-cardiac-care-platinum__wordings.txt", []), | |
| # Tata AIG | |
| ("tata-aig__medicare-lite", "Tata AIG MediCare Lite", "tata-aig", | |
| "rag/corpus/tata-aig/medicare-lite__cis.pdf", | |
| "tata-aig__medicare-lite__cis.txt", []), | |
| ("tata-aig__medicare-select", "Tata AIG MediCare Select", "tata-aig", | |
| "rag/corpus/tata-aig/medicare-select__brochure.pdf", | |
| "tata-aig__medicare-select__brochure.txt", []), | |
| ] | |
| # --------------------------------------------------------------------------- | |
| # Pattern-based field extractors | |
| # Each returns (value, quote) or (None, quote_with_explanation) on miss | |
| # --------------------------------------------------------------------------- | |
| def find_context(text, pattern, max_len=200, flags=re.IGNORECASE): | |
| m = re.search(pattern, text, flags) | |
| if not m: | |
| return None, None | |
| start = max(0, m.start() - 30) | |
| end = min(len(text), m.end() + 160) | |
| ctx = re.sub(r"\s+", " ", text[start:end]).strip() | |
| return m, ctx[:max_len] | |
| def extract_uin(text): | |
| # IRDAI UIN: 3-letter insurer + 3-5 letter product code + 5 digits + V + 6 digits | |
| # Examples: HDFHLIP25041V062425 (HDF + HLIP), SHAHLIP22032V052122 (SHA + HLIP), | |
| # CHIHLIP23128V012223 (CHI + HLIP), NBHHLIP26042V022526 (NBH + HLIP) | |
| pat = r"\b([A-Z]{6,9}[0-9]{5}V[0-9]{6})\b" | |
| m, ctx = find_context(text, pat) | |
| if m: | |
| return m.group(1), ctx | |
| return None, "UIN not found in extracted text" | |
| def extract_min_entry_age(text): | |
| # Look for "minimum entry age" / "min age" / "91 days" | |
| pats = [ | |
| (r"[Mm]inimum [Ee]ntry [Aa]ge[^.\n]{0,80}?(\d+)\s*(day|year|month)", "explicit min"), | |
| (r"[Aa]ge at [Ee]ntry[^.\n]{0,40}?(\d+)\s*(day|year|month)", "age at entry"), | |
| (r"[Cc]hild[^.\n]{0,40}?(\d+)\s*day", "child entry"), | |
| (r"(\d+)\s*[Dd]ays\s*(?:to|-|β)\s*\d+\s*[Yy]ears", "range form"), | |
| ] | |
| for pat, _ in pats: | |
| m, ctx = find_context(text, pat) | |
| if m: | |
| val = int(m.group(1)) | |
| unit = m.group(2).lower() if m.lastindex and m.lastindex >= 2 else "days" | |
| return val, unit, ctx | |
| return None, None, "Min entry age not found" | |
| def extract_max_entry_age(text): | |
| pats = [ | |
| (r"[Mm]aximum [Ee]ntry [Aa]ge[^.\n]{0,80}?(\d+)\s*[Yy]ear", "explicit max"), | |
| (r"[Ee]ntry [Aa]ge[^.\n]{0,40}?[Uu]p to (\d+)\s*[Yy]ear", "entry age up to"), | |
| (r"(\d+)\s*[Dd]ays\s*(?:to|-|β)\s*(\d+)\s*[Yy]ears", "range"), | |
| (r"[Mm]aximum [Aa]ge[^.\n]{0,40}?(\d+)\s*[Yy]ear", "max age"), | |
| ] | |
| for pat, _ in pats: | |
| m, ctx = find_context(text, pat) | |
| if m: | |
| # Range pattern -> group 2 is max | |
| try: | |
| val = int(m.group(2)) if m.lastindex and m.lastindex >= 2 and m.group(2).isdigit() else int(m.group(1)) | |
| except Exception: | |
| val = int(m.group(1)) | |
| return val, "years", ctx | |
| return None, "years", "Max entry age not explicitly stated; check Policy Schedule" | |
| def extract_renewal_age(text): | |
| if re.search(r"[Ll]ifelong\s*[Rr]enew|[Ll]ife[- ]?[Ll]ong|[Nn]o\s+maximum\s+(cover\s+)?ceas|continuous\s+life\s+long", text): | |
| m, ctx = find_context(text, r"[Ll]ifelong\s*[Rr]enew|[Ll]ife[- ]?[Ll]ong\s*[Rr]enew|[Nn]o\s+maximum\s+(cover\s+)?ceas|continuous\s+life\s+long") | |
| return None, "Lifelong renewability" + ((": " + ctx) if ctx else "") | |
| m, ctx = find_context(text, r"[Mm]aximum\s+[Rr]enewal\s+[Aa]ge[^.\n]{0,40}?(\d+)\s*[Yy]ear") | |
| if m: | |
| return int(m.group(1)), ctx | |
| return None, "Max renewal age not specified; check Policy Schedule" | |
| def extract_sum_insured_options(text): | |
| # Look for currency lists e.g. "3 Lacs, 5 Lacs, 10 Lacs" | |
| m, ctx = find_context(text, r"[Ss]um\s+[Ii]nsured[^.\n]{0,300}?(\d+[\d,. ]{0,20}(?:Lakhs?|Lacs?|Crores?|L\b|Cr\b))") | |
| if m: | |
| # Try to gather numeric values from window | |
| window = text[max(0, m.start()-30): m.end()+400] | |
| nums = re.findall(r"(\d+(?:\.\d+)?)\s*(?:Lakhs?|Lacs?|L\b)", window, re.IGNORECASE) | |
| nums_cr = re.findall(r"(\d+(?:\.\d+)?)\s*(?:Crores?|Cr\b)", window, re.IGNORECASE) | |
| vals = [] | |
| for n in nums: | |
| try: | |
| v = int(float(n) * 100000) | |
| if 50000 <= v <= 1000000000: | |
| vals.append(v) | |
| except Exception: | |
| pass | |
| for n in nums_cr: | |
| try: | |
| v = int(float(n) * 10000000) | |
| if 50000 <= v <= 1000000000: | |
| vals.append(v) | |
| except Exception: | |
| pass | |
| vals = sorted(set(vals)) | |
| # Require at least 2 distinct values to count this as a real enumeration | |
| if len(vals) >= 2: | |
| return vals, ctx[:200] | |
| return None, "Sum Insured options not enumerated in extracted text; check Policy Schedule" | |
| def extract_initial_waiting(text): | |
| m, ctx = find_context(text, r"(\d+)\s*[Dd]ays?\s+(?:from\s+the\s+(?:first|date of)|waiting period|of\s+the\s+inception)") | |
| if m and int(m.group(1)) in (15, 30): | |
| return int(m.group(1)), ctx | |
| m, ctx = find_context(text, r"[Ee]xcl03[^.\n]{0,200}?(\d+)\s*days?") | |
| if m: | |
| return int(m.group(1)), ctx | |
| m, ctx = find_context(text, r"within\s+(\d+)\s*days\s+from\s+the\s+first") | |
| if m: | |
| return int(m.group(1)), ctx | |
| return 30, "Default IRDAI 30-day waiting period applies (not explicitly quoted in extracted snippet)" | |
| def extract_ped_waiting(text): | |
| # PED in months | |
| m, ctx = find_context(text, r"[Pp]re[- ]existing\s+[Dd]isease\s*(?:\([^)]+\))?[^.\n]{0,300}?(\d+)\s*(months|years)") | |
| if m: | |
| val = int(m.group(1)) | |
| unit = m.group(2).lower() | |
| months = val * 12 if "year" in unit else val | |
| return months, ctx | |
| m, ctx = find_context(text, r"PED[^.\n]{0,200}?(\d+)\s*(months|years)") | |
| if m: | |
| val = int(m.group(1)) | |
| unit = m.group(2).lower() | |
| months = val * 12 if "year" in unit else val | |
| return months, ctx | |
| m, ctx = find_context(text, r"[Ee]xcl01[^.\n]{0,200}?(\d+)\s*months") | |
| if m: | |
| return int(m.group(1)), ctx | |
| return None, "PED waiting period not extracted; check Section 5 / Excl01" | |
| def extract_specific_disease_waiting(text): | |
| m, ctx = find_context(text, r"(?:listed|specified|named|specific)\s+(?:conditions?|ailments?|diseases?|treatments?)[^.\n]{0,300}?(\d+)\s*(months|years)") | |
| if m: | |
| val = int(m.group(1)) | |
| unit = m.group(2).lower() | |
| months = val * 12 if "year" in unit else val | |
| return months, ctx | |
| m, ctx = find_context(text, r"[Ee]xcl02[^.\n]{0,200}?(\d+)\s*(months|years)") | |
| if m: | |
| val = int(m.group(1)) | |
| unit = m.group(2).lower() | |
| return val * 12 if "year" in unit else val, ctx | |
| return 24, "Default IRDAI 24-month specific-disease waiting (not explicitly quoted)" | |
| def extract_maternity_waiting(text): | |
| m, ctx = find_context(text, r"[Mm]aternity[^.\n]{0,200}?(\d+)\s*months?\s+(?:waiting|of continuous)") | |
| if m: | |
| return int(m.group(1)), ctx | |
| m, ctx = find_context(text, r"[Ww]aiting\s+[Pp]eriod[^.\n]{0,50}?[Mm]aternity[^.\n]{0,80}?(\d+)\s*months?") | |
| if m: | |
| return int(m.group(1)), ctx | |
| return None, "Maternity waiting not specified or maternity excluded" | |
| def extract_pre_hosp_days(text): | |
| m, ctx = find_context(text, r"[Pp]re[- ]?[Hh]ospitalisation[^.\n]{0,200}?(\d+)\s*days?") | |
| if m: | |
| return int(m.group(1)), ctx | |
| m, ctx = find_context(text, r"(\d+)\s*days?\s+(?:prior to|before).{0,40}(?:admission|hospitali[sz]ation)") | |
| if m: | |
| return int(m.group(1)), ctx | |
| return None, "Pre-hospitalization days not extracted" | |
| def extract_post_hosp_days(text): | |
| m, ctx = find_context(text, r"[Pp]ost[- ]?[Hh]ospitalisation[^.\n]{0,200}?(\d+)\s*days?") | |
| if m: | |
| return int(m.group(1)), ctx | |
| m, ctx = find_context(text, r"(\d+)\s*days?\s+(?:after|post|following).{0,40}discharge") | |
| if m: | |
| return int(m.group(1)), ctx | |
| return None, "Post-hospitalization days not extracted" | |
| def extract_day_care_count(text): | |
| m, ctx = find_context(text, r"(\d{2,4})\s*(?:listed\s+)?[Dd]ay\s*[- ]?[Cc]are\s*(?:[Pp]rocedures?|[Tt]reatments?)") | |
| if m: | |
| v = int(m.group(1)) | |
| if 50 <= v <= 2000: | |
| return v, ctx | |
| m, ctx = find_context(text, r"[Dd]ay\s*[- ]?[Cc]are[^.\n]{0,80}?(\d{2,4})\s*[Pp]rocedures?") | |
| if m: | |
| v = int(m.group(1)) | |
| if 50 <= v <= 2000: | |
| return v, ctx | |
| return None, "Day-care count not enumerated; covered per policy definition" | |
| def extract_ayush(text): | |
| if re.search(r"AYUSH", text): | |
| m, ctx = find_context(text, r"AYUSH[^.\n]{0,200}") | |
| return True, ctx | |
| if re.search(r"[Aa]lternative\s+[Tt]reatment", text): | |
| m, ctx = find_context(text, r"[Aa]lternative\s+[Tt]reatment[^.\n]{0,200}") | |
| return True, ctx | |
| return False, "AYUSH coverage not found in extracted text" | |
| def extract_maternity(text): | |
| # Check explicit "maternity not covered" or "Excl18" | |
| m1 = re.search(r"[Mm]aternity[^.\n]{0,80}?(?:not\s+covered|excluded)", text) | |
| m2 = re.search(r"Excl18", text) | |
| m3 = re.search(r"[Mm]aternity\s+(?:[Ee]xpenses?|[Cc]over|[Bb]enefit)[^.\n]{0,300}?(?:lump\s+sum|Rs\.?\s*\d|INR|deliveries?)", text) | |
| if m3 and not m1: | |
| # Has positive maternity description | |
| m, ctx = find_context(text, r"[Mm]aternity\s+(?:[Ee]xpenses?|[Cc]over|[Bb]enefit)[^.\n]{0,300}") | |
| return True, ctx | |
| if m1 or m2: | |
| if m1: | |
| m, ctx = find_context(text, r"[Mm]aternity[^.\n]{0,200}?(?:not\s+covered|excluded)[^.\n]{0,100}") | |
| else: | |
| m, ctx = find_context(text, r"Excl18[^.\n]{0,200}") | |
| return False, ctx or "Maternity excluded (Excl18)" | |
| # No explicit mention -> default false for typical retail (most retail base excludes maternity) | |
| return False, "Maternity not explicitly mentioned; presumed excluded in base" | |
| def extract_newborn(text): | |
| m = re.search(r"[Nn]ew[ -]?[Bb]orn[^.\n]{0,200}", text) | |
| if m: | |
| ctx = re.sub(r"\s+", " ", text[m.start():m.end()+50]).strip() | |
| if re.search(r"not\s+covered|excluded", ctx): | |
| return False, ctx[:200] | |
| return True, ctx[:200] | |
| return False, "Newborn cover not found; typically tied to maternity option" | |
| def extract_organ_donor(text): | |
| m = re.search(r"[Oo]rgan\s+[Dd]onor", text) | |
| if m: | |
| ctx = re.sub(r"\s+", " ", text[m.start():m.end()+200]).strip() | |
| if re.search(r"not\s+covered|excluded", ctx): | |
| return False, ctx[:200] | |
| return True, ctx[:200] | |
| return False, "Organ donor cover not extracted" | |
| def extract_ncb(text): | |
| m = re.search(r"(?:[Nn]o\s+[Cc]laim\s+[Bb]onus|[Cc]umulative\s+[Bb]onus|NCB|cumulative\s+bonus)[^.\n]{0,400}?(\d{1,3})\s*%", text) | |
| if m: | |
| v = int(m.group(1)) | |
| if 5 <= v <= 100: | |
| ctx = re.sub(r"\s+", " ", text[max(0, m.start()):m.end()+50]).strip()[:220] | |
| return v, ctx | |
| m = re.search(r"(\d{1,3})\s*%\s+(?:increase|bonus)\s+(?:in|of)\s+(?:Sum\s+Insured|SI)", text, re.IGNORECASE) | |
| if m: | |
| v = int(m.group(1)) | |
| if 5 <= v <= 100: | |
| ctx = re.sub(r"\s+", " ", text[max(0, m.start()-40):m.end()+30]).strip()[:220] | |
| return v, ctx | |
| return None, "NCB % not extracted; product may use booster/recharge structure" | |
| def extract_restoration(text): | |
| # Patterns | |
| pats = [ | |
| r"[Rr]estor[ae][^.\n]{0,300}", | |
| r"[Rr]echarge\s+of\s+[Ss]um\s+[Ii]nsured[^.\n]{0,300}", | |
| r"[Rr]efill[^.\n]{0,300}", | |
| r"[Rr]eset\s+[Bb]enefit[^.\n]{0,300}", | |
| r"[Rr]e[- ]?[Ii]nstatement[^.\n]{0,300}", | |
| ] | |
| for p in pats: | |
| m = re.search(p, text) | |
| if m: | |
| ctx = re.sub(r"\s+", " ", text[m.start():m.end()]).strip()[:280] | |
| return ctx[:240], ctx | |
| return None, "Restoration benefit not found in extracted text" | |
| def extract_room_rent(text): | |
| # Prefer wording that's a capping description, not the room-rent definition | |
| pats = [ | |
| r"[Nn]o\s+[Rr]oom\s+[Rr]ent\s+(?:[Cc]apping|[Ll]imit|[Ss]ub[- ]?[Ll]imit)[^.\n]{0,150}", | |
| r"[Rr]oom\s+[Rr]ent\s+No\s+Sub[- ]?Limit[^.\n]{0,100}", | |
| r"[Ss]ingle\s+[Pp]rivate\s+(?:AC\s+)?[Rr]oom[^.\n]{0,150}", | |
| r"[Rr]oom\s+[Rr]ent[^.\n]{0,200}?(?:up\s+to|maximum|capped\s+at|limit\s+of|sub[- ]?limit)\s*(?:Rs\.?|`|INR|\d+\s*%)[^.\n]{0,100}", | |
| r"[Rr]oom\s+[Cc]ategory[^.\n]{0,160}", | |
| r"[Rr]oom\s+[Rr]ent[^.\n]{0,160}?(\d+\s*%|Rs\.?\s*\d|`\s*\d|INR\s*\d)[^.\n]{0,80}", | |
| ] | |
| for p in pats: | |
| m = re.search(p, text) | |
| if m: | |
| ctx = re.sub(r"\s+", " ", text[m.start():m.end()]).strip()[:240] | |
| return ctx[:200], ctx | |
| return None, "Room rent capping not extracted (only definition found, no explicit cap)" | |
| def extract_copayment(text): | |
| m = re.search(r"[Cc]o[- ]?payment\s+of\s+(\d{1,2})\s*%", text) | |
| if m: | |
| v = int(m.group(1)) | |
| ctx = re.sub(r"\s+", " ", text[max(0, m.start()-30):m.end()+120]).strip()[:240] | |
| return v, ctx | |
| m = re.search(r"(\d{1,2})\s*%\s+[Cc]o[- ]?[Pp]ay", text) | |
| if m: | |
| v = int(m.group(1)) | |
| ctx = re.sub(r"\s+", " ", text[max(0, m.start()-30):m.end()+120]).strip()[:240] | |
| return v, ctx | |
| return 0, "No mandatory copay extracted; product may have age-based or zone-based optional copay" | |
| def extract_deductible(text): | |
| m = re.search(r"[Dd]eductible[^.\n]{0,300}?(?:Rs\.?\s*|INR\s*|βΉ\s*)(\d[\d,]{2,})", text) | |
| if m: | |
| amt = int(m.group(1).replace(",", "")) | |
| ctx = re.sub(r"\s+", " ", text[max(0, m.start()-30):m.end()+120]).strip()[:240] | |
| return amt, ctx | |
| m = re.search(r"[Aa]ggregate\s+[Dd]eductible[^.\n]{0,200}", text) | |
| if m: | |
| ctx = re.sub(r"\s+", " ", text[m.start():m.end()]).strip()[:220] | |
| return None, ctx | |
| return None, "No base deductible (or only optional voluntary deductible add-on)" | |
| def extract_cashless(text): | |
| if re.search(r"[Cc]ashless", text): | |
| m, ctx = find_context(text, r"[Cc]ashless[^.\n]{0,200}") | |
| return True, ctx | |
| return None, "Cashless mention not found" | |
| def extract_policy_type(text, policy_id=""): | |
| pid = policy_id.lower() | |
| # ID-based classification first (most reliable) | |
| if "top-up" in pid or "supreme-enhance" in pid or "health-booster" in pid or "optima-enhance" in pid or "health-plus-top-up" in pid or "extra-care-plus" in pid: | |
| return "top-up", "Top-up / super top-up policy (per product name)" | |
| if "cardiac-care" in pid or "cancer-care" in pid or "criti-medicare" in pid or "criti-care" in pid: | |
| return "benefit", "Specialty cardiac/critical illness β benefit-based lump-sum on diagnosis" | |
| if "hospital-cash" in pid or "daily-cash" in pid: | |
| return "hospital-cash", "Hospital cash / daily benefit policy" | |
| # Heuristic on text | |
| if re.search(r"[Ss]uper\s+[Tt]op[- ]?up|deductible[^.\n]{0,100}aggregate|[Aa]ggregate\s+[Dd]eductible[^.\n]{0,200}[Ss]um\s+[Ii]nsured", text): | |
| return "top-up", "Top-up / super top-up policy (kicks in above a deductible)" | |
| if re.search(r"[Hh]ospital\s+[Cc]ash|[Dd]aily\s+[Cc]ash\s+[Bb]enefit", text): | |
| return "hospital-cash", "Hospital cash / daily benefit policy" | |
| # Indemnity is the default for retail health | |
| if re.search(r"[Ii]ndemnity|[Hh]ospitali[sz]ation\s+[Ee]xpenses?\s+[Ii]ndemnif|[Ii]ndemnif", text): | |
| m, ctx = find_context(text, r"[Ii]ndemnity|[Ii]ndemnif") | |
| return "indemnity", ctx or "Indemnity-based health insurance" | |
| return "indemnity", "Default indemnity (no explicit alternate type detected)" | |
| # --------------------------------------------------------------------------- | |
| # Driver | |
| # --------------------------------------------------------------------------- | |
| def curate_one(entry): | |
| policy_id, policy_name, insurer, primary_pdf, txt_name, supporting = entry | |
| txt_path = os.path.join(CACHE, txt_name) | |
| if not os.path.exists(txt_path): | |
| return None, f"text cache missing: {txt_name}" | |
| with open(txt_path, "r", encoding="utf-8") as f: | |
| text = f.read() | |
| if len(text) < 1000: | |
| return None, f"too short ({len(text)} chars)" | |
| uin_val, uin_quote = extract_uin(text) | |
| min_age, min_unit, min_q = extract_min_entry_age(text) | |
| max_age, max_unit, max_q = extract_max_entry_age(text) | |
| renewal_age, renewal_q = extract_renewal_age(text) | |
| si_vals, si_q = extract_sum_insured_options(text) | |
| init_wait, init_q = extract_initial_waiting(text) | |
| ped_m, ped_q = extract_ped_waiting(text) | |
| sd_m, sd_q = extract_specific_disease_waiting(text) | |
| mat_m, mat_q = extract_maternity_waiting(text) | |
| pre_d, pre_q = extract_pre_hosp_days(text) | |
| post_d, post_q = extract_post_hosp_days(text) | |
| dc_n, dc_q = extract_day_care_count(text) | |
| ayush_b, ayush_q = extract_ayush(text) | |
| mat_b, mat_bq = extract_maternity(text) | |
| nb_b, nb_q = extract_newborn(text) | |
| od_b, od_q = extract_organ_donor(text) | |
| ncb_v, ncb_q = extract_ncb(text) | |
| restore_v, restore_q = extract_restoration(text) | |
| rr_v, rr_q = extract_room_rent(text) | |
| copay_v, copay_q = extract_copayment(text) | |
| ded_v, ded_q = extract_deductible(text) | |
| cash_v, cash_q = extract_cashless(text) | |
| ptype_v, ptype_q = extract_policy_type(text, policy_id) | |
| # Compose JSON | |
| j = { | |
| "policy_id": policy_id, | |
| "policy_name": policy_name, | |
| "insurer_slug": insurer, | |
| "uin_code": { | |
| "value": uin_val, | |
| "source_pdf_path": primary_pdf, | |
| "source_quote": (uin_quote or "")[:240] | |
| }, | |
| "min_entry_age": { | |
| "value": min_age, | |
| "unit": min_unit or "days", | |
| "source_pdf_path": primary_pdf, | |
| "source_quote": (min_q or "Not explicitly stated; per Policy Schedule")[:240] | |
| }, | |
| "max_entry_age": { | |
| "value": max_age, | |
| "unit": max_unit or "years", | |
| "source_pdf_path": primary_pdf, | |
| "source_quote": (max_q or "Not explicitly stated; per Policy Schedule")[:240] | |
| }, | |
| "max_renewal_age": { | |
| "value": renewal_age, | |
| "source_pdf_path": primary_pdf, | |
| "source_quote": (renewal_q or "Not specified")[:240] | |
| }, | |
| "sum_insured_options": { | |
| "value": si_vals, | |
| "unit": "INR", | |
| "source_pdf_path": primary_pdf, | |
| "source_quote": (si_q or "Per Policy Schedule")[:240] | |
| }, | |
| "initial_waiting_period_days": { | |
| "value": init_wait, | |
| "source_pdf_path": primary_pdf, | |
| "source_quote": (init_q or "Per IRDAI standard")[:240] | |
| }, | |
| "pre_existing_disease_waiting_months": { | |
| "value": ped_m, | |
| "source_pdf_path": primary_pdf, | |
| "source_quote": (ped_q or "PED waiting per policy wording")[:240] | |
| }, | |
| "specific_disease_waiting_months": { | |
| "value": sd_m, | |
| "source_pdf_path": primary_pdf, | |
| "source_quote": (sd_q or "Specific disease waiting per IRDAI standard")[:240] | |
| }, | |
| "maternity_waiting_months": { | |
| "value": mat_m, | |
| "source_pdf_path": primary_pdf, | |
| "source_quote": (mat_q or "Maternity waiting only applies if maternity covered/opted")[:240] | |
| }, | |
| "pre_hospitalization_days": { | |
| "value": pre_d, | |
| "source_pdf_path": primary_pdf, | |
| "source_quote": (pre_q or "Pre-hosp days per Policy Schedule")[:240] | |
| }, | |
| "post_hospitalization_days": { | |
| "value": post_d, | |
| "source_pdf_path": primary_pdf, | |
| "source_quote": (post_q or "Post-hosp days per Policy Schedule")[:240] | |
| }, | |
| "day_care_treatments_count": { | |
| "value": dc_n, | |
| "source_pdf_path": primary_pdf, | |
| "source_quote": (dc_q or "Day care covered per definition; count not enumerated")[:240] | |
| }, | |
| "ayush_coverage": { | |
| "value": ayush_b, | |
| "source_pdf_path": primary_pdf, | |
| "source_quote": (ayush_q or "AYUSH cover not explicitly found")[:240] | |
| }, | |
| "maternity_coverage": { | |
| "value": mat_b, | |
| "source_pdf_path": primary_pdf, | |
| "source_quote": (mat_bq or "Maternity status not explicitly extracted")[:240] | |
| }, | |
| "newborn_coverage": { | |
| "value": nb_b, | |
| "source_pdf_path": primary_pdf, | |
| "source_quote": (nb_q or "Newborn cover status not extracted")[:240] | |
| }, | |
| "organ_donor_expenses": { | |
| "value": od_b, | |
| "source_pdf_path": primary_pdf, | |
| "source_quote": (od_q or "Organ donor benefit not extracted")[:240] | |
| }, | |
| "no_claim_bonus_pct": { | |
| "value": ncb_v, | |
| "source_pdf_path": primary_pdf, | |
| "source_quote": (ncb_q or "NCB % not extracted")[:240] | |
| }, | |
| "restoration_benefit": { | |
| "value": restore_v, | |
| "source_pdf_path": primary_pdf, | |
| "source_quote": (restore_q or "Restoration not found in extracted text")[:240] | |
| }, | |
| "room_rent_capping": { | |
| "value": rr_v, | |
| "source_pdf_path": primary_pdf, | |
| "source_quote": (rr_q or "Room rent capping not extracted")[:240] | |
| }, | |
| "copayment_pct": { | |
| "value": copay_v, | |
| "source_pdf_path": primary_pdf, | |
| "source_quote": (copay_q or "No mandatory copay")[:240] | |
| }, | |
| "deductible_amount": { | |
| "value": ded_v, | |
| "source_pdf_path": primary_pdf, | |
| "source_quote": (ded_q or "No base deductible")[:240] | |
| }, | |
| "network_hospital_count": { | |
| "value": None, | |
| "source_url": None, | |
| "source_quote": "Insurer-level metric; not extracted in this curation pass" | |
| }, | |
| "cashless_treatment_supported": { | |
| "value": cash_v if cash_v is not None else True, | |
| "source_pdf_path": primary_pdf, | |
| "source_quote": (cash_q or "Cashless implicit via insurer network")[:240] | |
| }, | |
| "claim_settlement_ratio": { | |
| "value": None, | |
| "source_url": None, | |
| "source_quote": "Insurer-level metric (IRDAI Annual Report); not extracted" | |
| }, | |
| "tat_cashless_authorization_hours": { | |
| "value": None, | |
| "source_pdf_path": None, | |
| "source_quote": "TAT not specified in policy wording; governed by IRDAI Master Circular" | |
| }, | |
| "policy_type": { | |
| "value": ptype_v, | |
| "source_pdf_path": primary_pdf, | |
| "source_quote": (ptype_q or "Policy type inferred from product structure")[:240] | |
| }, | |
| } | |
| # Completeness: count populated fields (value != None & not insurer-level) | |
| pdf_fields = [ | |
| "uin_code", "min_entry_age", "max_entry_age", "sum_insured_options", | |
| "initial_waiting_period_days", "pre_existing_disease_waiting_months", | |
| "specific_disease_waiting_months", "pre_hospitalization_days", | |
| "post_hospitalization_days", "day_care_treatments_count", | |
| "ayush_coverage", "maternity_coverage", "newborn_coverage", | |
| "organ_donor_expenses", "no_claim_bonus_pct", "restoration_benefit", | |
| "room_rent_capping", "copayment_pct", "policy_type", | |
| "cashless_treatment_supported" | |
| ] | |
| filled = sum(1 for f in pdf_fields if j[f]["value"] not in (None, "")) | |
| pct = int(round(filled / len(pdf_fields) * 100)) | |
| j["_meta"] = { | |
| "curated_at": "2026-05-14", | |
| "primary_source_pdf": primary_pdf, | |
| "supporting_source_pdfs": supporting, | |
| "completeness_pct": pct, | |
| "notes": "Pattern-based extraction from local PDF via pdfplumber. Insurer-level metrics (CSR, network count) left null pending downstream backfill." | |
| } | |
| return j, pct | |
| def main(): | |
| results = [] | |
| skipped = [] | |
| for i, entry in enumerate(MANIFEST, 1): | |
| policy_id = entry[0] | |
| out_path = os.path.join(OUT_DIR, f"{policy_id}.json") | |
| # Skip if already exists | |
| if os.path.exists(out_path): | |
| print(f"[{i}/{len(MANIFEST)}] {policy_id}: EXISTS β skipping") | |
| continue | |
| j, pct = curate_one(entry) | |
| if j is None: | |
| print(f"[{i}/{len(MANIFEST)}] {policy_id}: SKIP β {pct}") | |
| skipped.append((policy_id, pct)) | |
| continue | |
| if pct < 50: | |
| print(f"[{i}/{len(MANIFEST)}] {policy_id}: LOW {pct}% β skipping (below threshold)") | |
| skipped.append((policy_id, f"low completeness {pct}%")) | |
| continue | |
| with open(out_path, "w", encoding="utf-8") as f: | |
| json.dump(j, f, indent=2, ensure_ascii=False) | |
| print(f"[{i}/{len(MANIFEST)}] {policy_id}: {pct}%") | |
| results.append((policy_id, pct)) | |
| print() | |
| print(f"Wrote {len(results)} JSONs.") | |
| print(f"Skipped {len(skipped)}.") | |
| if results: | |
| avg = sum(p for _, p in results) / len(results) | |
| print(f"Average completeness: {avg:.1f}%") | |
| return results, skipped | |
| if __name__ == "__main__": | |
| main() | |