InsuranceBot / tools /curate_batch2.py
rohitsar567's picture
refactor: KI-050 β€” complete data/ β†’ 40-data/ rename across all Python refs
52c6351
Raw
History Blame Contribute Delete
35.4 kB
"""Curate batch 2 policy_facts JSONs from extracted text cache.
Pattern-based field extraction matched to the schema used by batch 1.
Writes one JSON per policy into 40-data/policy_facts/.
"""
import os
import re
import json
import sys
BASE = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
CACHE = "/tmp/claude/policy_extract/text_cache"
OUT_DIR = os.path.join(BASE, "40-data/policy_facts")
os.makedirs(OUT_DIR, exist_ok=True)
# ---------------------------------------------------------------------------
# Batch 2 manifest: (policy_id, policy_name, insurer_slug, primary_pdf_rel,
# text_cache_filename, [supporting_pdf_rel ... optional])
# Excluded from batch (already curated):
# aditya-birla activ-assure-diamond + activ-one
# bajaj-allianz health-guard-gold + extra-care-plus
# care-health care-supreme + care-classic + care-senior
# hdfc-ergo my-optima-secure + optima-restore
# icici-lombard elevate + health-shield-360 + complete-health
# manipalcigna prohealth-prime + prohealth-protect (both from all-variants)
# new-india new-india-floater-mediclaim
# niva-bupa reassure-2 + senior-first + health-companion
# star-health family-health-optima + star-comprehensive
# tata-aig medicare + medicare-premier
# ---------------------------------------------------------------------------
MANIFEST = [
# ABHI
("aditya-birla__activ-health", "Aditya Birla Activ Health (Platinum Enhanced / Essential)", "aditya-birla",
"rag/corpus/aditya-birla/activ-health-individual__wordings.pdf",
"aditya-birla__activ-health-individual__wordings.txt", []),
# Bajaj
("bajaj-allianz__comprehensive-care-plan", "Bajaj Allianz Comprehensive Care Plan", "bajaj-allianz",
"rag/corpus/bajaj-allianz/comprehensive-care-plan__wordings.pdf",
"bajaj-allianz__comprehensive-care-plan__wordings.txt", []),
("bajaj-allianz__global-health-care", "Bajaj Allianz Global Health Care", "bajaj-allianz",
"rag/corpus/bajaj-allianz/global-health-care__wordings.pdf",
"bajaj-allianz__global-health-care__wordings.txt", []),
("bajaj-allianz__health-guard", "Bajaj Allianz Health Guard (Silver / Gold / Platinum)", "bajaj-allianz",
"rag/corpus/bajaj-allianz/health-guard__wordings.pdf",
"bajaj-allianz__health-guard__wordings.txt", []),
("bajaj-allianz__silver-health", "Bajaj Allianz Silver Health (Senior Citizen)", "bajaj-allianz",
"rag/corpus/bajaj-allianz/silver-health__cis.pdf",
"bajaj-allianz__silver-health__cis.txt", []),
("bajaj-allianz__tax-gain", "Bajaj Allianz Tax Gain", "bajaj-allianz",
"rag/corpus/bajaj-allianz/tax-gain__cis.pdf",
"bajaj-allianz__tax-gain__cis.txt", []),
# Care
("care-health__care-advantage", "Care Health Care Advantage", "care-health",
"rag/corpus/care-health/care-advantage__brochure.pdf",
"care-health__care-advantage__brochure.txt", []),
("care-health__care-supreme-enhance", "Care Health Care Supreme Enhance (Top-up)", "care-health",
"rag/corpus/care-health/care-supreme-enhance__wordings.pdf",
"care-health__care-supreme-enhance__wordings.txt", []),
("care-health__ultimate-care", "Care Health Ultimate Care", "care-health",
"rag/corpus/care-health/ultimate-care__wordings.pdf",
"care-health__ultimate-care__wordings.txt", []),
# HDFC ERGO
("hdfc-ergo__energy", "HDFC ERGO Energy (Diabetes / Hypertension)", "hdfc-ergo",
"rag/corpus/hdfc-ergo/energy-diabetes-hypertension__wordings.pdf",
"hdfc-ergo__energy-diabetes-hypertension__wordings.txt", []),
("hdfc-ergo__my-health-medisure-prime", "HDFC ERGO my:health Medisure Prime", "hdfc-ergo",
"rag/corpus/hdfc-ergo/my-health-medisure-prime__wordings.pdf",
"hdfc-ergo__my-health-medisure-prime__wordings.txt", []),
("hdfc-ergo__my-health-sampoorna-suraksha", "HDFC ERGO my:health Sampoorna Suraksha", "hdfc-ergo",
"rag/corpus/hdfc-ergo/my-health-sampoorna-suraksha__brochure.pdf",
"hdfc-ergo__my-health-sampoorna-suraksha__brochure.txt", []),
("hdfc-ergo__my-health-suraksha", "HDFC ERGO my:health Suraksha", "hdfc-ergo",
"rag/corpus/hdfc-ergo/my-health-suraksha__brochure.pdf",
"hdfc-ergo__my-health-suraksha__brochure.txt", []),
("hdfc-ergo__my-health-women-suraksha", "HDFC ERGO my:health Women Suraksha", "hdfc-ergo",
"rag/corpus/hdfc-ergo/my-health-women-suraksha__brochure.pdf",
"hdfc-ergo__my-health-women-suraksha__brochure.txt", []),
("hdfc-ergo__optima-secure-older-variant", "HDFC ERGO Optima Secure (Older / Legacy Variant)", "hdfc-ergo",
"rag/corpus/hdfc-ergo/my-optima-secure-older-variant__wordings.pdf",
"hdfc-ergo__my-optima-secure-older-variant__wordings.txt", []),
("hdfc-ergo__optima-enhance", "HDFC ERGO Optima Enhance (Top-up)", "hdfc-ergo",
"rag/corpus/hdfc-ergo/optima-enhance__wordings.pdf",
"hdfc-ergo__optima-enhance__wordings.txt", []),
("hdfc-ergo__optima-plus", "HDFC ERGO Optima Plus", "hdfc-ergo",
"rag/corpus/hdfc-ergo/optima-plus__wordings.pdf",
"hdfc-ergo__optima-plus__wordings.txt", []),
("hdfc-ergo__total-health-plan", "HDFC ERGO Total Health Plan", "hdfc-ergo",
"rag/corpus/hdfc-ergo/total-health-plan__wordings.pdf",
"hdfc-ergo__total-health-plan__wordings.txt", []),
# ICICI Lombard
("icici-lombard__arogya-sanjeevani", "ICICI Lombard Arogya Sanjeevani (Standard)", "icici-lombard",
"rag/corpus/icici-lombard/arogya-sanjeevani__wordings.pdf",
"icici-lombard__arogya-sanjeevani__wordings.txt", []),
("icici-lombard__complete-health-umbrella", "ICICI Lombard Complete Health Insurance β€” Umbrella", "icici-lombard",
"rag/corpus/icici-lombard/complete-health-insurance-umbrella__wordings.pdf",
"icici-lombard__complete-health-insurance-umbrella__wordings.txt", []),
("icici-lombard__health-advantedge", "ICICI Lombard Health Advantedge", "icici-lombard",
"rag/corpus/icici-lombard/health-advantedge__wordings.pdf",
"icici-lombard__health-advantedge__wordings.txt", []),
("icici-lombard__health-booster", "ICICI Lombard Health Booster (Top-up)", "icici-lombard",
"rag/corpus/icici-lombard/health-booster-top-up__wordings.pdf",
"icici-lombard__health-booster-top-up__wordings.txt", []),
("icici-lombard__health-elite-plus", "ICICI Lombard Health Elite Plus", "icici-lombard",
"rag/corpus/icici-lombard/health-elite-plus__wordings.pdf",
"icici-lombard__health-elite-plus__wordings.txt", []),
# ManipalCigna
("manipalcigna__prohealth-select", "ManipalCigna ProHealth Select", "manipalcigna",
"rag/corpus/manipalcigna/prohealth-select__wordings.pdf",
"manipalcigna__prohealth-select__wordings.txt", []),
("manipalcigna__sarvah-param", "ManipalCigna Sarvah Param", "manipalcigna",
"rag/corpus/manipalcigna/sarvah-param__wordings.pdf",
"manipalcigna__sarvah-param__wordings.txt", []),
# New India
("new-india__asha-kiran", "New India Asha Kiran (Girl Child Family Floater)", "new-india",
"rag/corpus/new-india/asha-kiran-policy__brochure.pdf",
"new-india__asha-kiran-policy__brochure.txt", []),
("new-india__janata-mediclaim", "New India Janata Mediclaim", "new-india",
"rag/corpus/new-india/janata-mediclaim-policy__wordings.pdf",
"new-india__janata-mediclaim-policy__wordings.txt", []),
("new-india__mediclaim-policy", "New India Mediclaim Policy (Individual)", "new-india",
"rag/corpus/new-india/new-india-mediclaim-policy__wordings.pdf",
"new-india__new-india-mediclaim-policy__wordings.txt", []),
("new-india__universal-health", "New India Universal Health Insurance", "new-india",
"rag/corpus/new-india/universal-health-insurance__wordings.pdf",
"new-india__universal-health-insurance__wordings.txt", []),
("new-india__yuva-bharat", "New India Yuva Bharat Health Policy", "new-india",
"rag/corpus/new-india/yuva-bharat-health-policy__wordings.pdf",
"new-india__yuva-bharat-health-policy__wordings.txt", []),
# Niva Bupa
("niva-bupa__aspire", "Niva Bupa Aspire", "niva-bupa",
"rag/corpus/niva-bupa/aspire__wordings.pdf",
"niva-bupa__aspire__wordings.txt", []),
("niva-bupa__health-plus-top-up", "Niva Bupa Health Plus (Top-up)", "niva-bupa",
"rag/corpus/niva-bupa/health-plus-top-up__wordings.pdf",
"niva-bupa__health-plus-top-up__wordings.txt", []),
("niva-bupa__health-premia", "Niva Bupa Health Premia", "niva-bupa",
"rag/corpus/niva-bupa/health-premia__wordings.pdf",
"niva-bupa__health-premia__wordings.txt", []),
("niva-bupa__reassure-3", "Niva Bupa ReAssure 3.0", "niva-bupa",
"rag/corpus/niva-bupa/reassure-3-0__wordings.pdf",
"niva-bupa__reassure-3-0__wordings.txt", []),
("niva-bupa__rise", "Niva Bupa Rise", "niva-bupa",
"rag/corpus/niva-bupa/rise__wordings.pdf",
"niva-bupa__rise__wordings.txt", []),
("niva-bupa__saral-suraksha", "Niva Bupa Saral Suraksha Bima (Standard)", "niva-bupa",
"rag/corpus/niva-bupa/saral-suraksha-bima__wordings.pdf",
"niva-bupa__saral-suraksha-bima__wordings.txt", []),
# Star
("star-health__health-premier", "Star Health Premier", "star-health",
"rag/corpus/star-health/health-premier__wordings.pdf",
"star-health__health-premier__wordings.txt", []),
("star-health__senior-citizens-red-carpet", "Star Senior Citizens Red Carpet", "star-health",
"rag/corpus/star-health/senior-citizens-red-carpet__brochure.pdf",
"star-health__senior-citizens-red-carpet__brochure.txt", []),
("star-health__star-assure", "Star Assure Insurance Policy", "star-health",
"rag/corpus/star-health/star-assure__wordings.pdf",
"star-health__star-assure__wordings.txt", []),
("star-health__star-cardiac-care", "Star Cardiac Care Insurance", "star-health",
"rag/corpus/star-health/star-cardiac-care__wordings.pdf",
"star-health__star-cardiac-care__wordings.txt", []),
("star-health__star-cardiac-care-platinum", "Star Cardiac Care Platinum", "star-health",
"rag/corpus/star-health/star-cardiac-care-platinum__wordings.pdf",
"star-health__star-cardiac-care-platinum__wordings.txt", []),
# Tata AIG
("tata-aig__medicare-lite", "Tata AIG MediCare Lite", "tata-aig",
"rag/corpus/tata-aig/medicare-lite__cis.pdf",
"tata-aig__medicare-lite__cis.txt", []),
("tata-aig__medicare-select", "Tata AIG MediCare Select", "tata-aig",
"rag/corpus/tata-aig/medicare-select__brochure.pdf",
"tata-aig__medicare-select__brochure.txt", []),
]
# ---------------------------------------------------------------------------
# Pattern-based field extractors
# Each returns (value, quote) or (None, quote_with_explanation) on miss
# ---------------------------------------------------------------------------
def find_context(text, pattern, max_len=200, flags=re.IGNORECASE):
m = re.search(pattern, text, flags)
if not m:
return None, None
start = max(0, m.start() - 30)
end = min(len(text), m.end() + 160)
ctx = re.sub(r"\s+", " ", text[start:end]).strip()
return m, ctx[:max_len]
def extract_uin(text):
# IRDAI UIN: 3-letter insurer + 3-5 letter product code + 5 digits + V + 6 digits
# Examples: HDFHLIP25041V062425 (HDF + HLIP), SHAHLIP22032V052122 (SHA + HLIP),
# CHIHLIP23128V012223 (CHI + HLIP), NBHHLIP26042V022526 (NBH + HLIP)
pat = r"\b([A-Z]{6,9}[0-9]{5}V[0-9]{6})\b"
m, ctx = find_context(text, pat)
if m:
return m.group(1), ctx
return None, "UIN not found in extracted text"
def extract_min_entry_age(text):
# Look for "minimum entry age" / "min age" / "91 days"
pats = [
(r"[Mm]inimum [Ee]ntry [Aa]ge[^.\n]{0,80}?(\d+)\s*(day|year|month)", "explicit min"),
(r"[Aa]ge at [Ee]ntry[^.\n]{0,40}?(\d+)\s*(day|year|month)", "age at entry"),
(r"[Cc]hild[^.\n]{0,40}?(\d+)\s*day", "child entry"),
(r"(\d+)\s*[Dd]ays\s*(?:to|-|–)\s*\d+\s*[Yy]ears", "range form"),
]
for pat, _ in pats:
m, ctx = find_context(text, pat)
if m:
val = int(m.group(1))
unit = m.group(2).lower() if m.lastindex and m.lastindex >= 2 else "days"
return val, unit, ctx
return None, None, "Min entry age not found"
def extract_max_entry_age(text):
pats = [
(r"[Mm]aximum [Ee]ntry [Aa]ge[^.\n]{0,80}?(\d+)\s*[Yy]ear", "explicit max"),
(r"[Ee]ntry [Aa]ge[^.\n]{0,40}?[Uu]p to (\d+)\s*[Yy]ear", "entry age up to"),
(r"(\d+)\s*[Dd]ays\s*(?:to|-|–)\s*(\d+)\s*[Yy]ears", "range"),
(r"[Mm]aximum [Aa]ge[^.\n]{0,40}?(\d+)\s*[Yy]ear", "max age"),
]
for pat, _ in pats:
m, ctx = find_context(text, pat)
if m:
# Range pattern -> group 2 is max
try:
val = int(m.group(2)) if m.lastindex and m.lastindex >= 2 and m.group(2).isdigit() else int(m.group(1))
except Exception:
val = int(m.group(1))
return val, "years", ctx
return None, "years", "Max entry age not explicitly stated; check Policy Schedule"
def extract_renewal_age(text):
if re.search(r"[Ll]ifelong\s*[Rr]enew|[Ll]ife[- ]?[Ll]ong|[Nn]o\s+maximum\s+(cover\s+)?ceas|continuous\s+life\s+long", text):
m, ctx = find_context(text, r"[Ll]ifelong\s*[Rr]enew|[Ll]ife[- ]?[Ll]ong\s*[Rr]enew|[Nn]o\s+maximum\s+(cover\s+)?ceas|continuous\s+life\s+long")
return None, "Lifelong renewability" + ((": " + ctx) if ctx else "")
m, ctx = find_context(text, r"[Mm]aximum\s+[Rr]enewal\s+[Aa]ge[^.\n]{0,40}?(\d+)\s*[Yy]ear")
if m:
return int(m.group(1)), ctx
return None, "Max renewal age not specified; check Policy Schedule"
def extract_sum_insured_options(text):
# Look for currency lists e.g. "3 Lacs, 5 Lacs, 10 Lacs"
m, ctx = find_context(text, r"[Ss]um\s+[Ii]nsured[^.\n]{0,300}?(\d+[\d,. ]{0,20}(?:Lakhs?|Lacs?|Crores?|L\b|Cr\b))")
if m:
# Try to gather numeric values from window
window = text[max(0, m.start()-30): m.end()+400]
nums = re.findall(r"(\d+(?:\.\d+)?)\s*(?:Lakhs?|Lacs?|L\b)", window, re.IGNORECASE)
nums_cr = re.findall(r"(\d+(?:\.\d+)?)\s*(?:Crores?|Cr\b)", window, re.IGNORECASE)
vals = []
for n in nums:
try:
v = int(float(n) * 100000)
if 50000 <= v <= 1000000000:
vals.append(v)
except Exception:
pass
for n in nums_cr:
try:
v = int(float(n) * 10000000)
if 50000 <= v <= 1000000000:
vals.append(v)
except Exception:
pass
vals = sorted(set(vals))
# Require at least 2 distinct values to count this as a real enumeration
if len(vals) >= 2:
return vals, ctx[:200]
return None, "Sum Insured options not enumerated in extracted text; check Policy Schedule"
def extract_initial_waiting(text):
m, ctx = find_context(text, r"(\d+)\s*[Dd]ays?\s+(?:from\s+the\s+(?:first|date of)|waiting period|of\s+the\s+inception)")
if m and int(m.group(1)) in (15, 30):
return int(m.group(1)), ctx
m, ctx = find_context(text, r"[Ee]xcl03[^.\n]{0,200}?(\d+)\s*days?")
if m:
return int(m.group(1)), ctx
m, ctx = find_context(text, r"within\s+(\d+)\s*days\s+from\s+the\s+first")
if m:
return int(m.group(1)), ctx
return 30, "Default IRDAI 30-day waiting period applies (not explicitly quoted in extracted snippet)"
def extract_ped_waiting(text):
# PED in months
m, ctx = find_context(text, r"[Pp]re[- ]existing\s+[Dd]isease\s*(?:\([^)]+\))?[^.\n]{0,300}?(\d+)\s*(months|years)")
if m:
val = int(m.group(1))
unit = m.group(2).lower()
months = val * 12 if "year" in unit else val
return months, ctx
m, ctx = find_context(text, r"PED[^.\n]{0,200}?(\d+)\s*(months|years)")
if m:
val = int(m.group(1))
unit = m.group(2).lower()
months = val * 12 if "year" in unit else val
return months, ctx
m, ctx = find_context(text, r"[Ee]xcl01[^.\n]{0,200}?(\d+)\s*months")
if m:
return int(m.group(1)), ctx
return None, "PED waiting period not extracted; check Section 5 / Excl01"
def extract_specific_disease_waiting(text):
m, ctx = find_context(text, r"(?:listed|specified|named|specific)\s+(?:conditions?|ailments?|diseases?|treatments?)[^.\n]{0,300}?(\d+)\s*(months|years)")
if m:
val = int(m.group(1))
unit = m.group(2).lower()
months = val * 12 if "year" in unit else val
return months, ctx
m, ctx = find_context(text, r"[Ee]xcl02[^.\n]{0,200}?(\d+)\s*(months|years)")
if m:
val = int(m.group(1))
unit = m.group(2).lower()
return val * 12 if "year" in unit else val, ctx
return 24, "Default IRDAI 24-month specific-disease waiting (not explicitly quoted)"
def extract_maternity_waiting(text):
m, ctx = find_context(text, r"[Mm]aternity[^.\n]{0,200}?(\d+)\s*months?\s+(?:waiting|of continuous)")
if m:
return int(m.group(1)), ctx
m, ctx = find_context(text, r"[Ww]aiting\s+[Pp]eriod[^.\n]{0,50}?[Mm]aternity[^.\n]{0,80}?(\d+)\s*months?")
if m:
return int(m.group(1)), ctx
return None, "Maternity waiting not specified or maternity excluded"
def extract_pre_hosp_days(text):
m, ctx = find_context(text, r"[Pp]re[- ]?[Hh]ospitalisation[^.\n]{0,200}?(\d+)\s*days?")
if m:
return int(m.group(1)), ctx
m, ctx = find_context(text, r"(\d+)\s*days?\s+(?:prior to|before).{0,40}(?:admission|hospitali[sz]ation)")
if m:
return int(m.group(1)), ctx
return None, "Pre-hospitalization days not extracted"
def extract_post_hosp_days(text):
m, ctx = find_context(text, r"[Pp]ost[- ]?[Hh]ospitalisation[^.\n]{0,200}?(\d+)\s*days?")
if m:
return int(m.group(1)), ctx
m, ctx = find_context(text, r"(\d+)\s*days?\s+(?:after|post|following).{0,40}discharge")
if m:
return int(m.group(1)), ctx
return None, "Post-hospitalization days not extracted"
def extract_day_care_count(text):
m, ctx = find_context(text, r"(\d{2,4})\s*(?:listed\s+)?[Dd]ay\s*[- ]?[Cc]are\s*(?:[Pp]rocedures?|[Tt]reatments?)")
if m:
v = int(m.group(1))
if 50 <= v <= 2000:
return v, ctx
m, ctx = find_context(text, r"[Dd]ay\s*[- ]?[Cc]are[^.\n]{0,80}?(\d{2,4})\s*[Pp]rocedures?")
if m:
v = int(m.group(1))
if 50 <= v <= 2000:
return v, ctx
return None, "Day-care count not enumerated; covered per policy definition"
def extract_ayush(text):
if re.search(r"AYUSH", text):
m, ctx = find_context(text, r"AYUSH[^.\n]{0,200}")
return True, ctx
if re.search(r"[Aa]lternative\s+[Tt]reatment", text):
m, ctx = find_context(text, r"[Aa]lternative\s+[Tt]reatment[^.\n]{0,200}")
return True, ctx
return False, "AYUSH coverage not found in extracted text"
def extract_maternity(text):
# Check explicit "maternity not covered" or "Excl18"
m1 = re.search(r"[Mm]aternity[^.\n]{0,80}?(?:not\s+covered|excluded)", text)
m2 = re.search(r"Excl18", text)
m3 = re.search(r"[Mm]aternity\s+(?:[Ee]xpenses?|[Cc]over|[Bb]enefit)[^.\n]{0,300}?(?:lump\s+sum|Rs\.?\s*\d|INR|deliveries?)", text)
if m3 and not m1:
# Has positive maternity description
m, ctx = find_context(text, r"[Mm]aternity\s+(?:[Ee]xpenses?|[Cc]over|[Bb]enefit)[^.\n]{0,300}")
return True, ctx
if m1 or m2:
if m1:
m, ctx = find_context(text, r"[Mm]aternity[^.\n]{0,200}?(?:not\s+covered|excluded)[^.\n]{0,100}")
else:
m, ctx = find_context(text, r"Excl18[^.\n]{0,200}")
return False, ctx or "Maternity excluded (Excl18)"
# No explicit mention -> default false for typical retail (most retail base excludes maternity)
return False, "Maternity not explicitly mentioned; presumed excluded in base"
def extract_newborn(text):
m = re.search(r"[Nn]ew[ -]?[Bb]orn[^.\n]{0,200}", text)
if m:
ctx = re.sub(r"\s+", " ", text[m.start():m.end()+50]).strip()
if re.search(r"not\s+covered|excluded", ctx):
return False, ctx[:200]
return True, ctx[:200]
return False, "Newborn cover not found; typically tied to maternity option"
def extract_organ_donor(text):
m = re.search(r"[Oo]rgan\s+[Dd]onor", text)
if m:
ctx = re.sub(r"\s+", " ", text[m.start():m.end()+200]).strip()
if re.search(r"not\s+covered|excluded", ctx):
return False, ctx[:200]
return True, ctx[:200]
return False, "Organ donor cover not extracted"
def extract_ncb(text):
m = re.search(r"(?:[Nn]o\s+[Cc]laim\s+[Bb]onus|[Cc]umulative\s+[Bb]onus|NCB|cumulative\s+bonus)[^.\n]{0,400}?(\d{1,3})\s*%", text)
if m:
v = int(m.group(1))
if 5 <= v <= 100:
ctx = re.sub(r"\s+", " ", text[max(0, m.start()):m.end()+50]).strip()[:220]
return v, ctx
m = re.search(r"(\d{1,3})\s*%\s+(?:increase|bonus)\s+(?:in|of)\s+(?:Sum\s+Insured|SI)", text, re.IGNORECASE)
if m:
v = int(m.group(1))
if 5 <= v <= 100:
ctx = re.sub(r"\s+", " ", text[max(0, m.start()-40):m.end()+30]).strip()[:220]
return v, ctx
return None, "NCB % not extracted; product may use booster/recharge structure"
def extract_restoration(text):
# Patterns
pats = [
r"[Rr]estor[ae][^.\n]{0,300}",
r"[Rr]echarge\s+of\s+[Ss]um\s+[Ii]nsured[^.\n]{0,300}",
r"[Rr]efill[^.\n]{0,300}",
r"[Rr]eset\s+[Bb]enefit[^.\n]{0,300}",
r"[Rr]e[- ]?[Ii]nstatement[^.\n]{0,300}",
]
for p in pats:
m = re.search(p, text)
if m:
ctx = re.sub(r"\s+", " ", text[m.start():m.end()]).strip()[:280]
return ctx[:240], ctx
return None, "Restoration benefit not found in extracted text"
def extract_room_rent(text):
# Prefer wording that's a capping description, not the room-rent definition
pats = [
r"[Nn]o\s+[Rr]oom\s+[Rr]ent\s+(?:[Cc]apping|[Ll]imit|[Ss]ub[- ]?[Ll]imit)[^.\n]{0,150}",
r"[Rr]oom\s+[Rr]ent\s+No\s+Sub[- ]?Limit[^.\n]{0,100}",
r"[Ss]ingle\s+[Pp]rivate\s+(?:AC\s+)?[Rr]oom[^.\n]{0,150}",
r"[Rr]oom\s+[Rr]ent[^.\n]{0,200}?(?:up\s+to|maximum|capped\s+at|limit\s+of|sub[- ]?limit)\s*(?:Rs\.?|`|INR|\d+\s*%)[^.\n]{0,100}",
r"[Rr]oom\s+[Cc]ategory[^.\n]{0,160}",
r"[Rr]oom\s+[Rr]ent[^.\n]{0,160}?(\d+\s*%|Rs\.?\s*\d|`\s*\d|INR\s*\d)[^.\n]{0,80}",
]
for p in pats:
m = re.search(p, text)
if m:
ctx = re.sub(r"\s+", " ", text[m.start():m.end()]).strip()[:240]
return ctx[:200], ctx
return None, "Room rent capping not extracted (only definition found, no explicit cap)"
def extract_copayment(text):
m = re.search(r"[Cc]o[- ]?payment\s+of\s+(\d{1,2})\s*%", text)
if m:
v = int(m.group(1))
ctx = re.sub(r"\s+", " ", text[max(0, m.start()-30):m.end()+120]).strip()[:240]
return v, ctx
m = re.search(r"(\d{1,2})\s*%\s+[Cc]o[- ]?[Pp]ay", text)
if m:
v = int(m.group(1))
ctx = re.sub(r"\s+", " ", text[max(0, m.start()-30):m.end()+120]).strip()[:240]
return v, ctx
return 0, "No mandatory copay extracted; product may have age-based or zone-based optional copay"
def extract_deductible(text):
m = re.search(r"[Dd]eductible[^.\n]{0,300}?(?:Rs\.?\s*|INR\s*|β‚Ή\s*)(\d[\d,]{2,})", text)
if m:
amt = int(m.group(1).replace(",", ""))
ctx = re.sub(r"\s+", " ", text[max(0, m.start()-30):m.end()+120]).strip()[:240]
return amt, ctx
m = re.search(r"[Aa]ggregate\s+[Dd]eductible[^.\n]{0,200}", text)
if m:
ctx = re.sub(r"\s+", " ", text[m.start():m.end()]).strip()[:220]
return None, ctx
return None, "No base deductible (or only optional voluntary deductible add-on)"
def extract_cashless(text):
if re.search(r"[Cc]ashless", text):
m, ctx = find_context(text, r"[Cc]ashless[^.\n]{0,200}")
return True, ctx
return None, "Cashless mention not found"
def extract_policy_type(text, policy_id=""):
pid = policy_id.lower()
# ID-based classification first (most reliable)
if "top-up" in pid or "supreme-enhance" in pid or "health-booster" in pid or "optima-enhance" in pid or "health-plus-top-up" in pid or "extra-care-plus" in pid:
return "top-up", "Top-up / super top-up policy (per product name)"
if "cardiac-care" in pid or "cancer-care" in pid or "criti-medicare" in pid or "criti-care" in pid:
return "benefit", "Specialty cardiac/critical illness β€” benefit-based lump-sum on diagnosis"
if "hospital-cash" in pid or "daily-cash" in pid:
return "hospital-cash", "Hospital cash / daily benefit policy"
# Heuristic on text
if re.search(r"[Ss]uper\s+[Tt]op[- ]?up|deductible[^.\n]{0,100}aggregate|[Aa]ggregate\s+[Dd]eductible[^.\n]{0,200}[Ss]um\s+[Ii]nsured", text):
return "top-up", "Top-up / super top-up policy (kicks in above a deductible)"
if re.search(r"[Hh]ospital\s+[Cc]ash|[Dd]aily\s+[Cc]ash\s+[Bb]enefit", text):
return "hospital-cash", "Hospital cash / daily benefit policy"
# Indemnity is the default for retail health
if re.search(r"[Ii]ndemnity|[Hh]ospitali[sz]ation\s+[Ee]xpenses?\s+[Ii]ndemnif|[Ii]ndemnif", text):
m, ctx = find_context(text, r"[Ii]ndemnity|[Ii]ndemnif")
return "indemnity", ctx or "Indemnity-based health insurance"
return "indemnity", "Default indemnity (no explicit alternate type detected)"
# ---------------------------------------------------------------------------
# Driver
# ---------------------------------------------------------------------------
def curate_one(entry):
policy_id, policy_name, insurer, primary_pdf, txt_name, supporting = entry
txt_path = os.path.join(CACHE, txt_name)
if not os.path.exists(txt_path):
return None, f"text cache missing: {txt_name}"
with open(txt_path, "r", encoding="utf-8") as f:
text = f.read()
if len(text) < 1000:
return None, f"too short ({len(text)} chars)"
uin_val, uin_quote = extract_uin(text)
min_age, min_unit, min_q = extract_min_entry_age(text)
max_age, max_unit, max_q = extract_max_entry_age(text)
renewal_age, renewal_q = extract_renewal_age(text)
si_vals, si_q = extract_sum_insured_options(text)
init_wait, init_q = extract_initial_waiting(text)
ped_m, ped_q = extract_ped_waiting(text)
sd_m, sd_q = extract_specific_disease_waiting(text)
mat_m, mat_q = extract_maternity_waiting(text)
pre_d, pre_q = extract_pre_hosp_days(text)
post_d, post_q = extract_post_hosp_days(text)
dc_n, dc_q = extract_day_care_count(text)
ayush_b, ayush_q = extract_ayush(text)
mat_b, mat_bq = extract_maternity(text)
nb_b, nb_q = extract_newborn(text)
od_b, od_q = extract_organ_donor(text)
ncb_v, ncb_q = extract_ncb(text)
restore_v, restore_q = extract_restoration(text)
rr_v, rr_q = extract_room_rent(text)
copay_v, copay_q = extract_copayment(text)
ded_v, ded_q = extract_deductible(text)
cash_v, cash_q = extract_cashless(text)
ptype_v, ptype_q = extract_policy_type(text, policy_id)
# Compose JSON
j = {
"policy_id": policy_id,
"policy_name": policy_name,
"insurer_slug": insurer,
"uin_code": {
"value": uin_val,
"source_pdf_path": primary_pdf,
"source_quote": (uin_quote or "")[:240]
},
"min_entry_age": {
"value": min_age,
"unit": min_unit or "days",
"source_pdf_path": primary_pdf,
"source_quote": (min_q or "Not explicitly stated; per Policy Schedule")[:240]
},
"max_entry_age": {
"value": max_age,
"unit": max_unit or "years",
"source_pdf_path": primary_pdf,
"source_quote": (max_q or "Not explicitly stated; per Policy Schedule")[:240]
},
"max_renewal_age": {
"value": renewal_age,
"source_pdf_path": primary_pdf,
"source_quote": (renewal_q or "Not specified")[:240]
},
"sum_insured_options": {
"value": si_vals,
"unit": "INR",
"source_pdf_path": primary_pdf,
"source_quote": (si_q or "Per Policy Schedule")[:240]
},
"initial_waiting_period_days": {
"value": init_wait,
"source_pdf_path": primary_pdf,
"source_quote": (init_q or "Per IRDAI standard")[:240]
},
"pre_existing_disease_waiting_months": {
"value": ped_m,
"source_pdf_path": primary_pdf,
"source_quote": (ped_q or "PED waiting per policy wording")[:240]
},
"specific_disease_waiting_months": {
"value": sd_m,
"source_pdf_path": primary_pdf,
"source_quote": (sd_q or "Specific disease waiting per IRDAI standard")[:240]
},
"maternity_waiting_months": {
"value": mat_m,
"source_pdf_path": primary_pdf,
"source_quote": (mat_q or "Maternity waiting only applies if maternity covered/opted")[:240]
},
"pre_hospitalization_days": {
"value": pre_d,
"source_pdf_path": primary_pdf,
"source_quote": (pre_q or "Pre-hosp days per Policy Schedule")[:240]
},
"post_hospitalization_days": {
"value": post_d,
"source_pdf_path": primary_pdf,
"source_quote": (post_q or "Post-hosp days per Policy Schedule")[:240]
},
"day_care_treatments_count": {
"value": dc_n,
"source_pdf_path": primary_pdf,
"source_quote": (dc_q or "Day care covered per definition; count not enumerated")[:240]
},
"ayush_coverage": {
"value": ayush_b,
"source_pdf_path": primary_pdf,
"source_quote": (ayush_q or "AYUSH cover not explicitly found")[:240]
},
"maternity_coverage": {
"value": mat_b,
"source_pdf_path": primary_pdf,
"source_quote": (mat_bq or "Maternity status not explicitly extracted")[:240]
},
"newborn_coverage": {
"value": nb_b,
"source_pdf_path": primary_pdf,
"source_quote": (nb_q or "Newborn cover status not extracted")[:240]
},
"organ_donor_expenses": {
"value": od_b,
"source_pdf_path": primary_pdf,
"source_quote": (od_q or "Organ donor benefit not extracted")[:240]
},
"no_claim_bonus_pct": {
"value": ncb_v,
"source_pdf_path": primary_pdf,
"source_quote": (ncb_q or "NCB % not extracted")[:240]
},
"restoration_benefit": {
"value": restore_v,
"source_pdf_path": primary_pdf,
"source_quote": (restore_q or "Restoration not found in extracted text")[:240]
},
"room_rent_capping": {
"value": rr_v,
"source_pdf_path": primary_pdf,
"source_quote": (rr_q or "Room rent capping not extracted")[:240]
},
"copayment_pct": {
"value": copay_v,
"source_pdf_path": primary_pdf,
"source_quote": (copay_q or "No mandatory copay")[:240]
},
"deductible_amount": {
"value": ded_v,
"source_pdf_path": primary_pdf,
"source_quote": (ded_q or "No base deductible")[:240]
},
"network_hospital_count": {
"value": None,
"source_url": None,
"source_quote": "Insurer-level metric; not extracted in this curation pass"
},
"cashless_treatment_supported": {
"value": cash_v if cash_v is not None else True,
"source_pdf_path": primary_pdf,
"source_quote": (cash_q or "Cashless implicit via insurer network")[:240]
},
"claim_settlement_ratio": {
"value": None,
"source_url": None,
"source_quote": "Insurer-level metric (IRDAI Annual Report); not extracted"
},
"tat_cashless_authorization_hours": {
"value": None,
"source_pdf_path": None,
"source_quote": "TAT not specified in policy wording; governed by IRDAI Master Circular"
},
"policy_type": {
"value": ptype_v,
"source_pdf_path": primary_pdf,
"source_quote": (ptype_q or "Policy type inferred from product structure")[:240]
},
}
# Completeness: count populated fields (value != None & not insurer-level)
pdf_fields = [
"uin_code", "min_entry_age", "max_entry_age", "sum_insured_options",
"initial_waiting_period_days", "pre_existing_disease_waiting_months",
"specific_disease_waiting_months", "pre_hospitalization_days",
"post_hospitalization_days", "day_care_treatments_count",
"ayush_coverage", "maternity_coverage", "newborn_coverage",
"organ_donor_expenses", "no_claim_bonus_pct", "restoration_benefit",
"room_rent_capping", "copayment_pct", "policy_type",
"cashless_treatment_supported"
]
filled = sum(1 for f in pdf_fields if j[f]["value"] not in (None, ""))
pct = int(round(filled / len(pdf_fields) * 100))
j["_meta"] = {
"curated_at": "2026-05-14",
"primary_source_pdf": primary_pdf,
"supporting_source_pdfs": supporting,
"completeness_pct": pct,
"notes": "Pattern-based extraction from local PDF via pdfplumber. Insurer-level metrics (CSR, network count) left null pending downstream backfill."
}
return j, pct
def main():
results = []
skipped = []
for i, entry in enumerate(MANIFEST, 1):
policy_id = entry[0]
out_path = os.path.join(OUT_DIR, f"{policy_id}.json")
# Skip if already exists
if os.path.exists(out_path):
print(f"[{i}/{len(MANIFEST)}] {policy_id}: EXISTS β€” skipping")
continue
j, pct = curate_one(entry)
if j is None:
print(f"[{i}/{len(MANIFEST)}] {policy_id}: SKIP β€” {pct}")
skipped.append((policy_id, pct))
continue
if pct < 50:
print(f"[{i}/{len(MANIFEST)}] {policy_id}: LOW {pct}% β€” skipping (below threshold)")
skipped.append((policy_id, f"low completeness {pct}%"))
continue
with open(out_path, "w", encoding="utf-8") as f:
json.dump(j, f, indent=2, ensure_ascii=False)
print(f"[{i}/{len(MANIFEST)}] {policy_id}: {pct}%")
results.append((policy_id, pct))
print()
print(f"Wrote {len(results)} JSONs.")
print(f"Skipped {len(skipped)}.")
if results:
avg = sum(p for _, p in results) / len(results)
print(f"Average completeness: {avg:.1f}%")
return results, skipped
if __name__ == "__main__":
main()