Spaces:
Sleeping
Sleeping
| """Canonical Pydantic schema for an Indian health insurance policy. | |
| This schema is grounded in the IRDAI Customer Information Sheet (CIS) format | |
| mandated by the "Health Insurance Standardisation" guidelines and IRDAI master | |
| circular on health insurance products. Field choices mirror the disclosures | |
| insurers are legally required to publish, supplemented by the comparison | |
| dimensions used by aggregators (PolicyBazaar, InsuranceDekho, Acko) so that the | |
| extracted records support both regulator-grade comparison and consumer-grade | |
| filtering. | |
| Design principles | |
| ----------------- | |
| 1. Most fields are Optional[...] because PDF extraction is lossy; the | |
| `extraction_confidence_pct` field captures uncertainty. | |
| 2. Enums are used only where the value set is bounded by regulation | |
| (policy type, geographic scope, premium mode). Free-text fields stay as | |
| `str` to avoid over-constraining the extractor. | |
| 3. The schema is forward-compatible: `Config.extra = "allow"` keeps unknown | |
| keys; v2 additions for Life / Motor / Travel can layer category-specific | |
| optional fields without breaking existing consumers. | |
| 4. Money is stored as INR integers (paise precision is not needed for | |
| policy-level comparison). Percentages are stored as floats in 0..100, not | |
| 0..1, to match how policy documents present them. | |
| """ | |
| from __future__ import annotations | |
| import re | |
| from datetime import date | |
| from enum import Enum | |
| from typing import Any, Dict, List, Optional, Union | |
| from pydantic import BaseModel, Field, field_validator | |
| # --------------------------------------------------------------------------- | |
| # Normalisation helpers β let the LLM emit natural-language variants while | |
| # we still store a clean enum value. The extractor frequently produces | |
| # "family floater" (with space) or "self+spouse+children" (kids spelled as | |
| # children) β these are semantically correct, just lexically off from our | |
| # canonical enum string. Without these normalisers the strict Pydantic | |
| # validator rejects ~40% of otherwise-good NIM V4-Pro extractions. | |
| # --------------------------------------------------------------------------- | |
| def _norm_token(s: str) -> str: | |
| """Lower, trim, collapse whitespace, replace spaces+hyphens with underscore.""" | |
| s = (s or "").strip().lower() | |
| s = re.sub(r"[\s\-]+", "_", s) | |
| return s | |
| _FAMILY_SYNONYMS = { | |
| "children": "kids", | |
| "child": "kids", | |
| "kid": "kids", | |
| "spouse_and_children": "self+spouse+kids", | |
| "spouse_and_kids": "self+spouse+kids", | |
| } | |
| def _norm_family(v: str) -> str: | |
| """Map common LLM variants to canonical FamilyComposition values.""" | |
| s = _norm_token(v).replace("_", "+") | |
| parts = [] | |
| for p in s.split("+"): | |
| parts.append(_FAMILY_SYNONYMS.get(p, p)) | |
| return "+".join(parts) | |
| _GEO_SYNONYMS = { | |
| "india": "pan_india", | |
| "india_only": "pan_india", | |
| "pan_india_only": "pan_india", | |
| "domestic": "pan_india", | |
| "domestic_india": "pan_india", | |
| "global": "worldwide", | |
| "global_ex_usa_canada": "worldwide_ex_usa_canada", | |
| "worldwide_excluding_usa_canada": "worldwide_ex_usa_canada", | |
| } | |
| def _norm_geo(v: str) -> str: | |
| """Map common LLM variants to canonical GeographicScope values. Falls back | |
| to pan_india if the value isn't recognised β most Indian policies are | |
| pan-India by default; honestly recording unknown as 'pan_india' is safer | |
| than rejecting the whole extraction.""" | |
| s = _norm_token(v) | |
| s = _GEO_SYNONYMS.get(s, s) | |
| return s if s in _VALID_GEO_SCOPES else "pan_india" | |
| _POLICY_TYPE_SYNONYMS = { | |
| "individual_family_floater": "family_floater", | |
| "individual_or_family_floater": "family_floater", | |
| "indemnity": "individual", | |
| "fixed_benefit": "other", | |
| "hospital_cash": "other", | |
| "personal_accident_insurance": "personal_accident", | |
| "pa": "personal_accident", | |
| "ci": "critical_illness", | |
| "cancer": "critical_illness", | |
| "diabetes": "disease_specific", | |
| "specific_disease": "disease_specific", | |
| } | |
| _VALID_POLICY_TYPES = { | |
| "individual", "family_floater", "senior_citizen", "critical_illness", | |
| "top_up", "super_top_up", "disease_specific", "group", | |
| "personal_accident", "other", | |
| } | |
| def _norm_policy_type_val(v: str) -> str: | |
| """Map common LLM variants to canonical PolicyType values. Strips slashes | |
| and falls back to 'other' for anything we cannot pin to a known type | |
| (better to record 'other' than to reject the whole policy).""" | |
| # Strip slashes / commas / pipes β V4-Pro sometimes emits compound values | |
| # like 'individual / family floater'. Take the first segment. | |
| s = v.strip() | |
| for sep in ("/", ",", "|", ";"): | |
| if sep in s: | |
| s = s.split(sep)[0] | |
| break | |
| s = _norm_token(s) | |
| s = _POLICY_TYPE_SYNONYMS.get(s, s) | |
| return s if s in _VALID_POLICY_TYPES else "other" | |
| _VALID_GEO_SCOPES = {"pan_india", "regional", "worldwide", "worldwide_ex_usa_canada"} | |
| # --------------------------------------------------------------------------- | |
| # Enumerations (bounded value sets) | |
| # --------------------------------------------------------------------------- | |
| class PolicyType(str, Enum): | |
| """Product category as defined by IRDAI product filings.""" | |
| INDIVIDUAL = "individual" | |
| FAMILY_FLOATER = "family_floater" | |
| SENIOR_CITIZEN = "senior_citizen" | |
| CRITICAL_ILLNESS = "critical_illness" | |
| TOP_UP = "top_up" | |
| SUPER_TOP_UP = "super_top_up" | |
| DISEASE_SPECIFIC = "disease_specific" | |
| GROUP = "group" | |
| PERSONAL_ACCIDENT = "personal_accident" | |
| OTHER = "other" | |
| class GeographicScope(str, Enum): | |
| """Territorial cover for hospitalization claims.""" | |
| PAN_INDIA = "pan_india" | |
| REGIONAL = "regional" | |
| WORLDWIDE = "worldwide" | |
| WORLDWIDE_EX_USA_CANADA = "worldwide_ex_usa_canada" | |
| class PremiumMode(str, Enum): | |
| """Premium payment frequency permitted by the insurer.""" | |
| MONTHLY = "monthly" | |
| QUARTERLY = "quarterly" | |
| HALF_YEARLY = "half_yearly" | |
| ANNUAL = "annual" | |
| SINGLE = "single" | |
| class FamilyComposition(str, Enum): | |
| """Allowed family composition under a single policy contract.""" | |
| SELF = "self" | |
| SELF_SPOUSE = "self+spouse" | |
| SELF_SPOUSE_KIDS = "self+spouse+kids" | |
| SELF_SPOUSE_KIDS_PARENTS = "self+spouse+kids+parents" | |
| MULTI_GENERATION = "multi_generation" | |
| CUSTOM = "custom" | |
| # --------------------------------------------------------------------------- | |
| # Composite value objects | |
| # --------------------------------------------------------------------------- | |
| class CoverageItem(BaseModel): | |
| """Reusable shape for a boolean-with-detail benefit. | |
| Many CIS rows are 'Yes, with limit X subject to Y waiting period'. Storing | |
| this as `{covered, limit_inr, limit_text, notes}` preserves comparability | |
| while keeping the original CIS wording for citation in the voice answer. | |
| """ | |
| covered: Optional[bool] = Field(None, description="True if the benefit is included.") | |
| limit_inr: Optional[int] = Field(None, description="Numeric monetary limit in INR, if any.") | |
| limit_text: Optional[str] = Field( | |
| None, | |
| description="Verbatim limit text from the policy (e.g. '1% of SI per day' or 'up to βΉ50,000').", | |
| ) | |
| notes: Optional[str] = Field(None, description="Conditions, sub-clauses, or carve-outs.") | |
| # --------------------------------------------------------------------------- | |
| # Main schema | |
| # --------------------------------------------------------------------------- | |
| class HealthPolicy(BaseModel): | |
| """Canonical record for one Indian health insurance policy variant. | |
| One row == one (insurer, policy_name, variant) tuple. Different sum-insured | |
| options live in `sum_insured_options_inr`, not as separate rows, because | |
| the underlying contract is the same. | |
| Extensibility: adding Life / Motor / Travel in v2 is done by introducing | |
| sibling models (`LifePolicy`, `MotorPolicy`) that share Identity + Source | |
| fields. Do NOT mutate or remove existing fields here; downstream extractors | |
| and the RAG store depend on stable keys. | |
| """ | |
| # === 1. Identity & metadata ============================================= | |
| policy_id: str = Field( | |
| ..., | |
| description="Slug we mint, e.g. 'niva-bupa-reassure-2'. Stable primary key.", | |
| ) | |
| insurer_name: str = Field(..., description="Legal insurer name, e.g. 'Niva Bupa Health Insurance Co. Ltd.'") | |
| insurer_slug: str = Field(..., description="Slug for insurer, e.g. 'niva-bupa'.") | |
| policy_name: str = Field(..., description="Marketing name of the policy, e.g. 'Reassure 2.0'.") | |
| policy_type: Optional[PolicyType] = Field(None, description="Product category per IRDAI filing.") | |
| def _norm_policy_type(cls, v: Any) -> Any: | |
| if isinstance(v, str): | |
| return _norm_policy_type_val(v) | |
| return v | |
| uin_code: Optional[str] = Field( | |
| None, | |
| description="IRDAI Unique Identification Number (UIN) β regulator-issued, " | |
| "e.g. 'NBHHLIP23068V012223'. Required for any IRDAI cross-reference.", | |
| ) | |
| # === 2. Eligibility ===================================================== | |
| min_entry_age_years: Optional[int] = Field(None, description="Minimum entry age for adult insured.") | |
| max_entry_age_years: Optional[int] = Field(None, description="Maximum entry age (often 65).") | |
| max_renewal_age_years: Optional[int] = Field( | |
| None, | |
| description="Maximum renewal age. IRDAI now mandates lifelong renewability; encode as 999 for 'lifelong'.", | |
| ) | |
| min_child_entry_age_days: Optional[int] = Field( | |
| None, | |
| description="Minimum entry age for dependent children in days (e.g. 91 for '91 days').", | |
| ) | |
| family_composition_allowed: Optional[List[FamilyComposition]] = Field( | |
| None, description="Composition options the contract supports." | |
| ) | |
| def _norm_family_list(cls, v: Any) -> Any: | |
| if isinstance(v, list): | |
| return [_norm_family(x) if isinstance(x, str) else x for x in v] | |
| return v | |
| residency_requirement: Optional[str] = Field( | |
| None, description="e.g. 'Indian resident only', 'NRI eligible with conditions'." | |
| ) | |
| # === 3. Sum insured & premium structure ================================= | |
| sum_insured_options_inr: Optional[List[int]] = Field( | |
| None, | |
| description="All sum insured tiers offered, in INR. e.g. [500000, 1000000, 2500000, 5000000, 10000000].", | |
| ) | |
| premium_payment_modes: Optional[List[PremiumMode]] = Field( | |
| None, description="Allowed billing frequencies." | |
| ) | |
| premium_range_indicative_inr: Optional[Dict[str, int]] = Field( | |
| None, | |
| description="Illustrative annual premium for a benchmark profile, keyed by age band. " | |
| "e.g. {'30-35_SI_10L': 12000, '50-55_SI_10L': 28000}. Filled from public quote pages, not the PDF.", | |
| ) | |
| premium_payment_term_years: Optional[int] = Field( | |
| None, description="Years over which premium must be paid (usually 1 for non-life)." | |
| ) | |
| def _coerce_premium_term(cls, v: Any) -> Any: | |
| # Accept list-of-options (e.g. [1, 2, 3]) by taking the smallest / | |
| # default option. Most non-life policies are billed annually so the | |
| # first int is the right "default term" for downstream consumers. | |
| if isinstance(v, list) and v: | |
| try: | |
| return int(v[0]) | |
| except (TypeError, ValueError): | |
| return None | |
| return v | |
| grace_period_days: Optional[int] = Field( | |
| None, description="Days past renewal date during which the policy stays continuously covered." | |
| ) | |
| free_look_period_days: Optional[int] = Field( | |
| None, description="Days the buyer can cancel for a full refund (IRDAI mandates 30 for digital sales)." | |
| ) | |
| # === 4. Waiting periods (CRITICAL for comparison) ======================= | |
| initial_waiting_period_days: Optional[int] = Field( | |
| None, description="Days from inception before any non-accident claim is payable (typically 30)." | |
| ) | |
| pre_existing_disease_waiting_months: Optional[int] = Field( | |
| None, | |
| description="Months before pre-existing conditions are covered. " | |
| "Industry range: 12 / 24 / 36 / 48. Lower is better for buyers.", | |
| ) | |
| specific_disease_waiting_months: Optional[int] = Field( | |
| None, | |
| description="Months before listed conditions (cataract, hernia, joint replacement etc.) are covered.", | |
| ) | |
| specific_diseases_listed: Optional[List[str]] = Field( | |
| None, description="The actual list of named conditions under the specific-disease waiting period." | |
| ) | |
| maternity_waiting_months: Optional[int] = Field( | |
| None, description="Months before maternity benefit kicks in (commonly 24-48 if maternity is included)." | |
| ) | |
| sub_limits_waiting_notes: Optional[str] = Field( | |
| None, description="Any other waiting-period sub-clauses not captured above." | |
| ) | |
| # === 5. Coverage scope ================================================== | |
| inpatient_hospitalization: Optional[CoverageItem] = Field( | |
| None, description="Core in-hospital cover. Almost always covered; details capture room/ICU caps." | |
| ) | |
| pre_hospitalization_days: Optional[int] = Field( | |
| None, description="Days of pre-admission expenses covered (commonly 30 / 60)." | |
| ) | |
| post_hospitalization_days: Optional[int] = Field( | |
| None, description="Days of post-discharge expenses covered (commonly 60 / 90 / 180)." | |
| ) | |
| day_care_treatments: Optional[CoverageItem] = Field( | |
| None, description="Procedures under 24 hrs. Encode count or 'all listed' in limit_text." | |
| ) | |
| domiciliary_treatment: Optional[CoverageItem] = Field( | |
| None, description="Home treatment when hospitalization is not possible." | |
| ) | |
| ayush_coverage: Optional[CoverageItem] = Field( | |
| None, description="Ayurveda / Yoga / Unani / Siddha / Homeopathy in-patient cover." | |
| ) | |
| maternity_coverage: Optional[CoverageItem] = Field( | |
| None, description="Normal and C-section delivery cover, with limit and waiting period." | |
| ) | |
| newborn_coverage: Optional[CoverageItem] = Field( | |
| None, description="Newborn from day 1, sometimes contingent on maternity benefit." | |
| ) | |
| organ_donor_expenses: Optional[CoverageItem] = Field( | |
| None, description="Donor's hospitalization expenses for a covered organ transplant." | |
| ) | |
| ambulance_cover: Optional[CoverageItem] = Field( | |
| None, description="Road / air ambulance, with per-claim limit." | |
| ) | |
| critical_illness_cover: Optional[CoverageItem] = Field( | |
| None, | |
| description="Lump-sum on diagnosis of listed CIs. Use limit_text to record the count " | |
| "(e.g. 'Covers 32 critical illnesses').", | |
| ) | |
| restoration_benefit: Optional[CoverageItem] = Field( | |
| None, | |
| description="Sum-insured refill once exhausted. Capture 'unlimited / once / unrelated illness only' in notes.", | |
| ) | |
| no_claim_bonus_pct: Optional[float] = Field( | |
| None, | |
| description="Annual SI step-up percentage on claim-free renewal (e.g. 50.0 for 50%). " | |
| "Cap is recorded in `no_claim_bonus_cap_pct`.", | |
| ) | |
| no_claim_bonus_cap_pct: Optional[float] = Field( | |
| None, description="Maximum cumulative NCB as % of base SI (e.g. 100 means up to 2x base SI)." | |
| ) | |
| preventive_health_checkup: Optional[CoverageItem] = Field( | |
| None, description="Free check-up frequency and limit (commonly annual or every 2 years)." | |
| ) | |
| # === 6. Sub-limits & caps (what's NOT fully covered) ==================== | |
| room_rent_capping: Optional[str] = Field( | |
| None, | |
| description="Room-rent restriction text, e.g. '1% of SI per day', 'Single private AC room', 'No cap'.", | |
| ) | |
| icu_capping: Optional[str] = Field( | |
| None, description="ICU rent restriction text. 'No cap' for premium plans." | |
| ) | |
| copayment_pct: Optional[float] = Field( | |
| None, | |
| description="Mandatory % the insured pays from each claim. " | |
| "Often age-triggered (e.g. 20% for entry age > 60). Use 0 if none.", | |
| ) | |
| copayment_trigger_notes: Optional[str] = Field( | |
| None, description="When the copay applies: age band, zone, voluntary discount, etc." | |
| ) | |
| disease_wise_sub_limits: Optional[Dict[str, str]] = Field( | |
| None, | |
| description="Per-procedure caps, e.g. {'cataract': 'βΉ25,000 per eye', 'knee_replacement': 'βΉ1,60,000'}.", | |
| ) | |
| deductible_amount_inr: Optional[int] = Field( | |
| None, | |
| description="Aggregate deductible in INR before the policy pays. " | |
| "Non-zero for top-up / super top-up plans; usually 0 for base indemnity.", | |
| ) | |
| # === 7. Geography & network ============================================= | |
| geographic_coverage: Optional[GeographicScope] = Field( | |
| None, description="Territorial scope for non-emergency claims." | |
| ) | |
| def _norm_geo_coverage(cls, v: Any) -> Any: | |
| # V4-Pro sometimes emits {'scope': 'India'} (dict) instead of a plain | |
| # string. Pull the value out + run through the geo synonym map. | |
| if isinstance(v, dict): | |
| for key in ("scope", "value", "name", "type"): | |
| if key in v and v[key]: | |
| v = v[key] | |
| break | |
| else: | |
| return None | |
| if isinstance(v, str): | |
| return _norm_geo(v) | |
| return v | |
| worldwide_emergency_cover: Optional[CoverageItem] = Field( | |
| None, description="Cover for emergencies abroad β distinct from full international cover." | |
| ) | |
| network_hospital_count: Optional[int] = Field( | |
| None, description="Approximate empanelled cashless hospital count published by the insurer." | |
| ) | |
| cashless_treatment_supported: Optional[bool] = Field( | |
| None, description="Whether cashless is offered (essentially always True for indemnity products)." | |
| ) | |
| # === 8. Exclusions ====================================================== | |
| permanent_exclusions: Optional[List[str]] = Field( | |
| None, | |
| description="Never-covered items (cosmetic surgery, self-inflicted injury, war, etc.). " | |
| "IRDAI mandates a standardised list since 2020.", | |
| ) | |
| temporary_exclusions: Optional[List[str]] = Field( | |
| None, | |
| description="Time-bound exclusions; usually mirrors specific_diseases_listed but kept separate " | |
| "because some policies list explicit time-bound carve-outs (e.g. 'bariatric surgery: 36 months').", | |
| ) | |
| notable_exclusions_summary: Optional[str] = Field( | |
| None, | |
| description="One-paragraph human summary of the most consumer-relevant exclusions. " | |
| "Used directly in the voice answer when a buyer asks 'what's NOT covered'.", | |
| ) | |
| # === 9. Claim & service ================================================= | |
| claim_settlement_ratio_pct: Optional[float] = Field( | |
| None, | |
| description="Last published IRDAI claim settlement ratio (%) for the INSURER (not the policy). " | |
| "Sourced from IRDAI's annual report, NOT the policy PDF.", | |
| ) | |
| claim_process_summary: Optional[str] = Field( | |
| None, | |
| description="One paragraph: how to file cashless and reimbursement, including helpline / portal.", | |
| ) | |
| tat_cashless_authorization_hours: Optional[float] = Field( | |
| None, | |
| description="Turn-around time for pre-authorization decisions, in hours. IRDAI 2024 mandate: 1 hour.", | |
| ) | |
| # === 10. Riders / optional add-ons ====================================== | |
| available_riders: Optional[List[str]] = Field( | |
| None, | |
| description="Names of optional add-on covers, e.g. ['Personal Accident', 'Critical Illness', " | |
| "'Hospital Cash', 'OPD Cover'].", | |
| ) | |
| top_rider_examples: Optional[List[str]] = Field( | |
| None, description="Subset of riders most relevant for a typical buyer β used in the voice pitch." | |
| ) | |
| rider_premium_indicative_inr: Optional[Dict[str, int]] = Field( | |
| None, | |
| description="Indicative annual rider cost, keyed by rider name. Sourced from public quotes.", | |
| ) | |
| # === 11. Source metadata ================================================ | |
| source_pdf_path: Optional[str] = Field( | |
| None, description="Local filesystem path to the policy wordings PDF used for extraction." | |
| ) | |
| source_pdf_url: Optional[str] = Field( | |
| None, description="Canonical insurer-hosted URL for the policy wordings PDF." | |
| ) | |
| last_updated_date: Optional[date] = Field( | |
| None, description="Date this record was last extracted or human-reviewed." | |
| ) | |
| extraction_confidence_pct: Optional[float] = Field( | |
| None, | |
| ge=0, | |
| le=100, | |
| description="Self-reported confidence (0-100) from the extraction pipeline. " | |
| "Records below ~70 should be flagged for human review before being served to users.", | |
| ) | |
| # ----------------------------------------------------------------------- | |
| class Config: | |
| extra = "allow" # Forward-compatibility: keep unknown keys for v2 expansion. | |
| use_enum_values = True # Serialize enums as their string values for JSON store. | |
| validate_assignment = True | |