InsuranceBot / backend /sum_insured.py
rohitsar567's picture
recovery: integrate stalled-session work + de-stale/cleanup (pytest 215 green) [build-fix]
b87bd2d
Raw
History Blame Contribute Delete
8.32 kB
"""Deterministic Sum-Insured rationalisation — SINGLE SOURCE OF TRUTH.
The curated `40-data/policy_facts/<id>.json` layer stores every Sum Insured
as a wrapped fact: `{"value": [..], "source_quote": "...", ...}`. The raw
`value` list is the LLM/curation extraction; it can include numbers that the
field's own `source_quote` does NOT actually support (extraction bleed — a
co-pay table figure, a valuables-cover sub-limit, a daily-cash amount, a
premium column read as an SI tier).
This module is the deterministic, no-LLM, no-fabrication gate:
1. corroborated_values(quote) — the set of INR amounts the source_quote
genuinely states, interpreted at the quote's own stated scale
(lakh|lac|L → ×1e5, crore|cr → ×1e7, Indian digit grouping collapsed).
2. corroborate(values, quote) — keep ONLY the policy's listed values that
appear (verbatim, at the quote's scale) in that quote. Nothing is
invented: tokens can only CONFIRM a value the policy already lists.
3. classify(kept, quote) — decide if the corroborated set is a
continuous BAND ("₹X – ₹Y") or discrete TIERS ("₹25 L / ₹50 L / …").
It is intentionally dependency-free (no FastAPI / Chroma import) so the
marketplace serializer in main.py can import it without an import cycle, the
same pattern as backend/policy_identity.py.
Public API:
corroborated_values(source_quote: str) -> set[int]
corroborate(values: list[int], source_quote: str) -> list[int]
classify(kept: list[int], source_quote: str) -> ("band"|"tiers"|"none")
rationalise(values, source_quote) -> SumInsuredView (one-call helper)
"""
from __future__ import annotations
import re
from dataclasses import dataclass
__all__ = [
"corroborated_values",
"corroborate",
"classify",
"rationalise",
"SumInsuredView",
]
# Range/band language in the field's own source_quote. Presence of any of
# these (AND a wide corroborated min→max) is the heuristic for a genuine
# continuous band rather than a discrete plan ladder.
_BAND_TERMS: tuple[str, ...] = (
"ranging from",
" to ",
"range",
"minimum",
"maximum",
"in multiples of",
"in multiple of",
)
# A corroborated set is treated as a continuous band only when it also spans
# a materially wide spread — guards against e.g. "₹3 L to ₹5 L room rent"
# language on a 2-tier plan being rendered as a continuous band.
_BAND_MIN_ABS_SPREAD = 900_000 # max − min ≥ ₹9 L
_BAND_MIN_RATIO = 3 # max / min ≥ 3×
# Smallest plausible standalone full-rupee SI figure in a quote. Below this a
# bare integer is treated as noise (page numbers, counts) unless a lakh/crore
# unit is in scope.
_MIN_RUPEE_TOKEN = 10_000
def _collapse_indian_grouping(s: str) -> str:
"""Join Indian-format digit grouping (1,00,000 / 5,00,000 / ₹5, 00,000)
into a single integer so the value matches its policy-list form.
A list separator (`0.5, 1, 1.5`) is NOT joined: the left operand of a
grouping comma never carries a decimal point, and a grouping tail is
exactly 2 or 3 digits terminated by a non-digit. Comma-then-space is a
grouping form too (real quotes contain `Rs5, 00,000`), so we accept an
optional single space after the comma.
"""
pat = re.compile(r"(?<![\d.])(\d{1,3})\s*,\s?(\d{2,3})(?=\D|$)")
prev = None
while prev != s:
prev = s
s = pat.sub(lambda mo: mo.group(1) + mo.group(2), s)
# Explicit full 3-group form (handled by the iterative pass above too,
# kept for clarity / belt-and-braces).
s = re.sub(
r"(?<![\d.])(\d{1,2}),\s?(\d{2}),\s?(\d{3})(?=\D|$)",
lambda mo: mo.group(1) + mo.group(2) + mo.group(3),
s,
)
return s
def corroborated_values(source_quote: str | None) -> set[int]:
"""The set of INR Sum-Insured amounts the source_quote genuinely states.
Deterministic. Per the D3 normalisation rule: lakh|lac|L → ×1e5,
crore|cr → ×1e7, commas/spaces stripped from digit groups. A bare number
is interpreted at every scale the quote explicitly invokes:
• full rupee value when it is a large standalone integer (≥ ₹10k);
• × ₹1 L when the quote anywhere says lakh/lac/L (covers both inline
"₹5 Lakh" AND a "(Rs. in Lakhs) 3.00 5.00 10.00" header list);
• × ₹1 Cr when the quote anywhere says crore/cr.
No fabrication: this only enumerates numbers that physically appear in
the quote; corroborate() then intersects with the policy's own list.
"""
if not source_quote:
return set()
raw = source_quote.lower()
for ch in ("₹", "`"):
raw = raw.replace(ch, " ")
raw = re.sub(r"\brs\.?\b", " ", raw)
raw = raw.replace("inr", " ")
glued_lakh: set[int] = set()
glued_crore: set[int] = set()
# Glued unit notation: 3l / 10l / 7.5l / 1cr / 50lacs / ₹5 Lakh
for mo in re.finditer(
r"(?<![\d.])(\d+(?:\.\d+)?)\s*(lakhs?|lacs?|l|crores?|cr)(?![a-z])", raw
):
f = float(mo.group(1))
unit = mo.group(2)
if unit.startswith("cr") or unit.startswith("crore"):
glued_crore.add(int(round(f * 1e7)))
else:
glued_lakh.add(int(round(f * 1e5)))
has_lakh = bool(glued_lakh) or re.search(
r"lakhs?|lacs?|\(in l|in lakh", raw
) is not None
has_crore = bool(glued_crore) or re.search(r"crores?|\bcr\b", raw) is not None
collapsed = _collapse_indian_grouping(raw)
out: set[int] = set(glued_lakh) | set(glued_crore)
for mo in re.finditer(r"(?<![\d.])(\d+(?:\.\d+)?)(?![\d.])", collapsed):
f = float(mo.group(1))
if f >= _MIN_RUPEE_TOKEN and f == int(f):
out.add(int(f))
if has_lakh:
out.add(int(round(f * 1e5)))
if has_crore:
out.add(int(round(f * 1e7)))
return out
def corroborate(values, source_quote: str | None) -> list[int]:
"""Keep ONLY the policy-listed `values` whose figure is genuinely stated
in this field's `source_quote`. Sorted, de-duplicated, ints. Empty list
when nothing corroborates (→ caller renders "As per policy schedule")."""
if not values:
return []
present = corroborated_values(source_quote)
if not present:
return []
uniq: set[int] = set()
for v in values:
try:
uniq.add(int(v))
except (TypeError, ValueError):
continue
return sorted(v for v in uniq if v in present)
def classify(kept, source_quote: str | None) -> str:
"""'band' | 'tiers' | 'none' for an already-corroborated value set.
BAND — the quote uses range language AND the corroborated set spans a
materially wide min→max (continuous offering).
TIERS — corroborated discrete plan amounts.
NONE — nothing corroborated.
"""
kept = sorted(set(int(v) for v in (kept or [])))
if not kept:
return "none"
q = (source_quote or "").lower()
has_band_lang = any(t in q for t in _BAND_TERMS)
lo, hi = kept[0], kept[-1]
wide = (hi - lo) >= _BAND_MIN_ABS_SPREAD and (hi / max(lo, 1)) >= _BAND_MIN_RATIO
if has_band_lang and wide and len(kept) >= 2:
return "band"
return "tiers"
@dataclass
class SumInsuredView:
"""The rationalised SI view for one policy field. `kind` drives display:
band → "₹{min} – ₹{max}" (min == sum_insured_min, max == sum_insured_max)
tiers → list the tiers ("₹25 L / ₹50 L / ₹1 Cr"; ">4 → min … max · N")
none → "As per policy schedule"
"""
kind: str # "band" | "tiers" | "none"
tiers: list[int] # corroborated, sorted (empty when kind == "none")
min_inr: int | None
max_inr: int | None
@property
def is_band(self) -> bool:
return self.kind == "band"
def rationalise(values, source_quote: str | None) -> SumInsuredView:
"""One-call helper: corroborate → classify → packaged view."""
kept = corroborate(values, source_quote)
kind = classify(kept, source_quote)
if kind == "none":
return SumInsuredView(kind="none", tiers=[], min_inr=None, max_inr=None)
return SumInsuredView(
kind=kind,
tiers=kept,
min_inr=kept[0],
max_inr=kept[-1],
)