Spaces:

Sathvik-kota
/

Datathon

Sleeping

App Files Files Community

Sathvik-kota commited on Nov 29, 2025

Commit

310da4b

verified ·

1 Parent(s): 8803a3c

Upload folder using huggingface_hub

Browse files

Files changed (1) hide show

app.py +514 -648

app.py CHANGED Viewed

@@ -1,27 +1,60 @@
 import os
 import re
 import json
 from io import BytesIO
 from typing import List, Dict, Any, Optional, Tuple
 from PIL import Image
-import numpy as np
-import cv2
 import pytesseract
 from pytesseract import Output
-# ---------------- Config / Keywords ----------------
 NUM_RE = re.compile(r"[-+]?\d{1,3}(?:[,0-9]*)(?:\.\d+)?")
-TOTAL_KEYWORDS = re.compile(
-    r"(grand\s*total|net\s*payable|total\s*amount|amount\s*payable|bill\s*amount|final\s*amount|balance\s*due|sub\s*total|subtotal|round\s*off)",
-    re.I,
-)
-FOOTER_KEYWORDS = re.compile(r"(page|printed on|printed:|date:|time:|am|pm)", re.I)
 HEADER_KEYWORDS = [
-    "description", "qty", "hrs", "rate", "discount", "net", "amt", "amount",
-    "consultation", "qty/hrs", "qty / hrs", "qty /", "qty/"
 ]
 HEADER_PHRASES = [
     "description qty / hrs consultation rate discount net amt",
     "description qty / hrs rate discount net amt",
@@ -31,7 +64,14 @@ HEADER_PHRASES = [
 ]
 HEADER_PHRASES = [h.lower() for h in HEADER_PHRASES]
-# ---------------- Small utilities ----------------
 def sanitize_ocr_text(s: str) -> str:
     if not s:
         return ""
@@ -40,220 +80,237 @@ def sanitize_ocr_text(s: str) -> str:
     s = s.replace("\r\n", "\n").replace("\r", "\n")
     s = re.sub(r"[ \t]+", " ", s)
     s = s.strip()
-    return s[:4000]
 def normalize_num_str(s: Optional[str]) -> Optional[float]:
     if s is None:
         return None
     s = str(s).strip()
     if s == "":
         return None
-    s = re.sub(r"[^\d\-\+\,\.\(\)]", "", s)
     negative = False
     if s.startswith("(") and s.endswith(")"):
         negative = True
         s = s[1:-1]
     s = s.replace(",", "")
-    if s in ("", "-", "+"):
-        return None
     try:
-        return -float(s) if negative else float(s)
-    except Exception:
-        try:
-            return float(s.replace(" ", ""))
-        except Exception:
-            return None
 def is_numeric_token(t: Optional[str]) -> bool:
     return bool(t and NUM_RE.search(str(t)))
 def clean_name_text(s: str) -> str:
-    """
-    Normalize OCR names: remove odd punctuation, normalize SG codes, RR-2, and
-    safely map OR->DR only when it looks like a doctor's name.
-    """
-    if not s:
-        return s
-    s = s.replace("—", "-").replace("–", "-")
     s = re.sub(r"\s+", " ", s)
     s = s.strip(" -:,.")
-    # SG code normalization
-    s = re.sub(r"\bSG0?(\d+)\b", r"SG\1", s, flags=re.I)
-    s = re.sub(r"\b(RR)[\s\-]*2\b", r"RR-2", s, flags=re.I)
-    # Safer OR -> DR: only when pattern looks like a doctor name (e.g. "OR S SALIL KUMAR")
-    # Heuristic: 'OR' token followed by one or more tokens that are all alphabetic
-    # and at least one seems like a personal name (length > 2).
-    def safe_or_to_dr(text: str) -> str:
-        toks = text.split()
-        out = []
-        i = 0
-        while i < len(toks):
-            tok = toks[i]
-            if tok.upper() == "OR" and i + 1 < len(toks):
-                lookahead = toks[i+1:i+5]  # check up to 4 following tokens
-                # all lookahead tokens are alphabetic-ish and at least one token length>2
-                if all(re.match(r"^[A-Za-z\-\.\']+$", la) for la in lookahead if la) and any(len(la) > 2 for la in lookahead):
-                    out.append("DR")
-                    i += 1
-                    continue
-            out.append(tok)
-            i += 1
-        return " ".join(out)
-    s = safe_or_to_dr(s)
-    return s.strip()
-# ---------------- image preprocessing ----------------
-def pil_to_cv2(img: Image.Image) -> Any:
     arr = np.array(img)
     if arr.ndim == 2:
         return arr
     return cv2.cvtColor(arr, cv2.COLOR_RGB2BGR)
-def preprocess_image(pil_img: Image.Image) -> Any:
     pil_img = pil_img.convert("RGB")
     w, h = pil_img.size
-    target_w = 1500
-    if w < target_w:
-        scale = target_w / float(w)
         pil_img = pil_img.resize((int(w * scale), int(h * scale)), Image.LANCZOS)
-    cv_img = pil_to_cv2(pil_img)
-    gray = cv2.cvtColor(cv_img, cv2.COLOR_BGR2GRAY)
-    # denoise
-    try:
-        gray = cv2.fastNlMeansDenoising(gray, h=10)
-    except Exception:
-        pass
     try:
-        bw = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                    cv2.THRESH_BINARY, 41, 15)
-    except Exception:
-        _, bw = cv2.threshold(gray, 127, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
-    kernel = np.ones((1,1), np.uint8)
-    bw = cv2.morphologyEx(bw, cv2.MORPH_OPEN, kernel)
     return bw
-# ---------------- OCR TSV helpers ----------------
-def image_to_tsv_cells(cv_img: Any) -> List[Dict[str, Any]]:
     try:
-        o = pytesseract.image_to_data(cv_img, output_type=Output.DICT, config="--psm 6")
-    except Exception:
-        o = pytesseract.image_to_data(cv_img, output_type=Output.DICT)
-    cells: List[Dict[str, Any]] = []
-    n = len(o.get("text", []))
     for i in range(n):
-        raw = o["text"][i]
-        if raw is None:
-            continue
-        txt = str(raw).strip()
-        if not txt:
             continue
         try:
-            conf = float(o["conf"][i]) if o["conf"][i] not in (None, "", "-1") else -1.0
-        except Exception:
             conf = -1.0
-        left = int(o.get("left", [0])[i])
-        top = int(o.get("top", [0])[i])
-        width = int(o.get("width", [0])[i])
-        height = int(o.get("height", [0])[i])
-        center_y = top + height / 2.0
-        center_x = left + width / 2.0
         cells.append({
-            "text": txt,
             "conf": conf,
             "left": left,
             "top": top,
             "width": width,
             "height": height,
-            "center_y": center_y,
-            "center_x": center_x
         })
     return cells
-# ---------------- grouping into rows ----------------
-def group_cells_into_rows(cells: List[Dict[str, Any]], y_tolerance: int = 12) -> List[List[Dict[str, Any]]]:
     if not cells:
         return []
-    sorted_cells = sorted(cells, key=lambda c: (c["center_y"], c["center_x"]))
-    rows: List[List[Dict[str, Any]]] = []
-    current = [sorted_cells[0]]
-    last_y = sorted_cells[0]["center_y"]
-    for c in sorted_cells[1:]:
-        if abs(c["center_y"] - last_y) <= y_tolerance:
             current.append(c)
             last_y = (last_y * (len(current) - 1) + c["center_y"]) / len(current)
         else:
             rows.append(sorted(current, key=lambda cc: cc["left"]))
             current = [c]
             last_y = c["center_y"]
     if current:
         rows.append(sorted(current, key=lambda cc: cc["left"]))
     return rows
-# ---------------- merge multiline names (doctor merge added) ----------------
-def merge_multiline_names(rows: List[List[Dict[str, Any]]]) -> List[List[Dict[str, Any]]]:
-    """
-    Merge split item/name rows. Added robust doctor-name merger:
-    - If a row is text-only and next row is doctor-name-like, merge them.
-    - Also merge short textual lines when both are short and non-numeric.
-    """
     if not rows:
         return rows
-    merged: List[List[Dict[str, Any]]] = []
     i = 0
     while i < len(rows):
         row = rows[i]
         tokens = [c["text"] for c in row]
         joined = " ".join(tokens)
         has_num = any(is_numeric_token(t) for t in tokens)
-        # Doctor-name merger:
-        # If current row contains a header-like token (e.g. 'Consultation', 'Charge', '|')
-        # and next row looks like a doctor's name (mostly alphabetic tokens, few tokens),
-        # merge them.
-        if not has_num and i + 1 < len(rows):
-            next_row = rows[i+1]
-            next_txt = " ".join([c["text"] for c in next_row]).strip()
-            # doctor-like heuristics: mostly alphabetic tokens, not numeric, token count <= 6
-            next_tokens = [t for t in re.split(r"\s+", next_txt) if t]
-            next_alpha = all(re.match(r"^[A-Za-z\-\.\']+$", t) for t in next_tokens if t)
-            next_has_num = any(is_numeric_token(t) for t in next_tokens)
-            # current row contains 'consultation' or 'charge' or '|' or 'dr' hint
-            if next_alpha and not next_has_num and len(next_tokens) <= 6:
-                # also ensure current row contains words like 'consultation' or 'charge' or 'dr' or '|'
-                if re.search(r"\b(consultation|charge|charges|\|)\b", joined, re.I) or re.search(r"\bdr\b", joined, re.I):
-                    merged_row = row + next_row
-                    merged.append(sorted(merged_row, key=lambda cc: cc["left"]))
-                    i += 2
-                    continue
-        # If both current and next are short pure-text lines (likely split names), merge them
-        if not has_num and i + 1 < len(rows):
-            next_row = rows[i+1]
-            next_tokens = [c["text"] for c in next_row]
-            next_has_num = any(is_numeric_token(t) for t in next_tokens)
-            if not next_has_num and len(tokens) <= 3 and len(next_tokens) <= 4:
-                merged_row = row + next_row
                 merged.append(sorted(merged_row, key=lambda cc: cc["left"]))
                 i += 2
                 continue
-        # Default
         merged.append(row)
         i += 1
     return merged
-# ---------------- Strong header detection (PATCH 1) ----------------
-def looks_like_header_text(txt: str, top_of_page: bool = False) -> bool:
     if not txt:
         return False
     t = re.sub(r"\s+", " ", txt.strip().lower())
-    # universal blocklist patterns
-    header_patterns = [
         r"description.*qty",
         r"qty.*rate",
         r"rate.*amount",
@@ -262,603 +319,429 @@ def looks_like_header_text(txt: str, top_of_page: bool = False) -> bool:
         r"hrs\s*/\s*qty",
         r"qty\s*/\s*hrs",
     ]
-    for p in header_patterns:
         if re.search(p, t):
             return True
-    # blacklisted exact headers
     if any(h == t for h in HEADER_PHRASES):
         return True
-    # generic: if ≥3 header words → header
     hits = sum(1 for k in HEADER_KEYWORDS if k in t)
     if hits >= 3:
         return True
-    # numeric structure: if line contains ≥3 numbers in tokenized order → header
     tokens = re.split(r"[ \|,/]+", t)
-    numeric_count = sum(1 for tok in tokens if NUM_RE.search(tok))
-    if numeric_count >= 3:
         return True
-    # top-of-page slightly looser
     if top_of_page and hits >= 2:
         return True
     return False
-# ---------------- parsing rows into items (Part 2) ----------------
-def detect_numeric_columns(cells: List[Dict[str, Any]], max_columns: int = 4) -> List[float]:
-    """
-    Adaptive clustering of numeric tokens into column centers (restores conservative adaptive threshold).
-    """
-    xs = [c["center_x"] for c in cells if is_numeric_token(c["text"])]
-    if not xs:
-        return []
-    xs = sorted(xs)
-    if len(xs) == 1:
-        return [xs[0]]
-    gaps = [xs[i+1] - xs[i] for i in range(len(xs)-1)]
-    mean_gap = float(np.mean(gaps))
-    std_gap = float(np.std(gaps)) if len(gaps) > 1 else 0.0
-    gap_thresh = max(30.0, mean_gap + 0.6 * std_gap)
-    clusters = []
-    curr = [xs[0]]
-    for i, g in enumerate(gaps):
-        if g > gap_thresh and len(clusters) < (max_columns - 1):
-            clusters.append(curr)
-            curr = [xs[i+1]]
-        else:
-            curr.append(xs[i+1])
-    clusters.append(curr)
-    centers = [float(np.median(c)) for c in clusters]
-    if len(centers) > max_columns:
-        centers = centers[-max_columns:]
-    return sorted(centers)
-def assign_token_to_column(token_x: float, column_centers: List[float]) -> Optional[int]:
-    if not column_centers:
-        return None
-    distances = [abs(token_x - cx) for cx in column_centers]
-    return int(np.argmin(distances))
-# helper: quick check if item name looks like a lab/test (so we can adjust candidate rules)
-LAB_TEST_KEYWORDS = set(["ct", "et", "hiv", "hcv", "pt", "rbs", "rft", "ts", "tsh", "hb", "hbsaG".lower()])
-# more robust: tokens that are short and uppercase-like are often test codes; we'll check token itself lowercased.
-def looks_like_lab_test(name: str) -> bool:
-    if not name:
-        return False
-    ln = name.lower()
-    # common short codes
-    for k in ["ct", "et", "hiv", "hcv", "pt", "rbs", "rft", "tsh", "hbsag", "hb", "pus", "group", "rh"]:
-        if re.search(r"\b" + re.escape(k) + r"\b", ln):
-            return True
-    # if the name contains terms 'test' or 'lab' or parentheses with code, treat as lab
-    if re.search(r"\b(test|lab|laborat|cmia|cima|cs)\b", ln):
-        return True
-    return False
-def parse_rows_with_columns(rows: List[List[Dict[str, Any]]], page_cells: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
-    """
-    Conservative parse: prefer not to invent rate/qty. Uses numeric column mapping, safer inference,
-    and special handling for lab tests to avoid exploding qty.
-    """
-    parsed_items: List[Dict[str, Any]] = []
     rows = merge_multiline_names(rows)
-    column_centers = detect_numeric_columns(page_cells, max_columns=4)
     for row in rows:
-        tokens = [c["text"] for c in row]
-        if not tokens:
-            continue
-        joined_lower = " ".join(tokens).lower()
-        # skip footer-like lines unless numeric
-        if FOOTER_KEYWORDS.search(joined_lower) and not any(is_numeric_token(t) for t in tokens):
             continue
-        # skip lines with no numeric tokens (likely headers or pure text)
-        if all(not is_numeric_token(t) for t in tokens):
             continue
-        # gather numeric candidates (unique, filtered)
         numeric_values = []
-        for t in tokens:
             if is_numeric_token(t):
                 v = normalize_num_str(t)
                 if v is not None:
                     numeric_values.append(float(v))
-        # de-duplicate
-        numeric_values = sorted(list({float(x) for x in numeric_values}), reverse=True)
-        # Heuristic: remove tiny tokens that cause qty explosion except when amount < 100
-        # We'll apply this later when we know amount. For now keep them but mark.
-        if column_centers:
-            # map numeric tokens to nearest columns
-            left_text_parts = []
-            numeric_bucket_map = {i: [] for i in range(len(column_centers))}
             for c in row:
                 t = c["text"]
-                cx = c["center_x"]
                 if is_numeric_token(t):
-                    col_idx = assign_token_to_column(cx, column_centers)
-                    if col_idx is None:
-                        numeric_bucket_map[len(column_centers)-1].append(t)
-                    else:
-                        numeric_bucket_map[col_idx].append(t)
                 else:
-                    left_text_parts.append(t)
-            raw_name = " ".join(left_text_parts).strip()
-            name = clean_name_text(raw_name) if raw_name else ""
-            num_cols = len(column_centers)
-            def get_bucket(idx):
-                vals = numeric_bucket_map.get(idx, [])
                 return vals[-1] if vals else None
-            amount = normalize_num_str(get_bucket(num_cols - 1)) if num_cols >= 1 else None
-            rate   = normalize_num_str(get_bucket(num_cols - 2)) if num_cols >= 2 else None
-            qty    = normalize_num_str(get_bucket(num_cols - 3)) if num_cols >= 3 else None
-            # fallback: last numeric token as amount
             if amount is None:
-                for t in reversed(tokens):
                     if is_numeric_token(t):
                         amount = normalize_num_str(t)
                         if amount is not None:
                             break
-            # Clean numeric_values now that we may know amount
-            numeric_candidates = numeric_values.copy()
-            if amount is not None:
-                numeric_candidates = [v for v in numeric_candidates if (v >= 5 or amount <= 100)]
-            else:
-                numeric_candidates = [v for v in numeric_candidates if v >= 5]
-            # special handling for lab tests: avoid tiny rates / large qty
-            lab_like = looks_like_lab_test(name)
-            # Try to infer rate & qty from numeric_candidates conservatively
-            inferred_rate = rate
-            inferred_qty = qty
-            if amount is not None and numeric_candidates:
-                # try candidates as rate
-                for cand in numeric_candidates:
-                    if cand <= 1:
-                        continue
-                    if cand >= amount:
-                        continue
-                    ratio = amount / cand if cand else None
-                    if ratio is None:
-                        continue
-                    r = round(ratio)
-                    if r < 1 or r > 200:
-                        continue
-                    # stricter for lab tests: reject qty > 10 and candidate < 5
-                    if lab_like and r > 10:
-                        continue
-                    if abs(ratio - r) <= max(0.03 * r, 0.15):
-                        inferred_rate = float(cand)
-                        inferred_qty = float(r)
-                        break
-            # fallback compute rate if qty found but rate missing
-            if (inferred_rate is None or inferred_rate == 0) and inferred_qty and inferred_qty != 0 and amount is not None:
-                try:
-                    candidate_rate = amount / inferred_qty
-                    if candidate_rate >= 1:
-                        inferred_rate = candidate_rate
-                except Exception:
-                    pass
-            # If amount is zero but rate exists and qty exists, compute amount
-            if (amount is None or amount == 0) and inferred_rate and inferred_qty:
-                amount = round(inferred_rate * inferred_qty, 2)
-            # final defaults
-            if inferred_qty is None:
-                inferred_qty = 1.0
-            if inferred_rate is None:
-                inferred_rate = 0.0
-            # final sanity checks
-            try:
-                amount = float(round(amount, 2)) if amount is not None else None
-            except Exception:
-                amount = None
-            try:
-                inferred_rate = float(round(inferred_rate, 2)) if inferred_rate is not None else 0.0
-            except Exception:
-                inferred_rate = 0.0
-            try:
-                inferred_qty = float(inferred_qty)
-            except Exception:
-                inferred_qty = 1.0
-            if amount is None or amount == 0:
-                # if amount still zero but we have rate>0 and qty present, compute
-                if inferred_rate and inferred_qty:
-                    amount = round(inferred_rate * inferred_qty, 2)
-            if amount is None or amount == 0:
-                # give up - skip this row (avoid inventing)
-                continue
-            parsed_items.append({
                 "item_name": name if name else "UNKNOWN",
-                "item_amount": float(round(amount, 2)),
-                "item_rate": float(round(inferred_rate, 2)) if inferred_rate else 0.0,
-                "item_quantity": float(inferred_qty) if inferred_qty else 1.0,
             })
         else:
-            # no clear numeric columns — conservative right-to-left parsing
-            numeric_idxs = [i for i, t in enumerate(tokens) if is_numeric_token(t)]
-            if not numeric_idxs:
                 continue
-            last = numeric_idxs[-1]
-            amt = normalize_num_str(tokens[last])
             if amt is None:
                 continue
-            name = " ".join(tokens[:last]).strip()
             if not name:
                 continue
-            # collect numeric tokens on RHS to attempt inference
-            right_nums = []
-            for i in numeric_idxs:
-                v = normalize_num_str(tokens[i])
-                if v is not None:
-                    right_nums.append(float(v))
-            right_nums = sorted(list({float(x) for x in right_nums}), reverse=True)
-            rate = None
-            qty = None
-            # conservative mapping
-            if len(right_nums) >= 2:
-                cand = right_nums[1]
-                if float(cand) > 1 and float(cand) < float(amt):
-                    ratio = float(amt) / float(cand) if cand else None
-                    if ratio:
-                        r = round(ratio)
-                        if 1 <= r <= 200 and abs(ratio - r) <= max(0.03 * r, 0.15) and r <= 100:
-                            rate = float(cand)
-                            qty = float(r)
-            if rate is None and right_nums:
-                for cand in right_nums:
-                    if cand <= 1.0 or cand >= float(amt):
-                        continue
-                    ratio = float(amt) / float(cand)
-                    r = round(ratio)
-                    if 1 <= r <= 100 and abs(ratio - r) <= max(0.03 * r, 0.15):
-                        rate = float(cand)
-                        qty = float(r)
-                        break
-            if qty is None:
-                qty = 1.0
-            if rate is None:
-                rate = 0.0
-            # special lab test protections
-            if looks_like_lab_test(name):
-                # if rate <5 and amt>100 -> treat rate as 0 (avoid cand like 12 causing qty 25)
-                if rate < 5 and amt > 100:
-                    rate = 0.0
-                    qty = 1.0
-            # if amount==0 but rate>0, update
-            if amt == 0 and rate and qty:
-                amt = round(rate * qty, 2)
-            parsed_items.append({
                 "item_name": clean_name_text(name),
                 "item_amount": float(round(amt, 2)),
                 "item_rate": float(round(rate, 2)),
-                "item_quantity": float(qty),
             })
-    return parsed_items
-# ---------------- dedupe & totals ----------------
-def dedupe_items(items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
-    seen = set()
-    out: List[Dict[str, Any]] = []
     for it in items:
-        nm = re.sub(r"\s+", " ", (it.get("item_name") or "").lower()).strip()
-        key = (nm[:120], round(float(it.get("item_amount", 0.0)), 2))
-        if key in seen:
-            continue
-        seen.add(key)
         out.append(it)
     return out
-def detect_subtotals_and_totals(rows_texts: List[str]) -> Dict[str, Optional[float]]:
-    subtotal = None; final = None
-    for rt in rows_texts[::-1]:
-        if not rt or rt.strip() == "":
             continue
         if TOTAL_KEYWORDS.search(rt):
             m = NUM_RE.search(rt)
             if m:
                 v = normalize_num_str(m.group(0))
                 if v is None:
                     continue
-                if re.search(r"sub", rt, re.I):
-                    if subtotal is None: subtotal = float(round(v, 2))
                 else:
-                    if final is None: final = float(round(v, 2))
-    return {"subtotal": subtotal, "final_total": final}
-# ---------------- Gemini refinement (improved prompt per PATCH 7) ----------------
-def refine_with_gemini(page_items: List[Dict[str, Any]], page_text: str = "") -> Tuple[List[Dict[str, Any]], Dict[str, int]]:
-    """
-    Attempt deterministic Gemini refinement. If Gemini not configured/available, return page_items as-is.
-    """
-    zero_usage = {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0}
     if not GEMINI_API_KEY or genai is None:
-        return page_items, zero_usage
-    try:
-        safe_text = sanitize_ocr_text(page_text)
-        system_prompt = (
-            "You are a strict bill-extraction cleaner. Return ONLY a JSON array (no explanation, no backticks). "
-            "Each entry must be an object with keys: item_name (string), item_amount (float), item_rate (float), item_quantity (float). "
-            "Do NOT include subtotal or total lines as items. Do NOT invent items; only clean/fix/normalize the given items. "
-            "Prefer exact names from the bill. If names are broken across lines, merge them. Do not rename items unless it's obvious OCR noise."
-        )
-        user_prompt = f"""
-Extract ONLY line items from this hospital bill.
-### RULES (MUST FOLLOW)
-- Do NOT invent items.
-- Do NOT return section headers (Room Charges, Lab Services, Radiology).
-- Merge broken multi-line names.
-- Reconstruct missing rate/qty using amt=rate*qty if visible in text.
-- Prefer exact names as shown in bill.
-- If a doctor name appears across lines, merge to full name.
-- Ignore totals / subtotals.
 - Ignore page numbers.
-- Avoid changing 'OR' unless it is clearly a doctor prefix.
-- Ignore final bill summaries.
-### OCR TEXT:
-{safe_text}
-### INITIAL ITEMS:
-{json.dumps(page_items, ensure_ascii=False, indent=2)}
-Return ONLY a JSON array of cleaned items, e.g.:
 [
-  {{ "item_name": "Consultation Charge | DR PREETHI MARY JOSEPH", "item_amount": 300.0, "item_rate": 300.0, "item_quantity": 1.0 }},
-  ...
 ]
 """
         model = genai.GenerativeModel(GEMINI_MODEL_NAME)
-        response = model.generate_content(
             [
                 {"role": "system", "parts": [system_prompt]},
                 {"role": "user", "parts": [user_prompt]},
             ],
             temperature=0.0,
-            max_output_tokens=1000,
         )
-        raw = response.text.strip()
-        if raw.startswith("```"):
-            raw = re.sub(r"^```[a-zA-Z]*", "", raw)
-            raw = re.sub(r"```$", "", raw).strip()
         parsed = json.loads(raw)
-        if isinstance(parsed, list):
-            cleaned = []
-            for obj in parsed:
-                try:
-                    cleaned.append({
-                        "item_name": str(obj.get("item_name", "")).strip(),
-                        "item_amount": float(obj.get("item_amount", 0.0)),
-                        "item_rate": float(obj.get("item_rate", 0.0) or 0.0),
-                        "item_quantity": float(obj.get("item_quantity", 1.0) or 1.0),
-                    })
-                except Exception:
-                    continue
-            # token usage not reliably available here; return zeros
-            return cleaned, zero_usage
-        return page_items, zero_usage
-    except Exception:
-        return page_items, zero_usage
-# ---------------- Post-validation engine (PATCH 5) ----------------
-def post_validate_items(items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
-    """
-    Rule engine to fix common Gemini hallucinations / OCR inference errors.
-    - If amount==0 and rate & qty present -> amount = rate * qty
-    - If rate*qty differs from amount by tolerance -> recompute qty or rate conservatively
-    - Clamp unreasonable qty for lab tests
-    """
-    out = []
-    for it in items:
-        name = it.get("item_name", "") or ""
-        amt = float(it.get("item_amount", 0.0) or 0.0)
-        rate = float(it.get("item_rate", 0.0) or 0.0)
-        qty = float(it.get("item_quantity", 1.0) or 1.0)
-        lab_like = looks_like_lab_test(name)
-        # If amount missing but rate & qty known -> compute amount
-        if (amt == 0 or amt is None) and rate > 0 and qty > 0:
-            amt = round(rate * qty, 2)
-        # If rate missing but amt and qty present -> compute rate
-        if (rate == 0 or rate is None) and qty and qty != 0:
-            try:
-                candidate_rate = amt / qty
-                if candidate_rate > 0:
-                    rate = round(candidate_rate, 2)
-            except Exception:
-                pass
-        # If qty obviously wrong (amt not close to rate*qty), try recompute qty
-        if rate > 0:
-            ideal = rate * qty
-            if abs(ideal - amt) > max(2.0, 0.1 * ideal):
-                # try compute qty = amt/rate
-                try:
-                    q = amt / rate if rate else qty
-                    if 1 <= round(q) <= (10 if lab_like else 100):
-                        qty = float(round(q))
-                    else:
-                        # fallback: set qty to 1
-                        qty = 1.0
-                except Exception:
-                    qty = 1.0
-        # Clamp lab test qtys to reasonable bounds
-        if lab_like and qty > 10:
-            qty = 1.0
-        # Recompute amt if mismatch after adjustments
-        if rate > 0:
-            recomputed = round(rate * qty, 2)
-            # if recomputed is close to amt, prefer recomputed
-            if abs(recomputed - amt) <= max(2.0, 0.05 * recomputed):
-                amt = recomputed
-            # else if amt much larger but not matching, keep amt but set qty=1
-            else:
-                if abs(amt - recomputed) / max(1.0, recomputed) > 0.5:
-                    qty = 1.0
-                    # and try recompute rate if rate seems wrong
-                    rate = round(amt / qty, 2) if qty else rate
-        it["item_amount"] = round(float(amt or 0.0), 2)
-        it["item_rate"] = round(float(rate or 0.0), 2)
-        it["item_quantity"] = float(qty or 1.0)
-        out.append(it)
-    return out
-# ---------------- main endpoint ----------------
 @app.post("/extract-bill-data")
 async def extract_bill_data(payload: BillRequest):
-    doc_url = payload.document
-    # ---------- download ----------
     try:
-        headers = {"User-Agent": "Mozilla/5.0"}
-        resp = requests.get(doc_url, headers=headers, timeout=30)
-        if resp.status_code != 200:
-            raise RuntimeError(f"download failed status={resp.status_code}")
-        file_bytes = resp.content
-    except Exception:
         return {
             "is_success": False,
-            "token_usage": {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0},
-            "data": {
-                "pagewise_line_items": [],
-                "total_item_count": 0,
-                "final_total": 0.0
-            }
         }
-    # ---------- convert to images ----------
-    images = []
-    clean_url = doc_url.split("?", 1)[0].lower()
     try:
-        if clean_url.endswith(".pdf"):
-            images = convert_from_bytes(file_bytes)
-        elif any(clean_url.endswith(ext) for ext in [".png", ".jpg", ".jpeg", ".tiff", ".bmp"]):
-            images = [Image.open(BytesIO(file_bytes))]
         else:
-            try:
-                images = convert_from_bytes(file_bytes)
-            except:
-                images = []
-    except Exception:
-        images = []
     pagewise = []
-    cumulative_token_usage = {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0}
-    # ---------- per page ----------
-    for idx, page_img in enumerate(images, start=1):
-        try:
-            proc = preprocess_image(page_img)
-            # TSV
             cells = image_to_tsv_cells(proc)
-            rows = group_cells_into_rows(cells, y_tolerance=12)
-            rows_texts = [" ".join([c["text"] for c in r]).strip() for r in rows]
-            # ---------------- HEADER PREFILTER ----------------
-            rows_filtered = []
-            for i, (r, rt) in enumerate(zip(rows, rows_texts)):
-                top_flag = (i < 6)
-                rt_norm = sanitize_ocr_text(rt).lower()
-                # strong header detector (from patched Part 1)
-                if looks_like_header_text(rt_norm, top_of_page=top_flag):
-                    continue
-                # legacy blacklist
-                if any(h in rt_norm for h in HEADER_PHRASES):
                     continue
-                rows_filtered.append(r)
-            rows = rows_filtered
-            rows_texts = [" ".join([c["text"] for c in r]).strip() for r in rows]
-            page_text = sanitize_ocr_text(" ".join(rows_texts))
-            # detect headers at top of page
             top_headers = []
-            for i, rt in enumerate(rows_texts[:6]):
-                if looks_like_header_text(rt.lower(), top_of_page=(i < 4)):
-                    top_headers.append(rt.strip().lower())
-            # ---------------- PARSE ITEMS ----------------
             parsed_items = parse_rows_with_columns(rows, cells)
-            # ---------------- GEMINI REFINEMENT ----------------
-            refined_items, token_u = refine_with_gemini(parsed_items, page_text)
-            for k in cumulative_token_usage:
-                cumulative_token_usage[k] += token_u.get(k, 0)
-            # ---------------- CONTEXT-AWARE SECTION FILTER ----------------
-            other_item_names = [it.get("item_name", "") for it in refined_items]
-            cleaned = []
-            for p in refined_items:
-                if final_item_filter(p, known_page_headers=top_headers, other_item_names=other_item_names):
-                    cleaned.append(p)
-            cleaned = dedupe_items(cleaned)
-            # drop any leftover header noise
-            cleaned = [p for p in cleaned if not looks_like_header_text(p["item_name"].lower())]
-            # ---------------- RULE ENGINE POST-VALIDATION ----------------
             cleaned = post_validate_items(cleaned)
-            # ---------------- PAGE TYPE ----------------
             page_type = "Bill Detail"
-            page_txt = page_text.lower()
-            if any(x in page_txt for x in ["pharmacy", "medicine", "tablet"]):
                 page_type = "Pharmacy"
-            if "final bill" in page_txt or "grand total" in page_txt:
                 page_type = "Final Bill"
-            # ---------------- PER-PAGE SUBTOTAL/TOTAL ----------------
-            detected = detect_subtotals_and_totals(rows_texts)
-            page_subtotal = detected.get("subtotal")
-            page_final = detected.get("final_total")
-            # ---------------- STORE PAGE ----------------
             pagewise.append({
                 "page_no": str(idx),
                 "page_type": page_type,
                 "bill_items": cleaned,
-                "subtotal": page_subtotal,
-                "final_page_total": page_final
             })
-        except Exception:
             pagewise.append({
                 "page_no": str(idx),
                 "page_type": "Bill Detail",
@@ -866,66 +749,49 @@ async def extract_bill_data(payload: BillRequest):
                 "subtotal": None,
                 "final_page_total": None
             })
-            continue
-    # ---------------- GLOBAL FINAL TOTAL ----------------
-    total_item_count = sum(len(p.get("bill_items", [])) for p in pagewise)
-    # Sum items across all pages (no double counting)
-    grand_total = 0.0
     for p in pagewise:
-        for it in p.get("bill_items", []):
-            try:
-                grand_total += float(it.get("item_amount", 0.0) or 0.0)
-            except:
-                pass
-    if not GEMINI_API_KEY or genai is None:
-        cumulative_token_usage["warning_no_gemini"] = 1
     return {
         "is_success": True,
-        "token_usage": cumulative_token_usage,
         "data": {
             "pagewise_line_items": pagewise,
             "total_item_count": total_item_count,
-            "final_total": round(grand_total, 2)
         }
     }
-# ---------------- debug TSV ----------------
 @app.post("/debug-tsv")
 async def debug_tsv(payload: BillRequest):
-    doc_url = payload.document
     try:
-        resp = requests.get(doc_url, timeout=20)
-        if resp.status_code != 200:
-            return {"error": "Download failed"}
-        file_bytes = resp.content
-    except Exception:
-        return {"error": "Download failed"}
-    clean_url = doc_url.split("?", 1)[0].lower()
-    if clean_url.endswith(".pdf"):
-        imgs = convert_from_bytes(file_bytes)
-        img = imgs[0]
-    else:
-        img = Image.open(BytesIO(file_bytes))
-    proc = preprocess_image(img)
-    cells = image_to_tsv_cells(proc)
-    return {"cells": cells}
-# ---------------- health check ----------------
 @app.get("/")
-def health_check():
-    msg = "Bill extraction API (patched v3) live."
-    if not GEMINI_API_KEY or genai is None:
-        msg += " (No Gemini → LLM refinement disabled)"
-    return {
-        "status": "ok",
-        "message": msg,
-        "hint": "POST /extract-bill-data with {'document':'<url>'}"
-    }

+###############################################
+# Bajaj Datathon - FINAL PATCHED BILL EXTRACTOR
+# High Accuracy | Robust OCR | Gemini Refinement
+###############################################
 import os
 import re
 import json
 from io import BytesIO
 from typing import List, Dict, Any, Optional, Tuple
+from fastapi import FastAPI
+from pydantic import BaseModel
+import requests
 from PIL import Image
+from pdf2image import convert_from_bytes
 import pytesseract
 from pytesseract import Output
+import numpy as np
+import cv2
+# Optional Gemini SDK
+try:
+    import google.generativeai as genai
+except:
+    genai = None
+# ---------------- LLM CONFIG ----------------
+GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
+GEMINI_MODEL_NAME = os.getenv("GEMINI_MODEL_NAME", "gemini-2.5-flash")
+if GEMINI_API_KEY and genai is not None:
+    try:
+        genai.configure(api_key=GEMINI_API_KEY)
+    except:
+        pass
+# ---------------- FASTAPI APP ----------------
+app = FastAPI(title="Bajaj Datathon - Bill Extractor (patched v3)")
+class BillRequest(BaseModel):
+    document: str
+###############################################
+#      COMMON REGEX AND UTILITY FUNCTIONS
+###############################################
 NUM_RE = re.compile(r"[-+]?\d{1,3}(?:[,0-9]*)(?:\.\d+)?")
 HEADER_KEYWORDS = [
+    "description", "qty", "hrs", "rate",
+    "discount", "net", "amt", "amount",
+    "qty/hrs", "qty / hrs"
 ]
 HEADER_PHRASES = [
     "description qty / hrs consultation rate discount net amt",
     "description qty / hrs rate discount net amt",
 ]
 HEADER_PHRASES = [h.lower() for h in HEADER_PHRASES]
+TOTAL_KEYWORDS = re.compile(
+    r"(grand\s*total|net\s*payable|total\s*amount|amount\s*payable|bill\s*amount|final\s*amount|balance\s*due|sub\s*total|subtotal|round\s*off)",
+    re.I,
+)
+FOOTER_KEYWORDS = re.compile(r"(page|printed on|printed:|date:|time:|am|pm)", re.I)
 def sanitize_ocr_text(s: str) -> str:
     if not s:
         return ""
     s = s.replace("\r\n", "\n").replace("\r", "\n")
     s = re.sub(r"[ \t]+", " ", s)
     s = s.strip()
+    return s[:5000]
 def normalize_num_str(s: Optional[str]) -> Optional[float]:
     if s is None:
         return None
     s = str(s).strip()
+    s = re.sub(r"[^\d\-\+\,\.\(\)]", "", s)
     if s == "":
         return None
     negative = False
     if s.startswith("(") and s.endswith(")"):
         negative = True
         s = s[1:-1]
     s = s.replace(",", "")
     try:
+        v = float(s)
+        return -v if negative else v
+    except:
+        return None
 def is_numeric_token(t: Optional[str]) -> bool:
     return bool(t and NUM_RE.search(str(t)))
 def clean_name_text(s: str) -> str:
+    s = s.replace("—", "-")
     s = re.sub(r"\s+", " ", s)
     s = s.strip(" -:,.")
+    # Fix doctor prefix only if followed by name
+    s = re.sub(r"\bOR (?=[A-Z][a-z])", "DR ", s)
+    return s.strip()
+###############################################
+#          IMAGE PREPROCESSING
+###############################################
+def pil_to_cv2(img: Image.Image):
     arr = np.array(img)
     if arr.ndim == 2:
         return arr
     return cv2.cvtColor(arr, cv2.COLOR_RGB2BGR)
+def preprocess_image(pil_img: Image.Image):
     pil_img = pil_img.convert("RGB")
     w, h = pil_img.size
+    if w < 1500:
+        scale = 1500 / float(w)
         pil_img = pil_img.resize((int(w * scale), int(h * scale)), Image.LANCZOS)
+    img = pil_to_cv2(pil_img)
+    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+    gray = cv2.fastNlMeansDenoising(gray, h=10)
     try:
+        bw = cv2.adaptiveThreshold(gray, 255,
+                                   cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                    cv2.THRESH_BINARY, 41, 15)
+    except:
+        _, bw = cv2.threshold(gray, 127, 255,
+                              cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+    bw = cv2.morphologyEx(bw, cv2.MORPH_OPEN, np.ones((1, 1), np.uint8))
     return bw
+###############################################
+#                OCR TSV EXTRACTION
+###############################################
+def image_to_tsv_cells(cv_img):
     try:
+        ocr = pytesseract.image_to_data(
+            cv_img,
+            output_type=Output.DICT,
+            config="--psm 6"
+        )
+    except:
+        ocr = pytesseract.image_to_data(cv_img, output_type=Output.DICT)
+    cells = []
+    n = len(ocr.get("text", []))
     for i in range(n):
+        t = (ocr["text"][i] or "").strip()
+        if not t:
             continue
         try:
+            conf = float(ocr["conf"][i])
+        except:
             conf = -1.0
+        left = int(ocr.get("left", [0])[i])
+        top = int(ocr.get("top", [0])[i])
+        width = int(ocr.get("width", [0])[i])
+        height = int(ocr.get("height", [0])[i])
         cells.append({
+            "text": t,
             "conf": conf,
             "left": left,
             "top": top,
             "width": width,
             "height": height,
+            "center_x": left + width / 2,
+            "center_y": top + height / 2,
         })
     return cells
+###############################################
+#          GROUPING INTO TEXT LINES
+###############################################
+def group_cells_into_rows(cells, y_tol=12):
     if not cells:
         return []
+    cells = sorted(cells, key=lambda c: (c["center_y"], c["center_x"]))
+    rows = []
+    current = [cells[0]]
+    last_y = cells[0]["center_y"]
+    for c in cells[1:]:
+        if abs(c["center_y"] - last_y) <= y_tol:
             current.append(c)
             last_y = (last_y * (len(current) - 1) + c["center_y"]) / len(current)
         else:
             rows.append(sorted(current, key=lambda cc: cc["left"]))
             current = [c]
             last_y = c["center_y"]
     if current:
         rows.append(sorted(current, key=lambda cc: cc["left"]))
     return rows
+###############################################
+#          DOCTOR-NAME MERGING (PATCH)
+###############################################
+def merge_multiline_names(rows):
     if not rows:
         return rows
+    merged = []
     i = 0
     while i < len(rows):
         row = rows[i]
         tokens = [c["text"] for c in row]
         joined = " ".join(tokens)
         has_num = any(is_numeric_token(t) for t in tokens)
+        # --- Doctor Name Merge Fix ---
+        if (not has_num and
+            re.search(r"\bdr\b", joined.lower()) and
+            i + 1 < len(rows)):
+            next_tokens = " ".join([c["text"] for c in rows[i + 1]])
+            if not any(is_numeric_token(x) for x in next_tokens.split()):
+                merged_row = row + rows[i + 1]
                 merged.append(sorted(merged_row, key=lambda cc: cc["left"]))
                 i += 2
                 continue
         merged.append(row)
         i += 1
     return merged
+###############################################
+#           DETECT NUMERIC COLUMNS
+###############################################
+def detect_numeric_columns(cells, max_cols=4):
+    xs = [c["center_x"] for c in cells if is_numeric_token(c["text"])]
+    if not xs:
+        return []
+    xs = sorted(xs)
+    if len(xs) == 1:
+        return [xs[0]]
+    gaps = [xs[i + 1] - xs[i] for i in range(len(xs) - 1)]
+    mean_gap = float(np.mean(gaps))
+    std_gap = float(np.std(gaps)) if len(gaps) > 1 else 0.0
+    thresh = max(30.0, mean_gap + 0.6 * std_gap)
+    clusters = []
+    curr = [xs[0]]
+    for i, g in enumerate(gaps):
+        if g > thresh and len(clusters) < (max_cols - 1):
+            clusters.append(curr)
+            curr = [xs[i + 1]]
+        else:
+            curr.append(xs[i + 1])
+    clusters.append(curr)
+    centers = [float(np.median(c)) for c in clusters]
+    centers = centers[-max_cols:]
+    return sorted(centers)
+def assign_token_to_column(x, centers):
+    if not centers:
+        return None
+    dist = [abs(x - c) for c in centers]
+    return int(np.argmin(dist))
+###############################################
+#      STRONG HEADER DETECTION (PATCHED)
+###############################################
+def looks_like_header_text(txt: str, top_of_page=False):
     if not txt:
         return False
     t = re.sub(r"\s+", " ", txt.strip().lower())
+    patterns = [
         r"description.*qty",
         r"qty.*rate",
         r"rate.*amount",
         r"hrs\s*/\s*qty",
         r"qty\s*/\s*hrs",
     ]
+    for p in patterns:
         if re.search(p, t):
             return True
     if any(h == t for h in HEADER_PHRASES):
         return True
     hits = sum(1 for k in HEADER_KEYWORDS if k in t)
     if hits >= 3:
         return True
     tokens = re.split(r"[ \|,/]+", t)
+    num = sum(1 for tok in tokens if NUM_RE.search(tok))
+    if num >= 3:
         return True
     if top_of_page and hits >= 2:
         return True
     return False
+###############################################
+#             PARSE ROWS INTO ITEMS
+###############################################
+def parse_rows_with_columns(rows, cells):
     rows = merge_multiline_names(rows)
+    col_centers = detect_numeric_columns(cells)
+    parsed = []
     for row in rows:
+        texts = [c["text"] for c in row]
+        joined = " ".join(texts).lower()
+        if FOOTER_KEYWORDS.search(joined) and not any(is_numeric_token(t) for t in texts):
             continue
+        if all(not is_numeric_token(t) for t in texts):
             continue
         numeric_values = []
+        for t in texts:
             if is_numeric_token(t):
                 v = normalize_num_str(t)
                 if v is not None:
                     numeric_values.append(float(v))
+        # De-duplicate & sort largest first
+        numeric_values = sorted(list({float(v) for v in numeric_values}), reverse=True)
+        # Drop tiny noise
+        numeric_values = [v for v in numeric_values if v >= 5 or (v < 5 and len(numeric_values) == 1)]
+        if col_centers:
+            left_text = []
+            bucket = {i: [] for i in range(len(col_centers))}
             for c in row:
                 t = c["text"]
+                x = c["center_x"]
                 if is_numeric_token(t):
+                    idx = assign_token_to_column(x, col_centers)
+                    if idx is not None:
+                        bucket[idx].append(t)
                 else:
+                    left_text.append(t)
+            name_raw = " ".join(left_text).strip()
+            name = clean_name_text(name_raw)
+            N = len(col_centers)
+            def pick(k):
+                vals = bucket.get(k, [])
                 return vals[-1] if vals else None
+            amount = normalize_num_str(pick(N - 1)) if N >= 1 else None
+            rate   = normalize_num_str(pick(N - 2)) if N >= 2 else None
+            qty    = normalize_num_str(pick(N - 3)) if N >= 3 else None
+            # fallback amount
             if amount is None:
+                for t in reversed(texts):
                     if is_numeric_token(t):
                         amount = normalize_num_str(t)
                         if amount is not None:
                             break
+            # strong qty/rate inference
+            if amount is not None and rate is not None:
+                ratio = amount / rate if rate else None
+                if ratio and 1 <= round(ratio) <= 10:
+                    qty = float(round(ratio))
+            if qty is None:
+                qty = 1.0
+            if amount == 0 and rate and qty:
+                amount = rate * qty
+            try: amount = float(round(amount, 2))
+            except: continue
+            try: rate = float(round(rate or 0.0, 2))
+            except: rate = 0.0
+            try: qty = float(qty)
+            except: qty = 1.0
+            parsed.append({
                 "item_name": name if name else "UNKNOWN",
+                "item_amount": amount,
+                "item_rate": rate,
+                "item_quantity": qty
             })
         else:
+            idxs = [i for i, t in enumerate(texts) if is_numeric_token(t)]
+            if not idxs:
                 continue
+            amt = normalize_num_str(texts[idxs[-1]])
             if amt is None:
                 continue
+            name = " ".join(texts[: idxs[-1]]).strip()
             if not name:
                 continue
+            rate = 0.0
+            qty = 1.0
+            possible = []
+            for i in idxs:
+                v = normalize_num_str(texts[i])
+                if v is not None:
+                    possible.append(float(v))
+            possible = sorted(list({v for v in possible}), reverse=True)
+            for p in possible:
+                if p <= 1 or p >= amt:
+                    continue
+                ratio = amt / p
+                r = round(ratio)
+                if 1 <= r <= 10:
+                    rate = p
+                    qty = r
+                    break
+            parsed.append({
                 "item_name": clean_name_text(name),
                 "item_amount": float(round(amt, 2)),
                 "item_rate": float(round(rate, 2)),
+                "item_quantity": float(qty)
             })
+    return parsed
+###############################################
+#              FINAL ITEM FILTER
+###############################################
+def final_item_filter(item, headers, all_names):
+    name = item["item_name"].strip()
+    ln = name.lower()
+    if not name:
+        return False
+    for h in headers:
+        if h in ln:
+            return False
+    if FOOTER_KEYWORDS.search(ln):
+        return False
+    if item["item_amount"] <= 0:
+        return False
+    words = ln.split()
+    short = len(words) <= 3
+    if any(k in ln for k in ["charges", "services", "room", "radiology", "surgery"]) and short:
+        lower_other = " ".join(all_names).lower()
+        if any(z in lower_other for z in [
+            "rent","ward","nursing","surgeon","anaes","ot","procedure"
+        ]):
+            return False
+    rate = item["item_rate"]
+    amt  = item["item_amount"]
+    if rate and rate > amt * 10 and amt < 10000:
+        return False
+    return True
+###############################################
+#           POST VALIDATION (PATCH)
+###############################################
+def post_validate_items(items):
+    out = []
     for it in items:
+        amt = it["item_amount"]
+        rate = it["item_rate"]
+        qty = it["item_quantity"]
+        if amt == 0 and rate > 0:
+            amt = rate * qty
+        if rate > 0:
+            ideal = rate * qty
+            if abs(ideal - amt) > max(2, 0.15 * ideal):
+                q = amt / rate
+                if 1 <= round(q) <= 10:
+                    qty = round(q)
+        it["item_amount"] = round(amt, 2)
+        it["item_rate"] = round(rate, 2)
+        it["item_quantity"] = float(qty)
         out.append(it)
     return out
+###############################################
+#          SUBTOTAL / FINAL TOTAL DETECTION
+###############################################
+def detect_subtotals_and_totals(rows):
+    sub = None
+    final = None
+    for rt in rows[::-1]:
+        if not rt.strip():
             continue
         if TOTAL_KEYWORDS.search(rt):
             m = NUM_RE.search(rt)
             if m:
                 v = normalize_num_str(m.group(0))
                 if v is None:
                     continue
+                if "sub" in rt.lower():
+                    if sub is None:
+                        sub = round(v, 2)
                 else:
+                    if final is None:
+                        final = round(v, 2)
+    return {"subtotal": sub, "final_total": final}
+###############################################
+#          GEMINI REFINER (PATCHED PROMPT)
+###############################################
+def refine_with_gemini(items, page_text=""):
     if not GEMINI_API_KEY or genai is None:
+        return items, {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0}
+    safe = sanitize_ocr_text(page_text)
+    system_prompt = (
+        "You are a strict hospital bill item cleaner.\n"
+        "Return ONLY a JSON array of cleaned line items.\n"
+        "Do NOT include section headers, totals, subtotals, page numbers.\n"
+        "Do NOT invent items.\n"
+    )
+    user_prompt = f"""
+Extract ONLY valid line items from the bill.
+RULES YOU MUST FOLLOW:
+- Do NOT create new items.
+- Do NOT output section headers (Room Charges, Lab Services, Radiology).
+- Merge broken names (doctor names on multiple lines).
+- Use exact item names from OCR text.
+- Recompute rate/qty if amount = rate×qty is clear.
+- Ignore totals or summary lines.
 - Ignore page numbers.
+- Always output: item_name, item_amount, item_rate, item_quantity.
+OCR TEXT:
+{safe}
+INITIAL ITEMS:
+{json.dumps(items, ensure_ascii=False)}
+Return ONLY a JSON array:
 [
+  {{"item_name":"...","item_amount":float,"item_rate":float,"item_quantity":float}}
 ]
 """
+    try:
         model = genai.GenerativeModel(GEMINI_MODEL_NAME)
+        resp = model.generate_content(
             [
                 {"role": "system", "parts": [system_prompt]},
                 {"role": "user", "parts": [user_prompt]},
             ],
             temperature=0.0,
+            max_output_tokens=1200,
         )
+        raw = resp.text.strip()
+        raw = raw.replace("```json", "").replace("```", "").strip()
         parsed = json.loads(raw)
+        cleaned = []
+        for obj in parsed:
+            cleaned.append({
+                "item_name": str(obj.get("item_name", "")).strip(),
+                "item_amount": float(obj.get("item_amount", 0.0)),
+                "item_rate": float(obj.get("item_rate", 0.0)),
+                "item_quantity": float(obj.get("item_quantity", 1.0)),
+            })
+        return cleaned, {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0}
+    except:
+        return items, {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0}
+###############################################
+#              MAIN EXTRACTION ENDPOINT
+###############################################
 @app.post("/extract-bill-data")
 async def extract_bill_data(payload: BillRequest):
+    url = payload.document
+    # download
     try:
+        r = requests.get(url, headers={"User-Agent": "Mozilla"}, timeout=30)
+        if r.status_code != 200:
+            raise RuntimeError("Download failed")
+        data = r.content
+    except:
         return {
             "is_success": False,
+            "token_usage": {},
+            "data": {"pagewise_line_items": [], "total_item_count": 0}
         }
+    # load image(s)
     try:
+        if url.lower().split("?")[0].endswith(".pdf"):
+            imgs = convert_from_bytes(data)
         else:
+            imgs = [Image.open(BytesIO(data))]
+    except:
+        imgs = []
     pagewise = []
+    total_tokens = {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0}
+    for idx, img in enumerate(imgs, 1):
+        try:
+            proc = preprocess_image(img)
             cells = image_to_tsv_cells(proc)
+            rows = group_cells_into_rows(cells)
+            row_texts = [" ".join([c["text"] for c in r]).strip() for r in rows]
+            # remove headers
+            filtered = []
+            for i, (r, t) in enumerate(zip(rows, row_texts)):
+                if looks_like_header_text(t, top_of_page=(i < 5)):
                     continue
+                filtered.append(r)
+            rows = filtered
+            row_texts = [" ".join([c["text"] for c in r]).strip() for r in rows]
+            page_text = " ".join(row_texts)
             top_headers = []
+            for t in row_texts[:5]:
+                if looks_like_header_text(t, top_of_page=True):
+                    top_headers.append(t.lower())
             parsed_items = parse_rows_with_columns(rows, cells)
+            refined, usage = refine_with_gemini(parsed_items, page_text)
+            for k in total_tokens:
+                total_tokens[k] += usage.get(k, 0)
+            all_names = [x["item_name"] for x in refined]
+            cleaned = [
+                x for x in refined
+                if final_item_filter(x, top_headers, all_names)
+            ]
             cleaned = post_validate_items(cleaned)
+            totals = detect_subtotals_and_totals(row_texts)
             page_type = "Bill Detail"
+            low = page_text.lower()
+            if "pharmacy" in low:
                 page_type = "Pharmacy"
+            if "final bill" in low or "grand total" in low:
                 page_type = "Final Bill"
             pagewise.append({
                 "page_no": str(idx),
                 "page_type": page_type,
                 "bill_items": cleaned,
+                "subtotal": totals["subtotal"],
+                "final_page_total": totals["final_total"]
             })
+        except:
             pagewise.append({
                 "page_no": str(idx),
                 "page_type": "Bill Detail",
                 "subtotal": None,
                 "final_page_total": None
             })
+    # global final total = sum of all item amounts
+    final_sum = 0.0
     for p in pagewise:
+        for it in p["bill_items"]:
+            final_sum += it["item_amount"]
+    total_item_count = sum(len(p["bill_items"]) for p in pagewise)
     return {
         "is_success": True,
+        "token_usage": total_tokens,
         "data": {
             "pagewise_line_items": pagewise,
             "total_item_count": total_item_count,
+            "final_total": round(final_sum, 2)
         }
     }
+###############################################
+#                  DEBUG ENDPOINT
+###############################################
 @app.post("/debug-tsv")
 async def debug_tsv(payload: BillRequest):
     try:
+        r = requests.get(payload.document, timeout=20)
+        img = Image.open(BytesIO(r.content))
+        proc = preprocess_image(img)
+        cells = image_to_tsv_cells(proc)
+        return {"cells": cells}
+    except:
+        return {"error": "debug failed"}
+###############################################
+#                HEALTH CHECK
+###############################################
 @app.get("/")
+def ping():
+    msg = "Bill extractor live."
+    if not GEMINI_API_KEY:
+        msg += " (Gemini missing)"
+    return {"status": "ok", "message": msg}