Spaces:

Sathvik-kota
/

Datathon

Sleeping

App Files Files Community

Sathvik-kota commited on Nov 29, 2025

Commit

8803a3c

verified ·

1 Parent(s): 84d69f8

Upload folder using huggingface_hub

Browse files

Files changed (1) hide show

app.py +441 -224

app.py CHANGED Viewed

@@ -1,46 +1,16 @@
-# app_bill_extractor_final_v2.py
-# Humanized, high-accuracy bill extraction API.
-# Robust OCR preprocessing, TSV layout parsing, numeric-column inference,
-# header prefiltering, deterministic Gemini refinement (if configured).
 import os
 import re
 import json
 from io import BytesIO
 from typing import List, Dict, Any, Optional, Tuple
-from fastapi import FastAPI
-from pydantic import BaseModel
-import requests
 from PIL import Image
-from pdf2image import convert_from_bytes
-import pytesseract
-from pytesseract import Output
 import numpy as np
 import cv2
-# Optional: Google Gemini SDK (if available)
-try:
-    import google.generativeai as genai
-except Exception:
-    genai = None
-# ---------------- LLM CONFIG ----------------
-GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
-GEMINI_MODEL_NAME = os.getenv("GEMINI_MODEL_NAME", "gemini-2.5-flash")
-if GEMINI_API_KEY and genai is not None:
-    try:
-        genai.configure(api_key=GEMINI_API_KEY)
-    except Exception:
-        pass
-# ---------------- FastAPI app ----------------
-app = FastAPI(title="Bajaj Datathon - Bill Extractor (final, humanized)")
-class BillRequest(BaseModel):
-    document: str
-# ---------------- Regex and keywords ----------------
 NUM_RE = re.compile(r"[-+]?\d{1,3}(?:[,0-9]*)(?:\.\d+)?")
 TOTAL_KEYWORDS = re.compile(
     r"(grand\s*total|net\s*payable|total\s*amount|amount\s*payable|bill\s*amount|final\s*amount|balance\s*due|sub\s*total|subtotal|round\s*off)",
@@ -48,8 +18,10 @@ TOTAL_KEYWORDS = re.compile(
 )
 FOOTER_KEYWORDS = re.compile(r"(page|printed on|printed:|date:|time:|am|pm)", re.I)
-# generalized header-related tokens & exact header phrase blacklist (common variants)
-HEADER_KEYWORDS = ["description", "qty", "hrs", "rate", "discount", "net", "amt", "amount", "consultation", "qty/hrs", "qty / hrs"]
 HEADER_PHRASES = [
     "description qty / hrs consultation rate discount net amt",
     "description qty / hrs rate discount net amt",
@@ -59,7 +31,7 @@ HEADER_PHRASES = [
 ]
 HEADER_PHRASES = [h.lower() for h in HEADER_PHRASES]
-# ---------------- small utilities ----------------
 def sanitize_ocr_text(s: str) -> str:
     if not s:
         return ""
@@ -96,13 +68,41 @@ def is_numeric_token(t: Optional[str]) -> bool:
     return bool(t and NUM_RE.search(str(t)))
 def clean_name_text(s: str) -> str:
-    s = s.replace("—", "-")
     s = re.sub(r"\s+", " ", s)
     s = s.strip(" -:,.")
     s = re.sub(r"\bSG0?(\d+)\b", r"SG\1", s, flags=re.I)
     s = re.sub(r"\b(RR)[\s\-]*2\b", r"RR-2", s, flags=re.I)
-    # fix common OCR mistakes for doctor prefixes
-    s = re.sub(r"\bOR\b", "DR", s)  # sometimes OCR turns 'DR' -> 'OR'
     return s.strip()
 # ---------------- image preprocessing ----------------
@@ -121,22 +121,27 @@ def preprocess_image(pil_img: Image.Image) -> Any:
         pil_img = pil_img.resize((int(w * scale), int(h * scale)), Image.LANCZOS)
     cv_img = pil_to_cv2(pil_img)
     gray = cv2.cvtColor(cv_img, cv2.COLOR_BGR2GRAY)
-    gray = cv2.fastNlMeansDenoising(gray, h=10)
     try:
-        bw = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 41, 15)
     except Exception:
         _, bw = cv2.threshold(gray, 127, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
     kernel = np.ones((1,1), np.uint8)
     bw = cv2.morphologyEx(bw, cv2.MORPH_OPEN, kernel)
     return bw
-# ---------------- OCR TSV ----------------
 def image_to_tsv_cells(cv_img: Any) -> List[Dict[str, Any]]:
     try:
         o = pytesseract.image_to_data(cv_img, output_type=Output.DICT, config="--psm 6")
     except Exception:
         o = pytesseract.image_to_data(cv_img, output_type=Output.DICT)
-    cells = []
     n = len(o.get("text", []))
     for i in range(n):
         raw = o["text"][i]
@@ -155,15 +160,24 @@ def image_to_tsv_cells(cv_img: Any) -> List[Dict[str, Any]]:
         height = int(o.get("height", [0])[i])
         center_y = top + height / 2.0
         center_x = left + width / 2.0
-        cells.append({"text": txt, "conf": conf, "left": left, "top": top, "width": width, "height": height, "center_y": center_y, "center_x": center_x})
     return cells
-# ---------------- grouping & merge helpers ----------------
 def group_cells_into_rows(cells: List[Dict[str, Any]], y_tolerance: int = 12) -> List[List[Dict[str, Any]]]:
     if not cells:
         return []
     sorted_cells = sorted(cells, key=lambda c: (c["center_y"], c["center_x"]))
-    rows = []
     current = [sorted_cells[0]]
     last_y = sorted_cells[0]["center_y"]
     for c in sorted_cells[1:]:
@@ -178,63 +192,106 @@ def group_cells_into_rows(cells: List[Dict[str, Any]], y_tolerance: int = 12) ->
         rows.append(sorted(current, key=lambda cc: cc["left"]))
     return rows
 def merge_multiline_names(rows: List[List[Dict[str, Any]]]) -> List[List[Dict[str, Any]]]:
     if not rows:
         return rows
-    merged = []
     i = 0
     while i < len(rows):
         row = rows[i]
         tokens = [c["text"] for c in row]
         has_num = any(is_numeric_token(t) for t in tokens)
-        # if row looks pure text and next row contains numbers but short left text tokens, merge
         if not has_num and i + 1 < len(rows):
             next_row = rows[i+1]
-            next_tokens = [c["text"] for c in next_row]
             next_has_num = any(is_numeric_token(t) for t in next_tokens)
-            if next_has_num and len(tokens) >= 2 and len([t for t in next_tokens if not is_numeric_token(t)]) <= 3:
-                merged_row = []
-                min_left = min((c["left"] for c in next_row), default=0)
-                offset = 10
-                for c in row:
-                    newc = c.copy()
-                    newc["left"] = min_left - offset
-                    newc["center_x"] = newc["left"] + newc.get("width", 0) / 2.0
-                    merged_row.append(newc)
-                    offset += 10
-                merged_row.extend(next_row)
-                merged.append(sorted(merged_row, key=lambda cc: cc["left"]))
-                i += 2
-                continue
-        # Additional merge: If a row ends with a trailing token like a doctor's name line with single token and next row also text, merge (helps names split across 2+ lines)
         if not has_num and i + 1 < len(rows):
             next_row = rows[i+1]
             next_tokens = [c["text"] for c in next_row]
             next_has_num = any(is_numeric_token(t) for t in next_tokens)
-            if not next_has_num and len(tokens) <= 3 and len(next_tokens) <= 3:
-                # merge both textual lines into one (keeps relative left ordering by shifting)
-                merged_row = []
-                min_left = min((c["left"] for c in next_row + row), default=0)
-                offset = 10
-                for c in row + next_row:
-                    newc = c.copy()
-                    if newc["left"] > min_left:
-                        newc["left"] = newc["left"]
-                    else:
-                        newc["left"] = min_left - offset
-                    newc["center_x"] = newc["left"] + newc.get("width", 0) / 2.0
-                    merged_row.append(newc)
-                    offset += 5
                 merged.append(sorted(merged_row, key=lambda cc: cc["left"]))
                 i += 2
                 continue
         merged.append(row)
         i += 1
     return merged
-# ---------------- numeric column detection ----------------
-# >>> CHANGE: adaptive clustering (restored to conservative adaptive threshold)
 def detect_numeric_columns(cells: List[Dict[str, Any]], max_columns: int = 4) -> List[float]:
     xs = [c["center_x"] for c in cells if is_numeric_token(c["text"])]
     if not xs:
         return []
@@ -265,9 +322,29 @@ def assign_token_to_column(token_x: float, column_centers: List[float]) -> Optio
     distances = [abs(token_x - cx) for cx in column_centers]
     return int(np.argmin(distances))
-# ---------------- parsing rows into items ----------------
 def parse_rows_with_columns(rows: List[List[Dict[str, Any]]], page_cells: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
-    parsed_items = []
     rows = merge_multiline_names(rows)
     column_centers = detect_numeric_columns(page_cells, max_columns=4)
@@ -276,8 +353,10 @@ def parse_rows_with_columns(rows: List[List[Dict[str, Any]]], page_cells: List[D
         if not tokens:
             continue
         joined_lower = " ".join(tokens).lower()
         if FOOTER_KEYWORDS.search(joined_lower) and not any(is_numeric_token(t) for t in tokens):
             continue
         if all(not is_numeric_token(t) for t in tokens):
             continue
@@ -288,10 +367,14 @@ def parse_rows_with_columns(rows: List[List[Dict[str, Any]]], page_cells: List[D
                 v = normalize_num_str(t)
                 if v is not None:
                     numeric_values.append(float(v))
-        # de-duplicate and sort descending (larger candidates first)
-        numeric_values = sorted(list({int(x) if float(x).is_integer() else x for x in numeric_values}), reverse=True)
         if column_centers:
             left_text_parts = []
             numeric_bucket_map = {i: [] for i in range(len(column_centers))}
             for c in row:
@@ -300,7 +383,7 @@ def parse_rows_with_columns(rows: List[List[Dict[str, Any]]], page_cells: List[D
                 if is_numeric_token(t):
                     col_idx = assign_token_to_column(cx, column_centers)
                     if col_idx is None:
-                        numeric_bucket_map[len(column_centers) - 1].append(t)
                     else:
                         numeric_bucket_map[col_idx].append(t)
                 else:
@@ -317,6 +400,7 @@ def parse_rows_with_columns(rows: List[List[Dict[str, Any]]], page_cells: List[D
             rate   = normalize_num_str(get_bucket(num_cols - 2)) if num_cols >= 2 else None
             qty    = normalize_num_str(get_bucket(num_cols - 3)) if num_cols >= 3 else None
             if amount is None:
                 for t in reversed(tokens):
                     if is_numeric_token(t):
@@ -324,70 +408,91 @@ def parse_rows_with_columns(rows: List[List[Dict[str, Any]]], page_cells: List[D
                         if amount is not None:
                             break
-            # >>> CHANGE: safer inference — skip tiny candidates like 1, enforce qty bounds, require close ratio
-            if amount is not None and numeric_values:
-                # Only accept candidate as rate if candidate >= 2 (or amount is tiny) and candidate < amount
-                for cand in numeric_values:
-                    try:
-                        cand_float = float(cand)
-                    except:
-                        continue
-                    if cand_float <= 1.0:
                         continue
-                    if amount <= 5 and cand_float < 1.0:
                         continue
-                    if cand_float >= amount:
-                        continue
-                    ratio = amount / cand_float if cand_float else None
                     if ratio is None:
                         continue
                     r = round(ratio)
                     if r < 1 or r > 200:
                         continue
-                    # require relative closeness threshold (adaptive)
                     if abs(ratio - r) <= max(0.03 * r, 0.15):
-                        # Accept only if qty reasonable (<=100)
-                        if r <= 100:
-                            rate = cand_float
-                            qty = float(r)
-                            break
             # fallback compute rate if qty found but rate missing
-            if (rate is None or rate == 0) and qty and qty != 0 and amount is not None:
                 try:
-                    candidate_rate = amount / qty
-                    # require candidate_rate > 1 (avoid tiny rates) and reasonable
-                    if candidate_rate >= 2:
-                        rate = candidate_rate
                 except Exception:
                     pass
             # final defaults
-            if qty is None:
-                qty = 1.0
-            # normalize and sanity-check
             try:
-                amount = float(round(amount, 2))
             except Exception:
-                continue
             try:
-                rate = float(round(rate, 2)) if rate is not None else 0.0
             except Exception:
-                rate = 0.0
             try:
-                qty = float(qty)
             except Exception:
-                qty = 1.0
             parsed_items.append({
                 "item_name": name if name else "UNKNOWN",
-                "item_amount": amount,
-                "item_rate": rate if rate is not None else 0.0,
-                "item_quantity": qty if qty is not None else 1.0,
             })
         else:
             numeric_idxs = [i for i, t in enumerate(tokens) if is_numeric_token(t)]
             if not numeric_idxs:
                 continue
@@ -398,29 +503,28 @@ def parse_rows_with_columns(rows: List[List[Dict[str, Any]]], page_cells: List[D
             name = " ".join(tokens[:last]).strip()
             if not name:
                 continue
-            rate = None; qty = None
-            # try to pick rate/qty from previous numeric tokens (right-to-left)
-            # and use the safer inference logic (ignore candidate == 1)
             right_nums = []
             for i in numeric_idxs:
                 v = normalize_num_str(tokens[i])
                 if v is not None:
                     right_nums.append(float(v))
-            right_nums = sorted(list({int(x) if float(x).is_integer() else x for x in right_nums}), reverse=True)
-            # attempt direct mapping: last numeric = amount, previous maybe rate / qty
             if len(right_nums) >= 2:
                 cand = right_nums[1]
                 if float(cand) > 1 and float(cand) < float(amt):
-                    # check ratio
                     ratio = float(amt) / float(cand) if cand else None
                     if ratio:
                         r = round(ratio)
                         if 1 <= r <= 200 and abs(ratio - r) <= max(0.03 * r, 0.15) and r <= 100:
                             rate = float(cand)
                             qty = float(r)
-            # fallback: conservative search like above
             if rate is None and right_nums:
                 for cand in right_nums:
                     if cand <= 1.0 or cand >= float(amt):
@@ -437,6 +541,17 @@ def parse_rows_with_columns(rows: List[List[Dict[str, Any]]], page_cells: List[D
             if rate is None:
                 rate = 0.0
             parsed_items.append({
                 "item_name": clean_name_text(name),
                 "item_amount": float(round(amt, 2)),
@@ -449,10 +564,10 @@ def parse_rows_with_columns(rows: List[List[Dict[str, Any]]], page_cells: List[D
 # ---------------- dedupe & totals ----------------
 def dedupe_items(items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
     seen = set()
-    out = []
     for it in items:
-        nm = re.sub(r"\s+", " ", it["item_name"].lower()).strip()
-        key = (nm[:120], round(float(it["item_amount"]), 2))
         if key in seen:
             continue
         seen.add(key)
@@ -476,8 +591,11 @@ def detect_subtotals_and_totals(rows_texts: List[str]) -> Dict[str, Optional[flo
                     if final is None: final = float(round(v, 2))
     return {"subtotal": subtotal, "final_total": final}
-# ---------------- Gemini refinement (deterministic) ----------------
 def refine_with_gemini(page_items: List[Dict[str, Any]], page_text: str = "") -> Tuple[List[Dict[str, Any]], Dict[str, int]]:
     zero_usage = {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0}
     if not GEMINI_API_KEY or genai is None:
         return page_items, zero_usage
@@ -486,18 +604,36 @@ def refine_with_gemini(page_items: List[Dict[str, Any]], page_text: str = "") ->
         system_prompt = (
             "You are a strict bill-extraction cleaner. Return ONLY a JSON array (no explanation, no backticks). "
             "Each entry must be an object with keys: item_name (string), item_amount (float), item_rate (float), item_quantity (float). "
-            "Do NOT include subtotal or total lines as items. Do not invent items; only clean/fix/normalize the given items."
-        )
-        user_prompt = (
-            f"page_text='''{safe_text}'''\n"
-            f"items = {json.dumps(page_items, ensure_ascii=False)}\n\n"
-            "Example:\n"
-            "items = [{'item_name':'Consultation Charge | DR PREETHI','item_amount':300.0,'item_rate':0.0,'item_quantity':300.0},\n"
-            "         {'item_name':'Description Qty / Hrs Consultation Rate Discount Net Amt','item_amount':1950.0,'item_rate':1950.0,'item_quantity':1.0}]\n"
-            "=>\n"
-            "[{'item_name':'Consultation Charge | DR PREETHI MARY JOSEPH','item_amount':300.0,'item_rate':300.0,'item_quantity':1.0}]\n\n"
-            "Return only the cleaned JSON array of items."
         )
         model = genai.GenerativeModel(GEMINI_MODEL_NAME)
         response = model.generate_content(
             [
@@ -524,79 +660,85 @@ def refine_with_gemini(page_items: List[Dict[str, Any]], page_text: str = "") ->
                     })
                 except Exception:
                     continue
             return cleaned, zero_usage
         return page_items, zero_usage
     except Exception:
         return page_items, zero_usage
-# ---------------- header heuristics & final filter ----------------
-def looks_like_header_text(txt: str, top_of_page: bool = False) -> bool:
-    if not txt:
-        return False
-    t = re.sub(r"\s+", " ", txt.strip().lower())
-    # exact phrase blacklist
-    if any(h == t for h in HEADER_PHRASES):
-        return True
-    hits = sum(1 for k in HEADER_KEYWORDS if k in t)
-    if hits >= 2:
-        return True
-    tokens = re.split(r"[\s\|,/:]+", t)
-    key_hit_count = sum(1 for tok in tokens if tok in HEADER_KEYWORDS)
-    if key_hit_count >= 3:
-        return True
-    if top_of_page and len(tokens) <= 10 and key_hit_count >= 2:
-        return True
-    if ("rate" in t or "net" in t) and "amt" in t and not any(ch.isdigit() for ch in t):
-        return True
-    if t.startswith("description") or t.startswith("qty") or t.startswith("qty /"):
-        return True
-    return False
-def final_item_filter(item: Dict[str, Any], known_page_headers: List[str] = [], other_item_names: List[str] = []) -> bool:
-    name = (item.get("item_name") or "").strip()
-    if not name:
-        return False
-    ln = name.lower()
-    # header exact detection
-    for h in known_page_headers:
-        if h and h.strip() and h.strip().lower() in ln:
-            return False
-    if FOOTER_KEYWORDS.search(ln):
-        return False
-    if item.get("item_amount", 0) > 1_000_000:
-        return False
-    if len(name) <= 2 and not re.search(r"[a-zA-Z]", name):
-        return False
-    # avoid pure section headers (short & header words)
-    words = ln.split()
-    header_word_hits = sum(1 for k in HEADER_KEYWORDS if k in ln)
-    if header_word_hits >= 1 and len(words) <= 3:
-        # if page contains more detailed items with 'room'/'rent'/'nursing' etc, remove this generic header
-        lower_other = " ".join(other_item_names).lower()
-        if any(k in lower_other for k in ["room", "rent", "nursing", "ward", "surgeon", "anaes", "ot", "charges", "procedure", "radiology"]):
-            return False
-        # also if name is exactly one of the short header words, drop
-        if ln in ("charge", "charges", "services", "consultation", "room", "radiology", "surgery"):
-            return False
-    # drop non-informative labels even if they have amount (summary rows)
-    if len(words) <= 4 and re.search(r"\b(charges|services|room|radiolog|laborat|surgery|procedure|rent|nursing)\b", ln):
-        # try to detect if it's a summary (presence of other more specific items)
-        lower_other = " ".join(other_item_names).lower()
-        if any(tok in lower_other for tok in ["rent", "room", "ward", "nursing", "surgeon", "anaes", "ot"]):
-            return False
-    if float(item.get("item_amount", 0)) <= 0.0:
-        return False
-    # sanity check rate vs amount
-    rate = float(item.get("item_rate", 0) or 0)
-    amt = float(item.get("item_amount", 0) or 0)
-    if rate and rate > amt * 10 and amt < 10000:
-        return False
-    return True
 # ---------------- main endpoint ----------------
 @app.post("/extract-bill-data")
 async def extract_bill_data(payload: BillRequest):
     doc_url = payload.document
     try:
         headers = {"User-Agent": "Mozilla/5.0"}
         resp = requests.get(doc_url, headers=headers, timeout=30)
@@ -604,8 +746,17 @@ async def extract_bill_data(payload: BillRequest):
             raise RuntimeError(f"download failed status={resp.status_code}")
         file_bytes = resp.content
     except Exception:
-        return {"is_success": False, "token_usage": {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0}, "data": {"pagewise_line_items": [], "total_item_count": 0}}
     images = []
     clean_url = doc_url.split("?", 1)[0].lower()
     try:
@@ -616,7 +767,7 @@ async def extract_bill_data(payload: BillRequest):
         else:
             try:
                 images = convert_from_bytes(file_bytes)
-            except Exception:
                 images = []
     except Exception:
         images = []
@@ -624,49 +775,68 @@ async def extract_bill_data(payload: BillRequest):
     pagewise = []
     cumulative_token_usage = {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0}
     for idx, page_img in enumerate(images, start=1):
         try:
             proc = preprocess_image(page_img)
             cells = image_to_tsv_cells(proc)
             rows = group_cells_into_rows(cells, y_tolerance=12)
             rows_texts = [" ".join([c["text"] for c in r]).strip() for r in rows]
-            # === HEADER PREFILTER: remove header-like rows anywhere on page ===
             rows_filtered = []
             for i, (r, rt) in enumerate(zip(rows, rows_texts)):
                 top_flag = (i < 6)
                 rt_norm = sanitize_ocr_text(rt).lower()
                 if looks_like_header_text(rt_norm, top_of_page=top_flag):
                     continue
                 if any(h in rt_norm for h in HEADER_PHRASES):
                     continue
                 rows_filtered.append(r)
-            # recompute row texts and a simple page_text
             rows = rows_filtered
             rows_texts = [" ".join([c["text"] for c in r]).strip() for r in rows]
             page_text = sanitize_ocr_text(" ".join(rows_texts))
-            # detect page-level top headers (for final filtering)
             top_headers = []
             for i, rt in enumerate(rows_texts[:6]):
-                if looks_like_header_text(rt, top_of_page=(i < 4)):
                     top_headers.append(rt.strip().lower())
             parsed_items = parse_rows_with_columns(rows, cells)
-            # ALWAYS attempt Gemini refinement if available (deterministic settings)
             refined_items, token_u = refine_with_gemini(parsed_items, page_text)
             for k in cumulative_token_usage:
                 cumulative_token_usage[k] += token_u.get(k, 0)
-            # Prepare other_item_names for contextual filtering (helps remove generic section headers)
-            other_item_names = [it.get("item_name","") for it in refined_items]
-            # final cleaning & dedupe
-            cleaned = [p for p in refined_items if final_item_filter(p, known_page_headers=top_headers, other_item_names=other_item_names)]
             cleaned = dedupe_items(cleaned)
             cleaned = [p for p in cleaned if not looks_like_header_text(p["item_name"].lower())]
             page_type = "Bill Detail"
             page_txt = page_text.lower()
             if any(x in page_txt for x in ["pharmacy", "medicine", "tablet"]):
@@ -674,16 +844,55 @@ async def extract_bill_data(payload: BillRequest):
             if "final bill" in page_txt or "grand total" in page_txt:
                 page_type = "Final Bill"
-            pagewise.append({"page_no": str(idx), "page_type": page_type, "bill_items": cleaned})
         except Exception:
-            pagewise.append({"page_no": str(idx), "page_type": "Bill Detail", "bill_items": []})
             continue
     total_item_count = sum(len(p.get("bill_items", [])) for p in pagewise)
     if not GEMINI_API_KEY or genai is None:
         cumulative_token_usage["warning_no_gemini"] = 1
-    return {"is_success": True, "token_usage": cumulative_token_usage, "data": {"pagewise_line_items": pagewise, "total_item_count": total_item_count}}
 # ---------------- debug TSV ----------------
 @app.post("/debug-tsv")
@@ -696,19 +905,27 @@ async def debug_tsv(payload: BillRequest):
         file_bytes = resp.content
     except Exception:
         return {"error": "Download failed"}
     clean_url = doc_url.split("?", 1)[0].lower()
     if clean_url.endswith(".pdf"):
         imgs = convert_from_bytes(file_bytes)
         img = imgs[0]
     else:
         img = Image.open(BytesIO(file_bytes))
     proc = preprocess_image(img)
     cells = image_to_tsv_cells(proc)
     return {"cells": cells}
 @app.get("/")
 def health_check():
-    msg = "Bill extraction API (final) live."
     if not GEMINI_API_KEY or genai is None:
-        msg += " (No GEMINI_API_KEY/configured SDK — LLM refinement skipped.)"
-    return {"status": "ok", "message": msg, "hint": "POST /extract-bill-data with {'document':'<url>'}"}

 import os
 import re
 import json
 from io import BytesIO
 from typing import List, Dict, Any, Optional, Tuple
 from PIL import Image
 import numpy as np
 import cv2
+import pytesseract
+from pytesseract import Output
+# ---------------- Config / Keywords ----------------
 NUM_RE = re.compile(r"[-+]?\d{1,3}(?:[,0-9]*)(?:\.\d+)?")
 TOTAL_KEYWORDS = re.compile(
     r"(grand\s*total|net\s*payable|total\s*amount|amount\s*payable|bill\s*amount|final\s*amount|balance\s*due|sub\s*total|subtotal|round\s*off)",
 )
 FOOTER_KEYWORDS = re.compile(r"(page|printed on|printed:|date:|time:|am|pm)", re.I)
+HEADER_KEYWORDS = [
+    "description", "qty", "hrs", "rate", "discount", "net", "amt", "amount",
+    "consultation", "qty/hrs", "qty / hrs", "qty /", "qty/"
+]
 HEADER_PHRASES = [
     "description qty / hrs consultation rate discount net amt",
     "description qty / hrs rate discount net amt",
 ]
 HEADER_PHRASES = [h.lower() for h in HEADER_PHRASES]
+# ---------------- Small utilities ----------------
 def sanitize_ocr_text(s: str) -> str:
     if not s:
         return ""
     return bool(t and NUM_RE.search(str(t)))
 def clean_name_text(s: str) -> str:
+    """
+    Normalize OCR names: remove odd punctuation, normalize SG codes, RR-2, and
+    safely map OR->DR only when it looks like a doctor's name.
+    """
+    if not s:
+        return s
+    s = s.replace("—", "-").replace("–", "-")
     s = re.sub(r"\s+", " ", s)
     s = s.strip(" -:,.")
+    # SG code normalization
     s = re.sub(r"\bSG0?(\d+)\b", r"SG\1", s, flags=re.I)
     s = re.sub(r"\b(RR)[\s\-]*2\b", r"RR-2", s, flags=re.I)
+    # Safer OR -> DR: only when pattern looks like a doctor name (e.g. "OR S SALIL KUMAR")
+    # Heuristic: 'OR' token followed by one or more tokens that are all alphabetic
+    # and at least one seems like a personal name (length > 2).
+    def safe_or_to_dr(text: str) -> str:
+        toks = text.split()
+        out = []
+        i = 0
+        while i < len(toks):
+            tok = toks[i]
+            if tok.upper() == "OR" and i + 1 < len(toks):
+                lookahead = toks[i+1:i+5]  # check up to 4 following tokens
+                # all lookahead tokens are alphabetic-ish and at least one token length>2
+                if all(re.match(r"^[A-Za-z\-\.\']+$", la) for la in lookahead if la) and any(len(la) > 2 for la in lookahead):
+                    out.append("DR")
+                    i += 1
+                    continue
+            out.append(tok)
+            i += 1
+        return " ".join(out)
+    s = safe_or_to_dr(s)
     return s.strip()
 # ---------------- image preprocessing ----------------
         pil_img = pil_img.resize((int(w * scale), int(h * scale)), Image.LANCZOS)
     cv_img = pil_to_cv2(pil_img)
     gray = cv2.cvtColor(cv_img, cv2.COLOR_BGR2GRAY)
+    # denoise
     try:
+        gray = cv2.fastNlMeansDenoising(gray, h=10)
+    except Exception:
+        pass
+    try:
+        bw = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+                                   cv2.THRESH_BINARY, 41, 15)
     except Exception:
         _, bw = cv2.threshold(gray, 127, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
     kernel = np.ones((1,1), np.uint8)
     bw = cv2.morphologyEx(bw, cv2.MORPH_OPEN, kernel)
     return bw
+# ---------------- OCR TSV helpers ----------------
 def image_to_tsv_cells(cv_img: Any) -> List[Dict[str, Any]]:
     try:
         o = pytesseract.image_to_data(cv_img, output_type=Output.DICT, config="--psm 6")
     except Exception:
         o = pytesseract.image_to_data(cv_img, output_type=Output.DICT)
+    cells: List[Dict[str, Any]] = []
     n = len(o.get("text", []))
     for i in range(n):
         raw = o["text"][i]
         height = int(o.get("height", [0])[i])
         center_y = top + height / 2.0
         center_x = left + width / 2.0
+        cells.append({
+            "text": txt,
+            "conf": conf,
+            "left": left,
+            "top": top,
+            "width": width,
+            "height": height,
+            "center_y": center_y,
+            "center_x": center_x
+        })
     return cells
+# ---------------- grouping into rows ----------------
 def group_cells_into_rows(cells: List[Dict[str, Any]], y_tolerance: int = 12) -> List[List[Dict[str, Any]]]:
     if not cells:
         return []
     sorted_cells = sorted(cells, key=lambda c: (c["center_y"], c["center_x"]))
+    rows: List[List[Dict[str, Any]]] = []
     current = [sorted_cells[0]]
     last_y = sorted_cells[0]["center_y"]
     for c in sorted_cells[1:]:
         rows.append(sorted(current, key=lambda cc: cc["left"]))
     return rows
+# ---------------- merge multiline names (doctor merge added) ----------------
 def merge_multiline_names(rows: List[List[Dict[str, Any]]]) -> List[List[Dict[str, Any]]]:
+    """
+    Merge split item/name rows. Added robust doctor-name merger:
+    - If a row is text-only and next row is doctor-name-like, merge them.
+    - Also merge short textual lines when both are short and non-numeric.
+    """
     if not rows:
         return rows
+    merged: List[List[Dict[str, Any]]] = []
     i = 0
     while i < len(rows):
         row = rows[i]
         tokens = [c["text"] for c in row]
+        joined = " ".join(tokens)
         has_num = any(is_numeric_token(t) for t in tokens)
+        # Doctor-name merger:
+        # If current row contains a header-like token (e.g. 'Consultation', 'Charge', '|')
+        # and next row looks like a doctor's name (mostly alphabetic tokens, few tokens),
+        # merge them.
         if not has_num and i + 1 < len(rows):
             next_row = rows[i+1]
+            next_txt = " ".join([c["text"] for c in next_row]).strip()
+            # doctor-like heuristics: mostly alphabetic tokens, not numeric, token count <= 6
+            next_tokens = [t for t in re.split(r"\s+", next_txt) if t]
+            next_alpha = all(re.match(r"^[A-Za-z\-\.\']+$", t) for t in next_tokens if t)
             next_has_num = any(is_numeric_token(t) for t in next_tokens)
+            # current row contains 'consultation' or 'charge' or '|' or 'dr' hint
+            if next_alpha and not next_has_num and len(next_tokens) <= 6:
+                # also ensure current row contains words like 'consultation' or 'charge' or 'dr' or '|'
+                if re.search(r"\b(consultation|charge|charges|\|)\b", joined, re.I) or re.search(r"\bdr\b", joined, re.I):
+                    merged_row = row + next_row
+                    merged.append(sorted(merged_row, key=lambda cc: cc["left"]))
+                    i += 2
+                    continue
+        # If both current and next are short pure-text lines (likely split names), merge them
         if not has_num and i + 1 < len(rows):
             next_row = rows[i+1]
             next_tokens = [c["text"] for c in next_row]
             next_has_num = any(is_numeric_token(t) for t in next_tokens)
+            if not next_has_num and len(tokens) <= 3 and len(next_tokens) <= 4:
+                merged_row = row + next_row
                 merged.append(sorted(merged_row, key=lambda cc: cc["left"]))
                 i += 2
                 continue
+        # Default
         merged.append(row)
         i += 1
     return merged
+# ---------------- Strong header detection (PATCH 1) ----------------
+def looks_like_header_text(txt: str, top_of_page: bool = False) -> bool:
+    if not txt:
+        return False
+    t = re.sub(r"\s+", " ", txt.strip().lower())
+    # universal blocklist patterns
+    header_patterns = [
+        r"description.*qty",
+        r"qty.*rate",
+        r"rate.*amount",
+        r"net\s*amt",
+        r"discount",
+        r"hrs\s*/\s*qty",
+        r"qty\s*/\s*hrs",
+    ]
+    for p in header_patterns:
+        if re.search(p, t):
+            return True
+    # blacklisted exact headers
+    if any(h == t for h in HEADER_PHRASES):
+        return True
+    # generic: if ≥3 header words → header
+    hits = sum(1 for k in HEADER_KEYWORDS if k in t)
+    if hits >= 3:
+        return True
+    # numeric structure: if line contains ≥3 numbers in tokenized order → header
+    tokens = re.split(r"[ \|,/]+", t)
+    numeric_count = sum(1 for tok in tokens if NUM_RE.search(tok))
+    if numeric_count >= 3:
+        return True
+    # top-of-page slightly looser
+    if top_of_page and hits >= 2:
+        return True
+    return False
+# ---------------- parsing rows into items (Part 2) ----------------
 def detect_numeric_columns(cells: List[Dict[str, Any]], max_columns: int = 4) -> List[float]:
+    """
+    Adaptive clustering of numeric tokens into column centers (restores conservative adaptive threshold).
+    """
     xs = [c["center_x"] for c in cells if is_numeric_token(c["text"])]
     if not xs:
         return []
     distances = [abs(token_x - cx) for cx in column_centers]
     return int(np.argmin(distances))
+# helper: quick check if item name looks like a lab/test (so we can adjust candidate rules)
+LAB_TEST_KEYWORDS = set(["ct", "et", "hiv", "hcv", "pt", "rbs", "rft", "ts", "tsh", "hb", "hbsaG".lower()])
+# more robust: tokens that are short and uppercase-like are often test codes; we'll check token itself lowercased.
+def looks_like_lab_test(name: str) -> bool:
+    if not name:
+        return False
+    ln = name.lower()
+    # common short codes
+    for k in ["ct", "et", "hiv", "hcv", "pt", "rbs", "rft", "tsh", "hbsag", "hb", "pus", "group", "rh"]:
+        if re.search(r"\b" + re.escape(k) + r"\b", ln):
+            return True
+    # if the name contains terms 'test' or 'lab' or parentheses with code, treat as lab
+    if re.search(r"\b(test|lab|laborat|cmia|cima|cs)\b", ln):
+        return True
+    return False
 def parse_rows_with_columns(rows: List[List[Dict[str, Any]]], page_cells: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    """
+    Conservative parse: prefer not to invent rate/qty. Uses numeric column mapping, safer inference,
+    and special handling for lab tests to avoid exploding qty.
+    """
+    parsed_items: List[Dict[str, Any]] = []
     rows = merge_multiline_names(rows)
     column_centers = detect_numeric_columns(page_cells, max_columns=4)
         if not tokens:
             continue
         joined_lower = " ".join(tokens).lower()
+        # skip footer-like lines unless numeric
         if FOOTER_KEYWORDS.search(joined_lower) and not any(is_numeric_token(t) for t in tokens):
             continue
+        # skip lines with no numeric tokens (likely headers or pure text)
         if all(not is_numeric_token(t) for t in tokens):
             continue
                 v = normalize_num_str(t)
                 if v is not None:
                     numeric_values.append(float(v))
+        # de-duplicate
+        numeric_values = sorted(list({float(x) for x in numeric_values}), reverse=True)
+        # Heuristic: remove tiny tokens that cause qty explosion except when amount < 100
+        # We'll apply this later when we know amount. For now keep them but mark.
         if column_centers:
+            # map numeric tokens to nearest columns
             left_text_parts = []
             numeric_bucket_map = {i: [] for i in range(len(column_centers))}
             for c in row:
                 if is_numeric_token(t):
                     col_idx = assign_token_to_column(cx, column_centers)
                     if col_idx is None:
+                        numeric_bucket_map[len(column_centers)-1].append(t)
                     else:
                         numeric_bucket_map[col_idx].append(t)
                 else:
             rate   = normalize_num_str(get_bucket(num_cols - 2)) if num_cols >= 2 else None
             qty    = normalize_num_str(get_bucket(num_cols - 3)) if num_cols >= 3 else None
+            # fallback: last numeric token as amount
             if amount is None:
                 for t in reversed(tokens):
                     if is_numeric_token(t):
                         if amount is not None:
                             break
+            # Clean numeric_values now that we may know amount
+            numeric_candidates = numeric_values.copy()
+            if amount is not None:
+                numeric_candidates = [v for v in numeric_candidates if (v >= 5 or amount <= 100)]
+            else:
+                numeric_candidates = [v for v in numeric_candidates if v >= 5]
+            # special handling for lab tests: avoid tiny rates / large qty
+            lab_like = looks_like_lab_test(name)
+            # Try to infer rate & qty from numeric_candidates conservatively
+            inferred_rate = rate
+            inferred_qty = qty
+            if amount is not None and numeric_candidates:
+                # try candidates as rate
+                for cand in numeric_candidates:
+                    if cand <= 1:
                         continue
+                    if cand >= amount:
                         continue
+                    ratio = amount / cand if cand else None
                     if ratio is None:
                         continue
                     r = round(ratio)
                     if r < 1 or r > 200:
                         continue
+                    # stricter for lab tests: reject qty > 10 and candidate < 5
+                    if lab_like and r > 10:
+                        continue
                     if abs(ratio - r) <= max(0.03 * r, 0.15):
+                        inferred_rate = float(cand)
+                        inferred_qty = float(r)
+                        break
             # fallback compute rate if qty found but rate missing
+            if (inferred_rate is None or inferred_rate == 0) and inferred_qty and inferred_qty != 0 and amount is not None:
                 try:
+                    candidate_rate = amount / inferred_qty
+                    if candidate_rate >= 1:
+                        inferred_rate = candidate_rate
                 except Exception:
                     pass
+            # If amount is zero but rate exists and qty exists, compute amount
+            if (amount is None or amount == 0) and inferred_rate and inferred_qty:
+                amount = round(inferred_rate * inferred_qty, 2)
             # final defaults
+            if inferred_qty is None:
+                inferred_qty = 1.0
+            if inferred_rate is None:
+                inferred_rate = 0.0
+            # final sanity checks
             try:
+                amount = float(round(amount, 2)) if amount is not None else None
             except Exception:
+                amount = None
             try:
+                inferred_rate = float(round(inferred_rate, 2)) if inferred_rate is not None else 0.0
             except Exception:
+                inferred_rate = 0.0
             try:
+                inferred_qty = float(inferred_qty)
             except Exception:
+                inferred_qty = 1.0
+            if amount is None or amount == 0:
+                # if amount still zero but we have rate>0 and qty present, compute
+                if inferred_rate and inferred_qty:
+                    amount = round(inferred_rate * inferred_qty, 2)
+            if amount is None or amount == 0:
+                # give up - skip this row (avoid inventing)
+                continue
             parsed_items.append({
                 "item_name": name if name else "UNKNOWN",
+                "item_amount": float(round(amount, 2)),
+                "item_rate": float(round(inferred_rate, 2)) if inferred_rate else 0.0,
+                "item_quantity": float(inferred_qty) if inferred_qty else 1.0,
             })
         else:
+            # no clear numeric columns — conservative right-to-left parsing
             numeric_idxs = [i for i, t in enumerate(tokens) if is_numeric_token(t)]
             if not numeric_idxs:
                 continue
             name = " ".join(tokens[:last]).strip()
             if not name:
                 continue
+            # collect numeric tokens on RHS to attempt inference
             right_nums = []
             for i in numeric_idxs:
                 v = normalize_num_str(tokens[i])
                 if v is not None:
                     right_nums.append(float(v))
+            right_nums = sorted(list({float(x) for x in right_nums}), reverse=True)
+            rate = None
+            qty = None
+            # conservative mapping
             if len(right_nums) >= 2:
                 cand = right_nums[1]
                 if float(cand) > 1 and float(cand) < float(amt):
                     ratio = float(amt) / float(cand) if cand else None
                     if ratio:
                         r = round(ratio)
                         if 1 <= r <= 200 and abs(ratio - r) <= max(0.03 * r, 0.15) and r <= 100:
                             rate = float(cand)
                             qty = float(r)
             if rate is None and right_nums:
                 for cand in right_nums:
                     if cand <= 1.0 or cand >= float(amt):
             if rate is None:
                 rate = 0.0
+            # special lab test protections
+            if looks_like_lab_test(name):
+                # if rate <5 and amt>100 -> treat rate as 0 (avoid cand like 12 causing qty 25)
+                if rate < 5 and amt > 100:
+                    rate = 0.0
+                    qty = 1.0
+            # if amount==0 but rate>0, update
+            if amt == 0 and rate and qty:
+                amt = round(rate * qty, 2)
             parsed_items.append({
                 "item_name": clean_name_text(name),
                 "item_amount": float(round(amt, 2)),
 # ---------------- dedupe & totals ----------------
 def dedupe_items(items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
     seen = set()
+    out: List[Dict[str, Any]] = []
     for it in items:
+        nm = re.sub(r"\s+", " ", (it.get("item_name") or "").lower()).strip()
+        key = (nm[:120], round(float(it.get("item_amount", 0.0)), 2))
         if key in seen:
             continue
         seen.add(key)
                     if final is None: final = float(round(v, 2))
     return {"subtotal": subtotal, "final_total": final}
+# ---------------- Gemini refinement (improved prompt per PATCH 7) ----------------
 def refine_with_gemini(page_items: List[Dict[str, Any]], page_text: str = "") -> Tuple[List[Dict[str, Any]], Dict[str, int]]:
+    """
+    Attempt deterministic Gemini refinement. If Gemini not configured/available, return page_items as-is.
+    """
     zero_usage = {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0}
     if not GEMINI_API_KEY or genai is None:
         return page_items, zero_usage
         system_prompt = (
             "You are a strict bill-extraction cleaner. Return ONLY a JSON array (no explanation, no backticks). "
             "Each entry must be an object with keys: item_name (string), item_amount (float), item_rate (float), item_quantity (float). "
+            "Do NOT include subtotal or total lines as items. Do NOT invent items; only clean/fix/normalize the given items. "
+            "Prefer exact names from the bill. If names are broken across lines, merge them. Do not rename items unless it's obvious OCR noise."
         )
+        user_prompt = f"""
+Extract ONLY line items from this hospital bill.
+### RULES (MUST FOLLOW)
+- Do NOT invent items.
+- Do NOT return section headers (Room Charges, Lab Services, Radiology).
+- Merge broken multi-line names.
+- Reconstruct missing rate/qty using amt=rate*qty if visible in text.
+- Prefer exact names as shown in bill.
+- If a doctor name appears across lines, merge to full name.
+- Ignore totals / subtotals.
+- Ignore page numbers.
+- Avoid changing 'OR' unless it is clearly a doctor prefix.
+- Ignore final bill summaries.
+### OCR TEXT:
+{safe_text}
+### INITIAL ITEMS:
+{json.dumps(page_items, ensure_ascii=False, indent=2)}
+Return ONLY a JSON array of cleaned items, e.g.:
+[
+  {{ "item_name": "Consultation Charge | DR PREETHI MARY JOSEPH", "item_amount": 300.0, "item_rate": 300.0, "item_quantity": 1.0 }},
+  ...
+]
+"""
         model = genai.GenerativeModel(GEMINI_MODEL_NAME)
         response = model.generate_content(
             [
                     })
                 except Exception:
                     continue
+            # token usage not reliably available here; return zeros
             return cleaned, zero_usage
         return page_items, zero_usage
     except Exception:
         return page_items, zero_usage
+# ---------------- Post-validation engine (PATCH 5) ----------------
+def post_validate_items(items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    """
+    Rule engine to fix common Gemini hallucinations / OCR inference errors.
+    - If amount==0 and rate & qty present -> amount = rate * qty
+    - If rate*qty differs from amount by tolerance -> recompute qty or rate conservatively
+    - Clamp unreasonable qty for lab tests
+    """
+    out = []
+    for it in items:
+        name = it.get("item_name", "") or ""
+        amt = float(it.get("item_amount", 0.0) or 0.0)
+        rate = float(it.get("item_rate", 0.0) or 0.0)
+        qty = float(it.get("item_quantity", 1.0) or 1.0)
+        lab_like = looks_like_lab_test(name)
+        # If amount missing but rate & qty known -> compute amount
+        if (amt == 0 or amt is None) and rate > 0 and qty > 0:
+            amt = round(rate * qty, 2)
+        # If rate missing but amt and qty present -> compute rate
+        if (rate == 0 or rate is None) and qty and qty != 0:
+            try:
+                candidate_rate = amt / qty
+                if candidate_rate > 0:
+                    rate = round(candidate_rate, 2)
+            except Exception:
+                pass
+        # If qty obviously wrong (amt not close to rate*qty), try recompute qty
+        if rate > 0:
+            ideal = rate * qty
+            if abs(ideal - amt) > max(2.0, 0.1 * ideal):
+                # try compute qty = amt/rate
+                try:
+                    q = amt / rate if rate else qty
+                    if 1 <= round(q) <= (10 if lab_like else 100):
+                        qty = float(round(q))
+                    else:
+                        # fallback: set qty to 1
+                        qty = 1.0
+                except Exception:
+                    qty = 1.0
+        # Clamp lab test qtys to reasonable bounds
+        if lab_like and qty > 10:
+            qty = 1.0
+        # Recompute amt if mismatch after adjustments
+        if rate > 0:
+            recomputed = round(rate * qty, 2)
+            # if recomputed is close to amt, prefer recomputed
+            if abs(recomputed - amt) <= max(2.0, 0.05 * recomputed):
+                amt = recomputed
+            # else if amt much larger but not matching, keep amt but set qty=1
+            else:
+                if abs(amt - recomputed) / max(1.0, recomputed) > 0.5:
+                    qty = 1.0
+                    # and try recompute rate if rate seems wrong
+                    rate = round(amt / qty, 2) if qty else rate
+        it["item_amount"] = round(float(amt or 0.0), 2)
+        it["item_rate"] = round(float(rate or 0.0), 2)
+        it["item_quantity"] = float(qty or 1.0)
+        out.append(it)
+    return out
 # ---------------- main endpoint ----------------
 @app.post("/extract-bill-data")
 async def extract_bill_data(payload: BillRequest):
     doc_url = payload.document
+    # ---------- download ----------
     try:
         headers = {"User-Agent": "Mozilla/5.0"}
         resp = requests.get(doc_url, headers=headers, timeout=30)
             raise RuntimeError(f"download failed status={resp.status_code}")
         file_bytes = resp.content
     except Exception:
+        return {
+            "is_success": False,
+            "token_usage": {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0},
+            "data": {
+                "pagewise_line_items": [],
+                "total_item_count": 0,
+                "final_total": 0.0
+            }
+        }
+    # ---------- convert to images ----------
     images = []
     clean_url = doc_url.split("?", 1)[0].lower()
     try:
         else:
             try:
                 images = convert_from_bytes(file_bytes)
+            except:
                 images = []
     except Exception:
         images = []
     pagewise = []
     cumulative_token_usage = {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0}
+    # ---------- per page ----------
     for idx, page_img in enumerate(images, start=1):
         try:
             proc = preprocess_image(page_img)
+            # TSV
             cells = image_to_tsv_cells(proc)
             rows = group_cells_into_rows(cells, y_tolerance=12)
             rows_texts = [" ".join([c["text"] for c in r]).strip() for r in rows]
+            # ---------------- HEADER PREFILTER ----------------
             rows_filtered = []
             for i, (r, rt) in enumerate(zip(rows, rows_texts)):
                 top_flag = (i < 6)
                 rt_norm = sanitize_ocr_text(rt).lower()
+                # strong header detector (from patched Part 1)
                 if looks_like_header_text(rt_norm, top_of_page=top_flag):
                     continue
+                # legacy blacklist
                 if any(h in rt_norm for h in HEADER_PHRASES):
                     continue
                 rows_filtered.append(r)
             rows = rows_filtered
             rows_texts = [" ".join([c["text"] for c in r]).strip() for r in rows]
             page_text = sanitize_ocr_text(" ".join(rows_texts))
+            # detect headers at top of page
             top_headers = []
             for i, rt in enumerate(rows_texts[:6]):
+                if looks_like_header_text(rt.lower(), top_of_page=(i < 4)):
                     top_headers.append(rt.strip().lower())
+            # ---------------- PARSE ITEMS ----------------
             parsed_items = parse_rows_with_columns(rows, cells)
+            # ---------------- GEMINI REFINEMENT ----------------
             refined_items, token_u = refine_with_gemini(parsed_items, page_text)
             for k in cumulative_token_usage:
                 cumulative_token_usage[k] += token_u.get(k, 0)
+            # ---------------- CONTEXT-AWARE SECTION FILTER ----------------
+            other_item_names = [it.get("item_name", "") for it in refined_items]
+            cleaned = []
+            for p in refined_items:
+                if final_item_filter(p, known_page_headers=top_headers, other_item_names=other_item_names):
+                    cleaned.append(p)
             cleaned = dedupe_items(cleaned)
+            # drop any leftover header noise
             cleaned = [p for p in cleaned if not looks_like_header_text(p["item_name"].lower())]
+            # ---------------- RULE ENGINE POST-VALIDATION ----------------
+            cleaned = post_validate_items(cleaned)
+            # ---------------- PAGE TYPE ----------------
             page_type = "Bill Detail"
             page_txt = page_text.lower()
             if any(x in page_txt for x in ["pharmacy", "medicine", "tablet"]):
             if "final bill" in page_txt or "grand total" in page_txt:
                 page_type = "Final Bill"
+            # ---------------- PER-PAGE SUBTOTAL/TOTAL ----------------
+            detected = detect_subtotals_and_totals(rows_texts)
+            page_subtotal = detected.get("subtotal")
+            page_final = detected.get("final_total")
+            # ---------------- STORE PAGE ----------------
+            pagewise.append({
+                "page_no": str(idx),
+                "page_type": page_type,
+                "bill_items": cleaned,
+                "subtotal": page_subtotal,
+                "final_page_total": page_final
+            })
         except Exception:
+            pagewise.append({
+                "page_no": str(idx),
+                "page_type": "Bill Detail",
+                "bill_items": [],
+                "subtotal": None,
+                "final_page_total": None
+            })
             continue
+    # ---------------- GLOBAL FINAL TOTAL ----------------
     total_item_count = sum(len(p.get("bill_items", [])) for p in pagewise)
+    # Sum items across all pages (no double counting)
+    grand_total = 0.0
+    for p in pagewise:
+        for it in p.get("bill_items", []):
+            try:
+                grand_total += float(it.get("item_amount", 0.0) or 0.0)
+            except:
+                pass
     if not GEMINI_API_KEY or genai is None:
         cumulative_token_usage["warning_no_gemini"] = 1
+    return {
+        "is_success": True,
+        "token_usage": cumulative_token_usage,
+        "data": {
+            "pagewise_line_items": pagewise,
+            "total_item_count": total_item_count,
+            "final_total": round(grand_total, 2)
+        }
+    }
 # ---------------- debug TSV ----------------
 @app.post("/debug-tsv")
         file_bytes = resp.content
     except Exception:
         return {"error": "Download failed"}
     clean_url = doc_url.split("?", 1)[0].lower()
     if clean_url.endswith(".pdf"):
         imgs = convert_from_bytes(file_bytes)
         img = imgs[0]
     else:
         img = Image.open(BytesIO(file_bytes))
     proc = preprocess_image(img)
     cells = image_to_tsv_cells(proc)
     return {"cells": cells}
+# ---------------- health check ----------------
 @app.get("/")
 def health_check():
+    msg = "Bill extraction API (patched v3) live."
     if not GEMINI_API_KEY or genai is None:
+        msg += " (No Gemini → LLM refinement disabled)"
+    return {
+        "status": "ok",
+        "message": msg,
+        "hint": "POST /extract-bill-data with {'document':'<url>'}"
+    }