Spaces:

Sathvik-kota
/

Datathon

Sleeping

App Files Files Community

Sathvik-kota commited on Nov 29, 2025

Commit

5ec4a93

verified ·

1 Parent(s): 1404047

Upload folder using huggingface_hub

Browse files

Files changed (1) hide show

app.py +451 -445

app.py CHANGED Viewed

@@ -1,8 +1,7 @@
-"""
-Bajaj Finserv Datathon – Bill Extraction Service
-Clean, modular and human-written version (Option A)
-Maintains your exact logic but reorganized for readability and robustness.
-"""
 import os
 import re
@@ -10,542 +9,549 @@ import json
 from io import BytesIO
 from typing import List, Dict, Any, Optional, Tuple
-import cv2
-import numpy as np
 import requests
 from PIL import Image
 from pdf2image import convert_from_bytes
-from fastapi import FastAPI
-from pydantic import BaseModel
 import pytesseract
 from pytesseract import Output
-import google.generativeai as genai
-# -------------------------------------------------------
-#  GEMINI CONFIG
-# -------------------------------------------------------
 GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
-GEMINI_MODEL = "gemini-2.5-flash"
-if GEMINI_API_KEY:
-    genai.configure(api_key=GEMINI_API_KEY)
-# -------------------------------------------------------
-#  FASTAPI APP
-# -------------------------------------------------------
-app = FastAPI(title="Bajaj Datathon - Bill Extractor (Clean vA)")
 class BillRequest(BaseModel):
     document: str
-# -------------------------------------------------------
-#  REGEX + CONSTANTS
-# -------------------------------------------------------
 NUM_RE = re.compile(r"[-+]?\d{1,3}(?:[,0-9]*)(?:\.\d+)?")
-TOTAL_KEYS = re.compile(
-    r"(grand\s*total|net\s*payable|total\s*amount|amount\s*payable|bill\s*amount|"
-    r"final\s*amount|balance\s*due|sub\s*total|subtotal|round\s*off)",
-    re.I
 )
-HEADER_HINT = re.compile(
-    r"^(consultation|room|nursing|surgery|radiology|laboratory|charges|services|investigation|package|section)$",
-    re.I
-)
-FOOTER_HINT = re.compile(r"(page|printed|date|time|am|pm|printed on)", re.I)
-# =======================================================
-#  UTILITY HELPERS
-# =======================================================
-def normalize_number(raw: Optional[str]) -> Optional[float]:
-    """Convert OCR number-like text into a clean float."""
-    if not raw:
         return None
-    text = re.sub(r"[^\d\-\+\,\.\(\)]", "", str(raw)).strip()
-    if not text:
         return None
-    # Handle negative (accounting) format: (150.00)
-    is_negative = text.startswith("(") and text.endswith(")")
-    if is_negative:
-        text = text[1:-1]
-    try:
-        val = float(text.replace(",", ""))
-        return -val if is_negative else val
-    except:
         return None
-def is_numeric(text: str) -> bool:
-    return bool(NUM_RE.search(str(text)))
-def clean_item_name(text: str) -> str:
-    """Normalizes the left-side description of an item."""
-    t = text.replace("—", "-")
-    t = re.sub(r"\s+", " ", t)
-    t = t.strip(" -:,.")
-    t = re.sub(r"\bSG0?(\d+)\b", r"SG\1", t, flags=re.I)
-    t = re.sub(r"\b(RR)[\s\-]*2\b", r"RR-2", t, flags=re.I)
-    return t.strip()
-# =======================================================
-#  IMAGE PROCESSING
-# =======================================================
-def pil_to_cv(pil: Image.Image) -> np.ndarray:
-    np_img = np.array(pil)
-    return np_img if np_img.ndim == 2 else cv2.cvtColor(np_img, cv2.COLOR_RGB2BGR)
-def preprocess_image(pil_img: Image.Image) -> np.ndarray:
-    """Resize, denoise & binarize image to improve OCR accuracy."""
     pil_img = pil_img.convert("RGB")
     w, h = pil_img.size
-    # Upscale if very small
-    if w < 1500:
-        scale = 1500 / float(w)
         pil_img = pil_img.resize((int(w * scale), int(h * scale)), Image.LANCZOS)
-    img = pil_to_cv(pil_img)
-    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
     gray = cv2.fastNlMeansDenoising(gray, h=10)
     try:
-        bw = cv2.adaptiveThreshold(
-            gray, 255,
-            cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
-            cv2.THRESH_BINARY,
-            41, 15
-        )
     except Exception:
         _, bw = cv2.threshold(gray, 127, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
-    bw = cv2.morphologyEx(bw, cv2.MORPH_OPEN, np.ones((1, 1), np.uint8))
     return bw
-# =======================================================
-#  OCR TSV PARSING
-# =======================================================
-def run_tesseract(cv_img: np.ndarray) -> List[Dict[str, Any]]:
-    """Extracts word-level bounding boxes and confidence from image."""
     try:
-        data = pytesseract.image_to_data(cv_img, output_type=Output.DICT, config="--psm 6")
-    except:
-        data = pytesseract.image_to_data(cv_img, output_type=Output.DICT)
     cells = []
-    n = len(data["text"])
     for i in range(n):
-        txt = str(data["text"][i]).strip()
         if not txt:
             continue
-        conf = float(data["conf"][i]) if data["conf"][i] not in ("", "-1") else -1.0
-        left = int(data["left"][i])
-        top = int(data["top"][i])
-        w = int(data["width"][i])
-        h = int(data["height"][i])
-        cells.append({
-            "text": txt,
-            "conf": conf,
-            "left": left,
-            "top": top,
-            "width": w,
-            "height": h,
-            "center_x": left + w / 2,
-            "center_y": top + h / 2,
-        })
     return cells
-# =======================================================
-#  ROW GROUPING + MERGING
-# =======================================================
-def group_cells(cells: List[Dict[str, Any]], tol: int = 12) -> List[List[Dict[str, Any]]]:
-    """Groups words into horizontal text rows."""
     if not cells:
         return []
-    cells = sorted(cells, key=lambda c: (c["center_y"], c["center_x"]))
-    rows, current = [], [cells[0]]
-    last = cells[0]["center_y"]
-    for c in cells[1:]:
-        if abs(c["center_y"] - last) <= tol:
             current.append(c)
         else:
-            rows.append(sorted(current, key=lambda x: x["left"]))
             current = [c]
-        last = c["center_y"]
-    rows.append(sorted(current, key=lambda x: x["left"]))
     return rows
-def merge_multiline_descriptions(rows: List[List[Dict[str, Any]]]) -> List[List[Dict[str, Any]]]:
-    """
-    Some items have description on one line and numbers on the next.
-    This merges them into a single row.
-    """
     if not rows:
         return rows
     merged = []
     i = 0
     while i < len(rows):
         row = rows[i]
         tokens = [c["text"] for c in row]
-        row_has_num = any(is_numeric(t) for t in tokens)
-        # If row is only text and next row is numeric: merge
-        if not row_has_num and i + 1 < len(rows):
-            next_row = rows[i + 1]
             next_tokens = [c["text"] for c in next_row]
-            if any(is_numeric(t) for t in next_tokens):
-                # prepend text row to numeric row
-                new_row = []
-                # push all text cells slightly left of next row
-                base_left = min([c["left"] for c in next_row]) - 50
-                offset = 0
-                for cell in row:
-                    c = dict(cell)
-                    c["left"] = base_left + offset
-                    c["center_x"] = c["left"] + c["width"] / 2
-                    new_row.append(c)
-                    offset += 15
-                new_row.extend(next_row)
-                merged.append(sorted(new_row, key=lambda x: x["left"]))
                 i += 2
                 continue
         merged.append(row)
         i += 1
     return merged
-# =======================================================
-#  COLUMN DETECTION
-# =======================================================
-def detect_column_centers(cells: List[Dict[str, Any]], max_cols=4) -> List[float]:
-    xs = sorted([c["center_x"] for c in cells if is_numeric(c["text"])])
     if not xs:
         return []
     if len(xs) == 1:
-        return xs
-    gaps = [xs[i + 1] - xs[i] for i in range(len(xs) - 1)]
-    gap_thresh = max(30, np.mean(gaps) + 0.6 * np.std(gaps))
     clusters = []
     curr = [xs[0]]
     for i, g in enumerate(gaps):
-        if g > gap_thresh and len(clusters) < max_cols - 1:
             clusters.append(curr)
-            curr = [xs[i + 1]]
         else:
-            curr.append(xs[i + 1])
     clusters.append(curr)
-    centers = sorted([np.median(c) for c in clusters])[:max_cols]
-    return centers
-def nearest_column(x: float, centers: List[float]) -> int:
-    distances = [abs(x - c) for c in centers]
     return int(np.argmin(distances))
-# =======================================================
-#  ROW PARSER (MAIN LOGIC)
-# =======================================================
-def parse_rows(rows: List[List[Dict[str, Any]]], cells: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
-    """Extract structured line items using detected columns."""
-    items = []
-    rows = merge_multiline_descriptions(rows)
-    col_centers = detect_column_centers(cells, max_cols=4)
     for row in rows:
         tokens = [c["text"] for c in row]
         if not tokens:
             continue
-        joined = " ".join(tokens).lower()
-        # Skip footer lines like "Page 1/4"
-        if FOOTER_HINT.search(joined) and not any(is_numeric(t) for t in tokens):
-            continue
-        # Skip headings that do not contain numbers
-        if not any(is_numeric(t) for t in tokens):
-            continue
-        # --- Parse row using detected columns ---
-        left_parts = []
-        numeric_buckets = {i: [] for i in range(len(col_centers))}
-        for c in row:
-            t = c["text"]
-            if is_numeric(t):
-                col = nearest_column(c["center_x"], col_centers) if col_centers else len(col_centers) - 1
-                numeric_buckets[col].append(t)
-            else:
-                left_parts.append(t)
-        name = clean_item_name(" ".join(left_parts))
-        num_cols = len(col_centers)
-        # Extract numeric fields by column order (qty, rate, amount)
-        def bucket(idx): return numeric_buckets.get(idx, [])[-1] if numeric_buckets.get(idx) else None
-        amount = normalize_number(bucket(num_cols - 1))
-        rate = normalize_number(bucket(num_cols - 2)) if num_cols >= 2 else None
-        qty = normalize_number(bucket(num_cols - 3)) if num_cols >= 3 else None
-        # Fallbacks
-        if amount is None:
-            for t in reversed(tokens):
-                if is_numeric(t):
-                    amount = normalize_number(t)
-                    break
-        if qty is None and amount and rate:
-            q_est = amount / rate
-            rounded = round(q_est)
-            if abs(q_est - rounded) <= 0.2:
-                qty = rounded
-        if qty is None:
-            qty = 1.0
-        if (rate is None or rate == 0) and qty and amount:
-            rate = round(amount / qty, 2)
-        if amount is None or amount <= 0:
             continue
-        if HEADER_HINT.search(name):
             continue
-        items.append({
-            "item_name": name or "UNKNOWN",
-            "item_amount": float(round(amount, 2)),
-            "item_rate": float(round(rate or 0.0, 2)),
-            "item_quantity": float(qty)
-        })
-    return items
-# =======================================================
-#  DEDUPE ITEMS + DETECT TOTALS
-# =======================================================
-def dedupe(items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
     seen = set()
     out = []
     for it in items:
-        key = (it["item_name"].lower()[:120], round(it["item_amount"], 2))
-        if key not in seen:
-            seen.add(key)
-            out.append(it)
     return out
-# =======================================================
-#  OPTIONAL: GEMINI REFINEMENT
-# =======================================================
-def refine_with_llm(items: List[Dict[str, Any]], text: str):
-    """Uses Gemini only when inconsistencies are high."""
-    if not GEMINI_API_KEY:
-        return items, {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0}
     try:
-        prompt = (
-            "You are a precise bill-item cleaner. Fix broken names, validate qty = amount/rate, "
-            "and remove any invalid rows. Return JSON array only.\n\n"
-            f"Full text: '''{text[:3000]}'''\n"
-            f"Detected items: {json.dumps(items)}"
         )
-        model = genai.GenerativeModel(GEMINI_MODEL)
-        response = model.generate_content(prompt)
         raw = response.text.strip()
-        raw = raw.replace("```json", "").replace("```", "")
         parsed = json.loads(raw)
-        final_items = []
-        for obj in parsed:
-            final_items.append({
-                "item_name": str(obj.get("item_name", "")).strip(),
-                "item_amount": float(obj.get("item_amount", 0)),
-                "item_rate": float(obj.get("item_rate", 0)),
-                "item_quantity": float(obj.get("item_quantity", 1)),
-            })
-        return final_items, {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0}
-    except:
-        return items, {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0}
-# =======================================================
-#  MAIN API ENDPOINT
-# =======================================================
 @app.post("/extract-bill-data")
 async def extract_bill_data(payload: BillRequest):
-    # ---------------------------------------------------
-    # 1. DOWNLOAD FILE
-    # ---------------------------------------------------
     try:
-        resp = requests.get(payload.document, headers={"User-Agent": "Mozilla"}, timeout=30)
-        resp.raise_for_status()
-        data_bytes = resp.content
-    except:
-        return {
-            "is_success": False,
-            "token_usage": {},
-            "data": {"pagewise_line_items": [], "total_item_count": 0}
-        }
-    # ---------------------------------------------------
-    # 2. LOAD PAGES (PDF / IMAGE)
-    # ---------------------------------------------------
-    pages = []
-    url_no_query = payload.document.split("?", 1)[0].lower()
     try:
-        if url_no_query.endswith(".pdf"):
-            pages = convert_from_bytes(data_bytes)
         else:
-            pages = [Image.open(BytesIO(data_bytes))]
-    except:
-        return {
-            "is_success": False,
-            "token_usage": {},
-            "data": {"pagewise_line_items": [], "total_item_count": 0}
-        }
-    # ---------------------------------------------------
-    # 3. PROCESS EACH PAGE
-    # ---------------------------------------------------
-    results = []
-    gemini_usage = {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0}
-    for idx, page in enumerate(pages, start=1):
-        try:
-            proc = preprocess_image(page)
-            cells = run_tesseract(proc)
-            rows = group_cells(cells)
-            page_text = " ".join(" ".join(c["text"] for c in r) for r in rows).lower()
-            items = parse_rows(rows, cells)
-            items = dedupe(items)
-            # decide whether to refine with LLM
-            use_llm = False
-            if GEMINI_API_KEY and len(items) > 0:
-                inconsistent = sum(
-                    1 for it in items
-                    if abs(it["item_quantity"] * it["item_rate"] - it["item_amount"]) > max(2, 0.03 * it["item_amount"])
-                )
-                if inconsistent > max(1, len(items) // 6):
-                    use_llm = True
-            if use_llm:
-                items, usage = refine_with_llm(items, page_text)
-                for k in gemini_usage:
-                    gemini_usage[k] += usage[k]
-            results.append({
-                "page_no": str(idx),
-                "page_type": "Bill Detail",
-                "bill_items": items,
-            })
-        except Exception:
-            results.append({
-                "page_no": str(idx),
-                "page_type": "Bill Detail",
-                "bill_items": []
-            })
-    total_count = sum(len(p["bill_items"]) for p in results)
-    return {
-        "is_success": True,
-        "token_usage": gemini_usage,
-        "data": {
-            "pagewise_line_items": results,
-            "total_item_count": total_count
-        }
-    }
-# -------------------------------------------------------
-#  RAW TSV DEBUG ENDPOINT
-# -------------------------------------------------------
 @app.post("/debug-tsv")
 async def debug_tsv(payload: BillRequest):
     try:
-        resp = requests.get(payload.document, timeout=20)
-        resp.raise_for_status()
-        data = resp.content
-    except:
-        return {"error": "Unable to download"}
-    url = payload.document.split("?", 1)[0].lower()
-    if url.endswith(".pdf"):
-        img = convert_from_bytes(data)[0]
     else:
-        img = Image.open(BytesIO(data))
     proc = preprocess_image(img)
-    return {"cells": run_tesseract(proc)}
 @app.get("/")
-def root():
-    return {"status": "ok", "message": "Bill extraction API running"}

+# app_bill_extractor_final.py
+# Humanized, high-accuracy bill extraction API.
+# Combines robust OCR preprocessing, TSV-based layout parsing, numeric-column inference,
+# and ALWAYS attempts Gemini refinement (if GEMINI_API_KEY set). Made compact & readable.
 import os
 import re
 from io import BytesIO
 from typing import List, Dict, Any, Optional, Tuple
+from fastapi import FastAPI
+from pydantic import BaseModel
 import requests
 from PIL import Image
 from pdf2image import convert_from_bytes
 import pytesseract
 from pytesseract import Output
+import numpy as np
+import cv2
+# Optional: Google Gemini SDK (if you use it). Code will gracefully work without it.
+try:
+    import google.generativeai as genai
+except Exception:
+    genai = None
+# ---------------- LLM CONFIG ----------------
 GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
+GEMINI_MODEL_NAME = os.getenv("GEMINI_MODEL_NAME", "gemini-2.5-flash")
+if GEMINI_API_KEY and genai is not None:
+    try:
+        genai.configure(api_key=GEMINI_API_KEY)
+    except Exception:
+        pass
+# ---------------- FastAPI app ----------------
+app = FastAPI(title="Bajaj Datathon - Bill Extractor (final, humanized)")
 class BillRequest(BaseModel):
     document: str
+# ---------------- Regex, small utils ----------------
 NUM_RE = re.compile(r"[-+]?\d{1,3}(?:[,0-9]*)(?:\.\d+)?")
+TOTAL_KEYWORDS = re.compile(
+    r"(grand\s*total|net\s*payable|total\s*amount|amount\s*payable|bill\s*amount|final\s*amount|balance\s*due|sub\s*total|subtotal|round\s*off)",
+    re.I,
 )
+FOOTER_KEYWORDS = re.compile(r"(page|printed on|printed:|date:|time:|am|pm)", re.I)
+HEADER_KEYWORDS = ["description", "qty", "hrs", "rate", "discount", "net", "amt", "amount", "consultation", "qty/hrs", "qty / hrs"]
+# sanitize OCR text before ever sending to an LLM or using it for heuristics
+def sanitize_ocr_text(s: str) -> str:
+    if not s:
+        return ""
+    # unify dashes and remove odd control characters
+    s = s.replace("\u2014", "-").replace("\u2013", "-")
+    s = re.sub(r"[^\x09\x0A\x0D\x20-\x7E]", " ", s)
+    s = s.replace("\r\n", "\n").replace("\r", "\n")
+    s = re.sub(r"[ \t]+", " ", s)
+    s = s.strip()
+    return s[:4000]
+def normalize_num_str(s: Optional[str]) -> Optional[float]:
+    if s is None:
         return None
+    s = str(s).strip()
+    if s == "":
         return None
+    s = re.sub(r"[^\d\-\+\,\.\(\)]", "", s)
+    negative = False
+    if s.startswith("(") and s.endswith(")"):
+        negative = True
+        s = s[1:-1]
+    s = s.replace(",", "")
+    if s in ("", "-", "+"):
         return None
+    try:
+        return -float(s) if negative else float(s)
+    except Exception:
+        try:
+            return float(s.replace(" ", ""))
+        except Exception:
+            return None
+def is_numeric_token(t: Optional[str]) -> bool:
+    return bool(t and NUM_RE.search(str(t)))
+def clean_name_text(s: str) -> str:
+    s = s.replace("—", "-")
+    s = re.sub(r"\s+", " ", s)
+    s = s.strip(" -:,.")
+    s = re.sub(r"\bSG0?(\d+)\b", r"SG\1", s, flags=re.I)
+    s = re.sub(r"\b(RR)[\s\-]*2\b", r"RR-2", s, flags=re.I)
+    return s.strip()
+# ---------------- image preprocessing ----------------
+def pil_to_cv2(img: Image.Image) -> Any:
+    arr = np.array(img)
+    if arr.ndim == 2:
+        return arr
+    return cv2.cvtColor(arr, cv2.COLOR_RGB2BGR)
+def preprocess_image(pil_img: Image.Image) -> Any:
+    # quick, robust steps: upscale small images, grayscale, denoise, adaptive threshold
     pil_img = pil_img.convert("RGB")
     w, h = pil_img.size
+    target_w = 1500
+    if w < target_w:
+        scale = target_w / float(w)
         pil_img = pil_img.resize((int(w * scale), int(h * scale)), Image.LANCZOS)
+    cv_img = pil_to_cv2(pil_img)
+    gray = cv2.cvtColor(cv_img, cv2.COLOR_BGR2GRAY)
     gray = cv2.fastNlMeansDenoising(gray, h=10)
     try:
+        bw = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 41, 15)
     except Exception:
         _, bw = cv2.threshold(gray, 127, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
+    kernel = np.ones((1,1), np.uint8)
+    bw = cv2.morphologyEx(bw, cv2.MORPH_OPEN, kernel)
     return bw
+# ---------------- OCR TSV helpers ----------------
+def image_to_tsv_cells(cv_img: Any) -> List[Dict[str, Any]]:
     try:
+        o = pytesseract.image_to_data(cv_img, output_type=Output.DICT, config="--psm 6")
+    except Exception:
+        o = pytesseract.image_to_data(cv_img, output_type=Output.DICT)
     cells = []
+    n = len(o.get("text", []))
     for i in range(n):
+        raw = o["text"][i]
+        if raw is None:
+            continue
+        txt = str(raw).strip()
         if not txt:
             continue
+        try:
+            conf = float(o["conf"][i]) if o["conf"][i] not in (None, "", "-1") else -1.0
+        except Exception:
+            conf = -1.0
+        left = int(o.get("left", [0])[i])
+        top = int(o.get("top", [0])[i])
+        width = int(o.get("width", [0])[i])
+        height = int(o.get("height", [0])[i])
+        center_y = top + height / 2.0
+        center_x = left + width / 2.0
+        cells.append({"text": txt, "conf": conf, "left": left, "top": top, "width": width, "height": height, "center_y": center_y, "center_x": center_x})
     return cells
+# ---------------- grouping & merging ----------------
+def group_cells_into_rows(cells: List[Dict[str, Any]], y_tolerance: int = 12) -> List[List[Dict[str, Any]]]:
     if not cells:
         return []
+    sorted_cells = sorted(cells, key=lambda c: (c["center_y"], c["center_x"]))
+    rows = []
+    current = [sorted_cells[0]]
+    last_y = sorted_cells[0]["center_y"]
+    for c in sorted_cells[1:]:
+        if abs(c["center_y"] - last_y) <= y_tolerance:
             current.append(c)
+            last_y = (last_y * (len(current) - 1) + c["center_y"]) / len(current)
         else:
+            rows.append(sorted(current, key=lambda cc: cc["left"]))
             current = [c]
+            last_y = c["center_y"]
+    if current:
+        rows.append(sorted(current, key=lambda cc: cc["left"]))
     return rows
+def merge_multiline_names(rows: List[List[Dict[str, Any]]]) -> List[List[Dict[str, Any]]]:
     if not rows:
         return rows
     merged = []
     i = 0
     while i < len(rows):
         row = rows[i]
         tokens = [c["text"] for c in row]
+        has_num = any(is_numeric_token(t) for t in tokens)
+        if not has_num and i + 1 < len(rows):
+            next_row = rows[i+1]
             next_tokens = [c["text"] for c in next_row]
+            next_has_num = any(is_numeric_token(t) for t in next_tokens)
+            if next_has_num and len(tokens) >= 2 and len([t for t in next_tokens if not is_numeric_token(t)]) <= 3:
+                merged_row = []
+                min_left = min((c["left"] for c in next_row), default=0)
+                offset = 10
+                for c in row:
+                    newc = c.copy()
+                    newc["left"] = min_left - offset
+                    newc["center_x"] = newc["left"] + newc.get("width", 0) / 2.0
+                    merged_row.append(newc)
+                    offset += 10
+                merged_row.extend(next_row)
+                merged.append(sorted(merged_row, key=lambda cc: cc["left"]))
                 i += 2
                 continue
         merged.append(row)
         i += 1
     return merged
+# ---------------- numeric column detection ----------------
+def detect_numeric_columns(cells: List[Dict[str, Any]], max_columns: int = 4) -> List[float]:
+    xs = [c["center_x"] for c in cells if is_numeric_token(c["text"])]
     if not xs:
         return []
+    xs = sorted(xs)
     if len(xs) == 1:
+        return [xs[0]]
+    gaps = [xs[i+1] - xs[i] for i in range(len(xs) - 1)]
+    mean_gap = float(np.mean(gaps))
+    std_gap = float(np.std(gaps)) if len(gaps) > 1 else 0.0
+    gap_thresh = max(30.0, mean_gap + 0.6 * std_gap)
     clusters = []
     curr = [xs[0]]
     for i, g in enumerate(gaps):
+        if g > gap_thresh and len(clusters) < (max_columns - 1):
             clusters.append(curr)
+            curr = [xs[i+1]]
         else:
+            curr.append(xs[i+1])
     clusters.append(curr)
+    centers = [float(np.median(c)) for c in clusters]
+    if len(centers) > max_columns:
+        centers = centers[-max_columns:]
+    return sorted(centers)
+def assign_token_to_column(token_x: float, column_centers: List[float]) -> Optional[int]:
+    if not column_centers:
+        return None
+    distances = [abs(token_x - cx) for cx in column_centers]
     return int(np.argmin(distances))
+# ---------------- parse rows into items ----------------
+def parse_rows_with_columns(rows: List[List[Dict[str, Any]]], page_cells: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    parsed_items = []
+    rows = merge_multiline_names(rows)
+    column_centers = detect_numeric_columns(page_cells, max_columns=4)
     for row in rows:
         tokens = [c["text"] for c in row]
         if not tokens:
             continue
+        joined_lower = " ".join(tokens).lower()
+        if FOOTER_KEYWORDS.search(joined_lower) and not any(is_numeric_token(t) for t in tokens):
             continue
+        if all(not is_numeric_token(t) for t in tokens):
             continue
+        if column_centers:
+            left_text_parts = []
+            numeric_bucket_map = {i: [] for i in range(len(column_centers))}
+            for c in row:
+                t = c["text"]
+                cx = c["center_x"]
+                if is_numeric_token(t):
+                    col_idx = assign_token_to_column(cx, column_centers)
+                    if col_idx is None:
+                        numeric_bucket_map[len(column_centers) - 1].append(t)
+                    else:
+                        numeric_bucket_map[col_idx].append(t)
+                else:
+                    left_text_parts.append(t)
+            raw_name = " ".join(left_text_parts).strip()
+            name = clean_name_text(raw_name) if raw_name else ""
+            num_cols = len(column_centers)
+            def get_bucket(idx):
+                vals = numeric_bucket_map.get(idx, [])
+                return vals[-1] if vals else None
+            amount = None; rate = None; qty = None
+            if num_cols >= 1:
+                amount = normalize_num_str(get_bucket(num_cols - 1))
+            if num_cols >= 2:
+                rate = normalize_num_str(get_bucket(num_cols - 2))
+            if num_cols >= 3:
+                qty = normalize_num_str(get_bucket(num_cols - 3))
+            if amount is None:
+                for t in reversed(tokens):
+                    if is_numeric_token(t):
+                        amount = normalize_num_str(t)
+                        break
+            if (qty is None or qty == 0) and amount is not None and rate:
+                ratio = amount / rate if rate else None
+                if ratio is not None:
+                    rounded = round(ratio)
+                    if rounded >= 1 and abs(ratio - rounded) <= max(0.04 * rounded, 0.2):
+                        qty = float(rounded)
+            if qty is None:
+                for pt in reversed(left_text_parts):
+                    m = re.match(r"^(\d+)(?:[xX])?$", pt)
+                    if m:
+                        qty = float(m.group(1))
+                        break
+                if qty is None:
+                    qty = 1.0
+            if (rate is None or rate == 0) and qty and qty != 0 and amount is not None:
+                rate = round(amount / qty, 2)
+            try:
+                amount = float(round(amount, 2)) if amount is not None else None
+            except Exception:
+                amount = None
+            try:
+                rate = float(round(rate, 2)) if rate is not None else 0.0
+            except Exception:
+                rate = 0.0
+            try:
+                qty = float(qty) if qty is not None else 1.0
+            except Exception:
+                qty = 1.0
+            if amount is None or amount == 0:
+                continue
+            parsed_items.append({
+                "item_name": name if name else "UNKNOWN",
+                "item_amount": float(round(amount, 2)),
+                "item_rate": float(round(rate, 2)) if rate else 0.0,
+                "item_quantity": float(qty) if qty else 1.0,
+            })
+        else:
+            numeric_idxs = [i for i, t in enumerate(tokens) if is_numeric_token(t)]
+            if not numeric_idxs:
+                continue
+            last = numeric_idxs[-1]
+            amt = normalize_num_str(tokens[last])
+            if amt is None:
+                continue
+            name = " ".join(tokens[:last]).strip()
+            if not name:
+                continue
+            rate = 0.0; qty = 1.0
+            if len(numeric_idxs) >= 2:
+                r = normalize_num_str(tokens[numeric_idxs[-2]])
+                rate = r if r is not None else 0.0
+            if len(numeric_idxs) >= 3:
+                q = normalize_num_str(tokens[numeric_idxs[-3]])
+                qty = q if q is not None else 1.0
+            parsed_items.append({
+                "item_name": clean_name_text(name),
+                "item_amount": float(round(amt, 2)),
+                "item_rate": float(round(rate, 2)),
+                "item_quantity": float(qty),
+            })
+    return parsed_items
+# ---------------- dedupe & totals ----------------
+def dedupe_items(items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
     seen = set()
     out = []
     for it in items:
+        nm = re.sub(r"\s+", " ", it["item_name"].lower()).strip()
+        key = (nm[:120], round(float(it["item_amount"]), 2))
+        if key in seen:
+            continue
+        seen.add(key)
+        out.append(it)
     return out
+def detect_subtotals_and_totals(rows_texts: List[str]) -> Dict[str, Optional[float]]:
+    subtotal = None; final = None
+    for rt in rows_texts[::-1]:
+        if not rt or rt.strip() == "":
+            continue
+        if TOTAL_KEYWORDS.search(rt):
+            m = NUM_RE.search(rt)
+            if m:
+                v = normalize_num_str(m.group(0))
+                if v is None:
+                    continue
+                if re.search(r"sub", rt, re.I):
+                    if subtotal is None: subtotal = float(round(v, 2))
+                else:
+                    if final is None: final = float(round(v, 2))
+    return {"subtotal": subtotal, "final_total": final}
+# ---------------- Gemini refinement (always attempted) ----------------
+def refine_with_gemini(page_items: List[Dict[str, Any]], page_text: str = "") -> Tuple[List[Dict[str, Any]], Dict[str, int]]:
+    zero_usage = {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0}
+    if not GEMINI_API_KEY or genai is None:
+        return page_items, zero_usage
     try:
+        safe_text = sanitize_ocr_text(page_text)
+        system = (
+            "You are a strict bill-extraction cleaner. Return ONLY a JSON array (no text) of objects with keys "
+            "item_name (string), item_amount (float), item_rate (float), item_quantity (float). "
+            "Do NOT return totals or subtotals as items. Do not invent items. Fix broken names and numeric mismatches."
+        )
+        # small few-shot example to anchor the model
+        few_shot = (
+            "# EXAMPLE\nitems = [{'item_name':'Consultation Charge | DR PREETHI','item_amount':300.0,'item_rate':0.0,'item_quantity':300.0}]\n"
+            "=> [{'item_name':'Consultation Charge | DR PREETHI MARY JOSEPH','item_amount':300.0,'item_rate':300.0,'item_quantity':1.0}]\n"
+        )
+        prompt = f"page_text='''{safe_text}'''\nitems = {json.dumps(page_items, ensure_ascii=False)}\n\n{few_shot}\nReturn only a JSON array."
+        model = genai.GenerativeModel(GEMINI_MODEL_NAME)
+        response = model.generate_content(
+            [
+                {"role": "system", "parts": [system]},
+                {"role": "user", "parts": [prompt]},
+            ],
+            temperature=0.0,
+            max_output_tokens=1000,
         )
         raw = response.text.strip()
+        if raw.startswith("```"):
+            raw = re.sub(r"^```[a-zA-Z]*", "", raw)
+            raw = re.sub(r"```$", "", raw).strip()
         parsed = json.loads(raw)
+        if isinstance(parsed, list):
+            cleaned = []
+            for obj in parsed:
+                try:
+                    cleaned.append({
+                        "item_name": str(obj.get("item_name", "")).strip(),
+                        "item_amount": float(obj.get("item_amount", 0.0)),
+                        "item_rate": float(obj.get("item_rate", 0.0) or 0.0),
+                        "item_quantity": float(obj.get("item_quantity", 1.0) or 1.0),
+                    })
+                except Exception:
+                    continue
+            return cleaned, zero_usage
+        return page_items, zero_usage
+    except Exception:
+        return page_items, zero_usage
+# ---------------- header heuristics & final filter ----------------
+def looks_like_header_text(txt: str, top_of_page: bool = False) -> bool:
+    if not txt:
+        return False
+    t = re.sub(r"\s+", " ", txt.strip().lower())
+    hits = sum(1 for k in HEADER_KEYWORDS if k in t)
+    if hits >= 2:
+        return True
+    tokens = re.split(r"[\s\|,/:]+", t)
+    key_hit_count = sum(1 for tok in tokens if tok in HEADER_KEYWORDS)
+    if key_hit_count >= 3:
+        return True
+    if top_of_page and len(tokens) <= 10 and key_hit_count >= 2:
+        return True
+    if ("rate" in t or "net" in t) and "amt" in t and not any(ch.isdigit() for ch in t):
+        return True
+    if t.startswith("description") or t.startswith("qty") or t.startswith("qty /"):
+        return True
+    return False
+def final_item_filter(item: Dict[str, Any], known_page_headers: List[str] = []) -> bool:
+    name = (item.get("item_name") or "").strip()
+    if not name:
+        return False
+    ln = name.lower()
+    for h in known_page_headers:
+        if h and h.strip() and h.strip().lower() in ln:
+            return False
+    if FOOTER_KEYWORDS.search(ln):
+        return False
+    if item.get("item_amount", 0) > 1_000_000:
+        return False
+    if len(name) <= 2 and not re.search(r"[a-zA-Z]", name):
+        return False
+    if re.fullmatch(r"(charge|charges|services|laboratory|lab|consultation)", ln.strip(), re.I):
+        return False
+    if float(item.get("item_amount", 0)) <= 0.0:
+        return False
+    rate = float(item.get("item_rate", 0) or 0)
+    amt = float(item.get("item_amount", 0) or 0)
+    if rate and rate > amt * 10 and amt < 10000:
+        return False
+    return True
+# ---------------- main endpoint ----------------
 @app.post("/extract-bill-data")
 async def extract_bill_data(payload: BillRequest):
+    doc_url = payload.document
     try:
+        headers = {"User-Agent": "Mozilla/5.0"}
+        resp = requests.get(doc_url, headers=headers, timeout=30)
+        if resp.status_code != 200:
+            raise RuntimeError(f"download failed status={resp.status_code}")
+        file_bytes = resp.content
+    except Exception:
+        return {"is_success": False, "token_usage": {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0}, "data": {"pagewise_line_items": [], "total_item_count": 0}}
+    images = []
+    clean_url = doc_url.split("?", 1)[0].lower()
     try:
+        if clean_url.endswith(".pdf"):
+            images = convert_from_bytes(file_bytes)
+        elif any(clean_url.endswith(ext) for ext in [".png", ".jpg", ".jpeg", ".tiff", ".bmp"]):
+            images = [Image.open(BytesIO(file_bytes))]
         else:
+            try:
+                images = convert_from_bytes(file_bytes)
+            except Exception:
+                images = []
+    except Exception:
+        images = []
+    pagewise = []
+    cumulative_token_usage = {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0}
+    for idx, page_img in enumerate(images, start=1):
+        try:
+            proc = preprocess_image(page_img)
+            cells = image_to_tsv_cells(proc)
+            rows = group_cells_into_rows(cells, y_tolerance=12)
+            rows_texts = [" ".join([c["text"] for c in r]) for r in rows]
+            top_headers = []
+            for i, rt in enumerate(rows_texts[:6]):
+                if looks_like_header_text(rt, top_of_page=(i < 4)):
+                    top_headers.append(rt.strip().lower())
+            parsed_items = parse_rows_with_columns(rows, cells)
+            page_text = sanitize_ocr_text(" ".join(rows_texts))
+            refined_items, token_u = refine_with_gemini(parsed_items, page_text)
+            for k in cumulative_token_usage:
+                cumulative_token_usage[k] += token_u.get(k, 0)
+            cleaned = [p for p in refined_items if final_item_filter(p, known_page_headers=top_headers)]
+            cleaned = dedupe_items(cleaned)
+            cleaned = [p for p in cleaned if not looks_like_header_text(p["item_name"].lower())]
+            page_type = "Bill Detail"
+            page_txt = page_text.lower()
+            if any(x in page_txt for x in ["pharmacy", "medicine", "tablet"]):
+                page_type = "Pharmacy"
+            if "final bill" in page_txt or "grand total" in page_txt:
+                page_type = "Final Bill"
+            pagewise.append({"page_no": str(idx), "page_type": page_type, "bill_items": cleaned})
+        except Exception:
+            pagewise.append({"page_no": str(idx), "page_type": "Bill Detail", "bill_items": []})
+            continue
+    total_item_count = sum(len(p.get("bill_items", [])) for p in pagewise)
+    if not GEMINI_API_KEY or genai is None:
+        cumulative_token_usage["warning_no_gemini"] = 1
+    return {"is_success": True, "token_usage": cumulative_token_usage, "data": {"pagewise_line_items": pagewise, "total_item_count": total_item_count}}
+# ---------------- debug TSV ----------------
 @app.post("/debug-tsv")
 async def debug_tsv(payload: BillRequest):
+    doc_url = payload.document
     try:
+        resp = requests.get(doc_url, timeout=20)
+        if resp.status_code != 200:
+            return {"error": "Download failed"}
+        file_bytes = resp.content
+    except Exception:
+        return {"error": "Download failed"}
+    clean_url = doc_url.split("?", 1)[0].lower()
+    if clean_url.endswith(".pdf"):
+        imgs = convert_from_bytes(file_bytes)
+        img = imgs[0]
     else:
+        img = Image.open(BytesIO(file_bytes))
     proc = preprocess_image(img)
+    cells = image_to_tsv_cells(proc)
+    return {"cells": cells}
 @app.get("/")
+def health_check():
+    msg = "Bill extraction API (final) live."
+    if not GEMINI_API_KEY or genai is None:
+        msg += " (No GEMINI_API_KEY/configured SDK — LLM refinement skipped.)"
+    return {"status": "ok", "message": msg, "hint": "POST /extract-bill-data with {'document':'<url>'}"}