Spaces:

Sathvik-kota
/

Datathon

Sleeping

App Files Files Community

Sathvik-kota commited on Nov 29, 2025

Commit

1404047

verified ·

1 Parent(s): 5ca19e8

Upload folder using huggingface_hub

Browse files

Files changed (1) hide show

app.py +440 -411

app.py CHANGED Viewed

@@ -1,522 +1,551 @@
-# app.py (Final v2 — added multiline-name merging + qty inference)
 import os
 import re
 import json
 from io import BytesIO
 from typing import List, Dict, Any, Optional, Tuple
-from fastapi import FastAPI
-from pydantic import BaseModel
 import requests
 from PIL import Image
 from pdf2image import convert_from_bytes
 import pytesseract
 from pytesseract import Output
-import numpy as np
-import cv2
 import google.generativeai as genai
-# ---------------- LLM CONFIG (Gemini) ----------------
 GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
-GEMINI_MODEL_NAME = "gemini-2.5-flash"
 if GEMINI_API_KEY:
     genai.configure(api_key=GEMINI_API_KEY)
-# ---------------- FASTAPI APP ----------------
-app = FastAPI(title="Bajaj Datathon - Bill Extractor (v2)")
 class BillRequest(BaseModel):
     document: str
-# ---------------- Globals & regex ----------------
 NUM_RE = re.compile(r"[-+]?\d{1,3}(?:[,0-9]*)(?:\.\d+)?")
-TOTAL_KEYWORDS = re.compile(
-    r"(grand\s*total|net\s*payable|total\s*amount|amount\s*payable|bill\s*amount|final\s*amount|balance\s*due|sub\s*total|subtotal|round\s*off)",
     re.I
 )
-HEADER_LIKE = re.compile(r"^(consultation|room|nursing|surgery|radiology|laborat|laboratory|charges|services|investigation|package|section)$", re.I)
-FOOTER_KEYWORDS = re.compile(r"(page|printed|printed on|page\s*\d+|printed:|date:|time:|am|pm)", re.I)
-# ---------------- Utilities ----------------
-def normalize_num_str(s: Optional[str]) -> Optional[float]:
-    if s is None:
-        return None
-    s = str(s).strip()
-    if s == "":
         return None
-    s = re.sub(r"[^\d\-\+\,\.\(\)]", "", s)
-    negative = False
-    if s.startswith("(") and s.endswith(")"):
-        negative = True
-        s = s[1:-1]
-    s = s.replace(",", "")
-    if s == "" or s in ("-", "+"):
         return None
     try:
-        return -float(s) if negative else float(s)
-    except Exception:
-        s2 = s.replace(" ", "")
-        try:
-            return float(s2)
-        except Exception:
-            return None
-def is_numeric_token(t: Optional[str]) -> bool:
-    if not t:
-        return False
-    return bool(NUM_RE.search(str(t)))
-def clean_name_text(s: str) -> str:
-    s = s.replace("—", "-")
-    s = s.replace("|", "|")
-    s = re.sub(r"\s+", " ", s)
-    s = s.strip(" -:,.")
-    s = re.sub(r"\bSG0?(\d+)\b", r"SG\1", s, flags=re.I)
-    s = re.sub(r"\b(RR)[\s\-]*2\b", r"RR-2", s, flags=re.I)
-    s = s.strip()
-    return s
-# ---------------- preprocessing ----------------
-def pil_to_cv2(img: Image.Image) -> Any:
-    arr = np.array(img)
-    if arr.ndim == 2:
-        return arr
-    return cv2.cvtColor(arr, cv2.COLOR_RGB2BGR)
-def preprocess_image(pil_img: Image.Image) -> Any:
     pil_img = pil_img.convert("RGB")
     w, h = pil_img.size
-    target_w = 1500
-    if w < target_w:
-        scale = target_w / float(w)
-        new_size = (int(w * scale), int(h * scale))
-        pil_img = pil_img.resize(new_size, Image.LANCZOS)
-    cv_img = pil_to_cv2(pil_img)
-    gray = cv2.cvtColor(cv_img, cv2.COLOR_BGR2GRAY)
     gray = cv2.fastNlMeansDenoising(gray, h=10)
     try:
-        bw = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 41, 15)
     except Exception:
         _, bw = cv2.threshold(gray, 127, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
-    kernel = np.ones((1,1), np.uint8)
-    bw = cv2.morphologyEx(bw, cv2.MORPH_OPEN, kernel)
     return bw
-# ---------------- OCR TSV ----------------
-def image_to_tsv_cells(cv_img: Any) -> List[Dict[str, Any]]:
     try:
-        o = pytesseract.image_to_data(cv_img, output_type=Output.DICT, config="--psm 6")
-    except Exception:
-        o = pytesseract.image_to_data(cv_img, output_type=Output.DICT)
     cells = []
-    n = len(o.get("text", []))
     for i in range(n):
-        raw = o["text"][i]
-        if raw is None:
             continue
-        txt = str(raw).strip()
-        if txt == "":
-            continue
-        try:
-            conf = float(o["conf"][i]) if o["conf"][i] not in (None, "", "-1") else -1.0
-        except Exception:
-            conf = -1.0
-        left = int(o.get("left", [0])[i])
-        top = int(o.get("top", [0])[i])
-        width = int(o.get("width", [0])[i])
-        height = int(o.get("height", [0])[i])
-        center_y = top + height/2.0
-        center_x = left + width/2.0
-        cells.append({"text": txt, "conf": conf, "left": left, "top": top, "width": width, "height": height, "center_y": center_y, "center_x": center_x})
     return cells
-# ---------------- group rows ----------------
-def group_cells_into_rows(cells: List[Dict[str, Any]], y_tolerance: int = 12) -> List[List[Dict[str, Any]]]:
     if not cells:
         return []
-    sorted_cells = sorted(cells, key=lambda c: (c["center_y"], c["center_x"]))
-    rows = []
-    current = [sorted_cells[0]]
-    last_y = sorted_cells[0]["center_y"]
-    for c in sorted_cells[1:]:
-        if abs(c["center_y"] - last_y) <= y_tolerance:
             current.append(c)
-            last_y = (last_y*(len(current)-1)+c["center_y"]) / len(current)
         else:
-            rows.append(sorted(current, key=lambda cc: cc["left"]))
             current = [c]
-            last_y = c["center_y"]
-    if current:
-        rows.append(sorted(current, key=lambda cc: cc["left"]))
     return rows
-# ---------------- merge multiline names ----------------
-def merge_multiline_names(rows: List[List[Dict[str, Any]]]) -> List[List[Dict[str, Any]]]:
     """
-    If a textual row (no numbers) is immediately followed by a numeric row with a short left-text,
-    merge the text tokens into the numeric row to form full item_name.
     """
     if not rows:
         return rows
     merged = []
     i = 0
     while i < len(rows):
         row = rows[i]
         tokens = [c["text"] for c in row]
-        has_num = any(is_numeric_token(t) for t in tokens)
-        if not has_num:
-            # candidate textual-only header or continuation — check next row
-            if i+1 < len(rows):
-                next_row = rows[i+1]
-                next_tokens = [c["text"] for c in next_row]
-                next_has_num = any(is_numeric_token(t) for t in next_tokens)
-                # if next has numbers and the textual row has decent length, merge
-                if next_has_num and len(tokens) >= 2 and len([t for t in next_tokens if not is_numeric_token(t)]) <= 2:
-                    # merge: prepend current row tokens into next_row (maintain x order by creating fake cells)
-                    merged_row = []
-                    # create synthetic cells for tokens in row to be merged with left positions slightly left of next_row
-                    # We will just create dicts with left positions smaller than any in next_row to preserve order
-                    min_left = min((c["left"] for c in next_row), default=0)
-                    offset = 5
-                    # convert textual row cells to keep their original positions but ensure left ordering
-                    for c in row:
-                        # keep original center_x but set left to min_left - big offset to keep them on left
-                        newc = c.copy()
-                        newc["left"] = min_left - (offset)
-                        newc["center_x"] = newc["left"] + newc.get("width", 0)/2.0
-                        merged_row.append(newc)
-                        offset += 10
-                    # then append next_row cells
-                    merged_row.extend(next_row)
-                    merged.append(sorted(merged_row, key=lambda cc: cc["left"]))
-                    i += 2
-                    continue
-        # default append
         merged.append(row)
         i += 1
     return merged
-# ---------------- numeric column detection ----------------
-def detect_numeric_columns(cells: List[Dict[str, Any]], max_columns: int = 4) -> List[float]:
-    xs = [c["center_x"] for c in cells if is_numeric_token(c["text"])]
     if not xs:
         return []
-    xs = sorted(xs)
     if len(xs) == 1:
-        return [xs[0]]
-    gaps = [xs[i+1]-xs[i] for i in range(len(xs)-1)]
-    mean_gap = float(np.mean(gaps))
-    std_gap = float(np.std(gaps)) if len(gaps)>1 else 0.0
-    gap_thresh = max(30.0, mean_gap + 0.6*std_gap)
     clusters = []
     curr = [xs[0]]
-    for i,g in enumerate(gaps):
-        if g > gap_thresh and len(clusters) < (max_columns-1):
             clusters.append(curr)
-            curr = [xs[i+1]]
         else:
-            curr.append(xs[i+1])
     clusters.append(curr)
-    centers = [float(np.median(c)) for c in clusters]
-    if len(centers) > max_columns:
-        centers = centers[-max_columns:]
-    return sorted(centers)
-def assign_token_to_column(token_x: float, column_centers: List[float]) -> Optional[int]:
-    if not column_centers:
-        return None
-    distances = [abs(token_x - cx) for cx in column_centers]
     return int(np.argmin(distances))
-# ---------------- main row parser with qty inference ----------------
-def parse_rows_with_columns(rows: List[List[Dict[str, Any]]], page_cells: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
-    parsed_items = []
-    # first merge multiline names (very important for your sample)
-    rows = merge_multiline_names(rows)
-    column_centers = detect_numeric_columns(page_cells, max_columns=4)
     for row in rows:
         tokens = [c["text"] for c in row]
         if not tokens:
             continue
-        joined_lower = " ".join(tokens).lower()
-        # skip footer-like
-        if FOOTER_KEYWORDS.search(joined_lower) and not any(is_numeric_token(t) for t in tokens):
             continue
-        # skip header-like row w/o numeric tokens
-        if all(not is_numeric_token(t) for t in tokens):
             continue
-        # parse using columns
-        if column_centers:
-            left_text_parts = []
-            numeric_bucket_map = {i: [] for i in range(len(column_centers))}
-            for c in row:
-                t = c["text"]
-                cx = c["center_x"]
-                if is_numeric_token(t):
-                    col_idx = assign_token_to_column(cx, column_centers)
-                    if col_idx is None:
-                        numeric_bucket_map[len(column_centers)-1].append(t)
-                    else:
-                        numeric_bucket_map[col_idx].append(t)
-                else:
-                    left_text_parts.append(t)
-            raw_name = " ".join(left_text_parts).strip()
-            name = clean_name_text(raw_name) if raw_name else ""
-            num_cols = len(column_centers)
-            def get_bucket(idx):
-                vals = numeric_bucket_map.get(idx, [])
-                return vals[-1] if vals else None
-            amount = None; rate = None; qty = None
-            if num_cols >= 1:
-                amount = normalize_num_str(get_bucket(num_cols-1))
-            if num_cols >= 2:
-                rate = normalize_num_str(get_bucket(num_cols-2))
-            if num_cols >= 3:
-                qty = normalize_num_str(get_bucket(num_cols-3))
-            # fallbacks
-            if amount is None:
-                for t in reversed(tokens):
-                    if is_numeric_token(t):
-                        amount = normalize_num_str(t); break
-            # if qty missing but rate present, attempt qty = round(amount/rate) if close to integer
-            if (qty is None or qty == 0) and amount is not None and rate:
-                ratio = amount / rate if rate else None
-                if ratio is not None:
-                    rounded = round(ratio)
-                    if rounded >= 1 and abs(ratio - rounded) <= max(0.04 * rounded, 0.2):
-                        qty = float(rounded)
-            # if still missing qty, try scanning left_text_parts
-            if qty is None:
-                for pt in reversed(left_text_parts):
-                    m = re.match(r"^(\d+)(?:[xX])?$", pt)
-                    if m:
-                        qty = float(m.group(1)); break
-                if qty is None:
-                    qty = 1.0
-            # if rate missing but qty available and amount present, infer
-            if (rate is None or rate == 0) and qty and qty != 0 and amount is not None:
-                rate = round(amount / qty, 2)
-            # normalize types
-            try:
-                amount = float(round(amount,2)) if amount is not None else None
-            except Exception:
-                amount = None
-            try:
-                rate = float(round(rate,2)) if rate is not None else 0.0
-            except Exception:
-                rate = 0.0
-            try:
-                qty = float(qty) if qty is not None else 1.0
-            except Exception:
-                qty = 1.0
-            # skip if amount missing or zero (likely header)
-            if amount is None or amount == 0:
-                continue
-            # skip header-like names
-            if name and HEADER_LIKE.search(name) and (amount is None or amount == 0):
-                continue
-            parsed_items.append({
-                "item_name": name if name else "UNKNOWN",
-                "item_amount": float(round(amount,2)),
-                "item_rate": float(round(rate,2)) if rate else 0.0,
-                "item_quantity": float(qty) if qty else 1.0
-            })
-        else:
-            # simple fallback
-            numeric_idxs = [i for i,t in enumerate(tokens) if is_numeric_token(t)]
-            if not numeric_idxs:
-                continue
-            last = numeric_idxs[-1]
-            amt = normalize_num_str(tokens[last])
-            if amt is None:
-                continue
-            name = " ".join(tokens[:last]).strip()
-            if not name:
-                continue
-            rate = 0.0; qty = 1.0
-            if len(numeric_idxs) >= 2:
-                r = normalize_num_str(tokens[numeric_idxs[-2]])
-                rate = r if r is not None else 0.0
-            if len(numeric_idxs) >= 3:
-                q = normalize_num_str(tokens[numeric_idxs[-3]])
-                qty = q if q is not None else 1.0
-            parsed_items.append({
-                "item_name": clean_name_text(name),
-                "item_amount": float(round(amt,2)),
-                "item_rate": float(round(rate,2)),
-                "item_quantity": float(qty)
-            })
-    return parsed_items
-# ---------------- dedupe & totals ----------------
-def dedupe_items(items: List[Dict[str,Any]]) -> List[Dict[str,Any]]:
     seen = set()
     out = []
     for it in items:
-        nm = re.sub(r"\s+"," ", it["item_name"].lower()).strip()
-        key = (nm[:120], round(float(it["item_amount"]),2))
-        if key in seen:
-            continue
-        seen.add(key)
-        out.append(it)
     return out
-def detect_subtotals_and_totals(rows_texts: List[str]) -> Dict[str,Optional[float]]:
-    subtotal=None; final=None
-    for rt in rows_texts[::-1]:
-        if not rt or rt.strip()=="":
-            continue
-        if TOTAL_KEYWORDS.search(rt):
-            m = NUM_RE.search(rt)
-            if m:
-                v = normalize_num_str(m.group(0))
-                if v is None:
-                    continue
-                if re.search(r"sub", rt, re.I):
-                    if subtotal is None: subtotal = float(round(v,2))
-                else:
-                    if final is None: final = float(round(v,2))
-    return {"subtotal": subtotal, "final_total": final}
-# ---------------- Gemini refinement (optional) ----------------
-def refine_with_gemini(page_items: List[Dict[str,Any]], page_text: str="") -> Tuple[List[Dict[str,Any]], Dict[str,int]]:
-    zero_usage = {"total_tokens":0,"input_tokens":0,"output_tokens":0}
     if not GEMINI_API_KEY:
-        return page_items, zero_usage
     try:
         prompt = (
-            "You are a precise bill extraction cleaner. Given items with item_name, item_quantity, item_rate, item_amount, "
-            "fix broken names, infer quantity if qty missing by checking amount and rate, and remove header/footer rows. "
-            "Return only a JSON array of cleaned items.\n\n"
-            f"page_text='''{page_text[:4000]}'''\nitems = {json.dumps(page_items, ensure_ascii=False)}"
-        )
-        model = genai.GenerativeModel(GEMINI_MODEL_NAME)
-        response = model.generate_content(
-            [{"role":"system","parts":["Return only valid JSON array."]},{"role":"user","parts":[prompt]}]
         )
         raw = response.text.strip()
-        if raw.startswith("```"):
-            raw = re.sub(r"^```[a-zA-Z]*","", raw)
-            raw = re.sub(r"```$","", raw).strip()
         parsed = json.loads(raw)
-        if isinstance(parsed, list):
-            cleaned=[]
-            for obj in parsed:
-                try:
-                    cleaned.append({
-                        "item_name": str(obj.get("item_name","")).strip(),
-                        "item_amount": float(obj.get("item_amount",0.0)),
-                        "item_rate": float(obj.get("item_rate",0.0) or 0.0),
-                        "item_quantity": float(obj.get("item_quantity",1.0) or 1.0)
-                    })
-                except Exception:
-                    continue
-            return cleaned, zero_usage
-        return page_items, zero_usage
-    except Exception:
-        return page_items, zero_usage
-# ---------------- main endpoint ----------------
 @app.post("/extract-bill-data")
 async def extract_bill_data(payload: BillRequest):
-    doc_url = payload.document
-    # download
     try:
-        headers={"User-Agent":"Mozilla/5.0"}
-        resp = requests.get(doc_url, headers=headers, timeout=30)
-        if resp.status_code != 200:
-            raise RuntimeError(f"download failed status={resp.status_code}")
-        file_bytes = resp.content
-    except Exception:
-        return {"is_success":False,"token_usage":{"total_tokens":0,"input_tokens":0,"output_tokens":0},"data":{"pagewise_line_items":[],"total_item_count":0}}
-    images=[]
-    clean_url = doc_url.split("?",1)[0].lower()
     try:
-        if clean_url.endswith(".pdf"):
-            images = convert_from_bytes(file_bytes)
-        elif any(clean_url.endswith(ext) for ext in [".png",".jpg",".jpeg",".tiff",".bmp"]):
-            images = [Image.open(BytesIO(file_bytes))]
         else:
-            try:
-                images = convert_from_bytes(file_bytes)
-            except Exception:
-                images=[]
-    except Exception:
-        images=[]
-    pagewise=[]
-    cumulative_token_usage={"total_tokens":0,"input_tokens":0,"output_tokens":0}
-    for idx,page_img in enumerate(images, start=1):
         try:
-            proc = preprocess_image(page_img)
-            cells = image_to_tsv_cells(proc)
-            rows = group_cells_into_rows(cells, y_tolerance=12)
-            rows_texts = [" ".join([c["text"] for c in r]) for r in rows]
-            totals = detect_subtotals_and_totals(rows_texts)
-            parsed_items = parse_rows_with_columns(rows, cells)
-            parsed_items = [p for p in parsed_items if not TOTAL_KEYWORDS.search(p.get("item_name",""))]
-            parsed_items = dedupe_items(parsed_items)
-            # call LLM if many inconsistencies or user requested GEMINI
-            call_llm = False
-            if GEMINI_API_KEY and parsed_items:
-                inconsistent = sum(1 for it in parsed_items if abs(it["item_quantity"]*it["item_rate"] - it["item_amount"]) > max(2.0, 0.03*(it["item_amount"] or 1.0)))
-                if inconsistent > max(1, len(parsed_items)//6) or len(parsed_items) > 18:
-                    call_llm = True
-            if call_llm:
-                page_text = " ".join(rows_texts)
-                refined, token_u = refine_with_gemini(parsed_items, page_text)
-                parsed_items = refined
-                for k in cumulative_token_usage:
-                    cumulative_token_usage[k] += token_u.get(k,0)
-            # final filter remove headers/footers
-            final=[]
-            for it in parsed_items:
-                nm = it.get("item_name","")
-                if not nm or HEADER_LIKE.search(nm) or FOOTER_KEYWORDS.search(nm):
-                    continue
-                if re.search(r"page\s+of|printed\s+on|printed:", nm, re.I):
-                    continue
-                if float(it.get("item_amount",0)) <= 0:
-                    continue
-                final.append(it)
-            page_type="Bill Detail"
-            page_txt = " ".join(rows_texts).lower()
-            if any(x in page_txt for x in ["pharmacy","medicine","tablet"]):
-                page_type="Pharmacy"
-            if "final bill" in page_txt or "grand total" in page_txt:
-                page_type="Final Bill"
-            pagewise.append({"page_no": str(idx), "page_type": page_type, "bill_items": final})
         except Exception:
-            pagewise.append({"page_no": str(idx), "page_type": "Bill Detail", "bill_items": []})
-            continue
-    total_item_count = sum(len(p.get("bill_items",[])) for p in pagewise)
-    return {"is_success": True, "token_usage": cumulative_token_usage, "data": {"pagewise_line_items": pagewise, "total_item_count": total_item_count}}
 @app.post("/debug-tsv")
 async def debug_tsv(payload: BillRequest):
-    doc_url = payload.document
     try:
-        resp = requests.get(doc_url, timeout=20)
-        if resp.status_code != 200:
-            return {"error": "Download failed"}
-        file_bytes = resp.content
     except:
-        return {"error": "Download failed"}
-    # Get image
-    clean_url = doc_url.split("?",1)[0].lower()
-    if clean_url.endswith(".pdf"):
-        imgs = convert_from_bytes(file_bytes)
-        img = imgs[0]
     else:
-        img = Image.open(BytesIO(file_bytes))
     proc = preprocess_image(img)
-    cells = image_to_tsv_cells(proc)
-    # return raw OCR cells for debugging
-    return {"cells": cells}
 @app.get("/")
-def health_check():
-    return {"status":"ok","message":"Bill extraction API (v2) live.","hint":"POST /extract-bill-data with {'document':'<url>'}"}

+"""
+Bajaj Finserv Datathon – Bill Extraction Service
+Clean, modular and human-written version (Option A)
+Maintains your exact logic but reorganized for readability and robustness.
+"""
 import os
 import re
 import json
 from io import BytesIO
 from typing import List, Dict, Any, Optional, Tuple
+import cv2
+import numpy as np
 import requests
 from PIL import Image
 from pdf2image import convert_from_bytes
+from fastapi import FastAPI
+from pydantic import BaseModel
 import pytesseract
 from pytesseract import Output
 import google.generativeai as genai
+# -------------------------------------------------------
+#  GEMINI CONFIG
+# -------------------------------------------------------
 GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
+GEMINI_MODEL = "gemini-2.5-flash"
 if GEMINI_API_KEY:
     genai.configure(api_key=GEMINI_API_KEY)
+# -------------------------------------------------------
+#  FASTAPI APP
+# -------------------------------------------------------
+app = FastAPI(title="Bajaj Datathon - Bill Extractor (Clean vA)")
 class BillRequest(BaseModel):
     document: str
+# -------------------------------------------------------
+#  REGEX + CONSTANTS
+# -------------------------------------------------------
 NUM_RE = re.compile(r"[-+]?\d{1,3}(?:[,0-9]*)(?:\.\d+)?")
+TOTAL_KEYS = re.compile(
+    r"(grand\s*total|net\s*payable|total\s*amount|amount\s*payable|bill\s*amount|"
+    r"final\s*amount|balance\s*due|sub\s*total|subtotal|round\s*off)",
     re.I
 )
+HEADER_HINT = re.compile(
+    r"^(consultation|room|nursing|surgery|radiology|laboratory|charges|services|investigation|package|section)$",
+    re.I
+)
+FOOTER_HINT = re.compile(r"(page|printed|date|time|am|pm|printed on)", re.I)
+# =======================================================
+#  UTILITY HELPERS
+# =======================================================
+def normalize_number(raw: Optional[str]) -> Optional[float]:
+    """Convert OCR number-like text into a clean float."""
+    if not raw:
         return None
+    text = re.sub(r"[^\d\-\+\,\.\(\)]", "", str(raw)).strip()
+    if not text:
         return None
+    # Handle negative (accounting) format: (150.00)
+    is_negative = text.startswith("(") and text.endswith(")")
+    if is_negative:
+        text = text[1:-1]
     try:
+        val = float(text.replace(",", ""))
+        return -val if is_negative else val
+    except:
+        return None
+def is_numeric(text: str) -> bool:
+    return bool(NUM_RE.search(str(text)))
+def clean_item_name(text: str) -> str:
+    """Normalizes the left-side description of an item."""
+    t = text.replace("—", "-")
+    t = re.sub(r"\s+", " ", t)
+    t = t.strip(" -:,.")
+    t = re.sub(r"\bSG0?(\d+)\b", r"SG\1", t, flags=re.I)
+    t = re.sub(r"\b(RR)[\s\-]*2\b", r"RR-2", t, flags=re.I)
+    return t.strip()
+# =======================================================
+#  IMAGE PROCESSING
+# =======================================================
+def pil_to_cv(pil: Image.Image) -> np.ndarray:
+    np_img = np.array(pil)
+    return np_img if np_img.ndim == 2 else cv2.cvtColor(np_img, cv2.COLOR_RGB2BGR)
+def preprocess_image(pil_img: Image.Image) -> np.ndarray:
+    """Resize, denoise & binarize image to improve OCR accuracy."""
     pil_img = pil_img.convert("RGB")
     w, h = pil_img.size
+    # Upscale if very small
+    if w < 1500:
+        scale = 1500 / float(w)
+        pil_img = pil_img.resize((int(w * scale), int(h * scale)), Image.LANCZOS)
+    img = pil_to_cv(pil_img)
+    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
     gray = cv2.fastNlMeansDenoising(gray, h=10)
     try:
+        bw = cv2.adaptiveThreshold(
+            gray, 255,
+            cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+            cv2.THRESH_BINARY,
+            41, 15
+        )
     except Exception:
         _, bw = cv2.threshold(gray, 127, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
+    bw = cv2.morphologyEx(bw, cv2.MORPH_OPEN, np.ones((1, 1), np.uint8))
     return bw
+# =======================================================
+#  OCR TSV PARSING
+# =======================================================
+def run_tesseract(cv_img: np.ndarray) -> List[Dict[str, Any]]:
+    """Extracts word-level bounding boxes and confidence from image."""
     try:
+        data = pytesseract.image_to_data(cv_img, output_type=Output.DICT, config="--psm 6")
+    except:
+        data = pytesseract.image_to_data(cv_img, output_type=Output.DICT)
     cells = []
+    n = len(data["text"])
     for i in range(n):
+        txt = str(data["text"][i]).strip()
+        if not txt:
             continue
+        conf = float(data["conf"][i]) if data["conf"][i] not in ("", "-1") else -1.0
+        left = int(data["left"][i])
+        top = int(data["top"][i])
+        w = int(data["width"][i])
+        h = int(data["height"][i])
+        cells.append({
+            "text": txt,
+            "conf": conf,
+            "left": left,
+            "top": top,
+            "width": w,
+            "height": h,
+            "center_x": left + w / 2,
+            "center_y": top + h / 2,
+        })
     return cells
+# =======================================================
+#  ROW GROUPING + MERGING
+# =======================================================
+def group_cells(cells: List[Dict[str, Any]], tol: int = 12) -> List[List[Dict[str, Any]]]:
+    """Groups words into horizontal text rows."""
     if not cells:
         return []
+    cells = sorted(cells, key=lambda c: (c["center_y"], c["center_x"]))
+    rows, current = [], [cells[0]]
+    last = cells[0]["center_y"]
+    for c in cells[1:]:
+        if abs(c["center_y"] - last) <= tol:
             current.append(c)
         else:
+            rows.append(sorted(current, key=lambda x: x["left"]))
             current = [c]
+        last = c["center_y"]
+    rows.append(sorted(current, key=lambda x: x["left"]))
     return rows
+def merge_multiline_descriptions(rows: List[List[Dict[str, Any]]]) -> List[List[Dict[str, Any]]]:
     """
+    Some items have description on one line and numbers on the next.
+    This merges them into a single row.
     """
     if not rows:
         return rows
     merged = []
     i = 0
     while i < len(rows):
         row = rows[i]
         tokens = [c["text"] for c in row]
+        row_has_num = any(is_numeric(t) for t in tokens)
+        # If row is only text and next row is numeric: merge
+        if not row_has_num and i + 1 < len(rows):
+            next_row = rows[i + 1]
+            next_tokens = [c["text"] for c in next_row]
+            if any(is_numeric(t) for t in next_tokens):
+                # prepend text row to numeric row
+                new_row = []
+                # push all text cells slightly left of next row
+                base_left = min([c["left"] for c in next_row]) - 50
+                offset = 0
+                for cell in row:
+                    c = dict(cell)
+                    c["left"] = base_left + offset
+                    c["center_x"] = c["left"] + c["width"] / 2
+                    new_row.append(c)
+                    offset += 15
+                new_row.extend(next_row)
+                merged.append(sorted(new_row, key=lambda x: x["left"]))
+                i += 2
+                continue
         merged.append(row)
         i += 1
     return merged
+# =======================================================
+#  COLUMN DETECTION
+# =======================================================
+def detect_column_centers(cells: List[Dict[str, Any]], max_cols=4) -> List[float]:
+    xs = sorted([c["center_x"] for c in cells if is_numeric(c["text"])])
     if not xs:
         return []
     if len(xs) == 1:
+        return xs
+    gaps = [xs[i + 1] - xs[i] for i in range(len(xs) - 1)]
+    gap_thresh = max(30, np.mean(gaps) + 0.6 * np.std(gaps))
     clusters = []
     curr = [xs[0]]
+    for i, g in enumerate(gaps):
+        if g > gap_thresh and len(clusters) < max_cols - 1:
             clusters.append(curr)
+            curr = [xs[i + 1]]
         else:
+            curr.append(xs[i + 1])
     clusters.append(curr)
+    centers = sorted([np.median(c) for c in clusters])[:max_cols]
+    return centers
+def nearest_column(x: float, centers: List[float]) -> int:
+    distances = [abs(x - c) for c in centers]
     return int(np.argmin(distances))
+# =======================================================
+#  ROW PARSER (MAIN LOGIC)
+# =======================================================
+def parse_rows(rows: List[List[Dict[str, Any]]], cells: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    """Extract structured line items using detected columns."""
+    items = []
+    rows = merge_multiline_descriptions(rows)
+    col_centers = detect_column_centers(cells, max_cols=4)
     for row in rows:
         tokens = [c["text"] for c in row]
         if not tokens:
             continue
+        joined = " ".join(tokens).lower()
+        # Skip footer lines like "Page 1/4"
+        if FOOTER_HINT.search(joined) and not any(is_numeric(t) for t in tokens):
             continue
+        # Skip headings that do not contain numbers
+        if not any(is_numeric(t) for t in tokens):
             continue
+        # --- Parse row using detected columns ---
+        left_parts = []
+        numeric_buckets = {i: [] for i in range(len(col_centers))}
+        for c in row:
+            t = c["text"]
+            if is_numeric(t):
+                col = nearest_column(c["center_x"], col_centers) if col_centers else len(col_centers) - 1
+                numeric_buckets[col].append(t)
+            else:
+                left_parts.append(t)
+        name = clean_item_name(" ".join(left_parts))
+        num_cols = len(col_centers)
+        # Extract numeric fields by column order (qty, rate, amount)
+        def bucket(idx): return numeric_buckets.get(idx, [])[-1] if numeric_buckets.get(idx) else None
+        amount = normalize_number(bucket(num_cols - 1))
+        rate = normalize_number(bucket(num_cols - 2)) if num_cols >= 2 else None
+        qty = normalize_number(bucket(num_cols - 3)) if num_cols >= 3 else None
+        # Fallbacks
+        if amount is None:
+            for t in reversed(tokens):
+                if is_numeric(t):
+                    amount = normalize_number(t)
+                    break
+        if qty is None and amount and rate:
+            q_est = amount / rate
+            rounded = round(q_est)
+            if abs(q_est - rounded) <= 0.2:
+                qty = rounded
+        if qty is None:
+            qty = 1.0
+        if (rate is None or rate == 0) and qty and amount:
+            rate = round(amount / qty, 2)
+        if amount is None or amount <= 0:
+            continue
+        if HEADER_HINT.search(name):
+            continue
+        items.append({
+            "item_name": name or "UNKNOWN",
+            "item_amount": float(round(amount, 2)),
+            "item_rate": float(round(rate or 0.0, 2)),
+            "item_quantity": float(qty)
+        })
+    return items
+# =======================================================
+#  DEDUPE ITEMS + DETECT TOTALS
+# =======================================================
+def dedupe(items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
     seen = set()
     out = []
     for it in items:
+        key = (it["item_name"].lower()[:120], round(it["item_amount"], 2))
+        if key not in seen:
+            seen.add(key)
+            out.append(it)
     return out
+# =======================================================
+#  OPTIONAL: GEMINI REFINEMENT
+# =======================================================
+def refine_with_llm(items: List[Dict[str, Any]], text: str):
+    """Uses Gemini only when inconsistencies are high."""
     if not GEMINI_API_KEY:
+        return items, {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0}
     try:
         prompt = (
+            "You are a precise bill-item cleaner. Fix broken names, validate qty = amount/rate, "
+            "and remove any invalid rows. Return JSON array only.\n\n"
+            f"Full text: '''{text[:3000]}'''\n"
+            f"Detected items: {json.dumps(items)}"
         )
+        model = genai.GenerativeModel(GEMINI_MODEL)
+        response = model.generate_content(prompt)
         raw = response.text.strip()
+        raw = raw.replace("```json", "").replace("```", "")
         parsed = json.loads(raw)
+        final_items = []
+        for obj in parsed:
+            final_items.append({
+                "item_name": str(obj.get("item_name", "")).strip(),
+                "item_amount": float(obj.get("item_amount", 0)),
+                "item_rate": float(obj.get("item_rate", 0)),
+                "item_quantity": float(obj.get("item_quantity", 1)),
+            })
+        return final_items, {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0}
+    except:
+        return items, {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0}
+# =======================================================
+#  MAIN API ENDPOINT
+# =======================================================
 @app.post("/extract-bill-data")
 async def extract_bill_data(payload: BillRequest):
+    # ---------------------------------------------------
+    # 1. DOWNLOAD FILE
+    # ---------------------------------------------------
     try:
+        resp = requests.get(payload.document, headers={"User-Agent": "Mozilla"}, timeout=30)
+        resp.raise_for_status()
+        data_bytes = resp.content
+    except:
+        return {
+            "is_success": False,
+            "token_usage": {},
+            "data": {"pagewise_line_items": [], "total_item_count": 0}
+        }
+    # ---------------------------------------------------
+    # 2. LOAD PAGES (PDF / IMAGE)
+    # ---------------------------------------------------
+    pages = []
+    url_no_query = payload.document.split("?", 1)[0].lower()
     try:
+        if url_no_query.endswith(".pdf"):
+            pages = convert_from_bytes(data_bytes)
         else:
+            pages = [Image.open(BytesIO(data_bytes))]
+    except:
+        return {
+            "is_success": False,
+            "token_usage": {},
+            "data": {"pagewise_line_items": [], "total_item_count": 0}
+        }
+    # ---------------------------------------------------
+    # 3. PROCESS EACH PAGE
+    # ---------------------------------------------------
+    results = []
+    gemini_usage = {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0}
+    for idx, page in enumerate(pages, start=1):
         try:
+            proc = preprocess_image(page)
+            cells = run_tesseract(proc)
+            rows = group_cells(cells)
+            page_text = " ".join(" ".join(c["text"] for c in r) for r in rows).lower()
+            items = parse_rows(rows, cells)
+            items = dedupe(items)
+            # decide whether to refine with LLM
+            use_llm = False
+            if GEMINI_API_KEY and len(items) > 0:
+                inconsistent = sum(
+                    1 for it in items
+                    if abs(it["item_quantity"] * it["item_rate"] - it["item_amount"]) > max(2, 0.03 * it["item_amount"])
+                )
+                if inconsistent > max(1, len(items) // 6):
+                    use_llm = True
+            if use_llm:
+                items, usage = refine_with_llm(items, page_text)
+                for k in gemini_usage:
+                    gemini_usage[k] += usage[k]
+            results.append({
+                "page_no": str(idx),
+                "page_type": "Bill Detail",
+                "bill_items": items,
+            })
         except Exception:
+            results.append({
+                "page_no": str(idx),
+                "page_type": "Bill Detail",
+                "bill_items": []
+            })
+    total_count = sum(len(p["bill_items"]) for p in results)
+    return {
+        "is_success": True,
+        "token_usage": gemini_usage,
+        "data": {
+            "pagewise_line_items": results,
+            "total_item_count": total_count
+        }
+    }
+# -------------------------------------------------------
+#  RAW TSV DEBUG ENDPOINT
+# -------------------------------------------------------
 @app.post("/debug-tsv")
 async def debug_tsv(payload: BillRequest):
     try:
+        resp = requests.get(payload.document, timeout=20)
+        resp.raise_for_status()
+        data = resp.content
     except:
+        return {"error": "Unable to download"}
+    url = payload.document.split("?", 1)[0].lower()
+    if url.endswith(".pdf"):
+        img = convert_from_bytes(data)[0]
     else:
+        img = Image.open(BytesIO(data))
     proc = preprocess_image(img)
+    return {"cells": run_tesseract(proc)}
 @app.get("/")
+def root():
+    return {"status": "ok", "message": "Bill extraction API running"}