Spaces:

Sathvik-kota
/

Datathon

Sleeping

App Files Files Community

Sathvik-kota commited on Nov 30, 2025

Commit

56ab53e

verified ·

1 Parent(s): 2ad459f

Upload folder using huggingface_hub

Browse files

Files changed (1) hide show

app.py +435 -244

app.py CHANGED Viewed

@@ -1,6 +1,15 @@
 import os
 import re
 import json
 from io import BytesIO
 from typing import List, Dict, Any, Optional, Tuple
@@ -9,10 +18,22 @@ from pydantic import BaseModel
 import requests
 from PIL import Image
 from pdf2image import convert_from_bytes
-import pytesseract
-from pytesseract import Output
 import numpy as np
 import cv2
 # Optional: Google Gemini SDK (if available)
 try:
@@ -20,51 +41,85 @@ try:
 except Exception:
     genai = None
-# ---------------- LLM CONFIG ----------------
 GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
 GEMINI_MODEL_NAME = os.getenv("GEMINI_MODEL_NAME", "gemini-2.5-flash")
 if GEMINI_API_KEY and genai is not None:
     try:
         genai.configure(api_key=GEMINI_API_KEY)
-    except Exception:
-        pass
-# ---------------- FastAPI app ----------------
-app = FastAPI(title="Bajaj Datathon - Bill Extractor (final, improved)")
 class BillRequest(BaseModel):
-    document: str
-# ---------------- Regex and keywords ----------------
 NUM_RE = re.compile(r"[-+]?\d{1,3}(?:[,0-9]*)(?:\.\d+)?")
 TOTAL_KEYWORDS = re.compile(
     r"(grand\s*total|net\s*payable|total\s*amount|amount\s*payable|bill\s*amount|final\s*amount|balance\s*due|sub\s*total|subtotal|round\s*off)",
     re.I,
 )
 FOOTER_KEYWORDS = re.compile(r"(page|printed on|printed:|date:|time:|am|pm)", re.I)
-HEADER_KEYWORDS = ["description", "qty", "hrs", "rate", "discount", "net", "amt", "amount", "consultation", "qty/hrs", "qty / hrs"]
 HEADER_PHRASES = [
     "description qty / hrs consultation rate discount net amt",
     "description qty / hrs rate discount net amt",
-    "description qty / hrs rate net amt",
-    "description qty hrs rate discount net amt",
-    "description qty / hrs rate discount net amt",
 ]
 HEADER_PHRASES = [h.lower() for h in HEADER_PHRASES]
-# ---------------- small utilities ----------------
-def sanitize_ocr_text(s: str) -> str:
     if not s:
         return ""
     s = s.replace("\u2014", "-").replace("\u2013", "-")
     s = re.sub(r"[^\x09\x0A\x0D\x20-\x7E]", " ", s)
     s = s.replace("\r\n", "\n").replace("\r", "\n")
     s = re.sub(r"[ \t]+", " ", s)
-    s = s.strip()
-    # Correct common OCR mis-recognitions for headers
-    s = re.sub(r"\bqiy\b", "qty", s, flags=re.IGNORECASE)
-    s = re.sub(r"\bdeseription\b", "description", s, flags=re.IGNORECASE)
-    return s[:4000]
 def normalize_num_str(s: Optional[str]) -> Optional[float]:
     if s is None:
@@ -81,7 +136,8 @@ def normalize_num_str(s: Optional[str]) -> Optional[float]:
     if s in ("", "-", "+"):
         return None
     try:
-        return -float(s) if negative else float(s)
     except Exception:
         try:
             return float(s.replace(" ", ""))
@@ -94,28 +150,32 @@ def is_numeric_token(t: Optional[str]) -> bool:
 def clean_name_text(s: str) -> str:
     s = s.replace("—", "-")
     s = re.sub(r"\s+", " ", s)
-    s = s.strip(" -:,.")
-    s = re.sub(r"\bSG0?(\d+)\b", r"SG\1", s, flags=re.I)
-    s = re.sub(r"\b(RR)[\s\-]*2\b", r"RR-2", s, flags=re.I)
-    s = re.sub(r"\bOR\b", "DR", s)  # correct OCR 'OR' -> 'DR'
     return s.strip()
-# ---------------- image preprocessing ----------------
 def pil_to_cv2(img: Image.Image) -> Any:
     arr = np.array(img)
     if arr.ndim == 2:
         return arr
     return cv2.cvtColor(arr, cv2.COLOR_RGB2BGR)
-def preprocess_image(pil_img: Image.Image) -> Any:
     pil_img = pil_img.convert("RGB")
     w, h = pil_img.size
-    target_w = 1500
     if w < target_w:
         scale = target_w / float(w)
         pil_img = pil_img.resize((int(w * scale), int(h * scale)), Image.LANCZOS)
     cv_img = pil_to_cv2(pil_img)
-    gray = cv2.cvtColor(cv_img, cv2.COLOR_BGR2GRAY)
     gray = cv2.fastNlMeansDenoising(gray, h=10)
     try:
         bw = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
@@ -126,10 +186,10 @@ def preprocess_image(pil_img: Image.Image) -> Any:
     bw = cv2.morphologyEx(bw, cv2.MORPH_OPEN, kernel)
     return bw
-# ---------------- OCR TSV ----------------
 def image_to_tsv_cells(cv_img: Any) -> List[Dict[str, Any]]:
     try:
-        o = pytesseract.image_to_data(cv_img, output_type=Output.DICT, config="--psm 6")
     except Exception:
         o = pytesseract.image_to_data(cv_img, output_type=Output.DICT)
     cells = []
@@ -142,7 +202,8 @@ def image_to_tsv_cells(cv_img: Any) -> List[Dict[str, Any]]:
         if not txt:
             continue
         try:
-            conf = float(o["conf"][i]) if o["conf"][i] not in (None, "", "-1") else -1.0
         except Exception:
             conf = -1.0
         left = int(o.get("left", [0])[i])
@@ -152,11 +213,9 @@ def image_to_tsv_cells(cv_img: Any) -> List[Dict[str, Any]]:
         center_y = top + height / 2.0
         center_x = left + width / 2.0
         cells.append({"text": txt, "conf": conf, "left": left, "top": top,
-                      "width": width, "height": height,
-                      "center_y": center_y, "center_x": center_x})
     return cells
-# ---------------- grouping & merge helpers ----------------
 def group_cells_into_rows(cells: List[Dict[str, Any]], y_tolerance: int = 12) -> List[List[Dict[str, Any]]]:
     if not cells:
         return []
@@ -185,7 +244,6 @@ def merge_multiline_names(rows: List[List[Dict[str, Any]]]) -> List[List[Dict[st
         row = rows[i]
         tokens = [c["text"] for c in row]
         has_num = any(is_numeric_token(t) for t in tokens)
-        # If row has no numbers but next row does, merge them into one line
         if not has_num and i + 1 < len(rows):
             next_row = rows[i+1]
             next_tokens = [c["text"] for c in next_row]
@@ -204,7 +262,6 @@ def merge_multiline_names(rows: List[List[Dict[str, Any]]]) -> List[List[Dict[st
                 merged.append(sorted(merged_row, key=lambda cc: cc["left"]))
                 i += 2
                 continue
-        # Merge short text rows without numbers (split descriptions)
         if not has_num and i + 1 < len(rows):
             next_row = rows[i+1]
             next_tokens = [c["text"] for c in next_row]
@@ -215,7 +272,10 @@ def merge_multiline_names(rows: List[List[Dict[str, Any]]]) -> List[List[Dict[st
                 offset = 10
                 for c in row + next_row:
                     newc = c.copy()
-                    newc["left"] = newc["left"] if newc["left"] > min_left else (min_left - offset)
                     newc["center_x"] = newc["left"] + newc.get("width", 0) / 2.0
                     merged_row.append(newc)
                     offset += 5
@@ -226,7 +286,6 @@ def merge_multiline_names(rows: List[List[Dict[str, Any]]]) -> List[List[Dict[st
         i += 1
     return merged
-# ---------------- numeric column detection ----------------
 def detect_numeric_columns(cells: List[Dict[str, Any]], max_columns: int = 6) -> List[float]:
     xs = [c["center_x"] for c in cells if is_numeric_token(c["text"])]
     if not xs:
@@ -237,7 +296,7 @@ def detect_numeric_columns(cells: List[Dict[str, Any]], max_columns: int = 6) ->
     gaps = [xs[i+1] - xs[i] for i in range(len(xs)-1)]
     mean_gap = float(np.mean(gaps))
     std_gap = float(np.std(gaps)) if len(gaps) > 1 else 0.0
-    gap_thresh = max(30.0, mean_gap + 0.6 * std_gap)
     clusters = []
     curr = [xs[0]]
     for i, g in enumerate(gaps):
@@ -258,7 +317,9 @@ def assign_token_to_column(token_x: float, column_centers: List[float]) -> Optio
     distances = [abs(token_x - cx) for cx in column_centers]
     return int(np.argmin(distances))
-# ---------------- parsing rows into items ----------------
 def parse_rows_with_columns(rows: List[List[Dict[str, Any]]], page_cells: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
     parsed_items = []
     rows = merge_multiline_names(rows)
@@ -271,10 +332,10 @@ def parse_rows_with_columns(rows: List[List[Dict[str, Any]]], page_cells: List[D
         joined_lower = " ".join(tokens).lower()
         if FOOTER_KEYWORDS.search(joined_lower) and not any(is_numeric_token(t) for t in tokens):
             continue
         if all(not is_numeric_token(t) for t in tokens):
             continue
-        # Collect numeric candidates in this row
         numeric_values = []
         for t in tokens:
             if is_numeric_token(t):
@@ -300,11 +361,9 @@ def parse_rows_with_columns(rows: List[List[Dict[str, Any]]], page_cells: List[D
             raw_name = " ".join(left_text_parts).strip()
             name = clean_name_text(raw_name) if raw_name else ""
             num_cols = len(column_centers)
             def get_bucket(idx):
                 vals = numeric_bucket_map.get(idx, [])
                 return vals[-1] if vals else None
             amount = normalize_num_str(get_bucket(num_cols - 1)) if num_cols >= 1 else None
             rate = normalize_num_str(get_bucket(num_cols - 2)) if num_cols >= 2 else None
             qty = normalize_num_str(get_bucket(num_cols - 3)) if num_cols >= 3 else None
@@ -316,7 +375,7 @@ def parse_rows_with_columns(rows: List[List[Dict[str, Any]]], page_cells: List[D
                         if amount is not None:
                             break
-            # Infer rate and qty if needed
             if amount is not None and numeric_values:
                 for cand in numeric_values:
                     try:
@@ -341,11 +400,10 @@ def parse_rows_with_columns(rows: List[List[Dict[str, Any]]], page_cells: List[D
                             qty = float(r)
                             break
-            # Fallback compute rate if needed
             if (rate is None or rate == 0) and qty and qty != 0 and amount is not None:
                 try:
                     candidate_rate = amount / qty
-                    if candidate_rate >= 2:
                         rate = candidate_rate
                 except Exception:
                     pass
@@ -353,7 +411,6 @@ def parse_rows_with_columns(rows: List[List[Dict[str, Any]]], page_cells: List[D
             if qty is None:
                 qty = 1.0
-            # Normalize values
             try:
                 amount = float(round(amount, 2))
             except Exception:
@@ -373,7 +430,6 @@ def parse_rows_with_columns(rows: List[List[Dict[str, Any]]], page_cells: List[D
                 "item_rate": rate if rate is not None else 0.0,
                 "item_quantity": qty if qty is not None else 1.0,
             })
         else:
             numeric_idxs = [i for i, t in enumerate(tokens) if is_numeric_token(t)]
             if not numeric_idxs:
@@ -425,100 +481,24 @@ def parse_rows_with_columns(rows: List[List[Dict[str, Any]]], page_cells: List[D
                 "item_rate": float(round(rate, 2)),
                 "item_quantity": float(qty),
             })
     return parsed_items
-# ---------------- dedupe & totals ----------------
 def dedupe_items(items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
     seen = set()
     out = []
     for it in items:
-        nm = re.sub(r"\s+", " ", it["item_name"].lower()).strip()
-        key = (nm[:120], round(float(it["item_amount"]), 2))
         if key in seen:
             continue
         seen.add(key)
         out.append(it)
     return out
-def detect_subtotals_and_totals(rows_texts: List[str]) -> Dict[str, Optional[float]]:
-    subtotal = None; final = None
-    for rt in reversed(rows_texts):
-        if not rt or rt.strip() == "":
-            continue
-        if TOTAL_KEYWORDS.search(rt):
-            m = NUM_RE.search(rt)
-            if m:
-                v = normalize_num_str(m.group(0))
-                if v is None:
-                    continue
-                if re.search(r"sub", rt, re.I):
-                    if subtotal is None:
-                        subtotal = float(round(v, 2))
-                else:
-                    if final is None:
-                        final = float(round(v, 2))
-    return {"subtotal": subtotal, "final_total": final}
-# ---------------- Gemini refinement (deterministic) ----------------
-def refine_with_gemini(page_items: List[Dict[str, Any]], page_text: str = "") -> Tuple[List[Dict[str, Any]], Dict[str, int]]:
-    zero_usage = {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0}
-    if not GEMINI_API_KEY or genai is None:
-        return page_items, zero_usage
-    try:
-        safe_text = sanitize_ocr_text(page_text)
-        system_prompt = (
-            "You are a strict bill-extraction cleaner. Return ONLY a JSON array (no explanation, no backticks). "
-            "Each entry must be an object with keys: item_name (string), item_amount (float), item_rate (float), item_quantity (float). "
-            "Do NOT include subtotal or total lines as items. Do not invent items; only clean/fix/normalize the given items."
-        )
-        user_prompt = (
-            f"page_text='''{safe_text}'''\n"
-            f"items = {json.dumps(page_items, ensure_ascii=False)}\n\n"
-            "Example:\n"
-            "items = [{'item_name':'Consultation Charge | DR PREETHI','item_amount':300.0,'item_rate':0.0,'item_quantity':300.0},\n"
-            "         {'item_name':'Description Qty / Hrs Consultation Rate Discount Net Amt','item_amount':1950.0,'item_rate':1950.0,'item_quantity':1.0}]\n"
-            "=>\n"
-            "[{'item_name':'Consultation Charge | DR PREETHI MARY JOSEPH','item_amount':300.0,'item_rate':300.0,'item_quantity':1.0}]\n\n"
-            "Return only the cleaned JSON array of items."
-        )
-        model = genai.GenerativeModel(GEMINI_MODEL_NAME)
-        response = model.generate_content(
-            [
-                {"role": "system", "parts": [system_prompt]},
-                {"role": "user", "parts": [user_prompt]},
-            ],
-            temperature=0.0,
-            max_output_tokens=1000,
-        )
-        raw = response.text.strip()
-        if raw.startswith("```"):
-            raw = re.sub(r"^```[a-zA-Z]*", "", raw)
-            raw = re.sub(r"```$", "", raw).strip()
-        parsed = json.loads(raw)
-        if isinstance(parsed, list):
-            cleaned = []
-            for obj in parsed:
-                try:
-                    cleaned.append({
-                        "item_name": str(obj.get("item_name", "")).strip(),
-                        "item_amount": float(obj.get("item_amount", 0.0)),
-                        "item_rate": float(obj.get("item_rate", 0.0) or 0.0),
-                        "item_quantity": float(obj.get("item_quantity", 1.0) or 1.0),
-                    })
-                except Exception:
-                    continue
-            return cleaned, zero_usage
-        return page_items, zero_usage
-    except Exception:
-        return page_items, zero_usage
-# ---------------- header heuristics & final filter ----------------
 def looks_like_header_text(txt: str, top_of_page: bool = False) -> bool:
     if not txt:
         return False
     t = re.sub(r"\s+", " ", txt.strip().lower())
-    # exact phrase blacklist
     if any(h == t for h in HEADER_PHRASES):
         return True
     hits = sum(1 for k in HEADER_KEYWORDS if k in t)
@@ -536,101 +516,271 @@ def looks_like_header_text(txt: str, top_of_page: bool = False) -> bool:
         return True
     return False
-def final_item_filter(item: Dict[str, Any], known_page_headers: List[str] = [], other_item_names: List[str] = []) -> bool:
     name = (item.get("item_name") or "").strip()
     if not name:
         return False
     ln = name.lower()
-    # Remove if this item matches any known header text
     for h in known_page_headers:
         if h and h.strip() and h.strip().lower() in ln:
             return False
     if FOOTER_KEYWORDS.search(ln):
         return False
-    if item.get("item_amount", 0) > 1_000_000:
-        return False
-    if len(name) <= 2 and not re.search(r"[a-zA-Z]", name):
         return False
-    # (Removed overly restrictive filters for generic terms to retain valid items)
-    # Drop items with non-positive amounts
-    if float(item.get("item_amount", 0)) <= 0.0:
         return False
-    # Sanity check: discard if rate is absurdly higher than amount
     rate = float(item.get("item_rate", 0) or 0)
-    amt = float(item.get("item_amount", 0) or 0)
-    if rate and rate > amt * 10 and amt < 10000:
         return False
     return True
-# ---------------- main endpoint ----------------
-@app.post("/extract-bill-data")
-async def extract_bill_data(payload: BillRequest):
-    doc_url = payload.document
-    file_bytes = None
-    # --------------------------- Local or remote file ---------------------------
-    if doc_url.startswith("file://"):
-        local_path = doc_url.replace("file://", "")
-        try:
-            with open(local_path, "rb") as f:
-                file_bytes = f.read()
-        except Exception as e:
-            return {
-                "is_success": False,
-                "error": f"Local file read error: {e}",
-                "data": {"pagewise_line_items": [], "total_item_count": 0},
-                "token_usage": {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0}
-            }
-    else:
-        try:
-            headers = {"User-Agent": "Mozilla/5.0"}
-            resp = requests.get(doc_url, headers=headers, timeout=30)
-            if resp.status_code != 200:
-                raise RuntimeError(f"Download failed status={resp.status_code}")
-            file_bytes = resp.content
-        except Exception as e:
-            return {
-                "is_success": False,
-                "error": f"HTTP error: {e}",
-                "data": {"pagewise_line_items": [], "total_item_count": 0},
-                "token_usage": {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0}
-            }
-    if not file_bytes:
-        return {
-            "is_success": False,
-            "error": "No file bytes found.",
-            "data": {"pagewise_line_items": [], "total_item_count": 0},
-            "token_usage": {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0}
-        }
-    images = []
-    clean_url = doc_url.split("?", 1)[0].lower()
     try:
-        if clean_url.endswith(".pdf"):
-            images = convert_from_bytes(file_bytes)
-        elif any(clean_url.endswith(ext) for ext in [".png", ".jpg", ".jpeg", ".tiff", ".bmp"]):
-            images = [Image.open(BytesIO(file_bytes))]
-        else:
             try:
-                images = convert_from_bytes(file_bytes)
             except Exception:
-                images = []
-    except Exception:
-        images = []
-    pagewise = []
-    cumulative_token_usage = {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0}
-    for idx, page_img in enumerate(images, start=1):
         try:
-            proc = preprocess_image(page_img)
             cells = image_to_tsv_cells(proc)
             rows = group_cells_into_rows(cells, y_tolerance=12)
             rows_texts = [" ".join([c["text"] for c in r]).strip() for r in rows]
-            # === Header prefilter: remove header-like rows ===
             rows_filtered = []
             for i, (r, rt) in enumerate(zip(rows, rows_texts)):
                 top_flag = (i < 6)
@@ -640,71 +790,112 @@ async def extract_bill_data(payload: BillRequest):
                 if any(h in rt_norm for h in HEADER_PHRASES):
                     continue
                 rows_filtered.append(r)
             rows = rows_filtered
-            rows_texts = [" ".join([c["text"] for c in r]).strip() for r in rows]
-            page_text = sanitize_ocr_text(" ".join(rows_texts))
-            # Collect detected top headers for final filtering
-            top_headers = []
-            for i, rt in enumerate(rows_texts[:6]):
-                if looks_like_header_text(rt, top_of_page=(i < 4)):
-                    top_headers.append(rt.strip().lower())
             parsed_items = parse_rows_with_columns(rows, cells)
-            # Gemini refinement (if enabled)
-            refined_items, token_u = refine_with_gemini(parsed_items, page_text)
-            for k in cumulative_token_usage:
-                cumulative_token_usage[k] += token_u.get(k, 0)
-            other_item_names = [it.get("item_name", "") for it in refined_items]
-            cleaned = [p for p in refined_items if final_item_filter(p, known_page_headers=top_headers, other_item_names=other_item_names)]
             cleaned = dedupe_items(cleaned)
             page_type = "Bill Detail"
-            page_txt = page_text.lower()
             if any(x in page_txt for x in ["pharmacy", "medicine", "tablet"]):
                 page_type = "Pharmacy"
-            if "final bill" in page_txt or "grand total" in page_txt:
-                page_type = "Final Bill"
-            pagewise.append({"page_no": str(idx), "page_type": page_type, "bill_items": cleaned})
-        except Exception:
-            pagewise.append({"page_no": str(idx), "page_type": "Bill Detail", "bill_items": []})
-            continue
-    total_item_count = sum(len(p.get("bill_items", [])) for p in pagewise)
     if not GEMINI_API_KEY or genai is None:
-        cumulative_token_usage["warning_no_gemini"] = 1
-    return {"is_success": True, "token_usage": cumulative_token_usage,
-            "data": {"pagewise_line_items": pagewise, "total_item_count": total_item_count}}
-# ---------------- debug TSV ----------------
 @app.post("/debug-tsv")
 async def debug_tsv(payload: BillRequest):
     doc_url = payload.document
     try:
-        resp = requests.get(doc_url, timeout=20)
-        if resp.status_code != 200:
-            return {"error": "Download failed"}
-        file_bytes = resp.content
-    except Exception:
-        return {"error": "Download failed"}
-    clean_url = doc_url.split("?", 1)[0].lower()
-    if clean_url.endswith(".pdf"):
         imgs = convert_from_bytes(file_bytes)
         img = imgs[0]
-    else:
-        img = Image.open(BytesIO(file_bytes))
-    proc = preprocess_image(img)
     cells = image_to_tsv_cells(proc)
     return {"cells": cells}
 @app.get("/")
 def health_check():
-    msg = "Bill extraction API (updated version) live."
     if not GEMINI_API_KEY or genai is None:
-        msg += " (No GEMINI - LLM refinement skipped.)"
-    return {"status": "ok", "message": msg, "hint": "POST /extract-bill-data with {'document':'<url>'}"}

+# app.py
+# High-accuracy bill extraction API with optional Amazon Textract / Google Vision + robust Tesseract fallback.
+# Usage:
+#   export OCR_ENGINE=textract   # or "vision" or "tesseract"
+#   export AWS_REGION=us-east-1  # required for Textract
+#   export GEMINI_API_KEY=...    # optional
+#   uvicorn app:app --host 0.0.0.0 --port 8080
 import os
 import re
 import json
+import logging
 from io import BytesIO
 from typing import List, Dict, Any, Optional, Tuple
 import requests
 from PIL import Image
 from pdf2image import convert_from_bytes
 import numpy as np
 import cv2
+import pytesseract
+from pytesseract import Output
+# Optional libs (import lazily)
+try:
+    import boto3
+    from botocore.exceptions import BotoCoreError, ClientError
+except Exception:
+    boto3 = None
+try:
+    from google.cloud import vision
+except Exception:
+    vision = None
 # Optional: Google Gemini SDK (if available)
 try:
 except Exception:
     genai = None
+# -------------------------------------------------------------------------
+# Configuration and logging
+# -------------------------------------------------------------------------
+OCR_ENGINE = os.getenv("OCR_ENGINE", "textract").lower()   # 'textract' | 'vision' | 'tesseract'
 GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
 GEMINI_MODEL_NAME = os.getenv("GEMINI_MODEL_NAME", "gemini-2.5-flash")
+AWS_REGION = os.getenv("AWS_REGION", "us-east-1")
+TESSERACT_PSM = os.getenv("TESSERACT_PSM", "6")  # page segmentation mode default
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger("bill-extractor")
 if GEMINI_API_KEY and genai is not None:
     try:
         genai.configure(api_key=GEMINI_API_KEY)
+        logger.info("Gemini configured")
+    except Exception as e:
+        logger.warning("Gemini config failed: %s", e)
+# Boto3 textract client (lazy init)
+_textract_client = None
+def textract_client():
+    global _textract_client
+    if _textract_client is None:
+        if boto3 is None:
+            raise RuntimeError("boto3 not installed but OCR_ENGINE=textract requested")
+        _textract_client = boto3.client("textract", region_name=AWS_REGION)
+    return _textract_client
+# Google Vision client (lazy)
+_vision_client = None
+def vision_client():
+    global _vision_client
+    if _vision_client is None:
+        if vision is None:
+            raise RuntimeError("google-cloud-vision not installed but OCR_ENGINE=vision requested")
+        _vision_client = vision.ImageAnnotatorClient()
+    return _vision_client
+# -------------------------------------------------------------------------
+# Request model
+# -------------------------------------------------------------------------
+app = FastAPI(title="Bajaj Datathon - Bill Extractor (high-accuracy)")
 class BillRequest(BaseModel):
+    document: str  # file://local_path or http(s) url
+# -------------------------------------------------------------------------
+# Helpers (numbers, cleaning, OCR preprocessing)
+# -------------------------------------------------------------------------
 NUM_RE = re.compile(r"[-+]?\d{1,3}(?:[,0-9]*)(?:\.\d+)?")
 TOTAL_KEYWORDS = re.compile(
     r"(grand\s*total|net\s*payable|total\s*amount|amount\s*payable|bill\s*amount|final\s*amount|balance\s*due|sub\s*total|subtotal|round\s*off)",
     re.I,
 )
 FOOTER_KEYWORDS = re.compile(r"(page|printed on|printed:|date:|time:|am|pm)", re.I)
+HEADER_KEYWORDS = [
+    "description", "qty", "hrs", "rate", "discount", "net", "amt", "amount",
+    "consultation", "address", "sex", "age", "mobile", "patient", "category",
+    "doctor", "dr", "invoice", "bill", "subtotal", "total", "charges", "service"
+]
 HEADER_PHRASES = [
     "description qty / hrs consultation rate discount net amt",
     "description qty / hrs rate discount net amt",
 ]
 HEADER_PHRASES = [h.lower() for h in HEADER_PHRASES]
+def sanitize_ocr_text(s: Optional[str]) -> str:
     if not s:
         return ""
     s = s.replace("\u2014", "-").replace("\u2013", "-")
     s = re.sub(r"[^\x09\x0A\x0D\x20-\x7E]", " ", s)
     s = s.replace("\r\n", "\n").replace("\r", "\n")
     s = re.sub(r"[ \t]+", " ", s)
+    # common OCR corrections
+    s = re.sub(r"\bqiy\b", "qty", s, flags=re.I)
+    s = re.sub(r"\bdeseription\b", "description", s, flags=re.I)
+    return s.strip()
 def normalize_num_str(s: Optional[str]) -> Optional[float]:
     if s is None:
     if s in ("", "-", "+"):
         return None
     try:
+        val = float(s)
+        return -val if negative else val
     except Exception:
         try:
             return float(s.replace(" ", ""))
 def clean_name_text(s: str) -> str:
     s = s.replace("—", "-")
     s = re.sub(r"\s+", " ", s)
+    s = s.strip(" -:,.=")
+    s = re.sub(r"\s+x$", "", s, flags=re.I)
+    s = re.sub(r"[\)\}\]]+$", "", s)
+    s = re.sub(r"\bOR\b", "DR", s)  # OCR OR -> DR
     return s.strip()
+# -------------------------------------------------------------------------
+# Image preprocessing helpers (for Tesseract pipeline)
+# -------------------------------------------------------------------------
 def pil_to_cv2(img: Image.Image) -> Any:
     arr = np.array(img)
     if arr.ndim == 2:
         return arr
     return cv2.cvtColor(arr, cv2.COLOR_RGB2BGR)
+def preprocess_image_for_tesseract(pil_img: Image.Image, target_w: int = 1500) -> Any:
     pil_img = pil_img.convert("RGB")
     w, h = pil_img.size
     if w < target_w:
         scale = target_w / float(w)
         pil_img = pil_img.resize((int(w * scale), int(h * scale)), Image.LANCZOS)
     cv_img = pil_to_cv2(pil_img)
+    if cv_img.ndim == 3:
+        gray = cv2.cvtColor(cv_img, cv2.COLOR_BGR2GRAY)
+    else:
+        gray = cv_img
     gray = cv2.fastNlMeansDenoising(gray, h=10)
     try:
         bw = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
     bw = cv2.morphologyEx(bw, cv2.MORPH_OPEN, kernel)
     return bw
 def image_to_tsv_cells(cv_img: Any) -> List[Dict[str, Any]]:
+    # returns list of OCR 'cells' compatible with your parsing pipeline
     try:
+        o = pytesseract.image_to_data(cv_img, output_type=Output.DICT, config=f"--psm {TESSERACT_PSM}")
     except Exception:
         o = pytesseract.image_to_data(cv_img, output_type=Output.DICT)
     cells = []
         if not txt:
             continue
         try:
+            conf_raw = o.get("conf", [])[i]
+            conf = float(conf_raw) if conf_raw not in (None, "", "-1") else -1.0
         except Exception:
             conf = -1.0
         left = int(o.get("left", [0])[i])
         center_y = top + height / 2.0
         center_x = left + width / 2.0
         cells.append({"text": txt, "conf": conf, "left": left, "top": top,
+                      "width": width, "height": height, "center_y": center_y, "center_x": center_x})
     return cells
 def group_cells_into_rows(cells: List[Dict[str, Any]], y_tolerance: int = 12) -> List[List[Dict[str, Any]]]:
     if not cells:
         return []
         row = rows[i]
         tokens = [c["text"] for c in row]
         has_num = any(is_numeric_token(t) for t in tokens)
         if not has_num and i + 1 < len(rows):
             next_row = rows[i+1]
             next_tokens = [c["text"] for c in next_row]
                 merged.append(sorted(merged_row, key=lambda cc: cc["left"]))
                 i += 2
                 continue
         if not has_num and i + 1 < len(rows):
             next_row = rows[i+1]
             next_tokens = [c["text"] for c in next_row]
                 offset = 10
                 for c in row + next_row:
                     newc = c.copy()
+                    if newc["left"] > min_left:
+                        newc["left"] = newc["left"]
+                    else:
+                        newc["left"] = min_left - offset
                     newc["center_x"] = newc["left"] + newc.get("width", 0) / 2.0
                     merged_row.append(newc)
                     offset += 5
         i += 1
     return merged
 def detect_numeric_columns(cells: List[Dict[str, Any]], max_columns: int = 6) -> List[float]:
     xs = [c["center_x"] for c in cells if is_numeric_token(c["text"])]
     if not xs:
     gaps = [xs[i+1] - xs[i] for i in range(len(xs)-1)]
     mean_gap = float(np.mean(gaps))
     std_gap = float(np.std(gaps)) if len(gaps) > 1 else 0.0
+    gap_thresh = max(28.0, mean_gap + 0.6 * std_gap)
     clusters = []
     curr = [xs[0]]
     for i, g in enumerate(gaps):
     distances = [abs(token_x - cx) for cx in column_centers]
     return int(np.argmin(distances))
+# -------------------------------------------------------------------------
+# Parsing pipeline (shared)
+# -------------------------------------------------------------------------
 def parse_rows_with_columns(rows: List[List[Dict[str, Any]]], page_cells: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
     parsed_items = []
     rows = merge_multiline_names(rows)
         joined_lower = " ".join(tokens).lower()
         if FOOTER_KEYWORDS.search(joined_lower) and not any(is_numeric_token(t) for t in tokens):
             continue
+        # require some numeric token (date-only rows excluded later)
         if all(not is_numeric_token(t) for t in tokens):
             continue
         numeric_values = []
         for t in tokens:
             if is_numeric_token(t):
             raw_name = " ".join(left_text_parts).strip()
             name = clean_name_text(raw_name) if raw_name else ""
             num_cols = len(column_centers)
             def get_bucket(idx):
                 vals = numeric_bucket_map.get(idx, [])
                 return vals[-1] if vals else None
             amount = normalize_num_str(get_bucket(num_cols - 1)) if num_cols >= 1 else None
             rate = normalize_num_str(get_bucket(num_cols - 2)) if num_cols >= 2 else None
             qty = normalize_num_str(get_bucket(num_cols - 3)) if num_cols >= 3 else None
                         if amount is not None:
                             break
+            # infer rate and qty heuristics
             if amount is not None and numeric_values:
                 for cand in numeric_values:
                     try:
                             qty = float(r)
                             break
             if (rate is None or rate == 0) and qty and qty != 0 and amount is not None:
                 try:
                     candidate_rate = amount / qty
+                    if candidate_rate >= 1.0:
                         rate = candidate_rate
                 except Exception:
                     pass
             if qty is None:
                 qty = 1.0
             try:
                 amount = float(round(amount, 2))
             except Exception:
                 "item_rate": rate if rate is not None else 0.0,
                 "item_quantity": qty if qty is not None else 1.0,
             })
         else:
             numeric_idxs = [i for i, t in enumerate(tokens) if is_numeric_token(t)]
             if not numeric_idxs:
                 "item_rate": float(round(rate, 2)),
                 "item_quantity": float(qty),
             })
     return parsed_items
 def dedupe_items(items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
     seen = set()
     out = []
     for it in items:
+        nm = re.sub(r"\s+", " ", (it.get("item_name","") or "").lower()).strip()
+        key = (nm[:120], round(float(it.get("item_amount", 0) or 0), 2))
         if key in seen:
             continue
         seen.add(key)
         out.append(it)
     return out
 def looks_like_header_text(txt: str, top_of_page: bool = False) -> bool:
     if not txt:
         return False
     t = re.sub(r"\s+", " ", txt.strip().lower())
     if any(h == t for h in HEADER_PHRASES):
         return True
     hits = sum(1 for k in HEADER_KEYWORDS if k in t)
         return True
     return False
+def final_item_filter(item: Dict[str, Any], known_page_headers: List[str] = []) -> bool:
     name = (item.get("item_name") or "").strip()
     if not name:
         return False
     ln = name.lower()
     for h in known_page_headers:
         if h and h.strip() and h.strip().lower() in ln:
             return False
     if FOOTER_KEYWORDS.search(ln):
         return False
+    amt = float(item.get("item_amount", 0) or 0)
+    if amt <= 0:
         return False
+    # sanity: weird giant amounts are likely OCR garbage
+    if amt > 10_000_000:
         return False
     rate = float(item.get("item_rate", 0) or 0)
+    if rate and rate > amt * 20 and amt < 10000:
         return False
     return True
+# -------------------------------------------------------------------------
+# Gemini refinement (deterministic, optional)
+# -------------------------------------------------------------------------
+def refine_with_gemini(page_items: List[Dict[str, Any]], page_text: str = "") -> Tuple[List[Dict[str, Any]], Dict[str,int]]:
+    zero_usage = {"total_tokens":0, "input_tokens":0, "output_tokens":0}
+    if not GEMINI_API_KEY or genai is None:
+        return page_items, zero_usage
     try:
+        safe_text = sanitize_ocr_text(page_text)[:3000]
+        system_prompt = (
+            "You are a strict bill-extraction cleaner. Return ONLY a JSON array (no explanation, no backticks). "
+            "Each entry must be an object with keys: item_name (string), item_amount (float), item_rate (float), item_quantity (float). "
+            "Do NOT include subtotal or total lines as items. Do not invent items; only clean/fix/normalize the given items."
+        )
+        user_prompt = f"page_text='''{safe_text}'''\nitems={json.dumps(page_items, ensure_ascii=False)}\nReturn only the cleaned JSON array."
+        model = genai.GenerativeModel(GEMINI_MODEL_NAME)
+        response = model.generate_content(
+            [
+                {"role":"system","parts":[system_prompt]},
+                {"role":"user","parts":[user_prompt]},
+            ],
+            temperature=0.0,
+            max_output_tokens=1000,
+        )
+        raw = response.text.strip()
+        if raw.startswith("```"):
+            raw = re.sub(r"^```[a-zA-Z]*", "", raw)
+            raw = re.sub(r"```$", "", raw).strip()
+        parsed = json.loads(raw)
+        out = []
+        for obj in parsed:
             try:
+                out.append({
+                    "item_name": str(obj.get("item_name","")).strip(),
+                    "item_amount": float(obj.get("item_amount",0.0)),
+                    "item_rate": float(obj.get("item_rate",0.0) or 0.0),
+                    "item_quantity": float(obj.get("item_quantity",1.0) or 1.0),
+                })
             except Exception:
+                continue
+        return out, zero_usage
+    except Exception as e:
+        logger.warning("Gemini refine failed: %s", e)
+        return page_items, zero_usage
+# -------------------------------------------------------------------------
+# OCR engine implementations
+# -------------------------------------------------------------------------
+def ocr_with_textract(file_bytes: bytes) -> List[Dict[str, Any]]:
+    """
+    Use Amazon Textract AnalyzeExpense on each page image. Returns list of pages:
+      [{"page_no": "1", "page_type": "...", "bill_items": [...]}]
+    Note: Textract AnalyzeExpense returns structured expense/line-item data; we map it to our output.
+    """
+    pages_out = []
+    client = textract_client()
+    # Convert bytes to images and call AnalyzeExpense for each page (synchronous).
+    try:
+        images = convert_from_bytes(file_bytes)
+    except Exception as e:
+        logger.warning("Textract fallback: PDF->image conversion failed: %s", e)
+        return []
+    for idx, pil_img in enumerate(images, start=1):
+        bio = BytesIO()
+        pil_img.save(bio, format="JPEG", quality=90)
+        img_bytes = bio.getvalue()
+        try:
+            resp = client.analyze_expense(Document={'Bytes': img_bytes})
+        except (BotoCoreError, ClientError) as e:
+            logger.exception("Textract analyze_expense failed: %s", e)
+            pages_out.append({"page_no": str(idx), "page_type": "Bill Detail", "bill_items": []})
+            continue
+        # Parse Textract response
+        items = []
+        line_item_groups = resp.get("ExpenseDocuments", [])
+        if line_item_groups:
+            for doc in line_item_groups:
+                groups = doc.get("LineItemGroups", [])
+                for g in groups:
+                    for li in g.get("LineItems", []):
+                        # Each line item has LineItemExpenseFields list
+                        name_parts = []
+                        amount = None
+                        rate = None
+                        qty = None
+                        for f in li.get("LineItemExpenseFields", []):
+                            tname = f.get("Type", {}).get("Text", "") or ""
+                            v = f.get("ValueDetection", {}).get("Text", "") or ""
+                            txt_l = tname.lower()
+                            if txt_l in ("item", "description", "item description", "service"):
+                                name_parts.append(v)
+                            elif txt_l in ("amount", "price", "total"):
+                                maybe = normalize_num_str(v)
+                                if maybe is not None:
+                                    amount = maybe
+                            elif txt_l in ("quantity", "qty"):
+                                maybe = normalize_num_str(v)
+                                if maybe is not None:
+                                    qty = maybe
+                            elif txt_l in ("rate", "unit price", "price per unit"):
+                                maybe = normalize_num_str(v)
+                                if maybe is not None:
+                                    rate = maybe
+                            else:
+                                # Heuristic: if value looks numeric and field name is empty, try assign
+                                if is_numeric_token(v) and amount is None:
+                                    maybe = normalize_num_str(v)
+                                    if maybe is not None:
+                                        amount = maybe
+                                elif v and not is_numeric_token(v):
+                                    name_parts.append(v)
+                        name = " ".join(name_parts).strip() or "UNKNOWN"
+                        # Post-process amount/rate/qty
+                        if amount is None:
+                            # try to find from summary fields
+                            pass
+                        if qty is None and rate is not None and amount is not None and rate != 0:
+                            try:
+                                qty = round(amount / rate, 2)
+                            except Exception:
+                                qty = 1.0
+                        if qty is None:
+                            qty = 1.0
+                        if rate is None and qty and qty != 0 and amount is not None:
+                            try:
+                                rate = round(amount / qty, 2)
+                            except Exception:
+                                rate = 0.0
+                        if amount is None:
+                            amount = 0.0
+                        items.append({
+                            "item_name": clean_name_text(name),
+                            "item_amount": float(round(amount, 2)),
+                            "item_rate": float(round(rate or 0.0, 2)),
+                            "item_quantity": float(qty or 1.0),
+                        })
+        # Fallback: if Textract returned no structured line items, attempt to extract lines from Blocks
+        if not items:
+            # try to extract lines from DocumentMetadata / Blocks
+            blocks = resp.get("Blocks", [])
+            lines = []
+            for b in blocks:
+                if b.get("BlockType") == "LINE":
+                    lines.append(b.get("Text", ""))
+            # naive fallback: group lines that contain numbers
+            for ln in lines:
+                tokens = ln.split()
+                numbers = [t for t in tokens if is_numeric_token(t)]
+                if numbers:
+                    name = " ".join([t for t in tokens if not is_numeric_token(t)])
+                    amount = None
+                    for t in reversed(tokens):
+                        if is_numeric_token(t):
+                            v = normalize_num_str(t)
+                            if v is not None:
+                                amount = v
+                                break
+                    if amount:
+                        items.append({
+                            "item_name": clean_name_text(name or "UNKNOWN"),
+                            "item_amount": float(round(amount, 2)),
+                            "item_rate": 0.0,
+                            "item_quantity": 1.0,
+                        })
+        # Filter & dedupe
+        items = [it for it in items if final_item_filter(it, [])]
+        items = dedupe_items(items)
+        page_type = "Bill Detail"
+        items_text = " ".join([it["item_name"] for it in items]).lower()
+        if "pharmacy" in items_text or "tablet" in items_text or "medicine" in items_text:
+            page_type = "Pharmacy"
+        pages_out.append({"page_no": str(idx), "page_type": page_type, "bill_items": items})
+    return pages_out
+def ocr_with_google_vision(file_bytes: bytes) -> List[Dict[str,Any]]:
+    """
+    Google Vision Document OCR pipeline. Returns parsed pages (same format).
+    """
+    client = vision_client()
+    pages_out = []
+    try:
+        images = convert_from_bytes(file_bytes)
+    except Exception as e:
+        logger.warning("Vision pipeline: PDF->image conversion failed: %s", e)
+        return []
+    for idx, pil_img in enumerate(images, start=1):
+        bio = BytesIO()
+        pil_img.save(bio, format="JPEG", quality=90)
+        content = bio.getvalue()
+        image = vision.Image(content=content)
+        resp = client.document_text_detection(image=image)
+        text = resp.full_text_annotation.text if resp.full_text_annotation else ""
+        # Build pseudo-cells from words using bounding boxes if available
+        cells = []
+        for page in (resp.full_text_annotation.pages or []):
+            for block in page.blocks:
+                for para in block.paragraphs:
+                    for word in para.words:
+                        word_text = "".join([sym.text for sym in word.symbols])
+                        bbox = word.bounding_box
+                        # compute approximate left/top/width/height
+                        xs = [v.x for v in bbox.vertices]
+                        ys = [v.y for v in bbox.vertices]
+                        left = int(min(xs)) if xs else 0
+                        top = int(min(ys)) if ys else 0
+                        width = int(max(xs)-min(xs)) if xs else 0
+                        height = int(max(ys)-min(ys)) if ys else 0
+                        center_x = left + width/2.0
+                        center_y = top + height/2.0
+                        cells.append({"text": word_text, "conf": -1.0, "left": left, "top": top, "width": width, "height": height, "center_x": center_x, "center_y": center_y})
+        # row grouping + parse using shared functions
+        rows = group_cells_into_rows(cells, y_tolerance=14)
+        parsed_items = parse_rows_with_columns(rows, cells)
+        cleaned = [p for p in parsed_items if final_item_filter(p, [])]
+        cleaned = dedupe_items(cleaned)
+        page_type = "Bill Detail"
+        page_txt = text.lower()
+        if any(x in page_txt for x in ["pharmacy", "medicine", "tablet"]):
+            page_type = "Pharmacy"
+        pages_out.append({"page_no": str(idx), "page_type": page_type, "bill_items": cleaned})
+    return pages_out
+def ocr_with_tesseract(file_bytes: bytes) -> List[Dict[str,Any]]:
+    """Tesseract pipeline using your preprocessing + TSV + parsing functions."""
+    pages_out = []
+    try:
+        images = convert_from_bytes(file_bytes)
+    except Exception as e:
+        # maybe it's a single image format (jpg/png)
+        try:
+            im = Image.open(BytesIO(file_bytes))
+            images = [im]
+        except Exception:
+            logger.exception("Tesseract pipeline can't open file: %s", e)
+            return []
+    for idx, pil_img in enumerate(images, start=1):
         try:
+            proc = preprocess_image_for_tesseract(pil_img)
             cells = image_to_tsv_cells(proc)
             rows = group_cells_into_rows(cells, y_tolerance=12)
             rows_texts = [" ".join([c["text"] for c in r]).strip() for r in rows]
+            # header prefilter
             rows_filtered = []
             for i, (r, rt) in enumerate(zip(rows, rows_texts)):
                 top_flag = (i < 6)
                 if any(h in rt_norm for h in HEADER_PHRASES):
                     continue
                 rows_filtered.append(r)
             rows = rows_filtered
             parsed_items = parse_rows_with_columns(rows, cells)
+            refined_items, _ = refine_with_gemini(parsed_items, sanitize_ocr_text(" ".join(rows_texts)))
+            cleaned = [p for p in refined_items if final_item_filter(p, [])]
             cleaned = dedupe_items(cleaned)
             page_type = "Bill Detail"
+            page_txt = " ".join(rows_texts).lower()
             if any(x in page_txt for x in ["pharmacy", "medicine", "tablet"]):
                 page_type = "Pharmacy"
+            pages_out.append({"page_no": str(idx), "page_type": page_type, "bill_items": cleaned})
+        except Exception as e:
+            logger.exception("Tesseract parse page failed: %s", e)
+            pages_out.append({"page_no": str(idx), "page_type": "Bill Detail", "bill_items": []})
+    return pages_out
+# -------------------------------------------------------------------------
+# Main endpoint
+# -------------------------------------------------------------------------
+@app.post("/extract-bill-data")
+async def extract_bill_data(payload: BillRequest):
+    doc_url = payload.document
+    file_bytes = None
+    # local file support
+    if doc_url.startswith("file://"):
+        local_path = doc_url.replace("file://", "")
+        try:
+            with open(local_path, "rb") as f:
+                file_bytes = f.read()
+        except Exception as e:
+            return {"is_success": False, "error": f"Local file read error: {e}",
+                    "data": {"pagewise_line_items": [], "total_item_count": 0}, "token_usage": {"total_tokens":0,"input_tokens":0,"output_tokens":0}}
+    else:
+        try:
+            headers = {"User-Agent": "Mozilla/5.0"}
+            resp = requests.get(doc_url, headers=headers, timeout=30)
+            if resp.status_code != 200:
+                return {"is_success": False, "error": f"Download failed status={resp.status_code}",
+                        "data": {"pagewise_line_items": [], "total_item_count": 0}, "token_usage": {"total_tokens":0,"input_tokens":0,"output_tokens":0}}
+            file_bytes = resp.content
+        except Exception as e:
+            return {"is_success": False, "error": f"HTTP error: {e}",
+                    "data": {"pagewise_line_items": [], "total_item_count": 0}, "token_usage": {"total_tokens":0,"input_tokens":0,"output_tokens":0}}
+    if not file_bytes:
+        return {"is_success": False, "error": "No file bytes found", "data": {"pagewise_line_items": [], "total_item_count": 0}, "token_usage": {"total_tokens":0,"input_tokens":0,"output_tokens":0}}
+    pages = []
+    token_usage = {"total_tokens":0,"input_tokens":0,"output_tokens":0}
+    engine = OCR_ENGINE
+    logger.info("Using OCR engine: %s", engine)
+    try:
+        if engine == "textract":
+            pages = ocr_with_textract(file_bytes)
+        elif engine == "vision":
+            pages = ocr_with_google_vision(file_bytes)
+        else:
+            pages = ocr_with_tesseract(file_bytes)
+    except Exception as e:
+        logger.exception("OCR engine failed: %s", e)
+        # fallback to tesseract pipeline
+        try:
+            pages = ocr_with_tesseract(file_bytes)
+        except Exception as e:
+            logger.exception("Tesseract fallback also failed: %s", e)
+            pages = []
+    total_item_count = sum(len(p.get("bill_items", [])) for p in pages)
     if not GEMINI_API_KEY or genai is None:
+        token_usage["warning_no_gemini"] = 1
+    return {"is_success": True, "token_usage": token_usage, "data": {"pagewise_line_items": pages, "total_item_count": total_item_count}}
+# -------------------------------------------------------------------------
+# Debug endpoint to return tsv cell info for inspection
+# -------------------------------------------------------------------------
 @app.post("/debug-tsv")
 async def debug_tsv(payload: BillRequest):
     doc_url = payload.document
     try:
+        if doc_url.startswith("file://"):
+            local_path = doc_url.replace("file://", "")
+            with open(local_path, "rb") as f:
+                file_bytes = f.read()
+        else:
+            resp = requests.get(doc_url, timeout=20)
+            resp.raise_for_status()
+            file_bytes = resp.content
+    except Exception as e:
+        return {"error": f"Download failed: {e}"}
+    try:
         imgs = convert_from_bytes(file_bytes)
         img = imgs[0]
+    except Exception:
+        try:
+            img = Image.open(BytesIO(file_bytes)).convert("RGB")
+        except Exception as e:
+            return {"error": f"Image conversion failed: {e}"}
+    proc = preprocess_image_for_tesseract(img)
     cells = image_to_tsv_cells(proc)
     return {"cells": cells}
 @app.get("/")
 def health_check():
+    msg = f"Bill extraction API live. OCR_ENGINE={OCR_ENGINE}"
     if not GEMINI_API_KEY or genai is None:
+        msg += " (Gemini not configured — LLM refinement skipped.)"
+    return {"status": "ok", "message": msg, "hint": "POST /extract-bill-data with {'document':'<url or file://path>'}"}