Spaces:

Sathvik-kota
/

Datathon

Sleeping

App Files Files Community

Sathvik-kota commited on Nov 30, 2025

Commit

2ad459f

verified ·

1 Parent(s): 811fc30

Upload folder using huggingface_hub

Browse files

Files changed (1) hide show

app.py +128 -186

app.py CHANGED Viewed

@@ -1,17 +1,10 @@
-# app_bill_extractor_final_v2.py
-# Humanized, high-accuracy bill extraction API.
-# Robust OCR preprocessing, TSV layout parsing, numeric-column inference,
-# header prefiltering, deterministic Gemini refinement (if configured).
 import os
 import re
 import json
-import logging
 from io import BytesIO
 from typing import List, Dict, Any, Optional, Tuple
-import uvicorn
-from fastapi import FastAPI, BackgroundTasks
 from pydantic import BaseModel
 import requests
 from PIL import Image
@@ -27,40 +20,29 @@ try:
 except Exception:
     genai = None
-# ---------------- logging ----------------
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger("bill-extractor")
-# ---------------- FastAPI app ----------------
-app = FastAPI(title="Bajaj Datathon - Bill Extractor (final, humanized)")
-# ---------------- request model ----------------
-class BillRequest(BaseModel):
-    document: str
 # ---------------- LLM CONFIG ----------------
 GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
 GEMINI_MODEL_NAME = os.getenv("GEMINI_MODEL_NAME", "gemini-2.5-flash")
 if GEMINI_API_KEY and genai is not None:
     try:
         genai.configure(api_key=GEMINI_API_KEY)
-        logger.info("Gemini SDK configured.")
-    except Exception as e:
-        logger.warning("Failed to configure Gemini SDK: %s", e)
-# ---------------- Regex and keywords (updated) ----------------
 NUM_RE = re.compile(r"[-+]?\d{1,3}(?:[,0-9]*)(?:\.\d+)?")
 TOTAL_KEYWORDS = re.compile(
-    r"(grand\s*total|net\s*payable|total\s*amount|amount\s*payable|bill\s*amount|final\s*amount|balance\s*due|sub\s*total|subtotal|total)",
     re.I,
 )
 FOOTER_KEYWORDS = re.compile(r"(page|printed on|printed:|date:|time:|am|pm)", re.I)
-HEADER_KEYWORDS = [
-    "description", "qty", "hrs", "rate", "discount", "net", "amt", "amount",
-    "consultation", "address", "sex", "age", "mobile", "patient", "category",
-    "doctor", "dr", "invoice", "bill", "subtotal", "total", "charges", "service"
-]
 HEADER_PHRASES = [
     "description qty / hrs consultation rate discount net amt",
     "description qty / hrs rate discount net amt",
@@ -79,6 +61,9 @@ def sanitize_ocr_text(s: str) -> str:
     s = s.replace("\r\n", "\n").replace("\r", "\n")
     s = re.sub(r"[ \t]+", " ", s)
     s = s.strip()
     return s[:4000]
 def normalize_num_str(s: Optional[str]) -> Optional[float]:
@@ -106,28 +91,14 @@ def normalize_num_str(s: Optional[str]) -> Optional[float]:
 def is_numeric_token(t: Optional[str]) -> bool:
     return bool(t and NUM_RE.search(str(t)))
-def looks_like_date_num(s: str) -> bool:
-    s_digits = re.sub(r"[^\d]", "", s or "")
-    if len(s_digits) >= 7:
-        if s_digits.endswith(("2025","2024","2023","2022","2026")):
-            return True
-        try:
-            if float(s_digits) > 1e6:
-                return True
-        except:
-            pass
-    return False
 def clean_name_text(s: str) -> str:
     s = s.replace("—", "-")
     s = re.sub(r"\s+", " ", s)
-    s = s.strip(" -:,.=")
-    s = re.sub(r"\s+x$", "", s, flags=re.I)
-    s = re.sub(r"[\)\}\]]+$", "", s)
-    s = re.sub(r"\bOR\b", "DR", s)
     s = s.strip(" -:,.")
-    s = s.strip()
-    return s
 # ---------------- image preprocessing ----------------
 def pil_to_cv2(img: Image.Image) -> Any:
@@ -137,7 +108,6 @@ def pil_to_cv2(img: Image.Image) -> Any:
     return cv2.cvtColor(arr, cv2.COLOR_RGB2BGR)
 def preprocess_image(pil_img: Image.Image) -> Any:
-    # convert and upscale if small
     pil_img = pil_img.convert("RGB")
     w, h = pil_img.size
     target_w = 1500
@@ -145,11 +115,7 @@ def preprocess_image(pil_img: Image.Image) -> Any:
         scale = target_w / float(w)
         pil_img = pil_img.resize((int(w * scale), int(h * scale)), Image.LANCZOS)
     cv_img = pil_to_cv2(pil_img)
-    # grayscale and denoise
-    if cv_img.ndim == 3:
-        gray = cv2.cvtColor(cv_img, cv2.COLOR_BGR2GRAY)
-    else:
-        gray = cv_img
     gray = cv2.fastNlMeansDenoising(gray, h=10)
     try:
         bw = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
@@ -162,7 +128,6 @@ def preprocess_image(pil_img: Image.Image) -> Any:
 # ---------------- OCR TSV ----------------
 def image_to_tsv_cells(cv_img: Any) -> List[Dict[str, Any]]:
-    # pytesseract expects either a PIL image or numpy array
     try:
         o = pytesseract.image_to_data(cv_img, output_type=Output.DICT, config="--psm 6")
     except Exception:
@@ -187,10 +152,11 @@ def image_to_tsv_cells(cv_img: Any) -> List[Dict[str, Any]]:
         center_y = top + height / 2.0
         center_x = left + width / 2.0
         cells.append({"text": txt, "conf": conf, "left": left, "top": top,
-                      "width": width, "height": height, "center_y": center_y, "center_x": center_x})
     return cells
-# ---------------- grouping & merging helpers ----------------
 def group_cells_into_rows(cells: List[Dict[str, Any]], y_tolerance: int = 12) -> List[List[Dict[str, Any]]]:
     if not cells:
         return []
@@ -219,6 +185,7 @@ def merge_multiline_names(rows: List[List[Dict[str, Any]]]) -> List[List[Dict[st
         row = rows[i]
         tokens = [c["text"] for c in row]
         has_num = any(is_numeric_token(t) for t in tokens)
         if not has_num and i + 1 < len(rows):
             next_row = rows[i+1]
             next_tokens = [c["text"] for c in next_row]
@@ -237,6 +204,7 @@ def merge_multiline_names(rows: List[List[Dict[str, Any]]]) -> List[List[Dict[st
                 merged.append(sorted(merged_row, key=lambda cc: cc["left"]))
                 i += 2
                 continue
         if not has_num and i + 1 < len(rows):
             next_row = rows[i+1]
             next_tokens = [c["text"] for c in next_row]
@@ -247,10 +215,7 @@ def merge_multiline_names(rows: List[List[Dict[str, Any]]]) -> List[List[Dict[st
                 offset = 10
                 for c in row + next_row:
                     newc = c.copy()
-                    if newc["left"] > min_left:
-                        newc["left"] = newc["left"]
-                    else:
-                        newc["left"] = min_left - offset
                     newc["center_x"] = newc["left"] + newc.get("width", 0) / 2.0
                     merged_row.append(newc)
                     offset += 5
@@ -262,7 +227,7 @@ def merge_multiline_names(rows: List[List[Dict[str, Any]]]) -> List[List[Dict[st
     return merged
 # ---------------- numeric column detection ----------------
-def detect_numeric_columns(cells: List[Dict[str, Any]], max_columns: int = 4) -> List[float]:
     xs = [c["center_x"] for c in cells if is_numeric_token(c["text"])]
     if not xs:
         return []
@@ -293,60 +258,11 @@ def assign_token_to_column(token_x: float, column_centers: List[float]) -> Optio
     distances = [abs(token_x - cx) for cx in column_centers]
     return int(np.argmin(distances))
-# ---------------- Gemini refinement (deterministic) ----------------
-def refine_with_gemini(page_items: List[Dict[str, Any]], page_text: str = "") -> Tuple[List[Dict[str, Any]], Dict[str, int]]:
-    zero_usage = {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0}
-    if not GEMINI_API_KEY or genai is None:
-        return page_items, zero_usage
-    try:
-        safe_text = sanitize_ocr_text(page_text)
-        system_prompt = (
-            "You are a strict bill-extraction cleaner. Return ONLY a JSON array (no explanation, no backticks). "
-            "Each entry must be an object with keys: item_name (string), item_amount (float), item_rate (float), item_quantity (float). "
-            "Do NOT include subtotal or total lines as items. Do not invent items; only clean/fix/normalize the given items."
-        )
-        user_prompt = (
-            f"page_text='''{safe_text}'''\n"
-            f"items = {json.dumps(page_items, ensure_ascii=False)}\n\n"
-            "Return only the cleaned JSON array of items."
-        )
-        model = genai.GenerativeModel(GEMINI_MODEL_NAME)
-        response = model.generate_content(
-            [
-                {"role": "system", "parts": [system_prompt]},
-                {"role": "user", "parts": [user_prompt]},
-            ],
-            temperature=0.0,
-            max_output_tokens=1000,
-        )
-        raw = response.text.strip()
-        if raw.startswith("```"):
-            raw = re.sub(r"^```[a-zA-Z]*", "", raw)
-            raw = re.sub(r"```$", "", raw).strip()
-        parsed = json.loads(raw)
-        if isinstance(parsed, list):
-            cleaned = []
-            for obj in parsed:
-                try:
-                    cleaned.append({
-                        "item_name": str(obj.get("item_name", "")).strip(),
-                        "item_amount": float(obj.get("item_amount", 0.0)),
-                        "item_rate": float(obj.get("item_rate", 0.0) or 0.0),
-                        "item_quantity": float(obj.get("item_quantity", 1.0) or 1.0),
-                    })
-                except Exception:
-                    continue
-            return cleaned, zero_usage
-        return page_items, zero_usage
-    except Exception as e:
-        logger.warning("Gemini refinement failed: %s", e)
-        return page_items, zero_usage
-# ---------------- parsing rows into items (modified) ----------------
 def parse_rows_with_columns(rows: List[List[Dict[str, Any]]], page_cells: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
     parsed_items = []
     rows = merge_multiline_names(rows)
-    column_centers = detect_numeric_columns(page_cells, max_columns=4)
     for row in rows:
         tokens = [c["text"] for c in row]
@@ -358,23 +274,23 @@ def parse_rows_with_columns(rows: List[List[Dict[str, Any]]], page_cells: List[D
         if all(not is_numeric_token(t) for t in tokens):
             continue
         numeric_values = []
         for t in tokens:
             if is_numeric_token(t):
-                if looks_like_date_num(t):
-                    continue
                 v = normalize_num_str(t)
                 if v is not None:
                     numeric_values.append(float(v))
-        numeric_values = sorted({int(x) if float(x).is_integer() else x for x in numeric_values}, reverse=True)
         if column_centers:
             left_text_parts = []
             numeric_bucket_map = {i: [] for i in range(len(column_centers))}
             for c in row:
                 t = c["text"]
-                if is_numeric_token(t) and not looks_like_date_num(t):
-                    col_idx = assign_token_to_column(c["center_x"], column_centers)
                     if col_idx is None:
                         numeric_bucket_map[len(column_centers) - 1].append(t)
                     else:
@@ -383,23 +299,24 @@ def parse_rows_with_columns(rows: List[List[Dict[str, Any]]], page_cells: List[D
                     left_text_parts.append(t)
             raw_name = " ".join(left_text_parts).strip()
             name = clean_name_text(raw_name) if raw_name else ""
             num_cols = len(column_centers)
             def get_bucket(idx):
                 vals = numeric_bucket_map.get(idx, [])
                 return vals[-1] if vals else None
             amount = normalize_num_str(get_bucket(num_cols - 1)) if num_cols >= 1 else None
-            rate   = normalize_num_str(get_bucket(num_cols - 2)) if num_cols >= 2 else None
-            qty    = normalize_num_str(get_bucket(num_cols - 3)) if num_cols >= 3 else None
             if amount is None:
                 for t in reversed(tokens):
-                    if is_numeric_token(t) and not looks_like_date_num(t):
                         amount = normalize_num_str(t)
                         if amount is not None:
                             break
             if amount is not None and numeric_values:
                 for cand in numeric_values:
                     try:
@@ -424,6 +341,7 @@ def parse_rows_with_columns(rows: List[List[Dict[str, Any]]], page_cells: List[D
                             qty = float(r)
                             break
             if (rate is None or rate == 0) and qty and qty != 0 and amount is not None:
                 try:
                     candidate_rate = amount / qty
@@ -435,17 +353,18 @@ def parse_rows_with_columns(rows: List[List[Dict[str, Any]]], page_cells: List[D
             if qty is None:
                 qty = 1.0
             try:
                 amount = float(round(amount, 2))
-            except:
                 continue
             try:
                 rate = float(round(rate, 2)) if rate is not None else 0.0
-            except:
                 rate = 0.0
             try:
                 qty = float(qty)
-            except:
                 qty = 1.0
             parsed_items.append({
@@ -456,7 +375,7 @@ def parse_rows_with_columns(rows: List[List[Dict[str, Any]]], page_cells: List[D
             })
         else:
-            numeric_idxs = [i for i, t in enumerate(tokens) if is_numeric_token(t) and not looks_like_date_num(t)]
             if not numeric_idxs:
                 continue
             last = numeric_idxs[-1]
@@ -473,11 +392,11 @@ def parse_rows_with_columns(rows: List[List[Dict[str, Any]]], page_cells: List[D
                 v = normalize_num_str(tokens[i])
                 if v is not None:
                     right_nums.append(float(v))
-            right_nums = sorted({int(x) if float(x).is_integer() else x for x in right_nums}, reverse=True)
             if len(right_nums) >= 2:
                 cand = right_nums[1]
-                if 1 < cand < float(amt):
                     ratio = float(amt) / float(cand) if cand else None
                     if ratio:
                         r = round(ratio)
@@ -524,7 +443,7 @@ def dedupe_items(items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
 def detect_subtotals_and_totals(rows_texts: List[str]) -> Dict[str, Optional[float]]:
     subtotal = None; final = None
-    for rt in rows_texts[::-1]:
         if not rt or rt.strip() == "":
             continue
         if TOTAL_KEYWORDS.search(rt):
@@ -534,16 +453,72 @@ def detect_subtotals_and_totals(rows_texts: List[str]) -> Dict[str, Optional[flo
                 if v is None:
                     continue
                 if re.search(r"sub", rt, re.I):
-                    if subtotal is None: subtotal = float(round(v, 2))
                 else:
-                    if final is None: final = float(round(v, 2))
     return {"subtotal": subtotal, "final_total": final}
-# ---------------- header heuristics & final filter (updated) ----------------
 def looks_like_header_text(txt: str, top_of_page: bool = False) -> bool:
     if not txt:
         return False
     t = re.sub(r"\s+", " ", txt.strip().lower())
     if any(h == t for h in HEADER_PHRASES):
         return True
     hits = sum(1 for k in HEADER_KEYWORDS if k in t)
@@ -559,8 +534,6 @@ def looks_like_header_text(txt: str, top_of_page: bool = False) -> bool:
         return True
     if t.startswith("description") or t.startswith("qty") or t.startswith("qty /"):
         return True
-    if "sponsor" in t or "admission" in t or "age" in t or "sex" in t or "mobile" in t or "address" in t:
-        return True
     return False
 def final_item_filter(item: Dict[str, Any], known_page_headers: List[str] = [], other_item_names: List[str] = []) -> bool:
@@ -568,41 +541,25 @@ def final_item_filter(item: Dict[str, Any], known_page_headers: List[str] = [],
     if not name:
         return False
     ln = name.lower()
-    if name.upper() == "UNKNOWN" or ln == "unknown":
-        return False
-    if ln == "x":
-        return False
     for h in known_page_headers:
         if h and h.strip() and h.strip().lower() in ln:
             return False
-    if re.search(r"\b(total|subtotal|grand total)\b", ln):
-        return False
     if FOOTER_KEYWORDS.search(ln):
         return False
     if item.get("item_amount", 0) > 1_000_000:
         return False
     if len(name) <= 2 and not re.search(r"[a-zA-Z]", name):
         return False
-    words = ln.split()
-    header_word_hits = sum(1 for k in HEADER_KEYWORDS if k in ln)
-    if header_word_hits >= 1 and len(words) <= 3:
-        lower_other = " ".join(other_item_names).lower()
-        if any(k in lower_other for k in ["room", "rent", "nursing", "ward", "surgeon", "anaes", "ot", "charges", "procedure", "radiology"]):
-            return False
-        if ln in ("charge", "charges", "services", "consultation", "room", "radiology", "surgery"):
-            return False
-    if len(words) <= 4 and re.search(r"\b(charges|services|room|radiolog|laborat|surgery|procedure|rent|nursing)\b", ln):
-        lower_other = " ".join(other_item_names).lower()
-        if any(tok in lower_other for tok in ["rent", "room", "ward", "nursing", "surgeon", "anaes", "ot"]):
-            return False
-    amt = float(item.get("item_amount", 0) or 0)
-    rate = float(item.get("item_rate", 0) or 0)
-    qty = float(item.get("item_quantity", 0) or 0)
-    if qty <= 0:
-        return False
-    if rate and rate > amt:
         return False
-    if amt <= 0.0:
         return False
     return True
@@ -612,14 +569,13 @@ async def extract_bill_data(payload: BillRequest):
     doc_url = payload.document
     file_bytes = None
-    # 1. local file support
     if doc_url.startswith("file://"):
         local_path = doc_url.replace("file://", "")
         try:
             with open(local_path, "rb") as f:
                 file_bytes = f.read()
         except Exception as e:
-            logger.error("Local file read error: %s", e)
             return {
                 "is_success": False,
                 "error": f"Local file read error: {e}",
@@ -634,7 +590,6 @@ async def extract_bill_data(payload: BillRequest):
                 raise RuntimeError(f"Download failed status={resp.status_code}")
             file_bytes = resp.content
         except Exception as e:
-            logger.error("HTTP download error: %s", e)
             return {
                 "is_success": False,
                 "error": f"HTTP error: {e}",
@@ -662,8 +617,7 @@ async def extract_bill_data(payload: BillRequest):
                 images = convert_from_bytes(file_bytes)
             except Exception:
                 images = []
-    except Exception as e:
-        logger.warning("Image conversion failed: %s", e)
         images = []
     pagewise = []
@@ -676,7 +630,7 @@ async def extract_bill_data(payload: BillRequest):
             rows = group_cells_into_rows(cells, y_tolerance=12)
             rows_texts = [" ".join([c["text"] for c in r]).strip() for r in rows]
-            # header prefilter
             rows_filtered = []
             for i, (r, rt) in enumerate(zip(rows, rows_texts)):
                 top_flag = (i < 6)
@@ -691,6 +645,7 @@ async def extract_bill_data(payload: BillRequest):
             rows_texts = [" ".join([c["text"] for c in r]).strip() for r in rows]
             page_text = sanitize_ocr_text(" ".join(rows_texts))
             top_headers = []
             for i, rt in enumerate(rows_texts[:6]):
                 if looks_like_header_text(rt, top_of_page=(i < 4)):
@@ -698,26 +653,24 @@ async def extract_bill_data(payload: BillRequest):
             parsed_items = parse_rows_with_columns(rows, cells)
             refined_items, token_u = refine_with_gemini(parsed_items, page_text)
             for k in cumulative_token_usage:
                 cumulative_token_usage[k] += token_u.get(k, 0)
-            other_item_names = [it.get("item_name","") for it in refined_items]
             cleaned = [p for p in refined_items if final_item_filter(p, known_page_headers=top_headers, other_item_names=other_item_names)]
             cleaned = dedupe_items(cleaned)
-            cleaned = [p for p in cleaned if not looks_like_header_text(p["item_name"].lower())]
             page_type = "Bill Detail"
             page_txt = page_text.lower()
             if any(x in page_txt for x in ["pharmacy", "medicine", "tablet"]):
                 page_type = "Pharmacy"
-            if "final bill" in page_txt or "grand total" in page_txt or "grandtotal" in page_txt:
                 page_type = "Final Bill"
             pagewise.append({"page_no": str(idx), "page_type": page_type, "bill_items": cleaned})
-        except Exception as e:
-            logger.exception("Failed to parse page %s: %s", idx, e)
             pagewise.append({"page_no": str(idx), "page_type": "Bill Detail", "bill_items": []})
             continue
@@ -725,7 +678,8 @@ async def extract_bill_data(payload: BillRequest):
     if not GEMINI_API_KEY or genai is None:
         cumulative_token_usage["warning_no_gemini"] = 1
-    return {"is_success": True, "token_usage": cumulative_token_usage, "data": {"pagewise_line_items": pagewise, "total_item_count": total_item_count}}
 # ---------------- debug TSV ----------------
 @app.post("/debug-tsv")
@@ -750,19 +704,7 @@ async def debug_tsv(payload: BillRequest):
 @app.get("/")
 def health_check():
-    msg = "Bill extraction API (final) live."
     if not GEMINI_API_KEY or genai is None:
-        msg += " (No GEMINI_API_KEY/configured SDK — LLM refinement skipped.)"
     return {"status": "ok", "message": msg, "hint": "POST /extract-bill-data with {'document':'<url>'}"}
-@app.get("/run-all-samples")
-async def run_all_samples():
-    try:
-        import run_all_samples
-        run_all_samples.main()
-        return {"status": "done", "results_ready": True}
-    except Exception as e:
-        logger.exception("run_all_samples failed: %s", e)
-        return {"status": "error", "error": str(e)}

 import os
 import re
 import json
 from io import BytesIO
 from typing import List, Dict, Any, Optional, Tuple
+from fastapi import FastAPI
 from pydantic import BaseModel
 import requests
 from PIL import Image
 except Exception:
     genai = None
 # ---------------- LLM CONFIG ----------------
 GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
 GEMINI_MODEL_NAME = os.getenv("GEMINI_MODEL_NAME", "gemini-2.5-flash")
 if GEMINI_API_KEY and genai is not None:
     try:
         genai.configure(api_key=GEMINI_API_KEY)
+    except Exception:
+        pass
+# ---------------- FastAPI app ----------------
+app = FastAPI(title="Bajaj Datathon - Bill Extractor (final, improved)")
+class BillRequest(BaseModel):
+    document: str
+# ---------------- Regex and keywords ----------------
 NUM_RE = re.compile(r"[-+]?\d{1,3}(?:[,0-9]*)(?:\.\d+)?")
 TOTAL_KEYWORDS = re.compile(
+    r"(grand\s*total|net\s*payable|total\s*amount|amount\s*payable|bill\s*amount|final\s*amount|balance\s*due|sub\s*total|subtotal|round\s*off)",
     re.I,
 )
 FOOTER_KEYWORDS = re.compile(r"(page|printed on|printed:|date:|time:|am|pm)", re.I)
+HEADER_KEYWORDS = ["description", "qty", "hrs", "rate", "discount", "net", "amt", "amount", "consultation", "qty/hrs", "qty / hrs"]
 HEADER_PHRASES = [
     "description qty / hrs consultation rate discount net amt",
     "description qty / hrs rate discount net amt",
     s = s.replace("\r\n", "\n").replace("\r", "\n")
     s = re.sub(r"[ \t]+", " ", s)
     s = s.strip()
+    # Correct common OCR mis-recognitions for headers
+    s = re.sub(r"\bqiy\b", "qty", s, flags=re.IGNORECASE)
+    s = re.sub(r"\bdeseription\b", "description", s, flags=re.IGNORECASE)
     return s[:4000]
 def normalize_num_str(s: Optional[str]) -> Optional[float]:
 def is_numeric_token(t: Optional[str]) -> bool:
     return bool(t and NUM_RE.search(str(t)))
 def clean_name_text(s: str) -> str:
     s = s.replace("—", "-")
     s = re.sub(r"\s+", " ", s)
     s = s.strip(" -:,.")
+    s = re.sub(r"\bSG0?(\d+)\b", r"SG\1", s, flags=re.I)
+    s = re.sub(r"\b(RR)[\s\-]*2\b", r"RR-2", s, flags=re.I)
+    s = re.sub(r"\bOR\b", "DR", s)  # correct OCR 'OR' -> 'DR'
+    return s.strip()
 # ---------------- image preprocessing ----------------
 def pil_to_cv2(img: Image.Image) -> Any:
     return cv2.cvtColor(arr, cv2.COLOR_RGB2BGR)
 def preprocess_image(pil_img: Image.Image) -> Any:
     pil_img = pil_img.convert("RGB")
     w, h = pil_img.size
     target_w = 1500
         scale = target_w / float(w)
         pil_img = pil_img.resize((int(w * scale), int(h * scale)), Image.LANCZOS)
     cv_img = pil_to_cv2(pil_img)
+    gray = cv2.cvtColor(cv_img, cv2.COLOR_BGR2GRAY)
     gray = cv2.fastNlMeansDenoising(gray, h=10)
     try:
         bw = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
 # ---------------- OCR TSV ----------------
 def image_to_tsv_cells(cv_img: Any) -> List[Dict[str, Any]]:
     try:
         o = pytesseract.image_to_data(cv_img, output_type=Output.DICT, config="--psm 6")
     except Exception:
         center_y = top + height / 2.0
         center_x = left + width / 2.0
         cells.append({"text": txt, "conf": conf, "left": left, "top": top,
+                      "width": width, "height": height,
+                      "center_y": center_y, "center_x": center_x})
     return cells
+# ---------------- grouping & merge helpers ----------------
 def group_cells_into_rows(cells: List[Dict[str, Any]], y_tolerance: int = 12) -> List[List[Dict[str, Any]]]:
     if not cells:
         return []
         row = rows[i]
         tokens = [c["text"] for c in row]
         has_num = any(is_numeric_token(t) for t in tokens)
+        # If row has no numbers but next row does, merge them into one line
         if not has_num and i + 1 < len(rows):
             next_row = rows[i+1]
             next_tokens = [c["text"] for c in next_row]
                 merged.append(sorted(merged_row, key=lambda cc: cc["left"]))
                 i += 2
                 continue
+        # Merge short text rows without numbers (split descriptions)
         if not has_num and i + 1 < len(rows):
             next_row = rows[i+1]
             next_tokens = [c["text"] for c in next_row]
                 offset = 10
                 for c in row + next_row:
                     newc = c.copy()
+                    newc["left"] = newc["left"] if newc["left"] > min_left else (min_left - offset)
                     newc["center_x"] = newc["left"] + newc.get("width", 0) / 2.0
                     merged_row.append(newc)
                     offset += 5
     return merged
 # ---------------- numeric column detection ----------------
+def detect_numeric_columns(cells: List[Dict[str, Any]], max_columns: int = 6) -> List[float]:
     xs = [c["center_x"] for c in cells if is_numeric_token(c["text"])]
     if not xs:
         return []
     distances = [abs(token_x - cx) for cx in column_centers]
     return int(np.argmin(distances))
+# ---------------- parsing rows into items ----------------
 def parse_rows_with_columns(rows: List[List[Dict[str, Any]]], page_cells: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
     parsed_items = []
     rows = merge_multiline_names(rows)
+    column_centers = detect_numeric_columns(page_cells, max_columns=6)
     for row in rows:
         tokens = [c["text"] for c in row]
         if all(not is_numeric_token(t) for t in tokens):
             continue
+        # Collect numeric candidates in this row
         numeric_values = []
         for t in tokens:
             if is_numeric_token(t):
                 v = normalize_num_str(t)
                 if v is not None:
                     numeric_values.append(float(v))
+        numeric_values = sorted(list({int(x) if float(x).is_integer() else x for x in numeric_values}), reverse=True)
         if column_centers:
             left_text_parts = []
             numeric_bucket_map = {i: [] for i in range(len(column_centers))}
             for c in row:
                 t = c["text"]
+                cx = c["center_x"]
+                if is_numeric_token(t):
+                    col_idx = assign_token_to_column(cx, column_centers)
                     if col_idx is None:
                         numeric_bucket_map[len(column_centers) - 1].append(t)
                     else:
                     left_text_parts.append(t)
             raw_name = " ".join(left_text_parts).strip()
             name = clean_name_text(raw_name) if raw_name else ""
             num_cols = len(column_centers)
             def get_bucket(idx):
                 vals = numeric_bucket_map.get(idx, [])
                 return vals[-1] if vals else None
             amount = normalize_num_str(get_bucket(num_cols - 1)) if num_cols >= 1 else None
+            rate = normalize_num_str(get_bucket(num_cols - 2)) if num_cols >= 2 else None
+            qty = normalize_num_str(get_bucket(num_cols - 3)) if num_cols >= 3 else None
             if amount is None:
                 for t in reversed(tokens):
+                    if is_numeric_token(t):
                         amount = normalize_num_str(t)
                         if amount is not None:
                             break
+            # Infer rate and qty if needed
             if amount is not None and numeric_values:
                 for cand in numeric_values:
                     try:
                             qty = float(r)
                             break
+            # Fallback compute rate if needed
             if (rate is None or rate == 0) and qty and qty != 0 and amount is not None:
                 try:
                     candidate_rate = amount / qty
             if qty is None:
                 qty = 1.0
+            # Normalize values
             try:
                 amount = float(round(amount, 2))
+            except Exception:
                 continue
             try:
                 rate = float(round(rate, 2)) if rate is not None else 0.0
+            except Exception:
                 rate = 0.0
             try:
                 qty = float(qty)
+            except Exception:
                 qty = 1.0
             parsed_items.append({
             })
         else:
+            numeric_idxs = [i for i, t in enumerate(tokens) if is_numeric_token(t)]
             if not numeric_idxs:
                 continue
             last = numeric_idxs[-1]
                 v = normalize_num_str(tokens[i])
                 if v is not None:
                     right_nums.append(float(v))
+            right_nums = sorted(list({int(x) if float(x).is_integer() else x for x in right_nums}), reverse=True)
             if len(right_nums) >= 2:
                 cand = right_nums[1]
+                if float(cand) > 1 and float(cand) < float(amt):
                     ratio = float(amt) / float(cand) if cand else None
                     if ratio:
                         r = round(ratio)
 def detect_subtotals_and_totals(rows_texts: List[str]) -> Dict[str, Optional[float]]:
     subtotal = None; final = None
+    for rt in reversed(rows_texts):
         if not rt or rt.strip() == "":
             continue
         if TOTAL_KEYWORDS.search(rt):
                 if v is None:
                     continue
                 if re.search(r"sub", rt, re.I):
+                    if subtotal is None:
+                        subtotal = float(round(v, 2))
                 else:
+                    if final is None:
+                        final = float(round(v, 2))
     return {"subtotal": subtotal, "final_total": final}
+# ---------------- Gemini refinement (deterministic) ----------------
+def refine_with_gemini(page_items: List[Dict[str, Any]], page_text: str = "") -> Tuple[List[Dict[str, Any]], Dict[str, int]]:
+    zero_usage = {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0}
+    if not GEMINI_API_KEY or genai is None:
+        return page_items, zero_usage
+    try:
+        safe_text = sanitize_ocr_text(page_text)
+        system_prompt = (
+            "You are a strict bill-extraction cleaner. Return ONLY a JSON array (no explanation, no backticks). "
+            "Each entry must be an object with keys: item_name (string), item_amount (float), item_rate (float), item_quantity (float). "
+            "Do NOT include subtotal or total lines as items. Do not invent items; only clean/fix/normalize the given items."
+        )
+        user_prompt = (
+            f"page_text='''{safe_text}'''\n"
+            f"items = {json.dumps(page_items, ensure_ascii=False)}\n\n"
+            "Example:\n"
+            "items = [{'item_name':'Consultation Charge | DR PREETHI','item_amount':300.0,'item_rate':0.0,'item_quantity':300.0},\n"
+            "         {'item_name':'Description Qty / Hrs Consultation Rate Discount Net Amt','item_amount':1950.0,'item_rate':1950.0,'item_quantity':1.0}]\n"
+            "=>\n"
+            "[{'item_name':'Consultation Charge | DR PREETHI MARY JOSEPH','item_amount':300.0,'item_rate':300.0,'item_quantity':1.0}]\n\n"
+            "Return only the cleaned JSON array of items."
+        )
+        model = genai.GenerativeModel(GEMINI_MODEL_NAME)
+        response = model.generate_content(
+            [
+                {"role": "system", "parts": [system_prompt]},
+                {"role": "user", "parts": [user_prompt]},
+            ],
+            temperature=0.0,
+            max_output_tokens=1000,
+        )
+        raw = response.text.strip()
+        if raw.startswith("```"):
+            raw = re.sub(r"^```[a-zA-Z]*", "", raw)
+            raw = re.sub(r"```$", "", raw).strip()
+        parsed = json.loads(raw)
+        if isinstance(parsed, list):
+            cleaned = []
+            for obj in parsed:
+                try:
+                    cleaned.append({
+                        "item_name": str(obj.get("item_name", "")).strip(),
+                        "item_amount": float(obj.get("item_amount", 0.0)),
+                        "item_rate": float(obj.get("item_rate", 0.0) or 0.0),
+                        "item_quantity": float(obj.get("item_quantity", 1.0) or 1.0),
+                    })
+                except Exception:
+                    continue
+            return cleaned, zero_usage
+        return page_items, zero_usage
+    except Exception:
+        return page_items, zero_usage
+# ---------------- header heuristics & final filter ----------------
 def looks_like_header_text(txt: str, top_of_page: bool = False) -> bool:
     if not txt:
         return False
     t = re.sub(r"\s+", " ", txt.strip().lower())
+    # exact phrase blacklist
     if any(h == t for h in HEADER_PHRASES):
         return True
     hits = sum(1 for k in HEADER_KEYWORDS if k in t)
         return True
     if t.startswith("description") or t.startswith("qty") or t.startswith("qty /"):
         return True
     return False
 def final_item_filter(item: Dict[str, Any], known_page_headers: List[str] = [], other_item_names: List[str] = []) -> bool:
     if not name:
         return False
     ln = name.lower()
+    # Remove if this item matches any known header text
     for h in known_page_headers:
         if h and h.strip() and h.strip().lower() in ln:
             return False
     if FOOTER_KEYWORDS.search(ln):
         return False
     if item.get("item_amount", 0) > 1_000_000:
         return False
     if len(name) <= 2 and not re.search(r"[a-zA-Z]", name):
         return False
+    # (Removed overly restrictive filters for generic terms to retain valid items)
+    # Drop items with non-positive amounts
+    if float(item.get("item_amount", 0)) <= 0.0:
         return False
+    # Sanity check: discard if rate is absurdly higher than amount
+    rate = float(item.get("item_rate", 0) or 0)
+    amt = float(item.get("item_amount", 0) or 0)
+    if rate and rate > amt * 10 and amt < 10000:
         return False
     return True
     doc_url = payload.document
     file_bytes = None
+    # --------------------------- Local or remote file ---------------------------
     if doc_url.startswith("file://"):
         local_path = doc_url.replace("file://", "")
         try:
             with open(local_path, "rb") as f:
                 file_bytes = f.read()
         except Exception as e:
             return {
                 "is_success": False,
                 "error": f"Local file read error: {e}",
                 raise RuntimeError(f"Download failed status={resp.status_code}")
             file_bytes = resp.content
         except Exception as e:
             return {
                 "is_success": False,
                 "error": f"HTTP error: {e}",
                 images = convert_from_bytes(file_bytes)
             except Exception:
                 images = []
+    except Exception:
         images = []
     pagewise = []
             rows = group_cells_into_rows(cells, y_tolerance=12)
             rows_texts = [" ".join([c["text"] for c in r]).strip() for r in rows]
+            # === Header prefilter: remove header-like rows ===
             rows_filtered = []
             for i, (r, rt) in enumerate(zip(rows, rows_texts)):
                 top_flag = (i < 6)
             rows_texts = [" ".join([c["text"] for c in r]).strip() for r in rows]
             page_text = sanitize_ocr_text(" ".join(rows_texts))
+            # Collect detected top headers for final filtering
             top_headers = []
             for i, rt in enumerate(rows_texts[:6]):
                 if looks_like_header_text(rt, top_of_page=(i < 4)):
             parsed_items = parse_rows_with_columns(rows, cells)
+            # Gemini refinement (if enabled)
             refined_items, token_u = refine_with_gemini(parsed_items, page_text)
             for k in cumulative_token_usage:
                 cumulative_token_usage[k] += token_u.get(k, 0)
+            other_item_names = [it.get("item_name", "") for it in refined_items]
             cleaned = [p for p in refined_items if final_item_filter(p, known_page_headers=top_headers, other_item_names=other_item_names)]
             cleaned = dedupe_items(cleaned)
             page_type = "Bill Detail"
             page_txt = page_text.lower()
             if any(x in page_txt for x in ["pharmacy", "medicine", "tablet"]):
                 page_type = "Pharmacy"
+            if "final bill" in page_txt or "grand total" in page_txt:
                 page_type = "Final Bill"
             pagewise.append({"page_no": str(idx), "page_type": page_type, "bill_items": cleaned})
+        except Exception:
             pagewise.append({"page_no": str(idx), "page_type": "Bill Detail", "bill_items": []})
             continue
     if not GEMINI_API_KEY or genai is None:
         cumulative_token_usage["warning_no_gemini"] = 1
+    return {"is_success": True, "token_usage": cumulative_token_usage,
+            "data": {"pagewise_line_items": pagewise, "total_item_count": total_item_count}}
 # ---------------- debug TSV ----------------
 @app.post("/debug-tsv")
 @app.get("/")
 def health_check():
+    msg = "Bill extraction API (updated version) live."
     if not GEMINI_API_KEY or genai is None:
+        msg += " (No GEMINI - LLM refinement skipped.)"
     return {"status": "ok", "message": msg, "hint": "POST /extract-bill-data with {'document':'<url>'}"}