Spaces:

Sathvik-kota
/

Datathon

Sleeping

App Files Files Community

Sathvik-kota commited on Nov 30, 2025

Commit

8e9daea

verified ·

1 Parent(s): 55af608

Upload folder using huggingface_hub

Browse files

Files changed (1) hide show

app.py +259 -81

app.py CHANGED Viewed

@@ -11,10 +11,10 @@ from io import BytesIO
 from typing import List, Dict, Any, Optional, Tuple
 import uvicorn
-from fastapi import FastAPI, BackgroundTasks
 from pydantic import BaseModel
 import requests
-from PIL import Image
 from pdf2image import convert_from_bytes
 import pytesseract
 from pytesseract import Output
@@ -54,7 +54,7 @@ TOTAL_KEYWORDS = re.compile(
     r"(grand\s*total|net\s*payable|total\s*amount|amount\s*payable|bill\s*amount|final\s*amount|balance\s*due|sub\s*total|subtotal|total)",
     re.I,
 )
-FOOTER_KEYWORDS = re.compile(r"(page|printed on|printed:|date:|time:|am|pm)", re.I)
 HEADER_KEYWORDS = [
     "description", "qty", "hrs", "rate", "discount", "net", "amt", "amount",
@@ -71,9 +71,10 @@ HEADER_PHRASES = [
 HEADER_PHRASES = [h.lower() for h in HEADER_PHRASES]
 # ---------------- small utilities ----------------
-def sanitize_ocr_text(s: str) -> str:
     if not s:
         return ""
     s = s.replace("\u2014", "-").replace("\u2013", "-")
     s = re.sub(r"[^\x09\x0A\x0D\x20-\x7E]", " ", s)
     s = s.replace("\r\n", "\n").replace("\r", "\n")
@@ -87,6 +88,8 @@ def normalize_num_str(s: Optional[str]) -> Optional[float]:
     s = str(s).strip()
     if s == "":
         return None
     s = re.sub(r"[^\d\-\+\,\.\(\)]", "", s)
     negative = False
     if s.startswith("(") and s.endswith(")"):
@@ -107,6 +110,8 @@ def is_numeric_token(t: Optional[str]) -> bool:
     return bool(t and NUM_RE.search(str(t)))
 def looks_like_date_num(s: str) -> bool:
     s_digits = re.sub(r"[^\d]", "", s or "")
     if len(s_digits) >= 7:
         if s_digits.endswith(("2025","2024","2023","2022","2026")):
@@ -118,17 +123,50 @@ def looks_like_date_num(s: str) -> bool:
             pass
     return False
 def clean_name_text(s: str) -> str:
-    s = s.replace("—", "-")
     s = re.sub(r"\s+", " ", s)
     s = s.strip(" -:,.=")
-    s = re.sub(r"\s+x$", "", s, flags=re.I)
-    s = re.sub(r"[\)\}\]]+$", "", s)
     s = re.sub(r"\bOR\b", "DR", s)
-    s = s.strip(" -:,.")
     s = s.strip()
     return s
 # ---------------- image preprocessing ----------------
 def pil_to_cv2(img: Image.Image) -> Any:
     arr = np.array(img)
@@ -144,30 +182,56 @@ def preprocess_image(pil_img: Image.Image) -> Any:
     if w < target_w:
         scale = target_w / float(w)
         pil_img = pil_img.resize((int(w * scale), int(h * scale)), Image.LANCZOS)
     cv_img = pil_to_cv2(pil_img)
-    # grayscale and denoise
     if cv_img.ndim == 3:
         gray = cv2.cvtColor(cv_img, cv2.COLOR_BGR2GRAY)
     else:
         gray = cv_img
-    gray = cv2.fastNlMeansDenoising(gray, h=10)
     try:
         bw = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                    cv2.THRESH_BINARY, 41, 15)
     except Exception:
-        _, bw = cv2.threshold(gray, 127, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
-    kernel = np.ones((1,1), np.uint8)
-    bw = cv2.morphologyEx(bw, cv2.MORPH_OPEN, kernel)
     return bw
-# ---------------- OCR TSV ----------------
 def image_to_tsv_cells(cv_img: Any) -> List[Dict[str, Any]]:
-    # pytesseract expects either a PIL image or numpy array
     try:
         o = pytesseract.image_to_data(cv_img, output_type=Output.DICT, config="--psm 6")
     except Exception:
         o = pytesseract.image_to_data(cv_img, output_type=Output.DICT)
-    cells = []
     n = len(o.get("text", []))
     for i in range(n):
         raw = o["text"][i]
@@ -176,18 +240,40 @@ def image_to_tsv_cells(cv_img: Any) -> List[Dict[str, Any]]:
         txt = str(raw).strip()
         if not txt:
             continue
         try:
-            conf = float(o["conf"][i]) if o["conf"][i] not in (None, "", "-1") else -1.0
         except Exception:
             conf = -1.0
-        left = int(o.get("left", [0])[i])
-        top = int(o.get("top", [0])[i])
-        width = int(o.get("width", [0])[i])
-        height = int(o.get("height", [0])[i])
         center_y = top + height / 2.0
         center_x = left + width / 2.0
-        cells.append({"text": txt, "conf": conf, "left": left, "top": top,
-                      "width": width, "height": height, "center_y": center_y, "center_x": center_x})
     return cells
 # ---------------- grouping & merging helpers ----------------
@@ -195,7 +281,7 @@ def group_cells_into_rows(cells: List[Dict[str, Any]], y_tolerance: int = 12) ->
     if not cells:
         return []
     sorted_cells = sorted(cells, key=lambda c: (c["center_y"], c["center_x"]))
-    rows = []
     current = [sorted_cells[0]]
     last_y = sorted_cells[0]["center_y"]
     for c in sorted_cells[1:]:
@@ -213,12 +299,13 @@ def group_cells_into_rows(cells: List[Dict[str, Any]], y_tolerance: int = 12) ->
 def merge_multiline_names(rows: List[List[Dict[str, Any]]]) -> List[List[Dict[str, Any]]]:
     if not rows:
         return rows
-    merged = []
     i = 0
     while i < len(rows):
         row = rows[i]
         tokens = [c["text"] for c in row]
         has_num = any(is_numeric_token(t) for t in tokens)
         if not has_num and i + 1 < len(rows):
             next_row = rows[i+1]
             next_tokens = [c["text"] for c in next_row]
@@ -226,34 +313,34 @@ def merge_multiline_names(rows: List[List[Dict[str, Any]]]) -> List[List[Dict[st
             if next_has_num and len(tokens) >= 2 and len([t for t in next_tokens if not is_numeric_token(t)]) <= 3:
                 merged_row = []
                 min_left = min((c["left"] for c in next_row), default=0)
-                offset = 10
                 for c in row:
                     newc = c.copy()
-                    newc["left"] = min_left - offset
-                    newc["center_x"] = newc["left"] + newc.get("width", 0) / 2.0
                     merged_row.append(newc)
-                    offset += 10
                 merged_row.extend(next_row)
                 merged.append(sorted(merged_row, key=lambda cc: cc["left"]))
                 i += 2
                 continue
         if not has_num and i + 1 < len(rows):
             next_row = rows[i+1]
             next_tokens = [c["text"] for c in next_row]
             next_has_num = any(is_numeric_token(t) for t in next_tokens)
             if not next_has_num and len(tokens) <= 3 and len(next_tokens) <= 3:
                 merged_row = []
-                min_left = min((c["left"] for c in next_row + row), default=0)
-                offset = 10
-                for c in row + next_row:
                     newc = c.copy()
-                    if newc["left"] > min_left:
-                        newc["left"] = newc["left"]
-                    else:
-                        newc["left"] = min_left - offset
-                    newc["center_x"] = newc["left"] + newc.get("width", 0) / 2.0
                     merged_row.append(newc)
-                    offset += 5
                 merged.append(sorted(merged_row, key=lambda cc: cc["left"]))
                 i += 2
                 continue
@@ -263,10 +350,9 @@ def merge_multiline_names(rows: List[List[Dict[str, Any]]]) -> List[List[Dict[st
 # ---------------- numeric column detection ----------------
 def detect_numeric_columns(cells: List[Dict[str, Any]], max_columns: int = 4) -> List[float]:
-    xs = [c["center_x"] for c in cells if is_numeric_token(c["text"])]
     if not xs:
         return []
-    xs = sorted(xs)
     if len(xs) == 1:
         return [xs[0]]
     gaps = [xs[i+1] - xs[i] for i in range(len(xs)-1)]
@@ -293,6 +379,91 @@ def assign_token_to_column(token_x: float, column_centers: List[float]) -> Optio
     distances = [abs(token_x - cx) for cx in column_centers]
     return int(np.argmin(distances))
 # ---------------- Gemini refinement (deterministic) ----------------
 def refine_with_gemini(page_items: List[Dict[str, Any]], page_text: str = "") -> Tuple[List[Dict[str, Any]], Dict[str, int]]:
     zero_usage = {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0}
@@ -344,7 +515,7 @@ def refine_with_gemini(page_items: List[Dict[str, Any]], page_text: str = "") ->
 # ---------------- parsing rows into items (modified) ----------------
 def parse_rows_with_columns(rows: List[List[Dict[str, Any]]], page_cells: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
-    parsed_items = []
     rows = merge_multiline_names(rows)
     column_centers = detect_numeric_columns(page_cells, max_columns=4)
@@ -353,9 +524,15 @@ def parse_rows_with_columns(rows: List[List[Dict[str, Any]]], page_cells: List[D
         if not tokens:
             continue
         joined_lower = " ".join(tokens).lower()
         if FOOTER_KEYWORDS.search(joined_lower) and not any(is_numeric_token(t) for t in tokens):
             continue
         if all(not is_numeric_token(t) for t in tokens):
             continue
         numeric_values = []
@@ -366,6 +543,7 @@ def parse_rows_with_columns(rows: List[List[Dict[str, Any]]], page_cells: List[D
                 v = normalize_num_str(t)
                 if v is not None:
                     numeric_values.append(float(v))
         numeric_values = sorted({int(x) if float(x).is_integer() else x for x in numeric_values}, reverse=True)
         if column_centers:
@@ -375,7 +553,7 @@ def parse_rows_with_columns(rows: List[List[Dict[str, Any]]], page_cells: List[D
                 t = c["text"]
                 if is_numeric_token(t) and not looks_like_date_num(t):
                     col_idx = assign_token_to_column(c["center_x"], column_centers)
-                    if col_idx is None:
                         numeric_bucket_map[len(column_centers) - 1].append(t)
                     else:
                         numeric_bucket_map[col_idx].append(t)
@@ -393,13 +571,16 @@ def parse_rows_with_columns(rows: List[List[Dict[str, Any]]], page_cells: List[D
             rate   = normalize_num_str(get_bucket(num_cols - 2)) if num_cols >= 2 else None
             qty    = normalize_num_str(get_bucket(num_cols - 3)) if num_cols >= 3 else None
             if amount is None:
                 for t in reversed(tokens):
                     if is_numeric_token(t) and not looks_like_date_num(t):
-                        amount = normalize_num_str(t)
-                        if amount is not None:
                             break
             if amount is not None and numeric_values:
                 for cand in numeric_values:
                     try:
@@ -419,43 +600,33 @@ def parse_rows_with_columns(rows: List[List[Dict[str, Any]]], page_cells: List[D
                     if r < 1 or r > 200:
                         continue
                     if abs(ratio - r) <= max(0.03 * r, 0.15):
-                        if r <= 100:
-                            rate = cand_float
-                            qty = float(r)
-                            break
             if (rate is None or rate == 0) and qty and qty != 0 and amount is not None:
                 try:
                     candidate_rate = amount / qty
-                    if candidate_rate >= 2:
                         rate = candidate_rate
                 except Exception:
                     pass
             if qty is None:
                 qty = 1.0
-            try:
-                amount = float(round(amount, 2))
-            except:
-                continue
-            try:
-                rate = float(round(rate, 2)) if rate is not None else 0.0
-            except:
-                rate = 0.0
-            try:
-                qty = float(qty)
-            except:
-                qty = 1.0
-            parsed_items.append({
-                "item_name": name if name else "UNKNOWN",
-                "item_amount": amount,
-                "item_rate": rate if rate is not None else 0.0,
-                "item_quantity": qty if qty is not None else 1.0,
-            })
         else:
             numeric_idxs = [i for i, t in enumerate(tokens) if is_numeric_token(t) and not looks_like_date_num(t)]
             if not numeric_idxs:
                 continue
@@ -466,8 +637,8 @@ def parse_rows_with_columns(rows: List[List[Dict[str, Any]]], page_cells: List[D
             name = " ".join(tokens[:last]).strip()
             if not name:
                 continue
-            rate = None; qty = None
             right_nums = []
             for i in numeric_idxs:
                 v = normalize_num_str(tokens[i])
@@ -475,6 +646,7 @@ def parse_rows_with_columns(rows: List[List[Dict[str, Any]]], page_cells: List[D
                     right_nums.append(float(v))
             right_nums = sorted({int(x) if float(x).is_integer() else x for x in right_nums}, reverse=True)
             if len(right_nums) >= 2:
                 cand = right_nums[1]
                 if 1 < cand < float(amt):
@@ -500,22 +672,21 @@ def parse_rows_with_columns(rows: List[List[Dict[str, Any]]], page_cells: List[D
             if rate is None:
                 rate = 0.0
-            parsed_items.append({
-                "item_name": clean_name_text(name),
-                "item_amount": float(round(amt, 2)),
-                "item_rate": float(round(rate, 2)),
-                "item_quantity": float(qty),
-            })
     return parsed_items
 # ---------------- dedupe & totals ----------------
 def dedupe_items(items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
     seen = set()
-    out = []
     for it in items:
         nm = re.sub(r"\s+", " ", it["item_name"].lower()).strip()
-        key = (nm[:120], round(float(it["item_amount"]), 2))
         if key in seen:
             continue
         seen.add(key)
@@ -600,10 +771,13 @@ def final_item_filter(item: Dict[str, Any], known_page_headers: List[str] = [],
     qty = float(item.get("item_quantity", 0) or 0)
     if qty <= 0:
         return False
-    if rate and rate > amt:
         return False
     if amt <= 0.0:
         return False
     return True
 # ---------------- main endpoint ----------------
@@ -650,7 +824,7 @@ async def extract_bill_data(payload: BillRequest):
             "token_usage": {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0}
         }
-    images = []
     clean_url = doc_url.split("?", 1)[0].lower()
     try:
         if clean_url.endswith(".pdf"):
@@ -673,6 +847,8 @@ async def extract_bill_data(payload: BillRequest):
         try:
             proc = preprocess_image(page_img)
             cells = image_to_tsv_cells(proc)
             rows = group_cells_into_rows(cells, y_tolerance=12)
             rows_texts = [" ".join([c["text"] for c in r]).strip() for r in rows]
@@ -698,6 +874,7 @@ async def extract_bill_data(payload: BillRequest):
             parsed_items = parse_rows_with_columns(rows, cells)
             refined_items, token_u = refine_with_gemini(parsed_items, page_text)
             for k in cumulative_token_usage:
                 cumulative_token_usage[k] += token_u.get(k, 0)
@@ -765,4 +942,5 @@ async def run_all_samples():
         logger.exception("run_all_samples failed: %s", e)
         return {"status": "error", "error": str(e)}

 from typing import List, Dict, Any, Optional, Tuple
 import uvicorn
+from fastapi import FastAPI
 from pydantic import BaseModel
 import requests
+from PIL import Image, ImageOps
 from pdf2image import convert_from_bytes
 import pytesseract
 from pytesseract import Output
     r"(grand\s*total|net\s*payable|total\s*amount|amount\s*payable|bill\s*amount|final\s*amount|balance\s*due|sub\s*total|subtotal|total)",
     re.I,
 )
+FOOTER_KEYWORDS = re.compile(r"(page|printed on|printed:|date:|time:|am|pm|signature)", re.I)
 HEADER_KEYWORDS = [
     "description", "qty", "hrs", "rate", "discount", "net", "amt", "amount",
 HEADER_PHRASES = [h.lower() for h in HEADER_PHRASES]
 # ---------------- small utilities ----------------
+def sanitize_ocr_text(s: Optional[str]) -> str:
     if not s:
         return ""
+    s = str(s)
     s = s.replace("\u2014", "-").replace("\u2013", "-")
     s = re.sub(r"[^\x09\x0A\x0D\x20-\x7E]", " ", s)
     s = s.replace("\r\n", "\n").replace("\r", "\n")
     s = str(s).strip()
     if s == "":
         return None
+    # common OCR fixes in numeric strings (O -> 0, , as thousands)
+    s = s.replace("O", "0").replace("o", "0").replace("l", "1")
     s = re.sub(r"[^\d\-\+\,\.\(\)]", "", s)
     negative = False
     if s.startswith("(") and s.endswith(")"):
     return bool(t and NUM_RE.search(str(t)))
 def looks_like_date_num(s: str) -> bool:
+    if not s:
+        return False
     s_digits = re.sub(r"[^\d]", "", s or "")
     if len(s_digits) >= 7:
         if s_digits.endswith(("2025","2024","2023","2022","2026")):
             pass
     return False
+def collapse_repeated_chars(s: str) -> str:
+    # collapse runs of repeated punctuation/letters that are OCR artifacts
+    s = re.sub(r"([^\w\s])\1{2,}", r"\1", s)
+    s = re.sub(r"([A-Za-z])\1{3,}", r"\1\1", s)
+    return s
 def clean_name_text(s: str) -> str:
+    if not s:
+        return ""
+    s = str(s)
+    s = s.replace("—", "-").replace("–", "-")
+    s = collapse_repeated_chars(s)
+    s = re.sub(r"[_\|]{2,}", " ", s)
+    s = re.sub(r"[^\x00-\x7F]+", " ", s)  # remove non-ascii weird chars
+    s = re.sub(r"[\[\]\{\}\(\)]", " ", s)
     s = re.sub(r"\s+", " ", s)
     s = s.strip(" -:,.=")
+    # fix common OCR 'OR' -> 'DR' when it's standalone uppercase
     s = re.sub(r"\bOR\b", "DR", s)
+    # remove trailing artifacts like 'x' placed between qty and name
+    s = re.sub(r"\s+x\s*$", "", s, flags=re.I)
     s = s.strip()
     return s
+def is_probable_garbage_name(name: str) -> bool:
+    if not name:
+        return True
+    n = name.strip()
+    # too short or too many non-alpha
+    alpha_count = len(re.findall(r"[A-Za-z]", n))
+    digit_count = len(re.findall(r"\d", n))
+    non_word = len(re.findall(r"[^\w\s]", n))
+    if alpha_count == 0:
+        return True
+    if len(n) < 2:
+        return True
+    # if >50% of chars are non-alnum, garbage
+    if non_word / max(1, len(n)) > 0.45:
+        return True
+    # if digits dominate and look not like code/date
+    if digit_count / max(1, len(n)) > 0.6 and not looks_like_date_num(n):
+        return True
+    return False
 # ---------------- image preprocessing ----------------
 def pil_to_cv2(img: Image.Image) -> Any:
     arr = np.array(img)
     if w < target_w:
         scale = target_w / float(w)
         pil_img = pil_img.resize((int(w * scale), int(h * scale)), Image.LANCZOS)
+    # convert to gray + CLAHE (adaptive contrast)
     cv_img = pil_to_cv2(pil_img)
     if cv_img.ndim == 3:
         gray = cv2.cvtColor(cv_img, cv2.COLOR_BGR2GRAY)
     else:
         gray = cv_img
+    # unsigned int conversion
+    gray = np.asarray(gray, dtype=np.uint8)
+    # CLAHE for contrast enhancement
+    try:
+        clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8,8))
+        gray = clahe.apply(gray)
+    except Exception:
+        pass
+    # denoise
+    try:
+        gray = cv2.fastNlMeansDenoising(gray, h=10)
+    except Exception:
+        pass
+    # adaptive threshold
     try:
         bw = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                    cv2.THRESH_BINARY, 41, 15)
     except Exception:
+        _, bw = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+    # morphological operations to remove tiny noise and thin grid lines
+    kernel_close = cv2.getStructuringElement(cv2.MORPH_RECT, (3,3))
+    bw = cv2.morphologyEx(bw, cv2.MORPH_CLOSE, kernel_close)
+    kernel_open = cv2.getStructuringElement(cv2.MORPH_RECT, (1,1))
+    bw = cv2.morphologyEx(bw, cv2.MORPH_OPEN, kernel_open)
     return bw
+# ---------------- OCR TSV (word-level) ----------------
+OCR_CONF_THRESHOLD = 30.0  # drop tokens with confidence less than this (if provided by tesseract)
 def image_to_tsv_cells(cv_img: Any) -> List[Dict[str, Any]]:
+    # pytesseract can accept numpy arrays or PIL images
     try:
         o = pytesseract.image_to_data(cv_img, output_type=Output.DICT, config="--psm 6")
     except Exception:
         o = pytesseract.image_to_data(cv_img, output_type=Output.DICT)
+    cells: List[Dict[str, Any]] = []
     n = len(o.get("text", []))
     for i in range(n):
         raw = o["text"][i]
         txt = str(raw).strip()
         if not txt:
             continue
+        # try to parse confidence (tesseract returns strings sometimes)
+        conf_raw = o.get("conf", [None]*n)[i]
         try:
+            conf = float(conf_raw) if conf_raw not in (None, "", "-1") else -1.0
         except Exception:
             conf = -1.0
+        # skip very-low-confidence tokens (reduce garbage)
+        if conf >= 0 and conf < OCR_CONF_THRESHOLD:
+            continue
+        left = int(o.get("left", [0]*n)[i])
+        top = int(o.get("top", [0]*n)[i])
+        width = int(o.get("width", [0]*n)[i])
+        height = int(o.get("height", [0]*n)[i])
         center_y = top + height / 2.0
         center_x = left + width / 2.0
+        # normalize numeric OCR artifacts inside token
+        if re.search(r"[0-9]", txt):
+            # quick fixes
+            txt = txt.replace("O", "0").replace("o", "0").replace("l", "1")
+            txt = re.sub(r"[^0-9\.\,\-\(\)]", lambda m: "" if m.group(0).isspace() else m.group(0), txt)
+        cells.append({
+            "text": txt,
+            "conf": conf,
+            "left": left,
+            "top": top,
+            "width": width,
+            "height": height,
+            "center_y": center_y,
+            "center_x": center_x
+        })
     return cells
 # ---------------- grouping & merging helpers ----------------
     if not cells:
         return []
     sorted_cells = sorted(cells, key=lambda c: (c["center_y"], c["center_x"]))
+    rows: List[List[Dict[str, Any]]] = []
     current = [sorted_cells[0]]
     last_y = sorted_cells[0]["center_y"]
     for c in sorted_cells[1:]:
 def merge_multiline_names(rows: List[List[Dict[str, Any]]]) -> List[List[Dict[str, Any]]]:
     if not rows:
         return rows
+    merged: List[List[Dict[str, Any]]] = []
     i = 0
     while i < len(rows):
         row = rows[i]
         tokens = [c["text"] for c in row]
         has_num = any(is_numeric_token(t) for t in tokens)
+        # Merge a full-text row with the next numeric row if appropriate
         if not has_num and i + 1 < len(rows):
             next_row = rows[i+1]
             next_tokens = [c["text"] for c in next_row]
             if next_has_num and len(tokens) >= 2 and len([t for t in next_tokens if not is_numeric_token(t)]) <= 3:
                 merged_row = []
                 min_left = min((c["left"] for c in next_row), default=0)
+                offset = 0
                 for c in row:
                     newc = c.copy()
+                    # Shift text to left of numeric columns
+                    newc["left"] = min_left - 20 + offset
+                    newc["center_x"] = newc["left"] + newc["width"] / 2.0
                     merged_row.append(newc)
+                    offset += 8
                 merged_row.extend(next_row)
                 merged.append(sorted(merged_row, key=lambda cc: cc["left"]))
                 i += 2
                 continue
+        # Merge two short text-only rows (e.g. split names)
         if not has_num and i + 1 < len(rows):
             next_row = rows[i+1]
             next_tokens = [c["text"] for c in next_row]
             next_has_num = any(is_numeric_token(t) for t in next_tokens)
             if not next_has_num and len(tokens) <= 3 and len(next_tokens) <= 3:
+                combined = row + next_row
+                min_left = min((c["left"] for c in combined), default=0)
                 merged_row = []
+                for c in combined:
                     newc = c.copy()
+                    # move slightly to align left
+                    if newc["left"] <= min_left:
+                        newc["left"] = min_left
+                    newc["center_x"] = newc["left"] + newc["width"] / 2.0
                     merged_row.append(newc)
                 merged.append(sorted(merged_row, key=lambda cc: cc["left"]))
                 i += 2
                 continue
 # ---------------- numeric column detection ----------------
 def detect_numeric_columns(cells: List[Dict[str, Any]], max_columns: int = 4) -> List[float]:
+    xs = sorted([c["center_x"] for c in cells if is_numeric_token(c["text"])])
     if not xs:
         return []
     if len(xs) == 1:
         return [xs[0]]
     gaps = [xs[i+1] - xs[i] for i in range(len(xs)-1)]
     distances = [abs(token_x - cx) for cx in column_centers]
     return int(np.argmin(distances))
+# ---------------- item validation & repair ----------------
+MAX_REASONABLE_QTY = 100.0
+MAX_REASONABLE_RATE = 1_000_000.0
+def validate_and_fix_item(item: Dict[str, Any]) -> Optional[Dict[str, Any]]:
+    """
+    Ensure amount/rate/qty are reasonable. Try to fix obvious OCR-caused errors.
+    Return None if the item should be discarded as garbage.
+    """
+    # sanitize name
+    name = clean_name_text(item.get("item_name", "") or "")
+    if is_probable_garbage_name(name):
+        # reject obviously garbage names
+        return None
+    amt = item.get("item_amount", 0.0) or 0.0
+    rate = item.get("item_rate", 0.0) or 0.0
+    qty = item.get("item_quantity", 0.0) or 0.0
+    # sanity caps
+    try:
+        amt = float(amt)
+    except Exception:
+        return None
+    try:
+        rate = float(rate)
+    except Exception:
+        rate = 0.0
+    try:
+        qty = float(qty)
+    except Exception:
+        qty = 1.0
+    # If qty is ridiculously large -> likely OCR error. Reset to 1 and set rate=amount if rate invalid
+    if qty > MAX_REASONABLE_QTY:
+        logger.debug("Qty %s too large for '%s' — resetting to 1", qty, name)
+        qty = 1.0
+        if rate <= 0 or rate > amt * 10:
+            rate = amt
+    # If rate > amt but rate is extremely large -> swap/assume misplace: if rate*qty approximates amt, fine.
+    if rate > amt and qty > 0:
+        if abs(rate * qty - amt) > max(0.05 * amt, 1.0):
+            # If rate bigger than amount and doesn't fit, assume rate was missing -> set rate = amt/qty if meaningful
+            try:
+                candidate = amt / qty if qty else amt
+                if 0 < candidate <= MAX_REASONABLE_RATE:
+                    logger.debug("Adjusting rate for '%s' from %s to %s", name, rate, candidate)
+                    rate = candidate
+            except Exception:
+                pass
+    # If rate == 0 but qty>0 and amt>0 try infer simple integer ratio from numeric candidates already done upstream,
+    # fallback: set rate = amt (qty assumed 1)
+    if (rate == 0 or rate is None) and qty and qty > 0:
+        if qty == 1 or not (amt / qty).is_integer():
+            # simply compute rate
+            try:
+                candidate_rate = amt / qty
+                if candidate_rate > 0 and candidate_rate <= MAX_REASONABLE_RATE:
+                    rate = round(candidate_rate, 2)
+            except Exception:
+                rate = 0.0
+    # final sanity: negative/zero amounts dropped
+    if amt <= 0.0:
+        return None
+    if qty <= 0:
+        qty = 1.0
+    # clamp qty to reasonable
+    if qty > MAX_REASONABLE_QTY:
+        qty = 1.0
+    # Round sensible values
+    amt = float(round(amt, 2))
+    rate = float(round(rate, 2)) if rate is not None else 0.0
+    qty = float(round(qty, 3))
+    return {
+        "item_name": name,
+        "item_amount": amt,
+        "item_rate": rate,
+        "item_quantity": qty
+    }
 # ---------------- Gemini refinement (deterministic) ----------------
 def refine_with_gemini(page_items: List[Dict[str, Any]], page_text: str = "") -> Tuple[List[Dict[str, Any]], Dict[str, int]]:
     zero_usage = {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0}
 # ---------------- parsing rows into items (modified) ----------------
 def parse_rows_with_columns(rows: List[List[Dict[str, Any]]], page_cells: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    parsed_items: List[Dict[str, Any]] = []
     rows = merge_multiline_names(rows)
     column_centers = detect_numeric_columns(page_cells, max_columns=4)
         if not tokens:
             continue
         joined_lower = " ".join(tokens).lower()
+        # skip obvious footers and headers
         if FOOTER_KEYWORDS.search(joined_lower) and not any(is_numeric_token(t) for t in tokens):
             continue
         if all(not is_numeric_token(t) for t in tokens):
+            # if a pure-text row but looks like header -> skip
+            if looks_like_header_text(joined_lower):
+                continue
+            # otherwise we may have description-only rows (handled by merge_multiline_names)
             continue
         numeric_values = []
                 v = normalize_num_str(t)
                 if v is not None:
                     numeric_values.append(float(v))
+        # unique sorted descending
         numeric_values = sorted({int(x) if float(x).is_integer() else x for x in numeric_values}, reverse=True)
         if column_centers:
                 t = c["text"]
                 if is_numeric_token(t) and not looks_like_date_num(t):
                     col_idx = assign_token_to_column(c["center_x"], column_centers)
+                    if col_idx is None or col_idx < 0:
                         numeric_bucket_map[len(column_centers) - 1].append(t)
                     else:
                         numeric_bucket_map[col_idx].append(t)
             rate   = normalize_num_str(get_bucket(num_cols - 2)) if num_cols >= 2 else None
             qty    = normalize_num_str(get_bucket(num_cols - 3)) if num_cols >= 3 else None
+            # fallback: last numeric token as amount
             if amount is None:
                 for t in reversed(tokens):
                     if is_numeric_token(t) and not looks_like_date_num(t):
+                        candidate = normalize_num_str(t)
+                        if candidate is not None:
+                            amount = candidate
                             break
+            # try to infer rate & qty from numeric_values
             if amount is not None and numeric_values:
                 for cand in numeric_values:
                     try:
                     if r < 1 or r > 200:
                         continue
                     if abs(ratio - r) <= max(0.03 * r, 0.15):
+                        # reasonable integer quantity
+                        qty = float(r)
+                        rate = cand_float
+                        break
+            # additional fallback if rate missing but qty exists
             if (rate is None or rate == 0) and qty and qty != 0 and amount is not None:
                 try:
                     candidate_rate = amount / qty
+                    if 0 < candidate_rate <= 1e7:
                         rate = candidate_rate
                 except Exception:
                     pass
+            # default quantity = 1 if unknown
             if qty is None:
                 qty = 1.0
+            # final rounding / validation via helper
+            raw_item = {"item_name": name if name else "UNKNOWN", "item_amount": amount or 0.0,
+                        "item_rate": rate or 0.0, "item_quantity": qty or 1.0}
+            fixed = validate_and_fix_item(raw_item)
+            if fixed:
+                parsed_items.append(fixed)
+            # else skip
         else:
+            # fallback parsing if no clear numeric columns
             numeric_idxs = [i for i, t in enumerate(tokens) if is_numeric_token(t) and not looks_like_date_num(t)]
             if not numeric_idxs:
                 continue
             name = " ".join(tokens[:last]).strip()
             if not name:
                 continue
+            # gather numeric candidates on the right to infer rate/qty
             right_nums = []
             for i in numeric_idxs:
                 v = normalize_num_str(tokens[i])
                     right_nums.append(float(v))
             right_nums = sorted({int(x) if float(x).is_integer() else x for x in right_nums}, reverse=True)
+            rate = None; qty = None
             if len(right_nums) >= 2:
                 cand = right_nums[1]
                 if 1 < cand < float(amt):
             if rate is None:
                 rate = 0.0
+            raw_item = {"item_name": clean_name_text(name), "item_amount": float(round(amt,2)),
+                        "item_rate": float(round(rate,2)), "item_quantity": float(qty)}
+            fixed = validate_and_fix_item(raw_item)
+            if fixed:
+                parsed_items.append(fixed)
     return parsed_items
 # ---------------- dedupe & totals ----------------
 def dedupe_items(items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
     seen = set()
+    out: List[Dict[str, Any]] = []
     for it in items:
         nm = re.sub(r"\s+", " ", it["item_name"].lower()).strip()
+        key = (nm[:120], round(float(it.get("item_amount", 0.0) or 0.0), 2))
         if key in seen:
             continue
         seen.add(key)
     qty = float(item.get("item_quantity", 0) or 0)
     if qty <= 0:
         return False
+    if rate and rate > amt * 10 and amt < 10000:
         return False
     if amt <= 0.0:
         return False
+    # must contain at least one alphabetic char
+    if not re.search(r"[A-Za-z]", name):
+        return False
     return True
 # ---------------- main endpoint ----------------
             "token_usage": {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0}
         }
+    images: List[Image.Image] = []
     clean_url = doc_url.split("?", 1)[0].lower()
     try:
         if clean_url.endswith(".pdf"):
         try:
             proc = preprocess_image(page_img)
             cells = image_to_tsv_cells(proc)
+            if not cells:
+                logger.debug("No OCR cells extracted for page %s", idx)
             rows = group_cells_into_rows(cells, y_tolerance=12)
             rows_texts = [" ".join([c["text"] for c in r]).strip() for r in rows]
             parsed_items = parse_rows_with_columns(rows, cells)
+            # Use Gemini only if configured
             refined_items, token_u = refine_with_gemini(parsed_items, page_text)
             for k in cumulative_token_usage:
                 cumulative_token_usage[k] += token_u.get(k, 0)
         logger.exception("run_all_samples failed: %s", e)
         return {"status": "error", "error": str(e)}
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", 7860)))