Spaces:

Sathvik-kota
/

Datathon

Sleeping

App Files Files Community

Sathvik-kota commited on Nov 30, 2025

Commit

811fc30

verified ·

1 Parent(s): 8e9daea

Upload folder using huggingface_hub

Browse files

Files changed (1) hide show

app.py +81 -259

app.py CHANGED Viewed

@@ -11,10 +11,10 @@ from io import BytesIO
 from typing import List, Dict, Any, Optional, Tuple
 import uvicorn
-from fastapi import FastAPI
 from pydantic import BaseModel
 import requests
-from PIL import Image, ImageOps
 from pdf2image import convert_from_bytes
 import pytesseract
 from pytesseract import Output
@@ -54,7 +54,7 @@ TOTAL_KEYWORDS = re.compile(
     r"(grand\s*total|net\s*payable|total\s*amount|amount\s*payable|bill\s*amount|final\s*amount|balance\s*due|sub\s*total|subtotal|total)",
     re.I,
 )
-FOOTER_KEYWORDS = re.compile(r"(page|printed on|printed:|date:|time:|am|pm|signature)", re.I)
 HEADER_KEYWORDS = [
     "description", "qty", "hrs", "rate", "discount", "net", "amt", "amount",
@@ -71,10 +71,9 @@ HEADER_PHRASES = [
 HEADER_PHRASES = [h.lower() for h in HEADER_PHRASES]
 # ---------------- small utilities ----------------
-def sanitize_ocr_text(s: Optional[str]) -> str:
     if not s:
         return ""
-    s = str(s)
     s = s.replace("\u2014", "-").replace("\u2013", "-")
     s = re.sub(r"[^\x09\x0A\x0D\x20-\x7E]", " ", s)
     s = s.replace("\r\n", "\n").replace("\r", "\n")
@@ -88,8 +87,6 @@ def normalize_num_str(s: Optional[str]) -> Optional[float]:
     s = str(s).strip()
     if s == "":
         return None
-    # common OCR fixes in numeric strings (O -> 0, , as thousands)
-    s = s.replace("O", "0").replace("o", "0").replace("l", "1")
     s = re.sub(r"[^\d\-\+\,\.\(\)]", "", s)
     negative = False
     if s.startswith("(") and s.endswith(")"):
@@ -110,8 +107,6 @@ def is_numeric_token(t: Optional[str]) -> bool:
     return bool(t and NUM_RE.search(str(t)))
 def looks_like_date_num(s: str) -> bool:
-    if not s:
-        return False
     s_digits = re.sub(r"[^\d]", "", s or "")
     if len(s_digits) >= 7:
         if s_digits.endswith(("2025","2024","2023","2022","2026")):
@@ -123,50 +118,17 @@ def looks_like_date_num(s: str) -> bool:
             pass
     return False
-def collapse_repeated_chars(s: str) -> str:
-    # collapse runs of repeated punctuation/letters that are OCR artifacts
-    s = re.sub(r"([^\w\s])\1{2,}", r"\1", s)
-    s = re.sub(r"([A-Za-z])\1{3,}", r"\1\1", s)
-    return s
 def clean_name_text(s: str) -> str:
-    if not s:
-        return ""
-    s = str(s)
-    s = s.replace("—", "-").replace("–", "-")
-    s = collapse_repeated_chars(s)
-    s = re.sub(r"[_\|]{2,}", " ", s)
-    s = re.sub(r"[^\x00-\x7F]+", " ", s)  # remove non-ascii weird chars
-    s = re.sub(r"[\[\]\{\}\(\)]", " ", s)
     s = re.sub(r"\s+", " ", s)
     s = s.strip(" -:,.=")
-    # fix common OCR 'OR' -> 'DR' when it's standalone uppercase
     s = re.sub(r"\bOR\b", "DR", s)
-    # remove trailing artifacts like 'x' placed between qty and name
-    s = re.sub(r"\s+x\s*$", "", s, flags=re.I)
     s = s.strip()
     return s
-def is_probable_garbage_name(name: str) -> bool:
-    if not name:
-        return True
-    n = name.strip()
-    # too short or too many non-alpha
-    alpha_count = len(re.findall(r"[A-Za-z]", n))
-    digit_count = len(re.findall(r"\d", n))
-    non_word = len(re.findall(r"[^\w\s]", n))
-    if alpha_count == 0:
-        return True
-    if len(n) < 2:
-        return True
-    # if >50% of chars are non-alnum, garbage
-    if non_word / max(1, len(n)) > 0.45:
-        return True
-    # if digits dominate and look not like code/date
-    if digit_count / max(1, len(n)) > 0.6 and not looks_like_date_num(n):
-        return True
-    return False
 # ---------------- image preprocessing ----------------
 def pil_to_cv2(img: Image.Image) -> Any:
     arr = np.array(img)
@@ -182,56 +144,30 @@ def preprocess_image(pil_img: Image.Image) -> Any:
     if w < target_w:
         scale = target_w / float(w)
         pil_img = pil_img.resize((int(w * scale), int(h * scale)), Image.LANCZOS)
-    # convert to gray + CLAHE (adaptive contrast)
     cv_img = pil_to_cv2(pil_img)
     if cv_img.ndim == 3:
         gray = cv2.cvtColor(cv_img, cv2.COLOR_BGR2GRAY)
     else:
         gray = cv_img
-    # unsigned int conversion
-    gray = np.asarray(gray, dtype=np.uint8)
-    # CLAHE for contrast enhancement
-    try:
-        clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8,8))
-        gray = clahe.apply(gray)
-    except Exception:
-        pass
-    # denoise
-    try:
-        gray = cv2.fastNlMeansDenoising(gray, h=10)
-    except Exception:
-        pass
-    # adaptive threshold
     try:
         bw = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                    cv2.THRESH_BINARY, 41, 15)
     except Exception:
-        _, bw = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
-    # morphological operations to remove tiny noise and thin grid lines
-    kernel_close = cv2.getStructuringElement(cv2.MORPH_RECT, (3,3))
-    bw = cv2.morphologyEx(bw, cv2.MORPH_CLOSE, kernel_close)
-    kernel_open = cv2.getStructuringElement(cv2.MORPH_RECT, (1,1))
-    bw = cv2.morphologyEx(bw, cv2.MORPH_OPEN, kernel_open)
     return bw
-# ---------------- OCR TSV (word-level) ----------------
-OCR_CONF_THRESHOLD = 30.0  # drop tokens with confidence less than this (if provided by tesseract)
 def image_to_tsv_cells(cv_img: Any) -> List[Dict[str, Any]]:
-    # pytesseract can accept numpy arrays or PIL images
     try:
         o = pytesseract.image_to_data(cv_img, output_type=Output.DICT, config="--psm 6")
     except Exception:
         o = pytesseract.image_to_data(cv_img, output_type=Output.DICT)
-    cells: List[Dict[str, Any]] = []
     n = len(o.get("text", []))
     for i in range(n):
         raw = o["text"][i]
@@ -240,40 +176,18 @@ def image_to_tsv_cells(cv_img: Any) -> List[Dict[str, Any]]:
         txt = str(raw).strip()
         if not txt:
             continue
-        # try to parse confidence (tesseract returns strings sometimes)
-        conf_raw = o.get("conf", [None]*n)[i]
         try:
-            conf = float(conf_raw) if conf_raw not in (None, "", "-1") else -1.0
         except Exception:
             conf = -1.0
-        # skip very-low-confidence tokens (reduce garbage)
-        if conf >= 0 and conf < OCR_CONF_THRESHOLD:
-            continue
-        left = int(o.get("left", [0]*n)[i])
-        top = int(o.get("top", [0]*n)[i])
-        width = int(o.get("width", [0]*n)[i])
-        height = int(o.get("height", [0]*n)[i])
         center_y = top + height / 2.0
         center_x = left + width / 2.0
-        # normalize numeric OCR artifacts inside token
-        if re.search(r"[0-9]", txt):
-            # quick fixes
-            txt = txt.replace("O", "0").replace("o", "0").replace("l", "1")
-            txt = re.sub(r"[^0-9\.\,\-\(\)]", lambda m: "" if m.group(0).isspace() else m.group(0), txt)
-        cells.append({
-            "text": txt,
-            "conf": conf,
-            "left": left,
-            "top": top,
-            "width": width,
-            "height": height,
-            "center_y": center_y,
-            "center_x": center_x
-        })
     return cells
 # ---------------- grouping & merging helpers ----------------
@@ -281,7 +195,7 @@ def group_cells_into_rows(cells: List[Dict[str, Any]], y_tolerance: int = 12) ->
     if not cells:
         return []
     sorted_cells = sorted(cells, key=lambda c: (c["center_y"], c["center_x"]))
-    rows: List[List[Dict[str, Any]]] = []
     current = [sorted_cells[0]]
     last_y = sorted_cells[0]["center_y"]
     for c in sorted_cells[1:]:
@@ -299,13 +213,12 @@ def group_cells_into_rows(cells: List[Dict[str, Any]], y_tolerance: int = 12) ->
 def merge_multiline_names(rows: List[List[Dict[str, Any]]]) -> List[List[Dict[str, Any]]]:
     if not rows:
         return rows
-    merged: List[List[Dict[str, Any]]] = []
     i = 0
     while i < len(rows):
         row = rows[i]
         tokens = [c["text"] for c in row]
         has_num = any(is_numeric_token(t) for t in tokens)
-        # Merge a full-text row with the next numeric row if appropriate
         if not has_num and i + 1 < len(rows):
             next_row = rows[i+1]
             next_tokens = [c["text"] for c in next_row]
@@ -313,34 +226,34 @@ def merge_multiline_names(rows: List[List[Dict[str, Any]]]) -> List[List[Dict[st
             if next_has_num and len(tokens) >= 2 and len([t for t in next_tokens if not is_numeric_token(t)]) <= 3:
                 merged_row = []
                 min_left = min((c["left"] for c in next_row), default=0)
-                offset = 0
                 for c in row:
                     newc = c.copy()
-                    # Shift text to left of numeric columns
-                    newc["left"] = min_left - 20 + offset
-                    newc["center_x"] = newc["left"] + newc["width"] / 2.0
                     merged_row.append(newc)
-                    offset += 8
                 merged_row.extend(next_row)
                 merged.append(sorted(merged_row, key=lambda cc: cc["left"]))
                 i += 2
                 continue
-        # Merge two short text-only rows (e.g. split names)
         if not has_num and i + 1 < len(rows):
             next_row = rows[i+1]
             next_tokens = [c["text"] for c in next_row]
             next_has_num = any(is_numeric_token(t) for t in next_tokens)
             if not next_has_num and len(tokens) <= 3 and len(next_tokens) <= 3:
-                combined = row + next_row
-                min_left = min((c["left"] for c in combined), default=0)
                 merged_row = []
-                for c in combined:
                     newc = c.copy()
-                    # move slightly to align left
-                    if newc["left"] <= min_left:
-                        newc["left"] = min_left
-                    newc["center_x"] = newc["left"] + newc["width"] / 2.0
                     merged_row.append(newc)
                 merged.append(sorted(merged_row, key=lambda cc: cc["left"]))
                 i += 2
                 continue
@@ -350,9 +263,10 @@ def merge_multiline_names(rows: List[List[Dict[str, Any]]]) -> List[List[Dict[st
 # ---------------- numeric column detection ----------------
 def detect_numeric_columns(cells: List[Dict[str, Any]], max_columns: int = 4) -> List[float]:
-    xs = sorted([c["center_x"] for c in cells if is_numeric_token(c["text"])])
     if not xs:
         return []
     if len(xs) == 1:
         return [xs[0]]
     gaps = [xs[i+1] - xs[i] for i in range(len(xs)-1)]
@@ -379,91 +293,6 @@ def assign_token_to_column(token_x: float, column_centers: List[float]) -> Optio
     distances = [abs(token_x - cx) for cx in column_centers]
     return int(np.argmin(distances))
-# ---------------- item validation & repair ----------------
-MAX_REASONABLE_QTY = 100.0
-MAX_REASONABLE_RATE = 1_000_000.0
-def validate_and_fix_item(item: Dict[str, Any]) -> Optional[Dict[str, Any]]:
-    """
-    Ensure amount/rate/qty are reasonable. Try to fix obvious OCR-caused errors.
-    Return None if the item should be discarded as garbage.
-    """
-    # sanitize name
-    name = clean_name_text(item.get("item_name", "") or "")
-    if is_probable_garbage_name(name):
-        # reject obviously garbage names
-        return None
-    amt = item.get("item_amount", 0.0) or 0.0
-    rate = item.get("item_rate", 0.0) or 0.0
-    qty = item.get("item_quantity", 0.0) or 0.0
-    # sanity caps
-    try:
-        amt = float(amt)
-    except Exception:
-        return None
-    try:
-        rate = float(rate)
-    except Exception:
-        rate = 0.0
-    try:
-        qty = float(qty)
-    except Exception:
-        qty = 1.0
-    # If qty is ridiculously large -> likely OCR error. Reset to 1 and set rate=amount if rate invalid
-    if qty > MAX_REASONABLE_QTY:
-        logger.debug("Qty %s too large for '%s' — resetting to 1", qty, name)
-        qty = 1.0
-        if rate <= 0 or rate > amt * 10:
-            rate = amt
-    # If rate > amt but rate is extremely large -> swap/assume misplace: if rate*qty approximates amt, fine.
-    if rate > amt and qty > 0:
-        if abs(rate * qty - amt) > max(0.05 * amt, 1.0):
-            # If rate bigger than amount and doesn't fit, assume rate was missing -> set rate = amt/qty if meaningful
-            try:
-                candidate = amt / qty if qty else amt
-                if 0 < candidate <= MAX_REASONABLE_RATE:
-                    logger.debug("Adjusting rate for '%s' from %s to %s", name, rate, candidate)
-                    rate = candidate
-            except Exception:
-                pass
-    # If rate == 0 but qty>0 and amt>0 try infer simple integer ratio from numeric candidates already done upstream,
-    # fallback: set rate = amt (qty assumed 1)
-    if (rate == 0 or rate is None) and qty and qty > 0:
-        if qty == 1 or not (amt / qty).is_integer():
-            # simply compute rate
-            try:
-                candidate_rate = amt / qty
-                if candidate_rate > 0 and candidate_rate <= MAX_REASONABLE_RATE:
-                    rate = round(candidate_rate, 2)
-            except Exception:
-                rate = 0.0
-    # final sanity: negative/zero amounts dropped
-    if amt <= 0.0:
-        return None
-    if qty <= 0:
-        qty = 1.0
-    # clamp qty to reasonable
-    if qty > MAX_REASONABLE_QTY:
-        qty = 1.0
-    # Round sensible values
-    amt = float(round(amt, 2))
-    rate = float(round(rate, 2)) if rate is not None else 0.0
-    qty = float(round(qty, 3))
-    return {
-        "item_name": name,
-        "item_amount": amt,
-        "item_rate": rate,
-        "item_quantity": qty
-    }
 # ---------------- Gemini refinement (deterministic) ----------------
 def refine_with_gemini(page_items: List[Dict[str, Any]], page_text: str = "") -> Tuple[List[Dict[str, Any]], Dict[str, int]]:
     zero_usage = {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0}
@@ -515,7 +344,7 @@ def refine_with_gemini(page_items: List[Dict[str, Any]], page_text: str = "") ->
 # ---------------- parsing rows into items (modified) ----------------
 def parse_rows_with_columns(rows: List[List[Dict[str, Any]]], page_cells: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
-    parsed_items: List[Dict[str, Any]] = []
     rows = merge_multiline_names(rows)
     column_centers = detect_numeric_columns(page_cells, max_columns=4)
@@ -524,15 +353,9 @@ def parse_rows_with_columns(rows: List[List[Dict[str, Any]]], page_cells: List[D
         if not tokens:
             continue
         joined_lower = " ".join(tokens).lower()
-        # skip obvious footers and headers
         if FOOTER_KEYWORDS.search(joined_lower) and not any(is_numeric_token(t) for t in tokens):
             continue
         if all(not is_numeric_token(t) for t in tokens):
-            # if a pure-text row but looks like header -> skip
-            if looks_like_header_text(joined_lower):
-                continue
-            # otherwise we may have description-only rows (handled by merge_multiline_names)
             continue
         numeric_values = []
@@ -543,7 +366,6 @@ def parse_rows_with_columns(rows: List[List[Dict[str, Any]]], page_cells: List[D
                 v = normalize_num_str(t)
                 if v is not None:
                     numeric_values.append(float(v))
-        # unique sorted descending
         numeric_values = sorted({int(x) if float(x).is_integer() else x for x in numeric_values}, reverse=True)
         if column_centers:
@@ -553,7 +375,7 @@ def parse_rows_with_columns(rows: List[List[Dict[str, Any]]], page_cells: List[D
                 t = c["text"]
                 if is_numeric_token(t) and not looks_like_date_num(t):
                     col_idx = assign_token_to_column(c["center_x"], column_centers)
-                    if col_idx is None or col_idx < 0:
                         numeric_bucket_map[len(column_centers) - 1].append(t)
                     else:
                         numeric_bucket_map[col_idx].append(t)
@@ -571,16 +393,13 @@ def parse_rows_with_columns(rows: List[List[Dict[str, Any]]], page_cells: List[D
             rate   = normalize_num_str(get_bucket(num_cols - 2)) if num_cols >= 2 else None
             qty    = normalize_num_str(get_bucket(num_cols - 3)) if num_cols >= 3 else None
-            # fallback: last numeric token as amount
             if amount is None:
                 for t in reversed(tokens):
                     if is_numeric_token(t) and not looks_like_date_num(t):
-                        candidate = normalize_num_str(t)
-                        if candidate is not None:
-                            amount = candidate
                             break
-            # try to infer rate & qty from numeric_values
             if amount is not None and numeric_values:
                 for cand in numeric_values:
                     try:
@@ -600,33 +419,43 @@ def parse_rows_with_columns(rows: List[List[Dict[str, Any]]], page_cells: List[D
                     if r < 1 or r > 200:
                         continue
                     if abs(ratio - r) <= max(0.03 * r, 0.15):
-                        # reasonable integer quantity
-                        qty = float(r)
-                        rate = cand_float
-                        break
-            # additional fallback if rate missing but qty exists
             if (rate is None or rate == 0) and qty and qty != 0 and amount is not None:
                 try:
                     candidate_rate = amount / qty
-                    if 0 < candidate_rate <= 1e7:
                         rate = candidate_rate
                 except Exception:
                     pass
-            # default quantity = 1 if unknown
             if qty is None:
                 qty = 1.0
-            # final rounding / validation via helper
-            raw_item = {"item_name": name if name else "UNKNOWN", "item_amount": amount or 0.0,
-                        "item_rate": rate or 0.0, "item_quantity": qty or 1.0}
-            fixed = validate_and_fix_item(raw_item)
-            if fixed:
-                parsed_items.append(fixed)
-            # else skip
         else:
-            # fallback parsing if no clear numeric columns
             numeric_idxs = [i for i, t in enumerate(tokens) if is_numeric_token(t) and not looks_like_date_num(t)]
             if not numeric_idxs:
                 continue
@@ -637,8 +466,8 @@ def parse_rows_with_columns(rows: List[List[Dict[str, Any]]], page_cells: List[D
             name = " ".join(tokens[:last]).strip()
             if not name:
                 continue
-            # gather numeric candidates on the right to infer rate/qty
             right_nums = []
             for i in numeric_idxs:
                 v = normalize_num_str(tokens[i])
@@ -646,7 +475,6 @@ def parse_rows_with_columns(rows: List[List[Dict[str, Any]]], page_cells: List[D
                     right_nums.append(float(v))
             right_nums = sorted({int(x) if float(x).is_integer() else x for x in right_nums}, reverse=True)
-            rate = None; qty = None
             if len(right_nums) >= 2:
                 cand = right_nums[1]
                 if 1 < cand < float(amt):
@@ -672,21 +500,22 @@ def parse_rows_with_columns(rows: List[List[Dict[str, Any]]], page_cells: List[D
             if rate is None:
                 rate = 0.0
-            raw_item = {"item_name": clean_name_text(name), "item_amount": float(round(amt,2)),
-                        "item_rate": float(round(rate,2)), "item_quantity": float(qty)}
-            fixed = validate_and_fix_item(raw_item)
-            if fixed:
-                parsed_items.append(fixed)
     return parsed_items
 # ---------------- dedupe & totals ----------------
 def dedupe_items(items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
     seen = set()
-    out: List[Dict[str, Any]] = []
     for it in items:
         nm = re.sub(r"\s+", " ", it["item_name"].lower()).strip()
-        key = (nm[:120], round(float(it.get("item_amount", 0.0) or 0.0), 2))
         if key in seen:
             continue
         seen.add(key)
@@ -771,13 +600,10 @@ def final_item_filter(item: Dict[str, Any], known_page_headers: List[str] = [],
     qty = float(item.get("item_quantity", 0) or 0)
     if qty <= 0:
         return False
-    if rate and rate > amt * 10 and amt < 10000:
         return False
     if amt <= 0.0:
         return False
-    # must contain at least one alphabetic char
-    if not re.search(r"[A-Za-z]", name):
-        return False
     return True
 # ---------------- main endpoint ----------------
@@ -824,7 +650,7 @@ async def extract_bill_data(payload: BillRequest):
             "token_usage": {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0}
         }
-    images: List[Image.Image] = []
     clean_url = doc_url.split("?", 1)[0].lower()
     try:
         if clean_url.endswith(".pdf"):
@@ -847,8 +673,6 @@ async def extract_bill_data(payload: BillRequest):
         try:
             proc = preprocess_image(page_img)
             cells = image_to_tsv_cells(proc)
-            if not cells:
-                logger.debug("No OCR cells extracted for page %s", idx)
             rows = group_cells_into_rows(cells, y_tolerance=12)
             rows_texts = [" ".join([c["text"] for c in r]).strip() for r in rows]
@@ -874,7 +698,6 @@ async def extract_bill_data(payload: BillRequest):
             parsed_items = parse_rows_with_columns(rows, cells)
-            # Use Gemini only if configured
             refined_items, token_u = refine_with_gemini(parsed_items, page_text)
             for k in cumulative_token_usage:
                 cumulative_token_usage[k] += token_u.get(k, 0)
@@ -942,5 +765,4 @@ async def run_all_samples():
         logger.exception("run_all_samples failed: %s", e)
         return {"status": "error", "error": str(e)}
-if __name__ == "__main__":
-    uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", 7860)))

 from typing import List, Dict, Any, Optional, Tuple
 import uvicorn
+from fastapi import FastAPI, BackgroundTasks
 from pydantic import BaseModel
 import requests
+from PIL import Image
 from pdf2image import convert_from_bytes
 import pytesseract
 from pytesseract import Output
     r"(grand\s*total|net\s*payable|total\s*amount|amount\s*payable|bill\s*amount|final\s*amount|balance\s*due|sub\s*total|subtotal|total)",
     re.I,
 )
+FOOTER_KEYWORDS = re.compile(r"(page|printed on|printed:|date:|time:|am|pm)", re.I)
 HEADER_KEYWORDS = [
     "description", "qty", "hrs", "rate", "discount", "net", "amt", "amount",
 HEADER_PHRASES = [h.lower() for h in HEADER_PHRASES]
 # ---------------- small utilities ----------------
+def sanitize_ocr_text(s: str) -> str:
     if not s:
         return ""
     s = s.replace("\u2014", "-").replace("\u2013", "-")
     s = re.sub(r"[^\x09\x0A\x0D\x20-\x7E]", " ", s)
     s = s.replace("\r\n", "\n").replace("\r", "\n")
     s = str(s).strip()
     if s == "":
         return None
     s = re.sub(r"[^\d\-\+\,\.\(\)]", "", s)
     negative = False
     if s.startswith("(") and s.endswith(")"):
     return bool(t and NUM_RE.search(str(t)))
 def looks_like_date_num(s: str) -> bool:
     s_digits = re.sub(r"[^\d]", "", s or "")
     if len(s_digits) >= 7:
         if s_digits.endswith(("2025","2024","2023","2022","2026")):
             pass
     return False
 def clean_name_text(s: str) -> str:
+    s = s.replace("—", "-")
     s = re.sub(r"\s+", " ", s)
     s = s.strip(" -:,.=")
+    s = re.sub(r"\s+x$", "", s, flags=re.I)
+    s = re.sub(r"[\)\}\]]+$", "", s)
     s = re.sub(r"\bOR\b", "DR", s)
+    s = s.strip(" -:,.")
     s = s.strip()
     return s
 # ---------------- image preprocessing ----------------
 def pil_to_cv2(img: Image.Image) -> Any:
     arr = np.array(img)
     if w < target_w:
         scale = target_w / float(w)
         pil_img = pil_img.resize((int(w * scale), int(h * scale)), Image.LANCZOS)
     cv_img = pil_to_cv2(pil_img)
+    # grayscale and denoise
     if cv_img.ndim == 3:
         gray = cv2.cvtColor(cv_img, cv2.COLOR_BGR2GRAY)
     else:
         gray = cv_img
+    gray = cv2.fastNlMeansDenoising(gray, h=10)
     try:
         bw = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                    cv2.THRESH_BINARY, 41, 15)
     except Exception:
+        _, bw = cv2.threshold(gray, 127, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
+    kernel = np.ones((1,1), np.uint8)
+    bw = cv2.morphologyEx(bw, cv2.MORPH_OPEN, kernel)
     return bw
+# ---------------- OCR TSV ----------------
 def image_to_tsv_cells(cv_img: Any) -> List[Dict[str, Any]]:
+    # pytesseract expects either a PIL image or numpy array
     try:
         o = pytesseract.image_to_data(cv_img, output_type=Output.DICT, config="--psm 6")
     except Exception:
         o = pytesseract.image_to_data(cv_img, output_type=Output.DICT)
+    cells = []
     n = len(o.get("text", []))
     for i in range(n):
         raw = o["text"][i]
         txt = str(raw).strip()
         if not txt:
             continue
         try:
+            conf = float(o["conf"][i]) if o["conf"][i] not in (None, "", "-1") else -1.0
         except Exception:
             conf = -1.0
+        left = int(o.get("left", [0])[i])
+        top = int(o.get("top", [0])[i])
+        width = int(o.get("width", [0])[i])
+        height = int(o.get("height", [0])[i])
         center_y = top + height / 2.0
         center_x = left + width / 2.0
+        cells.append({"text": txt, "conf": conf, "left": left, "top": top,
+                      "width": width, "height": height, "center_y": center_y, "center_x": center_x})
     return cells
 # ---------------- grouping & merging helpers ----------------
     if not cells:
         return []
     sorted_cells = sorted(cells, key=lambda c: (c["center_y"], c["center_x"]))
+    rows = []
     current = [sorted_cells[0]]
     last_y = sorted_cells[0]["center_y"]
     for c in sorted_cells[1:]:
 def merge_multiline_names(rows: List[List[Dict[str, Any]]]) -> List[List[Dict[str, Any]]]:
     if not rows:
         return rows
+    merged = []
     i = 0
     while i < len(rows):
         row = rows[i]
         tokens = [c["text"] for c in row]
         has_num = any(is_numeric_token(t) for t in tokens)
         if not has_num and i + 1 < len(rows):
             next_row = rows[i+1]
             next_tokens = [c["text"] for c in next_row]
             if next_has_num and len(tokens) >= 2 and len([t for t in next_tokens if not is_numeric_token(t)]) <= 3:
                 merged_row = []
                 min_left = min((c["left"] for c in next_row), default=0)
+                offset = 10
                 for c in row:
                     newc = c.copy()
+                    newc["left"] = min_left - offset
+                    newc["center_x"] = newc["left"] + newc.get("width", 0) / 2.0
                     merged_row.append(newc)
+                    offset += 10
                 merged_row.extend(next_row)
                 merged.append(sorted(merged_row, key=lambda cc: cc["left"]))
                 i += 2
                 continue
         if not has_num and i + 1 < len(rows):
             next_row = rows[i+1]
             next_tokens = [c["text"] for c in next_row]
             next_has_num = any(is_numeric_token(t) for t in next_tokens)
             if not next_has_num and len(tokens) <= 3 and len(next_tokens) <= 3:
                 merged_row = []
+                min_left = min((c["left"] for c in next_row + row), default=0)
+                offset = 10
+                for c in row + next_row:
                     newc = c.copy()
+                    if newc["left"] > min_left:
+                        newc["left"] = newc["left"]
+                    else:
+                        newc["left"] = min_left - offset
+                    newc["center_x"] = newc["left"] + newc.get("width", 0) / 2.0
                     merged_row.append(newc)
+                    offset += 5
                 merged.append(sorted(merged_row, key=lambda cc: cc["left"]))
                 i += 2
                 continue
 # ---------------- numeric column detection ----------------
 def detect_numeric_columns(cells: List[Dict[str, Any]], max_columns: int = 4) -> List[float]:
+    xs = [c["center_x"] for c in cells if is_numeric_token(c["text"])]
     if not xs:
         return []
+    xs = sorted(xs)
     if len(xs) == 1:
         return [xs[0]]
     gaps = [xs[i+1] - xs[i] for i in range(len(xs)-1)]
     distances = [abs(token_x - cx) for cx in column_centers]
     return int(np.argmin(distances))
 # ---------------- Gemini refinement (deterministic) ----------------
 def refine_with_gemini(page_items: List[Dict[str, Any]], page_text: str = "") -> Tuple[List[Dict[str, Any]], Dict[str, int]]:
     zero_usage = {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0}
 # ---------------- parsing rows into items (modified) ----------------
 def parse_rows_with_columns(rows: List[List[Dict[str, Any]]], page_cells: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    parsed_items = []
     rows = merge_multiline_names(rows)
     column_centers = detect_numeric_columns(page_cells, max_columns=4)
         if not tokens:
             continue
         joined_lower = " ".join(tokens).lower()
         if FOOTER_KEYWORDS.search(joined_lower) and not any(is_numeric_token(t) for t in tokens):
             continue
         if all(not is_numeric_token(t) for t in tokens):
             continue
         numeric_values = []
                 v = normalize_num_str(t)
                 if v is not None:
                     numeric_values.append(float(v))
         numeric_values = sorted({int(x) if float(x).is_integer() else x for x in numeric_values}, reverse=True)
         if column_centers:
                 t = c["text"]
                 if is_numeric_token(t) and not looks_like_date_num(t):
                     col_idx = assign_token_to_column(c["center_x"], column_centers)
+                    if col_idx is None:
                         numeric_bucket_map[len(column_centers) - 1].append(t)
                     else:
                         numeric_bucket_map[col_idx].append(t)
             rate   = normalize_num_str(get_bucket(num_cols - 2)) if num_cols >= 2 else None
             qty    = normalize_num_str(get_bucket(num_cols - 3)) if num_cols >= 3 else None
             if amount is None:
                 for t in reversed(tokens):
                     if is_numeric_token(t) and not looks_like_date_num(t):
+                        amount = normalize_num_str(t)
+                        if amount is not None:
                             break
             if amount is not None and numeric_values:
                 for cand in numeric_values:
                     try:
                     if r < 1 or r > 200:
                         continue
                     if abs(ratio - r) <= max(0.03 * r, 0.15):
+                        if r <= 100:
+                            rate = cand_float
+                            qty = float(r)
+                            break
             if (rate is None or rate == 0) and qty and qty != 0 and amount is not None:
                 try:
                     candidate_rate = amount / qty
+                    if candidate_rate >= 2:
                         rate = candidate_rate
                 except Exception:
                     pass
             if qty is None:
                 qty = 1.0
+            try:
+                amount = float(round(amount, 2))
+            except:
+                continue
+            try:
+                rate = float(round(rate, 2)) if rate is not None else 0.0
+            except:
+                rate = 0.0
+            try:
+                qty = float(qty)
+            except:
+                qty = 1.0
+            parsed_items.append({
+                "item_name": name if name else "UNKNOWN",
+                "item_amount": amount,
+                "item_rate": rate if rate is not None else 0.0,
+                "item_quantity": qty if qty is not None else 1.0,
+            })
         else:
             numeric_idxs = [i for i, t in enumerate(tokens) if is_numeric_token(t) and not looks_like_date_num(t)]
             if not numeric_idxs:
                 continue
             name = " ".join(tokens[:last]).strip()
             if not name:
                 continue
+            rate = None; qty = None
             right_nums = []
             for i in numeric_idxs:
                 v = normalize_num_str(tokens[i])
                     right_nums.append(float(v))
             right_nums = sorted({int(x) if float(x).is_integer() else x for x in right_nums}, reverse=True)
             if len(right_nums) >= 2:
                 cand = right_nums[1]
                 if 1 < cand < float(amt):
             if rate is None:
                 rate = 0.0
+            parsed_items.append({
+                "item_name": clean_name_text(name),
+                "item_amount": float(round(amt, 2)),
+                "item_rate": float(round(rate, 2)),
+                "item_quantity": float(qty),
+            })
     return parsed_items
 # ---------------- dedupe & totals ----------------
 def dedupe_items(items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
     seen = set()
+    out = []
     for it in items:
         nm = re.sub(r"\s+", " ", it["item_name"].lower()).strip()
+        key = (nm[:120], round(float(it["item_amount"]), 2))
         if key in seen:
             continue
         seen.add(key)
     qty = float(item.get("item_quantity", 0) or 0)
     if qty <= 0:
         return False
+    if rate and rate > amt:
         return False
     if amt <= 0.0:
         return False
     return True
 # ---------------- main endpoint ----------------
             "token_usage": {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0}
         }
+    images = []
     clean_url = doc_url.split("?", 1)[0].lower()
     try:
         if clean_url.endswith(".pdf"):
         try:
             proc = preprocess_image(page_img)
             cells = image_to_tsv_cells(proc)
             rows = group_cells_into_rows(cells, y_tolerance=12)
             rows_texts = [" ".join([c["text"] for c in r]).strip() for r in rows]
             parsed_items = parse_rows_with_columns(rows, cells)
             refined_items, token_u = refine_with_gemini(parsed_items, page_text)
             for k in cumulative_token_usage:
                 cumulative_token_usage[k] += token_u.get(k, 0)
         logger.exception("run_all_samples failed: %s", e)
         return {"status": "error", "error": str(e)}