Spaces:

Sathvik-kota
/

Datathon

Sleeping

App Files Files Community

Sathvik-kota commited on Nov 29, 2025

Commit

80ab573

verified ·

1 Parent(s): 5ec4a93

Upload folder using huggingface_hub

Browse files

Files changed (1) hide show

app.py +78 -32

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
-# app_bill_extractor_final.py
 # Humanized, high-accuracy bill extraction API.
-# Combines robust OCR preprocessing, TSV-based layout parsing, numeric-column inference,
-# and ALWAYS attempts Gemini refinement (if GEMINI_API_KEY set). Made compact & readable.
 import os
 import re
@@ -19,7 +19,7 @@ from pytesseract import Output
 import numpy as np
 import cv2
-# Optional: Google Gemini SDK (if you use it). Code will gracefully work without it.
 try:
     import google.generativeai as genai
 except Exception:
@@ -40,20 +40,29 @@ app = FastAPI(title="Bajaj Datathon - Bill Extractor (final, humanized)")
 class BillRequest(BaseModel):
     document: str
-# ---------------- Regex, small utils ----------------
 NUM_RE = re.compile(r"[-+]?\d{1,3}(?:[,0-9]*)(?:\.\d+)?")
 TOTAL_KEYWORDS = re.compile(
     r"(grand\s*total|net\s*payable|total\s*amount|amount\s*payable|bill\s*amount|final\s*amount|balance\s*due|sub\s*total|subtotal|round\s*off)",
     re.I,
 )
 FOOTER_KEYWORDS = re.compile(r"(page|printed on|printed:|date:|time:|am|pm)", re.I)
-HEADER_KEYWORDS = ["description", "qty", "hrs", "rate", "discount", "net", "amt", "amount", "consultation", "qty/hrs", "qty / hrs"]
-# sanitize OCR text before ever sending to an LLM or using it for heuristics
 def sanitize_ocr_text(s: str) -> str:
     if not s:
         return ""
-    # unify dashes and remove odd control characters
     s = s.replace("\u2014", "-").replace("\u2013", "-")
     s = re.sub(r"[^\x09\x0A\x0D\x20-\x7E]", " ", s)
     s = s.replace("\r\n", "\n").replace("\r", "\n")
@@ -102,7 +111,6 @@ def pil_to_cv2(img: Image.Image) -> Any:
     return cv2.cvtColor(arr, cv2.COLOR_RGB2BGR)
 def preprocess_image(pil_img: Image.Image) -> Any:
-    # quick, robust steps: upscale small images, grayscale, denoise, adaptive threshold
     pil_img = pil_img.convert("RGB")
     w, h = pil_img.size
     target_w = 1500
@@ -120,7 +128,7 @@ def preprocess_image(pil_img: Image.Image) -> Any:
     bw = cv2.morphologyEx(bw, cv2.MORPH_OPEN, kernel)
     return bw
-# ---------------- OCR TSV helpers ----------------
 def image_to_tsv_cells(cv_img: Any) -> List[Dict[str, Any]]:
     try:
         o = pytesseract.image_to_data(cv_img, output_type=Output.DICT, config="--psm 6")
@@ -148,7 +156,7 @@ def image_to_tsv_cells(cv_img: Any) -> List[Dict[str, Any]]:
         cells.append({"text": txt, "conf": conf, "left": left, "top": top, "width": width, "height": height, "center_y": center_y, "center_x": center_x})
     return cells
-# ---------------- grouping & merging ----------------
 def group_cells_into_rows(cells: List[Dict[str, Any]], y_tolerance: int = 12) -> List[List[Dict[str, Any]]]:
     if not cells:
         return []
@@ -199,7 +207,7 @@ def merge_multiline_names(rows: List[List[Dict[str, Any]]]) -> List[List[Dict[st
         i += 1
     return merged
-# ---------------- numeric column detection ----------------
 def detect_numeric_columns(cells: List[Dict[str, Any]], max_columns: int = 4) -> List[float]:
     xs = [c["center_x"] for c in cells if is_numeric_token(c["text"])]
     if not xs:
@@ -207,19 +215,21 @@ def detect_numeric_columns(cells: List[Dict[str, Any]], max_columns: int = 4) ->
     xs = sorted(xs)
     if len(xs) == 1:
         return [xs[0]]
     gaps = [xs[i+1] - xs[i] for i in range(len(xs) - 1)]
-    mean_gap = float(np.mean(gaps))
-    std_gap = float(np.std(gaps)) if len(gaps) > 1 else 0.0
-    gap_thresh = max(30.0, mean_gap + 0.6 * std_gap)
     clusters = []
     curr = [xs[0]]
     for i, g in enumerate(gaps):
-        if g > gap_thresh and len(clusters) < (max_columns - 1):
             clusters.append(curr)
             curr = [xs[i+1]]
         else:
             curr.append(xs[i+1])
     clusters.append(curr)
     centers = [float(np.median(c)) for c in clusters]
     if len(centers) > max_columns:
         centers = centers[-max_columns:]
@@ -231,7 +241,7 @@ def assign_token_to_column(token_x: float, column_centers: List[float]) -> Optio
     distances = [abs(token_x - cx) for cx in column_centers]
     return int(np.argmin(distances))
-# ---------------- parse rows into items ----------------
 def parse_rows_with_columns(rows: List[List[Dict[str, Any]]], page_cells: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
     parsed_items = []
     rows = merge_multiline_names(rows)
@@ -369,29 +379,33 @@ def detect_subtotals_and_totals(rows_texts: List[str]) -> Dict[str, Optional[flo
                     if final is None: final = float(round(v, 2))
     return {"subtotal": subtotal, "final_total": final}
-# ---------------- Gemini refinement (always attempted) ----------------
 def refine_with_gemini(page_items: List[Dict[str, Any]], page_text: str = "") -> Tuple[List[Dict[str, Any]], Dict[str, int]]:
     zero_usage = {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0}
     if not GEMINI_API_KEY or genai is None:
         return page_items, zero_usage
     try:
         safe_text = sanitize_ocr_text(page_text)
-        system = (
-            "You are a strict bill-extraction cleaner. Return ONLY a JSON array (no text) of objects with keys "
-            "item_name (string), item_amount (float), item_rate (float), item_quantity (float). "
-            "Do NOT return totals or subtotals as items. Do not invent items. Fix broken names and numeric mismatches."
         )
-        # small few-shot example to anchor the model
-        few_shot = (
-            "# EXAMPLE\nitems = [{'item_name':'Consultation Charge | DR PREETHI','item_amount':300.0,'item_rate':0.0,'item_quantity':300.0}]\n"
-            "=> [{'item_name':'Consultation Charge | DR PREETHI MARY JOSEPH','item_amount':300.0,'item_rate':300.0,'item_quantity':1.0}]\n"
         )
-        prompt = f"page_text='''{safe_text}'''\nitems = {json.dumps(page_items, ensure_ascii=False)}\n\n{few_shot}\nReturn only a JSON array."
         model = genai.GenerativeModel(GEMINI_MODEL_NAME)
         response = model.generate_content(
             [
-                {"role": "system", "parts": [system]},
-                {"role": "user", "parts": [prompt]},
             ],
             temperature=0.0,
             max_output_tokens=1000,
@@ -413,6 +427,7 @@ def refine_with_gemini(page_items: List[Dict[str, Any]], page_text: str = "") ->
                     })
                 except Exception:
                     continue
             return cleaned, zero_usage
         return page_items, zero_usage
     except Exception:
@@ -423,6 +438,9 @@ def looks_like_header_text(txt: str, top_of_page: bool = False) -> bool:
     if not txt:
         return False
     t = re.sub(r"\s+", " ", txt.strip().lower())
     hits = sum(1 for k in HEADER_KEYWORDS if k in t)
     if hits >= 2:
         return True
@@ -438,12 +456,12 @@ def looks_like_header_text(txt: str, top_of_page: bool = False) -> bool:
         return True
     return False
 def final_item_filter(item: Dict[str, Any], known_page_headers: List[str] = []) -> bool:
     name = (item.get("item_name") or "").strip()
     if not name:
         return False
     ln = name.lower()
     for h in known_page_headers:
         if h and h.strip() and h.strip().lower() in ln:
             return False
@@ -455,6 +473,10 @@ def final_item_filter(item: Dict[str, Any], known_page_headers: List[str] = [])
         return False
     if re.fullmatch(r"(charge|charges|services|laboratory|lab|consultation)", ln.strip(), re.I):
         return False
     if float(item.get("item_amount", 0)) <= 0.0:
         return False
     rate = float(item.get("item_rate", 0) or 0)
@@ -499,25 +521,48 @@ async def extract_bill_data(payload: BillRequest):
             proc = preprocess_image(page_img)
             cells = image_to_tsv_cells(proc)
             rows = group_cells_into_rows(cells, y_tolerance=12)
-            rows_texts = [" ".join([c["text"] for c in r]) for r in rows]
             top_headers = []
             for i, rt in enumerate(rows_texts[:6]):
                 if looks_like_header_text(rt, top_of_page=(i < 4)):
                     top_headers.append(rt.strip().lower())
             parsed_items = parse_rows_with_columns(rows, cells)
-            page_text = sanitize_ocr_text(" ".join(rows_texts))
             refined_items, token_u = refine_with_gemini(parsed_items, page_text)
             for k in cumulative_token_usage:
                 cumulative_token_usage[k] += token_u.get(k, 0)
             cleaned = [p for p in refined_items if final_item_filter(p, known_page_headers=top_headers)]
             cleaned = dedupe_items(cleaned)
             cleaned = [p for p in cleaned if not looks_like_header_text(p["item_name"].lower())]
             page_type = "Bill Detail"
             page_txt = page_text.lower()
             if any(x in page_txt for x in ["pharmacy", "medicine", "tablet"]):
                 page_type = "Pharmacy"
             if "final bill" in page_txt or "grand total" in page_txt:
                 page_type = "Final Bill"
             pagewise.append({"page_no": str(idx), "page_type": page_type, "bill_items": cleaned})
         except Exception:
             pagewise.append({"page_no": str(idx), "page_type": "Bill Detail", "bill_items": []})
@@ -526,6 +571,7 @@ async def extract_bill_data(payload: BillRequest):
     total_item_count = sum(len(p.get("bill_items", [])) for p in pagewise)
     if not GEMINI_API_KEY or genai is None:
         cumulative_token_usage["warning_no_gemini"] = 1
     return {"is_success": True, "token_usage": cumulative_token_usage, "data": {"pagewise_line_items": pagewise, "total_item_count": total_item_count}}
 # ---------------- debug TSV ----------------

+# app_bill_extractor_final_v2.py
 # Humanized, high-accuracy bill extraction API.
+# Robust OCR preprocessing, TSV layout parsing, numeric-column inference,
+# header prefiltering, deterministic Gemini refinement (if configured).
 import os
 import re
 import numpy as np
 import cv2
+# Optional: Google Gemini SDK (if available)
 try:
     import google.generativeai as genai
 except Exception:
 class BillRequest(BaseModel):
     document: str
+# ---------------- Regex and keywords ----------------
 NUM_RE = re.compile(r"[-+]?\d{1,3}(?:[,0-9]*)(?:\.\d+)?")
 TOTAL_KEYWORDS = re.compile(
     r"(grand\s*total|net\s*payable|total\s*amount|amount\s*payable|bill\s*amount|final\s*amount|balance\s*due|sub\s*total|subtotal|round\s*off)",
     re.I,
 )
 FOOTER_KEYWORDS = re.compile(r"(page|printed on|printed:|date:|time:|am|pm)", re.I)
+# generalized header-related tokens & exact header phrase blacklist (common variants)
+HEADER_KEYWORDS = ["description", "qty", "hrs", "rate", "discount", "net", "amt", "amount", "consultation", "qty/hrs", "qty / hrs"]
+HEADER_PHRASES = [
+    "description qty / hrs consultation rate discount net amt",
+    "description qty / hrs rate discount net amt",
+    "description qty / hrs rate net amt",
+    "description qty hrs rate discount net amt",
+    "description qty / hrs rate discount net amt",
+]
+HEADER_PHRASES = [h.lower() for h in HEADER_PHRASES]
+# ---------------- small utilities ----------------
 def sanitize_ocr_text(s: str) -> str:
     if not s:
         return ""
     s = s.replace("\u2014", "-").replace("\u2013", "-")
     s = re.sub(r"[^\x09\x0A\x0D\x20-\x7E]", " ", s)
     s = s.replace("\r\n", "\n").replace("\r", "\n")
     return cv2.cvtColor(arr, cv2.COLOR_RGB2BGR)
 def preprocess_image(pil_img: Image.Image) -> Any:
     pil_img = pil_img.convert("RGB")
     w, h = pil_img.size
     target_w = 1500
     bw = cv2.morphologyEx(bw, cv2.MORPH_OPEN, kernel)
     return bw
+# ---------------- OCR TSV ----------------
 def image_to_tsv_cells(cv_img: Any) -> List[Dict[str, Any]]:
     try:
         o = pytesseract.image_to_data(cv_img, output_type=Output.DICT, config="--psm 6")
         cells.append({"text": txt, "conf": conf, "left": left, "top": top, "width": width, "height": height, "center_y": center_y, "center_x": center_x})
     return cells
+# ---------------- grouping & merge helpers ----------------
 def group_cells_into_rows(cells: List[Dict[str, Any]], y_tolerance: int = 12) -> List[List[Dict[str, Any]]]:
     if not cells:
         return []
         i += 1
     return merged
+# ---------------- numeric column detection (conservative) ----------------
 def detect_numeric_columns(cells: List[Dict[str, Any]], max_columns: int = 4) -> List[float]:
     xs = [c["center_x"] for c in cells if is_numeric_token(c["text"])]
     if not xs:
     xs = sorted(xs)
     if len(xs) == 1:
         return [xs[0]]
+    # Conservative min gap to avoid merging separate numeric columns
+    min_gap_px = 50.0
     gaps = [xs[i+1] - xs[i] for i in range(len(xs) - 1)]
     clusters = []
     curr = [xs[0]]
     for i, g in enumerate(gaps):
+        if g >= min_gap_px:
             clusters.append(curr)
             curr = [xs[i+1]]
         else:
             curr.append(xs[i+1])
     clusters.append(curr)
     centers = [float(np.median(c)) for c in clusters]
     if len(centers) > max_columns:
         centers = centers[-max_columns:]
     distances = [abs(token_x - cx) for cx in column_centers]
     return int(np.argmin(distances))
+# ---------------- parsing rows into items ----------------
 def parse_rows_with_columns(rows: List[List[Dict[str, Any]]], page_cells: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
     parsed_items = []
     rows = merge_multiline_names(rows)
                     if final is None: final = float(round(v, 2))
     return {"subtotal": subtotal, "final_total": final}
+# ---------------- Gemini refinement (deterministic) ----------------
 def refine_with_gemini(page_items: List[Dict[str, Any]], page_text: str = "") -> Tuple[List[Dict[str, Any]], Dict[str, int]]:
     zero_usage = {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0}
     if not GEMINI_API_KEY or genai is None:
         return page_items, zero_usage
     try:
         safe_text = sanitize_ocr_text(page_text)
+        system_prompt = (
+            "You are a strict bill-extraction cleaner. Return ONLY a JSON array (no explanation, no backticks). "
+            "Each entry must be an object with keys: item_name (string), item_amount (float), item_rate (float), item_quantity (float). "
+            "Do NOT include subtotal or total lines as items. Do not invent items; only clean/fix/normalize the given items."
         )
+        user_prompt = (
+            f"page_text='''{safe_text}'''\n"
+            f"items = {json.dumps(page_items, ensure_ascii=False)}\n\n"
+            "Example:\n"
+            "items = [{'item_name':'Consultation Charge | DR PREETHI','item_amount':300.0,'item_rate':0.0,'item_quantity':300.0},\n"
+            "         {'item_name':'Description Qty / Hrs Consultation Rate Discount Net Amt','item_amount':1950.0,'item_rate':1950.0,'item_quantity':1.0}]\n"
+            "=>\n"
+            "[{'item_name':'Consultation Charge | DR PREETHI MARY JOSEPH','item_amount':300.0,'item_rate':300.0,'item_quantity':1.0}]\n\n"
+            "Return only the cleaned JSON array of items."
         )
         model = genai.GenerativeModel(GEMINI_MODEL_NAME)
         response = model.generate_content(
             [
+                {"role": "system", "parts": [system_prompt]},
+                {"role": "user", "parts": [user_prompt]},
             ],
             temperature=0.0,
             max_output_tokens=1000,
                     })
                 except Exception:
                     continue
+            # token usage info not reliably extracted here — return zeros
             return cleaned, zero_usage
         return page_items, zero_usage
     except Exception:
     if not txt:
         return False
     t = re.sub(r"\s+", " ", txt.strip().lower())
+    # exact phrase blacklist
+    if any(h == t for h in HEADER_PHRASES):
+        return True
     hits = sum(1 for k in HEADER_KEYWORDS if k in t)
     if hits >= 2:
         return True
         return True
     return False
 def final_item_filter(item: Dict[str, Any], known_page_headers: List[str] = []) -> bool:
     name = (item.get("item_name") or "").strip()
     if not name:
         return False
     ln = name.lower()
+    # exact match against detected headers
     for h in known_page_headers:
         if h and h.strip() and h.strip().lower() in ln:
             return False
         return False
     if re.fullmatch(r"(charge|charges|services|laboratory|lab|consultation)", ln.strip(), re.I):
         return False
+    # drop obvious section/subtotal labels (but allow items like 'ANAES. CHARGE' which contain a dot)
+    if len(name.split()) <= 4 and re.search(r"\b(charges|services|room|radiology|laborat|surgery|procedure)\b", ln):
+        if "." not in name and not re.search(r"\b[A-Z]{2,}\b", name):
+            return False
     if float(item.get("item_amount", 0)) <= 0.0:
         return False
     rate = float(item.get("item_rate", 0) or 0)
             proc = preprocess_image(page_img)
             cells = image_to_tsv_cells(proc)
             rows = group_cells_into_rows(cells, y_tolerance=12)
+            rows_texts = [" ".join([c["text"] for c in r]).strip() for r in rows]
+            # === HEADER PREFILTER: remove header-like rows anywhere on page ===
+            rows_filtered = []
+            for i, (r, rt) in enumerate(zip(rows, rows_texts)):
+                top_flag = (i < 6)
+                rt_norm = sanitize_ocr_text(rt).lower()
+                if looks_like_header_text(rt_norm, top_of_page=top_flag):
+                    continue
+                if any(h in rt_norm for h in HEADER_PHRASES):
+                    continue
+                rows_filtered.append(r)
+            # recompute row texts and a simple page_text
+            rows = rows_filtered
+            rows_texts = [" ".join([c["text"] for c in r]).strip() for r in rows]
+            page_text = sanitize_ocr_text(" ".join(rows_texts))
+            # detect page-level top headers (for final filtering)
             top_headers = []
             for i, rt in enumerate(rows_texts[:6]):
                 if looks_like_header_text(rt, top_of_page=(i < 4)):
                     top_headers.append(rt.strip().lower())
             parsed_items = parse_rows_with_columns(rows, cells)
+            # ALWAYS attempt Gemini refinement if available (deterministic settings)
             refined_items, token_u = refine_with_gemini(parsed_items, page_text)
             for k in cumulative_token_usage:
                 cumulative_token_usage[k] += token_u.get(k, 0)
+            # final cleaning & dedupe
             cleaned = [p for p in refined_items if final_item_filter(p, known_page_headers=top_headers)]
             cleaned = dedupe_items(cleaned)
             cleaned = [p for p in cleaned if not looks_like_header_text(p["item_name"].lower())]
             page_type = "Bill Detail"
             page_txt = page_text.lower()
             if any(x in page_txt for x in ["pharmacy", "medicine", "tablet"]):
                 page_type = "Pharmacy"
             if "final bill" in page_txt or "grand total" in page_txt:
                 page_type = "Final Bill"
             pagewise.append({"page_no": str(idx), "page_type": page_type, "bill_items": cleaned})
         except Exception:
             pagewise.append({"page_no": str(idx), "page_type": "Bill Detail", "bill_items": []})
     total_item_count = sum(len(p.get("bill_items", [])) for p in pagewise)
     if not GEMINI_API_KEY or genai is None:
         cumulative_token_usage["warning_no_gemini"] = 1
     return {"is_success": True, "token_usage": cumulative_token_usage, "data": {"pagewise_line_items": pagewise, "total_item_count": total_item_count}}
 # ---------------- debug TSV ----------------