Spaces:

Sathvik-kota
/

Datathon

Sleeping

App Files Files Community

Sathvik-kota commited on Nov 30, 2025

Commit

d246b8c

verified ·

1 Parent(s): e568983

Upload folder using huggingface_hub

Browse files

Files changed (1) hide show

app.py +41 -188

app.py CHANGED Viewed

@@ -1,22 +1,10 @@
-# Enhanced Bill Extraction API
-# Designed for Bajaj Datathon: accurate line item + subtotal + total extraction
-#
-# Key improvements:
-# 1. Explicit subtotal/total detection and preservation
-# 2. Double-count prevention via fingerprinting
-# 3. Item-sum vs bill-total validation
-# 4. Confidence scoring and anomaly detection
-# 5. Enhanced preprocessing for table structures
-# 6. Gemini-powered structural validation
 import os
 import re
 import json
 import logging
 from io import BytesIO
-from typing import List, Dict, Any, Optional, Tuple, Set
-from dataclasses import dataclass, asdict
-from collections import defaultdict
 from fastapi import FastAPI
 from pydantic import BaseModel
@@ -40,29 +28,15 @@ try:
 except Exception:
     vision = None
-try:
-    import google.generativeai as genai
-except Exception:
-    genai = None
 # -------------------------------------------------------------------------
 # Configuration
 # -------------------------------------------------------------------------
 OCR_ENGINE = os.getenv("OCR_ENGINE", "tesseract").lower()
-GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
-GEMINI_MODEL_NAME = os.getenv("GEMINI_MODEL_NAME", "gemini-2.0-flash")
 AWS_REGION = os.getenv("AWS_REGION", "us-east-1")
 TESSERACT_PSM = os.getenv("TESSERACT_PSM", "6")
 logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger("bill-extractor-v2")
-if GEMINI_API_KEY and genai is not None:
-    try:
-        genai.configure(api_key=GEMINI_API_KEY)
-        logger.info("Gemini configured")
-    except Exception as e:
-        logger.warning("Gemini config failed: %s", e)
 # Lazy clients
 _textract_client = None
@@ -85,7 +59,7 @@ def vision_client():
     return _vision_client
 # -------------------------------------------------------------------------
-# Data Models
 # -------------------------------------------------------------------------
 @dataclass
 class BillLineItem:
@@ -94,15 +68,19 @@ class BillLineItem:
     item_quantity: float = 1.0
     item_rate: float = 0.0
     item_amount: float = 0.0
-    confidence: float = 1.0  # 0-1 confidence score
-    source_row: str = ""  # raw OCR text for debugging
-    is_description_continuation: bool = False  # multi-line item flag
     def to_dict(self) -> Dict[str, Any]:
-        d = asdict(self)
-        d.pop("source_row", None)  # exclude raw text from output
-        d.pop("is_description_continuation", None)
-        return d
 @dataclass
 class BillTotal:
@@ -119,26 +97,25 @@ class BillTotal:
 class ExtractedPage:
     """Page-level extraction result"""
     page_no: int
-    page_type: str  # "Bill Detail", "Header", "Footer", etc.
     line_items: List[BillLineItem]
     bill_totals: BillTotal
-    page_confidence: float = 1.0
     def to_dict(self) -> Dict[str, Any]:
         return {
             "page_no": self.page_no,
             "page_type": self.page_type,
             "line_items": [item.to_dict() for item in self.line_items],
             "bill_totals": self.bill_totals.to_dict(),
-            "page_confidence": round(self.page_confidence, 3),
         }
 # -------------------------------------------------------------------------
-# Regular Expressions (Enhanced)
 # -------------------------------------------------------------------------
 NUM_RE = re.compile(r"[-+]?\d{1,3}(?:[,0-9]*)(?:\.\d+)?")
-# Total/Subtotal keywords (improved detection)
 TOTAL_KEYWORDS = re.compile(
     r"\b(grand\s+total|net\s+payable|total\s+(?:amount|due)|amount\s+payable|bill\s+amount|"
     r"final\s+(?:amount|total)|balance\s+due|amount\s+due|total\s+payable|payable)\b",
@@ -164,22 +141,20 @@ FOOTER_KEYWORDS = re.compile(
 HEADER_KEYWORDS = [
     "description", "qty", "qty/hrs", "hrs", "rate", "unit price", "discount",
     "net", "amt", "amount", "price", "total", "sl.no", "s.no", "item", "service",
-    "consultation", "patient", "invoice", "bill", "charges"
 ]
 # -------------------------------------------------------------------------
 # Text Cleaning & Normalization
 # -------------------------------------------------------------------------
 def sanitize_ocr_text(s: Optional[str]) -> str:
-    """Deep clean OCR text"""
     if not s:
         return ""
     s = s.replace("\u2014", "-").replace("\u2013", "-")
-    s = s.replace("\u00A0", " ")  # nbsp
     s = re.sub(r"[^\x09\x0A\x0D\x20-\x7E]", " ", s)
     s = s.replace("\r\n", "\n").replace("\r", "\n")
     s = re.sub(r"[ \t]+", " ", s)
-    # OCR corrections
     s = re.sub(r"\b(qiy|qty|oty|gty)\b", "qty", s, flags=re.I)
     s = re.sub(r"\b(deseription|descriptin|desription)\b", "description", s, flags=re.I)
     return s.strip()
@@ -192,13 +167,11 @@ def normalize_num_str(s: Optional[str], allow_zero: bool = False) -> Optional[fl
     if s == "":
         return None
-    # Handle parentheses (negative indicator)
     negative = False
     if s.startswith("(") and s.endswith(")"):
         negative = True
         s = s[1:-1]
-    # Remove non-numeric chars except decimal/comma
     s = re.sub(r"[^\d\-\+\,\.\(\)]", "", s)
     s = s.replace(",", "")
@@ -223,7 +196,7 @@ def clean_item_name(s: str) -> str:
     s = s.replace("—", "-").replace("–", "-")
     s = re.sub(r"\s+", " ", s)
     s = s.strip(" -:,.=()[]{}|\\")
-    s = re.sub(r"\bOR\b", "DR", s)  # OCR OR -> DR
     return s.strip()
 # -------------------------------------------------------------------------
@@ -236,27 +209,20 @@ def item_fingerprint(item: BillLineItem) -> Tuple[str, float]:
     return (name_norm, amount_rounded)
 def dedupe_items_advanced(items: List[BillLineItem]) -> List[BillLineItem]:
-    """
-    Remove duplicates while preserving highest-confidence versions.
-    Handles multi-line descriptions by checking sequential items.
-    """
     if not items:
         return []
-    # Remove exact duplicates (same fingerprint)
     seen: Dict[Tuple, BillLineItem] = {}
     for item in items:
         fp = item_fingerprint(item)
         if fp not in seen or item.confidence > seen[fp].confidence:
             seen[fp] = item
-    # Remove high-similarity continuation rows (likely description wrapping)
     final = []
     for item in seen.values():
         if item.is_description_continuation:
-            # Check if very similar to previous item
             if final and abs(float(final[-1].item_amount) - float(item.item_amount)) < 0.01:
-                # Likely continuation; merge
                 final[-1].item_name = (final[-1].item_name + " " + item.item_name).strip()
                 continue
         final.append(item)
@@ -266,27 +232,24 @@ def dedupe_items_advanced(items: List[BillLineItem]) -> List[BillLineItem]:
 # -------------------------------------------------------------------------
 # Total/Subtotal Detection
 # -------------------------------------------------------------------------
 def detect_totals_in_rows(rows: List[List[Dict[str, Any]]]) -> Tuple[Optional[float], Optional[float], Optional[float], Optional[float]]:
-    """
-    Scan rows for subtotal, tax, discount, final total.
-    Returns: (subtotal, tax, discount, final_total)
-    """
     subtotal = None
     tax = None
     discount = None
     final_total = None
-    rows_text = []
     for row in rows:
         row_text = " ".join([c["text"] for c in row])
-        rows_text.append((row_text, row))
-    # Scan for keywords
-    for row_text, row in rows_text:
         row_lower = row_text.lower()
         tokens = row_text.split()
-        # Extract number from row
         amounts = []
         for t in tokens:
             if is_numeric_token(t):
@@ -297,10 +260,8 @@ def detect_totals_in_rows(rows: List[List[Dict[str, Any]]]) -> Tuple[Optional[fl
         if not amounts:
             continue
-        # Use rightmost/largest amount typically
         amount = max(amounts)
-        # Keyword matching
         if FINAL_TOTAL_KEYWORDS.search(row_lower):
             final_total = amount
         elif SUBTOTAL_KEYWORDS.search(row_lower):
@@ -312,17 +273,10 @@ def detect_totals_in_rows(rows: List[List[Dict[str, Any]]]) -> Tuple[Optional[fl
     return subtotal, tax, discount, final_total
-FINAL_TOTAL_KEYWORDS = re.compile(
-    r"\b(grand\s+total|final\s+(?:total|amount)|total\s+(?:due|payable|amount)|"
-    r"net\s+payable|amount\s+(?:due|payable)|balance\s+due|payable)\b",
-    re.I
-)
 # -------------------------------------------------------------------------
 # Image Preprocessing
 # -------------------------------------------------------------------------
 def pil_to_cv2(img: Image.Image) -> Any:
-    """Convert PIL to OpenCV"""
     arr = np.array(img)
     if arr.ndim == 2:
         return arr
@@ -333,30 +287,25 @@ def preprocess_image_for_tesseract(pil_img: Image.Image, target_w: int = 1500) -
     pil_img = pil_img.convert("RGB")
     w, h = pil_img.size
-    # Upscale if too small
     if w < target_w:
         scale = target_w / float(w)
         pil_img = pil_img.resize((int(w * scale), int(h * scale)), Image.LANCZOS)
     cv_img = pil_to_cv2(pil_img)
-    # Grayscale
     if cv_img.ndim == 3:
         gray = cv2.cvtColor(cv_img, cv2.COLOR_BGR2GRAY)
     else:
         gray = cv_img
-    # Denoise
     gray = cv2.fastNlMeansDenoising(gray, h=10)
-    # Adaptive thresholding (better for tables with shadows)
     try:
         bw = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                    cv2.THRESH_BINARY, 41, 15)
     except Exception:
         _, bw = cv2.threshold(gray, 127, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
-    # Morphological cleanup
     kernel = np.ones((2, 2), np.uint8)
     bw = cv2.morphologyEx(bw, cv2.MORPH_CLOSE, kernel)
     bw = cv2.morphologyEx(bw, cv2.MORPH_OPEN, kernel)
@@ -395,7 +344,7 @@ def image_to_tsv_cells(cv_img: Any) -> List[Dict[str, Any]]:
         cells.append({
             "text": txt,
-            "conf": max(0.0, conf) / 100.0,  # normalize to 0-1
             "left": left, "top": top, "width": width, "height": height,
             "center_x": center_x, "center_y": center_y
         })
@@ -427,7 +376,7 @@ def group_cells_into_rows(cells: List[Dict[str, Any]], y_tolerance: int = 12) ->
     return rows
 # -------------------------------------------------------------------------
-# Column Detection (Enhanced)
 # -------------------------------------------------------------------------
 def detect_numeric_columns(cells: List[Dict[str, Any]], max_columns: int = 6) -> List[float]:
     """Detect x-positions of numeric columns"""
@@ -439,7 +388,6 @@ def detect_numeric_columns(cells: List[Dict[str, Any]], max_columns: int = 6) ->
     if len(xs) == 1:
         return xs
-    # Cluster columns by gap analysis
     gaps = [xs[i+1] - xs[i] for i in range(len(xs)-1)]
     mean_gap = float(np.mean(gaps))
     std_gap = float(np.std(gaps)) if len(gaps) > 1 else 0.0
@@ -469,34 +417,28 @@ def assign_token_to_column(token_x: float, column_centers: List[float]) -> Optio
     return int(np.argmin(distances))
 # -------------------------------------------------------------------------
-# Row Parsing (Enhanced for accuracy)
 # -------------------------------------------------------------------------
 def parse_rows_with_columns(
     rows: List[List[Dict[str, Any]]],
     page_cells: List[Dict[str, Any]],
     page_text: str = ""
 ) -> List[BillLineItem]:
-    """
-    Parse rows into line items with improved accuracy.
-    Handles multi-line descriptions and uncertain quantities.
-    """
     items = []
     column_centers = detect_numeric_columns(page_cells, max_columns=6)
-    for row_idx, row in enumerate(rows):
         tokens = [c["text"] for c in row]
         row_text = " ".join(tokens)
         row_lower = row_text.lower()
-        # Skip footers/headers
         if FOOTER_KEYWORDS.search(row_lower) and not any(is_numeric_token(t) for t in tokens):
             continue
-        # Require at least one numeric token
         if not any(is_numeric_token(t) for t in tokens):
             continue
-        # Extract amounts
         numeric_values = []
         for t in tokens:
             if is_numeric_token(t):
@@ -509,7 +451,6 @@ def parse_rows_with_columns(
         numeric_values = sorted(list(set(numeric_values)), reverse=True)
-        # Column-based parsing
         if column_centers:
             left_text_parts = []
             numeric_buckets = {i: [] for i in range(len(column_centers))}
@@ -530,13 +471,11 @@ def parse_rows_with_columns(
             item_name = " ".join(left_text_parts).strip()
             item_name = clean_item_name(item_name) if item_name else "UNKNOWN"
-            # Extract from columns (right-most is typically amount)
             num_cols = len(column_centers)
             amount = None
             rate = None
             qty = None
-            # Try rightmost column first (usually total amount)
             if num_cols >= 1:
                 bucket = numeric_buckets.get(num_cols - 1, [])
                 if bucket:
@@ -544,25 +483,21 @@ def parse_rows_with_columns(
                     amount = normalize_num_str(amt_str, allow_zero=False)
             if amount is None:
-                # Fallback: take largest numeric value
                 for v in numeric_values:
                     if v > 0:
                         amount = v
                         break
-            # Try second-to-right for rate
             if num_cols >= 2:
                 bucket = numeric_buckets.get(num_cols - 2, [])
                 if bucket:
                     rate = normalize_num_str(bucket[-1][0], allow_zero=False)
-            # Try third-to-right for quantity
             if num_cols >= 3:
                 bucket = numeric_buckets.get(num_cols - 3, [])
                 if bucket:
                     qty = normalize_num_str(bucket[-1][0], allow_zero=False)
-            # Smart qty/rate inference
             if amount and not qty and not rate and numeric_values:
                 for cand in numeric_values:
                     if cand <= 0.1 or cand >= amount:
@@ -574,7 +509,6 @@ def parse_rows_with_columns(
                         rate = cand
                         break
-            # Derive missing values
             if qty and rate is None and amount and amount != 0:
                 rate = amount / qty
             elif rate and qty is None and amount and amount != 0:
@@ -582,7 +516,6 @@ def parse_rows_with_columns(
             elif amount and qty and rate is None:
                 rate = amount / qty if qty != 0 else 0.0
-            # Defaults
             if qty is None:
                 qty = 1.0
             if rate is None:
@@ -590,7 +523,6 @@ def parse_rows_with_columns(
             if amount is None:
                 amount = qty * rate if qty and rate else 0.0
-            # Finalize
             if amount > 0:
                 confidence = np.mean([c.get("conf", 0.85) for c in row]) if row else 0.85
                 items.append(BillLineItem(
@@ -602,7 +534,6 @@ def parse_rows_with_columns(
                     source_row=row_text,
                 ))
         else:
-            # Fallback: simple parsing without columns
             numeric_idxs = [i for i, t in enumerate(tokens) if is_numeric_token(t)]
             if not numeric_idxs:
                 continue
@@ -628,45 +559,10 @@ def parse_rows_with_columns(
     return items
 # -------------------------------------------------------------------------
-# Accuracy Validation
-# -------------------------------------------------------------------------
-def validate_totals(
-    line_items: List[BillLineItem],
-    bill_totals: BillTotal,
-    tolerance_pct: float = 2.0
-) -> Tuple[float, str]:
-    """
-    Validate extracted items sum vs bill total.
-    Returns: (accuracy_score 0-100, validation_msg)
-    """
-    if not line_items:
-        return 0.0, "No line items extracted"
-    items_sum = sum(item.item_amount for item in line_items)
-    # If we detected a final total, compare
-    if bill_totals.final_total_amount is not None:
-        final_total = bill_totals.final_total_amount
-        diff = abs(items_sum - final_total)
-        diff_pct = (diff / final_total * 100) if final_total != 0 else 0.0
-        if diff_pct <= tolerance_pct:
-            score = 100.0
-            msg = f"✓ Extracted total ({items_sum:.2f}) matches bill total ({final_total:.2f})"
-        else:
-            # Scale score based on how close
-            score = max(0.0, 100.0 - (diff_pct * 5))
-            msg = f"⚠ Mismatch: items_sum={items_sum:.2f}, bill_total={final_total:.2f}, diff={diff_pct:.1f}%"
-        return score, msg
-    return 85.0, f"No bill total detected; items_sum={items_sum:.2f}"
-# -------------------------------------------------------------------------
-# Main OCR Pipelines (Tesseract)
 # -------------------------------------------------------------------------
 def ocr_with_tesseract(file_bytes: bytes) -> List[ExtractedPage]:
-    """Enhanced Tesseract pipeline"""
     pages_out = []
     try:
@@ -681,36 +577,28 @@ def ocr_with_tesseract(file_bytes: bytes) -> List[ExtractedPage]:
     for idx, pil_img in enumerate(images, start=1):
         try:
-            # Preprocess & extract
             proc = preprocess_image_for_tesseract(pil_img)
             cells = image_to_tsv_cells(proc)
             rows = group_cells_into_rows(cells, y_tolerance=12)
-            # Get page text
             page_text = " ".join([" ".join([c["text"] for c in r]) for r in rows])
-            # Detect totals early
             subtotal, tax, discount, final_total = detect_totals_in_rows(rows)
-            # Parse line items
             items = parse_rows_with_columns(rows, cells, page_text)
-            # Deduplicate
             items = dedupe_items_advanced(items)
-            # Filter (exclude totals/subtotals)
             filtered_items = []
             for item in items:
                 name_lower = item.item_name.lower()
-                # Skip if name matches total keywords
                 if TOTAL_KEYWORDS.search(name_lower) or SUBTOTAL_KEYWORDS.search(name_lower):
                     continue
                 if item.item_amount > 0:
                     filtered_items.append(item)
-            # Create bill totals object
             bill_totals = BillTotal(
                 subtotal_amount=subtotal,
                 tax_amount=tax,
@@ -718,10 +606,6 @@ def ocr_with_tesseract(file_bytes: bytes) -> List[ExtractedPage]:
                 final_total_amount=final_total,
             )
-            # Validate
-            accuracy, val_msg = validate_totals(filtered_items, bill_totals)
-            logger.info(f"Page {idx}: {val_msg}")
             page_conf = np.mean([item.confidence for item in filtered_items]) if filtered_items else 0.8
             pages_out.append(ExtractedPage(
@@ -747,26 +631,23 @@ def ocr_with_tesseract(file_bytes: bytes) -> List[ExtractedPage]:
 # -------------------------------------------------------------------------
 # FastAPI App
 # -------------------------------------------------------------------------
-app = FastAPI(title="Enhanced Bill Extractor (Datathon v2)")
 class BillRequest(BaseModel):
-    document: str  # file://path or http(s) URL
 class BillResponse(BaseModel):
     is_success: bool
     error: Optional[str] = None
     data: Dict[str, Any]
-    accuracy_score: float  # 0-100
-    validation_message: str
     token_usage: Dict[str, int]
 @app.post("/extract-bill-data", response_model=BillResponse)
 async def extract_bill_data(payload: BillRequest):
-    """Main extraction endpoint"""
     doc_url = payload.document
     file_bytes = None
-    # Load file
     if doc_url.startswith("file://"):
         local_path = doc_url.replace("file://", "")
         try:
@@ -777,8 +658,6 @@ async def extract_bill_data(payload: BillRequest):
                 is_success=False,
                 error=f"Local file read failed: {e}",
                 data={"pagewise_line_items": [], "total_item_count": 0},
-                accuracy_score=0.0,
-                validation_message="File load failed",
                 token_usage={"total_tokens": 0, "input_tokens": 0, "output_tokens": 0},
             )
     else:
@@ -790,8 +669,6 @@ async def extract_bill_data(payload: BillRequest):
                     is_success=False,
                     error=f"Download failed (status={resp.status_code})",
                     data={"pagewise_line_items": [], "total_item_count": 0},
-                    accuracy_score=0.0,
-                    validation_message="HTTP error",
                     token_usage={"total_tokens": 0, "input_tokens": 0, "output_tokens": 0},
                 )
             file_bytes = resp.content
@@ -800,8 +677,6 @@ async def extract_bill_data(payload: BillRequest):
                 is_success=False,
                 error=f"HTTP error: {e}",
                 data={"pagewise_line_items": [], "total_item_count": 0},
-                accuracy_score=0.0,
-                validation_message="Network error",
                 token_usage={"total_tokens": 0, "input_tokens": 0, "output_tokens": 0},
             )
@@ -810,46 +685,28 @@ async def extract_bill_data(payload: BillRequest):
             is_success=False,
             error="No file bytes",
             data={"pagewise_line_items": [], "total_item_count": 0},
-            accuracy_score=0.0,
-            validation_message="Empty file",
             token_usage={"total_tokens": 0, "input_tokens": 0, "output_tokens": 0},
         )
-    # Extract
     logger.info(f"Processing with engine: {OCR_ENGINE}")
     try:
         if OCR_ENGINE == "tesseract":
             pages = ocr_with_tesseract(file_bytes)
         else:
-            # Fallback to tesseract
             pages = ocr_with_tesseract(file_bytes)
     except Exception as e:
         logger.exception("OCR failed: %s", e)
         pages = []
-    # Prepare response
     total_items = sum(len(p.line_items) for p in pages)
     pages_dict = [p.to_dict() for p in pages]
-    # Calculate overall accuracy
-    all_items = [item for p in pages for item in p.line_items]
-    all_totals = BillTotal(
-        subtotal_amount=sum(p.bill_totals.subtotal_amount or 0 for p in pages) or None,
-        tax_amount=sum(p.bill_totals.tax_amount or 0 for p in pages) or None,
-        discount_amount=sum(p.bill_totals.discount_amount or 0 for p in pages) or None,
-        final_total_amount=sum(p.bill_totals.final_total_amount or 0 for p in pages) or None,
-    )
-    overall_acc, msg = validate_totals(all_items, all_totals)
     return BillResponse(
         is_success=True,
         data={
             "pagewise_line_items": pages_dict,
             "total_item_count": total_items,
         },
-        accuracy_score=overall_acc,
-        validation_message=msg,
         token_usage={"total_tokens": 0, "input_tokens": 0, "output_tokens": 0},
     )
@@ -858,10 +715,6 @@ def health():
     return {
         "status": "ok",
         "engine": OCR_ENGINE,
-        "message": "Enhanced Bill Extractor (Datathon v2 - High Accuracy Mode)",
         "hint": "POST /extract-bill-data with {'document': '<url or file://path>'}",
     }
-if __name__ == "__main__":
-    import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=8080)

 import os
 import re
 import json
 import logging
 from io import BytesIO
+from typing import List, Dict, Any, Optional, Tuple
+from dataclasses import dataclass, asdict, field
 from fastapi import FastAPI
 from pydantic import BaseModel
 except Exception:
     vision = None
 # -------------------------------------------------------------------------
 # Configuration
 # -------------------------------------------------------------------------
 OCR_ENGINE = os.getenv("OCR_ENGINE", "tesseract").lower()
 AWS_REGION = os.getenv("AWS_REGION", "us-east-1")
 TESSERACT_PSM = os.getenv("TESSERACT_PSM", "6")
 logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger("bill-extractor")
 # Lazy clients
 _textract_client = None
     return _vision_client
 # -------------------------------------------------------------------------
+# Data Models (Clean Output)
 # -------------------------------------------------------------------------
 @dataclass
 class BillLineItem:
     item_quantity: float = 1.0
     item_rate: float = 0.0
     item_amount: float = 0.0
+    # Internal fields (not exported)
+    confidence: float = field(default=1.0, repr=False)
+    source_row: str = field(default="", repr=False)
+    is_description_continuation: bool = field(default=False, repr=False)
     def to_dict(self) -> Dict[str, Any]:
+        """Export only public fields"""
+        return {
+            "item_name": self.item_name,
+            "item_quantity": self.item_quantity,
+            "item_rate": self.item_rate,
+            "item_amount": self.item_amount,
+        }
 @dataclass
 class BillTotal:
 class ExtractedPage:
     """Page-level extraction result"""
     page_no: int
+    page_type: str
     line_items: List[BillLineItem]
     bill_totals: BillTotal
+    page_confidence: float = field(default=1.0, repr=False)  # Internal
     def to_dict(self) -> Dict[str, Any]:
+        """Export clean output (no confidence/validation)"""
         return {
             "page_no": self.page_no,
             "page_type": self.page_type,
             "line_items": [item.to_dict() for item in self.line_items],
             "bill_totals": self.bill_totals.to_dict(),
         }
 # -------------------------------------------------------------------------
+# Regular Expressions
 # -------------------------------------------------------------------------
 NUM_RE = re.compile(r"[-+]?\d{1,3}(?:[,0-9]*)(?:\.\d+)?")
 TOTAL_KEYWORDS = re.compile(
     r"\b(grand\s+total|net\s+payable|total\s+(?:amount|due)|amount\s+payable|bill\s+amount|"
     r"final\s+(?:amount|total)|balance\s+due|amount\s+due|total\s+payable|payable)\b",
 HEADER_KEYWORDS = [
     "description", "qty", "qty/hrs", "hrs", "rate", "unit price", "discount",
     "net", "amt", "amount", "price", "total", "sl.no", "s.no", "item", "service",
 ]
 # -------------------------------------------------------------------------
 # Text Cleaning & Normalization
 # -------------------------------------------------------------------------
 def sanitize_ocr_text(s: Optional[str]) -> str:
+    """Clean OCR text"""
     if not s:
         return ""
     s = s.replace("\u2014", "-").replace("\u2013", "-")
+    s = s.replace("\u00A0", " ")
     s = re.sub(r"[^\x09\x0A\x0D\x20-\x7E]", " ", s)
     s = s.replace("\r\n", "\n").replace("\r", "\n")
     s = re.sub(r"[ \t]+", " ", s)
     s = re.sub(r"\b(qiy|qty|oty|gty)\b", "qty", s, flags=re.I)
     s = re.sub(r"\b(deseription|descriptin|desription)\b", "description", s, flags=re.I)
     return s.strip()
     if s == "":
         return None
     negative = False
     if s.startswith("(") and s.endswith(")"):
         negative = True
         s = s[1:-1]
     s = re.sub(r"[^\d\-\+\,\.\(\)]", "", s)
     s = s.replace(",", "")
     s = s.replace("—", "-").replace("–", "-")
     s = re.sub(r"\s+", " ", s)
     s = s.strip(" -:,.=()[]{}|\\")
+    s = re.sub(r"\bOR\b", "DR", s)
     return s.strip()
 # -------------------------------------------------------------------------
     return (name_norm, amount_rounded)
 def dedupe_items_advanced(items: List[BillLineItem]) -> List[BillLineItem]:
+    """Remove duplicates while preserving highest-confidence versions"""
     if not items:
         return []
     seen: Dict[Tuple, BillLineItem] = {}
     for item in items:
         fp = item_fingerprint(item)
         if fp not in seen or item.confidence > seen[fp].confidence:
             seen[fp] = item
     final = []
     for item in seen.values():
         if item.is_description_continuation:
             if final and abs(float(final[-1].item_amount) - float(item.item_amount)) < 0.01:
                 final[-1].item_name = (final[-1].item_name + " " + item.item_name).strip()
                 continue
         final.append(item)
 # -------------------------------------------------------------------------
 # Total/Subtotal Detection
 # -------------------------------------------------------------------------
+FINAL_TOTAL_KEYWORDS = re.compile(
+    r"\b(grand\s+total|final\s+(?:total|amount)|total\s+(?:due|payable|amount)|"
+    r"net\s+payable|amount\s+(?:due|payable)|balance\s+due|payable)\b",
+    re.I
+)
 def detect_totals_in_rows(rows: List[List[Dict[str, Any]]]) -> Tuple[Optional[float], Optional[float], Optional[float], Optional[float]]:
+    """Scan rows for subtotal, tax, discount, final total"""
     subtotal = None
     tax = None
     discount = None
     final_total = None
     for row in rows:
         row_text = " ".join([c["text"] for c in row])
         row_lower = row_text.lower()
         tokens = row_text.split()
         amounts = []
         for t in tokens:
             if is_numeric_token(t):
         if not amounts:
             continue
         amount = max(amounts)
         if FINAL_TOTAL_KEYWORDS.search(row_lower):
             final_total = amount
         elif SUBTOTAL_KEYWORDS.search(row_lower):
     return subtotal, tax, discount, final_total
 # -------------------------------------------------------------------------
 # Image Preprocessing
 # -------------------------------------------------------------------------
 def pil_to_cv2(img: Image.Image) -> Any:
     arr = np.array(img)
     if arr.ndim == 2:
         return arr
     pil_img = pil_img.convert("RGB")
     w, h = pil_img.size
     if w < target_w:
         scale = target_w / float(w)
         pil_img = pil_img.resize((int(w * scale), int(h * scale)), Image.LANCZOS)
     cv_img = pil_to_cv2(pil_img)
     if cv_img.ndim == 3:
         gray = cv2.cvtColor(cv_img, cv2.COLOR_BGR2GRAY)
     else:
         gray = cv_img
     gray = cv2.fastNlMeansDenoising(gray, h=10)
     try:
         bw = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                    cv2.THRESH_BINARY, 41, 15)
     except Exception:
         _, bw = cv2.threshold(gray, 127, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
     kernel = np.ones((2, 2), np.uint8)
     bw = cv2.morphologyEx(bw, cv2.MORPH_CLOSE, kernel)
     bw = cv2.morphologyEx(bw, cv2.MORPH_OPEN, kernel)
         cells.append({
             "text": txt,
+            "conf": max(0.0, conf) / 100.0,
             "left": left, "top": top, "width": width, "height": height,
             "center_x": center_x, "center_y": center_y
         })
     return rows
 # -------------------------------------------------------------------------
+# Column Detection
 # -------------------------------------------------------------------------
 def detect_numeric_columns(cells: List[Dict[str, Any]], max_columns: int = 6) -> List[float]:
     """Detect x-positions of numeric columns"""
     if len(xs) == 1:
         return xs
     gaps = [xs[i+1] - xs[i] for i in range(len(xs)-1)]
     mean_gap = float(np.mean(gaps))
     std_gap = float(np.std(gaps)) if len(gaps) > 1 else 0.0
     return int(np.argmin(distances))
 # -------------------------------------------------------------------------
+# Row Parsing
 # -------------------------------------------------------------------------
 def parse_rows_with_columns(
     rows: List[List[Dict[str, Any]]],
     page_cells: List[Dict[str, Any]],
     page_text: str = ""
 ) -> List[BillLineItem]:
+    """Parse rows into line items"""
     items = []
     column_centers = detect_numeric_columns(page_cells, max_columns=6)
+    for row in rows:
         tokens = [c["text"] for c in row]
         row_text = " ".join(tokens)
         row_lower = row_text.lower()
         if FOOTER_KEYWORDS.search(row_lower) and not any(is_numeric_token(t) for t in tokens):
             continue
         if not any(is_numeric_token(t) for t in tokens):
             continue
         numeric_values = []
         for t in tokens:
             if is_numeric_token(t):
         numeric_values = sorted(list(set(numeric_values)), reverse=True)
         if column_centers:
             left_text_parts = []
             numeric_buckets = {i: [] for i in range(len(column_centers))}
             item_name = " ".join(left_text_parts).strip()
             item_name = clean_item_name(item_name) if item_name else "UNKNOWN"
             num_cols = len(column_centers)
             amount = None
             rate = None
             qty = None
             if num_cols >= 1:
                 bucket = numeric_buckets.get(num_cols - 1, [])
                 if bucket:
                     amount = normalize_num_str(amt_str, allow_zero=False)
             if amount is None:
                 for v in numeric_values:
                     if v > 0:
                         amount = v
                         break
             if num_cols >= 2:
                 bucket = numeric_buckets.get(num_cols - 2, [])
                 if bucket:
                     rate = normalize_num_str(bucket[-1][0], allow_zero=False)
             if num_cols >= 3:
                 bucket = numeric_buckets.get(num_cols - 3, [])
                 if bucket:
                     qty = normalize_num_str(bucket[-1][0], allow_zero=False)
             if amount and not qty and not rate and numeric_values:
                 for cand in numeric_values:
                     if cand <= 0.1 or cand >= amount:
                         rate = cand
                         break
             if qty and rate is None and amount and amount != 0:
                 rate = amount / qty
             elif rate and qty is None and amount and amount != 0:
             elif amount and qty and rate is None:
                 rate = amount / qty if qty != 0 else 0.0
             if qty is None:
                 qty = 1.0
             if rate is None:
             if amount is None:
                 amount = qty * rate if qty and rate else 0.0
             if amount > 0:
                 confidence = np.mean([c.get("conf", 0.85) for c in row]) if row else 0.85
                 items.append(BillLineItem(
                     source_row=row_text,
                 ))
         else:
             numeric_idxs = [i for i, t in enumerate(tokens) if is_numeric_token(t)]
             if not numeric_idxs:
                 continue
     return items
 # -------------------------------------------------------------------------
+# Tesseract OCR Pipeline
 # -------------------------------------------------------------------------
 def ocr_with_tesseract(file_bytes: bytes) -> List[ExtractedPage]:
+    """Tesseract pipeline"""
     pages_out = []
     try:
     for idx, pil_img in enumerate(images, start=1):
         try:
             proc = preprocess_image_for_tesseract(pil_img)
             cells = image_to_tsv_cells(proc)
             rows = group_cells_into_rows(cells, y_tolerance=12)
             page_text = " ".join([" ".join([c["text"] for c in r]) for r in rows])
             subtotal, tax, discount, final_total = detect_totals_in_rows(rows)
             items = parse_rows_with_columns(rows, cells, page_text)
             items = dedupe_items_advanced(items)
             filtered_items = []
             for item in items:
                 name_lower = item.item_name.lower()
                 if TOTAL_KEYWORDS.search(name_lower) or SUBTOTAL_KEYWORDS.search(name_lower):
                     continue
                 if item.item_amount > 0:
                     filtered_items.append(item)
             bill_totals = BillTotal(
                 subtotal_amount=subtotal,
                 tax_amount=tax,
                 final_total_amount=final_total,
             )
             page_conf = np.mean([item.confidence for item in filtered_items]) if filtered_items else 0.8
             pages_out.append(ExtractedPage(
 # -------------------------------------------------------------------------
 # FastAPI App
 # -------------------------------------------------------------------------
+app = FastAPI(title="Enhanced Bill Extractor (Clean Output)")
 class BillRequest(BaseModel):
+    document: str
 class BillResponse(BaseModel):
     is_success: bool
     error: Optional[str] = None
     data: Dict[str, Any]
     token_usage: Dict[str, int]
 @app.post("/extract-bill-data", response_model=BillResponse)
 async def extract_bill_data(payload: BillRequest):
+    """Main extraction endpoint (clean output)"""
     doc_url = payload.document
     file_bytes = None
     if doc_url.startswith("file://"):
         local_path = doc_url.replace("file://", "")
         try:
                 is_success=False,
                 error=f"Local file read failed: {e}",
                 data={"pagewise_line_items": [], "total_item_count": 0},
                 token_usage={"total_tokens": 0, "input_tokens": 0, "output_tokens": 0},
             )
     else:
                     is_success=False,
                     error=f"Download failed (status={resp.status_code})",
                     data={"pagewise_line_items": [], "total_item_count": 0},
                     token_usage={"total_tokens": 0, "input_tokens": 0, "output_tokens": 0},
                 )
             file_bytes = resp.content
                 is_success=False,
                 error=f"HTTP error: {e}",
                 data={"pagewise_line_items": [], "total_item_count": 0},
                 token_usage={"total_tokens": 0, "input_tokens": 0, "output_tokens": 0},
             )
             is_success=False,
             error="No file bytes",
             data={"pagewise_line_items": [], "total_item_count": 0},
             token_usage={"total_tokens": 0, "input_tokens": 0, "output_tokens": 0},
         )
     logger.info(f"Processing with engine: {OCR_ENGINE}")
     try:
         if OCR_ENGINE == "tesseract":
             pages = ocr_with_tesseract(file_bytes)
         else:
             pages = ocr_with_tesseract(file_bytes)
     except Exception as e:
         logger.exception("OCR failed: %s", e)
         pages = []
     total_items = sum(len(p.line_items) for p in pages)
     pages_dict = [p.to_dict() for p in pages]
     return BillResponse(
         is_success=True,
         data={
             "pagewise_line_items": pages_dict,
             "total_item_count": total_items,
         },
         token_usage={"total_tokens": 0, "input_tokens": 0, "output_tokens": 0},
     )
     return {
         "status": "ok",
         "engine": OCR_ENGINE,
+        "message": "Enhanced Bill Extractor (Clean Output Mode)",
         "hint": "POST /extract-bill-data with {'document': '<url or file://path>'}",
     }