Spaces:

Sathvik-kota
/

Datathon

Sleeping

App Files Files Community

Sathvik-kota commited on Nov 29, 2025

Commit

4de63d1

verified ·

1 Parent(s): b64719f

Upload folder using huggingface_hub

Browse files

Files changed (1) hide show

app.py +123 -290

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# app.py
 import os
 import re
 import json
@@ -24,10 +24,11 @@ if GEMINI_API_KEY:
     genai.configure(api_key=GEMINI_API_KEY)
 # ---------------- FASTAPI APP ----------------
-app = FastAPI(title="Bajaj Datathon - Bill Extractor")
 class BillRequest(BaseModel):
     document: str
 # ---------------- Helpers: number normalization & detection ----------------
 NUM_RE = re.compile(r"[-+]?\d{1,3}(?:[,0-9]*)(?:\.\d+)?")  # matches numbers with commas, decimals
 TOTAL_KEYWORDS = re.compile(r"(grand\s*total|net\s*payable|total\s*amount|amount\s*payable|bill\s*amount|final\s*amount|balance\s*due|sub\s*total|subtotal)", re.I)
@@ -244,6 +245,7 @@ def parse_row_to_item(cells_row: List[Dict[str, Any]]) -> Optional[Dict[str, Any
         "item_rate": float(round(rate_val, 2)) if rate_val else 0.0,
         "item_quantity": float(qty_val)
     }
 # ---------------- Duplicate suppression & subtotal detection ----------------
 def dedupe_items(items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
     """
@@ -351,322 +353,153 @@ def refine_with_gemini(page_items: List[Dict[str, Any]]) -> (List[Dict[str, Any]
     except Exception:
         return page_items, zero_usage
-# ---------------- FALLBACK REGEX EXTRACTOR ----------------
-def extract_items_from_text(text: str):
-    """
-    Very simple rule-based extractor used as a fallback
-    when LLM is not available or fails.
-    Logic:
-    - Split OCR text into lines
-    - For each line, if it has at least one numeric token,
-      treat the last numeric token as item_amount
-    - Everything before that is item_name
-    - Skip lines that look like totals
-    """
-    lines = [line.strip() for line in text.splitlines() if line.strip()]
-    bill_items = []
-    for line in lines:
-        # Skip obvious total lines
-        if re.search(r"(total|grand total|net payable)", line, re.IGNORECASE):
-            continue
-        tokens = line.split()
-        if not tokens:
-            continue
-        # Numeric tokens like 123 or 45.67
-        numeric_indices = [
-            i for i, tok in enumerate(tokens)
-            if re.fullmatch(r"\d+(\.\d+)?", tok)
-        ]
-        if not numeric_indices:
-            continue
-        last_idx = numeric_indices[-1]
-        amount_str = tokens[last_idx]
-        name_tokens = tokens[:last_idx]
-        if not name_tokens:
-            continue
-        try:
-            amount_val = float(amount_str)
-        except ValueError:
-            continue
-        item_name = " ".join(name_tokens)
-        bill_items.append(
-            {
-                "item_name": item_name,
-                "item_amount": amount_val,
-                "item_rate": 0.0,      # to be improved later
-                "item_quantity": 0.0,  # to be improved later
-            }
-        )
-    return bill_items
-# ---------------- LLM CALL (GEMINI) ----------------
-def call_gemini_for_items(pages_ocr):
-    """
-    pages_ocr: list of dicts:
-        { "page_no": "1", "page_type": "Bill Detail", "text": "<ocr_text>" }
-    Returns:
-        (pagewise_line_items, token_usage_dict)
-        or (None, zero_token_usage) if LLM is unavailable / fails.
-    """
-    zero_usage = {
-        "total_tokens": 0,
-        "input_tokens": 0,
-        "output_tokens": 0
-    }
-    if not GEMINI_API_KEY:
-        # No key configured → skip LLM and let caller fallback
-        return None, zero_usage
-    # Build a concise representation of pages for the prompt
-    pages_repr = [
-        {
-            "page_no": p["page_no"],
-            "page_type": p["page_type"],
-            "text": p["text"],
-        }
-        for p in pages_ocr
-    ]
-    system_instruction = (
-        "You are a medical bill extraction engine. "
-        "Given OCR text from each page of a bill, extract individual line items.\n\n"
-        "For each page, you must return bill_items with fields:\n"
-        "- item_name (string, as close as possible to bill text)\n"
-        "- item_rate (float; 0.0 if not clearly present)\n"
-        "- item_quantity (float; 1.0 if implicit; 0.0 if unknown)\n"
-        "- item_amount (float; net amount for that line)\n\n"
-        "Do NOT include grand totals, sub-totals, or net payable rows as separate items.\n"
-        "Only include the per-service / per-medicine lines.\n\n"
-        "Return ONLY valid JSON in this exact shape (no comments, no extra keys):\n"
-        "{\n"
-        "  \"pagewise_line_items\": [\n"
-        "    {\n"
-        "      \"page_no\": \"1\",\n"
-        "      \"page_type\": \"Bill Detail\",\n"
-        "      \"bill_items\": [\n"
-        "        {\n"
-        "          \"item_name\": \"...\",\n"
-        "          \"item_amount\": 123.45,\n"
-        "          \"item_rate\": 61.72,\n"
-        "          \"item_quantity\": 2.0\n"
-        "        }\n"
-        "      ]\n"
-        "    }\n"
-        "  ]\n"
-        "}\n"
-    )
-    user_prompt = (
-        "Use the following OCR text per page to extract line items into the required schema.\n"
-        "The data is provided as a JSON array under the key 'pages_ocr'.\n\n"
-        f"pages_ocr = {json.dumps(pages_repr, ensure_ascii=False)}"
-    )
-    try:
-        model = genai.GenerativeModel(GEMINI_MODEL_NAME)
-        response = model.generate_content(
-            [
-                {"role": "system", "parts": [system_instruction]},
-                {"role": "user", "parts": [user_prompt]},
-            ]
-        )
-        raw_text = response.text.strip()
-        # Strip possible ```json ... ``` wrappers
-        if raw_text.startswith("```"):
-            raw_text = re.sub(r"^```[a-zA-Z]*", "", raw_text)
-            raw_text = re.sub(r"```$", "", raw_text)
-            raw_text = raw_text.strip()
-        parsed = json.loads(raw_text)
-        pagewise = parsed.get("pagewise_line_items", [])
-        if not isinstance(pagewise, list):
-            return None, zero_usage
-        # We are on free tier, so we keep token_usage as zeros (schema only)
-        token_usage = zero_usage
-        return pagewise, token_usage
-    except Exception:
-        # Any LLM error → caller will fallback to regex
-        return None, zero_usage
-# ---------------- MAIN ENDPOINT ----------------
 @app.post("/extract-bill-data")
 async def extract_bill_data(payload: BillRequest):
-    """
-    Main Datathon endpoint.
-    Flow:
-    - Download document from URL
-    - If PDF: convert each page to an image and run OCR
-    - If image: run OCR directly
-    - Build page-wise OCR text
-    - Try LLM (Gemini) to extract structured line items
-      - If LLM fails or key missing → fallback to regex-only extraction
-    - Return JSON in the exact schema expected by the evaluators
-    """
     doc_url = payload.document
-    # ---- Step 1: Download file ----
     try:
-        headers = {
-            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
-        }
-        response = requests.get(doc_url, headers=headers, timeout=20)
-        if response.status_code != 200:
-            return {
-                "is_success": False,
-                "token_usage": {
-                    "total_tokens": 0,
-                    "input_tokens": 0,
-                    "output_tokens": 0
-                },
-                "data": {
-                    "pagewise_line_items": [],
-                    "total_item_count": 0
-                }
-            }
-        file_bytes = response.content
-    except Exception:
         return {
             "is_success": False,
-            "token_usage": {
-                "total_tokens": 0,
-                "input_tokens": 0,
-                "output_tokens": 0
-            },
-            "data": {
-                "pagewise_line_items": [],
-                "total_item_count": 0
-            }
         }
-    # ---- Step 2: OCR (PDF + images) ----
-    pagewise_ocr = []  # list of {page_no, page_type, text}
-    # IMPORTANT: strip query (?sv=...) only for extension detection
     clean_url = doc_url.split("?", 1)[0].lower()
     try:
-        # PDF case
         if clean_url.endswith(".pdf"):
             pages = convert_from_bytes(file_bytes)
-            for idx, page_img in enumerate(pages, start=1):
-                text = pytesseract.image_to_string(page_img)
-                pagewise_ocr.append(
-                    {
-                        "page_no": str(idx),
-                        "page_type": "Bill Detail",
-                        "text": text,
-                    }
-                )
-        # Image case
-        elif any(clean_url.endswith(ext) for ext in [".png", ".jpg", ".jpeg"]):
-            image = Image.open(BytesIO(file_bytes))
-            text = pytesseract.image_to_string(image)
-            pagewise_ocr.append(
-                {
-                    "page_no": "1",
-                    "page_type": "Bill Detail",
-                    "text": text,
-                }
-            )
-        # Other file types → currently not handled
         else:
-            pagewise_ocr = []
     except Exception:
-        # OCR failure
-        return {
-            "is_success": False,
-            "token_usage": {
-                "total_tokens": 0,
-                "input_tokens": 0,
-                "output_tokens": 0
-            },
-            "data": {
-                "pagewise_line_items": [],
-                "total_item_count": 0
-            }
-        }
-    # ---- Step 3: LLM extraction + fallback ----
     pagewise_line_items = []
-    token_usage = {
-        "total_tokens": 0,
-        "input_tokens": 0,
-        "output_tokens": 0
-    }
-    if pagewise_ocr:
-        # Try Gemini first (if key is set)
-        pagewise_llm, token_usage = call_gemini_for_items(pagewise_ocr)
-        if pagewise_llm:
-            pagewise_line_items = pagewise_llm
-        else:
-            # Fallback: regex-based extraction
-            for p in pagewise_ocr:
-                items = extract_items_from_text(p["text"])
-                if items:
-                    pagewise_line_items.append(
-                        {
-                            "page_no": p["page_no"],
-                            "page_type": p["page_type"],
-                            "bill_items": items,
-                        }
-                    )
-    total_item_count = sum(
-        len(p.get("bill_items", [])) for p in pagewise_line_items
-    )
-    # ---- Step 4: Final response ----
     return {
         "is_success": True,
-        "token_usage": token_usage,
         "data": {
             "pagewise_line_items": pagewise_line_items,
             "total_item_count": total_item_count
         }
     }
 @app.get("/")
 def health_check():
-    """
-    Simple health endpoint to verify that the API is running.
-    """
     return {
         "status": "ok",
-        "message": "Bajaj Datathon bill extraction API is live.",
-        "hint": "Use POST /extract-bill-data with { 'document': '<url>' }"
     }

+# app.py (HIGH ACCURACY TSV + preprocessing + optional Gemini refinement)
 import os
 import re
 import json
     genai.configure(api_key=GEMINI_API_KEY)
 # ---------------- FASTAPI APP ----------------
+app = FastAPI(title="Bajaj Datathon - Bill Extractor (High Accuracy)")
 class BillRequest(BaseModel):
     document: str
 # ---------------- Helpers: number normalization & detection ----------------
 NUM_RE = re.compile(r"[-+]?\d{1,3}(?:[,0-9]*)(?:\.\d+)?")  # matches numbers with commas, decimals
 TOTAL_KEYWORDS = re.compile(r"(grand\s*total|net\s*payable|total\s*amount|amount\s*payable|bill\s*amount|final\s*amount|balance\s*due|sub\s*total|subtotal)", re.I)
         "item_rate": float(round(rate_val, 2)) if rate_val else 0.0,
         "item_quantity": float(qty_val)
     }
 # ---------------- Duplicate suppression & subtotal detection ----------------
 def dedupe_items(items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
     """
     except Exception:
         return page_items, zero_usage
+# ---------------- Main endpoint logic ----------------
 @app.post("/extract-bill-data")
 async def extract_bill_data(payload: BillRequest):
     doc_url = payload.document
+    # Step 1: download
     try:
+        headers = {"User-Agent": "Mozilla/5.0"}
+        resp = requests.get(doc_url, headers=headers, timeout=30)
+        if resp.status_code != 200:
+            raise RuntimeError(f"download failed status={resp.status_code}")
+        file_bytes = resp.content
+    except Exception as e:
         return {
             "is_success": False,
+            "token_usage": {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0},
+            "data": {"pagewise_line_items": [], "total_item_count": 0}
         }
+    # Step 2: convert PDF->images or handle single image
+    images = []
     clean_url = doc_url.split("?", 1)[0].lower()
     try:
         if clean_url.endswith(".pdf"):
             pages = convert_from_bytes(file_bytes)
+            images = pages
+        elif any(clean_url.endswith(ext) for ext in [".png", ".jpg", ".jpeg", ".tiff", ".bmp"]):
+            img = Image.open(BytesIO(file_bytes))
+            images = [img]
         else:
+            # try PDF conversion as fallback
+            try:
+                pages = convert_from_bytes(file_bytes)
+                images = pages
+            except Exception:
+                images = []
     except Exception:
+        images = []
     pagewise_line_items = []
+    cumulative_token_usage = {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0}
+    # Process each page
+    for idx, page_img in enumerate(images, start=1):
+        try:
+            # preprocess
+            processed_cv = preprocess_image(page_img)
+            # get TSV / word cells
+            cells = image_to_tsv_cells(processed_cv)
+            # reconstruct rows
+            rows = group_cells_into_rows(cells, y_tolerance=12)
+            rows_texts = [" ".join([c["text"] for c in r]) for r in rows]
+            # detect subtotal/final totals in page text
+            subtotals = detect_subtotals_and_totals(rows_texts)
+            # parse each row to items
+            parsed_items = []
+            for r in rows:
+                parsed = parse_row_to_item(r)
+                if parsed is None:
+                    continue
+                # filter out obvious total-like names
+                if TOTAL_KEYWORDS.search(parsed["item_name"]):
+                    continue
+                parsed_items.append(parsed)
+            # dedupe
+            parsed_items = dedupe_items(parsed_items)
+            # if no items found via TSV (e.g., OCR failed), fallback to plain OCR text + simple extractor
+            if not parsed_items:
+                try:
+                    raw_text = pytesseract.image_to_string(processed_cv)
+                    parsed_items = []
+                    # reuse your simpler extractor logic (very small and safe)
+                    for line in [ln.strip() for ln in raw_text.splitlines() if ln.strip()]:
+                        if TOTAL_KEYWORDS.search(line):
+                            continue
+                        toks = line.split()
+                        numeric_idxs = [i for i,t in enumerate(toks) if NUM_RE.search(t)]
+                        if numeric_idxs:
+                            last = numeric_idxs[-1]
+                            amt = normalize_num_str(toks[last])
+                            if amt is None:
+                                continue
+                            name = " ".join(toks[:last]).strip()
+                            if name == "":
+                                continue
+                            parsed_items.append({
+                                "item_name": name,
+                                "item_amount": float(round(amt, 2)),
+                                "item_rate": 0.0,
+                                "item_quantity": 1.0
+                            })
+                    parsed_items = dedupe_items(parsed_items)
+                except Exception:
+                    parsed_items = []
+            # optional Gemini refinement (page-level)
+            if GEMINI_API_KEY and parsed_items:
+                refined, token_u = refine_with_gemini(parsed_items)
+                parsed_items = refined
+                # accumulate token usage (placeholder zeros kept)
+                for k in cumulative_token_usage:
+                    cumulative_token_usage[k] += token_u.get(k, 0)
+            # Page type heuristics
+            page_type = "Bill Detail"
+            page_text_join = " ".join(rows_texts).lower()
+            if "pharmacy" in page_text_join or "medicine" in page_text_join or "tablet" in page_text_join:
+                page_type = "Pharmacy"
+            if "final bill" in page_text_join or "grand total" in page_text_join:
+                page_type = "Final Bill"
+            # attach to pagewise output
+            pagewise_line_items.append({
+                "page_no": str(idx),
+                "page_type": page_type,
+                "bill_items": parsed_items
+            })
+        except Exception:
+            # on per-page failure continue with empty list
+            pagewise_line_items.append({
+                "page_no": str(idx),
+                "page_type": "Bill Detail",
+                "bill_items": []
+            })
+            continue
+    total_item_count = sum(len(p.get("bill_items", [])) for p in pagewise_line_items)
     return {
         "is_success": True,
+        "token_usage": cumulative_token_usage,
         "data": {
             "pagewise_line_items": pagewise_line_items,
             "total_item_count": total_item_count
         }
     }
 @app.get("/")
 def health_check():
     return {
         "status": "ok",
+        "message": "Bajaj Datathon bill extraction API (high-accuracy) is live.",
+        "hint": "POST /extract-bill-data with { 'document': '<url>' }"
     }