Spaces:

Sathvik-kota
/

Datathon

Sleeping

App Files Files Community

Sathvik-kota commited on Nov 28, 2025

Commit

9439b9f

verified ·

1 Parent(s): 0bfaa94

Upload folder using huggingface_hub

Browse files

Files changed (1) hide show

app.py +215 -86

app.py CHANGED Viewed

@@ -1,45 +1,57 @@
 from fastapi import FastAPI
 from pydantic import BaseModel
-from io import BytesIO
-from pdf2image import convert_from_bytes
 from PIL import Image
-import pytesseract, requests, re
 app = FastAPI()
 class BillRequest(BaseModel):
     document: str
-def parse_text(text):
-    """Extract bill items using a simple numeric line pattern."""
-    lines = [l.strip() for l in text.splitlines() if l.strip()]
-    pattern = re.compile(r"^(.*\D)?(\d+(?:\.\d+)?)$")
-    items=[]
-    for line in lines:
-        m=pattern.match(line)
-        if not m: continue
-        name=(m.group(1) or "").strip()
-        if not name: continue
-        try: amount=float(m.group(2))
-        except: continue
-        items.append({"item_name":name,"item_amount":amount,"item_rate":0.0,"item_quantity":0.0})
-    return items
 def extract_items_from_text(text: str):
     """
-    Looser heuristic:
-    - Take any line that has at least one numeric token
-    - Use the last numeric token as item_amount
-    - Everything before that token is item_name
-    - Skip obvious total/summary lines
     """
     lines = [line.strip() for line in text.splitlines() if line.strip()]
     bill_items = []
     for line in lines:
-        # Skip totals / summary lines
         if re.search(r"(total|grand total|net payable)", line, re.IGNORECASE):
             continue
@@ -47,7 +59,7 @@ def extract_items_from_text(text: str):
         if not tokens:
             continue
-        # Find all purely numeric tokens (e.g. 123, 45.67)
         numeric_indices = [
             i for i, tok in enumerate(tokens)
             if re.fullmatch(r"\d+(\.\d+)?", tok)
@@ -60,7 +72,6 @@ def extract_items_from_text(text: str):
         amount_str = tokens[last_idx]
         name_tokens = tokens[:last_idx]
-        # If there's no text before the amount, skip
         if not name_tokens:
             continue
@@ -75,24 +86,128 @@ def extract_items_from_text(text: str):
             {
                 "item_name": item_name,
                 "item_amount": amount_val,
-                "item_rate": 0.0,
-                "item_quantity": 0.0,
             }
         )
     return bill_items
 @app.post("/extract-bill-data")
 async def extract_bill_data(payload: BillRequest):
     """
     Main Datathon endpoint.
-    Current flow:
-    - Download the document from the provided URL
-    - If it's a PDF, convert pages to images and run OCR per page
-    - If it's an image (png/jpg/jpeg), run OCR on the image
-    - Extract line items using a simple text heuristic
-    - Return data in the required JSON format
     """
     doc_url = payload.document
@@ -104,7 +219,6 @@ async def extract_bill_data(payload: BillRequest):
         response = requests.get(doc_url, headers=headers, timeout=20)
         if response.status_code != 200:
-            # URL not reachable → graceful failure
             return {
                 "is_success": False,
                 "token_usage": {
@@ -121,7 +235,6 @@ async def extract_bill_data(payload: BillRequest):
         file_bytes = response.content
     except Exception:
-        # Network or other error
         return {
             "is_success": False,
             "token_usage": {
@@ -135,50 +248,42 @@ async def extract_bill_data(payload: BillRequest):
             }
         }
-    pagewise_line_items = []
-    total_item_count = 0
-    # ---- Step 2: OCR + extraction ----
     try:
-        lower_url = doc_url.lower()
-        # PDF handling
         if lower_url.endswith(".pdf"):
             pages = convert_from_bytes(file_bytes)
             for idx, page_img in enumerate(pages, start=1):
-                ocr_text = pytesseract.image_to_string(page_img)
-                bill_items = extract_items_from_text(ocr_text)
-                if bill_items:
-                    pagewise_line_items.append(
-                        {
-                            "page_no": str(idx),
-                            "page_type": "Bill Detail",  # can refine later
-                            "bill_items": bill_items,
-                        }
-                    )
-                    total_item_count += len(bill_items)
-        # Image handling
-        elif any(lower_url.endswith(ext) for ext in [".png", ".jpg", ".jpeg"]):
-            image = Image.open(BytesIO(file_bytes))
-            ocr_text = pytesseract.image_to_string(image)
-            bill_items = extract_items_from_text(ocr_text)
-            if bill_items:
-                pagewise_line_items.append(
                     {
-                        "page_no": "1",
-                        "page_type": "Bill Detail",
-                        "bill_items": bill_items,
                     }
                 )
-                total_item_count = len(bill_items)
-        # Other types (json, txt, etc.) → currently no extraction
     except Exception:
-        # OCR / parsing failure → keep schema, mark as failure
         return {
             "is_success": False,
             "token_usage": {
@@ -192,14 +297,41 @@ async def extract_bill_data(payload: BillRequest):
             }
         }
-    # ---- Step 3: Final response ----
     return {
         "is_success": True,
-        "token_usage": {
-            "total_tokens": 0,   # update when LLMs are added
-            "input_tokens": 0,
-            "output_tokens": 0
-        },
         "data": {
             "pagewise_line_items": pagewise_line_items,
             "total_item_count": total_item_count
@@ -207,16 +339,13 @@ async def extract_bill_data(payload: BillRequest):
     }
-def bad_response():
-    return {
-        "is_success":False,
-        "token_usage":{"total_tokens":0,"input_tokens":0,"output_tokens":0},
-        "data":{"pagewise_line_items":[],"total_item_count":0}
-    }
-def success(data,count):
     return {
-        "is_success":True,
-        "token_usage":{"total_tokens":0,"input_tokens":0,"output_tokens":0},
-        "data":{"pagewise_line_items":data,"total_item_count":count}
     }

+# app.py
+import os
+import re
+import json
+from io import BytesIO
 from fastapi import FastAPI
 from pydantic import BaseModel
+import requests
 from PIL import Image
+from pdf2image import convert_from_bytes
+import pytesseract
+import google.generativeai as genai
+# ---------------- LLM CONFIG (Gemini) ----------------
+GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
+GEMINI_MODEL_NAME = "gemini-1.5-flash"
+if GEMINI_API_KEY:
+    genai.configure(api_key=GEMINI_API_KEY)
+# ---------------- FASTAPI APP ----------------
 app = FastAPI()
 class BillRequest(BaseModel):
+    """
+    Request body model.
+    Expects a public URL to a bill document (image/PDF).
+    """
     document: str
+# ---------------- FALLBACK REGEX EXTRACTOR ----------------
 def extract_items_from_text(text: str):
     """
+    Very simple rule-based extractor used as a fallback
+    when LLM is not available or fails.
+    Logic:
+    - Split OCR text into lines
+    - For each line, if it has at least one numeric token,
+      treat the last numeric token as item_amount
+    - Everything before that is item_name
+    - Skip lines that look like totals
     """
     lines = [line.strip() for line in text.splitlines() if line.strip()]
     bill_items = []
     for line in lines:
+        # Skip obvious total lines
         if re.search(r"(total|grand total|net payable)", line, re.IGNORECASE):
             continue
         if not tokens:
             continue
+        # Numeric tokens like 123 or 45.67
         numeric_indices = [
             i for i, tok in enumerate(tokens)
             if re.fullmatch(r"\d+(\.\d+)?", tok)
         amount_str = tokens[last_idx]
         name_tokens = tokens[:last_idx]
         if not name_tokens:
             continue
             {
                 "item_name": item_name,
                 "item_amount": amount_val,
+                "item_rate": 0.0,      # to be improved later
+                "item_quantity": 0.0,  # to be improved later
             }
         )
     return bill_items
+# ---------------- LLM CALL (GEMINI) ----------------
+def call_gemini_for_items(pages_ocr):
+    """
+    pages_ocr: list of dicts:
+        { "page_no": "1", "page_type": "Bill Detail", "text": "<ocr_text>" }
+    Returns:
+        (pagewise_line_items, token_usage_dict)
+        or (None, zero_token_usage) if LLM is unavailable / fails.
+    """
+    zero_usage = {
+        "total_tokens": 0,
+        "input_tokens": 0,
+        "output_tokens": 0
+    }
+    if not GEMINI_API_KEY:
+        # No key configured → skip LLM and let caller fallback
+        return None, zero_usage
+    # Build a concise representation of pages for the prompt
+    pages_repr = [
+        {
+            "page_no": p["page_no"],
+            "page_type": p["page_type"],
+            "text": p["text"],
+        }
+        for p in pages_ocr
+    ]
+    system_instruction = (
+        "You are a medical bill extraction engine. "
+        "Given OCR text from each page of a bill, extract individual line items.\n\n"
+        "For each page, you must return bill_items with fields:\n"
+        "- item_name (string, as close as possible to bill text)\n"
+        "- item_rate (float; 0.0 if not clearly present)\n"
+        "- item_quantity (float; 1.0 if implicit; 0.0 if unknown)\n"
+        "- item_amount (float; net amount for that line)\n\n"
+        "Do NOT include grand totals, sub-totals, or net payable rows as separate items.\n"
+        "Only include the per-service / per-medicine lines.\n\n"
+        "Return ONLY valid JSON in this exact shape (no comments, no extra keys):\n"
+        "{\n"
+        "  \"pagewise_line_items\": [\n"
+        "    {\n"
+        "      \"page_no\": \"1\",\n"
+        "      \"page_type\": \"Bill Detail\",\n"
+        "      \"bill_items\": [\n"
+        "        {\n"
+        "          \"item_name\": \"...\",\n"
+        "          \"item_amount\": 123.45,\n"
+        "          \"item_rate\": 61.72,\n"
+        "          \"item_quantity\": 2.0\n"
+        "        }\n"
+        "      ]\n"
+        "    }\n"
+        "  ]\n"
+        "}\n"
+    )
+    user_prompt = (
+        "Use the following OCR text per page to extract line items into the required schema.\n"
+        "The data is provided as a JSON array under the key 'pages_ocr'.\n\n"
+        f"pages_ocr = {json.dumps(pages_repr, ensure_ascii=False)}"
+    )
+    try:
+        model = genai.GenerativeModel(GEMINI_MODEL_NAME)
+        response = model.generate_content(
+            [
+                {"role": "system", "parts": [system_instruction]},
+                {"role": "user", "parts": [user_prompt]},
+            ]
+        )
+        raw_text = response.text.strip()
+        # Strip possible ```json ... ``` wrappers
+        if raw_text.startswith("```"):
+            raw_text = re.sub(r"^```[a-zA-Z]*", "", raw_text)
+            raw_text = re.sub(r"```$", "", raw_text)
+            raw_text = raw_text.strip()
+        parsed = json.loads(raw_text)
+        pagewise = parsed.get("pagewise_line_items", [])
+        if not isinstance(pagewise, list):
+            return None, zero_usage
+        # We are on free tier, so we keep token_usage as zeros (schema only)
+        token_usage = zero_usage
+        return pagewise, token_usage
+    except Exception:
+        # Any LLM error → caller will fallback to regex
+        return None, zero_usage
+# ---------------- MAIN ENDPOINT ----------------
 @app.post("/extract-bill-data")
 async def extract_bill_data(payload: BillRequest):
     """
     Main Datathon endpoint.
+    Flow:
+    - Download document from URL
+    - If PDF: convert each page to an image and run OCR
+    - If image: run OCR directly
+    - Build page-wise OCR text
+    - Try LLM (Gemini) to extract structured line items
+      - If LLM fails or key missing → fallback to regex-only extraction
+    - Return JSON in the exact schema expected by the evaluators
     """
     doc_url = payload.document
         response = requests.get(doc_url, headers=headers, timeout=20)
         if response.status_code != 200:
             return {
                 "is_success": False,
                 "token_usage": {
         file_bytes = response.content
     except Exception:
         return {
             "is_success": False,
             "token_usage": {
             }
         }
+    # ---- Step 2: OCR (PDF + images) ----
+    pagewise_ocr = []  # list of {page_no, page_type, text}
+    lower_url = doc_url.lower()
     try:
+        # PDF case
         if lower_url.endswith(".pdf"):
             pages = convert_from_bytes(file_bytes)
             for idx, page_img in enumerate(pages, start=1):
+                text = pytesseract.image_to_string(page_img)
+                pagewise_ocr.append(
                     {
+                        "page_no": str(idx),
+                        "page_type": "Bill Detail",  # can refine later
+                        "text": text,
                     }
                 )
+        # Image case
+        elif any(lower_url.endswith(ext) for ext in [".png", ".jpg", ".jpeg"]):
+            image = Image.open(BytesIO(file_bytes))
+            text = pytesseract.image_to_string(image)
+            pagewise_ocr.append(
+                {
+                    "page_no": "1",
+                    "page_type": "Bill Detail",
+                    "text": text,
+                }
+            )
+        # Other file types → currently not handled
+        else:
+            pagewise_ocr = []
     except Exception:
+        # OCR failure
         return {
             "is_success": False,
             "token_usage": {
             }
         }
+    # ---- Step 3: LLM extraction + fallback ----
+    pagewise_line_items = []
+    token_usage = {
+        "total_tokens": 0,
+        "input_tokens": 0,
+        "output_tokens": 0
+    }
+    if pagewise_ocr:
+        # Try Gemini first (if key is set)
+        pagewise_llm, token_usage = call_gemini_for_items(pagewise_ocr)
+        if pagewise_llm:
+            pagewise_line_items = pagewise_llm
+        else:
+            # Fallback: regex-based extraction
+            for p in pagewise_ocr:
+                items = extract_items_from_text(p["text"])
+                if items:
+                    pagewise_line_items.append(
+                        {
+                            "page_no": p["page_no"],
+                            "page_type": p["page_type"],
+                            "bill_items": items,
+                        }
+                    )
+    total_item_count = sum(
+        len(p.get("bill_items", [])) for p in pagewise_line_items
+    )
+    # ---- Step 4: Final response ----
     return {
         "is_success": True,
+        "token_usage": token_usage,
         "data": {
             "pagewise_line_items": pagewise_line_items,
             "total_item_count": total_item_count
     }
+@app.get("/")
+def health_check():
+    """
+    Simple health endpoint to verify that the API is running.
+    """
     return {
+        "status": "ok",
+        "message": "Bajaj Datathon bill extraction API is live.",
+        "hint": "Use POST /extract-bill-data with { 'document': '<url>' }"
     }