Spaces:

Sathvik-kota
/

Datathon

Sleeping

Sathvik-kota commited on Nov 29, 2025

Commit

a139240

verified ·

1 Parent(s): 29a0b82

Upload folder using huggingface_hub

Files changed (1) hide show

app.py CHANGED Viewed

@@ -15,7 +15,7 @@ import google.generativeai as genai
 # ---------------- LLM CONFIG (Gemini) ----------------
 GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
-GEMINI_MODEL_NAME = "gemini-1.5-flash"
 if GEMINI_API_KEY:
     genai.configure(api_key=GEMINI_API_KEY)
@@ -250,24 +250,26 @@ async def extract_bill_data(payload: BillRequest):
     # ---- Step 2: OCR (PDF + images) ----
     pagewise_ocr = []  # list of {page_no, page_type, text}
-    lower_url = doc_url.lower()
     try:
         # PDF case
-        if lower_url.endswith(".pdf"):
             pages = convert_from_bytes(file_bytes)
             for idx, page_img in enumerate(pages, start=1):
                 text = pytesseract.image_to_string(page_img)
                 pagewise_ocr.append(
                     {
                         "page_no": str(idx),
-                        "page_type": "Bill Detail",  # can refine later
                         "text": text,
                     }
                 )
         # Image case
-        elif any(lower_url.endswith(ext) for ext in [".png", ".jpg", ".jpeg"]):
             image = Image.open(BytesIO(file_bytes))
             text = pytesseract.image_to_string(image)
             pagewise_ocr.append(

 # ---------------- LLM CONFIG (Gemini) ----------------
 GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
+GEMINI_MODEL_NAME = "gemini-2.5-flash"
 if GEMINI_API_KEY:
     genai.configure(api_key=GEMINI_API_KEY)
     # ---- Step 2: OCR (PDF + images) ----
     pagewise_ocr = []  # list of {page_no, page_type, text}
+    # IMPORTANT: strip query (?sv=...) only for extension detection
+    clean_url = doc_url.split("?", 1)[0].lower()
     try:
         # PDF case
+        if clean_url.endswith(".pdf"):
             pages = convert_from_bytes(file_bytes)
             for idx, page_img in enumerate(pages, start=1):
                 text = pytesseract.image_to_string(page_img)
                 pagewise_ocr.append(
                     {
                         "page_no": str(idx),
+                        "page_type": "Bill Detail",
                         "text": text,
                     }
                 )
         # Image case
+        elif any(clean_url.endswith(ext) for ext in [".png", ".jpg", ".jpeg"]):
             image = Image.open(BytesIO(file_bytes))
             text = pytesseract.image_to_string(image)
             pagewise_ocr.append(