Spaces:

Sathvik-kota
/

Datathon

Sleeping

App Files Files Community

Sathvik-kota commited on Nov 29, 2025

Commit

b64719f

verified ·

1 Parent(s): b100b23

Upload folder using huggingface_hub

Browse files

Files changed (1) hide show

app.py +62 -0

app.py CHANGED Viewed

@@ -289,6 +289,68 @@ def detect_subtotals_and_totals(rows_texts: List[str]) -> Dict[str, Any]:
                         final_total = float(round(val, 2))
     return {"subtotal": subtotal, "final_total": final_total}
 # ---------------- FALLBACK REGEX EXTRACTOR ----------------

                         final_total = float(round(val, 2))
     return {"subtotal": subtotal, "final_total": final_total}
+# ---------------- Optional: Gemini refinement ----------------
+def refine_with_gemini(page_items: List[Dict[str, Any]]) -> (List[Dict[str, Any]], Dict[str,int]):
+    """
+    Send structured extracted items to Gemini to ask it to:
+    - validate amounts and quantities
+    - remove summary rows (if any slipped)
+    - dedupe and correct obvious OCR mis-splits
+    Returns (refined_items, token_usage)
+    If GEMINI_API_KEY not set or any error occurs, returns original items and zero token_usage.
+    """
+    zero_usage = {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0}
+    if not GEMINI_API_KEY:
+        return page_items, zero_usage
+    try:
+        # Build compact JSON payload for the LLM
+        prompt = (
+            "You are a precise bill extraction assistant. You are given a list of extracted line items "
+            "parsed from OCR with fields item_name, item_quantity, item_rate, item_amount. "
+            "Your job: remove any rows that are totals or duplicate summaries, fix numeric mistakes if obvious, "
+            "and return cleaned JSON array of items only. Do NOT add new items. "
+            "Return ONLY valid JSON (an array of objects with same fields).\n\n"
+            f"items = {json.dumps(page_items, ensure_ascii=False)}"
+        )
+        model = genai.GenerativeModel(GEMINI_MODEL_NAME)
+        response = model.generate_content(
+            [
+                {"role": "system", "parts": ["You are concise and precise. Return only valid JSON."]},
+                {"role": "user", "parts": [prompt]},
+            ]
+        )
+        raw_text = response.text.strip()
+        # remove fenced codeblock markers
+        if raw_text.startswith("```"):
+            raw_text = re.sub(r"^```[a-zA-Z]*", "", raw_text)
+            raw_text = re.sub(r"```$", "", raw_text).strip()
+        parsed = json.loads(raw_text)
+        if isinstance(parsed, list):
+            # normalize returned items
+            cleaned = []
+            for obj in parsed:
+                try:
+                    cleaned.append({
+                        "item_name": str(obj.get("item_name", "")).strip(),
+                        "item_amount": float(obj.get("item_amount", 0.0)),
+                        "item_rate": float(obj.get("item_rate", 0.0) or 0.0),
+                        "item_quantity": float(obj.get("item_quantity", 1.0) or 1.0)
+                    })
+                except Exception:
+                    continue
+            # Token usage: we cannot reliably extract tokens without genai usage details; keep zero schema
+            token_usage = zero_usage
+            return cleaned, token_usage
+        else:
+            return page_items, zero_usage
+    except Exception:
+        return page_items, zero_usage
 # ---------------- FALLBACK REGEX EXTRACTOR ----------------