Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files
app.py
CHANGED
|
@@ -289,6 +289,68 @@ def detect_subtotals_and_totals(rows_texts: List[str]) -> Dict[str, Any]:
|
|
| 289 |
final_total = float(round(val, 2))
|
| 290 |
return {"subtotal": subtotal, "final_total": final_total}
|
| 291 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 292 |
|
| 293 |
# ---------------- FALLBACK REGEX EXTRACTOR ----------------
|
| 294 |
|
|
|
|
| 289 |
final_total = float(round(val, 2))
|
| 290 |
return {"subtotal": subtotal, "final_total": final_total}
|
| 291 |
|
| 292 |
+
# ---------------- Optional: Gemini refinement ----------------
|
| 293 |
+
def refine_with_gemini(page_items: List[Dict[str, Any]]) -> (List[Dict[str, Any]], Dict[str,int]):
|
| 294 |
+
"""
|
| 295 |
+
Send structured extracted items to Gemini to ask it to:
|
| 296 |
+
- validate amounts and quantities
|
| 297 |
+
- remove summary rows (if any slipped)
|
| 298 |
+
- dedupe and correct obvious OCR mis-splits
|
| 299 |
+
Returns (refined_items, token_usage)
|
| 300 |
+
If GEMINI_API_KEY not set or any error occurs, returns original items and zero token_usage.
|
| 301 |
+
"""
|
| 302 |
+
zero_usage = {"total_tokens": 0, "input_tokens": 0, "output_tokens": 0}
|
| 303 |
+
if not GEMINI_API_KEY:
|
| 304 |
+
return page_items, zero_usage
|
| 305 |
+
|
| 306 |
+
try:
|
| 307 |
+
# Build compact JSON payload for the LLM
|
| 308 |
+
prompt = (
|
| 309 |
+
"You are a precise bill extraction assistant. You are given a list of extracted line items "
|
| 310 |
+
"parsed from OCR with fields item_name, item_quantity, item_rate, item_amount. "
|
| 311 |
+
"Your job: remove any rows that are totals or duplicate summaries, fix numeric mistakes if obvious, "
|
| 312 |
+
"and return cleaned JSON array of items only. Do NOT add new items. "
|
| 313 |
+
"Return ONLY valid JSON (an array of objects with same fields).\n\n"
|
| 314 |
+
f"items = {json.dumps(page_items, ensure_ascii=False)}"
|
| 315 |
+
)
|
| 316 |
+
|
| 317 |
+
model = genai.GenerativeModel(GEMINI_MODEL_NAME)
|
| 318 |
+
response = model.generate_content(
|
| 319 |
+
[
|
| 320 |
+
{"role": "system", "parts": ["You are concise and precise. Return only valid JSON."]},
|
| 321 |
+
{"role": "user", "parts": [prompt]},
|
| 322 |
+
]
|
| 323 |
+
)
|
| 324 |
+
|
| 325 |
+
raw_text = response.text.strip()
|
| 326 |
+
# remove fenced codeblock markers
|
| 327 |
+
if raw_text.startswith("```"):
|
| 328 |
+
raw_text = re.sub(r"^```[a-zA-Z]*", "", raw_text)
|
| 329 |
+
raw_text = re.sub(r"```$", "", raw_text).strip()
|
| 330 |
+
|
| 331 |
+
parsed = json.loads(raw_text)
|
| 332 |
+
if isinstance(parsed, list):
|
| 333 |
+
# normalize returned items
|
| 334 |
+
cleaned = []
|
| 335 |
+
for obj in parsed:
|
| 336 |
+
try:
|
| 337 |
+
cleaned.append({
|
| 338 |
+
"item_name": str(obj.get("item_name", "")).strip(),
|
| 339 |
+
"item_amount": float(obj.get("item_amount", 0.0)),
|
| 340 |
+
"item_rate": float(obj.get("item_rate", 0.0) or 0.0),
|
| 341 |
+
"item_quantity": float(obj.get("item_quantity", 1.0) or 1.0)
|
| 342 |
+
})
|
| 343 |
+
except Exception:
|
| 344 |
+
continue
|
| 345 |
+
# Token usage: we cannot reliably extract tokens without genai usage details; keep zero schema
|
| 346 |
+
token_usage = zero_usage
|
| 347 |
+
return cleaned, token_usage
|
| 348 |
+
else:
|
| 349 |
+
return page_items, zero_usage
|
| 350 |
+
|
| 351 |
+
except Exception:
|
| 352 |
+
return page_items, zero_usage
|
| 353 |
+
|
| 354 |
|
| 355 |
# ---------------- FALLBACK REGEX EXTRACTOR ----------------
|
| 356 |
|