Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files
app.py
CHANGED
|
@@ -244,7 +244,50 @@ def parse_row_to_item(cells_row: List[Dict[str, Any]]) -> Optional[Dict[str, Any
|
|
| 244 |
"item_rate": float(round(rate_val, 2)) if rate_val else 0.0,
|
| 245 |
"item_quantity": float(qty_val)
|
| 246 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 247 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 248 |
|
| 249 |
|
| 250 |
# ---------------- FALLBACK REGEX EXTRACTOR ----------------
|
|
|
|
| 244 |
"item_rate": float(round(rate_val, 2)) if rate_val else 0.0,
|
| 245 |
"item_quantity": float(qty_val)
|
| 246 |
}
|
| 247 |
+
# ---------------- Duplicate suppression & subtotal detection ----------------
|
| 248 |
+
def dedupe_items(items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
| 249 |
+
"""
|
| 250 |
+
Remove obvious duplicates:
|
| 251 |
+
- same item_name normalized + same amount -> keep first
|
| 252 |
+
"""
|
| 253 |
+
seen = set()
|
| 254 |
+
out = []
|
| 255 |
+
for it in items:
|
| 256 |
+
n = it["item_name"].lower()
|
| 257 |
+
n = re.sub(r"\s+", " ", n).strip()
|
| 258 |
+
key = (n[:80], round(float(it["item_amount"]), 2)) # use first 80 chars to be safe
|
| 259 |
+
if key in seen:
|
| 260 |
+
continue
|
| 261 |
+
seen.add(key)
|
| 262 |
+
out.append(it)
|
| 263 |
+
return out
|
| 264 |
|
| 265 |
+
def detect_subtotals_and_totals(rows_texts: List[str]) -> Dict[str, Any]:
|
| 266 |
+
"""
|
| 267 |
+
Look through page-level row texts for subtotal/final total cues and values.
|
| 268 |
+
Returns dict with keys: subtotal (float or None), final_total (float or None)
|
| 269 |
+
"""
|
| 270 |
+
subtotal = None
|
| 271 |
+
final_total = None
|
| 272 |
+
for rt in rows_texts[::-1]: # scan bottom-up
|
| 273 |
+
if not rt or rt.strip() == "":
|
| 274 |
+
continue
|
| 275 |
+
if TOTAL_KEYWORDS.search(rt):
|
| 276 |
+
# find first number in the line
|
| 277 |
+
m = NUM_RE.search(rt)
|
| 278 |
+
if m:
|
| 279 |
+
val = normalize_num_str(m.group(0))
|
| 280 |
+
if val is None:
|
| 281 |
+
continue
|
| 282 |
+
# decide if subtotal or final_total based on keyword
|
| 283 |
+
if re.search(r"sub", rt, re.I):
|
| 284 |
+
if subtotal is None:
|
| 285 |
+
subtotal = float(round(val, 2))
|
| 286 |
+
else:
|
| 287 |
+
# treat as final total likely
|
| 288 |
+
if final_total is None:
|
| 289 |
+
final_total = float(round(val, 2))
|
| 290 |
+
return {"subtotal": subtotal, "final_total": final_total}
|
| 291 |
|
| 292 |
|
| 293 |
# ---------------- FALLBACK REGEX EXTRACTOR ----------------
|