Spaces:
Running
Running
fix(parser): detect Co-op via tagline, BALANCE DUE as total, strip inline qty, filter CREDIT/DEBIT
Browse files- ocr/parser.py +20 -2
ocr/parser.py
CHANGED
|
@@ -30,6 +30,10 @@ _OCR_VARIANTS = {
|
|
| 30 |
"IIDL": "LIDL",
|
| 31 |
"COOP": "CO-OP",
|
| 32 |
"CO OP": "CO-OP",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
"M & S": "M&S",
|
| 34 |
"MARKS & SPENCER": "M&S",
|
| 35 |
}
|
|
@@ -53,7 +57,7 @@ _DATE_PATTERNS = [
|
|
| 53 |
_PRICE_EXTRACT_RE = re.compile(r"(-?)[£$€¥]?\s*(\d{1,6}[.,\s]\d{2})")
|
| 54 |
|
| 55 |
_TOTALS_KEYWORDS = re.compile(
|
| 56 |
-
r"\b(total|sub[\s-]?total|subtotal|savings|promotions|tax|gst|hst)\b",
|
| 57 |
re.IGNORECASE,
|
| 58 |
)
|
| 59 |
|
|
@@ -71,7 +75,7 @@ _EACH_RE = re.compile(r"\d\s*each\b", re.IGNORECASE)
|
|
| 71 |
_INLINE_QTY_RE = re.compile(r"^(\d+)\s*x\s*[£$€¥]?(\d+[.,]\d{2})$", re.IGNORECASE)
|
| 72 |
|
| 73 |
# Payment / non-item rows in the totals section
|
| 74 |
-
_PAYMENT_SKIP = re.compile(r"\b(cash|change|card|visa|mastercard|amex|contactless|clubcard)\b", re.IGNORECASE)
|
| 75 |
|
| 76 |
# Row merging tolerance in pixels
|
| 77 |
_ROW_Y_TOLERANCE = 30
|
|
@@ -390,6 +394,11 @@ def _match_known_retailer(header_rows: list[list[dict]]) -> tuple[str | None, fl
|
|
| 390 |
return text, _top_y(block["bbox"])
|
| 391 |
if text in _OCR_VARIANTS:
|
| 392 |
return _OCR_VARIANTS[text], _top_y(block["bbox"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 393 |
return None, 0.0
|
| 394 |
|
| 395 |
|
|
@@ -489,6 +498,13 @@ def _append_desc(item: dict, desc_blocks: list[dict], receipt_width: float) -> N
|
|
| 489 |
if _is_qty_block(b, receipt_width):
|
| 490 |
item["quantity"] = int(text)
|
| 491 |
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 492 |
if _DISCOUNT_PREFIX_RE.match(text):
|
| 493 |
continue
|
| 494 |
if _EACH_RE.search(text):
|
|
@@ -550,6 +566,8 @@ def _extract_totals(totals_rows: list[list[dict]]) -> dict[str, str | None]:
|
|
| 550 |
result["subtotal"] = result["subtotal"] or price
|
| 551 |
elif "total" in row_text and "sub" not in row_text and "card" not in row_text:
|
| 552 |
result["total"] = result["total"] or price
|
|
|
|
|
|
|
| 553 |
elif result["total"] is None and not price.startswith("-"):
|
| 554 |
# Standalone positive price with no keyword — treat as total if
|
| 555 |
# not yet set (handles receipts where OCR misses the "TOTAL" text).
|
|
|
|
| 30 |
"IIDL": "LIDL",
|
| 31 |
"COOP": "CO-OP",
|
| 32 |
"CO OP": "CO-OP",
|
| 33 |
+
"OWNED BY YOU": "CO-OP",
|
| 34 |
+
"OWNED BY YOU.": "CO-OP",
|
| 35 |
+
"RIGHT BY YOU": "CO-OP",
|
| 36 |
+
"RIGHT BY YOU.": "CO-OP",
|
| 37 |
"M & S": "M&S",
|
| 38 |
"MARKS & SPENCER": "M&S",
|
| 39 |
}
|
|
|
|
| 57 |
_PRICE_EXTRACT_RE = re.compile(r"(-?)[£$€¥]?\s*(\d{1,6}[.,\s]\d{2})")
|
| 58 |
|
| 59 |
_TOTALS_KEYWORDS = re.compile(
|
| 60 |
+
r"\b(total|sub[\s-]?total|subtotal|savings|promotions|tax|gst|hst|balance)\b",
|
| 61 |
re.IGNORECASE,
|
| 62 |
)
|
| 63 |
|
|
|
|
| 75 |
_INLINE_QTY_RE = re.compile(r"^(\d+)\s*x\s*[£$€¥]?(\d+[.,]\d{2})$", re.IGNORECASE)
|
| 76 |
|
| 77 |
# Payment / non-item rows in the totals section
|
| 78 |
+
_PAYMENT_SKIP = re.compile(r"\b(cash|change|card|visa|mastercard|amex|contactless|clubcard|credit|debit)\b", re.IGNORECASE)
|
| 79 |
|
| 80 |
# Row merging tolerance in pixels
|
| 81 |
_ROW_Y_TOLERANCE = 30
|
|
|
|
| 394 |
return text, _top_y(block["bbox"])
|
| 395 |
if text in _OCR_VARIANTS:
|
| 396 |
return _OCR_VARIANTS[text], _top_y(block["bbox"])
|
| 397 |
+
row_text = " ".join(b["text"].strip() for b in row).strip().upper()
|
| 398 |
+
if row_text in _KNOWN_RETAILERS:
|
| 399 |
+
return row_text, min(_top_y(b["bbox"]) for b in row)
|
| 400 |
+
if row_text in _OCR_VARIANTS:
|
| 401 |
+
return _OCR_VARIANTS[row_text], min(_top_y(b["bbox"]) for b in row)
|
| 402 |
return None, 0.0
|
| 403 |
|
| 404 |
|
|
|
|
| 498 |
if _is_qty_block(b, receipt_width):
|
| 499 |
item["quantity"] = int(text)
|
| 500 |
continue
|
| 501 |
+
# Co-op style: qty embedded in description ("1 BATCH S/NOODLE B")
|
| 502 |
+
if not item["description"] and not parts:
|
| 503 |
+
m_qty = re.match(r"^(\d{1,2})\s+([A-Za-z].+)$", text)
|
| 504 |
+
if m_qty:
|
| 505 |
+
item["quantity"] = int(m_qty.group(1))
|
| 506 |
+
parts.append(m_qty.group(2).strip())
|
| 507 |
+
continue
|
| 508 |
if _DISCOUNT_PREFIX_RE.match(text):
|
| 509 |
continue
|
| 510 |
if _EACH_RE.search(text):
|
|
|
|
| 566 |
result["subtotal"] = result["subtotal"] or price
|
| 567 |
elif "total" in row_text and "sub" not in row_text and "card" not in row_text:
|
| 568 |
result["total"] = result["total"] or price
|
| 569 |
+
elif "balance" in row_text:
|
| 570 |
+
result["total"] = result["total"] or price
|
| 571 |
elif result["total"] is None and not price.startswith("-"):
|
| 572 |
# Standalone positive price with no keyword — treat as total if
|
| 573 |
# not yet set (handles receipts where OCR misses the "TOTAL" text).
|