sinful1 commited on
Commit
bb788ea
·
1 Parent(s): 3a00f27

fix(parser): detect Co-op via tagline, BALANCE DUE as total, strip inline qty, filter CREDIT/DEBIT

Browse files
Files changed (1) hide show
  1. ocr/parser.py +20 -2
ocr/parser.py CHANGED
@@ -30,6 +30,10 @@ _OCR_VARIANTS = {
30
  "IIDL": "LIDL",
31
  "COOP": "CO-OP",
32
  "CO OP": "CO-OP",
 
 
 
 
33
  "M & S": "M&S",
34
  "MARKS & SPENCER": "M&S",
35
  }
@@ -53,7 +57,7 @@ _DATE_PATTERNS = [
53
  _PRICE_EXTRACT_RE = re.compile(r"(-?)[£$€¥]?\s*(\d{1,6}[.,\s]\d{2})")
54
 
55
  _TOTALS_KEYWORDS = re.compile(
56
- r"\b(total|sub[\s-]?total|subtotal|savings|promotions|tax|gst|hst)\b",
57
  re.IGNORECASE,
58
  )
59
 
@@ -71,7 +75,7 @@ _EACH_RE = re.compile(r"\d\s*each\b", re.IGNORECASE)
71
  _INLINE_QTY_RE = re.compile(r"^(\d+)\s*x\s*[£$€¥]?(\d+[.,]\d{2})$", re.IGNORECASE)
72
 
73
  # Payment / non-item rows in the totals section
74
- _PAYMENT_SKIP = re.compile(r"\b(cash|change|card|visa|mastercard|amex|contactless|clubcard)\b", re.IGNORECASE)
75
 
76
  # Row merging tolerance in pixels
77
  _ROW_Y_TOLERANCE = 30
@@ -390,6 +394,11 @@ def _match_known_retailer(header_rows: list[list[dict]]) -> tuple[str | None, fl
390
  return text, _top_y(block["bbox"])
391
  if text in _OCR_VARIANTS:
392
  return _OCR_VARIANTS[text], _top_y(block["bbox"])
 
 
 
 
 
393
  return None, 0.0
394
 
395
 
@@ -489,6 +498,13 @@ def _append_desc(item: dict, desc_blocks: list[dict], receipt_width: float) -> N
489
  if _is_qty_block(b, receipt_width):
490
  item["quantity"] = int(text)
491
  continue
 
 
 
 
 
 
 
492
  if _DISCOUNT_PREFIX_RE.match(text):
493
  continue
494
  if _EACH_RE.search(text):
@@ -550,6 +566,8 @@ def _extract_totals(totals_rows: list[list[dict]]) -> dict[str, str | None]:
550
  result["subtotal"] = result["subtotal"] or price
551
  elif "total" in row_text and "sub" not in row_text and "card" not in row_text:
552
  result["total"] = result["total"] or price
 
 
553
  elif result["total"] is None and not price.startswith("-"):
554
  # Standalone positive price with no keyword — treat as total if
555
  # not yet set (handles receipts where OCR misses the "TOTAL" text).
 
30
  "IIDL": "LIDL",
31
  "COOP": "CO-OP",
32
  "CO OP": "CO-OP",
33
+ "OWNED BY YOU": "CO-OP",
34
+ "OWNED BY YOU.": "CO-OP",
35
+ "RIGHT BY YOU": "CO-OP",
36
+ "RIGHT BY YOU.": "CO-OP",
37
  "M & S": "M&S",
38
  "MARKS & SPENCER": "M&S",
39
  }
 
57
  _PRICE_EXTRACT_RE = re.compile(r"(-?)[£$€¥]?\s*(\d{1,6}[.,\s]\d{2})")
58
 
59
  _TOTALS_KEYWORDS = re.compile(
60
+ r"\b(total|sub[\s-]?total|subtotal|savings|promotions|tax|gst|hst|balance)\b",
61
  re.IGNORECASE,
62
  )
63
 
 
75
  _INLINE_QTY_RE = re.compile(r"^(\d+)\s*x\s*[£$€¥]?(\d+[.,]\d{2})$", re.IGNORECASE)
76
 
77
  # Payment / non-item rows in the totals section
78
+ _PAYMENT_SKIP = re.compile(r"\b(cash|change|card|visa|mastercard|amex|contactless|clubcard|credit|debit)\b", re.IGNORECASE)
79
 
80
  # Row merging tolerance in pixels
81
  _ROW_Y_TOLERANCE = 30
 
394
  return text, _top_y(block["bbox"])
395
  if text in _OCR_VARIANTS:
396
  return _OCR_VARIANTS[text], _top_y(block["bbox"])
397
+ row_text = " ".join(b["text"].strip() for b in row).strip().upper()
398
+ if row_text in _KNOWN_RETAILERS:
399
+ return row_text, min(_top_y(b["bbox"]) for b in row)
400
+ if row_text in _OCR_VARIANTS:
401
+ return _OCR_VARIANTS[row_text], min(_top_y(b["bbox"]) for b in row)
402
  return None, 0.0
403
 
404
 
 
498
  if _is_qty_block(b, receipt_width):
499
  item["quantity"] = int(text)
500
  continue
501
+ # Co-op style: qty embedded in description ("1 BATCH S/NOODLE B")
502
+ if not item["description"] and not parts:
503
+ m_qty = re.match(r"^(\d{1,2})\s+([A-Za-z].+)$", text)
504
+ if m_qty:
505
+ item["quantity"] = int(m_qty.group(1))
506
+ parts.append(m_qty.group(2).strip())
507
+ continue
508
  if _DISCOUNT_PREFIX_RE.match(text):
509
  continue
510
  if _EACH_RE.search(text):
 
566
  result["subtotal"] = result["subtotal"] or price
567
  elif "total" in row_text and "sub" not in row_text and "card" not in row_text:
568
  result["total"] = result["total"] or price
569
+ elif "balance" in row_text:
570
+ result["total"] = result["total"] or price
571
  elif result["total"] is None and not price.startswith("-"):
572
  # Standalone positive price with no keyword — treat as total if
573
  # not yet set (handles receipts where OCR misses the "TOTAL" text).