File size: 22,701 Bytes
619b919
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bb788ea
 
 
 
619b919
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bb788ea
619b919
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bb788ea
619b919
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bb788ea
 
 
 
 
619b919
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bb788ea
 
 
 
 
 
 
619b919
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bb788ea
 
619b919
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
"""
Stage 2: Parse raw OCR text blocks into structured receipt data.

Uses spatial layout (bounding box coordinates) to reconstruct receipt rows,
rather than relying on single-line regex matching. Receipts have a consistent
column layout:
  - Far left:  quantity (single digit)
  - Middle:    item description
  - Far right: price

The parser groups blocks into rows by y-coordinate proximity, classifies
columns by x-position, then extracts structured fields.
"""

import re
from typing import Any

# Known UK retailer names (uppercase) for exact matching against header text
_KNOWN_RETAILERS = {
    "TESCO", "ASDA", "ALDI", "SAINSBURY'S", "SAINSBURYS", "MORRISONS",
    "WAITROSE", "COSTCO", "ICELAND", "SPAR", "NISA", "BOOTHS",
    "LIDL", "CO-OP", "M&S", "BUDGENS", "LONDIS", "ONE STOP",
}

# Map of common OCR misreads for stylized logos β†’ canonical retailer name
_OCR_VARIANTS = {
    "LODZ": "LIDL",
    "LIOL": "LIDL",
    "LDL": "LIDL",
    "IIDL": "LIDL",
    "COOP": "CO-OP",
    "CO OP": "CO-OP",
    "OWNED BY YOU": "CO-OP",
    "OWNED BY YOU.": "CO-OP",
    "RIGHT BY YOU": "CO-OP",
    "RIGHT BY YOU.": "CO-OP",
    "M & S": "M&S",
    "MARKS & SPENCER": "M&S",
}


# ---------------------------------------------------------------------------
# Compiled patterns
# ---------------------------------------------------------------------------

_DATE_PATTERNS = [
    re.compile(r"\b(\d{4}[-/]\d{1,2}[-/]\d{1,2})\b"),
    re.compile(r"\b(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})\b"),
    re.compile(
        r"\b(\d{1,2}\s+"
        r"(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*"
        r"\s+\d{2,4})\b",
        re.IGNORECASE,
    ),
]

_PRICE_EXTRACT_RE = re.compile(r"(-?)[Β£$€Β₯]?\s*(\d{1,6}[.,\s]\d{2})")

_TOTALS_KEYWORDS = re.compile(
    r"\b(total|sub[\s-]?total|subtotal|savings|promotions|tax|gst|hst|balance)\b",
    re.IGNORECASE,
)

_HEADER_SKIP = re.compile(
    r"(www\.|\.com|\.co\.uk|vat\s*(?:no\.?|number)|questions?\s*please|please\s*visit|store.locator)",
    re.IGNORECASE,
)

_DISCOUNT_PREFIX_RE = re.compile(r"^Cc", re.IGNORECASE)

# "1.90 each" or "3.00each" β€” informational price-per-unit lines, not items
_EACH_RE = re.compile(r"\d\s*each\b", re.IGNORECASE)

# Inline quantity: "2x0.13", "3 x Β£0.22" β€” Lidl-style qty on the description line
_INLINE_QTY_RE = re.compile(r"^(\d+)\s*x\s*[Β£$€Β₯]?(\d+[.,]\d{2})$", re.IGNORECASE)

# Payment / non-item rows in the totals section
_PAYMENT_SKIP = re.compile(r"\b(cash|change|card|visa|mastercard|amex|contactless|clubcard|credit|debit)\b", re.IGNORECASE)

# Row merging tolerance in pixels
_ROW_Y_TOLERANCE = 30

# Minimum ratio of "readable" characters (letters, digits, spaces, common punct)
# to filter out ghost text from receipt backs
_MIN_READABLE_RATIO = 0.6


# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------

def parse_blocks(blocks: list[dict[str, Any]]) -> dict[str, Any]:
    """
    Accept OCR blocks (sorted top-to-bottom) and return structured receipt data.

    Each block: { "text": str, "confidence": float, "bbox": [...] }
    """
    if not blocks:
        return _empty_result()

    clean_blocks = [b for b in blocks if _is_readable(b["text"])]
    if not clean_blocks:
        return _empty_result()

    rows = _build_rows(clean_blocks)
    receipt_width = _estimate_receipt_width(clean_blocks)

    date = _extract_date(clean_blocks)
    header_end, totals_start = _find_sections(rows, receipt_width)

    merchant_name, store_location = _extract_header(rows[:header_end])

    item_rows = rows[header_end:totals_start]
    # Tesco receipts can split the first item's qty+description and price
    # into two rows when ghost text above the item pulls the row anchor up.
    # Detect an orphaned qty row immediately before the first-price row and
    # glue its blocks into the first item row.
    if header_end > 0 and item_rows:
        prev_row = rows[header_end - 1]
        if any(_is_qty_block(b, receipt_width) for b in prev_row):
            item_rows = [
                sorted(prev_row + item_rows[0], key=lambda b: _left_x(b["bbox"]))
            ] + item_rows[1:]

    line_items = _extract_line_items(item_rows, receipt_width)
    totals = _extract_totals(rows[totals_start:])

    return {
        "merchant_name": merchant_name,
        "store_location": store_location,
        "date": date,
        "line_items": line_items,
        "subtotal": totals.get("subtotal"),
        "savings": totals.get("savings"),
        "total": totals.get("total"),
    }


def _empty_result() -> dict[str, Any]:
    return {
        "merchant_name": None,
        "store_location": None,
        "date": None,
        "line_items": [],
        "subtotal": None,
        "savings": None,
        "total": None,
    }


# ---------------------------------------------------------------------------
# Text quality filter
# ---------------------------------------------------------------------------

def _is_readable(text: str) -> bool:
    """
    Filter out garbage text from receipt backs / noise.

    Ghost text tends to have: random case mixing, no recognizable words,
    high consonant density. Real receipt text has: prices, normal English words,
    or standard labels.
    """
    if not text or not text.strip():
        return False
    stripped = text.strip()

    # Single digits pass (could be quantity column)
    if len(stripped) == 1:
        return stripped.isdigit()
    if len(stripped) < 2:
        return False

    # Prices always pass
    if _PRICE_EXTRACT_RE.search(stripped):
        return True

    # Short tokens (2-3 chars) β€” allow if they look like real text
    if len(stripped) <= 3:
        # Allow "Cc", digits, common abbreviations
        if re.fullmatch(r"[A-Za-z]{2,3}|[0-9]+", stripped):
            return True
        return False

    # For longer text: check if it has a reasonable ratio of lowercase letters
    # and spaces (real English text). Ghost text is often CamelCase garbage
    # with no spaces and random character distribution.
    alpha = sum(1 for c in stripped if c.isalpha())
    if alpha == 0:
        # Pure numbers/symbols β€” keep if it has recognizable structure
        return bool(re.search(r"\d", stripped))

    # Check for word-like patterns (sequences of letters separated by spaces/punct)
    words = re.findall(r"[A-Za-z]+", stripped)
    if not words:
        return False

    # Ghost text signature: many words with unusual capitalization mixing
    # Real text: "Tesco British Whole Milk", "VAT Number", "Subtotal:"
    # Ghost text: "VIAJeY", "biqemoa ot vlggsanoiibno", "68T3-uoniqAoncguaGAon"

    # Heuristic: if average word length > 6 and mostly lowercase jumbles, it's noise
    avg_word_len = sum(len(w) for w in words) / len(words)
    has_spaces = " " in stripped

    # Long unbroken text with digits mixed into letters is ghost text
    # (e.g., "68T3-uoniqAoncguaGAon"). Pure-letter words like "Cornflakes"
    # or "BakedBeansTomSauce" are valid receipt items.
    if avg_word_len > 8 and not has_spaces and re.search(r"\d", stripped) and alpha > 5:
        return False

    # Check consonant density β€” ghost text has unusual consonant clusters
    consonants = sum(1 for c in stripped.lower() if c in "bcdfghjklmnpqrstvwxyz")
    vowels = sum(1 for c in stripped.lower() if c in "aeiou")
    if vowels > 0 and consonants / (vowels + consonants) > 0.80:
        return False
    if vowels == 0 and alpha > 3:
        return False

    return True


# ---------------------------------------------------------------------------
# Row building β€” group blocks by y-coordinate proximity
# ---------------------------------------------------------------------------

def _top_y(bbox: list) -> float:
    try:
        return min(pt[1] for pt in bbox)
    except (TypeError, IndexError):
        return 0.0


def _left_x(bbox: list) -> float:
    try:
        return min(pt[0] for pt in bbox)
    except (TypeError, IndexError):
        return 0.0


def _build_rows(blocks: list[dict]) -> list[list[dict]]:
    """Group blocks into rows by y-coordinate proximity, sorted left-to-right."""
    if not blocks:
        return []

    sorted_blocks = sorted(blocks, key=lambda b: _top_y(b["bbox"]))
    rows: list[list[dict]] = []
    current_row: list[dict] = [sorted_blocks[0]]
    current_y = _top_y(sorted_blocks[0]["bbox"])

    for block in sorted_blocks[1:]:
        y = _top_y(block["bbox"])
        if abs(y - current_y) <= _ROW_Y_TOLERANCE:
            current_row.append(block)
        else:
            rows.append(sorted(current_row, key=lambda b: _left_x(b["bbox"])))
            current_row = [block]
            current_y = y

    if current_row:
        rows.append(sorted(current_row, key=lambda b: _left_x(b["bbox"])))

    return rows


# ---------------------------------------------------------------------------
# Receipt width estimation and column classification
# ---------------------------------------------------------------------------

def _estimate_receipt_width(blocks: list[dict]) -> float:
    """Estimate the receipt width from the rightmost x-coordinate."""
    max_x = 0.0
    for block in blocks:
        for pt in block["bbox"]:
            try:
                max_x = max(max_x, float(pt[0]))
            except (TypeError, IndexError):
                pass
    return max_x if max_x > 0 else 1000.0


def _is_price_block(block: dict, receipt_width: float) -> bool:
    """A price block sits in the right column and contains a price pattern."""
    x = _left_x(block["bbox"])
    return x > receipt_width * 0.70 and bool(_PRICE_EXTRACT_RE.search(block["text"]))


def _is_qty_block(block: dict, receipt_width: float) -> bool:
    """A quantity block sits in the left column and is a single digit."""
    x = _left_x(block["bbox"])
    return x < receipt_width * 0.12 and re.fullmatch(r"\d", block["text"].strip()) is not None


def _row_has_right_price(row: list[dict], receipt_width: float) -> bool:
    """Check if any block in the row is a price in the right column."""
    return any(_is_price_block(b, receipt_width) for b in row)


# ---------------------------------------------------------------------------
# Section detection
# ---------------------------------------------------------------------------

def _find_sections(rows: list[list[dict]], receipt_width: float) -> tuple[int, int]:
    """
    Find where the header ends and the totals section begins.

    Returns (header_end_index, totals_start_index).
    """
    header_end = 0
    totals_start = len(rows)

    # Header ends at the first row with a price in the right column
    for i, row in enumerate(rows):
        if _row_has_right_price(row, receipt_width):
            header_end = i
            break

    # Primary: keyword detection ("Subtotal:", "TOTAL:", etc.)
    for i in range(header_end, len(rows)):
        row_text = " ".join(b["text"] for b in rows[i])
        if _TOTALS_KEYWORDS.search(row_text):
            totals_start = i
            break

    # Fallback: if no keyword found, look for a vertical gap significantly
    # larger than normal item spacing. This handles receipts where OCR
    # misses the "TOTAL" text (e.g., Lidl's dashed separator).
    if totals_start == len(rows) and len(rows) > header_end + 2:
        spacings = []
        for i in range(header_end + 1, len(rows)):
            prev_y = max(_top_y(b["bbox"]) for b in rows[i - 1])
            curr_y = min(_top_y(b["bbox"]) for b in rows[i])
            spacings.append(curr_y - prev_y)

        if spacings:
            avg_spacing = sum(spacings) / len(spacings)
            gap_threshold = max(avg_spacing * 1.8, 60)
            for i, spacing in enumerate(spacings):
                if spacing > gap_threshold:
                    totals_start = header_end + 1 + i
                    break

    return header_end, totals_start


# ---------------------------------------------------------------------------
# Header extraction
# ---------------------------------------------------------------------------

def _extract_header(header_rows: list[list[dict]]) -> tuple[str | None, str | None]:
    """
    Extract merchant name and store location from header rows.

    Checks header blocks against a known retailer list (exact match) and
    an OCR variants map (e.g., "LODZ" β†’ "LIDL") for stylized logos.
    If no known retailer is found, returns (None, None) β€” the app layer
    can prompt the user.
    """
    retailer, retailer_y = _match_known_retailer(header_rows)
    if not retailer:
        return None, None

    # Only consider rows below the retailer logo. Thermal-printed receipts
    # can curl and reveal the back's ghost text above the logo; that text
    # often passes the readability filter and must be excluded here.
    store_location = None
    for row in header_rows:
        row_y = min(_top_y(b["bbox"]) for b in row)
        if row_y <= retailer_y:
            continue
        meaningful = [b for b in row if len(b["text"].strip()) >= 3
                      and not _HEADER_SKIP.search(b["text"])
                      and not any(p.search(b["text"]) for p in _DATE_PATTERNS)
                      and b["text"].strip().upper() != retailer
                      and b["text"].strip().upper() not in _OCR_VARIANTS
                      and b["confidence"] >= 0.85]
        if not meaningful:
            continue
        best = max(meaningful, key=lambda b: b["confidence"])
        store_location = best["text"].strip()
        break

    return retailer, store_location


def _match_known_retailer(header_rows: list[list[dict]]) -> tuple[str | None, float]:
    """
    Check if any header block matches a known retailer or OCR variant.
    Returns (retailer_name, y_coordinate_of_match) or (None, 0.0).
    """
    for row in header_rows:
        for block in row:
            text = block["text"].strip().upper()
            if text in _KNOWN_RETAILERS:
                return text, _top_y(block["bbox"])
            if text in _OCR_VARIANTS:
                return _OCR_VARIANTS[text], _top_y(block["bbox"])
        row_text = " ".join(b["text"].strip() for b in row).strip().upper()
        if row_text in _KNOWN_RETAILERS:
            return row_text, min(_top_y(b["bbox"]) for b in row)
        if row_text in _OCR_VARIANTS:
            return _OCR_VARIANTS[row_text], min(_top_y(b["bbox"]) for b in row)
    return None, 0.0


# ---------------------------------------------------------------------------
# Date extraction
# ---------------------------------------------------------------------------

def _extract_date(blocks: list[dict]) -> str | None:
    """Scan all blocks for the first date match."""
    for block in blocks:
        for pattern in _DATE_PATTERNS:
            m = pattern.search(block["text"])
            if m:
                return m.group(1)
    return None


# ---------------------------------------------------------------------------
# Line item extraction (price-anchored)
# ---------------------------------------------------------------------------

def _extract_line_items(
    item_rows: list[list[dict]], receipt_width: float
) -> list[dict[str, Any]]:
    """
    Row-based line-item extraction.

    Walk the rows produced by _build_rows top-to-bottom:
      - A row containing a positive right-column price starts a new item
        using the non-price blocks in that row as its initial description.
      - A row without a price is a continuation β€” append its text to the
        current item.
      - A row containing a negative price attaches it as a discount to the
        current item (first discount wins).
    """
    items: list[dict[str, Any]] = []
    current: dict[str, Any] | None = None

    for row in item_rows:
        price_blk = next(
            (b for b in reversed(row) if _is_price_block(b, receipt_width)),
            None,
        )
        desc_blocks = [b for b in row if b is not price_blk]

        if price_blk is None:
            if current is not None:
                _append_desc(current, desc_blocks, receipt_width)
            continue

        price_str = _normalise_price(price_blk["text"])

        if (
            not price_str.startswith("-")
            and current is not None
            and current["discount"] is None
            and any(_is_cc_discount_indicator(b["text"]) for b in desc_blocks)
        ):
            price_str = f"-{price_str}"

        if price_str.startswith("-"):
            if current is not None:
                _append_desc(current, desc_blocks, receipt_width)
                if current["discount"] is None:
                    current["discount"] = price_str
                continue
            # No preceding item β€” standalone negative price (refund/return)

        current = {
            "description": None,
            "quantity": 1,
            "unit_price": price_str,
            "total_price": price_str,
            "discount": None,
        }
        _append_desc(current, desc_blocks, receipt_width)
        items.append(current)

    return items


def _is_desc_block(block: dict, receipt_width: float) -> bool:
    """
    Description blocks start before the price column (70% mark).
    Exclude blocks in the 65-70% zone that are short fragments β€” these
    are typically standalone ghost text from the receipt back.
    """
    x = _left_x(block["bbox"])
    return x < receipt_width * 0.65


def _append_desc(item: dict, desc_blocks: list[dict], receipt_width: float) -> None:
    """Merge extra desc blocks into an item, updating qty and unit_price."""
    parts: list[str] = [item["description"]] if item["description"] else []
    for b in desc_blocks:
        text = b["text"].strip()
        if _is_qty_block(b, receipt_width):
            item["quantity"] = int(text)
            continue
        # Co-op style: qty embedded in description ("1 BATCH S/NOODLE B")
        if not item["description"] and not parts:
            m_qty = re.match(r"^(\d{1,2})\s+([A-Za-z].+)$", text)
            if m_qty:
                item["quantity"] = int(m_qty.group(1))
                parts.append(m_qty.group(2).strip())
                continue
        if _DISCOUNT_PREFIX_RE.match(text):
            continue
        if _EACH_RE.search(text):
            continue
        iq = _INLINE_QTY_RE.match(text)
        if iq:
            item["quantity"] = int(iq.group(1))
            continue
        if not _is_desc_block(b, receipt_width):
            continue
        parts.append(text)

    item["description"] = " ".join(parts).strip() or None
    item["unit_price"] = _calc_unit_price(item["total_price"], item["quantity"])


# ---------------------------------------------------------------------------
# Totals extraction
# ---------------------------------------------------------------------------

def _extract_totals(totals_rows: list[list[dict]]) -> dict[str, str | None]:
    """Extract subtotal, savings, and total from the totals section."""
    result: dict[str, str | None] = {"subtotal": None, "savings": None, "total": None}

    for ri, row in enumerate(totals_rows):
        row_text = " ".join(b["text"] for b in row).strip().lower()

        # Find the price β€” prefer rightmost block
        price = None
        for block in sorted(row, key=lambda b: _left_x(b["bbox"]), reverse=True):
            m = _PRICE_EXTRACT_RE.search(block["text"])
            if m:
                price = _normalise_price(block["text"])
                break

        if price is None:
            continue

        # Skip payment rows (CASH, CHANGE, CARD, etc.)
        if _PAYMENT_SKIP.search(row_text):
            continue

        # For savings/promotions, prefer the negative price if available.
        # Sometimes OCR splits "Savings: -Β£6.70" into two rows.
        if "saving" in row_text or "promotion" in row_text:
            if not price.startswith("-"):
                # Check the next row for a standalone negative price
                if ri + 1 < len(totals_rows):
                    next_row = totals_rows[ri + 1]
                    for nb in sorted(next_row, key=lambda b: _left_x(b["bbox"]), reverse=True):
                        nm = _PRICE_EXTRACT_RE.search(nb["text"])
                        if nm:
                            np_ = _normalise_price(nb["text"])
                            if np_.startswith("-"):
                                price = np_
                            break
            result["savings"] = result["savings"] or price
        elif "subtotal" in row_text or "sub total" in row_text:
            result["subtotal"] = result["subtotal"] or price
        elif "total" in row_text and "sub" not in row_text and "card" not in row_text:
            result["total"] = result["total"] or price
        elif "balance" in row_text:
            result["total"] = result["total"] or price
        elif result["total"] is None and not price.startswith("-"):
            # Standalone positive price with no keyword β€” treat as total if
            # not yet set (handles receipts where OCR misses the "TOTAL" text).
            # Negative standalone prices are savings/discounts, not totals.
            result["total"] = price

    return result


# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------

def _normalise_price(text: str) -> str:
    """Extract and standardise a price string."""
    m = _PRICE_EXTRACT_RE.search(text)
    if not m:
        return text.strip()
    sign = m.group(1)
    digits = m.group(2).replace(",", ".").replace(" ", ".")
    cleaned = re.sub(r"[Β£$€Β₯\s]", "", digits)
    return f"{sign}{cleaned}"


def _is_cc_discount_indicator(text: str) -> bool:
    """True when text is a Clubcard discount marker: 'Cc' followed by a price
    and no other meaningful words. Distinguishes 'Cc Β£2.25' (discount) from
    'Cc Any 3 For 2' (promotion label)."""
    text = text.strip()
    if not _DISCOUNT_PREFIX_RE.match(text):
        return False
    after_cc = text[2:].strip()
    if not re.search(r"\d", after_cc):
        return False
    cleaned = _PRICE_EXTRACT_RE.sub("", after_cc)
    cleaned = re.sub(r"[Β£$€Β₯\d.,\s]", "", cleaned)
    return len(cleaned) <= 2


def _calc_unit_price(total_price: str, quantity: int) -> str:
    """Calculate unit price from total and quantity."""
    if quantity <= 1:
        return total_price
    try:
        return str(round(float(total_price) / quantity, 2))
    except (ValueError, ZeroDivisionError):
        return total_price