File size: 9,334 Bytes
b0bec61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
"""
extractor.py — Regex-based field parser for the Bill/Invoice Scanner.

Responsibilities:
- extract_vendor(): find the company/vendor name from raw OCR text
- extract_date(): find the invoice date in multiple date formats
- extract_invoice_number(): find the invoice/bill reference number
- extract_amounts(): find subtotal, GST/tax, and total amounts
- parse_invoice(): master function — calls all above, returns single dict

All functions accept a raw text string and return a value or None.
No imports from other project modules — this module is self-contained.
"""

from __future__ import annotations
import re


# ---------------------------------------------------------------------------
# Compiled regex patterns (compile once at module load for performance)
# ---------------------------------------------------------------------------

# Known header strings to skip when detecting vendor name
_SKIP_HEADERS = {
    "tax invoice", "invoice", "bill", "receipt", "gst invoice",
    "retail invoice", "cash receipt", "sale receipt", "original",
    "duplicate", "restaurant bill", "restaurant", "bill of supply",
}

# Date patterns: DD/MM/YYYY · DD-MM-YYYY · DD Mon YYYY · Mon DD YYYY · DD-Mon-YYYY
_DATE_PATTERNS = [
    re.compile(r"\b(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})\b"),
    re.compile(
        r"\b(\d{1,2}\s+"
        r"(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*"
        r"\s+\d{2,4})\b",
        re.IGNORECASE,
    ),
    re.compile(
        r"\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*"
        r"\s+\d{1,2},?\s+\d{2,4}\b",
        re.IGNORECASE,
    ),
    # DD-Mon-YYYY e.g. 22-Feb-2024
    re.compile(
        r"\b(\d{1,2}[-/]"
        r"(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*"
        r"[-/]\d{2,4})\b",
        re.IGNORECASE,
    ),
]

# Invoice / bill number patterns
_INVOICE_NO_PATTERN = re.compile(
    r"\b(?:invoice\s*(?:no\.?|#|number|num\.?)|inv\.?\s*(?:no\.?|#)?|bill\s*(?:no\.?|#))"
    r"\s*[:\-]?\s*([A-Z0-9][-A-Z0-9/]{2,30})",
    re.IGNORECASE,
)

# Amount pattern: handles ₹ Rs. $ and comma-thousands
_AMOUNT_PATTERN = re.compile(
    r"(?:₹|Rs\.?|\$)?\s*(\d{1,3}(?:,\d{3})*(?:\.\d{1,2})?|\d+(?:\.\d{1,2})?)"
)

# Keyword matchers for each amount field (case-insensitive)
# Highly flexible to handle dots, RM, and multi-line gaps
_TOTAL_KEYWORDS = re.compile(
    r"(?:round\s*d\s*total|grand\s*total|total\s*payable|total\s*due|total\s*amount|net\s*amount|total|payable)\b"
    r"[\s\.\:\(RM\)]*?"  # Handle : (RM) .... etc
    r"([\d,]+\.\d{2})\b",
    re.IGNORECASE | re.DOTALL,
)
_SUBTOTAL_KEYWORDS = re.compile(
    r"\b(?:subtotal|sub\s*total|net\s*amount|amount\s*before\s*tax)\s*[:\-]?\s*"
    r"(?:₹|Rs\.?|\$)?\s*([\d,]+(?:\.\d{1,2})?)",
    re.IGNORECASE,
)
_GST_KEYWORDS = re.compile(
    r"\b(?:gst|cgst|sgst|igst|vat|tax|service\s*tax)\s*(?:\(?\d+%?\)?)?\s*[:\-]?\s*"
    r"(?:₹|Rs\.?|\$)?\s*([\d,]+(?:\.\d{1,2})?)",
    re.IGNORECASE,
)


# ---------------------------------------------------------------------------
# Helper
# ---------------------------------------------------------------------------

def _parse_amount(raw: str) -> float | None:
    """
    Parse a raw amount string (possibly with commas/currency symbols) to float.

    Args:
        raw: A string like '1,250.00', '1250', '₹ 1,250'.

    Returns:
        Float value, or None if parsing fails.
    """
    if raw is None:
        return None
    cleaned = raw.replace(",", "").strip()
    try:
        return float(cleaned)
    except ValueError:
        return None


# ---------------------------------------------------------------------------
# Field extractors
# ---------------------------------------------------------------------------

def extract_vendor(text: str) -> str | None:
    """
    Extract the vendor/company name from raw OCR text.

    Strategy: the first non-empty, non-numeric line that is not a known
    generic header (e.g., 'TAX INVOICE') is usually the vendor name.

    Args:
        text: Raw OCR output as a multi-line string.

    Returns:
        Vendor name string, or None if not identifiable.
    """
    if not text:
        return None

    lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
    for line in lines:
        lower = line.lower()
        # Skip known generic headers
        if lower in _SKIP_HEADERS:
            continue
        # Skip lines that are purely numeric or very short
        if re.fullmatch(r"[\d\s\-/.,]+", line) or len(line) < 3:
            continue
        # Skip lines that look like dates or invoice numbers
        if _DATE_PATTERNS[0].search(line) or _INVOICE_NO_PATTERN.search(line):
            continue
        return line

    return None


def extract_date(text: str) -> str | None:
    """
    Extract the invoice date from raw OCR text.

    Tries patterns in sequence: numeric (DD/MM/YYYY), then written-month
    variants. Returns the first match found.

    Args:
        text: Raw OCR output as a multi-line string.

    Returns:
        Date string as found in the text, or None if not found.
    """
    if not text:
        return None

    for pattern in _DATE_PATTERNS:
        match = pattern.search(text)
        if match:
            return match.group(1) if match.lastindex else match.group(0)

    return None


def extract_invoice_number(text: str) -> str | None:
    """
    Extract the invoice/bill reference number from raw OCR text.

    Matches common patterns: 'Invoice No.', 'INV#', 'Bill No:', etc.
    Avoids matching headers like 'TAX INVOICE' by checking line-by-line
    and ensuring the label is followed by a potential reference.

    Args:
        text: Raw OCR output as a multi-line string.

    Returns:
        Invoice number string, or None if not found.
    """
    if not text:
        return None

    # Stricter pattern that avoids matching just 'INVOICE' followed by newline
    # Requires a label followed by at least 2 alphanumeric chars on the same line
    pattern = re.compile(
        r"\b(?:inv(?:oice)?|bill)\s*(?:no\.?|#|num(?:ber)?)?\s*[:\-]?\s*([A-Z0-9][-A-Z0-9/]{2,30})",
        re.IGNORECASE
    )

    for line in text.splitlines():
        line = line.strip()
        # Skip generic headers entirely (failure mode fix)
        if line.lower() in _SKIP_HEADERS:
            continue
            
        match = pattern.search(line)
        if match:
            # Additional guard: don't return the match if it's just a known header substring
            val = match.group(1).strip()
            if val.lower() not in _SKIP_HEADERS:
                return val

    return None


def extract_amounts(text: str) -> dict[str, float | None]:
    """
    Extract subtotal, GST/tax, and total amounts from raw OCR text.

    Uses case-insensitive keyword matching before each amount to correctly
    classify the value. The failure-mode fix for 'Total: None' is applied
    here — all keyword comparisons operate on lowercased text and the regex
    allows optional whitespace between the keyword and the colon/value.

    Args:
        text: Raw OCR output as a multi-line string.

    Returns:
        Dict with keys: 'subtotal', 'gst', 'total'.
        Each value is a float or None if not found.
    """
    # Search for each amount type
    total_match = _TOTAL_KEYWORDS.search(text)
    subtotal_match = _SUBTOTAL_KEYWORDS.search(text)
    gst_match = _GST_KEYWORDS.search(text)

    total = _parse_amount(total_match.group(1)) if total_match else None
    
    # --- Failure-Mode Fix: Global Max Fallback ---
    # SROIE receipts often separate labels and totals.
    # If keyword match failed, take the largest currency-formatted number near the bottom.
    if total is None:
        all_amounts = _AMOUNT_PATTERN.findall(text)
        if all_amounts:
            # Clean and parse all found amounts
            numeric_vals = []
            for m in all_amounts:
                v = _parse_amount(m)
                if v is not None:
                    numeric_vals.append(v)
            if numeric_vals:
                # Take the maximum of the last 4 amounts found (usually bottom of bill)
                total = max(numeric_vals[-4:])
    
    subtotal = _parse_amount(subtotal_match.group(1)) if subtotal_match else None
    gst = _parse_amount(gst_match.group(1)) if gst_match else None

    return {"subtotal": subtotal, "gst": gst, "total": total}


def parse_invoice(text: str) -> dict:
    """
    Master function: parse all fields from raw OCR text.

    Calls each extractor and assembles a single dict. Any field that cannot
    be extracted is set to None — the UI renders None fields as empty inputs,
    prompting the user to fill them manually (human-in-the-loop design).

    Args:
        text: Raw OCR output as a multi-line string (from ocr.extract_text).

    Returns:
        Dict with keys: vendor, date, invoice_number, subtotal, gst, total,
        raw_text. All values are str | float | None except raw_text (always str).
    """
    amounts = extract_amounts(text)
    return {
        "vendor": extract_vendor(text),
        "date": extract_date(text),
        "invoice_number": extract_invoice_number(text),
        "subtotal": amounts["subtotal"],
        "gst": amounts["gst"],
        "total": amounts["total"],
        "raw_text": text,
    }