Spaces:
Sleeping
Sleeping
File size: 9,334 Bytes
b0bec61 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 | """
extractor.py — Regex-based field parser for the Bill/Invoice Scanner.
Responsibilities:
- extract_vendor(): find the company/vendor name from raw OCR text
- extract_date(): find the invoice date in multiple date formats
- extract_invoice_number(): find the invoice/bill reference number
- extract_amounts(): find subtotal, GST/tax, and total amounts
- parse_invoice(): master function — calls all above, returns single dict
All functions accept a raw text string and return a value or None.
No imports from other project modules — this module is self-contained.
"""
from __future__ import annotations
import re
# ---------------------------------------------------------------------------
# Compiled regex patterns (compile once at module load for performance)
# ---------------------------------------------------------------------------
# Known header strings to skip when detecting vendor name
_SKIP_HEADERS = {
"tax invoice", "invoice", "bill", "receipt", "gst invoice",
"retail invoice", "cash receipt", "sale receipt", "original",
"duplicate", "restaurant bill", "restaurant", "bill of supply",
}
# Date patterns: DD/MM/YYYY · DD-MM-YYYY · DD Mon YYYY · Mon DD YYYY · DD-Mon-YYYY
_DATE_PATTERNS = [
re.compile(r"\b(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})\b"),
re.compile(
r"\b(\d{1,2}\s+"
r"(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*"
r"\s+\d{2,4})\b",
re.IGNORECASE,
),
re.compile(
r"\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*"
r"\s+\d{1,2},?\s+\d{2,4}\b",
re.IGNORECASE,
),
# DD-Mon-YYYY e.g. 22-Feb-2024
re.compile(
r"\b(\d{1,2}[-/]"
r"(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*"
r"[-/]\d{2,4})\b",
re.IGNORECASE,
),
]
# Invoice / bill number patterns
_INVOICE_NO_PATTERN = re.compile(
r"\b(?:invoice\s*(?:no\.?|#|number|num\.?)|inv\.?\s*(?:no\.?|#)?|bill\s*(?:no\.?|#))"
r"\s*[:\-]?\s*([A-Z0-9][-A-Z0-9/]{2,30})",
re.IGNORECASE,
)
# Amount pattern: handles ₹ Rs. $ and comma-thousands
_AMOUNT_PATTERN = re.compile(
r"(?:₹|Rs\.?|\$)?\s*(\d{1,3}(?:,\d{3})*(?:\.\d{1,2})?|\d+(?:\.\d{1,2})?)"
)
# Keyword matchers for each amount field (case-insensitive)
# Highly flexible to handle dots, RM, and multi-line gaps
_TOTAL_KEYWORDS = re.compile(
r"(?:round\s*d\s*total|grand\s*total|total\s*payable|total\s*due|total\s*amount|net\s*amount|total|payable)\b"
r"[\s\.\:\(RM\)]*?" # Handle : (RM) .... etc
r"([\d,]+\.\d{2})\b",
re.IGNORECASE | re.DOTALL,
)
_SUBTOTAL_KEYWORDS = re.compile(
r"\b(?:subtotal|sub\s*total|net\s*amount|amount\s*before\s*tax)\s*[:\-]?\s*"
r"(?:₹|Rs\.?|\$)?\s*([\d,]+(?:\.\d{1,2})?)",
re.IGNORECASE,
)
_GST_KEYWORDS = re.compile(
r"\b(?:gst|cgst|sgst|igst|vat|tax|service\s*tax)\s*(?:\(?\d+%?\)?)?\s*[:\-]?\s*"
r"(?:₹|Rs\.?|\$)?\s*([\d,]+(?:\.\d{1,2})?)",
re.IGNORECASE,
)
# ---------------------------------------------------------------------------
# Helper
# ---------------------------------------------------------------------------
def _parse_amount(raw: str) -> float | None:
"""
Parse a raw amount string (possibly with commas/currency symbols) to float.
Args:
raw: A string like '1,250.00', '1250', '₹ 1,250'.
Returns:
Float value, or None if parsing fails.
"""
if raw is None:
return None
cleaned = raw.replace(",", "").strip()
try:
return float(cleaned)
except ValueError:
return None
# ---------------------------------------------------------------------------
# Field extractors
# ---------------------------------------------------------------------------
def extract_vendor(text: str) -> str | None:
"""
Extract the vendor/company name from raw OCR text.
Strategy: the first non-empty, non-numeric line that is not a known
generic header (e.g., 'TAX INVOICE') is usually the vendor name.
Args:
text: Raw OCR output as a multi-line string.
Returns:
Vendor name string, or None if not identifiable.
"""
if not text:
return None
lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
for line in lines:
lower = line.lower()
# Skip known generic headers
if lower in _SKIP_HEADERS:
continue
# Skip lines that are purely numeric or very short
if re.fullmatch(r"[\d\s\-/.,]+", line) or len(line) < 3:
continue
# Skip lines that look like dates or invoice numbers
if _DATE_PATTERNS[0].search(line) or _INVOICE_NO_PATTERN.search(line):
continue
return line
return None
def extract_date(text: str) -> str | None:
"""
Extract the invoice date from raw OCR text.
Tries patterns in sequence: numeric (DD/MM/YYYY), then written-month
variants. Returns the first match found.
Args:
text: Raw OCR output as a multi-line string.
Returns:
Date string as found in the text, or None if not found.
"""
if not text:
return None
for pattern in _DATE_PATTERNS:
match = pattern.search(text)
if match:
return match.group(1) if match.lastindex else match.group(0)
return None
def extract_invoice_number(text: str) -> str | None:
"""
Extract the invoice/bill reference number from raw OCR text.
Matches common patterns: 'Invoice No.', 'INV#', 'Bill No:', etc.
Avoids matching headers like 'TAX INVOICE' by checking line-by-line
and ensuring the label is followed by a potential reference.
Args:
text: Raw OCR output as a multi-line string.
Returns:
Invoice number string, or None if not found.
"""
if not text:
return None
# Stricter pattern that avoids matching just 'INVOICE' followed by newline
# Requires a label followed by at least 2 alphanumeric chars on the same line
pattern = re.compile(
r"\b(?:inv(?:oice)?|bill)\s*(?:no\.?|#|num(?:ber)?)?\s*[:\-]?\s*([A-Z0-9][-A-Z0-9/]{2,30})",
re.IGNORECASE
)
for line in text.splitlines():
line = line.strip()
# Skip generic headers entirely (failure mode fix)
if line.lower() in _SKIP_HEADERS:
continue
match = pattern.search(line)
if match:
# Additional guard: don't return the match if it's just a known header substring
val = match.group(1).strip()
if val.lower() not in _SKIP_HEADERS:
return val
return None
def extract_amounts(text: str) -> dict[str, float | None]:
"""
Extract subtotal, GST/tax, and total amounts from raw OCR text.
Uses case-insensitive keyword matching before each amount to correctly
classify the value. The failure-mode fix for 'Total: None' is applied
here — all keyword comparisons operate on lowercased text and the regex
allows optional whitespace between the keyword and the colon/value.
Args:
text: Raw OCR output as a multi-line string.
Returns:
Dict with keys: 'subtotal', 'gst', 'total'.
Each value is a float or None if not found.
"""
# Search for each amount type
total_match = _TOTAL_KEYWORDS.search(text)
subtotal_match = _SUBTOTAL_KEYWORDS.search(text)
gst_match = _GST_KEYWORDS.search(text)
total = _parse_amount(total_match.group(1)) if total_match else None
# --- Failure-Mode Fix: Global Max Fallback ---
# SROIE receipts often separate labels and totals.
# If keyword match failed, take the largest currency-formatted number near the bottom.
if total is None:
all_amounts = _AMOUNT_PATTERN.findall(text)
if all_amounts:
# Clean and parse all found amounts
numeric_vals = []
for m in all_amounts:
v = _parse_amount(m)
if v is not None:
numeric_vals.append(v)
if numeric_vals:
# Take the maximum of the last 4 amounts found (usually bottom of bill)
total = max(numeric_vals[-4:])
subtotal = _parse_amount(subtotal_match.group(1)) if subtotal_match else None
gst = _parse_amount(gst_match.group(1)) if gst_match else None
return {"subtotal": subtotal, "gst": gst, "total": total}
def parse_invoice(text: str) -> dict:
"""
Master function: parse all fields from raw OCR text.
Calls each extractor and assembles a single dict. Any field that cannot
be extracted is set to None — the UI renders None fields as empty inputs,
prompting the user to fill them manually (human-in-the-loop design).
Args:
text: Raw OCR output as a multi-line string (from ocr.extract_text).
Returns:
Dict with keys: vendor, date, invoice_number, subtotal, gst, total,
raw_text. All values are str | float | None except raw_text (always str).
"""
amounts = extract_amounts(text)
return {
"vendor": extract_vendor(text),
"date": extract_date(text),
"invoice_number": extract_invoice_number(text),
"subtotal": amounts["subtotal"],
"gst": amounts["gst"],
"total": amounts["total"],
"raw_text": text,
}
|