Bill-Invoice-Scanner-Pro / extractor.py
DIVYANSHI SINGH
Root project layout configured for deployment
b0bec61
"""
extractor.py — Regex-based field parser for the Bill/Invoice Scanner.
Responsibilities:
- extract_vendor(): find the company/vendor name from raw OCR text
- extract_date(): find the invoice date in multiple date formats
- extract_invoice_number(): find the invoice/bill reference number
- extract_amounts(): find subtotal, GST/tax, and total amounts
- parse_invoice(): master function — calls all above, returns single dict
All functions accept a raw text string and return a value or None.
No imports from other project modules — this module is self-contained.
"""
from __future__ import annotations
import re
# ---------------------------------------------------------------------------
# Compiled regex patterns (compile once at module load for performance)
# ---------------------------------------------------------------------------
# Known header strings to skip when detecting vendor name
_SKIP_HEADERS = {
"tax invoice", "invoice", "bill", "receipt", "gst invoice",
"retail invoice", "cash receipt", "sale receipt", "original",
"duplicate", "restaurant bill", "restaurant", "bill of supply",
}
# Date patterns: DD/MM/YYYY · DD-MM-YYYY · DD Mon YYYY · Mon DD YYYY · DD-Mon-YYYY
_DATE_PATTERNS = [
re.compile(r"\b(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})\b"),
re.compile(
r"\b(\d{1,2}\s+"
r"(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*"
r"\s+\d{2,4})\b",
re.IGNORECASE,
),
re.compile(
r"\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*"
r"\s+\d{1,2},?\s+\d{2,4}\b",
re.IGNORECASE,
),
# DD-Mon-YYYY e.g. 22-Feb-2024
re.compile(
r"\b(\d{1,2}[-/]"
r"(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*"
r"[-/]\d{2,4})\b",
re.IGNORECASE,
),
]
# Invoice / bill number patterns
_INVOICE_NO_PATTERN = re.compile(
r"\b(?:invoice\s*(?:no\.?|#|number|num\.?)|inv\.?\s*(?:no\.?|#)?|bill\s*(?:no\.?|#))"
r"\s*[:\-]?\s*([A-Z0-9][-A-Z0-9/]{2,30})",
re.IGNORECASE,
)
# Amount pattern: handles ₹ Rs. $ and comma-thousands
_AMOUNT_PATTERN = re.compile(
r"(?:₹|Rs\.?|\$)?\s*(\d{1,3}(?:,\d{3})*(?:\.\d{1,2})?|\d+(?:\.\d{1,2})?)"
)
# Keyword matchers for each amount field (case-insensitive)
# Highly flexible to handle dots, RM, and multi-line gaps
_TOTAL_KEYWORDS = re.compile(
r"(?:round\s*d\s*total|grand\s*total|total\s*payable|total\s*due|total\s*amount|net\s*amount|total|payable)\b"
r"[\s\.\:\(RM\)]*?" # Handle : (RM) .... etc
r"([\d,]+\.\d{2})\b",
re.IGNORECASE | re.DOTALL,
)
_SUBTOTAL_KEYWORDS = re.compile(
r"\b(?:subtotal|sub\s*total|net\s*amount|amount\s*before\s*tax)\s*[:\-]?\s*"
r"(?:₹|Rs\.?|\$)?\s*([\d,]+(?:\.\d{1,2})?)",
re.IGNORECASE,
)
_GST_KEYWORDS = re.compile(
r"\b(?:gst|cgst|sgst|igst|vat|tax|service\s*tax)\s*(?:\(?\d+%?\)?)?\s*[:\-]?\s*"
r"(?:₹|Rs\.?|\$)?\s*([\d,]+(?:\.\d{1,2})?)",
re.IGNORECASE,
)
# ---------------------------------------------------------------------------
# Helper
# ---------------------------------------------------------------------------
def _parse_amount(raw: str) -> float | None:
"""
Parse a raw amount string (possibly with commas/currency symbols) to float.
Args:
raw: A string like '1,250.00', '1250', '₹ 1,250'.
Returns:
Float value, or None if parsing fails.
"""
if raw is None:
return None
cleaned = raw.replace(",", "").strip()
try:
return float(cleaned)
except ValueError:
return None
# ---------------------------------------------------------------------------
# Field extractors
# ---------------------------------------------------------------------------
def extract_vendor(text: str) -> str | None:
"""
Extract the vendor/company name from raw OCR text.
Strategy: the first non-empty, non-numeric line that is not a known
generic header (e.g., 'TAX INVOICE') is usually the vendor name.
Args:
text: Raw OCR output as a multi-line string.
Returns:
Vendor name string, or None if not identifiable.
"""
if not text:
return None
lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
for line in lines:
lower = line.lower()
# Skip known generic headers
if lower in _SKIP_HEADERS:
continue
# Skip lines that are purely numeric or very short
if re.fullmatch(r"[\d\s\-/.,]+", line) or len(line) < 3:
continue
# Skip lines that look like dates or invoice numbers
if _DATE_PATTERNS[0].search(line) or _INVOICE_NO_PATTERN.search(line):
continue
return line
return None
def extract_date(text: str) -> str | None:
"""
Extract the invoice date from raw OCR text.
Tries patterns in sequence: numeric (DD/MM/YYYY), then written-month
variants. Returns the first match found.
Args:
text: Raw OCR output as a multi-line string.
Returns:
Date string as found in the text, or None if not found.
"""
if not text:
return None
for pattern in _DATE_PATTERNS:
match = pattern.search(text)
if match:
return match.group(1) if match.lastindex else match.group(0)
return None
def extract_invoice_number(text: str) -> str | None:
"""
Extract the invoice/bill reference number from raw OCR text.
Matches common patterns: 'Invoice No.', 'INV#', 'Bill No:', etc.
Avoids matching headers like 'TAX INVOICE' by checking line-by-line
and ensuring the label is followed by a potential reference.
Args:
text: Raw OCR output as a multi-line string.
Returns:
Invoice number string, or None if not found.
"""
if not text:
return None
# Stricter pattern that avoids matching just 'INVOICE' followed by newline
# Requires a label followed by at least 2 alphanumeric chars on the same line
pattern = re.compile(
r"\b(?:inv(?:oice)?|bill)\s*(?:no\.?|#|num(?:ber)?)?\s*[:\-]?\s*([A-Z0-9][-A-Z0-9/]{2,30})",
re.IGNORECASE
)
for line in text.splitlines():
line = line.strip()
# Skip generic headers entirely (failure mode fix)
if line.lower() in _SKIP_HEADERS:
continue
match = pattern.search(line)
if match:
# Additional guard: don't return the match if it's just a known header substring
val = match.group(1).strip()
if val.lower() not in _SKIP_HEADERS:
return val
return None
def extract_amounts(text: str) -> dict[str, float | None]:
"""
Extract subtotal, GST/tax, and total amounts from raw OCR text.
Uses case-insensitive keyword matching before each amount to correctly
classify the value. The failure-mode fix for 'Total: None' is applied
here — all keyword comparisons operate on lowercased text and the regex
allows optional whitespace between the keyword and the colon/value.
Args:
text: Raw OCR output as a multi-line string.
Returns:
Dict with keys: 'subtotal', 'gst', 'total'.
Each value is a float or None if not found.
"""
# Search for each amount type
total_match = _TOTAL_KEYWORDS.search(text)
subtotal_match = _SUBTOTAL_KEYWORDS.search(text)
gst_match = _GST_KEYWORDS.search(text)
total = _parse_amount(total_match.group(1)) if total_match else None
# --- Failure-Mode Fix: Global Max Fallback ---
# SROIE receipts often separate labels and totals.
# If keyword match failed, take the largest currency-formatted number near the bottom.
if total is None:
all_amounts = _AMOUNT_PATTERN.findall(text)
if all_amounts:
# Clean and parse all found amounts
numeric_vals = []
for m in all_amounts:
v = _parse_amount(m)
if v is not None:
numeric_vals.append(v)
if numeric_vals:
# Take the maximum of the last 4 amounts found (usually bottom of bill)
total = max(numeric_vals[-4:])
subtotal = _parse_amount(subtotal_match.group(1)) if subtotal_match else None
gst = _parse_amount(gst_match.group(1)) if gst_match else None
return {"subtotal": subtotal, "gst": gst, "total": total}
def parse_invoice(text: str) -> dict:
"""
Master function: parse all fields from raw OCR text.
Calls each extractor and assembles a single dict. Any field that cannot
be extracted is set to None — the UI renders None fields as empty inputs,
prompting the user to fill them manually (human-in-the-loop design).
Args:
text: Raw OCR output as a multi-line string (from ocr.extract_text).
Returns:
Dict with keys: vendor, date, invoice_number, subtotal, gst, total,
raw_text. All values are str | float | None except raw_text (always str).
"""
amounts = extract_amounts(text)
return {
"vendor": extract_vendor(text),
"date": extract_date(text),
"invoice_number": extract_invoice_number(text),
"subtotal": amounts["subtotal"],
"gst": amounts["gst"],
"total": amounts["total"],
"raw_text": text,
}