Spaces:

Divya499
/

Bill-Invoice-Scanner-Pro

Sleeping

File size: 9,334 Bytes

b0bec61

"""
extractor.py — Regex-based field parser for the Bill/Invoice Scanner.

Responsibilities:
- extract_vendor(): find the company/vendor name from raw OCR text
- extract_date(): find the invoice date in multiple date formats
- extract_invoice_number(): find the invoice/bill reference number
- extract_amounts(): find subtotal, GST/tax, and total amounts
- parse_invoice(): master function — calls all above, returns single dict

All functions accept a raw text string and return a value or None.
No imports from other project modules — this module is self-contained.
"""

from __future__ import annotations
import re


# ---------------------------------------------------------------------------
# Compiled regex patterns (compile once at module load for performance)
# ---------------------------------------------------------------------------

# Known header strings to skip when detecting vendor name
_SKIP_HEADERS = {
    "tax invoice", "invoice", "bill", "receipt", "gst invoice",
    "retail invoice", "cash receipt", "sale receipt", "original",
    "duplicate", "restaurant bill", "restaurant", "bill of supply",
}

# Date patterns: DD/MM/YYYY · DD-MM-YYYY · DD Mon YYYY · Mon DD YYYY · DD-Mon-YYYY
_DATE_PATTERNS = [
    re.compile(r"\b(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})\b"),
    re.compile(
        r"\b(\d{1,2}\s+"
        r"(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*"
        r"\s+\d{2,4})\b",
        re.IGNORECASE,
    ),
    re.compile(
        r"\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*"
        r"\s+\d{1,2},?\s+\d{2,4}\b",
        re.IGNORECASE,
    ),
    # DD-Mon-YYYY e.g. 22-Feb-2024
    re.compile(
        r"\b(\d{1,2}[-/]"
        r"(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*"
        r"[-/]\d{2,4})\b",
        re.IGNORECASE,
    ),
]

# Invoice / bill number patterns
_INVOICE_NO_PATTERN = re.compile(
    r"\b(?:invoice\s*(?:no\.?|#|number|num\.?)|inv\.?\s*(?:no\.?|#)?|bill\s*(?:no\.?|#))"
    r"\s*[:\-]?\s*([A-Z0-9][-A-Z0-9/]{2,30})",
    re.IGNORECASE,
)

# Amount pattern: handles ₹ Rs. $ and comma-thousands
_AMOUNT_PATTERN = re.compile(
    r"(?:₹|Rs\.?|\$)?\s*(\d{1,3}(?:,\d{3})*(?:\.\d{1,2})?|\d+(?:\.\d{1,2})?)"
)

# Keyword matchers for each amount field (case-insensitive)
# Highly flexible to handle dots, RM, and multi-line gaps
_TOTAL_KEYWORDS = re.compile(
    r"(?:round\s*d\s*total|grand\s*total|total\s*payable|total\s*due|total\s*amount|net\s*amount|total|payable)\b"
    r"[\s\.\:\(RM\)]*?"  # Handle : (RM) .... etc
    r"([\d,]+\.\d{2})\b",
    re.IGNORECASE | re.DOTALL,
)
_SUBTOTAL_KEYWORDS = re.compile(
    r"\b(?:subtotal|sub\s*total|net\s*amount|amount\s*before\s*tax)\s*[:\-]?\s*"
    r"(?:₹|Rs\.?|\$)?\s*([\d,]+(?:\.\d{1,2})?)",
    re.IGNORECASE,
)
_GST_KEYWORDS = re.compile(
    r"\b(?:gst|cgst|sgst|igst|vat|tax|service\s*tax)\s*(?:\(?\d+%?\)?)?\s*[:\-]?\s*"
    r"(?:₹|Rs\.?|\$)?\s*([\d,]+(?:\.\d{1,2})?)",
    re.IGNORECASE,
)


# ---------------------------------------------------------------------------
# Helper
# ---------------------------------------------------------------------------

def _parse_amount(raw: str) -> float | None:
    """
    Parse a raw amount string (possibly with commas/currency symbols) to float.

    Args:
        raw: A string like '1,250.00', '1250', '₹ 1,250'.

    Returns:
        Float value, or None if parsing fails.
    """
    if raw is None:
        return None
    cleaned = raw.replace(",", "").strip()
    try:
        return float(cleaned)
    except ValueError:
        return None


# ---------------------------------------------------------------------------
# Field extractors
# ---------------------------------------------------------------------------

def extract_vendor(text: str) -> str | None:
    """
    Extract the vendor/company name from raw OCR text.

    Strategy: the first non-empty, non-numeric line that is not a known
    generic header (e.g., 'TAX INVOICE') is usually the vendor name.

    Args:
        text: Raw OCR output as a multi-line string.

    Returns:
        Vendor name string, or None if not identifiable.
    """
    if not text:
        return None

    lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
    for line in lines:
        lower = line.lower()
        # Skip known generic headers
        if lower in _SKIP_HEADERS:
            continue
        # Skip lines that are purely numeric or very short
        if re.fullmatch(r"[\d\s\-/.,]+", line) or len(line) < 3:
            continue
        # Skip lines that look like dates or invoice numbers
        if _DATE_PATTERNS[0].search(line) or _INVOICE_NO_PATTERN.search(line):
            continue
        return line

    return None


def extract_date(text: str) -> str | None:
    """
    Extract the invoice date from raw OCR text.

    Tries patterns in sequence: numeric (DD/MM/YYYY), then written-month
    variants. Returns the first match found.

    Args:
        text: Raw OCR output as a multi-line string.

    Returns:
        Date string as found in the text, or None if not found.
    """
    if not text:
        return None

    for pattern in _DATE_PATTERNS:
        match = pattern.search(text)
        if match:
            return match.group(1) if match.lastindex else match.group(0)

    return None


def extract_invoice_number(text: str) -> str | None:
    """
    Extract the invoice/bill reference number from raw OCR text.

    Matches common patterns: 'Invoice No.', 'INV#', 'Bill No:', etc.
    Avoids matching headers like 'TAX INVOICE' by checking line-by-line
    and ensuring the label is followed by a potential reference.

    Args:
        text: Raw OCR output as a multi-line string.

    Returns:
        Invoice number string, or None if not found.
    """
    if not text:
        return None

    # Stricter pattern that avoids matching just 'INVOICE' followed by newline
    # Requires a label followed by at least 2 alphanumeric chars on the same line
    pattern = re.compile(
        r"\b(?:inv(?:oice)?|bill)\s*(?:no\.?|#|num(?:ber)?)?\s*[:\-]?\s*([A-Z0-9][-A-Z0-9/]{2,30})",
        re.IGNORECASE
    )

    for line in text.splitlines():
        line = line.strip()
        # Skip generic headers entirely (failure mode fix)
        if line.lower() in _SKIP_HEADERS:
            continue
            
        match = pattern.search(line)
        if match:
            # Additional guard: don't return the match if it's just a known header substring
            val = match.group(1).strip()
            if val.lower() not in _SKIP_HEADERS:
                return val

    return None


def extract_amounts(text: str) -> dict[str, float | None]:
    """
    Extract subtotal, GST/tax, and total amounts from raw OCR text.

    Uses case-insensitive keyword matching before each amount to correctly
    classify the value. The failure-mode fix for 'Total: None' is applied
    here — all keyword comparisons operate on lowercased text and the regex
    allows optional whitespace between the keyword and the colon/value.

    Args:
        text: Raw OCR output as a multi-line string.

    Returns:
        Dict with keys: 'subtotal', 'gst', 'total'.
        Each value is a float or None if not found.
    """
    # Search for each amount type
    total_match = _TOTAL_KEYWORDS.search(text)
    subtotal_match = _SUBTOTAL_KEYWORDS.search(text)
    gst_match = _GST_KEYWORDS.search(text)

    total = _parse_amount(total_match.group(1)) if total_match else None
    
    # --- Failure-Mode Fix: Global Max Fallback ---
    # SROIE receipts often separate labels and totals.
    # If keyword match failed, take the largest currency-formatted number near the bottom.
    if total is None:
        all_amounts = _AMOUNT_PATTERN.findall(text)
        if all_amounts:
            # Clean and parse all found amounts
            numeric_vals = []
            for m in all_amounts:
                v = _parse_amount(m)
                if v is not None:
                    numeric_vals.append(v)
            if numeric_vals:
                # Take the maximum of the last 4 amounts found (usually bottom of bill)
                total = max(numeric_vals[-4:])
    
    subtotal = _parse_amount(subtotal_match.group(1)) if subtotal_match else None
    gst = _parse_amount(gst_match.group(1)) if gst_match else None

    return {"subtotal": subtotal, "gst": gst, "total": total}


def parse_invoice(text: str) -> dict:
    """
    Master function: parse all fields from raw OCR text.

    Calls each extractor and assembles a single dict. Any field that cannot
    be extracted is set to None — the UI renders None fields as empty inputs,
    prompting the user to fill them manually (human-in-the-loop design).

    Args:
        text: Raw OCR output as a multi-line string (from ocr.extract_text).

    Returns:
        Dict with keys: vendor, date, invoice_number, subtotal, gst, total,
        raw_text. All values are str | float | None except raw_text (always str).
    """
    amounts = extract_amounts(text)
    return {
        "vendor": extract_vendor(text),
        "date": extract_date(text),
        "invoice_number": extract_invoice_number(text),
        "subtotal": amounts["subtotal"],
        "gst": amounts["gst"],
        "total": amounts["total"],
        "raw_text": text,
    }