""" extractor.py — Regex-based field parser for the Bill/Invoice Scanner. Responsibilities: - extract_vendor(): find the company/vendor name from raw OCR text - extract_date(): find the invoice date in multiple date formats - extract_invoice_number(): find the invoice/bill reference number - extract_amounts(): find subtotal, GST/tax, and total amounts - parse_invoice(): master function — calls all above, returns single dict All functions accept a raw text string and return a value or None. No imports from other project modules — this module is self-contained. """ from __future__ import annotations import re # --------------------------------------------------------------------------- # Compiled regex patterns (compile once at module load for performance) # --------------------------------------------------------------------------- # Known header strings to skip when detecting vendor name _SKIP_HEADERS = { "tax invoice", "invoice", "bill", "receipt", "gst invoice", "retail invoice", "cash receipt", "sale receipt", "original", "duplicate", "restaurant bill", "restaurant", "bill of supply", } # Date patterns: DD/MM/YYYY · DD-MM-YYYY · DD Mon YYYY · Mon DD YYYY · DD-Mon-YYYY _DATE_PATTERNS = [ re.compile(r"\b(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})\b"), re.compile( r"\b(\d{1,2}\s+" r"(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*" r"\s+\d{2,4})\b", re.IGNORECASE, ), re.compile( r"\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*" r"\s+\d{1,2},?\s+\d{2,4}\b", re.IGNORECASE, ), # DD-Mon-YYYY e.g. 22-Feb-2024 re.compile( r"\b(\d{1,2}[-/]" r"(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*" r"[-/]\d{2,4})\b", re.IGNORECASE, ), ] # Invoice / bill number patterns _INVOICE_NO_PATTERN = re.compile( r"\b(?:invoice\s*(?:no\.?|#|number|num\.?)|inv\.?\s*(?:no\.?|#)?|bill\s*(?:no\.?|#))" r"\s*[:\-]?\s*([A-Z0-9][-A-Z0-9/]{2,30})", re.IGNORECASE, ) # Amount pattern: handles ₹ Rs. $ and comma-thousands _AMOUNT_PATTERN = re.compile( r"(?:₹|Rs\.?|\$)?\s*(\d{1,3}(?:,\d{3})*(?:\.\d{1,2})?|\d+(?:\.\d{1,2})?)" ) # Keyword matchers for each amount field (case-insensitive) # Highly flexible to handle dots, RM, and multi-line gaps _TOTAL_KEYWORDS = re.compile( r"(?:round\s*d\s*total|grand\s*total|total\s*payable|total\s*due|total\s*amount|net\s*amount|total|payable)\b" r"[\s\.\:\(RM\)]*?" # Handle : (RM) .... etc r"([\d,]+\.\d{2})\b", re.IGNORECASE | re.DOTALL, ) _SUBTOTAL_KEYWORDS = re.compile( r"\b(?:subtotal|sub\s*total|net\s*amount|amount\s*before\s*tax)\s*[:\-]?\s*" r"(?:₹|Rs\.?|\$)?\s*([\d,]+(?:\.\d{1,2})?)", re.IGNORECASE, ) _GST_KEYWORDS = re.compile( r"\b(?:gst|cgst|sgst|igst|vat|tax|service\s*tax)\s*(?:\(?\d+%?\)?)?\s*[:\-]?\s*" r"(?:₹|Rs\.?|\$)?\s*([\d,]+(?:\.\d{1,2})?)", re.IGNORECASE, ) # --------------------------------------------------------------------------- # Helper # --------------------------------------------------------------------------- def _parse_amount(raw: str) -> float | None: """ Parse a raw amount string (possibly with commas/currency symbols) to float. Args: raw: A string like '1,250.00', '1250', '₹ 1,250'. Returns: Float value, or None if parsing fails. """ if raw is None: return None cleaned = raw.replace(",", "").strip() try: return float(cleaned) except ValueError: return None # --------------------------------------------------------------------------- # Field extractors # --------------------------------------------------------------------------- def extract_vendor(text: str) -> str | None: """ Extract the vendor/company name from raw OCR text. Strategy: the first non-empty, non-numeric line that is not a known generic header (e.g., 'TAX INVOICE') is usually the vendor name. Args: text: Raw OCR output as a multi-line string. Returns: Vendor name string, or None if not identifiable. """ if not text: return None lines = [ln.strip() for ln in text.splitlines() if ln.strip()] for line in lines: lower = line.lower() # Skip known generic headers if lower in _SKIP_HEADERS: continue # Skip lines that are purely numeric or very short if re.fullmatch(r"[\d\s\-/.,]+", line) or len(line) < 3: continue # Skip lines that look like dates or invoice numbers if _DATE_PATTERNS[0].search(line) or _INVOICE_NO_PATTERN.search(line): continue return line return None def extract_date(text: str) -> str | None: """ Extract the invoice date from raw OCR text. Tries patterns in sequence: numeric (DD/MM/YYYY), then written-month variants. Returns the first match found. Args: text: Raw OCR output as a multi-line string. Returns: Date string as found in the text, or None if not found. """ if not text: return None for pattern in _DATE_PATTERNS: match = pattern.search(text) if match: return match.group(1) if match.lastindex else match.group(0) return None def extract_invoice_number(text: str) -> str | None: """ Extract the invoice/bill reference number from raw OCR text. Matches common patterns: 'Invoice No.', 'INV#', 'Bill No:', etc. Avoids matching headers like 'TAX INVOICE' by checking line-by-line and ensuring the label is followed by a potential reference. Args: text: Raw OCR output as a multi-line string. Returns: Invoice number string, or None if not found. """ if not text: return None # Stricter pattern that avoids matching just 'INVOICE' followed by newline # Requires a label followed by at least 2 alphanumeric chars on the same line pattern = re.compile( r"\b(?:inv(?:oice)?|bill)\s*(?:no\.?|#|num(?:ber)?)?\s*[:\-]?\s*([A-Z0-9][-A-Z0-9/]{2,30})", re.IGNORECASE ) for line in text.splitlines(): line = line.strip() # Skip generic headers entirely (failure mode fix) if line.lower() in _SKIP_HEADERS: continue match = pattern.search(line) if match: # Additional guard: don't return the match if it's just a known header substring val = match.group(1).strip() if val.lower() not in _SKIP_HEADERS: return val return None def extract_amounts(text: str) -> dict[str, float | None]: """ Extract subtotal, GST/tax, and total amounts from raw OCR text. Uses case-insensitive keyword matching before each amount to correctly classify the value. The failure-mode fix for 'Total: None' is applied here — all keyword comparisons operate on lowercased text and the regex allows optional whitespace between the keyword and the colon/value. Args: text: Raw OCR output as a multi-line string. Returns: Dict with keys: 'subtotal', 'gst', 'total'. Each value is a float or None if not found. """ # Search for each amount type total_match = _TOTAL_KEYWORDS.search(text) subtotal_match = _SUBTOTAL_KEYWORDS.search(text) gst_match = _GST_KEYWORDS.search(text) total = _parse_amount(total_match.group(1)) if total_match else None # --- Failure-Mode Fix: Global Max Fallback --- # SROIE receipts often separate labels and totals. # If keyword match failed, take the largest currency-formatted number near the bottom. if total is None: all_amounts = _AMOUNT_PATTERN.findall(text) if all_amounts: # Clean and parse all found amounts numeric_vals = [] for m in all_amounts: v = _parse_amount(m) if v is not None: numeric_vals.append(v) if numeric_vals: # Take the maximum of the last 4 amounts found (usually bottom of bill) total = max(numeric_vals[-4:]) subtotal = _parse_amount(subtotal_match.group(1)) if subtotal_match else None gst = _parse_amount(gst_match.group(1)) if gst_match else None return {"subtotal": subtotal, "gst": gst, "total": total} def parse_invoice(text: str) -> dict: """ Master function: parse all fields from raw OCR text. Calls each extractor and assembles a single dict. Any field that cannot be extracted is set to None — the UI renders None fields as empty inputs, prompting the user to fill them manually (human-in-the-loop design). Args: text: Raw OCR output as a multi-line string (from ocr.extract_text). Returns: Dict with keys: vendor, date, invoice_number, subtotal, gst, total, raw_text. All values are str | float | None except raw_text (always str). """ amounts = extract_amounts(text) return { "vendor": extract_vendor(text), "date": extract_date(text), "invoice_number": extract_invoice_number(text), "subtotal": amounts["subtotal"], "gst": amounts["gst"], "total": amounts["total"], "raw_text": text, }