Spaces:

Divya499
/

Bill-Invoice-Scanner-Pro

Sleeping

DIVYANSHI SINGH

Root project layout configured for deployment

b0bec61 2 months ago

9.33 kB

	"""
	extractor.py — Regex-based field parser for the Bill/Invoice Scanner.

	Responsibilities:
	- extract_vendor(): find the company/vendor name from raw OCR text
	- extract_date(): find the invoice date in multiple date formats
	- extract_invoice_number(): find the invoice/bill reference number
	- extract_amounts(): find subtotal, GST/tax, and total amounts
	- parse_invoice(): master function — calls all above, returns single dict

	All functions accept a raw text string and return a value or None.
	No imports from other project modules — this module is self-contained.
	"""

	from __future__ import annotations
	import re


	# ---------------------------------------------------------------------------
	# Compiled regex patterns (compile once at module load for performance)
	# ---------------------------------------------------------------------------

	# Known header strings to skip when detecting vendor name
	_SKIP_HEADERS = {
	"tax invoice", "invoice", "bill", "receipt", "gst invoice",
	"retail invoice", "cash receipt", "sale receipt", "original",
	"duplicate", "restaurant bill", "restaurant", "bill of supply",
	}

	# Date patterns: DD/MM/YYYY · DD-MM-YYYY · DD Mon YYYY · Mon DD YYYY · DD-Mon-YYYY
	_DATE_PATTERNS = [
	re.compile(r"\b(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})\b"),
	re.compile(
	r"\b(\d{1,2}\s+"
	r"(?:Jan\|Feb\|Mar\|Apr\|May\|Jun\|Jul\|Aug\|Sep\|Oct\|Nov\|Dec)[a-z]*"
	r"\s+\d{2,4})\b",
	re.IGNORECASE,
	),
	re.compile(
	r"\b(?:Jan\|Feb\|Mar\|Apr\|May\|Jun\|Jul\|Aug\|Sep\|Oct\|Nov\|Dec)[a-z]*"
	r"\s+\d{1,2},?\s+\d{2,4}\b",
	re.IGNORECASE,
	),
	# DD-Mon-YYYY e.g. 22-Feb-2024
	re.compile(
	r"\b(\d{1,2}[-/]"
	r"(?:Jan\|Feb\|Mar\|Apr\|May\|Jun\|Jul\|Aug\|Sep\|Oct\|Nov\|Dec)[a-z]*"
	r"[-/]\d{2,4})\b",
	re.IGNORECASE,
	),
	]

	# Invoice / bill number patterns
	_INVOICE_NO_PATTERN = re.compile(
	r"\b(?:invoice\s(?:no\.?\|#\|number\|num\.?)\|inv\.?\s(?:no\.?\|#)?\|bill\s*(?:no\.?\|#))"
	r"\s[:\-]?\s([A-Z0-9][-A-Z0-9/]{2,30})",
	re.IGNORECASE,
	)

	# Amount pattern: handles ₹ Rs. $ and comma-thousands
	_AMOUNT_PATTERN = re.compile(
	r"(?:₹\|Rs\.?\|\$)?\s(\d{1,3}(?:,\d{3})(?:\.\d{1,2})?\|\d+(?:\.\d{1,2})?)"
	)

	# Keyword matchers for each amount field (case-insensitive)
	# Highly flexible to handle dots, RM, and multi-line gaps
	_TOTAL_KEYWORDS = re.compile(
	r"(?:round\sd\stotal\|grand\stotal\|total\spayable\|total\sdue\|total\samount\|net\s*amount\|total\|payable)\b"
	r"[\s\.\:$RM$]*?" # Handle : (RM) .... etc
	r"([\d,]+\.\d{2})\b",
	re.IGNORECASE \| re.DOTALL,
	)
	_SUBTOTAL_KEYWORDS = re.compile(
	r"\b(?:subtotal\|sub\stotal\|net\samount\|amount\sbefore\stax)\s[:\-]?\s"
	r"(?:₹\|Rs\.?\|\$)?\s*([\d,]+(?:\.\d{1,2})?)",
	re.IGNORECASE,
	)
	_GST_KEYWORDS = re.compile(
	r"\b(?:gst\|cgst\|sgst\|igst\|vat\|tax\|service\stax)\s(?:$?\d+%?$?)?\s[:\-]?\s"
	r"(?:₹\|Rs\.?\|\$)?\s*([\d,]+(?:\.\d{1,2})?)",
	re.IGNORECASE,
	)


	# ---------------------------------------------------------------------------
	# Helper
	# ---------------------------------------------------------------------------

	def _parse_amount(raw: str) -> float \| None:
	"""
	Parse a raw amount string (possibly with commas/currency symbols) to float.

	Args:
	raw: A string like '1,250.00', '1250', '₹ 1,250'.

	Returns:
	Float value, or None if parsing fails.
	"""
	if raw is None:
	return None
	cleaned = raw.replace(",", "").strip()
	try:
	return float(cleaned)
	except ValueError:
	return None


	# ---------------------------------------------------------------------------
	# Field extractors
	# ---------------------------------------------------------------------------

	def extract_vendor(text: str) -> str \| None:
	"""
	Extract the vendor/company name from raw OCR text.

	Strategy: the first non-empty, non-numeric line that is not a known
	generic header (e.g., 'TAX INVOICE') is usually the vendor name.

	Args:
	text: Raw OCR output as a multi-line string.

	Returns:
	Vendor name string, or None if not identifiable.
	"""
	if not text:
	return None

	lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
	for line in lines:
	lower = line.lower()
	# Skip known generic headers
	if lower in _SKIP_HEADERS:
	continue
	# Skip lines that are purely numeric or very short
	if re.fullmatch(r"[\d\s\-/.,]+", line) or len(line) < 3:
	continue
	# Skip lines that look like dates or invoice numbers
	if _DATE_PATTERNS[0].search(line) or _INVOICE_NO_PATTERN.search(line):
	continue
	return line

	return None


	def extract_date(text: str) -> str \| None:
	"""
	Extract the invoice date from raw OCR text.

	Tries patterns in sequence: numeric (DD/MM/YYYY), then written-month
	variants. Returns the first match found.

	Args:
	text: Raw OCR output as a multi-line string.

	Returns:
	Date string as found in the text, or None if not found.
	"""
	if not text:
	return None

	for pattern in _DATE_PATTERNS:
	match = pattern.search(text)
	if match:
	return match.group(1) if match.lastindex else match.group(0)

	return None


	def extract_invoice_number(text: str) -> str \| None:
	"""
	Extract the invoice/bill reference number from raw OCR text.

	Matches common patterns: 'Invoice No.', 'INV#', 'Bill No:', etc.
	Avoids matching headers like 'TAX INVOICE' by checking line-by-line
	and ensuring the label is followed by a potential reference.

	Args:
	text: Raw OCR output as a multi-line string.

	Returns:
	Invoice number string, or None if not found.
	"""
	if not text:
	return None

	# Stricter pattern that avoids matching just 'INVOICE' followed by newline
	# Requires a label followed by at least 2 alphanumeric chars on the same line
	pattern = re.compile(
	r"\b(?:inv(?:oice)?\|bill)\s(?:no\.?\|#\|num(?:ber)?)?\s[:\-]?\s*([A-Z0-9][-A-Z0-9/]{2,30})",
	re.IGNORECASE
	)

	for line in text.splitlines():
	line = line.strip()
	# Skip generic headers entirely (failure mode fix)
	if line.lower() in _SKIP_HEADERS:
	continue

	match = pattern.search(line)
	if match:
	# Additional guard: don't return the match if it's just a known header substring
	val = match.group(1).strip()
	if val.lower() not in _SKIP_HEADERS:
	return val

	return None


	def extract_amounts(text: str) -> dict[str, float \| None]:
	"""
	Extract subtotal, GST/tax, and total amounts from raw OCR text.

	Uses case-insensitive keyword matching before each amount to correctly
	classify the value. The failure-mode fix for 'Total: None' is applied
	here — all keyword comparisons operate on lowercased text and the regex
	allows optional whitespace between the keyword and the colon/value.

	Args:
	text: Raw OCR output as a multi-line string.

	Returns:
	Dict with keys: 'subtotal', 'gst', 'total'.
	Each value is a float or None if not found.
	"""
	# Search for each amount type
	total_match = _TOTAL_KEYWORDS.search(text)
	subtotal_match = _SUBTOTAL_KEYWORDS.search(text)
	gst_match = _GST_KEYWORDS.search(text)

	total = _parse_amount(total_match.group(1)) if total_match else None

	# --- Failure-Mode Fix: Global Max Fallback ---
	# SROIE receipts often separate labels and totals.
	# If keyword match failed, take the largest currency-formatted number near the bottom.
	if total is None:
	all_amounts = _AMOUNT_PATTERN.findall(text)
	if all_amounts:
	# Clean and parse all found amounts
	numeric_vals = []
	for m in all_amounts:
	v = _parse_amount(m)
	if v is not None:
	numeric_vals.append(v)
	if numeric_vals:
	# Take the maximum of the last 4 amounts found (usually bottom of bill)
	total = max(numeric_vals[-4:])

	subtotal = _parse_amount(subtotal_match.group(1)) if subtotal_match else None
	gst = _parse_amount(gst_match.group(1)) if gst_match else None

	return {"subtotal": subtotal, "gst": gst, "total": total}


	def parse_invoice(text: str) -> dict:
	"""
	Master function: parse all fields from raw OCR text.

	Calls each extractor and assembles a single dict. Any field that cannot
	be extracted is set to None — the UI renders None fields as empty inputs,
	prompting the user to fill them manually (human-in-the-loop design).

	Args:
	text: Raw OCR output as a multi-line string (from ocr.extract_text).

	Returns:
	Dict with keys: vendor, date, invoice_number, subtotal, gst, total,
	raw_text. All values are str \| float \| None except raw_text (always str).
	"""
	amounts = extract_amounts(text)
	return {
	"vendor": extract_vendor(text),
	"date": extract_date(text),
	"invoice_number": extract_invoice_number(text),
	"subtotal": amounts["subtotal"],
	"gst": amounts["gst"],
	"total": amounts["total"],
	"raw_text": text,
	}