""" backend/utils/layout_template.py INVOICE LAYOUT TEMPLATE MAPPER ================================ Converts any VLM/OCR extraction result into a STANDARDIZED output where each field always appears at the same position in the document — regardless of the source invoice's language, style, or quality (including handwritten/unclear images). CONCEPT: - Real invoices all share the same SPATIAL ZONES (top-left=vendor, top-right=invoice#, etc.) - VLM extracts raw field values from image - This module maps those values onto a fixed positional template - Output is always consistent: same field names, same order, same structure ADVANTAGE FOR HANDWRITTEN/UNCLEAR IMAGES: - Even if OCR reads garbled text, VLM can infer fields from position - Template forces a complete output with all expected fields (empty if not found) - Downstream systems always receive the same schema regardless of image quality STANDARD INVOICE ZONES (spatial reference): ┌─────────────────────────────────────────────────┐ │ [TOP-LEFT] │ [TOP-RIGHT] │ │ Vendor Name │ Invoice Number │ │ Vendor Address │ Invoice Date │ │ Vendor GSTIN │ Due Date │ ├─────────────────────────────────────────────────┤ │ [MID-LEFT] │ [MID-RIGHT] │ │ Bill To (Buyer) │ Ship To │ │ Buyer Address │ PO Number │ │ Buyer GSTIN │ │ ├─────────────────────────────────────────────────┤ │ [CENTER - TABLE] │ │ Item | Qty | Rate | HSN | Tax | Amount │ ├─────────────────────────────────────────────────┤ │ [BOTTOM-RIGHT] │ │ Subtotal / Taxable Amount │ │ CGST / SGST / IGST │ │ Total Amount │ └─────────────────────────────────────────────────┘ """ from __future__ import annotations import logging from typing import Any logger = logging.getLogger(__name__) # ── STANDARD TEMPLATE SCHEMA ────────────────────────────────────────────────── # Each entry: (canonical_field_name, zone, aliases_to_search_in_raw_extraction) INVOICE_TEMPLATE: list[tuple[str, str, list[str]]] = [ # ── VENDOR / SELLER (Top-Left) ──────────────────────────────────────────── ("vendor_name", "top_left", ["vendor/shop name", "vendor", "seller", "from", "company", "firm", "shop", "store", "biller", "issued_by", "supplier"]), ("vendor_address", "top_left", ["address", "vendor_address", "seller_address", "from_address", "office_address"]), ("vendor_gstin", "top_left", ["phone / gstin / tax id", "gstin", "gst", "gst_number", "gstin_number", "vendor_gstin", "seller_gstin", "tax_id"]), ("vendor_phone", "top_left", ["phone / gstin / tax id", "phone", "mobile", "contact", "tel", "telephone", "vendor_phone"]), ("vendor_email", "top_left", ["email", "e-mail", "vendor_email"]), # ── INVOICE META (Top-Right) ─────────────────────────────────────────────── ("invoice_number", "top_right", ["bill no / customer id", "invoice_number", "invoice_no", "bill_number", "bill_no", "receipt_no", "challan_no", "ref_no"]), ("invoice_date", "top_right", ["date", "invoice_date", "bill_date", "issued_date", "dated"]), ("due_date", "top_right", ["due_date", "payment_due", "pay_by"]), ("po_number", "top_right", ["po_number", "purchase_order", "order_number", "po_no"]), # ── BUYER / BILL-TO (Mid-Left) ──────────────────────────────────────────── ("buyer_name", "mid_left", ["buyer", "bill_to", "customer", "client", "consignee", "to", "billed_to", "sold_to"]), ("buyer_address", "mid_left", ["buyer_address", "bill_to_address", "customer_address", "delivery_address"]), ("buyer_gstin", "mid_left", ["buyer_gstin", "customer_gstin", "bill_to_gstin"]), # ── LINE ITEMS (Center Table) ───────────────────────────────────────────── ("items", "center", ["items", "line_items", "products", "goods", "services", "description", "particulars"]), # ── TAX SUMMARY (Bottom-Right) ──────────────────────────────────────────── ("subtotal", "bottom_right", ["subtotal", "taxable_amount", "taxable_value", "net_amount", "before_tax"]), ("cgst", "bottom_right", ["cgst", "central_gst", "cgst_amount"]), ("sgst", "bottom_right", ["sgst", "state_gst", "sgst_amount"]), ("igst", "bottom_right", ["igst", "integrated_gst", "igst_amount"]), ("total_tax", "bottom_right", ["total_tax", "tax_total", "gst_total", "vat"]), ("total_amount", "bottom_right", ["total amount", "total", "grand_total", "total_amount", "net_total", "amount_due", "bill_total", "payable", "final_amount"]), ("amount_in_words", "bottom_right", ["amount_in_words", "in_words", "rupees_in_words", "total_words"]), # ── PAYMENT INFO (Bottom) ───────────────────────────────────────────────── ("bank_name", "bottom", ["bank", "bank_name"]), ("account_number", "bottom", ["account_number", "account_no", "acc_no", "bank_account"]), ("ifsc", "bottom", ["ifsc", "ifsc_code", "bank_ifsc"]), ("upi", "bottom", ["upi", "upi_id", "gpay", "phonepe"]), # ── MISC ────────────────────────────────────────────────────────────────── ("notes", "footer", ["notes", "terms", "conditions", "remarks", "note"]), ] # Zone display order for the standardized output ZONE_ORDER = ["top_left", "top_right", "mid_left", "center", "bottom_right", "bottom", "footer"] ZONE_LABELS = { "top_left": "VENDOR / SELLER", "top_right": "INVOICE DETAILS", "mid_left": "BUYER / BILL TO", "center": "LINE ITEMS", "bottom_right": "TAX & TOTALS", "bottom": "PAYMENT INFO", "footer": "NOTES & TERMS", } def _normalize_key(key: str) -> str: """Normalize a key for fuzzy alias matching.""" return key.lower().strip().replace(" ", "_").replace("-", "_").replace(".", "") def _flatten_dict(d: dict[str, Any], parent_key: str = '') -> dict[str, Any]: items: list[tuple[str, Any]] = [] if isinstance(d, list): return {parent_key: d} for k, v in d.items(): new_key = f"{parent_key}_{k}" if parent_key else k if isinstance(v, dict): items.extend(_flatten_dict(v, new_key).items()) else: items.append((new_key, v)) return dict(items) def _find_value(raw: dict[str, Any], aliases: list[str]) -> Any: """ Search the raw extraction dict for a value matching any of the given aliases. Handles nested dicts and case-insensitive keys. Returns the first match found, or None. """ if not isinstance(raw, dict): return None flat_raw = _flatten_dict(raw) # Build normalized key map norm_map = {_normalize_key(k): v for k, v in flat_raw.items()} for alias in aliases: norm_alias = _normalize_key(alias) # Direct match if norm_alias in norm_map: val = norm_map[norm_alias] if val and str(val).strip(): return val # Partial match: alias is substring of a key or vice versa for norm_k, v in norm_map.items(): if (norm_alias in norm_k or norm_k in norm_alias) and v and str(v).strip(): return v # Last resort: check if the value appears in a "full_extraction" text blob full_text = raw.get("full_extraction", "") return None def _extract_from_full_text(full_text: str, aliases: list[str]) -> str | None: """ When raw extraction has a 'full_extraction' text blob (not structured), try to find a field value by looking for known label patterns. e.g., 'Invoice No: 1234' → '1234' """ if not full_text: return None import re for alias in aliases: # Pattern: "alias: value" or "alias - value" on a line pattern = rf'(?i){re.escape(alias.replace("_", "[ _-]"))}[\s:.\-]+([^\n]+)' match = re.search(pattern, full_text) if match: val = match.group(1).strip().rstrip(",;") if val: return val return None def _parse_markdown_table(full_text: str) -> list[dict[str, str]]: """ Parse a markdown table from the Digital Twin text into a list of dicts. Look for | Header | style lines. """ import re lines = full_text.splitlines() table_lines = [l.strip() for l in lines if l.strip().startswith("|")] if len(table_lines) < 3: # Need header, separator, and at least one row return [] # Simple parser: assume first line is header, second is separator try: header_raw = table_lines[0].strip("|").split("|") headers = [h.strip().lower() for h in header_raw] rows = [] for line in table_lines[2:]: if "---" in line: continue cells = [c.strip() for c in line.strip("|").split("|")] if len(cells) >= len(headers): row = {headers[i]: cells[i] for i in range(len(headers))} rows.append(row) return rows except Exception: return [] def map_to_standard_template(raw_extraction: dict[str, Any]) -> dict[str, Any]: """ Map any raw VLM extraction result to the standard invoice template. Args: raw_extraction: Raw dict from VLM (any key names, any structure) Returns: Standardized dict with canonical field names, always same structure. Empty string for fields not found. """ # Handle the case where VLM returned a 'full_extraction' text blob full_text = raw_extraction.get("full_extraction", "") is_text_blob = bool(full_text) and len(raw_extraction) <= 4 result: dict[str, Any] = {} matched_count = 0 for field_name, zone, aliases in INVOICE_TEMPLATE: val = None if field_name == "items" and full_text: # Special handling for table parsing from Digital Twin markdown val = _parse_markdown_table(full_text) if not val and not is_text_blob: val = _find_value(raw_extraction, aliases) if not val and full_text: val = _extract_from_full_text(full_text, aliases + [field_name]) result[field_name] = val or "" if val: matched_count += 1 # If nothing matched and we have a full_text blob, preserve it under a special key if matched_count == 0 and full_text: result["_raw_text"] = full_text logger.info("[layout_template] No structured fields found. Preserving raw text blob.") else: logger.info("[layout_template] Mapped %d/%d fields from extraction.", matched_count, len(INVOICE_TEMPLATE)) result["_template_version"] = "invoice_v1" return result def format_standardized_output(mapped: dict[str, Any]) -> str: """ Format the mapped fields into a human-readable standardized text output. Fields always appear in the same zone order — consistent across all invoices. Empty fields are shown as '—' to maintain the template structure. """ lines = ["=" * 60, " STANDARDIZED INVOICE EXTRACTION", "=" * 60] # Group by zone zone_fields: dict[str, list[tuple[str, str, Any]]] = {z: [] for z in ZONE_ORDER} for field_name, zone, aliases in INVOICE_TEMPLATE: val = mapped.get(field_name, "") zone_fields[zone].append((field_name, aliases[0].replace("_", " ").title(), val)) for zone in ZONE_ORDER: fields = zone_fields[zone] if not fields: continue # Skip empty zones if all(not v for _, _, v in fields): continue lines.append(f"\n[{ZONE_LABELS[zone]}]") lines.append("-" * 40) for field_name, label, val in fields: display_val = str(val).strip() if val else "\u2014" lines.append(f" {label:<25} {display_val}") # Raw text fallback raw_text = mapped.get("_raw_text", "") if raw_text: lines.append("\n[RAW EXTRACTED TEXT]") lines.append("-" * 40) lines.append(raw_text[:2000]) lines.append("\n" + "=" * 60) return "\n".join(lines)