""" Document Understanding OCR Extract structured invoice fields from OCR text with transparent confidence checks. """ from pathlib import Path import re import pandas as pd import plotly.express as px import streamlit as st st.set_page_config(page_title="Document Understanding OCR", page_icon="📄", layout="wide") def load_shared_css() -> None: current_dir = Path(__file__).resolve().parent candidates = [ current_dir / "shared" / "styles.css", current_dir.parent / "shared" / "styles.css", ] css_path = next(path for path in candidates if path.exists()) st.markdown(f"", unsafe_allow_html=True) load_shared_css() SAMPLE_TEXT = """INVOICE Vendor: Northwind Robotics GmbH Invoice No: INV-2026-0418 Date: 2026-04-18 Due Date: 2026-05-18 Bill To: Atlas Manufacturing Description Qty Unit Price Amount Vision sensor calibration 3 420.00 1260.00 Edge gateway support 2 310.00 620.00 Subtotal: 1880.00 Tax: 357.20 Total: 2237.20 EUR """ FIELD_PATTERNS = { "vendor": [r"Vendor:\s*(.+)", r"From:\s*(.+)"], "invoice_number": [r"Invoice\s*(?:No|#|Number):\s*([A-Z0-9\-]+)"], "invoice_date": [r"Date:\s*(\d{4}-\d{2}-\d{2})", r"Date:\s*(\d{1,2}/\d{1,2}/\d{4})"], "due_date": [r"Due Date:\s*(\d{4}-\d{2}-\d{2})", r"Payment Due:\s*(.+)"], "customer": [r"Bill To:\s*(.+)", r"Customer:\s*(.+)"], "subtotal": [r"Subtotal:\s*([0-9,.]+)"], "tax": [r"Tax:\s*([0-9,.]+)"], "total": [r"Total:\s*([0-9,.]+)\s*([A-Z]{3})?"], } def first_match(text: str, patterns): for pattern in patterns: match = re.search(pattern, text, re.IGNORECASE) if match: groups = [group for group in match.groups() if group] return " ".join(groups).strip(), 0.92 if len(groups[0]) > 2 else 0.68 return "", 0.0 def extract_line_items(text: str) -> pd.DataFrame: rows = [] for line in text.splitlines(): match = re.search(r"^(.+?)\s+(\d+)\s+([0-9,.]+)\s+([0-9,.]+)$", line.strip()) if match: description, qty, unit_price, amount = match.groups() rows.append({ "description": description.strip(), "quantity": int(qty), "unit_price": float(unit_price.replace(",", "")), "amount": float(amount.replace(",", "")), }) return pd.DataFrame(rows) def extract_document(text: str): fields = [] payload = {} for name, patterns in FIELD_PATTERNS.items(): value, confidence = first_match(text, patterns) payload[name] = value fields.append({"field": name, "value": value or "not found", "confidence": confidence}) items = extract_line_items(text) if not items.empty: computed_total = items["amount"].sum() payload["line_item_count"] = int(len(items)) payload["computed_line_total"] = round(float(computed_total), 2) field_df = pd.DataFrame(fields) completeness = round((field_df["confidence"] > 0).mean() * 100, 1) average_confidence = round(field_df["confidence"].mean() * 100, 1) return payload, field_df, items, completeness, average_confidence st.markdown("""
Turn noisy OCR text into auditable invoice JSON, field confidence, and line-item checks.