| """ |
| Document Understanding OCR |
| Extract structured invoice fields from OCR text with transparent confidence checks. |
| """ |
|
|
| from pathlib import Path |
| import re |
|
|
| import pandas as pd |
| import plotly.express as px |
| import streamlit as st |
|
|
|
|
| st.set_page_config(page_title="Document Understanding OCR", page_icon="๐", layout="wide") |
|
|
|
|
| def load_shared_css() -> None: |
| current_dir = Path(__file__).resolve().parent |
| candidates = [ |
| current_dir / "shared" / "styles.css", |
| current_dir.parent / "shared" / "styles.css", |
| ] |
| css_path = next(path for path in candidates if path.exists()) |
| st.markdown(f"<style>{css_path.read_text(encoding='utf-8')}</style>", unsafe_allow_html=True) |
|
|
|
|
| load_shared_css() |
|
|
|
|
| SAMPLE_TEXT = """INVOICE |
| Vendor: Northwind Robotics GmbH |
| Invoice No: INV-2026-0418 |
| Date: 2026-04-18 |
| Due Date: 2026-05-18 |
| |
| Bill To: Atlas Manufacturing |
| |
| Description Qty Unit Price Amount |
| Vision sensor calibration 3 420.00 1260.00 |
| Edge gateway support 2 310.00 620.00 |
| |
| Subtotal: 1880.00 |
| Tax: 357.20 |
| Total: 2237.20 EUR |
| """ |
|
|
|
|
| FIELD_PATTERNS = { |
| "vendor": [r"Vendor:\s*(.+)", r"From:\s*(.+)"], |
| "invoice_number": [r"Invoice\s*(?:No|#|Number):\s*([A-Z0-9\-]+)"], |
| "invoice_date": [r"Date:\s*(\d{4}-\d{2}-\d{2})", r"Date:\s*(\d{1,2}/\d{1,2}/\d{4})"], |
| "due_date": [r"Due Date:\s*(\d{4}-\d{2}-\d{2})", r"Payment Due:\s*(.+)"], |
| "customer": [r"Bill To:\s*(.+)", r"Customer:\s*(.+)"], |
| "subtotal": [r"Subtotal:\s*([0-9,.]+)"], |
| "tax": [r"Tax:\s*([0-9,.]+)"], |
| "total": [r"Total:\s*([0-9,.]+)\s*([A-Z]{3})?"], |
| } |
|
|
|
|
| def first_match(text: str, patterns): |
| for pattern in patterns: |
| match = re.search(pattern, text, re.IGNORECASE) |
| if match: |
| groups = [group for group in match.groups() if group] |
| return " ".join(groups).strip(), 0.92 if len(groups[0]) > 2 else 0.68 |
| return "", 0.0 |
|
|
|
|
| def extract_line_items(text: str) -> pd.DataFrame: |
| rows = [] |
| for line in text.splitlines(): |
| match = re.search(r"^(.+?)\s+(\d+)\s+([0-9,.]+)\s+([0-9,.]+)$", line.strip()) |
| if match: |
| description, qty, unit_price, amount = match.groups() |
| rows.append({ |
| "description": description.strip(), |
| "quantity": int(qty), |
| "unit_price": float(unit_price.replace(",", "")), |
| "amount": float(amount.replace(",", "")), |
| }) |
| return pd.DataFrame(rows) |
|
|
|
|
| def extract_document(text: str): |
| fields = [] |
| payload = {} |
| for name, patterns in FIELD_PATTERNS.items(): |
| value, confidence = first_match(text, patterns) |
| payload[name] = value |
| fields.append({"field": name, "value": value or "not found", "confidence": confidence}) |
|
|
| items = extract_line_items(text) |
| if not items.empty: |
| computed_total = items["amount"].sum() |
| payload["line_item_count"] = int(len(items)) |
| payload["computed_line_total"] = round(float(computed_total), 2) |
|
|
| field_df = pd.DataFrame(fields) |
| completeness = round((field_df["confidence"] > 0).mean() * 100, 1) |
| average_confidence = round(field_df["confidence"].mean() * 100, 1) |
| return payload, field_df, items, completeness, average_confidence |
|
|
|
|
| st.markdown(""" |
| <div class="hero"> |
| <div class="hf-badge">Document AI</div> |
| <h1>๐ Document Understanding OCR</h1> |
| <p>Turn noisy OCR text into auditable invoice JSON, field confidence, and line-item checks.</p> |
| <div class="pill-row"> |
| <span class="hf-chip">Schema extraction</span> |
| <span class="hf-chip">Confidence audit</span> |
| <span class="hf-chip">HF model-ready</span> |
| </div> |
| </div> |
| """, unsafe_allow_html=True) |
|
|
| with st.sidebar: |
| st.markdown("### Technique") |
| st.info("This Space demonstrates the post-OCR layer: regex/schema extraction, confidence scoring, validation, and export. A production Space can swap the text input for Donut, LayoutLM, or TrOCR inference on Hugging Face.") |
| min_confidence = st.slider("Review threshold", 0.0, 1.0, 0.70, 0.05) |
|
|
| left, right = st.columns([1.1, 0.9]) |
| with left: |
| text = st.text_area("OCR text", value=SAMPLE_TEXT, height=360) |
| with right: |
| st.markdown("### What This Space Proves") |
| st.markdown(""" |
| - Extracts structured fields from unstructured document text. |
| - Separates evidence and confidence instead of pretending OCR is perfect. |
| - Produces JSON that can feed a dataset, API, or human review queue. |
| """) |
|
|
| payload, field_df, items, completeness, average_confidence = extract_document(text) |
|
|
| metric_cols = st.columns(3) |
| metric_cols[0].metric("Field completeness", f"{completeness}%") |
| metric_cols[1].metric("Average confidence", f"{average_confidence}%") |
| metric_cols[2].metric("Line items", int(payload.get("line_item_count", 0))) |
|
|
| tab1, tab2, tab3 = st.tabs(["Extraction", "Validation", "Export"]) |
|
|
| with tab1: |
| st.dataframe(field_df, use_container_width=True, hide_index=True) |
| if not items.empty: |
| st.markdown("### Line Items") |
| st.dataframe(items, use_container_width=True, hide_index=True) |
|
|
| with tab2: |
| fig = px.bar( |
| field_df, |
| x="field", |
| y="confidence", |
| color="confidence", |
| range_y=[0, 1], |
| color_continuous_scale=["#b8a9d9", "#ffad7a", "#7accff"], |
| title="Field-level confidence", |
| ) |
| st.plotly_chart(fig, use_container_width=True) |
| review = field_df[field_df["confidence"] < min_confidence] |
| if review.empty: |
| st.success("All extracted fields are above the review threshold.") |
| else: |
| st.warning("Fields below threshold need human review.") |
| st.dataframe(review, use_container_width=True, hide_index=True) |
|
|
| with tab3: |
| st.json(payload) |
| st.download_button( |
| "Download JSON", |
| data=pd.Series(payload).to_json(indent=2), |
| file_name="document_extraction.json", |
| mime="application/json", |
| use_container_width=True, |
| ) |
|
|