File size: 5,909 Bytes
e5ee651
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
"""
Document Understanding OCR
Extract structured invoice fields from OCR text with transparent confidence checks.
"""

from pathlib import Path
import re

import pandas as pd
import plotly.express as px
import streamlit as st


st.set_page_config(page_title="Document Understanding OCR", page_icon="๐Ÿ“„", layout="wide")


def load_shared_css() -> None:
    current_dir = Path(__file__).resolve().parent
    candidates = [
        current_dir / "shared" / "styles.css",
        current_dir.parent / "shared" / "styles.css",
    ]
    css_path = next(path for path in candidates if path.exists())
    st.markdown(f"<style>{css_path.read_text(encoding='utf-8')}</style>", unsafe_allow_html=True)


load_shared_css()


SAMPLE_TEXT = """INVOICE
Vendor: Northwind Robotics GmbH
Invoice No: INV-2026-0418
Date: 2026-04-18
Due Date: 2026-05-18

Bill To: Atlas Manufacturing

Description                 Qty   Unit Price   Amount
Vision sensor calibration     3      420.00    1260.00
Edge gateway support          2      310.00     620.00

Subtotal: 1880.00
Tax: 357.20
Total: 2237.20 EUR
"""


FIELD_PATTERNS = {
    "vendor": [r"Vendor:\s*(.+)", r"From:\s*(.+)"],
    "invoice_number": [r"Invoice\s*(?:No|#|Number):\s*([A-Z0-9\-]+)"],
    "invoice_date": [r"Date:\s*(\d{4}-\d{2}-\d{2})", r"Date:\s*(\d{1,2}/\d{1,2}/\d{4})"],
    "due_date": [r"Due Date:\s*(\d{4}-\d{2}-\d{2})", r"Payment Due:\s*(.+)"],
    "customer": [r"Bill To:\s*(.+)", r"Customer:\s*(.+)"],
    "subtotal": [r"Subtotal:\s*([0-9,.]+)"],
    "tax": [r"Tax:\s*([0-9,.]+)"],
    "total": [r"Total:\s*([0-9,.]+)\s*([A-Z]{3})?"],
}


def first_match(text: str, patterns):
    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            groups = [group for group in match.groups() if group]
            return " ".join(groups).strip(), 0.92 if len(groups[0]) > 2 else 0.68
    return "", 0.0


def extract_line_items(text: str) -> pd.DataFrame:
    rows = []
    for line in text.splitlines():
        match = re.search(r"^(.+?)\s+(\d+)\s+([0-9,.]+)\s+([0-9,.]+)$", line.strip())
        if match:
            description, qty, unit_price, amount = match.groups()
            rows.append({
                "description": description.strip(),
                "quantity": int(qty),
                "unit_price": float(unit_price.replace(",", "")),
                "amount": float(amount.replace(",", "")),
            })
    return pd.DataFrame(rows)


def extract_document(text: str):
    fields = []
    payload = {}
    for name, patterns in FIELD_PATTERNS.items():
        value, confidence = first_match(text, patterns)
        payload[name] = value
        fields.append({"field": name, "value": value or "not found", "confidence": confidence})

    items = extract_line_items(text)
    if not items.empty:
        computed_total = items["amount"].sum()
        payload["line_item_count"] = int(len(items))
        payload["computed_line_total"] = round(float(computed_total), 2)

    field_df = pd.DataFrame(fields)
    completeness = round((field_df["confidence"] > 0).mean() * 100, 1)
    average_confidence = round(field_df["confidence"].mean() * 100, 1)
    return payload, field_df, items, completeness, average_confidence


st.markdown("""
<div class="hero">
  <div class="hf-badge">Document AI</div>
  <h1>๐Ÿ“„ Document Understanding OCR</h1>
  <p>Turn noisy OCR text into auditable invoice JSON, field confidence, and line-item checks.</p>
  <div class="pill-row">
    <span class="hf-chip">Schema extraction</span>
    <span class="hf-chip">Confidence audit</span>
    <span class="hf-chip">HF model-ready</span>
  </div>
</div>
""", unsafe_allow_html=True)

with st.sidebar:
    st.markdown("### Technique")
    st.info("This Space demonstrates the post-OCR layer: regex/schema extraction, confidence scoring, validation, and export. A production Space can swap the text input for Donut, LayoutLM, or TrOCR inference on Hugging Face.")
    min_confidence = st.slider("Review threshold", 0.0, 1.0, 0.70, 0.05)

left, right = st.columns([1.1, 0.9])
with left:
    text = st.text_area("OCR text", value=SAMPLE_TEXT, height=360)
with right:
    st.markdown("### What This Space Proves")
    st.markdown("""
- Extracts structured fields from unstructured document text.
- Separates evidence and confidence instead of pretending OCR is perfect.
- Produces JSON that can feed a dataset, API, or human review queue.
""")

payload, field_df, items, completeness, average_confidence = extract_document(text)

metric_cols = st.columns(3)
metric_cols[0].metric("Field completeness", f"{completeness}%")
metric_cols[1].metric("Average confidence", f"{average_confidence}%")
metric_cols[2].metric("Line items", int(payload.get("line_item_count", 0)))

tab1, tab2, tab3 = st.tabs(["Extraction", "Validation", "Export"])

with tab1:
    st.dataframe(field_df, use_container_width=True, hide_index=True)
    if not items.empty:
        st.markdown("### Line Items")
        st.dataframe(items, use_container_width=True, hide_index=True)

with tab2:
    fig = px.bar(
        field_df,
        x="field",
        y="confidence",
        color="confidence",
        range_y=[0, 1],
        color_continuous_scale=["#b8a9d9", "#ffad7a", "#7accff"],
        title="Field-level confidence",
    )
    st.plotly_chart(fig, use_container_width=True)
    review = field_df[field_df["confidence"] < min_confidence]
    if review.empty:
        st.success("All extracted fields are above the review threshold.")
    else:
        st.warning("Fields below threshold need human review.")
        st.dataframe(review, use_container_width=True, hide_index=True)

with tab3:
    st.json(payload)
    st.download_button(
        "Download JSON",
        data=pd.Series(payload).to_json(indent=2),
        file_name="document_extraction.json",
        mime="application/json",
        use_container_width=True,
    )