Seth0330 commited on
Commit
71ced98
·
verified ·
1 Parent(s): 564a3c9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +336 -257
app.py CHANGED
@@ -1,260 +1,339 @@
1
- import sqlite3
2
- import threading
3
- import time
4
  import re
5
- from datetime import datetime
6
-
7
  import pandas as pd
 
8
  import streamlit as st
9
-
10
- # =========================
11
- # App Config
12
- # =========================
13
- st.set_page_config(page_title="Expo Game Timer", page_icon="⏱️", layout="centered")
14
-
15
- DB_PATH = "game.db"
16
- DB_LOCK = threading.Lock()
17
- TICK_SECONDS = 0.1 # ~10 fps refresh while the timer is running
18
-
19
- # =========================
20
- # DB Utilities
21
- # =========================
22
- def init_db():
23
- with DB_LOCK:
24
- conn = sqlite3.connect(DB_PATH, check_same_thread=False, timeout=10)
25
- cur = conn.cursor()
26
- cur.execute("PRAGMA journal_mode=WAL;")
27
- cur.execute(
28
- """
29
- CREATE TABLE IF NOT EXISTS results (
30
- id INTEGER PRIMARY KEY AUTOINCREMENT,
31
- name TEXT NOT NULL,
32
- email TEXT NOT NULL,
33
- seconds REAL NOT NULL,
34
- created_at TEXT NOT NULL
35
- )
36
- """
37
- )
38
- conn.commit()
39
- conn.close()
40
-
41
- def insert_result(name: str, email: str, seconds: float):
42
- now = datetime.utcnow().isoformat()
43
- with DB_LOCK:
44
- conn = sqlite3.connect(DB_PATH, check_same_thread=False, timeout=10)
45
- cur = conn.cursor()
46
- cur.execute(
47
- "INSERT INTO results (name, email, seconds, created_at) VALUES (?, ?, ?, ?)",
48
- (name.strip(), email.strip().lower(), float(seconds), now),
49
- )
50
- conn.commit()
51
- conn.close()
52
- load_all_results.clear() # bust cache so dashboard updates instantly
53
-
54
- @st.cache_data(show_spinner=False)
55
- def load_all_results() -> pd.DataFrame:
56
- with DB_LOCK:
57
- conn = sqlite3.connect(DB_PATH, check_same_thread=False, timeout=10)
58
- df = pd.read_sql_query(
59
- "SELECT id, name, email, seconds, created_at FROM results ORDER BY id DESC",
60
- conn,
61
- )
62
- conn.close()
63
- return df
64
-
65
- # =========================
66
- # Helpers
67
- # =========================
68
- EMAIL_RE = re.compile(r"^[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}$")
69
-
70
- def valid_email(email: str) -> bool:
71
- return bool(EMAIL_RE.match(email or ""))
72
-
73
- def format_seconds(s: float) -> str:
74
- # mm:ss.mmm
75
- m, sec = divmod(max(float(s), 0.0), 60)
76
- return f"{int(m):02d}:{sec:06.3f}"
77
-
78
- def ensure_session_state():
79
- ss = st.session_state
80
- if "start_time" not in ss: # None means not currently running
81
- ss.start_time = None
82
- if "accumulated" not in ss: # seconds already accrued from past runs
83
- ss.accumulated = 0.0
84
- if "name" not in ss:
85
- ss.name = ""
86
- if "email" not in ss:
87
- ss.email = ""
88
-
89
- def is_running() -> bool:
90
- return st.session_state.start_time is not None
91
-
92
- def current_elapsed() -> float:
93
- """Total elapsed = accumulated + (now - start_time if running)."""
94
- ss = st.session_state
95
- if ss.start_time is None:
96
- return ss.accumulated
97
- return ss.accumulated + (time.perf_counter() - ss.start_time)
98
-
99
- def start_timer():
100
- if st.session_state.start_time is None:
101
- st.session_state.start_time = time.perf_counter()
102
-
103
- def stop_timer():
104
- if st.session_state.start_time is not None:
105
- st.session_state.accumulated += (time.perf_counter() - st.session_state.start_time)
106
- st.session_state.start_time = None
107
-
108
- def reset_timer():
109
- st.session_state.start_time = None
110
- st.session_state.accumulated = 0.0
111
-
112
- def safe_rerun():
113
- try:
114
- st.rerun()
115
- except Exception:
116
- st.experimental_rerun()
117
-
118
- # =========================
119
- # UI
120
- # =========================
121
- def header():
122
- st.markdown(
123
- """
124
- <div style="text-align:center; margin-bottom: 0.5rem;">
125
- <h1 style="margin-bottom:0">⏱️ Expo Game Timer</h1>
126
- <p style="color:#666; margin-top:0.25rem">Record participants, time their run, track a live leaderboard, and export results.</p>
127
- </div>
128
- """,
129
- unsafe_allow_html=True,
130
- )
131
-
132
- def participant_form():
133
- c1, c2 = st.columns(2)
134
- with c1:
135
- st.text_input("Participant Name", key="name", placeholder="Jane Doe")
136
- with c2:
137
- st.text_input("Email", key="email", placeholder="jane@example.com")
138
-
139
- def stopwatch_card():
140
- ensure_session_state()
141
-
142
- st.markdown("### Stopwatch")
143
- with st.container(border=True):
144
- # Display (updates continuously while running)
145
- elapsed = current_elapsed()
146
- st.markdown(
147
- f"<div style='font-size:3rem; text-align:center; font-variant-numeric: tabular-nums;'>{format_seconds(elapsed)}</div>",
148
- unsafe_allow_html=True,
149
- )
150
-
151
- b1, b2, b3 = st.columns(3)
152
- with b1:
153
- if st.button("▶️ Start", use_container_width=True, disabled=is_running()):
154
- start_timer()
155
- safe_rerun()
156
- with b2:
157
- if st.button("⏸️ Stop", use_container_width=True, disabled=not is_running()):
158
- stop_timer()
159
- safe_rerun()
160
- with b3:
161
- if st.button("↺ Reset", use_container_width=True, disabled=(current_elapsed() == 0.0 and not is_running())):
162
- reset_timer()
163
- safe_rerun()
164
-
165
- st.caption("Tip: Start the timer when the game begins and press Stop as soon as they finish. Then Save Result.")
166
-
167
- st.divider()
168
- save_col1, save_col2 = st.columns([2, 1])
169
- with save_col1:
170
- st.write("**Save this run**")
171
- if not st.session_state.name.strip():
172
- st.info("Enter a participant name.")
173
- if not st.session_state.email.strip():
174
- st.info("Enter a valid email.")
175
- if st.session_state.email and not valid_email(st.session_state.email):
176
- st.error("Please enter a valid email address.")
177
-
178
- with save_col2:
179
- disabled_save = (
180
- not st.session_state.name.strip()
181
- or not valid_email(st.session_state.email)
182
- or current_elapsed() <= 0.0
183
- or is_running() # don't allow saving while the timer is running
184
- )
185
- if st.button("💾 Save Result", type="primary", use_container_width=True, disabled=disabled_save):
186
- secs = round(current_elapsed(), 3)
187
- try:
188
- insert_result(st.session_state.name, st.session_state.email, secs)
189
- st.success(f"Saved: {st.session_state.name} {format_seconds(secs)}")
190
- reset_timer()
191
- except Exception as e:
192
- st.error(f"Failed to save result: {e}")
193
- safe_rerun()
194
-
195
- # Auto-refresh while running (simple, robust pattern)
196
- if is_running():
197
- time.sleep(TICK_SECONDS)
198
- safe_rerun()
199
-
200
- def dashboard():
201
- st.markdown("### Dashboard")
202
- with st.container(border=True):
203
- df = load_all_results()
204
- if df.empty:
205
- st.info("No results yet. Save the first run to see stats and leaderboard.")
206
- return
207
-
208
- # Quick stats
209
- total = len(df)
210
- best = df["seconds"].min()
211
- avg = df["seconds"].mean()
212
-
213
- s1, s2, s3 = st.columns(3)
214
- s1.metric("Total Participants (runs)", total)
215
- s2.metric("Best Time", format_seconds(best))
216
- s3.metric("Average Time", format_seconds(avg))
217
-
218
- st.markdown("#### 🏆 Top 3 Fastest")
219
- top3 = df.sort_values("seconds", ascending=True).head(3).copy()
220
- top3["Time"] = top3["seconds"].apply(format_seconds)
221
- st.dataframe(
222
- top3[["name", "email", "Time", "created_at"]]
223
- .rename(columns={"name": "Name", "email": "Email", "created_at": "Recorded (UTC)"}),
224
- hide_index=True,
225
- use_container_width=True,
226
- )
227
-
228
- # --- No "All Results" table displayed ---
229
- # Still provide CSV of the full dataset
230
- csv_df = df.copy()
231
- csv_df["time_formatted"] = csv_df["seconds"].apply(format_seconds)
232
- st.download_button(
233
- label="⬇️ Download all results (CSV)",
234
- data=csv_df.to_csv(index=False).encode("utf-8"),
235
- file_name="game_results.csv",
236
- mime="text/csv",
237
- use_container_width=True,
238
- )
239
-
240
-
241
- def footer_note():
242
- st.caption(
243
- "Data is stored in a local SQLite database (`game.db`). "
244
- "Note: if the Space restarts or is rebuilt, the DB resets. "
245
- "Multiple attempts per email are allowed; use the CSV to post-process if you want best-per-email."
246
- )
247
-
248
- # =========================
249
- # Main
250
- # =========================
251
- def main():
252
- init_db()
253
- header()
254
- participant_form()
255
- stopwatch_card()
256
- dashboard()
257
- footer_note()
258
-
259
- if __name__ == "__main__":
260
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
 
 
2
  import re
3
+ import json
4
+ import numpy as np
5
  import pandas as pd
6
+ from PIL import Image, ImageOps, ImageFilter
7
  import streamlit as st
8
+ import pytesseract
9
+ from pytesseract import Output
10
+
11
+ # PDF → images
12
+ try:
13
+ from pdf2image import convert_from_bytes
14
+ PDF_OK = True
15
+ except Exception:
16
+ PDF_OK = False
17
+
18
+ st.set_page_config(page_title="Invoice OCR (Tesseract) · Streamlit", layout="wide")
19
+
20
+ # --------------------------- Image utils ---------------------------
21
+ def preprocess(img: Image.Image) -> Image.Image:
22
+ """Light cleanup to help Tesseract: grayscale, contrast, binarize, sharpen."""
23
+ g = ImageOps.grayscale(img)
24
+ g = ImageOps.autocontrast(g)
25
+ # mild unsharp for text edges
26
+ g = g.filter(ImageFilter.UnsharpMask(radius=1, percent=150, threshold=3))
27
+ # adaptive-like: simple threshold after autocontrast
28
+ arr = np.array(g)
29
+ thr = np.clip(arr.mean() * 0.9, 110, 180) # heuristic
30
+ bw = Image.fromarray((arr > thr).astype(np.uint8) * 255)
31
+ return bw
32
+
33
+ def load_pages(file_bytes: bytes, name: str):
34
+ """Return a list of PIL Images (pages)."""
35
+ name = (name or "").lower()
36
+ if name.endswith(".pdf"):
37
+ if not PDF_OK:
38
+ st.error("pdf2image not available. Did you add poppler in apt.txt?")
39
+ return []
40
+ pages = convert_from_bytes(file_bytes, dpi=300)
41
+ return pages
42
+ else:
43
+ img = Image.open(io.BytesIO(file_bytes)).convert("RGB")
44
+ return [img]
45
+
46
+ # --------------------------- OCR ---------------------------
47
+ def ocr_tsv(img: Image.Image, lang="eng") -> pd.DataFrame:
48
+ """Run Tesseract and return TSV dataframe (one row per word)."""
49
+ # Important: keep original scale for better bbox geometry
50
+ data = pytesseract.image_to_data(img, lang=lang, output_type=Output.DATAFRAME)
51
+ # Drop NaNs that Tesseract sometimes emits
52
+ data = data.dropna(subset=["text"]).reset_index(drop=True)
53
+ # Compute centers for convenience
54
+ data["x2"] = data["left"] + data["width"]
55
+ data["y2"] = data["top"] + data["height"]
56
+ data["cx"] = data["left"] + data["width"] / 2
57
+ data["cy"] = data["top"] + data["height"] / 2
58
+ return data
59
+
60
+ def ocr_text(img: Image.Image, lang="eng") -> str:
61
+ return pytesseract.image_to_string(img, lang=lang)
62
+
63
+ # --------------------------- Key-field parsing ---------------------------
64
+ CURRENCY = r"(?P<curr>USD|CAD|EUR|GBP|\$|C\$|€|£)?"
65
+ MONEY = rf"{CURRENCY}\s?(?P<amt>\d{{1,3}}(?:[,]\d{{3}})*(?:[.]\d{{2}})?)"
66
+
67
+ DATE = r"(?P<date>(?:\d{4}[-/]\d{1,2}[-/]\d{1,2})|(?:\d{1,2}[-/]\d{1,2}[-/]\d{2,4})|(?:[A-Za-z]{3,9}\s+\d{1,2},\s*\d{2,4}))"
68
+ INV_PAT = r"(?:invoice\s*(?:no\.?|#|number)?\s*[:\-]?\s*(?P<inv>[A-Z0-9\-_/]{4,})).*"
69
+ PO_PAT = r"(?:po\s*(?:no\.?|#|number)?\s*[:\-]?\s*(?P<po>[A-Z0-9\-_/]{3,}))"
70
+ TOTAL_PAT = rf"(?:\b(total(?:\s*amount)?|amount\s*due|grand\s*total)\b.*?{MONEY})"
71
+ SUBTOTAL_PAT = rf"(?:\bsub\s*total\b.*?{MONEY})"
72
+ TAX_PAT = rf"(?:\b(tax|gst|vat|hst)\b.*?{MONEY})"
73
+
74
+ def find_first(pattern, text, flags=re.IGNORECASE | re.DOTALL):
75
+ m = re.search(pattern, text, flags)
76
+ return (m.groupdict() if m else None), m
77
+
78
+ def parse_fields(fulltext: str):
79
+ # Normalize spaces
80
+ t = re.sub(r"[ \t]+", " ", fulltext)
81
+ t = re.sub(r"\n{2,}", "\n", t)
82
+
83
+ out = {
84
+ "invoice_number": None,
85
+ "invoice_date": None,
86
+ "po_number": None,
87
+ "subtotal": None,
88
+ "tax": None,
89
+ "total": None,
90
+ "currency": None,
91
+ }
92
+
93
+ # Invoice number
94
+ g,_ = find_first(INV_PAT, t)
95
+ if g and g.get("inv"):
96
+ out["invoice_number"] = g["inv"].strip()
97
+
98
+ # PO
99
+ g,_ = find_first(PO_PAT, t)
100
+ if g and g.get("po"):
101
+ out["po_number"] = g["po"].strip()
102
+
103
+ # Date: look near "invoice date" first
104
+ near_date = re.search(rf"(invoice\s*date[:\-\s]*){DATE}", t, re.IGNORECASE)
105
+ if near_date:
106
+ out["invoice_date"] = near_date.group("date")
107
+ else:
108
+ g,_ = find_first(DATE, t)
109
+ if g and g.get("date"):
110
+ out["invoice_date"] = g["date"]
111
+
112
+ # Monetary values
113
+ # Subtotal
114
+ g,m = find_first(SUBTOTAL_PAT, t)
115
+ if g and g.get("amt"):
116
+ out["subtotal"] = g["amt"].replace(",", "")
117
+ out["currency"] = g.get("curr") or out["currency"]
118
+
119
+ # Tax
120
+ g,m = find_first(TAX_PAT, t)
121
+ if g and g.get("amt"):
122
+ out["tax"] = g["amt"].replace(",", "")
123
+ out["currency"] = g.get("curr") or out["currency"]
124
+
125
+ # Total / Amount Due
126
+ g,m = find_first(TOTAL_PAT, t)
127
+ if g and g.get("amt"):
128
+ out["total"] = g["amt"].replace(",", "")
129
+ out["currency"] = g.get("curr") or out["currency"]
130
+
131
+ # Normalize currency symbols
132
+ if out["currency"] in ["$", "C$", "€", "£"]:
133
+ sym_map = {"$":"USD", "C$":"CAD", "€":"EUR", "£":"GBP"}
134
+ out["currency"] = sym_map.get(out["currency"], out["currency"])
135
+
136
+ return out
137
+
138
+ # --------------------------- Line item parsing ---------------------------
139
+ HEAD_CANDIDATES = ["description", "item", "qty", "quantity", "price", "unit price", "rate", "amount", "total"]
140
+ def guess_header_rows(tsv: pd.DataFrame) -> pd.DataFrame:
141
+ """
142
+ Try to find a header line based on presence of common header tokens.
143
+ Returns candidate header rows (can be empty).
144
+ """
145
+ # Group by (block, par, line) -> line text and bbox
146
+ lines = []
147
+ for keys, g in tsv.groupby(["block_num", "par_num", "line_num"], as_index=False):
148
+ text = " ".join([w for w in g["text"].astype(str).tolist() if w.strip()])
149
+ if text.strip():
150
+ row = {
151
+ "block_num": keys[0],
152
+ "par_num": keys[1],
153
+ "line_num": keys[2],
154
+ "text": text.lower(),
155
+ "top": g["top"].min(),
156
+ "bottom": g["y2"].max(),
157
+ "left": g["left"].min(),
158
+ "right": g["x2"].max(),
159
+ }
160
+ lines.append(row)
161
+ L = pd.DataFrame(lines)
162
+ if L.empty:
163
+ return L
164
+
165
+ def score_header(s: str):
166
+ tokens = sum(1 for h in HEAD_CANDIDATES if h in s)
167
+ return tokens
168
+
169
+ L["header_score"] = L["text"].apply(score_header)
170
+ return L[L["header_score"] >= 2].sort_values(["header_score", "top"], ascending=[False, True])
171
+
172
+ def extract_table(tsv: pd.DataFrame) -> pd.DataFrame:
173
+ """
174
+ Simple geometry-driven itemization:
175
+ - find a header line
176
+ - derive rough column boundaries from header word x-positions
177
+ - assign subsequent words into nearest column
178
+ - stop when large vertical gap or when totals region starts
179
+ """
180
+ header_lines = guess_header_rows(tsv)
181
+ if header_lines.empty:
182
+ return pd.DataFrame()
183
+
184
+ # Take the top-scoring header
185
+ H = header_lines.iloc[0]
186
+ header_band_top, header_band_bottom = H["top"], H["bottom"]
187
+
188
+ # Words within header band
189
+ header_words = tsv[(tsv["top"] >= header_band_top - 5) & (tsv["y2"] <= header_band_bottom + 5)]
190
+ # Keep only words that look like header candidates
191
+ header_words = header_words[header_words["text"].str.lower().isin([h for h in HEAD_CANDIDATES if " " not in h]) |
192
+ header_words["text"].str.lower().isin(["description","item","qty","price","amount","total"])]
193
+
194
+ if header_words.empty:
195
+ return pd.DataFrame()
196
+
197
+ # Sort by x center; build columns
198
+ header_words = header_words.sort_values("cx")
199
+ columns = []
200
+ for _, w in header_words.iterrows():
201
+ columns.append({"name": w["text"].lower(), "x": w["cx"]})
202
+
203
+ # Canonical column order by x
204
+ columns = sorted(columns, key=lambda c: c["x"])
205
+
206
+ # Items region: words below header, but above totals area (heuristic)
207
+ below = tsv[tsv["top"] > header_band_bottom + 5].copy()
208
+
209
+ # Stop at the first strong "total" line to avoid footer math rows
210
+ footer_y = None
211
+ totals_mask = below["text"].str.lower().str.contains(r"(sub\s*total|amount\s*due|total|grand\s*total|balance)", regex=True, na=False)
212
+ if totals_mask.any():
213
+ footer_y = below.loc[totals_mask, "top"].min()
214
+ below = below[below["top"] < footer_y - 4]
215
+
216
+ if below.empty:
217
+ return pd.DataFrame()
218
+
219
+ # Group by line again, then split into columns by nearest header x
220
+ items = []
221
+ for (b,p,l), g in below.groupby(["block_num","par_num","line_num"]):
222
+ words = g.sort_values("cx")
223
+ if words["text"].str.strip().eq("").all():
224
+ continue
225
+
226
+ # Assign each word to nearest column center
227
+ col_texts = {c["name"]: [] for c in columns}
228
+ for _, w in words.iterrows():
229
+ if not str(w["text"]).strip():
230
+ continue
231
+ nearest = min(columns, key=lambda c: abs(c["x"] - w["cx"]))
232
+ col_texts[nearest["name"]].append(str(w["text"]))
233
+
234
+ row = {k: " ".join(v).strip() for k,v in col_texts.items()}
235
+ # basic filters to avoid empty noise lines
236
+ if any(val for val in row.values()):
237
+ items.append(row)
238
+
239
+ df = pd.DataFrame(items)
240
+ # Normalize common column names
241
+ rename_map = {}
242
+ for c in df.columns:
243
+ if "desc" in c or c == "item":
244
+ rename_map[c] = "description"
245
+ elif c in ["qty","quantity"]:
246
+ rename_map[c] = "quantity"
247
+ elif "unit" in c or "rate" in c or "price" in c:
248
+ rename_map[c] = "unit_price"
249
+ elif "amount" in c or "total" in c:
250
+ rename_map[c] = "line_total"
251
+ df = df.rename(columns=rename_map)
252
+
253
+ # Drop fully empty rows
254
+ df = df[[c for c in ["description","quantity","unit_price","line_total"] if c in df.columns]]
255
+ if not df.empty:
256
+ df = df[~(df.fillna("").apply(lambda r: "".join(r.values), axis=1).str.strip()=="")]
257
+ return df.reset_index(drop=True)
258
+
259
+ # --------------------------- App UI ---------------------------
260
+ st.title("Invoice Extraction (Tesseract · Streamlit)")
261
+
262
+ st.sidebar.header("Settings")
263
+ lang = st.sidebar.text_input("Tesseract language(s)", value="eng")
264
+ show_tsv = st.sidebar.checkbox("Show raw OCR TSV", value=False)
265
+ show_fulltext = st.sidebar.checkbox("Show full OCR text", value=False)
266
+
267
+ up = st.file_uploader("Upload an invoice (PDF, PNG, JPG)", type=["pdf","png","jpg","jpeg"], accept_multiple_files=False)
268
+
269
+ if not up:
270
+ st.info("Upload a scanned invoice PDF or an image to begin.")
271
+ st.stop()
272
+
273
+ pages = load_pages(up.read(), up.name)
274
+ if not pages:
275
+ st.stop()
276
+
277
+ # Page selector (for multi-page PDFs)
278
+ if len(pages) > 1:
279
+ idx = st.number_input("Page", min_value=1, max_value=len(pages), value=1)
280
+ img = pages[idx-1]
281
+ else:
282
+ img = pages[0]
283
+
284
+ col_prev, col_data = st.columns([1.1, 1.3], gap="large")
285
+
286
+ with col_prev:
287
+ st.subheader("Preview")
288
+ st.image(img, use_column_width=True, caption="Original page")
289
+ pre = preprocess(img)
290
+ with st.expander("Preprocessed (for OCR)"):
291
+ st.image(pre, use_column_width=True)
292
+
293
+ with col_data:
294
+ st.subheader("Extraction")
295
+ with st.spinner("Running Tesseract..."):
296
+ tsv = ocr_tsv(pre, lang=lang)
297
+ text = ocr_text(pre, lang=lang)
298
+
299
+ key_fields = parse_fields(text)
300
+ st.markdown("**Key Fields (heuristic)**")
301
+ k1, k2, k3 = st.columns(3)
302
+ with k1:
303
+ st.write(f"**Invoice #:** {key_fields.get('invoice_number') or '—'}")
304
+ st.write(f"**Invoice Date:** {key_fields.get('invoice_date') or '—'}")
305
+ with k2:
306
+ st.write(f"**PO #:** {key_fields.get('po_number') or '—'}")
307
+ st.write(f"**Subtotal:** {key_fields.get('subtotal') or '—'}")
308
+ with k3:
309
+ st.write(f"**Tax:** {key_fields.get('tax') or '—'}")
310
+ tot = key_fields.get('total') or '—'
311
+ cur = key_fields.get('currency') or ''
312
+ st.write(f"**Total:** {tot} {cur}".strip())
313
+
314
+ st.markdown("**Line Items (auto-detected)**")
315
+ items = extract_table(tsv)
316
+ if items.empty:
317
+ st.caption("No line items confidently detected. You can still download full OCR text.")
318
+ else:
319
+ st.dataframe(items, use_container_width=True)
320
+
321
+ # Downloads
322
+ result = {
323
+ "file": up.name,
324
+ "key_fields": key_fields,
325
+ "items": items.to_dict(orient="records") if not items.empty else [],
326
+ "full_text": text,
327
+ }
328
+ j = json.dumps(result, indent=2)
329
+ st.download_button("Download JSON", data=j, file_name="invoice_extraction.json", mime="application/json")
330
+ if not items.empty:
331
+ csv = items.to_csv(index=False)
332
+ st.download_button("Download Line Items CSV", data=csv, file_name="invoice_items.csv", mime="text/csv")
333
+
334
+ # Optional raw views
335
+ with st.expander("Advanced · Raw Outputs"):
336
+ if show_fulltext:
337
+ st.text_area("OCR Full Text", value=text, height=220)
338
+ if show_tsv:
339
+ st.dataframe(tsv.head(100), use_container_width=True)