Spaces:

Seth0330
/

DPT2

Runtime error

App Files Files Community

Seth0330 commited on Oct 23, 2025

Commit

71ced98

verified ·

1 Parent(s): 564a3c9

Update app.py

Browse files

Files changed (1) hide show

app.py +336 -257

app.py CHANGED Viewed

@@ -1,260 +1,339 @@
-import sqlite3
-import threading
-import time
 import re
-from datetime import datetime
 import pandas as pd
 import streamlit as st
-# =========================
-# App Config
-# =========================
-st.set_page_config(page_title="Expo Game Timer", page_icon="⏱️", layout="centered")
-DB_PATH = "game.db"
-DB_LOCK = threading.Lock()
-TICK_SECONDS = 0.1  # ~10 fps refresh while the timer is running
-# =========================
-# DB Utilities
-# =========================
-def init_db():
-    with DB_LOCK:
-        conn = sqlite3.connect(DB_PATH, check_same_thread=False, timeout=10)
-        cur = conn.cursor()
-        cur.execute("PRAGMA journal_mode=WAL;")
-        cur.execute(
-            """
-            CREATE TABLE IF NOT EXISTS results (
-                id INTEGER PRIMARY KEY AUTOINCREMENT,
-                name TEXT NOT NULL,
-                email TEXT NOT NULL,
-                seconds REAL NOT NULL,
-                created_at TEXT NOT NULL
-            )
-            """
-        )
-        conn.commit()
-        conn.close()
-def insert_result(name: str, email: str, seconds: float):
-    now = datetime.utcnow().isoformat()
-    with DB_LOCK:
-        conn = sqlite3.connect(DB_PATH, check_same_thread=False, timeout=10)
-        cur = conn.cursor()
-        cur.execute(
-            "INSERT INTO results (name, email, seconds, created_at) VALUES (?, ?, ?, ?)",
-            (name.strip(), email.strip().lower(), float(seconds), now),
-        )
-        conn.commit()
-        conn.close()
-    load_all_results.clear()  # bust cache so dashboard updates instantly
-@st.cache_data(show_spinner=False)
-def load_all_results() -> pd.DataFrame:
-    with DB_LOCK:
-        conn = sqlite3.connect(DB_PATH, check_same_thread=False, timeout=10)
-        df = pd.read_sql_query(
-            "SELECT id, name, email, seconds, created_at FROM results ORDER BY id DESC",
-            conn,
-        )
-        conn.close()
-    return df
-# =========================
-# Helpers
-# =========================
-EMAIL_RE = re.compile(r"^[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}$")
-def valid_email(email: str) -> bool:
-    return bool(EMAIL_RE.match(email or ""))
-def format_seconds(s: float) -> str:
-    # mm:ss.mmm
-    m, sec = divmod(max(float(s), 0.0), 60)
-    return f"{int(m):02d}:{sec:06.3f}"
-def ensure_session_state():
-    ss = st.session_state
-    if "start_time" not in ss:   # None means not currently running
-        ss.start_time = None
-    if "accumulated" not in ss:  # seconds already accrued from past runs
-        ss.accumulated = 0.0
-    if "name" not in ss:
-        ss.name = ""
-    if "email" not in ss:
-        ss.email = ""
-def is_running() -> bool:
-    return st.session_state.start_time is not None
-def current_elapsed() -> float:
-    """Total elapsed = accumulated + (now - start_time if running)."""
-    ss = st.session_state
-    if ss.start_time is None:
-        return ss.accumulated
-    return ss.accumulated + (time.perf_counter() - ss.start_time)
-def start_timer():
-    if st.session_state.start_time is None:
-        st.session_state.start_time = time.perf_counter()
-def stop_timer():
-    if st.session_state.start_time is not None:
-        st.session_state.accumulated += (time.perf_counter() - st.session_state.start_time)
-        st.session_state.start_time = None
-def reset_timer():
-    st.session_state.start_time = None
-    st.session_state.accumulated = 0.0
-def safe_rerun():
-    try:
-        st.rerun()
-    except Exception:
-        st.experimental_rerun()
-# =========================
-# UI
-# =========================
-def header():
-    st.markdown(
-        """
-        <div style="text-align:center; margin-bottom: 0.5rem;">
-            <h1 style="margin-bottom:0">⏱️ Expo Game Timer</h1>
-            <p style="color:#666; margin-top:0.25rem">Record participants, time their run, track a live leaderboard, and export results.</p>
-        </div>
-        """,
-        unsafe_allow_html=True,
-    )
-def participant_form():
-    c1, c2 = st.columns(2)
-    with c1:
-        st.text_input("Participant Name", key="name", placeholder="Jane Doe")
-    with c2:
-        st.text_input("Email", key="email", placeholder="jane@example.com")
-def stopwatch_card():
-    ensure_session_state()
-    st.markdown("### Stopwatch")
-    with st.container(border=True):
-        # Display (updates continuously while running)
-        elapsed = current_elapsed()
-        st.markdown(
-            f"<div style='font-size:3rem; text-align:center; font-variant-numeric: tabular-nums;'>{format_seconds(elapsed)}</div>",
-            unsafe_allow_html=True,
-        )
-        b1, b2, b3 = st.columns(3)
-        with b1:
-            if st.button("▶️ Start", use_container_width=True, disabled=is_running()):
-                start_timer()
-                safe_rerun()
-        with b2:
-            if st.button("⏸️ Stop", use_container_width=True, disabled=not is_running()):
-                stop_timer()
-                safe_rerun()
-        with b3:
-            if st.button("↺ Reset", use_container_width=True, disabled=(current_elapsed() == 0.0 and not is_running())):
-                reset_timer()
-                safe_rerun()
-        st.caption("Tip: Start the timer when the game begins and press Stop as soon as they finish. Then Save Result.")
-        st.divider()
-        save_col1, save_col2 = st.columns([2, 1])
-        with save_col1:
-            st.write("**Save this run**")
-            if not st.session_state.name.strip():
-                st.info("Enter a participant name.")
-            if not st.session_state.email.strip():
-                st.info("Enter a valid email.")
-            if st.session_state.email and not valid_email(st.session_state.email):
-                st.error("Please enter a valid email address.")
-        with save_col2:
-            disabled_save = (
-                not st.session_state.name.strip()
-                or not valid_email(st.session_state.email)
-                or current_elapsed() <= 0.0
-                or is_running()  # don't allow saving while the timer is running
-            )
-            if st.button("💾 Save Result", type="primary", use_container_width=True, disabled=disabled_save):
-                secs = round(current_elapsed(), 3)
-                try:
-                    insert_result(st.session_state.name, st.session_state.email, secs)
-                    st.success(f"Saved: {st.session_state.name} — {format_seconds(secs)}")
-                    reset_timer()
-                except Exception as e:
-                    st.error(f"Failed to save result: {e}")
-                safe_rerun()
-    # Auto-refresh while running (simple, robust pattern)
-    if is_running():
-        time.sleep(TICK_SECONDS)
-        safe_rerun()
-def dashboard():
-    st.markdown("### Dashboard")
-    with st.container(border=True):
-        df = load_all_results()
-        if df.empty:
-            st.info("No results yet. Save the first run to see stats and leaderboard.")
-            return
-        # Quick stats
-        total = len(df)
-        best = df["seconds"].min()
-        avg = df["seconds"].mean()
-        s1, s2, s3 = st.columns(3)
-        s1.metric("Total Participants (runs)", total)
-        s2.metric("Best Time", format_seconds(best))
-        s3.metric("Average Time", format_seconds(avg))
-        st.markdown("#### 🏆 Top 3 Fastest")
-        top3 = df.sort_values("seconds", ascending=True).head(3).copy()
-        top3["Time"] = top3["seconds"].apply(format_seconds)
-        st.dataframe(
-            top3[["name", "email", "Time", "created_at"]]
-            .rename(columns={"name": "Name", "email": "Email", "created_at": "Recorded (UTC)"}),
-            hide_index=True,
-            use_container_width=True,
-        )
-        # --- No "All Results" table displayed ---
-        # Still provide CSV of the full dataset
-        csv_df = df.copy()
-        csv_df["time_formatted"] = csv_df["seconds"].apply(format_seconds)
-        st.download_button(
-            label="⬇️ Download all results (CSV)",
-            data=csv_df.to_csv(index=False).encode("utf-8"),
-            file_name="game_results.csv",
-            mime="text/csv",
-            use_container_width=True,
-        )
-def footer_note():
-    st.caption(
-        "Data is stored in a local SQLite database (`game.db`). "
-        "Note: if the Space restarts or is rebuilt, the DB resets. "
-        "Multiple attempts per email are allowed; use the CSV to post-process if you want best-per-email."
-    )
-# =========================
-# Main
-# =========================
-def main():
-    init_db()
-    header()
-    participant_form()
-    stopwatch_card()
-    dashboard()
-    footer_note()
-if __name__ == "__main__":
-    main()

+import io
 import re
+import json
+import numpy as np
 import pandas as pd
+from PIL import Image, ImageOps, ImageFilter
 import streamlit as st
+import pytesseract
+from pytesseract import Output
+# PDF → images
+try:
+    from pdf2image import convert_from_bytes
+    PDF_OK = True
+except Exception:
+    PDF_OK = False
+st.set_page_config(page_title="Invoice OCR (Tesseract) · Streamlit", layout="wide")
+# --------------------------- Image utils ---------------------------
+def preprocess(img: Image.Image) -> Image.Image:
+    """Light cleanup to help Tesseract: grayscale, contrast, binarize, sharpen."""
+    g = ImageOps.grayscale(img)
+    g = ImageOps.autocontrast(g)
+    # mild unsharp for text edges
+    g = g.filter(ImageFilter.UnsharpMask(radius=1, percent=150, threshold=3))
+    # adaptive-like: simple threshold after autocontrast
+    arr = np.array(g)
+    thr = np.clip(arr.mean() * 0.9, 110, 180)  # heuristic
+    bw = Image.fromarray((arr > thr).astype(np.uint8) * 255)
+    return bw
+def load_pages(file_bytes: bytes, name: str):
+    """Return a list of PIL Images (pages)."""
+    name = (name or "").lower()
+    if name.endswith(".pdf"):
+        if not PDF_OK:
+            st.error("pdf2image not available. Did you add poppler in apt.txt?")
+            return []
+        pages = convert_from_bytes(file_bytes, dpi=300)
+        return pages
+    else:
+        img = Image.open(io.BytesIO(file_bytes)).convert("RGB")
+        return [img]
+# --------------------------- OCR ---------------------------
+def ocr_tsv(img: Image.Image, lang="eng") -> pd.DataFrame:
+    """Run Tesseract and return TSV dataframe (one row per word)."""
+    # Important: keep original scale for better bbox geometry
+    data = pytesseract.image_to_data(img, lang=lang, output_type=Output.DATAFRAME)
+    # Drop NaNs that Tesseract sometimes emits
+    data = data.dropna(subset=["text"]).reset_index(drop=True)
+    # Compute centers for convenience
+    data["x2"] = data["left"] + data["width"]
+    data["y2"] = data["top"] + data["height"]
+    data["cx"] = data["left"] + data["width"] / 2
+    data["cy"] = data["top"] + data["height"] / 2
+    return data
+def ocr_text(img: Image.Image, lang="eng") -> str:
+    return pytesseract.image_to_string(img, lang=lang)
+# --------------------------- Key-field parsing ---------------------------
+CURRENCY = r"(?P<curr>USD|CAD|EUR|GBP|\$|C\$|€|£)?"
+MONEY = rf"{CURRENCY}\s?(?P<amt>\d{{1,3}}(?:[,]\d{{3}})*(?:[.]\d{{2}})?)"
+DATE = r"(?P<date>(?:\d{4}[-/]\d{1,2}[-/]\d{1,2})|(?:\d{1,2}[-/]\d{1,2}[-/]\d{2,4})|(?:[A-Za-z]{3,9}\s+\d{1,2},\s*\d{2,4}))"
+INV_PAT = r"(?:invoice\s*(?:no\.?|#|number)?\s*[:\-]?\s*(?P<inv>[A-Z0-9\-_/]{4,})).*"
+PO_PAT  = r"(?:po\s*(?:no\.?|#|number)?\s*[:\-]?\s*(?P<po>[A-Z0-9\-_/]{3,}))"
+TOTAL_PAT = rf"(?:\b(total(?:\s*amount)?|amount\s*due|grand\s*total)\b.*?{MONEY})"
+SUBTOTAL_PAT = rf"(?:\bsub\s*total\b.*?{MONEY})"
+TAX_PAT = rf"(?:\b(tax|gst|vat|hst)\b.*?{MONEY})"
+def find_first(pattern, text, flags=re.IGNORECASE | re.DOTALL):
+    m = re.search(pattern, text, flags)
+    return (m.groupdict() if m else None), m
+def parse_fields(fulltext: str):
+    # Normalize spaces
+    t = re.sub(r"[ \t]+", " ", fulltext)
+    t = re.sub(r"\n{2,}", "\n", t)
+    out = {
+        "invoice_number": None,
+        "invoice_date": None,
+        "po_number": None,
+        "subtotal": None,
+        "tax": None,
+        "total": None,
+        "currency": None,
+    }
+    # Invoice number
+    g,_ = find_first(INV_PAT, t)
+    if g and g.get("inv"):
+        out["invoice_number"] = g["inv"].strip()
+    # PO
+    g,_ = find_first(PO_PAT, t)
+    if g and g.get("po"):
+        out["po_number"] = g["po"].strip()
+    # Date: look near "invoice date" first
+    near_date = re.search(rf"(invoice\s*date[:\-\s]*){DATE}", t, re.IGNORECASE)
+    if near_date:
+        out["invoice_date"] = near_date.group("date")
+    else:
+        g,_ = find_first(DATE, t)
+        if g and g.get("date"):
+            out["invoice_date"] = g["date"]
+    # Monetary values
+    # Subtotal
+    g,m = find_first(SUBTOTAL_PAT, t)
+    if g and g.get("amt"):
+        out["subtotal"] = g["amt"].replace(",", "")
+        out["currency"] = g.get("curr") or out["currency"]
+    # Tax
+    g,m = find_first(TAX_PAT, t)
+    if g and g.get("amt"):
+        out["tax"] = g["amt"].replace(",", "")
+        out["currency"] = g.get("curr") or out["currency"]
+    # Total / Amount Due
+    g,m = find_first(TOTAL_PAT, t)
+    if g and g.get("amt"):
+        out["total"] = g["amt"].replace(",", "")
+        out["currency"] = g.get("curr") or out["currency"]
+    # Normalize currency symbols
+    if out["currency"] in ["$", "C$", "€", "£"]:
+        sym_map = {"$":"USD", "C$":"CAD", "€":"EUR", "£":"GBP"}
+        out["currency"] = sym_map.get(out["currency"], out["currency"])
+    return out
+# --------------------------- Line item parsing ---------------------------
+HEAD_CANDIDATES = ["description", "item", "qty", "quantity", "price", "unit price", "rate", "amount", "total"]
+def guess_header_rows(tsv: pd.DataFrame) -> pd.DataFrame:
+    """
+    Try to find a header line based on presence of common header tokens.
+    Returns candidate header rows (can be empty).
+    """
+    # Group by (block, par, line) -> line text and bbox
+    lines = []
+    for keys, g in tsv.groupby(["block_num", "par_num", "line_num"], as_index=False):
+        text = " ".join([w for w in g["text"].astype(str).tolist() if w.strip()])
+        if text.strip():
+            row = {
+                "block_num": keys[0],
+                "par_num": keys[1],
+                "line_num": keys[2],
+                "text": text.lower(),
+                "top": g["top"].min(),
+                "bottom": g["y2"].max(),
+                "left": g["left"].min(),
+                "right": g["x2"].max(),
+            }
+            lines.append(row)
+    L = pd.DataFrame(lines)
+    if L.empty:
+        return L
+    def score_header(s: str):
+        tokens = sum(1 for h in HEAD_CANDIDATES if h in s)
+        return tokens
+    L["header_score"] = L["text"].apply(score_header)
+    return L[L["header_score"] >= 2].sort_values(["header_score", "top"], ascending=[False, True])
+def extract_table(tsv: pd.DataFrame) -> pd.DataFrame:
+    """
+    Simple geometry-driven itemization:
+    - find a header line
+    - derive rough column boundaries from header word x-positions
+    - assign subsequent words into nearest column
+    - stop when large vertical gap or when totals region starts
+    """
+    header_lines = guess_header_rows(tsv)
+    if header_lines.empty:
+        return pd.DataFrame()
+    # Take the top-scoring header
+    H = header_lines.iloc[0]
+    header_band_top, header_band_bottom = H["top"], H["bottom"]
+    # Words within header band
+    header_words = tsv[(tsv["top"] >= header_band_top - 5) & (tsv["y2"] <= header_band_bottom + 5)]
+    # Keep only words that look like header candidates
+    header_words = header_words[header_words["text"].str.lower().isin([h for h in HEAD_CANDIDATES if " " not in h]) |
+                                header_words["text"].str.lower().isin(["description","item","qty","price","amount","total"])]
+    if header_words.empty:
+        return pd.DataFrame()
+    # Sort by x center; build columns
+    header_words = header_words.sort_values("cx")
+    columns = []
+    for _, w in header_words.iterrows():
+        columns.append({"name": w["text"].lower(), "x": w["cx"]})
+    # Canonical column order by x
+    columns = sorted(columns, key=lambda c: c["x"])
+    # Items region: words below header, but above totals area (heuristic)
+    below = tsv[tsv["top"] > header_band_bottom + 5].copy()
+    # Stop at the first strong "total" line to avoid footer math rows
+    footer_y = None
+    totals_mask = below["text"].str.lower().str.contains(r"(sub\s*total|amount\s*due|total|grand\s*total|balance)", regex=True, na=False)
+    if totals_mask.any():
+        footer_y = below.loc[totals_mask, "top"].min()
+        below = below[below["top"] < footer_y - 4]
+    if below.empty:
+        return pd.DataFrame()
+    # Group by line again, then split into columns by nearest header x
+    items = []
+    for (b,p,l), g in below.groupby(["block_num","par_num","line_num"]):
+        words = g.sort_values("cx")
+        if words["text"].str.strip().eq("").all():
+            continue
+        # Assign each word to nearest column center
+        col_texts = {c["name"]: [] for c in columns}
+        for _, w in words.iterrows():
+            if not str(w["text"]).strip():
+                continue
+            nearest = min(columns, key=lambda c: abs(c["x"] - w["cx"]))
+            col_texts[nearest["name"]].append(str(w["text"]))
+        row = {k: " ".join(v).strip() for k,v in col_texts.items()}
+        # basic filters to avoid empty noise lines
+        if any(val for val in row.values()):
+            items.append(row)
+    df = pd.DataFrame(items)
+    # Normalize common column names
+    rename_map = {}
+    for c in df.columns:
+        if "desc" in c or c == "item":
+            rename_map[c] = "description"
+        elif c in ["qty","quantity"]:
+            rename_map[c] = "quantity"
+        elif "unit" in c or "rate" in c or "price" in c:
+            rename_map[c] = "unit_price"
+        elif "amount" in c or "total" in c:
+            rename_map[c] = "line_total"
+    df = df.rename(columns=rename_map)
+    # Drop fully empty rows
+    df = df[[c for c in ["description","quantity","unit_price","line_total"] if c in df.columns]]
+    if not df.empty:
+        df = df[~(df.fillna("").apply(lambda r: "".join(r.values), axis=1).str.strip()=="")]
+    return df.reset_index(drop=True)
+# --------------------------- App UI ---------------------------
+st.title("Invoice Extraction (Tesseract · Streamlit)")
+st.sidebar.header("Settings")
+lang = st.sidebar.text_input("Tesseract language(s)", value="eng")
+show_tsv = st.sidebar.checkbox("Show raw OCR TSV", value=False)
+show_fulltext = st.sidebar.checkbox("Show full OCR text", value=False)
+up = st.file_uploader("Upload an invoice (PDF, PNG, JPG)", type=["pdf","png","jpg","jpeg"], accept_multiple_files=False)
+if not up:
+    st.info("Upload a scanned invoice PDF or an image to begin.")
+    st.stop()
+pages = load_pages(up.read(), up.name)
+if not pages:
+    st.stop()
+# Page selector (for multi-page PDFs)
+if len(pages) > 1:
+    idx = st.number_input("Page", min_value=1, max_value=len(pages), value=1)
+    img = pages[idx-1]
+else:
+    img = pages[0]
+col_prev, col_data = st.columns([1.1, 1.3], gap="large")
+with col_prev:
+    st.subheader("Preview")
+    st.image(img, use_column_width=True, caption="Original page")
+    pre = preprocess(img)
+    with st.expander("Preprocessed (for OCR)"):
+        st.image(pre, use_column_width=True)
+with col_data:
+    st.subheader("Extraction")
+    with st.spinner("Running Tesseract..."):
+        tsv = ocr_tsv(pre, lang=lang)
+        text = ocr_text(pre, lang=lang)
+    key_fields = parse_fields(text)
+    st.markdown("**Key Fields (heuristic)**")
+    k1, k2, k3 = st.columns(3)
+    with k1:
+        st.write(f"**Invoice #:** {key_fields.get('invoice_number') or '—'}")
+        st.write(f"**Invoice Date:** {key_fields.get('invoice_date') or '—'}")
+    with k2:
+        st.write(f"**PO #:** {key_fields.get('po_number') or '—'}")
+        st.write(f"**Subtotal:** {key_fields.get('subtotal') or '—'}")
+    with k3:
+        st.write(f"**Tax:** {key_fields.get('tax') or '—'}")
+        tot = key_fields.get('total') or '—'
+        cur = key_fields.get('currency') or ''
+        st.write(f"**Total:** {tot} {cur}".strip())
+    st.markdown("**Line Items (auto-detected)**")
+    items = extract_table(tsv)
+    if items.empty:
+        st.caption("No line items confidently detected. You can still download full OCR text.")
+    else:
+        st.dataframe(items, use_container_width=True)
+    # Downloads
+    result = {
+        "file": up.name,
+        "key_fields": key_fields,
+        "items": items.to_dict(orient="records") if not items.empty else [],
+        "full_text": text,
+    }
+    j = json.dumps(result, indent=2)
+    st.download_button("Download JSON", data=j, file_name="invoice_extraction.json", mime="application/json")
+    if not items.empty:
+        csv = items.to_csv(index=False)
+        st.download_button("Download Line Items CSV", data=csv, file_name="invoice_items.csv", mime="text/csv")
+# Optional raw views
+with st.expander("Advanced · Raw Outputs"):
+    if show_fulltext:
+        st.text_area("OCR Full Text", value=text, height=220)
+    if show_tsv:
+        st.dataframe(tsv.head(100), use_container_width=True)