DPT2

Sleeping

App Files Files Community

Seth0330 commited on Oct 23, 2025

Commit

1567f8d

verified ·

1 Parent(s): a664847

Update app.py

Browse files

Files changed (1) hide show

app.py +77 -121

app.py CHANGED Viewed

@@ -1,47 +1,48 @@
-# Streamlit Invoice Extraction — Hugging Face Donut (no local .pth) + Tesseract tables
-# - Uses a pretrained model from HF Hub (default: naver-clova-ix/donut-base-finetuned-sroie)
-# - Extracts key fields via Donut JSON if available, else regex fallback
-# - Extracts line items via Tesseract word boxes + geometry heuristics
-# - Works on HF Spaces without any custom checkpoints
 import os, io, re, json
-from typing import List, Tuple, Dict
 import numpy as np
 import pandas as pd
 from PIL import Image, ImageOps, ImageFilter
 import streamlit as st
-# OCR for word boxes (detection only) + pdf to images
 import pytesseract
 from pytesseract import Output
 from pdf2image import convert_from_bytes
-# HF Donut (pretrained, downloaded automatically)
 import torch
 from transformers import DonutProcessor, VisionEncoderDecoderModel
-st.set_page_config(page_title="Invoice Extraction — Donut (HF) + Tesseract tables", layout="wide")
 # ----------------------------- Sidebar -----------------------------
-st.sidebar.header("Model (Hugging Face)")
 model_id = st.sidebar.text_input(
     "HF model id",
-    value="naver-clova-ix/donut-base-finetuned-sroie",  # good default for receipts/invoices (SROIE)
-    help="Examples: naver-clova-ix/donut-base-finetuned-sroie, naver-clova-ix/donut-base-finetuned-docvqa"
 )
 task_prompt = st.sidebar.text_input(
-    "Task prompt (for Donut models expecting prompts)",
-    value="<s_cord-v2>",  # SROIE/cord-style models typically ignore or use default; harmless to keep
-    help="Some Donut checkpoints use task-specific prompts; keep or adjust as needed."
 )
 det_lang = st.sidebar.text_input("Tesseract language(s) — detection only", value="eng")
-show_boxes = st.sidebar.checkbox("Show word boxes", value=False)
-device = "cuda" if torch.cuda.is_available() else "cpu"
-st.sidebar.markdown("---")
-st.sidebar.caption("Tip: If your model outputs JSON (e.g., SROIE), we’ll parse it for key fields. Otherwise we’ll regex from generated text.")
 # ----------------------------- Utilities -----------------------------
 def load_pages(file_bytes: bytes, name: str) -> List[Image.Image]:
@@ -58,14 +59,13 @@ def preprocess_for_detection(img: Image.Image) -> Image.Image:
 @st.cache_resource(show_spinner=True)
 def load_donut(_model_id: str):
     processor = DonutProcessor.from_pretrained(_model_id)
     model = VisionEncoderDecoderModel.from_pretrained(_model_id)
-    model.to(device)
-    model.eval()
     return processor, model
 def donut_infer(img: Image.Image, processor: DonutProcessor, model: VisionEncoderDecoderModel, prompt: str):
-    # Donut expects RGB PIL Image; processor handles resizing/normalization
     inputs = processor(images=img, text=prompt, return_tensors="pt").to(device)
     with torch.no_grad():
         outputs = model.generate(
@@ -74,12 +74,10 @@ def donut_infer(img: Image.Image, processor: DonutProcessor, model: VisionEncode
             num_beams=1,
             early_stopping=True,
         )
-    # decode
     seq = processor.batch_decode(outputs, skip_special_tokens=True)[0]
-    # Donut models often emit JSON; try to parse
     parsed = None
     try:
-        # strip whitespace garbage around JSON
         start = seq.find("{")
         end = seq.rfind("}")
         if start != -1 and end != -1 and end > start:
@@ -88,7 +86,7 @@ def donut_infer(img: Image.Image, processor: DonutProcessor, model: VisionEncode
         parsed = None
     return seq, parsed
-# ----------------------------- Key fields & line items -----------------------------
 CURRENCY = r"(?P<curr>USD|CAD|EUR|GBP|\$|C\$|€|£)?"
 MONEY = rf"{CURRENCY}\s?(?P<amt>\d{{1,3}}(?:[,]\d{{3}})*(?:[.]\d{{2}})?)"
 DATE = r"(?P<date>(?:\d{4}[-/]\d{1,2}[-/]\d{1,2})|(?:\d{1,2}[-/]\d{1,2}[-/]\d{2,4})|(?:[A-Za-z]{3,9}\s+\d{1,2},\s*\d{2,4}))"
@@ -118,25 +116,15 @@ def parse_fields_regex(fulltext: str):
     return out
 def normalize_kv_from_donut(parsed: dict):
-    """Try to map common Donut outputs to our schema."""
-    txt = json.dumps(parsed).lower()
-    # heuristic mapping for typical SROIE/receipt keys
-    candidates = {
-        "invoice_number": ["invoice_number","invoice no","invoice_no","invoice","inv_no"],
-        "invoice_date":   ["date","invoice_date","bill_date"],
-        "po_number":      ["po_number","po","purchase_order"],
-        "subtotal":       ["subtotal","sub_total"],
-        "tax":            ["tax","gst","vat","hst"],
-        "total":          ["total","amount_total","amount_due","grand_total"]
-    }
     out = {k: None for k in ["invoice_number","invoice_date","po_number","subtotal","tax","total","currency"]}
-    # simple search: pick first occurrence
     def search_keys(obj, key_list):
-        # breadth-first scan
         if isinstance(obj, dict):
             for k, v in obj.items():
-                if any(kk in k.lower() for kk in key_list):
-                    return v
                 found = search_keys(v, key_list)
                 if found is not None:
                     return found
@@ -147,16 +135,24 @@ def normalize_kv_from_donut(parsed: dict):
                     return found
         return None
-    for outk, key_list in candidates.items():
-        val = search_keys(parsed, key_list)
-        if isinstance(val, (dict, list)):
-            val = None  # keep it simple; Donut sometimes nests values
         if isinstance(val, str):
-            out[outk] = val.strip()
-    # currency guess:
-    curr = re.search(r"(USD|CAD|EUR|GBP|\$|C\$|€|£)", json.dumps(parsed, ensure_ascii=False), re.I)
-    if curr:
-        sym = curr.group(1)
         out["currency"] = {"$":"USD","C$":"CAD","€":"EUR","£":"GBP"}.get(sym, sym.upper())
     return out
@@ -167,99 +163,58 @@ def detect_words(img: Image.Image, lang="eng") -> pd.DataFrame:
     df["y2"] = df["top"] + df["height"]
     return df[df["conf"] > -1]
-def crop_words(img: Image.Image, df: pd.DataFrame) -> List[Tuple[Image.Image, Dict]]:
-    crops, metas = [], []
-    for _, r in df.iterrows():
-        if str(r["text"]).strip() == "":
-            continue
-        box = (int(r["left"]), int(r["top"]), int(r["x2"]), int(r["y2"]))
-        c = img.crop(box)
-        crops.append(c)
-        metas.append({"box": box})
-    return crops, metas
-HEAD_CANDIDATES = ["description","item","qty","quantity","price","unit","rate","amount","total"]
-def items_from_wordgrid(df: pd.DataFrame) -> pd.DataFrame:
-    if df.empty:
         return pd.DataFrame()
-    df = df.copy()
-    df["cx"] = df["left"] + 0.5*df["width"]
-    df["cy"] = df["top"]  + 0.5*df["height"]
-    # group lines
     lines = []
-    for (b,p,l), g in df.groupby(["block_num","par_num","line_num"]):
-        text = " ".join([t for t in g["text"].astype(str) if t.strip()])
-        if text.strip():
-            lines.append({
-                "block_num":b,"par_num":p,"line_num":l,
-                "text": text.lower(),
-                "top": g["top"].min(), "bottom": (g["top"]+g["height"]).max(),
-                "left": g["left"].min(), "right": (g["left"]+g["width"]).max(),
-                "words": g.sort_values("left")[["left","top","width","height","text"]].values.tolist()
-            })
-    L = pd.DataFrame(lines)
-    if L.empty: return pd.DataFrame()
-    L["score"] = L["text"].apply(lambda s: sum(1 for h in HEAD_CANDIDATES if h in s))
-    headers = L[L["score"]>=2].sort_values(["score","top"], ascending=[False,True])
-    if headers.empty: return pd.DataFrame()
-    H = headers.iloc[0]
-    header_y = H["bottom"] + 4
-    # derive column anchors from header words positions
-    df_header = detect_words(img=None, lang="eng")  # placeholder to keep signature consistent
-    # get header band words
-    # reconstruct header band from original DF
-    # (we need original df back here; easier: pass it in as closure var)
-    # → we'll adapt: compute from global last_df if present
-    return_df = pd.DataFrame()
-    return return_df
-# We’ll implement a simpler, robust table extractor to avoid closure complexity:
-def items_from_words_simple(tsv: pd.DataFrame) -> pd.DataFrame:
-    # find header line
-    L = []
     for (b,p,l), g in tsv.groupby(["block_num","par_num","line_num"]):
         text = " ".join([w for w in g["text"].astype(str).tolist() if w.strip()])
         if text.strip():
-            L.append({
                 "block_num": b, "par_num": p, "line_num": l,
                 "text": text.lower(),
                 "top": g["top"].min(), "bottom": (g["top"]+g["height"]).max(),
                 "left": g["left"].min(), "right": (g["left"]+g["width"]).max()
             })
-    lines = pd.DataFrame(L)
-    if lines.empty:
         return pd.DataFrame()
     def score_header(s: str):
         return sum(1 for h in HEAD_CANDIDATES if h in s)
-    lines["header_score"] = lines["text"].apply(score_header)
-    hdrs = lines[lines["header_score"] >= 2].sort_values(["header_score","top"], ascending=[False,True])
     if hdrs.empty:
         return pd.DataFrame()
     H = hdrs.iloc[0]
     header_top, header_bottom = H["top"], H["bottom"]
-    # header words
     header_words = tsv[(tsv["top"] >= header_top - 5) & ((tsv["top"] + tsv["height"]) <= header_bottom + 5)]
     header_words = header_words.sort_values("left")
     if header_words.empty:
         return pd.DataFrame()
     xs = header_words["left"].tolist()
-    # items region
     below = tsv[tsv["top"] > header_bottom + 5].copy()
-    totals_mask = below["text"].str.lower().str.contains(r"(sub\s*total|amount\s*due|total|grand\s*total|balance)", regex=True, na=False)
     if totals_mask.any():
         stop_y = below.loc[totals_mask, "top"].min()
         below = below[below["top"] < stop_y - 4]
     if below.empty:
         return pd.DataFrame()
-    # build rows by assigning words to nearest header x
     rows = []
     for (b,p,l), g in below.groupby(["block_num","par_num","line_num"]):
         g = g.sort_values("left")
@@ -270,14 +225,15 @@ def items_from_words_simple(tsv: pd.DataFrame) -> pd.DataFrame:
             idx = int(np.abs(np.array(xs) - w["left"]).argmin())
             buckets[idx].append(str(w["text"]))
         vals = [" ".join(buckets[i]).strip() for i in range(len(xs))]
-        rows.append(vals)
     if not rows:
         return pd.DataFrame()
     df_rows = pd.DataFrame(rows).fillna("")
-    # name columns heuristically
     names = []
-    hdr_tokens = [t.lower() for t in header_words["text"].tolist()]
     for i in range(df_rows.shape[1]):
         wl = hdr_tokens[i] if i < len(hdr_tokens) else f"col_{i}"
         if "desc" in wl or wl in ["item","description"]:
@@ -291,24 +247,24 @@ def items_from_words_simple(tsv: pd.DataFrame) -> pd.DataFrame:
         else:
             names.append(f"col_{i}")
     df_rows.columns = names
-    # drop empty lines
     df_rows = df_rows[~(df_rows.fillna("").apply(lambda r: "".join(r.values), axis=1).str.strip()=="")]
     return df_rows.reset_index(drop=True)
 # ----------------------------- App -----------------------------
-st.title("Invoice Extraction — Donut (HF pretrained) + Tesseract tables")
 up = st.file_uploader("Upload an invoice (PDF/JPG/PNG)", type=["pdf","png","jpg","jpeg"])
 if not up:
     st.info("Upload a scanned invoice to begin.")
     st.stop()
-# load model once
 with st.spinner(f"Loading model '{model_id}' from Hugging Face…"):
     processor, donut_model = load_donut(model_id)
 pages = load_pages(up.read(), up.name)
 page_idx = 0
 if len(pages) > 1:
     page_idx = st.number_input("Page", 1, len(pages), 1) - 1
@@ -326,11 +282,11 @@ with col1:
 with col2:
     st.subheader("OCR & Extraction")
-    # 1) Donut extraction (key fields or full text)
     with st.spinner("Running Donut…"):
         seq, parsed = donut_infer(img, processor, donut_model, task_prompt)
-    # 2) Key fields
     if parsed:
         key_fields = normalize_kv_from_donut(parsed)
         donut_payload = parsed
@@ -351,7 +307,7 @@ with col2:
         cur = key_fields.get('currency') or ''
         st.write(f"**Total:** {tot} {cur}".strip())
-    # 3) Tesseract line items (geometry heuristic)
     with st.spinner("Detecting words with Tesseract (for table)…"):
         tsv = pytesseract.image_to_data(det_img, lang=det_lang, output_type=Output.DATAFRAME)
         tsv = tsv.dropna(subset=["text"]).reset_index(drop=True)

+# app.py
+# Invoice Extraction — Donut (public HF model, no token) + Tesseract tables
+# - Loads a public Donut checkpoint (default: naver-clova-ix/donut-base-finetuned-cord-v2)
+# - Pulls key fields from Donut JSON (if available) or falls back to regex
+# - Detects line-item tables via Tesseract word boxes + geometry heuristics
 import os, io, re, json
+from typing import List
 import numpy as np
 import pandas as pd
 from PIL import Image, ImageOps, ImageFilter
 import streamlit as st
+# OCR (detection only) and PDF->image
 import pytesseract
 from pytesseract import Output
 from pdf2image import convert_from_bytes
+# HF Donut (auto-downloads public model; no HF token required)
 import torch
 from transformers import DonutProcessor, VisionEncoderDecoderModel
+# ----------------------------- Page config -----------------------------
+st.set_page_config(
+    page_title="Invoice Extraction — Donut (public) + Tesseract tables",
+    layout="wide"
+)
+device = "cuda" if torch.cuda.is_available() else "cpu"
 # ----------------------------- Sidebar -----------------------------
+st.sidebar.header("Model (Hugging Face — public)")
 model_id = st.sidebar.text_input(
     "HF model id",
+    value="naver-clova-ix/donut-base-finetuned-cord-v2",
+    help="Use a public model id. Example: naver-clova-ix/donut-base-finetuned-cord-v2"
 )
 task_prompt = st.sidebar.text_input(
+    "Task prompt (Donut)",
+    value="<s_cord-v2>",
+    help="Keep default for CORD-like invoices; adjust if you change models."
 )
 det_lang = st.sidebar.text_input("Tesseract language(s) — detection only", value="eng")
+show_boxes = st.sidebar.checkbox("Show word boxes (debug)", value=False)
 # ----------------------------- Utilities -----------------------------
 def load_pages(file_bytes: bytes, name: str) -> List[Image.Image]:
 @st.cache_resource(show_spinner=True)
 def load_donut(_model_id: str):
+    # Public checkpoints load without token
     processor = DonutProcessor.from_pretrained(_model_id)
     model = VisionEncoderDecoderModel.from_pretrained(_model_id)
+    model.to(device).eval()
     return processor, model
 def donut_infer(img: Image.Image, processor: DonutProcessor, model: VisionEncoderDecoderModel, prompt: str):
     inputs = processor(images=img, text=prompt, return_tensors="pt").to(device)
     with torch.no_grad():
         outputs = model.generate(
             num_beams=1,
             early_stopping=True,
         )
     seq = processor.batch_decode(outputs, skip_special_tokens=True)[0]
     parsed = None
+    # Try to parse JSON from the generated sequence
     try:
         start = seq.find("{")
         end = seq.rfind("}")
         if start != -1 and end != -1 and end > start:
         parsed = None
     return seq, parsed
+# ----------------------------- Key fields & tables -----------------------------
 CURRENCY = r"(?P<curr>USD|CAD|EUR|GBP|\$|C\$|€|£)?"
 MONEY = rf"{CURRENCY}\s?(?P<amt>\d{{1,3}}(?:[,]\d{{3}})*(?:[.]\d{{2}})?)"
 DATE = r"(?P<date>(?:\d{4}[-/]\d{1,2}[-/]\d{1,2})|(?:\d{1,2}[-/]\d{1,2}[-/]\d{2,4})|(?:[A-Za-z]{3,9}\s+\d{1,2},\s*\d{2,4}))"
     return out
 def normalize_kv_from_donut(parsed: dict):
+    """Map common Donut outputs to a simple invoice schema."""
     out = {k: None for k in ["invoice_number","invoice_date","po_number","subtotal","tax","total","currency"]}
     def search_keys(obj, key_list):
         if isinstance(obj, dict):
             for k, v in obj.items():
+                kl = k.lower()
+                if any(kk in kl for kk in key_list):
+                    return v if isinstance(v, str) else None
                 found = search_keys(v, key_list)
                 if found is not None:
                     return found
                     return found
         return None
+    mapping = {
+        "invoice_number": ["invoice_number","invoice no","invoice_no","invoice","inv_no","inv no"],
+        "invoice_date":   ["invoice_date","date","bill_date","document_date"],
+        "po_number":      ["po_number","po","purchase_order"],
+        "subtotal":       ["subtotal","sub_total"],
+        "tax":            ["tax","gst","vat","hst"],
+        "total":          ["total","amount_total","amount_due","grand_total"],
+    }
+    for k, keys in mapping.items():
+        val = search_keys(parsed, keys)
         if isinstance(val, str):
+            out[k] = val.strip()
+    # currency guess from JSON text
+    txt = json.dumps(parsed, ensure_ascii=False)
+    m = re.search(r"(USD|CAD|EUR|GBP|\$|C\$|€|£)", txt, re.I)
+    if m:
+        sym = m.group(1)
         out["currency"] = {"$":"USD","C$":"CAD","€":"EUR","£":"GBP"}.get(sym, sym.upper())
     return out
     df["y2"] = df["top"] + df["height"]
     return df[df["conf"] > -1]
+def items_from_words_simple(tsv: pd.DataFrame) -> pd.DataFrame:
+    """Geometry-driven table extraction using Tesseract TSV."""
+    HEAD_CANDIDATES = ["description","item","qty","quantity","price","unit","rate","amount","total"]
+    if tsv.empty:
         return pd.DataFrame()
+    # Build per-line metadata
     lines = []
     for (b,p,l), g in tsv.groupby(["block_num","par_num","line_num"]):
         text = " ".join([w for w in g["text"].astype(str).tolist() if w.strip()])
         if text.strip():
+            lines.append({
                 "block_num": b, "par_num": p, "line_num": l,
                 "text": text.lower(),
                 "top": g["top"].min(), "bottom": (g["top"]+g["height"]).max(),
                 "left": g["left"].min(), "right": (g["left"]+g["width"]).max()
             })
+    L = pd.DataFrame(lines)
+    if L.empty:
         return pd.DataFrame()
     def score_header(s: str):
         return sum(1 for h in HEAD_CANDIDATES if h in s)
+    L["header_score"] = L["text"].apply(score_header)
+    hdrs = L[L["header_score"] >= 2].sort_values(["header_score","top"], ascending=[False,True])
     if hdrs.empty:
         return pd.DataFrame()
     H = hdrs.iloc[0]
     header_top, header_bottom = H["top"], H["bottom"]
+    # Header words & their x-positions
     header_words = tsv[(tsv["top"] >= header_top - 5) & ((tsv["top"] + tsv["height"]) <= header_bottom + 5)]
     header_words = header_words.sort_values("left")
     if header_words.empty:
         return pd.DataFrame()
     xs = header_words["left"].tolist()
+    hdr_tokens = [t.lower() for t in header_words["text"].tolist()]
+    # Items region below header (stop before totals area)
     below = tsv[tsv["top"] > header_bottom + 5].copy()
+    totals_mask = below["text"].str.lower().str.contains(
+        r"(sub\s*total|amount\s*due|total|grand\s*total|balance)",
+        regex=True, na=False
+    )
     if totals_mask.any():
         stop_y = below.loc[totals_mask, "top"].min()
         below = below[below["top"] < stop_y - 4]
     if below.empty:
         return pd.DataFrame()
     rows = []
     for (b,p,l), g in below.groupby(["block_num","par_num","line_num"]):
         g = g.sort_values("left")
             idx = int(np.abs(np.array(xs) - w["left"]).argmin())
             buckets[idx].append(str(w["text"]))
         vals = [" ".join(buckets[i]).strip() for i in range(len(xs))]
+        if any(vals):
+            rows.append(vals)
     if not rows:
         return pd.DataFrame()
     df_rows = pd.DataFrame(rows).fillna("")
+    # Name columns heuristically from header tokens
     names = []
     for i in range(df_rows.shape[1]):
         wl = hdr_tokens[i] if i < len(hdr_tokens) else f"col_{i}"
         if "desc" in wl or wl in ["item","description"]:
         else:
             names.append(f"col_{i}")
     df_rows.columns = names
+    # Drop blank rows
     df_rows = df_rows[~(df_rows.fillna("").apply(lambda r: "".join(r.values), axis=1).str.strip()=="")]
     return df_rows.reset_index(drop=True)
 # ----------------------------- App -----------------------------
+st.title("Invoice Extraction — Donut (public checkpoint) + Tesseract tables")
 up = st.file_uploader("Upload an invoice (PDF/JPG/PNG)", type=["pdf","png","jpg","jpeg"])
 if not up:
     st.info("Upload a scanned invoice to begin.")
     st.stop()
+# Load HF model (public)
 with st.spinner(f"Loading model '{model_id}' from Hugging Face…"):
     processor, donut_model = load_donut(model_id)
 pages = load_pages(up.read(), up.name)
 page_idx = 0
 if len(pages) > 1:
     page_idx = st.number_input("Page", 1, len(pages), 1) - 1
 with col2:
     st.subheader("OCR & Extraction")
+    # 1) Donut for key-value extraction / text
     with st.spinner("Running Donut…"):
         seq, parsed = donut_infer(img, processor, donut_model, task_prompt)
+    # 2) Key fields from JSON (if available) else regex over generated text
     if parsed:
         key_fields = normalize_kv_from_donut(parsed)
         donut_payload = parsed
         cur = key_fields.get('currency') or ''
         st.write(f"**Total:** {tot} {cur}".strip())
+    # 3) Tesseract word boxes for line-item table (simple heuristic)
     with st.spinner("Detecting words with Tesseract (for table)…"):
         tsv = pytesseract.image_to_data(det_img, lang=det_lang, output_type=Output.DATAFRAME)
         tsv = tsv.dropna(subset=["text"]).reset_index(drop=True)