Seth0330 commited on
Commit
1567f8d
·
verified ·
1 Parent(s): a664847

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +77 -121
app.py CHANGED
@@ -1,47 +1,48 @@
1
- # Streamlit Invoice Extraction — Hugging Face Donut (no local .pth) + Tesseract tables
2
- # - Uses a pretrained model from HF Hub (default: naver-clova-ix/donut-base-finetuned-sroie)
3
- # - Extracts key fields via Donut JSON if available, else regex fallback
4
- # - Extracts line items via Tesseract word boxes + geometry heuristics
5
- # - Works on HF Spaces without any custom checkpoints
6
 
7
  import os, io, re, json
8
- from typing import List, Tuple, Dict
9
-
10
  import numpy as np
11
  import pandas as pd
12
  from PIL import Image, ImageOps, ImageFilter
13
 
14
  import streamlit as st
15
 
16
- # OCR for word boxes (detection only) + pdf to images
17
  import pytesseract
18
  from pytesseract import Output
19
  from pdf2image import convert_from_bytes
20
 
21
- # HF Donut (pretrained, downloaded automatically)
22
  import torch
23
  from transformers import DonutProcessor, VisionEncoderDecoderModel
24
 
25
- st.set_page_config(page_title="Invoice Extraction Donut (HF) + Tesseract tables", layout="wide")
 
 
 
 
 
 
26
 
27
  # ----------------------------- Sidebar -----------------------------
28
- st.sidebar.header("Model (Hugging Face)")
29
  model_id = st.sidebar.text_input(
30
  "HF model id",
31
- value="naver-clova-ix/donut-base-finetuned-sroie", # good default for receipts/invoices (SROIE)
32
- help="Examples: naver-clova-ix/donut-base-finetuned-sroie, naver-clova-ix/donut-base-finetuned-docvqa"
33
  )
34
  task_prompt = st.sidebar.text_input(
35
- "Task prompt (for Donut models expecting prompts)",
36
- value="<s_cord-v2>", # SROIE/cord-style models typically ignore or use default; harmless to keep
37
- help="Some Donut checkpoints use task-specific prompts; keep or adjust as needed."
38
  )
39
  det_lang = st.sidebar.text_input("Tesseract language(s) — detection only", value="eng")
40
- show_boxes = st.sidebar.checkbox("Show word boxes", value=False)
41
- device = "cuda" if torch.cuda.is_available() else "cpu"
42
-
43
- st.sidebar.markdown("---")
44
- st.sidebar.caption("Tip: If your model outputs JSON (e.g., SROIE), we’ll parse it for key fields. Otherwise we’ll regex from generated text.")
45
 
46
  # ----------------------------- Utilities -----------------------------
47
  def load_pages(file_bytes: bytes, name: str) -> List[Image.Image]:
@@ -58,14 +59,13 @@ def preprocess_for_detection(img: Image.Image) -> Image.Image:
58
 
59
  @st.cache_resource(show_spinner=True)
60
  def load_donut(_model_id: str):
 
61
  processor = DonutProcessor.from_pretrained(_model_id)
62
  model = VisionEncoderDecoderModel.from_pretrained(_model_id)
63
- model.to(device)
64
- model.eval()
65
  return processor, model
66
 
67
  def donut_infer(img: Image.Image, processor: DonutProcessor, model: VisionEncoderDecoderModel, prompt: str):
68
- # Donut expects RGB PIL Image; processor handles resizing/normalization
69
  inputs = processor(images=img, text=prompt, return_tensors="pt").to(device)
70
  with torch.no_grad():
71
  outputs = model.generate(
@@ -74,12 +74,10 @@ def donut_infer(img: Image.Image, processor: DonutProcessor, model: VisionEncode
74
  num_beams=1,
75
  early_stopping=True,
76
  )
77
- # decode
78
  seq = processor.batch_decode(outputs, skip_special_tokens=True)[0]
79
- # Donut models often emit JSON; try to parse
80
  parsed = None
 
81
  try:
82
- # strip whitespace garbage around JSON
83
  start = seq.find("{")
84
  end = seq.rfind("}")
85
  if start != -1 and end != -1 and end > start:
@@ -88,7 +86,7 @@ def donut_infer(img: Image.Image, processor: DonutProcessor, model: VisionEncode
88
  parsed = None
89
  return seq, parsed
90
 
91
- # ----------------------------- Key fields & line items -----------------------------
92
  CURRENCY = r"(?P<curr>USD|CAD|EUR|GBP|\$|C\$|€|£)?"
93
  MONEY = rf"{CURRENCY}\s?(?P<amt>\d{{1,3}}(?:[,]\d{{3}})*(?:[.]\d{{2}})?)"
94
  DATE = r"(?P<date>(?:\d{4}[-/]\d{1,2}[-/]\d{1,2})|(?:\d{1,2}[-/]\d{1,2}[-/]\d{2,4})|(?:[A-Za-z]{3,9}\s+\d{1,2},\s*\d{2,4}))"
@@ -118,25 +116,15 @@ def parse_fields_regex(fulltext: str):
118
  return out
119
 
120
  def normalize_kv_from_donut(parsed: dict):
121
- """Try to map common Donut outputs to our schema."""
122
- txt = json.dumps(parsed).lower()
123
- # heuristic mapping for typical SROIE/receipt keys
124
- candidates = {
125
- "invoice_number": ["invoice_number","invoice no","invoice_no","invoice","inv_no"],
126
- "invoice_date": ["date","invoice_date","bill_date"],
127
- "po_number": ["po_number","po","purchase_order"],
128
- "subtotal": ["subtotal","sub_total"],
129
- "tax": ["tax","gst","vat","hst"],
130
- "total": ["total","amount_total","amount_due","grand_total"]
131
- }
132
  out = {k: None for k in ["invoice_number","invoice_date","po_number","subtotal","tax","total","currency"]}
133
- # simple search: pick first occurrence
134
  def search_keys(obj, key_list):
135
- # breadth-first scan
136
  if isinstance(obj, dict):
137
  for k, v in obj.items():
138
- if any(kk in k.lower() for kk in key_list):
139
- return v
 
140
  found = search_keys(v, key_list)
141
  if found is not None:
142
  return found
@@ -147,16 +135,24 @@ def normalize_kv_from_donut(parsed: dict):
147
  return found
148
  return None
149
 
150
- for outk, key_list in candidates.items():
151
- val = search_keys(parsed, key_list)
152
- if isinstance(val, (dict, list)):
153
- val = None # keep it simple; Donut sometimes nests values
 
 
 
 
 
 
154
  if isinstance(val, str):
155
- out[outk] = val.strip()
156
- # currency guess:
157
- curr = re.search(r"(USD|CAD|EUR|GBP|\$|C\$|€|£)", json.dumps(parsed, ensure_ascii=False), re.I)
158
- if curr:
159
- sym = curr.group(1)
 
 
160
  out["currency"] = {"$":"USD","C$":"CAD","€":"EUR","£":"GBP"}.get(sym, sym.upper())
161
  return out
162
 
@@ -167,99 +163,58 @@ def detect_words(img: Image.Image, lang="eng") -> pd.DataFrame:
167
  df["y2"] = df["top"] + df["height"]
168
  return df[df["conf"] > -1]
169
 
170
- def crop_words(img: Image.Image, df: pd.DataFrame) -> List[Tuple[Image.Image, Dict]]:
171
- crops, metas = [], []
172
- for _, r in df.iterrows():
173
- if str(r["text"]).strip() == "":
174
- continue
175
- box = (int(r["left"]), int(r["top"]), int(r["x2"]), int(r["y2"]))
176
- c = img.crop(box)
177
- crops.append(c)
178
- metas.append({"box": box})
179
- return crops, metas
180
-
181
- HEAD_CANDIDATES = ["description","item","qty","quantity","price","unit","rate","amount","total"]
182
- def items_from_wordgrid(df: pd.DataFrame) -> pd.DataFrame:
183
- if df.empty:
184
  return pd.DataFrame()
185
- df = df.copy()
186
- df["cx"] = df["left"] + 0.5*df["width"]
187
- df["cy"] = df["top"] + 0.5*df["height"]
188
 
189
- # group lines
190
  lines = []
191
- for (b,p,l), g in df.groupby(["block_num","par_num","line_num"]):
192
- text = " ".join([t for t in g["text"].astype(str) if t.strip()])
193
- if text.strip():
194
- lines.append({
195
- "block_num":b,"par_num":p,"line_num":l,
196
- "text": text.lower(),
197
- "top": g["top"].min(), "bottom": (g["top"]+g["height"]).max(),
198
- "left": g["left"].min(), "right": (g["left"]+g["width"]).max(),
199
- "words": g.sort_values("left")[["left","top","width","height","text"]].values.tolist()
200
- })
201
- L = pd.DataFrame(lines)
202
- if L.empty: return pd.DataFrame()
203
- L["score"] = L["text"].apply(lambda s: sum(1 for h in HEAD_CANDIDATES if h in s))
204
- headers = L[L["score"]>=2].sort_values(["score","top"], ascending=[False,True])
205
- if headers.empty: return pd.DataFrame()
206
- H = headers.iloc[0]
207
- header_y = H["bottom"] + 4
208
-
209
- # derive column anchors from header words positions
210
- df_header = detect_words(img=None, lang="eng") # placeholder to keep signature consistent
211
-
212
- # get header band words
213
- # reconstruct header band from original DF
214
- # (we need original df back here; easier: pass it in as closure var)
215
- # → we'll adapt: compute from global last_df if present
216
- return_df = pd.DataFrame()
217
- return return_df
218
-
219
- # We’ll implement a simpler, robust table extractor to avoid closure complexity:
220
- def items_from_words_simple(tsv: pd.DataFrame) -> pd.DataFrame:
221
- # find header line
222
- L = []
223
  for (b,p,l), g in tsv.groupby(["block_num","par_num","line_num"]):
224
  text = " ".join([w for w in g["text"].astype(str).tolist() if w.strip()])
225
  if text.strip():
226
- L.append({
227
  "block_num": b, "par_num": p, "line_num": l,
228
  "text": text.lower(),
229
  "top": g["top"].min(), "bottom": (g["top"]+g["height"]).max(),
230
  "left": g["left"].min(), "right": (g["left"]+g["width"]).max()
231
  })
232
- lines = pd.DataFrame(L)
233
- if lines.empty:
234
  return pd.DataFrame()
235
 
236
  def score_header(s: str):
237
  return sum(1 for h in HEAD_CANDIDATES if h in s)
238
 
239
- lines["header_score"] = lines["text"].apply(score_header)
240
- hdrs = lines[lines["header_score"] >= 2].sort_values(["header_score","top"], ascending=[False,True])
241
  if hdrs.empty:
242
  return pd.DataFrame()
 
243
  H = hdrs.iloc[0]
244
  header_top, header_bottom = H["top"], H["bottom"]
245
 
246
- # header words
247
  header_words = tsv[(tsv["top"] >= header_top - 5) & ((tsv["top"] + tsv["height"]) <= header_bottom + 5)]
248
  header_words = header_words.sort_values("left")
249
  if header_words.empty:
250
  return pd.DataFrame()
251
  xs = header_words["left"].tolist()
 
252
 
253
- # items region
254
  below = tsv[tsv["top"] > header_bottom + 5].copy()
255
- totals_mask = below["text"].str.lower().str.contains(r"(sub\s*total|amount\s*due|total|grand\s*total|balance)", regex=True, na=False)
 
 
 
256
  if totals_mask.any():
257
  stop_y = below.loc[totals_mask, "top"].min()
258
  below = below[below["top"] < stop_y - 4]
259
  if below.empty:
260
  return pd.DataFrame()
261
 
262
- # build rows by assigning words to nearest header x
263
  rows = []
264
  for (b,p,l), g in below.groupby(["block_num","par_num","line_num"]):
265
  g = g.sort_values("left")
@@ -270,14 +225,15 @@ def items_from_words_simple(tsv: pd.DataFrame) -> pd.DataFrame:
270
  idx = int(np.abs(np.array(xs) - w["left"]).argmin())
271
  buckets[idx].append(str(w["text"]))
272
  vals = [" ".join(buckets[i]).strip() for i in range(len(xs))]
273
- rows.append(vals)
 
274
  if not rows:
275
  return pd.DataFrame()
276
 
277
  df_rows = pd.DataFrame(rows).fillna("")
278
- # name columns heuristically
 
279
  names = []
280
- hdr_tokens = [t.lower() for t in header_words["text"].tolist()]
281
  for i in range(df_rows.shape[1]):
282
  wl = hdr_tokens[i] if i < len(hdr_tokens) else f"col_{i}"
283
  if "desc" in wl or wl in ["item","description"]:
@@ -291,24 +247,24 @@ def items_from_words_simple(tsv: pd.DataFrame) -> pd.DataFrame:
291
  else:
292
  names.append(f"col_{i}")
293
  df_rows.columns = names
294
- # drop empty lines
 
295
  df_rows = df_rows[~(df_rows.fillna("").apply(lambda r: "".join(r.values), axis=1).str.strip()=="")]
296
  return df_rows.reset_index(drop=True)
297
 
298
  # ----------------------------- App -----------------------------
299
- st.title("Invoice Extraction — Donut (HF pretrained) + Tesseract tables")
300
 
301
  up = st.file_uploader("Upload an invoice (PDF/JPG/PNG)", type=["pdf","png","jpg","jpeg"])
302
  if not up:
303
  st.info("Upload a scanned invoice to begin.")
304
  st.stop()
305
 
306
- # load model once
307
  with st.spinner(f"Loading model '{model_id}' from Hugging Face…"):
308
  processor, donut_model = load_donut(model_id)
309
 
310
  pages = load_pages(up.read(), up.name)
311
-
312
  page_idx = 0
313
  if len(pages) > 1:
314
  page_idx = st.number_input("Page", 1, len(pages), 1) - 1
@@ -326,11 +282,11 @@ with col1:
326
  with col2:
327
  st.subheader("OCR & Extraction")
328
 
329
- # 1) Donut extraction (key fields or full text)
330
  with st.spinner("Running Donut…"):
331
  seq, parsed = donut_infer(img, processor, donut_model, task_prompt)
332
 
333
- # 2) Key fields
334
  if parsed:
335
  key_fields = normalize_kv_from_donut(parsed)
336
  donut_payload = parsed
@@ -351,7 +307,7 @@ with col2:
351
  cur = key_fields.get('currency') or ''
352
  st.write(f"**Total:** {tot} {cur}".strip())
353
 
354
- # 3) Tesseract line items (geometry heuristic)
355
  with st.spinner("Detecting words with Tesseract (for table)…"):
356
  tsv = pytesseract.image_to_data(det_img, lang=det_lang, output_type=Output.DATAFRAME)
357
  tsv = tsv.dropna(subset=["text"]).reset_index(drop=True)
 
1
+ # app.py
2
+ # Invoice Extraction Donut (public HF model, no token) + Tesseract tables
3
+ # - Loads a public Donut checkpoint (default: naver-clova-ix/donut-base-finetuned-cord-v2)
4
+ # - Pulls key fields from Donut JSON (if available) or falls back to regex
5
+ # - Detects line-item tables via Tesseract word boxes + geometry heuristics
6
 
7
  import os, io, re, json
8
+ from typing import List
 
9
  import numpy as np
10
  import pandas as pd
11
  from PIL import Image, ImageOps, ImageFilter
12
 
13
  import streamlit as st
14
 
15
+ # OCR (detection only) and PDF->image
16
  import pytesseract
17
  from pytesseract import Output
18
  from pdf2image import convert_from_bytes
19
 
20
+ # HF Donut (auto-downloads public model; no HF token required)
21
  import torch
22
  from transformers import DonutProcessor, VisionEncoderDecoderModel
23
 
24
+ # ----------------------------- Page config -----------------------------
25
+ st.set_page_config(
26
+ page_title="Invoice Extraction — Donut (public) + Tesseract tables",
27
+ layout="wide"
28
+ )
29
+
30
+ device = "cuda" if torch.cuda.is_available() else "cpu"
31
 
32
  # ----------------------------- Sidebar -----------------------------
33
+ st.sidebar.header("Model (Hugging Face — public)")
34
  model_id = st.sidebar.text_input(
35
  "HF model id",
36
+ value="naver-clova-ix/donut-base-finetuned-cord-v2",
37
+ help="Use a public model id. Example: naver-clova-ix/donut-base-finetuned-cord-v2"
38
  )
39
  task_prompt = st.sidebar.text_input(
40
+ "Task prompt (Donut)",
41
+ value="<s_cord-v2>",
42
+ help="Keep default for CORD-like invoices; adjust if you change models."
43
  )
44
  det_lang = st.sidebar.text_input("Tesseract language(s) — detection only", value="eng")
45
+ show_boxes = st.sidebar.checkbox("Show word boxes (debug)", value=False)
 
 
 
 
46
 
47
  # ----------------------------- Utilities -----------------------------
48
  def load_pages(file_bytes: bytes, name: str) -> List[Image.Image]:
 
59
 
60
  @st.cache_resource(show_spinner=True)
61
  def load_donut(_model_id: str):
62
+ # Public checkpoints load without token
63
  processor = DonutProcessor.from_pretrained(_model_id)
64
  model = VisionEncoderDecoderModel.from_pretrained(_model_id)
65
+ model.to(device).eval()
 
66
  return processor, model
67
 
68
  def donut_infer(img: Image.Image, processor: DonutProcessor, model: VisionEncoderDecoderModel, prompt: str):
 
69
  inputs = processor(images=img, text=prompt, return_tensors="pt").to(device)
70
  with torch.no_grad():
71
  outputs = model.generate(
 
74
  num_beams=1,
75
  early_stopping=True,
76
  )
 
77
  seq = processor.batch_decode(outputs, skip_special_tokens=True)[0]
 
78
  parsed = None
79
+ # Try to parse JSON from the generated sequence
80
  try:
 
81
  start = seq.find("{")
82
  end = seq.rfind("}")
83
  if start != -1 and end != -1 and end > start:
 
86
  parsed = None
87
  return seq, parsed
88
 
89
+ # ----------------------------- Key fields & tables -----------------------------
90
  CURRENCY = r"(?P<curr>USD|CAD|EUR|GBP|\$|C\$|€|£)?"
91
  MONEY = rf"{CURRENCY}\s?(?P<amt>\d{{1,3}}(?:[,]\d{{3}})*(?:[.]\d{{2}})?)"
92
  DATE = r"(?P<date>(?:\d{4}[-/]\d{1,2}[-/]\d{1,2})|(?:\d{1,2}[-/]\d{1,2}[-/]\d{2,4})|(?:[A-Za-z]{3,9}\s+\d{1,2},\s*\d{2,4}))"
 
116
  return out
117
 
118
  def normalize_kv_from_donut(parsed: dict):
119
+ """Map common Donut outputs to a simple invoice schema."""
 
 
 
 
 
 
 
 
 
 
120
  out = {k: None for k in ["invoice_number","invoice_date","po_number","subtotal","tax","total","currency"]}
121
+
122
  def search_keys(obj, key_list):
 
123
  if isinstance(obj, dict):
124
  for k, v in obj.items():
125
+ kl = k.lower()
126
+ if any(kk in kl for kk in key_list):
127
+ return v if isinstance(v, str) else None
128
  found = search_keys(v, key_list)
129
  if found is not None:
130
  return found
 
135
  return found
136
  return None
137
 
138
+ mapping = {
139
+ "invoice_number": ["invoice_number","invoice no","invoice_no","invoice","inv_no","inv no"],
140
+ "invoice_date": ["invoice_date","date","bill_date","document_date"],
141
+ "po_number": ["po_number","po","purchase_order"],
142
+ "subtotal": ["subtotal","sub_total"],
143
+ "tax": ["tax","gst","vat","hst"],
144
+ "total": ["total","amount_total","amount_due","grand_total"],
145
+ }
146
+ for k, keys in mapping.items():
147
+ val = search_keys(parsed, keys)
148
  if isinstance(val, str):
149
+ out[k] = val.strip()
150
+
151
+ # currency guess from JSON text
152
+ txt = json.dumps(parsed, ensure_ascii=False)
153
+ m = re.search(r"(USD|CAD|EUR|GBP|\$|C\$|€|£)", txt, re.I)
154
+ if m:
155
+ sym = m.group(1)
156
  out["currency"] = {"$":"USD","C$":"CAD","€":"EUR","£":"GBP"}.get(sym, sym.upper())
157
  return out
158
 
 
163
  df["y2"] = df["top"] + df["height"]
164
  return df[df["conf"] > -1]
165
 
166
+ def items_from_words_simple(tsv: pd.DataFrame) -> pd.DataFrame:
167
+ """Geometry-driven table extraction using Tesseract TSV."""
168
+ HEAD_CANDIDATES = ["description","item","qty","quantity","price","unit","rate","amount","total"]
169
+ if tsv.empty:
 
 
 
 
 
 
 
 
 
 
170
  return pd.DataFrame()
 
 
 
171
 
172
+ # Build per-line metadata
173
  lines = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
  for (b,p,l), g in tsv.groupby(["block_num","par_num","line_num"]):
175
  text = " ".join([w for w in g["text"].astype(str).tolist() if w.strip()])
176
  if text.strip():
177
+ lines.append({
178
  "block_num": b, "par_num": p, "line_num": l,
179
  "text": text.lower(),
180
  "top": g["top"].min(), "bottom": (g["top"]+g["height"]).max(),
181
  "left": g["left"].min(), "right": (g["left"]+g["width"]).max()
182
  })
183
+ L = pd.DataFrame(lines)
184
+ if L.empty:
185
  return pd.DataFrame()
186
 
187
  def score_header(s: str):
188
  return sum(1 for h in HEAD_CANDIDATES if h in s)
189
 
190
+ L["header_score"] = L["text"].apply(score_header)
191
+ hdrs = L[L["header_score"] >= 2].sort_values(["header_score","top"], ascending=[False,True])
192
  if hdrs.empty:
193
  return pd.DataFrame()
194
+
195
  H = hdrs.iloc[0]
196
  header_top, header_bottom = H["top"], H["bottom"]
197
 
198
+ # Header words & their x-positions
199
  header_words = tsv[(tsv["top"] >= header_top - 5) & ((tsv["top"] + tsv["height"]) <= header_bottom + 5)]
200
  header_words = header_words.sort_values("left")
201
  if header_words.empty:
202
  return pd.DataFrame()
203
  xs = header_words["left"].tolist()
204
+ hdr_tokens = [t.lower() for t in header_words["text"].tolist()]
205
 
206
+ # Items region below header (stop before totals area)
207
  below = tsv[tsv["top"] > header_bottom + 5].copy()
208
+ totals_mask = below["text"].str.lower().str.contains(
209
+ r"(sub\s*total|amount\s*due|total|grand\s*total|balance)",
210
+ regex=True, na=False
211
+ )
212
  if totals_mask.any():
213
  stop_y = below.loc[totals_mask, "top"].min()
214
  below = below[below["top"] < stop_y - 4]
215
  if below.empty:
216
  return pd.DataFrame()
217
 
 
218
  rows = []
219
  for (b,p,l), g in below.groupby(["block_num","par_num","line_num"]):
220
  g = g.sort_values("left")
 
225
  idx = int(np.abs(np.array(xs) - w["left"]).argmin())
226
  buckets[idx].append(str(w["text"]))
227
  vals = [" ".join(buckets[i]).strip() for i in range(len(xs))]
228
+ if any(vals):
229
+ rows.append(vals)
230
  if not rows:
231
  return pd.DataFrame()
232
 
233
  df_rows = pd.DataFrame(rows).fillna("")
234
+
235
+ # Name columns heuristically from header tokens
236
  names = []
 
237
  for i in range(df_rows.shape[1]):
238
  wl = hdr_tokens[i] if i < len(hdr_tokens) else f"col_{i}"
239
  if "desc" in wl or wl in ["item","description"]:
 
247
  else:
248
  names.append(f"col_{i}")
249
  df_rows.columns = names
250
+
251
+ # Drop blank rows
252
  df_rows = df_rows[~(df_rows.fillna("").apply(lambda r: "".join(r.values), axis=1).str.strip()=="")]
253
  return df_rows.reset_index(drop=True)
254
 
255
  # ----------------------------- App -----------------------------
256
+ st.title("Invoice Extraction — Donut (public checkpoint) + Tesseract tables")
257
 
258
  up = st.file_uploader("Upload an invoice (PDF/JPG/PNG)", type=["pdf","png","jpg","jpeg"])
259
  if not up:
260
  st.info("Upload a scanned invoice to begin.")
261
  st.stop()
262
 
263
+ # Load HF model (public)
264
  with st.spinner(f"Loading model '{model_id}' from Hugging Face…"):
265
  processor, donut_model = load_donut(model_id)
266
 
267
  pages = load_pages(up.read(), up.name)
 
268
  page_idx = 0
269
  if len(pages) > 1:
270
  page_idx = st.number_input("Page", 1, len(pages), 1) - 1
 
282
  with col2:
283
  st.subheader("OCR & Extraction")
284
 
285
+ # 1) Donut for key-value extraction / text
286
  with st.spinner("Running Donut…"):
287
  seq, parsed = donut_infer(img, processor, donut_model, task_prompt)
288
 
289
+ # 2) Key fields from JSON (if available) else regex over generated text
290
  if parsed:
291
  key_fields = normalize_kv_from_donut(parsed)
292
  donut_payload = parsed
 
307
  cur = key_fields.get('currency') or ''
308
  st.write(f"**Total:** {tot} {cur}".strip())
309
 
310
+ # 3) Tesseract word boxes for line-item table (simple heuristic)
311
  with st.spinner("Detecting words with Tesseract (for table)…"):
312
  tsv = pytesseract.image_to_data(det_img, lang=det_lang, output_type=Output.DATAFRAME)
313
  tsv = tsv.dropna(subset=["text"]).reset_index(drop=True)