Spaces:

Gabriel00A
/

invoice-staging

Paused

App Files Files Community

Gabriel00A commited on Nov 12, 2025

Commit

01da16a

verified ·

1 Parent(s): 5545434

Update app.py

Browse files

Files changed (1) hide show

app.py +68 -25

app.py CHANGED Viewed

@@ -4,7 +4,7 @@ import io
 import zipfile
 from datetime import datetime
 from pathlib import Path
-from typing import List, Optional
 import gradio as gr
 import pandas as pd
@@ -61,7 +61,10 @@ Base.metadata.create_all(engine)
 def load_suppliers_from_csv():
     if not SUPPLIER_CSV.exists():
         return
-    df = pd.read_csv(SUPPLIER_CSV, dtype=str).fillna("")
     session = SessionLocal()
     for _, r in df.iterrows():
         name = str(r.get("name","")).strip()
@@ -82,8 +85,15 @@ TAXNO_RE = re.compile(r"([0-9A-Z]{15,20})")
 INVOICE_RE = re.compile(r"(发票代码|发票号码|发票号|Invoice No|Invoice No\.)[:：\s]*([A-Za-z0-9\-]+)")
 AMOUNT_RE = re.compile(r"([0-9]{1,3}(?:[,，][0-9]{3})*(?:\.[0-9]{1,2})?)")
-def do_ocr PIL_image_bytes(file_bytes: bytes) -> str:
-    img = Image.open(io.BytesIO(file_bytes))
     text = pytesseract.image_to_string(img, lang='chi_sim+eng')
     return text
@@ -118,16 +128,18 @@ def extract_fields(ocr_text: str) -> dict:
     if not name:
         lines = [ln.strip() for ln in ocr_text.splitlines() if ln.strip()]
         if lines:
-            name = lines[0][:80]
     return {"taxno": taxno, "invoice_no": inv, "total": total, "name": name, "raw": ocr_text}
 def match_supplier(session, extracted: dict, threshold:int=80):
     tax = extracted.get("taxno")
     if tax:
         sup = session.query(Supplier).filter(Supplier.tax_no==tax.strip()).first()
         if sup:
             return {"supplier_id": sup.id, "supplier_name": sup.name, "supplier_taxno": sup.tax_no, "score": 100.0}
     name = (extracted.get("name") or "").strip()
     if not name:
         return None
@@ -147,12 +159,12 @@ def match_supplier(session, extracted: dict, threshold:int=80):
 # ---------- Storage helpers ----------
 def save_file_and_record(file_bytes: bytes, filename: str, uploader: str = "unknown"):
     ts = datetime.utcnow().strftime("%Y%m%d%H%M%S%f")
-    safe_name = f"{ts}_{filename}"
     path = STORAGE_DIR / safe_name
     with open(path, "wb") as f:
         f.write(file_bytes)
-    ocr_text = do_ocr PIL_image_bytes(file_bytes)
     extracted = extract_fields(ocr_text)
     session = SessionLocal()
     match = match_supplier(session, extracted)
@@ -195,10 +207,11 @@ def list_documents(limit=200):
     session.close()
     return rows
-def get_suppliers():
     session = SessionLocal()
     s = session.query(Supplier).order_by(Supplier.name).all()
     session.close()
     return [(str(x.id), x.name) for x in s]
 def confirm_document(doc_id: int, supplier_id: Optional[int]):
@@ -253,10 +266,19 @@ def export_zip(ids: List[int]):
 # ---------- Gradio UI ----------
 def upload_files(files, uploader):
     results = []
     for f in files:
-        content = f.read()
-        res = save_file_and_record(content, f.name, uploader or "unknown")
         results.append(res)
     return pd.DataFrame(results)
@@ -266,31 +288,51 @@ def refresh_list():
         return pd.DataFrame([], columns=["id","filename","uploaded_at","supplier_name","invoice_no","total_amount","score","status"])
     return pd.DataFrame(rows)
-def ui_confirm(doc_id, supplier_id):
-    ok = confirm_document(int(doc_id), int(supplier_id) if supplier_id else None)
-    return {"ok": ok}
-def ui_export(selected_ids):
-    if not selected_ids:
-        return None
-    ids = [int(x) for x in selected_ids]
     zp = export_zip(ids)
-    return zp
 with gr.Blocks() as demo:
     gr.Markdown("## 发票收票分拣（Staging）  \n上传发票 → 自动识别供应商 → 人工确认/导出")
     with gr.Row():
         with gr.Column(scale=3):
             uploader = gr.Textbox(label="上传人 (可选)", placeholder="front_desk")
-            file_inputs = gr.File(label="拖拽或选择发票（图片 / PDF 可行，但图片更稳定）", file_count="multiple")
             upload_btn = gr.Button("上传并识别")
-            upload_out = gr.Dataframe(headers=["id","filename","supplier","score","status","invoice_no","total"])
         with gr.Column(scale=2):
             refresh_btn = gr.Button("刷新列表")
-            list_out = gr.Dataframe(headers=["id","filename","uploaded_at","supplier_name","invoice_no","total_amount","score","status"])
             gr.Markdown("**确认匹配（人工）**")
             doc_id_in = gr.Textbox(label="文档 ID（从列表取）")
-            supplier_dd = gr.Dropdown(choices=[x[1] for x in get_suppliers()], label="选择供应商（或先新增 suppliers.csv）")
             confirm_btn = gr.Button("确认关联")
             confirm_out = gr.Textbox()
             gr.Markdown("**导出选中（输入逗号分隔的 ID 列表）**")
@@ -300,10 +342,11 @@ with gr.Blocks() as demo:
     upload_btn.click(lambda files,uploader: upload_files(files,uploader), inputs=[file_inputs,uploader], outputs=upload_out)
     refresh_btn.click(lambda: refresh_list(), inputs=[], outputs=list_out)
-    confirm_btn.click(lambda d,s: ui_confirm(d, None if not s else next((sid for sid,name in get_suppliers() if name==s), None)), inputs=[doc_id_in, supplier_dd], outputs=confirm_out)
-    export_btn.click(lambda txt: ui_export([x.strip() for x in txt.split(",") if x.strip()]), inputs=[export_ids], outputs=export_out)
-    gr.Markdown("**提示**：如果供应商不在下拉中，请在 repo 放入或更新 `suppliers.csv` 并 Rebuild 空间，或者修改数据库。")
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860, share=False)

 import zipfile
 from datetime import datetime
 from pathlib import Path
+from typing import List, Optional, Tuple
 import gradio as gr
 import pandas as pd
 def load_suppliers_from_csv():
     if not SUPPLIER_CSV.exists():
         return
+    try:
+        df = pd.read_csv(SUPPLIER_CSV, dtype=str).fillna("")
+    except Exception:
+        return
     session = SessionLocal()
     for _, r in df.iterrows():
         name = str(r.get("name","")).strip()
 INVOICE_RE = re.compile(r"(发票代码|发票号码|发票号|Invoice No|Invoice No\.)[:：\s]*([A-Za-z0-9\-]+)")
 AMOUNT_RE = re.compile(r"([0-9]{1,3}(?:[,，][0-9]{3})*(?:\.[0-9]{1,2})?)")
+def do_ocr(image_bytes: bytes) -> str:
+    """
+    用 pytesseract 对二进制图片数据做 OCR，返回识别出的纯文本。
+    """
+    img = Image.open(io.BytesIO(image_bytes))
+    try:
+        img = img.convert("RGB")
+    except Exception:
+        pass
     text = pytesseract.image_to_string(img, lang='chi_sim+eng')
     return text
     if not name:
         lines = [ln.strip() for ln in ocr_text.splitlines() if ln.strip()]
         if lines:
+            name = lines[0][:120]
     return {"taxno": taxno, "invoice_no": inv, "total": total, "name": name, "raw": ocr_text}
 def match_supplier(session, extracted: dict, threshold:int=80):
+    # 1) tax no exact match
     tax = extracted.get("taxno")
     if tax:
         sup = session.query(Supplier).filter(Supplier.tax_no==tax.strip()).first()
         if sup:
             return {"supplier_id": sup.id, "supplier_name": sup.name, "supplier_taxno": sup.tax_no, "score": 100.0}
+    # 2) name fuzzy match
     name = (extracted.get("name") or "").strip()
     if not name:
         return None
 # ---------- Storage helpers ----------
 def save_file_and_record(file_bytes: bytes, filename: str, uploader: str = "unknown"):
     ts = datetime.utcnow().strftime("%Y%m%d%H%M%S%f")
+    safe_name = f"{ts}_{filename.replace(' ', '_')}"
     path = STORAGE_DIR / safe_name
     with open(path, "wb") as f:
         f.write(file_bytes)
+    ocr_text = do_ocr(file_bytes)
     extracted = extract_fields(ocr_text)
     session = SessionLocal()
     match = match_supplier(session, extracted)
     session.close()
     return rows
+def get_suppliers() -> List[Tuple[str,str]]:
     session = SessionLocal()
     s = session.query(Supplier).order_by(Supplier.name).all()
     session.close()
+    # return list of (id, name)
     return [(str(x.id), x.name) for x in s]
 def confirm_document(doc_id: int, supplier_id: Optional[int]):
 # ---------- Gradio UI ----------
 def upload_files(files, uploader):
+    # files: list of gradio file-like objects
     results = []
+    if not files:
+        return pd.DataFrame([], columns=["id","filename","supplier","score","status","invoice_no","total"])
     for f in files:
+        # f.read() returns bytes
+        try:
+            content = f.read()
+        except Exception:
+            # f may be a local path in some envs
+            with open(f.name, "rb") as fh:
+                content = fh.read()
+        res = save_file_and_record(content, getattr(f, "name", "uploaded_file"), uploader or "unknown")
         results.append(res)
     return pd.DataFrame(results)
         return pd.DataFrame([], columns=["id","filename","uploaded_at","supplier_name","invoice_no","total_amount","score","status"])
     return pd.DataFrame(rows)
+def ui_confirm(doc_id: str, supplier_name: str):
+    # doc_id is string from textbox; supplier_name is name selected in dropdown
+    try:
+        doc_id_i = int(doc_id)
+    except:
+        return "Invalid doc id"
+    if not supplier_name:
+        # mark unknown
+        ok = confirm_document(doc_id_i, None)
+        return "ok" if ok else "fail"
+    # find supplier id by name
+    session = SessionLocal()
+    s = session.query(Supplier).filter(Supplier.name==supplier_name).first()
+    session.close()
+    if not s:
+        return "supplier not found"
+    ok = confirm_document(doc_id_i, s.id)
+    return "ok" if ok else "fail"
+def ui_export(txt_ids: str):
+    if not txt_ids:
+        return ""
+    ids = [int(x.strip()) for x in txt_ids.split(",") if x.strip().isdigit()]
+    if not ids:
+        return ""
     zp = export_zip(ids)
+    return zp or ""
+# build UI
 with gr.Blocks() as demo:
     gr.Markdown("## 发票收票分拣（Staging）  \n上传发票 → 自动识别供应商 → 人工确认/导出")
     with gr.Row():
         with gr.Column(scale=3):
             uploader = gr.Textbox(label="上传人 (可选)", placeholder="front_desk")
+            file_inputs = gr.File(label="拖拽或选择发票（图片 优先）", file_count="multiple")
             upload_btn = gr.Button("上传并识别")
+            upload_out = gr.Dataframe(headers=["id","filename","supplier","score","status","invoice_no","total"], interactive=False)
         with gr.Column(scale=2):
             refresh_btn = gr.Button("刷新列表")
+            list_out = gr.Dataframe(headers=["id","filename","uploaded_at","supplier_name","invoice_no","total_amount","score","status"], interactive=False)
             gr.Markdown("**确认匹配（人工）**")
             doc_id_in = gr.Textbox(label="文档 ID（从列表取）")
+            # supplier dropdown choices filled at start; to refresh suppliers update repo or restart
+            supplier_choices = [name for (_id, name) in get_suppliers()]
+            supplier_dd = gr.Dropdown(choices=supplier_choices, label="选择供应商（或先通过 suppliers.csv 添加）")
             confirm_btn = gr.Button("确认关联")
             confirm_out = gr.Textbox()
             gr.Markdown("**导出选中（输入逗号分隔的 ID 列表）**")
     upload_btn.click(lambda files,uploader: upload_files(files,uploader), inputs=[file_inputs,uploader], outputs=upload_out)
     refresh_btn.click(lambda: refresh_list(), inputs=[], outputs=list_out)
+    confirm_btn.click(lambda d,s: ui_confirm(d, s), inputs=[doc_id_in, supplier_dd], outputs=confirm_out)
+    export_btn.click(lambda txt: ui_export(txt), inputs=[export_ids], outputs=export_out)
+    gr.Markdown("**提示**：如果供应商不在下拉中，请在 repo 放入或更新 `suppliers.csv` 并 Rebuild 空间，或在数据库新增。")
 if __name__ == "__main__":
+    # Gradio default port for Spaces is 7860
     demo.launch(server_name="0.0.0.0", server_port=7860, share=False)