Spaces:

Gabriel00A
/

invoice-staging

Paused

App Files Files Community

Gabriel00A commited on Nov 12, 2025

Commit

71d3ff0

verified ·

1 Parent(s): 17face9

Update app.py

Browse files

Files changed (1) hide show

app.py +71 -158

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# app.py (final robust version - supports PNG/JPG/PDF and returns errors nicely)
 import os
 import io
 import zipfile
@@ -16,8 +16,6 @@ from rapidfuzz import process, fuzz
 from sqlalchemy import create_engine, Column, Integer, String, DateTime, Float, Text
 from sqlalchemy.ext.declarative import declarative_base
 from sqlalchemy.orm import sessionmaker
-# pdf2image for PDF -> image
 from pdf2image import convert_from_bytes
 # ---------- config ----------
@@ -56,13 +54,13 @@ class Document(Base):
     invoice_no = Column(String, nullable=True)
     total_amount = Column(Float, nullable=True)
     match_score = Column(Float, nullable=True)
-    status = Column(String, default="new")  # new / matched / unknown / exported
     raw_extracted = Column(Text, nullable=True)
     error = Column(Text, nullable=True)
 Base.metadata.create_all(engine)
-# load suppliers CSV if present
 def load_suppliers_from_csv():
     if not SUPPLIER_CSV.exists():
         return
@@ -85,44 +83,36 @@ def load_suppliers_from_csv():
 load_suppliers_from_csv()
-# ---------- OCR & extract ----------
 TAXNO_RE = re.compile(r"([0-9A-Z]{15,20})")
 INVOICE_RE = re.compile(r"(发票代码|发票号码|发票号|Invoice No|Invoice No\.)[:：\s]*([A-Za-z0-9\-]+)")
 AMOUNT_RE = re.compile(r"([0-9]{1,3}(?:[,，][0-9]{3})*(?:\.[0-9]{1,2})?)")
 def do_ocr(file_bytes: bytes) -> str:
     """
-    支持图片与 PDF：
-    - 先尝试把 bytes 当图片打开（PIL）
-    - 如果不是图片或识别无结果，尝试 pdf2image 将第一页转为图片再 OCR
-    返回拼接的识别文本（若多页可合并）。
     """
-    # 1) try image
     try:
         img = Image.open(io.BytesIO(file_bytes))
         img = img.convert("RGB")
         text = pytesseract.image_to_string(img, lang='chi_sim+eng')
-        if text and text.strip():
             return text
     except Exception:
-        # 不是图片或 PIL 打开失败 -> fall through to PDF attempt
         pass
-    # 2) try PDF -> images
     try:
-        images = convert_from_bytes(file_bytes, dpi=300)  # convert all pages (if many, okay)
         texts = []
         for im in images:
-            try:
-                t = pytesseract.image_to_string(im, lang='chi_sim+eng')
-            except Exception:
-                t = ""
-            if t and t.strip():
                 texts.append(t)
         return "\n\n".join(texts)
     except Exception as e:
-        # 无法处理为 PDF
-        print("do_ocr error:", e)
         return ""
 def extract_fields(ocr_text: str) -> dict:
@@ -181,7 +171,7 @@ def match_supplier(session, extracted: dict, threshold:int=80):
             return {"supplier_id": s.id, "supplier_name": s.name, "supplier_taxno": s.tax_no, "score": float(score)}
     return None
-# ---------- Storage helpers ----------
 def save_file_and_record(file_bytes: bytes, filename: str, uploader: str = "unknown"):
     ts = datetime.utcnow().strftime("%Y%m%d%H%M%S%f")
     safe_name = f"{ts}_{filename.replace(' ', '_')}"
@@ -193,15 +183,12 @@ def save_file_and_record(file_bytes: bytes, filename: str, uploader: str = "unkn
     try:
         ocr_text = do_ocr(file_bytes)
     except Exception as e:
-        ocr_text = ""
         print("OCR exception:", e, traceback.format_exc())
     extracted = extract_fields(ocr_text)
     session = SessionLocal()
     match = match_supplier(session, extracted)
-    # default status and error None
     status = "matched" if match else "unknown"
-    error_msg = None
     doc = Document(
         filename=filename,
@@ -216,8 +203,7 @@ def save_file_and_record(file_bytes: bytes, filename: str, uploader: str = "unkn
         total_amount=extracted.get("total"),
         match_score=match["score"] if match else None,
         status=status,
-        raw_extracted=str(extracted),
-        error=error_msg
     )
     session.add(doc)
     session.commit()
@@ -225,53 +211,11 @@ def save_file_and_record(file_bytes: bytes, filename: str, uploader: str = "unkn
     session.close()
     return {
         "id": doc.id, "filename": doc.filename, "supplier": doc.supplier_name,
-        "score": doc.match_score, "status": doc.status, "invoice_no": doc.invoice_no, "total": doc.total_amount
     }
-def list_documents(limit=200):
-    session = SessionLocal()
-    q = session.query(Document).order_by(Document.uploaded_at.desc()).limit(limit).all()
-    rows = []
-    for d in q:
-        rows.append({
-            "id": d.id, "filename": d.filename,
-            "uploaded_at": d.uploaded_at.strftime("%Y-%m-%d %H:%M:%S"),
-            "supplier_id": d.supplier_id, "supplier_name": d.supplier_name,
-            "supplier_taxno": d.supplier_taxno, "invoice_no": d.invoice_no,
-            "total_amount": d.total_amount, "score": d.match_score, "status": d.status
-        })
-    session.close()
-    return rows
-def get_suppliers() -> List[Tuple[str,str]]:
-    session = SessionLocal()
-    s = session.query(Supplier).order_by(Supplier.name).all()
-    session.close()
-    return [(str(x.id), x.name) for x in s]
-def confirm_document(doc_id: int, supplier_id: Optional[int]):
-    session = SessionLocal()
-    d = session.query(Document).get(doc_id)
-    if not d:
-        session.close()
-        return False
-    if supplier_id:
-        s = session.query(Supplier).get(supplier_id)
-        if not s:
-            session.close()
-            return False
-        d.supplier_id = s.id
-        d.supplier_name = s.name
-        d.supplier_taxno = s.tax_no
-        d.match_score = 100.0
-        d.status = "matched"
-    else:
-        d.status = "unknown"
-    session.add(d)
-    session.commit()
-    session.close()
-    return True
 def export_zip(ids: List[int]):
     session = SessionLocal()
     docs = session.query(Document).filter(Document.id.in_(ids)).all()
@@ -297,131 +241,100 @@ def export_zip(ids: List[int]):
             if os.path.exists(d.filepath):
                 zf.write(d.filepath, arcname=os.path.basename(d.filepath))
     session.close()
-    return str(zip_path)
-# ---------- Gradio UI ----------
-def _read_uploaded_file(f):
-    """
-    Robust reader: f may be a gradio file-like object with read(), or a local file path object.
-    返回 bytes 和 文件名
-    """
     try:
-        content = f.read()
-        name = getattr(f, "name", None) or getattr(f, "filename", None) or "uploaded_file"
-        # when gradio gives a SpooledTemporaryFile, name may be a path
-        return content, os.path.basename(name)
-    except Exception:
-        # fallback: if f has .name path, read from disk
-        try:
-            path = f.name
-            with open(path, "rb") as fh:
-                return fh.read(), os.path.basename(path)
-        except Exception as e:
-            raise RuntimeError(f"无法读取上传文件: {e}")
 def upload_files(files, uploader):
     results = []
     if not files:
         return pd.DataFrame([], columns=["id","filename","supplier","score","status","invoice_no","total"])
     for f in files:
         try:
-            content, filename = _read_uploaded_file(f)
-            res = save_file_and_record(content, filename, uploader or "unknown")
             results.append(res)
         except Exception as e:
-            # record error into DB so we can inspect later
-            try:
-                ts = datetime.utcnow().strftime("%Y%m%d%H%M%S%f")
-                safe_name = f"err_{ts}_{getattr(f, 'name', 'unknown')}"
-                p = STORAGE_DIR / safe_name
-                # try write raw if possible
-                try:
-                    with open(p, "wb") as fh:
-                        if hasattr(f, "read"):
-                            fh.write(f.read())
-                except Exception:
-                    pass
-            except Exception:
-                pass
-            # append a visible error row
             results.append({
-                "id": -1,
-                "filename": getattr(f, "name", "uploaded_file"),
-                "supplier": None,
-                "score": None,
-                "status": "error",
-                "invoice_no": None,
-                "total": None,
-                "error": str(e) + "\n" + traceback.format_exc()
             })
-            print("file processing error:", e, traceback.format_exc())
-    df = pd.DataFrame(results)
-    # ensure consistent columns for front-end display
-    cols = ["id","filename","supplier","score","status","invoice_no","total"]
-    for c in cols:
-        if c not in df.columns:
-            df[c] = None
-    return df[cols]
 def refresh_list():
-    rows = list_documents()
-    if not rows:
-        return pd.DataFrame([], columns=["id","filename","uploaded_at","supplier_name","invoice_no","total_amount","score","status"])
     return pd.DataFrame(rows)
 def ui_confirm(doc_id: str, supplier_name: str):
-    try:
-        doc_id_i = int(doc_id)
-    except:
         return "Invalid doc id"
-    if not supplier_name:
-        ok = confirm_document(doc_id_i, None)
-        return "ok" if ok else "fail"
     session = SessionLocal()
     s = session.query(Supplier).filter(Supplier.name==supplier_name).first()
-    session.close()
     if not s:
-        return "supplier not found"
-    ok = confirm_document(doc_id_i, s.id)
-    return "ok" if ok else "fail"
 def ui_export(txt_ids: str):
-    if not txt_ids:
-        return ""
     ids = [int(x.strip()) for x in txt_ids.split(",") if x.strip().isdigit()]
     if not ids:
         return ""
-    zp = export_zip(ids)
-    return zp or ""
 with gr.Blocks() as demo:
-    gr.Markdown("## 发票收票分拣（Staging）  \n上传发票 → 自动识别供应商 → 人工确认/导出")
     with gr.Row():
         with gr.Column(scale=3):
             uploader = gr.Textbox(label="上传人 (可选)", placeholder="front_desk")
-            file_inputs = gr.File(label="拖拽或选择发票（图片 优先）", file_count="multiple")
             upload_btn = gr.Button("上传并识别")
             upload_out = gr.Dataframe(headers=["id","filename","supplier","score","status","invoice_no","total"], interactive=False)
         with gr.Column(scale=2):
             refresh_btn = gr.Button("刷新列表")
             list_out = gr.Dataframe(headers=["id","filename","uploaded_at","supplier_name","invoice_no","total_amount","score","status"], interactive=False)
-            gr.Markdown("**确认匹配（人工）**")
-            doc_id_in = gr.Textbox(label="文档 ID（从列表取）")
-            supplier_choices = [name for (_id, name) in get_suppliers()]
-            supplier_dd = gr.Dropdown(choices=supplier_choices, label="选择供应商（或先通过 suppliers.csv 添加）")
             confirm_btn = gr.Button("确认关联")
-            confirm_out = gr.Textbox()
-            gr.Markdown("**导出选中（输入逗号分隔的 ID 列表）**")
-            export_ids = gr.Textbox(label="ID 列表，例如：1,2,3")
             export_btn = gr.Button("导出为 ZIP")
-            export_out = gr.Textbox()
-    upload_btn.click(lambda files,uploader: upload_files(files,uploader), inputs=[file_inputs,uploader], outputs=upload_out)
-    refresh_btn.click(lambda: refresh_list(), inputs=[], outputs=list_out)
-    confirm_btn.click(lambda d,s: ui_confirm(d, s), inputs=[doc_id_in, supplier_dd], outputs=confirm_out)
-    export_btn.click(lambda txt: ui_export(txt), inputs=[export_ids], outputs=export_out)
-    gr.Markdown("**提示**：如果供应商不在下拉中，请在 repo 放入或更新 `suppliers.csv` 并 Rebuild 空间，或在数据库新增。")
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860, share=False)

+# app.py (final version - supports PDF/JPG/PNG + export to repo root)
 import os
 import io
 import zipfile
 from sqlalchemy import create_engine, Column, Integer, String, DateTime, Float, Text
 from sqlalchemy.ext.declarative import declarative_base
 from sqlalchemy.orm import sessionmaker
 from pdf2image import convert_from_bytes
 # ---------- config ----------
     invoice_no = Column(String, nullable=True)
     total_amount = Column(Float, nullable=True)
     match_score = Column(Float, nullable=True)
+    status = Column(String, default="new")
     raw_extracted = Column(Text, nullable=True)
     error = Column(Text, nullable=True)
 Base.metadata.create_all(engine)
+# ---------- Load Suppliers ----------
 def load_suppliers_from_csv():
     if not SUPPLIER_CSV.exists():
         return
 load_suppliers_from_csv()
+# ---------- OCR & Extraction ----------
 TAXNO_RE = re.compile(r"([0-9A-Z]{15,20})")
 INVOICE_RE = re.compile(r"(发票代码|发票号码|发票号|Invoice No|Invoice No\.)[:：\s]*([A-Za-z0-9\-]+)")
 AMOUNT_RE = re.compile(r"([0-9]{1,3}(?:[,，][0-9]{3})*(?:\.[0-9]{1,2})?)")
 def do_ocr(file_bytes: bytes) -> str:
     """
+    自动识别图片或 PDF，返回识别文字。
     """
+    # 1) 图片
     try:
         img = Image.open(io.BytesIO(file_bytes))
         img = img.convert("RGB")
         text = pytesseract.image_to_string(img, lang='chi_sim+eng')
+        if text.strip():
             return text
     except Exception:
         pass
+    # 2) PDF
     try:
+        images = convert_from_bytes(file_bytes, dpi=300)
         texts = []
         for im in images:
+            t = pytesseract.image_to_string(im, lang='chi_sim+eng')
+            if t.strip():
                 texts.append(t)
         return "\n\n".join(texts)
     except Exception as e:
+        print("OCR error:", e)
         return ""
 def extract_fields(ocr_text: str) -> dict:
             return {"supplier_id": s.id, "supplier_name": s.name, "supplier_taxno": s.tax_no, "score": float(score)}
     return None
+# ---------- Save & Record ----------
 def save_file_and_record(file_bytes: bytes, filename: str, uploader: str = "unknown"):
     ts = datetime.utcnow().strftime("%Y%m%d%H%M%S%f")
     safe_name = f"{ts}_{filename.replace(' ', '_')}"
     try:
         ocr_text = do_ocr(file_bytes)
     except Exception as e:
         print("OCR exception:", e, traceback.format_exc())
     extracted = extract_fields(ocr_text)
     session = SessionLocal()
     match = match_supplier(session, extracted)
     status = "matched" if match else "unknown"
     doc = Document(
         filename=filename,
         total_amount=extracted.get("total"),
         match_score=match["score"] if match else None,
         status=status,
+        raw_extracted=str(extracted)
     )
     session.add(doc)
     session.commit()
     session.close()
     return {
         "id": doc.id, "filename": doc.filename, "supplier": doc.supplier_name,
+        "score": doc.match_score, "status": doc.status,
+        "invoice_no": doc.invoice_no, "total": doc.total_amount
     }
+# ---------- Export ----------
 def export_zip(ids: List[int]):
     session = SessionLocal()
     docs = session.query(Document).filter(Document.id.in_(ids)).all()
             if os.path.exists(d.filepath):
                 zf.write(d.filepath, arcname=os.path.basename(d.filepath))
     session.close()
+    # === 新增：复制 ZIP 到 Space 根目录，让用户可直接下载 ===
+    import shutil
     try:
+        shutil.copy(zip_path, BASE_DIR / f"export_{ts}.zip")
+        print(f"✅ ZIP 已复制到根目录: export_{ts}.zip")
+    except Exception as e:
+        print("复制 ZIP 失败：", e)
+    return str(zip_path)
+# ---------- Gradio UI ----------
 def upload_files(files, uploader):
     results = []
     if not files:
         return pd.DataFrame([], columns=["id","filename","supplier","score","status","invoice_no","total"])
     for f in files:
         try:
+            content = f.read() if hasattr(f, "read") else open(f.name, "rb").read()
+            res = save_file_and_record(content, getattr(f, "name", "file"), uploader or "unknown")
             results.append(res)
         except Exception as e:
             results.append({
+                "id": -1, "filename": getattr(f, "name", "file"),
+                "supplier": None, "score": None, "status": "error",
+                "invoice_no": None, "total": None
             })
+            print("File process error:", e, traceback.format_exc())
+    return pd.DataFrame(results)
 def refresh_list():
+    session = SessionLocal()
+    q = session.query(Document).order_by(Document.uploaded_at.desc()).limit(200).all()
+    rows = [{
+        "id": d.id, "filename": d.filename,
+        "uploaded_at": d.uploaded_at.strftime("%Y-%m-%d %H:%M:%S"),
+        "supplier_name": d.supplier_name, "invoice_no": d.invoice_no,
+        "total_amount": d.total_amount, "score": d.match_score, "status": d.status
+    } for d in q]
+    session.close()
     return pd.DataFrame(rows)
 def ui_confirm(doc_id: str, supplier_name: str):
+    if not doc_id.isdigit():
         return "Invalid doc id"
+    doc_id_i = int(doc_id)
     session = SessionLocal()
     s = session.query(Supplier).filter(Supplier.name==supplier_name).first()
+    d = session.query(Document).get(doc_id_i)
+    if not d:
+        session.close()
+        return "Doc not found"
     if not s:
+        session.close()
+        return "Supplier not found"
+    d.supplier_id = s.id
+    d.supplier_name = s.name
+    d.supplier_taxno = s.tax_no
+    d.match_score = 100.0
+    d.status = "matched"
+    session.add(d)
+    session.commit()
+    session.close()
+    return "ok"
 def ui_export(txt_ids: str):
     ids = [int(x.strip()) for x in txt_ids.split(",") if x.strip().isdigit()]
     if not ids:
         return ""
+    return export_zip(ids)
 with gr.Blocks() as demo:
+    gr.Markdown("## 发票收票分拣（Staging）\n上传发票 → 自动识别供应商 → 人工确认/导出")
     with gr.Row():
         with gr.Column(scale=3):
             uploader = gr.Textbox(label="上传人 (可选)", placeholder="front_desk")
+            file_inputs = gr.File(label="拖拽或选择发票（图片或 PDF）", file_count="multiple")
             upload_btn = gr.Button("上传并识别")
             upload_out = gr.Dataframe(headers=["id","filename","supplier","score","status","invoice_no","total"], interactive=False)
         with gr.Column(scale=2):
             refresh_btn = gr.Button("刷新列表")
             list_out = gr.Dataframe(headers=["id","filename","uploaded_at","supplier_name","invoice_no","total_amount","score","status"], interactive=False)
+            doc_id_in = gr.Textbox(label="文档 ID（从列表复制）")
+            supplier_dd = gr.Dropdown(choices=[s.name for s in SessionLocal().query(Supplier).all()], label="选择供应商")
             confirm_btn = gr.Button("确认关联")
+            confirm_out = gr.Textbox(label="结果")
+            export_ids = gr.Textbox(label="导出 ID（如 1,2,3）")
             export_btn = gr.Button("导出为 ZIP")
+            export_out = gr.Textbox(label="导出结果")
+    upload_btn.click(upload_files, inputs=[file_inputs, uploader], outputs=upload_out)
+    refresh_btn.click(refresh_list, inputs=[], outputs=list_out)
+    confirm_btn.click(ui_confirm, inputs=[doc_id_in, supplier_dd], outputs=confirm_out)
+    export_btn.click(ui_export, inputs=[export_ids], outputs=export_out)
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860, share=False)