Spaces:

ReneeHWT
/

PDF_OCR_Extract

Sleeping

App Files Files Community

ReneeHWT commited on Jun 13, 2025

Commit

df0821f

verified ·

1 Parent(s): 1cb3494

Update app.py

Browse files

Files changed (1) hide show

app.py +21 -40

app.py CHANGED Viewed

@@ -1,52 +1,33 @@
 import gradio as gr
 import fitz  # PyMuPDF
-from PIL import Image
-import io
-import pytesseract
-import traceback
-import os
-def extract_text(file):
-    """
-    支援 file 可能是：
-    - 路徑字串 (Gradio 3.x 回傳)
-    - 類檔案物件 (早期回傳)
-    """
-    if not file:
-        return ""
-    try:
-        # 1. 讀取 PDF bytes
-        if isinstance(file, str) and os.path.exists(file):
-            with open(file, "rb") as f:
-                pdf_bytes = f.read()
-        else:
-            # file 可能是 uploaded file-like
-            pdf_bytes = file.read()
-        # 2. 用 PyMuPDF 開啟
-        doc = fitz.open(stream=pdf_bytes, filetype="pdf")
-        full_text = []
-        # 3. 每頁渲染 & OCR
-        for i, page in enumerate(doc, start=1):
-            pix = page.get_pixmap(dpi=300)
-            img = Image.open(io.BytesIO(pix.tobytes("png")))
-            text = pytesseract.image_to_string(img, lang='eng+chi_tra')
-            full_text.append(f"--- Page {i} ---\n{text}")
-        return "\n\n".join(full_text).strip()
-    except Exception as e:
-        tb = traceback.format_exc()
-        return f"⚠️ 擷取失敗：{e}\n\n詳細 Traceback：\n{tb}"
-iface = gr.Interface(
     fn=extract_text,
-    inputs=gr.File(label="Upload your PDF"),
-    outputs=gr.Textbox(label="Extracted Text", lines=20),
-    title="PDF Text Extractor with OCR",
-    description="Upload a PDF and extract ALL text (including images) using Tesseract OCR."
 )
 if __name__ == "__main__":
-    iface.launch()

 import gradio as gr
 import fitz  # PyMuPDF
+def extract_text(pdf_file):
+    # 打開 PDF
+    doc = fitz.open(pdf_file.name)
+    full_text = ""
+    # 純文字抽取
+    for page in doc:
+        full_text += page.get_text()
+    # 過濾非 ASCII 字元（只保留英數、標點）
+    filtered = full_text.encode("ascii", errors="ignore").decode()
+    # 輸出成 txt
+    out_path = "output.txt"
+    with open(out_path, "w", encoding="utf-8") as f:
+        f.write(filtered)
+    return out_path
+# 建立 Gradio 介面
+demo = gr.Interface(
     fn=extract_text,
+    inputs=gr.File(label="Upload PDF (.pdf)"),
+    outputs=gr.File(label="Download TXT"),
+    title="PDF → TXT (English only)",
+    description="Extract English text from PDF (純文字抽取) and download as .txt"
 )
 if __name__ == "__main__":
+    demo.launch()