Spaces:

ReneeHWT
/

PDF_OCR_Extract

Sleeping

ReneeHWT commited on Jun 13, 2025

Commit

4f83151

verified ·

1 Parent(s): 97941f0

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -4,26 +4,42 @@ from PIL import Image
 import io
 import pytesseract
 import traceback
 def extract_text(file):
-    if file is None:
         return ""
     try:
-        pdf_bytes = file.read()
         doc = fitz.open(stream=pdf_bytes, filetype="pdf")
         full_text = ""
         for i, page in enumerate(doc, start=1):
-            # 將每頁以 300 dpi 渲染成影像
             pix = page.get_pixmap(dpi=300)
             img = Image.open(io.BytesIO(pix.tobytes("png")))
-            # OCR
             page_text = pytesseract.image_to_string(img, lang='eng+chi_tra')
-            full_text += f"--- Page {i} ---\n" + page_text + "\n\n"
         return full_text.strip()
     except Exception as e:
-        # 捕捉任何錯誤並把 traceback 一併回傳
         tb = traceback.format_exc()
-        return f"⚠️ 擷取失敗，錯誤訊息：\n{str(e)}\n\n詳細追蹤：\n{tb}"
 iface = gr.Interface(
     fn=extract_text,
@@ -34,5 +50,4 @@ iface = gr.Interface(
 )
 if __name__ == "__main__":
-    # 若要讓局域網也能存取，可用 server_name="0.0.0.0"
     iface.launch()

 import io
 import pytesseract
 import traceback
+import os
 def extract_text(file):
+    """
+    支援 file 可能是：
+    - 路徑字串 (Gradio 3.x 回傳)
+    - File-like object (舊版或其他情境)
+    """
+    if not file:
         return ""
     try:
+        # 1. 讀取 PDF 位元組
+        if isinstance(file, str) and os.path.exists(file):
+            # file 是路徑
+            with open(file, "rb") as f:
+                pdf_bytes = f.read()
+        else:
+            # file 是類檔案物件
+            pdf_bytes = file.read()
+        # 2. 用 PyMuPDF 開啟 PDF
         doc = fitz.open(stream=pdf_bytes, filetype="pdf")
         full_text = ""
+        # 3. 每頁渲染、OCR
         for i, page in enumerate(doc, start=1):
             pix = page.get_pixmap(dpi=300)
             img = Image.open(io.BytesIO(pix.tobytes("png")))
             page_text = pytesseract.image_to_string(img, lang='eng+chi_tra')
+            full_text += f"--- Page {i} ---\n{page_text}\n\n"
         return full_text.strip()
     except Exception as e:
         tb = traceback.format_exc()
+        return f"⚠️ 擷取失敗，錯誤訊息：\n{e}\n\n詳細追蹤：\n{tb}"
 iface = gr.Interface(
     fn=extract_text,
 )
 if __name__ == "__main__":
     iface.launch()