Spaces:

ReneeHWT
/

PDF_OCR_Extract

Sleeping

ReneeHWT commited on Jun 13, 2025

Commit

c58425c

verified ·

1 Parent(s): 4f83151

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -10,43 +10,42 @@ def extract_text(file):
     """
     支援 file 可能是：
     - 路徑字串 (Gradio 3.x 回傳)
-    - File-like object (舊版或其他情境)
     """
     if not file:
         return ""
     try:
-        # 1. 讀取 PDF 位元組
         if isinstance(file, str) and os.path.exists(file):
-            # file 是路徑
             with open(file, "rb") as f:
                 pdf_bytes = f.read()
         else:
-            # file 是類檔案物件
             pdf_bytes = file.read()
-        # 2. 用 PyMuPDF 開啟 PDF
         doc = fitz.open(stream=pdf_bytes, filetype="pdf")
-        full_text = ""
-        # 3. 每頁渲染、OCR
         for i, page in enumerate(doc, start=1):
             pix = page.get_pixmap(dpi=300)
             img = Image.open(io.BytesIO(pix.tobytes("png")))
-            page_text = pytesseract.image_to_string(img, lang='eng+chi_tra')
-            full_text += f"--- Page {i} ---\n{page_text}\n\n"
-        return full_text.strip()
     except Exception as e:
         tb = traceback.format_exc()
-        return f"⚠️ 擷取失敗，錯誤訊息：\n{e}\n\n詳細追蹤：\n{tb}"
 iface = gr.Interface(
     fn=extract_text,
     inputs=gr.File(label="Upload your PDF"),
     outputs=gr.Textbox(label="Extracted Text", lines=20),
     title="PDF Text Extractor with OCR",
-    description="Upload a PDF and extract ALL text (including from images) using Tesseract OCR."
 )
 if __name__ == "__main__":

     """
     支援 file 可能是：
     - 路徑字串 (Gradio 3.x 回傳)
+    - 類檔案物件 (早期回傳)
     """
     if not file:
         return ""
     try:
+        # 1. 讀取 PDF bytes
         if isinstance(file, str) and os.path.exists(file):
             with open(file, "rb") as f:
                 pdf_bytes = f.read()
         else:
+            # file 可能是 uploaded file-like
             pdf_bytes = file.read()
+        # 2. 用 PyMuPDF 開啟
         doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+        full_text = []
+        # 3. 每頁渲染 & OCR
         for i, page in enumerate(doc, start=1):
             pix = page.get_pixmap(dpi=300)
             img = Image.open(io.BytesIO(pix.tobytes("png")))
+            text = pytesseract.image_to_string(img, lang='eng+chi_tra')
+            full_text.append(f"--- Page {i} ---\n{text}")
+        return "\n\n".join(full_text).strip()
     except Exception as e:
         tb = traceback.format_exc()
+        return f"⚠️ 擷取失敗：{e}\n\n詳細 Traceback：\n{tb}"
 iface = gr.Interface(
     fn=extract_text,
     inputs=gr.File(label="Upload your PDF"),
     outputs=gr.Textbox(label="Extracted Text", lines=20),
     title="PDF Text Extractor with OCR",
+    description="Upload a PDF and extract ALL text (including images) using Tesseract OCR."
 )
 if __name__ == "__main__":