Spaces:

ReneeHWT
/

PDF_OCR_Extract

Sleeping

ReneeHWT commited on Jun 13, 2025

Commit

97941f0

verified ·

1 Parent(s): 4ada510

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -3,27 +3,36 @@ import fitz  # PyMuPDF
 from PIL import Image
 import io
 import pytesseract
 def extract_text(file):
     if file is None:
         return ""
-    pdf_bytes = file.read()
-    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
-    full_text = ""
-    for page in doc:
-        pix = page.get_pixmap(dpi=300)
-        img = Image.open(io.BytesIO(pix.tobytes("png")))
-        full_text += pytesseract.image_to_string(img, lang='eng+chi_tra') + "\n\n"
-    return full_text
 iface = gr.Interface(
     fn=extract_text,
-    # 這裡用新 API，直接用 gr.File 與 gr.Textbox
     inputs=gr.File(label="Upload your PDF"),
-    outputs=gr.Textbox(label="Extracted Text"),
     title="PDF Text Extractor with OCR",
     description="Upload a PDF and extract ALL text (including from images) using Tesseract OCR."
 )
 if __name__ == "__main__":
     iface.launch()

 from PIL import Image
 import io
 import pytesseract
+import traceback
 def extract_text(file):
     if file is None:
         return ""
+    try:
+        pdf_bytes = file.read()
+        doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+        full_text = ""
+        for i, page in enumerate(doc, start=1):
+            # 將每頁以 300 dpi 渲染成影像
+            pix = page.get_pixmap(dpi=300)
+            img = Image.open(io.BytesIO(pix.tobytes("png")))
+            # OCR
+            page_text = pytesseract.image_to_string(img, lang='eng+chi_tra')
+            full_text += f"--- Page {i} ---\n" + page_text + "\n\n"
+        return full_text.strip()
+    except Exception as e:
+        # 捕捉任何錯誤並把 traceback 一併回傳
+        tb = traceback.format_exc()
+        return f"⚠️ 擷取失敗，錯誤訊息：\n{str(e)}\n\n詳細追蹤：\n{tb}"
 iface = gr.Interface(
     fn=extract_text,
     inputs=gr.File(label="Upload your PDF"),
+    outputs=gr.Textbox(label="Extracted Text", lines=20),
     title="PDF Text Extractor with OCR",
     description="Upload a PDF and extract ALL text (including from images) using Tesseract OCR."
 )
 if __name__ == "__main__":
+    # 若要讓局域網也能存取，可用 server_name="0.0.0.0"
     iface.launch()