Spaces:

ReneeHWT
/

PDF_OCR_Extract

Sleeping

ReneeHWT commited on Jun 13, 2025

Commit

530ec5f

verified ·

1 Parent(s): f5b1291

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,34 +1,39 @@
 import gradio as gr
-from pdf2image import convert_from_bytes
 import pytesseract
 def extract_text(file):
     """
     Extracts text from a PDF file using OCR.
-    - Converts PDF pages to images.
     - Runs Tesseract OCR on each image.
     """
     if file is None:
         return ""
-    # 讀取 PDF 原始位元組
     pdf_bytes = file.read()
-    # 每頁轉成影像 (dpi=300 for better OCR accuracy)
-    images = convert_from_bytes(pdf_bytes, dpi=300)
-    # 用 Tesseract OCR 擷取文字並累積
-    text = ""
-    for img in images:
-        text += pytesseract.image_to_string(img, lang='eng+chi_tra') + "\n\n"
-    return text
-# 建立 Gradio 介面
 iface = gr.Interface(
     fn=extract_text,
     inputs=gr.inputs.File(label="Upload your PDF"),
     outputs=gr.outputs.Textbox(label="Extracted Text"),
     title="PDF Text Extractor with OCR",
-    description="Upload a PDF file and extract all text (including from images) using Tesseract OCR."
 )
 if __name__ == "__main__":
-    # 若要產生公開連結，可改成 iface.launch(share=True)
     iface.launch()

+pip install gradio pymupdf pillow pytesseract
 import gradio as gr
+import fitz  # PyMuPDF
+from PIL import Image
+import io
 import pytesseract
 def extract_text(file):
     """
     Extracts text from a PDF file using OCR.
+    - Uses PyMuPDF to render each page as an image.
     - Runs Tesseract OCR on each image.
     """
     if file is None:
         return ""
     pdf_bytes = file.read()
+    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+    full_text = ""
+    for page in doc:
+        # 以 300 dpi 渲染
+        pix = page.get_pixmap(dpi=300)
+        img_data = pix.tobytes("png")
+        img = Image.open(io.BytesIO(img_data))
+        # OCR，支援英文與繁體中文
+        page_text = pytesseract.image_to_string(img, lang='eng+chi_tra')
+        full_text += page_text + "\n\n"
+    return full_text
 iface = gr.Interface(
     fn=extract_text,
     inputs=gr.inputs.File(label="Upload your PDF"),
     outputs=gr.outputs.Textbox(label="Extracted Text"),
     title="PDF Text Extractor with OCR",
+    description="Upload a PDF and extract ALL text (including from images) using Tesseract OCR."
 )
 if __name__ == "__main__":
     iface.launch()