Spaces:

ReneeHWT
/

PDF_OCR_Extract

Sleeping

ReneeHWT commited on Jun 13, 2025

Commit

4ada510

verified ·

1 Parent(s): 2628b24

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -5,11 +5,6 @@ import io
 import pytesseract
 def extract_text(file):
-    """
-    Extracts text from a PDF file using OCR.
-    - Uses PyMuPDF to render each page as an image.
-    - Runs Tesseract OCR on each image.
-    """
     if file is None:
         return ""
     pdf_bytes = file.read()
@@ -18,14 +13,14 @@ def extract_text(file):
     for page in doc:
         pix = page.get_pixmap(dpi=300)
         img = Image.open(io.BytesIO(pix.tobytes("png")))
-        page_text = pytesseract.image_to_string(img, lang='eng+chi_tra')
-        full_text += page_text + "\n\n"
     return full_text
 iface = gr.Interface(
     fn=extract_text,
-    inputs=gr.inputs.File(label="Upload your PDF"),
-    outputs=gr.outputs.Textbox(label="Extracted Text"),
     title="PDF Text Extractor with OCR",
     description="Upload a PDF and extract ALL text (including from images) using Tesseract OCR."
 )

 import pytesseract
 def extract_text(file):
     if file is None:
         return ""
     pdf_bytes = file.read()
     for page in doc:
         pix = page.get_pixmap(dpi=300)
         img = Image.open(io.BytesIO(pix.tobytes("png")))
+        full_text += pytesseract.image_to_string(img, lang='eng+chi_tra') + "\n\n"
     return full_text
 iface = gr.Interface(
     fn=extract_text,
+    # 這裡用新 API，直接用 gr.File 與 gr.Textbox
+    inputs=gr.File(label="Upload your PDF"),
+    outputs=gr.Textbox(label="Extracted Text"),
     title="PDF Text Extractor with OCR",
     description="Upload a PDF and extract ALL text (including from images) using Tesseract OCR."
 )