PDF_OCR_Extract / app.py
ReneeHWT's picture
Update app.py
df0821f verified
raw
history blame contribute delete
831 Bytes
import gradio as gr
import fitz # PyMuPDF
def extract_text(pdf_file):
# 打開 PDF
doc = fitz.open(pdf_file.name)
full_text = ""
# 純文字抽取
for page in doc:
full_text += page.get_text()
# 過濾非 ASCII 字元(只保留英數、標點)
filtered = full_text.encode("ascii", errors="ignore").decode()
# 輸出成 txt
out_path = "output.txt"
with open(out_path, "w", encoding="utf-8") as f:
f.write(filtered)
return out_path
# 建立 Gradio 介面
demo = gr.Interface(
fn=extract_text,
inputs=gr.File(label="Upload PDF (.pdf)"),
outputs=gr.File(label="Download TXT"),
title="PDF → TXT (English only)",
description="Extract English text from PDF (純文字抽取) and download as .txt"
)
if __name__ == "__main__":
demo.launch()