import gradio as gr import fitz # PyMuPDF def extract_text(pdf_file): # 打開 PDF doc = fitz.open(pdf_file.name) full_text = "" # 純文字抽取 for page in doc: full_text += page.get_text() # 過濾非 ASCII 字元(只保留英數、標點) filtered = full_text.encode("ascii", errors="ignore").decode() # 輸出成 txt out_path = "output.txt" with open(out_path, "w", encoding="utf-8") as f: f.write(filtered) return out_path # 建立 Gradio 介面 demo = gr.Interface( fn=extract_text, inputs=gr.File(label="Upload PDF (.pdf)"), outputs=gr.File(label="Download TXT"), title="PDF → TXT (English only)", description="Extract English text from PDF (純文字抽取) and download as .txt" ) if __name__ == "__main__": demo.launch()