Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import fitz # PyMuPDF | |
| def extract_text(pdf_file): | |
| # 打開 PDF | |
| doc = fitz.open(pdf_file.name) | |
| full_text = "" | |
| # 純文字抽取 | |
| for page in doc: | |
| full_text += page.get_text() | |
| # 過濾非 ASCII 字元(只保留英數、標點) | |
| filtered = full_text.encode("ascii", errors="ignore").decode() | |
| # 輸出成 txt | |
| out_path = "output.txt" | |
| with open(out_path, "w", encoding="utf-8") as f: | |
| f.write(filtered) | |
| return out_path | |
| # 建立 Gradio 介面 | |
| demo = gr.Interface( | |
| fn=extract_text, | |
| inputs=gr.File(label="Upload PDF (.pdf)"), | |
| outputs=gr.File(label="Download TXT"), | |
| title="PDF → TXT (English only)", | |
| description="Extract English text from PDF (純文字抽取) and download as .txt" | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |