Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,52 +1,33 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
import fitz # PyMuPDF
|
| 3 |
-
from PIL import Image
|
| 4 |
-
import io
|
| 5 |
-
import pytesseract
|
| 6 |
-
import traceback
|
| 7 |
-
import os
|
| 8 |
|
| 9 |
-
def extract_text(
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
- 類檔案物件 (早期回傳)
|
| 14 |
-
"""
|
| 15 |
-
if not file:
|
| 16 |
-
return ""
|
| 17 |
-
try:
|
| 18 |
-
# 1. 讀取 PDF bytes
|
| 19 |
-
if isinstance(file, str) and os.path.exists(file):
|
| 20 |
-
with open(file, "rb") as f:
|
| 21 |
-
pdf_bytes = f.read()
|
| 22 |
-
else:
|
| 23 |
-
# file 可能是 uploaded file-like
|
| 24 |
-
pdf_bytes = file.read()
|
| 25 |
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
full_text
|
| 29 |
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
pix = page.get_pixmap(dpi=300)
|
| 33 |
-
img = Image.open(io.BytesIO(pix.tobytes("png")))
|
| 34 |
-
text = pytesseract.image_to_string(img, lang='eng+chi_tra')
|
| 35 |
-
full_text.append(f"--- Page {i} ---\n{text}")
|
| 36 |
|
| 37 |
-
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
-
|
| 40 |
-
tb = traceback.format_exc()
|
| 41 |
-
return f"⚠️ 擷取失敗:{e}\n\n詳細 Traceback:\n{tb}"
|
| 42 |
|
| 43 |
-
|
|
|
|
| 44 |
fn=extract_text,
|
| 45 |
-
inputs=gr.File(label="Upload
|
| 46 |
-
outputs=gr.
|
| 47 |
-
title="PDF
|
| 48 |
-
description="
|
| 49 |
)
|
| 50 |
|
| 51 |
if __name__ == "__main__":
|
| 52 |
-
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import fitz # PyMuPDF
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
|
| 4 |
+
def extract_text(pdf_file):
|
| 5 |
+
# 打開 PDF
|
| 6 |
+
doc = fitz.open(pdf_file.name)
|
| 7 |
+
full_text = ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
+
# 純文字抽取
|
| 10 |
+
for page in doc:
|
| 11 |
+
full_text += page.get_text()
|
| 12 |
|
| 13 |
+
# 過濾非 ASCII 字元(只保留英數、標點)
|
| 14 |
+
filtered = full_text.encode("ascii", errors="ignore").decode()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
+
# 輸出成 txt
|
| 17 |
+
out_path = "output.txt"
|
| 18 |
+
with open(out_path, "w", encoding="utf-8") as f:
|
| 19 |
+
f.write(filtered)
|
| 20 |
|
| 21 |
+
return out_path
|
|
|
|
|
|
|
| 22 |
|
| 23 |
+
# 建立 Gradio 介面
|
| 24 |
+
demo = gr.Interface(
|
| 25 |
fn=extract_text,
|
| 26 |
+
inputs=gr.File(label="Upload PDF (.pdf)"),
|
| 27 |
+
outputs=gr.File(label="Download TXT"),
|
| 28 |
+
title="PDF → TXT (English only)",
|
| 29 |
+
description="Extract English text from PDF (純文字抽取) and download as .txt"
|
| 30 |
)
|
| 31 |
|
| 32 |
if __name__ == "__main__":
|
| 33 |
+
demo.launch()
|