ReneeHWT commited on
Commit
df0821f
·
verified ·
1 Parent(s): 1cb3494

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -40
app.py CHANGED
@@ -1,52 +1,33 @@
1
  import gradio as gr
2
  import fitz # PyMuPDF
3
- from PIL import Image
4
- import io
5
- import pytesseract
6
- import traceback
7
- import os
8
 
9
- def extract_text(file):
10
- """
11
- 支援 file 可能是:
12
- - 路徑字串 (Gradio 3.x 回傳)
13
- - 類檔案物件 (早期回傳)
14
- """
15
- if not file:
16
- return ""
17
- try:
18
- # 1. 讀取 PDF bytes
19
- if isinstance(file, str) and os.path.exists(file):
20
- with open(file, "rb") as f:
21
- pdf_bytes = f.read()
22
- else:
23
- # file 可能是 uploaded file-like
24
- pdf_bytes = file.read()
25
 
26
- # 2. 用 PyMuPDF 開啟
27
- doc = fitz.open(stream=pdf_bytes, filetype="pdf")
28
- full_text = []
29
 
30
- # 3. 每頁渲染 & OCR
31
- for i, page in enumerate(doc, start=1):
32
- pix = page.get_pixmap(dpi=300)
33
- img = Image.open(io.BytesIO(pix.tobytes("png")))
34
- text = pytesseract.image_to_string(img, lang='eng+chi_tra')
35
- full_text.append(f"--- Page {i} ---\n{text}")
36
 
37
- return "\n\n".join(full_text).strip()
 
 
 
38
 
39
- except Exception as e:
40
- tb = traceback.format_exc()
41
- return f"⚠️ 擷取失敗:{e}\n\n詳細 Traceback:\n{tb}"
42
 
43
- iface = gr.Interface(
 
44
  fn=extract_text,
45
- inputs=gr.File(label="Upload your PDF"),
46
- outputs=gr.Textbox(label="Extracted Text", lines=20),
47
- title="PDF Text Extractor with OCR",
48
- description="Upload a PDF and extract ALL text (including images) using Tesseract OCR."
49
  )
50
 
51
  if __name__ == "__main__":
52
- iface.launch()
 
1
  import gradio as gr
2
  import fitz # PyMuPDF
 
 
 
 
 
3
 
4
+ def extract_text(pdf_file):
5
+ # 打開 PDF
6
+ doc = fitz.open(pdf_file.name)
7
+ full_text = ""
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
+ # 純文字抽取
10
+ for page in doc:
11
+ full_text += page.get_text()
12
 
13
+ # 過濾非 ASCII 字元(只保留英數、標點)
14
+ filtered = full_text.encode("ascii", errors="ignore").decode()
 
 
 
 
15
 
16
+ # 輸出成 txt
17
+ out_path = "output.txt"
18
+ with open(out_path, "w", encoding="utf-8") as f:
19
+ f.write(filtered)
20
 
21
+ return out_path
 
 
22
 
23
+ # 建立 Gradio 介面
24
+ demo = gr.Interface(
25
  fn=extract_text,
26
+ inputs=gr.File(label="Upload PDF (.pdf)"),
27
+ outputs=gr.File(label="Download TXT"),
28
+ title="PDF TXT (English only)",
29
+ description="Extract English text from PDF (純文字抽取) and download as .txt"
30
  )
31
 
32
  if __name__ == "__main__":
33
+ demo.launch()