ReneeHWT commited on
Commit
97941f0
·
verified ·
1 Parent(s): 4ada510

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -10
app.py CHANGED
@@ -3,27 +3,36 @@ import fitz # PyMuPDF
3
  from PIL import Image
4
  import io
5
  import pytesseract
 
6
 
7
  def extract_text(file):
8
  if file is None:
9
  return ""
10
- pdf_bytes = file.read()
11
- doc = fitz.open(stream=pdf_bytes, filetype="pdf")
12
- full_text = ""
13
- for page in doc:
14
- pix = page.get_pixmap(dpi=300)
15
- img = Image.open(io.BytesIO(pix.tobytes("png")))
16
- full_text += pytesseract.image_to_string(img, lang='eng+chi_tra') + "\n\n"
17
- return full_text
 
 
 
 
 
 
 
 
18
 
19
  iface = gr.Interface(
20
  fn=extract_text,
21
- # 這裡用新 API,直接用 gr.File 與 gr.Textbox
22
  inputs=gr.File(label="Upload your PDF"),
23
- outputs=gr.Textbox(label="Extracted Text"),
24
  title="PDF Text Extractor with OCR",
25
  description="Upload a PDF and extract ALL text (including from images) using Tesseract OCR."
26
  )
27
 
28
  if __name__ == "__main__":
 
29
  iface.launch()
 
3
  from PIL import Image
4
  import io
5
  import pytesseract
6
+ import traceback
7
 
8
  def extract_text(file):
9
  if file is None:
10
  return ""
11
+ try:
12
+ pdf_bytes = file.read()
13
+ doc = fitz.open(stream=pdf_bytes, filetype="pdf")
14
+ full_text = ""
15
+ for i, page in enumerate(doc, start=1):
16
+ # 將每頁以 300 dpi 渲染成影像
17
+ pix = page.get_pixmap(dpi=300)
18
+ img = Image.open(io.BytesIO(pix.tobytes("png")))
19
+ # OCR
20
+ page_text = pytesseract.image_to_string(img, lang='eng+chi_tra')
21
+ full_text += f"--- Page {i} ---\n" + page_text + "\n\n"
22
+ return full_text.strip()
23
+ except Exception as e:
24
+ # 捕捉任何錯誤並把 traceback 一併回傳
25
+ tb = traceback.format_exc()
26
+ return f"⚠️ 擷取失敗,錯誤訊息:\n{str(e)}\n\n詳細追蹤:\n{tb}"
27
 
28
  iface = gr.Interface(
29
  fn=extract_text,
 
30
  inputs=gr.File(label="Upload your PDF"),
31
+ outputs=gr.Textbox(label="Extracted Text", lines=20),
32
  title="PDF Text Extractor with OCR",
33
  description="Upload a PDF and extract ALL text (including from images) using Tesseract OCR."
34
  )
35
 
36
  if __name__ == "__main__":
37
+ # 若要讓局域網也能存取,可用 server_name="0.0.0.0"
38
  iface.launch()