ReneeHWT commited on
Commit
4f83151
·
verified ·
1 Parent(s): 97941f0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -8
app.py CHANGED
@@ -4,26 +4,42 @@ from PIL import Image
4
  import io
5
  import pytesseract
6
  import traceback
 
7
 
8
  def extract_text(file):
9
- if file is None:
 
 
 
 
 
10
  return ""
11
  try:
12
- pdf_bytes = file.read()
 
 
 
 
 
 
 
 
 
13
  doc = fitz.open(stream=pdf_bytes, filetype="pdf")
14
  full_text = ""
 
 
15
  for i, page in enumerate(doc, start=1):
16
- # 將每頁以 300 dpi 渲染成影像
17
  pix = page.get_pixmap(dpi=300)
18
  img = Image.open(io.BytesIO(pix.tobytes("png")))
19
- # OCR
20
  page_text = pytesseract.image_to_string(img, lang='eng+chi_tra')
21
- full_text += f"--- Page {i} ---\n" + page_text + "\n\n"
 
22
  return full_text.strip()
 
23
  except Exception as e:
24
- # 捕捉任何錯誤並把 traceback 一併回傳
25
  tb = traceback.format_exc()
26
- return f"⚠️ 擷取失敗,錯誤訊息:\n{str(e)}\n\n詳細追蹤:\n{tb}"
27
 
28
  iface = gr.Interface(
29
  fn=extract_text,
@@ -34,5 +50,4 @@ iface = gr.Interface(
34
  )
35
 
36
  if __name__ == "__main__":
37
- # 若要讓局域網也能存取,可用 server_name="0.0.0.0"
38
  iface.launch()
 
4
  import io
5
  import pytesseract
6
  import traceback
7
+ import os
8
 
9
  def extract_text(file):
10
+ """
11
+ 支援 file 可能是:
12
+ - 路徑字串 (Gradio 3.x 回傳)
13
+ - File-like object (舊版或其他情境)
14
+ """
15
+ if not file:
16
  return ""
17
  try:
18
+ # 1. 讀取 PDF 位元組
19
+ if isinstance(file, str) and os.path.exists(file):
20
+ # file 是路徑
21
+ with open(file, "rb") as f:
22
+ pdf_bytes = f.read()
23
+ else:
24
+ # file 是類檔案物件
25
+ pdf_bytes = file.read()
26
+
27
+ # 2. 用 PyMuPDF 開啟 PDF
28
  doc = fitz.open(stream=pdf_bytes, filetype="pdf")
29
  full_text = ""
30
+
31
+ # 3. 每頁渲染、OCR
32
  for i, page in enumerate(doc, start=1):
 
33
  pix = page.get_pixmap(dpi=300)
34
  img = Image.open(io.BytesIO(pix.tobytes("png")))
 
35
  page_text = pytesseract.image_to_string(img, lang='eng+chi_tra')
36
+ full_text += f"--- Page {i} ---\n{page_text}\n\n"
37
+
38
  return full_text.strip()
39
+
40
  except Exception as e:
 
41
  tb = traceback.format_exc()
42
+ return f"⚠️ 擷取失敗,錯誤訊息:\n{e}\n\n詳細追蹤:\n{tb}"
43
 
44
  iface = gr.Interface(
45
  fn=extract_text,
 
50
  )
51
 
52
  if __name__ == "__main__":
 
53
  iface.launch()