ReneeHWT commited on
Commit
c58425c
·
verified ·
1 Parent(s): 4f83151

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -12
app.py CHANGED
@@ -10,43 +10,42 @@ def extract_text(file):
10
  """
11
  支援 file 可能是:
12
  - 路徑字串 (Gradio 3.x 回傳)
13
- - File-like object (舊版或其他情境)
14
  """
15
  if not file:
16
  return ""
17
  try:
18
- # 1. 讀取 PDF 位元組
19
  if isinstance(file, str) and os.path.exists(file):
20
- # file 是路徑
21
  with open(file, "rb") as f:
22
  pdf_bytes = f.read()
23
  else:
24
- # file 是類檔案物件
25
  pdf_bytes = file.read()
26
 
27
- # 2. 用 PyMuPDF 開啟 PDF
28
  doc = fitz.open(stream=pdf_bytes, filetype="pdf")
29
- full_text = ""
30
 
31
- # 3. 每頁渲染、OCR
32
  for i, page in enumerate(doc, start=1):
33
  pix = page.get_pixmap(dpi=300)
34
  img = Image.open(io.BytesIO(pix.tobytes("png")))
35
- page_text = pytesseract.image_to_string(img, lang='eng+chi_tra')
36
- full_text += f"--- Page {i} ---\n{page_text}\n\n"
37
 
38
- return full_text.strip()
39
 
40
  except Exception as e:
41
  tb = traceback.format_exc()
42
- return f"⚠️ 擷取失敗,錯誤訊息:\n{e}\n\n詳細追蹤:\n{tb}"
43
 
44
  iface = gr.Interface(
45
  fn=extract_text,
46
  inputs=gr.File(label="Upload your PDF"),
47
  outputs=gr.Textbox(label="Extracted Text", lines=20),
48
  title="PDF Text Extractor with OCR",
49
- description="Upload a PDF and extract ALL text (including from images) using Tesseract OCR."
50
  )
51
 
52
  if __name__ == "__main__":
 
10
  """
11
  支援 file 可能是:
12
  - 路徑字串 (Gradio 3.x 回傳)
13
+ - 類檔案物件 (早期回傳)
14
  """
15
  if not file:
16
  return ""
17
  try:
18
+ # 1. 讀取 PDF bytes
19
  if isinstance(file, str) and os.path.exists(file):
 
20
  with open(file, "rb") as f:
21
  pdf_bytes = f.read()
22
  else:
23
+ # file 可能是 uploaded file-like
24
  pdf_bytes = file.read()
25
 
26
+ # 2. 用 PyMuPDF 開啟
27
  doc = fitz.open(stream=pdf_bytes, filetype="pdf")
28
+ full_text = []
29
 
30
+ # 3. 每頁渲染 & OCR
31
  for i, page in enumerate(doc, start=1):
32
  pix = page.get_pixmap(dpi=300)
33
  img = Image.open(io.BytesIO(pix.tobytes("png")))
34
+ text = pytesseract.image_to_string(img, lang='eng+chi_tra')
35
+ full_text.append(f"--- Page {i} ---\n{text}")
36
 
37
+ return "\n\n".join(full_text).strip()
38
 
39
  except Exception as e:
40
  tb = traceback.format_exc()
41
+ return f"⚠️ 擷取失敗:{e}\n\n詳細 Traceback:\n{tb}"
42
 
43
  iface = gr.Interface(
44
  fn=extract_text,
45
  inputs=gr.File(label="Upload your PDF"),
46
  outputs=gr.Textbox(label="Extracted Text", lines=20),
47
  title="PDF Text Extractor with OCR",
48
+ description="Upload a PDF and extract ALL text (including images) using Tesseract OCR."
49
  )
50
 
51
  if __name__ == "__main__":