Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -10,43 +10,42 @@ def extract_text(file):
|
|
| 10 |
"""
|
| 11 |
支援 file 可能是:
|
| 12 |
- 路徑字串 (Gradio 3.x 回傳)
|
| 13 |
-
-
|
| 14 |
"""
|
| 15 |
if not file:
|
| 16 |
return ""
|
| 17 |
try:
|
| 18 |
-
# 1. 讀取 PDF
|
| 19 |
if isinstance(file, str) and os.path.exists(file):
|
| 20 |
-
# file 是路徑
|
| 21 |
with open(file, "rb") as f:
|
| 22 |
pdf_bytes = f.read()
|
| 23 |
else:
|
| 24 |
-
# file
|
| 25 |
pdf_bytes = file.read()
|
| 26 |
|
| 27 |
-
# 2. 用 PyMuPDF 開啟
|
| 28 |
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
|
| 29 |
-
full_text =
|
| 30 |
|
| 31 |
-
# 3.
|
| 32 |
for i, page in enumerate(doc, start=1):
|
| 33 |
pix = page.get_pixmap(dpi=300)
|
| 34 |
img = Image.open(io.BytesIO(pix.tobytes("png")))
|
| 35 |
-
|
| 36 |
-
full_text
|
| 37 |
|
| 38 |
-
return full_text.strip()
|
| 39 |
|
| 40 |
except Exception as e:
|
| 41 |
tb = traceback.format_exc()
|
| 42 |
-
return f"⚠️
|
| 43 |
|
| 44 |
iface = gr.Interface(
|
| 45 |
fn=extract_text,
|
| 46 |
inputs=gr.File(label="Upload your PDF"),
|
| 47 |
outputs=gr.Textbox(label="Extracted Text", lines=20),
|
| 48 |
title="PDF Text Extractor with OCR",
|
| 49 |
-
description="Upload a PDF and extract ALL text (including
|
| 50 |
)
|
| 51 |
|
| 52 |
if __name__ == "__main__":
|
|
|
|
| 10 |
"""
|
| 11 |
支援 file 可能是:
|
| 12 |
- 路徑字串 (Gradio 3.x 回傳)
|
| 13 |
+
- 類檔案物件 (早期回傳)
|
| 14 |
"""
|
| 15 |
if not file:
|
| 16 |
return ""
|
| 17 |
try:
|
| 18 |
+
# 1. 讀取 PDF bytes
|
| 19 |
if isinstance(file, str) and os.path.exists(file):
|
|
|
|
| 20 |
with open(file, "rb") as f:
|
| 21 |
pdf_bytes = f.read()
|
| 22 |
else:
|
| 23 |
+
# file 可能是 uploaded file-like
|
| 24 |
pdf_bytes = file.read()
|
| 25 |
|
| 26 |
+
# 2. 用 PyMuPDF 開啟
|
| 27 |
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
|
| 28 |
+
full_text = []
|
| 29 |
|
| 30 |
+
# 3. 每頁渲染 & OCR
|
| 31 |
for i, page in enumerate(doc, start=1):
|
| 32 |
pix = page.get_pixmap(dpi=300)
|
| 33 |
img = Image.open(io.BytesIO(pix.tobytes("png")))
|
| 34 |
+
text = pytesseract.image_to_string(img, lang='eng+chi_tra')
|
| 35 |
+
full_text.append(f"--- Page {i} ---\n{text}")
|
| 36 |
|
| 37 |
+
return "\n\n".join(full_text).strip()
|
| 38 |
|
| 39 |
except Exception as e:
|
| 40 |
tb = traceback.format_exc()
|
| 41 |
+
return f"⚠️ 擷取失敗:{e}\n\n詳細 Traceback:\n{tb}"
|
| 42 |
|
| 43 |
iface = gr.Interface(
|
| 44 |
fn=extract_text,
|
| 45 |
inputs=gr.File(label="Upload your PDF"),
|
| 46 |
outputs=gr.Textbox(label="Extracted Text", lines=20),
|
| 47 |
title="PDF Text Extractor with OCR",
|
| 48 |
+
description="Upload a PDF and extract ALL text (including images) using Tesseract OCR."
|
| 49 |
)
|
| 50 |
|
| 51 |
if __name__ == "__main__":
|