Update app.py
Browse files
app.py
CHANGED
|
@@ -35,13 +35,53 @@ import traceback
|
|
| 35 |
# ====================== Hugging Face Tesseract 环境 ==========================
|
| 36 |
import pytesseract
|
| 37 |
import os
|
|
|
|
| 38 |
|
| 39 |
-
#
|
| 40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
|
| 42 |
-
#
|
| 43 |
-
|
| 44 |
-
raise RuntimeError("Tesseract not found! Check packages.txt and rebuild.")
|
| 45 |
|
| 46 |
# ====================== Module 1: PDF/Image to Markdown ======================
|
| 47 |
class PDFImageToMarkdown:
|
|
|
|
| 35 |
# ====================== Hugging Face Tesseract 环境 ==========================
|
| 36 |
import pytesseract
|
| 37 |
import os
|
| 38 |
+
import shutil
|
| 39 |
|
| 40 |
+
# 根据环境自动检测 Tesseract 路径
|
| 41 |
+
def setup_tesseract():
|
| 42 |
+
# 尝试所有可能的 Tesseract 路径
|
| 43 |
+
possible_paths = [
|
| 44 |
+
'/usr/bin/tesseract', # Hugging Face Spaces 或 Linux 标准路径
|
| 45 |
+
'/usr/local/bin/tesseract', # Linux/macOS 本地安装
|
| 46 |
+
'/opt/homebrew/bin/tesseract', # macOS Homebrew
|
| 47 |
+
'/opt/tesseract/bin/tesseract', # 某些 Linux 发行版
|
| 48 |
+
'tesseract', # 系统PATH中
|
| 49 |
+
]
|
| 50 |
+
|
| 51 |
+
# 首先尝试使用 which 命令查找 tesseract
|
| 52 |
+
tesseract_path = shutil.which('tesseract')
|
| 53 |
+
if tesseract_path:
|
| 54 |
+
pytesseract.pytesseract.tesseract_cmd = tesseract_path
|
| 55 |
+
print(f"Found Tesseract at: {tesseract_path}")
|
| 56 |
+
return
|
| 57 |
+
|
| 58 |
+
# 如果 which 找不到,尝试常见路径
|
| 59 |
+
for path in possible_paths:
|
| 60 |
+
if path != 'tesseract' and os.path.exists(path):
|
| 61 |
+
pytesseract.pytesseract.tesseract_cmd = path
|
| 62 |
+
print(f"Found Tesseract at: {path}")
|
| 63 |
+
return
|
| 64 |
+
|
| 65 |
+
# 如果是 'tesseract' 字符串,直接设置(让系统在PATH中查找)
|
| 66 |
+
if 'tesseract' in possible_paths:
|
| 67 |
+
pytesseract.pytesseract.tesseract_cmd = 'tesseract'
|
| 68 |
+
print("Using 'tesseract' from system PATH")
|
| 69 |
+
return
|
| 70 |
+
|
| 71 |
+
# 如果都找不到,提供详细的错误信息
|
| 72 |
+
print("Tesseract not found in any of the expected locations:")
|
| 73 |
+
for path in possible_paths:
|
| 74 |
+
print(f" - {path}")
|
| 75 |
+
print("\nTroubleshooting steps:")
|
| 76 |
+
print("1. Ensure packages.txt contains 'tesseract-ocr'")
|
| 77 |
+
print("2. Try rebuilding the Space")
|
| 78 |
+
print("3. Check if the base image supports apt packages")
|
| 79 |
+
|
| 80 |
+
# 不抛出错误,而是设置一个默认值,让 pytesseract 自己处理
|
| 81 |
+
pytesseract.pytesseract.tesseract_cmd = 'tesseract'
|
| 82 |
|
| 83 |
+
# 设置 Tesseract
|
| 84 |
+
setup_tesseract()
|
|
|
|
| 85 |
|
| 86 |
# ====================== Module 1: PDF/Image to Markdown ======================
|
| 87 |
class PDFImageToMarkdown:
|