Desung commited on
Commit
8450ca5
·
verified ·
1 Parent(s): b02438a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -5
app.py CHANGED
@@ -35,13 +35,53 @@ import traceback
35
  # ====================== Hugging Face Tesseract 环境 ==========================
36
  import pytesseract
37
  import os
 
38
 
39
- # 硬编码 Hugging Face 环境 Tesseract 路径
40
- pytesseract.pytesseract.tesseract_cmd = r'/user/bin/tesseract'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
- # 验证路径是否存在(调试用)
43
- if not os.path.exists(pytesseract.pytesseract.tesseract_cmd):
44
- raise RuntimeError("Tesseract not found! Check packages.txt and rebuild.")
45
 
46
  # ====================== Module 1: PDF/Image to Markdown ======================
47
  class PDFImageToMarkdown:
 
35
  # ====================== Hugging Face Tesseract 环境 ==========================
36
  import pytesseract
37
  import os
38
+ import shutil
39
 
40
+ # 根据环境自动检测 Tesseract 路径
41
+ def setup_tesseract():
42
+ # 尝试所有可能的 Tesseract 路径
43
+ possible_paths = [
44
+ '/usr/bin/tesseract', # Hugging Face Spaces 或 Linux 标准路径
45
+ '/usr/local/bin/tesseract', # Linux/macOS 本地安装
46
+ '/opt/homebrew/bin/tesseract', # macOS Homebrew
47
+ '/opt/tesseract/bin/tesseract', # 某些 Linux 发行版
48
+ 'tesseract', # 系统PATH中
49
+ ]
50
+
51
+ # 首先尝试使用 which 命令查找 tesseract
52
+ tesseract_path = shutil.which('tesseract')
53
+ if tesseract_path:
54
+ pytesseract.pytesseract.tesseract_cmd = tesseract_path
55
+ print(f"Found Tesseract at: {tesseract_path}")
56
+ return
57
+
58
+ # 如果 which 找不到,尝试常见路径
59
+ for path in possible_paths:
60
+ if path != 'tesseract' and os.path.exists(path):
61
+ pytesseract.pytesseract.tesseract_cmd = path
62
+ print(f"Found Tesseract at: {path}")
63
+ return
64
+
65
+ # 如果是 'tesseract' 字符串,直接设置(让系统在PATH中查找)
66
+ if 'tesseract' in possible_paths:
67
+ pytesseract.pytesseract.tesseract_cmd = 'tesseract'
68
+ print("Using 'tesseract' from system PATH")
69
+ return
70
+
71
+ # 如果都找不到,提供详细的错误信息
72
+ print("Tesseract not found in any of the expected locations:")
73
+ for path in possible_paths:
74
+ print(f" - {path}")
75
+ print("\nTroubleshooting steps:")
76
+ print("1. Ensure packages.txt contains 'tesseract-ocr'")
77
+ print("2. Try rebuilding the Space")
78
+ print("3. Check if the base image supports apt packages")
79
+
80
+ # 不抛出错误,而是设置一个默认值,让 pytesseract 自己处理
81
+ pytesseract.pytesseract.tesseract_cmd = 'tesseract'
82
 
83
+ # 设置 Tesseract
84
+ setup_tesseract()
 
85
 
86
  # ====================== Module 1: PDF/Image to Markdown ======================
87
  class PDFImageToMarkdown: