Spaces:
Paused
Paused
fixing the tesseract path issues for full page ocr
Browse files- app.py +77 -32
- fix_tesseract_huggingface.py +144 -0
- packages.txt +6 -0
- setup.sh +43 -24
app.py
CHANGED
|
@@ -13,12 +13,83 @@ try:
|
|
| 13 |
except ImportError:
|
| 14 |
print("python-dotenv not installed, skipping .env file loading")
|
| 15 |
|
| 16 |
-
#
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
# Load Gemini API key from environment variable
|
| 24 |
gemini_api_key = os.getenv("GOOGLE_API_KEY")
|
|
@@ -29,9 +100,6 @@ if not gemini_api_key:
|
|
| 29 |
else:
|
| 30 |
print(f"Found Gemini API key: {gemini_api_key[:5]}...{gemini_api_key[-5:] if len(gemini_api_key) > 10 else ''}")
|
| 31 |
|
| 32 |
-
# Get the current directory
|
| 33 |
-
current_dir = os.path.dirname(os.path.abspath(__file__))
|
| 34 |
-
|
| 35 |
# Add the current directory to the Python path
|
| 36 |
sys.path.append(current_dir)
|
| 37 |
|
|
@@ -53,29 +121,6 @@ except ModuleNotFoundError:
|
|
| 53 |
# Try import again
|
| 54 |
from src.main import main
|
| 55 |
|
| 56 |
-
# Function to setup Tesseract
|
| 57 |
-
def setup_tesseract():
|
| 58 |
-
"""Setup Tesseract OCR environment."""
|
| 59 |
-
# Create tessdata directory if it doesn't exist
|
| 60 |
-
tessdata_dir = os.path.join(current_dir, "tessdata")
|
| 61 |
-
os.makedirs(tessdata_dir, exist_ok=True)
|
| 62 |
-
|
| 63 |
-
# Set TESSDATA_PREFIX environment variable if not already set
|
| 64 |
-
if not os.environ.get('TESSDATA_PREFIX'):
|
| 65 |
-
os.environ['TESSDATA_PREFIX'] = tessdata_dir
|
| 66 |
-
print(f"Set TESSDATA_PREFIX to {tessdata_dir}")
|
| 67 |
-
|
| 68 |
-
# Download eng.traineddata if it doesn't exist
|
| 69 |
-
eng_traineddata = os.path.join(tessdata_dir, "eng.traineddata")
|
| 70 |
-
if not os.path.exists(eng_traineddata):
|
| 71 |
-
try:
|
| 72 |
-
print("Downloading eng.traineddata...")
|
| 73 |
-
url = "https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata"
|
| 74 |
-
urllib.request.urlretrieve(url, eng_traineddata)
|
| 75 |
-
print("Downloaded eng.traineddata")
|
| 76 |
-
except Exception as e:
|
| 77 |
-
print(f"Error downloading eng.traineddata: {e}")
|
| 78 |
-
|
| 79 |
# Call setup function at import time
|
| 80 |
setup_tesseract()
|
| 81 |
|
|
|
|
| 13 |
except ImportError:
|
| 14 |
print("python-dotenv not installed, skipping .env file loading")
|
| 15 |
|
| 16 |
+
# Get the current directory
|
| 17 |
+
current_dir = os.path.dirname(os.path.abspath(__file__))
|
| 18 |
+
|
| 19 |
+
# Function to setup Tesseract
|
| 20 |
+
def setup_tesseract():
|
| 21 |
+
"""Setup Tesseract OCR environment."""
|
| 22 |
+
print("Setting up Tesseract OCR environment...")
|
| 23 |
+
|
| 24 |
+
# Create tessdata directory if it doesn't exist
|
| 25 |
+
tessdata_dir = os.path.join(current_dir, "tessdata")
|
| 26 |
+
os.makedirs(tessdata_dir, exist_ok=True)
|
| 27 |
+
|
| 28 |
+
# Set TESSDATA_PREFIX environment variable if not already set
|
| 29 |
+
if not os.environ.get('TESSDATA_PREFIX'):
|
| 30 |
+
# Check multiple possible locations
|
| 31 |
+
possible_tessdata_dirs = [
|
| 32 |
+
tessdata_dir, # Our local tessdata directory
|
| 33 |
+
"/usr/share/tesseract-ocr/4.00/tessdata", # Common location in Hugging Face
|
| 34 |
+
"/usr/share/tesseract-ocr/tessdata", # Another common location
|
| 35 |
+
"/usr/local/share/tessdata", # Standard installation location
|
| 36 |
+
]
|
| 37 |
+
|
| 38 |
+
# Use the first directory that exists
|
| 39 |
+
for dir_path in possible_tessdata_dirs:
|
| 40 |
+
if os.path.exists(dir_path):
|
| 41 |
+
os.environ['TESSDATA_PREFIX'] = dir_path
|
| 42 |
+
print(f"Set TESSDATA_PREFIX to {dir_path}")
|
| 43 |
+
break
|
| 44 |
+
else:
|
| 45 |
+
# If none exist, use our local directory
|
| 46 |
+
os.environ['TESSDATA_PREFIX'] = tessdata_dir
|
| 47 |
+
print(f"No existing tessdata directory found, set TESSDATA_PREFIX to {tessdata_dir}")
|
| 48 |
+
|
| 49 |
+
# Download eng.traineddata if it doesn't exist in our local tessdata
|
| 50 |
+
eng_traineddata = os.path.join(tessdata_dir, "eng.traineddata")
|
| 51 |
+
if not os.path.exists(eng_traineddata):
|
| 52 |
+
try:
|
| 53 |
+
print("Downloading eng.traineddata...")
|
| 54 |
+
url = "https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata"
|
| 55 |
+
urllib.request.urlretrieve(url, eng_traineddata)
|
| 56 |
+
print("Downloaded eng.traineddata")
|
| 57 |
+
except Exception as e:
|
| 58 |
+
print(f"Error downloading eng.traineddata: {e}")
|
| 59 |
+
|
| 60 |
+
# Configure pytesseract
|
| 61 |
+
try:
|
| 62 |
+
import pytesseract
|
| 63 |
+
# Check if tesseract is in PATH
|
| 64 |
+
tesseract_cmd = shutil.which("tesseract")
|
| 65 |
+
if tesseract_cmd:
|
| 66 |
+
pytesseract.pytesseract.tesseract_cmd = tesseract_cmd
|
| 67 |
+
print(f"Set pytesseract.tesseract_cmd to {tesseract_cmd}")
|
| 68 |
+
else:
|
| 69 |
+
# Try common locations
|
| 70 |
+
common_locations = [
|
| 71 |
+
"/usr/bin/tesseract",
|
| 72 |
+
"/usr/local/bin/tesseract",
|
| 73 |
+
"/app/tesseract/tesseract"
|
| 74 |
+
]
|
| 75 |
+
for location in common_locations:
|
| 76 |
+
if os.path.isfile(location) and os.access(location, os.X_OK):
|
| 77 |
+
pytesseract.pytesseract.tesseract_cmd = location
|
| 78 |
+
print(f"Set pytesseract.tesseract_cmd to {location}")
|
| 79 |
+
break
|
| 80 |
+
else:
|
| 81 |
+
print("Warning: Could not find tesseract executable")
|
| 82 |
+
except ImportError:
|
| 83 |
+
print("pytesseract not installed")
|
| 84 |
+
|
| 85 |
+
# Try to import tesserocr to verify it's working
|
| 86 |
+
try:
|
| 87 |
+
import tesserocr
|
| 88 |
+
print(f"tesserocr imported successfully, version: {tesserocr.tesseract_version()}")
|
| 89 |
+
except ImportError:
|
| 90 |
+
print("tesserocr not installed or not working")
|
| 91 |
+
except Exception as e:
|
| 92 |
+
print(f"Error importing tesserocr: {e}")
|
| 93 |
|
| 94 |
# Load Gemini API key from environment variable
|
| 95 |
gemini_api_key = os.getenv("GOOGLE_API_KEY")
|
|
|
|
| 100 |
else:
|
| 101 |
print(f"Found Gemini API key: {gemini_api_key[:5]}...{gemini_api_key[-5:] if len(gemini_api_key) > 10 else ''}")
|
| 102 |
|
|
|
|
|
|
|
|
|
|
| 103 |
# Add the current directory to the Python path
|
| 104 |
sys.path.append(current_dir)
|
| 105 |
|
|
|
|
| 121 |
# Try import again
|
| 122 |
from src.main import main
|
| 123 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
# Call setup function at import time
|
| 125 |
setup_tesseract()
|
| 126 |
|
fix_tesseract_huggingface.py
ADDED
|
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python
|
| 2 |
+
"""
|
| 3 |
+
Script to diagnose and fix Tesseract issues in Hugging Face environments.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
import sys
|
| 8 |
+
import shutil
|
| 9 |
+
import subprocess
|
| 10 |
+
import platform
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
import urllib.request
|
| 13 |
+
|
| 14 |
+
def diagnose_tesseract():
|
| 15 |
+
"""Diagnose Tesseract installation and configuration issues."""
|
| 16 |
+
print("=== Tesseract Diagnostics ===")
|
| 17 |
+
|
| 18 |
+
# Check OS
|
| 19 |
+
print(f"Operating System: {platform.system()} {platform.release()}")
|
| 20 |
+
|
| 21 |
+
# Check if tesseract is in PATH
|
| 22 |
+
tesseract_path = shutil.which("tesseract")
|
| 23 |
+
if tesseract_path:
|
| 24 |
+
print(f"✅ Tesseract found in PATH: {tesseract_path}")
|
| 25 |
+
try:
|
| 26 |
+
version = subprocess.check_output(["tesseract", "--version"],
|
| 27 |
+
stderr=subprocess.STDOUT,
|
| 28 |
+
universal_newlines=True)
|
| 29 |
+
print(f"✅ Tesseract version info:\n{version.splitlines()[0]}")
|
| 30 |
+
except (subprocess.SubprocessError, FileNotFoundError) as e:
|
| 31 |
+
print(f"❌ Error running tesseract: {e}")
|
| 32 |
+
else:
|
| 33 |
+
print("❌ Tesseract not found in PATH")
|
| 34 |
+
|
| 35 |
+
# Check common installation locations
|
| 36 |
+
common_locations = [
|
| 37 |
+
"/usr/bin/tesseract",
|
| 38 |
+
"/usr/local/bin/tesseract",
|
| 39 |
+
"/opt/conda/bin/tesseract",
|
| 40 |
+
"/app/tesseract/tesseract",
|
| 41 |
+
r"C:\Program Files\Tesseract-OCR\tesseract.exe"
|
| 42 |
+
]
|
| 43 |
+
|
| 44 |
+
for location in common_locations:
|
| 45 |
+
if os.path.isfile(location) and os.access(location, os.X_OK):
|
| 46 |
+
print(f"✅ Tesseract executable found at: {location}")
|
| 47 |
+
|
| 48 |
+
# Check TESSDATA_PREFIX
|
| 49 |
+
tessdata_prefix = os.environ.get('TESSDATA_PREFIX')
|
| 50 |
+
if tessdata_prefix:
|
| 51 |
+
print(f"✅ TESSDATA_PREFIX is set to: {tessdata_prefix}")
|
| 52 |
+
if os.path.exists(tessdata_prefix):
|
| 53 |
+
print(f"✅ TESSDATA_PREFIX directory exists")
|
| 54 |
+
eng_traineddata = os.path.join(tessdata_prefix, "eng.traineddata")
|
| 55 |
+
if os.path.exists(eng_traineddata):
|
| 56 |
+
print(f"✅ eng.traineddata found at: {eng_traineddata}")
|
| 57 |
+
else:
|
| 58 |
+
print(f"❌ eng.traineddata not found at: {eng_traineddata}")
|
| 59 |
+
else:
|
| 60 |
+
print(f"❌ TESSDATA_PREFIX directory does not exist: {tessdata_prefix}")
|
| 61 |
+
else:
|
| 62 |
+
print("❌ TESSDATA_PREFIX environment variable not set")
|
| 63 |
+
|
| 64 |
+
# Check pytesseract
|
| 65 |
+
try:
|
| 66 |
+
import pytesseract
|
| 67 |
+
print(f"✅ pytesseract is installed")
|
| 68 |
+
print(f"✅ pytesseract.tesseract_cmd = {pytesseract.pytesseract.tesseract_cmd}")
|
| 69 |
+
except ImportError:
|
| 70 |
+
print("❌ pytesseract is not installed")
|
| 71 |
+
|
| 72 |
+
# Check tesserocr
|
| 73 |
+
try:
|
| 74 |
+
import tesserocr
|
| 75 |
+
print(f"✅ tesserocr is installed")
|
| 76 |
+
print(f"✅ tesserocr version: {tesserocr.tesseract_version()}")
|
| 77 |
+
except ImportError:
|
| 78 |
+
print("❌ tesserocr is not installed")
|
| 79 |
+
except Exception as e:
|
| 80 |
+
print(f"❌ Error importing tesserocr: {e}")
|
| 81 |
+
|
| 82 |
+
def fix_tesseract():
|
| 83 |
+
"""Fix common Tesseract issues."""
|
| 84 |
+
print("\n=== Fixing Tesseract Issues ===")
|
| 85 |
+
|
| 86 |
+
# Create local tessdata directory
|
| 87 |
+
current_dir = os.path.dirname(os.path.abspath(__file__))
|
| 88 |
+
tessdata_dir = os.path.join(current_dir, "tessdata")
|
| 89 |
+
os.makedirs(tessdata_dir, exist_ok=True)
|
| 90 |
+
print(f"✅ Created local tessdata directory: {tessdata_dir}")
|
| 91 |
+
|
| 92 |
+
# Set TESSDATA_PREFIX to our local directory
|
| 93 |
+
os.environ['TESSDATA_PREFIX'] = tessdata_dir
|
| 94 |
+
print(f"✅ Set TESSDATA_PREFIX to: {tessdata_dir}")
|
| 95 |
+
|
| 96 |
+
# Download eng.traineddata
|
| 97 |
+
eng_traineddata = os.path.join(tessdata_dir, "eng.traineddata")
|
| 98 |
+
if not os.path.exists(eng_traineddata):
|
| 99 |
+
try:
|
| 100 |
+
print("Downloading eng.traineddata...")
|
| 101 |
+
url = "https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata"
|
| 102 |
+
urllib.request.urlretrieve(url, eng_traineddata)
|
| 103 |
+
print("✅ Downloaded eng.traineddata")
|
| 104 |
+
except Exception as e:
|
| 105 |
+
print(f"❌ Error downloading eng.traineddata: {e}")
|
| 106 |
+
else:
|
| 107 |
+
print("✅ eng.traineddata already exists")
|
| 108 |
+
|
| 109 |
+
# Configure pytesseract
|
| 110 |
+
try:
|
| 111 |
+
import pytesseract
|
| 112 |
+
tesseract_path = shutil.which("tesseract")
|
| 113 |
+
if tesseract_path:
|
| 114 |
+
pytesseract.pytesseract.tesseract_cmd = tesseract_path
|
| 115 |
+
print(f"✅ Set pytesseract.tesseract_cmd to {tesseract_path}")
|
| 116 |
+
else:
|
| 117 |
+
# Try common locations
|
| 118 |
+
common_locations = [
|
| 119 |
+
"/usr/bin/tesseract",
|
| 120 |
+
"/usr/local/bin/tesseract",
|
| 121 |
+
"/app/tesseract/tesseract"
|
| 122 |
+
]
|
| 123 |
+
for location in common_locations:
|
| 124 |
+
if os.path.isfile(location) and os.access(location, os.X_OK):
|
| 125 |
+
pytesseract.pytesseract.tesseract_cmd = location
|
| 126 |
+
print(f"✅ Set pytesseract.tesseract_cmd to {location}")
|
| 127 |
+
break
|
| 128 |
+
except ImportError:
|
| 129 |
+
print("❌ pytesseract not installed, please install it with: pip install pytesseract")
|
| 130 |
+
|
| 131 |
+
# Add TESSDATA_PREFIX to .env file for persistence
|
| 132 |
+
try:
|
| 133 |
+
with open(".env", "a") as f:
|
| 134 |
+
f.write(f"\nTESSDATAFIX_PREFIX={tessdata_dir}\n")
|
| 135 |
+
print("✅ Added TESSDATA_PREFIX to .env file")
|
| 136 |
+
except Exception as e:
|
| 137 |
+
print(f"❌ Error adding TESSDATA_PREFIX to .env file: {e}")
|
| 138 |
+
|
| 139 |
+
print("\n=== Tesseract Fix Complete ===")
|
| 140 |
+
print("Please restart your application for changes to take effect.")
|
| 141 |
+
|
| 142 |
+
if __name__ == "__main__":
|
| 143 |
+
diagnose_tesseract()
|
| 144 |
+
fix_tesseract()
|
packages.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
tesseract-ocr
|
| 2 |
+
tesseract-ocr-eng
|
| 3 |
+
libtesseract-dev
|
| 4 |
+
libleptonica-dev
|
| 5 |
+
imagemagick
|
| 6 |
+
poppler-utils
|
setup.sh
CHANGED
|
@@ -5,10 +5,28 @@ set -e
|
|
| 5 |
|
| 6 |
echo "Setting up Tesseract OCR environment..."
|
| 7 |
|
| 8 |
-
# Install
|
| 9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
pip install -q -U google-genai
|
| 11 |
-
echo "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
# Create tessdata directory if it doesn't exist
|
| 14 |
mkdir -p tessdata
|
|
@@ -20,33 +38,34 @@ echo "TESSDATA_PREFIX set to: $TESSDATA_PREFIX"
|
|
| 20 |
# Download eng.traineddata if it doesn't exist
|
| 21 |
if [ ! -f "tessdata/eng.traineddata" ]; then
|
| 22 |
echo "Downloading eng.traineddata..."
|
| 23 |
-
wget -O tessdata/eng.traineddata https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata
|
|
|
|
| 24 |
echo "Downloaded eng.traineddata"
|
| 25 |
else
|
| 26 |
echo "eng.traineddata already exists"
|
| 27 |
fi
|
| 28 |
|
| 29 |
-
#
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
|
|
|
|
|
|
| 34 |
|
| 35 |
# Verify Tesseract installation
|
| 36 |
echo "Verifying Tesseract installation..."
|
| 37 |
-
tesseract --version || echo "Tesseract not found in PATH"
|
| 38 |
-
|
| 39 |
-
# Test
|
| 40 |
-
echo "Testing
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
cat test_output.txt
|
| 49 |
-
fi
|
| 50 |
-
fi
|
| 51 |
|
| 52 |
-
|
|
|
|
|
|
| 5 |
|
| 6 |
echo "Setting up Tesseract OCR environment..."
|
| 7 |
|
| 8 |
+
# Install required packages if not already installed
|
| 9 |
+
if ! command -v tesseract &> /dev/null; then
|
| 10 |
+
echo "Tesseract not found, attempting to install..."
|
| 11 |
+
apt-get update -y || echo "Failed to update apt, continuing anyway"
|
| 12 |
+
apt-get install -y tesseract-ocr tesseract-ocr-eng libtesseract-dev libleptonica-dev || echo "Failed to install tesseract via apt, continuing anyway"
|
| 13 |
+
fi
|
| 14 |
+
|
| 15 |
+
# Install Python dependencies
|
| 16 |
+
echo "Installing Python dependencies..."
|
| 17 |
+
pip install -q -U pytesseract pillow opencv-python-headless pdf2image
|
| 18 |
pip install -q -U google-genai
|
| 19 |
+
echo "Python dependencies installed successfully"
|
| 20 |
+
|
| 21 |
+
# Install tesserocr with pip
|
| 22 |
+
echo "Installing tesserocr..."
|
| 23 |
+
pip install -q -U tesserocr || echo "Failed to install tesserocr with pip, trying with specific compiler flags..."
|
| 24 |
+
|
| 25 |
+
# If tesserocr installation failed, try with specific compiler flags
|
| 26 |
+
if ! python -c "import tesserocr" &> /dev/null; then
|
| 27 |
+
echo "Trying alternative tesserocr installation..."
|
| 28 |
+
CPPFLAGS="-I/usr/local/include -I/usr/include" LDFLAGS="-L/usr/local/lib -L/usr/lib" pip install -q -U tesserocr || echo "Failed to install tesserocr with compiler flags, continuing anyway"
|
| 29 |
+
fi
|
| 30 |
|
| 31 |
# Create tessdata directory if it doesn't exist
|
| 32 |
mkdir -p tessdata
|
|
|
|
| 38 |
# Download eng.traineddata if it doesn't exist
|
| 39 |
if [ ! -f "tessdata/eng.traineddata" ]; then
|
| 40 |
echo "Downloading eng.traineddata..."
|
| 41 |
+
wget -O tessdata/eng.traineddata https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata || \
|
| 42 |
+
curl -o tessdata/eng.traineddata https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata
|
| 43 |
echo "Downloaded eng.traineddata"
|
| 44 |
else
|
| 45 |
echo "eng.traineddata already exists"
|
| 46 |
fi
|
| 47 |
|
| 48 |
+
# Try to copy to system locations (may fail in restricted environments)
|
| 49 |
+
for tessdata_dir in "/usr/share/tesseract-ocr/4.00/tessdata" "/usr/share/tesseract-ocr/tessdata" "/usr/local/share/tessdata"; do
|
| 50 |
+
if [ -d "$tessdata_dir" ]; then
|
| 51 |
+
echo "Copying eng.traineddata to $tessdata_dir..."
|
| 52 |
+
cp -f tessdata/eng.traineddata "$tessdata_dir/" 2>/dev/null || echo "Failed to copy to $tessdata_dir, continuing anyway"
|
| 53 |
+
fi
|
| 54 |
+
done
|
| 55 |
|
| 56 |
# Verify Tesseract installation
|
| 57 |
echo "Verifying Tesseract installation..."
|
| 58 |
+
tesseract --version || echo "Tesseract not found in PATH, but may still be available to Python"
|
| 59 |
+
|
| 60 |
+
# Test tesserocr if installed
|
| 61 |
+
echo "Testing tesserocr..."
|
| 62 |
+
python -c "import tesserocr; print(f'tesserocr version: {tesserocr.tesseract_version()}')" || echo "tesserocr not working, but may still be able to use pytesseract"
|
| 63 |
+
|
| 64 |
+
# Test pytesseract
|
| 65 |
+
echo "Testing pytesseract..."
|
| 66 |
+
python -c "import pytesseract; print(f'pytesseract path: {pytesseract.tesseract_cmd}')" || echo "pytesseract not working"
|
| 67 |
+
|
| 68 |
+
echo "Setup completed"
|
|
|
|
|
|
|
|
|
|
| 69 |
|
| 70 |
+
# Add TESSDATA_PREFIX to .env file for persistence
|
| 71 |
+
echo "TESSDATA_PREFIX=$(pwd)/tessdata" >> .env
|