Spaces:
Paused
Paused
| # Exit on error | |
| set -e | |
| echo "Setting up Tesseract OCR environment..." | |
| # Install required packages if not already installed | |
| if ! command -v tesseract &> /dev/null; then | |
| echo "Tesseract not found, attempting to install..." | |
| apt-get update -y || echo "Failed to update apt, continuing anyway" | |
| apt-get install -y tesseract-ocr tesseract-ocr-eng libtesseract-dev libleptonica-dev || echo "Failed to install tesseract via apt, continuing anyway" | |
| fi | |
| # Install Python dependencies | |
| echo "Installing Python dependencies..." | |
| pip install -q -U pytesseract pillow opencv-python-headless pdf2image | |
| pip install -q -U google-genai | |
| echo "Python dependencies installed successfully" | |
| # Install tesserocr with pip | |
| echo "Installing tesserocr..." | |
| pip install -q -U tesserocr || echo "Failed to install tesserocr with pip, trying with specific compiler flags..." | |
| # If tesserocr installation failed, try with specific compiler flags | |
| if ! python -c "import tesserocr" &> /dev/null; then | |
| echo "Trying alternative tesserocr installation..." | |
| CPPFLAGS="-I/usr/local/include -I/usr/include" LDFLAGS="-L/usr/local/lib -L/usr/lib" pip install -q -U tesserocr || echo "Failed to install tesserocr with compiler flags, continuing anyway" | |
| fi | |
| # Create tessdata directory if it doesn't exist | |
| mkdir -p tessdata | |
| # Set TESSDATA_PREFIX environment variable | |
| export TESSDATA_PREFIX="$(pwd)/tessdata" | |
| echo "TESSDATA_PREFIX set to: $TESSDATA_PREFIX" | |
| # Download eng.traineddata if it doesn't exist | |
| if [ ! -f "tessdata/eng.traineddata" ]; then | |
| echo "Downloading eng.traineddata..." | |
| wget -O tessdata/eng.traineddata https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata || \ | |
| curl -o tessdata/eng.traineddata https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata | |
| echo "Downloaded eng.traineddata" | |
| else | |
| echo "eng.traineddata already exists" | |
| fi | |
| # Try to copy to system locations (may fail in restricted environments) | |
| for tessdata_dir in "/usr/share/tesseract-ocr/4.00/tessdata" "/usr/share/tesseract-ocr/tessdata" "/usr/local/share/tessdata"; do | |
| if [ -d "$tessdata_dir" ]; then | |
| echo "Copying eng.traineddata to $tessdata_dir..." | |
| cp -f tessdata/eng.traineddata "$tessdata_dir/" 2>/dev/null || echo "Failed to copy to $tessdata_dir, continuing anyway" | |
| fi | |
| done | |
| # Verify Tesseract installation | |
| echo "Verifying Tesseract installation..." | |
| tesseract --version || echo "Tesseract not found in PATH, but may still be available to Python" | |
| # Test tesserocr if installed | |
| echo "Testing tesserocr..." | |
| python -c "import tesserocr; print(f'tesserocr version: {tesserocr.tesseract_version()}')" || echo "tesserocr not working, but may still be able to use pytesseract" | |
| # Test pytesseract | |
| echo "Testing pytesseract..." | |
| python -c "import pytesseract; print(f'pytesseract path: {pytesseract.tesseract_cmd}')" || echo "pytesseract not working" | |
| echo "Setup completed" | |
| # Add TESSDATA_PREFIX to .env file for persistence | |
| echo "TESSDATA_PREFIX=$(pwd)/tessdata" >> .env |