#!/bin/bash # Exit on error set -e echo "Setting up Tesseract OCR environment..." # Install required packages if not already installed if ! command -v tesseract &> /dev/null; then echo "Tesseract not found, attempting to install..." apt-get update -y || echo "Failed to update apt, continuing anyway" apt-get install -y tesseract-ocr tesseract-ocr-eng libtesseract-dev libleptonica-dev || echo "Failed to install tesseract via apt, continuing anyway" fi # Install Python dependencies echo "Installing Python dependencies..." pip install -q -U pytesseract pillow opencv-python-headless pdf2image pip install -q -U google-genai echo "Python dependencies installed successfully" # Install tesserocr with pip echo "Installing tesserocr..." pip install -q -U tesserocr || echo "Failed to install tesserocr with pip, trying with specific compiler flags..." # If tesserocr installation failed, try with specific compiler flags if ! python -c "import tesserocr" &> /dev/null; then echo "Trying alternative tesserocr installation..." CPPFLAGS="-I/usr/local/include -I/usr/include" LDFLAGS="-L/usr/local/lib -L/usr/lib" pip install -q -U tesserocr || echo "Failed to install tesserocr with compiler flags, continuing anyway" fi # Create tessdata directory if it doesn't exist mkdir -p tessdata # Set TESSDATA_PREFIX environment variable export TESSDATA_PREFIX="$(pwd)/tessdata" echo "TESSDATA_PREFIX set to: $TESSDATA_PREFIX" # Download eng.traineddata if it doesn't exist if [ ! -f "tessdata/eng.traineddata" ]; then echo "Downloading eng.traineddata..." wget -O tessdata/eng.traineddata https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata || \ curl -o tessdata/eng.traineddata https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata echo "Downloaded eng.traineddata" else echo "eng.traineddata already exists" fi # Try to copy to system locations (may fail in restricted environments) for tessdata_dir in "/usr/share/tesseract-ocr/4.00/tessdata" "/usr/share/tesseract-ocr/tessdata" "/usr/local/share/tessdata"; do if [ -d "$tessdata_dir" ]; then echo "Copying eng.traineddata to $tessdata_dir..." cp -f tessdata/eng.traineddata "$tessdata_dir/" 2>/dev/null || echo "Failed to copy to $tessdata_dir, continuing anyway" fi done # Verify Tesseract installation echo "Verifying Tesseract installation..." tesseract --version || echo "Tesseract not found in PATH, but may still be available to Python" # Test tesserocr if installed echo "Testing tesserocr..." python -c "import tesserocr; print(f'tesserocr version: {tesserocr.tesseract_version()}')" || echo "tesserocr not working, but may still be able to use pytesseract" # Test pytesseract echo "Testing pytesseract..." python -c "import pytesseract; print(f'pytesseract path: {pytesseract.tesseract_cmd}')" || echo "pytesseract not working" echo "Setup completed" # Add TESSDATA_PREFIX to .env file for persistence echo "TESSDATA_PREFIX=$(pwd)/tessdata" >> .env