Spaces:
Paused
Paused
fix: download tesseract traineddata files directly from source
Browse files
build.sh
CHANGED
|
@@ -5,50 +5,73 @@ set -e
|
|
| 5 |
|
| 6 |
echo "Starting build process..."
|
| 7 |
|
| 8 |
-
# Install system dependencies for tesseract
|
| 9 |
echo "Installing Tesseract and dependencies..."
|
| 10 |
apt-get update && apt-get install -y \
|
| 11 |
tesseract-ocr \
|
| 12 |
tesseract-ocr-eng \
|
| 13 |
libtesseract-dev \
|
| 14 |
libleptonica-dev \
|
| 15 |
-
pkg-config
|
|
|
|
| 16 |
|
| 17 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
if ! command -v tesseract &> /dev/null; then
|
| 19 |
echo "Tesseract installation failed!"
|
| 20 |
exit 1
|
| 21 |
fi
|
| 22 |
echo "Tesseract version: $(tesseract --version)"
|
| 23 |
|
| 24 |
-
#
|
| 25 |
-
|
| 26 |
-
if [ -
|
| 27 |
-
echo "
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
exit 1
|
| 29 |
fi
|
| 30 |
-
echo "Set TESSDATA_PREFIX=${TESSDATA_PREFIX}"
|
| 31 |
-
export TESSDATA_PREFIX
|
| 32 |
|
| 33 |
-
|
| 34 |
-
|
| 35 |
|
| 36 |
-
#
|
| 37 |
-
|
| 38 |
-
|
|
|
|
|
|
|
| 39 |
exit 1
|
| 40 |
fi
|
| 41 |
-
|
| 42 |
-
ls -l $TESSDATA_PREFIX
|
| 43 |
|
| 44 |
-
#
|
| 45 |
echo "Installing tesserocr from source..."
|
| 46 |
pip uninstall -y tesserocr || true
|
| 47 |
-
pip install --no-binary :all: tesserocr
|
| 48 |
|
| 49 |
-
#
|
| 50 |
-
echo "
|
| 51 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
|
| 53 |
# Install Python dependencies
|
| 54 |
echo "Installing Python dependencies..."
|
|
@@ -57,7 +80,7 @@ pip install -e .
|
|
| 57 |
# Create .env file if it doesn't exist
|
| 58 |
if [ ! -f .env ]; then
|
| 59 |
echo "Creating .env file..."
|
| 60 |
-
cp .env.example .env
|
| 61 |
fi
|
| 62 |
|
| 63 |
echo "Build process completed successfully!"
|
|
|
|
| 5 |
|
| 6 |
echo "Starting build process..."
|
| 7 |
|
| 8 |
+
# Install system dependencies for tesseract
|
| 9 |
echo "Installing Tesseract and dependencies..."
|
| 10 |
apt-get update && apt-get install -y \
|
| 11 |
tesseract-ocr \
|
| 12 |
tesseract-ocr-eng \
|
| 13 |
libtesseract-dev \
|
| 14 |
libleptonica-dev \
|
| 15 |
+
pkg-config \
|
| 16 |
+
wget
|
| 17 |
|
| 18 |
+
# Create tessdata directory
|
| 19 |
+
TESSDATA_DIR="/usr/share/tesseract-ocr/4.00/tessdata"
|
| 20 |
+
mkdir -p "$TESSDATA_DIR"
|
| 21 |
+
|
| 22 |
+
# Download traineddata files directly from the official repository
|
| 23 |
+
echo "Downloading Tesseract traineddata files..."
|
| 24 |
+
wget -O "$TESSDATA_DIR/eng.traineddata" "https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata"
|
| 25 |
+
wget -O "$TESSDATA_DIR/osd.traineddata" "https://github.com/tesseract-ocr/tessdata/raw/main/osd.traineddata"
|
| 26 |
+
|
| 27 |
+
# Set and verify TESSDATA_PREFIX
|
| 28 |
+
export TESSDATA_PREFIX="$TESSDATA_DIR"
|
| 29 |
+
echo "TESSDATA_PREFIX=${TESSDATA_PREFIX}" >> /etc/environment
|
| 30 |
+
|
| 31 |
+
# Verify tesseract installation and data files
|
| 32 |
+
echo "Verifying Tesseract installation..."
|
| 33 |
if ! command -v tesseract &> /dev/null; then
|
| 34 |
echo "Tesseract installation failed!"
|
| 35 |
exit 1
|
| 36 |
fi
|
| 37 |
echo "Tesseract version: $(tesseract --version)"
|
| 38 |
|
| 39 |
+
# Verify traineddata files
|
| 40 |
+
echo "Verifying traineddata files..."
|
| 41 |
+
if [ ! -f "$TESSDATA_DIR/eng.traineddata" ]; then
|
| 42 |
+
echo "eng.traineddata is missing!"
|
| 43 |
+
exit 1
|
| 44 |
+
fi
|
| 45 |
+
if [ ! -f "$TESSDATA_DIR/osd.traineddata" ]; then
|
| 46 |
+
echo "osd.traineddata is missing!"
|
| 47 |
exit 1
|
| 48 |
fi
|
|
|
|
|
|
|
| 49 |
|
| 50 |
+
echo "Traineddata files in $TESSDATA_DIR:"
|
| 51 |
+
ls -l "$TESSDATA_DIR"
|
| 52 |
|
| 53 |
+
# Test Tesseract functionality
|
| 54 |
+
echo "Testing Tesseract functionality..."
|
| 55 |
+
echo "Hello World" > test.png
|
| 56 |
+
if ! tesseract test.png stdout; then
|
| 57 |
+
echo "Tesseract test failed!"
|
| 58 |
exit 1
|
| 59 |
fi
|
| 60 |
+
rm test.png
|
|
|
|
| 61 |
|
| 62 |
+
# Clean and install tesserocr from source
|
| 63 |
echo "Installing tesserocr from source..."
|
| 64 |
pip uninstall -y tesserocr || true
|
| 65 |
+
CPPFLAGS="-I/usr/include/tesseract" LDFLAGS="-L/usr/lib/x86_64-linux-gnu/" pip install --no-binary :all: tesserocr
|
| 66 |
|
| 67 |
+
# Verify tesserocr installation
|
| 68 |
+
echo "Verifying tesserocr installation..."
|
| 69 |
+
python3 -c "
|
| 70 |
+
import tesserocr
|
| 71 |
+
print(f'tesserocr version: {tesserocr.__version__}')
|
| 72 |
+
print(f'Available languages: {tesserocr.get_languages()}')
|
| 73 |
+
print(f'TESSDATA_PREFIX: {tesserocr.get_languages()[1]}')
|
| 74 |
+
"
|
| 75 |
|
| 76 |
# Install Python dependencies
|
| 77 |
echo "Installing Python dependencies..."
|
|
|
|
| 80 |
# Create .env file if it doesn't exist
|
| 81 |
if [ ! -f .env ]; then
|
| 82 |
echo "Creating .env file..."
|
| 83 |
+
cp .env.example .env || echo "Warning: .env.example not found"
|
| 84 |
fi
|
| 85 |
|
| 86 |
echo "Build process completed successfully!"
|