File size: 1,835 Bytes
befccc3
 
 
3b879b4
 
 
 
 
 
 
 
befccc3
3b879b4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
befccc3
 
3b879b4
 
 
 
 
 
 
 
befccc3
 
3b879b4
 
 
 
 
 
befccc3
 
3b879b4
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
#!/bin/bash
# setup.sh - Setup script for Hugging Face Spaces

set -e  # Exit on any error

echo "πŸš€ Setting up OCR dependencies for Hugging Face Spaces..."

# Update package list
echo "πŸ“¦ Updating package list..."
apt-get update -qq

# Install system dependencies for OCR
echo "πŸ”§ Installing system dependencies..."
apt-get install -y -qq tesseract-ocr tesseract-ocr-ben tesseract-ocr-eng poppler-utils

# Verify Tesseract installation
echo "βœ… Verifying Tesseract installation..."
tesseract --version

# Check available languages
echo "🌐 Available Tesseract languages:"
tesseract --list-langs

# Create tessdata directory if it doesn't exist
TESSDATA_DIR="/usr/share/tesseract-ocr/4.00/tessdata"
if [ ! -d "$TESSDATA_DIR" ]; then
    echo "πŸ“ Creating tessdata directory..."
    mkdir -p "$TESSDATA_DIR"
fi

# Download additional language data if needed
echo "πŸ“₯ Ensuring language data is available..."

# Bengali language support
if [ ! -f "$TESSDATA_DIR/ben.traineddata" ]; then
    echo "πŸ‡§πŸ‡© Installing Bengali language data..."
    wget -q -O "$TESSDATA_DIR/ben.traineddata" \
        https://github.com/tesseract-ocr/tessdata/raw/main/ben.traineddata || \
        echo "⚠️ Warning: Could not download Bengali language data"
fi

# English language support
if [ ! -f "$TESSDATA_DIR/eng.traineddata" ]; then
    echo "πŸ‡ΊπŸ‡Έ Installing English language data..."
    wget -q -O "$TESSDATA_DIR/eng.traineddata" \
        https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata || \
        echo "⚠️ Warning: Could not download English language data"
fi

# Set permissions
chmod -R 755 "$TESSDATA_DIR" 2>/dev/null || true

echo "πŸŽ‰ Setup completed successfully!"
echo "πŸ“Š Final verification:"
tesseract --list-langs | grep -E "(ben|eng)" || echo "⚠️ Some languages may not be available"