Spaces:
Paused
Paused
tesseract ocr fix
Browse files- build.sh +31 -18
- requirements.txt +7 -4
- src/main.py +7 -1
build.sh
CHANGED
|
@@ -10,9 +10,14 @@ echo "Installing Tesseract and dependencies..."
|
|
| 10 |
apt-get update && apt-get install -y \
|
| 11 |
tesseract-ocr \
|
| 12 |
tesseract-ocr-eng \
|
|
|
|
| 13 |
libtesseract-dev \
|
| 14 |
libleptonica-dev \
|
| 15 |
-
pkg-config
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
# Verify tesseract installation
|
| 18 |
if ! command -v tesseract &> /dev/null; then
|
|
@@ -21,34 +26,42 @@ if ! command -v tesseract &> /dev/null; then
|
|
| 21 |
fi
|
| 22 |
echo "Tesseract version: $(tesseract --version)"
|
| 23 |
|
| 24 |
-
# Set
|
| 25 |
-
TESSDATA_PREFIX=
|
| 26 |
-
if [ -z "$TESSDATA_PREFIX" ]; then
|
| 27 |
-
echo "Could not find tessdata directory!"
|
| 28 |
-
exit 1
|
| 29 |
-
fi
|
| 30 |
echo "Set TESSDATA_PREFIX=${TESSDATA_PREFIX}"
|
| 31 |
-
export TESSDATA_PREFIX
|
| 32 |
|
| 33 |
# Add TESSDATA_PREFIX to environment for persistence
|
| 34 |
echo "TESSDATA_PREFIX=${TESSDATA_PREFIX}" >> /etc/environment
|
| 35 |
|
| 36 |
-
# Verify tessdata directory
|
| 37 |
if [ ! -d "$TESSDATA_PREFIX" ]; then
|
| 38 |
-
echo "
|
| 39 |
-
|
| 40 |
fi
|
|
|
|
| 41 |
echo "Tessdata directory contents:"
|
| 42 |
ls -l $TESSDATA_PREFIX
|
| 43 |
|
| 44 |
-
#
|
| 45 |
-
echo "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
pip uninstall -y tesserocr || true
|
| 47 |
-
pip install --no-binary :all: tesserocr
|
| 48 |
|
| 49 |
-
# Install
|
| 50 |
-
echo "Installing
|
| 51 |
-
pip install
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
|
| 53 |
# Install Python dependencies
|
| 54 |
echo "Installing Python dependencies..."
|
|
@@ -57,7 +70,7 @@ pip install -e .
|
|
| 57 |
# Create .env file if it doesn't exist
|
| 58 |
if [ ! -f .env ]; then
|
| 59 |
echo "Creating .env file..."
|
| 60 |
-
cp .env.example .env
|
| 61 |
fi
|
| 62 |
|
| 63 |
echo "Build process completed successfully!"
|
|
|
|
| 10 |
apt-get update && apt-get install -y \
|
| 11 |
tesseract-ocr \
|
| 12 |
tesseract-ocr-eng \
|
| 13 |
+
tesseract-ocr-osd \
|
| 14 |
libtesseract-dev \
|
| 15 |
libleptonica-dev \
|
| 16 |
+
pkg-config \
|
| 17 |
+
build-essential
|
| 18 |
+
|
| 19 |
+
# Create tessdata directory if it doesn't exist
|
| 20 |
+
mkdir -p /usr/share/tesseract-ocr/4.00/tessdata
|
| 21 |
|
| 22 |
# Verify tesseract installation
|
| 23 |
if ! command -v tesseract &> /dev/null; then
|
|
|
|
| 26 |
fi
|
| 27 |
echo "Tesseract version: $(tesseract --version)"
|
| 28 |
|
| 29 |
+
# Set and export TESSDATA_PREFIX
|
| 30 |
+
export TESSDATA_PREFIX="/usr/share/tesseract-ocr/4.00/tessdata"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
echo "Set TESSDATA_PREFIX=${TESSDATA_PREFIX}"
|
|
|
|
| 32 |
|
| 33 |
# Add TESSDATA_PREFIX to environment for persistence
|
| 34 |
echo "TESSDATA_PREFIX=${TESSDATA_PREFIX}" >> /etc/environment
|
| 35 |
|
| 36 |
+
# Verify tessdata directory and contents
|
| 37 |
if [ ! -d "$TESSDATA_PREFIX" ]; then
|
| 38 |
+
echo "Creating tessdata directory..."
|
| 39 |
+
mkdir -p "$TESSDATA_PREFIX"
|
| 40 |
fi
|
| 41 |
+
|
| 42 |
echo "Tessdata directory contents:"
|
| 43 |
ls -l $TESSDATA_PREFIX
|
| 44 |
|
| 45 |
+
# Test Tesseract functionality
|
| 46 |
+
echo "Testing Tesseract functionality..."
|
| 47 |
+
echo "Hello World" > test.png
|
| 48 |
+
if ! tesseract test.png stdout; then
|
| 49 |
+
echo "Tesseract test failed!"
|
| 50 |
+
exit 1
|
| 51 |
+
fi
|
| 52 |
+
rm test.png
|
| 53 |
+
|
| 54 |
+
# Clean any existing tesserocr installation
|
| 55 |
+
echo "Cleaning existing tesserocr installation..."
|
| 56 |
pip uninstall -y tesserocr || true
|
|
|
|
| 57 |
|
| 58 |
+
# Install tesserocr from source with proper configuration
|
| 59 |
+
echo "Installing tesserocr from source..."
|
| 60 |
+
CPPFLAGS=-I/usr/include/tesseract/ LDFLAGS=-L/usr/lib/x86_64-linux-gnu/ pip install --no-binary :all: tesserocr
|
| 61 |
+
|
| 62 |
+
# Verify tesserocr installation
|
| 63 |
+
echo "Verifying tesserocr installation..."
|
| 64 |
+
python3 -c "import tesserocr; print(f'tesserocr version: {tesserocr.__version__}')"
|
| 65 |
|
| 66 |
# Install Python dependencies
|
| 67 |
echo "Installing Python dependencies..."
|
|
|
|
| 70 |
# Create .env file if it doesn't exist
|
| 71 |
if [ ! -f .env ]; then
|
| 72 |
echo "Creating .env file..."
|
| 73 |
+
cp .env.example .env || echo "Warning: .env.example not found"
|
| 74 |
fi
|
| 75 |
|
| 76 |
echo "Build process completed successfully!"
|
requirements.txt
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
docling==2.
|
| 2 |
gradio==5.14.0
|
| 3 |
grpcio-status==1.70.0
|
| 4 |
markdown==3.7
|
|
@@ -8,8 +8,11 @@ openai==1.61.1
|
|
| 8 |
pipdeptree==2.25.0
|
| 9 |
pytesseract==0.3.13
|
| 10 |
semchunk==2.2.2
|
|
|
|
|
|
|
|
|
|
| 11 |
tesseract==0.1.3
|
| 12 |
tesserocr>=2.5.0; platform_system != "Windows" # Only install on non-Windows systems
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
#
|
|
|
|
| 1 |
+
docling==2.25.0
|
| 2 |
gradio==5.14.0
|
| 3 |
grpcio-status==1.70.0
|
| 4 |
markdown==3.7
|
|
|
|
| 8 |
pipdeptree==2.25.0
|
| 9 |
pytesseract==0.3.13
|
| 10 |
semchunk==2.2.2
|
| 11 |
+
Pillow>=9.0.0
|
| 12 |
+
numpy>=1.21.0
|
| 13 |
+
# Tesseract dependencies
|
| 14 |
tesseract==0.1.3
|
| 15 |
tesserocr>=2.5.0; platform_system != "Windows" # Only install on non-Windows systems
|
| 16 |
+
# Additional dependencies for image processing
|
| 17 |
+
opencv-python-headless>=4.5.0 # Headless version for server environments
|
| 18 |
+
pdf2image>=1.16.0 # For PDF processing
|
src/main.py
CHANGED
|
@@ -186,7 +186,13 @@ def main():
|
|
| 186 |
[chatbot, chatbot]
|
| 187 |
)
|
| 188 |
|
| 189 |
-
demo.launch(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 190 |
|
| 191 |
|
| 192 |
if __name__ == "__main__":
|
|
|
|
| 186 |
[chatbot, chatbot]
|
| 187 |
)
|
| 188 |
|
| 189 |
+
demo.launch(
|
| 190 |
+
server_name="0.0.0.0",
|
| 191 |
+
server_port=7860,
|
| 192 |
+
root_path="",
|
| 193 |
+
show_error=True,
|
| 194 |
+
share=False # Explicitly disable sharing on Hugging Face
|
| 195 |
+
)
|
| 196 |
|
| 197 |
|
| 198 |
if __name__ == "__main__":
|