AnseMin commited on
Commit
b7d7d76
·
1 Parent(s): 08e21aa

tesseract ocr fix

Browse files
Files changed (3) hide show
  1. build.sh +31 -18
  2. requirements.txt +7 -4
  3. src/main.py +7 -1
build.sh CHANGED
@@ -10,9 +10,14 @@ echo "Installing Tesseract and dependencies..."
10
  apt-get update && apt-get install -y \
11
  tesseract-ocr \
12
  tesseract-ocr-eng \
 
13
  libtesseract-dev \
14
  libleptonica-dev \
15
- pkg-config
 
 
 
 
16
 
17
  # Verify tesseract installation
18
  if ! command -v tesseract &> /dev/null; then
@@ -21,34 +26,42 @@ if ! command -v tesseract &> /dev/null; then
21
  fi
22
  echo "Tesseract version: $(tesseract --version)"
23
 
24
- # Set TESSDATA_PREFIX environment variable
25
- TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)
26
- if [ -z "$TESSDATA_PREFIX" ]; then
27
- echo "Could not find tessdata directory!"
28
- exit 1
29
- fi
30
  echo "Set TESSDATA_PREFIX=${TESSDATA_PREFIX}"
31
- export TESSDATA_PREFIX
32
 
33
  # Add TESSDATA_PREFIX to environment for persistence
34
  echo "TESSDATA_PREFIX=${TESSDATA_PREFIX}" >> /etc/environment
35
 
36
- # Verify tessdata directory
37
  if [ ! -d "$TESSDATA_PREFIX" ]; then
38
- echo "Tessdata directory does not exist!"
39
- exit 1
40
  fi
 
41
  echo "Tessdata directory contents:"
42
  ls -l $TESSDATA_PREFIX
43
 
44
- # Uninstall any existing tesserocr and install from source
45
- echo "Installing tesserocr from source..."
 
 
 
 
 
 
 
 
 
46
  pip uninstall -y tesserocr || true
47
- pip install --no-binary :all: tesserocr
48
 
49
- # Install ocrmac
50
- echo "Installing ocrmac..."
51
- pip install ocrmac
 
 
 
 
52
 
53
  # Install Python dependencies
54
  echo "Installing Python dependencies..."
@@ -57,7 +70,7 @@ pip install -e .
57
  # Create .env file if it doesn't exist
58
  if [ ! -f .env ]; then
59
  echo "Creating .env file..."
60
- cp .env.example .env
61
  fi
62
 
63
  echo "Build process completed successfully!"
 
10
  apt-get update && apt-get install -y \
11
  tesseract-ocr \
12
  tesseract-ocr-eng \
13
+ tesseract-ocr-osd \
14
  libtesseract-dev \
15
  libleptonica-dev \
16
+ pkg-config \
17
+ build-essential
18
+
19
+ # Create tessdata directory if it doesn't exist
20
+ mkdir -p /usr/share/tesseract-ocr/4.00/tessdata
21
 
22
  # Verify tesseract installation
23
  if ! command -v tesseract &> /dev/null; then
 
26
  fi
27
  echo "Tesseract version: $(tesseract --version)"
28
 
29
+ # Set and export TESSDATA_PREFIX
30
+ export TESSDATA_PREFIX="/usr/share/tesseract-ocr/4.00/tessdata"
 
 
 
 
31
  echo "Set TESSDATA_PREFIX=${TESSDATA_PREFIX}"
 
32
 
33
  # Add TESSDATA_PREFIX to environment for persistence
34
  echo "TESSDATA_PREFIX=${TESSDATA_PREFIX}" >> /etc/environment
35
 
36
+ # Verify tessdata directory and contents
37
  if [ ! -d "$TESSDATA_PREFIX" ]; then
38
+ echo "Creating tessdata directory..."
39
+ mkdir -p "$TESSDATA_PREFIX"
40
  fi
41
+
42
  echo "Tessdata directory contents:"
43
  ls -l $TESSDATA_PREFIX
44
 
45
+ # Test Tesseract functionality
46
+ echo "Testing Tesseract functionality..."
47
+ echo "Hello World" > test.png
48
+ if ! tesseract test.png stdout; then
49
+ echo "Tesseract test failed!"
50
+ exit 1
51
+ fi
52
+ rm test.png
53
+
54
+ # Clean any existing tesserocr installation
55
+ echo "Cleaning existing tesserocr installation..."
56
  pip uninstall -y tesserocr || true
 
57
 
58
+ # Install tesserocr from source with proper configuration
59
+ echo "Installing tesserocr from source..."
60
+ CPPFLAGS=-I/usr/include/tesseract/ LDFLAGS=-L/usr/lib/x86_64-linux-gnu/ pip install --no-binary :all: tesserocr
61
+
62
+ # Verify tesserocr installation
63
+ echo "Verifying tesserocr installation..."
64
+ python3 -c "import tesserocr; print(f'tesserocr version: {tesserocr.__version__}')"
65
 
66
  # Install Python dependencies
67
  echo "Installing Python dependencies..."
 
70
  # Create .env file if it doesn't exist
71
  if [ ! -f .env ]; then
72
  echo "Creating .env file..."
73
+ cp .env.example .env || echo "Warning: .env.example not found"
74
  fi
75
 
76
  echo "Build process completed successfully!"
requirements.txt CHANGED
@@ -1,4 +1,4 @@
1
- docling==2.18.0
2
  gradio==5.14.0
3
  grpcio-status==1.70.0
4
  markdown==3.7
@@ -8,8 +8,11 @@ openai==1.61.1
8
  pipdeptree==2.25.0
9
  pytesseract==0.3.13
10
  semchunk==2.2.2
 
 
 
11
  tesseract==0.1.3
12
  tesserocr>=2.5.0; platform_system != "Windows" # Only install on non-Windows systems
13
- Pillow>=9.0.0 # Required for image processing
14
- numpy>=1.21.0 # Required for image processing
15
- # Use pytesseract instead of tesserocr for cross-platform compatibility
 
1
+ docling==2.25.0
2
  gradio==5.14.0
3
  grpcio-status==1.70.0
4
  markdown==3.7
 
8
  pipdeptree==2.25.0
9
  pytesseract==0.3.13
10
  semchunk==2.2.2
11
+ Pillow>=9.0.0
12
+ numpy>=1.21.0
13
+ # Tesseract dependencies
14
  tesseract==0.1.3
15
  tesserocr>=2.5.0; platform_system != "Windows" # Only install on non-Windows systems
16
+ # Additional dependencies for image processing
17
+ opencv-python-headless>=4.5.0 # Headless version for server environments
18
+ pdf2image>=1.16.0 # For PDF processing
src/main.py CHANGED
@@ -186,7 +186,13 @@ def main():
186
  [chatbot, chatbot]
187
  )
188
 
189
- demo.launch(server_name="0.0.0.0", server_port=7860)
 
 
 
 
 
 
190
 
191
 
192
  if __name__ == "__main__":
 
186
  [chatbot, chatbot]
187
  )
188
 
189
+ demo.launch(
190
+ server_name="0.0.0.0",
191
+ server_port=7860,
192
+ root_path="",
193
+ show_error=True,
194
+ share=False # Explicitly disable sharing on Hugging Face
195
+ )
196
 
197
 
198
  if __name__ == "__main__":