AnseMin commited on
Commit
3860890
·
1 Parent(s): 292de5e

changing setup.sh to see logs of where tesseract is located in hugging face space

Browse files
Files changed (2) hide show
  1. setup.sh +8 -1
  2. src/parsers/docling_parser.py +4 -48
setup.sh CHANGED
@@ -49,4 +49,11 @@ if [ -f "test.png" ]; then
49
  fi
50
  fi
51
 
52
- echo "Setup completed"
 
 
 
 
 
 
 
 
49
  fi
50
  fi
51
 
52
+ echo "Setup completed"
53
+
54
+ # Add these diagnostic commands at the end of your setup.sh
55
+ echo "Checking Tesseract location:"
56
+ which tesseract || echo "Tesseract not found in PATH"
57
+ whereis tesseract
58
+ echo "Current PATH: $PATH"
59
+ echo "TESSDATA_PREFIX: $TESSDATA_PREFIX"
src/parsers/docling_parser.py CHANGED
@@ -123,30 +123,6 @@ class DoclingParser(DocumentParser):
123
 
124
  def _apply_full_force_ocr(self, file_path: Union[str, Path]) -> str:
125
  """Apply full force OCR to a document."""
126
- import subprocess
127
- import os
128
-
129
- # Try to find tesseract binary
130
- tesseract_cmd = 'tesseract'
131
- try:
132
- # Check if tesseract is available
133
- subprocess.run([tesseract_cmd, '--version'],
134
- stdout=subprocess.PIPE,
135
- stderr=subprocess.PIPE,
136
- check=True)
137
- except (subprocess.SubprocessError, FileNotFoundError):
138
- # Try common locations in Hugging Face environment
139
- potential_paths = [
140
- '/usr/bin/tesseract',
141
- '/usr/local/bin/tesseract',
142
- '/opt/conda/bin/tesseract'
143
- ]
144
-
145
- for path in potential_paths:
146
- if os.path.exists(path):
147
- tesseract_cmd = path
148
- break
149
-
150
  input_doc = Path(file_path)
151
 
152
  pipeline_options = PdfPipelineOptions()
@@ -154,22 +130,9 @@ class DoclingParser(DocumentParser):
154
  pipeline_options.do_table_structure = True
155
  pipeline_options.table_structure_options.do_cell_matching = True
156
 
157
- # Create OCR options with explicit tesseract path
158
- ocr_options = TesseractCliOcrOptions(
159
- force_full_page_ocr=True,
160
- tesseract_cmd=tesseract_cmd
161
- )
162
  pipeline_options.ocr_options = ocr_options
163
 
164
- # Set tessdata prefix if not already set
165
- if not os.environ.get('TESSDATA_PREFIX'):
166
- for prefix in ['/usr/share/tesseract-ocr/4.00/tessdata',
167
- '/usr/share/tessdata',
168
- '/usr/local/share/tessdata']:
169
- if os.path.exists(prefix):
170
- os.environ['TESSDATA_PREFIX'] = prefix
171
- break
172
-
173
  converter = DocumentConverter(
174
  format_options={
175
  InputFormat.PDF: PdfFormatOption(
@@ -178,16 +141,9 @@ class DoclingParser(DocumentParser):
178
  }
179
  )
180
 
181
- try:
182
- doc = converter.convert(input_doc).document
183
- return doc.export_to_markdown()
184
- except Exception as e:
185
- # Provide more helpful error message
186
- error_msg = str(e)
187
- if "Tesseract is not available" in error_msg:
188
- return f"Error: Tesseract OCR could not be found. Tried path: {tesseract_cmd}. Please ensure Tesseract is installed and in your PATH."
189
- return f"Error during full force OCR: {error_msg}"
190
 
191
 
192
  # Register the parser with the registry
193
- ParserRegistry.register(DoclingParser)
 
123
 
124
  def _apply_full_force_ocr(self, file_path: Union[str, Path]) -> str:
125
  """Apply full force OCR to a document."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  input_doc = Path(file_path)
127
 
128
  pipeline_options = PdfPipelineOptions()
 
130
  pipeline_options.do_table_structure = True
131
  pipeline_options.table_structure_options.do_cell_matching = True
132
 
133
+ ocr_options = TesseractCliOcrOptions(force_full_page_ocr=True)
 
 
 
 
134
  pipeline_options.ocr_options = ocr_options
135
 
 
 
 
 
 
 
 
 
 
136
  converter = DocumentConverter(
137
  format_options={
138
  InputFormat.PDF: PdfFormatOption(
 
141
  }
142
  )
143
 
144
+ doc = converter.convert(input_doc).document
145
+ return doc.export_to_markdown()
 
 
 
 
 
 
 
146
 
147
 
148
  # Register the parser with the registry
149
+ ParserRegistry.register(DoclingParser)