Spaces:

Ansemin101
/

Markit

Paused

App Files Files Community

AnseMin commited on Mar 9, 2025

Commit

3860890

1 Parent(s): 292de5e

changing setup.sh to see logs of where tesseract is located in hugging face space

Browse files

Files changed (2) hide show

setup.sh +8 -1
src/parsers/docling_parser.py +4 -48

setup.sh CHANGED Viewed

@@ -49,4 +49,11 @@ if [ -f "test.png" ]; then
   fi
 fi
-echo "Setup completed"

   fi
 fi
+echo "Setup completed"
+# Add these diagnostic commands at the end of your setup.sh
+echo "Checking Tesseract location:"
+which tesseract || echo "Tesseract not found in PATH"
+whereis tesseract
+echo "Current PATH: $PATH"
+echo "TESSDATA_PREFIX: $TESSDATA_PREFIX"

src/parsers/docling_parser.py CHANGED Viewed

@@ -123,30 +123,6 @@ class DoclingParser(DocumentParser):
     def _apply_full_force_ocr(self, file_path: Union[str, Path]) -> str:
         """Apply full force OCR to a document."""
-        import subprocess
-        import os
-        # Try to find tesseract binary
-        tesseract_cmd = 'tesseract'
-        try:
-            # Check if tesseract is available
-            subprocess.run([tesseract_cmd, '--version'],
-                          stdout=subprocess.PIPE,
-                          stderr=subprocess.PIPE,
-                          check=True)
-        except (subprocess.SubprocessError, FileNotFoundError):
-            # Try common locations in Hugging Face environment
-            potential_paths = [
-                '/usr/bin/tesseract',
-                '/usr/local/bin/tesseract',
-                '/opt/conda/bin/tesseract'
-            ]
-            for path in potential_paths:
-                if os.path.exists(path):
-                    tesseract_cmd = path
-                    break
         input_doc = Path(file_path)
         pipeline_options = PdfPipelineOptions()
@@ -154,22 +130,9 @@ class DoclingParser(DocumentParser):
         pipeline_options.do_table_structure = True
         pipeline_options.table_structure_options.do_cell_matching = True
-        # Create OCR options with explicit tesseract path
-        ocr_options = TesseractCliOcrOptions(
-            force_full_page_ocr=True,
-            tesseract_cmd=tesseract_cmd
-        )
         pipeline_options.ocr_options = ocr_options
-        # Set tessdata prefix if not already set
-        if not os.environ.get('TESSDATA_PREFIX'):
-            for prefix in ['/usr/share/tesseract-ocr/4.00/tessdata',
-                          '/usr/share/tessdata',
-                          '/usr/local/share/tessdata']:
-                if os.path.exists(prefix):
-                    os.environ['TESSDATA_PREFIX'] = prefix
-                    break
         converter = DocumentConverter(
             format_options={
                 InputFormat.PDF: PdfFormatOption(
@@ -178,16 +141,9 @@ class DoclingParser(DocumentParser):
             }
         )
-        try:
-            doc = converter.convert(input_doc).document
-            return doc.export_to_markdown()
-        except Exception as e:
-            # Provide more helpful error message
-            error_msg = str(e)
-            if "Tesseract is not available" in error_msg:
-                return f"Error: Tesseract OCR could not be found. Tried path: {tesseract_cmd}. Please ensure Tesseract is installed and in your PATH."
-            return f"Error during full force OCR: {error_msg}"
 # Register the parser with the registry
-ParserRegistry.register(DoclingParser)

     def _apply_full_force_ocr(self, file_path: Union[str, Path]) -> str:
         """Apply full force OCR to a document."""
         input_doc = Path(file_path)
         pipeline_options = PdfPipelineOptions()
         pipeline_options.do_table_structure = True
         pipeline_options.table_structure_options.do_cell_matching = True
+        ocr_options = TesseractCliOcrOptions(force_full_page_ocr=True)
         pipeline_options.ocr_options = ocr_options
         converter = DocumentConverter(
             format_options={
                 InputFormat.PDF: PdfFormatOption(
             }
         )
+        doc = converter.convert(input_doc).document
+        return doc.export_to_markdown()
 # Register the parser with the registry
+ParserRegistry.register(DoclingParser)