Spaces:
Paused
Paused
changing setup.sh to see logs of where tesseract is located in hugging face space
Browse files- setup.sh +8 -1
- src/parsers/docling_parser.py +4 -48
setup.sh
CHANGED
|
@@ -49,4 +49,11 @@ if [ -f "test.png" ]; then
|
|
| 49 |
fi
|
| 50 |
fi
|
| 51 |
|
| 52 |
-
echo "Setup completed"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
fi
|
| 50 |
fi
|
| 51 |
|
| 52 |
+
echo "Setup completed"
|
| 53 |
+
|
| 54 |
+
# Add these diagnostic commands at the end of your setup.sh
|
| 55 |
+
echo "Checking Tesseract location:"
|
| 56 |
+
which tesseract || echo "Tesseract not found in PATH"
|
| 57 |
+
whereis tesseract
|
| 58 |
+
echo "Current PATH: $PATH"
|
| 59 |
+
echo "TESSDATA_PREFIX: $TESSDATA_PREFIX"
|
src/parsers/docling_parser.py
CHANGED
|
@@ -123,30 +123,6 @@ class DoclingParser(DocumentParser):
|
|
| 123 |
|
| 124 |
def _apply_full_force_ocr(self, file_path: Union[str, Path]) -> str:
|
| 125 |
"""Apply full force OCR to a document."""
|
| 126 |
-
import subprocess
|
| 127 |
-
import os
|
| 128 |
-
|
| 129 |
-
# Try to find tesseract binary
|
| 130 |
-
tesseract_cmd = 'tesseract'
|
| 131 |
-
try:
|
| 132 |
-
# Check if tesseract is available
|
| 133 |
-
subprocess.run([tesseract_cmd, '--version'],
|
| 134 |
-
stdout=subprocess.PIPE,
|
| 135 |
-
stderr=subprocess.PIPE,
|
| 136 |
-
check=True)
|
| 137 |
-
except (subprocess.SubprocessError, FileNotFoundError):
|
| 138 |
-
# Try common locations in Hugging Face environment
|
| 139 |
-
potential_paths = [
|
| 140 |
-
'/usr/bin/tesseract',
|
| 141 |
-
'/usr/local/bin/tesseract',
|
| 142 |
-
'/opt/conda/bin/tesseract'
|
| 143 |
-
]
|
| 144 |
-
|
| 145 |
-
for path in potential_paths:
|
| 146 |
-
if os.path.exists(path):
|
| 147 |
-
tesseract_cmd = path
|
| 148 |
-
break
|
| 149 |
-
|
| 150 |
input_doc = Path(file_path)
|
| 151 |
|
| 152 |
pipeline_options = PdfPipelineOptions()
|
|
@@ -154,22 +130,9 @@ class DoclingParser(DocumentParser):
|
|
| 154 |
pipeline_options.do_table_structure = True
|
| 155 |
pipeline_options.table_structure_options.do_cell_matching = True
|
| 156 |
|
| 157 |
-
|
| 158 |
-
ocr_options = TesseractCliOcrOptions(
|
| 159 |
-
force_full_page_ocr=True,
|
| 160 |
-
tesseract_cmd=tesseract_cmd
|
| 161 |
-
)
|
| 162 |
pipeline_options.ocr_options = ocr_options
|
| 163 |
|
| 164 |
-
# Set tessdata prefix if not already set
|
| 165 |
-
if not os.environ.get('TESSDATA_PREFIX'):
|
| 166 |
-
for prefix in ['/usr/share/tesseract-ocr/4.00/tessdata',
|
| 167 |
-
'/usr/share/tessdata',
|
| 168 |
-
'/usr/local/share/tessdata']:
|
| 169 |
-
if os.path.exists(prefix):
|
| 170 |
-
os.environ['TESSDATA_PREFIX'] = prefix
|
| 171 |
-
break
|
| 172 |
-
|
| 173 |
converter = DocumentConverter(
|
| 174 |
format_options={
|
| 175 |
InputFormat.PDF: PdfFormatOption(
|
|
@@ -178,16 +141,9 @@ class DoclingParser(DocumentParser):
|
|
| 178 |
}
|
| 179 |
)
|
| 180 |
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
return doc.export_to_markdown()
|
| 184 |
-
except Exception as e:
|
| 185 |
-
# Provide more helpful error message
|
| 186 |
-
error_msg = str(e)
|
| 187 |
-
if "Tesseract is not available" in error_msg:
|
| 188 |
-
return f"Error: Tesseract OCR could not be found. Tried path: {tesseract_cmd}. Please ensure Tesseract is installed and in your PATH."
|
| 189 |
-
return f"Error during full force OCR: {error_msg}"
|
| 190 |
|
| 191 |
|
| 192 |
# Register the parser with the registry
|
| 193 |
-
ParserRegistry.register(DoclingParser)
|
|
|
|
| 123 |
|
| 124 |
def _apply_full_force_ocr(self, file_path: Union[str, Path]) -> str:
|
| 125 |
"""Apply full force OCR to a document."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
input_doc = Path(file_path)
|
| 127 |
|
| 128 |
pipeline_options = PdfPipelineOptions()
|
|
|
|
| 130 |
pipeline_options.do_table_structure = True
|
| 131 |
pipeline_options.table_structure_options.do_cell_matching = True
|
| 132 |
|
| 133 |
+
ocr_options = TesseractCliOcrOptions(force_full_page_ocr=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
pipeline_options.ocr_options = ocr_options
|
| 135 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
converter = DocumentConverter(
|
| 137 |
format_options={
|
| 138 |
InputFormat.PDF: PdfFormatOption(
|
|
|
|
| 141 |
}
|
| 142 |
)
|
| 143 |
|
| 144 |
+
doc = converter.convert(input_doc).document
|
| 145 |
+
return doc.export_to_markdown()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
|
| 147 |
|
| 148 |
# Register the parser with the registry
|
| 149 |
+
ParserRegistry.register(DoclingParser)
|