Spaces:
Runtime error
Runtime error
Refactor OCR configuration in DoclingParser to use EasyOCR exclusively
Browse files- Simplified the OCR method configuration by removing Tesseract support and defaulting to EasyOCR.
- Updated the supported OCR methods list to reflect the removal of Tesseract, enhancing clarity and maintainability.
- Improved logging to indicate the use of EasyOCR for CPU-only processing.
src/parsers/docling_parser.py
CHANGED
|
@@ -183,21 +183,9 @@ class DoclingParser(DocumentParser):
|
|
| 183 |
pipeline_options.do_table_structure = True
|
| 184 |
pipeline_options.table_structure_options.do_cell_matching = True
|
| 185 |
|
| 186 |
-
# Configure OCR method -
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
import subprocess
|
| 190 |
-
subprocess.run(["tesseract", "--version"], capture_output=True, check=True)
|
| 191 |
-
pipeline_options.ocr_options = TesseractOcrOptions()
|
| 192 |
-
logger.info("Using Tesseract OCR (CPU-only)")
|
| 193 |
-
except (FileNotFoundError, subprocess.CalledProcessError):
|
| 194 |
-
logger.warning("Tesseract not available, falling back to EasyOCR")
|
| 195 |
-
pipeline_options.ocr_options = EasyOcrOptions()
|
| 196 |
-
logger.info("Using EasyOCR (CPU-only)")
|
| 197 |
-
else:
|
| 198 |
-
# Default to EasyOCR (including docling_easyocr and docling_default)
|
| 199 |
-
pipeline_options.ocr_options = EasyOcrOptions()
|
| 200 |
-
logger.info("Using EasyOCR (CPU-only)")
|
| 201 |
|
| 202 |
# Configure advanced features
|
| 203 |
pipeline_options.do_table_structure = kwargs.get('enable_tables', True)
|
|
@@ -238,13 +226,8 @@ class DoclingParser(DocumentParser):
|
|
| 238 |
pipeline_options.do_table_structure = True
|
| 239 |
pipeline_options.table_structure_options.do_cell_matching = True
|
| 240 |
|
| 241 |
-
# Configure OCR method
|
| 242 |
-
|
| 243 |
-
pipeline_options.ocr_options = TesseractOcrOptions()
|
| 244 |
-
elif ocr_method == "docling_easyocr":
|
| 245 |
-
pipeline_options.ocr_options = EasyOcrOptions()
|
| 246 |
-
else: # Default to EasyOCR
|
| 247 |
-
pipeline_options.ocr_options = EasyOcrOptions()
|
| 248 |
|
| 249 |
# Configure advanced features
|
| 250 |
pipeline_options.do_table_structure = kwargs.get('enable_tables', True)
|
|
@@ -308,21 +291,10 @@ class DoclingParser(DocumentParser):
|
|
| 308 |
@classmethod
|
| 309 |
def get_supported_ocr_methods(cls) -> List[Dict[str, Any]]:
|
| 310 |
"""Return list of supported OCR methods."""
|
| 311 |
-
|
| 312 |
{
|
| 313 |
"id": "docling_default",
|
| 314 |
-
"name": "
|
| 315 |
-
"default_params": {
|
| 316 |
-
"enable_tables": True,
|
| 317 |
-
"enable_code_enrichment": False,
|
| 318 |
-
"enable_formula_enrichment": False,
|
| 319 |
-
"enable_picture_classification": False,
|
| 320 |
-
"generate_picture_images": False
|
| 321 |
-
}
|
| 322 |
-
},
|
| 323 |
-
{
|
| 324 |
-
"id": "docling_easyocr",
|
| 325 |
-
"name": "Docling EasyOCR",
|
| 326 |
"default_params": {
|
| 327 |
"enable_tables": True,
|
| 328 |
"enable_code_enrichment": False,
|
|
@@ -332,26 +304,6 @@ class DoclingParser(DocumentParser):
|
|
| 332 |
}
|
| 333 |
}
|
| 334 |
]
|
| 335 |
-
|
| 336 |
-
# Add Tesseract method if available (requires system installation)
|
| 337 |
-
try:
|
| 338 |
-
import subprocess
|
| 339 |
-
subprocess.run(["tesseract", "--version"], capture_output=True, check=True)
|
| 340 |
-
methods.append({
|
| 341 |
-
"id": "docling_tesseract",
|
| 342 |
-
"name": "Docling Tesseract OCR",
|
| 343 |
-
"default_params": {
|
| 344 |
-
"enable_tables": True,
|
| 345 |
-
"enable_code_enrichment": False,
|
| 346 |
-
"enable_formula_enrichment": False,
|
| 347 |
-
"enable_picture_classification": False,
|
| 348 |
-
"generate_picture_images": False
|
| 349 |
-
}
|
| 350 |
-
})
|
| 351 |
-
except (FileNotFoundError, subprocess.CalledProcessError):
|
| 352 |
-
logger.debug("Tesseract not available on system")
|
| 353 |
-
|
| 354 |
-
return methods
|
| 355 |
|
| 356 |
@classmethod
|
| 357 |
def get_description(cls) -> str:
|
|
|
|
| 183 |
pipeline_options.do_table_structure = True
|
| 184 |
pipeline_options.table_structure_options.do_cell_matching = True
|
| 185 |
|
| 186 |
+
# Configure OCR method - use EasyOCR with CPU enforcement
|
| 187 |
+
pipeline_options.ocr_options = EasyOcrOptions()
|
| 188 |
+
logger.info("Using EasyOCR (CPU-only)")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 189 |
|
| 190 |
# Configure advanced features
|
| 191 |
pipeline_options.do_table_structure = kwargs.get('enable_tables', True)
|
|
|
|
| 226 |
pipeline_options.do_table_structure = True
|
| 227 |
pipeline_options.table_structure_options.do_cell_matching = True
|
| 228 |
|
| 229 |
+
# Configure OCR method - use EasyOCR
|
| 230 |
+
pipeline_options.ocr_options = EasyOcrOptions()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 231 |
|
| 232 |
# Configure advanced features
|
| 233 |
pipeline_options.do_table_structure = kwargs.get('enable_tables', True)
|
|
|
|
| 291 |
@classmethod
|
| 292 |
def get_supported_ocr_methods(cls) -> List[Dict[str, Any]]:
|
| 293 |
"""Return list of supported OCR methods."""
|
| 294 |
+
return [
|
| 295 |
{
|
| 296 |
"id": "docling_default",
|
| 297 |
+
"name": "EasyOCR",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 298 |
"default_params": {
|
| 299 |
"enable_tables": True,
|
| 300 |
"enable_code_enrichment": False,
|
|
|
|
| 304 |
}
|
| 305 |
}
|
| 306 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 307 |
|
| 308 |
@classmethod
|
| 309 |
def get_description(cls) -> str:
|