| from pathlib import Path |
|
|
| from docling.datamodel.base_models import InputFormat |
| from docling.datamodel.pipeline_options import ( |
| AcceleratorDevice, |
| AcceleratorOptions, |
| PdfPipelineOptions, |
| ) |
| from docling.datamodel.settings import settings |
| from docling.document_converter import DocumentConverter, PdfFormatOption |
| from docling_core.types.doc import ImageRefMode |
|
|
| from .settings import ENABLE_DEBUG_MODE |
|
|
| DOCLING_DEBUG_PATH = Path("/tmp/docling") |
|
|
| |
| accelerator_options = AcceleratorOptions(num_threads=8, device=AcceleratorDevice.AUTO) |
| pipeline_options = PdfPipelineOptions() |
| pipeline_options.accelerator_options = accelerator_options |
| pipeline_options.do_ocr = True |
| pipeline_options.do_table_structure = True |
| pipeline_options.do_formula_enrichment = True |
| pipeline_options.generate_picture_images = True |
| pipeline_options.images_scale = 2.0 |
|
|
| |
| settings.debug.debug_output_path = str(DOCLING_DEBUG_PATH) |
| settings.debug.visualize_layout = ENABLE_DEBUG_MODE |
| settings.debug.visualize_tables = ENABLE_DEBUG_MODE |
|
|
| |
| docling_converter = DocumentConverter( |
| format_options={ |
| InputFormat.PDF: PdfFormatOption( |
| pipeline_options=pipeline_options, |
| ) |
| } |
| ) |
|
|
|
|
| def convert_docling(path: str, file_name: str): |
| result = docling_converter.convert(path) |
| text = result.document.export_to_markdown(image_mode=ImageRefMode.EMBEDDED) |
| debug_image_dir = DOCLING_DEBUG_PATH / f"debug_{file_name}" |
| if debug_image_dir.exists(): |
| debug_image_paths = [ |
| path for path in debug_image_dir.iterdir() if path.suffix == ".png" |
| ] |
| else: |
| debug_image_paths = [] |
|
|
| return text, debug_image_paths |
|
|