"""Tests for pipeline options — build_converter, convert_document routing, service forwarding. Requires the ``docling`` library (heavy, includes torch). Tests are skipped automatically when docling is not installed (e.g. in lightweight CI environments that only install docling-core). """ from __future__ import annotations from unittest.mock import AsyncMock, MagicMock, patch import pytest docling = pytest.importorskip("docling", reason="docling library not installed") from docling.datamodel.base_models import InputFormat # noqa: E402 from docling.datamodel.pipeline_options import ( # noqa: E402 PdfPipelineOptions, TableFormerMode, ) from domain.value_objects import ConversionOptions # noqa: E402 from infra.local_converter import ( # noqa: E402 _build_docling_converter as build_converter, ) from infra.local_converter import ( # noqa: E402 _convert_sync as convert_document, ) # --------------------------------------------------------------------------- # build_converter — verifies Docling pipeline options are wired correctly # --------------------------------------------------------------------------- class TestBuildConverter: """Verify that build_converter produces a DocumentConverter with the right PdfPipelineOptions.""" def _get_pipeline_options(self, converter) -> PdfPipelineOptions: """Extract PdfPipelineOptions from a DocumentConverter.""" fmt_opt = converter.format_to_options[InputFormat.PDF] return fmt_opt.pipeline_options def test_defaults(self): conv = build_converter(ConversionOptions()) opts = self._get_pipeline_options(conv) assert opts.do_ocr is True assert opts.do_table_structure is True assert opts.table_structure_options.mode == TableFormerMode.ACCURATE assert opts.do_code_enrichment is False assert opts.do_formula_enrichment is False assert opts.do_picture_classification is False assert opts.do_picture_description is False assert opts.generate_page_images is False assert opts.generate_picture_images is False assert opts.images_scale == 1.0 def test_ocr_disabled(self): conv = build_converter(ConversionOptions(do_ocr=False)) opts = self._get_pipeline_options(conv) assert opts.do_ocr is False def test_table_mode_fast(self): conv = build_converter(ConversionOptions(table_mode="fast")) opts = self._get_pipeline_options(conv) assert opts.table_structure_options.mode == TableFormerMode.FAST def test_table_mode_accurate(self): conv = build_converter(ConversionOptions(table_mode="accurate")) opts = self._get_pipeline_options(conv) assert opts.table_structure_options.mode == TableFormerMode.ACCURATE def test_table_structure_disabled(self): conv = build_converter(ConversionOptions(do_table_structure=False)) opts = self._get_pipeline_options(conv) assert opts.do_table_structure is False def test_code_enrichment_enabled(self): conv = build_converter(ConversionOptions(do_code_enrichment=True)) opts = self._get_pipeline_options(conv) assert opts.do_code_enrichment is True def test_formula_enrichment_enabled(self): conv = build_converter(ConversionOptions(do_formula_enrichment=True)) opts = self._get_pipeline_options(conv) assert opts.do_formula_enrichment is True def test_picture_classification_enabled(self): conv = build_converter(ConversionOptions(do_picture_classification=True)) opts = self._get_pipeline_options(conv) assert opts.do_picture_classification is True def test_picture_description_enabled(self): conv = build_converter(ConversionOptions(do_picture_description=True)) opts = self._get_pipeline_options(conv) assert opts.do_picture_description is True def test_generate_picture_images(self): conv = build_converter(ConversionOptions(generate_picture_images=True)) opts = self._get_pipeline_options(conv) assert opts.generate_picture_images is True def test_generate_page_images(self): conv = build_converter(ConversionOptions(generate_page_images=True)) opts = self._get_pipeline_options(conv) assert opts.generate_page_images is True def test_images_scale(self): conv = build_converter(ConversionOptions(images_scale=2.0)) opts = self._get_pipeline_options(conv) assert opts.images_scale == 2.0 def test_all_options_combined(self): conv = build_converter( ConversionOptions( do_ocr=False, do_table_structure=True, table_mode="fast", do_code_enrichment=True, do_formula_enrichment=True, do_picture_classification=True, do_picture_description=True, generate_picture_images=True, generate_page_images=True, images_scale=1.5, ) ) opts = self._get_pipeline_options(conv) assert opts.do_ocr is False assert opts.do_table_structure is True assert opts.table_structure_options.mode == TableFormerMode.FAST assert opts.do_code_enrichment is True assert opts.do_formula_enrichment is True assert opts.do_picture_classification is True assert opts.do_picture_description is True assert opts.generate_picture_images is True assert opts.generate_page_images is True assert opts.images_scale == 1.5 # --------------------------------------------------------------------------- # convert_document — default vs custom converter routing # --------------------------------------------------------------------------- class TestConvertDocumentRouting: """Verify convert_document uses default converter for default opts, custom otherwise.""" @patch("infra.local_converter._get_default_converter") @patch("infra.local_converter._build_docling_converter") def test_uses_default_converter_with_all_defaults(self, mock_build, mock_get_default): mock_conv = MagicMock() mock_result = MagicMock() mock_result.document.pages = {} mock_result.document.iterate_items.return_value = [] mock_result.document.export_to_markdown.return_value = "" mock_result.document.export_to_html.return_value = "" mock_result.document.export_to_dict.return_value = {} mock_conv.convert.return_value = mock_result mock_get_default.return_value = mock_conv convert_document("/tmp/test.pdf", ConversionOptions()) mock_get_default.assert_called_once() mock_build.assert_not_called() @patch("infra.local_converter._get_default_converter") @patch("infra.local_converter._build_docling_converter") def test_uses_custom_converter_when_ocr_disabled(self, mock_build, mock_get_default): mock_conv = MagicMock() mock_result = MagicMock() mock_result.document.pages = {} mock_result.document.iterate_items.return_value = [] mock_result.document.export_to_markdown.return_value = "" mock_result.document.export_to_html.return_value = "" mock_result.document.export_to_dict.return_value = {} mock_conv.convert.return_value = mock_result mock_build.return_value = mock_conv convert_document("/tmp/test.pdf", ConversionOptions(do_ocr=False)) mock_build.assert_called_once() mock_get_default.assert_not_called() @patch("infra.local_converter._get_default_converter") @patch("infra.local_converter._build_docling_converter") def test_uses_custom_converter_when_table_mode_fast(self, mock_build, mock_get_default): mock_conv = MagicMock() mock_result = MagicMock() mock_result.document.pages = {} mock_result.document.iterate_items.return_value = [] mock_result.document.export_to_markdown.return_value = "" mock_result.document.export_to_html.return_value = "" mock_result.document.export_to_dict.return_value = {} mock_conv.convert.return_value = mock_result mock_build.return_value = mock_conv opts = ConversionOptions(table_mode="fast") convert_document("/tmp/test.pdf", opts) mock_build.assert_called_once_with(opts) @patch("infra.local_converter._get_default_converter") @patch("infra.local_converter._build_docling_converter") def test_uses_custom_converter_when_code_enrichment_on(self, mock_build, mock_get_default): mock_conv = MagicMock() mock_result = MagicMock() mock_result.document.pages = {} mock_result.document.iterate_items.return_value = [] mock_result.document.export_to_markdown.return_value = "" mock_result.document.export_to_html.return_value = "" mock_result.document.export_to_dict.return_value = {} mock_conv.convert.return_value = mock_result mock_build.return_value = mock_conv opts = ConversionOptions(do_code_enrichment=True) convert_document("/tmp/test.pdf", opts) mock_build.assert_called_once_with(opts) @patch("infra.local_converter._get_default_converter") @patch("infra.local_converter._build_docling_converter") def test_uses_custom_converter_when_formula_enrichment_on(self, mock_build, mock_get_default): mock_conv = MagicMock() mock_result = MagicMock() mock_result.document.pages = {} mock_result.document.iterate_items.return_value = [] mock_result.document.export_to_markdown.return_value = "" mock_result.document.export_to_html.return_value = "" mock_result.document.export_to_dict.return_value = {} mock_conv.convert.return_value = mock_result mock_build.return_value = mock_conv convert_document("/tmp/test.pdf", ConversionOptions(do_formula_enrichment=True)) mock_build.assert_called_once() @patch("infra.local_converter._get_default_converter") @patch("infra.local_converter._build_docling_converter") def test_uses_custom_converter_when_picture_options_on(self, mock_build, mock_get_default): mock_conv = MagicMock() mock_result = MagicMock() mock_result.document.pages = {} mock_result.document.iterate_items.return_value = [] mock_result.document.export_to_markdown.return_value = "" mock_result.document.export_to_html.return_value = "" mock_result.document.export_to_dict.return_value = {} mock_conv.convert.return_value = mock_result mock_build.return_value = mock_conv convert_document("/tmp/test.pdf", ConversionOptions(do_picture_classification=True)) mock_build.assert_called_once() @patch("infra.local_converter._get_default_converter") @patch("infra.local_converter._build_docling_converter") def test_uses_custom_converter_when_generate_images_on(self, mock_build, mock_get_default): mock_conv = MagicMock() mock_result = MagicMock() mock_result.document.pages = {} mock_result.document.iterate_items.return_value = [] mock_result.document.export_to_markdown.return_value = "" mock_result.document.export_to_html.return_value = "" mock_result.document.export_to_dict.return_value = {} mock_conv.convert.return_value = mock_result mock_build.return_value = mock_conv convert_document("/tmp/test.pdf", ConversionOptions(generate_picture_images=True)) mock_build.assert_called_once() @patch("infra.local_converter._get_default_converter") @patch("infra.local_converter._build_docling_converter") def test_uses_custom_converter_when_images_scale_changed(self, mock_build, mock_get_default): mock_conv = MagicMock() mock_result = MagicMock() mock_result.document.pages = {} mock_result.document.iterate_items.return_value = [] mock_result.document.export_to_markdown.return_value = "" mock_result.document.export_to_html.return_value = "" mock_result.document.export_to_dict.return_value = {} mock_conv.convert.return_value = mock_result mock_build.return_value = mock_conv opts = ConversionOptions(images_scale=2.0) convert_document("/tmp/test.pdf", opts) mock_build.assert_called_once_with(opts) @patch("infra.local_converter._get_default_converter") @patch("infra.local_converter._build_docling_converter") def test_forwards_all_options_to_build_converter(self, mock_build, mock_get_default): mock_conv = MagicMock() mock_result = MagicMock() mock_result.document.pages = {} mock_result.document.iterate_items.return_value = [] mock_result.document.export_to_markdown.return_value = "" mock_result.document.export_to_html.return_value = "" mock_result.document.export_to_dict.return_value = {} mock_conv.convert.return_value = mock_result mock_build.return_value = mock_conv opts = ConversionOptions( do_ocr=False, do_table_structure=False, table_mode="fast", do_code_enrichment=True, do_formula_enrichment=True, do_picture_classification=True, do_picture_description=True, generate_picture_images=True, generate_page_images=True, images_scale=1.5, ) convert_document("/tmp/test.pdf", opts) mock_build.assert_called_once_with(opts) # --------------------------------------------------------------------------- # Service layer — pipeline options forwarding # --------------------------------------------------------------------------- class TestServiceForwardsPipelineOptions: """Verify analysis_service.create and _run_analysis forward pipeline options.""" @pytest.fixture def mock_doc(self): from domain.models import Document return Document(id="d1", filename="test.pdf", storage_path="/tmp/test.pdf") @pytest.fixture def mock_job(self): from domain.models import AnalysisJob return AnalysisJob(id="j1", document_id="d1", document_filename="test.pdf") @patch("services.analysis_service.document_repo") @patch("services.analysis_service.analysis_repo") @pytest.mark.asyncio async def test_create_passes_pipeline_options_to_run( self, mock_analysis_repo, mock_doc_repo, mock_doc, ): mock_doc_repo.find_by_id = AsyncMock(return_value=mock_doc) mock_analysis_repo.insert = AsyncMock() mock_converter = AsyncMock() from services.analysis_service import AnalysisService svc = AnalysisService(converter=mock_converter) opts = {"do_ocr": False, "table_mode": "fast"} with patch("services.analysis_service.asyncio.create_task") as mock_task: await svc.create("d1", pipeline_options=opts) mock_task.assert_called_once() @patch("services.analysis_service.document_repo") @patch("services.analysis_service.analysis_repo") @pytest.mark.asyncio async def test_create_passes_none_when_no_options( self, mock_analysis_repo, mock_doc_repo, mock_doc, ): mock_doc_repo.find_by_id = AsyncMock(return_value=mock_doc) mock_analysis_repo.insert = AsyncMock() mock_converter = AsyncMock() from services.analysis_service import AnalysisService svc = AnalysisService(converter=mock_converter) with patch("services.analysis_service.asyncio.create_task") as mock_task: await svc.create("d1") mock_task.assert_called_once() @patch("services.analysis_service.analysis_repo") @patch("services.analysis_service.document_repo") @pytest.mark.asyncio async def test_run_analysis_forwards_options_to_convert( self, mock_doc_repo, mock_analysis_repo, mock_job, ): from domain.value_objects import ConversionResult, PageDetail mock_analysis_repo.find_by_id = AsyncMock(return_value=mock_job) mock_analysis_repo.update_status = AsyncMock() mock_doc_repo.update_page_count = AsyncMock() mock_converter = AsyncMock() mock_converter.convert.return_value = ConversionResult( page_count=1, content_markdown="# Test", content_html="