"""Tests for the ServeConverter adapter (Docling Serve HTTP client).""" from __future__ import annotations import importlib import json from unittest.mock import AsyncMock, MagicMock, patch import httpx import pytest from domain.value_objects import ConversionOptions, ConversionResult from infra.serve_converter import ( ServeConverter, _build_form_data, _extract_bbox, _parse_response, ) def _has_docling() -> bool: """Return True if the heavy docling library is available.""" return importlib.util.find_spec("docling") is not None # --------------------------------------------------------------------------- # Unit tests — form data building # --------------------------------------------------------------------------- class TestBuildFormData: def test_default_options(self): data = _build_form_data(ConversionOptions()) assert data["do_ocr"] == "true" assert data["do_table_structure"] == "true" assert data["table_mode"] == "accurate" assert data["do_code_enrichment"] == "false" assert data["do_formula_enrichment"] == "false" assert data["do_picture_classification"] == "false" assert data["do_picture_description"] == "false" assert data["include_images"] == "false" assert data["generate_page_images"] == "false" assert data["images_scale"] == "1.0" assert set(data["to_formats"]) == {"md", "html", "json"} def test_custom_options(self): opts = ConversionOptions( do_ocr=False, table_mode="fast", images_scale=2.0, generate_picture_images=True, ) data = _build_form_data(opts) assert data["do_ocr"] == "false" assert data["table_mode"] == "fast" assert data["images_scale"] == "2.0" assert data["include_images"] == "true" # --------------------------------------------------------------------------- # Unit tests — response parsing # --------------------------------------------------------------------------- class TestParseResponse: def test_minimal_response(self): data = { "document": { "md_content": "# Hello", "html_content": "

Hello

", "json_content": { "pages": {"1": {"size": {"width": 612.0, "height": 792.0}}}, "texts": [], "tables": [], "pictures": [], }, } } result = _parse_response(data) assert isinstance(result, ConversionResult) assert result.content_markdown == "# Hello" assert result.content_html == "

Hello

" assert result.page_count == 1 assert result.pages[0].width == 612.0 assert result.document_json is not None def test_response_with_elements(self): data = { "document": { "md_content": "# Title\nText", "html_content": "

Title

Text

", "json_content": { "pages": {"1": {"size": {"width": 612.0, "height": 792.0}}}, "texts": [ { "label": "title", "text": "Title", "prov": [ { "page_no": 1, "bbox": { "l": 10, "t": 20, "r": 200, "b": 40, "coord_origin": "TOPLEFT", }, } ], }, { "label": "paragraph", "text": "Text", "prov": [ { "page_no": 1, "bbox": { "l": 10, "t": 50, "r": 200, "b": 70, "coord_origin": "TOPLEFT", }, } ], }, ], "tables": [], "pictures": [], }, } } result = _parse_response(data) assert len(result.pages[0].elements) == 2 assert result.pages[0].elements[0].type == "title" assert result.pages[0].elements[0].content == "Title" assert result.pages[0].elements[0].bbox == [10, 20, 200, 40] assert result.pages[0].elements[1].type == "text" def test_multi_page(self): data = { "document": { "md_content": "", "html_content": "", "json_content": { "pages": { "1": {"size": {"width": 612.0, "height": 792.0}}, "2": {"size": {"width": 595.0, "height": 842.0}}, }, "texts": [], "tables": [], "pictures": [], }, } } result = _parse_response(data) assert result.page_count == 2 assert result.pages[1].width == 595.0 def test_no_json_content(self): data = { "document": { "md_content": "text", "html_content": "

text

", } } result = _parse_response(data) assert result.content_markdown == "text" assert result.pages == [] assert result.page_count == 1 assert result.document_json is None def test_json_content_as_string(self): json_doc = { "pages": {"1": {"size": {"width": 612.0, "height": 792.0}}}, "texts": [], "tables": [], "pictures": [], } data = { "document": { "md_content": "", "html_content": "", "json_content": json.dumps(json_doc), } } result = _parse_response(data) assert result.page_count == 1 def test_json_content_malformed_string_falls_back(self): """Bug #5: malformed JSON string in json_content must not crash.""" data = { "document": { "md_content": "# Hello", "html_content": "

Hello

", "json_content": "NOT VALID JSON {{{", } } result = _parse_response(data) assert isinstance(result, ConversionResult) assert result.content_markdown == "# Hello" assert result.pages == [] assert result.page_count == 1 def test_tables_and_pictures(self): data = { "document": { "md_content": "", "html_content": "", "json_content": { "pages": {"1": {"size": {"width": 612.0, "height": 792.0}}}, "texts": [], "tables": [ { "label": "table", "text": "", "prov": [ { "page_no": 1, "bbox": { "l": 10, "t": 10, "r": 300, "b": 200, "coord_origin": "TOPLEFT", }, } ], }, ], "pictures": [ { "label": "picture", "text": "", "prov": [ { "page_no": 1, "bbox": { "l": 50, "t": 300, "r": 250, "b": 500, "coord_origin": "TOPLEFT", }, } ], }, ], }, } } result = _parse_response(data) types = [e.type for e in result.pages[0].elements] assert "table" in types assert "picture" in types # --------------------------------------------------------------------------- # Unit tests — bbox extraction # --------------------------------------------------------------------------- class TestExtractBbox: def test_topleft_passthrough(self): bbox = _extract_bbox( {"l": 10, "t": 20, "r": 100, "b": 50, "coord_origin": "TOPLEFT"}, 792.0 ) assert bbox == [10, 20, 100, 50] def test_bottomleft_conversion(self): # In BOTTOMLEFT: t (top of box) has higher y than b (bottom of box) bbox = _extract_bbox( {"l": 10, "t": 772, "r": 100, "b": 742, "coord_origin": "BOTTOMLEFT"}, 792.0 ) # new_top = 792 - 772 = 20, new_bottom = 792 - 742 = 50 assert bbox == [10, 20, 100, 50] def test_missing_coord_origin_defaults_topleft(self): bbox = _extract_bbox({"l": 10, "t": 20, "r": 100, "b": 50}, 792.0) assert bbox == [10, 20, 100, 50] def test_empty_dict(self): bbox = _extract_bbox({}, 792.0) assert bbox == [0.0, 0.0, 0.0, 0.0] def test_non_dict_returns_zeros(self): bbox = _extract_bbox("invalid", 792.0) assert bbox == [0.0, 0.0, 0.0, 0.0] # --------------------------------------------------------------------------- # Unit tests — label mapping # --------------------------------------------------------------------------- class TestLabelMapping: def test_known_labels(self): from infra.serve_converter import _LABEL_MAP assert _LABEL_MAP["table"] == "table" assert _LABEL_MAP["picture"] == "picture" assert _LABEL_MAP["figure"] == "picture" assert _LABEL_MAP["title"] == "title" assert _LABEL_MAP["section_header"] == "section_header" assert _LABEL_MAP["list_item"] == "list" assert _LABEL_MAP["formula"] == "formula" assert _LABEL_MAP["code"] == "code" assert _LABEL_MAP["paragraph"] == "text" def test_unknown_label_defaults_to_text(self): from infra.serve_converter import _LABEL_MAP assert _LABEL_MAP.get("unknown_thing", "text") == "text" # --------------------------------------------------------------------------- # Unit tests — ServeConverter # --------------------------------------------------------------------------- class TestServeConverter: def test_headers_with_api_key(self): conv = ServeConverter(base_url="http://localhost:5001", api_key="secret") assert conv._headers() == {"X-Api-Key": "secret"} def test_headers_without_api_key(self): conv = ServeConverter(base_url="http://localhost:5001") assert conv._headers() == {} def test_base_url_trailing_slash_stripped(self): conv = ServeConverter(base_url="http://localhost:5001/") assert conv._base_url == "http://localhost:5001" # --------------------------------------------------------------------------- # Integration tests — HTTP calls (mocked) # --------------------------------------------------------------------------- class TestServeConverterConvert: @pytest.mark.asyncio async def test_successful_conversion(self, tmp_path): test_file = tmp_path / "test.pdf" test_file.write_bytes(b"%PDF-1.4 fake content") serve_response = { "document": { "md_content": "# Converted", "html_content": "

Converted

", "json_content": { "pages": {"1": {"size": {"width": 612.0, "height": 792.0}}}, "texts": [ { "label": "title", "text": "Converted", "prov": [ { "page_no": 1, "bbox": { "l": 10, "t": 20, "r": 200, "b": 40, "coord_origin": "TOPLEFT", }, } ], }, ], "tables": [], "pictures": [], }, } } mock_response = MagicMock() mock_response.status_code = 200 mock_response.json.return_value = serve_response mock_response.raise_for_status = MagicMock() mock_client = AsyncMock() mock_client.post.return_value = mock_response mock_client.__aenter__ = AsyncMock(return_value=mock_client) mock_client.__aexit__ = AsyncMock(return_value=False) conv = ServeConverter(base_url="http://localhost:5001", api_key="test-key") with patch("infra.serve_converter.httpx.AsyncClient", return_value=mock_client): result = await conv.convert(str(test_file), ConversionOptions()) assert isinstance(result, ConversionResult) assert result.content_markdown == "# Converted" assert result.page_count == 1 assert len(result.pages[0].elements) == 1 assert result.pages[0].elements[0].type == "title" # Verify form fields sent as dict with list for repeated keys call_kwargs = mock_client.post.call_args sent_data = call_kwargs.kwargs.get("data", {}) assert sent_data["do_ocr"] == "true" assert set(sent_data["to_formats"]) == {"md", "html", "json"} @pytest.mark.asyncio async def test_http_error_raises(self, tmp_path): test_file = tmp_path / "test.pdf" test_file.write_bytes(b"%PDF-1.4 fake content") mock_response = MagicMock() mock_response.raise_for_status.side_effect = httpx.HTTPStatusError( "Server Error", request=MagicMock(), response=MagicMock(status_code=500), ) mock_client = AsyncMock() mock_client.post.return_value = mock_response mock_client.__aenter__ = AsyncMock(return_value=mock_client) mock_client.__aexit__ = AsyncMock(return_value=False) conv = ServeConverter(base_url="http://localhost:5001") with ( patch("infra.serve_converter.httpx.AsyncClient", return_value=mock_client), pytest.raises(httpx.HTTPStatusError), ): await conv.convert(str(test_file), ConversionOptions()) @pytest.mark.asyncio async def test_health_check_success(self): mock_response = MagicMock() mock_response.status_code = 200 mock_client = AsyncMock() mock_client.get.return_value = mock_response mock_client.__aenter__ = AsyncMock(return_value=mock_client) mock_client.__aexit__ = AsyncMock(return_value=False) conv = ServeConverter(base_url="http://localhost:5001") with patch("infra.serve_converter.httpx.AsyncClient", return_value=mock_client): assert await conv.health_check() is True @pytest.mark.asyncio async def test_health_check_failure(self): mock_client = AsyncMock() mock_client.get.side_effect = httpx.ConnectError("Connection refused") mock_client.__aenter__ = AsyncMock(return_value=mock_client) mock_client.__aexit__ = AsyncMock(return_value=False) conv = ServeConverter(base_url="http://localhost:5001") with patch("infra.serve_converter.httpx.AsyncClient", return_value=mock_client): assert await conv.health_check() is False # --------------------------------------------------------------------------- # Integration — converter wiring in main.py # --------------------------------------------------------------------------- class TestConverterWiring: @pytest.mark.skipif( not _has_docling(), reason="docling library not installed", ) def test_local_engine_builds_local_converter(self): from infra.local_converter import LocalConverter from infra.settings import Settings with patch("main.settings", Settings(conversion_engine="local")): from main import _build_converter converter = _build_converter() assert isinstance(converter, LocalConverter) def test_remote_engine_builds_serve_converter(self): from infra.settings import Settings with patch( "main.settings", Settings(conversion_engine="remote", docling_serve_url="http://serve:5001"), ): from main import _build_converter converter = _build_converter() assert isinstance(converter, ServeConverter) assert converter._base_url == "http://serve:5001" def test_remote_engine_passes_api_key(self): from infra.settings import Settings with patch( "main.settings", Settings( conversion_engine="remote", docling_serve_url="http://serve:5001", docling_serve_api_key="my-key", ), ): from main import _build_converter converter = _build_converter() assert isinstance(converter, ServeConverter) assert converter._api_key == "my-key"