import tempfile import unittest from pathlib import Path from unittest.mock import patch from zsgdp.config import load_config from zsgdp.parsers.external import MarkerParser, _read_external_markdown, _read_marker_markdown, normalize_marker_markdown from zsgdp.schema import DocumentProfile, PageProfile class MarkerParserTests(unittest.TestCase): def test_normalize_marker_markdown_emits_common_schema(self): profile = DocumentProfile( doc_id="d1", source_path="sample.pdf", file_type="pdf", page_count=1, extension=".pdf", pages=[PageProfile(page_num=1, digital_text_chars=20)], ) candidate = normalize_marker_markdown( markdown="# Report\n\n| A | B |\n| --- | --- |\n| 1 | 2 |\n\n![Chart](chart.png)", profile=profile, source_path="sample.pdf", ) self.assertEqual(candidate.parser_name, "marker") self.assertEqual(len(candidate.tables), 1) self.assertEqual(len(candidate.figures), 1) self.assertEqual(candidate.pages[0]["source_parser"], "marker") def test_marker_parser_runs_markdown_through_normalizer(self): profile = DocumentProfile( doc_id="d1", source_path="sample.pdf", file_type="pdf", page_count=1, extension=".pdf", pages=[PageProfile(page_num=1, digital_text_chars=20)], ) with patch.object(MarkerParser, "available", return_value=True), patch( "zsgdp.parsers.external.run_marker_to_markdown", return_value="# Report\n\nBody.", ): candidate = MarkerParser().parse("sample.pdf", profile, load_config()) self.assertEqual(candidate.parser_name, "marker") self.assertEqual(candidate.elements[0].source_parser, "marker") self.assertEqual(candidate.provenance["requested_pages"], [1]) def test_read_marker_markdown_prefers_markdown_file(self): with tempfile.TemporaryDirectory() as tmp: root = Path(tmp) nested = root / "sample" nested.mkdir() (nested / "other.md").write_text("# Other", encoding="utf-8") (nested / "markdown.md").write_text("# Preferred", encoding="utf-8") markdown = _read_marker_markdown(root) self.assertEqual(markdown, "# Preferred") def test_read_external_markdown_falls_back_to_stdout(self): with tempfile.TemporaryDirectory() as tmp: markdown = _read_external_markdown(Path(tmp), parser_name="mineru", stdout="# From stdout") self.assertEqual(markdown, "# From stdout") if __name__ == "__main__": unittest.main()