import tempfile import unittest from pathlib import Path from zsgdp.cli import main from zsgdp.pipeline import parse_document class PipelineTests(unittest.TestCase): def test_parse_document_writes_outputs(self): with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) input_path = tmp_path / "sample.md" output_dir = tmp_path / "out" input_path.write_text("# Report\n\nThis is a test document.\n", encoding="utf-8") parsed = parse_document(input_path, output_dir) self.assertTrue(parsed.elements) self.assertTrue(parsed.chunks) self.assertEqual(parsed.provenance["config_deployment"]["gpu_models_target"], "zeroshotGPU") self.assertEqual(parsed.provenance["gpu_runtime"]["gpu_models_target"], "zeroshotGPU") self.assertIn("chunking", parsed.provenance) self.assertTrue((output_dir / "parsed_document.json").exists()) self.assertTrue((output_dir / "elements.jsonl").exists()) self.assertTrue((output_dir / "chunks.jsonl").exists()) self.assertTrue((output_dir / "chunking_plan.json").exists()) self.assertTrue((output_dir / "parser_metrics.json").exists()) self.assertTrue((output_dir / "gpu_runtime.json").exists()) self.assertTrue((output_dir / "artifact_manifest.json").exists()) def test_parse_document_exports_gpu_tasks_when_visual_work_exists(self): try: import fitz # type: ignore except ImportError: self.skipTest("PyMuPDF is not installed") with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) input_path = tmp_path / "visual.pdf" output_dir = tmp_path / "out" doc = fitz.open() page = doc.new_page() page.insert_text((72, 72), "Report") page.draw_rect(fitz.Rect(72, 120, 180, 180)) doc.save(input_path) doc.close() parsed = parse_document(input_path, output_dir, config_overrides={"chunking": {"vision_guided": True}}) self.assertTrue(parsed.provenance["gpu_tasks"]) self.assertEqual(parsed.provenance["gpu_task_report"]["task_count"], len(parsed.provenance["gpu_tasks"])) self.assertTrue((output_dir / "gpu_tasks.jsonl").exists()) self.assertTrue((output_dir / "gpu_task_report.json").exists()) def test_parse_document_reverifies_after_repair(self): with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) input_path = tmp_path / "table.md" output_dir = tmp_path / "out" input_path.write_text("# Report\n\n| A | B |\n| --- | --- |\n| 1 | 2 | 3 |\n", encoding="utf-8") parsed = parse_document(input_path, output_dir) issue_types = [issue.issue_type for issue in parsed.quality_report.issues] self.assertNotIn("invalid_table", issue_types) self.assertIn("pre_repair_quality", parsed.provenance) self.assertTrue(parsed.provenance["repair_iterations"]) self.assertIn("invalid_table", [issue["issue_type"] for issue in parsed.provenance["pre_repair_quality"]["issues"]]) def test_export_chunks_cli(self): with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) input_path = tmp_path / "sample.md" output_dir = tmp_path / "out" chunks_out = tmp_path / "chunks.json" input_path.write_text("# Report\n\nThis is a test document.\n", encoding="utf-8") parse_document(input_path, output_dir) exit_code = main([ "export-chunks", "--parsed", str(output_dir), "--format", "json", "--output", str(chunks_out), ]) self.assertEqual(exit_code, 0) self.assertTrue(chunks_out.exists()) self.assertIn('"chunk_id"', chunks_out.read_text(encoding="utf-8")) def test_parse_folder_cli_uses_workers_and_unique_output_dirs(self): with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) docs = tmp_path / "docs" output_dir = tmp_path / "out" docs.mkdir() (docs / "same.md").write_text("# Markdown\n\nHello world.\n", encoding="utf-8") (docs / "same.txt").write_text("Plain text document.\n", encoding="utf-8") code = main([ "parse-folder", "--input", str(docs), "--output", str(output_dir), "--workers", "2", "--gpu-workers", "1", "--parsers", "text", ]) self.assertEqual(code, 0) self.assertTrue((output_dir / "same" / "artifact_manifest.json").exists()) self.assertTrue((output_dir / "same-txt" / "artifact_manifest.json").exists()) def test_gpu_status_cli(self): code = main(["gpu-status"]) self.assertEqual(code, 0) if __name__ == "__main__": unittest.main()