Spaces:
Running on Zero
Running on Zero
| import tempfile | |
| import unittest | |
| from pathlib import Path | |
| from zsgdp.cli import main | |
| from zsgdp.pipeline import parse_document | |
| class PipelineTests(unittest.TestCase): | |
| def test_parse_document_writes_outputs(self): | |
| with tempfile.TemporaryDirectory() as tmp: | |
| tmp_path = Path(tmp) | |
| input_path = tmp_path / "sample.md" | |
| output_dir = tmp_path / "out" | |
| input_path.write_text("# Report\n\nThis is a test document.\n", encoding="utf-8") | |
| parsed = parse_document(input_path, output_dir) | |
| self.assertTrue(parsed.elements) | |
| self.assertTrue(parsed.chunks) | |
| self.assertEqual(parsed.provenance["config_deployment"]["gpu_models_target"], "zeroshotGPU") | |
| self.assertEqual(parsed.provenance["gpu_runtime"]["gpu_models_target"], "zeroshotGPU") | |
| self.assertIn("chunking", parsed.provenance) | |
| self.assertTrue((output_dir / "parsed_document.json").exists()) | |
| self.assertTrue((output_dir / "elements.jsonl").exists()) | |
| self.assertTrue((output_dir / "chunks.jsonl").exists()) | |
| self.assertTrue((output_dir / "chunking_plan.json").exists()) | |
| self.assertTrue((output_dir / "parser_metrics.json").exists()) | |
| self.assertTrue((output_dir / "gpu_runtime.json").exists()) | |
| self.assertTrue((output_dir / "artifact_manifest.json").exists()) | |
| def test_parse_document_exports_gpu_tasks_when_visual_work_exists(self): | |
| try: | |
| import fitz # type: ignore | |
| except ImportError: | |
| self.skipTest("PyMuPDF is not installed") | |
| with tempfile.TemporaryDirectory() as tmp: | |
| tmp_path = Path(tmp) | |
| input_path = tmp_path / "visual.pdf" | |
| output_dir = tmp_path / "out" | |
| doc = fitz.open() | |
| page = doc.new_page() | |
| page.insert_text((72, 72), "Report") | |
| page.draw_rect(fitz.Rect(72, 120, 180, 180)) | |
| doc.save(input_path) | |
| doc.close() | |
| parsed = parse_document(input_path, output_dir, config_overrides={"chunking": {"vision_guided": True}}) | |
| self.assertTrue(parsed.provenance["gpu_tasks"]) | |
| self.assertEqual(parsed.provenance["gpu_task_report"]["task_count"], len(parsed.provenance["gpu_tasks"])) | |
| self.assertTrue((output_dir / "gpu_tasks.jsonl").exists()) | |
| self.assertTrue((output_dir / "gpu_task_report.json").exists()) | |
| def test_parse_document_reverifies_after_repair(self): | |
| with tempfile.TemporaryDirectory() as tmp: | |
| tmp_path = Path(tmp) | |
| input_path = tmp_path / "table.md" | |
| output_dir = tmp_path / "out" | |
| input_path.write_text("# Report\n\n| A | B |\n| --- | --- |\n| 1 | 2 | 3 |\n", encoding="utf-8") | |
| parsed = parse_document(input_path, output_dir) | |
| issue_types = [issue.issue_type for issue in parsed.quality_report.issues] | |
| self.assertNotIn("invalid_table", issue_types) | |
| self.assertIn("pre_repair_quality", parsed.provenance) | |
| self.assertTrue(parsed.provenance["repair_iterations"]) | |
| self.assertIn("invalid_table", [issue["issue_type"] for issue in parsed.provenance["pre_repair_quality"]["issues"]]) | |
| def test_export_chunks_cli(self): | |
| with tempfile.TemporaryDirectory() as tmp: | |
| tmp_path = Path(tmp) | |
| input_path = tmp_path / "sample.md" | |
| output_dir = tmp_path / "out" | |
| chunks_out = tmp_path / "chunks.json" | |
| input_path.write_text("# Report\n\nThis is a test document.\n", encoding="utf-8") | |
| parse_document(input_path, output_dir) | |
| exit_code = main([ | |
| "export-chunks", | |
| "--parsed", | |
| str(output_dir), | |
| "--format", | |
| "json", | |
| "--output", | |
| str(chunks_out), | |
| ]) | |
| self.assertEqual(exit_code, 0) | |
| self.assertTrue(chunks_out.exists()) | |
| self.assertIn('"chunk_id"', chunks_out.read_text(encoding="utf-8")) | |
| def test_parse_folder_cli_uses_workers_and_unique_output_dirs(self): | |
| with tempfile.TemporaryDirectory() as tmp: | |
| tmp_path = Path(tmp) | |
| docs = tmp_path / "docs" | |
| output_dir = tmp_path / "out" | |
| docs.mkdir() | |
| (docs / "same.md").write_text("# Markdown\n\nHello world.\n", encoding="utf-8") | |
| (docs / "same.txt").write_text("Plain text document.\n", encoding="utf-8") | |
| code = main([ | |
| "parse-folder", | |
| "--input", | |
| str(docs), | |
| "--output", | |
| str(output_dir), | |
| "--workers", | |
| "2", | |
| "--gpu-workers", | |
| "1", | |
| "--parsers", | |
| "text", | |
| ]) | |
| self.assertEqual(code, 0) | |
| self.assertTrue((output_dir / "same" / "artifact_manifest.json").exists()) | |
| self.assertTrue((output_dir / "same-txt" / "artifact_manifest.json").exists()) | |
| def test_gpu_status_cli(self): | |
| code = main(["gpu-status"]) | |
| self.assertEqual(code, 0) | |
| if __name__ == "__main__": | |
| unittest.main() | |