zeroshotGPU / tests /test_pipeline.py
Arjunvir Singh
Initial commit: zeroshotGPU MVP with full eval surface
db06ffa
import tempfile
import unittest
from pathlib import Path
from zsgdp.cli import main
from zsgdp.pipeline import parse_document
class PipelineTests(unittest.TestCase):
def test_parse_document_writes_outputs(self):
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
input_path = tmp_path / "sample.md"
output_dir = tmp_path / "out"
input_path.write_text("# Report\n\nThis is a test document.\n", encoding="utf-8")
parsed = parse_document(input_path, output_dir)
self.assertTrue(parsed.elements)
self.assertTrue(parsed.chunks)
self.assertEqual(parsed.provenance["config_deployment"]["gpu_models_target"], "zeroshotGPU")
self.assertEqual(parsed.provenance["gpu_runtime"]["gpu_models_target"], "zeroshotGPU")
self.assertIn("chunking", parsed.provenance)
self.assertTrue((output_dir / "parsed_document.json").exists())
self.assertTrue((output_dir / "elements.jsonl").exists())
self.assertTrue((output_dir / "chunks.jsonl").exists())
self.assertTrue((output_dir / "chunking_plan.json").exists())
self.assertTrue((output_dir / "parser_metrics.json").exists())
self.assertTrue((output_dir / "gpu_runtime.json").exists())
self.assertTrue((output_dir / "artifact_manifest.json").exists())
def test_parse_document_exports_gpu_tasks_when_visual_work_exists(self):
try:
import fitz # type: ignore
except ImportError:
self.skipTest("PyMuPDF is not installed")
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
input_path = tmp_path / "visual.pdf"
output_dir = tmp_path / "out"
doc = fitz.open()
page = doc.new_page()
page.insert_text((72, 72), "Report")
page.draw_rect(fitz.Rect(72, 120, 180, 180))
doc.save(input_path)
doc.close()
parsed = parse_document(input_path, output_dir, config_overrides={"chunking": {"vision_guided": True}})
self.assertTrue(parsed.provenance["gpu_tasks"])
self.assertEqual(parsed.provenance["gpu_task_report"]["task_count"], len(parsed.provenance["gpu_tasks"]))
self.assertTrue((output_dir / "gpu_tasks.jsonl").exists())
self.assertTrue((output_dir / "gpu_task_report.json").exists())
def test_parse_document_reverifies_after_repair(self):
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
input_path = tmp_path / "table.md"
output_dir = tmp_path / "out"
input_path.write_text("# Report\n\n| A | B |\n| --- | --- |\n| 1 | 2 | 3 |\n", encoding="utf-8")
parsed = parse_document(input_path, output_dir)
issue_types = [issue.issue_type for issue in parsed.quality_report.issues]
self.assertNotIn("invalid_table", issue_types)
self.assertIn("pre_repair_quality", parsed.provenance)
self.assertTrue(parsed.provenance["repair_iterations"])
self.assertIn("invalid_table", [issue["issue_type"] for issue in parsed.provenance["pre_repair_quality"]["issues"]])
def test_export_chunks_cli(self):
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
input_path = tmp_path / "sample.md"
output_dir = tmp_path / "out"
chunks_out = tmp_path / "chunks.json"
input_path.write_text("# Report\n\nThis is a test document.\n", encoding="utf-8")
parse_document(input_path, output_dir)
exit_code = main([
"export-chunks",
"--parsed",
str(output_dir),
"--format",
"json",
"--output",
str(chunks_out),
])
self.assertEqual(exit_code, 0)
self.assertTrue(chunks_out.exists())
self.assertIn('"chunk_id"', chunks_out.read_text(encoding="utf-8"))
def test_parse_folder_cli_uses_workers_and_unique_output_dirs(self):
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
docs = tmp_path / "docs"
output_dir = tmp_path / "out"
docs.mkdir()
(docs / "same.md").write_text("# Markdown\n\nHello world.\n", encoding="utf-8")
(docs / "same.txt").write_text("Plain text document.\n", encoding="utf-8")
code = main([
"parse-folder",
"--input",
str(docs),
"--output",
str(output_dir),
"--workers",
"2",
"--gpu-workers",
"1",
"--parsers",
"text",
])
self.assertEqual(code, 0)
self.assertTrue((output_dir / "same" / "artifact_manifest.json").exists())
self.assertTrue((output_dir / "same-txt" / "artifact_manifest.json").exists())
def test_gpu_status_cli(self):
code = main(["gpu-status"])
self.assertEqual(code, 0)
if __name__ == "__main__":
unittest.main()