import tempfile import unittest from pathlib import Path from unittest.mock import patch try: import app as space_app except RuntimeError as exc: space_app = None APP_IMPORT_ERROR = str(exc) else: APP_IMPORT_ERROR = "" class _UploadedFile: def __init__(self, name: str): self.name = name class AppTests(unittest.TestCase): def test_parse_uploaded_document_returns_artifact_validation(self): if space_app is None: self.skipTest(APP_IMPORT_ERROR) with tempfile.TemporaryDirectory() as tmp: input_path = Path(tmp) / "sample.md" input_path.write_text("# Report\n\nHello from the Space UI.\n", encoding="utf-8") outputs = space_app.parse_uploaded_document(_UploadedFile(str(input_path)), "Default lightweight") self.assertEqual(len(outputs), 11) summary = outputs[1] artifact_validation = outputs[8] archive_path = outputs[9] individual_files = outputs[10] self.assertTrue(summary["artifact_manifest_valid"]) self.assertTrue(artifact_validation["valid"]) self.assertTrue(Path(archive_path).exists()) # Per-artifact downloads. self.assertIsInstance(individual_files, list) self.assertGreater(len(individual_files), 0) names = [Path(p).name for p in individual_files] # Core artifacts every parse should produce. for required in ("parsed_document.json", "document.md", "chunks.jsonl", "artifact_manifest.json"): self.assertIn(required, names) # Each path actually exists on disk so Gradio can serve it. for path in individual_files: self.assertTrue(Path(path).exists(), f"missing: {path}") # The archive zip is a separate artifact and must NOT appear in the # per-artifact list (zip is the bundled-everything view). self.assertNotIn(Path(archive_path).name, names) # Summary records the per-artifact count. self.assertEqual(summary["individual_artifact_count"], len(individual_files)) class UploadGuardTests(unittest.TestCase): def test_oversized_upload_rejected_with_clear_message(self): if space_app is None: self.skipTest(APP_IMPORT_ERROR) with tempfile.TemporaryDirectory() as tmp: input_path = Path(tmp) / "huge.md" input_path.write_text("# Big\n\n" + "x" * 4096, encoding="utf-8") with patch.object(space_app, "MAX_UPLOAD_BYTES", 1024): outputs = space_app.parse_uploaded_document( _UploadedFile(str(input_path)), "Default lightweight" ) summary = outputs[1] self.assertTrue(summary.get("rejected")) self.assertIn("MB", summary["error"]) def test_high_page_count_rejected(self): if space_app is None: self.skipTest(APP_IMPORT_ERROR) with tempfile.TemporaryDirectory() as tmp: input_path = Path(tmp) / "doc.md" input_path.write_text("# Doc\n\nSomething small.\n", encoding="utf-8") class _FakeProfile: page_count = 1000 with patch.object(space_app, "MAX_PAGE_COUNT", 50), patch.object( space_app, "profile_document", return_value=_FakeProfile() ): outputs = space_app.parse_uploaded_document( _UploadedFile(str(input_path)), "Default lightweight" ) summary = outputs[1] self.assertTrue(summary.get("rejected")) self.assertIn("pages", summary["error"]) def test_missing_upload_path_rejected(self): if space_app is None: self.skipTest(APP_IMPORT_ERROR) outputs = space_app.parse_uploaded_document( _UploadedFile("/tmp/zsgdp-does-not-exist.md"), "Default lightweight" ) summary = outputs[1] self.assertTrue(summary.get("rejected")) self.assertIn("missing", summary["error"].lower()) def test_error_paths_return_full_tuple_width(self): # Drift guard: every return path (success + error) must yield 11 outputs # so the Gradio click handler doesn't error on shape mismatch. if space_app is None: self.skipTest(APP_IMPORT_ERROR) # No upload at all. outputs = space_app.parse_uploaded_document(None, "Default lightweight") self.assertEqual(len(outputs), 11) self.assertEqual(outputs[10], []) # Missing-file rejection. outputs = space_app.parse_uploaded_document( _UploadedFile("/tmp/zsgdp-does-not-exist-xyz.md"), "Default lightweight" ) self.assertEqual(len(outputs), 11) self.assertEqual(outputs[10], []) def test_normal_upload_passes_guards(self): if space_app is None: self.skipTest(APP_IMPORT_ERROR) with tempfile.TemporaryDirectory() as tmp: input_path = Path(tmp) / "ok.md" input_path.write_text("# OK\n\nA normal document.\n", encoding="utf-8") outputs = space_app.parse_uploaded_document( _UploadedFile(str(input_path)), "Default lightweight" ) summary = outputs[1] self.assertNotIn("rejected", summary) class BatchAndZipUploadTests(unittest.TestCase): def test_zip_upload_extracts_and_parses_each_doc(self): if space_app is None: self.skipTest(APP_IMPORT_ERROR) import zipfile with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) # Build a small zip with two markdown docs. doc_a = tmp_path / "a.md" doc_a.write_text("# Doc A\n\nFirst.\n", encoding="utf-8") doc_b = tmp_path / "b.md" doc_b.write_text("# Doc B\n\nSecond.\n", encoding="utf-8") zip_path = tmp_path / "batch.zip" with zipfile.ZipFile(zip_path, "w") as zf: zf.write(doc_a, arcname="a.md") zf.write(doc_b, arcname="b.md") outputs = space_app.parse_uploaded_document( _UploadedFile(str(zip_path)), "Default lightweight" ) # Tuple width unchanged. self.assertEqual(len(outputs), 11) summary = outputs[1] # Batch metadata recorded. self.assertIn("batch", summary) self.assertEqual(summary["batch"]["input_count"], 2) self.assertEqual(summary["batch"]["successful_count"], 2) self.assertEqual(summary["batch"]["failed_count"], 0) self.assertEqual(len(summary["batch"]["documents"]), 2) # Aggregate metrics populated. agg = summary["batch"]["aggregate"] self.assertGreater(agg["total_chunks"], 0) self.assertGreater(agg["mean_quality_score"], 0.0) def test_multiple_files_uploaded_as_list(self): if space_app is None: self.skipTest(APP_IMPORT_ERROR) with tempfile.TemporaryDirectory() as tmp: doc1 = Path(tmp) / "one.md" doc1.write_text("# One\n\nFirst doc.\n", encoding="utf-8") doc2 = Path(tmp) / "two.md" doc2.write_text("# Two\n\nSecond doc.\n", encoding="utf-8") outputs = space_app.parse_uploaded_document( [_UploadedFile(str(doc1)), _UploadedFile(str(doc2))], "Default lightweight", ) summary = outputs[1] self.assertIn("batch", summary) self.assertEqual(summary["batch"]["input_count"], 2) def test_zip_with_unsupported_files_filtered_out(self): if space_app is None: self.skipTest(APP_IMPORT_ERROR) import zipfile with tempfile.TemporaryDirectory() as tmp: tmp_path = Path(tmp) zip_path = tmp_path / "mixed.zip" doc_a = tmp_path / "first.md" doc_a.write_text("# First\n\nContent A.\n", encoding="utf-8") doc_b = tmp_path / "second.md" doc_b.write_text("# Second\n\nContent B.\n", encoding="utf-8") junk = tmp_path / "ignore.exe" junk.write_bytes(b"\x00\x01") with zipfile.ZipFile(zip_path, "w") as zf: zf.write(doc_a, arcname="first.md") zf.write(doc_b, arcname="second.md") zf.write(junk, arcname="ignore.exe") outputs = space_app.parse_uploaded_document( _UploadedFile(str(zip_path)), "Default lightweight" ) summary = outputs[1] # The two .md files parsed; the .exe was filtered out before parsing. self.assertIn("batch", summary) self.assertEqual(summary["batch"]["input_count"], 2) self.assertEqual(summary["batch"]["successful_count"], 2) def test_chunk_detail_payload_present(self): if space_app is None: self.skipTest(APP_IMPORT_ERROR) with tempfile.TemporaryDirectory() as tmp: doc = Path(tmp) / "rich.md" doc.write_text( "# Rich Doc\n\n" "First paragraph with some prose to chunk.\n\n" "Second paragraph with different content for variety.\n\n" "| A | B |\n| --- | --- |\n| 1 | 2 |\n", encoding="utf-8", ) outputs = space_app.parse_uploaded_document( _UploadedFile(str(doc)), "Default lightweight" ) chunking_payload = outputs[4] self.assertIn("plan", chunking_payload) self.assertIn("detail", chunking_payload) detail = chunking_payload["detail"] self.assertGreater(detail["total_chunks"], 0) self.assertIn("strategies", detail) # Each strategy block has the expected shape. for strategy_name, block in detail["strategies"].items(): self.assertIn("count", block) self.assertIn("samples", block) self.assertIn("token_count_min", block) if __name__ == "__main__": unittest.main()