Spaces:
Running on Zero
Running on Zero
Arjunvir Singh
Frontend: zip + multi-file uploads, progress with stage labels, chunk detail tab
4e3af73 | import tempfile | |
| import unittest | |
| from pathlib import Path | |
| from unittest.mock import patch | |
| try: | |
| import app as space_app | |
| except RuntimeError as exc: | |
| space_app = None | |
| APP_IMPORT_ERROR = str(exc) | |
| else: | |
| APP_IMPORT_ERROR = "" | |
| class _UploadedFile: | |
| def __init__(self, name: str): | |
| self.name = name | |
| class AppTests(unittest.TestCase): | |
| def test_parse_uploaded_document_returns_artifact_validation(self): | |
| if space_app is None: | |
| self.skipTest(APP_IMPORT_ERROR) | |
| with tempfile.TemporaryDirectory() as tmp: | |
| input_path = Path(tmp) / "sample.md" | |
| input_path.write_text("# Report\n\nHello from the Space UI.\n", encoding="utf-8") | |
| outputs = space_app.parse_uploaded_document(_UploadedFile(str(input_path)), "Default lightweight") | |
| self.assertEqual(len(outputs), 11) | |
| summary = outputs[1] | |
| artifact_validation = outputs[8] | |
| archive_path = outputs[9] | |
| individual_files = outputs[10] | |
| self.assertTrue(summary["artifact_manifest_valid"]) | |
| self.assertTrue(artifact_validation["valid"]) | |
| self.assertTrue(Path(archive_path).exists()) | |
| # Per-artifact downloads. | |
| self.assertIsInstance(individual_files, list) | |
| self.assertGreater(len(individual_files), 0) | |
| names = [Path(p).name for p in individual_files] | |
| # Core artifacts every parse should produce. | |
| for required in ("parsed_document.json", "document.md", "chunks.jsonl", "artifact_manifest.json"): | |
| self.assertIn(required, names) | |
| # Each path actually exists on disk so Gradio can serve it. | |
| for path in individual_files: | |
| self.assertTrue(Path(path).exists(), f"missing: {path}") | |
| # The archive zip is a separate artifact and must NOT appear in the | |
| # per-artifact list (zip is the bundled-everything view). | |
| self.assertNotIn(Path(archive_path).name, names) | |
| # Summary records the per-artifact count. | |
| self.assertEqual(summary["individual_artifact_count"], len(individual_files)) | |
| class UploadGuardTests(unittest.TestCase): | |
| def test_oversized_upload_rejected_with_clear_message(self): | |
| if space_app is None: | |
| self.skipTest(APP_IMPORT_ERROR) | |
| with tempfile.TemporaryDirectory() as tmp: | |
| input_path = Path(tmp) / "huge.md" | |
| input_path.write_text("# Big\n\n" + "x" * 4096, encoding="utf-8") | |
| with patch.object(space_app, "MAX_UPLOAD_BYTES", 1024): | |
| outputs = space_app.parse_uploaded_document( | |
| _UploadedFile(str(input_path)), "Default lightweight" | |
| ) | |
| summary = outputs[1] | |
| self.assertTrue(summary.get("rejected")) | |
| self.assertIn("MB", summary["error"]) | |
| def test_high_page_count_rejected(self): | |
| if space_app is None: | |
| self.skipTest(APP_IMPORT_ERROR) | |
| with tempfile.TemporaryDirectory() as tmp: | |
| input_path = Path(tmp) / "doc.md" | |
| input_path.write_text("# Doc\n\nSomething small.\n", encoding="utf-8") | |
| class _FakeProfile: | |
| page_count = 1000 | |
| with patch.object(space_app, "MAX_PAGE_COUNT", 50), patch.object( | |
| space_app, "profile_document", return_value=_FakeProfile() | |
| ): | |
| outputs = space_app.parse_uploaded_document( | |
| _UploadedFile(str(input_path)), "Default lightweight" | |
| ) | |
| summary = outputs[1] | |
| self.assertTrue(summary.get("rejected")) | |
| self.assertIn("pages", summary["error"]) | |
| def test_missing_upload_path_rejected(self): | |
| if space_app is None: | |
| self.skipTest(APP_IMPORT_ERROR) | |
| outputs = space_app.parse_uploaded_document( | |
| _UploadedFile("/tmp/zsgdp-does-not-exist.md"), "Default lightweight" | |
| ) | |
| summary = outputs[1] | |
| self.assertTrue(summary.get("rejected")) | |
| self.assertIn("missing", summary["error"].lower()) | |
| def test_error_paths_return_full_tuple_width(self): | |
| # Drift guard: every return path (success + error) must yield 11 outputs | |
| # so the Gradio click handler doesn't error on shape mismatch. | |
| if space_app is None: | |
| self.skipTest(APP_IMPORT_ERROR) | |
| # No upload at all. | |
| outputs = space_app.parse_uploaded_document(None, "Default lightweight") | |
| self.assertEqual(len(outputs), 11) | |
| self.assertEqual(outputs[10], []) | |
| # Missing-file rejection. | |
| outputs = space_app.parse_uploaded_document( | |
| _UploadedFile("/tmp/zsgdp-does-not-exist-xyz.md"), "Default lightweight" | |
| ) | |
| self.assertEqual(len(outputs), 11) | |
| self.assertEqual(outputs[10], []) | |
| def test_normal_upload_passes_guards(self): | |
| if space_app is None: | |
| self.skipTest(APP_IMPORT_ERROR) | |
| with tempfile.TemporaryDirectory() as tmp: | |
| input_path = Path(tmp) / "ok.md" | |
| input_path.write_text("# OK\n\nA normal document.\n", encoding="utf-8") | |
| outputs = space_app.parse_uploaded_document( | |
| _UploadedFile(str(input_path)), "Default lightweight" | |
| ) | |
| summary = outputs[1] | |
| self.assertNotIn("rejected", summary) | |
| class BatchAndZipUploadTests(unittest.TestCase): | |
| def test_zip_upload_extracts_and_parses_each_doc(self): | |
| if space_app is None: | |
| self.skipTest(APP_IMPORT_ERROR) | |
| import zipfile | |
| with tempfile.TemporaryDirectory() as tmp: | |
| tmp_path = Path(tmp) | |
| # Build a small zip with two markdown docs. | |
| doc_a = tmp_path / "a.md" | |
| doc_a.write_text("# Doc A\n\nFirst.\n", encoding="utf-8") | |
| doc_b = tmp_path / "b.md" | |
| doc_b.write_text("# Doc B\n\nSecond.\n", encoding="utf-8") | |
| zip_path = tmp_path / "batch.zip" | |
| with zipfile.ZipFile(zip_path, "w") as zf: | |
| zf.write(doc_a, arcname="a.md") | |
| zf.write(doc_b, arcname="b.md") | |
| outputs = space_app.parse_uploaded_document( | |
| _UploadedFile(str(zip_path)), "Default lightweight" | |
| ) | |
| # Tuple width unchanged. | |
| self.assertEqual(len(outputs), 11) | |
| summary = outputs[1] | |
| # Batch metadata recorded. | |
| self.assertIn("batch", summary) | |
| self.assertEqual(summary["batch"]["input_count"], 2) | |
| self.assertEqual(summary["batch"]["successful_count"], 2) | |
| self.assertEqual(summary["batch"]["failed_count"], 0) | |
| self.assertEqual(len(summary["batch"]["documents"]), 2) | |
| # Aggregate metrics populated. | |
| agg = summary["batch"]["aggregate"] | |
| self.assertGreater(agg["total_chunks"], 0) | |
| self.assertGreater(agg["mean_quality_score"], 0.0) | |
| def test_multiple_files_uploaded_as_list(self): | |
| if space_app is None: | |
| self.skipTest(APP_IMPORT_ERROR) | |
| with tempfile.TemporaryDirectory() as tmp: | |
| doc1 = Path(tmp) / "one.md" | |
| doc1.write_text("# One\n\nFirst doc.\n", encoding="utf-8") | |
| doc2 = Path(tmp) / "two.md" | |
| doc2.write_text("# Two\n\nSecond doc.\n", encoding="utf-8") | |
| outputs = space_app.parse_uploaded_document( | |
| [_UploadedFile(str(doc1)), _UploadedFile(str(doc2))], | |
| "Default lightweight", | |
| ) | |
| summary = outputs[1] | |
| self.assertIn("batch", summary) | |
| self.assertEqual(summary["batch"]["input_count"], 2) | |
| def test_zip_with_unsupported_files_filtered_out(self): | |
| if space_app is None: | |
| self.skipTest(APP_IMPORT_ERROR) | |
| import zipfile | |
| with tempfile.TemporaryDirectory() as tmp: | |
| tmp_path = Path(tmp) | |
| zip_path = tmp_path / "mixed.zip" | |
| doc_a = tmp_path / "first.md" | |
| doc_a.write_text("# First\n\nContent A.\n", encoding="utf-8") | |
| doc_b = tmp_path / "second.md" | |
| doc_b.write_text("# Second\n\nContent B.\n", encoding="utf-8") | |
| junk = tmp_path / "ignore.exe" | |
| junk.write_bytes(b"\x00\x01") | |
| with zipfile.ZipFile(zip_path, "w") as zf: | |
| zf.write(doc_a, arcname="first.md") | |
| zf.write(doc_b, arcname="second.md") | |
| zf.write(junk, arcname="ignore.exe") | |
| outputs = space_app.parse_uploaded_document( | |
| _UploadedFile(str(zip_path)), "Default lightweight" | |
| ) | |
| summary = outputs[1] | |
| # The two .md files parsed; the .exe was filtered out before parsing. | |
| self.assertIn("batch", summary) | |
| self.assertEqual(summary["batch"]["input_count"], 2) | |
| self.assertEqual(summary["batch"]["successful_count"], 2) | |
| def test_chunk_detail_payload_present(self): | |
| if space_app is None: | |
| self.skipTest(APP_IMPORT_ERROR) | |
| with tempfile.TemporaryDirectory() as tmp: | |
| doc = Path(tmp) / "rich.md" | |
| doc.write_text( | |
| "# Rich Doc\n\n" | |
| "First paragraph with some prose to chunk.\n\n" | |
| "Second paragraph with different content for variety.\n\n" | |
| "| A | B |\n| --- | --- |\n| 1 | 2 |\n", | |
| encoding="utf-8", | |
| ) | |
| outputs = space_app.parse_uploaded_document( | |
| _UploadedFile(str(doc)), "Default lightweight" | |
| ) | |
| chunking_payload = outputs[4] | |
| self.assertIn("plan", chunking_payload) | |
| self.assertIn("detail", chunking_payload) | |
| detail = chunking_payload["detail"] | |
| self.assertGreater(detail["total_chunks"], 0) | |
| self.assertIn("strategies", detail) | |
| # Each strategy block has the expected shape. | |
| for strategy_name, block in detail["strategies"].items(): | |
| self.assertIn("count", block) | |
| self.assertIn("samples", block) | |
| self.assertIn("token_count_min", block) | |
| if __name__ == "__main__": | |
| unittest.main() | |