File size: 9,997 Bytes
db06ffa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4e3af73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
db06ffa
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
import tempfile
import unittest
from pathlib import Path
from unittest.mock import patch

try:
    import app as space_app
except RuntimeError as exc:
    space_app = None
    APP_IMPORT_ERROR = str(exc)
else:
    APP_IMPORT_ERROR = ""


class _UploadedFile:
    def __init__(self, name: str):
        self.name = name


class AppTests(unittest.TestCase):
    def test_parse_uploaded_document_returns_artifact_validation(self):
        if space_app is None:
            self.skipTest(APP_IMPORT_ERROR)

        with tempfile.TemporaryDirectory() as tmp:
            input_path = Path(tmp) / "sample.md"
            input_path.write_text("# Report\n\nHello from the Space UI.\n", encoding="utf-8")

            outputs = space_app.parse_uploaded_document(_UploadedFile(str(input_path)), "Default lightweight")

        self.assertEqual(len(outputs), 11)
        summary = outputs[1]
        artifact_validation = outputs[8]
        archive_path = outputs[9]
        individual_files = outputs[10]
        self.assertTrue(summary["artifact_manifest_valid"])
        self.assertTrue(artifact_validation["valid"])
        self.assertTrue(Path(archive_path).exists())
        # Per-artifact downloads.
        self.assertIsInstance(individual_files, list)
        self.assertGreater(len(individual_files), 0)
        names = [Path(p).name for p in individual_files]
        # Core artifacts every parse should produce.
        for required in ("parsed_document.json", "document.md", "chunks.jsonl", "artifact_manifest.json"):
            self.assertIn(required, names)
        # Each path actually exists on disk so Gradio can serve it.
        for path in individual_files:
            self.assertTrue(Path(path).exists(), f"missing: {path}")
        # The archive zip is a separate artifact and must NOT appear in the
        # per-artifact list (zip is the bundled-everything view).
        self.assertNotIn(Path(archive_path).name, names)
        # Summary records the per-artifact count.
        self.assertEqual(summary["individual_artifact_count"], len(individual_files))


class UploadGuardTests(unittest.TestCase):
    def test_oversized_upload_rejected_with_clear_message(self):
        if space_app is None:
            self.skipTest(APP_IMPORT_ERROR)

        with tempfile.TemporaryDirectory() as tmp:
            input_path = Path(tmp) / "huge.md"
            input_path.write_text("# Big\n\n" + "x" * 4096, encoding="utf-8")

            with patch.object(space_app, "MAX_UPLOAD_BYTES", 1024):
                outputs = space_app.parse_uploaded_document(
                    _UploadedFile(str(input_path)), "Default lightweight"
                )

        summary = outputs[1]
        self.assertTrue(summary.get("rejected"))
        self.assertIn("MB", summary["error"])

    def test_high_page_count_rejected(self):
        if space_app is None:
            self.skipTest(APP_IMPORT_ERROR)

        with tempfile.TemporaryDirectory() as tmp:
            input_path = Path(tmp) / "doc.md"
            input_path.write_text("# Doc\n\nSomething small.\n", encoding="utf-8")

            class _FakeProfile:
                page_count = 1000

            with patch.object(space_app, "MAX_PAGE_COUNT", 50), patch.object(
                space_app, "profile_document", return_value=_FakeProfile()
            ):
                outputs = space_app.parse_uploaded_document(
                    _UploadedFile(str(input_path)), "Default lightweight"
                )

        summary = outputs[1]
        self.assertTrue(summary.get("rejected"))
        self.assertIn("pages", summary["error"])

    def test_missing_upload_path_rejected(self):
        if space_app is None:
            self.skipTest(APP_IMPORT_ERROR)

        outputs = space_app.parse_uploaded_document(
            _UploadedFile("/tmp/zsgdp-does-not-exist.md"), "Default lightweight"
        )
        summary = outputs[1]
        self.assertTrue(summary.get("rejected"))
        self.assertIn("missing", summary["error"].lower())

    def test_error_paths_return_full_tuple_width(self):
        # Drift guard: every return path (success + error) must yield 11 outputs
        # so the Gradio click handler doesn't error on shape mismatch.
        if space_app is None:
            self.skipTest(APP_IMPORT_ERROR)

        # No upload at all.
        outputs = space_app.parse_uploaded_document(None, "Default lightweight")
        self.assertEqual(len(outputs), 11)
        self.assertEqual(outputs[10], [])

        # Missing-file rejection.
        outputs = space_app.parse_uploaded_document(
            _UploadedFile("/tmp/zsgdp-does-not-exist-xyz.md"), "Default lightweight"
        )
        self.assertEqual(len(outputs), 11)
        self.assertEqual(outputs[10], [])

    def test_normal_upload_passes_guards(self):
        if space_app is None:
            self.skipTest(APP_IMPORT_ERROR)

        with tempfile.TemporaryDirectory() as tmp:
            input_path = Path(tmp) / "ok.md"
            input_path.write_text("# OK\n\nA normal document.\n", encoding="utf-8")
            outputs = space_app.parse_uploaded_document(
                _UploadedFile(str(input_path)), "Default lightweight"
            )

        summary = outputs[1]
        self.assertNotIn("rejected", summary)


class BatchAndZipUploadTests(unittest.TestCase):
    def test_zip_upload_extracts_and_parses_each_doc(self):
        if space_app is None:
            self.skipTest(APP_IMPORT_ERROR)

        import zipfile

        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            # Build a small zip with two markdown docs.
            doc_a = tmp_path / "a.md"
            doc_a.write_text("# Doc A\n\nFirst.\n", encoding="utf-8")
            doc_b = tmp_path / "b.md"
            doc_b.write_text("# Doc B\n\nSecond.\n", encoding="utf-8")
            zip_path = tmp_path / "batch.zip"
            with zipfile.ZipFile(zip_path, "w") as zf:
                zf.write(doc_a, arcname="a.md")
                zf.write(doc_b, arcname="b.md")

            outputs = space_app.parse_uploaded_document(
                _UploadedFile(str(zip_path)), "Default lightweight"
            )

        # Tuple width unchanged.
        self.assertEqual(len(outputs), 11)
        summary = outputs[1]
        # Batch metadata recorded.
        self.assertIn("batch", summary)
        self.assertEqual(summary["batch"]["input_count"], 2)
        self.assertEqual(summary["batch"]["successful_count"], 2)
        self.assertEqual(summary["batch"]["failed_count"], 0)
        self.assertEqual(len(summary["batch"]["documents"]), 2)
        # Aggregate metrics populated.
        agg = summary["batch"]["aggregate"]
        self.assertGreater(agg["total_chunks"], 0)
        self.assertGreater(agg["mean_quality_score"], 0.0)

    def test_multiple_files_uploaded_as_list(self):
        if space_app is None:
            self.skipTest(APP_IMPORT_ERROR)

        with tempfile.TemporaryDirectory() as tmp:
            doc1 = Path(tmp) / "one.md"
            doc1.write_text("# One\n\nFirst doc.\n", encoding="utf-8")
            doc2 = Path(tmp) / "two.md"
            doc2.write_text("# Two\n\nSecond doc.\n", encoding="utf-8")

            outputs = space_app.parse_uploaded_document(
                [_UploadedFile(str(doc1)), _UploadedFile(str(doc2))],
                "Default lightweight",
            )

        summary = outputs[1]
        self.assertIn("batch", summary)
        self.assertEqual(summary["batch"]["input_count"], 2)

    def test_zip_with_unsupported_files_filtered_out(self):
        if space_app is None:
            self.skipTest(APP_IMPORT_ERROR)

        import zipfile

        with tempfile.TemporaryDirectory() as tmp:
            tmp_path = Path(tmp)
            zip_path = tmp_path / "mixed.zip"
            doc_a = tmp_path / "first.md"
            doc_a.write_text("# First\n\nContent A.\n", encoding="utf-8")
            doc_b = tmp_path / "second.md"
            doc_b.write_text("# Second\n\nContent B.\n", encoding="utf-8")
            junk = tmp_path / "ignore.exe"
            junk.write_bytes(b"\x00\x01")

            with zipfile.ZipFile(zip_path, "w") as zf:
                zf.write(doc_a, arcname="first.md")
                zf.write(doc_b, arcname="second.md")
                zf.write(junk, arcname="ignore.exe")

            outputs = space_app.parse_uploaded_document(
                _UploadedFile(str(zip_path)), "Default lightweight"
            )

        summary = outputs[1]
        # The two .md files parsed; the .exe was filtered out before parsing.
        self.assertIn("batch", summary)
        self.assertEqual(summary["batch"]["input_count"], 2)
        self.assertEqual(summary["batch"]["successful_count"], 2)

    def test_chunk_detail_payload_present(self):
        if space_app is None:
            self.skipTest(APP_IMPORT_ERROR)

        with tempfile.TemporaryDirectory() as tmp:
            doc = Path(tmp) / "rich.md"
            doc.write_text(
                "# Rich Doc\n\n"
                "First paragraph with some prose to chunk.\n\n"
                "Second paragraph with different content for variety.\n\n"
                "| A | B |\n| --- | --- |\n| 1 | 2 |\n",
                encoding="utf-8",
            )
            outputs = space_app.parse_uploaded_document(
                _UploadedFile(str(doc)), "Default lightweight"
            )

        chunking_payload = outputs[4]
        self.assertIn("plan", chunking_payload)
        self.assertIn("detail", chunking_payload)
        detail = chunking_payload["detail"]
        self.assertGreater(detail["total_chunks"], 0)
        self.assertIn("strategies", detail)
        # Each strategy block has the expected shape.
        for strategy_name, block in detail["strategies"].items():
            self.assertIn("count", block)
            self.assertIn("samples", block)
            self.assertIn("token_count_min", block)


if __name__ == "__main__":
    unittest.main()