Spaces:

arjun10g
/

zeroshotGPU

Running on Zero

Arjunvir Singh

Frontend: zip + multi-file uploads, progress with stage labels, chunk detail tab

4e3af73 6 days ago

10 kB

	import tempfile
	import unittest
	from pathlib import Path
	from unittest.mock import patch

	try:
	import app as space_app
	except RuntimeError as exc:
	space_app = None
	APP_IMPORT_ERROR = str(exc)
	else:
	APP_IMPORT_ERROR = ""


	class _UploadedFile:
	def __init__(self, name: str):
	self.name = name


	class AppTests(unittest.TestCase):
	def test_parse_uploaded_document_returns_artifact_validation(self):
	if space_app is None:
	self.skipTest(APP_IMPORT_ERROR)

	with tempfile.TemporaryDirectory() as tmp:
	input_path = Path(tmp) / "sample.md"
	input_path.write_text("# Report\n\nHello from the Space UI.\n", encoding="utf-8")

	outputs = space_app.parse_uploaded_document(_UploadedFile(str(input_path)), "Default lightweight")

	self.assertEqual(len(outputs), 11)
	summary = outputs[1]
	artifact_validation = outputs[8]
	archive_path = outputs[9]
	individual_files = outputs[10]
	self.assertTrue(summary["artifact_manifest_valid"])
	self.assertTrue(artifact_validation["valid"])
	self.assertTrue(Path(archive_path).exists())
	# Per-artifact downloads.
	self.assertIsInstance(individual_files, list)
	self.assertGreater(len(individual_files), 0)
	names = [Path(p).name for p in individual_files]
	# Core artifacts every parse should produce.
	for required in ("parsed_document.json", "document.md", "chunks.jsonl", "artifact_manifest.json"):
	self.assertIn(required, names)
	# Each path actually exists on disk so Gradio can serve it.
	for path in individual_files:
	self.assertTrue(Path(path).exists(), f"missing: {path}")
	# The archive zip is a separate artifact and must NOT appear in the
	# per-artifact list (zip is the bundled-everything view).
	self.assertNotIn(Path(archive_path).name, names)
	# Summary records the per-artifact count.
	self.assertEqual(summary["individual_artifact_count"], len(individual_files))


	class UploadGuardTests(unittest.TestCase):
	def test_oversized_upload_rejected_with_clear_message(self):
	if space_app is None:
	self.skipTest(APP_IMPORT_ERROR)

	with tempfile.TemporaryDirectory() as tmp:
	input_path = Path(tmp) / "huge.md"
	input_path.write_text("# Big\n\n" + "x" * 4096, encoding="utf-8")

	with patch.object(space_app, "MAX_UPLOAD_BYTES", 1024):
	outputs = space_app.parse_uploaded_document(
	_UploadedFile(str(input_path)), "Default lightweight"
	)

	summary = outputs[1]
	self.assertTrue(summary.get("rejected"))
	self.assertIn("MB", summary["error"])

	def test_high_page_count_rejected(self):
	if space_app is None:
	self.skipTest(APP_IMPORT_ERROR)

	with tempfile.TemporaryDirectory() as tmp:
	input_path = Path(tmp) / "doc.md"
	input_path.write_text("# Doc\n\nSomething small.\n", encoding="utf-8")

	class _FakeProfile:
	page_count = 1000

	with patch.object(space_app, "MAX_PAGE_COUNT", 50), patch.object(
	space_app, "profile_document", return_value=_FakeProfile()
	):
	outputs = space_app.parse_uploaded_document(
	_UploadedFile(str(input_path)), "Default lightweight"
	)

	summary = outputs[1]
	self.assertTrue(summary.get("rejected"))
	self.assertIn("pages", summary["error"])

	def test_missing_upload_path_rejected(self):
	if space_app is None:
	self.skipTest(APP_IMPORT_ERROR)

	outputs = space_app.parse_uploaded_document(
	_UploadedFile("/tmp/zsgdp-does-not-exist.md"), "Default lightweight"
	)
	summary = outputs[1]
	self.assertTrue(summary.get("rejected"))
	self.assertIn("missing", summary["error"].lower())

	def test_error_paths_return_full_tuple_width(self):
	# Drift guard: every return path (success + error) must yield 11 outputs
	# so the Gradio click handler doesn't error on shape mismatch.
	if space_app is None:
	self.skipTest(APP_IMPORT_ERROR)

	# No upload at all.
	outputs = space_app.parse_uploaded_document(None, "Default lightweight")
	self.assertEqual(len(outputs), 11)
	self.assertEqual(outputs[10], [])

	# Missing-file rejection.
	outputs = space_app.parse_uploaded_document(
	_UploadedFile("/tmp/zsgdp-does-not-exist-xyz.md"), "Default lightweight"
	)
	self.assertEqual(len(outputs), 11)
	self.assertEqual(outputs[10], [])

	def test_normal_upload_passes_guards(self):
	if space_app is None:
	self.skipTest(APP_IMPORT_ERROR)

	with tempfile.TemporaryDirectory() as tmp:
	input_path = Path(tmp) / "ok.md"
	input_path.write_text("# OK\n\nA normal document.\n", encoding="utf-8")
	outputs = space_app.parse_uploaded_document(
	_UploadedFile(str(input_path)), "Default lightweight"
	)

	summary = outputs[1]
	self.assertNotIn("rejected", summary)


	class BatchAndZipUploadTests(unittest.TestCase):
	def test_zip_upload_extracts_and_parses_each_doc(self):
	if space_app is None:
	self.skipTest(APP_IMPORT_ERROR)

	import zipfile

	with tempfile.TemporaryDirectory() as tmp:
	tmp_path = Path(tmp)
	# Build a small zip with two markdown docs.
	doc_a = tmp_path / "a.md"
	doc_a.write_text("# Doc A\n\nFirst.\n", encoding="utf-8")
	doc_b = tmp_path / "b.md"
	doc_b.write_text("# Doc B\n\nSecond.\n", encoding="utf-8")
	zip_path = tmp_path / "batch.zip"
	with zipfile.ZipFile(zip_path, "w") as zf:
	zf.write(doc_a, arcname="a.md")
	zf.write(doc_b, arcname="b.md")

	outputs = space_app.parse_uploaded_document(
	_UploadedFile(str(zip_path)), "Default lightweight"
	)

	# Tuple width unchanged.
	self.assertEqual(len(outputs), 11)
	summary = outputs[1]
	# Batch metadata recorded.
	self.assertIn("batch", summary)
	self.assertEqual(summary["batch"]["input_count"], 2)
	self.assertEqual(summary["batch"]["successful_count"], 2)
	self.assertEqual(summary["batch"]["failed_count"], 0)
	self.assertEqual(len(summary["batch"]["documents"]), 2)
	# Aggregate metrics populated.
	agg = summary["batch"]["aggregate"]
	self.assertGreater(agg["total_chunks"], 0)
	self.assertGreater(agg["mean_quality_score"], 0.0)

	def test_multiple_files_uploaded_as_list(self):
	if space_app is None:
	self.skipTest(APP_IMPORT_ERROR)

	with tempfile.TemporaryDirectory() as tmp:
	doc1 = Path(tmp) / "one.md"
	doc1.write_text("# One\n\nFirst doc.\n", encoding="utf-8")
	doc2 = Path(tmp) / "two.md"
	doc2.write_text("# Two\n\nSecond doc.\n", encoding="utf-8")

	outputs = space_app.parse_uploaded_document(
	[_UploadedFile(str(doc1)), _UploadedFile(str(doc2))],
	"Default lightweight",
	)

	summary = outputs[1]
	self.assertIn("batch", summary)
	self.assertEqual(summary["batch"]["input_count"], 2)

	def test_zip_with_unsupported_files_filtered_out(self):
	if space_app is None:
	self.skipTest(APP_IMPORT_ERROR)

	import zipfile

	with tempfile.TemporaryDirectory() as tmp:
	tmp_path = Path(tmp)
	zip_path = tmp_path / "mixed.zip"
	doc_a = tmp_path / "first.md"
	doc_a.write_text("# First\n\nContent A.\n", encoding="utf-8")
	doc_b = tmp_path / "second.md"
	doc_b.write_text("# Second\n\nContent B.\n", encoding="utf-8")
	junk = tmp_path / "ignore.exe"
	junk.write_bytes(b"\x00\x01")

	with zipfile.ZipFile(zip_path, "w") as zf:
	zf.write(doc_a, arcname="first.md")
	zf.write(doc_b, arcname="second.md")
	zf.write(junk, arcname="ignore.exe")

	outputs = space_app.parse_uploaded_document(
	_UploadedFile(str(zip_path)), "Default lightweight"
	)

	summary = outputs[1]
	# The two .md files parsed; the .exe was filtered out before parsing.
	self.assertIn("batch", summary)
	self.assertEqual(summary["batch"]["input_count"], 2)
	self.assertEqual(summary["batch"]["successful_count"], 2)

	def test_chunk_detail_payload_present(self):
	if space_app is None:
	self.skipTest(APP_IMPORT_ERROR)

	with tempfile.TemporaryDirectory() as tmp:
	doc = Path(tmp) / "rich.md"
	doc.write_text(
	"# Rich Doc\n\n"
	"First paragraph with some prose to chunk.\n\n"
	"Second paragraph with different content for variety.\n\n"
	"\| A \| B \|\n\| --- \| --- \|\n\| 1 \| 2 \|\n",
	encoding="utf-8",
	)
	outputs = space_app.parse_uploaded_document(
	_UploadedFile(str(doc)), "Default lightweight"
	)

	chunking_payload = outputs[4]
	self.assertIn("plan", chunking_payload)
	self.assertIn("detail", chunking_payload)
	detail = chunking_payload["detail"]
	self.assertGreater(detail["total_chunks"], 0)
	self.assertIn("strategies", detail)
	# Each strategy block has the expected shape.
	for strategy_name, block in detail["strategies"].items():
	self.assertIn("count", block)
	self.assertIn("samples", block)
	self.assertIn("token_count_min", block)


	if __name__ == "__main__":
	unittest.main()