Spaces:
Running on Zero
Running on Zero
File size: 2,347 Bytes
db06ffa | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 | import tempfile
import unittest
from pathlib import Path
from zsgdp.benchmarks.parser_quality import run_parser_benchmark
from zsgdp.cli import main
class BenchmarkTests(unittest.TestCase):
def test_run_parser_benchmark_writes_results(self):
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
docs = tmp_path / "docs"
out = tmp_path / "bench"
docs.mkdir()
(docs / "one.md").write_text("# One\n\nHello world", encoding="utf-8")
summary = run_parser_benchmark(docs, out)
self.assertEqual(summary["document_count"], 1)
self.assertIn("fixed_token_baseline", summary["documents"][0]["chunk_strategy_counts"])
self.assertTrue(summary["chunk_strategy_leaderboard"])
self.assertIn("structure_quality", summary)
self.assertIn("chunking_quality", summary)
self.assertIn("throughput", summary)
self.assertIn("ablation_plan", summary)
self.assertTrue((out / "results.json").exists())
self.assertTrue((out / "leaderboard.csv").exists())
self.assertTrue((out / "parser_runs.csv").exists())
self.assertTrue((out / "chunk_runs.csv").exists())
self.assertTrue((out / "structure_runs.csv").exists())
self.assertTrue((out / "chunk_quality.csv").exists())
self.assertTrue((out / "throughput_runs.csv").exists())
self.assertTrue((out / "ablations.json").exists())
def test_benchmark_cli(self):
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
docs = tmp_path / "docs"
out = tmp_path / "bench"
docs.mkdir()
(docs / "one.md").write_text("# One\n\nHello world", encoding="utf-8")
code = main(["benchmark", "--input", str(docs), "--output", str(out), "--parsers", "text"])
self.assertEqual(code, 0)
self.assertTrue((out / "leaderboard.csv").exists())
self.assertTrue((out / "chunk_runs.csv").exists())
self.assertTrue((out / "structure_runs.csv").exists())
self.assertTrue((out / "chunk_quality.csv").exists())
self.assertTrue((out / "throughput_runs.csv").exists())
if __name__ == "__main__":
unittest.main()
|