Spaces:
Running on Zero
Running on Zero
| """Tests for non-UI dataset and trace export tooling.""" | |
| from __future__ import annotations | |
| import json | |
| import tempfile | |
| import unittest | |
| from collections import Counter | |
| from pathlib import Path | |
| from scripts.export_traces import export_trace_jsonl | |
| from scripts.generate_dataset import build_sft_records, write_sft_jsonl | |
| from scripts.generate_sample_traces import generate_sample_traces | |
| from scripts.prepare_curated_dataset import MODES, build_curated_records, write_jsonl | |
| from src.models.schema import TraceRecord | |
| class DatasetToolingTest(unittest.TestCase): | |
| def test_build_sft_records(self) -> None: | |
| records = build_sft_records(7) | |
| assistant_payload = json.loads(records[0]["messages"][2]["content"]) | |
| self.assertEqual(len(records), 7) | |
| self.assertEqual(records[0]["split"], "preview") | |
| self.assertIn("object_understanding", records[0]) | |
| self.assertIn("persona", assistant_payload) | |
| self.assertIn("diary", assistant_payload) | |
| def test_write_sft_jsonl(self) -> None: | |
| with tempfile.TemporaryDirectory() as tmp_dir: | |
| output_path = Path(tmp_dir) / "preview.jsonl" | |
| write_sft_jsonl(build_sft_records(3), output_path) | |
| rows = [ | |
| json.loads(line) | |
| for line in output_path.read_text(encoding="utf-8").splitlines() | |
| ] | |
| self.assertEqual(len(rows), 3) | |
| self.assertEqual(rows[0]["id"], "sft-preview-0001") | |
| def test_build_curated_records(self) -> None: | |
| records = build_curated_records(10) | |
| assistant_payload = json.loads(records[0]["messages"][2]["content"]) | |
| self.assertEqual(len(records), 10) | |
| self.assertEqual(records[0]["split"], "train") | |
| self.assertEqual(records[0]["source"], "objectverse-diary-synthetic-curated-v1") | |
| self.assertIn("curation_notes", records[0]) | |
| self.assertIn("persona", assistant_payload) | |
| self.assertIn("diary", assistant_payload) | |
| def test_build_curated_v2_records_has_broader_balanced_coverage(self) -> None: | |
| records = build_curated_records(200, version="v2") | |
| object_names = [ | |
| record["object_understanding"]["object"]["name"] | |
| for record in records | |
| ] | |
| mode_counts = Counter(record["mode"] for record in records) | |
| object_mode_pairs = {(name, record["mode"]) for name, record in zip(object_names, records)} | |
| assistant_payload = json.loads(records[0]["messages"][2]["content"]) | |
| self.assertEqual(len(records), 200) | |
| self.assertEqual(records[0]["source"], "objectverse-diary-synthetic-curated-v2") | |
| self.assertGreaterEqual(len(set(object_names)), 40) | |
| self.assertEqual(mode_counts, Counter({mode: 40 for mode in MODES})) | |
| self.assertEqual(len(object_mode_pairs), 200) | |
| self.assertIn("scene_detail", records[0]) | |
| self.assertIn("persona", assistant_payload) | |
| self.assertIn("diary", assistant_payload) | |
| def test_write_curated_jsonl(self) -> None: | |
| with tempfile.TemporaryDirectory() as tmp_dir: | |
| output_path = Path(tmp_dir) / "curated.jsonl" | |
| write_jsonl(build_curated_records(2), output_path) | |
| rows = [ | |
| json.loads(line) | |
| for line in output_path.read_text(encoding="utf-8").splitlines() | |
| ] | |
| self.assertEqual(len(rows), 2) | |
| self.assertEqual(rows[0]["id"], "curated-synthetic-0001") | |
| def test_export_trace_jsonl(self) -> None: | |
| with tempfile.TemporaryDirectory() as tmp_dir: | |
| sample_dir = Path(tmp_dir) / "samples" | |
| output_path = Path(tmp_dir) / "public_traces.jsonl" | |
| generate_sample_traces(sample_dir) | |
| count = export_trace_jsonl(sample_dir, output_path) | |
| rows = [ | |
| json.loads(line) | |
| for line in output_path.read_text(encoding="utf-8").splitlines() | |
| ] | |
| self.assertEqual(count, 6) | |
| self.assertEqual(len(rows), 6) | |
| TraceRecord.model_validate(rows[0]) | |
| if __name__ == "__main__": | |
| unittest.main() | |