"""Tests for non-UI dataset and trace export tooling.""" from __future__ import annotations import json import tempfile import unittest from collections import Counter from pathlib import Path from scripts.export_traces import export_trace_jsonl from scripts.generate_dataset import build_sft_records, write_sft_jsonl from scripts.generate_sample_traces import generate_sample_traces from scripts.prepare_curated_dataset import MODES, build_curated_records, write_jsonl from src.models.schema import TraceRecord class DatasetToolingTest(unittest.TestCase): def test_build_sft_records(self) -> None: records = build_sft_records(7) assistant_payload = json.loads(records[0]["messages"][2]["content"]) self.assertEqual(len(records), 7) self.assertEqual(records[0]["split"], "preview") self.assertIn("object_understanding", records[0]) self.assertIn("persona", assistant_payload) self.assertIn("diary", assistant_payload) def test_write_sft_jsonl(self) -> None: with tempfile.TemporaryDirectory() as tmp_dir: output_path = Path(tmp_dir) / "preview.jsonl" write_sft_jsonl(build_sft_records(3), output_path) rows = [ json.loads(line) for line in output_path.read_text(encoding="utf-8").splitlines() ] self.assertEqual(len(rows), 3) self.assertEqual(rows[0]["id"], "sft-preview-0001") def test_build_curated_records(self) -> None: records = build_curated_records(10) assistant_payload = json.loads(records[0]["messages"][2]["content"]) self.assertEqual(len(records), 10) self.assertEqual(records[0]["split"], "train") self.assertEqual(records[0]["source"], "objectverse-diary-synthetic-curated-v1") self.assertIn("curation_notes", records[0]) self.assertIn("persona", assistant_payload) self.assertIn("diary", assistant_payload) def test_build_curated_v2_records_has_broader_balanced_coverage(self) -> None: records = build_curated_records(200, version="v2") object_names = [ record["object_understanding"]["object"]["name"] for record in records ] mode_counts = Counter(record["mode"] for record in records) object_mode_pairs = {(name, record["mode"]) for name, record in zip(object_names, records)} assistant_payload = json.loads(records[0]["messages"][2]["content"]) self.assertEqual(len(records), 200) self.assertEqual(records[0]["source"], "objectverse-diary-synthetic-curated-v2") self.assertGreaterEqual(len(set(object_names)), 40) self.assertEqual(mode_counts, Counter({mode: 40 for mode in MODES})) self.assertEqual(len(object_mode_pairs), 200) self.assertIn("scene_detail", records[0]) self.assertIn("persona", assistant_payload) self.assertIn("diary", assistant_payload) def test_write_curated_jsonl(self) -> None: with tempfile.TemporaryDirectory() as tmp_dir: output_path = Path(tmp_dir) / "curated.jsonl" write_jsonl(build_curated_records(2), output_path) rows = [ json.loads(line) for line in output_path.read_text(encoding="utf-8").splitlines() ] self.assertEqual(len(rows), 2) self.assertEqual(rows[0]["id"], "curated-synthetic-0001") def test_export_trace_jsonl(self) -> None: with tempfile.TemporaryDirectory() as tmp_dir: sample_dir = Path(tmp_dir) / "samples" output_path = Path(tmp_dir) / "public_traces.jsonl" generate_sample_traces(sample_dir) count = export_trace_jsonl(sample_dir, output_path) rows = [ json.loads(line) for line in output_path.read_text(encoding="utf-8").splitlines() ] self.assertEqual(count, 6) self.assertEqual(len(rows), 6) TraceRecord.model_validate(rows[0]) if __name__ == "__main__": unittest.main()