File size: 4,089 Bytes
bc02199
 
 
 
 
 
 
dd6cefc
bc02199
 
 
 
 
dd6cefc
bc02199
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9e874de
 
 
 
 
 
 
 
 
 
 
dd6cefc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9e874de
 
 
 
 
 
 
 
 
 
 
 
bc02199
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
"""Tests for non-UI dataset and trace export tooling."""

from __future__ import annotations

import json
import tempfile
import unittest
from collections import Counter
from pathlib import Path

from scripts.export_traces import export_trace_jsonl
from scripts.generate_dataset import build_sft_records, write_sft_jsonl
from scripts.generate_sample_traces import generate_sample_traces
from scripts.prepare_curated_dataset import MODES, build_curated_records, write_jsonl
from src.models.schema import TraceRecord


class DatasetToolingTest(unittest.TestCase):
    def test_build_sft_records(self) -> None:
        records = build_sft_records(7)
        assistant_payload = json.loads(records[0]["messages"][2]["content"])

        self.assertEqual(len(records), 7)
        self.assertEqual(records[0]["split"], "preview")
        self.assertIn("object_understanding", records[0])
        self.assertIn("persona", assistant_payload)
        self.assertIn("diary", assistant_payload)

    def test_write_sft_jsonl(self) -> None:
        with tempfile.TemporaryDirectory() as tmp_dir:
            output_path = Path(tmp_dir) / "preview.jsonl"
            write_sft_jsonl(build_sft_records(3), output_path)
            rows = [
                json.loads(line)
                for line in output_path.read_text(encoding="utf-8").splitlines()
            ]

        self.assertEqual(len(rows), 3)
        self.assertEqual(rows[0]["id"], "sft-preview-0001")

    def test_build_curated_records(self) -> None:
        records = build_curated_records(10)
        assistant_payload = json.loads(records[0]["messages"][2]["content"])

        self.assertEqual(len(records), 10)
        self.assertEqual(records[0]["split"], "train")
        self.assertEqual(records[0]["source"], "objectverse-diary-synthetic-curated-v1")
        self.assertIn("curation_notes", records[0])
        self.assertIn("persona", assistant_payload)
        self.assertIn("diary", assistant_payload)

    def test_build_curated_v2_records_has_broader_balanced_coverage(self) -> None:
        records = build_curated_records(200, version="v2")
        object_names = [
            record["object_understanding"]["object"]["name"]
            for record in records
        ]
        mode_counts = Counter(record["mode"] for record in records)
        object_mode_pairs = {(name, record["mode"]) for name, record in zip(object_names, records)}
        assistant_payload = json.loads(records[0]["messages"][2]["content"])

        self.assertEqual(len(records), 200)
        self.assertEqual(records[0]["source"], "objectverse-diary-synthetic-curated-v2")
        self.assertGreaterEqual(len(set(object_names)), 40)
        self.assertEqual(mode_counts, Counter({mode: 40 for mode in MODES}))
        self.assertEqual(len(object_mode_pairs), 200)
        self.assertIn("scene_detail", records[0])
        self.assertIn("persona", assistant_payload)
        self.assertIn("diary", assistant_payload)

    def test_write_curated_jsonl(self) -> None:
        with tempfile.TemporaryDirectory() as tmp_dir:
            output_path = Path(tmp_dir) / "curated.jsonl"
            write_jsonl(build_curated_records(2), output_path)
            rows = [
                json.loads(line)
                for line in output_path.read_text(encoding="utf-8").splitlines()
            ]

        self.assertEqual(len(rows), 2)
        self.assertEqual(rows[0]["id"], "curated-synthetic-0001")

    def test_export_trace_jsonl(self) -> None:
        with tempfile.TemporaryDirectory() as tmp_dir:
            sample_dir = Path(tmp_dir) / "samples"
            output_path = Path(tmp_dir) / "public_traces.jsonl"
            generate_sample_traces(sample_dir)
            count = export_trace_jsonl(sample_dir, output_path)
            rows = [
                json.loads(line)
                for line in output_path.read_text(encoding="utf-8").splitlines()
            ]

        self.assertEqual(count, 6)
        self.assertEqual(len(rows), 6)
        TraceRecord.model_validate(rows[0])


if __name__ == "__main__":
    unittest.main()