File size: 6,812 Bytes
0bd7547
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
754345f
 
 
0bd7547
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
import json

from agent.core.session_uploader import (
    _PERSONAL_TOKEN_ENV,
    _resolve_token,
    _update_upload_status,
    _upload_dataset_card,
    _write_claude_code_payload,
    _write_row_payload,
    dataset_card_readme,
    to_claude_code_jsonl,
)

HF_SECRET = "hf_" + "a" * 30
ANTHROPIC_SECRET = "sk-ant-" + "b" * 24
GITHUB_SECRET = "ghp_" + "c" * 36


def test_dataset_card_readme_has_metadata_and_public_warning():
    readme = dataset_card_readme("lewtun/ml-intern-sessions")

    assert readme.startswith("---\n")
    assert 'pretty_name: "ML Intern Session Traces"' in readme
    assert "task_categories:\n- text-generation" in readme
    assert "- agent-traces" in readme
    assert "- coding-agent" in readme
    assert "- ml-intern" in readme
    assert 'path: "sessions/**/*.jsonl"' in readme
    assert "ML Intern demo: https://smolagents-ml-intern.hf.space" in readme
    assert "ML Intern CLI: https://github.com/huggingface/ml-intern" in readme
    assert "Repository: https://huggingface.co/datasets/" not in readme
    assert (
        "**WARNING: no comprehensive redaction or human review has been performed for this dataset.**"
        in readme
    )
    assert "automated best-effort scrubbing" in readme
    assert "Do not make this dataset public" in readme


def test_upload_dataset_card_only_for_claude_code_format():
    class FakeApi:
        def __init__(self):
            self.calls = []

        def upload_file(self, **kwargs):
            self.calls.append(kwargs)

    api = FakeApi()

    _upload_dataset_card(api, "lewtun/ml-intern-sessions", "hf_token", "row")
    assert api.calls == []

    _upload_dataset_card(api, "lewtun/ml-intern-sessions", "hf_token", "claude_code")
    assert len(api.calls) == 1
    assert api.calls[0]["path_in_repo"] == "README.md"
    assert api.calls[0]["repo_id"] == "lewtun/ml-intern-sessions"
    assert api.calls[0]["repo_type"] == "dataset"
    assert api.calls[0]["token"] == "hf_token"
    assert (
        b"no comprehensive redaction or human review" in api.calls[0]["path_or_fileobj"]
    )


def test_personal_token_env_takes_precedence_for_hf_token(monkeypatch):
    monkeypatch.setenv(_PERSONAL_TOKEN_ENV, "personal-token")
    monkeypatch.setenv("HF_TOKEN", "env-token")

    assert _resolve_token("HF_TOKEN") == "personal-token"


def test_update_upload_status_preserves_other_uploader_fields(tmp_path):
    session_file = tmp_path / "session_123.json"
    session_file.write_text(
        json.dumps(
            {
                "session_id": "123",
                "upload_status": "success",
                "upload_url": "https://huggingface.co/datasets/org/sessions",
                "personal_upload_status": "pending",
            }
        )
    )

    _update_upload_status(
        str(session_file),
        "personal_upload_status",
        "personal_upload_url",
        "success",
        "https://huggingface.co/datasets/user/ml-intern-sessions",
    )

    data = json.loads(session_file.read_text())
    assert data["upload_status"] == "success"
    assert data["upload_url"] == "https://huggingface.co/datasets/org/sessions"
    assert data["personal_upload_status"] == "success"
    assert (
        data["personal_upload_url"]
        == "https://huggingface.co/datasets/user/ml-intern-sessions"
    )


def test_claude_code_jsonl_uses_message_timestamps():
    events = to_claude_code_jsonl(
        {
            "session_id": "session-123",
            "model_name": "anthropic/claude-opus-4-6",
            "session_start_time": "2026-01-01T00:00:00",
            "messages": [
                {
                    "role": "user",
                    "content": "hello",
                    "timestamp": "2026-01-01T00:00:01",
                },
                {
                    "role": "assistant",
                    "content": "hi",
                    "timestamp": "2026-01-01T00:00:02",
                },
                {
                    "role": "tool",
                    "tool_call_id": "call-1",
                    "content": "ok",
                    "timestamp": "2026-01-01T00:00:03",
                },
            ],
        }
    )

    assert [event["timestamp"] for event in events] == [
        "2026-01-01T00:00:01",
        "2026-01-01T00:00:02",
        "2026-01-01T00:00:03",
    ]


def test_row_payload_scrubs_messages_events_and_tools(tmp_path):
    tmp_file = tmp_path / "row.jsonl"
    data = {
        "session_id": "session-123",
        "user_id": "lewtun",
        "session_start_time": "2026-01-01T00:00:00",
        "session_end_time": "2026-01-01T00:00:03",
        "model_name": "anthropic/claude-opus-4-6",
        "total_cost_usd": 0.01,
        "messages": [{"role": "user", "content": f"token {HF_SECRET}"}],
        "events": [{"type": "debug", "content": f"key {ANTHROPIC_SECRET}"}],
        "tools": [{"name": "bash", "env": f"GITHUB_TOKEN={GITHUB_SECRET}"}],
    }

    _write_row_payload(data, str(tmp_file))

    payload = tmp_file.read_text()
    assert HF_SECRET not in payload
    assert ANTHROPIC_SECRET not in payload
    assert GITHUB_SECRET not in payload
    assert "[REDACTED_HF_TOKEN]" in payload
    assert "[REDACTED_ANTHROPIC_KEY]" in payload
    assert "GITHUB_TOKEN=[REDACTED]" in payload


def test_claude_code_payload_scrubs_messages_before_conversion(tmp_path):
    tmp_file = tmp_path / "claude_code.jsonl"
    data = {
        "session_id": "session-123",
        "model_name": "anthropic/claude-opus-4-6",
        "session_start_time": "2026-01-01T00:00:00",
        "messages": [
            {
                "role": "user",
                "content": f"token {HF_SECRET}",
                "timestamp": "2026-01-01T00:00:01",
            },
            {
                "role": "assistant",
                "content": "running tool",
                "tool_calls": [
                    {
                        "id": "call-1",
                        "function": {
                            "name": "bash",
                            "arguments": json.dumps({"key": ANTHROPIC_SECRET}),
                        },
                    }
                ],
                "timestamp": "2026-01-01T00:00:02",
            },
            {
                "role": "tool",
                "tool_call_id": "call-1",
                "content": f"GITHUB_TOKEN={GITHUB_SECRET}",
                "timestamp": "2026-01-01T00:00:03",
            },
        ],
    }

    _write_claude_code_payload(data, str(tmp_file))

    payload = tmp_file.read_text()
    assert HF_SECRET not in payload
    assert ANTHROPIC_SECRET not in payload
    assert GITHUB_SECRET not in payload
    assert "[REDACTED_HF_TOKEN]" in payload
    assert "[REDACTED_ANTHROPIC_KEY]" in payload
    assert "GITHUB_TOKEN=[REDACTED]" in payload