| import json |
|
|
| from agent.core.session_uploader import ( |
| _PERSONAL_TOKEN_ENV, |
| _resolve_token, |
| _update_upload_status, |
| _upload_dataset_card, |
| _write_claude_code_payload, |
| _write_row_payload, |
| dataset_card_readme, |
| to_claude_code_jsonl, |
| ) |
|
|
| HF_SECRET = "hf_" + "a" * 30 |
| ANTHROPIC_SECRET = "sk-ant-" + "b" * 24 |
| GITHUB_SECRET = "ghp_" + "c" * 36 |
|
|
|
|
| def test_dataset_card_readme_has_metadata_and_public_warning(): |
| readme = dataset_card_readme("lewtun/ml-intern-sessions") |
|
|
| assert readme.startswith("---\n") |
| assert 'pretty_name: "ML Intern Session Traces"' in readme |
| assert "task_categories:\n- text-generation" in readme |
| assert "- agent-traces" in readme |
| assert "- coding-agent" in readme |
| assert "- ml-intern" in readme |
| assert 'path: "sessions/**/*.jsonl"' in readme |
| assert "ML Intern demo: https://smolagents-ml-intern.hf.space" in readme |
| assert "ML Intern CLI: https://github.com/huggingface/ml-intern" in readme |
| assert "Repository: https://huggingface.co/datasets/" not in readme |
| assert ( |
| "**WARNING: no comprehensive redaction or human review has been performed for this dataset.**" |
| in readme |
| ) |
| assert "automated best-effort scrubbing" in readme |
| assert "Do not make this dataset public" in readme |
|
|
|
|
| def test_upload_dataset_card_only_for_claude_code_format(): |
| class FakeApi: |
| def __init__(self): |
| self.calls = [] |
|
|
| def upload_file(self, **kwargs): |
| self.calls.append(kwargs) |
|
|
| api = FakeApi() |
|
|
| _upload_dataset_card(api, "lewtun/ml-intern-sessions", "hf_token", "row") |
| assert api.calls == [] |
|
|
| _upload_dataset_card(api, "lewtun/ml-intern-sessions", "hf_token", "claude_code") |
| assert len(api.calls) == 1 |
| assert api.calls[0]["path_in_repo"] == "README.md" |
| assert api.calls[0]["repo_id"] == "lewtun/ml-intern-sessions" |
| assert api.calls[0]["repo_type"] == "dataset" |
| assert api.calls[0]["token"] == "hf_token" |
| assert ( |
| b"no comprehensive redaction or human review" in api.calls[0]["path_or_fileobj"] |
| ) |
|
|
|
|
| def test_personal_token_env_takes_precedence_for_hf_token(monkeypatch): |
| monkeypatch.setenv(_PERSONAL_TOKEN_ENV, "personal-token") |
| monkeypatch.setenv("HF_TOKEN", "env-token") |
|
|
| assert _resolve_token("HF_TOKEN") == "personal-token" |
|
|
|
|
| def test_update_upload_status_preserves_other_uploader_fields(tmp_path): |
| session_file = tmp_path / "session_123.json" |
| session_file.write_text( |
| json.dumps( |
| { |
| "session_id": "123", |
| "upload_status": "success", |
| "upload_url": "https://huggingface.co/datasets/org/sessions", |
| "personal_upload_status": "pending", |
| } |
| ) |
| ) |
|
|
| _update_upload_status( |
| str(session_file), |
| "personal_upload_status", |
| "personal_upload_url", |
| "success", |
| "https://huggingface.co/datasets/user/ml-intern-sessions", |
| ) |
|
|
| data = json.loads(session_file.read_text()) |
| assert data["upload_status"] == "success" |
| assert data["upload_url"] == "https://huggingface.co/datasets/org/sessions" |
| assert data["personal_upload_status"] == "success" |
| assert ( |
| data["personal_upload_url"] |
| == "https://huggingface.co/datasets/user/ml-intern-sessions" |
| ) |
|
|
|
|
| def test_claude_code_jsonl_uses_message_timestamps(): |
| events = to_claude_code_jsonl( |
| { |
| "session_id": "session-123", |
| "model_name": "anthropic/claude-opus-4-6", |
| "session_start_time": "2026-01-01T00:00:00", |
| "messages": [ |
| { |
| "role": "user", |
| "content": "hello", |
| "timestamp": "2026-01-01T00:00:01", |
| }, |
| { |
| "role": "assistant", |
| "content": "hi", |
| "timestamp": "2026-01-01T00:00:02", |
| }, |
| { |
| "role": "tool", |
| "tool_call_id": "call-1", |
| "content": "ok", |
| "timestamp": "2026-01-01T00:00:03", |
| }, |
| ], |
| } |
| ) |
|
|
| assert [event["timestamp"] for event in events] == [ |
| "2026-01-01T00:00:01", |
| "2026-01-01T00:00:02", |
| "2026-01-01T00:00:03", |
| ] |
|
|
|
|
| def test_row_payload_scrubs_messages_events_and_tools(tmp_path): |
| tmp_file = tmp_path / "row.jsonl" |
| data = { |
| "session_id": "session-123", |
| "user_id": "lewtun", |
| "session_start_time": "2026-01-01T00:00:00", |
| "session_end_time": "2026-01-01T00:00:03", |
| "model_name": "anthropic/claude-opus-4-6", |
| "total_cost_usd": 0.01, |
| "messages": [{"role": "user", "content": f"token {HF_SECRET}"}], |
| "events": [{"type": "debug", "content": f"key {ANTHROPIC_SECRET}"}], |
| "tools": [{"name": "bash", "env": f"GITHUB_TOKEN={GITHUB_SECRET}"}], |
| } |
|
|
| _write_row_payload(data, str(tmp_file)) |
|
|
| payload = tmp_file.read_text() |
| assert HF_SECRET not in payload |
| assert ANTHROPIC_SECRET not in payload |
| assert GITHUB_SECRET not in payload |
| assert "[REDACTED_HF_TOKEN]" in payload |
| assert "[REDACTED_ANTHROPIC_KEY]" in payload |
| assert "GITHUB_TOKEN=[REDACTED]" in payload |
|
|
|
|
| def test_claude_code_payload_scrubs_messages_before_conversion(tmp_path): |
| tmp_file = tmp_path / "claude_code.jsonl" |
| data = { |
| "session_id": "session-123", |
| "model_name": "anthropic/claude-opus-4-6", |
| "session_start_time": "2026-01-01T00:00:00", |
| "messages": [ |
| { |
| "role": "user", |
| "content": f"token {HF_SECRET}", |
| "timestamp": "2026-01-01T00:00:01", |
| }, |
| { |
| "role": "assistant", |
| "content": "running tool", |
| "tool_calls": [ |
| { |
| "id": "call-1", |
| "function": { |
| "name": "bash", |
| "arguments": json.dumps({"key": ANTHROPIC_SECRET}), |
| }, |
| } |
| ], |
| "timestamp": "2026-01-01T00:00:02", |
| }, |
| { |
| "role": "tool", |
| "tool_call_id": "call-1", |
| "content": f"GITHUB_TOKEN={GITHUB_SECRET}", |
| "timestamp": "2026-01-01T00:00:03", |
| }, |
| ], |
| } |
|
|
| _write_claude_code_payload(data, str(tmp_file)) |
|
|
| payload = tmp_file.read_text() |
| assert HF_SECRET not in payload |
| assert ANTHROPIC_SECRET not in payload |
| assert GITHUB_SECRET not in payload |
| assert "[REDACTED_HF_TOKEN]" in payload |
| assert "[REDACTED_ANTHROPIC_KEY]" in payload |
| assert "GITHUB_TOKEN=[REDACTED]" in payload |
|
|