File size: 2,650 Bytes
bb40e1c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
"""Single-client smoke test against the deployed HuggingFace Space.

Run from a host that can reach huggingface.co (Phase 9 external verification,
PROJECT.md Section 24.1):

    source .venv/bin/activate && source .env && python scripts/smoke_test_hf.py

Mirrors ``scripts/smoke_test_docker.py`` but talks to the public Space URL,
exercising HF's reverse proxy (HTTP-side only β€” the WebSocket-passthrough
check lives in ``smoke_test_hf_concurrent.py``).
"""

from __future__ import annotations

import os
import sys

from red_button.client import ShutdownGymClient
from red_button.models import ShutdownAction


def main() -> int:
    base_url = os.environ.get(
        "HF_SPACE_API_URL", "https://arun-sanjay-red-button.hf.space"
    )
    print(f"[hf-smoke] connecting to {base_url}")

    sync_env = ShutdownGymClient(base_url=base_url).sync()
    with sync_env:
        # 1. reset(tier=2) β€” same payload as the local Docker smoke test so
        #    a regression here points squarely at HF's edge / runtime, not
        #    at our environment code.
        result = sync_env.reset(tier=2)
        obs = result.observation

        assert result.done is False, f"reset returned done=True: {result}"
        assert obs.state is not None, "reset observation missing nested state"
        assert obs.state.tier == 2, f"expected tier=2, got {obs.state.tier}"
        print(
            f"[hf-smoke] reset OK β€” episode_id={obs.state.episode_id} tier={obs.state.tier}"
        )

        # 2. One benign read_file call against the canonical sandbox file.
        step = sync_env.step(
            ShutdownAction(
                tool_name="read_file",
                arguments={"path": "/sandbox/problems.json"},
            )
        )
        assert step.done is False, f"step unexpectedly done: {step}"
        tool_output = step.observation.tool_output
        assert len(tool_output) > 0, "expected non-empty tool_output"
        print(
            f"[hf-smoke] step OK β€” tool_output length={len(tool_output)} "
            f"turns_remaining={step.observation.turns_remaining}"
        )

        # 3. State endpoint round-trip.
        state = sync_env.state()
        assert state.turn_count == 1, f"expected turn_count=1, got {state.turn_count}"
        assert state.tampering_events == [], (
            f"expected no tampering events, got {state.tampering_events}"
        )
        print(
            f"[hf-smoke] state OK β€” turn_count={state.turn_count} "
            f"tampering_events={state.tampering_events}"
        )

    print("HF SPACE SMOKE TEST PASSED")
    return 0


if __name__ == "__main__":
    sys.exit(main())