lewtun HF Staff OpenAI Codex commited on
Commit
60f7188
·
unverified ·
1 Parent(s): 60474c1

Fix GPU sandbox hardware OAuth failure (#246)

Browse files

* Fix GPU sandbox hardware OAuth failure

Co-authored-by: OpenAI Codex <codex@openai.com>

* Address sandbox sleep-time review

Co-authored-by: OpenAI Codex <codex@openai.com>

---------

Co-authored-by: OpenAI Codex <codex@openai.com>

agent/tools/sandbox_client.py CHANGED
@@ -65,7 +65,6 @@ MAX_TIMEOUT = 1200
65
  WAIT_TIMEOUT = 600
66
  WAIT_INTERVAL = 5
67
  API_WAIT_TIMEOUT = 180
68
- HARDWARE_REQUEST_TIMEOUT = 60
69
  CPU_BASIC_HARDWARE = "cpu-basic"
70
 
71
 
@@ -78,58 +77,6 @@ def _is_transient_space_visibility_error(error: Exception) -> bool:
78
  return "Repository Not Found" in message or "404 Client Error" in message
79
 
80
 
81
- def _is_transient_space_management_error(error: Exception) -> bool:
82
- """Return True when a just-created private Space is not manageable yet."""
83
- response = getattr(error, "response", None)
84
- if getattr(response, "status_code", None) in {401, 404}:
85
- return True
86
- message = str(error)
87
- return (
88
- "Repository Not Found" in message
89
- or "401 Client Error" in message
90
- or "404 Client Error" in message
91
- )
92
-
93
-
94
- def _request_space_hardware_with_retry(
95
- api: HfApi,
96
- space_id: str,
97
- *,
98
- hardware: str,
99
- sleep_time: int | None,
100
- log: Callable[[str], object],
101
- check_cancel: Callable[[], object],
102
- ) -> None:
103
- """Request hardware, retrying while Hub permissions propagate for a new Space."""
104
- deadline = time.time() + HARDWARE_REQUEST_TIMEOUT
105
- attempt = 0
106
- while True:
107
- check_cancel()
108
- try:
109
- api.request_space_hardware(
110
- space_id,
111
- hardware=hardware,
112
- sleep_time=sleep_time,
113
- )
114
- return
115
- except Exception as e:
116
- if not _is_transient_space_management_error(e):
117
- raise
118
-
119
- remaining = deadline - time.time()
120
- if remaining <= 0:
121
- raise
122
-
123
- attempt += 1
124
- status_code = getattr(getattr(e, "response", None), "status_code", None)
125
- status = f"HTTP {status_code}" if status_code else type(e).__name__
126
- log(
127
- f" Hardware request not accepted yet ({status}); "
128
- f"retrying ({attempt})..."
129
- )
130
- time.sleep(min(WAIT_INTERVAL, remaining))
131
-
132
-
133
  _DOCKERFILE = """\
134
  FROM ghcr.io/astral-sh/uv:python3.12-bookworm-slim
135
 
@@ -679,24 +626,21 @@ class Sandbox:
679
 
680
  _check_cancel()
681
 
682
- # ``duplicate_space`` already receives the target hardware. The extra
683
- # /hardware call is useful for paid tiers, but hosted OAuth tokens can
684
- # 401 on that endpoint for a fresh private Space even after duplication
685
- # succeeds. Avoid the redundant call for default CPU sandboxes when no
686
- # auto-sleep timer is requested; with sleep_time set, the hardware
687
- # endpoint is still needed to configure auto-sleep.
688
- if hardware == CPU_BASIC_HARDWARE and sleep_time is None:
689
- _log(f"Using duplicated Space hardware: {hardware}")
690
- else:
691
- _request_space_hardware_with_retry(
692
- api,
693
- space_id,
694
- hardware=hardware,
695
- sleep_time=sleep_time,
696
- log=_log,
697
- check_cancel=_check_cancel,
698
- )
699
- _log(f"Requested hardware: {hardware}")
700
 
701
  # Inject secrets BEFORE uploading server files (which triggers rebuild).
702
  # Secrets added after a Space is running aren't available until restart,
 
65
  WAIT_TIMEOUT = 600
66
  WAIT_INTERVAL = 5
67
  API_WAIT_TIMEOUT = 180
 
68
  CPU_BASIC_HARDWARE = "cpu-basic"
69
 
70
 
 
77
  return "Repository Not Found" in message or "404 Client Error" in message
78
 
79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  _DOCKERFILE = """\
81
  FROM ghcr.io/astral-sh/uv:python3.12-bookworm-slim
82
 
 
626
 
627
  _check_cancel()
628
 
629
+ # ``duplicate_space`` sends hardware and sleepTimeSeconds in the
630
+ # initial create request. Avoid a second /hardware call: deployed HF
631
+ # OAuth tokens can 401 on that endpoint for a just-created private
632
+ # Space even though duplication itself succeeded. We rely on the
633
+ # duplicate endpoint to honor sleepTimeSeconds for upgraded hardware;
634
+ # cpu-basic auto-sleep is fixed by the Hub.
635
+ _log(f"Using duplicated Space hardware: {hardware}")
636
+ if sleep_time is not None:
637
+ if hardware == CPU_BASIC_HARDWARE:
638
+ _log(
639
+ f"Requested duplicated Space sleep time: {sleep_time}s "
640
+ "(cpu-basic auto-sleep is fixed by the Hub)"
641
+ )
642
+ else:
643
+ _log(f"Using duplicated Space sleep time: {sleep_time}s")
 
 
 
644
 
645
  # Inject secrets BEFORE uploading server files (which triggers rebuild).
646
  # Secrets added after a Space is running aren't available until restart,
tests/unit/test_sandbox_private_spaces.py CHANGED
@@ -3,8 +3,6 @@ import threading
3
  import time
4
  from types import SimpleNamespace
5
 
6
- import pytest
7
-
8
  from agent.core import telemetry
9
  from agent.tools import sandbox_client, sandbox_tool
10
  from agent.tools.sandbox_client import Sandbox
@@ -17,6 +15,7 @@ def _fail_metadata_update(*args, **kwargs):
17
 
18
  def test_sandbox_client_defaults_to_private_spaces(monkeypatch):
19
  duplicate_kwargs = {}
 
20
  requested_hardware = []
21
 
22
  class FakeApi:
@@ -44,11 +43,12 @@ def test_sandbox_client_defaults_to_private_spaces(monkeypatch):
44
  )
45
  monkeypatch.setattr(Sandbox, "_wait_for_api", lambda self, *args, **kwargs: None)
46
 
47
- Sandbox.create(owner="alice", token="hf-token", log=lambda msg: None)
48
 
49
  assert duplicate_kwargs["private"] is True
50
  assert duplicate_kwargs["hardware"] == "cpu-basic"
51
  assert requested_hardware == []
 
52
 
53
 
54
  def test_sandbox_client_retries_transient_runtime_404(monkeypatch):
@@ -98,32 +98,20 @@ def test_sandbox_client_retries_transient_runtime_404(monkeypatch):
98
  assert runtime_calls == 2
99
 
100
 
101
- def test_sandbox_client_retries_transient_hardware_401(monkeypatch):
102
- hardware_calls = 0
103
  logs: list[str] = []
104
-
105
- class FakeResponse:
106
- status_code = 401
107
-
108
- class FakeHardware401(Exception):
109
- response = FakeResponse()
110
-
111
- def __str__(self):
112
- return "401 Client Error: Repository Not Found"
113
 
114
  class FakeApi:
115
  def __init__(self, token=None):
116
  self.token = token
117
 
118
  def duplicate_space(self, **kwargs):
119
- pass
120
 
121
  def request_space_hardware(self, space_id, hardware, sleep_time=None):
122
- nonlocal hardware_calls
123
- hardware_calls += 1
124
- if hardware_calls == 1:
125
- raise FakeHardware401()
126
- return SimpleNamespace(stage="BUILDING", hardware=None)
127
 
128
  def add_space_secret(self, *args, **kwargs):
129
  pass
@@ -144,58 +132,62 @@ def test_sandbox_client_retries_transient_hardware_401(monkeypatch):
144
  owner="alice",
145
  token="hf-token",
146
  hardware="t4-small",
 
147
  log=logs.append,
148
  )
149
 
150
  assert sandbox.space_id.startswith("alice/sandbox-")
151
- assert hardware_calls == 2
152
- assert any("Hardware request not accepted yet (HTTP 401)" in log for log in logs)
 
 
 
153
 
154
 
155
- def test_sandbox_hardware_retry_reraises_after_timeout(monkeypatch):
156
- calls = 0
157
  logs: list[str] = []
158
- sleeps: list[float] = []
159
 
160
- class FakeResponse:
161
- status_code = 401
 
162
 
163
- class FakeHardware401(Exception):
164
- response = FakeResponse()
165
 
166
- def __str__(self):
167
- return "401 Client Error: Repository Not Found"
168
 
169
- first_error = FakeHardware401("first")
170
- timeout_error = FakeHardware401("timeout")
171
 
172
- class FakeApi:
173
- def request_space_hardware(self, space_id, hardware, sleep_time=None):
174
- nonlocal calls
175
- calls += 1
176
- if calls == 1:
177
- raise first_error
178
- raise timeout_error
179
-
180
- timestamps = iter([100.0, 100.0, 161.0])
181
-
182
- monkeypatch.setattr(sandbox_client.time, "time", lambda: next(timestamps))
183
- monkeypatch.setattr(sandbox_client.time, "sleep", sleeps.append)
184
-
185
- with pytest.raises(FakeHardware401) as excinfo:
186
- sandbox_client._request_space_hardware_with_retry(
187
- FakeApi(),
188
- "alice/sandbox-12345678",
189
- hardware="cpu-basic",
190
- sleep_time=None,
191
- log=logs.append,
192
- check_cancel=lambda: None,
193
- )
194
 
195
- assert excinfo.value is timeout_error
196
- assert calls == 2
197
- assert sleeps == [sandbox_client.WAIT_INTERVAL]
198
- assert len(logs) == 1
 
 
 
 
199
 
200
 
201
  def test_sandbox_tool_forces_private_spaces(monkeypatch):
 
3
  import time
4
  from types import SimpleNamespace
5
 
 
 
6
  from agent.core import telemetry
7
  from agent.tools import sandbox_client, sandbox_tool
8
  from agent.tools.sandbox_client import Sandbox
 
15
 
16
  def test_sandbox_client_defaults_to_private_spaces(monkeypatch):
17
  duplicate_kwargs = {}
18
+ logs: list[str] = []
19
  requested_hardware = []
20
 
21
  class FakeApi:
 
43
  )
44
  monkeypatch.setattr(Sandbox, "_wait_for_api", lambda self, *args, **kwargs: None)
45
 
46
+ Sandbox.create(owner="alice", token="hf-token", log=logs.append)
47
 
48
  assert duplicate_kwargs["private"] is True
49
  assert duplicate_kwargs["hardware"] == "cpu-basic"
50
  assert requested_hardware == []
51
+ assert not any("sleep time" in log for log in logs)
52
 
53
 
54
  def test_sandbox_client_retries_transient_runtime_404(monkeypatch):
 
98
  assert runtime_calls == 2
99
 
100
 
101
+ def test_sandbox_client_configures_gpu_at_duplication(monkeypatch):
102
+ duplicate_kwargs = {}
103
  logs: list[str] = []
104
+ requested_hardware = []
 
 
 
 
 
 
 
 
105
 
106
  class FakeApi:
107
  def __init__(self, token=None):
108
  self.token = token
109
 
110
  def duplicate_space(self, **kwargs):
111
+ duplicate_kwargs.update(kwargs)
112
 
113
  def request_space_hardware(self, space_id, hardware, sleep_time=None):
114
+ requested_hardware.append((space_id, hardware, sleep_time))
 
 
 
 
115
 
116
  def add_space_secret(self, *args, **kwargs):
117
  pass
 
132
  owner="alice",
133
  token="hf-token",
134
  hardware="t4-small",
135
+ sleep_time=2700,
136
  log=logs.append,
137
  )
138
 
139
  assert sandbox.space_id.startswith("alice/sandbox-")
140
+ assert duplicate_kwargs["hardware"] == "t4-small"
141
+ assert duplicate_kwargs["sleep_time"] == 2700
142
+ assert requested_hardware == []
143
+ assert "Using duplicated Space hardware: t4-small" in logs
144
+ assert "Using duplicated Space sleep time: 2700s" in logs
145
 
146
 
147
+ def test_sandbox_client_logs_cpu_sleep_time_as_hub_fixed(monkeypatch):
148
+ duplicate_kwargs = {}
149
  logs: list[str] = []
150
+ requested_hardware = []
151
 
152
+ class FakeApi:
153
+ def __init__(self, token=None):
154
+ self.token = token
155
 
156
+ def duplicate_space(self, **kwargs):
157
+ duplicate_kwargs.update(kwargs)
158
 
159
+ def request_space_hardware(self, space_id, hardware, sleep_time=None):
160
+ requested_hardware.append((space_id, hardware, sleep_time))
161
 
162
+ def add_space_secret(self, *args, **kwargs):
163
+ pass
164
 
165
+ def get_space_runtime(self, space_id):
166
+ return SimpleNamespace(stage="RUNNING", hardware="cpu-basic")
167
+
168
+ monkeypatch.setattr(sandbox_client, "HfApi", FakeApi)
169
+ monkeypatch.setattr(
170
+ Sandbox,
171
+ "_setup_server",
172
+ staticmethod(lambda *args, **kwargs: None),
173
+ )
174
+ monkeypatch.setattr(Sandbox, "_wait_for_api", lambda self, *args, **kwargs: None)
175
+
176
+ Sandbox.create(
177
+ owner="alice",
178
+ token="hf-token",
179
+ sleep_time=2700,
180
+ log=logs.append,
181
+ )
 
 
 
 
 
182
 
183
+ assert duplicate_kwargs["hardware"] == "cpu-basic"
184
+ assert duplicate_kwargs["sleep_time"] == 2700
185
+ assert requested_hardware == []
186
+ assert "Using duplicated Space hardware: cpu-basic" in logs
187
+ assert (
188
+ "Requested duplicated Space sleep time: 2700s "
189
+ "(cpu-basic auto-sleep is fixed by the Hub)"
190
+ ) in logs
191
 
192
 
193
  def test_sandbox_tool_forces_private_spaces(monkeypatch):