Fix GPU sandbox hardware OAuth failure (#246)
Browse files* Fix GPU sandbox hardware OAuth failure
Co-authored-by: OpenAI Codex <codex@openai.com>
* Address sandbox sleep-time review
Co-authored-by: OpenAI Codex <codex@openai.com>
---------
Co-authored-by: OpenAI Codex <codex@openai.com>
agent/tools/sandbox_client.py
CHANGED
|
@@ -65,7 +65,6 @@ MAX_TIMEOUT = 1200
|
|
| 65 |
WAIT_TIMEOUT = 600
|
| 66 |
WAIT_INTERVAL = 5
|
| 67 |
API_WAIT_TIMEOUT = 180
|
| 68 |
-
HARDWARE_REQUEST_TIMEOUT = 60
|
| 69 |
CPU_BASIC_HARDWARE = "cpu-basic"
|
| 70 |
|
| 71 |
|
|
@@ -78,58 +77,6 @@ def _is_transient_space_visibility_error(error: Exception) -> bool:
|
|
| 78 |
return "Repository Not Found" in message or "404 Client Error" in message
|
| 79 |
|
| 80 |
|
| 81 |
-
def _is_transient_space_management_error(error: Exception) -> bool:
|
| 82 |
-
"""Return True when a just-created private Space is not manageable yet."""
|
| 83 |
-
response = getattr(error, "response", None)
|
| 84 |
-
if getattr(response, "status_code", None) in {401, 404}:
|
| 85 |
-
return True
|
| 86 |
-
message = str(error)
|
| 87 |
-
return (
|
| 88 |
-
"Repository Not Found" in message
|
| 89 |
-
or "401 Client Error" in message
|
| 90 |
-
or "404 Client Error" in message
|
| 91 |
-
)
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
def _request_space_hardware_with_retry(
|
| 95 |
-
api: HfApi,
|
| 96 |
-
space_id: str,
|
| 97 |
-
*,
|
| 98 |
-
hardware: str,
|
| 99 |
-
sleep_time: int | None,
|
| 100 |
-
log: Callable[[str], object],
|
| 101 |
-
check_cancel: Callable[[], object],
|
| 102 |
-
) -> None:
|
| 103 |
-
"""Request hardware, retrying while Hub permissions propagate for a new Space."""
|
| 104 |
-
deadline = time.time() + HARDWARE_REQUEST_TIMEOUT
|
| 105 |
-
attempt = 0
|
| 106 |
-
while True:
|
| 107 |
-
check_cancel()
|
| 108 |
-
try:
|
| 109 |
-
api.request_space_hardware(
|
| 110 |
-
space_id,
|
| 111 |
-
hardware=hardware,
|
| 112 |
-
sleep_time=sleep_time,
|
| 113 |
-
)
|
| 114 |
-
return
|
| 115 |
-
except Exception as e:
|
| 116 |
-
if not _is_transient_space_management_error(e):
|
| 117 |
-
raise
|
| 118 |
-
|
| 119 |
-
remaining = deadline - time.time()
|
| 120 |
-
if remaining <= 0:
|
| 121 |
-
raise
|
| 122 |
-
|
| 123 |
-
attempt += 1
|
| 124 |
-
status_code = getattr(getattr(e, "response", None), "status_code", None)
|
| 125 |
-
status = f"HTTP {status_code}" if status_code else type(e).__name__
|
| 126 |
-
log(
|
| 127 |
-
f" Hardware request not accepted yet ({status}); "
|
| 128 |
-
f"retrying ({attempt})..."
|
| 129 |
-
)
|
| 130 |
-
time.sleep(min(WAIT_INTERVAL, remaining))
|
| 131 |
-
|
| 132 |
-
|
| 133 |
_DOCKERFILE = """\
|
| 134 |
FROM ghcr.io/astral-sh/uv:python3.12-bookworm-slim
|
| 135 |
|
|
@@ -679,24 +626,21 @@ class Sandbox:
|
|
| 679 |
|
| 680 |
_check_cancel()
|
| 681 |
|
| 682 |
-
# ``duplicate_space``
|
| 683 |
-
#
|
| 684 |
-
# 401 on that endpoint for a
|
| 685 |
-
#
|
| 686 |
-
#
|
| 687 |
-
#
|
| 688 |
-
|
| 689 |
-
|
| 690 |
-
|
| 691 |
-
|
| 692 |
-
|
| 693 |
-
|
| 694 |
-
|
| 695 |
-
|
| 696 |
-
|
| 697 |
-
check_cancel=_check_cancel,
|
| 698 |
-
)
|
| 699 |
-
_log(f"Requested hardware: {hardware}")
|
| 700 |
|
| 701 |
# Inject secrets BEFORE uploading server files (which triggers rebuild).
|
| 702 |
# Secrets added after a Space is running aren't available until restart,
|
|
|
|
| 65 |
WAIT_TIMEOUT = 600
|
| 66 |
WAIT_INTERVAL = 5
|
| 67 |
API_WAIT_TIMEOUT = 180
|
|
|
|
| 68 |
CPU_BASIC_HARDWARE = "cpu-basic"
|
| 69 |
|
| 70 |
|
|
|
|
| 77 |
return "Repository Not Found" in message or "404 Client Error" in message
|
| 78 |
|
| 79 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
_DOCKERFILE = """\
|
| 81 |
FROM ghcr.io/astral-sh/uv:python3.12-bookworm-slim
|
| 82 |
|
|
|
|
| 626 |
|
| 627 |
_check_cancel()
|
| 628 |
|
| 629 |
+
# ``duplicate_space`` sends hardware and sleepTimeSeconds in the
|
| 630 |
+
# initial create request. Avoid a second /hardware call: deployed HF
|
| 631 |
+
# OAuth tokens can 401 on that endpoint for a just-created private
|
| 632 |
+
# Space even though duplication itself succeeded. We rely on the
|
| 633 |
+
# duplicate endpoint to honor sleepTimeSeconds for upgraded hardware;
|
| 634 |
+
# cpu-basic auto-sleep is fixed by the Hub.
|
| 635 |
+
_log(f"Using duplicated Space hardware: {hardware}")
|
| 636 |
+
if sleep_time is not None:
|
| 637 |
+
if hardware == CPU_BASIC_HARDWARE:
|
| 638 |
+
_log(
|
| 639 |
+
f"Requested duplicated Space sleep time: {sleep_time}s "
|
| 640 |
+
"(cpu-basic auto-sleep is fixed by the Hub)"
|
| 641 |
+
)
|
| 642 |
+
else:
|
| 643 |
+
_log(f"Using duplicated Space sleep time: {sleep_time}s")
|
|
|
|
|
|
|
|
|
|
| 644 |
|
| 645 |
# Inject secrets BEFORE uploading server files (which triggers rebuild).
|
| 646 |
# Secrets added after a Space is running aren't available until restart,
|
tests/unit/test_sandbox_private_spaces.py
CHANGED
|
@@ -3,8 +3,6 @@ import threading
|
|
| 3 |
import time
|
| 4 |
from types import SimpleNamespace
|
| 5 |
|
| 6 |
-
import pytest
|
| 7 |
-
|
| 8 |
from agent.core import telemetry
|
| 9 |
from agent.tools import sandbox_client, sandbox_tool
|
| 10 |
from agent.tools.sandbox_client import Sandbox
|
|
@@ -17,6 +15,7 @@ def _fail_metadata_update(*args, **kwargs):
|
|
| 17 |
|
| 18 |
def test_sandbox_client_defaults_to_private_spaces(monkeypatch):
|
| 19 |
duplicate_kwargs = {}
|
|
|
|
| 20 |
requested_hardware = []
|
| 21 |
|
| 22 |
class FakeApi:
|
|
@@ -44,11 +43,12 @@ def test_sandbox_client_defaults_to_private_spaces(monkeypatch):
|
|
| 44 |
)
|
| 45 |
monkeypatch.setattr(Sandbox, "_wait_for_api", lambda self, *args, **kwargs: None)
|
| 46 |
|
| 47 |
-
Sandbox.create(owner="alice", token="hf-token", log=
|
| 48 |
|
| 49 |
assert duplicate_kwargs["private"] is True
|
| 50 |
assert duplicate_kwargs["hardware"] == "cpu-basic"
|
| 51 |
assert requested_hardware == []
|
|
|
|
| 52 |
|
| 53 |
|
| 54 |
def test_sandbox_client_retries_transient_runtime_404(monkeypatch):
|
|
@@ -98,32 +98,20 @@ def test_sandbox_client_retries_transient_runtime_404(monkeypatch):
|
|
| 98 |
assert runtime_calls == 2
|
| 99 |
|
| 100 |
|
| 101 |
-
def
|
| 102 |
-
|
| 103 |
logs: list[str] = []
|
| 104 |
-
|
| 105 |
-
class FakeResponse:
|
| 106 |
-
status_code = 401
|
| 107 |
-
|
| 108 |
-
class FakeHardware401(Exception):
|
| 109 |
-
response = FakeResponse()
|
| 110 |
-
|
| 111 |
-
def __str__(self):
|
| 112 |
-
return "401 Client Error: Repository Not Found"
|
| 113 |
|
| 114 |
class FakeApi:
|
| 115 |
def __init__(self, token=None):
|
| 116 |
self.token = token
|
| 117 |
|
| 118 |
def duplicate_space(self, **kwargs):
|
| 119 |
-
|
| 120 |
|
| 121 |
def request_space_hardware(self, space_id, hardware, sleep_time=None):
|
| 122 |
-
|
| 123 |
-
hardware_calls += 1
|
| 124 |
-
if hardware_calls == 1:
|
| 125 |
-
raise FakeHardware401()
|
| 126 |
-
return SimpleNamespace(stage="BUILDING", hardware=None)
|
| 127 |
|
| 128 |
def add_space_secret(self, *args, **kwargs):
|
| 129 |
pass
|
|
@@ -144,58 +132,62 @@ def test_sandbox_client_retries_transient_hardware_401(monkeypatch):
|
|
| 144 |
owner="alice",
|
| 145 |
token="hf-token",
|
| 146 |
hardware="t4-small",
|
|
|
|
| 147 |
log=logs.append,
|
| 148 |
)
|
| 149 |
|
| 150 |
assert sandbox.space_id.startswith("alice/sandbox-")
|
| 151 |
-
assert
|
| 152 |
-
assert
|
|
|
|
|
|
|
|
|
|
| 153 |
|
| 154 |
|
| 155 |
-
def
|
| 156 |
-
|
| 157 |
logs: list[str] = []
|
| 158 |
-
|
| 159 |
|
| 160 |
-
class
|
| 161 |
-
|
|
|
|
| 162 |
|
| 163 |
-
|
| 164 |
-
|
| 165 |
|
| 166 |
-
def
|
| 167 |
-
|
| 168 |
|
| 169 |
-
|
| 170 |
-
|
| 171 |
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
hardware="cpu-basic",
|
| 190 |
-
sleep_time=None,
|
| 191 |
-
log=logs.append,
|
| 192 |
-
check_cancel=lambda: None,
|
| 193 |
-
)
|
| 194 |
|
| 195 |
-
assert
|
| 196 |
-
assert
|
| 197 |
-
assert
|
| 198 |
-
assert
|
|
|
|
|
|
|
|
|
|
|
|
|
| 199 |
|
| 200 |
|
| 201 |
def test_sandbox_tool_forces_private_spaces(monkeypatch):
|
|
|
|
| 3 |
import time
|
| 4 |
from types import SimpleNamespace
|
| 5 |
|
|
|
|
|
|
|
| 6 |
from agent.core import telemetry
|
| 7 |
from agent.tools import sandbox_client, sandbox_tool
|
| 8 |
from agent.tools.sandbox_client import Sandbox
|
|
|
|
| 15 |
|
| 16 |
def test_sandbox_client_defaults_to_private_spaces(monkeypatch):
|
| 17 |
duplicate_kwargs = {}
|
| 18 |
+
logs: list[str] = []
|
| 19 |
requested_hardware = []
|
| 20 |
|
| 21 |
class FakeApi:
|
|
|
|
| 43 |
)
|
| 44 |
monkeypatch.setattr(Sandbox, "_wait_for_api", lambda self, *args, **kwargs: None)
|
| 45 |
|
| 46 |
+
Sandbox.create(owner="alice", token="hf-token", log=logs.append)
|
| 47 |
|
| 48 |
assert duplicate_kwargs["private"] is True
|
| 49 |
assert duplicate_kwargs["hardware"] == "cpu-basic"
|
| 50 |
assert requested_hardware == []
|
| 51 |
+
assert not any("sleep time" in log for log in logs)
|
| 52 |
|
| 53 |
|
| 54 |
def test_sandbox_client_retries_transient_runtime_404(monkeypatch):
|
|
|
|
| 98 |
assert runtime_calls == 2
|
| 99 |
|
| 100 |
|
| 101 |
+
def test_sandbox_client_configures_gpu_at_duplication(monkeypatch):
|
| 102 |
+
duplicate_kwargs = {}
|
| 103 |
logs: list[str] = []
|
| 104 |
+
requested_hardware = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
|
| 106 |
class FakeApi:
|
| 107 |
def __init__(self, token=None):
|
| 108 |
self.token = token
|
| 109 |
|
| 110 |
def duplicate_space(self, **kwargs):
|
| 111 |
+
duplicate_kwargs.update(kwargs)
|
| 112 |
|
| 113 |
def request_space_hardware(self, space_id, hardware, sleep_time=None):
|
| 114 |
+
requested_hardware.append((space_id, hardware, sleep_time))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
|
| 116 |
def add_space_secret(self, *args, **kwargs):
|
| 117 |
pass
|
|
|
|
| 132 |
owner="alice",
|
| 133 |
token="hf-token",
|
| 134 |
hardware="t4-small",
|
| 135 |
+
sleep_time=2700,
|
| 136 |
log=logs.append,
|
| 137 |
)
|
| 138 |
|
| 139 |
assert sandbox.space_id.startswith("alice/sandbox-")
|
| 140 |
+
assert duplicate_kwargs["hardware"] == "t4-small"
|
| 141 |
+
assert duplicate_kwargs["sleep_time"] == 2700
|
| 142 |
+
assert requested_hardware == []
|
| 143 |
+
assert "Using duplicated Space hardware: t4-small" in logs
|
| 144 |
+
assert "Using duplicated Space sleep time: 2700s" in logs
|
| 145 |
|
| 146 |
|
| 147 |
+
def test_sandbox_client_logs_cpu_sleep_time_as_hub_fixed(monkeypatch):
|
| 148 |
+
duplicate_kwargs = {}
|
| 149 |
logs: list[str] = []
|
| 150 |
+
requested_hardware = []
|
| 151 |
|
| 152 |
+
class FakeApi:
|
| 153 |
+
def __init__(self, token=None):
|
| 154 |
+
self.token = token
|
| 155 |
|
| 156 |
+
def duplicate_space(self, **kwargs):
|
| 157 |
+
duplicate_kwargs.update(kwargs)
|
| 158 |
|
| 159 |
+
def request_space_hardware(self, space_id, hardware, sleep_time=None):
|
| 160 |
+
requested_hardware.append((space_id, hardware, sleep_time))
|
| 161 |
|
| 162 |
+
def add_space_secret(self, *args, **kwargs):
|
| 163 |
+
pass
|
| 164 |
|
| 165 |
+
def get_space_runtime(self, space_id):
|
| 166 |
+
return SimpleNamespace(stage="RUNNING", hardware="cpu-basic")
|
| 167 |
+
|
| 168 |
+
monkeypatch.setattr(sandbox_client, "HfApi", FakeApi)
|
| 169 |
+
monkeypatch.setattr(
|
| 170 |
+
Sandbox,
|
| 171 |
+
"_setup_server",
|
| 172 |
+
staticmethod(lambda *args, **kwargs: None),
|
| 173 |
+
)
|
| 174 |
+
monkeypatch.setattr(Sandbox, "_wait_for_api", lambda self, *args, **kwargs: None)
|
| 175 |
+
|
| 176 |
+
Sandbox.create(
|
| 177 |
+
owner="alice",
|
| 178 |
+
token="hf-token",
|
| 179 |
+
sleep_time=2700,
|
| 180 |
+
log=logs.append,
|
| 181 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 182 |
|
| 183 |
+
assert duplicate_kwargs["hardware"] == "cpu-basic"
|
| 184 |
+
assert duplicate_kwargs["sleep_time"] == 2700
|
| 185 |
+
assert requested_hardware == []
|
| 186 |
+
assert "Using duplicated Space hardware: cpu-basic" in logs
|
| 187 |
+
assert (
|
| 188 |
+
"Requested duplicated Space sleep time: 2700s "
|
| 189 |
+
"(cpu-basic auto-sleep is fixed by the Hub)"
|
| 190 |
+
) in logs
|
| 191 |
|
| 192 |
|
| 193 |
def test_sandbox_tool_forces_private_spaces(monkeypatch):
|