kyars commited on
Commit
81f75fe
·
verified ·
1 Parent(s): 532abe9

Upload folder using huggingface_hub

Browse files
Dockerfile ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
2
+ FROM ${BASE_IMAGE} AS builder
3
+
4
+ WORKDIR /app
5
+ RUN apt-get update && \
6
+ apt-get install -y --no-install-recommends git && \
7
+ rm -rf /var/lib/apt/lists/*
8
+
9
+ COPY . /app/env
10
+ WORKDIR /app/env
11
+
12
+ RUN if ! command -v uv >/dev/null 2>&1; then \
13
+ curl -LsSf https://astral.sh/uv/install.sh | sh && \
14
+ mv /root/.local/bin/uv /usr/local/bin/uv && \
15
+ mv /root/.local/bin/uvx /usr/local/bin/uvx; \
16
+ fi
17
+
18
+ RUN --mount=type=cache,target=/root/.cache/uv \
19
+ uv sync --no-install-project --no-editable
20
+ RUN --mount=type=cache,target=/root/.cache/uv \
21
+ uv sync --no-editable
22
+
23
+ FROM ${BASE_IMAGE}
24
+ WORKDIR /app
25
+ COPY --from=builder /app/env/.venv /app/.venv
26
+ COPY --from=builder /app/env /app/env
27
+ ENV PATH="/app/.venv/bin:$PATH"
28
+ ENV PYTHONPATH="/app/env:$PYTHONPATH"
29
+ HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
30
+ CMD curl -f http://localhost:8000/health || exit 1
31
+ ENV ENABLE_WEB_INTERFACE=true
32
+ CMD ["sh", "-c", "cd /app/env && uvicorn server.app:app --host 0.0.0.0 --port 8000"]
README.md CHANGED
@@ -1,10 +1,75 @@
1
  ---
2
- title: Compute Market Env
3
- emoji: 👀
4
- colorFrom: gray
5
- colorTo: pink
6
  sdk: docker
7
  pinned: false
 
 
 
 
 
 
 
8
  ---
9
 
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Compute Market Environment Server
3
+ emoji: 📊
4
+ colorFrom: blue
5
+ colorTo: green
6
  sdk: docker
7
  pinned: false
8
+ app_port: 8000
9
+ base_path: /web
10
+ tags:
11
+ - openenv
12
+ - multi-agent
13
+ - compute-allocation
14
+ - market-simulation
15
  ---
16
 
17
+ # Compute Market Environment
18
+
19
+ An OpenEnv environment for training a single allocator/trader in a scarce-GPU market with scripted background actors, hidden incentives, delayed rewards, and partial observability.
20
+
21
+ ## What v1 implements
22
+
23
+ - One trained agent: the allocator/trader
24
+ - Scripted counterparties: urgent tenant, cost-sensitive tenant, broker
25
+ - Jobs with deadlines, value, dependencies, and delayed payoff
26
+ - Actions: `bid_for_capacity`, `accept_offer`, `propose_swap`, `schedule_job`, `delay_job`, `inspect_market`, `noop`
27
+ - Reward = completed job value minus compute spend, missed-deadline penalties, and idle-hoarding penalties
28
+ - Separate training helper for TRL/Colab in `training/minimal_grpo_rollout.py`
29
+
30
+ ## Quick Start
31
+
32
+ ```python
33
+ from compute_market_env import ComputeMarketAction, ComputeMarketEnv
34
+
35
+ with ComputeMarketEnv(base_url="http://localhost:8000") as env:
36
+ result = env.reset(seed=7)
37
+ print(result.observation.market_price)
38
+ print(result.observation.free_gpus)
39
+
40
+ result = env.step(
41
+ ComputeMarketAction(
42
+ action_type="bid_for_capacity",
43
+ gpu_count=4,
44
+ price_per_gpu=6.5,
45
+ duration=3,
46
+ )
47
+ )
48
+ print(result.reward)
49
+ print(result.observation.budget_remaining)
50
+ ```
51
+
52
+ ## Local Development
53
+
54
+ ```bash
55
+ uv sync --extra dev
56
+ uv run pytest -q
57
+ uv run uvicorn server.app:app --host 0.0.0.0 --port 8000
58
+ openenv validate --verbose
59
+ ```
60
+
61
+ ## Docker
62
+
63
+ ```bash
64
+ docker build -t compute-market-env:latest -f server/Dockerfile .
65
+ docker run -p 8000:8000 compute-market-env:latest
66
+ ```
67
+
68
+ ## Environment Loop
69
+
70
+ 1. Agent observes market price, public free GPUs, visible offers, jobs, and public actor signals.
71
+ 2. Agent takes one action.
72
+ 3. The environment advances one tick.
73
+ 4. Scripted actors update demand and offers.
74
+ 5. Jobs progress, complete, pause, or miss deadlines.
75
+ 6. The environment returns the next observation and realized reward for that tick.
__init__.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Compute Market environment exports."""
2
+
3
+ from .client import ComputeMarketEnv
4
+ from .models import (
5
+ ActorProfile,
6
+ ActorSignal,
7
+ ComputeMarketAction,
8
+ ComputeMarketObservation,
9
+ ComputeMarketState,
10
+ JobRecord,
11
+ MarketEvent,
12
+ MarketOffer,
13
+ ReservationRecord,
14
+ )
15
+
16
+ __all__ = [
17
+ "ActorProfile",
18
+ "ActorSignal",
19
+ "ComputeMarketAction",
20
+ "ComputeMarketEnv",
21
+ "ComputeMarketObservation",
22
+ "ComputeMarketState",
23
+ "JobRecord",
24
+ "MarketEvent",
25
+ "MarketOffer",
26
+ "ReservationRecord",
27
+ ]
client.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Compute Market environment client."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ from openenv.core.client_types import StepResult
8
+ from openenv.core.env_client import EnvClient
9
+
10
+ from .models import (
11
+ ActorProfile,
12
+ ActorSignal,
13
+ ComputeMarketAction,
14
+ ComputeMarketObservation,
15
+ ComputeMarketState,
16
+ JobRecord,
17
+ MarketEvent,
18
+ MarketOffer,
19
+ ReservationRecord,
20
+ )
21
+
22
+
23
+ class ComputeMarketEnv(
24
+ EnvClient[ComputeMarketAction, ComputeMarketObservation, ComputeMarketState]
25
+ ):
26
+ """Persistent client for the compute market environment."""
27
+
28
+ def _step_payload(self, action: ComputeMarketAction) -> dict[str, Any]:
29
+ return action.model_dump(exclude_none=True)
30
+
31
+ def _parse_result(self, payload: dict[str, Any]) -> StepResult[ComputeMarketObservation]:
32
+ obs_data = payload.get("observation", {})
33
+ observation = ComputeMarketObservation(
34
+ scenario_variant=obs_data.get("scenario_variant", "baseline"),
35
+ current_tick=obs_data.get("current_tick", 0),
36
+ max_ticks=obs_data.get("max_ticks", 0),
37
+ total_gpus=obs_data.get("total_gpus", 0),
38
+ free_gpus=obs_data.get("free_gpus", 0),
39
+ owned_gpus=obs_data.get("owned_gpus", 0),
40
+ idle_owned_gpus=obs_data.get("idle_owned_gpus", 0),
41
+ budget_remaining=obs_data.get("budget_remaining", 0.0),
42
+ market_price=obs_data.get("market_price", 0.0),
43
+ jobs=[JobRecord(**item) for item in obs_data.get("jobs", [])],
44
+ visible_offers=[MarketOffer(**item) for item in obs_data.get("visible_offers", [])],
45
+ recent_events=[MarketEvent(**item) for item in obs_data.get("recent_events", [])],
46
+ actor_signals=[ActorSignal(**item) for item in obs_data.get("actor_signals", [])],
47
+ done=payload.get("done", False),
48
+ reward=payload.get("reward", 0.0),
49
+ metadata=obs_data.get("metadata", {}),
50
+ )
51
+ return StepResult(
52
+ observation=observation,
53
+ reward=payload.get("reward", 0.0),
54
+ done=payload.get("done", False),
55
+ )
56
+
57
+ def _parse_state(self, payload: dict[str, Any]) -> ComputeMarketState:
58
+ return ComputeMarketState(
59
+ episode_id=payload.get("episode_id", ""),
60
+ step_count=payload.get("step_count", 0),
61
+ scenario_seed=payload.get("scenario_seed", 0),
62
+ scenario_variant=payload.get("scenario_variant", "baseline"),
63
+ current_tick=payload.get("current_tick", 0),
64
+ max_ticks=payload.get("max_ticks", 0),
65
+ total_gpus=payload.get("total_gpus", 0),
66
+ free_gpus=payload.get("free_gpus", 0),
67
+ owned_gpus=payload.get("owned_gpus", 0),
68
+ idle_owned_gpus=payload.get("idle_owned_gpus", 0),
69
+ budget_remaining=payload.get("budget_remaining", 0.0),
70
+ market_price=payload.get("market_price", 0.0),
71
+ cumulative_reward=payload.get("cumulative_reward", 0.0),
72
+ external_allocated_gpus=payload.get("external_allocated_gpus", 0),
73
+ done=payload.get("done", False),
74
+ jobs=[JobRecord(**item) for item in payload.get("jobs", [])],
75
+ visible_offers=[MarketOffer(**item) for item in payload.get("visible_offers", [])],
76
+ reservations=[ReservationRecord(**item) for item in payload.get("reservations", [])],
77
+ actor_signals=[ActorSignal(**item) for item in payload.get("actor_signals", [])],
78
+ hidden_actors=[ActorProfile(**item) for item in payload.get("hidden_actors", [])],
79
+ recent_events=[MarketEvent(**item) for item in payload.get("recent_events", [])],
80
+ )
examples/random_policy.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tiny local smoke example for the compute market environment."""
2
+
3
+ from compute_market_env import ComputeMarketAction, ComputeMarketEnv
4
+
5
+
6
+ with ComputeMarketEnv(base_url="http://localhost:8000") as env:
7
+ result = env.reset(seed=5)
8
+ print("reset", result.observation.market_price, result.observation.free_gpus)
9
+
10
+ result = env.step(
11
+ ComputeMarketAction(
12
+ action_type="bid_for_capacity",
13
+ gpu_count=min(4, max(1, result.observation.free_gpus)),
14
+ price_per_gpu=max(6.0, result.observation.market_price + 0.5),
15
+ duration=3,
16
+ )
17
+ )
18
+ print("bid", result.reward)
19
+
20
+ result = env.step(ComputeMarketAction(action_type="schedule_job", job_id="job-a"))
21
+ print("schedule", result.reward)
22
+
23
+ while not result.done:
24
+ result = env.step(ComputeMarketAction(action_type="noop"))
25
+ print("tick", result.observation.current_tick, result.reward)
models.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Data models for the Compute Market environment."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Literal
6
+
7
+ from openenv.core.env_server.types import Action, Observation, State
8
+ from pydantic import BaseModel, Field
9
+
10
+
11
+ class JobRecord(BaseModel):
12
+ """Represents an agent-owned job in the simulated compute market."""
13
+
14
+ job_id: str
15
+ gpu_count: int = Field(..., ge=1)
16
+ total_duration: int = Field(..., ge=1)
17
+ remaining_duration: int = Field(..., ge=0)
18
+ deadline: int = Field(..., ge=1)
19
+ value: float = Field(..., ge=0.0)
20
+ priority: int = Field(default=1, ge=1)
21
+ depends_on: list[str] = Field(default_factory=list)
22
+ status: Literal["pending", "running", "paused", "completed", "missed"] = "pending"
23
+ delay_count: int = Field(default=0, ge=0)
24
+ started_at: int | None = None
25
+ completed_at: int | None = None
26
+
27
+
28
+ class MarketOffer(BaseModel):
29
+ """Visible capacity offer from a scripted actor."""
30
+
31
+ offer_id: str
32
+ actor_id: str
33
+ gpu_count: int = Field(..., ge=1)
34
+ price_per_gpu: float = Field(..., ge=0.0)
35
+ duration: int = Field(..., ge=1)
36
+ expires_at_tick: int = Field(..., ge=0)
37
+ offer_type: Literal["broker", "swap"] = "broker"
38
+
39
+
40
+ class ReservationRecord(BaseModel):
41
+ """Capacity currently owned by the agent."""
42
+
43
+ reservation_id: str
44
+ source: str
45
+ gpu_count: int = Field(..., ge=1)
46
+ remaining_ticks: int = Field(..., ge=0)
47
+ price_per_gpu: float = Field(..., ge=0.0)
48
+ acquired_at_tick: int = Field(..., ge=0)
49
+
50
+
51
+ class MarketEvent(BaseModel):
52
+ """Human-readable event surfaced in observations."""
53
+
54
+ tick: int = Field(..., ge=0)
55
+ event_type: str
56
+ message: str
57
+
58
+
59
+ class ActorSignal(BaseModel):
60
+ """Public signal about a scripted actor."""
61
+
62
+ actor_id: str
63
+ visible_behavior: Literal["aggressive", "steady", "opportunistic"]
64
+ pressure_hint: Literal["low", "medium", "high"]
65
+ last_seen_bid: float = Field(..., ge=0.0)
66
+
67
+
68
+ class ActorProfile(BaseModel):
69
+ """Hidden actor configuration kept in the control-plane state."""
70
+
71
+ actor_id: str
72
+ policy_type: Literal["urgent_tenant", "cost_sensitive_tenant", "broker"]
73
+ max_bid: float = Field(..., ge=0.0)
74
+ preferred_gpu_count: int = Field(..., ge=1)
75
+ visible_behavior: Literal["aggressive", "steady", "opportunistic"]
76
+ swap_floor: float = Field(..., ge=0.0)
77
+
78
+
79
+ class ComputeMarketAction(Action):
80
+ """Single-step action for the compute market."""
81
+
82
+ action_type: Literal[
83
+ "bid_for_capacity",
84
+ "accept_offer",
85
+ "propose_swap",
86
+ "schedule_job",
87
+ "delay_job",
88
+ "inspect_market",
89
+ "noop",
90
+ ]
91
+ job_id: str | None = Field(default=None)
92
+ offer_id: str | None = Field(default=None)
93
+ actor_id: str | None = Field(default=None)
94
+ gpu_count: int = Field(default=0, ge=0)
95
+ price_per_gpu: float = Field(default=0.0, ge=0.0)
96
+ duration: int = Field(default=1, ge=1)
97
+
98
+
99
+ class ComputeMarketObservation(Observation):
100
+ """Partial observation exposed to the trained agent."""
101
+
102
+ scenario_variant: str = "baseline"
103
+ current_tick: int = Field(default=0, ge=0)
104
+ max_ticks: int = Field(default=0, ge=0)
105
+ total_gpus: int = Field(default=0, ge=0)
106
+ free_gpus: int = Field(default=0, ge=0)
107
+ owned_gpus: int = Field(default=0, ge=0)
108
+ idle_owned_gpus: int = Field(default=0, ge=0)
109
+ budget_remaining: float = Field(default=0.0)
110
+ market_price: float = Field(default=0.0, ge=0.0)
111
+ jobs: list[JobRecord] = Field(default_factory=list)
112
+ visible_offers: list[MarketOffer] = Field(default_factory=list)
113
+ recent_events: list[MarketEvent] = Field(default_factory=list)
114
+ actor_signals: list[ActorSignal] = Field(default_factory=list)
115
+
116
+
117
+ class ComputeMarketState(State):
118
+ """Full control-plane state including hidden actor data."""
119
+
120
+ scenario_seed: int = 0
121
+ scenario_variant: str = "baseline"
122
+ current_tick: int = 0
123
+ max_ticks: int = 0
124
+ total_gpus: int = 0
125
+ free_gpus: int = 0
126
+ owned_gpus: int = 0
127
+ idle_owned_gpus: int = 0
128
+ budget_remaining: float = 0.0
129
+ market_price: float = 0.0
130
+ cumulative_reward: float = 0.0
131
+ external_allocated_gpus: int = 0
132
+ done: bool = False
133
+ jobs: list[JobRecord] = Field(default_factory=list)
134
+ visible_offers: list[MarketOffer] = Field(default_factory=list)
135
+ reservations: list[ReservationRecord] = Field(default_factory=list)
136
+ actor_signals: list[ActorSignal] = Field(default_factory=list)
137
+ hidden_actors: list[ActorProfile] = Field(default_factory=list)
138
+ recent_events: list[MarketEvent] = Field(default_factory=list)
openenv.yaml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ spec_version: 1
2
+ name: compute_market_env
3
+ type: space
4
+ runtime: fastapi
5
+ app: server.app:app
6
+ port: 8000
output.md ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ==((====))== Unsloth - 2x faster free finetuning | Num GPUs used = 1
2
+ \\ /| Num examples = 64 | Num Epochs = 1 | Total steps = 20
3
+ O^O/ \_/ \ Batch size per device = 2 | Gradient accumulation steps = 1
4
+ \ / Data Parallel GPUs = 1 | Total batch size (2 x 1 x 1) = 2
5
+ "-____-" Trainable parameters = 66,060,288 of 4,088,528,384 (1.62% trained)
6
+ WARNING 03-08 19:58:35 [input_processor.py:168] vLLM has deprecated support for supporting different tokenizers for different LoRAs. By default, vLLM uses base model's tokenizer. If you are using a LoRA with its own tokenizer, consider specifying `--tokenizer [lora_path]` to use the LoRA tokenizer.
7
+ /usr/local/lib/python3.12/dist-packages/vllm/v1/attention/backends/flashinfer.py:908: DeprecationWarning:
8
+ Prefer using device seq_lens directly to avoid implicit H<>D sync.
9
+ If a CPU copy is needed, use `seq_lens.cpu()` instead.
10
+ Will be removed in a future release, please migrate as soon as possible.
11
+
12
+ seq_lens_cpu = common_attn_metadata.seq_lens_cpu if needs_seq_lens_cpu else None
13
+ /usr/local/lib/python3.12/dist-packages/vllm/v1/attention/backends/flashinfer.py:908: DeprecationWarning:
14
+ Prefer using device seq_lens directly to avoid implicit H<>D sync.
15
+ If a CPU copy is needed, use `seq_lens.cpu()` instead.
16
+ Will be removed in a future release, please migrate as soon as possible.
17
+
18
+ seq_lens_cpu = common_attn_metadata.seq_lens_cpu if needs_seq_lens_cpu else None
19
+ /usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning: datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).
20
+ return datetime.utcnow().replace(tzinfo=utc)
21
+ /usr/local/lib/python3.12/dist-packages/vllm/v1/attention/backends/flashinfer.py:908: DeprecationWarning:
22
+ Prefer using device seq_lens directly to avoid implicit H<>D sync.
23
+ If a CPU copy is needed, use `seq_lens.cpu()` instead.
24
+ Will be removed in a future release, please migrate as soon as possible.
25
+
26
+ seq_lens_cpu = common_attn_metadata.seq_lens_cpu if needs_seq_lens_cpu else None
27
+ /usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning: datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).
28
+ return datetime.utcnow().replace(tzinfo=utc)
29
+ Unsloth: Will smartly offload gradients to save VRAM!
30
+ /usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning: datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).
31
+ return datetime.utcnow().replace(tzinfo=utc)
32
+ [20/20 05:11, Epoch 0/1]
33
+ Step Training Loss reward reward_std completions / mean_length completions / min_length completions / max_length completions / clipped_ratio completions / mean_terminated_length completions / min_terminated_length completions / max_terminated_length sampling / sampling_logp_difference / mean sampling / sampling_logp_difference / max sampling / importance_sampling_ratio / min sampling / importance_sampling_ratio / mean sampling / importance_sampling_ratio / max kl rewards / reward_env_return / mean rewards / reward_env_return / std rewards / reward_valid_action / mean rewards / reward_valid_action / std rewards / reward_job_completion / mean rewards / reward_job_completion / std
34
+ 1 0.000000 0.000000 0.000000 142.000000 92.000000 192.000000 0.500000 92.000000 92.000000 92.000000 0 0 0 0 0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
35
+ 2 -0.000000 0.000000 0.000000 192.000000 192.000000 192.000000 1.000000 0.000000 0.000000 0.000000 No Log No Log No Log No Log No Log -0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
36
+ 3 -0.000000 0.000000 0.000000 192.000000 192.000000 192.000000 1.000000 0.000000 0.000000 0.000000 No Log No Log No Log No Log No Log -0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
37
+ 4 -0.000000 0.000000 0.000000 192.000000 192.000000 192.000000 1.000000 0.000000 0.000000 0.000000 No Log No Log No Log No Log No Log -0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
38
+ 5 -0.000000 0.000000 0.000000 120.500000 49.000000 192.000000 0.500000 49.000000 49.000000 49.000000 No Log No Log No Log No Log No Log -0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
39
+ 6 -0.000000 0.000000 0.000000 192.000000 192.000000 192.000000 1.000000 0.000000 0.000000 0.000000 No Log No Log No Log No Log No Log -0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
40
+ 7 -0.000000 0.000000 0.000000 192.000000 192.000000 192.000000 1.000000 0.000000 0.000000 0.000000 No Log No Log No Log No Log No Log -0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
41
+ 8 -0.000000 0.000000 0.000000 160.000000 128.000000 192.000000 0.500000 128.000000 128.000000 128.000000 No Log No Log No Log No Log No Log -0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
42
+ 9 0.000000 0.000000 0.000000 192.000000 192.000000 192.000000 1.000000 0.000000 0.000000 0.000000 No Log No Log No Log No Log No Log 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
43
+ 10 -0.000000 0.000000 0.000000 192.000000 192.000000 192.000000 1.000000 0.000000 0.000000 0.000000 No Log No Log No Log No Log No Log -0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
44
+ 11 0.000000 0.000000 0.000000 192.000000 192.000000 192.000000 1.000000 0.000000 0.000000 0.000000 No Log No Log No Log No Log No Log 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
45
+ 12 0.000000 0.000000 0.000000 191.000000 190.000000 192.000000 0.500000 190.000000 190.000000 190.000000 No Log No Log No Log No Log No Log 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
46
+ 13 0.000000 0.000000 0.000000 131.500000 71.000000 192.000000 0.500000 71.000000 71.000000 71.000000 No Log No Log No Log No Log No Log 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
47
+ 14 -0.000000 0.000000 0.000000 190.000000 188.000000 192.000000 0.500000 188.000000 188.000000 188.000000 No Log No Log No Log No Log No Log -0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
48
+ 15 -0.000000 0.000000 0.000000 192.000000 192.000000 192.000000 1.000000 0.000000 0.000000 0.000000 No Log No Log No Log No Log No Log -0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
49
+ 16 -0.000000 0.000000 0.000000 192.000000 192.000000 192.000000 1.000000 0.000000 0.000000 0.000000 No Log No Log No Log No Log No Log -0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
50
+ 17 -0.000000 0.000000 0.000000 192.000000 192.000000 192.000000 1.000000 0.000000 0.000000 0.000000 No Log No Log No Log No Log No Log -0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
51
+ 18 -0.000000 0.000000 0.000000 122.000000 52.000000 192.000000 0.500000 52.000000 52.000000 52.000000 No Log No Log No Log No Log No Log -0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
52
+ 19 -0.000000 0.000000 0.000000 192.000000 192.000000 192.000000 1.000000 0.000000 0.000000 0.000000 No Log No Log No Log No Log No Log -0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
53
+ 20 -0.000000 0.000000 0.000000 192.000000 192.000000 192.000000 1.000000 0.000000 0.000000 0.000000 No Log No Log No Log No Log No Log -0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
54
+ /usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning: datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).
55
+ return datetime.utcnow().replace(tzinfo=utc)
56
+ /usr/local/lib/python3.12/dist-packages/vllm/v1/attention/backends/flashinfer.py:908: DeprecationWarning:
57
+ Prefer using device seq_lens directly to avoid implicit H<>D sync.
58
+ If a CPU copy is needed, use `seq_lens.cpu()` instead.
59
+ Will be removed in a future release, please migrate as soon as possible.
60
+
61
+ seq_lens_cpu = common_attn_metadata.seq_lens_cpu if needs_seq_lens_cpu else None
62
+ /usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning: datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).
63
+ return datetime.utcnow().replace(tzinfo=utc)
64
+ TrainOutput(global_step=20, training_loss=-2.682549142670951e-13, metrics={'train_runtime': 381.8669, 'train_samples_per_second': 0.105, 'train_steps_per_second': 0.052, 'total_flos': 0.0, 'train_loss': -2.682549142670951e-13})
pyproject.toml ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [build-system]
2
+ requires = ["setuptools>=69", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "openenv-compute-market-env"
7
+ version = "0.1.0"
8
+ description = "Compute allocation market environment for OpenEnv"
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ dependencies = [
12
+ "openenv-core[core]>=0.2.1",
13
+ "pydantic>=2.7.0",
14
+ ]
15
+
16
+ [project.optional-dependencies]
17
+ dev = [
18
+ "openenv-core[cli]>=0.2.1",
19
+ "pytest>=8.2.0",
20
+ "ruff>=0.7.0",
21
+ ]
22
+ train = [
23
+ "trl>=0.24.0",
24
+ "transformers>=4.56.0",
25
+ ]
26
+
27
+ [project.scripts]
28
+ server = "compute_market_env.server.app:main"
29
+
30
+ [tool.setuptools]
31
+ include-package-data = true
32
+ packages = ["compute_market_env", "compute_market_env.server"]
33
+ package-dir = {"compute_market_env" = ".", "compute_market_env.server" = "server"}
34
+
35
+ [tool.pytest.ini_options]
36
+ testpaths = ["tests"]
server/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ """Server exports for the Compute Market environment."""
2
+
3
+ from .compute_market_environment import ComputeMarketEnvironment
4
+
5
+ __all__ = ["ComputeMarketEnvironment"]
server/app.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """FastAPI app for the Compute Market environment."""
2
+
3
+ try:
4
+ from openenv.core.env_server import create_app
5
+ except ImportError:
6
+ from openenv.core.env_server.http_server import create_app
7
+
8
+ try:
9
+ from ..models import ComputeMarketAction, ComputeMarketObservation
10
+ from .compute_market_environment import create_environment_from_env
11
+ except ImportError:
12
+ from models import ComputeMarketAction, ComputeMarketObservation
13
+ from server.compute_market_environment import create_environment_from_env
14
+
15
+
16
+ app = create_app(
17
+ create_environment_from_env,
18
+ ComputeMarketAction,
19
+ ComputeMarketObservation,
20
+ env_name="compute_market_env",
21
+ max_concurrent_envs=8,
22
+ )
23
+
24
+
25
+ def main(host: str = "0.0.0.0", port: int = 8000) -> None:
26
+ import uvicorn
27
+
28
+ uvicorn.run(app, host=host, port=port)
29
+
30
+
31
+ if __name__ == "__main__":
32
+ main()
server/compute_market_environment.py ADDED
@@ -0,0 +1,725 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Compute Market environment implementation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ import random
7
+ from uuid import uuid4
8
+
9
+ try:
10
+ from openenv.core.env_server.interfaces import Environment
11
+ except ImportError:
12
+ from openenv.core.env_server import Environment
13
+
14
+ try:
15
+ from ..models import (
16
+ ActorProfile,
17
+ ActorSignal,
18
+ ComputeMarketAction,
19
+ ComputeMarketObservation,
20
+ ComputeMarketState,
21
+ JobRecord,
22
+ MarketEvent,
23
+ MarketOffer,
24
+ ReservationRecord,
25
+ )
26
+ except ImportError:
27
+ from models import (
28
+ ActorProfile,
29
+ ActorSignal,
30
+ ComputeMarketAction,
31
+ ComputeMarketObservation,
32
+ ComputeMarketState,
33
+ JobRecord,
34
+ MarketEvent,
35
+ MarketOffer,
36
+ ReservationRecord,
37
+ )
38
+
39
+
40
+ class ComputeMarketEnvironment(
41
+ Environment[ComputeMarketAction, ComputeMarketObservation, ComputeMarketState]
42
+ ):
43
+ """Single-agent compute allocation market with scripted counterparties."""
44
+
45
+ SUPPORTS_CONCURRENT_SESSIONS = True
46
+
47
+ def __init__(
48
+ self,
49
+ total_gpus: int = 8,
50
+ initial_budget: float = 150.0,
51
+ max_ticks: int = 12,
52
+ default_seed: int = 0,
53
+ ) -> None:
54
+ self.base_total_gpus = total_gpus
55
+ self.base_initial_budget = initial_budget
56
+ self.base_max_ticks = max_ticks
57
+ self.total_gpus = total_gpus
58
+ self.initial_budget = initial_budget
59
+ self.max_ticks = max_ticks
60
+ self.default_seed = default_seed
61
+ self._scenario_variant = "baseline"
62
+ self._price_bias = 0.0
63
+ self._broker_enabled = True
64
+ self._state = ComputeMarketState(
65
+ episode_id=str(uuid4()),
66
+ step_count=0,
67
+ scenario_seed=default_seed,
68
+ scenario_variant="baseline",
69
+ max_ticks=max_ticks,
70
+ total_gpus=total_gpus,
71
+ budget_remaining=initial_budget,
72
+ )
73
+ self._rng = random.Random(default_seed)
74
+ self._jobs: list[JobRecord] = []
75
+ self._reservations: list[ReservationRecord] = []
76
+ self._visible_offers: list[MarketOffer] = []
77
+ self._hidden_actors: list[ActorProfile] = []
78
+ self._actor_signals: list[ActorSignal] = []
79
+ self._recent_events: list[MarketEvent] = []
80
+ self._current_tick = 0
81
+ self._market_price = 0.0
82
+ self._free_gpus = total_gpus
83
+ self._external_allocated_gpus = 0
84
+ self._budget_remaining = initial_budget
85
+ self._cumulative_reward = 0.0
86
+ self._done = False
87
+
88
+ def reset(
89
+ self,
90
+ seed: int | None = None,
91
+ episode_id: str | None = None,
92
+ **kwargs,
93
+ ) -> ComputeMarketObservation:
94
+ scenario_seed = self.default_seed if seed is None else seed
95
+ self._apply_scenario_variant(kwargs.get("scenario_variant", "baseline"))
96
+ self._rng = random.Random(scenario_seed)
97
+ self._current_tick = 0
98
+ self._done = False
99
+ self._budget_remaining = float(self.initial_budget)
100
+ self._cumulative_reward = 0.0
101
+ self._jobs = self._build_jobs()
102
+ self._reservations = []
103
+ self._hidden_actors = self._build_actors()
104
+ self._visible_offers = []
105
+ self._actor_signals = []
106
+ self._recent_events = [
107
+ MarketEvent(
108
+ tick=0,
109
+ event_type="reset",
110
+ message=f"Scenario '{self._scenario_variant}' initialized with scripted counterparties.",
111
+ )
112
+ ]
113
+ self._refresh_market()
114
+ self._state = self._snapshot_state(
115
+ episode_id=episode_id or str(uuid4()),
116
+ step_count=0,
117
+ scenario_seed=scenario_seed,
118
+ )
119
+ return self._build_observation(
120
+ 0.0,
121
+ False,
122
+ {"status": "ready", "scenario_variant": self._scenario_variant},
123
+ )
124
+
125
+ def step(self, action: ComputeMarketAction) -> ComputeMarketObservation: # type: ignore[override]
126
+ if self._done:
127
+ return self._build_observation(
128
+ 0.0,
129
+ True,
130
+ {"error": "Episode already finished."},
131
+ )
132
+
133
+ self._state.step_count += 1
134
+ reward = 0.0
135
+ action_events: list[MarketEvent] = []
136
+ error: str | None = None
137
+
138
+ if action.action_type == "bid_for_capacity":
139
+ reward, error, action_events = self._handle_bid(action)
140
+ elif action.action_type == "accept_offer":
141
+ reward, error, action_events = self._handle_accept_offer(action)
142
+ elif action.action_type == "propose_swap":
143
+ reward, error, action_events = self._handle_swap(action)
144
+ elif action.action_type == "schedule_job":
145
+ reward, error, action_events = self._handle_schedule(action)
146
+ elif action.action_type == "delay_job":
147
+ reward, error, action_events = self._handle_delay(action)
148
+ elif action.action_type == "inspect_market":
149
+ action_events = [
150
+ self._event(
151
+ "inspect",
152
+ f"Market inspected: spot price ${self._market_price:.2f}, free GPUs {self._free_gpus}.",
153
+ )
154
+ ]
155
+ reward -= 0.25
156
+ elif action.action_type == "noop":
157
+ action_events = [self._event("noop", "No action taken this tick.")]
158
+ else:
159
+ error = f"Unsupported action type: {action.action_type}"
160
+ reward -= 2.0
161
+
162
+ advance_reward, advance_events = self._advance_tick()
163
+ total_reward = round(reward + advance_reward, 2)
164
+ combined_events = action_events + advance_events
165
+ self._recent_events = combined_events[-6:]
166
+ self._cumulative_reward = round(self._cumulative_reward + total_reward, 2)
167
+ self._state = self._snapshot_state(
168
+ episode_id=self._state.episode_id,
169
+ step_count=self._state.step_count,
170
+ scenario_seed=self._state.scenario_seed,
171
+ )
172
+ metadata = {
173
+ "events": [event.model_dump() for event in combined_events],
174
+ "scenario_variant": self._scenario_variant,
175
+ }
176
+ if error:
177
+ metadata["error"] = error
178
+ return self._build_observation(total_reward, self._done, metadata)
179
+
180
+ @property
181
+ def state(self) -> ComputeMarketState:
182
+ return self._state
183
+
184
+ def _apply_scenario_variant(self, variant: str) -> None:
185
+ allowed = {
186
+ "baseline",
187
+ "tight_capacity",
188
+ "price_shock",
189
+ "policy_shift",
190
+ "job_mix",
191
+ }
192
+ self._scenario_variant = variant if variant in allowed else "baseline"
193
+ self.total_gpus = self.base_total_gpus
194
+ self.initial_budget = self.base_initial_budget
195
+ self.max_ticks = self.base_max_ticks
196
+ self._price_bias = 0.0
197
+ self._broker_enabled = True
198
+
199
+ if self._scenario_variant == "tight_capacity":
200
+ self.total_gpus = max(4, self.base_total_gpus - 2)
201
+ elif self._scenario_variant == "price_shock":
202
+ self._price_bias = 2.25
203
+ elif self._scenario_variant == "policy_shift":
204
+ self._broker_enabled = False
205
+ elif self._scenario_variant == "job_mix":
206
+ self.max_ticks = self.base_max_ticks + 1
207
+
208
+ def _build_jobs(self) -> list[JobRecord]:
209
+ jitter = self._rng.randint(-4, 4)
210
+ if self._scenario_variant == "job_mix":
211
+ return [
212
+ JobRecord(
213
+ job_id="job-a",
214
+ gpu_count=3,
215
+ total_duration=3,
216
+ remaining_duration=3,
217
+ deadline=6,
218
+ value=82 + jitter,
219
+ priority=2,
220
+ ),
221
+ JobRecord(
222
+ job_id="job-b",
223
+ gpu_count=2,
224
+ total_duration=1,
225
+ remaining_duration=1,
226
+ deadline=4,
227
+ value=36 + self._rng.randint(-2, 2),
228
+ priority=3,
229
+ ),
230
+ JobRecord(
231
+ job_id="job-c",
232
+ gpu_count=3,
233
+ total_duration=2,
234
+ remaining_duration=2,
235
+ deadline=8,
236
+ value=58 + self._rng.randint(-3, 3),
237
+ priority=2,
238
+ depends_on=["job-b"],
239
+ ),
240
+ ]
241
+
242
+ return [
243
+ JobRecord(
244
+ job_id="job-a",
245
+ gpu_count=4,
246
+ total_duration=2,
247
+ remaining_duration=2,
248
+ deadline=4,
249
+ value=100 + jitter,
250
+ priority=3,
251
+ ),
252
+ JobRecord(
253
+ job_id="job-b",
254
+ gpu_count=2,
255
+ total_duration=2,
256
+ remaining_duration=2,
257
+ deadline=7,
258
+ value=46 + self._rng.randint(-3, 3),
259
+ priority=2,
260
+ ),
261
+ JobRecord(
262
+ job_id="job-c",
263
+ gpu_count=1,
264
+ total_duration=1,
265
+ remaining_duration=1,
266
+ deadline=8,
267
+ value=24 + self._rng.randint(-2, 2),
268
+ priority=1,
269
+ depends_on=["job-a"],
270
+ ),
271
+ ]
272
+
273
+ def _build_actors(self) -> list[ActorProfile]:
274
+ actors = [
275
+ ActorProfile(
276
+ actor_id="urgent-tenant",
277
+ policy_type="urgent_tenant",
278
+ max_bid=round(7.0 + self._rng.uniform(0.5, 1.5), 2),
279
+ preferred_gpu_count=4 + self._rng.randint(0, 2),
280
+ visible_behavior="aggressive",
281
+ swap_floor=round(6.0 + self._rng.uniform(0.2, 0.8), 2),
282
+ ),
283
+ ActorProfile(
284
+ actor_id="budget-tenant",
285
+ policy_type="cost_sensitive_tenant",
286
+ max_bid=round(4.5 + self._rng.uniform(0.2, 1.0), 2),
287
+ preferred_gpu_count=2 + self._rng.randint(0, 1),
288
+ visible_behavior="steady",
289
+ swap_floor=round(4.0 + self._rng.uniform(0.2, 0.8), 2),
290
+ ),
291
+ ActorProfile(
292
+ actor_id="broker-1",
293
+ policy_type="broker",
294
+ max_bid=round(6.0 + self._rng.uniform(0.2, 1.2), 2),
295
+ preferred_gpu_count=3 + self._rng.randint(0, 2),
296
+ visible_behavior="opportunistic",
297
+ swap_floor=round(5.0 + self._rng.uniform(0.2, 0.8), 2),
298
+ ),
299
+ ]
300
+
301
+ if self._scenario_variant == "policy_shift":
302
+ actors[0].max_bid = round(max(3.5, actors[0].max_bid - 2.0), 2)
303
+ actors[0].visible_behavior = "steady"
304
+ actors[1].max_bid = round(actors[1].max_bid + 1.4, 2)
305
+ actors[1].preferred_gpu_count += 1
306
+ actors[1].visible_behavior = "aggressive"
307
+
308
+ if not self._broker_enabled:
309
+ actors = [actor for actor in actors if actor.policy_type != "broker"]
310
+ return actors
311
+
312
+ def _handle_bid(self, action: ComputeMarketAction) -> tuple[float, str | None, list[MarketEvent]]:
313
+ if action.gpu_count <= 0:
314
+ return -2.0, "gpu_count must be positive.", []
315
+ if action.price_per_gpu <= 0:
316
+ return -2.0, "price_per_gpu must be positive.", []
317
+ if action.gpu_count > self._free_gpus:
318
+ return -2.0, f"Only {self._free_gpus} public GPUs are available this tick.", []
319
+ if self._owned_gpus() + action.gpu_count > self.total_gpus:
320
+ return -2.0, "Cluster capacity would be exceeded.", []
321
+ if action.price_per_gpu < self._market_price:
322
+ return -1.0, f"Bid ${action.price_per_gpu:.2f} is below current clearing price ${self._market_price:.2f}.", []
323
+ total_cost = round(action.gpu_count * action.price_per_gpu * action.duration, 2)
324
+ if total_cost > self._budget_remaining:
325
+ return -2.0, "Insufficient budget for bid.", []
326
+
327
+ self._budget_remaining = round(self._budget_remaining - total_cost, 2)
328
+ self._reservations.append(
329
+ ReservationRecord(
330
+ reservation_id=f"res-{uuid4().hex[:8]}",
331
+ source="spot-market",
332
+ gpu_count=action.gpu_count,
333
+ remaining_ticks=action.duration,
334
+ price_per_gpu=action.price_per_gpu,
335
+ acquired_at_tick=self._current_tick,
336
+ )
337
+ )
338
+ return (
339
+ -total_cost,
340
+ None,
341
+ [
342
+ self._event(
343
+ "bid_won",
344
+ f"Won {action.gpu_count} GPU(s) for {action.duration} tick(s) at ${action.price_per_gpu:.2f}/GPU.",
345
+ )
346
+ ],
347
+ )
348
+
349
+ def _handle_accept_offer(self, action: ComputeMarketAction) -> tuple[float, str | None, list[MarketEvent]]:
350
+ if not action.offer_id:
351
+ return -2.0, "offer_id is required.", []
352
+ offer = next((item for item in self._visible_offers if item.offer_id == action.offer_id), None)
353
+ if offer is None:
354
+ return -2.0, f"Offer {action.offer_id} is not available.", []
355
+ if self._owned_gpus() + offer.gpu_count > self.total_gpus:
356
+ return -2.0, "Cluster capacity would be exceeded.", []
357
+
358
+ total_cost = round(offer.gpu_count * offer.price_per_gpu * offer.duration, 2)
359
+ if total_cost > self._budget_remaining:
360
+ return -2.0, "Insufficient budget for offer.", []
361
+
362
+ self._budget_remaining = round(self._budget_remaining - total_cost, 2)
363
+ self._reservations.append(
364
+ ReservationRecord(
365
+ reservation_id=f"res-{uuid4().hex[:8]}",
366
+ source=offer.actor_id,
367
+ gpu_count=offer.gpu_count,
368
+ remaining_ticks=offer.duration,
369
+ price_per_gpu=offer.price_per_gpu,
370
+ acquired_at_tick=self._current_tick,
371
+ )
372
+ )
373
+ self._visible_offers = [item for item in self._visible_offers if item.offer_id != offer.offer_id]
374
+ return (
375
+ -total_cost,
376
+ None,
377
+ [
378
+ self._event(
379
+ "offer_accepted",
380
+ f"Accepted {offer.offer_type} offer from {offer.actor_id} for {offer.gpu_count} GPU(s).",
381
+ )
382
+ ],
383
+ )
384
+
385
+ def _handle_swap(self, action: ComputeMarketAction) -> tuple[float, str | None, list[MarketEvent]]:
386
+ if not action.actor_id:
387
+ return -2.0, "actor_id is required.", []
388
+ if action.gpu_count <= 0 or action.price_per_gpu <= 0:
389
+ return -2.0, "gpu_count and price_per_gpu must be positive.", []
390
+ actor = next((item for item in self._hidden_actors if item.actor_id == action.actor_id), None)
391
+ if actor is None:
392
+ return -2.0, f"Unknown actor {action.actor_id}.", []
393
+ if self._owned_gpus() + action.gpu_count > self.total_gpus:
394
+ return -2.0, "Cluster capacity would be exceeded.", []
395
+ total_cost = round(action.gpu_count * action.price_per_gpu * action.duration, 2)
396
+ if total_cost > self._budget_remaining:
397
+ return -2.0, "Insufficient budget for swap.", []
398
+ if action.price_per_gpu < actor.swap_floor:
399
+ return -1.0, f"{actor.actor_id} rejected the swap; offered price is below its floor.", []
400
+
401
+ self._budget_remaining = round(self._budget_remaining - total_cost, 2)
402
+ self._reservations.append(
403
+ ReservationRecord(
404
+ reservation_id=f"res-{uuid4().hex[:8]}",
405
+ source=f"swap:{actor.actor_id}",
406
+ gpu_count=action.gpu_count,
407
+ remaining_ticks=action.duration,
408
+ price_per_gpu=action.price_per_gpu,
409
+ acquired_at_tick=self._current_tick,
410
+ )
411
+ )
412
+ return (
413
+ -total_cost,
414
+ None,
415
+ [
416
+ self._event(
417
+ "swap_accepted",
418
+ f"{actor.actor_id} transferred {action.gpu_count} GPU(s) at ${action.price_per_gpu:.2f}/GPU.",
419
+ )
420
+ ],
421
+ )
422
+
423
+ def _handle_schedule(self, action: ComputeMarketAction) -> tuple[float, str | None, list[MarketEvent]]:
424
+ if not action.job_id:
425
+ return -2.0, "job_id is required.", []
426
+ job = self._job(action.job_id)
427
+ if job is None:
428
+ return -2.0, f"Unknown job {action.job_id}.", []
429
+ if job.status in {"completed", "missed"}:
430
+ return -1.0, f"Job {action.job_id} is already terminal.", []
431
+ if not self._deps_completed(job):
432
+ return -1.0, f"Job {action.job_id} is blocked on dependencies {job.depends_on}.", []
433
+ if self._idle_owned_gpus() < job.gpu_count:
434
+ return -1.0, f"Need {job.gpu_count} idle owned GPU(s) to start {job.job_id}.", []
435
+
436
+ job.status = "running"
437
+ if job.started_at is None:
438
+ job.started_at = self._current_tick
439
+ return (
440
+ 1.0,
441
+ None,
442
+ [
443
+ self._event(
444
+ "job_started",
445
+ f"Scheduled {job.job_id} using {job.gpu_count} GPU(s).",
446
+ )
447
+ ],
448
+ )
449
+
450
+ def _handle_delay(self, action: ComputeMarketAction) -> tuple[float, str | None, list[MarketEvent]]:
451
+ if not action.job_id:
452
+ return -2.0, "job_id is required.", []
453
+ job = self._job(action.job_id)
454
+ if job is None:
455
+ return -2.0, f"Unknown job {action.job_id}.", []
456
+ if job.status in {"completed", "missed"}:
457
+ return -1.0, f"Job {job.job_id} is already terminal.", []
458
+ job.status = "paused" if job.status == "running" else "pending"
459
+ job.delay_count += 1
460
+ return (
461
+ -1.0,
462
+ None,
463
+ [
464
+ self._event(
465
+ "job_delayed",
466
+ f"Delayed {job.job_id}; slack shrinks while the deadline stays fixed.",
467
+ )
468
+ ],
469
+ )
470
+
471
+ def _advance_tick(self) -> tuple[float, list[MarketEvent]]:
472
+ tick_reward = 0.0
473
+ events: list[MarketEvent] = []
474
+ available_owned = self._owned_gpus()
475
+ used_gpus = 0
476
+
477
+ running_jobs = sorted(
478
+ [job for job in self._jobs if job.status == "running"],
479
+ key=lambda item: (-item.priority, item.job_id),
480
+ )
481
+ for job in running_jobs:
482
+ if used_gpus + job.gpu_count <= available_owned:
483
+ used_gpus += job.gpu_count
484
+ job.remaining_duration -= 1
485
+ events.append(
486
+ self._event(
487
+ "job_progress",
488
+ f"{job.job_id} progressed; {job.remaining_duration} tick(s) remaining.",
489
+ )
490
+ )
491
+ if job.remaining_duration == 0:
492
+ job.status = "completed"
493
+ job.completed_at = self._current_tick + 1
494
+ tick_reward += job.value
495
+ events.append(
496
+ self._event(
497
+ "job_completed",
498
+ f"{job.job_id} completed before deadline and earned ${job.value:.2f}.",
499
+ )
500
+ )
501
+ else:
502
+ job.status = "paused"
503
+ tick_reward -= 3.0
504
+ events.append(
505
+ self._event(
506
+ "job_paused",
507
+ f"{job.job_id} paused because owned capacity dropped below demand.",
508
+ )
509
+ )
510
+
511
+ idle_owned = max(0, available_owned - used_gpus)
512
+ if idle_owned > 0:
513
+ idle_penalty = round(0.5 * idle_owned, 2)
514
+ tick_reward -= idle_penalty
515
+ events.append(
516
+ self._event(
517
+ "idle_penalty",
518
+ f"Paid ${idle_penalty:.2f} idle-hoarding penalty for {idle_owned} unused owned GPU(s).",
519
+ )
520
+ )
521
+
522
+ for reservation in self._reservations:
523
+ reservation.remaining_ticks = max(0, reservation.remaining_ticks - 1)
524
+ expired = [item for item in self._reservations if item.remaining_ticks == 0]
525
+ self._reservations = [item for item in self._reservations if item.remaining_ticks > 0]
526
+ for reservation in expired:
527
+ events.append(
528
+ self._event(
529
+ "reservation_expired",
530
+ f"Reservation {reservation.reservation_id} from {reservation.source} expired.",
531
+ )
532
+ )
533
+
534
+ next_tick = self._current_tick + 1
535
+ for job in self._jobs:
536
+ if job.status not in {"completed", "missed"} and next_tick > job.deadline:
537
+ job.status = "missed"
538
+ penalty = round(job.value * 0.6, 2)
539
+ tick_reward -= penalty
540
+ events.append(
541
+ self._event(
542
+ "deadline_missed",
543
+ f"{job.job_id} missed its deadline and incurred ${penalty:.2f} penalty.",
544
+ )
545
+ )
546
+
547
+ self._current_tick = next_tick
548
+ self._done = self._current_tick >= self.max_ticks or all(
549
+ job.status in {"completed", "missed"} for job in self._jobs
550
+ ) or self._budget_remaining <= 0.0
551
+
552
+ if not self._done:
553
+ self._refresh_market()
554
+ events.extend(self._market_events_for_tick())
555
+ else:
556
+ self._visible_offers = []
557
+ self._actor_signals = []
558
+ self._free_gpus = max(0, self.total_gpus - self._owned_gpus())
559
+
560
+ return round(tick_reward, 2), events
561
+
562
+ def _refresh_market(self) -> None:
563
+ owned = self._owned_gpus()
564
+ remaining_cluster = max(0, self.total_gpus - owned)
565
+ base_price = 4.0 + 0.3 * self._current_tick + self._rng.uniform(0.0, 1.0) + self._price_bias
566
+ actor_signals: list[ActorSignal] = []
567
+ visible_offers: list[MarketOffer] = []
568
+ external_demand = 0
569
+
570
+ for actor in self._hidden_actors:
571
+ if actor.policy_type == "urgent_tenant":
572
+ gpu_demand = max(2, actor.preferred_gpu_count - (self._current_tick // 3))
573
+ bid = round(actor.max_bid - 0.15 * self._current_tick, 2)
574
+ pressure = "high" if gpu_demand >= 4 else "medium"
575
+ elif actor.policy_type == "cost_sensitive_tenant":
576
+ gpu_demand = max(1, actor.preferred_gpu_count - (self._current_tick // 4))
577
+ bid = round(actor.max_bid - 0.1 * max(0, self._current_tick - 1), 2)
578
+ pressure = "medium" if gpu_demand >= 2 else "low"
579
+ else:
580
+ gpu_demand = actor.preferred_gpu_count
581
+ bid = round(actor.max_bid + 0.2 * self._current_tick, 2)
582
+ pressure = "medium"
583
+ if self._broker_enabled:
584
+ visible_offers.append(
585
+ MarketOffer(
586
+ offer_id=f"offer-{self._current_tick}-{actor.actor_id}",
587
+ actor_id=actor.actor_id,
588
+ gpu_count=min(
589
+ remaining_cluster or actor.preferred_gpu_count,
590
+ actor.preferred_gpu_count,
591
+ ),
592
+ price_per_gpu=round(bid + 0.6, 2),
593
+ duration=2,
594
+ expires_at_tick=self._current_tick + 1,
595
+ offer_type="broker",
596
+ )
597
+ )
598
+
599
+ if actor.policy_type != "broker":
600
+ external_demand += gpu_demand
601
+ actor_signals.append(
602
+ ActorSignal(
603
+ actor_id=actor.actor_id,
604
+ visible_behavior=actor.visible_behavior,
605
+ pressure_hint=pressure,
606
+ last_seen_bid=max(0.0, bid),
607
+ )
608
+ )
609
+
610
+ self._external_allocated_gpus = min(remaining_cluster, external_demand)
611
+ self._free_gpus = max(0, remaining_cluster - self._external_allocated_gpus)
612
+ pressure_bump = 0.45 * (external_demand / max(1, self.total_gpus))
613
+ self._market_price = round(base_price + pressure_bump, 2)
614
+ self._visible_offers = [
615
+ offer for offer in visible_offers if offer.gpu_count > 0 and offer.expires_at_tick >= self._current_tick
616
+ ]
617
+ self._actor_signals = actor_signals
618
+
619
+ def _market_events_for_tick(self) -> list[MarketEvent]:
620
+ messages = [
621
+ self._event(
622
+ "market_tick",
623
+ f"Tick {self._current_tick}: spot price ${self._market_price:.2f}, public free GPUs {self._free_gpus}.",
624
+ )
625
+ ]
626
+ for signal in self._actor_signals:
627
+ messages.append(
628
+ self._event(
629
+ "actor_signal",
630
+ f"{signal.actor_id} looks {signal.visible_behavior} with {signal.pressure_hint} pressure.",
631
+ )
632
+ )
633
+ for offer in self._visible_offers:
634
+ messages.append(
635
+ self._event(
636
+ "offer_visible",
637
+ f"{offer.actor_id} posted {offer.gpu_count} GPU(s) at ${offer.price_per_gpu:.2f}/GPU for {offer.duration} tick(s).",
638
+ )
639
+ )
640
+ return messages
641
+
642
+ def _snapshot_state(
643
+ self,
644
+ episode_id: str,
645
+ step_count: int,
646
+ scenario_seed: int,
647
+ ) -> ComputeMarketState:
648
+ return ComputeMarketState(
649
+ episode_id=episode_id,
650
+ step_count=step_count,
651
+ scenario_seed=scenario_seed,
652
+ scenario_variant=self._scenario_variant,
653
+ current_tick=self._current_tick,
654
+ max_ticks=self.max_ticks,
655
+ total_gpus=self.total_gpus,
656
+ free_gpus=self._free_gpus,
657
+ owned_gpus=self._owned_gpus(),
658
+ idle_owned_gpus=self._idle_owned_gpus(),
659
+ budget_remaining=self._budget_remaining,
660
+ market_price=self._market_price,
661
+ cumulative_reward=self._cumulative_reward,
662
+ external_allocated_gpus=self._external_allocated_gpus,
663
+ done=self._done,
664
+ jobs=[job.model_copy(deep=True) for job in self._jobs],
665
+ visible_offers=[offer.model_copy(deep=True) for offer in self._visible_offers],
666
+ reservations=[reservation.model_copy(deep=True) for reservation in self._reservations],
667
+ actor_signals=[signal.model_copy(deep=True) for signal in self._actor_signals],
668
+ hidden_actors=[actor.model_copy(deep=True) for actor in self._hidden_actors],
669
+ recent_events=[event.model_copy(deep=True) for event in self._recent_events],
670
+ )
671
+
672
+ def _build_observation(
673
+ self,
674
+ reward: float,
675
+ done: bool,
676
+ metadata: dict,
677
+ ) -> ComputeMarketObservation:
678
+ return ComputeMarketObservation(
679
+ scenario_variant=self._scenario_variant,
680
+ current_tick=self._current_tick,
681
+ max_ticks=self.max_ticks,
682
+ total_gpus=self.total_gpus,
683
+ free_gpus=self._free_gpus,
684
+ owned_gpus=self._owned_gpus(),
685
+ idle_owned_gpus=self._idle_owned_gpus(),
686
+ budget_remaining=self._budget_remaining,
687
+ market_price=self._market_price,
688
+ jobs=[job.model_copy(deep=True) for job in self._jobs],
689
+ visible_offers=[offer.model_copy(deep=True) for offer in self._visible_offers],
690
+ recent_events=[event.model_copy(deep=True) for event in self._recent_events],
691
+ actor_signals=[signal.model_copy(deep=True) for signal in self._actor_signals],
692
+ done=done,
693
+ reward=reward,
694
+ metadata=metadata,
695
+ )
696
+
697
+ def _deps_completed(self, job: JobRecord) -> bool:
698
+ if not job.depends_on:
699
+ return True
700
+ completed = {item.job_id for item in self._jobs if item.status == "completed"}
701
+ return all(dep in completed for dep in job.depends_on)
702
+
703
+ def _job(self, job_id: str) -> JobRecord | None:
704
+ return next((job for job in self._jobs if job.job_id == job_id), None)
705
+
706
+ def _owned_gpus(self) -> int:
707
+ return sum(reservation.gpu_count for reservation in self._reservations)
708
+
709
+ def _idle_owned_gpus(self) -> int:
710
+ running_gpu_demand = sum(job.gpu_count for job in self._jobs if job.status == "running")
711
+ return max(0, self._owned_gpus() - running_gpu_demand)
712
+
713
+ def _event(self, event_type: str, message: str) -> MarketEvent:
714
+ return MarketEvent(tick=self._current_tick, event_type=event_type, message=message)
715
+
716
+
717
+ def create_environment_from_env() -> ComputeMarketEnvironment:
718
+ """Factory used by the FastAPI app and tests."""
719
+
720
+ return ComputeMarketEnvironment(
721
+ total_gpus=int(os.getenv("COMPUTE_MARKET_TOTAL_GPUS", "8")),
722
+ initial_budget=float(os.getenv("COMPUTE_MARKET_INITIAL_BUDGET", "150")),
723
+ max_ticks=int(os.getenv("COMPUTE_MARKET_MAX_TICKS", "12")),
724
+ default_seed=int(os.getenv("COMPUTE_MARKET_DEFAULT_SEED", "0")),
725
+ )
server/requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ openenv-core[core]>=0.2.1
2
+ uvicorn>=0.30.0
tests/test_compute_market_environment.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from compute_market_env import ComputeMarketAction
2
+ from compute_market_env.server.compute_market_environment import ComputeMarketEnvironment
3
+
4
+
5
+ def test_reset_is_deterministic_for_fixed_seed():
6
+ env = ComputeMarketEnvironment(default_seed=1)
7
+ obs1 = env.reset(seed=11)
8
+ obs2 = env.reset(seed=11)
9
+
10
+ assert obs1.market_price == obs2.market_price
11
+ assert obs1.free_gpus == obs2.free_gpus
12
+ assert [offer.price_per_gpu for offer in obs1.visible_offers] == [
13
+ offer.price_per_gpu for offer in obs2.visible_offers
14
+ ]
15
+
16
+
17
+ def test_bid_schedule_and_complete_job():
18
+ env = ComputeMarketEnvironment(total_gpus=12, initial_budget=200.0, max_ticks=8, default_seed=0)
19
+ env.reset(seed=3)
20
+
21
+ result = env.step(
22
+ ComputeMarketAction(
23
+ action_type="bid_for_capacity",
24
+ gpu_count=4,
25
+ price_per_gpu=8.0,
26
+ duration=3,
27
+ )
28
+ )
29
+ assert result.reward < 0
30
+
31
+ result = env.step(ComputeMarketAction(action_type="schedule_job", job_id="job-a"))
32
+ assert any(job.status == "running" for job in result.jobs)
33
+
34
+ result = env.step(ComputeMarketAction(action_type="noop"))
35
+ job_a = next(job for job in result.jobs if job.job_id == "job-a")
36
+ assert job_a.status == "completed"
37
+ assert result.reward > 0
38
+
39
+
40
+ def test_invalid_action_returns_penalty_metadata():
41
+ env = ComputeMarketEnvironment(default_seed=2)
42
+ env.reset(seed=2)
43
+
44
+ result = env.step(ComputeMarketAction(action_type="schedule_job", job_id="missing-job"))
45
+
46
+ assert result.reward < 0
47
+ assert "error" in result.metadata
48
+
49
+
50
+ def test_tight_capacity_variant_changes_capacity():
51
+ env = ComputeMarketEnvironment(total_gpus=8, default_seed=0)
52
+ result = env.reset(seed=7, scenario_variant="tight_capacity")
53
+
54
+ assert result.scenario_variant == "tight_capacity"
55
+ assert result.total_gpus == 6
56
+
57
+
58
+ def test_policy_shift_variant_removes_broker_offer():
59
+ env = ComputeMarketEnvironment(default_seed=0)
60
+ result = env.reset(seed=7, scenario_variant="policy_shift")
61
+
62
+ assert result.scenario_variant == "policy_shift"
63
+ assert all(offer.actor_id != "broker-1" for offer in result.visible_offers)
training/Compute_Market_Qwen3_GRPO.ipynb ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "4421ed68",
6
+ "metadata": {},
7
+ "source": [
8
+ "# Compute Market Qwen3 GRPO\n",
9
+ "\n",
10
+ "<a href=\"https://colab.research.google.com/github/kiankyars/lambdatheta/blob/main/training/Compute_Market_Qwen3_GRPO.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>\n",
11
+ "\n",
12
+ "Minimal Colab notebook for GRPO on the OpenEnv compute market environment.\n",
13
+ "Use this first for a smoke run before trying the larger gpt-oss notebook."
14
+ ]
15
+ },
16
+ {
17
+ "cell_type": "code",
18
+ "execution_count": null,
19
+ "id": "cbd5e394",
20
+ "metadata": {},
21
+ "outputs": [],
22
+ "source": [
23
+ "%%capture\n",
24
+ "!pip install --upgrade uv\n",
25
+ "!uv pip install unsloth vllm --torch-backend=auto\n",
26
+ "!uv pip install --upgrade --no-cache-dir --no-deps unsloth unsloth_zoo\n",
27
+ "!uv pip install transformers==4.56.2 'trl>=0.24.0' datasets openenv-core\n",
28
+ "!git clone https://github.com/kiankyars/lambdatheta.git\n",
29
+ "!pip install git+https://huggingface.co/spaces/openenv-community/compute_market_env\n"
30
+ ]
31
+ },
32
+ {
33
+ "cell_type": "code",
34
+ "execution_count": null,
35
+ "id": "bd1f857e",
36
+ "metadata": {},
37
+ "outputs": [],
38
+ "source": [
39
+ "import os, sys\n",
40
+ "os.environ['OPENENV_URL'] = 'https://openenv-community-compute-market-env.hf.space'\n",
41
+ "sys.path.append('/content/lambdatheta')\n",
42
+ "MAX_STEPS = 300\n"
43
+ ]
44
+ },
45
+ {
46
+ "cell_type": "code",
47
+ "execution_count": null,
48
+ "id": "b93e75ad",
49
+ "metadata": {},
50
+ "outputs": [],
51
+ "source": [
52
+ "from unsloth import FastLanguageModel\n",
53
+ "import torch\n",
54
+ "from transformers import AutoTokenizer\n",
55
+ "from training.compute_market_grpo import build_trainer, build_dataset\n",
56
+ "\n",
57
+ "max_seq_length = 2048 # Can increase for longer reasoning traces\n",
58
+ "lora_rank = 32 # Larger rank = smarter, but slower\n",
59
+ "\n",
60
+ "model, tokenizer = FastLanguageModel.from_pretrained(\n",
61
+ " model_name = \"unsloth/Qwen3-4B-Base\",\n",
62
+ " max_seq_length = max_seq_length,\n",
63
+ " load_in_4bit = False, # False for LoRA 16bit\n",
64
+ " fast_inference = True, # Enable vllm fast inference\n",
65
+ " max_lora_rank = lora_rank,\n",
66
+ " gpu_memory_utilization = 0.9, # Reduce if out of memory\n",
67
+ ")\n",
68
+ "\n",
69
+ "model = FastLanguageModel.get_peft_model(\n",
70
+ " model,\n",
71
+ " r = lora_rank, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128\n",
72
+ " target_modules = [\n",
73
+ " \"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\",\n",
74
+ " \"gate_proj\", \"up_proj\", \"down_proj\",\n",
75
+ " ],\n",
76
+ " lora_alpha = lora_rank*2, # *2 speeds up training\n",
77
+ " use_gradient_checkpointing = \"unsloth\", # Reduces memory usage\n",
78
+ " random_state = 3407,\n",
79
+ ")\n",
80
+ "\n",
81
+ "tokenizer = AutoTokenizer.from_pretrained(\"unsloth/Qwen3-4B-Base\")\n",
82
+ "if tokenizer.pad_token is None:\n",
83
+ " tokenizer.pad_token = tokenizer.eos_token\n"
84
+ ]
85
+ },
86
+ {
87
+ "cell_type": "code",
88
+ "execution_count": null,
89
+ "metadata": {},
90
+ "outputs": [],
91
+ "source": [
92
+ "dataset = build_dataset(size=64)\n",
93
+ "trainer = build_trainer(\n",
94
+ " model=model,\n",
95
+ " tokenizer=tokenizer,\n",
96
+ " env_url=os.environ['OPENENV_URL'],\n",
97
+ " train_dataset=dataset,\n",
98
+ " output_dir='outputs/compute-market-qwen3-4b',\n",
99
+ " max_steps=MAX_STEPS,\n",
100
+ ")\n",
101
+ "trainer\n"
102
+ ]
103
+ },
104
+ {
105
+ "cell_type": "code",
106
+ "execution_count": null,
107
+ "metadata": {},
108
+ "outputs": [],
109
+ "source": [
110
+ "trainer.train()\n"
111
+ ]
112
+ }
113
+ ],
114
+ "metadata": {
115
+ "kernelspec": {
116
+ "display_name": "Python 3",
117
+ "language": "python",
118
+ "name": "python3"
119
+ },
120
+ "language_info": {
121
+ "name": "python",
122
+ "version": "3.11"
123
+ }
124
+ },
125
+ "nbformat": 4,
126
+ "nbformat_minor": 5
127
+ }
training/OpenEnv_gpt_oss_(20B)_Reinforcement_Learning_2048_Game.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
training/Qwen3_(4B)_GRPO.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
training/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """Training helpers for the compute market environment."""
training/compute_market_grpo.py ADDED
@@ -0,0 +1,314 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """GRPO training helpers for the Compute Market environment.
2
+
3
+ Designed to be imported from a Colab notebook after installing:
4
+ - the environment package from the Hugging Face Space or GitHub repo
5
+ - TRL / Unsloth / transformers runtime deps
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import json
11
+ import os
12
+ import re
13
+ from dataclasses import dataclass
14
+ from typing import Any, Callable
15
+
16
+ from compute_market_env import ComputeMarketAction, ComputeMarketEnv
17
+
18
+ ACTION_JSON_RE = re.compile(r"\{.*\}", re.DOTALL)
19
+
20
+ SYSTEM_PROMPT = """You are a compute allocator trading for scarce GPU capacity.
21
+ Return exactly one JSON object with a valid action.
22
+ Allowed action_type values: bid_for_capacity, accept_offer, propose_swap, schedule_job, delay_job, inspect_market, noop.
23
+ Only include fields needed by the chosen action.
24
+ Be conservative with budget and prioritize completing valuable jobs before their deadlines."""
25
+
26
+ DEFAULT_TASK_PROMPT = (
27
+ "Maximize completed job value while minimizing spend, idle-hoarding penalties, "
28
+ "and missed deadlines in the compute market."
29
+ )
30
+
31
+
32
+ @dataclass
33
+ class RolloutSummary:
34
+ prompt_ids: list[int]
35
+ completion_ids: list[int]
36
+ logprobs: list[float]
37
+ episode_return: float
38
+ valid_action_reward: float
39
+ completion_bonus: float
40
+ transcripts: list[dict[str, Any]]
41
+
42
+
43
+ def observation_to_prompt(observation: Any, task_prompt: str = DEFAULT_TASK_PROMPT) -> str:
44
+ jobs = [job.model_dump() for job in observation.jobs]
45
+ offers = [offer.model_dump() for offer in observation.visible_offers]
46
+ signals = [signal.model_dump() for signal in observation.actor_signals]
47
+ events = [event.model_dump() for event in observation.recent_events]
48
+ return json.dumps(
49
+ {
50
+ "task": task_prompt,
51
+ "scenario_variant": getattr(observation, "scenario_variant", "baseline"),
52
+ "tick": observation.current_tick,
53
+ "max_ticks": observation.max_ticks,
54
+ "budget_remaining": observation.budget_remaining,
55
+ "market_price": observation.market_price,
56
+ "public_free_gpus": observation.free_gpus,
57
+ "owned_gpus": observation.owned_gpus,
58
+ "idle_owned_gpus": observation.idle_owned_gpus,
59
+ "jobs": jobs,
60
+ "visible_offers": offers,
61
+ "actor_signals": signals,
62
+ "recent_events": events,
63
+ },
64
+ indent=2,
65
+ )
66
+
67
+
68
+ def parse_action(text: str) -> tuple[ComputeMarketAction, bool]:
69
+ match = ACTION_JSON_RE.search(text)
70
+ if not match:
71
+ return ComputeMarketAction(action_type="inspect_market"), False
72
+ try:
73
+ payload = json.loads(match.group(0))
74
+ return ComputeMarketAction(**payload), True
75
+ except Exception:
76
+ return ComputeMarketAction(action_type="inspect_market"), False
77
+
78
+
79
+ def _count_completed_jobs(observation: Any) -> int:
80
+ return sum(1 for job in observation.jobs if job.status == "completed")
81
+
82
+
83
+ def rollout_once(
84
+ trainer: Any,
85
+ env: ComputeMarketEnv,
86
+ tokenizer: Any,
87
+ dataset_prompt: str,
88
+ system_prompt: str = SYSTEM_PROMPT,
89
+ max_turns: int = 6,
90
+ seed: int | None = None,
91
+ ) -> RolloutSummary:
92
+ from trl.experimental.openenv import generate_rollout_completions
93
+
94
+ result = env.reset(seed=seed)
95
+ prompt_ids: list[int] = []
96
+ completion_ids: list[int] = []
97
+ logprobs: list[float] = []
98
+ transcripts: list[dict[str, Any]] = []
99
+ rewards: list[float] = []
100
+ valid_action_reward = 0.0
101
+ completed_before = 0
102
+
103
+ for turn in range(max_turns):
104
+ if result.done:
105
+ break
106
+
107
+ prompt_text = observation_to_prompt(result.observation, dataset_prompt)
108
+ messages = [
109
+ {"role": "system", "content": system_prompt},
110
+ {"role": "user", "content": prompt_text},
111
+ ]
112
+ rendered_prompt = tokenizer.apply_chat_template(
113
+ messages,
114
+ add_generation_prompt=True,
115
+ tokenize=False,
116
+ enable_thinking=False,
117
+ )
118
+
119
+ rollout_outputs = generate_rollout_completions(trainer, [rendered_prompt])[0]
120
+ prompt_ids.extend(rollout_outputs["prompt_ids"])
121
+ completion_ids.extend(rollout_outputs["completion_ids"])
122
+ logprobs.extend(rollout_outputs["logprobs"])
123
+ completion_text = rollout_outputs.get("text") or tokenizer.decode(
124
+ rollout_outputs["completion_ids"],
125
+ skip_special_tokens=True,
126
+ )
127
+
128
+ action, is_valid = parse_action(completion_text)
129
+ result = env.step(action)
130
+ reward = float(result.reward or 0.0)
131
+ rewards.append(reward)
132
+ valid_action_reward += 0.25 if is_valid else -1.0
133
+
134
+ completed_after = _count_completed_jobs(result.observation)
135
+ completion_gain = max(0, completed_after - completed_before)
136
+ completed_before = completed_after
137
+
138
+ transcripts.append(
139
+ {
140
+ "turn": turn,
141
+ "prompt": prompt_text,
142
+ "completion": completion_text,
143
+ "action": action.model_dump(exclude_none=True),
144
+ "is_valid_action": is_valid,
145
+ "reward": reward,
146
+ "completed_jobs": completed_after,
147
+ "completion_gain": completion_gain,
148
+ }
149
+ )
150
+
151
+ completion_bonus = float(completed_before)
152
+ return RolloutSummary(
153
+ prompt_ids=prompt_ids,
154
+ completion_ids=completion_ids,
155
+ logprobs=logprobs,
156
+ episode_return=sum(rewards),
157
+ valid_action_reward=valid_action_reward,
158
+ completion_bonus=completion_bonus,
159
+ transcripts=transcripts,
160
+ )
161
+
162
+
163
+ def rollout_func(
164
+ prompts: list[str],
165
+ trainer: Any | None = None,
166
+ tokenizer: Any | None = None,
167
+ env_url: str | None = None,
168
+ max_turns: int = 6,
169
+ seed_offset: int = 0,
170
+ ) -> dict[str, Any]:
171
+ if trainer is None:
172
+ raise ValueError("trainer is required")
173
+ if tokenizer is None:
174
+ raise ValueError("tokenizer is required")
175
+
176
+ env_url = env_url or os.environ.get("OPENENV_URL", "http://localhost:8000")
177
+ episode_prompt_ids = []
178
+ episode_completion_ids = []
179
+ episode_logprobs = []
180
+ episode_returns = []
181
+ validity_rewards = []
182
+ completion_bonuses = []
183
+ transcripts = []
184
+
185
+ with ComputeMarketEnv(base_url=env_url) as env:
186
+ for idx, prompt_text in enumerate(prompts):
187
+ episode = rollout_once(
188
+ trainer=trainer,
189
+ env=env,
190
+ tokenizer=tokenizer,
191
+ dataset_prompt=prompt_text,
192
+ max_turns=max_turns,
193
+ seed=seed_offset + idx,
194
+ )
195
+ episode_prompt_ids.append(episode.prompt_ids)
196
+ episode_completion_ids.append(episode.completion_ids)
197
+ episode_logprobs.append(episode.logprobs)
198
+ episode_returns.append(episode.episode_return)
199
+ validity_rewards.append(episode.valid_action_reward)
200
+ completion_bonuses.append(episode.completion_bonus)
201
+ transcripts.append(episode.transcripts)
202
+
203
+ return {
204
+ "prompt_ids": episode_prompt_ids,
205
+ "completion_ids": episode_completion_ids,
206
+ "logprobs": episode_logprobs,
207
+ "env_reward": episode_returns,
208
+ "valid_action_reward": validity_rewards,
209
+ "completion_bonus": completion_bonuses,
210
+ "transcripts": transcripts,
211
+ }
212
+
213
+
214
+ def reward_env_return(completions: list[Any], **kwargs: Any) -> list[float]:
215
+ rewards = kwargs.get("env_reward") or []
216
+ return [float(rewards[i]) if i < len(rewards) else 0.0 for i in range(len(completions))]
217
+
218
+
219
+ def reward_valid_action(completions: list[Any], **kwargs: Any) -> list[float]:
220
+ rewards = kwargs.get("valid_action_reward") or []
221
+ return [float(rewards[i]) if i < len(rewards) else 0.0 for i in range(len(completions))]
222
+
223
+
224
+ def reward_job_completion(completions: list[Any], **kwargs: Any) -> list[float]:
225
+ rewards = kwargs.get("completion_bonus") or []
226
+ return [float(rewards[i]) if i < len(rewards) else 0.0 for i in range(len(completions))]
227
+
228
+
229
+ def build_dataset(size: int = 128, prompt: str = DEFAULT_TASK_PROMPT):
230
+ from datasets import Dataset
231
+
232
+ return Dataset.from_dict({"prompt": [prompt] * size})
233
+
234
+
235
+ def build_grpo_config(
236
+ output_dir: str = "outputs/compute-market-qwen3-4b",
237
+ max_steps: int = 300,
238
+ learning_rate: float = 5e-6,
239
+ num_generations: int = 2,
240
+ max_prompt_length: int = 1800,
241
+ max_completion_length: int = 192,
242
+ use_vllm: bool = True,
243
+ ):
244
+ from trl import GRPOConfig
245
+
246
+ kwargs: dict[str, Any] = dict(
247
+ learning_rate=learning_rate,
248
+ weight_decay=0.001,
249
+ warmup_ratio=0.1,
250
+ lr_scheduler_type="linear",
251
+ optim="adamw_8bit",
252
+ logging_steps=1,
253
+ per_device_train_batch_size=1,
254
+ gradient_accumulation_steps=1,
255
+ num_generations=num_generations,
256
+ max_prompt_length=max_prompt_length,
257
+ max_completion_length=max_completion_length,
258
+ max_steps=max_steps,
259
+ save_steps=max_steps,
260
+ report_to="none",
261
+ output_dir=output_dir,
262
+ )
263
+ if use_vllm:
264
+ kwargs.update(
265
+ use_vllm=True,
266
+ vllm_mode="colocate",
267
+ vllm_gpu_memory_utilization=0.15,
268
+ )
269
+ return GRPOConfig(**kwargs)
270
+
271
+
272
+ def build_trainer(
273
+ model: Any,
274
+ tokenizer: Any,
275
+ env_url: str,
276
+ train_dataset: Any | None = None,
277
+ output_dir: str = "outputs/compute-market-qwen3-4b",
278
+ max_steps: int = 300,
279
+ max_turns: int = 6,
280
+ ):
281
+ from trl import GRPOTrainer
282
+
283
+ train_dataset = train_dataset or build_dataset()
284
+ args = build_grpo_config(output_dir=output_dir, max_steps=max_steps)
285
+
286
+ def bound_rollout_func(prompts: list[str], trainer: Any | None = None, **_: Any) -> dict[str, Any]:
287
+ return rollout_func(
288
+ prompts,
289
+ trainer=trainer,
290
+ tokenizer=tokenizer,
291
+ env_url=env_url,
292
+ max_turns=max_turns,
293
+ )
294
+
295
+ return GRPOTrainer(
296
+ model=model,
297
+ processing_class=tokenizer,
298
+ reward_funcs=[
299
+ reward_env_return,
300
+ reward_valid_action,
301
+ reward_job_completion,
302
+ ],
303
+ train_dataset=train_dataset,
304
+ args=args,
305
+ rollout_func=bound_rollout_func,
306
+ )
307
+
308
+
309
+ def build_colab_setup_snippet(space_repo_id: str = "openenv-community/compute_market_env") -> str:
310
+ return f"""# Colab install\n!pip install --upgrade uv\n!uv pip install unsloth vllm --torch-backend=auto\n!uv pip install --upgrade --no-cache-dir --no-deps unsloth unsloth_zoo\n!uv pip install transformers==4.56.2 'trl>=0.24.0' datasets openenv-core\n!pip install git+https://huggingface.co/spaces/{space_repo_id}\n\nimport os\nos.environ['OPENENV_URL'] = 'https://{space_repo_id.replace('/', '-').replace('_', '-')}.hf.space'\n"""
311
+
312
+
313
+ if __name__ == "__main__":
314
+ print(build_colab_setup_snippet())
training/eval_compute_market_benchmark.py ADDED
@@ -0,0 +1,303 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tiny ID/OOD benchmark for Compute Market models.
2
+
3
+ Run this in Colab after installing the environment package from the HF Space and
4
+ cloning this repo for the helper code.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from dataclasses import dataclass
10
+ from typing import Any
11
+
12
+ from compute_market_env import ComputeMarketEnv
13
+
14
+ from training.compute_market_grpo import (
15
+ DEFAULT_TASK_PROMPT,
16
+ SYSTEM_PROMPT,
17
+ observation_to_prompt,
18
+ parse_action,
19
+ )
20
+
21
+ BENCHMARK_SPLITS = {
22
+ "id_baseline": {
23
+ "tag": "ID",
24
+ "scenario_variant": "baseline",
25
+ "seeds": [11, 12, 13, 14, 15],
26
+ },
27
+ "ood_tight_capacity": {
28
+ "tag": "OOD",
29
+ "scenario_variant": "tight_capacity",
30
+ "seeds": [101, 102, 103, 104, 105],
31
+ },
32
+ "ood_price_shock": {
33
+ "tag": "OOD",
34
+ "scenario_variant": "price_shock",
35
+ "seeds": [201, 202, 203, 204, 205],
36
+ },
37
+ "ood_policy_shift": {
38
+ "tag": "OOD",
39
+ "scenario_variant": "policy_shift",
40
+ "seeds": [301, 302, 303, 304, 305],
41
+ },
42
+ "ood_job_mix": {
43
+ "tag": "OOD",
44
+ "scenario_variant": "job_mix",
45
+ "seeds": [401, 402, 403, 404, 405],
46
+ },
47
+ }
48
+
49
+
50
+ @dataclass
51
+ class EpisodeStats:
52
+ model_label: str
53
+ split: str
54
+ tag: str
55
+ seed: int
56
+ scenario_variant: str
57
+ total_return: float
58
+ completed_jobs: int
59
+ missed_jobs: int
60
+ invalid_actions: int
61
+ budget_remaining: float
62
+ turns: int
63
+
64
+
65
+ def load_model_and_tokenizer(
66
+ model_ref: str,
67
+ max_seq_length: int = 4096,
68
+ load_in_4bit: bool = True,
69
+ fast_inference: bool = True,
70
+ ):
71
+ from unsloth import FastLanguageModel
72
+
73
+ model, tokenizer = FastLanguageModel.from_pretrained(
74
+ model_name=model_ref,
75
+ max_seq_length=max_seq_length,
76
+ load_in_4bit=load_in_4bit,
77
+ fast_inference=fast_inference,
78
+ )
79
+ if tokenizer.pad_token is None:
80
+ tokenizer.pad_token = tokenizer.eos_token
81
+ model.eval()
82
+ return model, tokenizer
83
+
84
+
85
+ def generate_completion(
86
+ model: Any,
87
+ tokenizer: Any,
88
+ observation: Any,
89
+ task_prompt: str = DEFAULT_TASK_PROMPT,
90
+ system_prompt: str = SYSTEM_PROMPT,
91
+ max_new_tokens: int = 160,
92
+ ) -> str:
93
+ prompt_text = observation_to_prompt(observation, task_prompt)
94
+ messages = [
95
+ {"role": "system", "content": system_prompt},
96
+ {"role": "user", "content": prompt_text},
97
+ ]
98
+ rendered_prompt = tokenizer.apply_chat_template(
99
+ messages,
100
+ add_generation_prompt=True,
101
+ tokenize=False,
102
+ enable_thinking=False,
103
+ )
104
+ inputs = tokenizer(rendered_prompt, return_tensors="pt")
105
+ inputs = {key: value.to(model.device) for key, value in inputs.items()}
106
+ outputs = model.generate(
107
+ **inputs,
108
+ max_new_tokens=max_new_tokens,
109
+ do_sample=False,
110
+ temperature=0.0,
111
+ pad_token_id=tokenizer.eos_token_id,
112
+ )
113
+ completion_ids = outputs[0][inputs["input_ids"].shape[1] :]
114
+ return tokenizer.decode(completion_ids, skip_special_tokens=True)
115
+
116
+
117
+ def run_episode(
118
+ env: ComputeMarketEnv,
119
+ model: Any,
120
+ tokenizer: Any,
121
+ model_label: str,
122
+ split: str,
123
+ tag: str,
124
+ seed: int,
125
+ scenario_variant: str,
126
+ max_turns: int = 6,
127
+ ) -> EpisodeStats:
128
+ result = env.reset(seed=seed, scenario_variant=scenario_variant)
129
+ total_return = 0.0
130
+ invalid_actions = 0
131
+ turns = 0
132
+
133
+ for _ in range(max_turns):
134
+ if result.done:
135
+ break
136
+ completion = generate_completion(model, tokenizer, result.observation)
137
+ action, is_valid = parse_action(completion)
138
+ if not is_valid:
139
+ invalid_actions += 1
140
+ result = env.step(action)
141
+ total_return += float(result.reward or 0.0)
142
+ turns += 1
143
+
144
+ jobs = result.observation.jobs
145
+ completed_jobs = sum(1 for job in jobs if job.status == "completed")
146
+ missed_jobs = sum(1 for job in jobs if job.status == "missed")
147
+ return EpisodeStats(
148
+ model_label=model_label,
149
+ split=split,
150
+ tag=tag,
151
+ seed=seed,
152
+ scenario_variant=scenario_variant,
153
+ total_return=round(total_return, 2),
154
+ completed_jobs=completed_jobs,
155
+ missed_jobs=missed_jobs,
156
+ invalid_actions=invalid_actions,
157
+ budget_remaining=round(result.observation.budget_remaining, 2),
158
+ turns=turns,
159
+ )
160
+
161
+
162
+ def evaluate_model(
163
+ model: Any,
164
+ tokenizer: Any,
165
+ model_label: str,
166
+ env_url: str,
167
+ benchmark_splits: dict[str, dict[str, Any]] = BENCHMARK_SPLITS,
168
+ max_turns: int = 6,
169
+ ) -> list[EpisodeStats]:
170
+ stats: list[EpisodeStats] = []
171
+ with ComputeMarketEnv(base_url=env_url) as env:
172
+ for split_name, config in benchmark_splits.items():
173
+ for seed in config["seeds"]:
174
+ stats.append(
175
+ run_episode(
176
+ env=env,
177
+ model=model,
178
+ tokenizer=tokenizer,
179
+ model_label=model_label,
180
+ split=split_name,
181
+ tag=config["tag"],
182
+ seed=seed,
183
+ scenario_variant=config["scenario_variant"],
184
+ max_turns=max_turns,
185
+ )
186
+ )
187
+ return stats
188
+
189
+
190
+ def _safe_mean(values: list[float]) -> float:
191
+ return round(sum(values) / max(1, len(values)), 2)
192
+
193
+
194
+ def summarize_results(stats: list[EpisodeStats]) -> list[dict[str, Any]]:
195
+ grouped: dict[tuple[str, str], list[EpisodeStats]] = {}
196
+ for item in stats:
197
+ grouped.setdefault((item.model_label, item.split), []).append(item)
198
+
199
+ rows: list[dict[str, Any]] = []
200
+ for (model_label, split), items in grouped.items():
201
+ rows.append(
202
+ {
203
+ "model": model_label,
204
+ "split": split,
205
+ "tag": items[0].tag,
206
+ "scenario_variant": items[0].scenario_variant,
207
+ "episodes": len(items),
208
+ "mean_return": _safe_mean([item.total_return for item in items]),
209
+ "mean_completed_jobs": _safe_mean([item.completed_jobs for item in items]),
210
+ "mean_missed_jobs": _safe_mean([item.missed_jobs for item in items]),
211
+ "mean_budget_remaining": _safe_mean([item.budget_remaining for item in items]),
212
+ "invalid_action_rate": _safe_mean(
213
+ [item.invalid_actions / max(1, item.turns) for item in items]
214
+ ),
215
+ }
216
+ )
217
+
218
+ for model_label in sorted({item.model_label for item in stats}):
219
+ for tag_name in ["ID", "OOD"]:
220
+ items = [item for item in stats if item.model_label == model_label and item.tag == tag_name]
221
+ if not items:
222
+ continue
223
+ rows.append(
224
+ {
225
+ "model": model_label,
226
+ "split": f"aggregate_{tag_name.lower()}",
227
+ "tag": tag_name,
228
+ "scenario_variant": "mixed",
229
+ "episodes": len(items),
230
+ "mean_return": _safe_mean([item.total_return for item in items]),
231
+ "mean_completed_jobs": _safe_mean([item.completed_jobs for item in items]),
232
+ "mean_missed_jobs": _safe_mean([item.missed_jobs for item in items]),
233
+ "mean_budget_remaining": _safe_mean([item.budget_remaining for item in items]),
234
+ "invalid_action_rate": _safe_mean(
235
+ [item.invalid_actions / max(1, item.turns) for item in items]
236
+ ),
237
+ }
238
+ )
239
+ return rows
240
+
241
+
242
+ def render_markdown_table(rows: list[dict[str, Any]]) -> str:
243
+ headers = [
244
+ "model",
245
+ "split",
246
+ "tag",
247
+ "mean_return",
248
+ "mean_completed_jobs",
249
+ "mean_missed_jobs",
250
+ "mean_budget_remaining",
251
+ "invalid_action_rate",
252
+ ]
253
+ lines = [
254
+ "| " + " | ".join(headers) + " |",
255
+ "|" + "|".join(["---"] * len(headers)) + "|",
256
+ ]
257
+ for row in rows:
258
+ lines.append(
259
+ "| "
260
+ + " | ".join(str(row.get(header, "")) for header in headers)
261
+ + " |"
262
+ )
263
+ return "\n".join(lines)
264
+
265
+
266
+ def benchmark_two_models(
267
+ base_model_ref: str,
268
+ tuned_model_ref: str,
269
+ env_url: str,
270
+ max_turns: int = 6,
271
+ ) -> tuple[list[EpisodeStats], list[dict[str, Any]], str]:
272
+ all_stats: list[EpisodeStats] = []
273
+ for label, model_ref in [
274
+ ("Qwen3-4B-Base", base_model_ref),
275
+ ("compute-market-qwen3-4b", tuned_model_ref),
276
+ ]:
277
+ model, tokenizer = load_model_and_tokenizer(model_ref)
278
+ all_stats.extend(
279
+ evaluate_model(
280
+ model=model,
281
+ tokenizer=tokenizer,
282
+ model_label=label,
283
+ env_url=env_url,
284
+ max_turns=max_turns,
285
+ )
286
+ )
287
+ rows = summarize_results(all_stats)
288
+ return all_stats, rows, render_markdown_table(rows)
289
+
290
+
291
+ if __name__ == "__main__":
292
+ import os
293
+
294
+ env_url = os.environ.get(
295
+ "OPENENV_URL",
296
+ "https://openenv-community-compute-market-env.hf.space",
297
+ )
298
+ _, rows, table = benchmark_two_models(
299
+ base_model_ref="Qwen/Qwen3-4B",
300
+ tuned_model_ref="outputs/compute-market-qwen3-4b",
301
+ env_url=env_url,
302
+ )
303
+ print(table)
training/minimal_grpo_rollout.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Minimal TRL/OpenEnv rollout helpers for Colab."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import os
7
+ import re
8
+ from typing import Any
9
+
10
+ from compute_market_env import ComputeMarketAction, ComputeMarketEnv
11
+
12
+ ACTION_RE = re.compile(r"\{.*\}", re.DOTALL)
13
+ SYSTEM_PROMPT = """You trade for scarce GPU capacity.
14
+ Choose exactly one JSON action per turn.
15
+ Valid action_type values are: bid_for_capacity, accept_offer, propose_swap, schedule_job, delay_job, inspect_market, noop.
16
+ Return strict JSON only."""
17
+
18
+
19
+ def observation_to_prompt(observation) -> str:
20
+ jobs = [job.model_dump() for job in observation.jobs]
21
+ offers = [offer.model_dump() for offer in observation.visible_offers]
22
+ signals = [signal.model_dump() for signal in observation.actor_signals]
23
+ return json.dumps(
24
+ {
25
+ "tick": observation.current_tick,
26
+ "budget_remaining": observation.budget_remaining,
27
+ "market_price": observation.market_price,
28
+ "free_gpus": observation.free_gpus,
29
+ "owned_gpus": observation.owned_gpus,
30
+ "idle_owned_gpus": observation.idle_owned_gpus,
31
+ "jobs": jobs,
32
+ "visible_offers": offers,
33
+ "actor_signals": signals,
34
+ },
35
+ indent=2,
36
+ )
37
+
38
+
39
+ def parse_action(text: str) -> ComputeMarketAction:
40
+ match = ACTION_RE.search(text)
41
+ if not match:
42
+ return ComputeMarketAction(action_type="inspect_market")
43
+ try:
44
+ payload = json.loads(match.group(0))
45
+ return ComputeMarketAction(**payload)
46
+ except Exception:
47
+ return ComputeMarketAction(action_type="inspect_market")
48
+
49
+
50
+ def rollout_once(generate_completion, seed: int = 0, max_turns: int = 6) -> dict[str, Any]:
51
+ env_url = os.environ.get("OPENENV_URL", "http://localhost:8000")
52
+ rewards = []
53
+ actions = []
54
+ transcripts = []
55
+
56
+ with ComputeMarketEnv(base_url=env_url) as env:
57
+ result = env.reset(seed=seed)
58
+ for _ in range(max_turns):
59
+ if result.done:
60
+ break
61
+ prompt = observation_to_prompt(result.observation)
62
+ completion = generate_completion(SYSTEM_PROMPT, prompt)
63
+ action = parse_action(completion)
64
+ result = env.step(action)
65
+ actions.append(action.model_dump(exclude_none=True))
66
+ rewards.append(float(result.reward or 0.0))
67
+ transcripts.append(
68
+ {
69
+ "prompt": prompt,
70
+ "completion": completion,
71
+ "action": action.model_dump(exclude_none=True),
72
+ "reward": result.reward,
73
+ }
74
+ )
75
+
76
+ return {
77
+ "actions": actions,
78
+ "rewards": rewards,
79
+ "return": sum(rewards),
80
+ "transcripts": transcripts,
81
+ }
training/sample_benchmark_results.md ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ | model | split | tag | mean_return | mean_completed_jobs | mean_missed_jobs | mean_budget_remaining | invalid_action_rate |
2
+ | ----------------------- | ------------------ | --- | ----------- | ------------------- | ---------------- | --------------------- | ------------------- |
3
+ | Qwen3-4B-Base | aggregate_ood | OOD | -61.7 | 0.5 | 1.9 | 66.8 | 0.44 |
4
+ | compute-market-qwen3-4b | aggregate_ood | OOD | -18.4 | 1.1 | 1.2 | 41.7 | 0.16 |
uv.lock ADDED
The diff for this file is too large to render. See raw diff