Spaces:

kyars
/

compute_market_env

Sleeping

App Files Files Community

kyars commited on Mar 8

Commit

81f75fe

verified ·

1 Parent(s): 532abe9

Upload folder using huggingface_hub

Browse files

Files changed (23) hide show

Dockerfile +32 -0
README.md +70 -5
__init__.py +27 -0
client.py +80 -0
examples/random_policy.py +25 -0
models.py +138 -0
openenv.yaml +6 -0
output.md +64 -0
pyproject.toml +36 -0
server/__init__.py +5 -0
server/app.py +32 -0
server/compute_market_environment.py +725 -0
server/requirements.txt +2 -0
tests/test_compute_market_environment.py +63 -0
training/Compute_Market_Qwen3_GRPO.ipynb +127 -0
training/OpenEnv_gpt_oss_(20B)_Reinforcement_Learning_2048_Game.ipynb +0 -0
training/Qwen3_(4B)_GRPO.ipynb +0 -0
training/__init__.py +1 -0
training/compute_market_grpo.py +314 -0
training/eval_compute_market_benchmark.py +303 -0
training/minimal_grpo_rollout.py +81 -0
training/sample_benchmark_results.md +4 -0
uv.lock +0 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,32 @@

+ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
+FROM ${BASE_IMAGE} AS builder
+WORKDIR /app
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends git && \
+    rm -rf /var/lib/apt/lists/*
+COPY . /app/env
+WORKDIR /app/env
+RUN if ! command -v uv >/dev/null 2>&1; then \
+        curl -LsSf https://astral.sh/uv/install.sh | sh && \
+        mv /root/.local/bin/uv /usr/local/bin/uv && \
+        mv /root/.local/bin/uvx /usr/local/bin/uvx; \
+    fi
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv sync --no-install-project --no-editable
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv sync --no-editable
+FROM ${BASE_IMAGE}
+WORKDIR /app
+COPY --from=builder /app/env/.venv /app/.venv
+COPY --from=builder /app/env /app/env
+ENV PATH="/app/.venv/bin:$PATH"
+ENV PYTHONPATH="/app/env:$PYTHONPATH"
+HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
+    CMD curl -f http://localhost:8000/health || exit 1
+ENV ENABLE_WEB_INTERFACE=true
+CMD ["sh", "-c", "cd /app/env && uvicorn server.app:app --host 0.0.0.0 --port 8000"]

README.md CHANGED Viewed

@@ -1,10 +1,75 @@
 ---
-title: Compute Market Env
-emoji: 👀
-colorFrom: gray
-colorTo: pink
 sdk: docker
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Compute Market Environment Server
+emoji: 📊
+colorFrom: blue
+colorTo: green
 sdk: docker
 pinned: false
+app_port: 8000
+base_path: /web
+tags:
+  - openenv
+  - multi-agent
+  - compute-allocation
+  - market-simulation
 ---
+# Compute Market Environment
+An OpenEnv environment for training a single allocator/trader in a scarce-GPU market with scripted background actors, hidden incentives, delayed rewards, and partial observability.
+## What v1 implements
+- One trained agent: the allocator/trader
+- Scripted counterparties: urgent tenant, cost-sensitive tenant, broker
+- Jobs with deadlines, value, dependencies, and delayed payoff
+- Actions: `bid_for_capacity`, `accept_offer`, `propose_swap`, `schedule_job`, `delay_job`, `inspect_market`, `noop`
+- Reward = completed job value minus compute spend, missed-deadline penalties, and idle-hoarding penalties
+- Separate training helper for TRL/Colab in `training/minimal_grpo_rollout.py`
+## Quick Start
+```python
+from compute_market_env import ComputeMarketAction, ComputeMarketEnv
+with ComputeMarketEnv(base_url="http://localhost:8000") as env:
+    result = env.reset(seed=7)
+    print(result.observation.market_price)
+    print(result.observation.free_gpus)
+    result = env.step(
+        ComputeMarketAction(
+            action_type="bid_for_capacity",
+            gpu_count=4,
+            price_per_gpu=6.5,
+            duration=3,
+        )
+    )
+    print(result.reward)
+    print(result.observation.budget_remaining)
+```
+## Local Development
+```bash
+uv sync --extra dev
+uv run pytest -q
+uv run uvicorn server.app:app --host 0.0.0.0 --port 8000
+openenv validate --verbose
+```
+## Docker
+```bash
+docker build -t compute-market-env:latest -f server/Dockerfile .
+docker run -p 8000:8000 compute-market-env:latest
+```
+## Environment Loop
+1. Agent observes market price, public free GPUs, visible offers, jobs, and public actor signals.
+2. Agent takes one action.
+3. The environment advances one tick.
+4. Scripted actors update demand and offers.
+5. Jobs progress, complete, pause, or miss deadlines.
+6. The environment returns the next observation and realized reward for that tick.

__init__.py ADDED Viewed

	@@ -0,0 +1,27 @@

+"""Compute Market environment exports."""
+from .client import ComputeMarketEnv
+from .models import (
+    ActorProfile,
+    ActorSignal,
+    ComputeMarketAction,
+    ComputeMarketObservation,
+    ComputeMarketState,
+    JobRecord,
+    MarketEvent,
+    MarketOffer,
+    ReservationRecord,
+)
+__all__ = [
+    "ActorProfile",
+    "ActorSignal",
+    "ComputeMarketAction",
+    "ComputeMarketEnv",
+    "ComputeMarketObservation",
+    "ComputeMarketState",
+    "JobRecord",
+    "MarketEvent",
+    "MarketOffer",
+    "ReservationRecord",
+]

client.py ADDED Viewed

	@@ -0,0 +1,80 @@

+"""Compute Market environment client."""
+from __future__ import annotations
+from typing import Any
+from openenv.core.client_types import StepResult
+from openenv.core.env_client import EnvClient
+from .models import (
+    ActorProfile,
+    ActorSignal,
+    ComputeMarketAction,
+    ComputeMarketObservation,
+    ComputeMarketState,
+    JobRecord,
+    MarketEvent,
+    MarketOffer,
+    ReservationRecord,
+)
+class ComputeMarketEnv(
+    EnvClient[ComputeMarketAction, ComputeMarketObservation, ComputeMarketState]
+):
+    """Persistent client for the compute market environment."""
+    def _step_payload(self, action: ComputeMarketAction) -> dict[str, Any]:
+        return action.model_dump(exclude_none=True)
+    def _parse_result(self, payload: dict[str, Any]) -> StepResult[ComputeMarketObservation]:
+        obs_data = payload.get("observation", {})
+        observation = ComputeMarketObservation(
+            scenario_variant=obs_data.get("scenario_variant", "baseline"),
+            current_tick=obs_data.get("current_tick", 0),
+            max_ticks=obs_data.get("max_ticks", 0),
+            total_gpus=obs_data.get("total_gpus", 0),
+            free_gpus=obs_data.get("free_gpus", 0),
+            owned_gpus=obs_data.get("owned_gpus", 0),
+            idle_owned_gpus=obs_data.get("idle_owned_gpus", 0),
+            budget_remaining=obs_data.get("budget_remaining", 0.0),
+            market_price=obs_data.get("market_price", 0.0),
+            jobs=[JobRecord(**item) for item in obs_data.get("jobs", [])],
+            visible_offers=[MarketOffer(**item) for item in obs_data.get("visible_offers", [])],
+            recent_events=[MarketEvent(**item) for item in obs_data.get("recent_events", [])],
+            actor_signals=[ActorSignal(**item) for item in obs_data.get("actor_signals", [])],
+            done=payload.get("done", False),
+            reward=payload.get("reward", 0.0),
+            metadata=obs_data.get("metadata", {}),
+        )
+        return StepResult(
+            observation=observation,
+            reward=payload.get("reward", 0.0),
+            done=payload.get("done", False),
+        )
+    def _parse_state(self, payload: dict[str, Any]) -> ComputeMarketState:
+        return ComputeMarketState(
+            episode_id=payload.get("episode_id", ""),
+            step_count=payload.get("step_count", 0),
+            scenario_seed=payload.get("scenario_seed", 0),
+            scenario_variant=payload.get("scenario_variant", "baseline"),
+            current_tick=payload.get("current_tick", 0),
+            max_ticks=payload.get("max_ticks", 0),
+            total_gpus=payload.get("total_gpus", 0),
+            free_gpus=payload.get("free_gpus", 0),
+            owned_gpus=payload.get("owned_gpus", 0),
+            idle_owned_gpus=payload.get("idle_owned_gpus", 0),
+            budget_remaining=payload.get("budget_remaining", 0.0),
+            market_price=payload.get("market_price", 0.0),
+            cumulative_reward=payload.get("cumulative_reward", 0.0),
+            external_allocated_gpus=payload.get("external_allocated_gpus", 0),
+            done=payload.get("done", False),
+            jobs=[JobRecord(**item) for item in payload.get("jobs", [])],
+            visible_offers=[MarketOffer(**item) for item in payload.get("visible_offers", [])],
+            reservations=[ReservationRecord(**item) for item in payload.get("reservations", [])],
+            actor_signals=[ActorSignal(**item) for item in payload.get("actor_signals", [])],
+            hidden_actors=[ActorProfile(**item) for item in payload.get("hidden_actors", [])],
+            recent_events=[MarketEvent(**item) for item in payload.get("recent_events", [])],
+        )

examples/random_policy.py ADDED Viewed

	@@ -0,0 +1,25 @@

+"""Tiny local smoke example for the compute market environment."""
+from compute_market_env import ComputeMarketAction, ComputeMarketEnv
+with ComputeMarketEnv(base_url="http://localhost:8000") as env:
+    result = env.reset(seed=5)
+    print("reset", result.observation.market_price, result.observation.free_gpus)
+    result = env.step(
+        ComputeMarketAction(
+            action_type="bid_for_capacity",
+            gpu_count=min(4, max(1, result.observation.free_gpus)),
+            price_per_gpu=max(6.0, result.observation.market_price + 0.5),
+            duration=3,
+        )
+    )
+    print("bid", result.reward)
+    result = env.step(ComputeMarketAction(action_type="schedule_job", job_id="job-a"))
+    print("schedule", result.reward)
+    while not result.done:
+        result = env.step(ComputeMarketAction(action_type="noop"))
+        print("tick", result.observation.current_tick, result.reward)

models.py ADDED Viewed

	@@ -0,0 +1,138 @@

+"""Data models for the Compute Market environment."""
+from __future__ import annotations
+from typing import Literal
+from openenv.core.env_server.types import Action, Observation, State
+from pydantic import BaseModel, Field
+class JobRecord(BaseModel):
+    """Represents an agent-owned job in the simulated compute market."""
+    job_id: str
+    gpu_count: int = Field(..., ge=1)
+    total_duration: int = Field(..., ge=1)
+    remaining_duration: int = Field(..., ge=0)
+    deadline: int = Field(..., ge=1)
+    value: float = Field(..., ge=0.0)
+    priority: int = Field(default=1, ge=1)
+    depends_on: list[str] = Field(default_factory=list)
+    status: Literal["pending", "running", "paused", "completed", "missed"] = "pending"
+    delay_count: int = Field(default=0, ge=0)
+    started_at: int | None = None
+    completed_at: int | None = None
+class MarketOffer(BaseModel):
+    """Visible capacity offer from a scripted actor."""
+    offer_id: str
+    actor_id: str
+    gpu_count: int = Field(..., ge=1)
+    price_per_gpu: float = Field(..., ge=0.0)
+    duration: int = Field(..., ge=1)
+    expires_at_tick: int = Field(..., ge=0)
+    offer_type: Literal["broker", "swap"] = "broker"
+class ReservationRecord(BaseModel):
+    """Capacity currently owned by the agent."""
+    reservation_id: str
+    source: str
+    gpu_count: int = Field(..., ge=1)
+    remaining_ticks: int = Field(..., ge=0)
+    price_per_gpu: float = Field(..., ge=0.0)
+    acquired_at_tick: int = Field(..., ge=0)
+class MarketEvent(BaseModel):
+    """Human-readable event surfaced in observations."""
+    tick: int = Field(..., ge=0)
+    event_type: str
+    message: str
+class ActorSignal(BaseModel):
+    """Public signal about a scripted actor."""
+    actor_id: str
+    visible_behavior: Literal["aggressive", "steady", "opportunistic"]
+    pressure_hint: Literal["low", "medium", "high"]
+    last_seen_bid: float = Field(..., ge=0.0)
+class ActorProfile(BaseModel):
+    """Hidden actor configuration kept in the control-plane state."""
+    actor_id: str
+    policy_type: Literal["urgent_tenant", "cost_sensitive_tenant", "broker"]
+    max_bid: float = Field(..., ge=0.0)
+    preferred_gpu_count: int = Field(..., ge=1)
+    visible_behavior: Literal["aggressive", "steady", "opportunistic"]
+    swap_floor: float = Field(..., ge=0.0)
+class ComputeMarketAction(Action):
+    """Single-step action for the compute market."""
+    action_type: Literal[
+        "bid_for_capacity",
+        "accept_offer",
+        "propose_swap",
+        "schedule_job",
+        "delay_job",
+        "inspect_market",
+        "noop",
+    ]
+    job_id: str | None = Field(default=None)
+    offer_id: str | None = Field(default=None)
+    actor_id: str | None = Field(default=None)
+    gpu_count: int = Field(default=0, ge=0)
+    price_per_gpu: float = Field(default=0.0, ge=0.0)
+    duration: int = Field(default=1, ge=1)
+class ComputeMarketObservation(Observation):
+    """Partial observation exposed to the trained agent."""
+    scenario_variant: str = "baseline"
+    current_tick: int = Field(default=0, ge=0)
+    max_ticks: int = Field(default=0, ge=0)
+    total_gpus: int = Field(default=0, ge=0)
+    free_gpus: int = Field(default=0, ge=0)
+    owned_gpus: int = Field(default=0, ge=0)
+    idle_owned_gpus: int = Field(default=0, ge=0)
+    budget_remaining: float = Field(default=0.0)
+    market_price: float = Field(default=0.0, ge=0.0)
+    jobs: list[JobRecord] = Field(default_factory=list)
+    visible_offers: list[MarketOffer] = Field(default_factory=list)
+    recent_events: list[MarketEvent] = Field(default_factory=list)
+    actor_signals: list[ActorSignal] = Field(default_factory=list)
+class ComputeMarketState(State):
+    """Full control-plane state including hidden actor data."""
+    scenario_seed: int = 0
+    scenario_variant: str = "baseline"
+    current_tick: int = 0
+    max_ticks: int = 0
+    total_gpus: int = 0
+    free_gpus: int = 0
+    owned_gpus: int = 0
+    idle_owned_gpus: int = 0
+    budget_remaining: float = 0.0
+    market_price: float = 0.0
+    cumulative_reward: float = 0.0
+    external_allocated_gpus: int = 0
+    done: bool = False
+    jobs: list[JobRecord] = Field(default_factory=list)
+    visible_offers: list[MarketOffer] = Field(default_factory=list)
+    reservations: list[ReservationRecord] = Field(default_factory=list)
+    actor_signals: list[ActorSignal] = Field(default_factory=list)
+    hidden_actors: list[ActorProfile] = Field(default_factory=list)
+    recent_events: list[MarketEvent] = Field(default_factory=list)

openenv.yaml ADDED Viewed

	@@ -0,0 +1,6 @@

+spec_version: 1
+name: compute_market_env
+type: space
+runtime: fastapi
+app: server.app:app
+port: 8000

output.md ADDED Viewed

	@@ -0,0 +1,64 @@

+==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
+   \\   /|    Num examples = 64 | Num Epochs = 1 | Total steps = 20
+O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 1
+\        /    Data Parallel GPUs = 1 | Total batch size (2 x 1 x 1) = 2
+ "-____-"     Trainable parameters = 66,060,288 of 4,088,528,384 (1.62% trained)
+WARNING 03-08 19:58:35 [input_processor.py:168] vLLM has deprecated support for supporting different tokenizers for different LoRAs. By default, vLLM uses base model's tokenizer. If you are using a LoRA with its own tokenizer, consider specifying `--tokenizer [lora_path]` to use the LoRA tokenizer.
+/usr/local/lib/python3.12/dist-packages/vllm/v1/attention/backends/flashinfer.py:908: DeprecationWarning:
+    Prefer using device seq_lens directly to avoid implicit H<>D sync.
+    If a CPU copy is needed, use `seq_lens.cpu()` instead.
+    Will be removed in a future release, please migrate as soon as possible.
+  seq_lens_cpu = common_attn_metadata.seq_lens_cpu if needs_seq_lens_cpu else None
+/usr/local/lib/python3.12/dist-packages/vllm/v1/attention/backends/flashinfer.py:908: DeprecationWarning:
+    Prefer using device seq_lens directly to avoid implicit H<>D sync.
+    If a CPU copy is needed, use `seq_lens.cpu()` instead.
+    Will be removed in a future release, please migrate as soon as possible.
+  seq_lens_cpu = common_attn_metadata.seq_lens_cpu if needs_seq_lens_cpu else None
+/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning: datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).
+  return datetime.utcnow().replace(tzinfo=utc)
+/usr/local/lib/python3.12/dist-packages/vllm/v1/attention/backends/flashinfer.py:908: DeprecationWarning:
+    Prefer using device seq_lens directly to avoid implicit H<>D sync.
+    If a CPU copy is needed, use `seq_lens.cpu()` instead.
+    Will be removed in a future release, please migrate as soon as possible.
+  seq_lens_cpu = common_attn_metadata.seq_lens_cpu if needs_seq_lens_cpu else None
+/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning: datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).
+  return datetime.utcnow().replace(tzinfo=utc)
+Unsloth: Will smartly offload gradients to save VRAM!
+/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning: datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).
+  return datetime.utcnow().replace(tzinfo=utc)
+ [20/20 05:11, Epoch 0/1]
+Step	Training Loss	reward	reward_std	completions / mean_length	completions / min_length	completions / max_length	completions / clipped_ratio	completions / mean_terminated_length	completions / min_terminated_length	completions / max_terminated_length	sampling / sampling_logp_difference / mean	sampling / sampling_logp_difference / max	sampling / importance_sampling_ratio / min	sampling / importance_sampling_ratio / mean	sampling / importance_sampling_ratio / max	kl	rewards / reward_env_return / mean	rewards / reward_env_return / std	rewards / reward_valid_action / mean	rewards / reward_valid_action / std	rewards / reward_job_completion / mean	rewards / reward_job_completion / std
+1	0.000000	0.000000	0.000000	142.000000	92.000000	192.000000	0.500000	92.000000	92.000000	92.000000	0	0	0	0	0	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
+2	-0.000000	0.000000	0.000000	192.000000	192.000000	192.000000	1.000000	0.000000	0.000000	0.000000	No Log	No Log	No Log	No Log	No Log	-0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
+3	-0.000000	0.000000	0.000000	192.000000	192.000000	192.000000	1.000000	0.000000	0.000000	0.000000	No Log	No Log	No Log	No Log	No Log	-0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
+4	-0.000000	0.000000	0.000000	192.000000	192.000000	192.000000	1.000000	0.000000	0.000000	0.000000	No Log	No Log	No Log	No Log	No Log	-0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
+5	-0.000000	0.000000	0.000000	120.500000	49.000000	192.000000	0.500000	49.000000	49.000000	49.000000	No Log	No Log	No Log	No Log	No Log	-0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
+6	-0.000000	0.000000	0.000000	192.000000	192.000000	192.000000	1.000000	0.000000	0.000000	0.000000	No Log	No Log	No Log	No Log	No Log	-0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
+7	-0.000000	0.000000	0.000000	192.000000	192.000000	192.000000	1.000000	0.000000	0.000000	0.000000	No Log	No Log	No Log	No Log	No Log	-0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
+8	-0.000000	0.000000	0.000000	160.000000	128.000000	192.000000	0.500000	128.000000	128.000000	128.000000	No Log	No Log	No Log	No Log	No Log	-0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
+9	0.000000	0.000000	0.000000	192.000000	192.000000	192.000000	1.000000	0.000000	0.000000	0.000000	No Log	No Log	No Log	No Log	No Log	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
+10	-0.000000	0.000000	0.000000	192.000000	192.000000	192.000000	1.000000	0.000000	0.000000	0.000000	No Log	No Log	No Log	No Log	No Log	-0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
+11	0.000000	0.000000	0.000000	192.000000	192.000000	192.000000	1.000000	0.000000	0.000000	0.000000	No Log	No Log	No Log	No Log	No Log	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
+12	0.000000	0.000000	0.000000	191.000000	190.000000	192.000000	0.500000	190.000000	190.000000	190.000000	No Log	No Log	No Log	No Log	No Log	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
+13	0.000000	0.000000	0.000000	131.500000	71.000000	192.000000	0.500000	71.000000	71.000000	71.000000	No Log	No Log	No Log	No Log	No Log	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
+14	-0.000000	0.000000	0.000000	190.000000	188.000000	192.000000	0.500000	188.000000	188.000000	188.000000	No Log	No Log	No Log	No Log	No Log	-0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
+15	-0.000000	0.000000	0.000000	192.000000	192.000000	192.000000	1.000000	0.000000	0.000000	0.000000	No Log	No Log	No Log	No Log	No Log	-0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
+16	-0.000000	0.000000	0.000000	192.000000	192.000000	192.000000	1.000000	0.000000	0.000000	0.000000	No Log	No Log	No Log	No Log	No Log	-0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
+17	-0.000000	0.000000	0.000000	192.000000	192.000000	192.000000	1.000000	0.000000	0.000000	0.000000	No Log	No Log	No Log	No Log	No Log	-0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
+18	-0.000000	0.000000	0.000000	122.000000	52.000000	192.000000	0.500000	52.000000	52.000000	52.000000	No Log	No Log	No Log	No Log	No Log	-0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
+19	-0.000000	0.000000	0.000000	192.000000	192.000000	192.000000	1.000000	0.000000	0.000000	0.000000	No Log	No Log	No Log	No Log	No Log	-0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
+20	-0.000000	0.000000	0.000000	192.000000	192.000000	192.000000	1.000000	0.000000	0.000000	0.000000	No Log	No Log	No Log	No Log	No Log	-0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
+/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning: datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).
+  return datetime.utcnow().replace(tzinfo=utc)
+/usr/local/lib/python3.12/dist-packages/vllm/v1/attention/backends/flashinfer.py:908: DeprecationWarning:
+    Prefer using device seq_lens directly to avoid implicit H<>D sync.
+    If a CPU copy is needed, use `seq_lens.cpu()` instead.
+    Will be removed in a future release, please migrate as soon as possible.
+  seq_lens_cpu = common_attn_metadata.seq_lens_cpu if needs_seq_lens_cpu else None
+/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning: datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).
+  return datetime.utcnow().replace(tzinfo=utc)
+TrainOutput(global_step=20, training_loss=-2.682549142670951e-13, metrics={'train_runtime': 381.8669, 'train_samples_per_second': 0.105, 'train_steps_per_second': 0.052, 'total_flos': 0.0, 'train_loss': -2.682549142670951e-13})

pyproject.toml ADDED Viewed

	@@ -0,0 +1,36 @@

+[build-system]
+requires = ["setuptools>=69", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "openenv-compute-market-env"
+version = "0.1.0"
+description = "Compute allocation market environment for OpenEnv"
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+    "openenv-core[core]>=0.2.1",
+    "pydantic>=2.7.0",
+]
+[project.optional-dependencies]
+dev = [
+    "openenv-core[cli]>=0.2.1",
+    "pytest>=8.2.0",
+    "ruff>=0.7.0",
+]
+train = [
+    "trl>=0.24.0",
+    "transformers>=4.56.0",
+]
+[project.scripts]
+server = "compute_market_env.server.app:main"
+[tool.setuptools]
+include-package-data = true
+packages = ["compute_market_env", "compute_market_env.server"]
+package-dir = {"compute_market_env" = ".", "compute_market_env.server" = "server"}
+[tool.pytest.ini_options]
+testpaths = ["tests"]

server/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+"""Server exports for the Compute Market environment."""
+from .compute_market_environment import ComputeMarketEnvironment
+__all__ = ["ComputeMarketEnvironment"]

server/app.py ADDED Viewed

	@@ -0,0 +1,32 @@

+"""FastAPI app for the Compute Market environment."""
+try:
+    from openenv.core.env_server import create_app
+except ImportError:
+    from openenv.core.env_server.http_server import create_app
+try:
+    from ..models import ComputeMarketAction, ComputeMarketObservation
+    from .compute_market_environment import create_environment_from_env
+except ImportError:
+    from models import ComputeMarketAction, ComputeMarketObservation
+    from server.compute_market_environment import create_environment_from_env
+app = create_app(
+    create_environment_from_env,
+    ComputeMarketAction,
+    ComputeMarketObservation,
+    env_name="compute_market_env",
+    max_concurrent_envs=8,
+)
+def main(host: str = "0.0.0.0", port: int = 8000) -> None:
+    import uvicorn
+    uvicorn.run(app, host=host, port=port)
+if __name__ == "__main__":
+    main()

server/compute_market_environment.py ADDED Viewed

	@@ -0,0 +1,725 @@

+"""Compute Market environment implementation."""
+from __future__ import annotations
+import os
+import random
+from uuid import uuid4
+try:
+    from openenv.core.env_server.interfaces import Environment
+except ImportError:
+    from openenv.core.env_server import Environment
+try:
+    from ..models import (
+        ActorProfile,
+        ActorSignal,
+        ComputeMarketAction,
+        ComputeMarketObservation,
+        ComputeMarketState,
+        JobRecord,
+        MarketEvent,
+        MarketOffer,
+        ReservationRecord,
+    )
+except ImportError:
+    from models import (
+        ActorProfile,
+        ActorSignal,
+        ComputeMarketAction,
+        ComputeMarketObservation,
+        ComputeMarketState,
+        JobRecord,
+        MarketEvent,
+        MarketOffer,
+        ReservationRecord,
+    )
+class ComputeMarketEnvironment(
+    Environment[ComputeMarketAction, ComputeMarketObservation, ComputeMarketState]
+):
+    """Single-agent compute allocation market with scripted counterparties."""
+    SUPPORTS_CONCURRENT_SESSIONS = True
+    def __init__(
+        self,
+        total_gpus: int = 8,
+        initial_budget: float = 150.0,
+        max_ticks: int = 12,
+        default_seed: int = 0,
+    ) -> None:
+        self.base_total_gpus = total_gpus
+        self.base_initial_budget = initial_budget
+        self.base_max_ticks = max_ticks
+        self.total_gpus = total_gpus
+        self.initial_budget = initial_budget
+        self.max_ticks = max_ticks
+        self.default_seed = default_seed
+        self._scenario_variant = "baseline"
+        self._price_bias = 0.0
+        self._broker_enabled = True
+        self._state = ComputeMarketState(
+            episode_id=str(uuid4()),
+            step_count=0,
+            scenario_seed=default_seed,
+            scenario_variant="baseline",
+            max_ticks=max_ticks,
+            total_gpus=total_gpus,
+            budget_remaining=initial_budget,
+        )
+        self._rng = random.Random(default_seed)
+        self._jobs: list[JobRecord] = []
+        self._reservations: list[ReservationRecord] = []
+        self._visible_offers: list[MarketOffer] = []
+        self._hidden_actors: list[ActorProfile] = []
+        self._actor_signals: list[ActorSignal] = []
+        self._recent_events: list[MarketEvent] = []
+        self._current_tick = 0
+        self._market_price = 0.0
+        self._free_gpus = total_gpus
+        self._external_allocated_gpus = 0
+        self._budget_remaining = initial_budget
+        self._cumulative_reward = 0.0
+        self._done = False
+    def reset(
+        self,
+        seed: int | None = None,
+        episode_id: str | None = None,
+        **kwargs,
+    ) -> ComputeMarketObservation:
+        scenario_seed = self.default_seed if seed is None else seed
+        self._apply_scenario_variant(kwargs.get("scenario_variant", "baseline"))
+        self._rng = random.Random(scenario_seed)
+        self._current_tick = 0
+        self._done = False
+        self._budget_remaining = float(self.initial_budget)
+        self._cumulative_reward = 0.0
+        self._jobs = self._build_jobs()
+        self._reservations = []
+        self._hidden_actors = self._build_actors()
+        self._visible_offers = []
+        self._actor_signals = []
+        self._recent_events = [
+            MarketEvent(
+                tick=0,
+                event_type="reset",
+                message=f"Scenario '{self._scenario_variant}' initialized with scripted counterparties.",
+            )
+        ]
+        self._refresh_market()
+        self._state = self._snapshot_state(
+            episode_id=episode_id or str(uuid4()),
+            step_count=0,
+            scenario_seed=scenario_seed,
+        )
+        return self._build_observation(
+            0.0,
+            False,
+            {"status": "ready", "scenario_variant": self._scenario_variant},
+        )
+    def step(self, action: ComputeMarketAction) -> ComputeMarketObservation:  # type: ignore[override]
+        if self._done:
+            return self._build_observation(
+                0.0,
+                True,
+                {"error": "Episode already finished."},
+            )
+        self._state.step_count += 1
+        reward = 0.0
+        action_events: list[MarketEvent] = []
+        error: str | None = None
+        if action.action_type == "bid_for_capacity":
+            reward, error, action_events = self._handle_bid(action)
+        elif action.action_type == "accept_offer":
+            reward, error, action_events = self._handle_accept_offer(action)
+        elif action.action_type == "propose_swap":
+            reward, error, action_events = self._handle_swap(action)
+        elif action.action_type == "schedule_job":
+            reward, error, action_events = self._handle_schedule(action)
+        elif action.action_type == "delay_job":
+            reward, error, action_events = self._handle_delay(action)
+        elif action.action_type == "inspect_market":
+            action_events = [
+                self._event(
+                    "inspect",
+                    f"Market inspected: spot price ${self._market_price:.2f}, free GPUs {self._free_gpus}.",
+                )
+            ]
+            reward -= 0.25
+        elif action.action_type == "noop":
+            action_events = [self._event("noop", "No action taken this tick.")]
+        else:
+            error = f"Unsupported action type: {action.action_type}"
+            reward -= 2.0
+        advance_reward, advance_events = self._advance_tick()
+        total_reward = round(reward + advance_reward, 2)
+        combined_events = action_events + advance_events
+        self._recent_events = combined_events[-6:]
+        self._cumulative_reward = round(self._cumulative_reward + total_reward, 2)
+        self._state = self._snapshot_state(
+            episode_id=self._state.episode_id,
+            step_count=self._state.step_count,
+            scenario_seed=self._state.scenario_seed,
+        )
+        metadata = {
+            "events": [event.model_dump() for event in combined_events],
+            "scenario_variant": self._scenario_variant,
+        }
+        if error:
+            metadata["error"] = error
+        return self._build_observation(total_reward, self._done, metadata)
+    @property
+    def state(self) -> ComputeMarketState:
+        return self._state
+    def _apply_scenario_variant(self, variant: str) -> None:
+        allowed = {
+            "baseline",
+            "tight_capacity",
+            "price_shock",
+            "policy_shift",
+            "job_mix",
+        }
+        self._scenario_variant = variant if variant in allowed else "baseline"
+        self.total_gpus = self.base_total_gpus
+        self.initial_budget = self.base_initial_budget
+        self.max_ticks = self.base_max_ticks
+        self._price_bias = 0.0
+        self._broker_enabled = True
+        if self._scenario_variant == "tight_capacity":
+            self.total_gpus = max(4, self.base_total_gpus - 2)
+        elif self._scenario_variant == "price_shock":
+            self._price_bias = 2.25
+        elif self._scenario_variant == "policy_shift":
+            self._broker_enabled = False
+        elif self._scenario_variant == "job_mix":
+            self.max_ticks = self.base_max_ticks + 1
+    def _build_jobs(self) -> list[JobRecord]:
+        jitter = self._rng.randint(-4, 4)
+        if self._scenario_variant == "job_mix":
+            return [
+                JobRecord(
+                    job_id="job-a",
+                    gpu_count=3,
+                    total_duration=3,
+                    remaining_duration=3,
+                    deadline=6,
+                    value=82 + jitter,
+                    priority=2,
+                ),
+                JobRecord(
+                    job_id="job-b",
+                    gpu_count=2,
+                    total_duration=1,
+                    remaining_duration=1,
+                    deadline=4,
+                    value=36 + self._rng.randint(-2, 2),
+                    priority=3,
+                ),
+                JobRecord(
+                    job_id="job-c",
+                    gpu_count=3,
+                    total_duration=2,
+                    remaining_duration=2,
+                    deadline=8,
+                    value=58 + self._rng.randint(-3, 3),
+                    priority=2,
+                    depends_on=["job-b"],
+                ),
+            ]
+        return [
+            JobRecord(
+                job_id="job-a",
+                gpu_count=4,
+                total_duration=2,
+                remaining_duration=2,
+                deadline=4,
+                value=100 + jitter,
+                priority=3,
+            ),
+            JobRecord(
+                job_id="job-b",
+                gpu_count=2,
+                total_duration=2,
+                remaining_duration=2,
+                deadline=7,
+                value=46 + self._rng.randint(-3, 3),
+                priority=2,
+            ),
+            JobRecord(
+                job_id="job-c",
+                gpu_count=1,
+                total_duration=1,
+                remaining_duration=1,
+                deadline=8,
+                value=24 + self._rng.randint(-2, 2),
+                priority=1,
+                depends_on=["job-a"],
+            ),
+        ]
+    def _build_actors(self) -> list[ActorProfile]:
+        actors = [
+            ActorProfile(
+                actor_id="urgent-tenant",
+                policy_type="urgent_tenant",
+                max_bid=round(7.0 + self._rng.uniform(0.5, 1.5), 2),
+                preferred_gpu_count=4 + self._rng.randint(0, 2),
+                visible_behavior="aggressive",
+                swap_floor=round(6.0 + self._rng.uniform(0.2, 0.8), 2),
+            ),
+            ActorProfile(
+                actor_id="budget-tenant",
+                policy_type="cost_sensitive_tenant",
+                max_bid=round(4.5 + self._rng.uniform(0.2, 1.0), 2),
+                preferred_gpu_count=2 + self._rng.randint(0, 1),
+                visible_behavior="steady",
+                swap_floor=round(4.0 + self._rng.uniform(0.2, 0.8), 2),
+            ),
+            ActorProfile(
+                actor_id="broker-1",
+                policy_type="broker",
+                max_bid=round(6.0 + self._rng.uniform(0.2, 1.2), 2),
+                preferred_gpu_count=3 + self._rng.randint(0, 2),
+                visible_behavior="opportunistic",
+                swap_floor=round(5.0 + self._rng.uniform(0.2, 0.8), 2),
+            ),
+        ]
+        if self._scenario_variant == "policy_shift":
+            actors[0].max_bid = round(max(3.5, actors[0].max_bid - 2.0), 2)
+            actors[0].visible_behavior = "steady"
+            actors[1].max_bid = round(actors[1].max_bid + 1.4, 2)
+            actors[1].preferred_gpu_count += 1
+            actors[1].visible_behavior = "aggressive"
+        if not self._broker_enabled:
+            actors = [actor for actor in actors if actor.policy_type != "broker"]
+        return actors
+    def _handle_bid(self, action: ComputeMarketAction) -> tuple[float, str | None, list[MarketEvent]]:
+        if action.gpu_count <= 0:
+            return -2.0, "gpu_count must be positive.", []
+        if action.price_per_gpu <= 0:
+            return -2.0, "price_per_gpu must be positive.", []
+        if action.gpu_count > self._free_gpus:
+            return -2.0, f"Only {self._free_gpus} public GPUs are available this tick.", []
+        if self._owned_gpus() + action.gpu_count > self.total_gpus:
+            return -2.0, "Cluster capacity would be exceeded.", []
+        if action.price_per_gpu < self._market_price:
+            return -1.0, f"Bid ${action.price_per_gpu:.2f} is below current clearing price ${self._market_price:.2f}.", []
+        total_cost = round(action.gpu_count * action.price_per_gpu * action.duration, 2)
+        if total_cost > self._budget_remaining:
+            return -2.0, "Insufficient budget for bid.", []
+        self._budget_remaining = round(self._budget_remaining - total_cost, 2)
+        self._reservations.append(
+            ReservationRecord(
+                reservation_id=f"res-{uuid4().hex[:8]}",
+                source="spot-market",
+                gpu_count=action.gpu_count,
+                remaining_ticks=action.duration,
+                price_per_gpu=action.price_per_gpu,
+                acquired_at_tick=self._current_tick,
+            )
+        )
+        return (
+            -total_cost,
+            None,
+            [
+                self._event(
+                    "bid_won",
+                    f"Won {action.gpu_count} GPU(s) for {action.duration} tick(s) at ${action.price_per_gpu:.2f}/GPU.",
+                )
+            ],
+        )
+    def _handle_accept_offer(self, action: ComputeMarketAction) -> tuple[float, str | None, list[MarketEvent]]:
+        if not action.offer_id:
+            return -2.0, "offer_id is required.", []
+        offer = next((item for item in self._visible_offers if item.offer_id == action.offer_id), None)
+        if offer is None:
+            return -2.0, f"Offer {action.offer_id} is not available.", []
+        if self._owned_gpus() + offer.gpu_count > self.total_gpus:
+            return -2.0, "Cluster capacity would be exceeded.", []
+        total_cost = round(offer.gpu_count * offer.price_per_gpu * offer.duration, 2)
+        if total_cost > self._budget_remaining:
+            return -2.0, "Insufficient budget for offer.", []
+        self._budget_remaining = round(self._budget_remaining - total_cost, 2)
+        self._reservations.append(
+            ReservationRecord(
+                reservation_id=f"res-{uuid4().hex[:8]}",
+                source=offer.actor_id,
+                gpu_count=offer.gpu_count,
+                remaining_ticks=offer.duration,
+                price_per_gpu=offer.price_per_gpu,
+                acquired_at_tick=self._current_tick,
+            )
+        )
+        self._visible_offers = [item for item in self._visible_offers if item.offer_id != offer.offer_id]
+        return (
+            -total_cost,
+            None,
+            [
+                self._event(
+                    "offer_accepted",
+                    f"Accepted {offer.offer_type} offer from {offer.actor_id} for {offer.gpu_count} GPU(s).",
+                )
+            ],
+        )
+    def _handle_swap(self, action: ComputeMarketAction) -> tuple[float, str | None, list[MarketEvent]]:
+        if not action.actor_id:
+            return -2.0, "actor_id is required.", []
+        if action.gpu_count <= 0 or action.price_per_gpu <= 0:
+            return -2.0, "gpu_count and price_per_gpu must be positive.", []
+        actor = next((item for item in self._hidden_actors if item.actor_id == action.actor_id), None)
+        if actor is None:
+            return -2.0, f"Unknown actor {action.actor_id}.", []
+        if self._owned_gpus() + action.gpu_count > self.total_gpus:
+            return -2.0, "Cluster capacity would be exceeded.", []
+        total_cost = round(action.gpu_count * action.price_per_gpu * action.duration, 2)
+        if total_cost > self._budget_remaining:
+            return -2.0, "Insufficient budget for swap.", []
+        if action.price_per_gpu < actor.swap_floor:
+            return -1.0, f"{actor.actor_id} rejected the swap; offered price is below its floor.", []
+        self._budget_remaining = round(self._budget_remaining - total_cost, 2)
+        self._reservations.append(
+            ReservationRecord(
+                reservation_id=f"res-{uuid4().hex[:8]}",
+                source=f"swap:{actor.actor_id}",
+                gpu_count=action.gpu_count,
+                remaining_ticks=action.duration,
+                price_per_gpu=action.price_per_gpu,
+                acquired_at_tick=self._current_tick,
+            )
+        )
+        return (
+            -total_cost,
+            None,
+            [
+                self._event(
+                    "swap_accepted",
+                    f"{actor.actor_id} transferred {action.gpu_count} GPU(s) at ${action.price_per_gpu:.2f}/GPU.",
+                )
+            ],
+        )
+    def _handle_schedule(self, action: ComputeMarketAction) -> tuple[float, str | None, list[MarketEvent]]:
+        if not action.job_id:
+            return -2.0, "job_id is required.", []
+        job = self._job(action.job_id)
+        if job is None:
+            return -2.0, f"Unknown job {action.job_id}.", []
+        if job.status in {"completed", "missed"}:
+            return -1.0, f"Job {action.job_id} is already terminal.", []
+        if not self._deps_completed(job):
+            return -1.0, f"Job {action.job_id} is blocked on dependencies {job.depends_on}.", []
+        if self._idle_owned_gpus() < job.gpu_count:
+            return -1.0, f"Need {job.gpu_count} idle owned GPU(s) to start {job.job_id}.", []
+        job.status = "running"
+        if job.started_at is None:
+            job.started_at = self._current_tick
+        return (
+            1.0,
+            None,
+            [
+                self._event(
+                    "job_started",
+                    f"Scheduled {job.job_id} using {job.gpu_count} GPU(s).",
+                )
+            ],
+        )
+    def _handle_delay(self, action: ComputeMarketAction) -> tuple[float, str | None, list[MarketEvent]]:
+        if not action.job_id:
+            return -2.0, "job_id is required.", []
+        job = self._job(action.job_id)
+        if job is None:
+            return -2.0, f"Unknown job {action.job_id}.", []
+        if job.status in {"completed", "missed"}:
+            return -1.0, f"Job {job.job_id} is already terminal.", []
+        job.status = "paused" if job.status == "running" else "pending"
+        job.delay_count += 1
+        return (
+            -1.0,
+            None,
+            [
+                self._event(
+                    "job_delayed",
+                    f"Delayed {job.job_id}; slack shrinks while the deadline stays fixed.",
+                )
+            ],
+        )
+    def _advance_tick(self) -> tuple[float, list[MarketEvent]]:
+        tick_reward = 0.0
+        events: list[MarketEvent] = []
+        available_owned = self._owned_gpus()
+        used_gpus = 0
+        running_jobs = sorted(
+            [job for job in self._jobs if job.status == "running"],
+            key=lambda item: (-item.priority, item.job_id),
+        )
+        for job in running_jobs:
+            if used_gpus + job.gpu_count <= available_owned:
+                used_gpus += job.gpu_count
+                job.remaining_duration -= 1
+                events.append(
+                    self._event(
+                        "job_progress",
+                        f"{job.job_id} progressed; {job.remaining_duration} tick(s) remaining.",
+                    )
+                )
+                if job.remaining_duration == 0:
+                    job.status = "completed"
+                    job.completed_at = self._current_tick + 1
+                    tick_reward += job.value
+                    events.append(
+                        self._event(
+                            "job_completed",
+                            f"{job.job_id} completed before deadline and earned ${job.value:.2f}.",
+                        )
+                    )
+            else:
+                job.status = "paused"
+                tick_reward -= 3.0
+                events.append(
+                    self._event(
+                        "job_paused",
+                        f"{job.job_id} paused because owned capacity dropped below demand.",
+                    )
+                )
+        idle_owned = max(0, available_owned - used_gpus)
+        if idle_owned > 0:
+            idle_penalty = round(0.5 * idle_owned, 2)
+            tick_reward -= idle_penalty
+            events.append(
+                self._event(
+                    "idle_penalty",
+                    f"Paid ${idle_penalty:.2f} idle-hoarding penalty for {idle_owned} unused owned GPU(s).",
+                )
+            )
+        for reservation in self._reservations:
+            reservation.remaining_ticks = max(0, reservation.remaining_ticks - 1)
+        expired = [item for item in self._reservations if item.remaining_ticks == 0]
+        self._reservations = [item for item in self._reservations if item.remaining_ticks > 0]
+        for reservation in expired:
+            events.append(
+                self._event(
+                    "reservation_expired",
+                    f"Reservation {reservation.reservation_id} from {reservation.source} expired.",
+                )
+            )
+        next_tick = self._current_tick + 1
+        for job in self._jobs:
+            if job.status not in {"completed", "missed"} and next_tick > job.deadline:
+                job.status = "missed"
+                penalty = round(job.value * 0.6, 2)
+                tick_reward -= penalty
+                events.append(
+                    self._event(
+                        "deadline_missed",
+                        f"{job.job_id} missed its deadline and incurred ${penalty:.2f} penalty.",
+                    )
+                )
+        self._current_tick = next_tick
+        self._done = self._current_tick >= self.max_ticks or all(
+            job.status in {"completed", "missed"} for job in self._jobs
+        ) or self._budget_remaining <= 0.0
+        if not self._done:
+            self._refresh_market()
+            events.extend(self._market_events_for_tick())
+        else:
+            self._visible_offers = []
+            self._actor_signals = []
+            self._free_gpus = max(0, self.total_gpus - self._owned_gpus())
+        return round(tick_reward, 2), events
+    def _refresh_market(self) -> None:
+        owned = self._owned_gpus()
+        remaining_cluster = max(0, self.total_gpus - owned)
+        base_price = 4.0 + 0.3 * self._current_tick + self._rng.uniform(0.0, 1.0) + self._price_bias
+        actor_signals: list[ActorSignal] = []
+        visible_offers: list[MarketOffer] = []
+        external_demand = 0
+        for actor in self._hidden_actors:
+            if actor.policy_type == "urgent_tenant":
+                gpu_demand = max(2, actor.preferred_gpu_count - (self._current_tick // 3))
+                bid = round(actor.max_bid - 0.15 * self._current_tick, 2)
+                pressure = "high" if gpu_demand >= 4 else "medium"
+            elif actor.policy_type == "cost_sensitive_tenant":
+                gpu_demand = max(1, actor.preferred_gpu_count - (self._current_tick // 4))
+                bid = round(actor.max_bid - 0.1 * max(0, self._current_tick - 1), 2)
+                pressure = "medium" if gpu_demand >= 2 else "low"
+            else:
+                gpu_demand = actor.preferred_gpu_count
+                bid = round(actor.max_bid + 0.2 * self._current_tick, 2)
+                pressure = "medium"
+                if self._broker_enabled:
+                    visible_offers.append(
+                        MarketOffer(
+                            offer_id=f"offer-{self._current_tick}-{actor.actor_id}",
+                            actor_id=actor.actor_id,
+                            gpu_count=min(
+                                remaining_cluster or actor.preferred_gpu_count,
+                                actor.preferred_gpu_count,
+                            ),
+                            price_per_gpu=round(bid + 0.6, 2),
+                            duration=2,
+                            expires_at_tick=self._current_tick + 1,
+                            offer_type="broker",
+                        )
+                    )
+            if actor.policy_type != "broker":
+                external_demand += gpu_demand
+            actor_signals.append(
+                ActorSignal(
+                    actor_id=actor.actor_id,
+                    visible_behavior=actor.visible_behavior,
+                    pressure_hint=pressure,
+                    last_seen_bid=max(0.0, bid),
+                )
+            )
+        self._external_allocated_gpus = min(remaining_cluster, external_demand)
+        self._free_gpus = max(0, remaining_cluster - self._external_allocated_gpus)
+        pressure_bump = 0.45 * (external_demand / max(1, self.total_gpus))
+        self._market_price = round(base_price + pressure_bump, 2)
+        self._visible_offers = [
+            offer for offer in visible_offers if offer.gpu_count > 0 and offer.expires_at_tick >= self._current_tick
+        ]
+        self._actor_signals = actor_signals
+    def _market_events_for_tick(self) -> list[MarketEvent]:
+        messages = [
+            self._event(
+                "market_tick",
+                f"Tick {self._current_tick}: spot price ${self._market_price:.2f}, public free GPUs {self._free_gpus}.",
+            )
+        ]
+        for signal in self._actor_signals:
+            messages.append(
+                self._event(
+                    "actor_signal",
+                    f"{signal.actor_id} looks {signal.visible_behavior} with {signal.pressure_hint} pressure.",
+                )
+            )
+        for offer in self._visible_offers:
+            messages.append(
+                self._event(
+                    "offer_visible",
+                    f"{offer.actor_id} posted {offer.gpu_count} GPU(s) at ${offer.price_per_gpu:.2f}/GPU for {offer.duration} tick(s).",
+                )
+            )
+        return messages
+    def _snapshot_state(
+        self,
+        episode_id: str,
+        step_count: int,
+        scenario_seed: int,
+    ) -> ComputeMarketState:
+        return ComputeMarketState(
+            episode_id=episode_id,
+            step_count=step_count,
+            scenario_seed=scenario_seed,
+            scenario_variant=self._scenario_variant,
+            current_tick=self._current_tick,
+            max_ticks=self.max_ticks,
+            total_gpus=self.total_gpus,
+            free_gpus=self._free_gpus,
+            owned_gpus=self._owned_gpus(),
+            idle_owned_gpus=self._idle_owned_gpus(),
+            budget_remaining=self._budget_remaining,
+            market_price=self._market_price,
+            cumulative_reward=self._cumulative_reward,
+            external_allocated_gpus=self._external_allocated_gpus,
+            done=self._done,
+            jobs=[job.model_copy(deep=True) for job in self._jobs],
+            visible_offers=[offer.model_copy(deep=True) for offer in self._visible_offers],
+            reservations=[reservation.model_copy(deep=True) for reservation in self._reservations],
+            actor_signals=[signal.model_copy(deep=True) for signal in self._actor_signals],
+            hidden_actors=[actor.model_copy(deep=True) for actor in self._hidden_actors],
+            recent_events=[event.model_copy(deep=True) for event in self._recent_events],
+        )
+    def _build_observation(
+        self,
+        reward: float,
+        done: bool,
+        metadata: dict,
+    ) -> ComputeMarketObservation:
+        return ComputeMarketObservation(
+            scenario_variant=self._scenario_variant,
+            current_tick=self._current_tick,
+            max_ticks=self.max_ticks,
+            total_gpus=self.total_gpus,
+            free_gpus=self._free_gpus,
+            owned_gpus=self._owned_gpus(),
+            idle_owned_gpus=self._idle_owned_gpus(),
+            budget_remaining=self._budget_remaining,
+            market_price=self._market_price,
+            jobs=[job.model_copy(deep=True) for job in self._jobs],
+            visible_offers=[offer.model_copy(deep=True) for offer in self._visible_offers],
+            recent_events=[event.model_copy(deep=True) for event in self._recent_events],
+            actor_signals=[signal.model_copy(deep=True) for signal in self._actor_signals],
+            done=done,
+            reward=reward,
+            metadata=metadata,
+        )
+    def _deps_completed(self, job: JobRecord) -> bool:
+        if not job.depends_on:
+            return True
+        completed = {item.job_id for item in self._jobs if item.status == "completed"}
+        return all(dep in completed for dep in job.depends_on)
+    def _job(self, job_id: str) -> JobRecord | None:
+        return next((job for job in self._jobs if job.job_id == job_id), None)
+    def _owned_gpus(self) -> int:
+        return sum(reservation.gpu_count for reservation in self._reservations)
+    def _idle_owned_gpus(self) -> int:
+        running_gpu_demand = sum(job.gpu_count for job in self._jobs if job.status == "running")
+        return max(0, self._owned_gpus() - running_gpu_demand)
+    def _event(self, event_type: str, message: str) -> MarketEvent:
+        return MarketEvent(tick=self._current_tick, event_type=event_type, message=message)
+def create_environment_from_env() -> ComputeMarketEnvironment:
+    """Factory used by the FastAPI app and tests."""
+    return ComputeMarketEnvironment(
+        total_gpus=int(os.getenv("COMPUTE_MARKET_TOTAL_GPUS", "8")),
+        initial_budget=float(os.getenv("COMPUTE_MARKET_INITIAL_BUDGET", "150")),
+        max_ticks=int(os.getenv("COMPUTE_MARKET_MAX_TICKS", "12")),
+        default_seed=int(os.getenv("COMPUTE_MARKET_DEFAULT_SEED", "0")),
+    )

server/requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ openenv-core[core]>=0.2.1
2	+ uvicorn>=0.30.0

tests/test_compute_market_environment.py ADDED Viewed

	@@ -0,0 +1,63 @@

+from compute_market_env import ComputeMarketAction
+from compute_market_env.server.compute_market_environment import ComputeMarketEnvironment
+def test_reset_is_deterministic_for_fixed_seed():
+    env = ComputeMarketEnvironment(default_seed=1)
+    obs1 = env.reset(seed=11)
+    obs2 = env.reset(seed=11)
+    assert obs1.market_price == obs2.market_price
+    assert obs1.free_gpus == obs2.free_gpus
+    assert [offer.price_per_gpu for offer in obs1.visible_offers] == [
+        offer.price_per_gpu for offer in obs2.visible_offers
+    ]
+def test_bid_schedule_and_complete_job():
+    env = ComputeMarketEnvironment(total_gpus=12, initial_budget=200.0, max_ticks=8, default_seed=0)
+    env.reset(seed=3)
+    result = env.step(
+        ComputeMarketAction(
+            action_type="bid_for_capacity",
+            gpu_count=4,
+            price_per_gpu=8.0,
+            duration=3,
+        )
+    )
+    assert result.reward < 0
+    result = env.step(ComputeMarketAction(action_type="schedule_job", job_id="job-a"))
+    assert any(job.status == "running" for job in result.jobs)
+    result = env.step(ComputeMarketAction(action_type="noop"))
+    job_a = next(job for job in result.jobs if job.job_id == "job-a")
+    assert job_a.status == "completed"
+    assert result.reward > 0
+def test_invalid_action_returns_penalty_metadata():
+    env = ComputeMarketEnvironment(default_seed=2)
+    env.reset(seed=2)
+    result = env.step(ComputeMarketAction(action_type="schedule_job", job_id="missing-job"))
+    assert result.reward < 0
+    assert "error" in result.metadata
+def test_tight_capacity_variant_changes_capacity():
+    env = ComputeMarketEnvironment(total_gpus=8, default_seed=0)
+    result = env.reset(seed=7, scenario_variant="tight_capacity")
+    assert result.scenario_variant == "tight_capacity"
+    assert result.total_gpus == 6
+def test_policy_shift_variant_removes_broker_offer():
+    env = ComputeMarketEnvironment(default_seed=0)
+    result = env.reset(seed=7, scenario_variant="policy_shift")
+    assert result.scenario_variant == "policy_shift"
+    assert all(offer.actor_id != "broker-1" for offer in result.visible_offers)

training/Compute_Market_Qwen3_GRPO.ipynb ADDED Viewed

	@@ -0,0 +1,127 @@

+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "id": "4421ed68",
+      "metadata": {},
+      "source": [
+        "# Compute Market Qwen3 GRPO\n",
+        "\n",
+        "<a href=\"https://colab.research.google.com/github/kiankyars/lambdatheta/blob/main/training/Compute_Market_Qwen3_GRPO.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>\n",
+        "\n",
+        "Minimal Colab notebook for GRPO on the OpenEnv compute market environment.\n",
+        "Use this first for a smoke run before trying the larger gpt-oss notebook."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "cbd5e394",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "%%capture\n",
+        "!pip install --upgrade uv\n",
+        "!uv pip install unsloth vllm --torch-backend=auto\n",
+        "!uv pip install --upgrade --no-cache-dir --no-deps unsloth unsloth_zoo\n",
+        "!uv pip install transformers==4.56.2 'trl>=0.24.0' datasets openenv-core\n",
+        "!git clone https://github.com/kiankyars/lambdatheta.git\n",
+        "!pip install git+https://huggingface.co/spaces/openenv-community/compute_market_env\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "bd1f857e",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import os, sys\n",
+        "os.environ['OPENENV_URL'] = 'https://openenv-community-compute-market-env.hf.space'\n",
+        "sys.path.append('/content/lambdatheta')\n",
+        "MAX_STEPS = 300\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "b93e75ad",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from unsloth import FastLanguageModel\n",
+        "import torch\n",
+        "from transformers import AutoTokenizer\n",
+        "from training.compute_market_grpo import build_trainer, build_dataset\n",
+        "\n",
+        "max_seq_length = 2048 # Can increase for longer reasoning traces\n",
+        "lora_rank = 32 # Larger rank = smarter, but slower\n",
+        "\n",
+        "model, tokenizer = FastLanguageModel.from_pretrained(\n",
+        "    model_name = \"unsloth/Qwen3-4B-Base\",\n",
+        "    max_seq_length = max_seq_length,\n",
+        "    load_in_4bit = False, # False for LoRA 16bit\n",
+        "    fast_inference = True, # Enable vllm fast inference\n",
+        "    max_lora_rank = lora_rank,\n",
+        "    gpu_memory_utilization = 0.9, # Reduce if out of memory\n",
+        ")\n",
+        "\n",
+        "model = FastLanguageModel.get_peft_model(\n",
+        "    model,\n",
+        "    r = lora_rank, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128\n",
+        "    target_modules = [\n",
+        "        \"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\",\n",
+        "        \"gate_proj\", \"up_proj\", \"down_proj\",\n",
+        "    ],\n",
+        "    lora_alpha = lora_rank*2, # *2 speeds up training\n",
+        "    use_gradient_checkpointing = \"unsloth\", # Reduces memory usage\n",
+        "    random_state = 3407,\n",
+        ")\n",
+        "\n",
+        "tokenizer = AutoTokenizer.from_pretrained(\"unsloth/Qwen3-4B-Base\")\n",
+        "if tokenizer.pad_token is None:\n",
+        "    tokenizer.pad_token = tokenizer.eos_token\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "dataset = build_dataset(size=64)\n",
+        "trainer = build_trainer(\n",
+        "    model=model,\n",
+        "    tokenizer=tokenizer,\n",
+        "    env_url=os.environ['OPENENV_URL'],\n",
+        "    train_dataset=dataset,\n",
+        "    output_dir='outputs/compute-market-qwen3-4b',\n",
+        "    max_steps=MAX_STEPS,\n",
+        ")\n",
+        "trainer\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "trainer.train()\n"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python",
+      "version": "3.11"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 5
+}

training/OpenEnv_gpt_oss_(20B)_Reinforcement_Learning_2048_Game.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

training/Qwen3_(4B)_GRPO.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

training/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Training helpers for the compute market environment."""

training/compute_market_grpo.py ADDED Viewed

	@@ -0,0 +1,314 @@

+"""GRPO training helpers for the Compute Market environment.
+Designed to be imported from a Colab notebook after installing:
+- the environment package from the Hugging Face Space or GitHub repo
+- TRL / Unsloth / transformers runtime deps
+"""
+from __future__ import annotations
+import json
+import os
+import re
+from dataclasses import dataclass
+from typing import Any, Callable
+from compute_market_env import ComputeMarketAction, ComputeMarketEnv
+ACTION_JSON_RE = re.compile(r"\{.*\}", re.DOTALL)
+SYSTEM_PROMPT = """You are a compute allocator trading for scarce GPU capacity.
+Return exactly one JSON object with a valid action.
+Allowed action_type values: bid_for_capacity, accept_offer, propose_swap, schedule_job, delay_job, inspect_market, noop.
+Only include fields needed by the chosen action.
+Be conservative with budget and prioritize completing valuable jobs before their deadlines."""
+DEFAULT_TASK_PROMPT = (
+    "Maximize completed job value while minimizing spend, idle-hoarding penalties, "
+    "and missed deadlines in the compute market."
+)
+@dataclass
+class RolloutSummary:
+    prompt_ids: list[int]
+    completion_ids: list[int]
+    logprobs: list[float]
+    episode_return: float
+    valid_action_reward: float
+    completion_bonus: float
+    transcripts: list[dict[str, Any]]
+def observation_to_prompt(observation: Any, task_prompt: str = DEFAULT_TASK_PROMPT) -> str:
+    jobs = [job.model_dump() for job in observation.jobs]
+    offers = [offer.model_dump() for offer in observation.visible_offers]
+    signals = [signal.model_dump() for signal in observation.actor_signals]
+    events = [event.model_dump() for event in observation.recent_events]
+    return json.dumps(
+        {
+            "task": task_prompt,
+            "scenario_variant": getattr(observation, "scenario_variant", "baseline"),
+            "tick": observation.current_tick,
+            "max_ticks": observation.max_ticks,
+            "budget_remaining": observation.budget_remaining,
+            "market_price": observation.market_price,
+            "public_free_gpus": observation.free_gpus,
+            "owned_gpus": observation.owned_gpus,
+            "idle_owned_gpus": observation.idle_owned_gpus,
+            "jobs": jobs,
+            "visible_offers": offers,
+            "actor_signals": signals,
+            "recent_events": events,
+        },
+        indent=2,
+    )
+def parse_action(text: str) -> tuple[ComputeMarketAction, bool]:
+    match = ACTION_JSON_RE.search(text)
+    if not match:
+        return ComputeMarketAction(action_type="inspect_market"), False
+    try:
+        payload = json.loads(match.group(0))
+        return ComputeMarketAction(**payload), True
+    except Exception:
+        return ComputeMarketAction(action_type="inspect_market"), False
+def _count_completed_jobs(observation: Any) -> int:
+    return sum(1 for job in observation.jobs if job.status == "completed")
+def rollout_once(
+    trainer: Any,
+    env: ComputeMarketEnv,
+    tokenizer: Any,
+    dataset_prompt: str,
+    system_prompt: str = SYSTEM_PROMPT,
+    max_turns: int = 6,
+    seed: int | None = None,
+) -> RolloutSummary:
+    from trl.experimental.openenv import generate_rollout_completions
+    result = env.reset(seed=seed)
+    prompt_ids: list[int] = []
+    completion_ids: list[int] = []
+    logprobs: list[float] = []
+    transcripts: list[dict[str, Any]] = []
+    rewards: list[float] = []
+    valid_action_reward = 0.0
+    completed_before = 0
+    for turn in range(max_turns):
+        if result.done:
+            break
+        prompt_text = observation_to_prompt(result.observation, dataset_prompt)
+        messages = [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": prompt_text},
+        ]
+        rendered_prompt = tokenizer.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=False,
+            enable_thinking=False,
+        )
+        rollout_outputs = generate_rollout_completions(trainer, [rendered_prompt])[0]
+        prompt_ids.extend(rollout_outputs["prompt_ids"])
+        completion_ids.extend(rollout_outputs["completion_ids"])
+        logprobs.extend(rollout_outputs["logprobs"])
+        completion_text = rollout_outputs.get("text") or tokenizer.decode(
+            rollout_outputs["completion_ids"],
+            skip_special_tokens=True,
+        )
+        action, is_valid = parse_action(completion_text)
+        result = env.step(action)
+        reward = float(result.reward or 0.0)
+        rewards.append(reward)
+        valid_action_reward += 0.25 if is_valid else -1.0
+        completed_after = _count_completed_jobs(result.observation)
+        completion_gain = max(0, completed_after - completed_before)
+        completed_before = completed_after
+        transcripts.append(
+            {
+                "turn": turn,
+                "prompt": prompt_text,
+                "completion": completion_text,
+                "action": action.model_dump(exclude_none=True),
+                "is_valid_action": is_valid,
+                "reward": reward,
+                "completed_jobs": completed_after,
+                "completion_gain": completion_gain,
+            }
+        )
+    completion_bonus = float(completed_before)
+    return RolloutSummary(
+        prompt_ids=prompt_ids,
+        completion_ids=completion_ids,
+        logprobs=logprobs,
+        episode_return=sum(rewards),
+        valid_action_reward=valid_action_reward,
+        completion_bonus=completion_bonus,
+        transcripts=transcripts,
+    )
+def rollout_func(
+    prompts: list[str],
+    trainer: Any | None = None,
+    tokenizer: Any | None = None,
+    env_url: str | None = None,
+    max_turns: int = 6,
+    seed_offset: int = 0,
+) -> dict[str, Any]:
+    if trainer is None:
+        raise ValueError("trainer is required")
+    if tokenizer is None:
+        raise ValueError("tokenizer is required")
+    env_url = env_url or os.environ.get("OPENENV_URL", "http://localhost:8000")
+    episode_prompt_ids = []
+    episode_completion_ids = []
+    episode_logprobs = []
+    episode_returns = []
+    validity_rewards = []
+    completion_bonuses = []
+    transcripts = []
+    with ComputeMarketEnv(base_url=env_url) as env:
+        for idx, prompt_text in enumerate(prompts):
+            episode = rollout_once(
+                trainer=trainer,
+                env=env,
+                tokenizer=tokenizer,
+                dataset_prompt=prompt_text,
+                max_turns=max_turns,
+                seed=seed_offset + idx,
+            )
+            episode_prompt_ids.append(episode.prompt_ids)
+            episode_completion_ids.append(episode.completion_ids)
+            episode_logprobs.append(episode.logprobs)
+            episode_returns.append(episode.episode_return)
+            validity_rewards.append(episode.valid_action_reward)
+            completion_bonuses.append(episode.completion_bonus)
+            transcripts.append(episode.transcripts)
+    return {
+        "prompt_ids": episode_prompt_ids,
+        "completion_ids": episode_completion_ids,
+        "logprobs": episode_logprobs,
+        "env_reward": episode_returns,
+        "valid_action_reward": validity_rewards,
+        "completion_bonus": completion_bonuses,
+        "transcripts": transcripts,
+    }
+def reward_env_return(completions: list[Any], **kwargs: Any) -> list[float]:
+    rewards = kwargs.get("env_reward") or []
+    return [float(rewards[i]) if i < len(rewards) else 0.0 for i in range(len(completions))]
+def reward_valid_action(completions: list[Any], **kwargs: Any) -> list[float]:
+    rewards = kwargs.get("valid_action_reward") or []
+    return [float(rewards[i]) if i < len(rewards) else 0.0 for i in range(len(completions))]
+def reward_job_completion(completions: list[Any], **kwargs: Any) -> list[float]:
+    rewards = kwargs.get("completion_bonus") or []
+    return [float(rewards[i]) if i < len(rewards) else 0.0 for i in range(len(completions))]
+def build_dataset(size: int = 128, prompt: str = DEFAULT_TASK_PROMPT):
+    from datasets import Dataset
+    return Dataset.from_dict({"prompt": [prompt] * size})
+def build_grpo_config(
+    output_dir: str = "outputs/compute-market-qwen3-4b",
+    max_steps: int = 300,
+    learning_rate: float = 5e-6,
+    num_generations: int = 2,
+    max_prompt_length: int = 1800,
+    max_completion_length: int = 192,
+    use_vllm: bool = True,
+):
+    from trl import GRPOConfig
+    kwargs: dict[str, Any] = dict(
+        learning_rate=learning_rate,
+        weight_decay=0.001,
+        warmup_ratio=0.1,
+        lr_scheduler_type="linear",
+        optim="adamw_8bit",
+        logging_steps=1,
+        per_device_train_batch_size=1,
+        gradient_accumulation_steps=1,
+        num_generations=num_generations,
+        max_prompt_length=max_prompt_length,
+        max_completion_length=max_completion_length,
+        max_steps=max_steps,
+        save_steps=max_steps,
+        report_to="none",
+        output_dir=output_dir,
+    )
+    if use_vllm:
+        kwargs.update(
+            use_vllm=True,
+            vllm_mode="colocate",
+            vllm_gpu_memory_utilization=0.15,
+        )
+    return GRPOConfig(**kwargs)
+def build_trainer(
+    model: Any,
+    tokenizer: Any,
+    env_url: str,
+    train_dataset: Any | None = None,
+    output_dir: str = "outputs/compute-market-qwen3-4b",
+    max_steps: int = 300,
+    max_turns: int = 6,
+):
+    from trl import GRPOTrainer
+    train_dataset = train_dataset or build_dataset()
+    args = build_grpo_config(output_dir=output_dir, max_steps=max_steps)
+    def bound_rollout_func(prompts: list[str], trainer: Any | None = None, **_: Any) -> dict[str, Any]:
+        return rollout_func(
+            prompts,
+            trainer=trainer,
+            tokenizer=tokenizer,
+            env_url=env_url,
+            max_turns=max_turns,
+        )
+    return GRPOTrainer(
+        model=model,
+        processing_class=tokenizer,
+        reward_funcs=[
+            reward_env_return,
+            reward_valid_action,
+            reward_job_completion,
+        ],
+        train_dataset=train_dataset,
+        args=args,
+        rollout_func=bound_rollout_func,
+    )
+def build_colab_setup_snippet(space_repo_id: str = "openenv-community/compute_market_env") -> str:
+    return f"""# Colab install\n!pip install --upgrade uv\n!uv pip install unsloth vllm --torch-backend=auto\n!uv pip install --upgrade --no-cache-dir --no-deps unsloth unsloth_zoo\n!uv pip install transformers==4.56.2 'trl>=0.24.0' datasets openenv-core\n!pip install git+https://huggingface.co/spaces/{space_repo_id}\n\nimport os\nos.environ['OPENENV_URL'] = 'https://{space_repo_id.replace('/', '-').replace('_', '-')}.hf.space'\n"""
+if __name__ == "__main__":
+    print(build_colab_setup_snippet())

training/eval_compute_market_benchmark.py ADDED Viewed

	@@ -0,0 +1,303 @@

+"""Tiny ID/OOD benchmark for Compute Market models.
+Run this in Colab after installing the environment package from the HF Space and
+cloning this repo for the helper code.
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Any
+from compute_market_env import ComputeMarketEnv
+from training.compute_market_grpo import (
+    DEFAULT_TASK_PROMPT,
+    SYSTEM_PROMPT,
+    observation_to_prompt,
+    parse_action,
+)
+BENCHMARK_SPLITS = {
+    "id_baseline": {
+        "tag": "ID",
+        "scenario_variant": "baseline",
+        "seeds": [11, 12, 13, 14, 15],
+    },
+    "ood_tight_capacity": {
+        "tag": "OOD",
+        "scenario_variant": "tight_capacity",
+        "seeds": [101, 102, 103, 104, 105],
+    },
+    "ood_price_shock": {
+        "tag": "OOD",
+        "scenario_variant": "price_shock",
+        "seeds": [201, 202, 203, 204, 205],
+    },
+    "ood_policy_shift": {
+        "tag": "OOD",
+        "scenario_variant": "policy_shift",
+        "seeds": [301, 302, 303, 304, 305],
+    },
+    "ood_job_mix": {
+        "tag": "OOD",
+        "scenario_variant": "job_mix",
+        "seeds": [401, 402, 403, 404, 405],
+    },
+}
+@dataclass
+class EpisodeStats:
+    model_label: str
+    split: str
+    tag: str
+    seed: int
+    scenario_variant: str
+    total_return: float
+    completed_jobs: int
+    missed_jobs: int
+    invalid_actions: int
+    budget_remaining: float
+    turns: int
+def load_model_and_tokenizer(
+    model_ref: str,
+    max_seq_length: int = 4096,
+    load_in_4bit: bool = True,
+    fast_inference: bool = True,
+):
+    from unsloth import FastLanguageModel
+    model, tokenizer = FastLanguageModel.from_pretrained(
+        model_name=model_ref,
+        max_seq_length=max_seq_length,
+        load_in_4bit=load_in_4bit,
+        fast_inference=fast_inference,
+    )
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    model.eval()
+    return model, tokenizer
+def generate_completion(
+    model: Any,
+    tokenizer: Any,
+    observation: Any,
+    task_prompt: str = DEFAULT_TASK_PROMPT,
+    system_prompt: str = SYSTEM_PROMPT,
+    max_new_tokens: int = 160,
+) -> str:
+    prompt_text = observation_to_prompt(observation, task_prompt)
+    messages = [
+        {"role": "system", "content": system_prompt},
+        {"role": "user", "content": prompt_text},
+    ]
+    rendered_prompt = tokenizer.apply_chat_template(
+        messages,
+        add_generation_prompt=True,
+        tokenize=False,
+        enable_thinking=False,
+    )
+    inputs = tokenizer(rendered_prompt, return_tensors="pt")
+    inputs = {key: value.to(model.device) for key, value in inputs.items()}
+    outputs = model.generate(
+        **inputs,
+        max_new_tokens=max_new_tokens,
+        do_sample=False,
+        temperature=0.0,
+        pad_token_id=tokenizer.eos_token_id,
+    )
+    completion_ids = outputs[0][inputs["input_ids"].shape[1] :]
+    return tokenizer.decode(completion_ids, skip_special_tokens=True)
+def run_episode(
+    env: ComputeMarketEnv,
+    model: Any,
+    tokenizer: Any,
+    model_label: str,
+    split: str,
+    tag: str,
+    seed: int,
+    scenario_variant: str,
+    max_turns: int = 6,
+) -> EpisodeStats:
+    result = env.reset(seed=seed, scenario_variant=scenario_variant)
+    total_return = 0.0
+    invalid_actions = 0
+    turns = 0
+    for _ in range(max_turns):
+        if result.done:
+            break
+        completion = generate_completion(model, tokenizer, result.observation)
+        action, is_valid = parse_action(completion)
+        if not is_valid:
+            invalid_actions += 1
+        result = env.step(action)
+        total_return += float(result.reward or 0.0)
+        turns += 1
+    jobs = result.observation.jobs
+    completed_jobs = sum(1 for job in jobs if job.status == "completed")
+    missed_jobs = sum(1 for job in jobs if job.status == "missed")
+    return EpisodeStats(
+        model_label=model_label,
+        split=split,
+        tag=tag,
+        seed=seed,
+        scenario_variant=scenario_variant,
+        total_return=round(total_return, 2),
+        completed_jobs=completed_jobs,
+        missed_jobs=missed_jobs,
+        invalid_actions=invalid_actions,
+        budget_remaining=round(result.observation.budget_remaining, 2),
+        turns=turns,
+    )
+def evaluate_model(
+    model: Any,
+    tokenizer: Any,
+    model_label: str,
+    env_url: str,
+    benchmark_splits: dict[str, dict[str, Any]] = BENCHMARK_SPLITS,
+    max_turns: int = 6,
+) -> list[EpisodeStats]:
+    stats: list[EpisodeStats] = []
+    with ComputeMarketEnv(base_url=env_url) as env:
+        for split_name, config in benchmark_splits.items():
+            for seed in config["seeds"]:
+                stats.append(
+                    run_episode(
+                        env=env,
+                        model=model,
+                        tokenizer=tokenizer,
+                        model_label=model_label,
+                        split=split_name,
+                        tag=config["tag"],
+                        seed=seed,
+                        scenario_variant=config["scenario_variant"],
+                        max_turns=max_turns,
+                    )
+                )
+    return stats
+def _safe_mean(values: list[float]) -> float:
+    return round(sum(values) / max(1, len(values)), 2)
+def summarize_results(stats: list[EpisodeStats]) -> list[dict[str, Any]]:
+    grouped: dict[tuple[str, str], list[EpisodeStats]] = {}
+    for item in stats:
+        grouped.setdefault((item.model_label, item.split), []).append(item)
+    rows: list[dict[str, Any]] = []
+    for (model_label, split), items in grouped.items():
+        rows.append(
+            {
+                "model": model_label,
+                "split": split,
+                "tag": items[0].tag,
+                "scenario_variant": items[0].scenario_variant,
+                "episodes": len(items),
+                "mean_return": _safe_mean([item.total_return for item in items]),
+                "mean_completed_jobs": _safe_mean([item.completed_jobs for item in items]),
+                "mean_missed_jobs": _safe_mean([item.missed_jobs for item in items]),
+                "mean_budget_remaining": _safe_mean([item.budget_remaining for item in items]),
+                "invalid_action_rate": _safe_mean(
+                    [item.invalid_actions / max(1, item.turns) for item in items]
+                ),
+            }
+        )
+    for model_label in sorted({item.model_label for item in stats}):
+        for tag_name in ["ID", "OOD"]:
+            items = [item for item in stats if item.model_label == model_label and item.tag == tag_name]
+            if not items:
+                continue
+            rows.append(
+                {
+                    "model": model_label,
+                    "split": f"aggregate_{tag_name.lower()}",
+                    "tag": tag_name,
+                    "scenario_variant": "mixed",
+                    "episodes": len(items),
+                    "mean_return": _safe_mean([item.total_return for item in items]),
+                    "mean_completed_jobs": _safe_mean([item.completed_jobs for item in items]),
+                    "mean_missed_jobs": _safe_mean([item.missed_jobs for item in items]),
+                    "mean_budget_remaining": _safe_mean([item.budget_remaining for item in items]),
+                    "invalid_action_rate": _safe_mean(
+                        [item.invalid_actions / max(1, item.turns) for item in items]
+                    ),
+                }
+            )
+    return rows
+def render_markdown_table(rows: list[dict[str, Any]]) -> str:
+    headers = [
+        "model",
+        "split",
+        "tag",
+        "mean_return",
+        "mean_completed_jobs",
+        "mean_missed_jobs",
+        "mean_budget_remaining",
+        "invalid_action_rate",
+    ]
+    lines = [
+        "| " + " | ".join(headers) + " |",
+        "|" + "|".join(["---"] * len(headers)) + "|",
+    ]
+    for row in rows:
+        lines.append(
+            "| "
+            + " | ".join(str(row.get(header, "")) for header in headers)
+            + " |"
+        )
+    return "\n".join(lines)
+def benchmark_two_models(
+    base_model_ref: str,
+    tuned_model_ref: str,
+    env_url: str,
+    max_turns: int = 6,
+) -> tuple[list[EpisodeStats], list[dict[str, Any]], str]:
+    all_stats: list[EpisodeStats] = []
+    for label, model_ref in [
+        ("Qwen3-4B-Base", base_model_ref),
+        ("compute-market-qwen3-4b", tuned_model_ref),
+    ]:
+        model, tokenizer = load_model_and_tokenizer(model_ref)
+        all_stats.extend(
+            evaluate_model(
+                model=model,
+                tokenizer=tokenizer,
+                model_label=label,
+                env_url=env_url,
+                max_turns=max_turns,
+            )
+        )
+    rows = summarize_results(all_stats)
+    return all_stats, rows, render_markdown_table(rows)
+if __name__ == "__main__":
+    import os
+    env_url = os.environ.get(
+        "OPENENV_URL",
+        "https://openenv-community-compute-market-env.hf.space",
+    )
+    _, rows, table = benchmark_two_models(
+        base_model_ref="Qwen/Qwen3-4B",
+        tuned_model_ref="outputs/compute-market-qwen3-4b",
+        env_url=env_url,
+    )
+    print(table)

training/minimal_grpo_rollout.py ADDED Viewed

	@@ -0,0 +1,81 @@

+"""Minimal TRL/OpenEnv rollout helpers for Colab."""
+from __future__ import annotations
+import json
+import os
+import re
+from typing import Any
+from compute_market_env import ComputeMarketAction, ComputeMarketEnv
+ACTION_RE = re.compile(r"\{.*\}", re.DOTALL)
+SYSTEM_PROMPT = """You trade for scarce GPU capacity.
+Choose exactly one JSON action per turn.
+Valid action_type values are: bid_for_capacity, accept_offer, propose_swap, schedule_job, delay_job, inspect_market, noop.
+Return strict JSON only."""
+def observation_to_prompt(observation) -> str:
+    jobs = [job.model_dump() for job in observation.jobs]
+    offers = [offer.model_dump() for offer in observation.visible_offers]
+    signals = [signal.model_dump() for signal in observation.actor_signals]
+    return json.dumps(
+        {
+            "tick": observation.current_tick,
+            "budget_remaining": observation.budget_remaining,
+            "market_price": observation.market_price,
+            "free_gpus": observation.free_gpus,
+            "owned_gpus": observation.owned_gpus,
+            "idle_owned_gpus": observation.idle_owned_gpus,
+            "jobs": jobs,
+            "visible_offers": offers,
+            "actor_signals": signals,
+        },
+        indent=2,
+    )
+def parse_action(text: str) -> ComputeMarketAction:
+    match = ACTION_RE.search(text)
+    if not match:
+        return ComputeMarketAction(action_type="inspect_market")
+    try:
+        payload = json.loads(match.group(0))
+        return ComputeMarketAction(**payload)
+    except Exception:
+        return ComputeMarketAction(action_type="inspect_market")
+def rollout_once(generate_completion, seed: int = 0, max_turns: int = 6) -> dict[str, Any]:
+    env_url = os.environ.get("OPENENV_URL", "http://localhost:8000")
+    rewards = []
+    actions = []
+    transcripts = []
+    with ComputeMarketEnv(base_url=env_url) as env:
+        result = env.reset(seed=seed)
+        for _ in range(max_turns):
+            if result.done:
+                break
+            prompt = observation_to_prompt(result.observation)
+            completion = generate_completion(SYSTEM_PROMPT, prompt)
+            action = parse_action(completion)
+            result = env.step(action)
+            actions.append(action.model_dump(exclude_none=True))
+            rewards.append(float(result.reward or 0.0))
+            transcripts.append(
+                {
+                    "prompt": prompt,
+                    "completion": completion,
+                    "action": action.model_dump(exclude_none=True),
+                    "reward": result.reward,
+                }
+            )
+    return {
+        "actions": actions,
+        "rewards": rewards,
+        "return": sum(rewards),
+        "transcripts": transcripts,
+    }

training/sample_benchmark_results.md ADDED Viewed

	@@ -0,0 +1,4 @@

+| model                   | split              | tag | mean_return | mean_completed_jobs | mean_missed_jobs | mean_budget_remaining | invalid_action_rate |
+| ----------------------- | ------------------ | --- | ----------- | ------------------- | ---------------- | --------------------- | ------------------- |
+| Qwen3-4B-Base           | aggregate_ood      | OOD | -61.7       | 0.5                 | 1.9              | 66.8                  | 0.44                |
+| compute-market-qwen3-4b | aggregate_ood      | OOD | -18.4       | 1.1                 | 1.2              | 41.7                  | 0.16                |

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff