Spaces:

CreativeEngineer
/

fusion-design-lab

Paused

CreativeEngineer Claude Opus 4.6 commited on Mar 8

Commit

3bfd80a

1 Parent(s): 2dba2cf

feat: add HF Space deployment + GRPO training notebook

- Add root-level re-export files (__init__.py, client.py, models.py)
for OpenEnv packaging convention
- Switch Dockerfile base from openenv-base to python:3.12-slim for
reliable HF Space builds
- Add Colab-ready GRPO training notebook using Unsloth + TRL
with environment reward functions

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (5) hide show

__init__.py +1 -0
client.py +5 -0
models.py +27 -0
server/Dockerfile +12 -22
training/notebooks/fusion_design_lab_training.ipynb +653 -0

__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Fusion Design Lab — OpenEnv P1 stellarator environment."""

client.py ADDED Viewed

	@@ -0,0 +1,5 @@

+"""Root-level re-export for OpenEnv packaging convention."""
+from fusion_lab.client import FusionLabClient
+__all__ = ["FusionLabClient"]

models.py ADDED Viewed

	@@ -0,0 +1,27 @@

+"""Root-level re-export for OpenEnv packaging convention."""
+from fusion_lab.models import (
+    ActionIntent,
+    DirectionName,
+    EvaluationFidelityName,
+    LowDimBoundaryParams,
+    MagnitudeName,
+    ParameterName,
+    StellaratorAction,
+    StellaratorObservation,
+    StellaratorState,
+    default_low_dim_boundary_params,
+)
+__all__ = [
+    "ActionIntent",
+    "DirectionName",
+    "EvaluationFidelityName",
+    "LowDimBoundaryParams",
+    "MagnitudeName",
+    "ParameterName",
+    "StellaratorAction",
+    "StellaratorObservation",
+    "StellaratorState",
+    "default_low_dim_boundary_params",
+]

server/Dockerfile CHANGED Viewed

@@ -1,43 +1,33 @@
-ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
-FROM ${BASE_IMAGE} AS builder
 WORKDIR /app
 RUN apt-get update && \
-    apt-get install -y --no-install-recommends git && \
     rm -rf /var/lib/apt/lists/*
-ARG BUILD_MODE=standalone
-ARG ENV_NAME=fusion_design_lab
 COPY . /app/env
 WORKDIR /app/env
-RUN if ! command -v uv >/dev/null 2>&1; then \
-        curl -LsSf https://astral.sh/uv/install.sh | sh && \
-        mv /root/.local/bin/uv /usr/local/bin/uv && \
-        mv /root/.local/bin/uvx /usr/local/bin/uvx; \
-    fi
 RUN --mount=type=cache,target=/root/.cache/uv \
-    if [ -f uv.lock ]; then \
-        uv sync --frozen --no-install-project --no-editable; \
-    else \
-        uv sync --no-install-project --no-editable; \
-    fi
 RUN --mount=type=cache,target=/root/.cache/uv \
-    if [ -f uv.lock ]; then \
-        uv sync --frozen --no-editable; \
-    else \
-        uv sync --no-editable; \
-    fi
-FROM ${BASE_IMAGE}
 WORKDIR /app
 COPY --from=builder /app/env/.venv /app/.venv
 COPY --from=builder /app/env /app/env

+FROM python:3.12-slim AS builder
 WORKDIR /app
 RUN apt-get update && \
+    apt-get install -y --no-install-recommends git curl && \
     rm -rf /var/lib/apt/lists/*
 COPY . /app/env
 WORKDIR /app/env
+RUN curl -LsSf https://astral.sh/uv/install.sh | sh && \
+    mv /root/.local/bin/uv /usr/local/bin/uv && \
+    mv /root/.local/bin/uvx /usr/local/bin/uvx
 RUN --mount=type=cache,target=/root/.cache/uv \
+    uv sync --frozen --no-install-project --no-editable
 RUN --mount=type=cache,target=/root/.cache/uv \
+    uv sync --frozen --no-editable
+FROM python:3.12-slim
 WORKDIR /app
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends curl && \
+    rm -rf /var/lib/apt/lists/*
 COPY --from=builder /app/env/.venv /app/.venv
 COPY --from=builder /app/env /app/env

training/notebooks/fusion_design_lab_training.ipynb ADDED Viewed

	@@ -0,0 +1,653 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "7fb27b941602401d91542211134fc71a",
+   "metadata": {},
+   "source": [
+    "# Fusion Design Lab — GRPO Training\n",
+    "\n",
+    "Train an LLM to optimize stellarator fusion reactor designs using **GRPO** (Group Relative Policy Optimization) with **Unsloth** and **TRL**.\n",
+    "\n",
+    "The agent interacts with a constrained optimization environment where it adjusts 4 geometric knobs of a stellarator boundary, aiming to **minimize max elongation** while satisfying 3 hard physics constraints:\n",
+    "- `aspect_ratio ≤ 4.0`\n",
+    "- `average_triangularity ≤ -0.5`\n",
+    "- `edge_iota_over_nfp ≥ 0.3`\n",
+    "\n",
+    "Each episode has **6 evaluations** budgeted. The agent produces a plan of actions and the environment scores it via the `constellaration` physics verifier.\n",
+    "\n",
+    "**Environment deployed at**: https://creativeengineer-fusion-design-lab.hf.space\n",
+    "\n",
+    "**Runtime**: Select GPU (T4 or better) via `Runtime > Change runtime type`."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "acae54e37e7d407bbb7b55eff062a284",
+   "metadata": {},
+   "source": [
+    "## 1. Install Dependencies"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9a63283cbaf04dbcab1f6479b197f3a8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%capture\n",
+    "!pip install unsloth vllm\n",
+    "!pip install --no-deps trl\n",
+    "!pip install constellaration openenv-core[core] pydantic fastapi uvicorn\n",
+    "!pip install matplotlib"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8dd0d8092fe74a7c96281538738b07e2",
+   "metadata": {},
+   "source": [
+    "## 2. Load Model with Unsloth"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "72eea5119410473aa328ad9291626812",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from unsloth import FastLanguageModel\n",
+    "\n",
+    "MODEL_NAME = \"unsloth/Qwen3-0.6B\"\n",
+    "MAX_SEQ_LENGTH = 2048\n",
+    "\n",
+    "model, tokenizer = FastLanguageModel.from_pretrained(\n",
+    "    model_name=MODEL_NAME,\n",
+    "    max_seq_length=MAX_SEQ_LENGTH,\n",
+    "    load_in_4bit=True,\n",
+    "    fast_inference=True,\n",
+    ")\n",
+    "\n",
+    "model = FastLanguageModel.get_peft_model(\n",
+    "    model,\n",
+    "    r=32,\n",
+    "    target_modules=[\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\", \"gate_proj\", \"up_proj\", \"down_proj\"],\n",
+    "    lora_alpha=32,\n",
+    "    use_gradient_checkpointing=\"unsloth\",\n",
+    ")\n",
+    "\n",
+    "print(f\"Model loaded: {MODEL_NAME}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8edb47106e1a46a883d545849b8ab81b",
+   "metadata": {},
+   "source": [
+    "## 3. Setup Stellarator Environment\n",
+    "\n",
+    "We install the environment package directly from the HF Space repository so training runs locally (no network latency). The same environment is deployed at the HF Space URL above."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "10185d26023b46108eb7d9f57d49d2b3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%capture\n",
+    "!pip install git+https://huggingface.co/spaces/CreativeEngineer/fusion-design-lab"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8763a12b2bbd4a93a75aff182afb95dc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "import re\n",
+    "from typing import Final\n",
+    "\n",
+    "from fusion_lab.models import StellaratorAction, StellaratorObservation\n",
+    "from server.contract import RESET_SEEDS\n",
+    "from server.environment import BUDGET, StellaratorEnvironment\n",
+    "\n",
+    "AVAILABLE_ACTIONS: Final[list[dict[str, str]]] = [\n",
+    "    {\"intent\": \"run\", \"parameter\": p, \"direction\": d, \"magnitude\": m}\n",
+    "    for p in [\"aspect_ratio\", \"elongation\", \"rotational_transform\", \"triangularity_scale\"]\n",
+    "    for d in [\"increase\", \"decrease\"]\n",
+    "    for m in [\"small\", \"medium\", \"large\"]\n",
+    "] + [\n",
+    "    {\"intent\": \"restore_best\"},\n",
+    "    {\"intent\": \"submit\"},\n",
+    "]\n",
+    "\n",
+    "ACTION_LABELS: Final[list[str]] = [\n",
+    "    f\"{a['intent']} {a.get('parameter', '')} {a.get('direction', '')} {a.get('magnitude', '')}\".strip()\n",
+    "    for a in AVAILABLE_ACTIONS\n",
+    "]\n",
+    "\n",
+    "# Quick smoke test\n",
+    "env = StellaratorEnvironment()\n",
+    "obs = env.reset(seed=0)\n",
+    "print(\n",
+    "    f\"Environment ready. Initial score: {obs.p1_score:.4f}, feasibility: {obs.p1_feasibility:.4f}\"\n",
+    ")\n",
+    "print(f\"Budget: {obs.budget_remaining}, Constraints satisfied: {obs.constraints_satisfied}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7623eae2785240b9bd12b16a66d81610",
+   "metadata": {},
+   "source": [
+    "## 4. Prompt Template & Action Parsing\n",
+    "\n",
+    "Each training sample is a prompt describing the stellarator task and initial state. The model generates a plan of actions to optimize the design."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7cdc8c89c7104fffa095e18ddfef8986",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "SYSTEM_PROMPT: Final[\n",
+    "    str\n",
+    "] = \"\"\"You are an expert stellarator fusion reactor designer. Your goal is to optimize a stellarator design by adjusting 4 geometric parameters to minimize max elongation while satisfying physics constraints.\n",
+    "\n",
+    "Constraints:\n",
+    "- aspect_ratio <= 4.0\n",
+    "- average_triangularity <= -0.5\n",
+    "- edge_iota_over_nfp >= 0.3\n",
+    "\n",
+    "Available parameters: aspect_ratio, elongation, rotational_transform, triangularity_scale\n",
+    "Available directions: increase, decrease\n",
+    "Available magnitudes: small, medium, large\n",
+    "\n",
+    "You have a budget of 6 evaluations. Output a plan of actions as a JSON array. Each action is an object with keys: intent, parameter, direction, magnitude. The last action should be {\"intent\": \"submit\"} to finalize your design.\n",
+    "\n",
+    "Example:\n",
+    "[{\"intent\":\"run\",\"parameter\":\"triangularity_scale\",\"direction\":\"increase\",\"magnitude\":\"small\"},{\"intent\":\"run\",\"parameter\":\"rotational_transform\",\"direction\":\"increase\",\"magnitude\":\"medium\"},{\"intent\":\"submit\"}]\"\"\"\n",
+    "\n",
+    "\n",
+    "def format_observation(obs: StellaratorObservation) -> str:\n",
+    "    return (\n",
+    "        f\"Current stellarator state:\\n\"\n",
+    "        f\"  max_elongation: {obs.max_elongation:.4f}\\n\"\n",
+    "        f\"  aspect_ratio: {obs.aspect_ratio:.4f} (constraint: <= 4.0)\\n\"\n",
+    "        f\"  average_triangularity: {obs.average_triangularity:.6f} (constraint: <= -0.5)\\n\"\n",
+    "        f\"  edge_iota_over_nfp: {obs.edge_iota_over_nfp:.4f} (constraint: >= 0.3)\\n\"\n",
+    "        f\"  p1_score: {obs.p1_score:.4f}\\n\"\n",
+    "        f\"  feasibility: {obs.p1_feasibility:.4f}\\n\"\n",
+    "        f\"  constraints_satisfied: {obs.constraints_satisfied}\\n\"\n",
+    "        f\"  budget_remaining: {obs.budget_remaining}\\n\"\n",
+    "        f\"\\nGenerate an action plan as a JSON array to optimize this design.\"\n",
+    "    )\n",
+    "\n",
+    "\n",
+    "def build_prompt(obs: StellaratorObservation) -> str:\n",
+    "    return (\n",
+    "        f\"<|im_start|>system\\n{SYSTEM_PROMPT}<|im_end|>\\n\"\n",
+    "        f\"<|im_start|>user\\n{format_observation(obs)}<|im_end|>\\n\"\n",
+    "        f\"<|im_start|>assistant\\n\"\n",
+    "    )\n",
+    "\n",
+    "\n",
+    "def parse_action_plan(text: str) -> list[StellaratorAction]:\n",
+    "    \"\"\"Parse a JSON action plan from model output.\"\"\"\n",
+    "    # Find JSON array in the text\n",
+    "    match = re.search(r\"\\[.*?\\]\", text, re.DOTALL)\n",
+    "    if not match:\n",
+    "        return []\n",
+    "    try:\n",
+    "        raw = json.loads(match.group())\n",
+    "    except json.JSONDecodeError:\n",
+    "        return []\n",
+    "    actions = []\n",
+    "    for item in raw:\n",
+    "        if not isinstance(item, dict) or \"intent\" not in item:\n",
+    "            continue\n",
+    "        intent = item[\"intent\"]\n",
+    "        if intent == \"submit\":\n",
+    "            actions.append(StellaratorAction(intent=\"submit\"))\n",
+    "            break\n",
+    "        if intent == \"restore_best\":\n",
+    "            actions.append(StellaratorAction(intent=\"restore_best\"))\n",
+    "            continue\n",
+    "        if intent == \"run\":\n",
+    "            p = item.get(\"parameter\", \"\")\n",
+    "            d = item.get(\"direction\", \"\")\n",
+    "            m = item.get(\"magnitude\", \"small\")\n",
+    "            if p in (\n",
+    "                \"aspect_ratio\",\n",
+    "                \"elongation\",\n",
+    "                \"rotational_transform\",\n",
+    "                \"triangularity_scale\",\n",
+    "            ) and d in (\"increase\", \"decrease\"):\n",
+    "                if m not in (\"small\", \"medium\", \"large\"):\n",
+    "                    m = \"small\"\n",
+    "                actions.append(\n",
+    "                    StellaratorAction(intent=\"run\", parameter=p, direction=d, magnitude=m)\n",
+    "                )\n",
+    "    return actions\n",
+    "\n",
+    "\n",
+    "# Test prompt\n",
+    "env = StellaratorEnvironment()\n",
+    "obs = env.reset(seed=0)\n",
+    "prompt = build_prompt(obs)\n",
+    "print(prompt[:500])\n",
+    "print(\"...\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b118ea5561624da68c537baed56e602f",
+   "metadata": {},
+   "source": [
+    "## 5. Training Dataset\n",
+    "\n",
+    "Create prompts from all 3 reset seeds. Each prompt is an initial observation that the model must optimize."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "938c804e27f84196a10c8828c723f798",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datasets import Dataset\n",
+    "\n",
+    "prompts = []\n",
+    "for seed_idx in range(len(RESET_SEEDS)):\n",
+    "    env = StellaratorEnvironment()\n",
+    "    obs = env.reset(seed=seed_idx)\n",
+    "    prompt = build_prompt(obs)\n",
+    "    # Repeat each seed to create a larger training set\n",
+    "    for _ in range(50):\n",
+    "        prompts.append({\"prompt\": prompt, \"seed_idx\": seed_idx})\n",
+    "\n",
+    "dataset = Dataset.from_list(prompts)\n",
+    "dataset = dataset.shuffle(seed=42)\n",
+    "print(f\"Training dataset: {len(dataset)} samples from {len(RESET_SEEDS)} seeds\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "504fb2a444614c0babb325280ed9130a",
+   "metadata": {},
+   "source": [
+    "## 6. Reward Functions\n",
+    "\n",
+    "Two reward signals:\n",
+    "1. **Format reward**: Does the completion contain a valid JSON action plan?\n",
+    "2. **Environment reward**: Execute the plan in the stellarator environment and return cumulative reward."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "59bbdb311c014d738909a11f9e486628",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import traceback\n",
+    "\n",
+    "\n",
+    "def format_reward_fn(completions: list[str], **kwargs) -> list[float]:\n",
+    "    \"\"\"Reward for producing a valid, parseable action plan.\"\"\"\n",
+    "    rewards = []\n",
+    "    for completion in completions:\n",
+    "        actions = parse_action_plan(completion)\n",
+    "        if len(actions) == 0:\n",
+    "            rewards.append(-1.0)\n",
+    "        elif any(a.intent == \"submit\" for a in actions):\n",
+    "            rewards.append(1.0)  # valid plan ending with submit\n",
+    "        else:\n",
+    "            rewards.append(0.0)  # valid actions but no submit\n",
+    "    return rewards\n",
+    "\n",
+    "\n",
+    "def environment_reward_fn(\n",
+    "    completions: list[str], seed_idx: list[int] | None = None, **kwargs\n",
+    ") -> list[float]:\n",
+    "    \"\"\"Execute each action plan in the environment and return cumulative reward.\"\"\"\n",
+    "    rewards = []\n",
+    "    seeds = seed_idx if seed_idx is not None else [0] * len(completions)\n",
+    "    for i, completion in enumerate(completions):\n",
+    "        try:\n",
+    "            actions = parse_action_plan(completion)\n",
+    "            if len(actions) == 0:\n",
+    "                rewards.append(-3.0)\n",
+    "                continue\n",
+    "            env = StellaratorEnvironment()\n",
+    "            env.reset(seed=int(seeds[i]) % len(RESET_SEEDS))\n",
+    "            total_reward = 0.0\n",
+    "            for action in actions[:BUDGET]:\n",
+    "                obs = env.step(action)\n",
+    "                total_reward += float(obs.reward or 0.0)\n",
+    "                if obs.done:\n",
+    "                    break\n",
+    "            rewards.append(total_reward)\n",
+    "        except Exception:\n",
+    "            traceback.print_exc()\n",
+    "            rewards.append(-3.0)\n",
+    "    return rewards\n",
+    "\n",
+    "\n",
+    "# Test reward functions with a hand-crafted plan\n",
+    "test_plan = json.dumps(\n",
+    "    [\n",
+    "        {\n",
+    "            \"intent\": \"run\",\n",
+    "            \"parameter\": \"triangularity_scale\",\n",
+    "            \"direction\": \"increase\",\n",
+    "            \"magnitude\": \"small\",\n",
+    "        },\n",
+    "        {\n",
+    "            \"intent\": \"run\",\n",
+    "            \"parameter\": \"rotational_transform\",\n",
+    "            \"direction\": \"increase\",\n",
+    "            \"magnitude\": \"medium\",\n",
+    "        },\n",
+    "        {\"intent\": \"submit\"},\n",
+    "    ]\n",
+    ")\n",
+    "print(f\"Format reward: {format_reward_fn([test_plan])}\")\n",
+    "print(f\"Environment reward: {environment_reward_fn([test_plan], seed_idx=[0])}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b43b363d81ae4b689946ece5c682cd59",
+   "metadata": {},
+   "source": [
+    "## 7. GRPO Training\n",
+    "\n",
+    "Train the model using Group Relative Policy Optimization. GRPO generates multiple completions per prompt and updates the policy toward higher-reward completions."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8a65eabff63a45729fe45fb5ade58bdc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from trl import GRPOConfig, GRPOTrainer\n",
+    "\n",
+    "MAX_PROMPT_LENGTH = 768\n",
+    "MAX_COMPLETION_LENGTH = MAX_SEQ_LENGTH - MAX_PROMPT_LENGTH\n",
+    "\n",
+    "training_args = GRPOConfig(\n",
+    "    output_dir=\"./grpo_fusion_output\",\n",
+    "    learning_rate=2e-4,\n",
+    "    num_generations=4,\n",
+    "    max_completion_length=MAX_COMPLETION_LENGTH,\n",
+    "    max_prompt_length=MAX_PROMPT_LENGTH,\n",
+    "    per_device_train_batch_size=4,\n",
+    "    gradient_accumulation_steps=1,\n",
+    "    max_steps=60,\n",
+    "    temperature=1.0,\n",
+    "    logging_steps=1,\n",
+    "    save_steps=20,\n",
+    "    bf16=True,\n",
+    "    report_to=\"none\",\n",
+    "    seed=42,\n",
+    ")\n",
+    "\n",
+    "trainer = GRPOTrainer(\n",
+    "    model=model,\n",
+    "    processing_class=tokenizer,\n",
+    "    reward_funcs=[format_reward_fn, environment_reward_fn],\n",
+    "    args=training_args,\n",
+    "    train_dataset=dataset,\n",
+    ")\n",
+    "\n",
+    "print(\"Starting GRPO training...\")\n",
+    "train_result = trainer.train()\n",
+    "print(f\"Training complete. Total steps: {train_result.global_step}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c3933fab20d04ec698c2621248eb3be0",
+   "metadata": {},
+   "source": [
+    "## 8. Training Results\n",
+    "\n",
+    "Visualize reward improvement over training steps."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4dd4641cc4064e0191573fe9c69df29b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "log_history = trainer.state.log_history\n",
+    "steps = [entry[\"step\"] for entry in log_history if \"loss\" in entry]\n",
+    "losses = [entry[\"loss\"] for entry in log_history if \"loss\" in entry]\n",
+    "\n",
+    "# Extract reward metrics if available\n",
+    "reward_steps = [\n",
+    "    entry[\"step\"]\n",
+    "    for entry in log_history\n",
+    "    if \"reward\" in entry or \"rewards/environment_reward_fn\" in entry\n",
+    "]\n",
+    "rewards = [\n",
+    "    entry.get(\"reward\", entry.get(\"rewards/environment_reward_fn\", 0))\n",
+    "    for entry in log_history\n",
+    "    if \"reward\" in entry or \"rewards/environment_reward_fn\" in entry\n",
+    "]\n",
+    "\n",
+    "fig, axes = plt.subplots(1, 2, figsize=(14, 5))\n",
+    "\n",
+    "axes[0].plot(steps, losses, \"b-\", alpha=0.7)\n",
+    "axes[0].set_xlabel(\"Step\")\n",
+    "axes[0].set_ylabel(\"Loss\")\n",
+    "axes[0].set_title(\"GRPO Training Loss\")\n",
+    "axes[0].grid(True, alpha=0.3)\n",
+    "\n",
+    "if rewards:\n",
+    "    axes[1].plot(reward_steps, rewards, \"g-o\", alpha=0.7, markersize=3)\n",
+    "    axes[1].set_xlabel(\"Step\")\n",
+    "    axes[1].set_ylabel(\"Mean Reward\")\n",
+    "    axes[1].set_title(\"Environment Reward Over Training\")\n",
+    "    axes[1].grid(True, alpha=0.3)\n",
+    "else:\n",
+    "    axes[1].text(0.5, 0.5, \"Reward metrics not logged\", ha=\"center\", va=\"center\")\n",
+    "\n",
+    "plt.suptitle(\"Fusion Design Lab — GRPO Training Curves\", fontsize=14, fontweight=\"bold\")\n",
+    "plt.tight_layout()\n",
+    "plt.savefig(\"training_curves.png\", dpi=150, bbox_inches=\"tight\")\n",
+    "plt.show()\n",
+    "print(\"Saved training_curves.png\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8309879909854d7188b41380fd92a7c3",
+   "metadata": {},
+   "source": [
+    "## 9. Evaluate Trained Policy\n",
+    "\n",
+    "Generate action plans from the trained model and compare against random baselines."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3ed186c9a28b402fb0bc4494df01f08d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import random\n",
+    "\n",
+    "FastLanguageModel.for_inference(model)\n",
+    "\n",
+    "\n",
+    "def run_episode_with_model(seed_idx: int) -> tuple[float, list[str]]:\n",
+    "    \"\"\"Run one episode using the trained model.\"\"\"\n",
+    "    env = StellaratorEnvironment()\n",
+    "    obs = env.reset(seed=seed_idx)\n",
+    "    prompt = build_prompt(obs)\n",
+    "    inputs = tokenizer(prompt, return_tensors=\"pt\").to(model.device)\n",
+    "    outputs = model.generate(\n",
+    "        **inputs,\n",
+    "        max_new_tokens=MAX_COMPLETION_LENGTH,\n",
+    "        temperature=0.7,\n",
+    "        do_sample=True,\n",
+    "    )\n",
+    "    completion = tokenizer.decode(\n",
+    "        outputs[0][inputs[\"input_ids\"].shape[1] :], skip_special_tokens=True\n",
+    "    )\n",
+    "    actions = parse_action_plan(completion)\n",
+    "    trace = []\n",
+    "    total_reward = 0.0\n",
+    "    for action in actions[:BUDGET]:\n",
+    "        obs = env.step(action)\n",
+    "        r = float(obs.reward or 0.0)\n",
+    "        total_reward += r\n",
+    "        trace.append(\n",
+    "            f\"  {action.intent} {action.parameter or ''} {action.direction or ''} {action.magnitude or ''} → reward={r:.3f} score={obs.p1_score:.4f} feasible={obs.constraints_satisfied}\".strip()\n",
+    "        )\n",
+    "        if obs.done:\n",
+    "            break\n",
+    "    return total_reward, trace\n",
+    "\n",
+    "\n",
+    "def run_random_episode(seed_idx: int) -> float:\n",
+    "    \"\"\"Run one episode with random actions for comparison.\"\"\"\n",
+    "    env = StellaratorEnvironment()\n",
+    "    env.reset(seed=seed_idx)\n",
+    "    total_reward = 0.0\n",
+    "    for step in range(BUDGET - 1):\n",
+    "        spec = random.choice(AVAILABLE_ACTIONS[:24])  # run actions only\n",
+    "        action = StellaratorAction(**spec)\n",
+    "        obs = env.step(action)\n",
+    "        total_reward += float(obs.reward or 0.0)\n",
+    "        if obs.done:\n",
+    "            return total_reward\n",
+    "    # submit on last step\n",
+    "    obs = env.step(StellaratorAction(intent=\"submit\"))\n",
+    "    total_reward += float(obs.reward or 0.0)\n",
+    "    return total_reward\n",
+    "\n",
+    "\n",
+    "# Evaluate\n",
+    "print(\"=\" * 60)\n",
+    "print(\"TRAINED MODEL EPISODES\")\n",
+    "print(\"=\" * 60)\n",
+    "trained_rewards = []\n",
+    "for seed in range(len(RESET_SEEDS)):\n",
+    "    reward, trace = run_episode_with_model(seed)\n",
+    "    trained_rewards.append(reward)\n",
+    "    print(f\"\\nSeed {seed} — Total reward: {reward:.3f}\")\n",
+    "    for line in trace:\n",
+    "        print(f\"  {line}\")\n",
+    "\n",
+    "print(f\"\\nMean trained reward: {sum(trained_rewards) / len(trained_rewards):.3f}\")\n",
+    "\n",
+    "print(\"\\n\" + \"=\" * 60)\n",
+    "print(\"RANDOM BASELINE (10 episodes per seed)\")\n",
+    "print(\"=\" * 60)\n",
+    "random_rewards = []\n",
+    "for seed in range(len(RESET_SEEDS)):\n",
+    "    seed_rewards = [run_random_episode(seed) for _ in range(10)]\n",
+    "    random_rewards.extend(seed_rewards)\n",
+    "    print(\n",
+    "        f\"Seed {seed} — Mean: {sum(seed_rewards) / len(seed_rewards):.3f}, Best: {max(seed_rewards):.3f}\"\n",
+    "    )\n",
+    "\n",
+    "print(f\"\\nMean random reward: {sum(random_rewards) / len(random_rewards):.3f}\")\n",
+    "print(f\"Mean trained reward: {sum(trained_rewards) / len(trained_rewards):.3f}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cb1e1581032b452c9409d6c6813c49d1",
+   "metadata": {},
+   "source": [
+    "## 10. Connect to Deployed HF Space\n",
+    "\n",
+    "Demonstrate connecting to the live environment on Hugging Face Spaces."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "379cbbc1e968416e875cc15c1202d7eb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from fusion_lab.client import FusionLabClient\n",
+    "from fusion_lab.models import StellaratorAction\n",
+    "\n",
+    "HF_SPACE_URL = \"https://creativeengineer-fusion-design-lab.hf.space\"\n",
+    "\n",
+    "with FusionLabClient(base_url=HF_SPACE_URL).sync() as client:\n",
+    "    obs = client.reset()\n",
+    "    print(f\"Connected to HF Space: {HF_SPACE_URL}\")\n",
+    "    print(\"Initial observation:\")\n",
+    "    print(f\"  max_elongation: {obs.observation.max_elongation:.4f}\")\n",
+    "    print(f\"  aspect_ratio: {obs.observation.aspect_ratio:.4f}\")\n",
+    "    print(f\"  p1_score: {obs.observation.p1_score:.4f}\")\n",
+    "    print(f\"  constraints_satisfied: {obs.observation.constraints_satisfied}\")\n",
+    "    print(f\"  budget_remaining: {obs.observation.budget_remaining}\")\n",
+    "\n",
+    "    # Run one action from the trained model\n",
+    "    prompt = build_prompt(obs.observation)\n",
+    "    inputs = tokenizer(prompt, return_tensors=\"pt\").to(model.device)\n",
+    "    outputs = model.generate(\n",
+    "        **inputs, max_new_tokens=MAX_COMPLETION_LENGTH, temperature=0.7, do_sample=True\n",
+    "    )\n",
+    "    completion = tokenizer.decode(\n",
+    "        outputs[0][inputs[\"input_ids\"].shape[1] :], skip_special_tokens=True\n",
+    "    )\n",
+    "    actions = parse_action_plan(completion)\n",
+    "\n",
+    "    print(f\"\\nModel generated {len(actions)} actions:\")\n",
+    "    for i, action in enumerate(actions[:BUDGET]):\n",
+    "        result = client.step(action)\n",
+    "        print(\n",
+    "            f\"  Step {i + 1}: {action.intent} {action.parameter or ''} {action.direction or ''} {action.magnitude or ''} → reward={result.reward:.3f}\"\n",
+    "        )\n",
+    "        if result.done:\n",
+    "            print(f\"  Episode done. Final score: {result.observation.p1_score:.4f}\")\n",
+    "            break\n",
+    "\n",
+    "print(\"\\nDone! Environment is live and accessible for training and evaluation.\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.12.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}