Spaces:

CreativeEngineer
/

fusion-design-lab

Paused

App Files Files Community

CreativeEngineer commited on Mar 8

Commit

8254ade

1 Parent(s): 5f2da5f

fix: align notebook evaluation with grpo rewards

Browse files

Files changed (1) hide show

training/notebooks/fusion_design_lab_training.ipynb +53 -5

training/notebooks/fusion_design_lab_training.ipynb CHANGED Viewed

@@ -326,7 +326,13 @@
    "cell_type": "markdown",
    "id": "504fb2a444614c0babb325280ed9130a",
    "metadata": {},
-   "source": "## 6. Reward Function\n\nThe environment reward executes each generated action plan in the stellarator environment and returns the cumulative V0 reward. The environment's built-in reward decomposes feasibility (+3/-3 crossing bonuses, feasibility progress), objective (max elongation improvement), step costs, submit bonuses, and failure penalties — see `server/environment.py:_compute_reward`.\n\nIf the model's plan ends before the episode terminates (no submit, budget not exhausted), the reward function auto-submits so terminal reward terms always fire. This ensures GRPO optimizes the full episode return, not truncated partial returns."
   },
   {
    "cell_type": "code",
@@ -543,6 +549,28 @@
     "FastLanguageModel.for_inference(model)\n",
     "\n",
     "\n",
     "def run_episode_with_model(seed_idx: int) -> tuple[float, list[str]]:\n",
     "    \"\"\"Run one episode using the trained model.\"\"\"\n",
     "    env = StellaratorEnvironment()\n",
@@ -561,15 +589,24 @@
     "    actions = parse_action_plan(completion)\n",
     "    trace = []\n",
     "    total_reward = 0.0\n",
     "    for action in actions[:BUDGET]:\n",
     "        obs = env.step(action)\n",
     "        r = float(obs.reward) if obs.reward is not None else 0.0\n",
     "        total_reward += r\n",
     "        trace.append(\n",
-    "            f\"  {action.intent} {action.parameter or ''} {action.direction or ''} {action.magnitude or ''} → reward={r:.3f} score={obs.p1_score:.4f} feasible={obs.constraints_satisfied}\".strip()\n",
     "        )\n",
     "        if obs.done:\n",
     "            break\n",
     "    return total_reward, trace\n",
     "\n",
     "\n",
@@ -669,18 +706,29 @@
     "    actions = parse_action_plan(completion)\n",
     "\n",
     "    print(f\"\\nTrained model generated {len(actions)} actions for remote env:\")\n",
-    "    for i, action in enumerate(actions[:BUDGET]):\n",
     "        result = env.step(action)\n",
     "        step_obs = result.observation\n",
     "        reward = float(result.reward) if result.reward is not None else 0.0\n",
     "        print(\n",
-    "            f\"  Step {i + 1}: {action.intent} {action.parameter or ''} \"\n",
     "            f\"{action.direction or ''} {action.magnitude or ''} \"\n",
-    "            f\"→ reward={reward:.3f}, score={step_obs.p1_score:.4f}\"\n",
     "        )\n",
     "        if result.done:\n",
     "            print(f\"  Episode done. Final score: {step_obs.p1_score:.4f}\")\n",
     "            break\n",
     "\n",
     "print(\"\\nEnvironment is live and accessible for training and evaluation.\")"
    ]

    "cell_type": "markdown",
    "id": "504fb2a444614c0babb325280ed9130a",
    "metadata": {},
+   "source": [
+    "## 6. Reward Function\n",
+    "\n",
+    "The environment reward executes each generated action plan in the stellarator environment and returns the cumulative V0 reward. The environment's built-in reward decomposes feasibility (+3/-3 crossing bonuses, feasibility progress), objective (max elongation improvement), step costs, submit bonuses, and failure penalties — see `server/environment.py:_compute_reward_breakdown(...)`.\n",
+    "\n",
+    "If the model's plan ends before the episode terminates (no submit, budget not exhausted), the reward function auto-submits so terminal reward terms always fire. This ensures GRPO optimizes the full episode return, not truncated partial returns. The live observation telemetry also exposes `reward_breakdown` and `action_monitor` for debugging reward behavior.\n"
+   ]
   },
   {
    "cell_type": "code",
     "FastLanguageModel.for_inference(model)\n",
     "\n",
     "\n",
+    "def reward_term_summary(obs: StellaratorObservation) -> str:\n",
+    "    breakdown = obs.reward_breakdown.model_dump()\n",
+    "    terms = []\n",
+    "    for key, value in breakdown.items():\n",
+    "        if key in {\n",
+    "            \"intent\",\n",
+    "            \"total\",\n",
+    "            \"evaluation_failed\",\n",
+    "            \"recovered_from_failure\",\n",
+    "            \"reference_constraints_satisfied\",\n",
+    "            \"reference_score\",\n",
+    "            \"reference_feasibility\",\n",
+    "            \"reference_max_elongation\",\n",
+    "            \"initial_reference_score\",\n",
+    "            \"terminal_score_ratio\",\n",
+    "        }:\n",
+    "            continue\n",
+    "        if isinstance(value, (int, float)) and float(value) != 0.0:\n",
+    "            terms.append(f\"{key}={float(value):+.3f}\")\n",
+    "    return \", \".join(terms) if terms else \"none\"\n",
+    "\n",
+    "\n",
     "def run_episode_with_model(seed_idx: int) -> tuple[float, list[str]]:\n",
     "    \"\"\"Run one episode using the trained model.\"\"\"\n",
     "    env = StellaratorEnvironment()\n",
     "    actions = parse_action_plan(completion)\n",
     "    trace = []\n",
     "    total_reward = 0.0\n",
+    "    done = False\n",
     "    for action in actions[:BUDGET]:\n",
     "        obs = env.step(action)\n",
     "        r = float(obs.reward) if obs.reward is not None else 0.0\n",
     "        total_reward += r\n",
     "        trace.append(\n",
+    "            f\"  {action.intent} {action.parameter or ''} {action.direction or ''} {action.magnitude or ''} → reward={r:.3f} score={obs.p1_score:.4f} feasible={obs.constraints_satisfied} terms={reward_term_summary(obs)}\".strip()\n",
     "        )\n",
     "        if obs.done:\n",
+    "            done = True\n",
     "            break\n",
+    "    if not done:\n",
+    "        obs = env.step(StellaratorAction(intent=\"submit\"))\n",
+    "        r = float(obs.reward) if obs.reward is not None else 0.0\n",
+    "        total_reward += r\n",
+    "        trace.append(\n",
+    "            f\"  submit (auto) → reward={r:.3f} score={obs.p1_score:.4f} feasible={obs.constraints_satisfied} terms={reward_term_summary(obs)}\"\n",
+    "        )\n",
     "    return total_reward, trace\n",
     "\n",
     "\n",
     "    actions = parse_action_plan(completion)\n",
     "\n",
     "    print(f\"\\nTrained model generated {len(actions)} actions for remote env:\")\n",
+    "    done = False\n",
+    "    for i, action in enumerate(actions[:BUDGET], start=1):\n",
     "        result = env.step(action)\n",
     "        step_obs = result.observation\n",
     "        reward = float(result.reward) if result.reward is not None else 0.0\n",
     "        print(\n",
+    "            f\"  Step {i}: {action.intent} {action.parameter or ''} \"\n",
     "            f\"{action.direction or ''} {action.magnitude or ''} \"\n",
+    "            f\"→ reward={reward:.3f}, score={step_obs.p1_score:.4f}, terms={reward_term_summary(step_obs)}\"\n",
     "        )\n",
     "        if result.done:\n",
     "            print(f\"  Episode done. Final score: {step_obs.p1_score:.4f}\")\n",
+    "            done = True\n",
     "            break\n",
+    "    if not done:\n",
+    "        result = env.step(StellaratorAction(intent=\"submit\"))\n",
+    "        step_obs = result.observation\n",
+    "        reward = float(result.reward) if result.reward is not None else 0.0\n",
+    "        print(\n",
+    "            f\"  Auto-submit → reward={reward:.3f}, score={step_obs.p1_score:.4f}, terms={reward_term_summary(step_obs)}\"\n",
+    "        )\n",
+    "        if result.done:\n",
+    "            print(f\"  Episode done. Final score: {step_obs.p1_score:.4f}\")\n",
     "\n",
     "print(\"\\nEnvironment is live and accessible for training and evaluation.\")"
    ]