Commit Β·
8254ade
1
Parent(s): 5f2da5f
fix: align notebook evaluation with grpo rewards
Browse files
training/notebooks/fusion_design_lab_training.ipynb
CHANGED
|
@@ -326,7 +326,13 @@
|
|
| 326 |
"cell_type": "markdown",
|
| 327 |
"id": "504fb2a444614c0babb325280ed9130a",
|
| 328 |
"metadata": {},
|
| 329 |
-
"source":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 330 |
},
|
| 331 |
{
|
| 332 |
"cell_type": "code",
|
|
@@ -543,6 +549,28 @@
|
|
| 543 |
"FastLanguageModel.for_inference(model)\n",
|
| 544 |
"\n",
|
| 545 |
"\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 546 |
"def run_episode_with_model(seed_idx: int) -> tuple[float, list[str]]:\n",
|
| 547 |
" \"\"\"Run one episode using the trained model.\"\"\"\n",
|
| 548 |
" env = StellaratorEnvironment()\n",
|
|
@@ -561,15 +589,24 @@
|
|
| 561 |
" actions = parse_action_plan(completion)\n",
|
| 562 |
" trace = []\n",
|
| 563 |
" total_reward = 0.0\n",
|
|
|
|
| 564 |
" for action in actions[:BUDGET]:\n",
|
| 565 |
" obs = env.step(action)\n",
|
| 566 |
" r = float(obs.reward) if obs.reward is not None else 0.0\n",
|
| 567 |
" total_reward += r\n",
|
| 568 |
" trace.append(\n",
|
| 569 |
-
" f\" {action.intent} {action.parameter or ''} {action.direction or ''} {action.magnitude or ''} β reward={r:.3f} score={obs.p1_score:.4f} feasible={obs.constraints_satisfied}\".strip()\n",
|
| 570 |
" )\n",
|
| 571 |
" if obs.done:\n",
|
|
|
|
| 572 |
" break\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 573 |
" return total_reward, trace\n",
|
| 574 |
"\n",
|
| 575 |
"\n",
|
|
@@ -669,18 +706,29 @@
|
|
| 669 |
" actions = parse_action_plan(completion)\n",
|
| 670 |
"\n",
|
| 671 |
" print(f\"\\nTrained model generated {len(actions)} actions for remote env:\")\n",
|
| 672 |
-
"
|
|
|
|
| 673 |
" result = env.step(action)\n",
|
| 674 |
" step_obs = result.observation\n",
|
| 675 |
" reward = float(result.reward) if result.reward is not None else 0.0\n",
|
| 676 |
" print(\n",
|
| 677 |
-
" f\" Step {i
|
| 678 |
" f\"{action.direction or ''} {action.magnitude or ''} \"\n",
|
| 679 |
-
" f\"β reward={reward:.3f}, score={step_obs.p1_score:.4f}\"\n",
|
| 680 |
" )\n",
|
| 681 |
" if result.done:\n",
|
| 682 |
" print(f\" Episode done. Final score: {step_obs.p1_score:.4f}\")\n",
|
|
|
|
| 683 |
" break\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 684 |
"\n",
|
| 685 |
"print(\"\\nEnvironment is live and accessible for training and evaluation.\")"
|
| 686 |
]
|
|
|
|
| 326 |
"cell_type": "markdown",
|
| 327 |
"id": "504fb2a444614c0babb325280ed9130a",
|
| 328 |
"metadata": {},
|
| 329 |
+
"source": [
|
| 330 |
+
"## 6. Reward Function\n",
|
| 331 |
+
"\n",
|
| 332 |
+
"The environment reward executes each generated action plan in the stellarator environment and returns the cumulative V0 reward. The environment's built-in reward decomposes feasibility (+3/-3 crossing bonuses, feasibility progress), objective (max elongation improvement), step costs, submit bonuses, and failure penalties β see `server/environment.py:_compute_reward_breakdown(...)`.\n",
|
| 333 |
+
"\n",
|
| 334 |
+
"If the model's plan ends before the episode terminates (no submit, budget not exhausted), the reward function auto-submits so terminal reward terms always fire. This ensures GRPO optimizes the full episode return, not truncated partial returns. The live observation telemetry also exposes `reward_breakdown` and `action_monitor` for debugging reward behavior.\n"
|
| 335 |
+
]
|
| 336 |
},
|
| 337 |
{
|
| 338 |
"cell_type": "code",
|
|
|
|
| 549 |
"FastLanguageModel.for_inference(model)\n",
|
| 550 |
"\n",
|
| 551 |
"\n",
|
| 552 |
+
"def reward_term_summary(obs: StellaratorObservation) -> str:\n",
|
| 553 |
+
" breakdown = obs.reward_breakdown.model_dump()\n",
|
| 554 |
+
" terms = []\n",
|
| 555 |
+
" for key, value in breakdown.items():\n",
|
| 556 |
+
" if key in {\n",
|
| 557 |
+
" \"intent\",\n",
|
| 558 |
+
" \"total\",\n",
|
| 559 |
+
" \"evaluation_failed\",\n",
|
| 560 |
+
" \"recovered_from_failure\",\n",
|
| 561 |
+
" \"reference_constraints_satisfied\",\n",
|
| 562 |
+
" \"reference_score\",\n",
|
| 563 |
+
" \"reference_feasibility\",\n",
|
| 564 |
+
" \"reference_max_elongation\",\n",
|
| 565 |
+
" \"initial_reference_score\",\n",
|
| 566 |
+
" \"terminal_score_ratio\",\n",
|
| 567 |
+
" }:\n",
|
| 568 |
+
" continue\n",
|
| 569 |
+
" if isinstance(value, (int, float)) and float(value) != 0.0:\n",
|
| 570 |
+
" terms.append(f\"{key}={float(value):+.3f}\")\n",
|
| 571 |
+
" return \", \".join(terms) if terms else \"none\"\n",
|
| 572 |
+
"\n",
|
| 573 |
+
"\n",
|
| 574 |
"def run_episode_with_model(seed_idx: int) -> tuple[float, list[str]]:\n",
|
| 575 |
" \"\"\"Run one episode using the trained model.\"\"\"\n",
|
| 576 |
" env = StellaratorEnvironment()\n",
|
|
|
|
| 589 |
" actions = parse_action_plan(completion)\n",
|
| 590 |
" trace = []\n",
|
| 591 |
" total_reward = 0.0\n",
|
| 592 |
+
" done = False\n",
|
| 593 |
" for action in actions[:BUDGET]:\n",
|
| 594 |
" obs = env.step(action)\n",
|
| 595 |
" r = float(obs.reward) if obs.reward is not None else 0.0\n",
|
| 596 |
" total_reward += r\n",
|
| 597 |
" trace.append(\n",
|
| 598 |
+
" f\" {action.intent} {action.parameter or ''} {action.direction or ''} {action.magnitude or ''} β reward={r:.3f} score={obs.p1_score:.4f} feasible={obs.constraints_satisfied} terms={reward_term_summary(obs)}\".strip()\n",
|
| 599 |
" )\n",
|
| 600 |
" if obs.done:\n",
|
| 601 |
+
" done = True\n",
|
| 602 |
" break\n",
|
| 603 |
+
" if not done:\n",
|
| 604 |
+
" obs = env.step(StellaratorAction(intent=\"submit\"))\n",
|
| 605 |
+
" r = float(obs.reward) if obs.reward is not None else 0.0\n",
|
| 606 |
+
" total_reward += r\n",
|
| 607 |
+
" trace.append(\n",
|
| 608 |
+
" f\" submit (auto) β reward={r:.3f} score={obs.p1_score:.4f} feasible={obs.constraints_satisfied} terms={reward_term_summary(obs)}\"\n",
|
| 609 |
+
" )\n",
|
| 610 |
" return total_reward, trace\n",
|
| 611 |
"\n",
|
| 612 |
"\n",
|
|
|
|
| 706 |
" actions = parse_action_plan(completion)\n",
|
| 707 |
"\n",
|
| 708 |
" print(f\"\\nTrained model generated {len(actions)} actions for remote env:\")\n",
|
| 709 |
+
" done = False\n",
|
| 710 |
+
" for i, action in enumerate(actions[:BUDGET], start=1):\n",
|
| 711 |
" result = env.step(action)\n",
|
| 712 |
" step_obs = result.observation\n",
|
| 713 |
" reward = float(result.reward) if result.reward is not None else 0.0\n",
|
| 714 |
" print(\n",
|
| 715 |
+
" f\" Step {i}: {action.intent} {action.parameter or ''} \"\n",
|
| 716 |
" f\"{action.direction or ''} {action.magnitude or ''} \"\n",
|
| 717 |
+
" f\"β reward={reward:.3f}, score={step_obs.p1_score:.4f}, terms={reward_term_summary(step_obs)}\"\n",
|
| 718 |
" )\n",
|
| 719 |
" if result.done:\n",
|
| 720 |
" print(f\" Episode done. Final score: {step_obs.p1_score:.4f}\")\n",
|
| 721 |
+
" done = True\n",
|
| 722 |
" break\n",
|
| 723 |
+
" if not done:\n",
|
| 724 |
+
" result = env.step(StellaratorAction(intent=\"submit\"))\n",
|
| 725 |
+
" step_obs = result.observation\n",
|
| 726 |
+
" reward = float(result.reward) if result.reward is not None else 0.0\n",
|
| 727 |
+
" print(\n",
|
| 728 |
+
" f\" Auto-submit β reward={reward:.3f}, score={step_obs.p1_score:.4f}, terms={reward_term_summary(step_obs)}\"\n",
|
| 729 |
+
" )\n",
|
| 730 |
+
" if result.done:\n",
|
| 731 |
+
" print(f\" Episode done. Final score: {step_obs.p1_score:.4f}\")\n",
|
| 732 |
"\n",
|
| 733 |
"print(\"\\nEnvironment is live and accessible for training and evaluation.\")"
|
| 734 |
]
|