CreativeEngineer commited on
Commit
8254ade
Β·
1 Parent(s): 5f2da5f

fix: align notebook evaluation with grpo rewards

Browse files
training/notebooks/fusion_design_lab_training.ipynb CHANGED
@@ -326,7 +326,13 @@
326
  "cell_type": "markdown",
327
  "id": "504fb2a444614c0babb325280ed9130a",
328
  "metadata": {},
329
- "source": "## 6. Reward Function\n\nThe environment reward executes each generated action plan in the stellarator environment and returns the cumulative V0 reward. The environment's built-in reward decomposes feasibility (+3/-3 crossing bonuses, feasibility progress), objective (max elongation improvement), step costs, submit bonuses, and failure penalties β€” see `server/environment.py:_compute_reward`.\n\nIf the model's plan ends before the episode terminates (no submit, budget not exhausted), the reward function auto-submits so terminal reward terms always fire. This ensures GRPO optimizes the full episode return, not truncated partial returns."
 
 
 
 
 
 
330
  },
331
  {
332
  "cell_type": "code",
@@ -543,6 +549,28 @@
543
  "FastLanguageModel.for_inference(model)\n",
544
  "\n",
545
  "\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
546
  "def run_episode_with_model(seed_idx: int) -> tuple[float, list[str]]:\n",
547
  " \"\"\"Run one episode using the trained model.\"\"\"\n",
548
  " env = StellaratorEnvironment()\n",
@@ -561,15 +589,24 @@
561
  " actions = parse_action_plan(completion)\n",
562
  " trace = []\n",
563
  " total_reward = 0.0\n",
 
564
  " for action in actions[:BUDGET]:\n",
565
  " obs = env.step(action)\n",
566
  " r = float(obs.reward) if obs.reward is not None else 0.0\n",
567
  " total_reward += r\n",
568
  " trace.append(\n",
569
- " f\" {action.intent} {action.parameter or ''} {action.direction or ''} {action.magnitude or ''} β†’ reward={r:.3f} score={obs.p1_score:.4f} feasible={obs.constraints_satisfied}\".strip()\n",
570
  " )\n",
571
  " if obs.done:\n",
 
572
  " break\n",
 
 
 
 
 
 
 
573
  " return total_reward, trace\n",
574
  "\n",
575
  "\n",
@@ -669,18 +706,29 @@
669
  " actions = parse_action_plan(completion)\n",
670
  "\n",
671
  " print(f\"\\nTrained model generated {len(actions)} actions for remote env:\")\n",
672
- " for i, action in enumerate(actions[:BUDGET]):\n",
 
673
  " result = env.step(action)\n",
674
  " step_obs = result.observation\n",
675
  " reward = float(result.reward) if result.reward is not None else 0.0\n",
676
  " print(\n",
677
- " f\" Step {i + 1}: {action.intent} {action.parameter or ''} \"\n",
678
  " f\"{action.direction or ''} {action.magnitude or ''} \"\n",
679
- " f\"β†’ reward={reward:.3f}, score={step_obs.p1_score:.4f}\"\n",
680
  " )\n",
681
  " if result.done:\n",
682
  " print(f\" Episode done. Final score: {step_obs.p1_score:.4f}\")\n",
 
683
  " break\n",
 
 
 
 
 
 
 
 
 
684
  "\n",
685
  "print(\"\\nEnvironment is live and accessible for training and evaluation.\")"
686
  ]
 
326
  "cell_type": "markdown",
327
  "id": "504fb2a444614c0babb325280ed9130a",
328
  "metadata": {},
329
+ "source": [
330
+ "## 6. Reward Function\n",
331
+ "\n",
332
+ "The environment reward executes each generated action plan in the stellarator environment and returns the cumulative V0 reward. The environment's built-in reward decomposes feasibility (+3/-3 crossing bonuses, feasibility progress), objective (max elongation improvement), step costs, submit bonuses, and failure penalties β€” see `server/environment.py:_compute_reward_breakdown(...)`.\n",
333
+ "\n",
334
+ "If the model's plan ends before the episode terminates (no submit, budget not exhausted), the reward function auto-submits so terminal reward terms always fire. This ensures GRPO optimizes the full episode return, not truncated partial returns. The live observation telemetry also exposes `reward_breakdown` and `action_monitor` for debugging reward behavior.\n"
335
+ ]
336
  },
337
  {
338
  "cell_type": "code",
 
549
  "FastLanguageModel.for_inference(model)\n",
550
  "\n",
551
  "\n",
552
+ "def reward_term_summary(obs: StellaratorObservation) -> str:\n",
553
+ " breakdown = obs.reward_breakdown.model_dump()\n",
554
+ " terms = []\n",
555
+ " for key, value in breakdown.items():\n",
556
+ " if key in {\n",
557
+ " \"intent\",\n",
558
+ " \"total\",\n",
559
+ " \"evaluation_failed\",\n",
560
+ " \"recovered_from_failure\",\n",
561
+ " \"reference_constraints_satisfied\",\n",
562
+ " \"reference_score\",\n",
563
+ " \"reference_feasibility\",\n",
564
+ " \"reference_max_elongation\",\n",
565
+ " \"initial_reference_score\",\n",
566
+ " \"terminal_score_ratio\",\n",
567
+ " }:\n",
568
+ " continue\n",
569
+ " if isinstance(value, (int, float)) and float(value) != 0.0:\n",
570
+ " terms.append(f\"{key}={float(value):+.3f}\")\n",
571
+ " return \", \".join(terms) if terms else \"none\"\n",
572
+ "\n",
573
+ "\n",
574
  "def run_episode_with_model(seed_idx: int) -> tuple[float, list[str]]:\n",
575
  " \"\"\"Run one episode using the trained model.\"\"\"\n",
576
  " env = StellaratorEnvironment()\n",
 
589
  " actions = parse_action_plan(completion)\n",
590
  " trace = []\n",
591
  " total_reward = 0.0\n",
592
+ " done = False\n",
593
  " for action in actions[:BUDGET]:\n",
594
  " obs = env.step(action)\n",
595
  " r = float(obs.reward) if obs.reward is not None else 0.0\n",
596
  " total_reward += r\n",
597
  " trace.append(\n",
598
+ " f\" {action.intent} {action.parameter or ''} {action.direction or ''} {action.magnitude or ''} β†’ reward={r:.3f} score={obs.p1_score:.4f} feasible={obs.constraints_satisfied} terms={reward_term_summary(obs)}\".strip()\n",
599
  " )\n",
600
  " if obs.done:\n",
601
+ " done = True\n",
602
  " break\n",
603
+ " if not done:\n",
604
+ " obs = env.step(StellaratorAction(intent=\"submit\"))\n",
605
+ " r = float(obs.reward) if obs.reward is not None else 0.0\n",
606
+ " total_reward += r\n",
607
+ " trace.append(\n",
608
+ " f\" submit (auto) β†’ reward={r:.3f} score={obs.p1_score:.4f} feasible={obs.constraints_satisfied} terms={reward_term_summary(obs)}\"\n",
609
+ " )\n",
610
  " return total_reward, trace\n",
611
  "\n",
612
  "\n",
 
706
  " actions = parse_action_plan(completion)\n",
707
  "\n",
708
  " print(f\"\\nTrained model generated {len(actions)} actions for remote env:\")\n",
709
+ " done = False\n",
710
+ " for i, action in enumerate(actions[:BUDGET], start=1):\n",
711
  " result = env.step(action)\n",
712
  " step_obs = result.observation\n",
713
  " reward = float(result.reward) if result.reward is not None else 0.0\n",
714
  " print(\n",
715
+ " f\" Step {i}: {action.intent} {action.parameter or ''} \"\n",
716
  " f\"{action.direction or ''} {action.magnitude or ''} \"\n",
717
+ " f\"β†’ reward={reward:.3f}, score={step_obs.p1_score:.4f}, terms={reward_term_summary(step_obs)}\"\n",
718
  " )\n",
719
  " if result.done:\n",
720
  " print(f\" Episode done. Final score: {step_obs.p1_score:.4f}\")\n",
721
+ " done = True\n",
722
  " break\n",
723
+ " if not done:\n",
724
+ " result = env.step(StellaratorAction(intent=\"submit\"))\n",
725
+ " step_obs = result.observation\n",
726
+ " reward = float(result.reward) if result.reward is not None else 0.0\n",
727
+ " print(\n",
728
+ " f\" Auto-submit β†’ reward={reward:.3f}, score={step_obs.p1_score:.4f}, terms={reward_term_summary(step_obs)}\"\n",
729
+ " )\n",
730
+ " if result.done:\n",
731
+ " print(f\" Episode done. Final score: {step_obs.p1_score:.4f}\")\n",
732
  "\n",
733
  "print(\"\\nEnvironment is live and accessible for training and evaluation.\")"
734
  ]