Spaces:

CreativeEngineer
/

fusion-design-lab

Paused

App Files Files Community

CreativeEngineer commited on Mar 8

Commit

ddcb837

1 Parent(s): 9c3599b

refactor: align colab notebook with shared llm helpers

Browse files

Files changed (1) hide show

training/notebooks/fusion_design_lab_training.ipynb +54 -179

training/notebooks/fusion_design_lab_training.ipynb CHANGED Viewed

@@ -112,24 +112,28 @@
     "import json\n",
     "from typing import Final\n",
     "\n",
-    "from fusion_lab.models import StellaratorAction, StellaratorObservation\n",
     "from server.contract import RESET_SEEDS\n",
     "from server.environment import BUDGET, StellaratorEnvironment\n",
     "\n",
-    "AVAILABLE_ACTIONS: Final[list[dict[str, str]]] = [\n",
     "    {\"intent\": \"run\", \"parameter\": p, \"direction\": d, \"magnitude\": m}\n",
-    "    for p in [\"aspect_ratio\", \"elongation\", \"rotational_transform\", \"triangularity_scale\"]\n",
-    "    for d in [\"increase\", \"decrease\"]\n",
-    "    for m in [\"small\", \"medium\", \"large\"]\n",
-    "] + [\n",
-    "    {\"intent\": \"restore_best\"},\n",
     "]\n",
     "\n",
-    "ACTION_LABELS: Final[list[str]] = [\n",
-    "    f\"{a['intent']} {a.get('parameter', '')} {a.get('direction', '')} {a.get('magnitude', '')}\".strip()\n",
-    "    for a in AVAILABLE_ACTIONS\n",
     "]\n",
-    "\n",
     "# Quick smoke test\n",
     "env = StellaratorEnvironment()\n",
     "obs = env.reset(seed=0)\n",
@@ -156,135 +160,25 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "SYSTEM_PROMPT: Final[\n",
-    "    str\n",
-    "] = \"\"\"You are an expert stellarator fusion reactor designer. Your goal is to optimize a stellarator design by adjusting 4 geometric parameters to minimize max elongation while satisfying physics constraints.\n",
-    "\n",
-    "Constraints:\n",
-    "- aspect_ratio <= 4.0\n",
-    "- average_triangularity <= -0.5\n",
-    "- edge_iota_over_nfp >= 0.3\n",
-    "\n",
-    "Available parameters: aspect_ratio, elongation, rotational_transform, triangularity_scale\n",
-    "Available directions: increase, decrease\n",
-    "Available magnitudes: small, medium, large\n",
-    "\n",
-    "You have a budget of 6 low-fidelity evaluations. Output a short plan of run actions as a JSON array. Each action is an object with keys: intent, parameter, direction, magnitude. Do not output submit.\n",
-    "\n",
-    "Example:\n",
-    "[{\"intent\":\"run\",\"parameter\":\"triangularity_scale\",\"direction\":\"increase\",\"magnitude\":\"small\"},{\"intent\":\"run\",\"parameter\":\"rotational_transform\",\"direction\":\"increase\",\"magnitude\":\"medium\"}]\"\"\"\n",
-    "\n",
-    "\n",
-    "def format_observation(obs: StellaratorObservation) -> str:\n",
-    "    return (\n",
-    "        f\"Current stellarator state:\\n\"\n",
-    "        f\"  max_elongation: {obs.max_elongation:.4f}\\n\"\n",
-    "        f\"  aspect_ratio: {obs.aspect_ratio:.4f} (constraint: <= 4.0)\\n\"\n",
-    "        f\"  average_triangularity: {obs.average_triangularity:.6f} (constraint: <= -0.5)\\n\"\n",
-    "        f\"  edge_iota_over_nfp: {obs.edge_iota_over_nfp:.4f} (constraint: >= 0.3)\\n\"\n",
-    "        f\"  p1_score: {obs.p1_score:.4f}\\n\"\n",
-    "        f\"  feasibility: {obs.p1_feasibility:.4f}\\n\"\n",
-    "        f\"  constraints_satisfied: {obs.constraints_satisfied}\\n\"\n",
-    "        f\"  budget_remaining: {obs.budget_remaining}\\n\"\n",
-    "        f\"\\nGenerate an action plan as a JSON array to optimize this design.\"\n",
-    "    )\n",
-    "\n",
-    "\n",
-    "def build_prompt(obs: StellaratorObservation) -> str:\n",
-    "    return (\n",
-    "        f\"<|im_start|>system\\n{SYSTEM_PROMPT}<|im_end|>\\n\"\n",
-    "        f\"<|im_start|>user\\n{format_observation(obs)}<|im_end|>\\n\"\n",
-    "        f\"<|im_start|>assistant\\n\"\n",
-    "    )\n",
-    "\n",
-    "\n",
-    "def _extract_json_array(text: str) -> str | None:\n",
-    "    \"\"\"Return the first balanced [...] substring that parses as a JSON array.\n",
-    "\n",
-    "    Iterates through every [ in text, finds its balanced closing ],\n",
-    "    and attempts json.loads. Returns the first candidate that decodes as a\n",
-    "    JSON list, skipping prose fragments like [draft].\n",
-    "    \"\"\"\n",
-    "    start = text.find(\"[\")\n",
-    "    while start != -1:\n",
-    "        depth = 0\n",
-    "        in_string = False\n",
-    "        escape = False\n",
-    "        matched_end: int | None = None\n",
-    "        for index in range(start, len(text)):\n",
-    "            char = text[index]\n",
-    "            if in_string:\n",
-    "                if escape:\n",
-    "                    escape = False\n",
-    "                elif char == \"\\\\\":\n",
-    "                    escape = True\n",
-    "                elif char == '\"':\n",
-    "                    in_string = False\n",
-    "                continue\n",
-    "            if char == '\"':\n",
-    "                in_string = True\n",
-    "            elif char == \"[\":\n",
-    "                depth += 1\n",
-    "            elif char == \"]\":\n",
-    "                depth -= 1\n",
-    "                if depth == 0:\n",
-    "                    matched_end = index\n",
-    "                    break\n",
-    "        if matched_end is not None:\n",
-    "            candidate = text[start : matched_end + 1]\n",
-    "            try:\n",
-    "                decoded = json.loads(candidate)\n",
-    "                if isinstance(decoded, list):\n",
-    "                    return candidate\n",
-    "            except (json.JSONDecodeError, ValueError):\n",
-    "                pass\n",
-    "        start = text.find(\"[\", start + 1)\n",
-    "    return None\n",
-    "\n",
-    "\n",
-    "def parse_action_plan(text: str) -> list[StellaratorAction]:\n",
-    "    \"\"\"Parse a JSON action plan from model output.\"\"\"\n",
-    "    array_text = _extract_json_array(text)\n",
-    "    if array_text is None:\n",
-    "        return []\n",
-    "    try:\n",
-    "        raw = json.loads(array_text)\n",
-    "    except json.JSONDecodeError:\n",
-    "        return []\n",
-    "    actions = []\n",
-    "    for item in raw:\n",
-    "        if not isinstance(item, dict) or \"intent\" not in item:\n",
-    "            continue\n",
-    "        intent = item[\"intent\"]\n",
-    "        if intent == \"submit\":\n",
-    "            continue\n",
-    "        if intent == \"restore_best\":\n",
-    "            actions.append(StellaratorAction(intent=\"restore_best\"))\n",
-    "            continue\n",
-    "        if intent == \"run\":\n",
-    "            p = item.get(\"parameter\", \"\")\n",
-    "            d = item.get(\"direction\", \"\")\n",
-    "            m = item.get(\"magnitude\", \"small\")\n",
-    "            if p in (\n",
-    "                \"aspect_ratio\",\n",
-    "                \"elongation\",\n",
-    "                \"rotational_transform\",\n",
-    "                \"triangularity_scale\",\n",
-    "            ) and d in (\"increase\", \"decrease\"):\n",
-    "                if m not in (\"small\", \"medium\", \"large\"):\n",
-    "                    m = \"small\"\n",
-    "                actions.append(\n",
-    "                    StellaratorAction(intent=\"run\", parameter=p, direction=d, magnitude=m)\n",
-    "                )\n",
-    "    return actions\n",
-    "\n",
-    "\n",
-    "# Test prompt\n",
     "env = StellaratorEnvironment()\n",
     "obs = env.reset(seed=0)\n",
     "prompt = build_prompt(obs)\n",
     "print(prompt[:500])\n",
-    "print(\"...\")"
    ]
   },
   {
@@ -308,8 +202,7 @@
     "\n",
     "prompts = []\n",
     "for seed_idx in range(len(RESET_SEEDS)):\n",
-    "    env = StellaratorEnvironment()\n",
-    "    obs = env.reset(seed=seed_idx)\n",
     "    prompt = build_prompt(obs)\n",
     "    # Repeat each seed to create a larger training set\n",
     "    for _ in range(50):\n",
@@ -327,9 +220,9 @@
    "source": [
     "## 6. Reward Function\n",
     "\n",
-    "The environment reward executes each generated action plan in the stellarator environment and returns the cumulative V0 reward. The environment's built-in reward decomposes feasibility (+3/-3 crossing bonuses, feasibility progress), objective (max elongation improvement), step costs, submit bonuses, and failure penalties \u2014 see `server/environment.py:_compute_reward_breakdown(...)`.\n",
     "\n",
-    "If the model's plan ends before the episode terminates (no submit, budget not exhausted), the reward function auto-submits so terminal reward terms always fire. This ensures GRPO optimizes the full episode return, not truncated partial returns. The live observation telemetry also exposes `reward_breakdown` and `action_monitor` for debugging reward behavior.\n"
    ]
   },
   {
@@ -360,17 +253,11 @@
     "            if len(actions) == 0:\n",
     "                rewards.append(-3.0)\n",
     "                continue\n",
-    "            env = StellaratorEnvironment()\n",
-    "            env.reset(seed=int(seeds[i]) % len(RESET_SEEDS))\n",
-    "            total_reward = 0.0\n",
-    "            for action in actions[:BUDGET]:\n",
-    "                if action.intent == \"submit\":\n",
-    "                    continue\n",
-    "                obs = env.step(action)\n",
-    "                total_reward += float(obs.reward) if obs.reward is not None else 0.0\n",
-    "                if obs.done:\n",
-    "                    break\n",
-    "            rewards.append(total_reward)\n",
     "        except Exception:\n",
     "            traceback.print_exc()\n",
     "            rewards.append(-3.0)\n",
@@ -543,8 +430,11 @@
     "FastLanguageModel.for_inference(model)\n",
     "\n",
     "\n",
-    "def reward_term_summary(obs: StellaratorObservation) -> str:\n",
-    "    breakdown = obs.reward_breakdown.model_dump()\n",
     "    terms = []\n",
     "    for key, value in breakdown.items():\n",
     "        if key in {\n",
@@ -581,35 +471,22 @@
     "        outputs[0][inputs[\"input_ids\"].shape[1] :], skip_special_tokens=True\n",
     "    )\n",
     "    actions = parse_action_plan(completion)\n",
-    "    trace = []\n",
-    "    total_reward = 0.0\n",
-    "    for action in actions[:BUDGET]:\n",
-    "        if action.intent == \"submit\":\n",
-    "            continue\n",
-    "        obs = env.step(action)\n",
-    "        r = float(obs.reward) if obs.reward is not None else 0.0\n",
-    "        total_reward += r\n",
-    "        trace.append(\n",
-    "            f\"  {action.intent} {action.parameter or ''} {action.direction or ''} {action.magnitude or ''} \u2192 reward={r:.3f} score={obs.p1_score:.4f} feasible={obs.constraints_satisfied} terms={reward_term_summary(obs)}\".strip()\n",
     "        )\n",
-    "        if obs.done:\n",
-    "            break\n",
-    "    return total_reward, trace\n",
     "\n",
     "\n",
     "def run_random_episode(seed_idx: int) -> float:\n",
     "    \"\"\"Run one episode with random actions for comparison.\"\"\"\n",
-    "    env = StellaratorEnvironment()\n",
-    "    env.reset(seed=seed_idx)\n",
-    "    total_reward = 0.0\n",
-    "    for step in range(BUDGET):\n",
-    "        spec = random.choice(AVAILABLE_ACTIONS[:24])  # run actions only\n",
-    "        action = StellaratorAction(**spec)\n",
-    "        obs = env.step(action)\n",
-    "        total_reward += float(obs.reward) if obs.reward is not None else 0.0\n",
-    "        if obs.done:\n",
-    "            return total_reward\n",
-    "    return total_reward\n",
     "\n",
     "\n",
     "# Evaluate\n",
@@ -690,7 +567,6 @@
     "    actions = parse_action_plan(completion)\n",
     "\n",
     "    print(f\"\\nTrained model generated {len(actions)} actions for remote env:\")\n",
-    "    done = False\n",
     "    for i, action in enumerate(actions[:BUDGET], start=1):\n",
     "        if action.intent == \"submit\":\n",
     "            continue\n",
@@ -704,7 +580,6 @@
     "        )\n",
     "        if result.done:\n",
     "            print(f\"  Episode done. Final score: {step_obs.p1_score:.4f}\")\n",
-    "            done = True\n",
     "            break\n",
     "print(\"\\nEnvironment is live and accessible for training and evaluation.\")"
    ]

     "import json\n",
     "from typing import Final\n",
     "\n",
+    "from fusion_lab.llm_agent import (\n",
+    "    RUN_DIRECTIONS,\n",
+    "    RUN_MAGNITUDES,\n",
+    "    RUN_PARAMETERS,\n",
+    "    build_prompt,\n",
+    "    parse_action_plan,\n",
+    "    run_episode_with_actions,\n",
+    ")\n",
+    "from fusion_lab.models import StellaratorAction\n",
     "from server.contract import RESET_SEEDS\n",
     "from server.environment import BUDGET, StellaratorEnvironment\n",
     "\n",
+    "RUN_ACTION_SPECS: Final[list[dict[str, str]]] = [\n",
     "    {\"intent\": \"run\", \"parameter\": p, \"direction\": d, \"magnitude\": m}\n",
+    "    for p in RUN_PARAMETERS\n",
+    "    for d in RUN_DIRECTIONS\n",
+    "    for m in RUN_MAGNITUDES\n",
     "]\n",
     "\n",
+    "AVAILABLE_ACTIONS: Final[list[dict[str, str]]] = RUN_ACTION_SPECS + [\n",
+    "    {\"intent\": \"restore_best\"},\n",
     "]\n",
     "# Quick smoke test\n",
     "env = StellaratorEnvironment()\n",
     "obs = env.reset(seed=0)\n",
    "metadata": {},
    "outputs": [],
    "source": [
+    "# Shared helper smoke test\n",
     "env = StellaratorEnvironment()\n",
     "obs = env.reset(seed=0)\n",
     "prompt = build_prompt(obs)\n",
     "print(prompt[:500])\n",
+    "print(\"...\")\n",
+    "\n",
+    "sample_completion = json.dumps(\n",
+    "    [\n",
+    "        {\n",
+    "            \"intent\": \"run\",\n",
+    "            \"parameter\": \"triangularity_scale\",\n",
+    "            \"direction\": \"increase\",\n",
+    "            \"magnitude\": \"small\",\n",
+    "        },\n",
+    "        {\"intent\": \"submit\"},\n",
+    "    ]\n",
+    ")\n",
+    "print(parse_action_plan(sample_completion))"
    ]
   },
   {
     "\n",
     "prompts = []\n",
     "for seed_idx in range(len(RESET_SEEDS)):\n",
+    "    obs = StellaratorEnvironment().reset(seed=seed_idx)\n",
     "    prompt = build_prompt(obs)\n",
     "    # Repeat each seed to create a larger training set\n",
     "    for _ in range(50):\n",
    "source": [
     "## 6. Reward Function\n",
     "\n",
+    "The environment reward executes each generated action plan in the stellarator environment and returns the cumulative low-fidelity Reward V0 from the live environment. The environment's built-in reward decomposes feasibility (+3/-3 crossing bonuses, feasibility progress), objective (max elongation improvement), step costs, and failure penalties \u2014 see `server/environment.py:_compute_reward_breakdown(...)`.\n",
     "\n",
+    "For the current training workflow, the notebook ignores `submit` and does not auto-submit. GRPO therefore optimizes the low-fidelity `run` path only. The live observation telemetry still exposes `reward_breakdown` and `action_monitor` for debugging reward behavior.\n"
    ]
   },
   {
     "            if len(actions) == 0:\n",
     "                rewards.append(-3.0)\n",
     "                continue\n",
+    "            trace = run_episode_with_actions(\n",
+    "                actions,\n",
+    "                seed_idx=int(seeds[i]) % len(RESET_SEEDS),\n",
+    "            )\n",
+    "            rewards.append(trace.total_reward)\n",
     "        except Exception:\n",
     "            traceback.print_exc()\n",
     "            rewards.append(-3.0)\n",
     "FastLanguageModel.for_inference(model)\n",
     "\n",
     "\n",
+    "def reward_term_summary(step_or_obs: object) -> str:\n",
+    "    breakdown_obj = getattr(step_or_obs, \"reward_breakdown\")\n",
+    "    breakdown = (\n",
+    "        breakdown_obj.model_dump() if hasattr(breakdown_obj, \"model_dump\") else breakdown_obj\n",
+    "    )\n",
     "    terms = []\n",
     "    for key, value in breakdown.items():\n",
     "        if key in {\n",
     "        outputs[0][inputs[\"input_ids\"].shape[1] :], skip_special_tokens=True\n",
     "    )\n",
     "    actions = parse_action_plan(completion)\n",
+    "    episode = run_episode_with_actions(actions, seed_idx=seed_idx)\n",
+    "    trace = [\n",
+    "        (\n",
+    "            f\"{step.action_label} \u2192 reward={step.reward:.3f} \"\n",
+    "            f\"score={step.p1_score:.4f} feasible={step.constraints_satisfied} \"\n",
+    "            f\"terms={reward_term_summary(step)}\"\n",
     "        )\n",
+    "        for step in episode.steps\n",
+    "    ]\n",
+    "    return episode.total_reward, trace\n",
     "\n",
     "\n",
     "def run_random_episode(seed_idx: int) -> float:\n",
     "    \"\"\"Run one episode with random actions for comparison.\"\"\"\n",
+    "    actions = [StellaratorAction(**random.choice(RUN_ACTION_SPECS)) for _ in range(BUDGET)]\n",
+    "    return run_episode_with_actions(actions, seed_idx=seed_idx).total_reward\n",
     "\n",
     "\n",
     "# Evaluate\n",
     "    actions = parse_action_plan(completion)\n",
     "\n",
     "    print(f\"\\nTrained model generated {len(actions)} actions for remote env:\")\n",
     "    for i, action in enumerate(actions[:BUDGET], start=1):\n",
     "        if action.intent == \"submit\":\n",
     "            continue\n",
     "        )\n",
     "        if result.done:\n",
     "            print(f\"  Episode done. Final score: {step_obs.p1_score:.4f}\")\n",
     "            break\n",
     "print(\"\\nEnvironment is live and accessible for training and evaluation.\")"
    ]