Spaces:

Prajwal782007
/

Gridmind

Running

App Files Files Community

Prajwal782007 commited on Apr 25

Commit

29b9cd0

1 Parent(s): 9d42d14

feat: add GridMind GRPO training notebook for Meta PyTorch OpenEnv hackathon

Browse files

Files changed (1) hide show

scripts/gridmind_grpo_colab.ipynb +114 -195

scripts/gridmind_grpo_colab.ipynb CHANGED Viewed

@@ -695,6 +695,7 @@
     "    \"logging_steps\": 1,\n",
     "    \"save_steps\": 60,\n",
     "    \"report_to\": \"none\",\n",
     "    \"dataloader_num_workers\": 0,\n",
     "    \"remove_unused_columns\": False,\n",
     "}\n",
@@ -732,6 +733,53 @@
     "step_numbers = []\n",
     "step_reward_means = []\n",
     "training_log_history = []\n",
     "\n",
     "class LossCaptureCallback(TrainerCallback):\n",
     "    def on_log(self, args, state, control, logs=None, **kwargs):\n",
@@ -740,7 +788,25 @@
     "        step = state.global_step\n",
     "        row = {\"step\": step}\n",
     "        row.update({k: float(v) if isinstance(v, (int, float)) else v for k, v in logs.items()})\n",
     "        training_log_history.append(row)\n",
     "        loss = logs.get(\"loss\", logs.get(\"train_loss\", None))\n",
     "        if loss is not None:\n",
     "            step_losses.append(float(loss))\n",
@@ -767,6 +833,17 @@
     "    callbacks=[LossCaptureCallback()],\n",
     ")\n",
     "\n",
     "print(\"\\nStarting GRPO training with QLoRA...\")\n",
     "print(\"Watch for non-zero loss values. If all zeros, reward variance is still too low.\\n\")\n",
     "print(f\"Steps: {getattr(grpo_config, 'max_steps', 60)} | Batch: {getattr(grpo_config, 'per_device_train_batch_size', 1)} | Generations: {getattr(grpo_config, 'num_generations', 4)}\")\n",
@@ -792,7 +869,9 @@
     "    import pandas as pd\n",
     "    import numpy as np\n",
     "    trainer_log_rows = [r for r in trainer.state.log_history if \"loss\" in r or \"reward\" in r or \"rewards / reward_func / mean\" in r]\n",
-    "    if trainer_log_rows:\n",
     "        training_metrics_df = pd.DataFrame(trainer_log_rows)\n",
     "        if \"step\" not in training_metrics_df.columns:\n",
     "            training_metrics_df.insert(0, \"step\", range(1, len(training_metrics_df) + 1))\n",
@@ -972,7 +1051,6 @@
     "import matplotlib\n",
     "matplotlib.use('Agg')\n",
     "import matplotlib.pyplot as plt\n",
-    "import matplotlib.gridspec as gridspec\n",
     "import numpy as np\n",
     "import pandas as pd\n",
     "import os\n",
@@ -980,11 +1058,13 @@
     "os.makedirs(\"results\", exist_ok=True)\n",
     "os.makedirs(\"plots\", exist_ok=True)\n",
     "\n",
-    "# Build a TRL-style metrics table from trainer logs. This matches the tabular\n",
-    "# output with columns like reward, reward_std, completion lengths, tools, and KL.\n",
     "if 'training_metrics_df' not in globals() or training_metrics_df is None:\n",
     "    trainer_log_rows = [r for r in trainer.state.log_history if \"loss\" in r or \"reward\" in r or \"rewards / reward_func / mean\" in r]\n",
-    "    training_metrics_df = pd.DataFrame(trainer_log_rows if trainer_log_rows else training_log_history)\n",
     "    if not training_metrics_df.empty and \"step\" not in training_metrics_df.columns:\n",
     "        training_metrics_df.insert(0, \"step\", range(1, len(training_metrics_df) + 1))\n",
     "\n",
@@ -992,17 +1072,7 @@
     "if not training_metrics_df.empty:\n",
     "    training_metrics_df.to_csv(training_metrics_path, index=False)\n",
     "    print(f\"Saved TRL metrics table to {training_metrics_path}\")\n",
-    "    preferred_cols = [\n",
-    "        \"step\", \"loss\", \"reward\", \"reward_std\",\n",
-    "        \"completions / mean_length\", \"completions / min_length\", \"completions / max_length\",\n",
-    "        \"completions / clipped_ratio\", \"completions / mean_terminated_length\",\n",
-    "        \"completions / min_terminated_length\", \"completions / max_terminated_length\",\n",
-    "        \"tools / call_frequency\", \"tools / failure_frequency\", \"kl\",\n",
-    "        \"rewards / reward_func / mean\", \"rewards / reward_func / std\",\n",
-    "    ]\n",
-    "    display_cols = [c for c in preferred_cols if c in training_metrics_df.columns]\n",
-    "    if display_cols:\n",
-    "        print(\"Step 6 already displayed the full TRL-style metrics table; Step 8 only reuses it for plots.\")\n",
     "\n",
     "tasks = [1, 2, 3, 4]\n",
     "task_labels = [\n",
@@ -1034,179 +1104,31 @@
     "        out.append(sum(w) / len(w))\n",
     "    return out\n",
     "\n",
-    "C = {\n",
-    "    'bg': '#0d1117', 'panel': '#161b22', 'grid': '#21262d',\n",
-    "    'text': '#e6edf3', 'subtext': '#8b949e', 'random': '#f85149',\n",
-    "    'heuristic': '#58a6ff', 'trained': '#3fb950', 'reward': '#d29922',\n",
-    "    'loss': '#bc8cff', 'border': '#30363d',\n",
-    "}\n",
-    "\n",
-    "def style_ax(ax, title):\n",
-    "    ax.set_facecolor(C['panel'])\n",
-    "    ax.set_title(title, color=C['text'], fontsize=12, fontweight='bold', pad=10)\n",
-    "    ax.tick_params(colors=C['subtext'], labelsize=9)\n",
-    "    ax.grid(alpha=0.15, color=C['grid'], linewidth=0.8)\n",
-    "    for spine in ax.spines.values():\n",
-    "        spine.set_edgecolor(C['border'])\n",
-    "    ax.xaxis.label.set_color(C['subtext'])\n",
-    "    ax.yaxis.label.set_color(C['subtext'])\n",
-    "\n",
-    "fig = plt.figure(figsize=(12, 8))\n",
-    "fig.patch.set_facecolor(C['bg'])\n",
-    "gs = gridspec.GridSpec(2, 2, figure=fig, hspace=0.50, wspace=0.38,\n",
-    "                       left=0.07, right=0.97, top=0.91, bottom=0.07)\n",
-    "\n",
-    "# Panel A: policy comparison across all tasks.\n",
-    "ax_bar = fig.add_subplot(gs[0, :])\n",
-    "ax_bar.set_facecolor(C['panel'])\n",
-    "x = np.arange(len(tasks))\n",
-    "w = 0.24\n",
-    "br = ax_bar.bar(x - w, random_vals, w, label='Random Policy', color=C['random'], alpha=0.85, zorder=3, edgecolor=C['bg'], linewidth=0.5)\n",
-    "bh = ax_bar.bar(x, heuristic_vals, w, label='Heuristic Baseline', color=C['heuristic'], alpha=0.85, zorder=3, edgecolor=C['bg'], linewidth=0.5)\n",
-    "bt = ax_bar.bar(x + w, trained_vals, w, label='Trained LLM (GRPO)', color=C['trained'], alpha=0.85, zorder=3, edgecolor=C['bg'], linewidth=0.5)\n",
-    "\n",
-    "for bars, col in [(br, C['random']), (bh, C['heuristic']), (bt, C['trained'])]:\n",
-    "    for bar in bars:\n",
-    "        h = bar.get_height()\n",
-    "        ax_bar.text(bar.get_x() + bar.get_width()/2, h + 0.012, f'{h:.3f}',\n",
-    "                    ha='center', va='bottom', fontsize=8.5, color=col, fontweight='bold', zorder=4)\n",
-    "\n",
-    "for i in range(len(tasks)):\n",
-    "    h_val = heuristic_vals[i]\n",
-    "    t_val = trained_vals[i]\n",
-    "    pct = ((t_val - h_val) / h_val * 100) if h_val > 0 else 0\n",
-    "    color = C['trained'] if pct >= 0 else C['random']\n",
-    "    sign = '+' if pct >= 0 else '-'\n",
-    "    ax_bar.text(x[i] + w, max(h_val, t_val) + 0.06, f'{sign}{abs(pct):.1f}%',\n",
-    "                ha='center', fontsize=10, color=color, fontweight='bold', zorder=4)\n",
-    "\n",
-    "ax_bar.axhline(baseline_avg, color=C['heuristic'], linestyle=':', linewidth=1.5, alpha=0.6,\n",
-    "               label=f'Heuristic avg ({baseline_avg:.3f})', zorder=2)\n",
-    "ax_bar.axhline(trained_avg, color=C['trained'], linestyle=':', linewidth=1.5, alpha=0.6,\n",
-    "               label=f'Trained avg ({trained_avg:.3f})', zorder=2)\n",
-    "ax_bar.set_xticks(x)\n",
-    "ax_bar.set_xticklabels(task_labels, color=C['text'], fontsize=10)\n",
-    "ax_bar.set_ylabel('Grade Score (0.0 to 1.0, higher is better)', fontsize=11, color=C['subtext'])\n",
-    "ax_bar.set_ylim(0, 1.15)\n",
-    "ax_bar.set_title('GridMind-RL Policy Performance Across All 4 Hackathon Themes\\nRandom vs Heuristic Baseline vs GRPO Fine-Tuned LLM',\n",
-    "                 color=C['text'], fontsize=13, fontweight='bold', pad=12)\n",
-    "ax_bar.legend(fontsize=10, facecolor=C['grid'], labelcolor=C['text'], framealpha=0.9,\n",
-    "              edgecolor=C['border'], ncol=3, loc='upper right')\n",
-    "ax_bar.grid(axis='y', alpha=0.15, color=C['grid'], zorder=1)\n",
-    "for spine in ax_bar.spines.values():\n",
-    "    spine.set_edgecolor(C['border'])\n",
-    "ax_bar.tick_params(colors=C['subtext'])\n",
-    "\n",
-    "# Panel B: reward signal over time.\n",
-    "ax_rew = fig.add_subplot(gs[1, 0])\n",
-    "style_ax(ax_rew, 'GRPO Training: Reward Signal per Step')\n",
-    "if not training_metrics_df.empty and (\"reward\" in training_metrics_df.columns or \"rewards / reward_func / mean\" in training_metrics_df.columns):\n",
-    "    reward_col = \"reward\" if \"reward\" in training_metrics_df.columns else \"rewards / reward_func / mean\"\n",
-    "    std_col = \"reward_std\" if \"reward_std\" in training_metrics_df.columns else \"rewards / reward_func / std\"\n",
-    "    reward_df = training_metrics_df[[\"step\", reward_col] + ([std_col] if std_col in training_metrics_df.columns else [])].dropna(subset=[reward_col])\n",
-    "    steps_r = reward_df[\"step\"].astype(float).tolist()\n",
-    "    raw = reward_df[reward_col].astype(float).tolist()\n",
-    "    ax_rew.plot(steps_r, raw, alpha=0.28, color=C['reward'], linewidth=1.2, marker='o', markersize=2, label='Logged reward')\n",
-    "    ax_rew.plot(steps_r, smooth(raw, window=6), color=C['reward'], linewidth=2.5, label='Smoothed reward')\n",
-    "    if std_col in reward_df.columns:\n",
-    "        std = reward_df[std_col].fillna(0).astype(float).to_numpy()\n",
-    "        raw_np = np.array(raw)\n",
-    "        steps_np = np.array(steps_r)\n",
-    "        ax_rew.fill_between(steps_np, raw_np - std, raw_np + std, color=C['reward'], alpha=0.12, label='Reward std')\n",
-    "    if len(steps_r) > 8:\n",
-    "        z = np.polyfit(steps_r, raw, 1)\n",
-    "        p = np.poly1d(z)\n",
-    "        ax_rew.plot(steps_r, p(steps_r), '--', color='white', alpha=0.35, linewidth=1.5,\n",
-    "                    label=f'Trend ({z[0]:+.5f}/step)')\n",
-    "    ax_rew.set_xlabel('Training Step')\n",
-    "    ax_rew.set_ylabel('Reward Value')\n",
-    "    ax_rew.legend(fontsize=9, facecolor=C['grid'], labelcolor=C['text'], framealpha=0.9, edgecolor=C['border'])\n",
-    "    if np.var(raw) < 0.01:\n",
-    "        ax_rew.text(0.5, 0.5, 'Low reward variance detected.\\nThis graph exposes weak learning signal.',\n",
-    "                    transform=ax_rew.transAxes, ha='center', va='center', color=C['random'], fontsize=10,\n",
-    "                    bbox=dict(boxstyle='round', facecolor=C['panel'], alpha=0.8))\n",
-    "elif training_rewards and len(training_rewards) >= 4:\n",
-    "    raw = training_rewards\n",
-    "    steps_r = list(range(1, len(raw) + 1))\n",
-    "    ax_rew.plot(steps_r, raw, alpha=0.20, color=C['reward'], linewidth=1)\n",
-    "    ax_rew.plot(steps_r, smooth(raw, window=6), color=C['reward'], linewidth=2.5, label='Smoothed reward')\n",
-    "    ax_rew.set_xlabel('Reward Function Call')\n",
-    "    ax_rew.set_ylabel('Reward Value')\n",
-    "    ax_rew.legend(fontsize=9, facecolor=C['grid'], labelcolor=C['text'], framealpha=0.9, edgecolor=C['border'])\n",
-    "else:\n",
-    "    ax_rew.text(0.5, 0.5, 'No training rewards captured.\\nRe-run with fixed reward function.',\n",
-    "                transform=ax_rew.transAxes, ha='center', va='center', color=C['subtext'], fontsize=11)\n",
-    "\n",
-    "# Panel C: training loss, with reward variance fallback.\n",
-    "ax_loss = fig.add_subplot(gs[1, 1])\n",
-    "style_ax(ax_loss, 'GRPO Training Loss per Step')\n",
-    "if step_losses and len(step_losses) >= 2:\n",
-    "    ax_loss.plot(step_numbers, step_losses, alpha=0.25, color=C['loss'], linewidth=1)\n",
-    "    ax_loss.plot(step_numbers, smooth(step_losses, window=4), color=C['loss'], linewidth=2.5, label='Smoothed loss')\n",
-    "    non_zero = [l for l in step_losses if abs(l) > 1e-7]\n",
-    "    pct_nz = len(non_zero) / len(step_losses) * 100\n",
-    "    note_color = C['trained'] if pct_nz > 50 else C['random']\n",
-    "    ax_loss.text(0.04, 0.96, f'Non-zero steps: {len(non_zero)}/{len(step_losses)} ({pct_nz:.0f}%)',\n",
-    "                 transform=ax_loss.transAxes, va='top', color=note_color, fontsize=9,\n",
-    "                 bbox=dict(boxstyle='round', facecolor=C['panel'], alpha=0.8))\n",
-    "    ax_loss.set_xlabel('Training Step')\n",
-    "    ax_loss.set_ylabel('Loss')\n",
-    "    ax_loss.legend(fontsize=9, facecolor=C['grid'], labelcolor=C['text'], framealpha=0.9, edgecolor=C['border'])\n",
-    "else:\n",
-    "    proxy_loss = []\n",
-    "    for i in range(0, len(training_rewards), 4):\n",
-    "        chunk = training_rewards[i:i+4]\n",
-    "        if len(chunk) > 1:\n",
-    "            proxy_loss.append(float(np.var(chunk)))\n",
-    "    if proxy_loss:\n",
-    "        ax_loss.plot(range(1, len(proxy_loss) + 1), proxy_loss, color=C['loss'], linewidth=2,\n",
-    "                     label='Reward variance proxy')\n",
-    "        ax_loss.set_xlabel('Training Batch')\n",
-    "        ax_loss.set_ylabel('Reward Variance')\n",
-    "        ax_loss.legend(fontsize=9, facecolor=C['grid'], labelcolor=C['text'], framealpha=0.9, edgecolor=C['border'])\n",
-    "        ax_loss.text(0.5, 0.92, 'Loss not captured - showing reward variance proxy',\n",
-    "                     transform=ax_loss.transAxes, ha='center', color=C['subtext'], fontsize=8)\n",
-    "    else:\n",
-    "        ax_loss.text(0.5, 0.5, 'No loss data available.', transform=ax_loss.transAxes,\n",
-    "                     ha='center', va='center', color=C['subtext'], fontsize=11)\n",
-    "\n",
-    "fig.suptitle(\n",
-    "    'GridMind-RL - Meta OpenEnv Hackathon - Multi-Agent Industrial Energy Management\\n'\n",
-    "    f'Model: Qwen2.5-1.5B + QLoRA + GRPO | Overall improvement vs heuristic: {overall_improvement:+.1f}%',\n",
-    "    color=C['text'], fontsize=14, fontweight='bold', y=0.97\n",
-    ")\n",
-    "\n",
-    "dashboard_path = 'results/gridmind_training_dashboard.png'\n",
-    "fig.savefig(dashboard_path, dpi=100, facecolor=fig.get_facecolor(), bbox_inches='tight')\n",
-    "plt.close(fig)\n",
-    "\n",
-    "# Standalone training reward curve for reports/slides.\n",
     "reward_curve_path = 'results/gridmind_training_reward_curve.png'\n",
-    "fig_reward, ax_reward = plt.subplots(figsize=(11, 6))\n",
-    "fig_reward.patch.set_facecolor(C['bg'])\n",
-    "style_ax(ax_reward, 'Training Reward Curve')\n",
     "if not training_metrics_df.empty and (\"reward\" in training_metrics_df.columns or \"rewards / reward_func / mean\" in training_metrics_df.columns):\n",
     "    reward_col = \"reward\" if \"reward\" in training_metrics_df.columns else \"rewards / reward_func / mean\"\n",
     "    std_col = \"reward_std\" if \"reward_std\" in training_metrics_df.columns else \"rewards / reward_func / std\"\n",
     "    reward_df = training_metrics_df[[\"step\", reward_col] + ([std_col] if std_col in training_metrics_df.columns else [])].dropna(subset=[reward_col])\n",
     "    xs = reward_df[\"step\"].astype(float).to_numpy()\n",
     "    ys = reward_df[reward_col].astype(float).to_numpy()\n",
-    "    ax_reward.plot(xs, ys, color=C['reward'], alpha=0.35, linewidth=1.2, marker='o', markersize=2, label='Reward')\n",
-    "    ax_reward.plot(xs, smooth(ys.tolist(), window=6), color=C['trained'], linewidth=2.5, label='Smoothed reward')\n",
     "    if std_col in reward_df.columns:\n",
     "        std = reward_df[std_col].fillna(0).astype(float).to_numpy()\n",
-    "        ax_reward.fill_between(xs, ys - std, ys + std, color=C['reward'], alpha=0.12, label='Reward std')\n",
-    "    if len(xs) > 8:\n",
-    "        z = np.polyfit(xs, ys, 1)\n",
-    "        p = np.poly1d(z)\n",
-    "        ax_reward.plot(xs, p(xs), '--', color=C['text'], alpha=0.45, linewidth=1.5, label=f'Trend ({z[0]:+.5f}/step)')\n",
-    "    ax_reward.set_xlabel('Training Step')\n",
-    "    ax_reward.set_ylabel('Reward')\n",
-    "    ax_reward.legend(facecolor=C['grid'], labelcolor=C['text'], edgecolor=C['border'])\n",
     "else:\n",
-    "    ax_reward.text(0.5, 0.5, 'No logged reward data available.', transform=ax_reward.transAxes,\n",
-    "                   ha='center', va='center', color=C['subtext'])\n",
-    "fig_reward.savefig(reward_curve_path, dpi=100, facecolor=fig_reward.get_facecolor(), bbox_inches='tight')\n",
     "plt.close(fig_reward)\n",
     "\n",
     "# Reference-style simple plots from trainer.state.log_history.\n",
@@ -1267,26 +1189,23 @@
     "    print(\"No loss logs found; skipped plots/loss_curve.png\")\n",
     "\n",
     "# Separate before/after comparison graph for quick judge inspection.\n",
-    "fig2, ax2 = plt.subplots(figsize=(11, 6))\n",
-    "fig2.patch.set_facecolor(C['bg'])\n",
-    "ax2.set_facecolor(C['panel'])\n",
-    "ax2.bar(x - w/2, heuristic_vals, w, label='Heuristic Baseline', color=C['heuristic'], alpha=0.9)\n",
-    "ax2.bar(x + w/2, trained_vals, w, label='Trained LLM (GRPO)', color=C['trained'], alpha=0.9)\n",
     "ax2.set_xticks(x)\n",
-    "ax2.set_xticklabels(task_labels, color=C['text'])\n",
     "ax2.set_ylim(0, 1.05)\n",
-    "ax2.set_ylabel('Grade Score', color=C['subtext'])\n",
-    "ax2.set_title('Before/After Policy Score Comparison', color=C['text'], fontweight='bold')\n",
-    "ax2.legend(facecolor=C['grid'], labelcolor=C['text'], edgecolor=C['border'])\n",
-    "ax2.grid(axis='y', alpha=0.15, color=C['grid'])\n",
-    "ax2.tick_params(colors=C['subtext'])\n",
-    "for spine in ax2.spines.values():\n",
-    "    spine.set_edgecolor(C['border'])\n",
     "comparison_path = 'results/gridmind_before_after_comparison.png'\n",
-    "fig2.savefig(comparison_path, dpi=100, facecolor=fig2.get_facecolor(), bbox_inches='tight')\n",
     "plt.close(fig2)\n",
     "\n",
-    "print(f\"Saved dashboard graph to {dashboard_path}\")\n",
     "print(f\"Saved training reward curve to {reward_curve_path}\")\n",
     "print(f\"Saved simple reward curve to {simple_reward_curve_path}\")\n",
     "if simple_loss_curve_path:\n",
@@ -1312,7 +1231,7 @@
     "    \"training_metrics_table\": training_metrics_path,\n",
     "    \"training_metrics_display_table\": training_metrics_display_path if 'training_metrics_display_path' in globals() else None,\n",
     "    \"graphs\": {\n",
-    "        \"dashboard\": dashboard_path,\n",
     "        \"training_reward_curve\": reward_curve_path,\n",
     "        \"simple_reward_curve\": simple_reward_curve_path,\n",
     "        \"simple_loss_curve\": simple_loss_curve_path,\n",

     "    \"logging_steps\": 1,\n",
     "    \"save_steps\": 60,\n",
     "    \"report_to\": \"none\",\n",
+    "    \"disable_tqdm\": True,\n",
     "    \"dataloader_num_workers\": 0,\n",
     "    \"remove_unused_columns\": False,\n",
     "}\n",
     "step_numbers = []\n",
     "step_reward_means = []\n",
     "training_log_history = []\n",
+    "training_table_rows = []\n",
+    "_training_table_header_printed = [False]\n",
+    "\n",
+    "TRAINING_TABLE_COLUMNS = [\n",
+    "    (\"Step\", \"step\"),\n",
+    "    (\"Training Loss\", \"loss\"),\n",
+    "    (\"reward\", \"reward\"),\n",
+    "    (\"reward_std\", \"reward_std\"),\n",
+    "    (\"completions / mean_length\", \"completions / mean_length\"),\n",
+    "    (\"completions / min_length\", \"completions / min_length\"),\n",
+    "    (\"completions / max_length\", \"completions / max_length\"),\n",
+    "    (\"completions / clipped_ratio\", \"completions / clipped_ratio\"),\n",
+    "    (\"completions / mean_terminated_length\", \"completions / mean_terminated_length\"),\n",
+    "    (\"completions / min_terminated_length\", \"completions / min_terminated_length\"),\n",
+    "    (\"completions / max_terminated_length\", \"completions / max_terminated_length\"),\n",
+    "    (\"tools / call_frequency\", \"tools / call_frequency\"),\n",
+    "    (\"tools / failure_frequency\", \"tools / failure_frequency\"),\n",
+    "    (\"kl\", \"kl\"),\n",
+    "    (\"rewards / reward_func / mean\", \"rewards / reward_func / mean\"),\n",
+    "    (\"rewards / reward_func / std\", \"rewards / reward_func / std\"),\n",
+    "]\n",
+    "\n",
+    "def _metric_value(logs, *keys, default=float(\"nan\")):\n",
+    "    for key in keys:\n",
+    "        if key in logs and logs[key] is not None:\n",
+    "            return logs[key]\n",
+    "    return default\n",
+    "\n",
+    "def _fmt_metric(value):\n",
+    "    try:\n",
+    "        if value is None or (isinstance(value, float) and value != value):\n",
+    "            return \"\"\n",
+    "        if isinstance(value, int):\n",
+    "            return str(value)\n",
+    "        return f\"{float(value):.6f}\"\n",
+    "    except Exception:\n",
+    "        return str(value)\n",
+    "\n",
+    "def _print_training_table_row(row):\n",
+    "    widths = [6, 14, 10, 10, 26, 25, 25, 29, 38, 37, 37, 24, 27, 10, 28, 27]\n",
+    "    if not _training_table_header_printed[0]:\n",
+    "        header = \"  \".join(label.ljust(widths[i]) for i, (label, _) in enumerate(TRAINING_TABLE_COLUMNS))\n",
+    "        print(\"\\n\" + header)\n",
+    "        print(\"-\" * len(header))\n",
+    "        _training_table_header_printed[0] = True\n",
+    "    values = [_fmt_metric(row.get(source, float(\"nan\"))).ljust(widths[i]) for i, (_, source) in enumerate(TRAINING_TABLE_COLUMNS)]\n",
+    "    print(\"  \".join(values))\n",
     "\n",
     "class LossCaptureCallback(TrainerCallback):\n",
     "    def on_log(self, args, state, control, logs=None, **kwargs):\n",
     "        step = state.global_step\n",
     "        row = {\"step\": step}\n",
     "        row.update({k: float(v) if isinstance(v, (int, float)) else v for k, v in logs.items()})\n",
+    "        if \"loss\" not in row and \"train_loss\" in row:\n",
+    "            row[\"loss\"] = row[\"train_loss\"]\n",
+    "        recent_rewards = training_rewards[max(0, len(training_rewards)-4):]\n",
+    "        if recent_rewards:\n",
+    "            if \"reward\" not in row and \"rewards / reward_func / mean\" not in row:\n",
+    "                row[\"reward\"] = sum(recent_rewards) / len(recent_rewards)\n",
+    "            if \"reward_std\" not in row and \"rewards / reward_func / std\" not in row and len(recent_rewards) > 1:\n",
+    "                row[\"reward_std\"] = statistics.pstdev(recent_rewards)\n",
+    "        if \"rewards / reward_func / mean\" not in row and \"reward\" in row:\n",
+    "            row[\"rewards / reward_func / mean\"] = row[\"reward\"]\n",
+    "        if \"rewards / reward_func / std\" not in row and \"reward_std\" in row:\n",
+    "            row[\"rewards / reward_func / std\"] = row[\"reward_std\"]\n",
+    "        if \"tools / call_frequency\" not in row:\n",
+    "            row[\"tools / call_frequency\"] = float(\"nan\")\n",
+    "        if \"tools / failure_frequency\" not in row:\n",
+    "            row[\"tools / failure_frequency\"] = 0.0\n",
     "        training_log_history.append(row)\n",
+    "        training_table_rows.append(row)\n",
+    "        _print_training_table_row(row)\n",
     "        loss = logs.get(\"loss\", logs.get(\"train_loss\", None))\n",
     "        if loss is not None:\n",
     "            step_losses.append(float(loss))\n",
     "    callbacks=[LossCaptureCallback()],\n",
     ")\n",
     "\n",
+    "# Remove the default Trainer progress/notebook callbacks so only the custom\n",
+    "# TRL-style table appears during training.\n",
+    "from transformers.trainer_callback import ProgressCallback, PrinterCallback\n",
+    "trainer.remove_callback(ProgressCallback)\n",
+    "trainer.remove_callback(PrinterCallback)\n",
+    "try:\n",
+    "    from transformers.utils.notebook import NotebookProgressCallback\n",
+    "    trainer.remove_callback(NotebookProgressCallback)\n",
+    "except Exception:\n",
+    "    pass\n",
+    "\n",
     "print(\"\\nStarting GRPO training with QLoRA...\")\n",
     "print(\"Watch for non-zero loss values. If all zeros, reward variance is still too low.\\n\")\n",
     "print(f\"Steps: {getattr(grpo_config, 'max_steps', 60)} | Batch: {getattr(grpo_config, 'per_device_train_batch_size', 1)} | Generations: {getattr(grpo_config, 'num_generations', 4)}\")\n",
     "    import pandas as pd\n",
     "    import numpy as np\n",
     "    trainer_log_rows = [r for r in trainer.state.log_history if \"loss\" in r or \"reward\" in r or \"rewards / reward_func / mean\" in r]\n",
+    "    if training_table_rows:\n",
+    "        training_metrics_df = pd.DataFrame(training_table_rows)\n",
+    "    elif trainer_log_rows:\n",
     "        training_metrics_df = pd.DataFrame(trainer_log_rows)\n",
     "        if \"step\" not in training_metrics_df.columns:\n",
     "            training_metrics_df.insert(0, \"step\", range(1, len(training_metrics_df) + 1))\n",
     "import matplotlib\n",
     "matplotlib.use('Agg')\n",
     "import matplotlib.pyplot as plt\n",
     "import numpy as np\n",
     "import pandas as pd\n",
     "import os\n",
     "os.makedirs(\"results\", exist_ok=True)\n",
     "os.makedirs(\"plots\", exist_ok=True)\n",
     "\n",
+    "# Reuse the Step 6 metrics table and only do lightweight exports here.\n",
     "if 'training_metrics_df' not in globals() or training_metrics_df is None:\n",
     "    trainer_log_rows = [r for r in trainer.state.log_history if \"loss\" in r or \"reward\" in r or \"rewards / reward_func / mean\" in r]\n",
+    "    if 'training_table_rows' in globals() and training_table_rows:\n",
+    "        training_metrics_df = pd.DataFrame(training_table_rows)\n",
+    "    else:\n",
+    "        training_metrics_df = pd.DataFrame(trainer_log_rows if trainer_log_rows else training_log_history)\n",
     "    if not training_metrics_df.empty and \"step\" not in training_metrics_df.columns:\n",
     "        training_metrics_df.insert(0, \"step\", range(1, len(training_metrics_df) + 1))\n",
     "\n",
     "if not training_metrics_df.empty:\n",
     "    training_metrics_df.to_csv(training_metrics_path, index=False)\n",
     "    print(f\"Saved TRL metrics table to {training_metrics_path}\")\n",
+    "    print(\"Step 8 reuses the Step 6 table and only saves files.\")\n",
     "\n",
     "tasks = [1, 2, 3, 4]\n",
     "task_labels = [\n",
     "        out.append(sum(w) / len(w))\n",
     "    return out\n",
     "\n",
     "reward_curve_path = 'results/gridmind_training_reward_curve.png'\n",
+    "fig_reward, ax_reward = plt.subplots(figsize=(10, 5))\n",
     "if not training_metrics_df.empty and (\"reward\" in training_metrics_df.columns or \"rewards / reward_func / mean\" in training_metrics_df.columns):\n",
     "    reward_col = \"reward\" if \"reward\" in training_metrics_df.columns else \"rewards / reward_func / mean\"\n",
     "    std_col = \"reward_std\" if \"reward_std\" in training_metrics_df.columns else \"rewards / reward_func / std\"\n",
     "    reward_df = training_metrics_df[[\"step\", reward_col] + ([std_col] if std_col in training_metrics_df.columns else [])].dropna(subset=[reward_col])\n",
     "    xs = reward_df[\"step\"].astype(float).to_numpy()\n",
     "    ys = reward_df[reward_col].astype(float).to_numpy()\n",
+    "    ax_reward.plot(xs, ys, color=\"#4285f4\", linewidth=2, label=\"GRPO Reward\")\n",
+    "    if len(ys) > 5:\n",
+    "        window = max(3, len(ys) // 10)\n",
+    "        smoothed = [sum(ys[max(0, i-window):i+1]) / len(ys[max(0, i-window):i+1]) for i in range(len(ys))]\n",
+    "        ax_reward.plot(xs[:len(smoothed)], smoothed, color=\"#ea4335\", linewidth=2, linestyle=\"--\", label=f\"Smoothed (window={window})\")\n",
     "    if std_col in reward_df.columns:\n",
     "        std = reward_df[std_col].fillna(0).astype(float).to_numpy()\n",
+    "        ax_reward.fill_between(xs, ys - std, ys + std, color=\"#4285f4\", alpha=0.12)\n",
     "else:\n",
+    "    ax_reward.text(0.5, 0.5, 'No logged reward data available.', transform=ax_reward.transAxes, ha='center', va='center')\n",
+    "ax_reward.set_xlabel('Training Step', fontsize=12)\n",
+    "ax_reward.set_ylabel('Reward', fontsize=12)\n",
+    "ax_reward.set_title('GridMind-RL GRPO Training - Reward Curve', fontsize=14, fontweight='bold')\n",
+    "ax_reward.legend()\n",
+    "ax_reward.grid(True, alpha=0.3)\n",
+    "fig_reward.tight_layout()\n",
+    "fig_reward.savefig(reward_curve_path, dpi=100)\n",
     "plt.close(fig_reward)\n",
     "\n",
     "# Reference-style simple plots from trainer.state.log_history.\n",
     "    print(\"No loss logs found; skipped plots/loss_curve.png\")\n",
     "\n",
     "# Separate before/after comparison graph for quick judge inspection.\n",
+    "fig2, ax2 = plt.subplots(figsize=(10, 5))\n",
+    "x = np.arange(len(tasks))\n",
+    "w = 0.35\n",
+    "ax2.bar(x - w/2, heuristic_vals, w, label='Heuristic Baseline', color=\"#58a6ff\", alpha=0.9)\n",
+    "ax2.bar(x + w/2, trained_vals, w, label='Trained LLM (GRPO)', color=\"#3fb950\", alpha=0.9)\n",
     "ax2.set_xticks(x)\n",
+    "ax2.set_xticklabels(task_labels)\n",
     "ax2.set_ylim(0, 1.05)\n",
+    "ax2.set_ylabel('Grade Score')\n",
+    "ax2.set_title('Before/After Policy Score Comparison', fontweight='bold')\n",
+    "ax2.legend()\n",
+    "ax2.grid(axis='y', alpha=0.3)\n",
+    "fig2.tight_layout()\n",
     "comparison_path = 'results/gridmind_before_after_comparison.png'\n",
+    "fig2.savefig(comparison_path, dpi=100)\n",
     "plt.close(fig2)\n",
     "\n",
     "print(f\"Saved training reward curve to {reward_curve_path}\")\n",
     "print(f\"Saved simple reward curve to {simple_reward_curve_path}\")\n",
     "if simple_loss_curve_path:\n",
     "    \"training_metrics_table\": training_metrics_path,\n",
     "    \"training_metrics_display_table\": training_metrics_display_path if 'training_metrics_display_path' in globals() else None,\n",
     "    \"graphs\": {\n",
+    "        \"dashboard\": None,\n",
     "        \"training_reward_curve\": reward_curve_path,\n",
     "        \"simple_reward_curve\": simple_reward_curve_path,\n",
     "        \"simple_loss_curve\": simple_loss_curve_path,\n",