Spaces:

ayushozha
/

replicalab

Running

maxxie114 Claude Sonnet 4.6 commited on Mar 8

Commit

cb80a59

1 Parent(s): a85a0ef

Add clean training summary cell to GRPO notebook

Adds a compact 5-column table (step, reward, reward_std, completion_len, kl)
alongside the full 18-column TRL output, with printed milestones for judges.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (1) hide show

notebooks/grpo_training.ipynb +9 -1

notebooks/grpo_training.ipynb CHANGED Viewed

@@ -1685,6 +1685,14 @@
     "print(\"Training complete.\")"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 15,
@@ -1902,4 +1910,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 5
-}

     "print(\"Training complete.\")"
    ]
   },
+  {
+   "cell_type": "code",
+   "id": "riivpk3u3g",
+   "source": "# ── 9b. Training summary (key metrics only) ───────────────────────────────────\n# The full TRL table has 18 columns — this shows just the signal that matters.\nimport pandas as pd\n\n_training_log = [\n    (5,   0.040704,  -0.070000, 0.106792, 218.9, 0.000364),\n    (10,  -0.021271, -0.027500, 0.065828, 165.2, 0.015877),\n    (15,  0.034168,  -0.002500, 0.030774, 177.3, 0.041233),\n    (20,  0.020938,   0.017500, 0.070535, 155.0, 0.107568),\n    (25,  -0.002000,  0.032500, 0.010774, 144.4, 0.185280),\n    (30,  0.032288,   0.035000, 0.030000, 126.8, 40.920438),\n    (35,  0.000408,   0.050000, 0.000000, 119.8, 0.413161),\n    (40,  0.019699,   0.010000, 0.073805, 123.3, 0.522195),\n    (45,  0.027926,   0.040000, 0.030000, 105.8, 0.467126),\n    (50,  -0.014295,  0.037500, 0.032080, 106.9, 0.732997),\n    (55,  -0.096492,  0.062500, 0.048410, 147.2, 0.239230),\n    (60,  0.051567,   0.047500, 0.060302, 136.3, 0.238321),\n    (65,  -0.102409,  0.085000, 0.064889, 159.2, 0.194201),\n    (70,  0.012411,   0.030000, 0.118153, 166.8, 0.361337),\n    (75,  -0.074794,  0.112500, 0.048854, 167.3, 0.289481),\n    (80,  0.069143,   0.070000, 0.110079, 205.4, 0.336399),\n    (85,  -0.037919,  0.087500, 0.092213, 162.5, 0.595924),\n    (90,  -0.000692,  0.107500, 0.052016, 162.6, 0.304314),\n    (95,  -0.059527,  0.105000, 0.078040, 144.7, 0.313946),\n    (100, -0.035350,  0.122500, 0.055000, 146.1, 0.643397),\n    (105, -0.009459,  0.110000, 0.080000, 152.1, 0.379797),\n    (110, -0.013286,  0.115000, 0.070000, 146.0, 0.729111),\n    (115, -0.002842,  0.130000, 0.033665, 139.1, 0.538091),\n    (120, -0.004330,  0.142500, 0.015000, 137.4, 1.426445),\n    (125,  0.062069,  0.102500, 0.065415, 157.5, 0.665843),\n    (130,  0.049791,  0.112500, 0.075000, 158.5, 0.518018),\n    (135, -0.025211,  0.130000, 0.050000, 147.1, 0.384765),\n    (140,  0.000373,  0.150000, 0.000000, 153.1, 0.372639),\n    (145, -0.001576,  0.062500, 0.115829, 147.8, 0.396434),\n    (150, -0.020348,  0.077500, 0.085829, 167.6, 0.303009),\n    (155, -0.022019,  0.150000, 0.015774, 162.6, 0.395963),\n    (160, -0.121579,  0.127500, 0.045000, 140.9, 0.583458),\n    (165,  0.029664,  0.115000, 0.070000, 159.6, 0.443787),\n    (170,  0.116292,  0.132500, 0.035000, 167.1, 0.346723),\n    (175,  0.000281,  0.150000, 0.000000, 154.7, 0.280489),\n    (180, -0.004622,  0.132500, 0.035000, 138.6, 0.325211),\n    (185, -0.100962,  0.135000, 0.040000, 173.6, 0.391806),\n    (190,  0.000338,  0.150000, 0.000000, 146.2, 0.339465),\n    (195,  0.000396,  0.150000, 0.000000, 151.1, 0.389999),\n    (200, -0.011246,  0.105000, 0.090000, 140.8, 1.234660),\n]\n\ndf_log = pd.DataFrame(_training_log,\n    columns=[\"step\", \"loss\", \"reward\", \"reward_std\", \"completion_len\", \"kl\"])\n\n# Key milestones\nprint(\"=== GRPO Training Summary (Qwen3.5-0.8B, 200 steps) ===\\n\")\nprint(df_log[[\"step\", \"reward\", \"reward_std\", \"completion_len\", \"kl\"]].to_string(index=False))\nprint(f\"\\nStart  reward (step   5): {df_log.iloc[0]['reward']:+.4f}\")\nprint(f\"Final  reward (step 200): {df_log.iloc[-1]['reward']:+.4f}\")\nprint(f\"Peak   reward           : {df_log['reward'].max():+.4f}  (step {int(df_log.loc[df_log['reward'].idxmax(), 'step'])})\")\nprint(f\"Avg reward (last 20 steps): {df_log.tail(20)['reward'].mean():+.4f}\")\nprint(f\"Completion length: {df_log.iloc[0]['completion_len']:.0f} → {df_log.iloc[-1]['completion_len']:.0f} tokens\")",
+   "metadata": {},
+   "execution_count": null,
+   "outputs": []
+  },
   {
    "cell_type": "code",
    "execution_count": 15,
  },
  "nbformat": 4,
  "nbformat_minor": 5
+}