maxxie114 Claude Sonnet 4.6 commited on
Commit
cb80a59
Β·
1 Parent(s): a85a0ef

Add clean training summary cell to GRPO notebook

Browse files

Adds a compact 5-column table (step, reward, reward_std, completion_len, kl)
alongside the full 18-column TRL output, with printed milestones for judges.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (1) hide show
  1. notebooks/grpo_training.ipynb +9 -1
notebooks/grpo_training.ipynb CHANGED
@@ -1685,6 +1685,14 @@
1685
  "print(\"Training complete.\")"
1686
  ]
1687
  },
 
 
 
 
 
 
 
 
1688
  {
1689
  "cell_type": "code",
1690
  "execution_count": 15,
@@ -1902,4 +1910,4 @@
1902
  },
1903
  "nbformat": 4,
1904
  "nbformat_minor": 5
1905
- }
 
1685
  "print(\"Training complete.\")"
1686
  ]
1687
  },
1688
+ {
1689
+ "cell_type": "code",
1690
+ "id": "riivpk3u3g",
1691
+ "source": "# ── 9b. Training summary (key metrics only) ───────────────────────────────────\n# The full TRL table has 18 columns β€” this shows just the signal that matters.\nimport pandas as pd\n\n_training_log = [\n (5, 0.040704, -0.070000, 0.106792, 218.9, 0.000364),\n (10, -0.021271, -0.027500, 0.065828, 165.2, 0.015877),\n (15, 0.034168, -0.002500, 0.030774, 177.3, 0.041233),\n (20, 0.020938, 0.017500, 0.070535, 155.0, 0.107568),\n (25, -0.002000, 0.032500, 0.010774, 144.4, 0.185280),\n (30, 0.032288, 0.035000, 0.030000, 126.8, 40.920438),\n (35, 0.000408, 0.050000, 0.000000, 119.8, 0.413161),\n (40, 0.019699, 0.010000, 0.073805, 123.3, 0.522195),\n (45, 0.027926, 0.040000, 0.030000, 105.8, 0.467126),\n (50, -0.014295, 0.037500, 0.032080, 106.9, 0.732997),\n (55, -0.096492, 0.062500, 0.048410, 147.2, 0.239230),\n (60, 0.051567, 0.047500, 0.060302, 136.3, 0.238321),\n (65, -0.102409, 0.085000, 0.064889, 159.2, 0.194201),\n (70, 0.012411, 0.030000, 0.118153, 166.8, 0.361337),\n (75, -0.074794, 0.112500, 0.048854, 167.3, 0.289481),\n (80, 0.069143, 0.070000, 0.110079, 205.4, 0.336399),\n (85, -0.037919, 0.087500, 0.092213, 162.5, 0.595924),\n (90, -0.000692, 0.107500, 0.052016, 162.6, 0.304314),\n (95, -0.059527, 0.105000, 0.078040, 144.7, 0.313946),\n (100, -0.035350, 0.122500, 0.055000, 146.1, 0.643397),\n (105, -0.009459, 0.110000, 0.080000, 152.1, 0.379797),\n (110, -0.013286, 0.115000, 0.070000, 146.0, 0.729111),\n (115, -0.002842, 0.130000, 0.033665, 139.1, 0.538091),\n (120, -0.004330, 0.142500, 0.015000, 137.4, 1.426445),\n (125, 0.062069, 0.102500, 0.065415, 157.5, 0.665843),\n (130, 0.049791, 0.112500, 0.075000, 158.5, 0.518018),\n (135, -0.025211, 0.130000, 0.050000, 147.1, 0.384765),\n (140, 0.000373, 0.150000, 0.000000, 153.1, 0.372639),\n (145, -0.001576, 0.062500, 0.115829, 147.8, 0.396434),\n (150, -0.020348, 0.077500, 0.085829, 167.6, 0.303009),\n (155, -0.022019, 0.150000, 0.015774, 162.6, 0.395963),\n (160, -0.121579, 0.127500, 0.045000, 140.9, 0.583458),\n (165, 0.029664, 0.115000, 0.070000, 159.6, 0.443787),\n (170, 0.116292, 0.132500, 0.035000, 167.1, 0.346723),\n (175, 0.000281, 0.150000, 0.000000, 154.7, 0.280489),\n (180, -0.004622, 0.132500, 0.035000, 138.6, 0.325211),\n (185, -0.100962, 0.135000, 0.040000, 173.6, 0.391806),\n (190, 0.000338, 0.150000, 0.000000, 146.2, 0.339465),\n (195, 0.000396, 0.150000, 0.000000, 151.1, 0.389999),\n (200, -0.011246, 0.105000, 0.090000, 140.8, 1.234660),\n]\n\ndf_log = pd.DataFrame(_training_log,\n columns=[\"step\", \"loss\", \"reward\", \"reward_std\", \"completion_len\", \"kl\"])\n\n# Key milestones\nprint(\"=== GRPO Training Summary (Qwen3.5-0.8B, 200 steps) ===\\n\")\nprint(df_log[[\"step\", \"reward\", \"reward_std\", \"completion_len\", \"kl\"]].to_string(index=False))\nprint(f\"\\nStart reward (step 5): {df_log.iloc[0]['reward']:+.4f}\")\nprint(f\"Final reward (step 200): {df_log.iloc[-1]['reward']:+.4f}\")\nprint(f\"Peak reward : {df_log['reward'].max():+.4f} (step {int(df_log.loc[df_log['reward'].idxmax(), 'step'])})\")\nprint(f\"Avg reward (last 20 steps): {df_log.tail(20)['reward'].mean():+.4f}\")\nprint(f\"Completion length: {df_log.iloc[0]['completion_len']:.0f} β†’ {df_log.iloc[-1]['completion_len']:.0f} tokens\")",
1692
+ "metadata": {},
1693
+ "execution_count": null,
1694
+ "outputs": []
1695
+ },
1696
  {
1697
  "cell_type": "code",
1698
  "execution_count": 15,
 
1910
  },
1911
  "nbformat": 4,
1912
  "nbformat_minor": 5
1913
+ }