Qwen2.5-3B-Open-R1-GRPO / trainer_state.json
wlzhou's picture
Model save
514375f verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 399.8,
"eval_steps": 10,
"global_step": 400,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"completion_length": 606.59375,
"epoch": 0.8,
"grad_norm": 1.3200632494048739,
"kl": 0.0,
"learning_rate": 5e-08,
"loss": 0.043,
"reward": 11.62500011920929,
"reward_std": 5.327881373465061,
"rewards/accuracy_reward_staging": 0.9671875108033419,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 0.984375,
"step": 1
},
{
"clip_ratio": 0.0,
"completion_length": 622.75,
"epoch": 1.8,
"grad_norm": 1.245967192961327,
"kl": 0.0,
"learning_rate": 1e-07,
"loss": 0.0103,
"reward": 11.717187702655792,
"reward_std": 5.550888277590275,
"rewards/accuracy_reward_staging": 0.973281254991889,
"rewards/format_reward": 0.984375,
"rewards/format_reward_staging": 1.0,
"step": 2
},
{
"clip_ratio": 0.0,
"completion_length": 562.15625,
"epoch": 2.8,
"grad_norm": 1.3598583264581219,
"kl": 0.0012388229370117188,
"learning_rate": 1.5e-07,
"loss": 0.0066,
"reward": 11.326562702655792,
"reward_std": 4.338181830942631,
"rewards/accuracy_reward_staging": 0.935781279578805,
"rewards/format_reward": 0.984375,
"rewards/format_reward_staging": 0.984375,
"step": 3
},
{
"clip_ratio": 0.0,
"completion_length": 633.84375,
"epoch": 3.8,
"grad_norm": 1.2847021449800877,
"kl": 0.001153707504272461,
"learning_rate": 2e-07,
"loss": 0.0018,
"reward": 11.992187529802322,
"reward_std": 5.119553402066231,
"rewards/accuracy_reward_staging": 1.0039062658324838,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 0.984375,
"step": 4
},
{
"clip_ratio": 0.0,
"completion_length": 617.796875,
"epoch": 4.8,
"grad_norm": 1.3189665452513488,
"kl": 0.0013086795806884766,
"learning_rate": 2.5e-07,
"loss": 0.085,
"reward": 10.701562702655792,
"reward_std": 5.677764259278774,
"rewards/accuracy_reward_staging": 0.8795312605798244,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.96875,
"step": 5
},
{
"clip_ratio": 0.0,
"completion_length": 610.96875,
"epoch": 5.8,
"grad_norm": 1.2985032171428867,
"kl": 0.001140594482421875,
"learning_rate": 3e-07,
"loss": -0.0194,
"reward": 11.771875321865082,
"reward_std": 4.783194027841091,
"rewards/accuracy_reward_staging": 0.9771875143051147,
"rewards/format_reward": 1.0,
"rewards/format_reward_staging": 1.0,
"step": 6
},
{
"clip_ratio": 0.0,
"completion_length": 601.046875,
"epoch": 6.8,
"grad_norm": 1.3341628850896736,
"kl": 0.0011951923370361328,
"learning_rate": 3.5e-07,
"loss": -0.0596,
"reward": 10.256250113248825,
"reward_std": 4.867078542709351,
"rewards/accuracy_reward_staging": 0.827187517657876,
"rewards/format_reward": 0.984375,
"rewards/format_reward_staging": 1.0,
"step": 7
},
{
"clip_ratio": 0.0,
"completion_length": 606.59375,
"epoch": 7.8,
"grad_norm": 1.285368092332121,
"kl": 0.0010988712310791016,
"learning_rate": 4e-07,
"loss": -0.0081,
"reward": 9.478125274181366,
"reward_std": 3.7967969875317067,
"rewards/accuracy_reward_staging": 0.7493750108405948,
"rewards/format_reward": 1.0,
"rewards/format_reward_staging": 0.984375,
"step": 8
},
{
"clip_ratio": 0.0,
"completion_length": 622.796875,
"epoch": 8.8,
"grad_norm": 1.2619218024838643,
"kl": 0.0011816024780273438,
"learning_rate": 4.5e-07,
"loss": 0.022,
"reward": 9.204687714576721,
"reward_std": 4.6997692584991455,
"rewards/accuracy_reward_staging": 0.7235937705263495,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 1.0,
"step": 9
},
{
"epoch": 9.8,
"grad_norm": 1.3090135878224105,
"learning_rate": 5e-07,
"loss": 0.0552,
"step": 10
},
{
"epoch": 9.8,
"eval_clip_ratio": 0.0,
"eval_completion_length": 598.6,
"eval_kl": 0.001386260986328125,
"eval_loss": -0.007845744490623474,
"eval_reward": 10.647500252723693,
"eval_reward_std": 5.141342180967331,
"eval_rewards/accuracy_reward_staging": 0.8647500067949295,
"eval_rewards/format_reward": 1.0,
"eval_rewards/format_reward_staging": 1.0,
"eval_runtime": 128.7421,
"eval_samples_per_second": 0.155,
"eval_steps_per_second": 0.039,
"step": 10
},
{
"clip_ratio": 0.0,
"completion_length": 616.1640625,
"epoch": 10.8,
"grad_norm": 1.2259380562812106,
"kl": 0.0011686086654663086,
"learning_rate": 5.5e-07,
"loss": 0.021,
"reward": 10.721875354647636,
"reward_std": 4.659499041736126,
"rewards/accuracy_reward_staging": 0.8792187599465251,
"rewards/format_reward": 0.9609375,
"rewards/format_reward_staging": 0.96875,
"step": 11
},
{
"clip_ratio": 0.0,
"completion_length": 614.578125,
"epoch": 11.8,
"grad_norm": 1.1553077299419614,
"kl": 0.0011510848999023438,
"learning_rate": 6e-07,
"loss": -0.025,
"reward": 10.503125250339508,
"reward_std": 5.1386475414037704,
"rewards/accuracy_reward_staging": 0.8581250142306089,
"rewards/format_reward": 0.953125,
"rewards/format_reward_staging": 0.96875,
"step": 12
},
{
"clip_ratio": 0.0,
"completion_length": 612.90625,
"epoch": 12.8,
"grad_norm": 1.3576949395163465,
"kl": 0.0011174678802490234,
"learning_rate": 6.5e-07,
"loss": -0.0006,
"reward": 10.643750101327896,
"reward_std": 4.954892493784428,
"rewards/accuracy_reward_staging": 0.8737500086426735,
"rewards/format_reward": 0.953125,
"rewards/format_reward_staging": 0.953125,
"step": 13
},
{
"clip_ratio": 0.0,
"completion_length": 628.53125,
"epoch": 13.8,
"grad_norm": 1.2887380340653378,
"kl": 0.0011186599731445312,
"learning_rate": 7e-07,
"loss": -0.0706,
"reward": 8.992187768220901,
"reward_std": 4.08132154494524,
"rewards/accuracy_reward_staging": 0.7023437591269612,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 1.0,
"step": 14
},
{
"clip_ratio": 0.0,
"completion_length": 655.21875,
"epoch": 14.8,
"grad_norm": 1.3384872721535677,
"kl": 0.0011413097381591797,
"learning_rate": 7.5e-07,
"loss": -0.0176,
"reward": 10.270312935113907,
"reward_std": 5.108375668525696,
"rewards/accuracy_reward_staging": 0.837968748062849,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.953125,
"step": 15
},
{
"clip_ratio": 0.0,
"completion_length": 620.21875,
"epoch": 15.8,
"grad_norm": 1.3078665462402237,
"kl": 0.0012836456298828125,
"learning_rate": 8e-07,
"loss": -0.0292,
"reward": 10.915625095367432,
"reward_std": 5.460881970822811,
"rewards/accuracy_reward_staging": 0.8946875166147947,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 1.0,
"step": 16
},
{
"clip_ratio": 0.0,
"completion_length": 619.3125,
"epoch": 16.8,
"grad_norm": 1.3598193439821062,
"kl": 0.0014238357543945312,
"learning_rate": 8.499999999999999e-07,
"loss": 0.008,
"reward": 10.659375220537186,
"reward_std": 5.2481329292058945,
"rewards/accuracy_reward_staging": 0.8690625205636024,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 1.0,
"step": 17
},
{
"clip_ratio": 0.0,
"completion_length": 649.84375,
"epoch": 17.8,
"grad_norm": 1.3496406035053183,
"kl": 0.0015878677368164062,
"learning_rate": 9e-07,
"loss": 0.0445,
"reward": 10.225000023841858,
"reward_std": 4.735325090587139,
"rewards/accuracy_reward_staging": 0.8225000277161598,
"rewards/format_reward": 1.0,
"rewards/format_reward_staging": 1.0,
"step": 18
},
{
"clip_ratio": 0.0,
"completion_length": 572.265625,
"epoch": 18.8,
"grad_norm": 1.3727510323236323,
"kl": 0.0017957687377929688,
"learning_rate": 9.499999999999999e-07,
"loss": 0.0497,
"reward": 9.406250178813934,
"reward_std": 4.422982223331928,
"rewards/accuracy_reward_staging": 0.7437500189989805,
"rewards/format_reward": 0.984375,
"rewards/format_reward_staging": 0.984375,
"step": 19
},
{
"epoch": 19.8,
"grad_norm": 1.2508053733094389,
"learning_rate": 1e-06,
"loss": -0.0193,
"step": 20
},
{
"epoch": 19.8,
"eval_clip_ratio": 0.0,
"eval_completion_length": 644.85,
"eval_kl": 0.0017383575439453125,
"eval_loss": 0.013418617658317089,
"eval_reward": 12.118750143051148,
"eval_reward_std": 5.42993243932724,
"eval_rewards/accuracy_reward_staging": 1.0181250065565108,
"eval_rewards/format_reward": 0.9375,
"eval_rewards/format_reward_staging": 1.0,
"eval_runtime": 138.2951,
"eval_samples_per_second": 0.145,
"eval_steps_per_second": 0.036,
"step": 20
},
{
"clip_ratio": 0.0,
"completion_length": 618.1875,
"epoch": 20.8,
"grad_norm": 1.2769652330197667,
"kl": 0.0017538070678710938,
"learning_rate": 1.05e-06,
"loss": 0.0169,
"reward": 10.289843887090683,
"reward_std": 4.1728136613965034,
"rewards/accuracy_reward_staging": 0.8297656457871199,
"rewards/format_reward": 0.9921875,
"rewards/format_reward_staging": 1.0,
"step": 21
},
{
"clip_ratio": 0.0,
"completion_length": 586.828125,
"epoch": 21.8,
"grad_norm": 1.2550140515179407,
"kl": 0.0023107528686523438,
"learning_rate": 1.1e-06,
"loss": 0.0203,
"reward": 10.453125178813934,
"reward_std": 5.255412273108959,
"rewards/accuracy_reward_staging": 0.8484374992549419,
"rewards/format_reward": 0.984375,
"rewards/format_reward_staging": 0.984375,
"step": 22
},
{
"clip_ratio": 0.0,
"completion_length": 609.78125,
"epoch": 22.8,
"grad_norm": 1.1649225522797726,
"kl": 0.0023360252380371094,
"learning_rate": 1.1499999999999998e-06,
"loss": 0.0301,
"reward": 10.517187863588333,
"reward_std": 4.380151428282261,
"rewards/accuracy_reward_staging": 0.8532812558114529,
"rewards/format_reward": 0.984375,
"rewards/format_reward_staging": 1.0,
"step": 23
},
{
"clip_ratio": 0.0,
"completion_length": 640.1875,
"epoch": 23.8,
"grad_norm": 1.2148463960481974,
"kl": 0.0026197433471679688,
"learning_rate": 1.2e-06,
"loss": 0.0016,
"reward": 10.815625190734863,
"reward_std": 4.644554391503334,
"rewards/accuracy_reward_staging": 0.8893750132992864,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.984375,
"step": 24
},
{
"clip_ratio": 0.0,
"completion_length": 605.65625,
"epoch": 24.8,
"grad_norm": 1.2403401257649775,
"kl": 0.0026373863220214844,
"learning_rate": 1.2499999999999999e-06,
"loss": 0.0521,
"reward": 10.245312750339508,
"reward_std": 4.28605642169714,
"rewards/accuracy_reward_staging": 0.8323437552899122,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.984375,
"step": 25
},
{
"clip_ratio": 0.0,
"completion_length": 598.609375,
"epoch": 25.8,
"grad_norm": 1.3147177916400645,
"kl": 0.0031595230102539062,
"learning_rate": 1.3e-06,
"loss": 0.0414,
"reward": 10.621875256299973,
"reward_std": 5.236618235707283,
"rewards/accuracy_reward_staging": 0.8684375081211329,
"rewards/format_reward": 0.953125,
"rewards/format_reward_staging": 0.984375,
"step": 26
},
{
"clip_ratio": 0.0,
"completion_length": 625.40625,
"epoch": 26.8,
"grad_norm": 1.2154796498346485,
"kl": 0.003124237060546875,
"learning_rate": 1.35e-06,
"loss": -0.0316,
"reward": 11.971875131130219,
"reward_std": 4.97715250402689,
"rewards/accuracy_reward_staging": 1.0034375116229057,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 0.96875,
"step": 27
},
{
"clip_ratio": 0.0,
"completion_length": 669.484375,
"epoch": 27.8,
"grad_norm": 1.2295951327912775,
"kl": 0.004219532012939453,
"learning_rate": 1.4e-06,
"loss": 0.0328,
"reward": 10.487500160932541,
"reward_std": 4.095494709908962,
"rewards/accuracy_reward_staging": 0.8581250123679638,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.96875,
"step": 28
},
{
"clip_ratio": 0.0,
"completion_length": 642.875,
"epoch": 28.8,
"grad_norm": 1.322550495751781,
"kl": 0.00588226318359375,
"learning_rate": 1.4499999999999999e-06,
"loss": 0.06,
"reward": 10.221875250339508,
"reward_std": 4.399149507284164,
"rewards/accuracy_reward_staging": 0.8268750105053186,
"rewards/format_reward": 0.984375,
"rewards/format_reward_staging": 0.96875,
"step": 29
},
{
"epoch": 29.8,
"grad_norm": 1.2977104318293018,
"learning_rate": 1.5e-06,
"loss": 0.0428,
"step": 30
},
{
"epoch": 29.8,
"eval_clip_ratio": 0.0,
"eval_completion_length": 615.9875,
"eval_kl": 0.005646514892578125,
"eval_loss": -0.0016943871742114425,
"eval_reward": 10.84500024318695,
"eval_reward_std": 4.436952286958695,
"eval_rewards/accuracy_reward_staging": 0.8932500079274177,
"eval_rewards/format_reward": 0.925,
"eval_rewards/format_reward_staging": 0.9875,
"eval_runtime": 140.4185,
"eval_samples_per_second": 0.142,
"eval_steps_per_second": 0.036,
"step": 30
},
{
"clip_ratio": 0.0,
"completion_length": 602.734375,
"epoch": 30.8,
"grad_norm": 1.2642898418324884,
"kl": 0.006984233856201172,
"learning_rate": 1.55e-06,
"loss": -0.0342,
"reward": 10.63593776524067,
"reward_std": 4.29075089469552,
"rewards/accuracy_reward_staging": 0.865937520749867,
"rewards/format_reward": 0.9765625,
"rewards/format_reward_staging": 1.0,
"step": 31
},
{
"clip_ratio": 0.0,
"completion_length": 562.6875,
"epoch": 31.8,
"grad_norm": 1.3685850711260872,
"kl": 0.0068912506103515625,
"learning_rate": 1.6e-06,
"loss": 0.0138,
"reward": 10.934375166893005,
"reward_std": 5.272631503641605,
"rewards/accuracy_reward_staging": 0.8996875174343586,
"rewards/format_reward": 0.984375,
"rewards/format_reward_staging": 0.953125,
"step": 32
},
{
"clip_ratio": 0.0,
"completion_length": 612.984375,
"epoch": 32.8,
"grad_norm": 1.2202427650766963,
"kl": 0.00725555419921875,
"learning_rate": 1.6499999999999999e-06,
"loss": 0.0152,
"reward": 11.089062750339508,
"reward_std": 5.698997817933559,
"rewards/accuracy_reward_staging": 0.9151562694460154,
"rewards/format_reward": 0.953125,
"rewards/format_reward_staging": 0.984375,
"step": 33
},
{
"clip_ratio": 0.0,
"completion_length": 637.859375,
"epoch": 33.8,
"grad_norm": 1.238938352076857,
"kl": 0.00881195068359375,
"learning_rate": 1.6999999999999998e-06,
"loss": -0.0175,
"reward": 10.721875220537186,
"reward_std": 4.769842825829983,
"rewards/accuracy_reward_staging": 0.8784375097602606,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 1.0,
"step": 34
},
{
"clip_ratio": 0.0,
"completion_length": 659.125,
"epoch": 34.8,
"grad_norm": 1.2817921497759655,
"kl": 0.009099960327148438,
"learning_rate": 1.75e-06,
"loss": -0.0368,
"reward": 11.725000083446503,
"reward_std": 5.023090958595276,
"rewards/accuracy_reward_staging": 0.9881250187754631,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.9375,
"step": 35
},
{
"clip_ratio": 0.0,
"completion_length": 627.671875,
"epoch": 35.8,
"grad_norm": 1.2427123786378356,
"kl": 0.00952911376953125,
"learning_rate": 1.8e-06,
"loss": 0.0311,
"reward": 10.635937720537186,
"reward_std": 4.218031510710716,
"rewards/accuracy_reward_staging": 0.8682812862098217,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 0.984375,
"step": 36
},
{
"clip_ratio": 0.0,
"completion_length": 606.109375,
"epoch": 36.8,
"grad_norm": 1.2742370115467172,
"kl": 0.012310028076171875,
"learning_rate": 1.85e-06,
"loss": 0.0282,
"reward": 11.673437774181366,
"reward_std": 3.9488272815942764,
"rewards/accuracy_reward_staging": 0.9720312729477882,
"rewards/format_reward": 0.953125,
"rewards/format_reward_staging": 1.0,
"step": 37
},
{
"clip_ratio": 0.0,
"completion_length": 651.328125,
"epoch": 37.8,
"grad_norm": 1.1726321374661877,
"kl": 0.012788772583007812,
"learning_rate": 1.8999999999999998e-06,
"loss": 0.0163,
"reward": 10.459375202655792,
"reward_std": 4.296897903084755,
"rewards/accuracy_reward_staging": 0.852187518030405,
"rewards/format_reward": 0.953125,
"rewards/format_reward_staging": 0.984375,
"step": 38
},
{
"clip_ratio": 0.0,
"completion_length": 617.984375,
"epoch": 38.8,
"grad_norm": 1.2806374425181677,
"kl": 0.017009735107421875,
"learning_rate": 1.95e-06,
"loss": -0.0057,
"reward": 10.478125214576721,
"reward_std": 4.7161330208182335,
"rewards/accuracy_reward_staging": 0.8525000084191561,
"rewards/format_reward": 0.953125,
"rewards/format_reward_staging": 1.0,
"step": 39
},
{
"epoch": 39.8,
"grad_norm": 1.362092657080068,
"learning_rate": 2e-06,
"loss": -0.0269,
"step": 40
},
{
"epoch": 39.8,
"eval_clip_ratio": 0.0,
"eval_completion_length": 603.9875,
"eval_kl": 0.0187530517578125,
"eval_loss": 0.0036536618135869503,
"eval_reward": 11.611250162124634,
"eval_reward_std": 5.22377119064331,
"eval_rewards/accuracy_reward_staging": 0.9636250138282776,
"eval_rewards/format_reward": 0.975,
"eval_rewards/format_reward_staging": 1.0,
"eval_runtime": 132.1448,
"eval_samples_per_second": 0.151,
"eval_steps_per_second": 0.038,
"step": 40
},
{
"clip_ratio": 0.0,
"completion_length": 634.0,
"epoch": 40.8,
"grad_norm": 1.2640957862139377,
"kl": 0.018407821655273438,
"learning_rate": 1.999961923064171e-06,
"loss": -0.0634,
"reward": 11.232812687754631,
"reward_std": 5.111758019775152,
"rewards/accuracy_reward_staging": 0.9334375113248825,
"rewards/format_reward": 0.9296875,
"rewards/format_reward_staging": 0.96875,
"step": 41
},
{
"clip_ratio": 0.0,
"completion_length": 635.765625,
"epoch": 41.8,
"grad_norm": 1.268606166424927,
"kl": 0.01999664306640625,
"learning_rate": 1.9998476951563913e-06,
"loss": 0.0283,
"reward": 12.45000010728836,
"reward_std": 4.9740989953279495,
"rewards/accuracy_reward_staging": 1.0450000185519457,
"rewards/format_reward": 1.0,
"rewards/format_reward_staging": 1.0,
"step": 42
},
{
"clip_ratio": 0.0,
"completion_length": 594.1875,
"epoch": 42.8,
"grad_norm": 1.3740022650965131,
"kl": 0.020366668701171875,
"learning_rate": 1.999657324975557e-06,
"loss": -0.0149,
"reward": 11.234375149011612,
"reward_std": 5.008681446313858,
"rewards/accuracy_reward_staging": 0.9250000100582838,
"rewards/format_reward": 1.0,
"rewards/format_reward_staging": 0.984375,
"step": 43
},
{
"clip_ratio": 0.0,
"completion_length": 622.640625,
"epoch": 43.8,
"grad_norm": 1.2026028918578335,
"kl": 0.02169036865234375,
"learning_rate": 1.9993908270190957e-06,
"loss": 0.0018,
"reward": 11.873437762260437,
"reward_std": 4.005194254219532,
"rewards/accuracy_reward_staging": 0.9920312594622374,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 0.984375,
"step": 44
},
{
"clip_ratio": 0.0,
"completion_length": 637.0625,
"epoch": 44.8,
"grad_norm": 1.281104821624419,
"kl": 0.022716522216796875,
"learning_rate": 1.999048221581858e-06,
"loss": 0.0455,
"reward": 11.17031267285347,
"reward_std": 4.456828519701958,
"rewards/accuracy_reward_staging": 0.9201562497764826,
"rewards/format_reward": 0.984375,
"rewards/format_reward_staging": 0.984375,
"step": 45
},
{
"clip_ratio": 0.0,
"completion_length": 652.5,
"epoch": 45.8,
"grad_norm": 1.3614150864044823,
"kl": 0.0223236083984375,
"learning_rate": 1.998629534754574e-06,
"loss": 0.0205,
"reward": 10.348437696695328,
"reward_std": 4.60803659260273,
"rewards/accuracy_reward_staging": 0.8426562454551458,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.984375,
"step": 46
},
{
"clip_ratio": 0.0,
"completion_length": 626.359375,
"epoch": 46.8,
"grad_norm": 1.245324461252277,
"kl": 0.0242156982421875,
"learning_rate": 1.9981347984218667e-06,
"loss": 0.0056,
"reward": 13.950000077486038,
"reward_std": 5.2063538283109665,
"rewards/accuracy_reward_staging": 1.2059375159442425,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.953125,
"step": 47
},
{
"clip_ratio": 0.0,
"completion_length": 593.96875,
"epoch": 47.8,
"grad_norm": 1.3313171277926144,
"kl": 0.02922821044921875,
"learning_rate": 1.997564050259824e-06,
"loss": 0.0449,
"reward": 12.739062905311584,
"reward_std": 4.18791925907135,
"rewards/accuracy_reward_staging": 1.075468771159649,
"rewards/format_reward": 0.984375,
"rewards/format_reward_staging": 1.0,
"step": 48
},
{
"clip_ratio": 0.0,
"completion_length": 580.21875,
"epoch": 48.8,
"grad_norm": 1.3629349178288628,
"kl": 0.03372955322265625,
"learning_rate": 1.996917333733128e-06,
"loss": 0.0174,
"reward": 11.535937696695328,
"reward_std": 3.948319137096405,
"rewards/accuracy_reward_staging": 0.9614062653854489,
"rewards/format_reward": 0.953125,
"rewards/format_reward_staging": 0.96875,
"step": 49
},
{
"epoch": 49.8,
"grad_norm": 1.210177545645759,
"learning_rate": 1.9961946980917456e-06,
"loss": 0.0148,
"step": 50
},
{
"epoch": 49.8,
"eval_clip_ratio": 0.0,
"eval_completion_length": 681.0125,
"eval_kl": 0.0308837890625,
"eval_loss": 0.084346242249012,
"eval_reward": 12.700000238418578,
"eval_reward_std": 4.6470307350158695,
"eval_rewards/accuracy_reward_staging": 1.0800000175833702,
"eval_rewards/format_reward": 0.9375,
"eval_rewards/format_reward_staging": 0.9625,
"eval_runtime": 184.5613,
"eval_samples_per_second": 0.108,
"eval_steps_per_second": 0.027,
"step": 50
},
{
"clip_ratio": 0.0,
"completion_length": 635.09375,
"epoch": 50.8,
"grad_norm": 1.3066463171281288,
"kl": 0.033504486083984375,
"learning_rate": 1.9953961983671786e-06,
"loss": 0.026,
"reward": 11.925000175833702,
"reward_std": 5.033060222864151,
"rewards/accuracy_reward_staging": 0.9956250190734863,
"rewards/format_reward": 0.9765625,
"rewards/format_reward_staging": 0.9921875,
"step": 51
},
{
"clip_ratio": 0.0,
"completion_length": 633.59375,
"epoch": 51.8,
"grad_norm": 1.4900533150827393,
"kl": 0.040374755859375,
"learning_rate": 1.994521895368273e-06,
"loss": 0.0244,
"reward": 11.634375095367432,
"reward_std": 4.955964259803295,
"rewards/accuracy_reward_staging": 0.9665625263005495,
"rewards/format_reward": 0.984375,
"rewards/format_reward_staging": 0.984375,
"step": 52
},
{
"clip_ratio": 0.0,
"completion_length": 642.84375,
"epoch": 52.8,
"grad_norm": 1.3018535470423898,
"kl": 0.0359039306640625,
"learning_rate": 1.9935718556765874e-06,
"loss": 0.0176,
"reward": 13.220312714576721,
"reward_std": 6.300683185458183,
"rewards/accuracy_reward_staging": 1.1282812524586916,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 0.96875,
"step": 53
},
{
"clip_ratio": 0.0,
"completion_length": 625.59375,
"epoch": 53.8,
"grad_norm": 1.1747250249399175,
"kl": 0.0344696044921875,
"learning_rate": 1.992546151641322e-06,
"loss": 0.0279,
"reward": 12.729687660932541,
"reward_std": 3.8526118397712708,
"rewards/accuracy_reward_staging": 1.0854687504470348,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.96875,
"step": 54
},
{
"clip_ratio": 0.0,
"completion_length": 616.421875,
"epoch": 54.8,
"grad_norm": 1.4193243535352469,
"kl": 0.0395355224609375,
"learning_rate": 1.9914448613738106e-06,
"loss": 0.0064,
"reward": 13.129687666893005,
"reward_std": 5.78121767193079,
"rewards/accuracy_reward_staging": 1.1223437692970037,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.96875,
"step": 55
},
{
"clip_ratio": 0.0,
"completion_length": 593.609375,
"epoch": 55.8,
"grad_norm": 1.3523573855480968,
"kl": 0.04061126708984375,
"learning_rate": 1.99026806874157e-06,
"loss": 0.0142,
"reward": 13.071875303983688,
"reward_std": 6.487003266811371,
"rewards/accuracy_reward_staging": 1.1118750274181366,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 0.984375,
"step": 56
},
{
"clip_ratio": 0.0,
"completion_length": 606.71875,
"epoch": 56.8,
"grad_norm": 1.295861523746061,
"kl": 0.04427337646484375,
"learning_rate": 1.989015863361917e-06,
"loss": 0.0139,
"reward": 13.359375238418579,
"reward_std": 6.011465005576611,
"rewards/accuracy_reward_staging": 1.1375000104308128,
"rewards/format_reward": 0.984375,
"rewards/format_reward_staging": 1.0,
"step": 57
},
{
"clip_ratio": 0.0,
"completion_length": 632.140625,
"epoch": 57.8,
"grad_norm": 1.358415139567083,
"kl": 0.0454254150390625,
"learning_rate": 1.9876883405951377e-06,
"loss": 0.0217,
"reward": 12.626562535762787,
"reward_std": 4.465259864926338,
"rewards/accuracy_reward_staging": 1.0657812729477882,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 1.0,
"step": 58
},
{
"clip_ratio": 0.0,
"completion_length": 598.5,
"epoch": 58.8,
"grad_norm": 1.3223218232049598,
"kl": 0.046783447265625,
"learning_rate": 1.986285601537231e-06,
"loss": 0.0257,
"reward": 12.182812690734863,
"reward_std": 6.196883611381054,
"rewards/accuracy_reward_staging": 1.0292187482118607,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.953125,
"step": 59
},
{
"epoch": 59.8,
"grad_norm": 1.2192909123573732,
"learning_rate": 1.984807753012208e-06,
"loss": 0.0537,
"step": 60
},
{
"epoch": 59.8,
"eval_clip_ratio": 0.0,
"eval_completion_length": 641.15,
"eval_kl": 0.04580078125,
"eval_loss": -0.05469979718327522,
"eval_reward": 13.055000233650208,
"eval_reward_std": 5.354214292764664,
"eval_rewards/accuracy_reward_staging": 1.1130000218749045,
"eval_rewards/format_reward": 0.9625,
"eval_rewards/format_reward_staging": 0.9625,
"eval_runtime": 152.2218,
"eval_samples_per_second": 0.131,
"eval_steps_per_second": 0.033,
"step": 60
},
{
"clip_ratio": 0.0,
"completion_length": 637.5546875,
"epoch": 60.8,
"grad_norm": 1.2273226448891168,
"kl": 0.046173095703125,
"learning_rate": 1.9832549075639547e-06,
"loss": -0.0281,
"reward": 12.067969009280205,
"reward_std": 5.051002878695726,
"rewards/accuracy_reward_staging": 1.0114843952469528,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 0.984375,
"step": 61
},
{
"clip_ratio": 0.0,
"completion_length": 626.640625,
"epoch": 61.8,
"grad_norm": 1.2972851245460888,
"kl": 0.062713623046875,
"learning_rate": 1.981627183447664e-06,
"loss": 0.0389,
"reward": 12.120312690734863,
"reward_std": 3.9745979011058807,
"rewards/accuracy_reward_staging": 1.0229687616229057,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.953125,
"step": 62
},
{
"clip_ratio": 0.0,
"completion_length": 669.671875,
"epoch": 62.8,
"grad_norm": 1.433658602561295,
"kl": 0.05224609375,
"learning_rate": 1.9799247046208295e-06,
"loss": 0.0548,
"reward": 13.040625154972076,
"reward_std": 5.295711062848568,
"rewards/accuracy_reward_staging": 1.1118750162422657,
"rewards/format_reward": 0.953125,
"rewards/format_reward_staging": 0.96875,
"step": 63
},
{
"clip_ratio": 0.0,
"completion_length": 618.8125,
"epoch": 63.8,
"grad_norm": 1.2699526702646982,
"kl": 0.0532379150390625,
"learning_rate": 1.9781476007338054e-06,
"loss": 0.0405,
"reward": 12.978125303983688,
"reward_std": 5.858064912259579,
"rewards/accuracy_reward_staging": 1.100937519222498,
"rewards/format_reward": 0.984375,
"rewards/format_reward_staging": 0.984375,
"step": 64
},
{
"clip_ratio": 0.0,
"completion_length": 622.25,
"epoch": 64.8,
"grad_norm": 1.2887790245964135,
"kl": 0.0642242431640625,
"learning_rate": 1.976296007119933e-06,
"loss": 0.0309,
"reward": 13.806250274181366,
"reward_std": 5.3699341379106045,
"rewards/accuracy_reward_staging": 1.1900000236928463,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.96875,
"step": 65
},
{
"clip_ratio": 0.0,
"completion_length": 626.75,
"epoch": 65.8,
"grad_norm": 1.2658566344825595,
"kl": 0.0558319091796875,
"learning_rate": 1.9743700647852355e-06,
"loss": -0.0173,
"reward": 12.885937601327896,
"reward_std": 5.130606591701508,
"rewards/accuracy_reward_staging": 1.0917187500745058,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 1.0,
"step": 66
},
{
"clip_ratio": 0.0,
"completion_length": 603.84375,
"epoch": 66.8,
"grad_norm": 1.2421251352393117,
"kl": 0.0610198974609375,
"learning_rate": 1.9723699203976766e-06,
"loss": 0.0279,
"reward": 12.806250214576721,
"reward_std": 4.9495924392249435,
"rewards/accuracy_reward_staging": 1.0806250050663948,
"rewards/format_reward": 1.0,
"rewards/format_reward_staging": 1.0,
"step": 67
},
{
"clip_ratio": 0.0,
"completion_length": 610.90625,
"epoch": 67.8,
"grad_norm": 1.2743636487669978,
"kl": 0.07061767578125,
"learning_rate": 1.9702957262759963e-06,
"loss": 0.0096,
"reward": 12.381250366568565,
"reward_std": 4.895804196596146,
"rewards/accuracy_reward_staging": 1.0443749986588955,
"rewards/format_reward": 0.953125,
"rewards/format_reward_staging": 0.984375,
"step": 68
},
{
"clip_ratio": 0.0,
"completion_length": 578.734375,
"epoch": 68.8,
"grad_norm": 19.774598485500963,
"kl": 0.20587158203125,
"learning_rate": 1.9681476403781077e-06,
"loss": 0.0525,
"reward": 13.853125363588333,
"reward_std": 4.133783400058746,
"rewards/accuracy_reward_staging": 1.1900000181049109,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 0.984375,
"step": 69
},
{
"epoch": 69.8,
"grad_norm": 1.2630722978620754,
"learning_rate": 1.965925826289068e-06,
"loss": -0.05,
"step": 70
},
{
"epoch": 69.8,
"eval_clip_ratio": 0.0,
"eval_completion_length": 644.375,
"eval_kl": 0.06834716796875,
"eval_loss": 0.05038486793637276,
"eval_reward": 13.201250171661377,
"eval_reward_std": 5.853598284721374,
"eval_rewards/accuracy_reward_staging": 1.1376250192523003,
"eval_rewards/format_reward": 0.8875,
"eval_rewards/format_reward_staging": 0.9375,
"eval_runtime": 152.6561,
"eval_samples_per_second": 0.131,
"eval_steps_per_second": 0.033,
"step": 70
},
{
"clip_ratio": 0.0,
"completion_length": 617.921875,
"epoch": 70.8,
"grad_norm": 1.3309524076426662,
"kl": 0.0696563720703125,
"learning_rate": 1.963630453208623e-06,
"loss": 0.0613,
"reward": 13.469531431794167,
"reward_std": 5.184730686247349,
"rewards/accuracy_reward_staging": 1.1508593847975135,
"rewards/format_reward": 0.9765625,
"rewards/format_reward_staging": 0.984375,
"step": 71
},
{
"clip_ratio": 0.0,
"completion_length": 673.703125,
"epoch": 71.8,
"grad_norm": 1.3030149455750017,
"kl": 0.0673980712890625,
"learning_rate": 1.9612616959383188e-06,
"loss": 0.0537,
"reward": 14.193750262260437,
"reward_std": 4.9487489387393,
"rewards/accuracy_reward_staging": 1.2318750098347664,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.96875,
"step": 72
},
{
"clip_ratio": 0.0,
"completion_length": 646.5625,
"epoch": 72.8,
"grad_norm": 1.2069285825783527,
"kl": 0.0667724609375,
"learning_rate": 1.958819734868193e-06,
"loss": 0.0452,
"reward": 13.829687654972076,
"reward_std": 4.151405468583107,
"rewards/accuracy_reward_staging": 1.1845312640070915,
"rewards/format_reward": 0.984375,
"rewards/format_reward_staging": 1.0,
"step": 73
},
{
"clip_ratio": 0.0,
"completion_length": 675.25,
"epoch": 73.8,
"grad_norm": 12.30456605431996,
"kl": 0.144561767578125,
"learning_rate": 1.9563047559630356e-06,
"loss": 0.0238,
"reward": 15.618750125169754,
"reward_std": 5.085296101868153,
"rewards/accuracy_reward_staging": 1.3665625024586916,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 0.984375,
"step": 74
},
{
"clip_ratio": 0.0,
"completion_length": 627.640625,
"epoch": 74.8,
"grad_norm": 435.4454579019399,
"kl": 2.6427154541015625,
"learning_rate": 1.953716950748227e-06,
"loss": 0.1019,
"reward": 14.799999952316284,
"reward_std": 4.404626630246639,
"rewards/accuracy_reward_staging": 1.2831250056624413,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 1.0,
"step": 75
},
{
"clip_ratio": 0.0,
"completion_length": 591.453125,
"epoch": 75.8,
"grad_norm": 1.5728290641754585,
"kl": 0.0894012451171875,
"learning_rate": 1.9510565162951534e-06,
"loss": 0.0154,
"reward": 14.187500238418579,
"reward_std": 4.889563232660294,
"rewards/accuracy_reward_staging": 1.2265625279396772,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.984375,
"step": 76
},
{
"clip_ratio": 0.0,
"completion_length": 618.296875,
"epoch": 76.8,
"grad_norm": 2.0802354331013824,
"kl": 0.113311767578125,
"learning_rate": 1.948323655206199e-06,
"loss": 0.031,
"reward": 14.854687571525574,
"reward_std": 4.158232696354389,
"rewards/accuracy_reward_staging": 1.2885937616229057,
"rewards/format_reward": 0.984375,
"rewards/format_reward_staging": 0.984375,
"step": 77
},
{
"clip_ratio": 0.0,
"completion_length": 652.46875,
"epoch": 77.8,
"grad_norm": 1.388498981753556,
"kl": 0.086456298828125,
"learning_rate": 1.945518575599317e-06,
"loss": 0.0197,
"reward": 14.209375262260437,
"reward_std": 5.606824688613415,
"rewards/accuracy_reward_staging": 1.2318750135600567,
"rewards/format_reward": 0.921875,
"rewards/format_reward_staging": 0.96875,
"step": 78
},
{
"clip_ratio": 0.0,
"completion_length": 604.703125,
"epoch": 78.8,
"grad_norm": 1.2332136505931421,
"kl": 0.080596923828125,
"learning_rate": 1.9426414910921785e-06,
"loss": 0.0222,
"reward": 14.440624922513962,
"reward_std": 4.533382810652256,
"rewards/accuracy_reward_staging": 1.247187502682209,
"rewards/format_reward": 0.984375,
"rewards/format_reward_staging": 0.984375,
"step": 79
},
{
"epoch": 79.8,
"grad_norm": 1.2077461933733504,
"learning_rate": 1.9396926207859082e-06,
"loss": 0.0239,
"step": 80
},
{
"epoch": 79.8,
"eval_clip_ratio": 0.0,
"eval_completion_length": 592.1125,
"eval_kl": 0.077099609375,
"eval_loss": 0.03515242785215378,
"eval_reward": 14.468750166893006,
"eval_reward_std": 4.639398086071014,
"eval_rewards/accuracy_reward_staging": 1.2506250083446502,
"eval_rewards/format_reward": 0.9875,
"eval_rewards/format_reward_staging": 0.975,
"eval_runtime": 133.8538,
"eval_samples_per_second": 0.149,
"eval_steps_per_second": 0.037,
"step": 80
},
{
"clip_ratio": 0.0,
"completion_length": 644.046875,
"epoch": 80.8,
"grad_norm": 1.673636310153387,
"kl": 0.07987213134765625,
"learning_rate": 1.9366721892483973e-06,
"loss": 0.0333,
"reward": 14.308594018220901,
"reward_std": 3.8065029891440645,
"rewards/accuracy_reward_staging": 1.233984388411045,
"rewards/format_reward": 0.9765625,
"rewards/format_reward_staging": 0.9921875,
"step": 81
},
{
"clip_ratio": 0.0,
"completion_length": 644.65625,
"epoch": 81.8,
"grad_norm": 1.2475722094071593,
"kl": 0.07330322265625,
"learning_rate": 1.9335804264972015e-06,
"loss": -0.0326,
"reward": 12.793750315904617,
"reward_std": 4.888111189007759,
"rewards/accuracy_reward_staging": 1.080937497317791,
"rewards/format_reward": 0.984375,
"rewards/format_reward_staging": 1.0,
"step": 82
},
{
"clip_ratio": 0.0,
"completion_length": 600.578125,
"epoch": 82.8,
"grad_norm": 1.1629304999796195,
"kl": 0.0791168212890625,
"learning_rate": 1.9304175679820247e-06,
"loss": 0.0416,
"reward": 12.628125369548798,
"reward_std": 4.35176794230938,
"rewards/accuracy_reward_staging": 1.0706250071525574,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.984375,
"step": 83
},
{
"clip_ratio": 0.0,
"completion_length": 557.15625,
"epoch": 83.8,
"grad_norm": 1.439668786268354,
"kl": 0.084381103515625,
"learning_rate": 1.9271838545667875e-06,
"loss": 0.0776,
"reward": 12.189062774181366,
"reward_std": 4.2925035655498505,
"rewards/accuracy_reward_staging": 1.0235937517136335,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 0.984375,
"step": 84
},
{
"clip_ratio": 0.0,
"completion_length": 594.0625,
"epoch": 84.8,
"grad_norm": 1.3254589188157202,
"kl": 0.0747528076171875,
"learning_rate": 1.9238795325112867e-06,
"loss": 0.0619,
"reward": 15.909375101327896,
"reward_std": 5.054341539740562,
"rewards/accuracy_reward_staging": 1.3925000187009573,
"rewards/format_reward": 0.984375,
"rewards/format_reward_staging": 1.0,
"step": 85
},
{
"clip_ratio": 0.0,
"completion_length": 595.28125,
"epoch": 85.8,
"grad_norm": 1.2110318535178632,
"kl": 0.0708465576171875,
"learning_rate": 1.9205048534524403e-06,
"loss": 0.0277,
"reward": 13.023437589406967,
"reward_std": 4.805721327662468,
"rewards/accuracy_reward_staging": 1.1070312801748514,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 0.984375,
"step": 86
},
{
"clip_ratio": 0.0,
"completion_length": 613.15625,
"epoch": 86.8,
"grad_norm": 1.3901357478872436,
"kl": 0.0800323486328125,
"learning_rate": 1.917060074385124e-06,
"loss": 0.0142,
"reward": 14.523437321186066,
"reward_std": 4.984271876513958,
"rewards/accuracy_reward_staging": 1.257031261920929,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 0.984375,
"step": 87
},
{
"clip_ratio": 0.0,
"completion_length": 603.078125,
"epoch": 87.8,
"grad_norm": 1.338201226896483,
"kl": 0.0803375244140625,
"learning_rate": 1.9135454576426007e-06,
"loss": 0.0304,
"reward": 14.829687654972076,
"reward_std": 6.223069980740547,
"rewards/accuracy_reward_staging": 1.2939062640070915,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.953125,
"step": 88
},
{
"clip_ratio": 0.0,
"completion_length": 622.9375,
"epoch": 88.8,
"grad_norm": 2.521080873125213,
"kl": 0.1446990966796875,
"learning_rate": 1.909961270876543e-06,
"loss": 0.0222,
"reward": 14.043749928474426,
"reward_std": 5.054637104272842,
"rewards/accuracy_reward_staging": 1.2106250263750553,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 0.96875,
"step": 89
},
{
"epoch": 89.8,
"grad_norm": 1.20377714499055,
"learning_rate": 1.9063077870366499e-06,
"loss": 0.048,
"step": 90
},
{
"epoch": 89.8,
"eval_clip_ratio": 0.0,
"eval_completion_length": 648.2375,
"eval_kl": 0.078076171875,
"eval_loss": 0.07422037422657013,
"eval_reward": 14.006250190734864,
"eval_reward_std": 4.856718444824219,
"eval_rewards/accuracy_reward_staging": 1.2081250160932542,
"eval_rewards/format_reward": 0.95,
"eval_rewards/format_reward_staging": 0.975,
"eval_runtime": 145.5911,
"eval_samples_per_second": 0.137,
"eval_steps_per_second": 0.034,
"step": 90
},
{
"clip_ratio": 0.0,
"completion_length": 618.7265625,
"epoch": 90.8,
"grad_norm": 1.151785130187796,
"kl": 0.0818328857421875,
"learning_rate": 1.9025852843498606e-06,
"loss": -0.0394,
"reward": 14.92031255364418,
"reward_std": 5.041832268238068,
"rewards/accuracy_reward_staging": 1.2959375083446503,
"rewards/format_reward": 0.9765625,
"rewards/format_reward_staging": 0.984375,
"step": 91
},
{
"clip_ratio": 0.0,
"completion_length": 650.859375,
"epoch": 91.8,
"grad_norm": 1.0808576276629485,
"kl": 0.0752716064453125,
"learning_rate": 1.8987940462991669e-06,
"loss": 0.0142,
"reward": 13.932812631130219,
"reward_std": 5.106485404074192,
"rewards/accuracy_reward_staging": 1.1948437727987766,
"rewards/format_reward": 0.984375,
"rewards/format_reward_staging": 1.0,
"step": 92
},
{
"clip_ratio": 0.0,
"completion_length": 653.96875,
"epoch": 92.8,
"grad_norm": 1.1761749428173782,
"kl": 0.0748443603515625,
"learning_rate": 1.894934361602025e-06,
"loss": 0.061,
"reward": 13.756249964237213,
"reward_std": 4.668057285249233,
"rewards/accuracy_reward_staging": 1.185000006109476,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.96875,
"step": 93
},
{
"clip_ratio": 0.0,
"completion_length": 643.140625,
"epoch": 93.8,
"grad_norm": 1.1826779106192027,
"kl": 0.08154296875,
"learning_rate": 1.8910065241883678e-06,
"loss": 0.0113,
"reward": 15.871875286102295,
"reward_std": 5.009915418922901,
"rewards/accuracy_reward_staging": 1.3918750323355198,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 0.984375,
"step": 94
},
{
"clip_ratio": 0.0,
"completion_length": 640.71875,
"epoch": 94.8,
"grad_norm": 1.220691618497292,
"kl": 0.087249755859375,
"learning_rate": 1.8870108331782216e-06,
"loss": 0.0364,
"reward": 15.275000274181366,
"reward_std": 5.186372339725494,
"rewards/accuracy_reward_staging": 1.3353125043213367,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 0.953125,
"step": 95
},
{
"clip_ratio": 0.0,
"completion_length": 638.9375,
"epoch": 95.8,
"grad_norm": 1.262017317075815,
"kl": 0.092254638671875,
"learning_rate": 1.8829475928589268e-06,
"loss": 0.0112,
"reward": 11.41250017285347,
"reward_std": 5.790772080421448,
"rewards/accuracy_reward_staging": 0.9459375087171793,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 0.984375,
"step": 96
},
{
"clip_ratio": 0.0,
"completion_length": 650.140625,
"epoch": 96.8,
"grad_norm": 1.555339606868479,
"kl": 0.088897705078125,
"learning_rate": 1.8788171126619653e-06,
"loss": 0.0167,
"reward": 13.132812529802322,
"reward_std": 5.363104030489922,
"rewards/accuracy_reward_staging": 1.1226562578231096,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.96875,
"step": 97
},
{
"clip_ratio": 0.0,
"completion_length": 685.40625,
"epoch": 97.8,
"grad_norm": 1.1855081064145405,
"kl": 0.0860595703125,
"learning_rate": 1.8746197071393956e-06,
"loss": -0.0101,
"reward": 14.51250010728836,
"reward_std": 5.773593910038471,
"rewards/accuracy_reward_staging": 1.2575000114738941,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 0.96875,
"step": 98
},
{
"clip_ratio": 0.0,
"completion_length": 632.890625,
"epoch": 98.8,
"grad_norm": 1.2154159531082918,
"kl": 0.092254638671875,
"learning_rate": 1.8703556959398995e-06,
"loss": 0.0378,
"reward": 13.392187654972076,
"reward_std": 5.218963444232941,
"rewards/accuracy_reward_staging": 1.1423437520861626,
"rewards/format_reward": 0.984375,
"rewards/format_reward_staging": 0.984375,
"step": 99
},
{
"epoch": 99.8,
"grad_norm": 1.2652701809880356,
"learning_rate": 1.8660254037844386e-06,
"loss": 0.0158,
"step": 100
},
{
"epoch": 99.8,
"eval_clip_ratio": 0.0,
"eval_completion_length": 596.325,
"eval_kl": 0.170458984375,
"eval_loss": 0.038617830723524094,
"eval_reward": 13.322500276565552,
"eval_reward_std": 4.48788731098175,
"eval_rewards/accuracy_reward_staging": 1.1335000172257423,
"eval_rewards/format_reward": 0.9875,
"eval_rewards/format_reward_staging": 1.0,
"eval_runtime": 137.9647,
"eval_samples_per_second": 0.145,
"eval_steps_per_second": 0.036,
"step": 100
},
{
"clip_ratio": 0.0,
"completion_length": 625.546875,
"epoch": 100.8,
"grad_norm": 1.158984032803143,
"kl": 0.0985870361328125,
"learning_rate": 1.8616291604415257e-06,
"loss": 0.0013,
"reward": 13.735937476158142,
"reward_std": 5.272327609360218,
"rewards/accuracy_reward_staging": 1.174375013448298,
"rewards/format_reward": 0.9921875,
"rewards/format_reward_staging": 1.0,
"step": 101
},
{
"clip_ratio": 0.0,
"completion_length": 610.265625,
"epoch": 101.8,
"grad_norm": 1.1842728351019054,
"kl": 0.095916748046875,
"learning_rate": 1.8571673007021123e-06,
"loss": 0.0156,
"reward": 15.284374952316284,
"reward_std": 4.7574154287576675,
"rewards/accuracy_reward_staging": 1.330000001937151,
"rewards/format_reward": 0.984375,
"rewards/format_reward_staging": 1.0,
"step": 102
},
{
"clip_ratio": 0.0,
"completion_length": 640.171875,
"epoch": 102.8,
"grad_norm": 1.2460167784481468,
"kl": 0.09686279296875,
"learning_rate": 1.852640164354092e-06,
"loss": -0.0181,
"reward": 14.125000357627869,
"reward_std": 4.396180346608162,
"rewards/accuracy_reward_staging": 1.2203124929219484,
"rewards/format_reward": 0.953125,
"rewards/format_reward_staging": 0.96875,
"step": 103
},
{
"clip_ratio": 0.0,
"completion_length": 632.03125,
"epoch": 103.8,
"grad_norm": 1.2093960835662856,
"kl": 0.096099853515625,
"learning_rate": 1.8480480961564257e-06,
"loss": -0.0125,
"reward": 15.537500262260437,
"reward_std": 4.605620868504047,
"rewards/accuracy_reward_staging": 1.3553125225007534,
"rewards/format_reward": 0.984375,
"rewards/format_reward_staging": 1.0,
"step": 104
},
{
"clip_ratio": 0.0,
"completion_length": 626.1875,
"epoch": 104.8,
"grad_norm": 16.01421157394276,
"kl": 0.21142578125,
"learning_rate": 1.8433914458128857e-06,
"loss": 0.0579,
"reward": 13.903125166893005,
"reward_std": 6.153450347483158,
"rewards/accuracy_reward_staging": 1.1950000002980232,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 0.984375,
"step": 105
},
{
"clip_ratio": 0.0,
"completion_length": 646.25,
"epoch": 105.8,
"grad_norm": 1.3180024354800577,
"kl": 0.10040283203125,
"learning_rate": 1.838670567945424e-06,
"loss": 0.068,
"reward": 13.818750381469727,
"reward_std": 5.729592114686966,
"rewards/accuracy_reward_staging": 1.189687505364418,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.984375,
"step": 106
},
{
"clip_ratio": 0.0,
"completion_length": 607.875,
"epoch": 106.8,
"grad_norm": 2.9653782850707398,
"kl": 0.14129638671875,
"learning_rate": 1.833885822067168e-06,
"loss": 0.0536,
"reward": 15.423437595367432,
"reward_std": 6.023381091654301,
"rewards/accuracy_reward_staging": 1.3454687595367432,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 1.0,
"step": 107
},
{
"clip_ratio": 0.0,
"completion_length": 644.546875,
"epoch": 107.8,
"grad_norm": 1.4820796355726387,
"kl": 0.09906005859375,
"learning_rate": 1.8290375725550415e-06,
"loss": 0.097,
"reward": 14.023437529802322,
"reward_std": 6.225167877972126,
"rewards/accuracy_reward_staging": 1.2054687719792128,
"rewards/format_reward": 0.984375,
"rewards/format_reward_staging": 0.984375,
"step": 108
},
{
"clip_ratio": 0.0,
"completion_length": 585.25,
"epoch": 108.8,
"grad_norm": 1.653023129371335,
"kl": 0.114166259765625,
"learning_rate": 1.8241261886220154e-06,
"loss": 0.0807,
"reward": 14.356250017881393,
"reward_std": 5.447244621813297,
"rewards/accuracy_reward_staging": 1.2371875084936619,
"rewards/format_reward": 0.984375,
"rewards/format_reward_staging": 1.0,
"step": 109
},
{
"epoch": 109.8,
"grad_norm": 1.2519961806572524,
"learning_rate": 1.8191520442889917e-06,
"loss": 0.0487,
"step": 110
},
{
"epoch": 109.8,
"eval_clip_ratio": 0.0,
"eval_completion_length": 607.9625,
"eval_kl": 0.098583984375,
"eval_loss": 0.015465144999325275,
"eval_reward": 14.107500028610229,
"eval_reward_std": 5.255042427778244,
"eval_rewards/accuracy_reward_staging": 1.2157500088214874,
"eval_rewards/format_reward": 0.9625,
"eval_rewards/format_reward_staging": 0.9875,
"eval_runtime": 141.9444,
"eval_samples_per_second": 0.141,
"eval_steps_per_second": 0.035,
"step": 110
},
{
"clip_ratio": 0.0,
"completion_length": 595.484375,
"epoch": 110.8,
"grad_norm": 1.2811054745764274,
"kl": 0.112457275390625,
"learning_rate": 1.8141155183563193e-06,
"loss": 0.0085,
"reward": 14.736718833446503,
"reward_std": 5.505104329437017,
"rewards/accuracy_reward_staging": 1.2760156439617276,
"rewards/format_reward": 0.984375,
"rewards/format_reward_staging": 0.9921875,
"step": 111
},
{
"clip_ratio": 0.0,
"completion_length": 589.59375,
"epoch": 111.8,
"grad_norm": 1.2357687466322653,
"kl": 0.1142578125,
"learning_rate": 1.8090169943749474e-06,
"loss": -0.0064,
"reward": 13.935937494039536,
"reward_std": 4.747958414256573,
"rewards/accuracy_reward_staging": 1.195156266912818,
"rewards/format_reward": 1.0,
"rewards/format_reward_staging": 0.984375,
"step": 112
},
{
"clip_ratio": 0.0,
"completion_length": 604.953125,
"epoch": 112.8,
"grad_norm": 1.4229471167256136,
"kl": 0.14752197265625,
"learning_rate": 1.803856860617217e-06,
"loss": 0.0281,
"reward": 13.79843756556511,
"reward_std": 5.385790981352329,
"rewards/accuracy_reward_staging": 1.1845312714576721,
"rewards/format_reward": 0.984375,
"rewards/format_reward_staging": 0.96875,
"step": 113
},
{
"clip_ratio": 0.0,
"completion_length": 611.328125,
"epoch": 113.8,
"grad_norm": 24.422765952176345,
"kl": 0.330352783203125,
"learning_rate": 1.7986355100472927e-06,
"loss": 0.0504,
"reward": 14.092187762260437,
"reward_std": 5.095987647771835,
"rewards/accuracy_reward_staging": 1.2107812762260437,
"rewards/format_reward": 1.0,
"rewards/format_reward_staging": 0.984375,
"step": 114
},
{
"clip_ratio": 0.0,
"completion_length": 567.453125,
"epoch": 114.8,
"grad_norm": 5.413230038864145,
"kl": 0.17083740234375,
"learning_rate": 1.7933533402912351e-06,
"loss": 0.0736,
"reward": 13.521874904632568,
"reward_std": 4.76890967041254,
"rewards/accuracy_reward_staging": 1.1584375277161598,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 0.96875,
"step": 115
},
{
"clip_ratio": 0.0,
"completion_length": 589.703125,
"epoch": 115.8,
"grad_norm": 1.9014947184206379,
"kl": 0.17449951171875,
"learning_rate": 1.7880107536067217e-06,
"loss": 0.0221,
"reward": 12.971875101327896,
"reward_std": 5.6128582283854485,
"rewards/accuracy_reward_staging": 1.1065624989569187,
"rewards/format_reward": 0.953125,
"rewards/format_reward_staging": 0.953125,
"step": 116
},
{
"clip_ratio": 0.0,
"completion_length": 593.71875,
"epoch": 116.8,
"grad_norm": 5.015821786557117,
"kl": 0.3743896484375,
"learning_rate": 1.7826081568524138e-06,
"loss": 0.0006,
"reward": 14.44375005364418,
"reward_std": 5.561103023588657,
"rewards/accuracy_reward_staging": 1.247500006109476,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 1.0,
"step": 117
},
{
"clip_ratio": 0.0,
"completion_length": 541.34375,
"epoch": 117.8,
"grad_norm": 2.862584895806164,
"kl": 0.159332275390625,
"learning_rate": 1.7771459614569707e-06,
"loss": -0.0004,
"reward": 13.903124928474426,
"reward_std": 4.877812258899212,
"rewards/accuracy_reward_staging": 1.1950000133365393,
"rewards/format_reward": 1.0,
"rewards/format_reward_staging": 0.953125,
"step": 118
},
{
"clip_ratio": 0.0,
"completion_length": 552.203125,
"epoch": 118.8,
"grad_norm": 1.2925016251536252,
"kl": 0.136077880859375,
"learning_rate": 1.7716245833877198e-06,
"loss": 0.0437,
"reward": 14.979687631130219,
"reward_std": 5.167752608656883,
"rewards/accuracy_reward_staging": 1.3042187504470348,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 0.96875,
"step": 119
},
{
"epoch": 119.8,
"grad_norm": 1.268704610469971,
"learning_rate": 1.766044443118978e-06,
"loss": 0.0381,
"step": 120
},
{
"epoch": 119.8,
"eval_clip_ratio": 0.0,
"eval_completion_length": 613.875,
"eval_kl": 0.1421142578125,
"eval_loss": 0.042472995817661285,
"eval_reward": 14.473749923706055,
"eval_reward_std": 4.867543476819992,
"eval_rewards/accuracy_reward_staging": 1.2536250054836273,
"eval_rewards/format_reward": 0.95,
"eval_rewards/format_reward_staging": 0.9875,
"eval_runtime": 140.6458,
"eval_samples_per_second": 0.142,
"eval_steps_per_second": 0.036,
"step": 120
},
{
"clip_ratio": 0.0,
"completion_length": 632.1796875,
"epoch": 120.8,
"grad_norm": 1.232115329282542,
"kl": 0.1269073486328125,
"learning_rate": 1.760405965600031e-06,
"loss": 0.0088,
"reward": 14.570312559604645,
"reward_std": 5.063761539757252,
"rewards/accuracy_reward_staging": 1.262500001117587,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 0.9765625,
"step": 121
},
{
"clip_ratio": 0.0,
"completion_length": 631.671875,
"epoch": 121.8,
"grad_norm": 1.3808041357571472,
"kl": 0.14166259765625,
"learning_rate": 1.7547095802227721e-06,
"loss": 0.0158,
"reward": 13.695312559604645,
"reward_std": 5.737492188811302,
"rewards/accuracy_reward_staging": 1.1742187663912773,
"rewards/format_reward": 1.0,
"rewards/format_reward_staging": 0.953125,
"step": 122
},
{
"clip_ratio": 0.0,
"completion_length": 635.234375,
"epoch": 122.8,
"grad_norm": 1.247279598577896,
"kl": 0.12890625,
"learning_rate": 1.7489557207890023e-06,
"loss": 0.0455,
"reward": 12.946875035762787,
"reward_std": 4.728762552142143,
"rewards/accuracy_reward_staging": 1.100937519222498,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 0.96875,
"step": 123
},
{
"clip_ratio": 0.0,
"completion_length": 578.359375,
"epoch": 123.8,
"grad_norm": 1.3012048204012603,
"kl": 0.14288330078125,
"learning_rate": 1.743144825477394e-06,
"loss": 0.0237,
"reward": 14.440625131130219,
"reward_std": 5.539311669766903,
"rewards/accuracy_reward_staging": 1.2471875082701445,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 1.0,
"step": 124
},
{
"clip_ratio": 0.0,
"completion_length": 618.046875,
"epoch": 124.8,
"grad_norm": 1.3325921669841398,
"kl": 0.1463623046875,
"learning_rate": 1.737277336810124e-06,
"loss": 0.0604,
"reward": 12.951562702655792,
"reward_std": 3.5424299761652946,
"rewards/accuracy_reward_staging": 1.1014062613248825,
"rewards/format_reward": 0.953125,
"rewards/format_reward_staging": 0.984375,
"step": 125
},
{
"clip_ratio": 0.0,
"completion_length": 600.65625,
"epoch": 125.8,
"grad_norm": 1.692915618997029,
"kl": 0.158050537109375,
"learning_rate": 1.7313537016191704e-06,
"loss": 0.0314,
"reward": 15.428125023841858,
"reward_std": 5.270965404808521,
"rewards/accuracy_reward_staging": 1.3459375128149986,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 1.0,
"step": 126
},
{
"clip_ratio": 0.0,
"completion_length": 616.15625,
"epoch": 126.8,
"grad_norm": 1.1976264555767795,
"kl": 0.1231689453125,
"learning_rate": 1.7253743710122874e-06,
"loss": -0.0521,
"reward": 15.524999856948853,
"reward_std": 4.5880225002765656,
"rewards/accuracy_reward_staging": 1.3556249924004078,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 1.0,
"step": 127
},
{
"clip_ratio": 0.0,
"completion_length": 636.265625,
"epoch": 127.8,
"grad_norm": 1.5736176734740694,
"kl": 0.14605712890625,
"learning_rate": 1.719339800338651e-06,
"loss": -0.0053,
"reward": 13.117187559604645,
"reward_std": 4.341549597680569,
"rewards/accuracy_reward_staging": 1.1273437663912773,
"rewards/format_reward": 0.890625,
"rewards/format_reward_staging": 0.953125,
"step": 128
},
{
"clip_ratio": 0.0,
"completion_length": 646.5,
"epoch": 128.8,
"grad_norm": 2.8516785822154693,
"kl": 0.19573974609375,
"learning_rate": 1.7132504491541815e-06,
"loss": -0.0363,
"reward": 13.059375166893005,
"reward_std": 4.451074585318565,
"rewards/accuracy_reward_staging": 1.1121875122189522,
"rewards/format_reward": 0.953125,
"rewards/format_reward_staging": 0.984375,
"step": 129
},
{
"epoch": 129.8,
"grad_norm": 1.330165854405246,
"learning_rate": 1.7071067811865474e-06,
"loss": 0.0375,
"step": 130
},
{
"epoch": 129.8,
"eval_clip_ratio": 0.0,
"eval_completion_length": 637.2125,
"eval_kl": 0.1220947265625,
"eval_loss": 0.0286346934735775,
"eval_reward": 14.201250052452087,
"eval_reward_std": 5.366847103834152,
"eval_rewards/accuracy_reward_staging": 1.2251250058412553,
"eval_rewards/format_reward": 0.9625,
"eval_rewards/format_reward_staging": 0.9875,
"eval_runtime": 144.6284,
"eval_samples_per_second": 0.138,
"eval_steps_per_second": 0.035,
"step": 130
},
{
"clip_ratio": 0.0,
"completion_length": 584.5859375,
"epoch": 130.8,
"grad_norm": 1.307152401229373,
"kl": 0.1373443603515625,
"learning_rate": 1.7009092642998508e-06,
"loss": -0.0099,
"reward": 13.771875083446503,
"reward_std": 5.035757407546043,
"rewards/accuracy_reward_staging": 1.1803125254809856,
"rewards/format_reward": 0.984375,
"rewards/format_reward_staging": 0.984375,
"step": 131
},
{
"clip_ratio": 0.0,
"completion_length": 645.203125,
"epoch": 131.8,
"grad_norm": 1.2708814145835345,
"kl": 0.129486083984375,
"learning_rate": 1.6946583704589972e-06,
"loss": 0.0643,
"reward": 12.623437643051147,
"reward_std": 5.542073376476765,
"rewards/accuracy_reward_staging": 1.0654687564820051,
"rewards/format_reward": 1.0,
"rewards/format_reward_staging": 0.96875,
"step": 132
},
{
"clip_ratio": 0.0,
"completion_length": 600.453125,
"epoch": 132.8,
"grad_norm": 1.293321609591619,
"kl": 0.135955810546875,
"learning_rate": 1.6883545756937537e-06,
"loss": -0.0023,
"reward": 13.604687690734863,
"reward_std": 5.285826697945595,
"rewards/accuracy_reward_staging": 1.163593776524067,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 1.0,
"step": 133
},
{
"clip_ratio": 0.0,
"completion_length": 622.75,
"epoch": 133.8,
"grad_norm": 1.25404998096487,
"kl": 0.123260498046875,
"learning_rate": 1.6819983600624985e-06,
"loss": -0.0007,
"reward": 13.665625005960464,
"reward_std": 5.407533464720473,
"rewards/accuracy_reward_staging": 1.1743750125169754,
"rewards/format_reward": 0.953125,
"rewards/format_reward_staging": 0.96875,
"step": 134
},
{
"clip_ratio": 0.0,
"completion_length": 543.90625,
"epoch": 134.8,
"grad_norm": 1.4502660038061592,
"kl": 0.146087646484375,
"learning_rate": 1.6755902076156602e-06,
"loss": 0.0388,
"reward": 13.443750143051147,
"reward_std": 6.642620116472244,
"rewards/accuracy_reward_staging": 1.1490625031292439,
"rewards/format_reward": 0.984375,
"rewards/format_reward_staging": 0.96875,
"step": 135
},
{
"clip_ratio": 0.0,
"completion_length": 652.296875,
"epoch": 135.8,
"grad_norm": 1.2487523655371195,
"kl": 0.114105224609375,
"learning_rate": 1.669130606358858e-06,
"loss": 0.0277,
"reward": 14.48281255364418,
"reward_std": 5.782497301697731,
"rewards/accuracy_reward_staging": 1.2576562836766243,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 0.9375,
"step": 136
},
{
"clip_ratio": 0.0,
"completion_length": 624.75,
"epoch": 136.8,
"grad_norm": 1.3402764758528873,
"kl": 0.133087158203125,
"learning_rate": 1.6626200482157374e-06,
"loss": 0.0515,
"reward": 12.865624994039536,
"reward_std": 4.773457303643227,
"rewards/accuracy_reward_staging": 1.0959375277161598,
"rewards/format_reward": 0.953125,
"rewards/format_reward_staging": 0.953125,
"step": 137
},
{
"clip_ratio": 0.0,
"completion_length": 581.875,
"epoch": 137.8,
"grad_norm": 1.2528044000047398,
"kl": 0.121673583984375,
"learning_rate": 1.6560590289905071e-06,
"loss": 0.0046,
"reward": 14.821875035762787,
"reward_std": 5.201211467385292,
"rewards/accuracy_reward_staging": 1.2837499883025885,
"rewards/format_reward": 1.0,
"rewards/format_reward_staging": 0.984375,
"step": 138
},
{
"clip_ratio": 0.0,
"completion_length": 611.15625,
"epoch": 138.8,
"grad_norm": 1.6131227618811717,
"kl": 0.118804931640625,
"learning_rate": 1.6494480483301835e-06,
"loss": 0.0186,
"reward": 14.048437416553497,
"reward_std": 4.07040748000145,
"rewards/accuracy_reward_staging": 1.2157812491059303,
"rewards/format_reward": 0.921875,
"rewards/format_reward_staging": 0.96875,
"step": 139
},
{
"epoch": 139.8,
"grad_norm": 1.348569648877767,
"learning_rate": 1.6427876096865393e-06,
"loss": 0.0477,
"step": 140
},
{
"epoch": 139.8,
"eval_clip_ratio": 0.0,
"eval_completion_length": 630.375,
"eval_kl": 0.135888671875,
"eval_loss": 0.021134015172719955,
"eval_reward": 13.206250190734863,
"eval_reward_std": 4.864674496650696,
"eval_rewards/accuracy_reward_staging": 1.1293750241398812,
"eval_rewards/format_reward": 0.95,
"eval_rewards/format_reward_staging": 0.9625,
"eval_runtime": 154.772,
"eval_samples_per_second": 0.129,
"eval_steps_per_second": 0.032,
"step": 140
},
{
"clip_ratio": 0.0,
"completion_length": 667.46875,
"epoch": 140.8,
"grad_norm": 1.3550397685799231,
"kl": 0.1424407958984375,
"learning_rate": 1.6360782202777638e-06,
"loss": 0.022,
"reward": 13.272656485438347,
"reward_std": 5.271991036832333,
"rewards/accuracy_reward_staging": 1.135859395377338,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.9765625,
"step": 141
},
{
"clip_ratio": 0.0,
"completion_length": 581.15625,
"epoch": 141.8,
"grad_norm": 1.274351995849414,
"kl": 0.1263427734375,
"learning_rate": 1.6293203910498375e-06,
"loss": 0.0166,
"reward": 13.015625029802322,
"reward_std": 5.490728512406349,
"rewards/accuracy_reward_staging": 1.1062499918043613,
"rewards/format_reward": 0.953125,
"rewards/format_reward_staging": 1.0,
"step": 142
},
{
"clip_ratio": 0.0,
"completion_length": 581.9375,
"epoch": 142.8,
"grad_norm": 1.4551759286658215,
"kl": 0.131011962890625,
"learning_rate": 1.6225146366376196e-06,
"loss": 0.0763,
"reward": 13.957812488079071,
"reward_std": 5.092830486595631,
"rewards/accuracy_reward_staging": 1.2004687525331974,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 0.984375,
"step": 143
},
{
"clip_ratio": 0.0,
"completion_length": 616.640625,
"epoch": 143.8,
"grad_norm": 1.246097895354405,
"kl": 0.108062744140625,
"learning_rate": 1.615661475325658e-06,
"loss": 0.0785,
"reward": 13.190625131130219,
"reward_std": 4.567478813230991,
"rewards/accuracy_reward_staging": 1.1268750242888927,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 0.953125,
"step": 144
},
{
"clip_ratio": 0.0,
"completion_length": 641.4375,
"epoch": 144.8,
"grad_norm": 1.3492898310297994,
"kl": 0.137298583984375,
"learning_rate": 1.6087614290087205e-06,
"loss": 0.0778,
"reward": 13.187500149011612,
"reward_std": 5.05699796974659,
"rewards/accuracy_reward_staging": 1.1281250044703484,
"rewards/format_reward": 0.921875,
"rewards/format_reward_staging": 0.984375,
"step": 145
},
{
"clip_ratio": 0.0,
"completion_length": 615.421875,
"epoch": 145.8,
"grad_norm": 1.2389330703315027,
"kl": 0.11785888671875,
"learning_rate": 1.6018150231520484e-06,
"loss": -0.0105,
"reward": 13.951562494039536,
"reward_std": 4.975374720990658,
"rewards/accuracy_reward_staging": 1.1951562650501728,
"rewards/format_reward": 1.0,
"rewards/format_reward_staging": 1.0,
"step": 146
},
{
"clip_ratio": 0.0,
"completion_length": 628.953125,
"epoch": 146.8,
"grad_norm": 1.1484661979759183,
"kl": 0.10455322265625,
"learning_rate": 1.5948227867513413e-06,
"loss": 0.0083,
"reward": 13.078125178813934,
"reward_std": 4.869858503341675,
"rewards/accuracy_reward_staging": 1.117187526077032,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.96875,
"step": 147
},
{
"clip_ratio": 0.0,
"completion_length": 644.734375,
"epoch": 147.8,
"grad_norm": 3.8787328503041603,
"kl": 0.148834228515625,
"learning_rate": 1.587785252292473e-06,
"loss": 0.0525,
"reward": 12.510937660932541,
"reward_std": 5.20218176394701,
"rewards/accuracy_reward_staging": 1.0557812713086605,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 0.984375,
"step": 148
},
{
"clip_ratio": 0.0,
"completion_length": 622.6875,
"epoch": 148.8,
"grad_norm": 1.2193760476315514,
"kl": 0.125579833984375,
"learning_rate": 1.5807029557109397e-06,
"loss": 0.0084,
"reward": 13.721875101327896,
"reward_std": 5.613097697496414,
"rewards/accuracy_reward_staging": 1.180000003427267,
"rewards/format_reward": 0.953125,
"rewards/format_reward_staging": 0.96875,
"step": 149
},
{
"epoch": 149.8,
"grad_norm": 1.3382966777749914,
"learning_rate": 1.573576436351046e-06,
"loss": 0.0283,
"step": 150
},
{
"epoch": 149.8,
"eval_clip_ratio": 0.0,
"eval_completion_length": 617.3625,
"eval_kl": 0.1303466796875,
"eval_loss": 0.013808819465339184,
"eval_reward": 13.037500023841858,
"eval_reward_std": 4.972468680143356,
"eval_rewards/accuracy_reward_staging": 1.1087500318884849,
"eval_rewards/format_reward": 0.9625,
"eval_rewards/format_reward_staging": 0.9875,
"eval_runtime": 145.0293,
"eval_samples_per_second": 0.138,
"eval_steps_per_second": 0.034,
"step": 150
},
{
"clip_ratio": 0.0,
"completion_length": 587.78125,
"epoch": 150.8,
"grad_norm": 1.288894243312563,
"kl": 0.124786376953125,
"learning_rate": 1.5664062369248328e-06,
"loss": 0.0259,
"reward": 14.903124868869781,
"reward_std": 6.114742249250412,
"rewards/accuracy_reward_staging": 1.2934375274926424,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 1.0,
"step": 151
},
{
"clip_ratio": 0.0,
"completion_length": 604.6875,
"epoch": 151.8,
"grad_norm": 1.3773815626279793,
"kl": 0.126007080078125,
"learning_rate": 1.5591929034707466e-06,
"loss": 0.0712,
"reward": 14.687500357627869,
"reward_std": 5.71131344884634,
"rewards/accuracy_reward_staging": 1.273437511175871,
"rewards/format_reward": 0.984375,
"rewards/format_reward_staging": 0.96875,
"step": 152
},
{
"clip_ratio": 0.0,
"completion_length": 615.921875,
"epoch": 152.8,
"grad_norm": 1.4485533003534696,
"kl": 0.135894775390625,
"learning_rate": 1.551936985312058e-06,
"loss": 0.0497,
"reward": 15.312499761581421,
"reward_std": 4.291183479130268,
"rewards/accuracy_reward_staging": 1.3562500244006515,
"rewards/format_reward": 0.828125,
"rewards/format_reward_staging": 0.921875,
"step": 153
},
{
"clip_ratio": 0.0,
"completion_length": 652.75,
"epoch": 153.8,
"grad_norm": 1.283755345470008,
"kl": 0.14154052734375,
"learning_rate": 1.544639035015027e-06,
"loss": 0.0349,
"reward": 13.10937511920929,
"reward_std": 5.929120138287544,
"rewards/accuracy_reward_staging": 1.1281250044703484,
"rewards/format_reward": 0.859375,
"rewards/format_reward_staging": 0.96875,
"step": 154
},
{
"clip_ratio": 0.0,
"completion_length": 579.21875,
"epoch": 154.8,
"grad_norm": 4.3223327812781625,
"kl": 0.21929931640625,
"learning_rate": 1.537299608346824e-06,
"loss": 0.0015,
"reward": 16.134375244379044,
"reward_std": 5.386517338454723,
"rewards/accuracy_reward_staging": 1.4243750181049109,
"rewards/format_reward": 0.921875,
"rewards/format_reward_staging": 0.96875,
"step": 155
},
{
"clip_ratio": 0.0,
"completion_length": 616.15625,
"epoch": 155.8,
"grad_norm": 1.600841914035174,
"kl": 0.154449462890625,
"learning_rate": 1.5299192642332049e-06,
"loss": 0.0289,
"reward": 14.656250029802322,
"reward_std": 5.294310428202152,
"rewards/accuracy_reward_staging": 1.2781250141561031,
"rewards/format_reward": 0.890625,
"rewards/format_reward_staging": 0.984375,
"step": 156
},
{
"clip_ratio": 0.0,
"completion_length": 589.578125,
"epoch": 156.8,
"grad_norm": 1.4513044601818303,
"kl": 0.16229248046875,
"learning_rate": 1.5224985647159488e-06,
"loss": 0.0441,
"reward": 14.590624958276749,
"reward_std": 5.033867612481117,
"rewards/accuracy_reward_staging": 1.273124998435378,
"rewards/format_reward": 0.921875,
"rewards/format_reward_staging": 0.9375,
"step": 157
},
{
"clip_ratio": 0.0,
"completion_length": 599.515625,
"epoch": 157.8,
"grad_norm": 1.3153063149563227,
"kl": 0.148529052734375,
"learning_rate": 1.5150380749100543e-06,
"loss": 0.0618,
"reward": 15.898437559604645,
"reward_std": 4.4645668268203735,
"rewards/accuracy_reward_staging": 1.4023437574505806,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.96875,
"step": 158
},
{
"clip_ratio": 0.0,
"completion_length": 604.359375,
"epoch": 158.8,
"grad_norm": 8.479997272303532,
"kl": 0.165130615234375,
"learning_rate": 1.5075383629607041e-06,
"loss": 0.0777,
"reward": 13.142187595367432,
"reward_std": 5.132370471954346,
"rewards/accuracy_reward_staging": 1.132968744263053,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.90625,
"step": 159
},
{
"epoch": 159.8,
"grad_norm": 1.1917014388947642,
"learning_rate": 1.5e-06,
"loss": -0.0404,
"step": 160
},
{
"epoch": 159.8,
"eval_clip_ratio": 0.0,
"eval_completion_length": 638.1625,
"eval_kl": 0.14365234375,
"eval_loss": 0.08888934552669525,
"eval_reward": 14.796250009536744,
"eval_reward_std": 5.661263364553451,
"eval_rewards/accuracy_reward_staging": 1.295874996483326,
"eval_rewards/format_reward": 0.9,
"eval_rewards/format_reward_staging": 0.9375,
"eval_runtime": 146.8527,
"eval_samples_per_second": 0.136,
"eval_steps_per_second": 0.034,
"step": 160
},
{
"clip_ratio": 0.0,
"completion_length": 651.7421875,
"epoch": 160.8,
"grad_norm": 1.2387655968511657,
"kl": 0.1446075439453125,
"learning_rate": 1.4924235601034672e-06,
"loss": 0.0701,
"reward": 15.571094110608101,
"reward_std": 5.18467765673995,
"rewards/accuracy_reward_staging": 1.3766406429931521,
"rewards/format_reward": 0.84375,
"rewards/format_reward_staging": 0.9609375,
"step": 161
},
{
"clip_ratio": 0.0,
"completion_length": 600.4375,
"epoch": 161.8,
"grad_norm": 1.229465314986415,
"kl": 0.13531494140625,
"learning_rate": 1.4848096202463372e-06,
"loss": -0.0057,
"reward": 16.17187523841858,
"reward_std": 4.812445372343063,
"rewards/accuracy_reward_staging": 1.4234375022351742,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 1.0,
"step": 162
},
{
"clip_ratio": 0.0,
"completion_length": 582.125,
"epoch": 162.8,
"grad_norm": 1.3529028258154412,
"kl": 0.1470947265625,
"learning_rate": 1.4771587602596083e-06,
"loss": 0.0891,
"reward": 15.801562637090683,
"reward_std": 4.80203927308321,
"rewards/accuracy_reward_staging": 1.392656246200204,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.96875,
"step": 163
},
{
"clip_ratio": 0.0,
"completion_length": 611.609375,
"epoch": 163.8,
"grad_norm": 1.4068895541908286,
"kl": 0.15350341796875,
"learning_rate": 1.4694715627858908e-06,
"loss": 0.0591,
"reward": 14.951562643051147,
"reward_std": 5.22976279258728,
"rewards/accuracy_reward_staging": 1.3045312520116568,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.96875,
"step": 164
},
{
"clip_ratio": 0.0,
"completion_length": 609.671875,
"epoch": 164.8,
"grad_norm": 1.2658845630248547,
"kl": 0.1463623046875,
"learning_rate": 1.461748613235034e-06,
"loss": 0.0266,
"reward": 14.03125,
"reward_std": 5.7937397211790085,
"rewards/accuracy_reward_staging": 1.215625025331974,
"rewards/format_reward": 0.921875,
"rewards/format_reward_staging": 0.953125,
"step": 165
},
{
"clip_ratio": 0.0,
"completion_length": 589.875,
"epoch": 165.8,
"grad_norm": 1.3344028785966706,
"kl": 0.1558837890625,
"learning_rate": 1.4539904997395467e-06,
"loss": 0.0711,
"reward": 15.390625029802322,
"reward_std": 4.8061781376600266,
"rewards/accuracy_reward_staging": 1.3515625335276127,
"rewards/format_reward": 0.890625,
"rewards/format_reward_staging": 0.984375,
"step": 166
},
{
"clip_ratio": 0.0,
"completion_length": 607.0625,
"epoch": 166.8,
"grad_norm": 1.6122384935377623,
"kl": 0.178619384765625,
"learning_rate": 1.4461978131098087e-06,
"loss": 0.0224,
"reward": 12.757812649011612,
"reward_std": 5.976896375417709,
"rewards/accuracy_reward_staging": 1.0867187604308128,
"rewards/format_reward": 0.921875,
"rewards/format_reward_staging": 0.96875,
"step": 167
},
{
"clip_ratio": 0.0,
"completion_length": 555.21875,
"epoch": 167.8,
"grad_norm": 1.3011797765950504,
"kl": 0.15509033203125,
"learning_rate": 1.4383711467890773e-06,
"loss": -0.0096,
"reward": 16.72499978542328,
"reward_std": 4.908542029559612,
"rewards/accuracy_reward_staging": 1.4756250157952309,
"rewards/format_reward": 0.984375,
"rewards/format_reward_staging": 0.984375,
"step": 168
},
{
"clip_ratio": 0.0,
"completion_length": 619.546875,
"epoch": 168.8,
"grad_norm": 1.71128104516776,
"kl": 0.15618896484375,
"learning_rate": 1.430511096808295e-06,
"loss": 0.1091,
"reward": 15.999999672174454,
"reward_std": 4.338721185922623,
"rewards/accuracy_reward_staging": 1.4124999977648258,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.96875,
"step": 169
},
{
"epoch": 169.8,
"grad_norm": 1.406915756546695,
"learning_rate": 1.4226182617406994e-06,
"loss": 0.0365,
"step": 170
},
{
"epoch": 169.8,
"eval_clip_ratio": 0.0,
"eval_completion_length": 599.3375,
"eval_kl": 0.1652587890625,
"eval_loss": 0.05737342685461044,
"eval_reward": 15.347499918937682,
"eval_reward_std": 5.264538067579269,
"eval_rewards/accuracy_reward_staging": 1.346000000834465,
"eval_rewards/format_reward": 0.925,
"eval_rewards/format_reward_staging": 0.9625,
"eval_runtime": 128.6979,
"eval_samples_per_second": 0.155,
"eval_steps_per_second": 0.039,
"step": 170
},
{
"clip_ratio": 0.0,
"completion_length": 607.75,
"epoch": 170.8,
"grad_norm": 1.4282138543951368,
"kl": 0.17828369140625,
"learning_rate": 1.414693242656239e-06,
"loss": -0.0169,
"reward": 15.774218946695328,
"reward_std": 5.630698639899492,
"rewards/accuracy_reward_staging": 1.3899218812584877,
"rewards/format_reward": 0.921875,
"rewards/format_reward_staging": 0.953125,
"step": 171
},
{
"clip_ratio": 0.0,
"completion_length": 597.484375,
"epoch": 171.8,
"grad_norm": 2.842026235946327,
"kl": 0.22344970703125,
"learning_rate": 1.4067366430758004e-06,
"loss": 0.0117,
"reward": 13.404687702655792,
"reward_std": 5.611785896122456,
"rewards/accuracy_reward_staging": 1.1482812836766243,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 0.953125,
"step": 172
},
{
"clip_ratio": 0.0,
"completion_length": 593.34375,
"epoch": 172.8,
"grad_norm": 1.355496874052129,
"kl": 0.178131103515625,
"learning_rate": 1.3987490689252462e-06,
"loss": 0.0242,
"reward": 15.953125089406967,
"reward_std": 4.999604664742947,
"rewards/accuracy_reward_staging": 1.403124986216426,
"rewards/format_reward": 0.921875,
"rewards/format_reward_staging": 1.0,
"step": 173
},
{
"clip_ratio": 0.0,
"completion_length": 636.546875,
"epoch": 173.8,
"grad_norm": 1.2708001429966154,
"kl": 0.1480712890625,
"learning_rate": 1.3907311284892735e-06,
"loss": 0.0781,
"reward": 16.860937863588333,
"reward_std": 5.687329366803169,
"rewards/accuracy_reward_staging": 1.4970312751829624,
"rewards/format_reward": 0.921875,
"rewards/format_reward_staging": 0.96875,
"step": 174
},
{
"clip_ratio": 0.0,
"completion_length": 629.34375,
"epoch": 174.8,
"grad_norm": 1.2881497238833757,
"kl": 0.184661865234375,
"learning_rate": 1.3826834323650898e-06,
"loss": 0.0465,
"reward": 14.581250131130219,
"reward_std": 5.882216438651085,
"rewards/accuracy_reward_staging": 1.2831250075250864,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.875,
"step": 175
},
{
"clip_ratio": 0.0,
"completion_length": 636.1875,
"epoch": 175.8,
"grad_norm": 1.3087334876201022,
"kl": 0.180084228515625,
"learning_rate": 1.374606593415912e-06,
"loss": 0.0515,
"reward": 15.878125071525574,
"reward_std": 5.341188468039036,
"rewards/accuracy_reward_staging": 1.4081249758601189,
"rewards/format_reward": 0.859375,
"rewards/format_reward_staging": 0.9375,
"step": 176
},
{
"clip_ratio": 0.0,
"completion_length": 620.109375,
"epoch": 176.8,
"grad_norm": 1.4376210080253358,
"kl": 0.1929931640625,
"learning_rate": 1.3665012267242972e-06,
"loss": 0.0086,
"reward": 13.495312541723251,
"reward_std": 5.61242138594389,
"rewards/accuracy_reward_staging": 1.1635937709361315,
"rewards/format_reward": 0.921875,
"rewards/format_reward_staging": 0.9375,
"step": 177
},
{
"clip_ratio": 0.0,
"completion_length": 626.03125,
"epoch": 177.8,
"grad_norm": 1772489.589709048,
"kl": 7520.199645996094,
"learning_rate": 1.3583679495453e-06,
"loss": 413.327,
"reward": 14.464062601327896,
"reward_std": 6.149559870362282,
"rewards/accuracy_reward_staging": 1.2729687709361315,
"rewards/format_reward": 0.859375,
"rewards/format_reward_staging": 0.875,
"step": 178
},
{
"clip_ratio": 0.0,
"completion_length": 622.34375,
"epoch": 178.8,
"grad_norm": 1.2715649881539406,
"kl": 0.173065185546875,
"learning_rate": 1.3502073812594674e-06,
"loss": -0.0255,
"reward": 16.45468744635582,
"reward_std": 5.025842607021332,
"rewards/accuracy_reward_staging": 1.4610937684774399,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.96875,
"step": 179
},
{
"epoch": 179.8,
"grad_norm": 1.3470338291497799,
"learning_rate": 1.3420201433256689e-06,
"loss": 0.0132,
"step": 180
},
{
"epoch": 179.8,
"eval_clip_ratio": 0.0,
"eval_completion_length": 644.8375,
"eval_kl": 0.1693603515625,
"eval_loss": 0.07019542157649994,
"eval_reward": 15.515000081062317,
"eval_reward_std": 4.510845869779587,
"eval_rewards/accuracy_reward_staging": 1.3702499970793725,
"eval_rewards/format_reward": 0.8625,
"eval_rewards/format_reward_staging": 0.95,
"eval_runtime": 140.9488,
"eval_samples_per_second": 0.142,
"eval_steps_per_second": 0.035,
"step": 180
},
{
"clip_ratio": 0.0,
"completion_length": 602.9765625,
"epoch": 180.8,
"grad_norm": 1.3491479753013993,
"kl": 0.183990478515625,
"learning_rate": 1.3338068592337708e-06,
"loss": 0.0611,
"reward": 15.047656297683716,
"reward_std": 5.386000510305166,
"rewards/accuracy_reward_staging": 1.3149218782782555,
"rewards/format_reward": 0.9453125,
"rewards/format_reward_staging": 0.953125,
"step": 181
},
{
"clip_ratio": 0.0,
"completion_length": 595.71875,
"epoch": 181.8,
"grad_norm": 1.3770681193765026,
"kl": 0.19049072265625,
"learning_rate": 1.3255681544571566e-06,
"loss": 0.0247,
"reward": 15.55312505364418,
"reward_std": 5.63801646232605,
"rewards/accuracy_reward_staging": 1.3615624941885471,
"rewards/format_reward": 0.953125,
"rewards/format_reward_staging": 0.984375,
"step": 182
},
{
"clip_ratio": 0.0,
"completion_length": 615.59375,
"epoch": 182.8,
"grad_norm": 1.4258618367759082,
"kl": 0.18341064453125,
"learning_rate": 1.3173046564050923e-06,
"loss": 0.0382,
"reward": 15.492187589406967,
"reward_std": 5.1280196234583855,
"rewards/accuracy_reward_staging": 1.3664062581956387,
"rewards/format_reward": 0.890625,
"rewards/format_reward_staging": 0.9375,
"step": 183
},
{
"clip_ratio": 0.0,
"completion_length": 657.21875,
"epoch": 183.8,
"grad_norm": 1.4496706627719373,
"kl": 0.2099609375,
"learning_rate": 1.3090169943749473e-06,
"loss": -0.0093,
"reward": 15.481249988079071,
"reward_std": 4.195666573941708,
"rewards/accuracy_reward_staging": 1.360624998807907,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.96875,
"step": 184
},
{
"clip_ratio": 0.0,
"completion_length": 643.625,
"epoch": 184.8,
"grad_norm": 12.416744832467893,
"kl": 0.35125732421875,
"learning_rate": 1.3007057995042729e-06,
"loss": 0.0552,
"reward": 16.317187398672104,
"reward_std": 5.490992607548833,
"rewards/accuracy_reward_staging": 1.4489062502980232,
"rewards/format_reward": 0.859375,
"rewards/format_reward_staging": 0.96875,
"step": 185
},
{
"clip_ratio": 0.0,
"completion_length": 623.515625,
"epoch": 185.8,
"grad_norm": 1.3589509266087427,
"kl": 0.20068359375,
"learning_rate": 1.2923717047227368e-06,
"loss": 0.0935,
"reward": 13.135937482118607,
"reward_std": 5.502887517213821,
"rewards/accuracy_reward_staging": 1.1276562362909317,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.953125,
"step": 186
},
{
"clip_ratio": 0.0,
"completion_length": 617.5625,
"epoch": 186.8,
"grad_norm": 5.614254453009779,
"kl": 0.2403564453125,
"learning_rate": 1.2840153447039228e-06,
"loss": 0.0561,
"reward": 14.834374755620956,
"reward_std": 6.0779377073049545,
"rewards/accuracy_reward_staging": 1.2990624997764826,
"rewards/format_reward": 0.890625,
"rewards/format_reward_staging": 0.953125,
"step": 187
},
{
"clip_ratio": 0.0,
"completion_length": 636.03125,
"epoch": 187.8,
"grad_norm": 1.3627377674239467,
"kl": 0.1864013671875,
"learning_rate": 1.275637355816999e-06,
"loss": 0.078,
"reward": 13.956250250339508,
"reward_std": 5.885370120406151,
"rewards/accuracy_reward_staging": 1.2112500164657831,
"rewards/format_reward": 0.921875,
"rewards/format_reward_staging": 0.921875,
"step": 188
},
{
"clip_ratio": 0.0,
"completion_length": 605.875,
"epoch": 188.8,
"grad_norm": 1.4102127840998477,
"kl": 0.1846923828125,
"learning_rate": 1.2672383760782567e-06,
"loss": 0.0346,
"reward": 14.612500011920929,
"reward_std": 6.431307382881641,
"rewards/accuracy_reward_staging": 1.2737499997019768,
"rewards/format_reward": 0.921875,
"rewards/format_reward_staging": 0.953125,
"step": 189
},
{
"epoch": 189.8,
"grad_norm": 1.2657153918632975,
"learning_rate": 1.2588190451025207e-06,
"loss": 0.0905,
"step": 190
},
{
"epoch": 189.8,
"eval_clip_ratio": 0.0,
"eval_completion_length": 639.3875,
"eval_kl": 0.183544921875,
"eval_loss": 0.09273408353328705,
"eval_reward": 14.913750052452087,
"eval_reward_std": 4.527625149488449,
"eval_rewards/accuracy_reward_staging": 1.312625017762184,
"eval_rewards/format_reward": 0.875,
"eval_rewards/format_reward_staging": 0.9125,
"eval_runtime": 149.5357,
"eval_samples_per_second": 0.134,
"eval_steps_per_second": 0.033,
"step": 190
},
{
"clip_ratio": 0.0,
"completion_length": 637.03125,
"epoch": 190.8,
"grad_norm": 1.5561907060681732,
"kl": 0.20361328125,
"learning_rate": 1.2503800040544414e-06,
"loss": 0.027,
"reward": 13.858593851327896,
"reward_std": 5.260060213506222,
"rewards/accuracy_reward_staging": 1.199921895749867,
"rewards/format_reward": 0.9140625,
"rewards/format_reward_staging": 0.9453125,
"step": 191
},
{
"clip_ratio": 0.0,
"completion_length": 614.578125,
"epoch": 191.8,
"grad_norm": 1.4112962880914386,
"kl": 0.173828125,
"learning_rate": 1.2419218955996676e-06,
"loss": 0.0162,
"reward": 15.854687571525574,
"reward_std": 4.764394700527191,
"rewards/accuracy_reward_staging": 1.3979687802493572,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.96875,
"step": 192
},
{
"clip_ratio": 0.0,
"completion_length": 628.078125,
"epoch": 192.8,
"grad_norm": 1.329564996171061,
"kl": 0.178253173828125,
"learning_rate": 1.2334453638559054e-06,
"loss": 0.0255,
"reward": 13.543750017881393,
"reward_std": 5.472193785011768,
"rewards/accuracy_reward_staging": 1.168437510728836,
"rewards/format_reward": 0.921875,
"rewards/format_reward_staging": 0.9375,
"step": 193
},
{
"clip_ratio": 0.0,
"completion_length": 657.796875,
"epoch": 193.8,
"grad_norm": 1.8007713245581922,
"kl": 0.198944091796875,
"learning_rate": 1.2249510543438651e-06,
"loss": 0.0516,
"reward": 14.325000017881393,
"reward_std": 4.515300907194614,
"rewards/accuracy_reward_staging": 1.2465624995529652,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.953125,
"step": 194
},
{
"clip_ratio": 0.0,
"completion_length": 623.984375,
"epoch": 194.8,
"grad_norm": 1.2974682020385033,
"kl": 0.1806640625,
"learning_rate": 1.2164396139381029e-06,
"loss": 0.0383,
"reward": 14.945312559604645,
"reward_std": 5.147151567041874,
"rewards/accuracy_reward_staging": 1.308593761175871,
"rewards/format_reward": 0.921875,
"rewards/format_reward_staging": 0.9375,
"step": 195
},
{
"clip_ratio": 0.0,
"completion_length": 581.5625,
"epoch": 195.8,
"grad_norm": 1.260278322445417,
"kl": 0.18658447265625,
"learning_rate": 1.207911690817759e-06,
"loss": 0.0477,
"reward": 14.231250047683716,
"reward_std": 5.530913561582565,
"rewards/accuracy_reward_staging": 1.2418750207871199,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.9375,
"step": 196
},
{
"clip_ratio": 0.0,
"completion_length": 604.90625,
"epoch": 196.8,
"grad_norm": 1.3096099803251504,
"kl": 0.1900634765625,
"learning_rate": 1.1993679344171972e-06,
"loss": 0.0251,
"reward": 15.50156244635582,
"reward_std": 5.51077751070261,
"rewards/accuracy_reward_staging": 1.3610937595367432,
"rewards/format_reward": 0.96875,
"rewards/format_reward_staging": 0.921875,
"step": 197
},
{
"clip_ratio": 0.0,
"completion_length": 605.453125,
"epoch": 197.8,
"grad_norm": 1.3586492398815606,
"kl": 0.189208984375,
"learning_rate": 1.1908089953765447e-06,
"loss": 0.0523,
"reward": 13.85781279206276,
"reward_std": 4.988245405256748,
"rewards/accuracy_reward_staging": 1.1998437773436308,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.953125,
"step": 198
},
{
"clip_ratio": 0.0,
"completion_length": 635.609375,
"epoch": 198.8,
"grad_norm": 1.340689220119322,
"kl": 0.17425537109375,
"learning_rate": 1.1822355254921476e-06,
"loss": 0.042,
"reward": 15.098437517881393,
"reward_std": 4.625658318400383,
"rewards/accuracy_reward_staging": 1.3145312629640102,
"rewards/format_reward": 0.984375,
"rewards/format_reward_staging": 0.96875,
"step": 199
},
{
"epoch": 199.8,
"grad_norm": 1.38676313511267,
"learning_rate": 1.1736481776669305e-06,
"loss": 0.0078,
"step": 200
},
{
"epoch": 199.8,
"eval_clip_ratio": 0.0,
"eval_completion_length": 617.9625,
"eval_kl": 0.227685546875,
"eval_loss": -0.011267063207924366,
"eval_reward": 15.479999876022339,
"eval_reward_std": 5.917063271999359,
"eval_rewards/accuracy_reward_staging": 1.358000010251999,
"eval_rewards/format_reward": 0.95,
"eval_rewards/format_reward_staging": 0.95,
"eval_runtime": 140.7854,
"eval_samples_per_second": 0.142,
"eval_steps_per_second": 0.036,
"step": 200
},
{
"clip_ratio": 0.0,
"completion_length": 619.3828125,
"epoch": 200.8,
"grad_norm": 1.2618291004150899,
"kl": 0.187408447265625,
"learning_rate": 1.1650476058606774e-06,
"loss": 0.0638,
"reward": 14.914843901991844,
"reward_std": 5.3753106370568275,
"rewards/accuracy_reward_staging": 1.3063281308859587,
"rewards/format_reward": 0.890625,
"rewards/format_reward_staging": 0.9609375,
"step": 201
},
{
"clip_ratio": 0.0,
"completion_length": 661.421875,
"epoch": 201.8,
"grad_norm": 1.3037648155909478,
"kl": 0.1773681640625,
"learning_rate": 1.156434465040231e-06,
"loss": 0.089,
"reward": 14.482812568545341,
"reward_std": 5.621522009372711,
"rewards/accuracy_reward_staging": 1.263906242325902,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.90625,
"step": 202
},
{
"clip_ratio": 0.0,
"completion_length": 634.109375,
"epoch": 202.8,
"grad_norm": 7.276649221582544,
"kl": 0.24066162109375,
"learning_rate": 1.1478094111296109e-06,
"loss": 0.0672,
"reward": 14.378125011920929,
"reward_std": 5.686573512852192,
"rewards/accuracy_reward_staging": 1.2518749982118607,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.953125,
"step": 203
},
{
"clip_ratio": 0.0,
"completion_length": 630.71875,
"epoch": 203.8,
"grad_norm": 1.231495456965766,
"kl": 0.2060546875,
"learning_rate": 1.1391731009600653e-06,
"loss": 0.1112,
"reward": 14.843750059604645,
"reward_std": 4.25174543261528,
"rewards/accuracy_reward_staging": 1.298437513411045,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.953125,
"step": 204
},
{
"clip_ratio": 0.0,
"completion_length": 692.5625,
"epoch": 204.8,
"grad_norm": 1.350931922850144,
"kl": 0.224212646484375,
"learning_rate": 1.1305261922200517e-06,
"loss": 0.0238,
"reward": 15.984374910593033,
"reward_std": 5.367278844118118,
"rewards/accuracy_reward_staging": 1.4234375320374966,
"rewards/format_reward": 0.8125,
"rewards/format_reward_staging": 0.9375,
"step": 205
},
{
"clip_ratio": 0.0,
"completion_length": 690.828125,
"epoch": 205.8,
"grad_norm": 1.318954712841193,
"kl": 0.19384765625,
"learning_rate": 1.1218693434051474e-06,
"loss": 0.0885,
"reward": 16.017187118530273,
"reward_std": 5.8362889885902405,
"rewards/accuracy_reward_staging": 1.41890624538064,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.921875,
"step": 206
},
{
"clip_ratio": 0.0,
"completion_length": 632.796875,
"epoch": 206.8,
"grad_norm": 1.4124473460728744,
"kl": 0.241455078125,
"learning_rate": 1.1132032137679068e-06,
"loss": 0.0533,
"reward": 16.976562440395355,
"reward_std": 5.929606184363365,
"rewards/accuracy_reward_staging": 1.5070312470197678,
"rewards/format_reward": 0.9375,
"rewards/format_reward_staging": 0.96875,
"step": 207
},
{
"clip_ratio": 0.0,
"completion_length": 646.421875,
"epoch": 207.8,
"grad_norm": 1.6773375733949765,
"kl": 0.25189208984375,
"learning_rate": 1.1045284632676535e-06,
"loss": 0.0348,
"reward": 14.135937511920929,
"reward_std": 4.714122384786606,
"rewards/accuracy_reward_staging": 1.2307812497019768,
"rewards/format_reward": 0.890625,
"rewards/format_reward_staging": 0.9375,
"step": 208
},
{
"clip_ratio": 0.0,
"completion_length": 617.140625,
"epoch": 208.8,
"grad_norm": 1.4348854069875467,
"kl": 0.22174072265625,
"learning_rate": 1.095845752520224e-06,
"loss": 0.0495,
"reward": 14.285937666893005,
"reward_std": 5.9770321398973465,
"rewards/accuracy_reward_staging": 1.2473437581211329,
"rewards/format_reward": 0.921875,
"rewards/format_reward_staging": 0.890625,
"step": 209
},
{
"epoch": 209.8,
"grad_norm": 1.1605399733788089,
"learning_rate": 1.0871557427476583e-06,
"loss": 0.0552,
"step": 210
},
{
"epoch": 209.8,
"eval_clip_ratio": 0.0,
"eval_completion_length": 655.7875,
"eval_kl": 0.201123046875,
"eval_loss": -0.0035779415629804134,
"eval_reward": 16.239999866485597,
"eval_reward_std": 4.2823482871055605,
"eval_rewards/accuracy_reward_staging": 1.4415000066161157,
"eval_rewards/format_reward": 0.9,
"eval_rewards/format_reward_staging": 0.925,
"eval_runtime": 142.3261,
"eval_samples_per_second": 0.141,
"eval_steps_per_second": 0.035,
"step": 210
},
{
"clip_ratio": 0.0,
"completion_length": 646.625,
"epoch": 210.8,
"grad_norm": 1.3085478801780013,
"kl": 0.21307373046875,
"learning_rate": 1.078459095727845e-06,
"loss": 0.0801,
"reward": 14.151562541723251,
"reward_std": 4.665051084011793,
"rewards/accuracy_reward_staging": 1.2393749924376607,
"rewards/format_reward": 0.84375,
"rewards/format_reward_staging": 0.9140625,
"step": 211
},
{
"clip_ratio": 0.0,
"completion_length": 627.609375,
"epoch": 211.8,
"grad_norm": 1.3675217879957127,
"kl": 0.22113037109375,
"learning_rate": 1.069756473744125e-06,
"loss": 0.0458,
"reward": 15.82500010728836,
"reward_std": 5.255921743810177,
"rewards/accuracy_reward_staging": 1.4075000062584877,
"rewards/format_reward": 0.8125,
"rewards/format_reward_staging": 0.9375,
"step": 212
},
{
"clip_ratio": 0.0,
"completion_length": 617.390625,
"epoch": 212.8,
"grad_norm": 1.380807253404178,
"kl": 0.20758056640625,
"learning_rate": 1.061048539534857e-06,
"loss": 0.0209,
"reward": 15.371874898672104,
"reward_std": 6.567200765013695,
"rewards/accuracy_reward_staging": 1.3559375293552876,
"rewards/format_reward": 0.890625,
"rewards/format_reward_staging": 0.921875,
"step": 213
},
{
"clip_ratio": 0.0,
"completion_length": 620.953125,
"epoch": 213.8,
"grad_norm": 1.3902160353507123,
"kl": 0.21124267578125,
"learning_rate": 1.052335956242944e-06,
"loss": 0.0436,
"reward": 15.621874839067459,
"reward_std": 5.122038297355175,
"rewards/accuracy_reward_staging": 1.382499996572733,
"rewards/format_reward": 0.84375,
"rewards/format_reward_staging": 0.953125,
"step": 214
},
{
"clip_ratio": 0.0,
"completion_length": 655.359375,
"epoch": 214.8,
"grad_norm": 1.3806424551742382,
"kl": 0.22705078125,
"learning_rate": 1.043619387365336e-06,
"loss": -0.0022,
"reward": 13.870312362909317,
"reward_std": 5.269171215593815,
"rewards/accuracy_reward_staging": 1.2057812418788671,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.90625,
"step": 215
},
{
"clip_ratio": 0.0,
"completion_length": 655.09375,
"epoch": 215.8,
"grad_norm": 2.656219707386371,
"kl": 0.2349853515625,
"learning_rate": 1.034899496702501e-06,
"loss": 0.0562,
"reward": 14.140625029802322,
"reward_std": 5.58522791415453,
"rewards/accuracy_reward_staging": 1.2312500067055225,
"rewards/format_reward": 0.890625,
"rewards/format_reward_staging": 0.9375,
"step": 216
},
{
"clip_ratio": 0.0,
"completion_length": 622.8125,
"epoch": 216.8,
"grad_norm": 1.5209208648591772,
"kl": 0.20660400390625,
"learning_rate": 1.0261769483078732e-06,
"loss": 0.0816,
"reward": 14.775000303983688,
"reward_std": 4.954350218176842,
"rewards/accuracy_reward_staging": 1.2931250091642141,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.96875,
"step": 217
},
{
"clip_ratio": 0.0,
"completion_length": 650.234375,
"epoch": 217.8,
"grad_norm": 1.55208660193829,
"kl": 0.210693359375,
"learning_rate": 1.0174524064372837e-06,
"loss": 0.0744,
"reward": 14.27812522649765,
"reward_std": 4.545742444694042,
"rewards/accuracy_reward_staging": 1.2575000151991844,
"rewards/format_reward": 0.78125,
"rewards/format_reward_staging": 0.921875,
"step": 218
},
{
"clip_ratio": 0.0,
"completion_length": 617.234375,
"epoch": 218.8,
"grad_norm": 1.3928599706734481,
"kl": 0.20306396484375,
"learning_rate": 1.0087265354983738e-06,
"loss": 0.0865,
"reward": 14.462500095367432,
"reward_std": 5.321807600557804,
"rewards/accuracy_reward_staging": 1.2571874894201756,
"rewards/format_reward": 0.90625,
"rewards/format_reward_staging": 0.984375,
"step": 219
},
{
"epoch": 219.8,
"grad_norm": 1.5191361705497182,
"learning_rate": 1e-06,
"loss": 0.0929,
"step": 220
},
{
"epoch": 219.8,
"eval_clip_ratio": 0.0,
"eval_completion_length": 621.85,
"eval_kl": 0.18828125,
"eval_loss": 0.07175321877002716,
"eval_reward": 15.787499904632568,
"eval_reward_std": 4.991747093200684,
"eval_rewards/accuracy_reward_staging": 1.39125002771616,
"eval_rewards/format_reward": 0.925,
"eval_rewards/format_reward_staging": 0.95,
"eval_runtime": 143.7256,
"eval_samples_per_second": 0.139,
"eval_steps_per_second": 0.035,
"step": 220
},
{
"clip_ratio": 0.0,
"completion_length": 626.6953125,
"epoch": 220.8,
"grad_norm": 1.1979189103689327,
"kl": 0.20904541015625,
"learning_rate": 9.912734645016263e-07,
"loss": 0.0653,
"reward": 14.830468773841858,
"reward_std": 4.90237557888031,
"rewards/accuracy_reward_staging": 1.3041406441479921,
"rewards/format_reward": 0.8515625,
"rewards/format_reward_staging": 0.9375,
"step": 221
},
{
"clip_ratio": 0.0,
"completion_length": 618.171875,
"epoch": 221.8,
"grad_norm": 1.3245626778227455,
"kl": 0.19866943359375,
"learning_rate": 9.825475935627165e-07,
"loss": 0.0378,
"reward": 15.185937464237213,
"reward_std": 6.512602657079697,
"rewards/accuracy_reward_staging": 1.3451562821865082,
"rewards/format_reward": 0.8125,
"rewards/format_reward_staging": 0.921875,
"step": 222
},
{
"clip_ratio": 0.0,
"completion_length": 655.546875,
"epoch": 222.8,
"grad_norm": 1.2627576071000894,
"kl": 0.22076416015625,
"learning_rate": 9.73823051692127e-07,
"loss": 0.0823,
"reward": 14.023437589406967,
"reward_std": 5.150766499340534,
"rewards/accuracy_reward_staging": 1.2210937440395355,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.9375,
"step": 223
},
{
"clip_ratio": 0.0,
"completion_length": 630.046875,
"epoch": 223.8,
"grad_norm": 1.4630069125653442,
"kl": 0.22393798828125,
"learning_rate": 9.651005032974993e-07,
"loss": 0.1163,
"reward": 15.462500154972076,
"reward_std": 4.448259741067886,
"rewards/accuracy_reward_staging": 1.3556250110268593,
"rewards/format_reward": 0.921875,
"rewards/format_reward_staging": 0.984375,
"step": 224
},
{
"clip_ratio": 0.0,
"completion_length": 626.4375,
"epoch": 224.8,
"grad_norm": 1.3737324086080591,
"kl": 0.24346923828125,
"learning_rate": 9.56380612634664e-07,
"loss": 0.073,
"reward": 13.575000047683716,
"reward_std": 5.982485473155975,
"rewards/accuracy_reward_staging": 1.174687497317791,
"rewards/format_reward": 0.890625,
"rewards/format_reward_staging": 0.9375,
"step": 225
},
{
"clip_ratio": 0.0,
"completion_length": 696.59375,
"epoch": 225.8,
"grad_norm": 1.4368525693959429,
"kl": 0.2528076171875,
"learning_rate": 9.476640437570561e-07,
"loss": 0.0855,
"reward": 14.559374749660492,
"reward_std": 6.478231497108936,
"rewards/accuracy_reward_staging": 1.2887500040233135,
"rewards/format_reward": 0.78125,
"rewards/format_reward_staging": 0.890625,
"step": 226
},
{
"clip_ratio": 0.0,
"completion_length": 655.390625,
"epoch": 226.8,
"grad_norm": 1.3423833107341736,
"kl": 0.2432861328125,
"learning_rate": 9.38951460465143e-07,
"loss": 0.0918,
"reward": 13.806249856948853,
"reward_std": 5.268295802175999,
"rewards/accuracy_reward_staging": 1.2056250125169754,
"rewards/format_reward": 0.828125,
"rewards/format_reward_staging": 0.921875,
"step": 227
},
{
"clip_ratio": 0.0,
"completion_length": 720.40625,
"epoch": 227.8,
"grad_norm": 2.0887623964966555,
"kl": 0.26153564453125,
"learning_rate": 9.302435262558747e-07,
"loss": 0.0741,
"reward": 14.531249985098839,
"reward_std": 6.193335264921188,
"rewards/accuracy_reward_staging": 1.2843750081956387,
"rewards/format_reward": 0.84375,
"rewards/format_reward_staging": 0.84375,
"step": 228
},
{
"clip_ratio": 0.0,
"completion_length": 699.34375,
"epoch": 228.8,
"grad_norm": 1.5817414267449967,
"kl": 0.25372314453125,
"learning_rate": 9.215409042721551e-07,
"loss": 0.1477,
"reward": 14.853124976158142,
"reward_std": 7.3317131996154785,
"rewards/accuracy_reward_staging": 1.3150000125169754,
"rewards/format_reward": 0.8125,
"rewards/format_reward_staging": 0.890625,
"step": 229
},
{
"epoch": 229.8,
"grad_norm": 1.4707093666889888,
"learning_rate": 9.128442572523417e-07,
"loss": 0.1039,
"step": 230
},
{
"epoch": 229.8,
"eval_clip_ratio": 0.0,
"eval_completion_length": 699.725,
"eval_kl": 0.27763671875,
"eval_loss": 0.13621756434440613,
"eval_reward": 15.102499961853027,
"eval_reward_std": 5.6639987349510195,
"eval_rewards/accuracy_reward_staging": 1.337749996781349,
"eval_rewards/format_reward": 0.825,
"eval_rewards/format_reward_staging": 0.9,
"eval_runtime": 176.4013,
"eval_samples_per_second": 0.113,
"eval_steps_per_second": 0.028,
"step": 230
},
{
"clip_ratio": 0.0,
"completion_length": 750.5234375,
"epoch": 230.8,
"grad_norm": 2.5022162810953015,
"kl": 0.310821533203125,
"learning_rate": 9.04154247479776e-07,
"loss": 0.0621,
"reward": 13.921874970197678,
"reward_std": 5.641379028558731,
"rewards/accuracy_reward_staging": 1.2312500048428774,
"rewards/format_reward": 0.7421875,
"rewards/format_reward_staging": 0.8671875,
"step": 231
},
{
"clip_ratio": 0.0,
"completion_length": 737.5,
"epoch": 231.8,
"grad_norm": 1.3203764343240048,
"kl": 0.2919921875,
"learning_rate": 8.954715367323466e-07,
"loss": 0.1254,
"reward": 13.312500059604645,
"reward_std": 5.994187116622925,
"rewards/accuracy_reward_staging": 1.1750000100582838,
"rewards/format_reward": 0.703125,
"rewards/format_reward_staging": 0.859375,
"step": 232
},
{
"clip_ratio": 0.0,
"completion_length": 663.015625,
"epoch": 232.8,
"grad_norm": 1.6833018973839629,
"kl": 0.306640625,
"learning_rate": 8.867967862320933e-07,
"loss": 0.0829,
"reward": 11.451562643051147,
"reward_std": 6.721396386623383,
"rewards/accuracy_reward_staging": 0.9779687598347664,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.796875,
"step": 233
},
{
"clip_ratio": 0.0,
"completion_length": 745.671875,
"epoch": 233.8,
"grad_norm": 1.5784019252688062,
"kl": 0.3076171875,
"learning_rate": 8.781306565948526e-07,
"loss": 0.0822,
"reward": 13.026562601327896,
"reward_std": 4.835877507925034,
"rewards/accuracy_reward_staging": 1.143281283788383,
"rewards/format_reward": 0.78125,
"rewards/format_reward_staging": 0.8125,
"step": 234
},
{
"clip_ratio": 0.0,
"completion_length": 716.4375,
"epoch": 234.8,
"grad_norm": 1.5923082293463926,
"kl": 0.3375244140625,
"learning_rate": 8.694738077799486e-07,
"loss": 0.0811,
"reward": 13.98749989271164,
"reward_std": 7.312740258872509,
"rewards/accuracy_reward_staging": 1.2425000295042992,
"rewards/format_reward": 0.796875,
"rewards/format_reward_staging": 0.765625,
"step": 235
},
{
"clip_ratio": 0.0,
"completion_length": 728.125,
"epoch": 235.8,
"grad_norm": 1.520271972538422,
"kl": 0.3232421875,
"learning_rate": 8.608268990399348e-07,
"loss": 0.1051,
"reward": 13.317187458276749,
"reward_std": 7.371540606021881,
"rewards/accuracy_reward_staging": 1.1739062573760748,
"rewards/format_reward": 0.71875,
"rewards/format_reward_staging": 0.859375,
"step": 236
},
{
"clip_ratio": 0.0,
"completion_length": 804.15625,
"epoch": 236.8,
"grad_norm": 1.3584766642039745,
"kl": 0.3104248046875,
"learning_rate": 8.521905888703893e-07,
"loss": 0.1753,
"reward": 12.36562493443489,
"reward_std": 7.354695707559586,
"rewards/accuracy_reward_staging": 1.0818749964237213,
"rewards/format_reward": 0.78125,
"rewards/format_reward_staging": 0.765625,
"step": 237
},
{
"clip_ratio": 0.0,
"completion_length": 691.6875,
"epoch": 237.8,
"grad_norm": 1.4062373976771998,
"kl": 0.3116455078125,
"learning_rate": 8.435655349597689e-07,
"loss": 0.1024,
"reward": 13.564062476158142,
"reward_std": 6.342557780444622,
"rewards/accuracy_reward_staging": 1.2001562491059303,
"rewards/format_reward": 0.796875,
"rewards/format_reward_staging": 0.765625,
"step": 238
},
{
"clip_ratio": 0.0,
"completion_length": 825.25,
"epoch": 238.8,
"grad_norm": 1.3912617565210863,
"kl": 0.326416015625,
"learning_rate": 8.349523941393223e-07,
"loss": 0.1304,
"reward": 13.695312589406967,
"reward_std": 7.123799741268158,
"rewards/accuracy_reward_staging": 1.211718775331974,
"rewards/format_reward": 0.78125,
"rewards/format_reward_staging": 0.796875,
"step": 239
},
{
"epoch": 239.8,
"grad_norm": 1.5595564499799097,
"learning_rate": 8.263518223330696e-07,
"loss": 0.2716,
"step": 240
},
{
"epoch": 239.8,
"eval_clip_ratio": 0.0,
"eval_completion_length": 710.4875,
"eval_kl": 0.301416015625,
"eval_loss": 0.1302235871553421,
"eval_reward": 13.53874992132187,
"eval_reward_std": 5.90446172952652,
"eval_rewards/accuracy_reward_staging": 1.1926250100135802,
"eval_rewards/format_reward": 0.825,
"eval_rewards/format_reward_staging": 0.7875,
"eval_runtime": 172.1092,
"eval_samples_per_second": 0.116,
"eval_steps_per_second": 0.029,
"step": 240
},
{
"clip_ratio": 0.0,
"completion_length": 820.21875,
"epoch": 240.8,
"grad_norm": 1.3594222983218733,
"kl": 0.29791259765625,
"learning_rate": 8.177644745078525e-07,
"loss": 0.1216,
"reward": 13.201562486588955,
"reward_std": 6.912581101059914,
"rewards/accuracy_reward_staging": 1.1646875077858567,
"rewards/format_reward": 0.7734375,
"rewards/format_reward_staging": 0.78125,
"step": 241
},
{
"clip_ratio": 0.0,
"completion_length": 714.53125,
"epoch": 241.8,
"grad_norm": 1.3736471708376299,
"kl": 0.31427001953125,
"learning_rate": 8.091910046234551e-07,
"loss": 0.1539,
"reward": 14.295312345027924,
"reward_std": 5.410611517727375,
"rewards/accuracy_reward_staging": 1.2639062507078052,
"rewards/format_reward": 0.84375,
"rewards/format_reward_staging": 0.8125,
"step": 242
},
{
"clip_ratio": 0.0,
"completion_length": 731.6875,
"epoch": 242.8,
"grad_norm": 1.4519054784859347,
"kl": 0.33892822265625,
"learning_rate": 8.006320655828029e-07,
"loss": 0.1532,
"reward": 11.943750023841858,
"reward_std": 7.561622552573681,
"rewards/accuracy_reward_staging": 1.0662500150501728,
"rewards/format_reward": 0.640625,
"rewards/format_reward_staging": 0.640625,
"step": 243
},
{
"clip_ratio": 0.0,
"completion_length": 807.59375,
"epoch": 243.8,
"grad_norm": 1.4438718122124043,
"kl": 0.340576171875,
"learning_rate": 7.920883091822408e-07,
"loss": 0.1606,
"reward": 10.796874925494194,
"reward_std": 7.172753885388374,
"rewards/accuracy_reward_staging": 0.9265625104308128,
"rewards/format_reward": 0.8125,
"rewards/format_reward_staging": 0.71875,
"step": 244
},
{
"clip_ratio": 0.0,
"completion_length": 755.296875,
"epoch": 244.8,
"grad_norm": 1.5583845174659146,
"kl": 0.3447265625,
"learning_rate": 7.835603860618971e-07,
"loss": 0.2097,
"reward": 11.843750149011612,
"reward_std": 7.400421276688576,
"rewards/accuracy_reward_staging": 1.0296875108033419,
"rewards/format_reward": 0.796875,
"rewards/format_reward_staging": 0.75,
"step": 245
},
{
"clip_ratio": 0.0,
"completion_length": 759.03125,
"epoch": 245.8,
"grad_norm": 1.3528957695912824,
"kl": 0.357666015625,
"learning_rate": 7.750489456561351e-07,
"loss": 0.1164,
"reward": 11.779687464237213,
"reward_std": 6.353302523493767,
"rewards/accuracy_reward_staging": 1.0295312507078052,
"rewards/format_reward": 0.765625,
"rewards/format_reward_staging": 0.71875,
"step": 246
},
{
"clip_ratio": 0.0,
"completion_length": 848.734375,
"epoch": 246.8,
"grad_norm": 1.554460288339238,
"kl": 0.3973388671875,
"learning_rate": 7.665546361440949e-07,
"loss": 0.1267,
"reward": 12.23906247317791,
"reward_std": 7.234898805618286,
"rewards/accuracy_reward_staging": 1.0817187651991844,
"rewards/format_reward": 0.703125,
"rewards/format_reward_staging": 0.71875,
"step": 247
},
{
"clip_ratio": 0.0,
"completion_length": 822.703125,
"epoch": 247.8,
"grad_norm": 1.433844235366676,
"kl": 0.3592529296875,
"learning_rate": 7.580781044003324e-07,
"loss": 0.0235,
"reward": 14.143749997019768,
"reward_std": 5.741221696138382,
"rewards/accuracy_reward_staging": 1.2581250127404928,
"rewards/format_reward": 0.765625,
"rewards/format_reward_staging": 0.796875,
"step": 248
},
{
"clip_ratio": 0.0,
"completion_length": 903.109375,
"epoch": 248.8,
"grad_norm": 1.3566149279604043,
"kl": 0.31982421875,
"learning_rate": 7.496199959455583e-07,
"loss": 0.2022,
"reward": 11.91718764603138,
"reward_std": 6.288423582911491,
"rewards/accuracy_reward_staging": 1.0448437514714897,
"rewards/format_reward": 0.71875,
"rewards/format_reward_staging": 0.75,
"step": 249
},
{
"epoch": 249.8,
"grad_norm": 3.5268979600947743,
"learning_rate": 7.411809548974791e-07,
"loss": 0.2181,
"step": 250
},
{
"epoch": 249.8,
"eval_clip_ratio": 0.0,
"eval_completion_length": 863.7875,
"eval_kl": 0.37177734375,
"eval_loss": 0.22468861937522888,
"eval_reward": 12.728750014305115,
"eval_reward_std": 6.379789352416992,
"eval_rewards/accuracy_reward_staging": 1.1216249838471413,
"eval_rewards/format_reward": 0.775,
"eval_rewards/format_reward_staging": 0.7375,
"eval_runtime": 243.2396,
"eval_samples_per_second": 0.082,
"eval_steps_per_second": 0.021,
"step": 250
},
{
"clip_ratio": 0.0,
"completion_length": 816.5703125,
"epoch": 250.8,
"grad_norm": 1.3710303728682416,
"kl": 0.4068603515625,
"learning_rate": 7.327616239217431e-07,
"loss": 0.176,
"reward": 12.385937452316284,
"reward_std": 6.818997707217932,
"rewards/accuracy_reward_staging": 1.0815625092945993,
"rewards/format_reward": 0.796875,
"rewards/format_reward_staging": 0.7734375,
"step": 251
},
{
"clip_ratio": 0.0,
"completion_length": 811.671875,
"epoch": 251.8,
"grad_norm": 1.361752204741976,
"kl": 0.39990234375,
"learning_rate": 7.243626441830009e-07,
"loss": 0.1261,
"reward": 11.423437476158142,
"reward_std": 7.801801845431328,
"rewards/accuracy_reward_staging": 0.9985937401652336,
"rewards/format_reward": 0.78125,
"rewards/format_reward_staging": 0.65625,
"step": 252
},
{
"clip_ratio": 0.0,
"completion_length": 796.375,
"epoch": 252.8,
"grad_norm": 1.4560468845981351,
"kl": 0.3995361328125,
"learning_rate": 7.159846552960773e-07,
"loss": 0.1045,
"reward": 12.606250077486038,
"reward_std": 7.680657230317593,
"rewards/accuracy_reward_staging": 1.107500022277236,
"rewards/format_reward": 0.8125,
"rewards/format_reward_staging": 0.71875,
"step": 253
},
{
"clip_ratio": 0.0,
"completion_length": 951.03125,
"epoch": 253.8,
"grad_norm": 1.411783783971026,
"kl": 0.36572265625,
"learning_rate": 7.076282952772633e-07,
"loss": 0.2697,
"reward": 10.959374994039536,
"reward_std": 8.183534801006317,
"rewards/accuracy_reward_staging": 0.9631250090897083,
"rewards/format_reward": 0.75,
"rewards/format_reward_staging": 0.578125,
"step": 254
},
{
"clip_ratio": 0.0,
"completion_length": 828.109375,
"epoch": 254.8,
"grad_norm": 1.573986411740295,
"kl": 0.385498046875,
"learning_rate": 6.992942004957269e-07,
"loss": 0.2294,
"reward": 11.017187476158142,
"reward_std": 8.27015207707882,
"rewards/accuracy_reward_staging": 0.9517187681049109,
"rewards/format_reward": 0.78125,
"rewards/format_reward_staging": 0.71875,
"step": 255
},
{
"clip_ratio": 0.0,
"completion_length": 896.015625,
"epoch": 255.8,
"grad_norm": 1.5503448731327603,
"kl": 0.4189453125,
"learning_rate": 6.909830056250526e-07,
"loss": 0.233,
"reward": 12.285937294363976,
"reward_std": 7.363780289888382,
"rewards/accuracy_reward_staging": 1.081718772649765,
"rewards/format_reward": 0.75,
"rewards/format_reward_staging": 0.71875,
"step": 256
},
{
"clip_ratio": 0.0,
"completion_length": 891.390625,
"epoch": 256.8,
"grad_norm": 1.5949891985949471,
"kl": 0.38238525390625,
"learning_rate": 6.82695343594908e-07,
"loss": 0.2359,
"reward": 13.309375122189522,
"reward_std": 7.619817182421684,
"rewards/accuracy_reward_staging": 1.1700000204145908,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.734375,
"step": 257
},
{
"clip_ratio": 0.0,
"completion_length": 892.046875,
"epoch": 257.8,
"grad_norm": 1.6964810618377213,
"kl": 0.375,
"learning_rate": 6.744318455428435e-07,
"loss": 0.3474,
"reward": 11.334375008940697,
"reward_std": 7.634813725948334,
"rewards/accuracy_reward_staging": 0.9834375139325857,
"rewards/format_reward": 0.8125,
"rewards/format_reward_staging": 0.6875,
"step": 258
},
{
"clip_ratio": 0.0,
"completion_length": 723.828125,
"epoch": 258.8,
"grad_norm": 5.6325638555717505,
"kl": 0.4317626953125,
"learning_rate": 6.661931407662291e-07,
"loss": 0.1424,
"reward": 12.10312506556511,
"reward_std": 7.089647740125656,
"rewards/accuracy_reward_staging": 1.0399999842047691,
"rewards/format_reward": 0.890625,
"rewards/format_reward_staging": 0.8125,
"step": 259
},
{
"epoch": 259.8,
"grad_norm": 1.4896714438939402,
"learning_rate": 6.579798566743313e-07,
"loss": 0.2349,
"step": 260
},
{
"epoch": 259.8,
"eval_clip_ratio": 0.0,
"eval_completion_length": 904.775,
"eval_kl": 0.505517578125,
"eval_loss": 0.21720127761363983,
"eval_reward": 11.496249973773956,
"eval_reward_std": 7.962953209877014,
"eval_rewards/accuracy_reward_staging": 1.0021250128746033,
"eval_rewards/format_reward": 0.825,
"eval_rewards/format_reward_staging": 0.65,
"eval_runtime": 252.62,
"eval_samples_per_second": 0.079,
"eval_steps_per_second": 0.02,
"step": 260
},
{
"clip_ratio": 0.0,
"completion_length": 851.5546875,
"epoch": 260.8,
"grad_norm": 1.4904521266628086,
"kl": 0.43194580078125,
"learning_rate": 6.497926187405324e-07,
"loss": 0.1751,
"reward": 12.633593738079071,
"reward_std": 7.350790940225124,
"rewards/accuracy_reward_staging": 1.1102343881502748,
"rewards/format_reward": 0.8046875,
"rewards/format_reward_staging": 0.7265625,
"step": 261
},
{
"clip_ratio": 0.0,
"completion_length": 854.90625,
"epoch": 261.8,
"grad_norm": 1.329154678834471,
"kl": 0.3721923828125,
"learning_rate": 6.416320504546997e-07,
"loss": 0.1341,
"reward": 12.78281256556511,
"reward_std": 6.5289479941129684,
"rewards/accuracy_reward_staging": 1.1282812533900142,
"rewards/format_reward": 0.765625,
"rewards/format_reward_staging": 0.734375,
"step": 262
},
{
"clip_ratio": 0.0,
"completion_length": 775.671875,
"epoch": 262.8,
"grad_norm": 1.7527252884904974,
"kl": 0.3619384765625,
"learning_rate": 6.334987732757028e-07,
"loss": 0.2659,
"reward": 11.443749904632568,
"reward_std": 6.490789204835892,
"rewards/accuracy_reward_staging": 0.9881250048056245,
"rewards/format_reward": 0.828125,
"rewards/format_reward_staging": 0.734375,
"step": 263
},
{
"clip_ratio": 0.0,
"completion_length": 837.03125,
"epoch": 263.8,
"grad_norm": 1.5611285133893833,
"kl": 0.426025390625,
"learning_rate": 6.253934065840879e-07,
"loss": 0.1938,
"reward": 10.160937517881393,
"reward_std": 6.217289835214615,
"rewards/accuracy_reward_staging": 0.8582812617532909,
"rewards/format_reward": 0.875,
"rewards/format_reward_staging": 0.703125,
"step": 264
},
{
"clip_ratio": 0.0,
"completion_length": 982.140625,
"epoch": 264.8,
"grad_norm": 1.6308836635992412,
"kl": 0.4444580078125,
"learning_rate": 6.173165676349102e-07,
"loss": 0.3391,
"reward": 11.560937523841858,
"reward_std": 8.43397456407547,
"rewards/accuracy_reward_staging": 1.0092187635600567,
"rewards/format_reward": 0.78125,
"rewards/format_reward_staging": 0.6875,
"step": 265
},
{
"clip_ratio": 0.0,
"completion_length": 917.21875,
"epoch": 265.8,
"grad_norm": 8.538915516124892,
"kl": 0.4678955078125,
"learning_rate": 6.092688715107263e-07,
"loss": 0.1789,
"reward": 12.584374994039536,
"reward_std": 5.365057937800884,
"rewards/accuracy_reward_staging": 1.1021875068545341,
"rewards/format_reward": 0.859375,
"rewards/format_reward_staging": 0.703125,
"step": 266
},
{
"clip_ratio": 0.0,
"completion_length": 978.140625,
"epoch": 266.8,
"grad_norm": 11.685381653606097,
"kl": 0.5882568359375,
"learning_rate": 6.012509310747538e-07,
"loss": 0.191,
"reward": 10.77499994635582,
"reward_std": 7.996917471289635,
"rewards/accuracy_reward_staging": 0.9415625259280205,
"rewards/format_reward": 0.75,
"rewards/format_reward_staging": 0.609375,
"step": 267
},
{
"clip_ratio": 0.0,
"completion_length": 925.015625,
"epoch": 267.8,
"grad_norm": 1.4792658565609935,
"kl": 0.4212646484375,
"learning_rate": 5.932633569241999e-07,
"loss": 0.1863,
"reward": 11.715624958276749,
"reward_std": 6.7979661747813225,
"rewards/accuracy_reward_staging": 1.0200000181794167,
"rewards/format_reward": 0.84375,
"rewards/format_reward_staging": 0.671875,
"step": 268
},
{
"clip_ratio": 0.0,
"completion_length": 845.140625,
"epoch": 268.8,
"grad_norm": 5.20673071015397,
"kl": 0.4212646484375,
"learning_rate": 5.853067573437611e-07,
"loss": 0.3561,
"reward": 13.192187458276749,
"reward_std": 7.627192087471485,
"rewards/accuracy_reward_staging": 1.164531260728836,
"rewards/format_reward": 0.828125,
"rewards/format_reward_staging": 0.71875,
"step": 269
},
{
"epoch": 269.8,
"grad_norm": 1.3473441966408635,
"learning_rate": 5.773817382593007e-07,
"loss": 0.2184,
"step": 270
},
{
"epoch": 269.8,
"eval_clip_ratio": 0.0,
"eval_completion_length": 896.7625,
"eval_kl": 0.446728515625,
"eval_loss": 0.2557651400566101,
"eval_reward": 11.58375017642975,
"eval_reward_std": 7.534060525894165,
"eval_rewards/accuracy_reward_staging": 1.0096250101923943,
"eval_rewards/format_reward": 0.8,
"eval_rewards/format_reward_staging": 0.6875,
"eval_runtime": 257.488,
"eval_samples_per_second": 0.078,
"eval_steps_per_second": 0.019,
"step": 270
},
{
"clip_ratio": 0.0,
"completion_length": 945.8359375,
"epoch": 270.8,
"grad_norm": 1.8166241750746335,
"kl": 0.4503173828125,
"learning_rate": 5.694889031917046e-07,
"loss": 0.2195,
"reward": 10.942968711256981,
"reward_std": 7.070375669747591,
"rewards/accuracy_reward_staging": 0.9497656342573464,
"rewards/format_reward": 0.7890625,
"rewards/format_reward_staging": 0.65625,
"step": 271
},
{
"clip_ratio": 0.0,
"completion_length": 860.984375,
"epoch": 271.8,
"grad_norm": 1.595405733714329,
"kl": 0.455810546875,
"learning_rate": 5.616288532109224e-07,
"loss": 0.2539,
"reward": 11.723437696695328,
"reward_std": 7.531994827091694,
"rewards/accuracy_reward_staging": 1.0301562547683716,
"rewards/format_reward": 0.75,
"rewards/format_reward_staging": 0.671875,
"step": 272
},
{
"clip_ratio": 0.0,
"completion_length": 892.765625,
"epoch": 272.8,
"grad_norm": 1.9592298387429952,
"kl": 0.4637451171875,
"learning_rate": 5.538021868901912e-07,
"loss": 0.2683,
"reward": 10.093750059604645,
"reward_std": 6.664774626493454,
"rewards/accuracy_reward_staging": 0.8687500087544322,
"rewards/format_reward": 0.796875,
"rewards/format_reward_staging": 0.609375,
"step": 273
},
{
"clip_ratio": 0.0,
"completion_length": 958.265625,
"epoch": 273.8,
"grad_norm": 98.566029551543,
"kl": 1.8087158203125,
"learning_rate": 5.460095002604532e-07,
"loss": 0.3116,
"reward": 9.573437586426735,
"reward_std": 7.937933571636677,
"rewards/accuracy_reward_staging": 0.8276562560349703,
"rewards/format_reward": 0.71875,
"rewards/format_reward_staging": 0.578125,
"step": 274
},
{
"clip_ratio": 0.0,
"completion_length": 915.875,
"epoch": 274.8,
"grad_norm": 1.54560131594488,
"kl": 0.4423828125,
"learning_rate": 5.382513867649663e-07,
"loss": 0.2622,
"reward": 10.931249961256981,
"reward_std": 7.524611636996269,
"rewards/accuracy_reward_staging": 0.9571875012479722,
"rewards/format_reward": 0.75,
"rewards/format_reward_staging": 0.609375,
"step": 275
},
{
"clip_ratio": 0.0,
"completion_length": 856.765625,
"epoch": 275.8,
"grad_norm": 3.0239354302219548,
"kl": 0.5811767578125,
"learning_rate": 5.305284372141095e-07,
"loss": 0.2451,
"reward": 11.835937544703484,
"reward_std": 6.662468932569027,
"rewards/accuracy_reward_staging": 1.039843776728958,
"rewards/format_reward": 0.734375,
"rewards/format_reward_staging": 0.703125,
"step": 276
},
{
"clip_ratio": 0.0,
"completion_length": 932.6875,
"epoch": 276.8,
"grad_norm": 1.616305493128834,
"kl": 0.538818359375,
"learning_rate": 5.228412397403915e-07,
"loss": 0.2794,
"reward": 9.204687595367432,
"reward_std": 8.746634840965271,
"rewards/accuracy_reward_staging": 0.7923437561839819,
"rewards/format_reward": 0.734375,
"rewards/format_reward_staging": 0.546875,
"step": 277
},
{
"clip_ratio": 0.0,
"completion_length": 853.65625,
"epoch": 277.8,
"grad_norm": 1.5898137384166846,
"kl": 0.4730224609375,
"learning_rate": 5.15190379753663e-07,
"loss": 0.2771,
"reward": 9.235937505960464,
"reward_std": 8.327273309230804,
"rewards/accuracy_reward_staging": 0.786093763075769,
"rewards/format_reward": 0.796875,
"rewards/format_reward_staging": 0.578125,
"step": 278
},
{
"clip_ratio": 0.0,
"completion_length": 823.4375,
"epoch": 278.8,
"grad_norm": 2.44549487611301,
"kl": 0.52734375,
"learning_rate": 5.07576439896533e-07,
"loss": 0.2175,
"reward": 11.657812714576721,
"reward_std": 8.151460975408554,
"rewards/accuracy_reward_staging": 1.0251562464982271,
"rewards/format_reward": 0.75,
"rewards/format_reward_staging": 0.65625,
"step": 279
},
{
"epoch": 279.8,
"grad_norm": 2.0646627929666184,
"learning_rate": 5.000000000000002e-07,
"loss": 0.357,
"step": 280
},
{
"epoch": 279.8,
"eval_clip_ratio": 0.0,
"eval_completion_length": 810.45,
"eval_kl": 0.4798828125,
"eval_loss": 0.1928146332502365,
"eval_reward": 12.850000190734864,
"eval_reward_std": 7.15499917268753,
"eval_rewards/accuracy_reward_staging": 1.1475000128149986,
"eval_rewards/format_reward": 0.725,
"eval_rewards/format_reward_staging": 0.65,
"eval_runtime": 242.0902,
"eval_samples_per_second": 0.083,
"eval_steps_per_second": 0.021,
"step": 280
},
{
"clip_ratio": 0.0,
"completion_length": 885.203125,
"epoch": 280.8,
"grad_norm": 1.6342686921474878,
"kl": 0.453857421875,
"learning_rate": 4.924616370392961e-07,
"loss": 0.2134,
"reward": 11.285156175494194,
"reward_std": 7.196100067347288,
"rewards/accuracy_reward_staging": 0.9832031358964741,
"rewards/format_reward": 0.8046875,
"rewards/format_reward_staging": 0.6484375,
"step": 281
},
{
"clip_ratio": 0.0,
"completion_length": 827.796875,
"epoch": 281.8,
"grad_norm": 1.9777507622588804,
"kl": 0.501708984375,
"learning_rate": 4.849619250899458e-07,
"loss": 0.2286,
"reward": 9.923437535762787,
"reward_std": 6.76015942543745,
"rewards/accuracy_reward_staging": 0.8532812669873238,
"rewards/format_reward": 0.75,
"rewards/format_reward_staging": 0.640625,
"step": 282
},
{
"clip_ratio": 0.0,
"completion_length": 760.53125,
"epoch": 282.8,
"grad_norm": 1.983747741222898,
"kl": 0.5208740234375,
"learning_rate": 4.775014352840512e-07,
"loss": 0.2496,
"reward": 10.876562595367432,
"reward_std": 8.217148587107658,
"rewards/accuracy_reward_staging": 0.9735937742516398,
"rewards/format_reward": 0.609375,
"rewards/format_reward_staging": 0.53125,
"step": 283
},
{
"clip_ratio": 0.0,
"completion_length": 800.21875,
"epoch": 283.8,
"grad_norm": 9.98641000305957,
"kl": 0.650634765625,
"learning_rate": 4.700807357667952e-07,
"loss": 0.224,
"reward": 10.59218743443489,
"reward_std": 8.216856330633163,
"rewards/accuracy_reward_staging": 0.910781248472631,
"rewards/format_reward": 0.84375,
"rewards/format_reward_staging": 0.640625,
"step": 284
},
{
"clip_ratio": 0.0,
"completion_length": 724.0625,
"epoch": 284.8,
"grad_norm": 3.1107143716405066,
"kl": 0.58544921875,
"learning_rate": 4.62700391653176e-07,
"loss": 0.1371,
"reward": 12.684374868869781,
"reward_std": 5.954333983361721,
"rewards/accuracy_reward_staging": 1.1121875066310167,
"rewards/format_reward": 0.796875,
"rewards/format_reward_staging": 0.765625,
"step": 285
},
{
"clip_ratio": 0.0,
"completion_length": 673.5,
"epoch": 285.8,
"grad_norm": 1.6272052880635073,
"kl": 0.486572265625,
"learning_rate": 4.5536096498497287e-07,
"loss": 0.1643,
"reward": 10.56093743443489,
"reward_std": 8.084580287337303,
"rewards/accuracy_reward_staging": 0.9310937505215406,
"rewards/format_reward": 0.6875,
"rewards/format_reward_staging": 0.5625,
"step": 286
},
{
"clip_ratio": 0.0,
"completion_length": 651.046875,
"epoch": 286.8,
"grad_norm": 44.36285030093844,
"kl": 0.7974853515625,
"learning_rate": 4.480630146879418e-07,
"loss": 0.161,
"reward": 11.729687303304672,
"reward_std": 8.256633162498474,
"rewards/accuracy_reward_staging": 1.02453126385808,
"rewards/format_reward": 0.78125,
"rewards/format_reward_staging": 0.703125,
"step": 287
},
{
"clip_ratio": 0.0,
"completion_length": 694.03125,
"epoch": 287.8,
"grad_norm": 1.805744925603759,
"kl": 0.5167236328125,
"learning_rate": 4.408070965292533e-07,
"loss": 0.1719,
"reward": 12.934375196695328,
"reward_std": 8.110588558018208,
"rewards/accuracy_reward_staging": 1.149687498807907,
"rewards/format_reward": 0.734375,
"rewards/format_reward_staging": 0.703125,
"step": 288
},
{
"clip_ratio": 0.0,
"completion_length": 806.671875,
"epoch": 288.8,
"grad_norm": 365.24516465790214,
"kl": 15.93310546875,
"learning_rate": 4.335937630751674e-07,
"loss": 0.422,
"reward": 10.040624976158142,
"reward_std": 6.964074335992336,
"rewards/accuracy_reward_staging": 0.8900000145658851,
"rewards/format_reward": 0.609375,
"rewards/format_reward_staging": 0.53125,
"step": 289
},
{
"epoch": 289.8,
"grad_norm": 2.018184843282375,
"learning_rate": 4.2642356364895417e-07,
"loss": 0.159,
"step": 290
},
{
"epoch": 289.8,
"eval_clip_ratio": 0.0,
"eval_completion_length": 676.325,
"eval_kl": 0.47177734375,
"eval_loss": 0.08975062519311905,
"eval_reward": 12.25,
"eval_reward_std": 7.105983757972718,
"eval_rewards/accuracy_reward_staging": 1.080000001192093,
"eval_rewards/format_reward": 0.75,
"eval_rewards/format_reward_staging": 0.7,
"eval_runtime": 172.0863,
"eval_samples_per_second": 0.116,
"eval_steps_per_second": 0.029,
"step": 290
},
{
"clip_ratio": 0.0,
"completion_length": 756.625,
"epoch": 290.8,
"grad_norm": 4.304003408743737,
"kl": 0.55126953125,
"learning_rate": 4.192970442890602e-07,
"loss": 0.1843,
"reward": 10.37187498062849,
"reward_std": 6.766278941184282,
"rewards/accuracy_reward_staging": 0.9106250035110861,
"rewards/format_reward": 0.671875,
"rewards/format_reward_staging": 0.59375,
"step": 291
},
{
"clip_ratio": 0.0,
"completion_length": 722.5,
"epoch": 291.8,
"grad_norm": 1.4716878010716028,
"kl": 0.4508056640625,
"learning_rate": 4.1221474770752696e-07,
"loss": 0.1393,
"reward": 12.068749815225601,
"reward_std": 6.005900785326958,
"rewards/accuracy_reward_staging": 1.0553124994039536,
"rewards/format_reward": 0.8125,
"rewards/format_reward_staging": 0.703125,
"step": 292
},
{
"clip_ratio": 0.0,
"completion_length": 757.15625,
"epoch": 292.8,
"grad_norm": 1.6973289397039133,
"kl": 0.4842529296875,
"learning_rate": 4.0517721324865884e-07,
"loss": 0.2115,
"reward": 9.153125151991844,
"reward_std": 7.310723379254341,
"rewards/accuracy_reward_staging": 0.7809374900534749,
"rewards/format_reward": 0.765625,
"rewards/format_reward_staging": 0.578125,
"step": 293
},
{
"clip_ratio": 0.0,
"completion_length": 700.765625,
"epoch": 293.8,
"grad_norm": 2.1555090676393096,
"kl": 0.51806640625,
"learning_rate": 3.981849768479516e-07,
"loss": 0.2632,
"reward": 11.881249964237213,
"reward_std": 7.455913960933685,
"rewards/accuracy_reward_staging": 1.0396875217556953,
"rewards/format_reward": 0.78125,
"rewards/format_reward_staging": 0.703125,
"step": 294
},
{
"clip_ratio": 0.0,
"completion_length": 643.59375,
"epoch": 294.8,
"grad_norm": 1.6704306436530678,
"kl": 0.459716796875,
"learning_rate": 3.912385709912793e-07,
"loss": 0.1352,
"reward": 11.445312559604645,
"reward_std": 7.4289940893650055,
"rewards/accuracy_reward_staging": 1.008593775331974,
"rewards/format_reward": 0.703125,
"rewards/format_reward_staging": 0.65625,
"step": 295
},
{
"clip_ratio": 0.0,
"completion_length": 684.578125,
"epoch": 295.8,
"grad_norm": 1.7855400540888517,
"kl": 0.528076171875,
"learning_rate": 3.843385246743417e-07,
"loss": 0.1663,
"reward": 9.97968752682209,
"reward_std": 8.184111461043358,
"rewards/accuracy_reward_staging": 0.8635937534272671,
"rewards/format_reward": 0.734375,
"rewards/format_reward_staging": 0.609375,
"step": 296
},
{
"clip_ratio": 0.0,
"completion_length": 602.75,
"epoch": 296.8,
"grad_norm": 38.11905446217515,
"kl": 1.2889404296875,
"learning_rate": 3.774853633623806e-07,
"loss": 0.2038,
"reward": 11.821874991059303,
"reward_std": 8.198160663247108,
"rewards/accuracy_reward_staging": 1.0400000140070915,
"rewards/format_reward": 0.75,
"rewards/format_reward_staging": 0.671875,
"step": 297
},
{
"clip_ratio": 0.0,
"completion_length": 585.765625,
"epoch": 297.8,
"grad_norm": 1.8197669464076098,
"kl": 0.5150146484375,
"learning_rate": 3.706796089501627e-07,
"loss": 0.0749,
"reward": 11.434375017881393,
"reward_std": 7.715316243469715,
"rewards/accuracy_reward_staging": 0.9934375211596489,
"rewards/format_reward": 0.796875,
"rewards/format_reward_staging": 0.703125,
"step": 298
},
{
"clip_ratio": 0.0,
"completion_length": 613.03125,
"epoch": 298.8,
"grad_norm": 1.889354762217911,
"kl": 0.4669189453125,
"learning_rate": 3.639217797222359e-07,
"loss": 0.1515,
"reward": 11.481249898672104,
"reward_std": 7.798143312335014,
"rewards/accuracy_reward_staging": 1.0090625192970037,
"rewards/format_reward": 0.71875,
"rewards/format_reward_staging": 0.671875,
"step": 299
},
{
"epoch": 299.8,
"grad_norm": 2.6622348023095666,
"learning_rate": 3.5721239031346063e-07,
"loss": 0.2454,
"step": 300
},
{
"epoch": 299.8,
"eval_clip_ratio": 0.0,
"eval_completion_length": 590.3125,
"eval_kl": 0.5787109375,
"eval_loss": 0.10901384055614471,
"eval_reward": 11.113749873638152,
"eval_reward_std": 8.458426451683044,
"eval_rewards/accuracy_reward_staging": 0.9851250126957893,
"eval_rewards/format_reward": 0.65,
"eval_rewards/format_reward_staging": 0.6125,
"eval_runtime": 141.3322,
"eval_samples_per_second": 0.142,
"eval_steps_per_second": 0.035,
"step": 300
},
{
"clip_ratio": 0.0,
"completion_length": 614.75,
"epoch": 300.8,
"grad_norm": 1.5247800909935867,
"kl": 0.50823974609375,
"learning_rate": 3.5055195166981646e-07,
"loss": 0.0888,
"reward": 11.17812480777502,
"reward_std": 7.858443755656481,
"rewards/accuracy_reward_staging": 0.9834375130012631,
"rewards/format_reward": 0.7421875,
"rewards/format_reward_staging": 0.6015625,
"step": 301
},
{
"clip_ratio": 0.0,
"completion_length": 590.609375,
"epoch": 301.8,
"grad_norm": 11.67646802939455,
"kl": 0.736083984375,
"learning_rate": 3.4394097100949283e-07,
"loss": 0.119,
"reward": 11.040624797344208,
"reward_std": 6.185135334730148,
"rewards/accuracy_reward_staging": 0.9775000158697367,
"rewards/format_reward": 0.65625,
"rewards/format_reward_staging": 0.609375,
"step": 302
},
{
"clip_ratio": 0.0,
"completion_length": 592.828125,
"epoch": 302.8,
"grad_norm": 2.07620138910771,
"kl": 0.5858154296875,
"learning_rate": 3.373799517842627e-07,
"loss": 0.1362,
"reward": 9.728124901652336,
"reward_std": 6.484044134616852,
"rewards/accuracy_reward_staging": 0.8431250131689012,
"rewards/format_reward": 0.671875,
"rewards/format_reward_staging": 0.625,
"step": 303
},
{
"clip_ratio": 0.0,
"completion_length": 590.421875,
"epoch": 303.8,
"grad_norm": 1.80968722219858,
"kl": 0.577392578125,
"learning_rate": 3.308693936411421e-07,
"loss": 0.1018,
"reward": 9.63281275331974,
"reward_std": 6.755122885107994,
"rewards/accuracy_reward_staging": 0.827343761920929,
"rewards/format_reward": 0.75,
"rewards/format_reward_staging": 0.609375,
"step": 304
},
{
"clip_ratio": 0.0,
"completion_length": 568.5625,
"epoch": 304.8,
"grad_norm": 2.0406619631130307,
"kl": 0.5341796875,
"learning_rate": 3.2440979238433974e-07,
"loss": 0.1264,
"reward": 10.746875032782555,
"reward_std": 7.6421735137701035,
"rewards/accuracy_reward_staging": 0.9465625076554716,
"rewards/format_reward": 0.671875,
"rewards/format_reward_staging": 0.609375,
"step": 305
},
{
"clip_ratio": 0.0,
"completion_length": 624.703125,
"epoch": 305.8,
"grad_norm": 2524.9442790112334,
"kl": 23.13818359375,
"learning_rate": 3.180016399375016e-07,
"loss": 0.9704,
"reward": 10.040624886751175,
"reward_std": 7.3795405626297,
"rewards/accuracy_reward_staging": 0.8853125032037497,
"rewards/format_reward": 0.640625,
"rewards/format_reward_staging": 0.546875,
"step": 306
},
{
"clip_ratio": 0.0,
"completion_length": 599.765625,
"epoch": 306.8,
"grad_norm": 22.332538208315107,
"kl": 0.8387451171875,
"learning_rate": 3.1164542430624586e-07,
"loss": 0.1017,
"reward": 11.599999964237213,
"reward_std": 6.432632386684418,
"rewards/accuracy_reward_staging": 1.035000003874302,
"rewards/format_reward": 0.625,
"rewards/format_reward_staging": 0.625,
"step": 307
},
{
"clip_ratio": 0.0,
"completion_length": 612.078125,
"epoch": 307.8,
"grad_norm": 103.35305685554316,
"kl": 0.972900390625,
"learning_rate": 3.0534162954100263e-07,
"loss": 0.1358,
"reward": 9.193749964237213,
"reward_std": 7.3355728685855865,
"rewards/accuracy_reward_staging": 0.7959375064820051,
"rewards/format_reward": 0.65625,
"rewards/format_reward_staging": 0.578125,
"step": 308
},
{
"clip_ratio": 0.0,
"completion_length": 570.96875,
"epoch": 308.8,
"grad_norm": 50.34680954325702,
"kl": 1.0068359375,
"learning_rate": 2.990907357001491e-07,
"loss": 0.0898,
"reward": 9.468749985098839,
"reward_std": 7.638161733746529,
"rewards/accuracy_reward_staging": 0.8328124992549419,
"rewards/format_reward": 0.59375,
"rewards/format_reward_staging": 0.546875,
"step": 309
},
{
"epoch": 309.8,
"grad_norm": 2.404150826295122,
"learning_rate": 2.9289321881345254e-07,
"loss": 0.125,
"step": 310
},
{
"epoch": 309.8,
"eval_clip_ratio": 0.0,
"eval_completion_length": 544.0375,
"eval_kl": 0.5705078125,
"eval_loss": 0.10134752094745636,
"eval_reward": 11.172499823570252,
"eval_reward_std": 7.30764594078064,
"eval_rewards/accuracy_reward_staging": 0.9935000017285347,
"eval_rewards/format_reward": 0.625,
"eval_rewards/format_reward_staging": 0.6125,
"eval_runtime": 125.2508,
"eval_samples_per_second": 0.16,
"eval_steps_per_second": 0.04,
"step": 310
},
{
"clip_ratio": 0.0,
"completion_length": 594.6796875,
"epoch": 310.8,
"grad_norm": 4.496947390349957,
"kl": 0.5538330078125,
"learning_rate": 2.867495508458185e-07,
"loss": 0.0803,
"reward": 11.534374952316284,
"reward_std": 6.893470458686352,
"rewards/accuracy_reward_staging": 1.0221875067800283,
"rewards/format_reward": 0.671875,
"rewards/format_reward_staging": 0.640625,
"step": 311
},
{
"clip_ratio": 0.0,
"completion_length": 604.53125,
"epoch": 311.8,
"grad_norm": 1.8183482676019072,
"kl": 0.561279296875,
"learning_rate": 2.8066019966134904e-07,
"loss": 0.1272,
"reward": 10.914062589406967,
"reward_std": 7.2590411230921745,
"rewards/accuracy_reward_staging": 0.9617187697440386,
"rewards/format_reward": 0.65625,
"rewards/format_reward_staging": 0.640625,
"step": 312
},
{
"clip_ratio": 0.0,
"completion_length": 588.84375,
"epoch": 312.8,
"grad_norm": 2.2241952673115684,
"kl": 0.574951171875,
"learning_rate": 2.7462562898771256e-07,
"loss": 0.0993,
"reward": 11.553124994039536,
"reward_std": 8.42781974375248,
"rewards/accuracy_reward_staging": 1.0303124962374568,
"rewards/format_reward": 0.671875,
"rewards/format_reward_staging": 0.578125,
"step": 313
},
{
"clip_ratio": 0.0,
"completion_length": 551.140625,
"epoch": 313.8,
"grad_norm": 10.414899131526484,
"kl": 0.54833984375,
"learning_rate": 2.6864629838082954e-07,
"loss": 0.0799,
"reward": 11.815624713897705,
"reward_std": 6.541705533862114,
"rewards/accuracy_reward_staging": 1.0659374967217445,
"rewards/format_reward": 0.59375,
"rewards/format_reward_staging": 0.5625,
"step": 314
},
{
"clip_ratio": 0.0,
"completion_length": 553.046875,
"epoch": 314.8,
"grad_norm": 47.0803214727609,
"kl": 0.74169921875,
"learning_rate": 2.62722663189876e-07,
"loss": 0.082,
"reward": 8.54062494635582,
"reward_std": 8.188691228628159,
"rewards/accuracy_reward_staging": 0.7603125041350722,
"rewards/format_reward": 0.453125,
"rewards/format_reward_staging": 0.484375,
"step": 315
},
{
"clip_ratio": 0.0,
"completion_length": 568.03125,
"epoch": 315.8,
"grad_norm": 1.697069435904432,
"kl": 0.52001953125,
"learning_rate": 2.568551745226056e-07,
"loss": 0.0445,
"reward": 11.301562577486038,
"reward_std": 7.986581727862358,
"rewards/accuracy_reward_staging": 0.9989062640815973,
"rewards/format_reward": 0.65625,
"rewards/format_reward_staging": 0.65625,
"step": 316
},
{
"clip_ratio": 0.0,
"completion_length": 604.609375,
"epoch": 316.8,
"grad_norm": 1.9224538433799612,
"kl": 0.5042724609375,
"learning_rate": 2.510442792109978e-07,
"loss": 0.1747,
"reward": 11.126562386751175,
"reward_std": 8.233518898487091,
"rewards/accuracy_reward_staging": 0.9829687615856528,
"rewards/format_reward": 0.703125,
"rewards/format_reward_staging": 0.59375,
"step": 317
},
{
"clip_ratio": 0.0,
"completion_length": 540.96875,
"epoch": 317.8,
"grad_norm": 1.8852768343458246,
"kl": 0.50634765625,
"learning_rate": 2.45290419777228e-07,
"loss": 0.0711,
"reward": 10.67187511920929,
"reward_std": 7.687072329223156,
"rewards/accuracy_reward_staging": 0.9312500208616257,
"rewards/format_reward": 0.6875,
"rewards/format_reward_staging": 0.671875,
"step": 318
},
{
"clip_ratio": 0.0,
"completion_length": 586.21875,
"epoch": 318.8,
"grad_norm": 1.9063098360507449,
"kl": 0.60400390625,
"learning_rate": 2.395940343999691e-07,
"loss": 0.072,
"reward": 9.948437467217445,
"reward_std": 8.150914326310158,
"rewards/accuracy_reward_staging": 0.8745312532410026,
"rewards/format_reward": 0.65625,
"rewards/format_reward_staging": 0.546875,
"step": 319
},
{
"epoch": 319.8,
"grad_norm": 2.0586826121884196,
"learning_rate": 2.339555568810221e-07,
"loss": 0.0995,
"step": 320
},
{
"epoch": 319.8,
"eval_clip_ratio": 0.0,
"eval_completion_length": 624.5875,
"eval_kl": 1.31884765625,
"eval_loss": 0.14124900102615356,
"eval_reward": 10.472500038146972,
"eval_reward_std": 8.316550683975219,
"eval_rewards/accuracy_reward_staging": 0.9397500105202198,
"eval_rewards/format_reward": 0.55,
"eval_rewards/format_reward_staging": 0.525,
"eval_runtime": 166.2792,
"eval_samples_per_second": 0.12,
"eval_steps_per_second": 0.03,
"step": 320
},
{
"clip_ratio": 0.0,
"completion_length": 531.609375,
"epoch": 320.8,
"grad_norm": 3.9996163379427734,
"kl": 0.682861328125,
"learning_rate": 2.283754166122802e-07,
"loss": 0.1015,
"reward": 11.93515631556511,
"reward_std": 7.991424214094877,
"rewards/accuracy_reward_staging": 1.0638281423598528,
"rewards/format_reward": 0.65625,
"rewards/format_reward_staging": 0.640625,
"step": 321
},
{
"clip_ratio": 0.0,
"completion_length": 536.390625,
"epoch": 321.8,
"grad_norm": 3.0819342365087095,
"kl": 0.537841796875,
"learning_rate": 2.228540385430291e-07,
"loss": 0.0322,
"reward": 10.235937386751175,
"reward_std": 7.312522612512112,
"rewards/accuracy_reward_staging": 0.9110937523655593,
"rewards/format_reward": 0.5625,
"rewards/format_reward_staging": 0.5625,
"step": 322
},
{
"clip_ratio": 0.0,
"completion_length": 564.328125,
"epoch": 322.8,
"grad_norm": 2.26675304418193,
"kl": 0.61669921875,
"learning_rate": 2.1739184314758607e-07,
"loss": 0.1112,
"reward": 9.662499949336052,
"reward_std": 7.8978844210505486,
"rewards/accuracy_reward_staging": 0.8537500146776438,
"rewards/format_reward": 0.578125,
"rewards/format_reward_staging": 0.546875,
"step": 323
},
{
"clip_ratio": 0.0,
"completion_length": 561.875,
"epoch": 323.8,
"grad_norm": 1.9954959778735344,
"kl": 0.582763671875,
"learning_rate": 2.1198924639327808e-07,
"loss": 0.1118,
"reward": 9.614062517881393,
"reward_std": 8.075859874486923,
"rewards/accuracy_reward_staging": 0.848906246945262,
"rewards/format_reward": 0.578125,
"rewards/format_reward_staging": 0.546875,
"step": 324
},
{
"clip_ratio": 0.0,
"completion_length": 513.0625,
"epoch": 324.8,
"grad_norm": 1.9291434012665167,
"kl": 0.508056640625,
"learning_rate": 2.0664665970876495e-07,
"loss": 0.09,
"reward": 11.331250041723251,
"reward_std": 7.7114517986774445,
"rewards/accuracy_reward_staging": 0.9878125172108412,
"rewards/format_reward": 0.765625,
"rewards/format_reward_staging": 0.6875,
"step": 325
},
{
"clip_ratio": 0.0,
"completion_length": 531.9375,
"epoch": 325.8,
"grad_norm": 2.130045177921353,
"kl": 0.6065673828125,
"learning_rate": 2.0136448995270738e-07,
"loss": 0.1138,
"reward": 9.910937532782555,
"reward_std": 7.951791629195213,
"rewards/accuracy_reward_staging": 0.8645312692970037,
"rewards/format_reward": 0.65625,
"rewards/format_reward_staging": 0.609375,
"step": 326
},
{
"clip_ratio": 0.0,
"completion_length": 590.3125,
"epoch": 326.8,
"grad_norm": 14.919466822673119,
"kl": 0.8441162109375,
"learning_rate": 1.961431393827827e-07,
"loss": 0.1238,
"reward": 9.217187523841858,
"reward_std": 7.790774069726467,
"rewards/accuracy_reward_staging": 0.7967187650501728,
"rewards/format_reward": 0.65625,
"rewards/format_reward_staging": 0.59375,
"step": 327
},
{
"clip_ratio": 0.0,
"completion_length": 581.578125,
"epoch": 327.8,
"grad_norm": 2.201816976112108,
"kl": 0.5943603515625,
"learning_rate": 1.9098300562505264e-07,
"loss": 0.1018,
"reward": 10.471874967217445,
"reward_std": 7.630053393542767,
"rewards/accuracy_reward_staging": 0.9268750208429992,
"rewards/format_reward": 0.640625,
"rewards/format_reward_staging": 0.5625,
"step": 328
},
{
"clip_ratio": 0.0,
"completion_length": 556.71875,
"epoch": 328.8,
"grad_norm": 5.906945093072998,
"kl": 0.6585693359375,
"learning_rate": 1.8588448164368087e-07,
"loss": 0.0954,
"reward": 10.989062532782555,
"reward_std": 7.685741938650608,
"rewards/accuracy_reward_staging": 0.972343759611249,
"rewards/format_reward": 0.671875,
"rewards/format_reward_staging": 0.59375,
"step": 329
},
{
"epoch": 329.8,
"grad_norm": 2.0853287849886426,
"learning_rate": 1.8084795571100809e-07,
"loss": 0.0689,
"step": 330
},
{
"epoch": 329.8,
"eval_clip_ratio": 0.0,
"eval_completion_length": 538.2375,
"eval_kl": 0.81767578125,
"eval_loss": 0.1242348700761795,
"eval_reward": 11.619999825954437,
"eval_reward_std": 8.338389962911606,
"eval_rewards/accuracy_reward_staging": 1.0357500161975621,
"eval_rewards/format_reward": 0.6375,
"eval_rewards/format_reward_staging": 0.625,
"eval_runtime": 130.5157,
"eval_samples_per_second": 0.153,
"eval_steps_per_second": 0.038,
"step": 330
},
{
"clip_ratio": 0.0,
"completion_length": 552.21875,
"epoch": 330.8,
"grad_norm": 1.9411099886639303,
"kl": 0.55401611328125,
"learning_rate": 1.758738113779843e-07,
"loss": 0.1164,
"reward": 12.410937391221523,
"reward_std": 6.827382728457451,
"rewards/accuracy_reward_staging": 1.1129687398206443,
"rewards/format_reward": 0.65625,
"rewards/format_reward_staging": 0.625,
"step": 331
},
{
"clip_ratio": 0.0,
"completion_length": 550.75,
"epoch": 331.8,
"grad_norm": 2.1891389339166056,
"kl": 0.5709228515625,
"learning_rate": 1.7096242744495838e-07,
"loss": -0.0025,
"reward": 11.460937261581421,
"reward_std": 6.8272934183478355,
"rewards/accuracy_reward_staging": 1.0195312476716936,
"rewards/format_reward": 0.671875,
"rewards/format_reward_staging": 0.59375,
"step": 332
},
{
"clip_ratio": 0.0,
"completion_length": 524.5,
"epoch": 332.8,
"grad_norm": 2.0777537885395376,
"kl": 0.536376953125,
"learning_rate": 1.661141779328319e-07,
"loss": 0.0535,
"reward": 12.75,
"reward_std": 7.35980150103569,
"rewards/accuracy_reward_staging": 1.1328125018626451,
"rewards/format_reward": 0.71875,
"rewards/format_reward_staging": 0.703125,
"step": 333
},
{
"clip_ratio": 0.0,
"completion_length": 499.59375,
"epoch": 333.8,
"grad_norm": 4.1435358861337095,
"kl": 0.5528564453125,
"learning_rate": 1.6132943205457606e-07,
"loss": 0.0848,
"reward": 10.412499874830246,
"reward_std": 8.24751353263855,
"rewards/accuracy_reward_staging": 0.9053125167265534,
"rewards/format_reward": 0.71875,
"rewards/format_reward_staging": 0.640625,
"step": 334
},
{
"clip_ratio": 0.0,
"completion_length": 528.609375,
"epoch": 334.8,
"grad_norm": 2.837368202419978,
"kl": 0.573486328125,
"learning_rate": 1.566085541871145e-07,
"loss": 0.0901,
"reward": 13.03749993443489,
"reward_std": 7.35784338414669,
"rewards/accuracy_reward_staging": 1.1756250127218664,
"rewards/format_reward": 0.640625,
"rewards/format_reward_staging": 0.640625,
"step": 335
},
{
"clip_ratio": 0.0,
"completion_length": 536.09375,
"epoch": 335.8,
"grad_norm": 1.9595695246588565,
"kl": 0.5914306640625,
"learning_rate": 1.5195190384357404e-07,
"loss": 0.0832,
"reward": 9.493749916553497,
"reward_std": 6.751585811376572,
"rewards/accuracy_reward_staging": 0.8478124998509884,
"rewards/format_reward": 0.546875,
"rewards/format_reward_staging": 0.46875,
"step": 336
},
{
"clip_ratio": 0.0,
"completion_length": 519.390625,
"epoch": 336.8,
"grad_norm": 2.110347695686011,
"kl": 0.519287109375,
"learning_rate": 1.473598356459078e-07,
"loss": 0.0886,
"reward": 12.745312631130219,
"reward_std": 7.440838478505611,
"rewards/accuracy_reward_staging": 1.122968764975667,
"rewards/format_reward": 0.796875,
"rewards/format_reward_staging": 0.71875,
"step": 337
},
{
"clip_ratio": 0.0,
"completion_length": 547.65625,
"epoch": 337.8,
"grad_norm": 1.727304431939535,
"kl": 0.4608154296875,
"learning_rate": 1.4283269929788776e-07,
"loss": 0.0791,
"reward": 12.464062497019768,
"reward_std": 6.970313638448715,
"rewards/accuracy_reward_staging": 1.1073437612503767,
"rewards/format_reward": 0.703125,
"rewards/format_reward_staging": 0.6875,
"step": 338
},
{
"clip_ratio": 0.0,
"completion_length": 516.578125,
"epoch": 338.8,
"grad_norm": 45.73313544647479,
"kl": 0.8197021484375,
"learning_rate": 1.3837083955847417e-07,
"loss": 0.102,
"reward": 11.170312657952309,
"reward_std": 7.535245016217232,
"rewards/accuracy_reward_staging": 0.9779687668196857,
"rewards/format_reward": 0.71875,
"rewards/format_reward_staging": 0.671875,
"step": 339
},
{
"epoch": 339.8,
"grad_norm": 2.6477795175870518,
"learning_rate": 1.3397459621556128e-07,
"loss": 0.1162,
"step": 340
},
{
"epoch": 339.8,
"eval_clip_ratio": 0.0,
"eval_completion_length": 529.1375,
"eval_kl": 23.68701171875,
"eval_loss": 1.2650219202041626,
"eval_reward": 10.880000054836273,
"eval_reward_std": 7.727067697048187,
"eval_rewards/accuracy_reward_staging": 0.9605000212788581,
"eval_rewards/format_reward": 0.65,
"eval_rewards/format_reward_staging": 0.625,
"eval_runtime": 120.192,
"eval_samples_per_second": 0.166,
"eval_steps_per_second": 0.042,
"step": 340
},
{
"clip_ratio": 0.0,
"completion_length": 560.4375,
"epoch": 340.8,
"grad_norm": 2.062257482287411,
"kl": 0.56280517578125,
"learning_rate": 1.296443040601003e-07,
"loss": 0.0435,
"reward": 10.376562610268593,
"reward_std": 8.002589859068394,
"rewards/accuracy_reward_staging": 0.9189062397927046,
"rewards/format_reward": 0.6171875,
"rewards/format_reward_staging": 0.5703125,
"step": 341
},
{
"clip_ratio": 0.0,
"completion_length": 539.75,
"epoch": 341.8,
"grad_norm": 2.2745767425023513,
"kl": 0.53656005859375,
"learning_rate": 1.2538029286060424e-07,
"loss": 0.0623,
"reward": 11.88906241953373,
"reward_std": 8.809318155050278,
"rewards/accuracy_reward_staging": 1.0607812739908695,
"rewards/format_reward": 0.671875,
"rewards/format_reward_staging": 0.609375,
"step": 342
},
{
"clip_ratio": 0.0,
"completion_length": 537.71875,
"epoch": 342.8,
"grad_norm": 1.9073471200149112,
"kl": 0.48541259765625,
"learning_rate": 1.2118288733803472e-07,
"loss": 0.1154,
"reward": 14.631249979138374,
"reward_std": 6.283873476088047,
"rewards/accuracy_reward_staging": 1.3256250023841858,
"rewards/format_reward": 0.703125,
"rewards/format_reward_staging": 0.671875,
"step": 343
},
{
"clip_ratio": 0.0,
"completion_length": 548.6875,
"epoch": 343.8,
"grad_norm": 2.2191165734186025,
"kl": 0.6094970703125,
"learning_rate": 1.1705240714107301e-07,
"loss": 0.1021,
"reward": 9.640625074505806,
"reward_std": 8.212526381015778,
"rewards/accuracy_reward_staging": 0.8593750139698386,
"rewards/format_reward": 0.53125,
"rewards/format_reward_staging": 0.515625,
"step": 344
},
{
"clip_ratio": 0.0,
"completion_length": 513.8125,
"epoch": 344.8,
"grad_norm": 1.9854032388069334,
"kl": 0.523681640625,
"learning_rate": 1.1298916682177828e-07,
"loss": 0.0055,
"reward": 14.028124868869781,
"reward_std": 6.586119674146175,
"rewards/accuracy_reward_staging": 1.2528125252574682,
"rewards/format_reward": 0.765625,
"rewards/format_reward_staging": 0.734375,
"step": 345
},
{
"clip_ratio": 0.0,
"completion_length": 538.0,
"epoch": 345.8,
"grad_norm": 1.922770111948013,
"kl": 0.4796142578125,
"learning_rate": 1.089934758116322e-07,
"loss": 0.0353,
"reward": 13.531249672174454,
"reward_std": 7.330604811664671,
"rewards/accuracy_reward_staging": 1.2109374832361937,
"rewards/format_reward": 0.71875,
"rewards/format_reward_staging": 0.703125,
"step": 346
},
{
"clip_ratio": 0.0,
"completion_length": 556.25,
"epoch": 346.8,
"grad_norm": 1.9617536727014244,
"kl": 0.6065673828125,
"learning_rate": 1.05065638397975e-07,
"loss": 0.0607,
"reward": 10.37343730032444,
"reward_std": 7.6566493809223175,
"rewards/accuracy_reward_staging": 0.9264062475413084,
"rewards/format_reward": 0.5625,
"rewards/format_reward_staging": 0.546875,
"step": 347
},
{
"clip_ratio": 0.0,
"completion_length": 514.421875,
"epoch": 347.8,
"grad_norm": 5.910623715329322,
"kl": 0.586669921875,
"learning_rate": 1.0120595370083318e-07,
"loss": 0.0441,
"reward": 10.33906227350235,
"reward_std": 5.6453575268387794,
"rewards/accuracy_reward_staging": 0.9151562377810478,
"rewards/format_reward": 0.578125,
"rewards/format_reward_staging": 0.609375,
"step": 348
},
{
"clip_ratio": 0.0,
"completion_length": 505.5625,
"epoch": 348.8,
"grad_norm": 2.2977218203304983,
"kl": 0.5316162109375,
"learning_rate": 9.741471565013958e-08,
"loss": 0.0605,
"reward": 12.892187714576721,
"reward_std": 7.930499374866486,
"rewards/accuracy_reward_staging": 1.1439062617719173,
"rewards/format_reward": 0.75,
"rewards/format_reward_staging": 0.703125,
"step": 349
},
{
"epoch": 349.8,
"grad_norm": 1.8861216023880358,
"learning_rate": 9.369221296335006e-08,
"loss": 0.1235,
"step": 350
},
{
"epoch": 349.8,
"eval_clip_ratio": 0.0,
"eval_completion_length": 520.25,
"eval_kl": 0.54873046875,
"eval_loss": 0.06522668898105621,
"eval_reward": 12.468749737739563,
"eval_reward_std": 7.662279307842255,
"eval_rewards/accuracy_reward_staging": 1.1093750052154063,
"eval_rewards/format_reward": 0.725,
"eval_rewards/format_reward_staging": 0.65,
"eval_runtime": 127.6467,
"eval_samples_per_second": 0.157,
"eval_steps_per_second": 0.039,
"step": 350
},
{
"clip_ratio": 0.0,
"completion_length": 525.09375,
"epoch": 350.8,
"grad_norm": 2.5663009167562287,
"kl": 0.54559326171875,
"learning_rate": 9.003872912345689e-08,
"loss": 0.0985,
"reward": 10.989843875169754,
"reward_std": 7.33668365329504,
"rewards/accuracy_reward_staging": 0.9677343829534948,
"rewards/format_reward": 0.671875,
"rewards/format_reward_staging": 0.640625,
"step": 351
},
{
"clip_ratio": 0.0,
"completion_length": 566.671875,
"epoch": 351.8,
"grad_norm": 3.306328897391672,
"kl": 0.5091552734375,
"learning_rate": 8.645454235739902e-08,
"loss": 0.11,
"reward": 11.37343755364418,
"reward_std": 7.7504191398620605,
"rewards/accuracy_reward_staging": 1.009218767285347,
"rewards/format_reward": 0.671875,
"rewards/format_reward_staging": 0.609375,
"step": 352
},
{
"clip_ratio": 0.0,
"completion_length": 525.484375,
"epoch": 352.8,
"grad_norm": 1.979792175196284,
"kl": 0.520751953125,
"learning_rate": 8.293992561487595e-08,
"loss": 0.034,
"reward": 10.399999901652336,
"reward_std": 6.3909139558672905,
"rewards/accuracy_reward_staging": 0.9212500043213367,
"rewards/format_reward": 0.625,
"rewards/format_reward_staging": 0.5625,
"step": 353
},
{
"clip_ratio": 0.0,
"completion_length": 505.015625,
"epoch": 353.8,
"grad_norm": 94.2447409316946,
"kl": 2.2459716796875,
"learning_rate": 7.949514654755962e-08,
"loss": 0.1011,
"reward": 10.320312514901161,
"reward_std": 6.939793795347214,
"rewards/accuracy_reward_staging": 0.9054687642492354,
"rewards/format_reward": 0.65625,
"rewards/format_reward_staging": 0.609375,
"step": 354
},
{
"clip_ratio": 0.0,
"completion_length": 551.140625,
"epoch": 354.8,
"grad_norm": 2.0400310680209444,
"kl": 0.5252685546875,
"learning_rate": 7.612046748871326e-08,
"loss": 0.0919,
"reward": 10.507812529802322,
"reward_std": 6.574328362941742,
"rewards/accuracy_reward_staging": 0.9257812593132257,
"rewards/format_reward": 0.625,
"rewards/format_reward_staging": 0.625,
"step": 355
},
{
"clip_ratio": 0.0,
"completion_length": 509.78125,
"epoch": 355.8,
"grad_norm": 10.548554530431085,
"kl": 0.63427734375,
"learning_rate": 7.281614543321269e-08,
"loss": 0.0484,
"reward": 12.018749877810478,
"reward_std": 7.941742122173309,
"rewards/accuracy_reward_staging": 1.0721874982118607,
"rewards/format_reward": 0.671875,
"rewards/format_reward_staging": 0.625,
"step": 356
},
{
"clip_ratio": 0.0,
"completion_length": 496.859375,
"epoch": 356.8,
"grad_norm": 2.9275259721912263,
"kl": 0.5072021484375,
"learning_rate": 6.958243201797553e-08,
"loss": 0.0034,
"reward": 13.354687303304672,
"reward_std": 6.157866388559341,
"rewards/accuracy_reward_staging": 1.1901562474668026,
"rewards/format_reward": 0.71875,
"rewards/format_reward_staging": 0.734375,
"step": 357
},
{
"clip_ratio": 0.0,
"completion_length": 497.078125,
"epoch": 357.8,
"grad_norm": 3.259670897497323,
"kl": 0.56689453125,
"learning_rate": 6.641957350279837e-08,
"loss": 0.1212,
"reward": 10.450000002980232,
"reward_std": 8.285482600331306,
"rewards/accuracy_reward_staging": 0.9262500102631748,
"rewards/format_reward": 0.59375,
"rewards/format_reward_staging": 0.59375,
"step": 358
},
{
"clip_ratio": 0.0,
"completion_length": 586.21875,
"epoch": 358.8,
"grad_norm": 3.6157884851011115,
"kl": 0.7269287109375,
"learning_rate": 6.332781075160243e-08,
"loss": 0.1011,
"reward": 8.88906255364418,
"reward_std": 8.714880511164665,
"rewards/accuracy_reward_staging": 0.7857812475413084,
"rewards/format_reward": 0.53125,
"rewards/format_reward_staging": 0.5,
"step": 359
},
{
"epoch": 359.8,
"grad_norm": 3.272297448604345,
"learning_rate": 6.030737921409168e-08,
"loss": 0.0793,
"step": 360
},
{
"epoch": 359.8,
"eval_clip_ratio": 0.0,
"eval_completion_length": 534.5875,
"eval_kl": 0.5283203125,
"eval_loss": 0.10396303236484528,
"eval_reward": 10.23999993801117,
"eval_reward_std": 7.775251030921936,
"eval_rewards/accuracy_reward_staging": 0.9027499988675117,
"eval_rewards/format_reward": 0.6375,
"eval_rewards/format_reward_staging": 0.575,
"eval_runtime": 125.3052,
"eval_samples_per_second": 0.16,
"eval_steps_per_second": 0.04,
"step": 360
},
{
"clip_ratio": 0.0,
"completion_length": 529.671875,
"epoch": 360.8,
"grad_norm": 2.443851018491839,
"kl": 0.60870361328125,
"learning_rate": 5.735850890782157e-08,
"loss": 0.0695,
"reward": 11.39765627682209,
"reward_std": 7.690338987857103,
"rewards/accuracy_reward_staging": 1.0139843788929284,
"rewards/format_reward": 0.6328125,
"rewards/format_reward_staging": 0.625,
"step": 361
},
{
"clip_ratio": 0.0,
"completion_length": 525.890625,
"epoch": 361.8,
"grad_norm": 4.571161669605146,
"kl": 0.7027587890625,
"learning_rate": 5.448142440068315e-08,
"loss": 0.0926,
"reward": 12.457812368869781,
"reward_std": 8.532767742872238,
"rewards/accuracy_reward_staging": 1.097343759611249,
"rewards/format_reward": 0.765625,
"rewards/format_reward_staging": 0.71875,
"step": 362
},
{
"clip_ratio": 0.0,
"completion_length": 540.40625,
"epoch": 362.8,
"grad_norm": 2.6127548669423333,
"kl": 0.56591796875,
"learning_rate": 5.1676344793800675e-08,
"loss": 0.1056,
"reward": 10.995312467217445,
"reward_std": 7.668447345495224,
"rewards/accuracy_reward_staging": 0.9682812597602606,
"rewards/format_reward": 0.65625,
"rewards/format_reward_staging": 0.65625,
"step": 363
},
{
"clip_ratio": 0.0,
"completion_length": 518.546875,
"epoch": 363.8,
"grad_norm": 2.0552105230802944,
"kl": 0.52197265625,
"learning_rate": 4.8943483704846465e-08,
"loss": 0.1321,
"reward": 11.596875041723251,
"reward_std": 8.213591203093529,
"rewards/accuracy_reward_staging": 1.0190625078976154,
"rewards/format_reward": 0.71875,
"rewards/format_reward_staging": 0.6875,
"step": 364
},
{
"clip_ratio": 0.0,
"completion_length": 505.765625,
"epoch": 364.8,
"grad_norm": 2.326851393730122,
"kl": 0.5279541015625,
"learning_rate": 4.6283049251773176e-08,
"loss": 0.0553,
"reward": 11.739062711596489,
"reward_std": 7.672401025891304,
"rewards/accuracy_reward_staging": 1.0348437502980232,
"rewards/format_reward": 0.75,
"rewards/format_reward_staging": 0.640625,
"step": 365
},
{
"clip_ratio": 0.0,
"completion_length": 549.078125,
"epoch": 365.8,
"grad_norm": 5.111208968520272,
"kl": 0.6072998046875,
"learning_rate": 4.3695244036964564e-08,
"loss": 0.082,
"reward": 9.354687571525574,
"reward_std": 7.511838540434837,
"rewards/accuracy_reward_staging": 0.8167187636718154,
"rewards/format_reward": 0.609375,
"rewards/format_reward_staging": 0.578125,
"step": 366
},
{
"clip_ratio": 0.0,
"completion_length": 517.296875,
"epoch": 366.8,
"grad_norm": 2.581279648987323,
"kl": 0.5887451171875,
"learning_rate": 4.1180265131806946e-08,
"loss": 0.0205,
"reward": 10.934374988079071,
"reward_std": 8.458094909787178,
"rewards/accuracy_reward_staging": 0.9778125118464231,
"rewards/format_reward": 0.59375,
"rewards/format_reward_staging": 0.5625,
"step": 367
},
{
"clip_ratio": 0.0,
"completion_length": 518.40625,
"epoch": 367.8,
"grad_norm": 2.035652649324917,
"kl": 0.569091796875,
"learning_rate": 3.87383040616811e-08,
"loss": 0.0615,
"reward": 11.773437261581421,
"reward_std": 7.848160028457642,
"rewards/accuracy_reward_staging": 1.0507812481373549,
"rewards/format_reward": 0.65625,
"rewards/format_reward_staging": 0.609375,
"step": 368
},
{
"clip_ratio": 0.0,
"completion_length": 568.703125,
"epoch": 368.8,
"grad_norm": 2.8704472982245526,
"kl": 0.57958984375,
"learning_rate": 3.636954679137705e-08,
"loss": 0.081,
"reward": 9.676562517881393,
"reward_std": 8.41864463686943,
"rewards/accuracy_reward_staging": 0.8692187499254942,
"rewards/format_reward": 0.515625,
"rewards/format_reward_staging": 0.46875,
"step": 369
},
{
"epoch": 369.8,
"grad_norm": 8.688192162327972,
"learning_rate": 3.4074173710931796e-08,
"loss": 0.092,
"step": 370
},
{
"epoch": 369.8,
"eval_clip_ratio": 0.0,
"eval_completion_length": 537.7875,
"eval_kl": 0.5083984375,
"eval_loss": 0.07736475765705109,
"eval_reward": 11.122499930858613,
"eval_reward_std": 6.3223115285858515,
"eval_rewards/accuracy_reward_staging": 0.989750000461936,
"eval_rewards/format_reward": 0.6375,
"eval_rewards/format_reward_staging": 0.5875,
"eval_runtime": 137.4717,
"eval_samples_per_second": 0.145,
"eval_steps_per_second": 0.036,
"step": 370
},
{
"clip_ratio": 0.0,
"completion_length": 548.3203125,
"epoch": 370.8,
"grad_norm": 2.9935915085575524,
"kl": 0.7093505859375,
"learning_rate": 3.185235962189237e-08,
"loss": 0.086,
"reward": 10.013281270861626,
"reward_std": 7.9125730618834496,
"rewards/accuracy_reward_staging": 0.892734372522682,
"rewards/format_reward": 0.5625,
"rewards/format_reward_staging": 0.5234375,
"step": 371
},
{
"clip_ratio": 0.0,
"completion_length": 507.0,
"epoch": 371.8,
"grad_norm": 2.2371928274108455,
"kl": 0.517822265625,
"learning_rate": 2.9704273724003526e-08,
"loss": 0.0676,
"reward": 11.292187675833702,
"reward_std": 6.194611236453056,
"rewards/accuracy_reward_staging": 1.0042187473736703,
"rewards/format_reward": 0.65625,
"rewards/format_reward_staging": 0.59375,
"step": 372
},
{
"clip_ratio": 0.0,
"completion_length": 559.84375,
"epoch": 372.8,
"grad_norm": 2.531647136891463,
"kl": 0.52001953125,
"learning_rate": 2.7630079602323443e-08,
"loss": 0.1587,
"reward": 10.668749883770943,
"reward_std": 8.836217179894447,
"rewards/accuracy_reward_staging": 0.9575000163167715,
"rewards/format_reward": 0.546875,
"rewards/format_reward_staging": 0.546875,
"step": 373
},
{
"clip_ratio": 0.0,
"completion_length": 503.234375,
"epoch": 373.8,
"grad_norm": 1.8905690009692866,
"kl": 0.56494140625,
"learning_rate": 2.5629935214764864e-08,
"loss": 0.0793,
"reward": 10.550000071525574,
"reward_std": 7.391293793916702,
"rewards/accuracy_reward_staging": 0.931562501937151,
"rewards/format_reward": 0.640625,
"rewards/format_reward_staging": 0.59375,
"step": 374
},
{
"clip_ratio": 0.0,
"completion_length": 518.109375,
"epoch": 374.8,
"grad_norm": 4.31839552805722,
"kl": 0.7208251953125,
"learning_rate": 2.3703992880066636e-08,
"loss": 0.0576,
"reward": 11.893750101327896,
"reward_std": 7.522340267896652,
"rewards/accuracy_reward_staging": 1.0612500254064798,
"rewards/format_reward": 0.65625,
"rewards/format_reward_staging": 0.625,
"step": 375
},
{
"clip_ratio": 0.0,
"completion_length": 560.953125,
"epoch": 375.8,
"grad_norm": 2.2893855348554824,
"kl": 0.4915771484375,
"learning_rate": 2.185239926619431e-08,
"loss": 0.0645,
"reward": 11.574999958276749,
"reward_std": 8.68027800321579,
"rewards/accuracy_reward_staging": 1.0246875025331974,
"rewards/format_reward": 0.671875,
"rewards/format_reward_staging": 0.65625,
"step": 376
},
{
"clip_ratio": 0.0,
"completion_length": 503.046875,
"epoch": 376.8,
"grad_norm": 2.0521716339261573,
"kl": 0.5130615234375,
"learning_rate": 2.007529537917041e-08,
"loss": 0.0747,
"reward": 11.942187368869781,
"reward_std": 6.6959647461771965,
"rewards/accuracy_reward_staging": 1.0660937773063779,
"rewards/format_reward": 0.640625,
"rewards/format_reward_staging": 0.640625,
"step": 377
},
{
"clip_ratio": 0.0,
"completion_length": 544.703125,
"epoch": 377.8,
"grad_norm": 2.6744855588179597,
"kl": 0.6314697265625,
"learning_rate": 1.8372816552336023e-08,
"loss": 0.0892,
"reward": 10.899999856948853,
"reward_std": 7.818521216511726,
"rewards/accuracy_reward_staging": 0.957187520340085,
"rewards/format_reward": 0.6875,
"rewards/format_reward_staging": 0.640625,
"step": 378
},
{
"clip_ratio": 0.0,
"completion_length": 536.0,
"epoch": 378.8,
"grad_norm": 2.1993782801564827,
"kl": 0.5281982421875,
"learning_rate": 1.6745092436045492e-08,
"loss": 0.09,
"reward": 10.765625104308128,
"reward_std": 9.52804271876812,
"rewards/accuracy_reward_staging": 0.9468750059604645,
"rewards/format_reward": 0.671875,
"rewards/format_reward_staging": 0.625,
"step": 379
},
{
"epoch": 379.8,
"grad_norm": 2.850728275624988,
"learning_rate": 1.519224698779198e-08,
"loss": 0.0848,
"step": 380
},
{
"epoch": 379.8,
"eval_clip_ratio": 0.0,
"eval_completion_length": 504.675,
"eval_kl": 0.4578125,
"eval_loss": 0.07416948676109314,
"eval_reward": 11.396249985694885,
"eval_reward_std": 8.601353228092194,
"eval_rewards/accuracy_reward_staging": 1.0146250143647193,
"eval_rewards/format_reward": 0.6375,
"eval_rewards/format_reward_staging": 0.6125,
"eval_runtime": 121.4493,
"eval_samples_per_second": 0.165,
"eval_steps_per_second": 0.041,
"step": 380
},
{
"clip_ratio": 0.0,
"completion_length": 510.578125,
"epoch": 380.8,
"grad_norm": 2.9736298141622295,
"kl": 0.55657958984375,
"learning_rate": 1.3714398462768562e-08,
"loss": 0.0869,
"reward": 12.212500043213367,
"reward_std": 7.73147202283144,
"rewards/accuracy_reward_staging": 1.0837499964982271,
"rewards/format_reward": 0.6953125,
"rewards/format_reward_staging": 0.6796875,
"step": 381
},
{
"clip_ratio": 0.0,
"completion_length": 522.1875,
"epoch": 381.8,
"grad_norm": 2.0373989754015125,
"kl": 0.5072021484375,
"learning_rate": 1.231165940486234e-08,
"loss": 0.034,
"reward": 12.49843743443489,
"reward_std": 6.0733470767736435,
"rewards/accuracy_reward_staging": 1.1014062575995922,
"rewards/format_reward": 0.734375,
"rewards/format_reward_staging": 0.75,
"step": 382
},
{
"clip_ratio": 0.0,
"completion_length": 498.09375,
"epoch": 382.8,
"grad_norm": 2.0529799885966997,
"kl": 0.5218505859375,
"learning_rate": 1.0984136638083175e-08,
"loss": 0.0674,
"reward": 12.098437368869781,
"reward_std": 7.62692953646183,
"rewards/accuracy_reward_staging": 1.0817187586799264,
"rewards/format_reward": 0.640625,
"rewards/format_reward_staging": 0.640625,
"step": 383
},
{
"clip_ratio": 0.0,
"completion_length": 523.203125,
"epoch": 383.8,
"grad_norm": 2.0301601145066495,
"kl": 0.521240234375,
"learning_rate": 9.731931258429638e-09,
"loss": 0.059,
"reward": 10.729687452316284,
"reward_std": 5.984116218984127,
"rewards/accuracy_reward_staging": 0.9464062480255961,
"rewards/format_reward": 0.671875,
"rewards/format_reward_staging": 0.59375,
"step": 384
},
{
"clip_ratio": 0.0,
"completion_length": 502.109375,
"epoch": 384.8,
"grad_norm": 4.517612366443085,
"kl": 0.602294921875,
"learning_rate": 8.555138626189618e-09,
"loss": 0.0919,
"reward": 10.614062532782555,
"reward_std": 7.197298094630241,
"rewards/accuracy_reward_staging": 0.9317187555134296,
"rewards/format_reward": 0.65625,
"rewards/format_reward_staging": 0.640625,
"step": 385
},
{
"clip_ratio": 0.0,
"completion_length": 531.5625,
"epoch": 385.8,
"grad_norm": 1.996721078816804,
"kl": 0.542236328125,
"learning_rate": 7.453848358678017e-09,
"loss": 0.0893,
"reward": 12.423437505960464,
"reward_std": 7.174239456653595,
"rewards/accuracy_reward_staging": 1.0970312654972076,
"rewards/format_reward": 0.765625,
"rewards/format_reward_staging": 0.6875,
"step": 386
},
{
"clip_ratio": 0.0,
"completion_length": 529.5,
"epoch": 386.8,
"grad_norm": 1.6692450152429934,
"kl": 0.4757080078125,
"learning_rate": 6.4281443234125434e-09,
"loss": 0.0622,
"reward": 10.457812458276749,
"reward_std": 8.408779114484787,
"rewards/accuracy_reward_staging": 0.9207812584936619,
"rewards/format_reward": 0.6875,
"rewards/format_reward_staging": 0.5625,
"step": 387
},
{
"clip_ratio": 0.0,
"completion_length": 516.28125,
"epoch": 387.8,
"grad_norm": 72.65003366147363,
"kl": 1.12158203125,
"learning_rate": 5.47810463172671e-09,
"loss": 0.0677,
"reward": 14.623437404632568,
"reward_std": 7.604200206696987,
"rewards/accuracy_reward_staging": 1.3154687583446503,
"rewards/format_reward": 0.75,
"rewards/format_reward_staging": 0.71875,
"step": 388
},
{
"clip_ratio": 0.0,
"completion_length": 540.546875,
"epoch": 388.8,
"grad_norm": 2.13782688418701,
"kl": 0.5604248046875,
"learning_rate": 4.603801632821147e-09,
"loss": 0.0741,
"reward": 11.571875154972076,
"reward_std": 8.498483955860138,
"rewards/accuracy_reward_staging": 1.0556250140070915,
"rewards/format_reward": 0.515625,
"rewards/format_reward_staging": 0.5,
"step": 389
},
{
"epoch": 389.8,
"grad_norm": 2.550655354845835,
"learning_rate": 3.805301908254455e-09,
"loss": 0.0785,
"step": 390
},
{
"epoch": 389.8,
"eval_clip_ratio": 0.0,
"eval_completion_length": 543.7,
"eval_kl": 0.77880859375,
"eval_loss": 0.05070864409208298,
"eval_reward": 11.214999914169312,
"eval_reward_std": 7.508172661066055,
"eval_rewards/accuracy_reward_staging": 1.0102500110864638,
"eval_rewards/format_reward": 0.575,
"eval_rewards/format_reward_staging": 0.5375,
"eval_runtime": 147.1522,
"eval_samples_per_second": 0.136,
"eval_steps_per_second": 0.034,
"step": 390
},
{
"clip_ratio": 0.0,
"completion_length": 514.140625,
"epoch": 390.8,
"grad_norm": 2.7881900199384537,
"kl": 0.58929443359375,
"learning_rate": 3.082666266872036e-09,
"loss": 0.0336,
"reward": 12.8234374076128,
"reward_std": 7.5515576638281345,
"rewards/accuracy_reward_staging": 1.1495312573388219,
"rewards/format_reward": 0.6640625,
"rewards/format_reward_staging": 0.6640625,
"step": 391
},
{
"clip_ratio": 0.0,
"completion_length": 524.25,
"epoch": 391.8,
"grad_norm": 2.166883481802826,
"kl": 0.585693359375,
"learning_rate": 2.435949740175802e-09,
"loss": 0.0693,
"reward": 9.915624856948853,
"reward_std": 7.9057832062244415,
"rewards/accuracy_reward_staging": 0.8790625054389238,
"rewards/format_reward": 0.578125,
"rewards/format_reward_staging": 0.546875,
"step": 392
},
{
"clip_ratio": 0.0,
"completion_length": 558.03125,
"epoch": 392.8,
"grad_norm": 2.013874424977512,
"kl": 0.5469970703125,
"learning_rate": 1.86520157813308e-09,
"loss": -0.0047,
"reward": 10.384374856948853,
"reward_std": 7.076003402471542,
"rewards/accuracy_reward_staging": 0.9103124821558595,
"rewards/format_reward": 0.65625,
"rewards/format_reward_staging": 0.625,
"step": 393
},
{
"clip_ratio": 0.0,
"completion_length": 573.859375,
"epoch": 393.8,
"grad_norm": 4.782888144712166,
"kl": 0.7337646484375,
"learning_rate": 1.3704652454261667e-09,
"loss": 0.0785,
"reward": 10.860937401652336,
"reward_std": 8.005559802055359,
"rewards/accuracy_reward_staging": 0.9782812423072755,
"rewards/format_reward": 0.546875,
"rewards/format_reward_staging": 0.53125,
"step": 394
},
{
"clip_ratio": 0.0,
"completion_length": 509.453125,
"epoch": 394.8,
"grad_norm": 5.775714842908385,
"kl": 0.88623046875,
"learning_rate": 9.517784181422018e-10,
"loss": 0.0558,
"reward": 10.265624895691872,
"reward_std": 7.404541149735451,
"rewards/accuracy_reward_staging": 0.9109375043772161,
"rewards/format_reward": 0.609375,
"rewards/format_reward_staging": 0.546875,
"step": 395
},
{
"clip_ratio": 0.0,
"completion_length": 543.453125,
"epoch": 395.8,
"grad_norm": 2.0434917013326266,
"kl": 0.5186767578125,
"learning_rate": 6.091729809042379e-10,
"loss": 0.0542,
"reward": 11.81874991953373,
"reward_std": 9.100275874137878,
"rewards/accuracy_reward_staging": 1.0662500150501728,
"rewards/format_reward": 0.609375,
"rewards/format_reward_staging": 0.546875,
"step": 396
},
{
"clip_ratio": 0.0,
"completion_length": 546.1875,
"epoch": 396.8,
"grad_norm": 1.7418802984126296,
"kl": 0.4542236328125,
"learning_rate": 3.426750244427401e-10,
"loss": 0.07,
"reward": 12.928124815225601,
"reward_std": 7.5549889877438545,
"rewards/accuracy_reward_staging": 1.1490625096485019,
"rewards/format_reward": 0.71875,
"rewards/format_reward_staging": 0.71875,
"step": 397
},
{
"clip_ratio": 0.0,
"completion_length": 492.109375,
"epoch": 397.8,
"grad_norm": 3.724982054081198,
"kl": 0.7119140625,
"learning_rate": 1.5230484360873042e-10,
"loss": 0.0322,
"reward": 12.485937371850014,
"reward_std": 7.725762560963631,
"rewards/accuracy_reward_staging": 1.1079687606543303,
"rewards/format_reward": 0.734375,
"rewards/format_reward_staging": 0.671875,
"step": 398
},
{
"clip_ratio": 0.0,
"completion_length": 550.203125,
"epoch": 398.8,
"grad_norm": 7.1968629953818315,
"kl": 0.607421875,
"learning_rate": 3.8076935828690315e-11,
"loss": 0.1134,
"reward": 9.825000047683716,
"reward_std": 8.716731041669846,
"rewards/accuracy_reward_staging": 0.8637500181794167,
"rewards/format_reward": 0.640625,
"rewards/format_reward_staging": 0.546875,
"step": 399
},
{
"epoch": 399.8,
"grad_norm": 2.578825219229897,
"learning_rate": 0.0,
"loss": 0.0625,
"step": 400
},
{
"epoch": 399.8,
"eval_clip_ratio": 0.0,
"eval_completion_length": 564.25,
"eval_kl": 0.541796875,
"eval_loss": 0.1237805113196373,
"eval_reward": 12.454999792575837,
"eval_reward_std": 7.5952778339385985,
"eval_rewards/accuracy_reward_staging": 1.1054999977350235,
"eval_rewards/format_reward": 0.7125,
"eval_rewards/format_reward_staging": 0.6875,
"eval_runtime": 144.5952,
"eval_samples_per_second": 0.138,
"eval_steps_per_second": 0.035,
"step": 400
},
{
"clip_ratio": 0.0,
"completion_length": 498.03125,
"epoch": 399.8,
"kl": 0.4832763671875,
"reward": 14.412499785423279,
"reward_std": 7.106586746871471,
"rewards/accuracy_reward_staging": 1.294375006109476,
"rewards/format_reward": 0.734375,
"rewards/format_reward_staging": 0.734375,
"step": 400,
"total_flos": 0.0,
"train_loss": 1.0999555667603271,
"train_runtime": 38247.6092,
"train_samples_per_second": 0.209,
"train_steps_per_second": 0.01
}
],
"logging_steps": 1,
"max_steps": 400,
"num_input_tokens_seen": 0,
"num_train_epochs": 400,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}