AgPerry's picture
Training in progress, step 100, checkpoint
53dfe81 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.007072135785007072,
"eval_steps": 500,
"global_step": 100,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"all_correct": 0.0,
"all_wrong": 0.14285714285714285,
"completion_length": 228.94644165039062,
"epoch": 7.072135785007072e-05,
"grad_norm": 5.001040503294373,
"kl": 0.0,
"learning_rate": 9.999961446907352e-07,
"loss": -0.0,
"reward": 1.427711844444275,
"reward_std": 0.43736881017684937,
"rewards/accuracy_reward": 0.5027117729187012,
"rewards/format_reward": 0.7410714626312256,
"step": 1,
"temporal_rewards": 0.7857142686843872
},
{
"all_correct": 0.0,
"all_wrong": 0.14285714285714285,
"completion_length": 243.33929443359375,
"epoch": 0.00014144271570014144,
"grad_norm": 2.8203915378362665,
"kl": 0.000698089599609375,
"learning_rate": 9.999845788223948e-07,
"loss": 0.0,
"reward": 0.9055423736572266,
"reward_std": 0.4032002389431,
"rewards/accuracy_reward": 0.1912565976381302,
"rewards/format_reward": 0.6964285969734192,
"step": 2,
"temporal_rewards": 0.4285714328289032
},
{
"all_correct": 0.0,
"all_wrong": 0.14285714285714285,
"completion_length": 304.08929443359375,
"epoch": 0.00021216407355021216,
"grad_norm": 3.229814928119297,
"kl": 0.000713348388671875,
"learning_rate": 9.999653025733385e-07,
"loss": 0.0,
"reward": 1.260606288909912,
"reward_std": 0.4095536172389984,
"rewards/accuracy_reward": 0.29989194869995117,
"rewards/format_reward": 0.848214328289032,
"step": 3,
"temporal_rewards": 0.6428571343421936
},
{
"all_correct": 0.0,
"all_wrong": 0.2857142857142857,
"completion_length": 269.51788330078125,
"epoch": 0.0002828854314002829,
"grad_norm": 2.4757985886885496,
"kl": 0.001068115234375,
"learning_rate": 9.999383162408303e-07,
"loss": 0.0,
"reward": 1.1197317838668823,
"reward_std": 0.2747226357460022,
"rewards/accuracy_reward": 0.23401744663715363,
"rewards/format_reward": 0.8035714626312256,
"step": 4,
"temporal_rewards": 0.6428571343421936
},
{
"all_correct": 0.0,
"all_wrong": 0.7142857142857143,
"completion_length": 265.3214416503906,
"epoch": 0.0003536067892503536,
"grad_norm": 5.853654137415192,
"kl": 0.001983642578125,
"learning_rate": 9.999036202410323e-07,
"loss": 0.0001,
"reward": 0.9350484013557434,
"reward_std": 0.36939921975135803,
"rewards/accuracy_reward": 0.22254842519760132,
"rewards/format_reward": 0.6696428656578064,
"step": 5,
"temporal_rewards": 0.5714285373687744
},
{
"all_correct": 0.2857142857142857,
"all_wrong": 0.2857142857142857,
"completion_length": 308.64288330078125,
"epoch": 0.0004243281471004243,
"grad_norm": 4.2687495467742265,
"kl": 0.0017852783203125,
"learning_rate": 9.998612151090002e-07,
"loss": 0.0001,
"reward": 1.3212175369262695,
"reward_std": 0.21474608778953552,
"rewards/accuracy_reward": 0.4033604562282562,
"rewards/format_reward": 0.910714328289032,
"step": 6,
"temporal_rewards": 0.5
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.2857142857142857,
"completion_length": 269.9464416503906,
"epoch": 0.0004950495049504951,
"grad_norm": 1.8718838996670093,
"kl": 0.003631591796875,
"learning_rate": 9.998111014986734e-07,
"loss": 0.0001,
"reward": 1.1820130348205566,
"reward_std": 0.40140631794929504,
"rewards/accuracy_reward": 0.3248700797557831,
"rewards/format_reward": 0.8392857313156128,
"step": 7,
"temporal_rewards": 0.4285714328289032
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.0,
"completion_length": 261.5357360839844,
"epoch": 0.0005657708628005657,
"grad_norm": 33.10783433713444,
"kl": 0.0037689208984375,
"learning_rate": 9.997532801828658e-07,
"loss": 0.0002,
"reward": 1.4435728788375854,
"reward_std": 0.2840394377708435,
"rewards/accuracy_reward": 0.4275014102458954,
"rewards/format_reward": 0.9375000596046448,
"step": 8,
"temporal_rewards": 0.5714285373687744
},
{
"all_correct": 0.0,
"all_wrong": 0.2857142857142857,
"completion_length": 229.94644165039062,
"epoch": 0.0006364922206506365,
"grad_norm": 2.322068277450259,
"kl": 0.00531005859375,
"learning_rate": 9.996877520532534e-07,
"loss": 0.0002,
"reward": 1.2075822353363037,
"reward_std": 0.12404467165470123,
"rewards/accuracy_reward": 0.17365358769893646,
"rewards/format_reward": 0.9910714626312256,
"step": 9,
"temporal_rewards": 0.5
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.14285714285714285,
"completion_length": 302.5,
"epoch": 0.0007072135785007072,
"grad_norm": 1.903621240007123,
"kl": 0.002532958984375,
"learning_rate": 9.996145181203615e-07,
"loss": 0.0001,
"reward": 1.2701517343521118,
"reward_std": 0.18076138198375702,
"rewards/accuracy_reward": 0.2558659315109253,
"rewards/format_reward": 0.910714328289032,
"step": 10,
"temporal_rewards": 0.5714285373687744
},
{
"all_correct": 0.2857142857142857,
"all_wrong": 0.14285714285714285,
"completion_length": 219.00001525878906,
"epoch": 0.0007779349363507779,
"grad_norm": 4.3082825593447485,
"kl": 0.00830078125,
"learning_rate": 9.995335795135475e-07,
"loss": 0.0003,
"reward": 1.4937142133712769,
"reward_std": 0.1519784927368164,
"rewards/accuracy_reward": 0.4294286072254181,
"rewards/format_reward": 0.9642857313156128,
"step": 11,
"temporal_rewards": 0.5
},
{
"all_correct": 0.2857142857142857,
"all_wrong": 0.14285714285714285,
"completion_length": 166.7857208251953,
"epoch": 0.0008486562942008486,
"grad_norm": 1.7997888001626345,
"kl": 0.01519775390625,
"learning_rate": 9.99444937480985e-07,
"loss": 0.0006,
"reward": 1.480262279510498,
"reward_std": 0.17392753064632416,
"rewards/accuracy_reward": 0.4463335871696472,
"rewards/format_reward": 1.0,
"step": 12,
"temporal_rewards": 0.5714285373687744
},
{
"all_correct": 0.0,
"all_wrong": 0.14285714285714285,
"completion_length": 279.6071472167969,
"epoch": 0.0009193776520509194,
"grad_norm": 11.803394899600724,
"kl": 0.00799560546875,
"learning_rate": 9.993485933896437e-07,
"loss": 0.0003,
"reward": 1.4244685173034668,
"reward_std": 0.19528640806674957,
"rewards/accuracy_reward": 0.23161137104034424,
"rewards/format_reward": 0.9910714626312256,
"step": 13,
"temporal_rewards": 0.7857142686843872
},
{
"all_correct": 0.0,
"all_wrong": 0.2857142857142857,
"completion_length": 237.6428680419922,
"epoch": 0.0009900990099009901,
"grad_norm": 1.8875797724521384,
"kl": 0.01171875,
"learning_rate": 9.99244548725269e-07,
"loss": 0.0005,
"reward": 1.4068427085876465,
"reward_std": 0.26988983154296875,
"rewards/accuracy_reward": 0.3711283504962921,
"rewards/format_reward": 0.9821429252624512,
"step": 14,
"temporal_rewards": 0.5
},
{
"all_correct": 0.0,
"all_wrong": 0.42857142857142855,
"completion_length": 240.3035888671875,
"epoch": 0.0010608203677510608,
"grad_norm": 2.3109230196325914,
"kl": 0.015380859375,
"learning_rate": 9.99132805092358e-07,
"loss": 0.0006,
"reward": 1.1580581665039062,
"reward_std": 0.13049285113811493,
"rewards/accuracy_reward": 0.07948664575815201,
"rewards/format_reward": 0.9821429252624512,
"step": 15,
"temporal_rewards": 0.6428571343421936
},
{
"all_correct": 0.2857142857142857,
"all_wrong": 0.2857142857142857,
"completion_length": 293.4107360839844,
"epoch": 0.0011315417256011315,
"grad_norm": 4.0653699922309885,
"kl": 0.007232666015625,
"learning_rate": 9.990133642141357e-07,
"loss": 0.0003,
"reward": 1.4591931104660034,
"reward_std": 0.11937069892883301,
"rewards/accuracy_reward": 0.46455028653144836,
"rewards/format_reward": 0.973214328289032,
"step": 16,
"temporal_rewards": 0.5
},
{
"all_correct": 0.0,
"all_wrong": 0.14285714285714285,
"completion_length": 209.5178680419922,
"epoch": 0.0012022630834512022,
"grad_norm": 3.010805401903002,
"kl": 0.006134033203125,
"learning_rate": 9.988862279325284e-07,
"loss": 0.0002,
"reward": 1.2082678079605103,
"reward_std": 0.18017704784870148,
"rewards/accuracy_reward": 0.1743391901254654,
"rewards/format_reward": 0.9821429252624512,
"step": 17,
"temporal_rewards": 0.5714285373687744
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.2857142857142857,
"completion_length": 304.6607360839844,
"epoch": 0.001272984441301273,
"grad_norm": 2.8112272464882273,
"kl": 0.0086669921875,
"learning_rate": 9.98751398208135e-07,
"loss": 0.0003,
"reward": 1.2044445276260376,
"reward_std": 0.12818890810012817,
"rewards/accuracy_reward": 0.24908724427223206,
"rewards/format_reward": 0.848214328289032,
"step": 18,
"temporal_rewards": 0.6428571343421936
},
{
"all_correct": 0.0,
"all_wrong": 0.2857142857142857,
"completion_length": 245.10714721679688,
"epoch": 0.0013437057991513438,
"grad_norm": 4.69936573628269,
"kl": 0.00701904296875,
"learning_rate": 9.986088771201963e-07,
"loss": 0.0003,
"reward": 1.3298383951187134,
"reward_std": 0.19001968204975128,
"rewards/accuracy_reward": 0.37983840703964233,
"rewards/format_reward": 0.9285714626312256,
"step": 19,
"temporal_rewards": 0.5
},
{
"all_correct": 0.0,
"all_wrong": 0.0,
"completion_length": 268.3571472167969,
"epoch": 0.0014144271570014145,
"grad_norm": 5.228045202997088,
"kl": 0.00811767578125,
"learning_rate": 9.98458666866564e-07,
"loss": 0.0003,
"reward": 1.3763288259506226,
"reward_std": 0.16895097494125366,
"rewards/accuracy_reward": 0.2674001157283783,
"rewards/format_reward": 0.9910714626312256,
"step": 20,
"temporal_rewards": 0.6428571343421936
},
{
"all_correct": 0.0,
"all_wrong": 0.2857142857142857,
"completion_length": 289.21429443359375,
"epoch": 0.0014851485148514852,
"grad_norm": 1.991182727557945,
"kl": 0.007293701171875,
"learning_rate": 9.983007697636658e-07,
"loss": 0.0003,
"reward": 1.26073157787323,
"reward_std": 0.242427796125412,
"rewards/accuracy_reward": 0.23573148250579834,
"rewards/format_reward": 1.0,
"step": 21,
"temporal_rewards": 0.5
},
{
"all_correct": 0.0,
"all_wrong": 0.14285714285714285,
"completion_length": 302.375,
"epoch": 0.0015558698727015559,
"grad_norm": 2.228743183846523,
"kl": 0.0084228515625,
"learning_rate": 9.981351882464707e-07,
"loss": 0.0003,
"reward": 1.2869592905044556,
"reward_std": 0.3141626715660095,
"rewards/accuracy_reward": 0.2548163831233978,
"rewards/format_reward": 0.9910714626312256,
"step": 22,
"temporal_rewards": 0.5714285373687744
},
{
"all_correct": 0.2857142857142857,
"all_wrong": 0.14285714285714285,
"completion_length": 316.01788330078125,
"epoch": 0.0016265912305516265,
"grad_norm": 3.8272666470851493,
"kl": 0.0081787109375,
"learning_rate": 9.979619248684501e-07,
"loss": 0.0003,
"reward": 1.33269202709198,
"reward_std": 0.13754969835281372,
"rewards/accuracy_reward": 0.3916205167770386,
"rewards/format_reward": 0.8571429252624512,
"step": 23,
"temporal_rewards": 0.6428571343421936
},
{
"all_correct": 0.0,
"all_wrong": 0.0,
"completion_length": 308.3571472167969,
"epoch": 0.0016973125884016972,
"grad_norm": 2.7805250038590934,
"kl": 0.0098876953125,
"learning_rate": 9.9778098230154e-07,
"loss": 0.0004,
"reward": 1.541878581047058,
"reward_std": 0.16751347482204437,
"rewards/accuracy_reward": 0.3454500734806061,
"rewards/format_reward": 1.0,
"step": 24,
"temporal_rewards": 0.714285671710968
},
{
"all_correct": 0.0,
"all_wrong": 0.0,
"completion_length": 240.5178680419922,
"epoch": 0.001768033946251768,
"grad_norm": 7.148190397206045,
"kl": 0.0142822265625,
"learning_rate": 9.975923633360984e-07,
"loss": 0.0006,
"reward": 1.4286309480667114,
"reward_std": 0.1863246113061905,
"rewards/accuracy_reward": 0.38934507966041565,
"rewards/format_reward": 1.0,
"step": 25,
"temporal_rewards": 0.5
},
{
"all_correct": 0.0,
"all_wrong": 0.0,
"completion_length": 314.125,
"epoch": 0.0018387553041018388,
"grad_norm": 2.7600877644928796,
"kl": 0.01080322265625,
"learning_rate": 9.973960708808631e-07,
"loss": 0.0004,
"reward": 1.4452130794525146,
"reward_std": 0.29430562257766724,
"rewards/accuracy_reward": 0.3523559868335724,
"rewards/format_reward": 0.9821429252624512,
"step": 26,
"temporal_rewards": 0.6428571343421936
},
{
"all_correct": 0.0,
"all_wrong": 0.2857142857142857,
"completion_length": 287.375,
"epoch": 0.0019094766619519095,
"grad_norm": 2.1898194956106405,
"kl": 0.0128173828125,
"learning_rate": 9.971921079629069e-07,
"loss": 0.0005,
"reward": 1.3160984516143799,
"reward_std": 0.1425773948431015,
"rewards/accuracy_reward": 0.22145557403564453,
"rewards/format_reward": 0.9910714626312256,
"step": 27,
"temporal_rewards": 0.714285671710968
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.2857142857142857,
"completion_length": 323.1785888671875,
"epoch": 0.0019801980198019802,
"grad_norm": 2.3090632087802643,
"kl": 0.00872802734375,
"learning_rate": 9.969804777275898e-07,
"loss": 0.0004,
"reward": 1.4885876178741455,
"reward_std": 0.1416071355342865,
"rewards/accuracy_reward": 0.48501619696617126,
"rewards/format_reward": 0.9642857313156128,
"step": 28,
"temporal_rewards": 0.5
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.14285714285714285,
"completion_length": 277.71429443359375,
"epoch": 0.002050919377652051,
"grad_norm": 2.563040364496041,
"kl": 0.01495361328125,
"learning_rate": 9.967611834385122e-07,
"loss": 0.0006,
"reward": 1.5186972618103027,
"reward_std": 0.21927528083324432,
"rewards/accuracy_reward": 0.5436971187591553,
"rewards/format_reward": 0.9642857313156128,
"step": 29,
"temporal_rewards": 0.5
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.14285714285714285,
"completion_length": 279.3035888671875,
"epoch": 0.0021216407355021216,
"grad_norm": 5.357871739418545,
"kl": 0.010009765625,
"learning_rate": 9.965342284774631e-07,
"loss": 0.0004,
"reward": 1.4721060991287231,
"reward_std": 0.11375095695257187,
"rewards/accuracy_reward": 0.3828202784061432,
"rewards/format_reward": 1.0,
"step": 30,
"temporal_rewards": 0.5714285373687744
},
{
"all_correct": 0.0,
"all_wrong": 0.14285714285714285,
"completion_length": 273.8035888671875,
"epoch": 0.0021923620933521925,
"grad_norm": 16.74341524407899,
"kl": 0.01470947265625,
"learning_rate": 9.962996163443688e-07,
"loss": 0.0006,
"reward": 1.4044029712677002,
"reward_std": 0.3334062397480011,
"rewards/accuracy_reward": 0.3222600221633911,
"rewards/format_reward": 0.973214328289032,
"step": 31,
"temporal_rewards": 0.5714285373687744
},
{
"all_correct": 0.8571428571428571,
"all_wrong": 0.0,
"completion_length": 234.19644165039062,
"epoch": 0.002263083451202263,
"grad_norm": 0.8595633494464696,
"kl": 0.01556396484375,
"learning_rate": 9.960573506572389e-07,
"loss": 0.0006,
"reward": 1.9745900630950928,
"reward_std": 0.0232665054500103,
"rewards/accuracy_reward": 0.8781614899635315,
"rewards/format_reward": 1.0,
"step": 32,
"temporal_rewards": 0.6428571343421936
},
{
"all_correct": 0.0,
"all_wrong": 0.14285714285714285,
"completion_length": 419.2321472167969,
"epoch": 0.002333804809052334,
"grad_norm": 1.5338994037252882,
"kl": 0.0084228515625,
"learning_rate": 9.958074351521096e-07,
"loss": 0.0003,
"reward": 1.3846343755722046,
"reward_std": 0.33544743061065674,
"rewards/accuracy_reward": 0.3524913489818573,
"rewards/format_reward": 0.9642857313156128,
"step": 33,
"temporal_rewards": 0.5
},
{
"all_correct": 0.0,
"all_wrong": 0.0,
"completion_length": 322.75,
"epoch": 0.0024045261669024044,
"grad_norm": 3.835179363483981,
"kl": 0.01275634765625,
"learning_rate": 9.955498736829874e-07,
"loss": 0.0005,
"reward": 1.4323281049728394,
"reward_std": 0.29070112109184265,
"rewards/accuracy_reward": 0.43232807517051697,
"rewards/format_reward": 0.9642857313156128,
"step": 34,
"temporal_rewards": 0.4285714328289032
},
{
"all_correct": 0.0,
"all_wrong": 0.14285714285714285,
"completion_length": 338.375,
"epoch": 0.0024752475247524753,
"grad_norm": 2.5354088480923265,
"kl": 0.01483154296875,
"learning_rate": 9.952846702217885e-07,
"loss": 0.0006,
"reward": 1.4049376249313354,
"reward_std": 0.26813623309135437,
"rewards/accuracy_reward": 0.3192232847213745,
"rewards/format_reward": 1.0,
"step": 35,
"temporal_rewards": 0.5
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.14285714285714285,
"completion_length": 257.1071472167969,
"epoch": 0.002545968882602546,
"grad_norm": 2.1307554136020066,
"kl": 0.017822265625,
"learning_rate": 9.950118288582787e-07,
"loss": 0.0007,
"reward": 1.4344677925109863,
"reward_std": 0.20196352899074554,
"rewards/accuracy_reward": 0.3826819956302643,
"rewards/format_reward": 1.0,
"step": 36,
"temporal_rewards": 0.6428571343421936
},
{
"all_correct": 0.2857142857142857,
"all_wrong": 0.42857142857142855,
"completion_length": 197.37501525878906,
"epoch": 0.0026166902404526167,
"grad_norm": 1.055288306131669,
"kl": 0.01806640625,
"learning_rate": 9.947313538000092e-07,
"loss": 0.0007,
"reward": 1.4836164712905884,
"reward_std": 0.10975757986307144,
"rewards/accuracy_reward": 0.47290223836898804,
"rewards/format_reward": 1.0,
"step": 37,
"temporal_rewards": 0.5
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.14285714285714285,
"completion_length": 244.23214721679688,
"epoch": 0.0026874115983026876,
"grad_norm": 2.103619483245465,
"kl": 0.0272216796875,
"learning_rate": 9.944432493722524e-07,
"loss": 0.0011,
"reward": 1.3943806886672974,
"reward_std": 0.0798025131225586,
"rewards/accuracy_reward": 0.3586663007736206,
"rewards/format_reward": 1.0,
"step": 38,
"temporal_rewards": 0.5
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.14285714285714285,
"completion_length": 363.5714416503906,
"epoch": 0.002758132956152758,
"grad_norm": 1.7740744148730756,
"kl": 0.0142822265625,
"learning_rate": 9.941475200179346e-07,
"loss": 0.0006,
"reward": 1.3610663414001465,
"reward_std": 0.15090212225914001,
"rewards/accuracy_reward": 0.33428052067756653,
"rewards/format_reward": 0.9553571939468384,
"step": 39,
"temporal_rewards": 0.5714285373687744
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.14285714285714285,
"completion_length": 346.76788330078125,
"epoch": 0.002828854314002829,
"grad_norm": 2.37933402889077,
"kl": 0.01202392578125,
"learning_rate": 9.938441702975689e-07,
"loss": 0.0005,
"reward": 1.4156906604766846,
"reward_std": 0.1760970950126648,
"rewards/accuracy_reward": 0.38533341884613037,
"rewards/format_reward": 0.973214328289032,
"step": 40,
"temporal_rewards": 0.5
},
{
"all_correct": 0.2857142857142857,
"all_wrong": 0.14285714285714285,
"completion_length": 341.7321472167969,
"epoch": 0.0028995756718528994,
"grad_norm": 1.80289901611109,
"kl": 0.01226806640625,
"learning_rate": 9.935332048891826e-07,
"loss": 0.0005,
"reward": 1.5363633632659912,
"reward_std": 0.15127059817314148,
"rewards/accuracy_reward": 0.4327918589115143,
"rewards/format_reward": 1.0,
"step": 41,
"temporal_rewards": 0.6428571343421936
},
{
"all_correct": 0.0,
"all_wrong": 0.0,
"completion_length": 379.6785888671875,
"epoch": 0.0029702970297029703,
"grad_norm": 6.9951116771129564,
"kl": 0.01312255859375,
"learning_rate": 9.932146285882476e-07,
"loss": 0.0005,
"reward": 1.4107944965362549,
"reward_std": 0.2239990383386612,
"rewards/accuracy_reward": 0.2965086102485657,
"rewards/format_reward": 0.9821429252624512,
"step": 42,
"temporal_rewards": 0.6428571343421936
},
{
"all_correct": 0.0,
"all_wrong": 0.14285714285714285,
"completion_length": 361.9821472167969,
"epoch": 0.003041018387553041,
"grad_norm": 14.405879650265113,
"kl": 0.01422119140625,
"learning_rate": 9.928884463076043e-07,
"loss": 0.0006,
"reward": 1.4136697053909302,
"reward_std": 0.2486438900232315,
"rewards/accuracy_reward": 0.3136696517467499,
"rewards/format_reward": 0.9821429252624512,
"step": 43,
"temporal_rewards": 0.5714285373687744
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.2857142857142857,
"completion_length": 350.0714416503906,
"epoch": 0.0031117397454031117,
"grad_norm": 27.566161680922317,
"kl": 0.0147705078125,
"learning_rate": 9.925546630773868e-07,
"loss": 0.0006,
"reward": 1.2689402103424072,
"reward_std": 0.16928793489933014,
"rewards/accuracy_reward": 0.2725115716457367,
"rewards/format_reward": 0.8750000596046448,
"step": 44,
"temporal_rewards": 0.6428571343421936
},
{
"all_correct": 0.0,
"all_wrong": 0.2857142857142857,
"completion_length": 385.83929443359375,
"epoch": 0.0031824611032531826,
"grad_norm": 12.206659175892238,
"kl": 0.0146484375,
"learning_rate": 9.922132840449458e-07,
"loss": 0.0006,
"reward": 1.1907211542129517,
"reward_std": 0.2152532935142517,
"rewards/accuracy_reward": 0.13179250061511993,
"rewards/format_reward": 0.9196429252624512,
"step": 45,
"temporal_rewards": 0.6428571343421936
},
{
"all_correct": 0.0,
"all_wrong": 0.0,
"completion_length": 374.9107360839844,
"epoch": 0.003253182461103253,
"grad_norm": 2.4250446537484573,
"kl": 0.01446533203125,
"learning_rate": 9.91864314474768e-07,
"loss": 0.0006,
"reward": 1.376240611076355,
"reward_std": 0.20905308425426483,
"rewards/accuracy_reward": 0.28338348865509033,
"rewards/format_reward": 0.9642857313156128,
"step": 46,
"temporal_rewards": 0.5714285373687744
},
{
"all_correct": 0.0,
"all_wrong": 0.14285714285714285,
"completion_length": 433.7321472167969,
"epoch": 0.003323903818953324,
"grad_norm": 4.7936130008464275,
"kl": 0.0084228515625,
"learning_rate": 9.915077597483958e-07,
"loss": 0.0003,
"reward": 1.392529845237732,
"reward_std": 0.2959822416305542,
"rewards/accuracy_reward": 0.369315505027771,
"rewards/format_reward": 0.9910714626312256,
"step": 47,
"temporal_rewards": 0.5
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.14285714285714285,
"completion_length": 423.4821472167969,
"epoch": 0.0033946251768033945,
"grad_norm": 1.1925199674831055,
"kl": 0.00927734375,
"learning_rate": 9.911436253643443e-07,
"loss": 0.0004,
"reward": 1.4139832258224487,
"reward_std": 0.2553583085536957,
"rewards/accuracy_reward": 0.4854118824005127,
"rewards/format_reward": 0.8928571939468384,
"step": 48,
"temporal_rewards": 0.5
},
{
"all_correct": 0.0,
"all_wrong": 0.0,
"completion_length": 360.1071472167969,
"epoch": 0.0034653465346534654,
"grad_norm": 6.970466371308957,
"kl": 0.01214599609375,
"learning_rate": 9.907719169380162e-07,
"loss": 0.0005,
"reward": 1.4226962327957153,
"reward_std": 0.15648144483566284,
"rewards/accuracy_reward": 0.29769620299339294,
"rewards/format_reward": 1.0,
"step": 49,
"temporal_rewards": 0.5
},
{
"all_correct": 0.0,
"all_wrong": 0.2857142857142857,
"completion_length": 245.75001525878906,
"epoch": 0.003536067892503536,
"grad_norm": 11.750858368118319,
"kl": 0.016845703125,
"learning_rate": 9.90392640201615e-07,
"loss": 0.0007,
"reward": 1.2471274137496948,
"reward_std": 0.20660826563835144,
"rewards/accuracy_reward": 0.2274845540523529,
"rewards/format_reward": 1.0,
"step": 50,
"temporal_rewards": 0.5714285373687744
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.14285714285714285,
"completion_length": 296.96429443359375,
"epoch": 0.0036067892503536068,
"grad_norm": 1.9771563601864632,
"kl": 0.01513671875,
"learning_rate": 9.900058010040577e-07,
"loss": 0.0006,
"reward": 1.4364941120147705,
"reward_std": 0.2468460500240326,
"rewards/accuracy_reward": 0.3489939868450165,
"rewards/format_reward": 0.9910714626312256,
"step": 51,
"temporal_rewards": 0.6428571343421936
},
{
"all_correct": 0.0,
"all_wrong": 0.14285714285714285,
"completion_length": 282.7857360839844,
"epoch": 0.0036775106082036777,
"grad_norm": 2.490876400127454,
"kl": 0.0164794921875,
"learning_rate": 9.89611405310883e-07,
"loss": 0.0007,
"reward": 1.4430785179138184,
"reward_std": 0.1914571076631546,
"rewards/accuracy_reward": 0.34843552112579346,
"rewards/format_reward": 1.0,
"step": 52,
"temporal_rewards": 0.6428571343421936
},
{
"all_correct": 0.2857142857142857,
"all_wrong": 0.0,
"completion_length": 255.35714721679688,
"epoch": 0.003748231966053748,
"grad_norm": 1.9265114881852106,
"kl": 0.0167236328125,
"learning_rate": 9.8920945920416e-07,
"loss": 0.0007,
"reward": 1.6397331953048706,
"reward_std": 0.16647037863731384,
"rewards/accuracy_reward": 0.5183045864105225,
"rewards/format_reward": 1.0,
"step": 53,
"temporal_rewards": 0.6428571343421936
},
{
"all_correct": 0.0,
"all_wrong": 0.0,
"completion_length": 328.51788330078125,
"epoch": 0.003818953323903819,
"grad_norm": 1.9871436921834855,
"kl": 0.01239013671875,
"learning_rate": 9.887999688823954e-07,
"loss": 0.0005,
"reward": 1.36974036693573,
"reward_std": 0.1781034767627716,
"rewards/accuracy_reward": 0.34116891026496887,
"rewards/format_reward": 0.9821429252624512,
"step": 54,
"temporal_rewards": 0.5
},
{
"all_correct": 0.0,
"all_wrong": 0.0,
"completion_length": 255.71429443359375,
"epoch": 0.0038896746817538895,
"grad_norm": 1.9418580621335162,
"kl": 0.015869140625,
"learning_rate": 9.883829406604361e-07,
"loss": 0.0006,
"reward": 1.2471375465393066,
"reward_std": 0.20031888782978058,
"rewards/accuracy_reward": 0.2346375435590744,
"rewards/format_reward": 0.9910714626312256,
"step": 55,
"temporal_rewards": 0.5
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.14285714285714285,
"completion_length": 309.5535888671875,
"epoch": 0.0039603960396039604,
"grad_norm": 2.0813942224085347,
"kl": 0.0146484375,
"learning_rate": 9.879583809693736e-07,
"loss": 0.0006,
"reward": 1.356779932975769,
"reward_std": 0.13400611281394958,
"rewards/accuracy_reward": 0.29785144329071045,
"rewards/format_reward": 0.9910714626312256,
"step": 56,
"temporal_rewards": 0.5714285373687744
},
{
"all_correct": 0.0,
"all_wrong": 0.0,
"completion_length": 371.83929443359375,
"epoch": 0.004031117397454031,
"grad_norm": 2.64005633723031,
"kl": 0.01141357421875,
"learning_rate": 9.875262963564435e-07,
"loss": 0.0005,
"reward": 1.4918296337127686,
"reward_std": 0.37153175473213196,
"rewards/accuracy_reward": 0.5061153769493103,
"rewards/format_reward": 0.9464285969734192,
"step": 57,
"temporal_rewards": 0.5
},
{
"all_correct": 0.0,
"all_wrong": 0.2857142857142857,
"completion_length": 272.1071472167969,
"epoch": 0.004101838755304102,
"grad_norm": 2.3327288177522463,
"kl": 0.019775390625,
"learning_rate": 9.870866934849246e-07,
"loss": 0.0008,
"reward": 1.3908780813217163,
"reward_std": 0.18408146500587463,
"rewards/accuracy_reward": 0.35516369342803955,
"rewards/format_reward": 1.0,
"step": 58,
"temporal_rewards": 0.5714285373687744
},
{
"all_correct": 0.0,
"all_wrong": 0.14285714285714285,
"completion_length": 361.7500305175781,
"epoch": 0.004172560113154172,
"grad_norm": 2.4090170910345385,
"kl": 0.016845703125,
"learning_rate": 9.866395791340374e-07,
"loss": 0.0007,
"reward": 1.3034508228302002,
"reward_std": 0.23496000468730927,
"rewards/accuracy_reward": 0.22845058143138885,
"rewards/format_reward": 0.973214328289032,
"step": 59,
"temporal_rewards": 0.5714285373687744
},
{
"all_correct": 0.2857142857142857,
"all_wrong": 0.14285714285714285,
"completion_length": 338.3035888671875,
"epoch": 0.004243281471004243,
"grad_norm": 2.0023139679744824,
"kl": 0.0140380859375,
"learning_rate": 9.861849601988383e-07,
"loss": 0.0006,
"reward": 1.3882596492767334,
"reward_std": 0.1839255839586258,
"rewards/accuracy_reward": 0.4061168134212494,
"rewards/format_reward": 0.9464285969734192,
"step": 60,
"temporal_rewards": 0.5
},
{
"all_correct": 0.0,
"all_wrong": 0.14285714285714285,
"completion_length": 273.375,
"epoch": 0.004314002828854314,
"grad_norm": 2.2569790201906055,
"kl": 0.0203857421875,
"learning_rate": 9.857228436901134e-07,
"loss": 0.0008,
"reward": 1.6071990728378296,
"reward_std": 0.1727701723575592,
"rewards/accuracy_reward": 0.4821990430355072,
"rewards/format_reward": 1.0,
"step": 61,
"temporal_rewards": 0.6428571343421936
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.2857142857142857,
"completion_length": 281.625,
"epoch": 0.004384724186704385,
"grad_norm": 1.6524578433340213,
"kl": 0.0125732421875,
"learning_rate": 9.852532367342712e-07,
"loss": 0.0005,
"reward": 1.617965579032898,
"reward_std": 0.2325797826051712,
"rewards/accuracy_reward": 0.5501083731651306,
"rewards/format_reward": 1.0,
"step": 62,
"temporal_rewards": 0.5714285373687744
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.14285714285714285,
"completion_length": 237.1785888671875,
"epoch": 0.004455445544554455,
"grad_norm": 2.83202259372297,
"kl": 0.02685546875,
"learning_rate": 9.847761465732316e-07,
"loss": 0.0011,
"reward": 1.5921242237091064,
"reward_std": 0.13505886495113373,
"rewards/accuracy_reward": 0.5099811553955078,
"rewards/format_reward": 0.9821429252624512,
"step": 63,
"temporal_rewards": 0.5714285373687744
},
{
"all_correct": 0.0,
"all_wrong": 0.2857142857142857,
"completion_length": 280.3214416503906,
"epoch": 0.004526166902404526,
"grad_norm": 6.503797696065202,
"kl": 0.0166015625,
"learning_rate": 9.842915805643156e-07,
"loss": 0.0007,
"reward": 1.2465910911560059,
"reward_std": 0.15690433979034424,
"rewards/accuracy_reward": 0.2215910702943802,
"rewards/format_reward": 0.9821429252624512,
"step": 64,
"temporal_rewards": 0.5714285373687744
},
{
"all_correct": 0.0,
"all_wrong": 0.0,
"completion_length": 299.71429443359375,
"epoch": 0.004596888260254597,
"grad_norm": 2.597081597057807,
"kl": 0.01318359375,
"learning_rate": 9.837995461801299e-07,
"loss": 0.0005,
"reward": 1.2907322645187378,
"reward_std": 0.18411685526371002,
"rewards/accuracy_reward": 0.24787509441375732,
"rewards/format_reward": 1.0,
"step": 65,
"temporal_rewards": 0.5
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.14285714285714285,
"completion_length": 254.4285888671875,
"epoch": 0.004667609618104668,
"grad_norm": 3.0012617684710476,
"kl": 0.0184326171875,
"learning_rate": 9.833000510084537e-07,
"loss": 0.0007,
"reward": 1.5473493337631226,
"reward_std": 0.28692588210105896,
"rewards/accuracy_reward": 0.43663495779037476,
"rewards/format_reward": 0.9910714626312256,
"step": 66,
"temporal_rewards": 0.714285671710968
},
{
"all_correct": 0.2857142857142857,
"all_wrong": 0.14285714285714285,
"completion_length": 310.3571472167969,
"epoch": 0.004738330975954739,
"grad_norm": 2.5744316518646677,
"kl": 0.0164794921875,
"learning_rate": 9.827931027521203e-07,
"loss": 0.0007,
"reward": 1.5562368631362915,
"reward_std": 0.056368716061115265,
"rewards/accuracy_reward": 0.4740940034389496,
"rewards/format_reward": 1.0,
"step": 67,
"temporal_rewards": 0.5714285373687744
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.0,
"completion_length": 285.1607360839844,
"epoch": 0.004809052333804809,
"grad_norm": 4.4218169067197675,
"kl": 0.02099609375,
"learning_rate": 9.82278709228899e-07,
"loss": 0.0008,
"reward": 1.6907390356063843,
"reward_std": 0.14986543357372284,
"rewards/accuracy_reward": 0.48716750741004944,
"rewards/format_reward": 1.0,
"step": 68,
"temporal_rewards": 0.714285671710968
},
{
"all_correct": 0.0,
"all_wrong": 0.0,
"completion_length": 268.83929443359375,
"epoch": 0.00487977369165488,
"grad_norm": 36.58319370122495,
"kl": 0.0150146484375,
"learning_rate": 9.817568783713743e-07,
"loss": 0.0006,
"reward": 1.422197699546814,
"reward_std": 0.1646842360496521,
"rewards/accuracy_reward": 0.29362624883651733,
"rewards/format_reward": 1.0,
"step": 69,
"temporal_rewards": 0.6428571343421936
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.0,
"completion_length": 404.3214416503906,
"epoch": 0.0049504950495049506,
"grad_norm": 1.9404067621178873,
"kl": 0.0120849609375,
"learning_rate": 9.812276182268236e-07,
"loss": 0.0005,
"reward": 1.441391110420227,
"reward_std": 0.3151356279850006,
"rewards/accuracy_reward": 0.3735339939594269,
"rewards/format_reward": 0.9642857313156128,
"step": 70,
"temporal_rewards": 0.5714285373687744
},
{
"all_correct": 0.0,
"all_wrong": 0.2857142857142857,
"completion_length": 287.5714416503906,
"epoch": 0.0050212164073550215,
"grad_norm": 1.5992858525788907,
"kl": 0.0206298828125,
"learning_rate": 9.80690936957093e-07,
"loss": 0.0008,
"reward": 1.2283703088760376,
"reward_std": 0.19715102016925812,
"rewards/accuracy_reward": 0.1426558941602707,
"rewards/format_reward": 1.0,
"step": 71,
"temporal_rewards": 0.5714285373687744
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.14285714285714285,
"completion_length": 234.46429443359375,
"epoch": 0.005091937765205092,
"grad_norm": 3.2452184263458235,
"kl": 0.020263671875,
"learning_rate": 9.801468428384716e-07,
"loss": 0.0008,
"reward": 1.4352792501449585,
"reward_std": 0.1383657157421112,
"rewards/accuracy_reward": 0.4245648682117462,
"rewards/format_reward": 1.0,
"step": 72,
"temporal_rewards": 0.4285714328289032
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.42857142857142855,
"completion_length": 226.6607208251953,
"epoch": 0.005162659123055162,
"grad_norm": 3.2995827121320156,
"kl": 0.0206298828125,
"learning_rate": 9.795953442615637e-07,
"loss": 0.0008,
"reward": 1.4032503366470337,
"reward_std": 0.18326207995414734,
"rewards/accuracy_reward": 0.30860739946365356,
"rewards/format_reward": 1.0,
"step": 73,
"temporal_rewards": 0.6428571343421936
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.0,
"completion_length": 273.4821472167969,
"epoch": 0.005233380480905233,
"grad_norm": 2.014919533244763,
"kl": 0.01708984375,
"learning_rate": 9.790364497311595e-07,
"loss": 0.0007,
"reward": 1.494827151298523,
"reward_std": 0.2938782870769501,
"rewards/accuracy_reward": 0.47696998715400696,
"rewards/format_reward": 1.0,
"step": 74,
"temporal_rewards": 0.5
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.0,
"completion_length": 333.3035888671875,
"epoch": 0.005304101838755304,
"grad_norm": 2.149668464492747,
"kl": 0.0152587890625,
"learning_rate": 9.784701678661044e-07,
"loss": 0.0006,
"reward": 1.6914342641830444,
"reward_std": 0.2628679871559143,
"rewards/accuracy_reward": 0.5807199478149414,
"rewards/format_reward": 1.0,
"step": 75,
"temporal_rewards": 0.5
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.0,
"completion_length": 198.37501525878906,
"epoch": 0.005374823196605375,
"grad_norm": 2.941637870652214,
"kl": 0.023681640625,
"learning_rate": 9.77896507399165e-07,
"loss": 0.0009,
"reward": 1.6653074026107788,
"reward_std": 0.17388883233070374,
"rewards/accuracy_reward": 0.6617358326911926,
"rewards/format_reward": 1.0,
"step": 76,
"temporal_rewards": 0.5
},
{
"all_correct": 0.0,
"all_wrong": 0.14285714285714285,
"completion_length": 292.9107360839844,
"epoch": 0.005445544554455445,
"grad_norm": 2.575279782360163,
"kl": 0.022216796875,
"learning_rate": 9.773154771768955e-07,
"loss": 0.0009,
"reward": 1.3348206281661987,
"reward_std": 0.2901800870895386,
"rewards/accuracy_reward": 0.24732069671154022,
"rewards/format_reward": 1.0,
"step": 77,
"temporal_rewards": 0.5714285373687744
},
{
"all_correct": 0.0,
"all_wrong": 0.0,
"completion_length": 395.9285888671875,
"epoch": 0.005516265912305516,
"grad_norm": 2.25007048941082,
"kl": 0.014892578125,
"learning_rate": 9.767270861595004e-07,
"loss": 0.0006,
"reward": 1.4193731546401978,
"reward_std": 0.2683948278427124,
"rewards/accuracy_reward": 0.26937323808670044,
"rewards/format_reward": 0.9464285969734192,
"step": 78,
"temporal_rewards": 0.714285671710968
},
{
"all_correct": 0.2857142857142857,
"all_wrong": 0.2857142857142857,
"completion_length": 348.3035888671875,
"epoch": 0.005586987270155587,
"grad_norm": 5.781423242439702,
"kl": 0.02099609375,
"learning_rate": 9.761313434206977e-07,
"loss": 0.0008,
"reward": 1.4807766675949097,
"reward_std": 0.09172937273979187,
"rewards/accuracy_reward": 0.5164910554885864,
"rewards/format_reward": 0.8392857313156128,
"step": 79,
"temporal_rewards": 0.714285671710968
},
{
"all_correct": 0.0,
"all_wrong": 0.2857142857142857,
"completion_length": 330.75,
"epoch": 0.005657708628005658,
"grad_norm": 3.517785506448118,
"kl": 0.011962890625,
"learning_rate": 9.755282581475767e-07,
"loss": 0.0005,
"reward": 1.233546495437622,
"reward_std": 0.3027758002281189,
"rewards/accuracy_reward": 0.27283215522766113,
"rewards/format_reward": 0.9285714626312256,
"step": 80,
"temporal_rewards": 0.5
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.0,
"completion_length": 346.1607360839844,
"epoch": 0.005728429985855729,
"grad_norm": 1.6029567912993616,
"kl": 0.0162353515625,
"learning_rate": 9.749178396404588e-07,
"loss": 0.0007,
"reward": 1.4679009914398193,
"reward_std": 0.15201067924499512,
"rewards/accuracy_reward": 0.43218663334846497,
"rewards/format_reward": 0.9642857313156128,
"step": 81,
"temporal_rewards": 0.5
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.2857142857142857,
"completion_length": 235.75001525878906,
"epoch": 0.005799151343705799,
"grad_norm": 1.4899619569405949,
"kl": 0.0201416015625,
"learning_rate": 9.743000973127523e-07,
"loss": 0.0008,
"reward": 1.4392858743667603,
"reward_std": 0.2590990662574768,
"rewards/accuracy_reward": 0.3999999761581421,
"rewards/format_reward": 1.0,
"step": 82,
"temporal_rewards": 0.5
},
{
"all_correct": 0.0,
"all_wrong": 0.14285714285714285,
"completion_length": 229.08929443359375,
"epoch": 0.00586987270155587,
"grad_norm": 2.3086478791631824,
"kl": 0.0255126953125,
"learning_rate": 9.73675040690808e-07,
"loss": 0.001,
"reward": 1.3768136501312256,
"reward_std": 0.2634865641593933,
"rewards/accuracy_reward": 0.31967079639434814,
"rewards/format_reward": 1.0,
"step": 83,
"temporal_rewards": 0.4285714328289032
},
{
"all_correct": 0.2857142857142857,
"all_wrong": 0.0,
"completion_length": 282.125,
"epoch": 0.005940594059405941,
"grad_norm": 2.016686773728144,
"kl": 0.0206298828125,
"learning_rate": 9.730426794137726e-07,
"loss": 0.0008,
"reward": 1.5875691175460815,
"reward_std": 0.17722778022289276,
"rewards/accuracy_reward": 0.5679263472557068,
"rewards/format_reward": 0.9910714626312256,
"step": 84,
"temporal_rewards": 0.5
},
{
"all_correct": 0.0,
"all_wrong": 0.14285714285714285,
"completion_length": 272.5535888671875,
"epoch": 0.006011315417256012,
"grad_norm": 2.4549952656895093,
"kl": 0.0245361328125,
"learning_rate": 9.72403023233439e-07,
"loss": 0.001,
"reward": 1.4239870309829712,
"reward_std": 0.1376960575580597,
"rewards/accuracy_reward": 0.35255834460258484,
"rewards/format_reward": 1.0,
"step": 85,
"temporal_rewards": 0.5714285373687744
},
{
"all_correct": 0.0,
"all_wrong": 0.0,
"completion_length": 334.625,
"epoch": 0.006082036775106082,
"grad_norm": 4.839946932590676,
"kl": 0.0185546875,
"learning_rate": 9.717560820140968e-07,
"loss": 0.0007,
"reward": 1.5320364236831665,
"reward_std": 0.18894881010055542,
"rewards/accuracy_reward": 0.4463222026824951,
"rewards/format_reward": 1.0,
"step": 86,
"temporal_rewards": 0.5
},
{
"all_correct": 0.0,
"all_wrong": 0.14285714285714285,
"completion_length": 302.9285888671875,
"epoch": 0.0061527581329561525,
"grad_norm": 14.12798359766534,
"kl": 0.025634765625,
"learning_rate": 9.711018657323798e-07,
"loss": 0.001,
"reward": 1.564679503440857,
"reward_std": 0.1213698536157608,
"rewards/accuracy_reward": 0.3789650797843933,
"rewards/format_reward": 1.0,
"step": 87,
"temporal_rewards": 0.714285671710968
},
{
"all_correct": 0.0,
"all_wrong": 0.2857142857142857,
"completion_length": 369.3571472167969,
"epoch": 0.006223479490806223,
"grad_norm": 1.1787384215611914,
"kl": 0.01434326171875,
"learning_rate": 9.704403844771127e-07,
"loss": 0.0006,
"reward": 1.3432508707046509,
"reward_std": 0.24372150003910065,
"rewards/accuracy_reward": 0.3253936171531677,
"rewards/format_reward": 0.9464285969734192,
"step": 88,
"temporal_rewards": 0.5
},
{
"all_correct": 0.0,
"all_wrong": 0.0,
"completion_length": 285.46429443359375,
"epoch": 0.006294200848656294,
"grad_norm": 2.1919194899374355,
"kl": 0.02587890625,
"learning_rate": 9.697716484491545e-07,
"loss": 0.001,
"reward": 1.250388503074646,
"reward_std": 0.09060105681419373,
"rewards/accuracy_reward": 0.21824556589126587,
"rewards/format_reward": 1.0,
"step": 89,
"temporal_rewards": 0.4285714328289032
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.0,
"completion_length": 256.39288330078125,
"epoch": 0.006364922206506365,
"grad_norm": 2.4092854894258298,
"kl": 0.0218505859375,
"learning_rate": 9.69095667961242e-07,
"loss": 0.0009,
"reward": 1.5603086948394775,
"reward_std": 0.2149655967950821,
"rewards/accuracy_reward": 0.53352290391922,
"rewards/format_reward": 0.9464285969734192,
"step": 90,
"temporal_rewards": 0.6428571343421936
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.14285714285714285,
"completion_length": 353.64288330078125,
"epoch": 0.006435643564356435,
"grad_norm": 2.2816781220518316,
"kl": 0.0189208984375,
"learning_rate": 9.684124534378306e-07,
"loss": 0.0008,
"reward": 1.5044071674346924,
"reward_std": 0.3355371654033661,
"rewards/accuracy_reward": 0.4311927855014801,
"rewards/format_reward": 1.0,
"step": 91,
"temporal_rewards": 0.6428571343421936
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.14285714285714285,
"completion_length": 383.0000305175781,
"epoch": 0.006506364922206506,
"grad_norm": 3.1153295514142254,
"kl": 0.0198974609375,
"learning_rate": 9.677220154149337e-07,
"loss": 0.0008,
"reward": 1.3627312183380127,
"reward_std": 0.22414493560791016,
"rewards/accuracy_reward": 0.30380263924598694,
"rewards/format_reward": 0.9553571939468384,
"step": 92,
"temporal_rewards": 0.6428571343421936
},
{
"all_correct": 0.0,
"all_wrong": 0.14285714285714285,
"completion_length": 390.08929443359375,
"epoch": 0.006577086280056577,
"grad_norm": 2.2845054552496777,
"kl": 0.019775390625,
"learning_rate": 9.670243645399592e-07,
"loss": 0.0008,
"reward": 1.2227200269699097,
"reward_std": 0.11306477338075638,
"rewards/accuracy_reward": 0.27629145979881287,
"rewards/format_reward": 0.8571429252624512,
"step": 93,
"temporal_rewards": 0.4285714328289032
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.2857142857142857,
"completion_length": 329.5357360839844,
"epoch": 0.006647807637906648,
"grad_norm": 4.302480420224573,
"kl": 0.0201416015625,
"learning_rate": 9.66319511571547e-07,
"loss": 0.0008,
"reward": 1.516809105873108,
"reward_std": 0.19412218034267426,
"rewards/accuracy_reward": 0.36680904030799866,
"rewards/format_reward": 1.0,
"step": 94,
"temporal_rewards": 0.6428571343421936
},
{
"all_correct": 0.0,
"all_wrong": 0.2857142857142857,
"completion_length": 381.8214416503906,
"epoch": 0.006718528995756719,
"grad_norm": 2.8590305141700845,
"kl": 0.01556396484375,
"learning_rate": 9.656074673794017e-07,
"loss": 0.0006,
"reward": 1.4072080850601196,
"reward_std": 0.26467329263687134,
"rewards/accuracy_reward": 0.32327938079833984,
"rewards/format_reward": 0.9464285969734192,
"step": 95,
"temporal_rewards": 0.6428571343421936
},
{
"all_correct": 0.0,
"all_wrong": 0.14285714285714285,
"completion_length": 380.4464416503906,
"epoch": 0.006789250353606789,
"grad_norm": 1.8422918377808415,
"kl": 0.017333984375,
"learning_rate": 9.648882429441256e-07,
"loss": 0.0007,
"reward": 1.3689515590667725,
"reward_std": 0.08133874088525772,
"rewards/accuracy_reward": 0.29752302169799805,
"rewards/format_reward": 1.0,
"step": 96,
"temporal_rewards": 0.5
},
{
"all_correct": 0.2857142857142857,
"all_wrong": 0.0,
"completion_length": 268.71429443359375,
"epoch": 0.00685997171145686,
"grad_norm": 1.9635116777359027,
"kl": 0.0234375,
"learning_rate": 9.641618493570494e-07,
"loss": 0.0009,
"reward": 1.5267155170440674,
"reward_std": 0.1548340767621994,
"rewards/accuracy_reward": 0.5052869319915771,
"rewards/format_reward": 1.0,
"step": 97,
"temporal_rewards": 0.5
},
{
"all_correct": 0.2857142857142857,
"all_wrong": 0.0,
"completion_length": 317.7857360839844,
"epoch": 0.006930693069306931,
"grad_norm": 3.1660820324790895,
"kl": 0.0223388671875,
"learning_rate": 9.634282978200603e-07,
"loss": 0.0009,
"reward": 1.617742896080017,
"reward_std": 0.18577314913272858,
"rewards/accuracy_reward": 0.5284570455551147,
"rewards/format_reward": 0.9821429252624512,
"step": 98,
"temporal_rewards": 0.5
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.42857142857142855,
"completion_length": 383.9821472167969,
"epoch": 0.007001414427157002,
"grad_norm": 1.737409775959772,
"kl": 0.0186767578125,
"learning_rate": 9.62687599645431e-07,
"loss": 0.0007,
"reward": 1.293892741203308,
"reward_std": 0.15461252629756927,
"rewards/accuracy_reward": 0.3153212070465088,
"rewards/format_reward": 0.9464285969734192,
"step": 99,
"temporal_rewards": 0.5
},
{
"all_correct": 0.14285714285714285,
"all_wrong": 0.14285714285714285,
"completion_length": 371.6964416503906,
"epoch": 0.007072135785007072,
"grad_norm": 2.208121527474034,
"kl": 0.0196533203125,
"learning_rate": 9.619397662556433e-07,
"loss": 0.0008,
"reward": 1.5583664178848267,
"reward_std": 0.17507988214492798,
"rewards/accuracy_reward": 0.47622358798980713,
"rewards/format_reward": 0.9821429252624512,
"step": 100,
"temporal_rewards": 0.5714285373687744
}
],
"logging_steps": 1.0,
"max_steps": 800,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}