checkpoint-500_7b / trainer_state.json
ugaoo's picture
Upload folder using huggingface_hub
ee7d4aa verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9463722397476341,
"eval_steps": 500,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"completion_length": 405.5555674235026,
"epoch": 0.0037854889589905363,
"grad_norm": 1.5204231066145135,
"kl": 0.0,
"learning_rate": 5.555555555555555e-09,
"loss": 0.0329,
"reward": 0.3750000099341075,
"reward_std": 0.3891436904668808,
"rewards/equation_reward_func": 0.3472222325702508,
"rewards/format_reward_func": 0.027777778605620067,
"step": 2
},
{
"clip_ratio": 0.0,
"completion_length": 426.1597366333008,
"epoch": 0.007570977917981073,
"grad_norm": 1.6075594847685568,
"kl": 0.00020535786946614584,
"learning_rate": 1.111111111111111e-08,
"loss": 0.0004,
"reward": 0.35416667846341926,
"reward_std": 0.40144437551498413,
"rewards/equation_reward_func": 0.3333333432674408,
"rewards/format_reward_func": 0.02083333395421505,
"step": 4
},
{
"clip_ratio": 0.0,
"completion_length": 389.59028879801434,
"epoch": 0.011356466876971609,
"grad_norm": 1.7738388926882676,
"kl": 0.00020662943522135416,
"learning_rate": 1.6666666666666667e-08,
"loss": 0.0068,
"reward": 0.3611111206312974,
"reward_std": 0.34669753164052963,
"rewards/equation_reward_func": 0.3263888942698638,
"rewards/format_reward_func": 0.034722223257025085,
"step": 6
},
{
"clip_ratio": 0.0,
"completion_length": 407.18751271565753,
"epoch": 0.015141955835962145,
"grad_norm": 1.7413085729902613,
"kl": 0.00020869572957356772,
"learning_rate": 2.222222222222222e-08,
"loss": -0.0175,
"reward": 0.3750000149011612,
"reward_std": 0.43933459122975665,
"rewards/equation_reward_func": 0.36111112497746944,
"rewards/format_reward_func": 0.013888889302810034,
"step": 8
},
{
"clip_ratio": 0.0,
"completion_length": 436.61806615193683,
"epoch": 0.01892744479495268,
"grad_norm": 1.5523678031549322,
"kl": 0.0001990795135498047,
"learning_rate": 2.7777777777777774e-08,
"loss": 0.0009,
"reward": 0.3958333482344945,
"reward_std": 0.4240533635020256,
"rewards/equation_reward_func": 0.3888889029622078,
"rewards/format_reward_func": 0.006944444651405017,
"step": 10
},
{
"clip_ratio": 0.0,
"completion_length": 417.5763982137044,
"epoch": 0.022712933753943218,
"grad_norm": 2.1871520648907357,
"kl": 0.0002460479736328125,
"learning_rate": 3.3333333333333334e-08,
"loss": 0.0668,
"reward": 0.31944445210198563,
"reward_std": 0.3596703422566255,
"rewards/equation_reward_func": 0.31250000807146233,
"rewards/format_reward_func": 0.006944444651405017,
"step": 12
},
{
"clip_ratio": 0.0,
"completion_length": 403.26390075683594,
"epoch": 0.026498422712933754,
"grad_norm": 1.6906264767615913,
"kl": 0.00021004676818847656,
"learning_rate": 3.888888888888889e-08,
"loss": 0.0052,
"reward": 0.3611111268401146,
"reward_std": 0.42362942298253375,
"rewards/equation_reward_func": 0.354166679084301,
"rewards/format_reward_func": 0.006944444651405017,
"step": 14
},
{
"clip_ratio": 0.0,
"completion_length": 393.6944580078125,
"epoch": 0.03028391167192429,
"grad_norm": 1.8134093955469572,
"kl": 0.0002319812774658203,
"learning_rate": 4.444444444444444e-08,
"loss": 0.0291,
"reward": 0.4097222362955411,
"reward_std": 0.43579815079768497,
"rewards/equation_reward_func": 0.3888889004786809,
"rewards/format_reward_func": 0.02083333395421505,
"step": 16
},
{
"clip_ratio": 0.0,
"completion_length": 440.2708460489909,
"epoch": 0.03406940063091483,
"grad_norm": 1.4606786067632986,
"kl": 0.0002152125040690104,
"learning_rate": 5e-08,
"loss": 0.0191,
"reward": 0.3888889004786809,
"reward_std": 0.44846897075573605,
"rewards/equation_reward_func": 0.3680555671453476,
"rewards/format_reward_func": 0.02083333395421505,
"step": 18
},
{
"clip_ratio": 0.0,
"completion_length": 411.56251525878906,
"epoch": 0.03785488958990536,
"grad_norm": 8.74750891658874,
"kl": 0.00022975603739420572,
"learning_rate": 5.555555555555555e-08,
"loss": 0.0165,
"reward": 0.38888889861603576,
"reward_std": 0.3779858859876792,
"rewards/equation_reward_func": 0.38888889861603576,
"rewards/format_reward_func": 0.0,
"step": 20
},
{
"clip_ratio": 0.0,
"completion_length": 410.5694580078125,
"epoch": 0.0416403785488959,
"grad_norm": 2.0666026097367185,
"kl": 0.0002140204111735026,
"learning_rate": 6.111111111111111e-08,
"loss": 0.0489,
"reward": 0.4305555671453476,
"reward_std": 0.4184086322784424,
"rewards/equation_reward_func": 0.40277779164413613,
"rewards/format_reward_func": 0.027777778605620067,
"step": 22
},
{
"clip_ratio": 0.0,
"completion_length": 398.5972366333008,
"epoch": 0.045425867507886436,
"grad_norm": 1.4353693037214081,
"kl": 0.00022824605305989584,
"learning_rate": 6.666666666666667e-08,
"loss": 0.0561,
"reward": 0.39583334140479565,
"reward_std": 0.38249212006727856,
"rewards/equation_reward_func": 0.37500000807146233,
"rewards/format_reward_func": 0.02083333395421505,
"step": 24
},
{
"clip_ratio": 0.0,
"completion_length": 406.06251525878906,
"epoch": 0.04921135646687697,
"grad_norm": 2.0297758030760487,
"kl": 0.00023778279622395834,
"learning_rate": 7.222222222222221e-08,
"loss": -0.036,
"reward": 0.2847222263614337,
"reward_std": 0.35836515327294666,
"rewards/equation_reward_func": 0.2638888942698638,
"rewards/format_reward_func": 0.02083333395421505,
"step": 26
},
{
"clip_ratio": 0.0,
"completion_length": 404.18751271565753,
"epoch": 0.05299684542586751,
"grad_norm": 1.7807227026542323,
"kl": 0.0002464453379313151,
"learning_rate": 7.777777777777778e-08,
"loss": -0.0037,
"reward": 0.3819444552063942,
"reward_std": 0.3984878833095233,
"rewards/equation_reward_func": 0.37500001055498916,
"rewards/format_reward_func": 0.006944444651405017,
"step": 28
},
{
"clip_ratio": 0.0,
"completion_length": 443.22918192545575,
"epoch": 0.056782334384858045,
"grad_norm": 1.4639198723709446,
"kl": 0.0002082983652750651,
"learning_rate": 8.333333333333333e-08,
"loss": 0.0215,
"reward": 0.28472222946584225,
"reward_std": 0.35284433389703435,
"rewards/equation_reward_func": 0.26388889613250893,
"rewards/format_reward_func": 0.02083333395421505,
"step": 30
},
{
"clip_ratio": 0.0,
"completion_length": 417.1458460489909,
"epoch": 0.06056782334384858,
"grad_norm": 1.551783394227111,
"kl": 0.0002196629842122396,
"learning_rate": 8.888888888888888e-08,
"loss": -0.0381,
"reward": 0.4236111231148243,
"reward_std": 0.4627470038831234,
"rewards/equation_reward_func": 0.409722230086724,
"rewards/format_reward_func": 0.013888889302810034,
"step": 32
},
{
"clip_ratio": 0.0,
"completion_length": 425.0972366333008,
"epoch": 0.06435331230283911,
"grad_norm": 1.6519839518228945,
"kl": 0.0002177556355794271,
"learning_rate": 9.444444444444444e-08,
"loss": 0.0149,
"reward": 0.28472222946584225,
"reward_std": 0.36097555483380955,
"rewards/equation_reward_func": 0.26388889489074546,
"rewards/format_reward_func": 0.02083333395421505,
"step": 34
},
{
"clip_ratio": 0.0,
"completion_length": 416.1597315470378,
"epoch": 0.06813880126182965,
"grad_norm": 1.6298419922409495,
"kl": 0.00024358431498209635,
"learning_rate": 1e-07,
"loss": 0.0544,
"reward": 0.31250000931322575,
"reward_std": 0.406619085619847,
"rewards/equation_reward_func": 0.27777778667708236,
"rewards/format_reward_func": 0.034722223257025085,
"step": 36
},
{
"clip_ratio": 0.0,
"completion_length": 406.13195546468097,
"epoch": 0.07192429022082018,
"grad_norm": 1.7781933588930947,
"kl": 0.00020241737365722656,
"learning_rate": 1.0555555555555555e-07,
"loss": 0.0181,
"reward": 0.5208333432674408,
"reward_std": 0.48631447553634644,
"rewards/equation_reward_func": 0.5000000074505806,
"rewards/format_reward_func": 0.02083333395421505,
"step": 38
},
{
"clip_ratio": 0.0,
"completion_length": 442.87500890096027,
"epoch": 0.07570977917981073,
"grad_norm": 1.7856778327927993,
"kl": 0.00023746490478515625,
"learning_rate": 1.111111111111111e-07,
"loss": -0.0027,
"reward": 0.32638889613250893,
"reward_std": 0.37259839847683907,
"rewards/equation_reward_func": 0.31250000807146233,
"rewards/format_reward_func": 0.013888889302810034,
"step": 40
},
{
"clip_ratio": 0.0,
"completion_length": 375.65973409016925,
"epoch": 0.07949526813880126,
"grad_norm": 1.9930393569793248,
"kl": 0.00021648406982421875,
"learning_rate": 1.1666666666666667e-07,
"loss": 0.0641,
"reward": 0.4236111268401146,
"reward_std": 0.38463745390375453,
"rewards/equation_reward_func": 0.4027777910232544,
"rewards/format_reward_func": 0.02083333395421505,
"step": 42
},
{
"clip_ratio": 0.0,
"completion_length": 411.6736195882161,
"epoch": 0.0832807570977918,
"grad_norm": 1.4888095636144503,
"kl": 0.0002304712931315104,
"learning_rate": 1.2222222222222222e-07,
"loss": 0.0313,
"reward": 0.31944445210198563,
"reward_std": 0.3178868380685647,
"rewards/equation_reward_func": 0.28472223194936913,
"rewards/format_reward_func": 0.034722223257025085,
"step": 44
},
{
"clip_ratio": 0.0,
"completion_length": 389.4166793823242,
"epoch": 0.08706624605678233,
"grad_norm": 1.6283738307368585,
"kl": 0.00023396809895833334,
"learning_rate": 1.2777777777777777e-07,
"loss": 0.0686,
"reward": 0.2986111169060071,
"reward_std": 0.37988172471523285,
"rewards/equation_reward_func": 0.2847222288449605,
"rewards/format_reward_func": 0.013888889302810034,
"step": 46
},
{
"clip_ratio": 0.0,
"completion_length": 405.71528879801434,
"epoch": 0.09085173501577287,
"grad_norm": 12.938622660748152,
"kl": 0.00023698806762695312,
"learning_rate": 1.3333333333333334e-07,
"loss": -0.0074,
"reward": 0.2361111187686523,
"reward_std": 0.3309923857450485,
"rewards/equation_reward_func": 0.22222222574055195,
"rewards/format_reward_func": 0.013888889302810034,
"step": 48
},
{
"clip_ratio": 0.0,
"completion_length": 434.0694580078125,
"epoch": 0.0946372239747634,
"grad_norm": 3.061348126208135,
"kl": 0.00024008750915527344,
"learning_rate": 1.3888888888888888e-07,
"loss": -0.012,
"reward": 0.27083334264655906,
"reward_std": 0.34488533437252045,
"rewards/equation_reward_func": 0.25694445086022216,
"rewards/format_reward_func": 0.013888889302810034,
"step": 50
},
{
"clip_ratio": 0.0,
"completion_length": 413.0694529215495,
"epoch": 0.09842271293375394,
"grad_norm": 3.454024027390986,
"kl": 0.0003235340118408203,
"learning_rate": 1.4444444444444442e-07,
"loss": -0.0064,
"reward": 0.40972223194936913,
"reward_std": 0.3772713306049506,
"rewards/equation_reward_func": 0.40277778916060925,
"rewards/format_reward_func": 0.006944444651405017,
"step": 52
},
{
"clip_ratio": 0.0,
"completion_length": 446.2986195882161,
"epoch": 0.10220820189274447,
"grad_norm": 1.6865767812775654,
"kl": 0.00020933151245117188,
"learning_rate": 1.5e-07,
"loss": 0.0067,
"reward": 0.3750000136593978,
"reward_std": 0.36897342403729755,
"rewards/equation_reward_func": 0.3611111231148243,
"rewards/format_reward_func": 0.013888889302810034,
"step": 54
},
{
"clip_ratio": 0.0,
"completion_length": 441.1736246744792,
"epoch": 0.10599369085173502,
"grad_norm": 2.2986869828700334,
"kl": 0.0004001458485921224,
"learning_rate": 1.5555555555555556e-07,
"loss": 0.0206,
"reward": 0.3541666716337204,
"reward_std": 0.3243444561958313,
"rewards/equation_reward_func": 0.3472222263614337,
"rewards/format_reward_func": 0.006944444651405017,
"step": 56
},
{
"clip_ratio": 0.0,
"completion_length": 432.27085240681964,
"epoch": 0.10977917981072555,
"grad_norm": 2.1906732682758645,
"kl": 0.0002334117889404297,
"learning_rate": 1.611111111111111e-07,
"loss": 0.0075,
"reward": 0.40972223194936913,
"reward_std": 0.4255252617100875,
"rewards/equation_reward_func": 0.39583334513008595,
"rewards/format_reward_func": 0.013888889302810034,
"step": 58
},
{
"clip_ratio": 0.0,
"completion_length": 426.7569529215495,
"epoch": 0.11356466876971609,
"grad_norm": 2.041653144281195,
"kl": 0.0002582073211669922,
"learning_rate": 1.6666666666666665e-07,
"loss": 0.0211,
"reward": 0.3680555634200573,
"reward_std": 0.40922948469718295,
"rewards/equation_reward_func": 0.361111119389534,
"rewards/format_reward_func": 0.006944444651405017,
"step": 60
},
{
"clip_ratio": 0.0,
"completion_length": 400.68751271565753,
"epoch": 0.11735015772870662,
"grad_norm": 1.93215349954409,
"kl": 0.0002829233805338542,
"learning_rate": 1.7222222222222222e-07,
"loss": -0.033,
"reward": 0.4097222400208314,
"reward_std": 0.45954596251249313,
"rewards/equation_reward_func": 0.3888889004786809,
"rewards/format_reward_func": 0.02083333395421505,
"step": 62
},
{
"clip_ratio": 0.0,
"completion_length": 429.3125178019206,
"epoch": 0.12113564668769716,
"grad_norm": 1.8873459369947734,
"kl": 0.0002741813659667969,
"learning_rate": 1.7777777777777776e-07,
"loss": 0.0397,
"reward": 0.4305555708706379,
"reward_std": 0.41432634244362515,
"rewards/equation_reward_func": 0.4027777848144372,
"rewards/format_reward_func": 0.027777778605620067,
"step": 64
},
{
"clip_ratio": 0.0,
"completion_length": 421.88890329996747,
"epoch": 0.12492113564668769,
"grad_norm": 1.688287521815126,
"kl": 0.00026599566141764325,
"learning_rate": 1.833333333333333e-07,
"loss": 0.0008,
"reward": 0.3472222276031971,
"reward_std": 0.3376887192328771,
"rewards/equation_reward_func": 0.3402777823309104,
"rewards/format_reward_func": 0.006944444651405017,
"step": 66
},
{
"clip_ratio": 0.0,
"completion_length": 452.2708447774251,
"epoch": 0.12870662460567822,
"grad_norm": 1.37663800323155,
"kl": 0.0003294944763183594,
"learning_rate": 1.8888888888888888e-07,
"loss": 0.0556,
"reward": 0.3263888992369175,
"reward_std": 0.2874133574465911,
"rewards/equation_reward_func": 0.312500008692344,
"rewards/format_reward_func": 0.013888889302810034,
"step": 68
},
{
"clip_ratio": 0.0,
"completion_length": 436.71528879801434,
"epoch": 0.13249211356466878,
"grad_norm": 1.4483289492436444,
"kl": 0.00029428799947102863,
"learning_rate": 1.9444444444444445e-07,
"loss": 0.0386,
"reward": 0.29861111876865226,
"reward_std": 0.3121309739847978,
"rewards/equation_reward_func": 0.28472222822407883,
"rewards/format_reward_func": 0.013888889302810034,
"step": 70
},
{
"clip_ratio": 0.0,
"completion_length": 414.62501271565753,
"epoch": 0.1362776025236593,
"grad_norm": 1.4269628803342047,
"kl": 0.0002837181091308594,
"learning_rate": 2e-07,
"loss": 0.0402,
"reward": 0.32638889861603576,
"reward_std": 0.35836514706412953,
"rewards/equation_reward_func": 0.29861111752688885,
"rewards/format_reward_func": 0.027777778605620067,
"step": 72
},
{
"clip_ratio": 0.0,
"completion_length": 398.69445546468097,
"epoch": 0.14006309148264984,
"grad_norm": 1.3415769326825684,
"kl": 0.00044043858846028644,
"learning_rate": 2.0555555555555553e-07,
"loss": -0.0574,
"reward": 0.3333333420256774,
"reward_std": 0.33815376708904904,
"rewards/equation_reward_func": 0.31250000682969886,
"rewards/format_reward_func": 0.02083333395421505,
"step": 74
},
{
"clip_ratio": 0.0,
"completion_length": 397.2708460489909,
"epoch": 0.14384858044164037,
"grad_norm": 1.6736466199506606,
"kl": 0.0003532568613688151,
"learning_rate": 2.111111111111111e-07,
"loss": 0.0355,
"reward": 0.3958333519597848,
"reward_std": 0.3349916177491347,
"rewards/equation_reward_func": 0.3750000173846881,
"rewards/format_reward_func": 0.02083333395421505,
"step": 76
},
{
"clip_ratio": 0.0,
"completion_length": 389.6388982137044,
"epoch": 0.14763406940063092,
"grad_norm": 1.6778745969677393,
"kl": 0.0004076957702636719,
"learning_rate": 2.1666666666666667e-07,
"loss": -0.0089,
"reward": 0.3819444514811039,
"reward_std": 0.3766806833446026,
"rewards/equation_reward_func": 0.3680555646618207,
"rewards/format_reward_func": 0.013888889302810034,
"step": 78
},
{
"clip_ratio": 0.0,
"completion_length": 369.4305648803711,
"epoch": 0.15141955835962145,
"grad_norm": 1.7763243336052263,
"kl": 0.0004963874816894531,
"learning_rate": 2.222222222222222e-07,
"loss": 0.0596,
"reward": 0.3541666716337204,
"reward_std": 0.4322179580728213,
"rewards/equation_reward_func": 0.3125000049670537,
"rewards/format_reward_func": 0.0416666679084301,
"step": 80
},
{
"clip_ratio": 0.0,
"completion_length": 418.77778879801434,
"epoch": 0.15520504731861198,
"grad_norm": 2.4954676920223084,
"kl": 0.0005669593811035156,
"learning_rate": 2.2777777777777776e-07,
"loss": 0.0353,
"reward": 0.4583333469927311,
"reward_std": 0.4091739282011986,
"rewards/equation_reward_func": 0.4305555659035842,
"rewards/format_reward_func": 0.027777778605620067,
"step": 82
},
{
"clip_ratio": 0.0,
"completion_length": 389.9583460489909,
"epoch": 0.1589905362776025,
"grad_norm": 1.9030809806319569,
"kl": 0.0004928906758626302,
"learning_rate": 2.3333333333333333e-07,
"loss": 0.047,
"reward": 0.4236111255983512,
"reward_std": 0.4178568907082081,
"rewards/equation_reward_func": 0.4097222325702508,
"rewards/format_reward_func": 0.013888889302810034,
"step": 84
},
{
"clip_ratio": 0.0,
"completion_length": 442.4930648803711,
"epoch": 0.16277602523659307,
"grad_norm": 1.3770817743623749,
"kl": 0.0006133715311686198,
"learning_rate": 2.388888888888889e-07,
"loss": 0.0004,
"reward": 0.4166666753590107,
"reward_std": 0.37612894798318547,
"rewards/equation_reward_func": 0.3958333469927311,
"rewards/format_reward_func": 0.02083333395421505,
"step": 86
},
{
"clip_ratio": 0.0,
"completion_length": 443.9444529215495,
"epoch": 0.1665615141955836,
"grad_norm": 1.4650231298226628,
"kl": 0.0006745656331380209,
"learning_rate": 2.4444444444444445e-07,
"loss": -0.0017,
"reward": 0.3750000111758709,
"reward_std": 0.3954201638698578,
"rewards/equation_reward_func": 0.36805556652446586,
"rewards/format_reward_func": 0.006944444651405017,
"step": 88
},
{
"clip_ratio": 0.0,
"completion_length": 429.5069630940755,
"epoch": 0.17034700315457413,
"grad_norm": 1.9750543844698667,
"kl": 0.000976403554280599,
"learning_rate": 2.5e-07,
"loss": 0.0418,
"reward": 0.36805556528270245,
"reward_std": 0.4254308380186558,
"rewards/equation_reward_func": 0.34027778543531895,
"rewards/format_reward_func": 0.027777778605620067,
"step": 90
},
{
"clip_ratio": 0.0,
"completion_length": 414.8263982137044,
"epoch": 0.17413249211356466,
"grad_norm": 3.4819218815816417,
"kl": 0.000812689463297526,
"learning_rate": 2.5555555555555553e-07,
"loss": -0.073,
"reward": 0.4166666766007741,
"reward_std": 0.3864077205459277,
"rewards/equation_reward_func": 0.4097222313284874,
"rewards/format_reward_func": 0.006944444651405017,
"step": 92
},
{
"clip_ratio": 0.0,
"completion_length": 358.4236195882161,
"epoch": 0.17791798107255521,
"grad_norm": 3.17839003794858,
"kl": 0.0010786056518554688,
"learning_rate": 2.6111111111111113e-07,
"loss": 0.0271,
"reward": 0.4305555634200573,
"reward_std": 0.4322568451364835,
"rewards/equation_reward_func": 0.4027777872979641,
"rewards/format_reward_func": 0.027777778605620067,
"step": 94
},
{
"clip_ratio": 0.0,
"completion_length": 421.7361208597819,
"epoch": 0.18170347003154574,
"grad_norm": 1.581354739245434,
"kl": 0.0016581217447916667,
"learning_rate": 2.6666666666666667e-07,
"loss": -0.0128,
"reward": 0.40972223194936913,
"reward_std": 0.4322179468969504,
"rewards/equation_reward_func": 0.40277778667708236,
"rewards/format_reward_func": 0.006944444651405017,
"step": 96
},
{
"clip_ratio": 0.0,
"completion_length": 378.01390075683594,
"epoch": 0.18548895899053627,
"grad_norm": 1.6207630586247244,
"kl": 0.0008861223856608073,
"learning_rate": 2.7222222222222216e-07,
"loss": -0.0275,
"reward": 0.48611112001041573,
"reward_std": 0.38690390810370445,
"rewards/equation_reward_func": 0.4513888992369175,
"rewards/format_reward_func": 0.034722223257025085,
"step": 98
},
{
"clip_ratio": 0.0,
"completion_length": 410.0486208597819,
"epoch": 0.1892744479495268,
"grad_norm": 1.6017744101080356,
"kl": 0.0018717447916666667,
"learning_rate": 2.7777777777777776e-07,
"loss": 0.0666,
"reward": 0.4166666778425376,
"reward_std": 0.4304381770392259,
"rewards/equation_reward_func": 0.38888889489074546,
"rewards/format_reward_func": 0.027777778605620067,
"step": 100
},
{
"clip_ratio": 0.0,
"completion_length": 417.06945546468097,
"epoch": 0.19305993690851736,
"grad_norm": 1.781222955044469,
"kl": 0.0016377766927083333,
"learning_rate": 2.833333333333333e-07,
"loss": 0.01,
"reward": 0.3750000037252903,
"reward_std": 0.4017697374025981,
"rewards/equation_reward_func": 0.3472222263614337,
"rewards/format_reward_func": 0.027777778605620067,
"step": 102
},
{
"clip_ratio": 0.0,
"completion_length": 394.3333460489909,
"epoch": 0.1968454258675079,
"grad_norm": 2.071755215843617,
"kl": 0.0020945866902669272,
"learning_rate": 2.8888888888888885e-07,
"loss": 0.0326,
"reward": 0.46527778543531895,
"reward_std": 0.4230251908302307,
"rewards/equation_reward_func": 0.42361111876865226,
"rewards/format_reward_func": 0.0416666679084301,
"step": 104
},
{
"clip_ratio": 0.0,
"completion_length": 381.5902913411458,
"epoch": 0.20063091482649842,
"grad_norm": 4.141898501255106,
"kl": 0.002117792765299479,
"learning_rate": 2.9444444444444444e-07,
"loss": 0.0573,
"reward": 0.3819444632778565,
"reward_std": 0.35283846283952397,
"rewards/equation_reward_func": 0.36111112249394256,
"rewards/format_reward_func": 0.02083333395421505,
"step": 106
},
{
"clip_ratio": 0.0,
"completion_length": 427.81251271565753,
"epoch": 0.20441640378548895,
"grad_norm": 1.8973650475253758,
"kl": 0.0040442148844401045,
"learning_rate": 3e-07,
"loss": 0.0408,
"reward": 0.40277778543531895,
"reward_std": 0.3682141068081061,
"rewards/equation_reward_func": 0.38194445582727593,
"rewards/format_reward_func": 0.02083333395421505,
"step": 108
},
{
"clip_ratio": 0.0,
"completion_length": 419.87501017252606,
"epoch": 0.2082018927444795,
"grad_norm": 1.7756216737476342,
"kl": 0.0018666585286458333,
"learning_rate": 3.055555555555556e-07,
"loss": -0.0149,
"reward": 0.4722222350537777,
"reward_std": 0.4178180123368899,
"rewards/equation_reward_func": 0.45833334513008595,
"rewards/format_reward_func": 0.013888889302810034,
"step": 110
},
{
"clip_ratio": 0.0,
"completion_length": 418.2222315470378,
"epoch": 0.21198738170347003,
"grad_norm": 1.5674546331115733,
"kl": 0.0031108856201171875,
"learning_rate": 3.111111111111111e-07,
"loss": -0.0077,
"reward": 0.36805556279917556,
"reward_std": 0.36998799939950305,
"rewards/equation_reward_func": 0.34027778543531895,
"rewards/format_reward_func": 0.027777778605620067,
"step": 112
},
{
"clip_ratio": 0.0,
"completion_length": 365.8333460489909,
"epoch": 0.21577287066246056,
"grad_norm": 2.163488952988276,
"kl": 0.003872553507486979,
"learning_rate": 3.166666666666666e-07,
"loss": 0.0248,
"reward": 0.44444446079432964,
"reward_std": 0.44138550013303757,
"rewards/equation_reward_func": 0.41666667722165585,
"rewards/format_reward_func": 0.027777778605620067,
"step": 114
},
{
"clip_ratio": 0.0,
"completion_length": 377.95140329996747,
"epoch": 0.2195583596214511,
"grad_norm": 1.9890628606938958,
"kl": 0.0058383941650390625,
"learning_rate": 3.222222222222222e-07,
"loss": 0.0108,
"reward": 0.44444446203609306,
"reward_std": 0.4227793253958225,
"rewards/equation_reward_func": 0.4097222338120143,
"rewards/format_reward_func": 0.034722223257025085,
"step": 116
},
{
"clip_ratio": 0.0,
"completion_length": 390.0833485921224,
"epoch": 0.22334384858044165,
"grad_norm": 1.6950159200633848,
"kl": 0.0033391316731770835,
"learning_rate": 3.2777777777777776e-07,
"loss": 0.0218,
"reward": 0.5000000149011612,
"reward_std": 0.46135225395361584,
"rewards/equation_reward_func": 0.472222238779068,
"rewards/format_reward_func": 0.027777778605620067,
"step": 118
},
{
"clip_ratio": 0.0,
"completion_length": 434.47223409016925,
"epoch": 0.22712933753943218,
"grad_norm": 1.6912972588519333,
"kl": 0.0037129720052083335,
"learning_rate": 3.333333333333333e-07,
"loss": 0.0303,
"reward": 0.402777789781491,
"reward_std": 0.37431980296969414,
"rewards/equation_reward_func": 0.3888888992369175,
"rewards/format_reward_func": 0.013888889302810034,
"step": 120
},
{
"clip_ratio": 0.0,
"completion_length": 379.2916768391927,
"epoch": 0.2309148264984227,
"grad_norm": 1.7665244345202618,
"kl": 0.010921478271484375,
"learning_rate": 3.388888888888889e-07,
"loss": 0.0464,
"reward": 0.38194445086022216,
"reward_std": 0.3815583561857541,
"rewards/equation_reward_func": 0.3402777835726738,
"rewards/format_reward_func": 0.041666667287548385,
"step": 122
},
{
"clip_ratio": 0.0,
"completion_length": 346.40973409016925,
"epoch": 0.23470031545741324,
"grad_norm": 1.7777246622024319,
"kl": 0.0057525634765625,
"learning_rate": 3.4444444444444444e-07,
"loss": 0.073,
"reward": 0.5555555739750465,
"reward_std": 0.5072049958010515,
"rewards/equation_reward_func": 0.5138889104127884,
"rewards/format_reward_func": 0.0416666679084301,
"step": 124
},
{
"clip_ratio": 0.0,
"completion_length": 394.50001525878906,
"epoch": 0.2384858044164038,
"grad_norm": 1.2724495191880816,
"kl": 0.005407969156901042,
"learning_rate": 3.5e-07,
"loss": 0.0085,
"reward": 0.5416666809469461,
"reward_std": 0.4325893906255563,
"rewards/equation_reward_func": 0.5138889035830895,
"rewards/format_reward_func": 0.027777778605620067,
"step": 126
},
{
"clip_ratio": 0.0,
"completion_length": 426.51390584309894,
"epoch": 0.24227129337539433,
"grad_norm": 1.7084837781655138,
"kl": 0.013666788736979166,
"learning_rate": 3.5555555555555553e-07,
"loss": -0.0031,
"reward": 0.40277779288589954,
"reward_std": 0.4025290633241336,
"rewards/equation_reward_func": 0.36805557149151963,
"rewards/format_reward_func": 0.034722223257025085,
"step": 128
},
{
"clip_ratio": 0.0,
"completion_length": 393.2986195882161,
"epoch": 0.24605678233438485,
"grad_norm": 1.6652131128084247,
"kl": 0.009862263997395834,
"learning_rate": 3.6111111111111107e-07,
"loss": 0.076,
"reward": 0.5277777922650179,
"reward_std": 0.4298570702473323,
"rewards/equation_reward_func": 0.5000000111758709,
"rewards/format_reward_func": 0.027777778605620067,
"step": 130
},
{
"clip_ratio": 0.0,
"completion_length": 387.29862213134766,
"epoch": 0.24984227129337538,
"grad_norm": 1.4235829543070357,
"kl": 0.006196339925130208,
"learning_rate": 3.666666666666666e-07,
"loss": 0.0928,
"reward": 0.5763888955116272,
"reward_std": 0.4299643337726593,
"rewards/equation_reward_func": 0.5277777959903082,
"rewards/format_reward_func": 0.048611112559835114,
"step": 132
},
{
"clip_ratio": 0.0,
"completion_length": 412.95140329996747,
"epoch": 0.25362776025236594,
"grad_norm": 1.5899891064412919,
"kl": 0.010592142740885416,
"learning_rate": 3.722222222222222e-07,
"loss": -0.0172,
"reward": 0.46527778543531895,
"reward_std": 0.4477427862584591,
"rewards/equation_reward_func": 0.39583334264655906,
"rewards/format_reward_func": 0.06944444589316845,
"step": 134
},
{
"clip_ratio": 0.0,
"completion_length": 375.7916768391927,
"epoch": 0.25741324921135644,
"grad_norm": 1.8581102947843942,
"kl": 0.0097503662109375,
"learning_rate": 3.7777777777777775e-07,
"loss": 0.0656,
"reward": 0.3819444539646308,
"reward_std": 0.4159533294538657,
"rewards/equation_reward_func": 0.3263888979951541,
"rewards/format_reward_func": 0.055555557211240135,
"step": 136
},
{
"clip_ratio": 0.0,
"completion_length": 402.8333460489909,
"epoch": 0.261198738170347,
"grad_norm": 1.8615199132284905,
"kl": 0.00942230224609375,
"learning_rate": 3.8333333333333335e-07,
"loss": 0.1235,
"reward": 0.3958333407839139,
"reward_std": 0.4607119709253311,
"rewards/equation_reward_func": 0.33333334637184936,
"rewards/format_reward_func": 0.06250000186264515,
"step": 138
},
{
"clip_ratio": 0.0,
"completion_length": 388.50695037841797,
"epoch": 0.26498422712933756,
"grad_norm": 1.4132445755845657,
"kl": 0.031420389811197914,
"learning_rate": 3.888888888888889e-07,
"loss": -0.0025,
"reward": 0.6388889079292616,
"reward_std": 0.4517383811374505,
"rewards/equation_reward_func": 0.5972222362955412,
"rewards/format_reward_func": 0.0416666679084301,
"step": 140
},
{
"clip_ratio": 0.0,
"completion_length": 399.69445546468097,
"epoch": 0.26876971608832806,
"grad_norm": 1.6231049342643675,
"kl": 0.011019388834635416,
"learning_rate": 3.9444444444444444e-07,
"loss": -0.0123,
"reward": 0.5486111268401146,
"reward_std": 0.4811764856179555,
"rewards/equation_reward_func": 0.486111119389534,
"rewards/format_reward_func": 0.06250000186264515,
"step": 142
},
{
"clip_ratio": 0.0,
"completion_length": 386.03473409016925,
"epoch": 0.2725552050473186,
"grad_norm": 1.5773905454247337,
"kl": 0.04953765869140625,
"learning_rate": 4e-07,
"loss": 0.0301,
"reward": 0.5486111262192329,
"reward_std": 0.44820784653226536,
"rewards/equation_reward_func": 0.48611111876865226,
"rewards/format_reward_func": 0.06250000124176343,
"step": 144
},
{
"clip_ratio": 0.0,
"completion_length": 398.125005086263,
"epoch": 0.2763406940063092,
"grad_norm": 1.2456280183136625,
"kl": 0.025739034016927082,
"learning_rate": 4.055555555555555e-07,
"loss": 0.0387,
"reward": 0.5000000186264515,
"reward_std": 0.321004219353199,
"rewards/equation_reward_func": 0.4444444552063942,
"rewards/format_reward_func": 0.055555557211240135,
"step": 146
},
{
"clip_ratio": 0.0,
"completion_length": 366.2986195882161,
"epoch": 0.2801261829652997,
"grad_norm": 1.5044443965642595,
"kl": 0.021631876627604168,
"learning_rate": 4.1111111111111107e-07,
"loss": 0.0575,
"reward": 0.43055557273328304,
"reward_std": 0.34387076273560524,
"rewards/equation_reward_func": 0.38888889613250893,
"rewards/format_reward_func": 0.0416666679084301,
"step": 148
},
{
"clip_ratio": 0.0,
"completion_length": 372.86112721761066,
"epoch": 0.28391167192429023,
"grad_norm": 2.033550282447034,
"kl": 0.02593231201171875,
"learning_rate": 4.1666666666666667e-07,
"loss": 0.1121,
"reward": 0.6805555721124014,
"reward_std": 0.5391590123375257,
"rewards/equation_reward_func": 0.5486111287027597,
"rewards/format_reward_func": 0.1319444477558136,
"step": 150
},
{
"clip_ratio": 0.0,
"completion_length": 340.0138969421387,
"epoch": 0.28769716088328073,
"grad_norm": 2.2987693590791363,
"kl": 0.0315399169921875,
"learning_rate": 4.222222222222222e-07,
"loss": 0.0078,
"reward": 0.5694444663822651,
"reward_std": 0.4559611765046914,
"rewards/equation_reward_func": 0.45833334761361283,
"rewards/format_reward_func": 0.11111111318071683,
"step": 152
},
{
"clip_ratio": 0.0,
"completion_length": 378.94445546468097,
"epoch": 0.2914826498422713,
"grad_norm": 1.9562773273854706,
"kl": 0.09186299641927083,
"learning_rate": 4.2777777777777775e-07,
"loss": 0.0201,
"reward": 0.6041666741172472,
"reward_std": 0.49037906900048256,
"rewards/equation_reward_func": 0.500000017384688,
"rewards/format_reward_func": 0.10416666915019353,
"step": 154
},
{
"clip_ratio": 0.0,
"completion_length": 360.2152913411458,
"epoch": 0.29526813880126185,
"grad_norm": 1.8560446617911341,
"kl": 0.060872395833333336,
"learning_rate": 4.3333333333333335e-07,
"loss": 0.0208,
"reward": 0.5208333457509676,
"reward_std": 0.4362143650650978,
"rewards/equation_reward_func": 0.4444444564481576,
"rewards/format_reward_func": 0.07638888992369175,
"step": 156
},
{
"clip_ratio": 0.0,
"completion_length": 397.4930725097656,
"epoch": 0.29905362776025235,
"grad_norm": 1.7722682766121323,
"kl": 0.03704833984375,
"learning_rate": 4.3888888888888884e-07,
"loss": 0.087,
"reward": 0.6736111355324587,
"reward_std": 0.4783005639910698,
"rewards/equation_reward_func": 0.5277777922650179,
"rewards/format_reward_func": 0.14583333705862364,
"step": 158
},
{
"clip_ratio": 0.0,
"completion_length": 340.4583485921224,
"epoch": 0.3028391167192429,
"grad_norm": 1.8020563834520036,
"kl": 0.052164713541666664,
"learning_rate": 4.444444444444444e-07,
"loss": 0.0567,
"reward": 0.4791666753590107,
"reward_std": 0.42059509828686714,
"rewards/equation_reward_func": 0.4027777860562007,
"rewards/format_reward_func": 0.07638888992369175,
"step": 160
},
{
"clip_ratio": 0.0,
"completion_length": 331.2986234029134,
"epoch": 0.30662460567823346,
"grad_norm": 1.816923697504477,
"kl": 0.14789835611979166,
"learning_rate": 4.5e-07,
"loss": 0.0408,
"reward": 0.6319444521019856,
"reward_std": 0.40209560344616574,
"rewards/equation_reward_func": 0.5208333420256773,
"rewards/format_reward_func": 0.11111111318071683,
"step": 162
},
{
"clip_ratio": 0.0,
"completion_length": 374.74306615193683,
"epoch": 0.31041009463722397,
"grad_norm": 1.6763316060977995,
"kl": 0.04315185546875,
"learning_rate": 4.555555555555555e-07,
"loss": 0.0657,
"reward": 0.652777798473835,
"reward_std": 0.47464097539583844,
"rewards/equation_reward_func": 0.4722222313284874,
"rewards/format_reward_func": 0.180555559694767,
"step": 164
},
{
"clip_ratio": 0.0,
"completion_length": 377.5416768391927,
"epoch": 0.3141955835962145,
"grad_norm": 1.6769854770195711,
"kl": 0.069580078125,
"learning_rate": 4.611111111111111e-07,
"loss": 0.0887,
"reward": 0.6527777897814909,
"reward_std": 0.5109836533665657,
"rewards/equation_reward_func": 0.4722222400208314,
"rewards/format_reward_func": 0.180555559694767,
"step": 166
},
{
"clip_ratio": 0.0,
"completion_length": 355.3680648803711,
"epoch": 0.317981072555205,
"grad_norm": 1.5011515214179916,
"kl": 0.19896443684895834,
"learning_rate": 4.6666666666666666e-07,
"loss": 0.0834,
"reward": 0.7638889228304228,
"reward_std": 0.5609942426284155,
"rewards/equation_reward_func": 0.569444460173448,
"rewards/format_reward_func": 0.1944444483766953,
"step": 168
},
{
"clip_ratio": 0.0,
"completion_length": 353.7361195882161,
"epoch": 0.3217665615141956,
"grad_norm": 15.47233883807027,
"kl": 0.056732177734375,
"learning_rate": 4.722222222222222e-07,
"loss": 0.0426,
"reward": 0.7500000124176344,
"reward_std": 0.5186516791582108,
"rewards/equation_reward_func": 0.5000000161429247,
"rewards/format_reward_func": 0.25000000931322575,
"step": 170
},
{
"clip_ratio": 0.0,
"completion_length": 308.31251017252606,
"epoch": 0.32555205047318614,
"grad_norm": 2.1360230493136325,
"kl": 0.19114176432291666,
"learning_rate": 4.777777777777778e-07,
"loss": 0.1307,
"reward": 0.8055555882553259,
"reward_std": 0.5721215779582659,
"rewards/equation_reward_func": 0.5000000086923441,
"rewards/format_reward_func": 0.30555556404093903,
"step": 172
},
{
"clip_ratio": 0.0,
"completion_length": 314.75000762939453,
"epoch": 0.32933753943217664,
"grad_norm": 2.7019951929627415,
"kl": 0.1749267578125,
"learning_rate": 4.833333333333333e-07,
"loss": 0.0575,
"reward": 0.826388897995154,
"reward_std": 0.5692646453777949,
"rewards/equation_reward_func": 0.5138889017204443,
"rewards/format_reward_func": 0.312500008692344,
"step": 174
},
{
"clip_ratio": 0.0,
"completion_length": 297.9722328186035,
"epoch": 0.3331230283911672,
"grad_norm": 2.3496202759776788,
"kl": 0.3324991861979167,
"learning_rate": 4.888888888888889e-07,
"loss": 0.0734,
"reward": 0.868055577079455,
"reward_std": 0.6303805137674013,
"rewards/equation_reward_func": 0.486111128081878,
"rewards/format_reward_func": 0.3819444564481576,
"step": 176
},
{
"clip_ratio": 0.0,
"completion_length": 349.2708384195964,
"epoch": 0.33690851735015775,
"grad_norm": 7.352824169943818,
"kl": 0.4977823893229167,
"learning_rate": 4.944444444444445e-07,
"loss": 0.0453,
"reward": 0.770833362514774,
"reward_std": 0.6322847319145998,
"rewards/equation_reward_func": 0.3958333370586236,
"rewards/format_reward_func": 0.37500001179675263,
"step": 178
},
{
"clip_ratio": 0.0,
"completion_length": 297.1666781107585,
"epoch": 0.34069400630914826,
"grad_norm": 24.940449042597443,
"kl": 4.795857747395833,
"learning_rate": 5e-07,
"loss": 0.0703,
"reward": 1.0555555820465088,
"reward_std": 0.5782317991058031,
"rewards/equation_reward_func": 0.5555555745959282,
"rewards/format_reward_func": 0.5000000211099783,
"step": 180
},
{
"clip_ratio": 0.0,
"completion_length": 271.9027849833171,
"epoch": 0.3444794952681388,
"grad_norm": 2.5871456184851414,
"kl": 14.199259440104166,
"learning_rate": 4.999998543120144e-07,
"loss": 0.0763,
"reward": 1.0902778108914692,
"reward_std": 0.5762393027544022,
"rewards/equation_reward_func": 0.5694444589316845,
"rewards/format_reward_func": 0.5208333482344946,
"step": 182
},
{
"clip_ratio": 0.0,
"completion_length": 309.31250890096027,
"epoch": 0.3482649842271293,
"grad_norm": 2.0327389699765313,
"kl": 0.7781168619791666,
"learning_rate": 4.999994172482276e-07,
"loss": 0.1347,
"reward": 0.895833358168602,
"reward_std": 0.5533264875411987,
"rewards/equation_reward_func": 0.43750001055498916,
"rewards/format_reward_func": 0.4583333469927311,
"step": 184
},
{
"clip_ratio": 0.0,
"completion_length": 268.13889567057294,
"epoch": 0.35205047318611987,
"grad_norm": 15.941396290884985,
"kl": 4.468831380208333,
"learning_rate": 4.99998688809149e-07,
"loss": 0.0794,
"reward": 0.979166696468989,
"reward_std": 0.5592605446775755,
"rewards/equation_reward_func": 0.38194445582727593,
"rewards/format_reward_func": 0.5972222338120142,
"step": 186
},
{
"clip_ratio": 0.0,
"completion_length": 247.0416742960612,
"epoch": 0.35583596214511043,
"grad_norm": 2.4432100547197657,
"kl": 0.603515625,
"learning_rate": 4.999976689956274e-07,
"loss": 0.023,
"reward": 1.1041666915019352,
"reward_std": 0.5778869986534119,
"rewards/equation_reward_func": 0.5138888973742723,
"rewards/format_reward_func": 0.5902777935067812,
"step": 188
},
{
"clip_ratio": 0.0,
"completion_length": 248.41667302449545,
"epoch": 0.35962145110410093,
"grad_norm": 4.823568956607298,
"kl": 1.6413167317708333,
"learning_rate": 4.999963578088516e-07,
"loss": 0.0856,
"reward": 1.0694444874922435,
"reward_std": 0.719012883802255,
"rewards/equation_reward_func": 0.5069444564481577,
"rewards/format_reward_func": 0.562500017384688,
"step": 190
},
{
"clip_ratio": 0.0,
"completion_length": 273.97917556762695,
"epoch": 0.3634069400630915,
"grad_norm": 2.743096965012267,
"kl": 0.9237467447916666,
"learning_rate": 4.999947552503497e-07,
"loss": 0.1483,
"reward": 1.1319444874922435,
"reward_std": 0.6314157545566559,
"rewards/equation_reward_func": 0.5208333457509676,
"rewards/format_reward_func": 0.6111111268401146,
"step": 192
},
{
"clip_ratio": 0.0,
"completion_length": 246.41667556762695,
"epoch": 0.36719242902208205,
"grad_norm": 2.638981910331043,
"kl": 0.8427327473958334,
"learning_rate": 4.999928613219894e-07,
"loss": 0.1078,
"reward": 1.0625000471870105,
"reward_std": 0.6069262598951658,
"rewards/equation_reward_func": 0.4583333469927311,
"rewards/format_reward_func": 0.6041666877766451,
"step": 194
},
{
"clip_ratio": 0.0,
"completion_length": 273.4652849833171,
"epoch": 0.37097791798107255,
"grad_norm": 3.1586256673049946,
"kl": 0.6038411458333334,
"learning_rate": 4.999906760259783e-07,
"loss": 0.0848,
"reward": 1.1944444874922435,
"reward_std": 0.5770174351831278,
"rewards/equation_reward_func": 0.548611123735706,
"rewards/format_reward_func": 0.6458333482344946,
"step": 196
},
{
"clip_ratio": 0.0,
"completion_length": 279.87500762939453,
"epoch": 0.3747634069400631,
"grad_norm": 2.3306411923794284,
"kl": 0.4184977213541667,
"learning_rate": 4.999881993648632e-07,
"loss": 0.1264,
"reward": 1.1805555820465088,
"reward_std": 0.5819496115048727,
"rewards/equation_reward_func": 0.506944460173448,
"rewards/format_reward_func": 0.6736111318071684,
"step": 198
},
{
"clip_ratio": 0.0,
"completion_length": 285.9930610656738,
"epoch": 0.3785488958990536,
"grad_norm": 2.9295320476594964,
"kl": 0.7996622721354166,
"learning_rate": 4.999854313415308e-07,
"loss": 0.1193,
"reward": 1.1388889302810032,
"reward_std": 0.5301796098550161,
"rewards/equation_reward_func": 0.39583334264655906,
"rewards/format_reward_func": 0.7430555721124014,
"step": 200
},
{
"clip_ratio": 0.0,
"completion_length": 269.12500890096027,
"epoch": 0.38233438485804416,
"grad_norm": 2.6552410076798028,
"kl": 1.19091796875,
"learning_rate": 4.999823719592071e-07,
"loss": 0.216,
"reward": 1.2777778208255768,
"reward_std": 0.5021173569063345,
"rewards/equation_reward_func": 0.506944457689921,
"rewards/format_reward_func": 0.7708333532015482,
"step": 202
},
{
"clip_ratio": 0.0,
"completion_length": 274.5069516499837,
"epoch": 0.3861198738170347,
"grad_norm": 5.1879891100690285,
"kl": 2.9518229166666665,
"learning_rate": 4.999790212214579e-07,
"loss": 0.1756,
"reward": 1.2430555870135624,
"reward_std": 0.58441444983085,
"rewards/equation_reward_func": 0.479166679084301,
"rewards/format_reward_func": 0.7638889054457346,
"step": 204
},
{
"clip_ratio": 0.0,
"completion_length": 248.79167302449545,
"epoch": 0.3899053627760252,
"grad_norm": 3.1959715484572917,
"kl": 1.0735677083333333,
"learning_rate": 4.999753791321885e-07,
"loss": 0.1732,
"reward": 1.3750000496705372,
"reward_std": 0.5170091787974039,
"rewards/equation_reward_func": 0.5902777947485447,
"rewards/format_reward_func": 0.7847222487131754,
"step": 206
},
{
"clip_ratio": 0.0,
"completion_length": 244.10417302449545,
"epoch": 0.3936908517350158,
"grad_norm": 14.582535270082563,
"kl": 6.297200520833333,
"learning_rate": 4.999714456956438e-07,
"loss": 0.0727,
"reward": 1.2986111442248027,
"reward_std": 0.5151846868296465,
"rewards/equation_reward_func": 0.5069444552063942,
"rewards/format_reward_func": 0.7916666815678278,
"step": 208
},
{
"clip_ratio": 0.0,
"completion_length": 242.52778244018555,
"epoch": 0.39747634069400634,
"grad_norm": 151.19489080244557,
"kl": 27.640625,
"learning_rate": 4.99967220916408e-07,
"loss": 0.0915,
"reward": 1.3958333532015483,
"reward_std": 0.48437386751174927,
"rewards/equation_reward_func": 0.5486111175268888,
"rewards/format_reward_func": 0.8472222437461218,
"step": 210
},
{
"clip_ratio": 0.0,
"completion_length": 185.7916717529297,
"epoch": 0.40126182965299684,
"grad_norm": 37.78796510721226,
"kl": 9.738444010416666,
"learning_rate": 4.999627047994053e-07,
"loss": 0.0349,
"reward": 1.4375000596046448,
"reward_std": 0.48517493655284244,
"rewards/equation_reward_func": 0.5763889048248529,
"rewards/format_reward_func": 0.8611111342906952,
"step": 212
},
{
"clip_ratio": 0.0,
"completion_length": 248.83333841959634,
"epoch": 0.4050473186119874,
"grad_norm": 15.557449401743996,
"kl": 1.7869466145833333,
"learning_rate": 4.999578973498994e-07,
"loss": 0.0905,
"reward": 1.2916667064030964,
"reward_std": 0.5043560986717542,
"rewards/equation_reward_func": 0.4652777872979641,
"rewards/format_reward_func": 0.8263889054457346,
"step": 214
},
{
"clip_ratio": 0.0,
"completion_length": 221.31945164998373,
"epoch": 0.4088328075709779,
"grad_norm": 2.746698671118404,
"kl": 2.4767252604166665,
"learning_rate": 4.999527985734931e-07,
"loss": 0.1176,
"reward": 1.3958333681027095,
"reward_std": 0.4606535832087199,
"rewards/equation_reward_func": 0.5486111280818781,
"rewards/format_reward_func": 0.8472222437461218,
"step": 216
},
{
"clip_ratio": 0.0,
"completion_length": 265.2291742960612,
"epoch": 0.41261829652996845,
"grad_norm": 8.47115489110944,
"kl": 2.67626953125,
"learning_rate": 4.999474084761293e-07,
"loss": 0.1801,
"reward": 1.4375000496705372,
"reward_std": 0.4704290193816026,
"rewards/equation_reward_func": 0.5763889029622078,
"rewards/format_reward_func": 0.8611111293236414,
"step": 218
},
{
"clip_ratio": 0.0,
"completion_length": 216.90278498331705,
"epoch": 0.416403785488959,
"grad_norm": 2.6419184728296528,
"kl": 1.8723958333333333,
"learning_rate": 4.999417270640898e-07,
"loss": 0.0151,
"reward": 1.3125000447034836,
"reward_std": 0.5176352287332217,
"rewards/equation_reward_func": 0.4513888967533906,
"rewards/format_reward_func": 0.8611111342906952,
"step": 220
},
{
"clip_ratio": 0.0,
"completion_length": 237.16667556762695,
"epoch": 0.4201892744479495,
"grad_norm": 3.5632903162508525,
"kl": 1.9169108072916667,
"learning_rate": 4.999357543439968e-07,
"loss": 0.2532,
"reward": 1.3263889302810032,
"reward_std": 0.46584198499719304,
"rewards/equation_reward_func": 0.44444444961845875,
"rewards/format_reward_func": 0.8819444676240286,
"step": 222
},
{
"clip_ratio": 0.0,
"completion_length": 250.10417048136392,
"epoch": 0.42397476340694007,
"grad_norm": 2.836432685345919,
"kl": 2.11572265625,
"learning_rate": 4.999294903228113e-07,
"loss": 0.0877,
"reward": 1.3541666964689891,
"reward_std": 0.5378451521197954,
"rewards/equation_reward_func": 0.5347222381581863,
"rewards/format_reward_func": 0.8194444676240286,
"step": 224
},
{
"clip_ratio": 0.0,
"completion_length": 256.95139439900714,
"epoch": 0.4277602523659306,
"grad_norm": 15.018378327595181,
"kl": 8.984842936197916,
"learning_rate": 4.999229350078339e-07,
"loss": 0.116,
"reward": 1.4513889253139496,
"reward_std": 0.4579727239906788,
"rewards/equation_reward_func": 0.562500019868215,
"rewards/format_reward_func": 0.8888889054457346,
"step": 226
},
{
"clip_ratio": 0.0,
"completion_length": 249.65973154703775,
"epoch": 0.43154574132492113,
"grad_norm": 2.9312130360707225,
"kl": 1.6197916666666667,
"learning_rate": 4.99916088406705e-07,
"loss": 0.1031,
"reward": 1.4722222586472828,
"reward_std": 0.491986704369386,
"rewards/equation_reward_func": 0.5763889017204443,
"rewards/format_reward_func": 0.895833358168602,
"step": 228
},
{
"clip_ratio": 0.0,
"completion_length": 237.29167048136392,
"epoch": 0.4353312302839117,
"grad_norm": 318.50068999038837,
"kl": 10.434326171875,
"learning_rate": 4.999089505274044e-07,
"loss": 0.073,
"reward": 1.326388920346896,
"reward_std": 0.42563923199971515,
"rewards/equation_reward_func": 0.4375000149011612,
"rewards/format_reward_func": 0.8888889153798422,
"step": 230
},
{
"clip_ratio": 0.0,
"completion_length": 247.90973027547201,
"epoch": 0.4391167192429022,
"grad_norm": 4.524241932647995,
"kl": 1.73681640625,
"learning_rate": 4.999015213782511e-07,
"loss": 0.0973,
"reward": 1.4375000496705372,
"reward_std": 0.5439534323910872,
"rewards/equation_reward_func": 0.6597222462296486,
"rewards/format_reward_func": 0.7777777959903082,
"step": 232
},
{
"clip_ratio": 0.0,
"completion_length": 241.83334096272787,
"epoch": 0.44290220820189274,
"grad_norm": 2.1566548938944345,
"kl": 6.7578125,
"learning_rate": 4.998938009679042e-07,
"loss": 0.0664,
"reward": 1.4027778307596843,
"reward_std": 0.5103383002181848,
"rewards/equation_reward_func": 0.5972222362955412,
"rewards/format_reward_func": 0.8055555820465088,
"step": 234
},
{
"clip_ratio": 0.0,
"completion_length": 265.61112213134766,
"epoch": 0.4466876971608833,
"grad_norm": 10.590567425193024,
"kl": 1.2277018229166667,
"learning_rate": 4.998857893053613e-07,
"loss": 0.0824,
"reward": 1.4166666964689891,
"reward_std": 0.5057607839504877,
"rewards/equation_reward_func": 0.5555555745959282,
"rewards/format_reward_func": 0.8611111342906952,
"step": 236
},
{
"clip_ratio": 0.0,
"completion_length": 270.2361195882161,
"epoch": 0.4504731861198738,
"grad_norm": 2.807056262560773,
"kl": 2.3780517578125,
"learning_rate": 4.998774863999605e-07,
"loss": 0.1143,
"reward": 1.3888889302810032,
"reward_std": 0.38816434393326443,
"rewards/equation_reward_func": 0.5138889023413261,
"rewards/format_reward_func": 0.8750000149011612,
"step": 238
},
{
"clip_ratio": 0.0,
"completion_length": 282.70834096272785,
"epoch": 0.45425867507886436,
"grad_norm": 3.1612686868215154,
"kl": 1.0327962239583333,
"learning_rate": 4.998688922613787e-07,
"loss": 0.0685,
"reward": 1.4305555919806163,
"reward_std": 0.5549860845009486,
"rewards/equation_reward_func": 0.6041666784634193,
"rewards/format_reward_func": 0.8263889104127884,
"step": 240
},
{
"clip_ratio": 0.0,
"completion_length": 215.09722646077475,
"epoch": 0.4580441640378549,
"grad_norm": 2.621718223845123,
"kl": 6.5284423828125,
"learning_rate": 4.998600068996324e-07,
"loss": 0.099,
"reward": 1.319444477558136,
"reward_std": 0.42932410165667534,
"rewards/equation_reward_func": 0.5000000211099783,
"rewards/format_reward_func": 0.8194444626569748,
"step": 242
},
{
"clip_ratio": 0.0,
"completion_length": 264.2569529215495,
"epoch": 0.4618296529968454,
"grad_norm": 4.352484643043419,
"kl": 0.9919026692708334,
"learning_rate": 4.998508303250775e-07,
"loss": 0.0482,
"reward": 1.48611115415891,
"reward_std": 0.5396140466133753,
"rewards/equation_reward_func": 0.6041666865348816,
"rewards/format_reward_func": 0.8819444527228674,
"step": 244
},
{
"clip_ratio": 0.0,
"completion_length": 243.7986183166504,
"epoch": 0.465615141955836,
"grad_norm": 5.13544206444599,
"kl": 2.6465657552083335,
"learning_rate": 4.998413625484094e-07,
"loss": 0.1093,
"reward": 1.2500000298023224,
"reward_std": 0.4689197850724061,
"rewards/equation_reward_func": 0.43055556155741215,
"rewards/format_reward_func": 0.8194444676240286,
"step": 246
},
{
"clip_ratio": 0.0,
"completion_length": 251.54167302449545,
"epoch": 0.4694006309148265,
"grad_norm": 2.679299182739803,
"kl": 0.9150797526041666,
"learning_rate": 4.998316035806628e-07,
"loss": 0.1428,
"reward": 1.3888889253139496,
"reward_std": 0.47515800098578137,
"rewards/equation_reward_func": 0.5486111318071684,
"rewards/format_reward_func": 0.8402777910232544,
"step": 248
},
{
"clip_ratio": 0.0,
"completion_length": 268.2152837117513,
"epoch": 0.47318611987381703,
"grad_norm": 1.8553455629336253,
"kl": 1.0334065755208333,
"learning_rate": 4.998215534332118e-07,
"loss": 0.0777,
"reward": 1.4861111640930176,
"reward_std": 0.40932964409391087,
"rewards/equation_reward_func": 0.6250000124176344,
"rewards/format_reward_func": 0.8611111293236414,
"step": 250
},
{
"clip_ratio": 0.0,
"completion_length": 267.91667556762695,
"epoch": 0.4769716088328076,
"grad_norm": 2.3104189096021246,
"kl": 1.7921549479166667,
"learning_rate": 4.998112121177698e-07,
"loss": 0.0391,
"reward": 1.3888889253139496,
"reward_std": 0.4704259845117728,
"rewards/equation_reward_func": 0.5138889048248529,
"rewards/format_reward_func": 0.8750000149011612,
"step": 252
},
{
"clip_ratio": 0.0,
"completion_length": 244.9861183166504,
"epoch": 0.4807570977917981,
"grad_norm": 5.244107644705432,
"kl": 3.0703328450520835,
"learning_rate": 4.9980057964639e-07,
"loss": 0.1144,
"reward": 1.3680555870135624,
"reward_std": 0.3652517894903819,
"rewards/equation_reward_func": 0.4444444552063942,
"rewards/format_reward_func": 0.9236111243565878,
"step": 254
},
{
"clip_ratio": 0.0,
"completion_length": 242.57639694213867,
"epoch": 0.48454258675078865,
"grad_norm": 3.0461805649624036,
"kl": 0.55810546875,
"learning_rate": 4.99789656031464e-07,
"loss": 0.1251,
"reward": 1.451388915379842,
"reward_std": 0.43821969131628674,
"rewards/equation_reward_func": 0.5416666828095913,
"rewards/format_reward_func": 0.9097222437461218,
"step": 256
},
{
"clip_ratio": 0.0,
"completion_length": 268.93056360880536,
"epoch": 0.48832807570977915,
"grad_norm": 2.7875837232126814,
"kl": 19.790120442708332,
"learning_rate": 4.997784412857239e-07,
"loss": 0.1328,
"reward": 1.4444445073604584,
"reward_std": 0.481424443423748,
"rewards/equation_reward_func": 0.5625000136593977,
"rewards/format_reward_func": 0.881944457689921,
"step": 258
},
{
"clip_ratio": 0.0,
"completion_length": 243.06944783528647,
"epoch": 0.4921135646687697,
"grad_norm": 3.3379512405703986,
"kl": 2.3311360677083335,
"learning_rate": 4.997669354222401e-07,
"loss": 0.0831,
"reward": 1.4444444874922435,
"reward_std": 0.48848551760117215,
"rewards/equation_reward_func": 0.5555555758376917,
"rewards/format_reward_func": 0.8888889104127884,
"step": 260
},
{
"clip_ratio": 0.0,
"completion_length": 257.15972900390625,
"epoch": 0.49589905362776027,
"grad_norm": 5.920825260861832,
"kl": 2.1470540364583335,
"learning_rate": 4.99755138454423e-07,
"loss": 0.0901,
"reward": 1.4166667014360428,
"reward_std": 0.40707051381468773,
"rewards/equation_reward_func": 0.5208333519597849,
"rewards/format_reward_func": 0.8958333532015482,
"step": 262
},
{
"clip_ratio": 0.0,
"completion_length": 275.7361208597819,
"epoch": 0.49968454258675077,
"grad_norm": 119.84927693026204,
"kl": 16.074625651041668,
"learning_rate": 4.997430503960219e-07,
"loss": 0.1126,
"reward": 1.4236111442248027,
"reward_std": 0.44205466161171597,
"rewards/equation_reward_func": 0.5347222350537777,
"rewards/format_reward_func": 0.8888889104127884,
"step": 264
},
{
"clip_ratio": 0.0,
"completion_length": 283.8333396911621,
"epoch": 0.5034700315457413,
"grad_norm": 2.3654173997862147,
"kl": 1.8214518229166667,
"learning_rate": 4.997306712611255e-07,
"loss": 0.1992,
"reward": 1.4097222487131755,
"reward_std": 0.4522901251912117,
"rewards/equation_reward_func": 0.5416666778425375,
"rewards/format_reward_func": 0.8680555721124014,
"step": 266
},
{
"clip_ratio": 0.0,
"completion_length": 255.3055648803711,
"epoch": 0.5072555205047319,
"grad_norm": 8.674419859591838,
"kl": 1.6661783854166667,
"learning_rate": 4.997180010641617e-07,
"loss": 0.0642,
"reward": 1.4236111640930176,
"reward_std": 0.4788891275723775,
"rewards/equation_reward_func": 0.5277777922650179,
"rewards/format_reward_func": 0.8958333532015482,
"step": 268
},
{
"clip_ratio": 0.0,
"completion_length": 258.4166742960612,
"epoch": 0.5110410094637224,
"grad_norm": 4.561358423608036,
"kl": 1.7342122395833333,
"learning_rate": 4.997050398198976e-07,
"loss": 0.008,
"reward": 1.3125000496705372,
"reward_std": 0.4775065655509631,
"rewards/equation_reward_func": 0.39583334140479565,
"rewards/format_reward_func": 0.9166666815678278,
"step": 270
},
{
"clip_ratio": 0.0,
"completion_length": 295.40278752644855,
"epoch": 0.5148264984227129,
"grad_norm": 3.607344267562202,
"kl": 1.3319905598958333,
"learning_rate": 4.996917875434397e-07,
"loss": 0.0834,
"reward": 1.36111115415891,
"reward_std": 0.4735433558622996,
"rewards/equation_reward_func": 0.46527778419355553,
"rewards/format_reward_func": 0.8958333532015482,
"step": 272
},
{
"clip_ratio": 0.0,
"completion_length": 261.59722900390625,
"epoch": 0.5186119873817034,
"grad_norm": 2.5043328754943537,
"kl": 1.7516276041666667,
"learning_rate": 4.996782442502337e-07,
"loss": 0.1104,
"reward": 1.3750000496705372,
"reward_std": 0.49132541194558144,
"rewards/equation_reward_func": 0.49305557397504646,
"rewards/format_reward_func": 0.8819444527228674,
"step": 274
},
{
"clip_ratio": 0.0,
"completion_length": 286.08334096272785,
"epoch": 0.522397476340694,
"grad_norm": 6.295893654204792,
"kl": 4.499348958333333,
"learning_rate": 4.996644099560641e-07,
"loss": 0.1441,
"reward": 1.4722222685813904,
"reward_std": 0.5132550907631716,
"rewards/equation_reward_func": 0.5833333507180214,
"rewards/format_reward_func": 0.8888889153798422,
"step": 276
},
{
"clip_ratio": 0.0,
"completion_length": 273.93056360880536,
"epoch": 0.5261829652996846,
"grad_norm": 7.804266363603045,
"kl": 1.0720621744791667,
"learning_rate": 4.996502846770549e-07,
"loss": 0.1438,
"reward": 1.3402778059244156,
"reward_std": 0.4476064319411914,
"rewards/equation_reward_func": 0.4305555584530036,
"rewards/format_reward_func": 0.9097222338120142,
"step": 278
},
{
"clip_ratio": 0.0,
"completion_length": 240.97222900390625,
"epoch": 0.5299684542586751,
"grad_norm": 224.30396185994454,
"kl": 22.011637369791668,
"learning_rate": 4.996358684296693e-07,
"loss": 0.1255,
"reward": 1.3680556019147236,
"reward_std": 0.41704921424388885,
"rewards/equation_reward_func": 0.45138889985779923,
"rewards/format_reward_func": 0.9166666815678278,
"step": 280
},
{
"clip_ratio": 0.0,
"completion_length": 245.88889694213867,
"epoch": 0.5337539432176656,
"grad_norm": 2.9359173813915618,
"kl": 4.063395182291667,
"learning_rate": 4.996211612307092e-07,
"loss": 0.1143,
"reward": 1.3333333482344945,
"reward_std": 0.4269623930255572,
"rewards/equation_reward_func": 0.4791666803260644,
"rewards/format_reward_func": 0.8541666865348816,
"step": 282
},
{
"clip_ratio": 0.0,
"completion_length": 288.9722264607747,
"epoch": 0.5375394321766561,
"grad_norm": 2.754515790547288,
"kl": 1.142822265625,
"learning_rate": 4.996061630973162e-07,
"loss": 0.1758,
"reward": 1.4722222636143367,
"reward_std": 0.38450759773453075,
"rewards/equation_reward_func": 0.5763889042039713,
"rewards/format_reward_func": 0.8958333482344946,
"step": 284
},
{
"clip_ratio": 0.0,
"completion_length": 273.65278752644855,
"epoch": 0.5413249211356467,
"grad_norm": 3.6074116845363675,
"kl": 62.014078776041664,
"learning_rate": 4.995908740469706e-07,
"loss": 0.2716,
"reward": 1.3750000447034836,
"reward_std": 0.4357808977365494,
"rewards/equation_reward_func": 0.45833334140479565,
"rewards/format_reward_func": 0.9166666815678278,
"step": 286
},
{
"clip_ratio": 0.0,
"completion_length": 254.61111958821616,
"epoch": 0.5451104100946372,
"grad_norm": 2.833549076551859,
"kl": 0.9168701171875,
"learning_rate": 4.995752940974918e-07,
"loss": 0.1139,
"reward": 1.4652778108914692,
"reward_std": 0.500111423432827,
"rewards/equation_reward_func": 0.5416666865348816,
"rewards/format_reward_func": 0.9236111293236414,
"step": 288
},
{
"clip_ratio": 0.0,
"completion_length": 266.50000890096027,
"epoch": 0.5488958990536278,
"grad_norm": 2.0285819176753637,
"kl": 0.7223714192708334,
"learning_rate": 4.995594232670383e-07,
"loss": 0.0795,
"reward": 1.5000000298023224,
"reward_std": 0.3858482278883457,
"rewards/equation_reward_func": 0.5902777904023727,
"rewards/format_reward_func": 0.909722238779068,
"step": 290
},
{
"clip_ratio": 0.0,
"completion_length": 242.50000953674316,
"epoch": 0.5526813880126183,
"grad_norm": 2.469130613713446,
"kl": 7.028157552083333,
"learning_rate": 4.995432615741076e-07,
"loss": 0.0928,
"reward": 1.5208333730697632,
"reward_std": 0.3851733220120271,
"rewards/equation_reward_func": 0.5972222313284874,
"rewards/format_reward_func": 0.9236111293236414,
"step": 292
},
{
"clip_ratio": 0.0,
"completion_length": 256.7430610656738,
"epoch": 0.5564668769716088,
"grad_norm": 2.818264802652981,
"kl": 0.8765869140625,
"learning_rate": 4.995268090375362e-07,
"loss": 0.134,
"reward": 1.4930555919806163,
"reward_std": 0.4773927927017212,
"rewards/equation_reward_func": 0.6041666840513548,
"rewards/format_reward_func": 0.8888889054457346,
"step": 294
},
{
"clip_ratio": 0.0,
"completion_length": 269.722230275472,
"epoch": 0.5602523659305993,
"grad_norm": 2.6290072054683082,
"kl": 1.646728515625,
"learning_rate": 4.995100656764996e-07,
"loss": 0.111,
"reward": 1.3402778108914692,
"reward_std": 0.45711999386548996,
"rewards/equation_reward_func": 0.430555568387111,
"rewards/format_reward_func": 0.9097222437461218,
"step": 296
},
{
"clip_ratio": 0.0,
"completion_length": 283.94445419311523,
"epoch": 0.5640378548895899,
"grad_norm": 3.52286689241144,
"kl": 1.5117594401041667,
"learning_rate": 4.994930315105124e-07,
"loss": 0.1291,
"reward": 1.4722222586472828,
"reward_std": 0.4221850348015626,
"rewards/equation_reward_func": 0.5763889029622078,
"rewards/format_reward_func": 0.8958333532015482,
"step": 298
},
{
"clip_ratio": 0.0,
"completion_length": 260.83334096272785,
"epoch": 0.5678233438485805,
"grad_norm": 3.048640806478669,
"kl": 8.982340494791666,
"learning_rate": 4.994757065594279e-07,
"loss": 0.1167,
"reward": 1.4236111442248027,
"reward_std": 0.4365849755704403,
"rewards/equation_reward_func": 0.534722238779068,
"rewards/format_reward_func": 0.8888889104127884,
"step": 300
},
{
"clip_ratio": 0.0,
"completion_length": 251.4861208597819,
"epoch": 0.571608832807571,
"grad_norm": 5.880882126873241,
"kl": 2.01953125,
"learning_rate": 4.994580908434383e-07,
"loss": 0.2153,
"reward": 1.3750000298023224,
"reward_std": 0.4684516203900178,
"rewards/equation_reward_func": 0.47916667846341926,
"rewards/format_reward_func": 0.8958333482344946,
"step": 302
},
{
"clip_ratio": 0.0,
"completion_length": 308.96528244018555,
"epoch": 0.5753943217665615,
"grad_norm": 7.940563386747667,
"kl": 2.2464192708333335,
"learning_rate": 4.994401843830749e-07,
"loss": 0.2154,
"reward": 1.2638889352480571,
"reward_std": 0.516243410607179,
"rewards/equation_reward_func": 0.41666668343047303,
"rewards/format_reward_func": 0.8472222437461218,
"step": 304
},
{
"clip_ratio": 0.0,
"completion_length": 232.1180623372396,
"epoch": 0.579179810725552,
"grad_norm": 305.4523441721458,
"kl": 29.108561197916668,
"learning_rate": 4.994219871992076e-07,
"loss": 0.2207,
"reward": 1.4375000298023224,
"reward_std": 0.45513641958435375,
"rewards/equation_reward_func": 0.5277778009573618,
"rewards/format_reward_func": 0.909722238779068,
"step": 306
},
{
"clip_ratio": 0.0,
"completion_length": 264.7777862548828,
"epoch": 0.5829652996845426,
"grad_norm": 2.0715581627005784,
"kl": 1.160400390625,
"learning_rate": 4.994034993130455e-07,
"loss": 0.1089,
"reward": 1.3958333730697632,
"reward_std": 0.3590660902361075,
"rewards/equation_reward_func": 0.47222223443289596,
"rewards/format_reward_func": 0.9236111243565878,
"step": 308
},
{
"clip_ratio": 0.0,
"completion_length": 257.13195419311523,
"epoch": 0.5867507886435331,
"grad_norm": 3.5395487394835476,
"kl": 1.2493489583333333,
"learning_rate": 4.993847207461362e-07,
"loss": 0.1119,
"reward": 1.3194444924592972,
"reward_std": 0.40260318542520207,
"rewards/equation_reward_func": 0.4236111293236415,
"rewards/format_reward_func": 0.8958333532015482,
"step": 310
},
{
"clip_ratio": 0.0,
"completion_length": 248.52778498331705,
"epoch": 0.5905362776025237,
"grad_norm": 3.948194000938186,
"kl": 1.1299641927083333,
"learning_rate": 4.993656515203662e-07,
"loss": 0.1778,
"reward": 1.3819444874922435,
"reward_std": 0.39707954103748005,
"rewards/equation_reward_func": 0.465277789781491,
"rewards/format_reward_func": 0.9166666766007742,
"step": 312
},
{
"clip_ratio": 0.0,
"completion_length": 251.06250508626303,
"epoch": 0.5943217665615141,
"grad_norm": 3.3126376703610187,
"kl": 1.5913899739583333,
"learning_rate": 4.993462916579606e-07,
"loss": 0.1415,
"reward": 1.4027778059244156,
"reward_std": 0.415769978115956,
"rewards/equation_reward_func": 0.5069444589316845,
"rewards/format_reward_func": 0.8958333532015482,
"step": 314
},
{
"clip_ratio": 0.0,
"completion_length": 267.7361195882161,
"epoch": 0.5981072555205047,
"grad_norm": 3.6712572603171045,
"kl": 1.0328776041666667,
"learning_rate": 4.993266411814837e-07,
"loss": 0.1356,
"reward": 1.5138889253139496,
"reward_std": 0.43073243647813797,
"rewards/equation_reward_func": 0.6180555745959282,
"rewards/format_reward_func": 0.8958333482344946,
"step": 316
},
{
"clip_ratio": 0.0,
"completion_length": 260.7569516499837,
"epoch": 0.6018927444794953,
"grad_norm": 2.898300493316585,
"kl": 1.694091796875,
"learning_rate": 4.993067001138379e-07,
"loss": 0.1933,
"reward": 1.3958333830038707,
"reward_std": 0.45616808036963147,
"rewards/equation_reward_func": 0.5138889054457346,
"rewards/format_reward_func": 0.881944457689921,
"step": 318
},
{
"clip_ratio": 0.0,
"completion_length": 243.38889439900717,
"epoch": 0.6056782334384858,
"grad_norm": 2.3012954584762206,
"kl": 1.1136881510416667,
"learning_rate": 4.992864684782648e-07,
"loss": 0.0314,
"reward": 1.423611159125964,
"reward_std": 0.4477810760339101,
"rewards/equation_reward_func": 0.4930555696288745,
"rewards/format_reward_func": 0.9305555721124014,
"step": 320
},
{
"clip_ratio": 0.0,
"completion_length": 252.68750635782877,
"epoch": 0.6094637223974764,
"grad_norm": 10.558231881280353,
"kl": 7.27197265625,
"learning_rate": 4.992659462983445e-07,
"loss": 0.1837,
"reward": 1.4444444874922435,
"reward_std": 0.4468059837818146,
"rewards/equation_reward_func": 0.5416666797051827,
"rewards/format_reward_func": 0.9027777959903082,
"step": 322
},
{
"clip_ratio": 0.0,
"completion_length": 223.90278244018555,
"epoch": 0.6132492113564669,
"grad_norm": 8.24083470994998,
"kl": 1.4090983072916667,
"learning_rate": 4.992451335979955e-07,
"loss": 0.0984,
"reward": 1.4513889253139496,
"reward_std": 0.34703291207551956,
"rewards/equation_reward_func": 0.5000000136593977,
"rewards/format_reward_func": 0.951388900478681,
"step": 324
},
{
"clip_ratio": 0.0,
"completion_length": 246.9583396911621,
"epoch": 0.6170347003154574,
"grad_norm": 3.165272632330998,
"kl": 1.4227701822916667,
"learning_rate": 4.992240304014751e-07,
"loss": 0.0434,
"reward": 1.381944477558136,
"reward_std": 0.3748237465818723,
"rewards/equation_reward_func": 0.4583333457509677,
"rewards/format_reward_func": 0.9236111243565878,
"step": 326
},
{
"clip_ratio": 0.0,
"completion_length": 248.64584477742514,
"epoch": 0.6208201892744479,
"grad_norm": 3.802073252683938,
"kl": 1.0417887369791667,
"learning_rate": 4.992026367333793e-07,
"loss": 0.0662,
"reward": 1.5347222487131755,
"reward_std": 0.3855091730753581,
"rewards/equation_reward_func": 0.6111111243565878,
"rewards/format_reward_func": 0.9236111243565878,
"step": 328
},
{
"clip_ratio": 0.0,
"completion_length": 251.90278498331705,
"epoch": 0.6246056782334385,
"grad_norm": 3.1898348924774695,
"kl": 1.318359375,
"learning_rate": 4.991809526186423e-07,
"loss": 0.1018,
"reward": 1.4930555919806163,
"reward_std": 0.4848398119211197,
"rewards/equation_reward_func": 0.5694444676240286,
"rewards/format_reward_func": 0.9236111293236414,
"step": 330
},
{
"clip_ratio": 0.0,
"completion_length": 270.7916781107585,
"epoch": 0.628391167192429,
"grad_norm": 48.65745253251759,
"kl": 9.658447265625,
"learning_rate": 4.991589780825373e-07,
"loss": 0.2243,
"reward": 1.6180556019147236,
"reward_std": 0.38904641941189766,
"rewards/equation_reward_func": 0.722222238779068,
"rewards/format_reward_func": 0.8958333532015482,
"step": 332
},
{
"clip_ratio": 0.0,
"completion_length": 247.05556360880533,
"epoch": 0.6321766561514196,
"grad_norm": 12.85876415564074,
"kl": 2.3059895833333335,
"learning_rate": 4.991367131506753e-07,
"loss": 0.0952,
"reward": 1.4930555919806163,
"reward_std": 0.44105598827203113,
"rewards/equation_reward_func": 0.5902777935067812,
"rewards/format_reward_func": 0.9027777959903082,
"step": 334
},
{
"clip_ratio": 0.0,
"completion_length": 252.79861958821616,
"epoch": 0.63596214511041,
"grad_norm": 3.766932765553029,
"kl": 1.00732421875,
"learning_rate": 4.991141578490066e-07,
"loss": 0.108,
"reward": 1.4305556019147236,
"reward_std": 0.4160829931497574,
"rewards/equation_reward_func": 0.5138888967533907,
"rewards/format_reward_func": 0.9166666766007742,
"step": 336
},
{
"clip_ratio": 0.0,
"completion_length": 257.6805610656738,
"epoch": 0.6397476340694006,
"grad_norm": 8.363557603327017,
"kl": 2.90673828125,
"learning_rate": 4.990913122038193e-07,
"loss": 0.0988,
"reward": 1.506944477558136,
"reward_std": 0.4711163180569808,
"rewards/equation_reward_func": 0.5833333494762579,
"rewards/format_reward_func": 0.9236111293236414,
"step": 338
},
{
"clip_ratio": 0.0,
"completion_length": 278.05556360880536,
"epoch": 0.6435331230283912,
"grad_norm": 2.4831862429823874,
"kl": 1.1470540364583333,
"learning_rate": 4.9906817624174e-07,
"loss": 0.1149,
"reward": 1.4583333780368168,
"reward_std": 0.40201255182425183,
"rewards/equation_reward_func": 0.5486111318071684,
"rewards/format_reward_func": 0.9097222338120142,
"step": 340
},
{
"clip_ratio": 0.0,
"completion_length": 270.8125114440918,
"epoch": 0.6473186119873817,
"grad_norm": 96.69755111218885,
"kl": 18.217529296875,
"learning_rate": 4.990447499897339e-07,
"loss": 0.1482,
"reward": 1.4166666964689891,
"reward_std": 0.4657805400590102,
"rewards/equation_reward_func": 0.500000019868215,
"rewards/format_reward_func": 0.9166666865348816,
"step": 342
},
{
"clip_ratio": 0.0,
"completion_length": 280.9513982137044,
"epoch": 0.6511041009463723,
"grad_norm": 4.4626269454999035,
"kl": 1.0166829427083333,
"learning_rate": 4.990210334751042e-07,
"loss": 0.2191,
"reward": 1.4305555919806163,
"reward_std": 0.5064363280932108,
"rewards/equation_reward_func": 0.5208333445092043,
"rewards/format_reward_func": 0.909722238779068,
"step": 344
},
{
"clip_ratio": 0.0,
"completion_length": 290.0277862548828,
"epoch": 0.6548895899053627,
"grad_norm": 42.001704471875875,
"kl": 7.866048177083333,
"learning_rate": 4.989970267254928e-07,
"loss": 0.3399,
"reward": 1.37500003973643,
"reward_std": 0.4621751358111699,
"rewards/equation_reward_func": 0.5138889104127884,
"rewards/format_reward_func": 0.8611111243565878,
"step": 346
},
{
"clip_ratio": 0.0,
"completion_length": 287.0277849833171,
"epoch": 0.6586750788643533,
"grad_norm": 401.0064206569611,
"kl": 13.825358072916666,
"learning_rate": 4.989727297688796e-07,
"loss": 0.2614,
"reward": 1.4930555919806163,
"reward_std": 0.48149604598681134,
"rewards/equation_reward_func": 0.6319444614152113,
"rewards/format_reward_func": 0.8611111342906952,
"step": 348
},
{
"clip_ratio": 0.0,
"completion_length": 267.2222277323405,
"epoch": 0.6624605678233438,
"grad_norm": 5.58193017827173,
"kl": 1.5638020833333333,
"learning_rate": 4.989481426335828e-07,
"loss": 0.2184,
"reward": 1.4791667064030964,
"reward_std": 0.32900576541821164,
"rewards/equation_reward_func": 0.583333345130086,
"rewards/format_reward_func": 0.8958333482344946,
"step": 350
},
{
"clip_ratio": 0.0,
"completion_length": 312.2222315470378,
"epoch": 0.6662460567823344,
"grad_norm": 2.903611804665768,
"kl": 1.7395833333333333,
"learning_rate": 4.989232653482587e-07,
"loss": 0.2021,
"reward": 1.4305555919806163,
"reward_std": 0.4162732983628909,
"rewards/equation_reward_func": 0.5486111268401146,
"rewards/format_reward_func": 0.8819444676240286,
"step": 352
},
{
"clip_ratio": 0.0,
"completion_length": 295.96528752644855,
"epoch": 0.670031545741325,
"grad_norm": 8.614948807031883,
"kl": 1.4444986979166667,
"learning_rate": 4.98898097941902e-07,
"loss": 0.2504,
"reward": 1.3194444825251896,
"reward_std": 0.3698546774685383,
"rewards/equation_reward_func": 0.43750001179675263,
"rewards/format_reward_func": 0.881944457689921,
"step": 354
},
{
"clip_ratio": 0.0,
"completion_length": 296.6111208597819,
"epoch": 0.6738170347003155,
"grad_norm": 18.17568858303832,
"kl": 4.43408203125,
"learning_rate": 4.988726404438453e-07,
"loss": 0.2654,
"reward": 1.2569444924592972,
"reward_std": 0.5792658850550652,
"rewards/equation_reward_func": 0.43750001303851604,
"rewards/format_reward_func": 0.8194444626569748,
"step": 356
},
{
"clip_ratio": 0.0,
"completion_length": 297.1527862548828,
"epoch": 0.677602523659306,
"grad_norm": 3.3997948870685444,
"kl": 2.4781901041666665,
"learning_rate": 4.988468928837595e-07,
"loss": 0.2077,
"reward": 1.4027778307596843,
"reward_std": 0.43186015884081524,
"rewards/equation_reward_func": 0.5625000142802795,
"rewards/format_reward_func": 0.8402777959903082,
"step": 358
},
{
"clip_ratio": 0.0,
"completion_length": 259.7013969421387,
"epoch": 0.6813880126182965,
"grad_norm": 3.261696651794849,
"kl": 2.1082763671875,
"learning_rate": 4.988208552916535e-07,
"loss": 0.1781,
"reward": 1.388888920346896,
"reward_std": 0.4762779163817565,
"rewards/equation_reward_func": 0.5069444607943296,
"rewards/format_reward_func": 0.881944457689921,
"step": 360
},
{
"clip_ratio": 0.0,
"completion_length": 285.75695673624676,
"epoch": 0.6851735015772871,
"grad_norm": 436.9538386873056,
"kl": 90.0078125,
"learning_rate": 4.987945276978741e-07,
"loss": 0.6442,
"reward": 1.2361111342906952,
"reward_std": 0.47308399528265,
"rewards/equation_reward_func": 0.3888889054457347,
"rewards/format_reward_func": 0.8472222437461218,
"step": 362
},
{
"clip_ratio": 0.0,
"completion_length": 253.26389821370444,
"epoch": 0.6889589905362776,
"grad_norm": 6.275698981154313,
"kl": 1.6064453125,
"learning_rate": 4.987679101331063e-07,
"loss": 0.2335,
"reward": 1.4861111442248027,
"reward_std": 0.4897613674402237,
"rewards/equation_reward_func": 0.6041666890184084,
"rewards/format_reward_func": 0.8819444626569748,
"step": 364
},
{
"clip_ratio": 0.0,
"completion_length": 225.61111958821616,
"epoch": 0.6927444794952682,
"grad_norm": 5.577871979120166,
"kl": 0.7556966145833334,
"learning_rate": 4.987410026283729e-07,
"loss": 0.1068,
"reward": 1.48611115415891,
"reward_std": 0.5080769136548042,
"rewards/equation_reward_func": 0.5763889054457346,
"rewards/format_reward_func": 0.9097222437461218,
"step": 366
},
{
"clip_ratio": 0.0,
"completion_length": 208.28472900390625,
"epoch": 0.6965299684542586,
"grad_norm": 34.81645021530138,
"kl": 5.219563802083333,
"learning_rate": 4.98713805215035e-07,
"loss": 0.1549,
"reward": 1.4583333830038707,
"reward_std": 0.40722255781292915,
"rewards/equation_reward_func": 0.5625000142802795,
"rewards/format_reward_func": 0.8958333532015482,
"step": 368
},
{
"clip_ratio": 0.0,
"completion_length": 267.7986183166504,
"epoch": 0.7003154574132492,
"grad_norm": 10.388065090744742,
"kl": 10.697916666666666,
"learning_rate": 4.986863179247908e-07,
"loss": 0.1906,
"reward": 1.3750000447034836,
"reward_std": 0.47181837012370426,
"rewards/equation_reward_func": 0.5069444558272759,
"rewards/format_reward_func": 0.8680555721124014,
"step": 370
},
{
"clip_ratio": 0.0,
"completion_length": 229.8611157735189,
"epoch": 0.7041009463722397,
"grad_norm": 4.562876059825846,
"kl": 4.035807291666667,
"learning_rate": 4.986585407896771e-07,
"loss": 0.223,
"reward": 1.4027778208255768,
"reward_std": 0.5173191850384077,
"rewards/equation_reward_func": 0.5486111293236414,
"rewards/format_reward_func": 0.854166696468989,
"step": 372
},
{
"clip_ratio": 0.0,
"completion_length": 229.1666742960612,
"epoch": 0.7078864353312303,
"grad_norm": 6.466655997110351,
"kl": 758.0651041666666,
"learning_rate": 4.986304738420683e-07,
"loss": 0.4869,
"reward": 1.4305555820465088,
"reward_std": 0.4751903774837653,
"rewards/equation_reward_func": 0.5763888955116272,
"rewards/format_reward_func": 0.8541666865348816,
"step": 374
},
{
"clip_ratio": 0.0,
"completion_length": 251.9236208597819,
"epoch": 0.7116719242902209,
"grad_norm": 49.29790482270018,
"kl": 13.262369791666666,
"learning_rate": 4.986021171146764e-07,
"loss": 0.3513,
"reward": 1.354166716337204,
"reward_std": 0.5414688164989153,
"rewards/equation_reward_func": 0.5138889054457346,
"rewards/format_reward_func": 0.8402777959903082,
"step": 376
},
{
"clip_ratio": 0.0,
"completion_length": 254.1666774749756,
"epoch": 0.7154574132492113,
"grad_norm": 5.643615815413666,
"kl": 7.41162109375,
"learning_rate": 4.985734706405516e-07,
"loss": 0.2591,
"reward": 1.2777778059244156,
"reward_std": 0.4625398740172386,
"rewards/equation_reward_func": 0.4513889004786809,
"rewards/format_reward_func": 0.8263889203468958,
"step": 378
},
{
"clip_ratio": 0.0,
"completion_length": 289.9583396911621,
"epoch": 0.7192429022082019,
"grad_norm": 304.8418060986503,
"kl": 665.8196614583334,
"learning_rate": 4.98544534453081e-07,
"loss": 1.0021,
"reward": 1.2708333830038707,
"reward_std": 0.4970496619741122,
"rewards/equation_reward_func": 0.534722234432896,
"rewards/format_reward_func": 0.736111139257749,
"step": 380
},
{
"clip_ratio": 0.0,
"completion_length": 247.37500635782877,
"epoch": 0.7230283911671924,
"grad_norm": 11.586701386430356,
"kl": 8.091145833333334,
"learning_rate": 4.985153085859902e-07,
"loss": 0.2491,
"reward": 1.43750003973643,
"reward_std": 0.5147989491621653,
"rewards/equation_reward_func": 0.6458333432674408,
"rewards/format_reward_func": 0.7916666865348816,
"step": 382
},
{
"clip_ratio": 0.0,
"completion_length": 264.2847315470378,
"epoch": 0.726813880126183,
"grad_norm": 9.752593632001947,
"kl": 11.559244791666666,
"learning_rate": 4.984857930733419e-07,
"loss": 0.3493,
"reward": 1.1111111392577488,
"reward_std": 0.47952866181731224,
"rewards/equation_reward_func": 0.347222230086724,
"rewards/format_reward_func": 0.7638889104127884,
"step": 384
},
{
"clip_ratio": 0.0,
"completion_length": 269.0902849833171,
"epoch": 0.7305993690851735,
"grad_norm": 9.316145758908815,
"kl": 11.126953125,
"learning_rate": 4.984559879495366e-07,
"loss": 0.3237,
"reward": 1.201388920346896,
"reward_std": 0.6368941242496172,
"rewards/equation_reward_func": 0.4861111231148243,
"rewards/format_reward_func": 0.7152778009573618,
"step": 386
},
{
"clip_ratio": 0.0,
"completion_length": 255.90972900390625,
"epoch": 0.7343848580441641,
"grad_norm": 8.827779574351993,
"kl": 121.453125,
"learning_rate": 4.984258932493123e-07,
"loss": 0.5189,
"reward": 1.2638889352480571,
"reward_std": 0.5239984119931856,
"rewards/equation_reward_func": 0.5555555671453476,
"rewards/format_reward_func": 0.708333358168602,
"step": 388
},
{
"clip_ratio": 0.0,
"completion_length": 237.27084159851074,
"epoch": 0.7381703470031545,
"grad_norm": 8.013012272149158,
"kl": 24.984375,
"learning_rate": 4.983955090077444e-07,
"loss": 0.2832,
"reward": 1.1597222437461217,
"reward_std": 0.5535530770818392,
"rewards/equation_reward_func": 0.48611112497746944,
"rewards/format_reward_func": 0.6736111268401146,
"step": 390
},
{
"clip_ratio": 0.0,
"completion_length": 249.63889821370444,
"epoch": 0.7419558359621451,
"grad_norm": 438.51211315014166,
"kl": 126.08072916666667,
"learning_rate": 4.983648352602459e-07,
"loss": 0.3395,
"reward": 1.1250000298023224,
"reward_std": 0.6015344088276228,
"rewards/equation_reward_func": 0.4930555646618207,
"rewards/format_reward_func": 0.6319444750746092,
"step": 392
},
{
"clip_ratio": 0.0,
"completion_length": 195.81250508626303,
"epoch": 0.7457413249211357,
"grad_norm": 13.870204564822584,
"kl": 9.074869791666666,
"learning_rate": 4.983338720425672e-07,
"loss": 0.2873,
"reward": 1.1805555770794551,
"reward_std": 0.6060735906163851,
"rewards/equation_reward_func": 0.4583333407839139,
"rewards/format_reward_func": 0.7222222437461218,
"step": 394
},
{
"clip_ratio": 0.0,
"completion_length": 254.87500699361166,
"epoch": 0.7495268138801262,
"grad_norm": 82.14394465970908,
"kl": 38.481770833333336,
"learning_rate": 4.98302619390796e-07,
"loss": 0.3067,
"reward": 1.1250000298023224,
"reward_std": 0.4945492781698704,
"rewards/equation_reward_func": 0.5277777904023727,
"rewards/format_reward_func": 0.5972222425043583,
"step": 396
},
{
"clip_ratio": 0.0,
"completion_length": 221.51389439900717,
"epoch": 0.7533123028391168,
"grad_norm": 17.556835883262877,
"kl": 97.25,
"learning_rate": 4.982710773413576e-07,
"loss": 0.3719,
"reward": 1.131944477558136,
"reward_std": 0.588702150930961,
"rewards/equation_reward_func": 0.5763889079292616,
"rewards/format_reward_func": 0.555555577079455,
"step": 398
},
{
"clip_ratio": 0.0,
"completion_length": 197.82639376322427,
"epoch": 0.7570977917981072,
"grad_norm": 26.30378944955965,
"kl": 17.8984375,
"learning_rate": 4.98239245931014e-07,
"loss": 0.3139,
"reward": 1.1805555870135624,
"reward_std": 0.5916161189476649,
"rewards/equation_reward_func": 0.5902777959903082,
"rewards/format_reward_func": 0.5902777959903082,
"step": 400
},
{
"clip_ratio": 0.0,
"completion_length": 222.54861704508463,
"epoch": 0.7608832807570978,
"grad_norm": 10.696318069471166,
"kl": 14.2109375,
"learning_rate": 4.982071251968652e-07,
"loss": 0.2388,
"reward": 1.1041666964689891,
"reward_std": 0.5821270644664764,
"rewards/equation_reward_func": 0.5069444638987383,
"rewards/format_reward_func": 0.5972222437461218,
"step": 402
},
{
"clip_ratio": 0.0,
"completion_length": 219.2916742960612,
"epoch": 0.7646687697160883,
"grad_norm": 132.22587525968703,
"kl": 40.453125,
"learning_rate": 4.981747151763478e-07,
"loss": 0.2509,
"reward": 1.0208333631356556,
"reward_std": 0.6254869078596433,
"rewards/equation_reward_func": 0.493055568387111,
"rewards/format_reward_func": 0.5277777959903082,
"step": 404
},
{
"clip_ratio": 0.0,
"completion_length": 213.25000445048013,
"epoch": 0.7684542586750789,
"grad_norm": 53.75808201656059,
"kl": 28.166666666666668,
"learning_rate": 4.981420159072359e-07,
"loss": 0.3216,
"reward": 0.923611139257749,
"reward_std": 0.5980016005535921,
"rewards/equation_reward_func": 0.39583334513008595,
"rewards/format_reward_func": 0.5277777860562006,
"step": 406
},
{
"clip_ratio": 0.0,
"completion_length": 255.5486208597819,
"epoch": 0.7722397476340694,
"grad_norm": 76.3126195344439,
"kl": 24.140625,
"learning_rate": 4.981090274276405e-07,
"loss": 0.2661,
"reward": 1.0833333681027095,
"reward_std": 0.6427489096919695,
"rewards/equation_reward_func": 0.5833333482344946,
"rewards/format_reward_func": 0.500000019868215,
"step": 408
},
{
"clip_ratio": 0.0,
"completion_length": 223.21528244018555,
"epoch": 0.7760252365930599,
"grad_norm": 13589.418456534844,
"kl": 1149.7135416666667,
"learning_rate": 4.9807574977601e-07,
"loss": 2.3024,
"reward": 0.9375000447034836,
"reward_std": 0.6095106812814871,
"rewards/equation_reward_func": 0.42361112497746944,
"rewards/format_reward_func": 0.5138889029622078,
"step": 410
},
{
"clip_ratio": 0.0,
"completion_length": 227.54167366027832,
"epoch": 0.7798107255520504,
"grad_norm": 33.78184675982937,
"kl": 29.5703125,
"learning_rate": 4.980421829911295e-07,
"loss": 0.269,
"reward": 0.8541667014360428,
"reward_std": 0.6479750176270803,
"rewards/equation_reward_func": 0.43055556900799274,
"rewards/format_reward_func": 0.4236111268401146,
"step": 412
},
{
"clip_ratio": 0.0,
"completion_length": 274.4166742960612,
"epoch": 0.783596214511041,
"grad_norm": 17.81633266386669,
"kl": 28.666666666666668,
"learning_rate": 4.980083271121214e-07,
"loss": 0.3345,
"reward": 0.909722238779068,
"reward_std": 0.6108483547965685,
"rewards/equation_reward_func": 0.5277777959903082,
"rewards/format_reward_func": 0.3819444576899211,
"step": 414
},
{
"clip_ratio": 0.0,
"completion_length": 251.74306106567383,
"epoch": 0.7873817034700316,
"grad_norm": 22.608432736994907,
"kl": 51.177083333333336,
"learning_rate": 4.979741821784445e-07,
"loss": 0.2628,
"reward": 0.8680555870135626,
"reward_std": 0.6757829288641611,
"rewards/equation_reward_func": 0.4583333457509677,
"rewards/format_reward_func": 0.40972222822407883,
"step": 416
},
{
"clip_ratio": 0.0,
"completion_length": 265.33334096272785,
"epoch": 0.7911671924290221,
"grad_norm": 13.10404823539201,
"kl": 27.53125,
"learning_rate": 4.979397482298952e-07,
"loss": 0.3222,
"reward": 0.7916666939854622,
"reward_std": 0.619778610765934,
"rewards/equation_reward_func": 0.38194445582727593,
"rewards/format_reward_func": 0.4097222313284874,
"step": 418
},
{
"clip_ratio": 0.0,
"completion_length": 245.4513931274414,
"epoch": 0.7949526813880127,
"grad_norm": 17.285612572481327,
"kl": 25.333333333333332,
"learning_rate": 4.979050253066063e-07,
"loss": 0.2375,
"reward": 0.937500019868215,
"reward_std": 0.5681246320406595,
"rewards/equation_reward_func": 0.5000000136593977,
"rewards/format_reward_func": 0.4375000074505806,
"step": 420
},
{
"clip_ratio": 0.0,
"completion_length": 247.6736183166504,
"epoch": 0.7987381703470031,
"grad_norm": 114.10864728746037,
"kl": 68.94791666666667,
"learning_rate": 4.978700134490473e-07,
"loss": 0.3221,
"reward": 0.9861111293236414,
"reward_std": 0.6230639989177386,
"rewards/equation_reward_func": 0.4791666778425376,
"rewards/format_reward_func": 0.5069444589316845,
"step": 422
},
{
"clip_ratio": 0.0,
"completion_length": 268.19445037841797,
"epoch": 0.8025236593059937,
"grad_norm": 27.278356050728746,
"kl": 63.755208333333336,
"learning_rate": 4.97834712698025e-07,
"loss": 0.3404,
"reward": 0.9027778077870607,
"reward_std": 0.6374689054985841,
"rewards/equation_reward_func": 0.5208333469927311,
"rewards/format_reward_func": 0.3819444514811039,
"step": 424
},
{
"clip_ratio": 0.0,
"completion_length": 261.0486183166504,
"epoch": 0.8063091482649842,
"grad_norm": 112.26698272573795,
"kl": 125.875,
"learning_rate": 4.977991230946823e-07,
"loss": 0.3086,
"reward": 0.9791666915019354,
"reward_std": 0.6475708857178688,
"rewards/equation_reward_func": 0.5763889091710249,
"rewards/format_reward_func": 0.40277778916060925,
"step": 426
},
{
"clip_ratio": 0.0,
"completion_length": 304.7708435058594,
"epoch": 0.8100946372239748,
"grad_norm": 88.42317906709971,
"kl": 145.625,
"learning_rate": 4.977632446804992e-07,
"loss": 0.3789,
"reward": 0.784722238779068,
"reward_std": 0.6482410331567129,
"rewards/equation_reward_func": 0.451388909171025,
"rewards/format_reward_func": 0.33333334388832253,
"step": 428
},
{
"clip_ratio": 0.0,
"completion_length": 276.3194529215495,
"epoch": 0.8138801261829653,
"grad_norm": 57.008405478973984,
"kl": 90.27083333333333,
"learning_rate": 4.97727077497292e-07,
"loss": 0.3829,
"reward": 0.8888889054457346,
"reward_std": 0.58370058486859,
"rewards/equation_reward_func": 0.5763889079292616,
"rewards/format_reward_func": 0.3125000074505806,
"step": 430
},
{
"clip_ratio": 0.0,
"completion_length": 274.0277837117513,
"epoch": 0.8176656151419558,
"grad_norm": 41.9648702558559,
"kl": 93.64973958333333,
"learning_rate": 4.976906215872137e-07,
"loss": 0.2295,
"reward": 0.8263889054457346,
"reward_std": 0.6093253418803215,
"rewards/equation_reward_func": 0.4861111243565877,
"rewards/format_reward_func": 0.34027778419355553,
"step": 432
},
{
"clip_ratio": 0.0,
"completion_length": 366.2569580078125,
"epoch": 0.8214511041009463,
"grad_norm": 23.70562238362353,
"kl": 49.619791666666664,
"learning_rate": 4.976538769927538e-07,
"loss": 0.2481,
"reward": 0.5763888992369175,
"reward_std": 0.6349846472342809,
"rewards/equation_reward_func": 0.3541666728754838,
"rewards/format_reward_func": 0.2222222244987885,
"step": 434
},
{
"clip_ratio": 0.0,
"completion_length": 322.51389439900714,
"epoch": 0.8252365930599369,
"grad_norm": 173.94515224795077,
"kl": 55.354166666666664,
"learning_rate": 4.976168437567384e-07,
"loss": 0.2866,
"reward": 0.7361111330489317,
"reward_std": 0.5293329904476801,
"rewards/equation_reward_func": 0.4652777922650178,
"rewards/format_reward_func": 0.2708333395421505,
"step": 436
},
{
"clip_ratio": 0.0,
"completion_length": 370.1041781107585,
"epoch": 0.8290220820189275,
"grad_norm": 49.42785489603644,
"kl": 38.208333333333336,
"learning_rate": 4.975795219223298e-07,
"loss": 0.2725,
"reward": 0.6250000161429247,
"reward_std": 0.6038348153233528,
"rewards/equation_reward_func": 0.39583334513008595,
"rewards/format_reward_func": 0.2291666710128387,
"step": 438
},
{
"clip_ratio": 0.0,
"completion_length": 358.9305674235026,
"epoch": 0.832807570977918,
"grad_norm": 63.93647805172191,
"kl": 38.565104166666664,
"learning_rate": 4.975419115330267e-07,
"loss": 0.2397,
"reward": 0.6388889079292616,
"reward_std": 0.5783760311702887,
"rewards/equation_reward_func": 0.40972224312524,
"rewards/format_reward_func": 0.2291666722546021,
"step": 440
},
{
"clip_ratio": 0.0,
"completion_length": 343.5486208597819,
"epoch": 0.8365930599369085,
"grad_norm": 18.35613118498554,
"kl": 40.0,
"learning_rate": 4.975040126326641e-07,
"loss": 0.3459,
"reward": 0.7291666766007742,
"reward_std": 0.6189329201976458,
"rewards/equation_reward_func": 0.46527778916060925,
"rewards/format_reward_func": 0.2638888967533906,
"step": 442
},
{
"clip_ratio": 0.0,
"completion_length": 334.00001017252606,
"epoch": 0.840378548895899,
"grad_norm": 61.315133397385836,
"kl": 64.54166666666667,
"learning_rate": 4.974658252654134e-07,
"loss": 0.3642,
"reward": 0.6111111268401146,
"reward_std": 0.6266890317201614,
"rewards/equation_reward_func": 0.3402777823309104,
"rewards/format_reward_func": 0.27083334140479565,
"step": 444
},
{
"clip_ratio": 0.0,
"completion_length": 287.41667556762695,
"epoch": 0.8441640378548896,
"grad_norm": 111.02963258009683,
"kl": 73.3125,
"learning_rate": 4.974273494757822e-07,
"loss": 0.2892,
"reward": 0.736111139257749,
"reward_std": 0.5954531555374464,
"rewards/equation_reward_func": 0.430555568387111,
"rewards/format_reward_func": 0.3055555609365304,
"step": 446
},
{
"clip_ratio": 0.0,
"completion_length": 315.1458447774251,
"epoch": 0.8479495268138801,
"grad_norm": 416.3212915048579,
"kl": 112.47135416666667,
"learning_rate": 4.973885853086141e-07,
"loss": 0.3557,
"reward": 0.7083333532015482,
"reward_std": 0.595863493780295,
"rewards/equation_reward_func": 0.43750001055498916,
"rewards/format_reward_func": 0.27083333892126876,
"step": 448
},
{
"clip_ratio": 0.0,
"completion_length": 282.88195419311523,
"epoch": 0.8517350157728707,
"grad_norm": 84.57887686537714,
"kl": 97.375,
"learning_rate": 4.973495328090889e-07,
"loss": 0.4201,
"reward": 0.5625000124176344,
"reward_std": 0.6184229714175066,
"rewards/equation_reward_func": 0.24305556466182074,
"rewards/format_reward_func": 0.31944445334374905,
"step": 450
},
{
"clip_ratio": 0.0,
"completion_length": 340.87501271565753,
"epoch": 0.8555205047318613,
"grad_norm": 139.9178717709,
"kl": 92.39583333333333,
"learning_rate": 4.973101920227225e-07,
"loss": 0.3206,
"reward": 0.5555555683871111,
"reward_std": 0.6198337351282438,
"rewards/equation_reward_func": 0.3263888955116272,
"rewards/format_reward_func": 0.22916667287548384,
"step": 452
},
{
"clip_ratio": 0.0,
"completion_length": 351.2916768391927,
"epoch": 0.8593059936908517,
"grad_norm": 168.90078404576994,
"kl": 58.34375,
"learning_rate": 4.972705629953667e-07,
"loss": 0.3032,
"reward": 0.7083333482344946,
"reward_std": 0.6670572757720947,
"rewards/equation_reward_func": 0.395833349476258,
"rewards/format_reward_func": 0.3125000074505806,
"step": 454
},
{
"clip_ratio": 0.0,
"completion_length": 346.7777913411458,
"epoch": 0.8630914826498423,
"grad_norm": 53.736984247481196,
"kl": 71.42708333333333,
"learning_rate": 4.97230645773209e-07,
"loss": 0.3515,
"reward": 0.6180555665244659,
"reward_std": 0.5822310447692871,
"rewards/equation_reward_func": 0.3680555621782939,
"rewards/format_reward_func": 0.25000000682969886,
"step": 456
},
{
"clip_ratio": 0.0,
"completion_length": 329.21528879801434,
"epoch": 0.8668769716088328,
"grad_norm": 61.81724196878047,
"kl": 71.97395833333333,
"learning_rate": 4.971904404027736e-07,
"loss": 0.3712,
"reward": 0.5972222362955412,
"reward_std": 0.6221836258967718,
"rewards/equation_reward_func": 0.34722223319113255,
"rewards/format_reward_func": 0.2500000062088172,
"step": 458
},
{
"clip_ratio": 0.0,
"completion_length": 400.7083511352539,
"epoch": 0.8706624605678234,
"grad_norm": 89.56376909680318,
"kl": 93.92708333333333,
"learning_rate": 4.971499469309197e-07,
"loss": 0.3209,
"reward": 0.5486111330489317,
"reward_std": 0.5003731027245522,
"rewards/equation_reward_func": 0.3611111169060071,
"rewards/format_reward_func": 0.18750000558793545,
"step": 460
},
{
"clip_ratio": 0.0,
"completion_length": 379.7430674235026,
"epoch": 0.8744479495268139,
"grad_norm": 96.80934872874563,
"kl": 71.47916666666667,
"learning_rate": 4.971091654048427e-07,
"loss": 0.2863,
"reward": 0.4166666828095913,
"reward_std": 0.5312095309297243,
"rewards/equation_reward_func": 0.26388889489074546,
"rewards/format_reward_func": 0.15277778233091036,
"step": 462
},
{
"clip_ratio": 0.0,
"completion_length": 379.1250063578288,
"epoch": 0.8782334384858044,
"grad_norm": 313.3277909671654,
"kl": 157.80208333333334,
"learning_rate": 4.970680958720733e-07,
"loss": 0.5211,
"reward": 0.48611112746099633,
"reward_std": 0.5439305094381174,
"rewards/equation_reward_func": 0.2222222313284874,
"rewards/format_reward_func": 0.2638888992369175,
"step": 464
},
{
"clip_ratio": 0.0,
"completion_length": 365.0416742960612,
"epoch": 0.8820189274447949,
"grad_norm": 121.11532507159346,
"kl": 131.42708333333334,
"learning_rate": 4.970267383804787e-07,
"loss": 0.4011,
"reward": 0.4375000149011612,
"reward_std": 0.5364614203572273,
"rewards/equation_reward_func": 0.28472223070760566,
"rewards/format_reward_func": 0.1527777792265018,
"step": 466
},
{
"clip_ratio": 0.0,
"completion_length": 354.7916742960612,
"epoch": 0.8858044164037855,
"grad_norm": 90.17803998443027,
"kl": 152.9375,
"learning_rate": 4.96985092978261e-07,
"loss": 0.4152,
"reward": 0.4305555696288745,
"reward_std": 0.5253821363051733,
"rewards/equation_reward_func": 0.2847222325702508,
"rewards/format_reward_func": 0.14583333457509676,
"step": 468
},
{
"clip_ratio": 0.0,
"completion_length": 281.70834223429364,
"epoch": 0.889589905362776,
"grad_norm": 90.52105948028516,
"kl": 115.625,
"learning_rate": 4.969431597139581e-07,
"loss": 0.2493,
"reward": 0.5000000074505806,
"reward_std": 0.6266848891973495,
"rewards/equation_reward_func": 0.3472222338120143,
"rewards/format_reward_func": 0.15277778171002865,
"step": 470
},
{
"clip_ratio": 0.0,
"completion_length": 323.0347315470378,
"epoch": 0.8933753943217666,
"grad_norm": 113.45003802315175,
"kl": 83.2734375,
"learning_rate": 4.969009386364433e-07,
"loss": 0.3054,
"reward": 0.4861111131807168,
"reward_std": 0.581800473233064,
"rewards/equation_reward_func": 0.2986111218730609,
"rewards/format_reward_func": 0.1875000068296989,
"step": 472
},
{
"clip_ratio": 0.0,
"completion_length": 427.35418065388996,
"epoch": 0.897160883280757,
"grad_norm": 28.936753072783624,
"kl": 65.74479166666667,
"learning_rate": 4.968584297949254e-07,
"loss": 0.2886,
"reward": 0.4305555659035842,
"reward_std": 0.5503566016753515,
"rewards/equation_reward_func": 0.3194444527228673,
"rewards/format_reward_func": 0.11111111442248027,
"step": 474
},
{
"clip_ratio": 0.0,
"completion_length": 356.95834096272785,
"epoch": 0.9009463722397476,
"grad_norm": 53.210272136279166,
"kl": 67.9296875,
"learning_rate": 4.968156332389489e-07,
"loss": 0.2718,
"reward": 0.652777798473835,
"reward_std": 0.6074397390087446,
"rewards/equation_reward_func": 0.44444445582727593,
"rewards/format_reward_func": 0.20833333830038706,
"step": 476
},
{
"clip_ratio": 0.0,
"completion_length": 345.06250254313153,
"epoch": 0.9047318611987382,
"grad_norm": 68.31437143998066,
"kl": 26.390625,
"learning_rate": 4.967725490183929e-07,
"loss": 0.2034,
"reward": 0.5625000111758709,
"reward_std": 0.6406622032324473,
"rewards/equation_reward_func": 0.35416667970518273,
"rewards/format_reward_func": 0.2083333389212688,
"step": 478
},
{
"clip_ratio": 0.0,
"completion_length": 384.29168192545575,
"epoch": 0.9085173501577287,
"grad_norm": 46.566871330301204,
"kl": 38.3125,
"learning_rate": 4.967291771834726e-07,
"loss": 0.2743,
"reward": 0.5138889116545519,
"reward_std": 0.6012993454933167,
"rewards/equation_reward_func": 0.3472222375373046,
"rewards/format_reward_func": 0.1666666685293118,
"step": 480
},
{
"clip_ratio": 0.0,
"completion_length": 403.1458435058594,
"epoch": 0.9123028391167193,
"grad_norm": 52.966354403482825,
"kl": 58.018229166666664,
"learning_rate": 4.96685517784738e-07,
"loss": 0.1692,
"reward": 0.5555555783212185,
"reward_std": 0.5279722325503826,
"rewards/equation_reward_func": 0.3819444576899211,
"rewards/format_reward_func": 0.17361111318071684,
"step": 482
},
{
"clip_ratio": 0.0,
"completion_length": 415.2847328186035,
"epoch": 0.9160883280757098,
"grad_norm": 76.22102872372605,
"kl": 36.552083333333336,
"learning_rate": 4.966415708730742e-07,
"loss": 0.2723,
"reward": 0.4930555745959282,
"reward_std": 0.5246221944689751,
"rewards/equation_reward_func": 0.31944445210198563,
"rewards/format_reward_func": 0.173611115043362,
"step": 484
},
{
"clip_ratio": 0.0,
"completion_length": 414.2291793823242,
"epoch": 0.9198738170347003,
"grad_norm": 126.65545998025205,
"kl": 60.572916666666664,
"learning_rate": 4.965973364997015e-07,
"loss": 0.2943,
"reward": 0.5138889017204443,
"reward_std": 0.6207031682133675,
"rewards/equation_reward_func": 0.3402777835726738,
"rewards/format_reward_func": 0.1736111156642437,
"step": 486
},
{
"clip_ratio": 0.0,
"completion_length": 448.9166768391927,
"epoch": 0.9236593059936908,
"grad_norm": 39.174744576485224,
"kl": 56.713541666666664,
"learning_rate": 4.965528147161752e-07,
"loss": 0.2663,
"reward": 0.46527779412766296,
"reward_std": 0.4942639557023843,
"rewards/equation_reward_func": 0.30555556776622933,
"rewards/format_reward_func": 0.1597222244987885,
"step": 488
},
{
"clip_ratio": 0.0,
"completion_length": 486.6319580078125,
"epoch": 0.9274447949526814,
"grad_norm": 48.44066729068605,
"kl": 102.69791666666667,
"learning_rate": 4.965080055743858e-07,
"loss": 0.2164,
"reward": 0.36805556279917556,
"reward_std": 0.503364427636067,
"rewards/equation_reward_func": 0.22916667411724725,
"rewards/format_reward_func": 0.1388888917863369,
"step": 490
},
{
"clip_ratio": 0.0,
"completion_length": 491.8541844685872,
"epoch": 0.931230283911672,
"grad_norm": 46.336260606492786,
"kl": 81.82291666666667,
"learning_rate": 4.964629091265583e-07,
"loss": 0.2553,
"reward": 0.36805556900799274,
"reward_std": 0.39493420471747714,
"rewards/equation_reward_func": 0.2430555603156487,
"rewards/format_reward_func": 0.1250000031044086,
"step": 492
},
{
"clip_ratio": 0.0,
"completion_length": 470.3472315470378,
"epoch": 0.9350157728706625,
"grad_norm": 117.64457418851589,
"kl": 107.79166666666667,
"learning_rate": 4.964175254252529e-07,
"loss": 0.2875,
"reward": 0.29166667473812896,
"reward_std": 0.40408586089809734,
"rewards/equation_reward_func": 0.1944444508602222,
"rewards/format_reward_func": 0.09722222449878852,
"step": 494
},
{
"clip_ratio": 0.0,
"completion_length": 440.8472391764323,
"epoch": 0.938801261829653,
"grad_norm": 122.35688787505664,
"kl": 80.23958333333333,
"learning_rate": 4.963718545233644e-07,
"loss": 0.2675,
"reward": 0.2916666815678279,
"reward_std": 0.4292173832654953,
"rewards/equation_reward_func": 0.1527777804682652,
"rewards/format_reward_func": 0.1388888917863369,
"step": 496
},
{
"clip_ratio": 0.0,
"completion_length": 451.4514045715332,
"epoch": 0.9425867507886435,
"grad_norm": 61.21324750008328,
"kl": 64.75,
"learning_rate": 4.963258964741226e-07,
"loss": 0.3291,
"reward": 0.3819444589316845,
"reward_std": 0.4863445957501729,
"rewards/equation_reward_func": 0.26388889489074546,
"rewards/format_reward_func": 0.11805555845300357,
"step": 498
},
{
"clip_ratio": 0.0,
"completion_length": 448.3680674235026,
"epoch": 0.9463722397476341,
"grad_norm": 76.04318455804953,
"kl": 56.770833333333336,
"learning_rate": 4.962796513310916e-07,
"loss": 0.2302,
"reward": 0.3333333383003871,
"reward_std": 0.4893345981836319,
"rewards/equation_reward_func": 0.2222222276031971,
"rewards/format_reward_func": 0.11111111318071683,
"step": 500
}
],
"logging_steps": 2,
"max_steps": 6000,
"num_input_tokens_seen": 0,
"num_train_epochs": 12,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 6,
"trial_name": null,
"trial_params": null
}