Ours_Dr / trainer_state.json
LLucass's picture
Model save
c5927e2 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.11428571428571428,
"eval_steps": 500,
"global_step": 100,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 2700.4271850585938,
"cov_mean": -2.6832926778297406e-05,
"cov_std": 0.24635104648768902,
"entropy": 0.36865234375,
"epoch": 0.001142857142857143,
"grad_norm": 0.35615867376327515,
"kl": 0.0,
"learning_rate": 0.0,
"loss": 0.0696,
"reward": 0.7604166893288493,
"reward_std": 0.4268697127699852,
"rewards/accuracy_reward": 0.25000001303851604,
"rewards/format_reward": 0.5104166669771075,
"step": 1,
"w_high_ratio": 0.2208261415362358,
"w_low_ratio": 0.027151118498295546,
"w_max": 2.1915207505226135,
"w_mean": 1.4711343348026276,
"w_min": 1.404075949984986e-37,
"w_std": 0.24041971936821938
},
{
"completion_length": 3127.3958435058594,
"cov_mean": -1.8215427189716138e-05,
"cov_std": 0.18336841650307178,
"entropy": 0.353515625,
"epoch": 0.002285714285714286,
"grad_norm": 0.18010225892066956,
"kl": 0.0,
"learning_rate": 1e-07,
"loss": 0.0533,
"reward": 0.6458333637565374,
"reward_std": 0.4249730706214905,
"rewards/accuracy_reward": 0.2812500102445483,
"rewards/format_reward": 0.3645833386108279,
"step": 2,
"w_high_ratio": 0.05701034888625145,
"w_low_ratio": 0.023528859252110124,
"w_max": 1.811183512210846,
"w_mean": 1.2113382518291473,
"w_min": 0.0,
"w_std": 0.15613791532814503
},
{
"completion_length": 3691.0626220703125,
"cov_mean": 2.796226033296989e-05,
"cov_std": 0.1637928392738104,
"entropy": 0.44189453125,
"epoch": 0.0034285714285714284,
"grad_norm": 0.1356951743364334,
"kl": 3.916025161743164e-05,
"learning_rate": 2e-07,
"loss": 0.052,
"reward": 0.19791667256504297,
"reward_std": 0.3607826754450798,
"rewards/accuracy_reward": 0.05208333395421505,
"rewards/format_reward": 0.14583333674818277,
"step": 3,
"w_high_ratio": 0.0,
"w_low_ratio": 0.02235229848884046,
"w_max": 1.460817277431488,
"w_mean": 1.082369714975357,
"w_min": 3.3280838527714405e-44,
"w_std": 0.12309953197836876
},
{
"completion_length": 2353.2709350585938,
"cov_mean": 1.0425418167869793e-05,
"cov_std": 0.3036706894636154,
"entropy": 0.41259765625,
"epoch": 0.004571428571428572,
"grad_norm": 0.190170019865036,
"kl": 3.3348798751831055e-05,
"learning_rate": 3e-07,
"loss": 0.0459,
"reward": 0.8750000149011612,
"reward_std": 0.5107106417417526,
"rewards/accuracy_reward": 0.1875000037252903,
"rewards/format_reward": 0.6875000298023224,
"step": 4,
"w_high_ratio": 0.2652290016412735,
"w_low_ratio": 0.034206886775791645,
"w_max": 2.106997400522232,
"w_mean": 1.5420070886611938,
"w_min": 2.4617042843759845e-36,
"w_std": 0.2812090367078781
},
{
"completion_length": 3485.1771850585938,
"cov_mean": 3.2382055223934003e-06,
"cov_std": 0.29665667191147804,
"entropy": 0.4609375,
"epoch": 0.005714285714285714,
"grad_norm": 0.2197088897228241,
"kl": 4.2125582695007324e-05,
"learning_rate": 4e-07,
"loss": 0.0803,
"reward": 0.46875001303851604,
"reward_std": 0.5515270829200745,
"rewards/accuracy_reward": 0.1145833358168602,
"rewards/format_reward": 0.35416667722165585,
"step": 5,
"w_high_ratio": 0.008333034813404083,
"w_low_ratio": 0.04545952333137393,
"w_max": 1.5202394425868988,
"w_mean": 1.1503158807754517,
"w_min": 5.693325166185118e-29,
"w_std": 0.23378031328320503
},
{
"completion_length": 3451.2500610351562,
"cov_mean": -4.464495305001037e-05,
"cov_std": 0.236886378377676,
"entropy": 0.46142578125,
"epoch": 0.006857142857142857,
"grad_norm": 0.13218647241592407,
"kl": 4.482269287109375e-05,
"learning_rate": 5e-07,
"loss": 0.0517,
"reward": 0.3645833507180214,
"reward_std": 0.515114888548851,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/format_reward": 0.28125000186264515,
"step": 6,
"w_high_ratio": 0.0625,
"w_low_ratio": 0.031897591426968575,
"w_max": 1.5891262888908386,
"w_mean": 1.1359511613845825,
"w_min": 0.0,
"w_std": 0.14766533859074116
},
{
"completion_length": 3224.3125610351562,
"cov_mean": 4.349886694399174e-06,
"cov_std": 0.3991788253188133,
"entropy": 0.38671875,
"epoch": 0.008,
"grad_norm": 0.22412240505218506,
"kl": 2.1651387214660645e-05,
"learning_rate": 6e-07,
"loss": 0.0742,
"reward": 0.8541666865348816,
"reward_std": 0.6870906725525856,
"rewards/accuracy_reward": 0.2291666716337204,
"rewards/format_reward": 0.6250000149011612,
"step": 7,
"w_high_ratio": 0.047733694314956665,
"w_low_ratio": 0.053672163281589746,
"w_max": 1.5988431572914124,
"w_mean": 1.2651265263557434,
"w_min": 6.1929912552473734e-37,
"w_std": 0.2889493927359581
},
{
"completion_length": 2800.9583740234375,
"cov_mean": 1.0622998161124997e-06,
"cov_std": 0.15430260822176933,
"entropy": 0.33740234375,
"epoch": 0.009142857142857144,
"grad_norm": 0.11328813433647156,
"kl": 1.7002224922180176e-05,
"learning_rate": 7e-07,
"loss": 0.0184,
"reward": 0.8958333730697632,
"reward_std": 0.25296592339873314,
"rewards/accuracy_reward": 0.4062500149011612,
"rewards/format_reward": 0.4895833432674408,
"step": 8,
"w_high_ratio": 0.0,
"w_low_ratio": 0.01592865912243724,
"w_max": 1.5333127677440643,
"w_mean": 1.2431240677833557,
"w_min": 0.25,
"w_std": 0.11287659406661987
},
{
"completion_length": 3369.791748046875,
"cov_mean": -2.2509159407491097e-05,
"cov_std": 0.20683829113841057,
"entropy": 0.45263671875,
"epoch": 0.010285714285714285,
"grad_norm": 0.1632954180240631,
"kl": 4.3064355850219727e-05,
"learning_rate": 8e-07,
"loss": 0.0417,
"reward": 0.4583333507180214,
"reward_std": 0.3903508894145489,
"rewards/accuracy_reward": 0.1145833358168602,
"rewards/format_reward": 0.3437500074505806,
"step": 9,
"w_high_ratio": 0.0,
"w_low_ratio": 0.029203591868281364,
"w_max": 1.6281995177268982,
"w_mean": 1.1540252268314362,
"w_min": 1.9273542721946577e-23,
"w_std": 0.15844954177737236
},
{
"completion_length": 2794.2291870117188,
"cov_mean": 7.258828873091261e-06,
"cov_std": 0.22622444108128548,
"entropy": 0.34716796875,
"epoch": 0.011428571428571429,
"grad_norm": 0.11491074413061142,
"kl": 2.664327621459961e-05,
"learning_rate": 9e-07,
"loss": 0.0765,
"reward": 0.6145833432674408,
"reward_std": 0.4795500487089157,
"rewards/accuracy_reward": 0.16666667070239782,
"rewards/format_reward": 0.4479166828095913,
"step": 10,
"w_high_ratio": 0.1683393381536007,
"w_low_ratio": 0.03143396740779281,
"w_max": 1.8651617467403412,
"w_mean": 1.2822044789791107,
"w_min": 2.2624703592113335e-38,
"w_std": 0.20682579837739468
},
{
"completion_length": 3703.197998046875,
"cov_mean": -2.311449361513951e-05,
"cov_std": 0.18362887762486935,
"entropy": 0.39697265625,
"epoch": 0.012571428571428572,
"grad_norm": 0.12180526554584503,
"kl": 2.73287296295166e-05,
"learning_rate": 1e-06,
"loss": 0.0401,
"reward": 0.26041667722165585,
"reward_std": 0.3744332268834114,
"rewards/accuracy_reward": 0.10416667256504297,
"rewards/format_reward": 0.15625000558793545,
"step": 11,
"w_high_ratio": 0.027614232152700424,
"w_low_ratio": 0.024976021610200405,
"w_max": 1.3482708036899567,
"w_mean": 1.0835402309894562,
"w_min": 0.25,
"w_std": 0.12668619584292173
},
{
"completion_length": 2611.260498046875,
"cov_mean": 1.0368909215685562e-05,
"cov_std": 0.22750693373382092,
"entropy": 0.3984375,
"epoch": 0.013714285714285714,
"grad_norm": 0.1790972799062729,
"kl": 2.802908420562744e-05,
"learning_rate": 9.997258721585931e-07,
"loss": 0.0576,
"reward": 0.7395833805203438,
"reward_std": 0.4462515264749527,
"rewards/accuracy_reward": 0.1354166716337204,
"rewards/format_reward": 0.604166679084301,
"step": 12,
"w_high_ratio": 0.10206323117017746,
"w_low_ratio": 0.0320228124037385,
"w_max": 2.1756480634212494,
"w_mean": 1.4741427898406982,
"w_min": 4.420129648185005e-23,
"w_std": 0.23598888516426086
},
{
"completion_length": 3224.041748046875,
"cov_mean": -2.2661331968265586e-05,
"cov_std": 0.15935274586081505,
"entropy": 0.38427734375,
"epoch": 0.014857142857142857,
"grad_norm": 0.24569930136203766,
"kl": 1.7702579498291016e-05,
"learning_rate": 9.989038226169207e-07,
"loss": 0.0498,
"reward": 0.604166679084301,
"reward_std": 0.30622391402721405,
"rewards/accuracy_reward": 0.21875000558793545,
"rewards/format_reward": 0.385416679084301,
"step": 13,
"w_high_ratio": 0.20555464923381805,
"w_low_ratio": 0.01726952870376408,
"w_max": 1.8901410400867462,
"w_mean": 1.31855970621109,
"w_min": 1.1411503146395655e-35,
"w_std": 0.14450976066291332
},
{
"completion_length": 3125.197998046875,
"cov_mean": -2.9986793833813863e-05,
"cov_std": 0.18112273141741753,
"entropy": 0.3623046875,
"epoch": 0.016,
"grad_norm": 0.14652569591999054,
"kl": 7.249414920806885e-06,
"learning_rate": 9.975348529157229e-07,
"loss": 0.0596,
"reward": 0.5000000111758709,
"reward_std": 0.3975026085972786,
"rewards/accuracy_reward": 0.13541667070239782,
"rewards/format_reward": 0.3645833432674408,
"step": 14,
"w_high_ratio": 0.0,
"w_low_ratio": 0.026353970635682344,
"w_max": 1.6898008584976196,
"w_mean": 1.1696374714374542,
"w_min": 2.786338774190119e-34,
"w_std": 0.17131789773702621
},
{
"completion_length": 2945.3959350585938,
"cov_mean": 3.4355500702076824e-06,
"cov_std": 0.18097041826695204,
"entropy": 0.37109375,
"epoch": 0.017142857142857144,
"grad_norm": 0.08377102017402649,
"kl": 2.804398536682129e-05,
"learning_rate": 9.956206309337066e-07,
"loss": 0.0282,
"reward": 0.6875000149011612,
"reward_std": 0.37770550325512886,
"rewards/accuracy_reward": 0.2395833395421505,
"rewards/format_reward": 0.4479166716337204,
"step": 15,
"w_high_ratio": 0.0,
"w_low_ratio": 0.027025693794712424,
"w_max": 1.4225987792015076,
"w_mean": 1.1180275976657867,
"w_min": 8.951121255272305e-16,
"w_std": 0.14841708727180958
},
{
"completion_length": 3842.3646240234375,
"cov_mean": -3.1302830393542536e-05,
"cov_std": 0.16068686172366142,
"entropy": 0.458984375,
"epoch": 0.018285714285714287,
"grad_norm": 0.1221930980682373,
"kl": 2.4199485778808594e-05,
"learning_rate": 9.931634888554935e-07,
"loss": 0.0431,
"reward": 0.1562500037252903,
"reward_std": 0.3155686669051647,
"rewards/accuracy_reward": 0.05208333395421505,
"rewards/format_reward": 0.10416666977107525,
"step": 16,
"w_high_ratio": 0.0,
"w_low_ratio": 0.022678226232528687,
"w_max": 1.1968038976192474,
"w_mean": 1.0266980826854706,
"w_min": 0.25,
"w_std": 0.10604305937886238
},
{
"completion_length": 2433.1875915527344,
"cov_mean": 2.4951528757810593e-05,
"cov_std": 0.27749199233949184,
"entropy": 0.44970703125,
"epoch": 0.019428571428571427,
"grad_norm": 0.13208113610744476,
"kl": 5.91278076171875e-05,
"learning_rate": 9.901664203302124e-07,
"loss": -0.0048,
"reward": 0.8854166865348816,
"reward_std": 0.4504813477396965,
"rewards/accuracy_reward": 0.2708333395421505,
"rewards/format_reward": 0.6145833358168602,
"step": 17,
"w_high_ratio": 0.171035997569561,
"w_low_ratio": 0.03585993289016187,
"w_max": 2.199991285800934,
"w_mean": 1.4317797720432281,
"w_min": 0.25,
"w_std": 0.24799126759171486
},
{
"completion_length": 3167.4791870117188,
"cov_mean": -2.742706919889315e-05,
"cov_std": 0.2498251087963581,
"entropy": 0.369140625,
"epoch": 0.02057142857142857,
"grad_norm": 0.15079385042190552,
"kl": 2.0952895283699036e-05,
"learning_rate": 9.866330768241983e-07,
"loss": 0.0577,
"reward": 0.5729166939854622,
"reward_std": 0.5097959190607071,
"rewards/accuracy_reward": 0.1666666716337204,
"rewards/format_reward": 0.4062500149011612,
"step": 18,
"w_high_ratio": 0.0,
"w_low_ratio": 0.034274401143193245,
"w_max": 1.457490622997284,
"w_mean": 1.1403506994247437,
"w_min": 0.0,
"w_std": 0.16276290826499462
},
{
"completion_length": 3139.635498046875,
"cov_mean": 1.529891596874222e-05,
"cov_std": 0.13174043968319893,
"entropy": 0.39208984375,
"epoch": 0.021714285714285714,
"grad_norm": 0.08939936012029648,
"kl": 5.739927291870117e-05,
"learning_rate": 9.825677631722435e-07,
"loss": 0.0233,
"reward": 0.8541666828095913,
"reward_std": 0.3213166669011116,
"rewards/accuracy_reward": 0.3750000111758709,
"rewards/format_reward": 0.47916666977107525,
"step": 19,
"w_high_ratio": 0.035190850496292114,
"w_low_ratio": 0.016721592284739017,
"w_max": 1.647162914276123,
"w_mean": 1.2506683766841888,
"w_min": 0.25,
"w_std": 0.09711403585970402
},
{
"completion_length": 2464.385498046875,
"cov_mean": 3.157450896651426e-05,
"cov_std": 0.283736914396286,
"entropy": 0.3369140625,
"epoch": 0.022857142857142857,
"grad_norm": 0.3488742411136627,
"kl": 9.429454803466797e-05,
"learning_rate": 9.779754323328192e-07,
"loss": 0.0914,
"reward": 0.9375000596046448,
"reward_std": 0.4943716749548912,
"rewards/accuracy_reward": 0.2604166716337204,
"rewards/format_reward": 0.6770833656191826,
"step": 20,
"w_high_ratio": 0.0566110759973526,
"w_low_ratio": 0.031579687260091305,
"w_max": 2.3184494078159332,
"w_mean": 1.4481623768806458,
"w_min": 0.0,
"w_std": 0.28026906587183475
},
{
"completion_length": 2847.21875,
"cov_mean": -2.243259518763807e-05,
"cov_std": 0.18684318475425243,
"entropy": 0.423828125,
"epoch": 0.024,
"grad_norm": 0.13671471178531647,
"kl": 0.00033351778984069824,
"learning_rate": 9.728616793536587e-07,
"loss": 0.0498,
"reward": 0.6562500204890966,
"reward_std": 0.38981083035469055,
"rewards/accuracy_reward": 0.18750000465661287,
"rewards/format_reward": 0.4687500027939677,
"step": 21,
"w_high_ratio": 0.08715118188410997,
"w_low_ratio": 0.021108672255650163,
"w_max": 1.9608261287212372,
"w_mean": 1.387522131204605,
"w_min": 4.576730842832761e-23,
"w_std": 0.14338573440909386
},
{
"completion_length": 1849.3542175292969,
"cov_mean": -5.1019123930018395e-05,
"cov_std": 0.208794716745615,
"entropy": 0.3994140625,
"epoch": 0.025142857142857144,
"grad_norm": 0.1823095828294754,
"kl": 0.00039577484130859375,
"learning_rate": 9.672327345550543e-07,
"loss": 0.0503,
"reward": 1.1041666716337204,
"reward_std": 0.363413717597723,
"rewards/accuracy_reward": 0.2708333432674408,
"rewards/format_reward": 0.833333358168602,
"step": 22,
"w_high_ratio": 0.3683718554675579,
"w_low_ratio": 0.028410385129973292,
"w_max": 2.5447845458984375,
"w_mean": 1.7170847058296204,
"w_min": 1.7296456515038733e-32,
"w_std": 0.18113290891051292
},
{
"completion_length": 2786.604217529297,
"cov_mean": 6.156731569717522e-05,
"cov_std": 0.21581846103072166,
"entropy": 0.3828125,
"epoch": 0.026285714285714287,
"grad_norm": 0.10202132165431976,
"kl": 0.00020551681518554688,
"learning_rate": 9.610954559391704e-07,
"loss": 0.0436,
"reward": 0.708333358168602,
"reward_std": 0.47307053953409195,
"rewards/accuracy_reward": 0.22916667722165585,
"rewards/format_reward": 0.4791666716337204,
"step": 23,
"w_high_ratio": 0.05155515298247337,
"w_low_ratio": 0.03038623696193099,
"w_max": 1.8245242238044739,
"w_mean": 1.2686880826950073,
"w_min": 2.0108632963061125e-43,
"w_std": 0.19650068879127502
},
{
"completion_length": 2932.14599609375,
"cov_mean": 3.685860542645969e-05,
"cov_std": 0.19567562174052,
"entropy": 0.35986328125,
"epoch": 0.027428571428571427,
"grad_norm": 0.11536505818367004,
"kl": 0.00012372806668281555,
"learning_rate": 9.54457320834625e-07,
"loss": 0.0259,
"reward": 0.8229167014360428,
"reward_std": 0.41623104363679886,
"rewards/accuracy_reward": 0.2708333358168602,
"rewards/format_reward": 0.5520833507180214,
"step": 24,
"w_high_ratio": 0.05616182088851929,
"w_low_ratio": 0.025760386954061687,
"w_max": 1.7301380336284637,
"w_mean": 1.2399356663227081,
"w_min": 0.25,
"w_std": 0.16614723671227694
},
{
"completion_length": 2980.104248046875,
"cov_mean": 5.5617931593587855e-06,
"cov_std": 0.20383853651583195,
"entropy": 0.43408203125,
"epoch": 0.02857142857142857,
"grad_norm": 0.13895268738269806,
"kl": 0.0003798753023147583,
"learning_rate": 9.473264167865171e-07,
"loss": 0.0328,
"reward": 0.6145833609625697,
"reward_std": 0.4326799139380455,
"rewards/accuracy_reward": 0.20833334140479565,
"rewards/format_reward": 0.4062500102445483,
"step": 25,
"w_high_ratio": 0.15471260249614716,
"w_low_ratio": 0.027910931850783527,
"w_max": 1.878886878490448,
"w_mean": 1.3457823991775513,
"w_min": 2.1938871947043534e-19,
"w_std": 0.21430648770183325
},
{
"completion_length": 3121.822998046875,
"cov_mean": -1.5570902263561948e-05,
"cov_std": 0.12479476444423199,
"entropy": 0.41845703125,
"epoch": 0.029714285714285714,
"grad_norm": 0.055989839136600494,
"kl": 5.704164505004883e-05,
"learning_rate": 9.397114317029974e-07,
"loss": 0.006,
"reward": 0.7395833656191826,
"reward_std": 0.2778088226914406,
"rewards/accuracy_reward": 0.25000000558793545,
"rewards/format_reward": 0.4895833358168602,
"step": 26,
"w_high_ratio": 0.05233287438750267,
"w_low_ratio": 0.016803464153781533,
"w_max": 1.667470008134842,
"w_mean": 1.2069356143474579,
"w_min": 1.8282741064045888e-40,
"w_std": 0.12309898342937231
},
{
"completion_length": 3419.697998046875,
"cov_mean": -8.16069814391085e-06,
"cov_std": 0.2326441928744316,
"entropy": 0.45654296875,
"epoch": 0.030857142857142857,
"grad_norm": 0.1154065951704979,
"kl": 0.0001367814838886261,
"learning_rate": 9.316216432703916e-07,
"loss": 0.0629,
"reward": 0.4687500223517418,
"reward_std": 0.4522514268755913,
"rewards/accuracy_reward": 0.11458333861082792,
"rewards/format_reward": 0.3541666716337204,
"step": 27,
"w_high_ratio": 0.125,
"w_low_ratio": 0.03899317281320691,
"w_max": 1.7131148278713226,
"w_mean": 1.2755843102931976,
"w_min": 2.3244726053753363e-31,
"w_std": 0.15411211177706718
},
{
"completion_length": 3003.3334350585938,
"cov_mean": 3.4548415897006635e-06,
"cov_std": 0.18210824206471443,
"entropy": 0.40576171875,
"epoch": 0.032,
"grad_norm": 0.10329318046569824,
"kl": 0.00033906102180480957,
"learning_rate": 9.230669076497687e-07,
"loss": 0.0466,
"reward": 0.729166679084301,
"reward_std": 0.4190382733941078,
"rewards/accuracy_reward": 0.291666679084301,
"rewards/format_reward": 0.4375000149011612,
"step": 28,
"w_high_ratio": 0.045407865196466446,
"w_low_ratio": 0.020685997209511697,
"w_max": 1.9191896319389343,
"w_mean": 1.28102046251297,
"w_min": 1.1079171471645686e-36,
"w_std": 0.15209556370973587
},
{
"completion_length": 3622.8438110351562,
"cov_mean": -1.4215014289220562e-05,
"cov_std": 0.19959762692451477,
"entropy": 0.43701171875,
"epoch": 0.03314285714285714,
"grad_norm": 0.11793094128370285,
"kl": 0.00043398141860961914,
"learning_rate": 9.140576474687263e-07,
"loss": 0.0686,
"reward": 0.2812500149011612,
"reward_std": 0.343124657869339,
"rewards/accuracy_reward": 0.06250000186264515,
"rewards/format_reward": 0.2187500074505806,
"step": 29,
"w_high_ratio": 0.0,
"w_low_ratio": 0.03135715611279011,
"w_max": 1.389756977558136,
"w_mean": 1.1044960916042328,
"w_min": 0.5,
"w_std": 0.15529824048280716
},
{
"completion_length": 3211.6563110351562,
"cov_mean": 2.0011442074974184e-05,
"cov_std": 0.3438211902976036,
"entropy": 0.40087890625,
"epoch": 0.03428571428571429,
"grad_norm": 0.15425726771354675,
"kl": 0.0005748271942138672,
"learning_rate": 9.046048391230247e-07,
"loss": 0.096,
"reward": 0.7708333544433117,
"reward_std": 0.6762835085391998,
"rewards/accuracy_reward": 0.2812500139698386,
"rewards/format_reward": 0.489583358168602,
"step": 30,
"w_high_ratio": 0.0955454632639885,
"w_low_ratio": 0.04459251323714852,
"w_max": 1.7270594835281372,
"w_mean": 1.2695180475711823,
"w_min": 0.0,
"w_std": 0.22970640659332275
},
{
"completion_length": 3313.7500610351562,
"cov_mean": -6.734976523148362e-07,
"cov_std": 0.1561581064015627,
"entropy": 0.38623046875,
"epoch": 0.03542857142857143,
"grad_norm": 0.12495917081832886,
"kl": 0.00024247169494628906,
"learning_rate": 8.9471999940354e-07,
"loss": 0.0659,
"reward": 0.47916669212281704,
"reward_std": 0.3414399288594723,
"rewards/accuracy_reward": 0.16666667722165585,
"rewards/format_reward": 0.31250001303851604,
"step": 31,
"w_high_ratio": 0.12074629962444305,
"w_low_ratio": 0.02205055020749569,
"w_max": 1.9694485068321228,
"w_mean": 1.323824942111969,
"w_min": 3.479372530225627e-30,
"w_std": 0.15388164669275284
},
{
"completion_length": 3430.5521850585938,
"cov_mean": -1.8880080915550934e-05,
"cov_std": 0.24603740125894547,
"entropy": 0.4443359375,
"epoch": 0.036571428571428574,
"grad_norm": 0.10446158051490784,
"kl": 0.00040030479431152344,
"learning_rate": 8.844151714648274e-07,
"loss": 0.0417,
"reward": 0.6875000223517418,
"reward_std": 0.4970519095659256,
"rewards/accuracy_reward": 0.2604166679084301,
"rewards/format_reward": 0.4270833507180214,
"step": 32,
"w_high_ratio": 0.0,
"w_low_ratio": 0.03472677152603865,
"w_max": 1.578925609588623,
"w_mean": 1.1632550954818726,
"w_min": 0.0,
"w_std": 0.17994992434978485
},
{
"completion_length": 3569.229248046875,
"cov_mean": -1.2864127711509354e-05,
"cov_std": 0.21655914932489395,
"entropy": 0.3828125,
"epoch": 0.037714285714285714,
"grad_norm": 0.11952047049999237,
"kl": 0.00048720836639404297,
"learning_rate": 8.737029101523929e-07,
"loss": 0.0213,
"reward": 0.5833333507180214,
"reward_std": 0.4569981172680855,
"rewards/accuracy_reward": 0.229166679084301,
"rewards/format_reward": 0.3541666716337204,
"step": 33,
"w_high_ratio": 0.0,
"w_low_ratio": 0.03175507392734289,
"w_max": 1.3123357892036438,
"w_mean": 1.0974721312522888,
"w_min": 0.25,
"w_std": 0.16161495074629784
},
{
"completion_length": 2714.0000610351562,
"cov_mean": -3.014505455212202e-05,
"cov_std": 0.24434123933315277,
"entropy": 0.462890625,
"epoch": 0.038857142857142854,
"grad_norm": 0.24054297804832458,
"kl": 0.0010285377502441406,
"learning_rate": 8.625962667065487e-07,
"loss": 0.0264,
"reward": 0.8541666865348816,
"reward_std": 0.43565599620342255,
"rewards/accuracy_reward": 0.3333333432674408,
"rewards/format_reward": 0.5208333432674408,
"step": 34,
"w_high_ratio": 0.08968023210763931,
"w_low_ratio": 0.03058682754635811,
"w_max": 1.8369105458259583,
"w_mean": 1.3255797028541565,
"w_min": 0.25,
"w_std": 0.23013706505298615
},
{
"completion_length": 3206.260498046875,
"cov_mean": 1.827982691793295e-05,
"cov_std": 0.2361072190105915,
"entropy": 0.42578125,
"epoch": 0.04,
"grad_norm": 0.13136403262615204,
"kl": 0.0009332895278930664,
"learning_rate": 8.511087728614862e-07,
"loss": 0.0513,
"reward": 0.5625000149011612,
"reward_std": 0.47499874979257584,
"rewards/accuracy_reward": 0.1979166679084301,
"rewards/format_reward": 0.3645833469927311,
"step": 35,
"w_high_ratio": 0.0,
"w_low_ratio": 0.03539817640557885,
"w_max": 1.4059478044509888,
"w_mean": 1.1172049045562744,
"w_min": 0.0,
"w_std": 0.1749916821718216
},
{
"completion_length": 3699.416748046875,
"cov_mean": -1.4997711559772142e-05,
"cov_std": 0.20064959302544594,
"entropy": 0.50439453125,
"epoch": 0.04114285714285714,
"grad_norm": 0.1509845107793808,
"kl": 0.0011619925498962402,
"learning_rate": 8.392544243589427e-07,
"loss": 0.0429,
"reward": 0.2500000102445483,
"reward_std": 0.41391417384147644,
"rewards/accuracy_reward": 0.031250000931322575,
"rewards/format_reward": 0.21875000279396772,
"step": 36,
"w_high_ratio": 0.0,
"w_low_ratio": 0.029611330712214112,
"w_max": 1.3176401853561401,
"w_mean": 1.0739335417747498,
"w_min": 0.0,
"w_std": 0.15683909878134727
},
{
"completion_length": 3516.1563110351562,
"cov_mean": -2.547230405980372e-05,
"cov_std": 0.11416707932949066,
"entropy": 0.43994140625,
"epoch": 0.04228571428571429,
"grad_norm": 0.08760611712932587,
"kl": 0.0007746219635009766,
"learning_rate": 8.270476638965461e-07,
"loss": 0.0156,
"reward": 0.22916667442768812,
"reward_std": 0.19299374520778656,
"rewards/accuracy_reward": 0.010416666977107525,
"rewards/format_reward": 0.21875001024454832,
"step": 37,
"w_high_ratio": 0.05747595056891441,
"w_low_ratio": 0.013685875572264194,
"w_max": 1.6239450573921204,
"w_mean": 1.1775790452957153,
"w_min": 0.25,
"w_std": 0.12097344920039177
},
{
"completion_length": 3670.822998046875,
"cov_mean": -4.859739419771358e-06,
"cov_std": 0.11942135915160179,
"entropy": 0.4833984375,
"epoch": 0.04342857142857143,
"grad_norm": 0.08295677602291107,
"kl": 0.0007152557373046875,
"learning_rate": 8.145033635316128e-07,
"loss": 0.016,
"reward": 0.322916679084301,
"reward_std": 0.24508872628211975,
"rewards/accuracy_reward": 0.14583333395421505,
"rewards/format_reward": 0.1770833432674408,
"step": 38,
"w_high_ratio": 0.0,
"w_low_ratio": 0.015665842220187187,
"w_max": 1.2278587818145752,
"w_mean": 1.0692134499549866,
"w_min": 0.5,
"w_std": 0.08390428125858307
},
{
"completion_length": 3133.5521850585938,
"cov_mean": 1.6014040738809854e-05,
"cov_std": 0.15454116463661194,
"entropy": 0.38427734375,
"epoch": 0.044571428571428574,
"grad_norm": 0.09236446022987366,
"kl": 0.0011830329895019531,
"learning_rate": 8.01636806561836e-07,
"loss": 0.0191,
"reward": 0.770833358168602,
"reward_std": 0.30482664704322815,
"rewards/accuracy_reward": 0.25000000558793545,
"rewards/format_reward": 0.520833358168602,
"step": 39,
"w_high_ratio": 0.0,
"w_low_ratio": 0.021826621610671282,
"w_max": 1.4440618753433228,
"w_mean": 1.1500347554683685,
"w_min": 0.25,
"w_std": 0.10476426035165787
},
{
"completion_length": 2921.5938110351562,
"cov_mean": 3.307407860120293e-05,
"cov_std": 0.18591826409101486,
"entropy": 0.4111328125,
"epoch": 0.045714285714285714,
"grad_norm": 0.1281704157590866,
"kl": 0.0041351318359375,
"learning_rate": 7.884636689049422e-07,
"loss": 0.0448,
"reward": 0.6770833507180214,
"reward_std": 0.39515648037195206,
"rewards/accuracy_reward": 0.17708333395421505,
"rewards/format_reward": 0.5000000149011612,
"step": 40,
"w_high_ratio": 0.09708013385534286,
"w_low_ratio": 0.02820506482385099,
"w_max": 1.9662592709064484,
"w_mean": 1.3416504263877869,
"w_min": 1.7285604841107016e-17,
"w_std": 0.17729274183511734
},
{
"completion_length": 3497.4063110351562,
"cov_mean": -7.282104343175888e-05,
"cov_std": 0.2843910865485668,
"entropy": 0.40771484375,
"epoch": 0.046857142857142854,
"grad_norm": 0.16035234928131104,
"kl": 0.0008625984191894531,
"learning_rate": 7.75e-07,
"loss": 0.0447,
"reward": 0.4583333507180214,
"reward_std": 0.5345464050769806,
"rewards/accuracy_reward": 0.1250000037252903,
"rewards/format_reward": 0.3333333507180214,
"step": 41,
"w_high_ratio": 0.015127741731703281,
"w_low_ratio": 0.04166511259973049,
"w_max": 1.6356081068515778,
"w_mean": 1.1487390100955963,
"w_min": 3.531651517161998e-24,
"w_std": 0.21111097559332848
},
{
"completion_length": 3070.854248046875,
"cov_mean": 4.5878337004978675e-06,
"cov_std": 0.0855317497625947,
"entropy": 0.48388671875,
"epoch": 0.048,
"grad_norm": 0.06175260245800018,
"kl": 0.0006914138793945312,
"learning_rate": 7.612622032536507e-07,
"loss": 0.0063,
"reward": 0.3229166716337204,
"reward_std": 0.17353228479623795,
"rewards/accuracy_reward": 0.010416666977107525,
"rewards/format_reward": 0.3125000074505806,
"step": 42,
"w_high_ratio": 0.05438845232129097,
"w_low_ratio": 0.008411283954046667,
"w_max": 1.544800043106079,
"w_mean": 1.1694203615188599,
"w_min": 0.5,
"w_std": 0.07276808470487595
},
{
"completion_length": 3378.8125610351562,
"cov_mean": -2.5848277346085524e-05,
"cov_std": 0.2625325694680214,
"entropy": 0.43701171875,
"epoch": 0.04914285714285714,
"grad_norm": 0.18838584423065186,
"kl": 0.0014505386352539062,
"learning_rate": 7.472670160550848e-07,
"loss": 0.0805,
"reward": 0.479166679084301,
"reward_std": 0.5196144729852676,
"rewards/accuracy_reward": 0.18750000558793545,
"rewards/format_reward": 0.2916666716337204,
"step": 43,
"w_high_ratio": 0.05749715492129326,
"w_low_ratio": 0.038668573601171374,
"w_max": 1.9480818212032318,
"w_mean": 1.1805387139320374,
"w_min": 2.3359345679368763e-34,
"w_std": 0.1971494909375906
},
{
"completion_length": 2916.9791717529297,
"cov_mean": -3.640100658230949e-06,
"cov_std": 0.23978274501860142,
"entropy": 0.41162109375,
"epoch": 0.05028571428571429,
"grad_norm": 0.16967085003852844,
"kl": 0.005632162094116211,
"learning_rate": 7.330314893841101e-07,
"loss": 0.0373,
"reward": 0.7708333460614085,
"reward_std": 0.43626825511455536,
"rewards/accuracy_reward": 0.2812500074505806,
"rewards/format_reward": 0.4895833386108279,
"step": 44,
"w_high_ratio": 0.11527429521083832,
"w_low_ratio": 0.0326957437209785,
"w_max": 1.8800793588161469,
"w_mean": 1.356241375207901,
"w_min": 0.0,
"w_std": 0.22198213264346123
},
{
"completion_length": 3686.1875610351562,
"cov_mean": -1.2132580195611808e-05,
"cov_std": 0.17040352895855904,
"entropy": 0.4228515625,
"epoch": 0.05142857142857143,
"grad_norm": 0.0952582135796547,
"kl": 0.0019674301147460938,
"learning_rate": 7.185729670371604e-07,
"loss": 0.0175,
"reward": 0.4687500149011612,
"reward_std": 0.3746139518916607,
"rewards/accuracy_reward": 0.2187500074505806,
"rewards/format_reward": 0.2500000074505806,
"step": 45,
"w_high_ratio": 0.0,
"w_low_ratio": 0.022616846952587366,
"w_max": 1.2881874740123749,
"w_mean": 1.0595116317272186,
"w_min": 0.25,
"w_std": 0.11086289770901203
},
{
"completion_length": 3573.8229370117188,
"cov_mean": -5.21990023116814e-06,
"cov_std": 0.09164197091013193,
"entropy": 0.5361328125,
"epoch": 0.052571428571428575,
"grad_norm": 0.061532407999038696,
"kl": 0.0024976730346679688,
"learning_rate": 7.039090644965509e-07,
"loss": 0.0226,
"reward": 0.20833333488553762,
"reward_std": 0.21344273164868355,
"rewards/accuracy_reward": 0.041666666977107525,
"rewards/format_reward": 0.1666666679084301,
"step": 46,
"w_high_ratio": 0.05877559259533882,
"w_low_ratio": 0.011987740639597178,
"w_max": 1.5505282580852509,
"w_mean": 1.1429267823696136,
"w_min": 3.571343117882909e-28,
"w_std": 0.09135792590677738
},
{
"completion_length": 3139.9375610351562,
"cov_mean": 1.948155477293767e-05,
"cov_std": 0.3308473080396652,
"entropy": 0.43017578125,
"epoch": 0.053714285714285714,
"grad_norm": 0.287928968667984,
"kl": 0.0011713504791259766,
"learning_rate": 6.890576474687263e-07,
"loss": 0.0536,
"reward": 0.8541666716337204,
"reward_std": 0.5766339302062988,
"rewards/accuracy_reward": 0.322916679084301,
"rewards/format_reward": 0.5312500149011612,
"step": 47,
"w_high_ratio": 0.12357743084430695,
"w_low_ratio": 0.038592321798205376,
"w_max": 2.0394512712955475,
"w_mean": 1.3565464913845062,
"w_min": 0.25,
"w_std": 0.26638074964284897
},
{
"completion_length": 3051.385467529297,
"cov_mean": -6.733167197126022e-06,
"cov_std": 0.19061635434627533,
"entropy": 0.4501953125,
"epoch": 0.054857142857142854,
"grad_norm": 0.1374855488538742,
"kl": 0.00525665283203125,
"learning_rate": 6.740368101176495e-07,
"loss": 0.0486,
"reward": 0.5208333432674408,
"reward_std": 0.37667082995176315,
"rewards/accuracy_reward": 0.1770833395421505,
"rewards/format_reward": 0.34375000838190317,
"step": 48,
"w_high_ratio": 0.12677159160375595,
"w_low_ratio": 0.023622059728950262,
"w_max": 2.024912714958191,
"w_mean": 1.3519074320793152,
"w_min": 7.271374424684445e-33,
"w_std": 0.19886896945536137
},
{
"completion_length": 2580.0521850585938,
"cov_mean": 2.3781666641298216e-05,
"cov_std": 0.2574050724506378,
"entropy": 0.39306640625,
"epoch": 0.056,
"grad_norm": 0.13522955775260925,
"kl": 0.0030527114868164062,
"learning_rate": 6.588648530198504e-07,
"loss": 0.0379,
"reward": 0.8020833730697632,
"reward_std": 0.47789302468299866,
"rewards/accuracy_reward": 0.20833333395421505,
"rewards/format_reward": 0.5937500149011612,
"step": 49,
"w_high_ratio": 0.0,
"w_low_ratio": 0.03599585313349962,
"w_max": 1.5133522152900696,
"w_mean": 1.177164077758789,
"w_min": 3.952712643244228e-41,
"w_std": 0.19642843678593636
},
{
"completion_length": 3276.2188110351562,
"cov_mean": 3.7818183841409336e-05,
"cov_std": 0.1878571268171072,
"entropy": 0.36767578125,
"epoch": 0.05714285714285714,
"grad_norm": 0.10174579173326492,
"kl": 0.0021190643310546875,
"learning_rate": 6.435602608679916e-07,
"loss": 0.04,
"reward": 0.5937500102445483,
"reward_std": 0.3714478053152561,
"rewards/accuracy_reward": 0.2604166716337204,
"rewards/format_reward": 0.3333333386108279,
"step": 50,
"w_high_ratio": 0.0,
"w_low_ratio": 0.0255408501252532,
"w_max": 1.3753422796726227,
"w_mean": 1.1391299068927765,
"w_min": 0.0,
"w_std": 0.12728617619723082
},
{
"completion_length": 2626.7084350585938,
"cov_mean": -8.886720934242476e-06,
"cov_std": 0.154384421184659,
"entropy": 0.46826171875,
"epoch": 0.05828571428571429,
"grad_norm": 0.10762708634138107,
"kl": 0.006221771240234375,
"learning_rate": 6.281416799501187e-07,
"loss": 0.038,
"reward": 0.5416666865348816,
"reward_std": 0.27000611275434494,
"rewards/accuracy_reward": 0.08333333674818277,
"rewards/format_reward": 0.4583333358168602,
"step": 51,
"w_high_ratio": 0.0,
"w_low_ratio": 0.022287086583673954,
"w_max": 1.522942990064621,
"w_mean": 1.16915962100029,
"w_min": 0.25,
"w_std": 0.10866253450512886
},
{
"completion_length": 3225.6875610351562,
"cov_mean": -3.420543362153694e-05,
"cov_std": 0.3140456900000572,
"entropy": 0.41357421875,
"epoch": 0.05942857142857143,
"grad_norm": 0.16037067770957947,
"kl": 0.0017061233520507812,
"learning_rate": 6.126278954320294e-07,
"loss": 0.0307,
"reward": 0.833333358168602,
"reward_std": 0.5814172253012657,
"rewards/accuracy_reward": 0.3854166865348816,
"rewards/format_reward": 0.4479166716337204,
"step": 52,
"w_high_ratio": 0.0,
"w_low_ratio": 0.041595788672566414,
"w_max": 1.4235666990280151,
"w_mean": 1.1377580165863037,
"w_min": 0.25,
"w_std": 0.20165112614631653
},
{
"completion_length": 3038.3126220703125,
"cov_mean": 3.059411119465949e-05,
"cov_std": 0.34266950748860836,
"entropy": 0.43994140625,
"epoch": 0.060571428571428575,
"grad_norm": 0.18063929677009583,
"kl": 0.004637241363525391,
"learning_rate": 5.97037808470444e-07,
"loss": 0.0228,
"reward": 0.9062500447034836,
"reward_std": 0.5983624011278152,
"rewards/accuracy_reward": 0.3229166828095913,
"rewards/format_reward": 0.583333358168602,
"step": 53,
"w_high_ratio": 0.0893278568983078,
"w_low_ratio": 0.048161128303036094,
"w_max": 1.605346292257309,
"w_mean": 1.2202682793140411,
"w_min": 3.5021185811519566e-32,
"w_std": 0.2196234930306673
},
{
"completion_length": 3048.3021850585938,
"cov_mean": -3.5217308322899044e-05,
"cov_std": 0.40455804020166397,
"entropy": 0.4111328125,
"epoch": 0.061714285714285715,
"grad_norm": 0.26634302735328674,
"kl": 0.0014967918395996094,
"learning_rate": 5.813904131848564e-07,
"loss": 0.0485,
"reward": 1.0104167088866234,
"reward_std": 0.6799703985452652,
"rewards/accuracy_reward": 0.4375000111758709,
"rewards/format_reward": 0.5729166828095913,
"step": 54,
"w_high_ratio": 0.14361883699893951,
"w_low_ratio": 0.03988745156675577,
"w_max": 1.914384812116623,
"w_mean": 1.3435330390930176,
"w_min": 0.0,
"w_std": 0.2648167684674263
},
{
"completion_length": 3372.3959350585938,
"cov_mean": 5.4979325341264484e-06,
"cov_std": 0.25056118331849575,
"entropy": 0.44189453125,
"epoch": 0.06285714285714286,
"grad_norm": 0.0977473184466362,
"kl": 0.0019044876098632812,
"learning_rate": 5.657047735161255e-07,
"loss": 0.0381,
"reward": 0.6562500223517418,
"reward_std": 0.4964308738708496,
"rewards/accuracy_reward": 0.27083334140479565,
"rewards/format_reward": 0.3854166716337204,
"step": 55,
"w_high_ratio": 0.0,
"w_low_ratio": 0.0342027700971812,
"w_max": 1.4320927858352661,
"w_mean": 1.1150383353233337,
"w_min": 0.0,
"w_std": 0.18786128982901573
},
{
"completion_length": 3225.2709045410156,
"cov_mean": 5.9108706409460865e-06,
"cov_std": 0.22232061624526978,
"entropy": 0.43115234375,
"epoch": 0.064,
"grad_norm": 0.11878591775894165,
"kl": 0.00154876708984375,
"learning_rate": 5.5e-07,
"loss": 0.0112,
"reward": 0.6458333656191826,
"reward_std": 0.39009611308574677,
"rewards/accuracy_reward": 0.19791666697710752,
"rewards/format_reward": 0.4479166828095913,
"step": 56,
"w_high_ratio": 0.0,
"w_low_ratio": 0.028656802838668227,
"w_max": 1.5001116394996643,
"w_mean": 1.2033225297927856,
"w_min": 1.9247332911380988e-31,
"w_std": 0.18417230807244778
},
{
"completion_length": 3587.6458740234375,
"cov_mean": -4.519502726907376e-05,
"cov_std": 0.24361642450094223,
"entropy": 0.34423828125,
"epoch": 0.06514285714285714,
"grad_norm": 0.11000871658325195,
"kl": 0.0006794929504394531,
"learning_rate": 5.342952264838747e-07,
"loss": 0.0423,
"reward": 0.4375,
"reward_std": 0.467288788408041,
"rewards/accuracy_reward": 0.1354166716337204,
"rewards/format_reward": 0.3020833358168602,
"step": 57,
"w_high_ratio": 0.0,
"w_low_ratio": 0.03233239706605673,
"w_max": 1.2505627870559692,
"w_mean": 1.0757884085178375,
"w_min": 0.25,
"w_std": 0.14500370249152184
},
{
"completion_length": 2626.947998046875,
"cov_mean": -1.4388041108759353e-05,
"cov_std": 0.2534067742526531,
"entropy": 0.38427734375,
"epoch": 0.06628571428571428,
"grad_norm": 0.15062791109085083,
"kl": 0.0038776397705078125,
"learning_rate": 5.186095868151436e-07,
"loss": 0.0665,
"reward": 0.916666716337204,
"reward_std": 0.43652553856372833,
"rewards/accuracy_reward": 0.2812500009313226,
"rewards/format_reward": 0.635416692122817,
"step": 58,
"w_high_ratio": 0.12001378461718559,
"w_low_ratio": 0.03707017982378602,
"w_max": 2.2393843233585358,
"w_mean": 1.5285146832466125,
"w_min": 2.338311714957214e-41,
"w_std": 0.2577071785926819
},
{
"completion_length": 3455.0208740234375,
"cov_mean": -2.0748303086293163e-05,
"cov_std": 0.2206678595393896,
"entropy": 0.3994140625,
"epoch": 0.06742857142857143,
"grad_norm": 0.1635066419839859,
"kl": 0.0013880729675292969,
"learning_rate": 5.02962191529556e-07,
"loss": 0.0447,
"reward": 0.39583333395421505,
"reward_std": 0.3752100467681885,
"rewards/accuracy_reward": 0.14583333861082792,
"rewards/format_reward": 0.2500000102445483,
"step": 59,
"w_high_ratio": 0.121368907392025,
"w_low_ratio": 0.02756796986795962,
"w_max": 1.8036501705646515,
"w_mean": 1.2463297247886658,
"w_min": 0.25,
"w_std": 0.14911611750721931
},
{
"completion_length": 3254.92724609375,
"cov_mean": -1.7401176137354923e-05,
"cov_std": 0.17664196342229843,
"entropy": 0.396484375,
"epoch": 0.06857142857142857,
"grad_norm": 0.08288750052452087,
"kl": 0.0025072097778320312,
"learning_rate": 4.873721045679706e-07,
"loss": 0.0281,
"reward": 0.4583333432674408,
"reward_std": 0.39030885696411133,
"rewards/accuracy_reward": 0.11458333488553762,
"rewards/format_reward": 0.34375000558793545,
"step": 60,
"w_high_ratio": 0.0,
"w_low_ratio": 0.022112081991508603,
"w_max": 1.5709031820297241,
"w_mean": 1.1311749517917633,
"w_min": 4.6449540846206874e-42,
"w_std": 0.12603357434272766
},
{
"completion_length": 3376.6771850585938,
"cov_mean": 2.7930617193305807e-05,
"cov_std": 0.22145200800150633,
"entropy": 0.400390625,
"epoch": 0.06971428571428571,
"grad_norm": 0.2130555361509323,
"kl": 0.0013718605041503906,
"learning_rate": 4.7185832004988133e-07,
"loss": 0.0505,
"reward": 0.6666666669771075,
"reward_std": 0.4248874858021736,
"rewards/accuracy_reward": 0.1979166716337204,
"rewards/format_reward": 0.4687500102445483,
"step": 61,
"w_high_ratio": 0.02656024508178234,
"w_low_ratio": 0.02897683286573738,
"w_max": 1.5726596117019653,
"w_mean": 1.1504198908805847,
"w_min": 0.25,
"w_std": 0.15992471296340227
},
{
"completion_length": 2804.3438110351562,
"cov_mean": 5.110432311994373e-05,
"cov_std": 0.3370564728975296,
"entropy": 0.385986328125,
"epoch": 0.07085714285714285,
"grad_norm": 0.20217926800251007,
"kl": 0.0056667327880859375,
"learning_rate": 4.5643973913200837e-07,
"loss": 0.0984,
"reward": 0.854166679084301,
"reward_std": 0.6372481435537338,
"rewards/accuracy_reward": 0.29166667722165585,
"rewards/format_reward": 0.5625000074505806,
"step": 62,
"w_high_ratio": 0.15436114370822906,
"w_low_ratio": 0.04324930440634489,
"w_max": 2.3450452983379364,
"w_mean": 1.4747015237808228,
"w_min": 2.7272514644043088e-36,
"w_std": 0.31457675993442535
},
{
"completion_length": 2748.1876220703125,
"cov_mean": 9.652720564190531e-06,
"cov_std": 0.3283480554819107,
"entropy": 0.43310546875,
"epoch": 0.072,
"grad_norm": 0.15674079954624176,
"kl": 0.0032906532287597656,
"learning_rate": 4.4113514698014953e-07,
"loss": 0.0725,
"reward": 1.031250037252903,
"reward_std": 0.5360563546419144,
"rewards/accuracy_reward": 0.3541666753590107,
"rewards/format_reward": 0.6770833432674408,
"step": 63,
"w_high_ratio": 0.0625,
"w_low_ratio": 0.04385069524869323,
"w_max": 1.8759834170341492,
"w_mean": 1.3464274108409882,
"w_min": 0.0,
"w_std": 0.1958361305296421
},
{
"completion_length": 3358.3334350585938,
"cov_mean": -4.510660664891475e-05,
"cov_std": 0.2794957533478737,
"entropy": 0.44921875,
"epoch": 0.07314285714285715,
"grad_norm": 0.12678052484989166,
"kl": 0.005690097808837891,
"learning_rate": 4.2596318988235037e-07,
"loss": 0.0603,
"reward": 0.5937500149011612,
"reward_std": 0.583733007311821,
"rewards/accuracy_reward": 0.2187500037252903,
"rewards/format_reward": 0.3750000074505806,
"step": 64,
"w_high_ratio": 0.044796403497457504,
"w_low_ratio": 0.03829633165150881,
"w_max": 1.6352408528327942,
"w_mean": 1.167921930551529,
"w_min": 2.6764800668604006e-43,
"w_std": 0.20047394558787346
},
{
"completion_length": 3026.5209350585938,
"cov_mean": 1.992606485146098e-05,
"cov_std": 0.17664698883891106,
"entropy": 0.38525390625,
"epoch": 0.07428571428571429,
"grad_norm": 0.08567796647548676,
"kl": 0.0034112930297851562,
"learning_rate": 4.1094235253127374e-07,
"loss": 0.0309,
"reward": 0.6250000260770321,
"reward_std": 0.32293669879436493,
"rewards/accuracy_reward": 0.16666666697710752,
"rewards/format_reward": 0.4583333395421505,
"step": 65,
"w_high_ratio": 0.0,
"w_low_ratio": 0.029133206233382225,
"w_max": 1.3120096027851105,
"w_mean": 1.1221435964107513,
"w_min": 1.0468715588988584e-22,
"w_std": 0.12845914252102375
},
{
"completion_length": 2413.1250610351562,
"cov_mean": -4.743512135974015e-06,
"cov_std": 0.05960770323872566,
"entropy": 0.35693359375,
"epoch": 0.07542857142857143,
"grad_norm": 0.08712891489267349,
"kl": 0.0032825469970703125,
"learning_rate": 3.9609093550344907e-07,
"loss": 0.0194,
"reward": 0.8645833432674408,
"reward_std": 0.13795074447989464,
"rewards/accuracy_reward": 0.375,
"rewards/format_reward": 0.4895833358168602,
"step": 66,
"w_high_ratio": 0.04171403869986534,
"w_low_ratio": 0.006300564622506499,
"w_max": 1.5700030624866486,
"w_mean": 1.1926406025886536,
"w_min": 0.5264238715171814,
"w_std": 0.06322706118226051
},
{
"completion_length": 3792.9271240234375,
"cov_mean": -9.492634717389592e-06,
"cov_std": 0.12886795960366726,
"entropy": 0.36865234375,
"epoch": 0.07657142857142857,
"grad_norm": 0.0828777328133583,
"kl": 0.002822399139404297,
"learning_rate": 3.8142703296283953e-07,
"loss": 0.0179,
"reward": 0.21875000558793545,
"reward_std": 0.2869785502552986,
"rewards/accuracy_reward": 0.052083334885537624,
"rewards/format_reward": 0.16666667442768812,
"step": 67,
"w_high_ratio": 0.0,
"w_low_ratio": 0.017554222606122494,
"w_max": 1.1846114993095398,
"w_mean": 1.0396882444620132,
"w_min": 0.25,
"w_std": 0.09070100169628859
},
{
"completion_length": 2557.6146240234375,
"cov_mean": 1.5981570413714508e-05,
"cov_std": 0.31408151611685753,
"entropy": 0.44384765625,
"epoch": 0.07771428571428571,
"grad_norm": 0.20466886460781097,
"kl": 0.0033721923828125,
"learning_rate": 3.6696851061588994e-07,
"loss": 0.0566,
"reward": 0.8020833507180214,
"reward_std": 0.5168246552348137,
"rewards/accuracy_reward": 0.2395833432674408,
"rewards/format_reward": 0.5625000149011612,
"step": 68,
"w_high_ratio": 0.04399501532316208,
"w_low_ratio": 0.042061637388542295,
"w_max": 1.8962246477603912,
"w_mean": 1.305674433708191,
"w_min": 4.959685019058809e-39,
"w_std": 0.2475343719124794
},
{
"completion_length": 3009.7084350585938,
"cov_mean": -4.52714293714962e-05,
"cov_std": 0.23395150154829025,
"entropy": 0.5517578125,
"epoch": 0.07885714285714286,
"grad_norm": 0.18218518793582916,
"kl": 0.014312744140625,
"learning_rate": 3.5273298394491515e-07,
"loss": 0.0752,
"reward": 0.4687500149011612,
"reward_std": 0.43293242901563644,
"rewards/accuracy_reward": 0.09375000279396772,
"rewards/format_reward": 0.37500000558793545,
"step": 69,
"w_high_ratio": 0.057301439344882965,
"w_low_ratio": 0.0333517212420702,
"w_max": 1.9855602085590363,
"w_mean": 1.2992196083068848,
"w_min": 7.707141553786494e-45,
"w_std": 0.19260139763355255
},
{
"completion_length": 3280.0833740234375,
"cov_mean": 2.8561088129208656e-05,
"cov_std": 0.19498306885361671,
"entropy": 0.3720703125,
"epoch": 0.08,
"grad_norm": 0.10543849319219589,
"kl": 0.010352134704589844,
"learning_rate": 3.387377967463493e-07,
"loss": 0.0185,
"reward": 0.5416667014360428,
"reward_std": 0.3840207904577255,
"rewards/accuracy_reward": 0.1250000074505806,
"rewards/format_reward": 0.4166666716337204,
"step": 70,
"w_high_ratio": 0.0,
"w_low_ratio": 0.02808787301182747,
"w_max": 1.2882064878940582,
"w_mean": 1.0968604385852814,
"w_min": 0.25,
"w_std": 0.1326066516339779
},
{
"completion_length": 2855.7708740234375,
"cov_mean": 4.004434208582097e-05,
"cov_std": 0.13545112498104572,
"entropy": 0.42724609375,
"epoch": 0.08114285714285714,
"grad_norm": 0.11269883066415787,
"kl": 0.014951705932617188,
"learning_rate": 3.250000000000001e-07,
"loss": 0.0154,
"reward": 0.5729166967794299,
"reward_std": 0.2492993399500847,
"rewards/accuracy_reward": 0.1979166716337204,
"rewards/format_reward": 0.3750000027939677,
"step": 71,
"w_high_ratio": 0.08807118237018585,
"w_low_ratio": 0.01654834917280823,
"w_max": 1.8029770255088806,
"w_mean": 1.2483810186386108,
"w_min": 0.25,
"w_std": 0.1558239422738552
},
{
"completion_length": 3550.322998046875,
"cov_mean": -4.4116359276813455e-07,
"cov_std": 0.24292385205626488,
"entropy": 0.5107421875,
"epoch": 0.08228571428571428,
"grad_norm": 0.20913389325141907,
"kl": 0.0040874481201171875,
"learning_rate": 3.115363310950578e-07,
"loss": 0.0498,
"reward": 0.3750000102445483,
"reward_std": 0.40810926631093025,
"rewards/accuracy_reward": 0.0729166716337204,
"rewards/format_reward": 0.3020833386108279,
"step": 72,
"w_high_ratio": 0.033293891698122025,
"w_low_ratio": 0.03450615704059601,
"w_max": 1.6246004700660706,
"w_mean": 1.1392557322978973,
"w_min": 2.138569791073276e-36,
"w_std": 0.19269496202468872
},
{
"completion_length": 3837.1146240234375,
"cov_mean": 2.1203804863034748e-06,
"cov_std": 0.15915799140930176,
"entropy": 0.51171875,
"epoch": 0.08342857142857144,
"grad_norm": 0.08118956536054611,
"kl": 0.0014677047729492188,
"learning_rate": 2.9836319343816397e-07,
"loss": 0.0146,
"reward": 0.2708333432674408,
"reward_std": 0.25903886556625366,
"rewards/accuracy_reward": 0.1041666716337204,
"rewards/format_reward": 0.1666666716337204,
"step": 73,
"w_high_ratio": 0.0,
"w_low_ratio": 0.020282023586332798,
"w_max": 1.1690112948417664,
"w_mean": 1.027027040719986,
"w_min": 0.5,
"w_std": 0.09119972214102745
},
{
"completion_length": 3503.1250610351562,
"cov_mean": -4.496445217228029e-05,
"cov_std": 0.2548239603638649,
"entropy": 0.41748046875,
"epoch": 0.08457142857142858,
"grad_norm": 0.1568731665611267,
"kl": 0.00296783447265625,
"learning_rate": 2.854966364683872e-07,
"loss": 0.0655,
"reward": 0.5625000074505806,
"reward_std": 0.4641239196062088,
"rewards/accuracy_reward": 0.2604166679084301,
"rewards/format_reward": 0.3020833358168602,
"step": 74,
"w_high_ratio": 0.0,
"w_low_ratio": 0.033767144195735455,
"w_max": 1.4706333875656128,
"w_mean": 1.078809916973114,
"w_min": 0.25,
"w_std": 0.1507711410522461
},
{
"completion_length": 3346.1875610351562,
"cov_mean": -7.328241736104246e-06,
"cov_std": 0.20871411636471748,
"entropy": 0.404296875,
"epoch": 0.08571428571428572,
"grad_norm": 0.10099554806947708,
"kl": 0.0059261322021484375,
"learning_rate": 2.729523361034538e-07,
"loss": 0.0554,
"reward": 0.572916679084301,
"reward_std": 0.3865407630801201,
"rewards/accuracy_reward": 0.1875,
"rewards/format_reward": 0.3854166716337204,
"step": 75,
"w_high_ratio": 0.0,
"w_low_ratio": 0.032186293974518776,
"w_max": 1.5220047235488892,
"w_mean": 1.1460089683532715,
"w_min": 0.25,
"w_std": 0.17850109934806824
},
{
"completion_length": 3092.635498046875,
"cov_mean": 4.942317445966182e-06,
"cov_std": 0.20054961927235126,
"entropy": 0.44091796875,
"epoch": 0.08685714285714285,
"grad_norm": 0.14435574412345886,
"kl": 0.001827239990234375,
"learning_rate": 2.6074557564105724e-07,
"loss": 0.02,
"reward": 0.583333358168602,
"reward_std": 0.34913603961467743,
"rewards/accuracy_reward": 0.10416666977107525,
"rewards/format_reward": 0.4791666939854622,
"step": 76,
"w_high_ratio": 0.0,
"w_low_ratio": 0.030099061783403158,
"w_max": 1.574487328529358,
"w_mean": 1.184053212404251,
"w_min": 1.2891045632755887e-30,
"w_std": 0.1306541245430708
},
{
"completion_length": 3414.0938110351562,
"cov_mean": 4.712834697784274e-05,
"cov_std": 0.23044732213020325,
"entropy": 0.45263671875,
"epoch": 0.088,
"grad_norm": 0.27278050780296326,
"kl": 0.0013208389282226562,
"learning_rate": 2.488912271385139e-07,
"loss": 0.015,
"reward": 0.510416672565043,
"reward_std": 0.4236603006720543,
"rewards/accuracy_reward": 0.1354166716337204,
"rewards/format_reward": 0.37500001583248377,
"step": 77,
"w_high_ratio": 0.006981382612138987,
"w_low_ratio": 0.02834776253439486,
"w_max": 1.5653101801872253,
"w_mean": 1.1680251359939575,
"w_min": 5.385388925526598e-27,
"w_std": 0.172462142072618
},
{
"completion_length": 3528.5001220703125,
"cov_mean": -2.2849541437608423e-05,
"cov_std": 0.24372886680066586,
"entropy": 0.41015625,
"epoch": 0.08914285714285715,
"grad_norm": 0.13359344005584717,
"kl": 0.0022611618041992188,
"learning_rate": 2.374037332934512e-07,
"loss": 0.037,
"reward": 0.6562500186264515,
"reward_std": 0.5271749570965767,
"rewards/accuracy_reward": 0.2812500111758709,
"rewards/format_reward": 0.3750000111758709,
"step": 78,
"w_high_ratio": 0.041247133165597916,
"w_low_ratio": 0.027018944965675473,
"w_max": 1.6266585290431976,
"w_mean": 1.1576823890209198,
"w_min": 1.890902738208876e-34,
"w_std": 0.16164034884423018
},
{
"completion_length": 2676.187545776367,
"cov_mean": 1.6489982044731732e-05,
"cov_std": 0.21785889007151127,
"entropy": 0.34912109375,
"epoch": 0.09028571428571429,
"grad_norm": 0.1319928616285324,
"kl": 0.0022878646850585938,
"learning_rate": 2.2629708984760706e-07,
"loss": 0.0289,
"reward": 0.843750037252903,
"reward_std": 0.4268086552619934,
"rewards/accuracy_reward": 0.26041667349636555,
"rewards/format_reward": 0.5833333544433117,
"step": 79,
"w_high_ratio": 0.0,
"w_low_ratio": 0.03145516477525234,
"w_max": 1.4992458820343018,
"w_mean": 1.168209046125412,
"w_min": 0.0,
"w_std": 0.1453277636319399
},
{
"completion_length": 3573.8021240234375,
"cov_mean": -9.463059541303664e-06,
"cov_std": 0.1826024018228054,
"entropy": 0.48974609375,
"epoch": 0.09142857142857143,
"grad_norm": 0.0996597409248352,
"kl": 0.002468109130859375,
"learning_rate": 2.1558482853517253e-07,
"loss": 0.0271,
"reward": 0.520833358168602,
"reward_std": 0.3810138627886772,
"rewards/accuracy_reward": 0.1979166679084301,
"rewards/format_reward": 0.3229166753590107,
"step": 80,
"w_high_ratio": 0.0,
"w_low_ratio": 0.02435835381038487,
"w_max": 1.368900626897812,
"w_mean": 1.106232464313507,
"w_min": 0.25,
"w_std": 0.11319147422909737
},
{
"completion_length": 3308.625,
"cov_mean": -1.9006092770723626e-05,
"cov_std": 0.1842699982225895,
"entropy": 0.58740234375,
"epoch": 0.09257142857142857,
"grad_norm": 0.15561415255069733,
"kl": 0.005096435546875,
"learning_rate": 2.0528000059645995e-07,
"loss": 0.0328,
"reward": 0.4062500149011612,
"reward_std": 0.28905032202601433,
"rewards/accuracy_reward": 0.11458333395421505,
"rewards/format_reward": 0.291666679084301,
"step": 81,
"w_high_ratio": 0.125,
"w_low_ratio": 0.02731443475931883,
"w_max": 1.497319370508194,
"w_mean": 1.1954041719436646,
"w_min": 0.25,
"w_std": 0.12777045369148254
},
{
"completion_length": 3054.5938110351562,
"cov_mean": -1.190575176224229e-05,
"cov_std": 0.13467486761510372,
"entropy": 0.453125,
"epoch": 0.09371428571428571,
"grad_norm": 0.06870616227388382,
"kl": 0.0037078857421875,
"learning_rate": 1.9539516087697517e-07,
"loss": 0.0283,
"reward": 0.6458333432674408,
"reward_std": 0.2705298960208893,
"rewards/accuracy_reward": 0.2187500074505806,
"rewards/format_reward": 0.4270833432674408,
"step": 82,
"w_high_ratio": 0.0,
"w_low_ratio": 0.016570631880313158,
"w_max": 1.6605907380580902,
"w_mean": 1.2332959175109863,
"w_min": 0.25,
"w_std": 0.10085548926144838
},
{
"completion_length": 3103.7188110351562,
"cov_mean": -1.1289954500171007e-05,
"cov_std": 0.25786374136805534,
"entropy": 0.4951171875,
"epoch": 0.09485714285714286,
"grad_norm": 0.11641041934490204,
"kl": 0.0033721923828125,
"learning_rate": 1.8594235253127372e-07,
"loss": 0.0561,
"reward": 0.541666679084301,
"reward_std": 0.5305610671639442,
"rewards/accuracy_reward": 0.17708333488553762,
"rewards/format_reward": 0.3645833507180214,
"step": 83,
"w_high_ratio": 0.0,
"w_low_ratio": 0.03598734503611922,
"w_max": 1.3528369665145874,
"w_mean": 1.095271646976471,
"w_min": 0.0,
"w_std": 0.1604925710707903
},
{
"completion_length": 3278.104248046875,
"cov_mean": 3.2929374356172048e-06,
"cov_std": 0.246146522462368,
"entropy": 0.45849609375,
"epoch": 0.096,
"grad_norm": 0.15934637188911438,
"kl": 0.0013265609741210938,
"learning_rate": 1.7693309235023127e-07,
"loss": 0.0172,
"reward": 0.7187500298023224,
"reward_std": 0.432245634496212,
"rewards/accuracy_reward": 0.3125000149011612,
"rewards/format_reward": 0.4062500149011612,
"step": 84,
"w_high_ratio": 0.0,
"w_low_ratio": 0.02858129981905222,
"w_max": 1.45015150308609,
"w_mean": 1.134983777999878,
"w_min": 0.25,
"w_std": 0.15726573020219803
},
{
"completion_length": 3484.729248046875,
"cov_mean": -8.481749773636693e-06,
"cov_std": 0.29308537393808365,
"entropy": 0.37841796875,
"epoch": 0.09714285714285714,
"grad_norm": 0.1379634290933609,
"kl": 0.0017528533935546875,
"learning_rate": 1.6837835672960831e-07,
"loss": 0.0623,
"reward": 0.5104166716337204,
"reward_std": 0.6195737272500992,
"rewards/accuracy_reward": 0.1666666679084301,
"rewards/format_reward": 0.3437500037252903,
"step": 85,
"w_high_ratio": 0.0,
"w_low_ratio": 0.03877481259405613,
"w_max": 1.4117690026760101,
"w_mean": 1.119108110666275,
"w_min": 0.0,
"w_std": 0.18827635422348976
},
{
"completion_length": 3158.8125,
"cov_mean": 6.843166598713424e-06,
"cov_std": 0.12266075890511274,
"entropy": 0.47119140625,
"epoch": 0.09828571428571428,
"grad_norm": 0.09657198935747147,
"kl": 0.0029735565185546875,
"learning_rate": 1.6028856829700258e-07,
"loss": 0.0174,
"reward": 0.541666679084301,
"reward_std": 0.2581377625465393,
"rewards/accuracy_reward": 0.1250000037252903,
"rewards/format_reward": 0.4166666716337204,
"step": 86,
"w_high_ratio": 0.0,
"w_low_ratio": 0.018843807047232985,
"w_max": 1.3829069435596466,
"w_mean": 1.131670981645584,
"w_min": 0.25,
"w_std": 0.11318285018205643
},
{
"completion_length": 3175.2188720703125,
"cov_mean": -5.853003926858946e-05,
"cov_std": 0.30259813368320465,
"entropy": 0.556640625,
"epoch": 0.09942857142857142,
"grad_norm": 0.38111400604248047,
"kl": 0.007335662841796875,
"learning_rate": 1.5267358321348285e-07,
"loss": 0.0778,
"reward": 0.7187500149011612,
"reward_std": 0.5092682540416718,
"rewards/accuracy_reward": 0.2291666753590107,
"rewards/format_reward": 0.4895833432674408,
"step": 87,
"w_high_ratio": 0.11183382570743561,
"w_low_ratio": 0.03777051903307438,
"w_max": 1.9268704950809479,
"w_mean": 1.334629088640213,
"w_min": 5.605193857299268e-45,
"w_std": 0.27019689977169037
},
{
"completion_length": 3190.9375610351562,
"cov_mean": 7.67477886256529e-05,
"cov_std": 0.46318161487579346,
"entropy": 0.5048828125,
"epoch": 0.10057142857142858,
"grad_norm": 0.2448384165763855,
"kl": 0.020694732666015625,
"learning_rate": 1.4554267916537495e-07,
"loss": 0.0906,
"reward": 0.7604167014360428,
"reward_std": 0.7656450867652893,
"rewards/accuracy_reward": 0.291666679084301,
"rewards/format_reward": 0.4687500149011612,
"step": 88,
"w_high_ratio": 0.08163053542375565,
"w_low_ratio": 0.06242929771542549,
"w_max": 1.8597923815250397,
"w_mean": 1.2850928604602814,
"w_min": 2.421886516239847e-28,
"w_std": 0.3251073509454727
},
{
"completion_length": 3597.2396850585938,
"cov_mean": -4.437502229848178e-05,
"cov_std": 0.29206302016973495,
"entropy": 0.45703125,
"epoch": 0.10171428571428572,
"grad_norm": 0.16698718070983887,
"kl": 0.00389862060546875,
"learning_rate": 1.3890454406082956e-07,
"loss": 0.0647,
"reward": 0.4687500149011612,
"reward_std": 0.5414880514144897,
"rewards/accuracy_reward": 0.1770833395421505,
"rewards/format_reward": 0.291666679084301,
"step": 89,
"w_high_ratio": 0.0,
"w_low_ratio": 0.04101241147145629,
"w_max": 1.4729963839054108,
"w_mean": 1.0995305478572845,
"w_min": 0.0,
"w_std": 0.19675205275416374
},
{
"completion_length": 2862.125,
"cov_mean": -5.355579560273327e-06,
"cov_std": 0.0925431028008461,
"entropy": 0.638671875,
"epoch": 0.10285714285714286,
"grad_norm": 0.09346118569374084,
"kl": 0.01725006103515625,
"learning_rate": 1.3276726544494571e-07,
"loss": 0.0269,
"reward": 0.4791666716337204,
"reward_std": 0.19776283204555511,
"rewards/accuracy_reward": 0.02083333395421505,
"rewards/format_reward": 0.4583333432674408,
"step": 90,
"w_high_ratio": 0.12039810419082642,
"w_low_ratio": 0.014701983891427517,
"w_max": 2.006953328847885,
"w_mean": 1.3530822694301605,
"w_min": 0.5,
"w_std": 0.09125572815537453
},
{
"completion_length": 3527.2813110351562,
"cov_mean": -6.6539573708723765e-06,
"cov_std": 0.19490721449255943,
"entropy": 0.47802734375,
"epoch": 0.104,
"grad_norm": 0.16878993809223175,
"kl": 0.0045948028564453125,
"learning_rate": 1.2713832064634125e-07,
"loss": 0.0023,
"reward": 0.48958333395421505,
"reward_std": 0.31570227444171906,
"rewards/accuracy_reward": 0.17708333861082792,
"rewards/format_reward": 0.3125000027939677,
"step": 91,
"w_high_ratio": 0.0432400144636631,
"w_low_ratio": 0.025078749749809504,
"w_max": 1.4252241849899292,
"w_mean": 1.1511092782020569,
"w_min": 0.25,
"w_std": 0.14859570004045963
},
{
"completion_length": 2996.8333740234375,
"cov_mean": 5.000362762075383e-06,
"cov_std": 0.274563018232584,
"entropy": 0.44921875,
"epoch": 0.10514285714285715,
"grad_norm": 0.2672693729400635,
"kl": 0.011915206909179688,
"learning_rate": 1.220245676671809e-07,
"loss": 0.0151,
"reward": 0.6875000223517418,
"reward_std": 0.3997742757201195,
"rewards/accuracy_reward": 0.1562500074505806,
"rewards/format_reward": 0.5312500074505806,
"step": 92,
"w_high_ratio": 0.057363301515579224,
"w_low_ratio": 0.03872442920692265,
"w_max": 1.9733782410621643,
"w_mean": 1.2820636332035065,
"w_min": 1.1237891646640876e-26,
"w_std": 0.21614115312695503
},
{
"completion_length": 3868.3333740234375,
"cov_mean": 2.6024475801023073e-05,
"cov_std": 0.11580366268754005,
"entropy": 0.59130859375,
"epoch": 0.10628571428571429,
"grad_norm": 0.07178976386785507,
"kl": 0.00457763671875,
"learning_rate": 1.1743223682775649e-07,
"loss": 0.0217,
"reward": 0.10416666883975267,
"reward_std": 0.23858631029725075,
"rewards/accuracy_reward": 0.02083333395421505,
"rewards/format_reward": 0.08333333488553762,
"step": 93,
"w_high_ratio": 0.0,
"w_low_ratio": 0.022031503496691585,
"w_max": 1.2298710346221924,
"w_mean": 1.0156493484973907,
"w_min": 9.954022423630139e-25,
"w_std": 0.08076347131282091
},
{
"completion_length": 3368.0834350585938,
"cov_mean": -9.980105346585333e-07,
"cov_std": 0.14382942207157612,
"entropy": 0.56640625,
"epoch": 0.10742857142857143,
"grad_norm": 0.09354749321937561,
"kl": 0.010030746459960938,
"learning_rate": 1.1336692317580158e-07,
"loss": 0.0167,
"reward": 0.479166679084301,
"reward_std": 0.26436545327305794,
"rewards/accuracy_reward": 0.15625000651925802,
"rewards/format_reward": 0.3229166679084301,
"step": 94,
"w_high_ratio": 0.0,
"w_low_ratio": 0.020133810699917376,
"w_max": 1.4812421798706055,
"w_mean": 1.140279084444046,
"w_min": 2.8643974210955766e-17,
"w_std": 0.10274781100451946
},
{
"completion_length": 3727.229248046875,
"cov_mean": 2.8429121812223457e-07,
"cov_std": 0.1946401260793209,
"entropy": 0.4609375,
"epoch": 0.10857142857142857,
"grad_norm": 0.09702739864587784,
"kl": 0.0018458366394042969,
"learning_rate": 1.0983357966978745e-07,
"loss": 0.039,
"reward": 0.2916666707023978,
"reward_std": 0.4342379942536354,
"rewards/accuracy_reward": 0.07291666977107525,
"rewards/format_reward": 0.21875000838190317,
"step": 95,
"w_high_ratio": 0.0,
"w_low_ratio": 0.027697827550582588,
"w_max": 1.2496315836906433,
"w_mean": 1.0440161526203156,
"w_min": 0.0,
"w_std": 0.13094050344079733
},
{
"completion_length": 3174.6876220703125,
"cov_mean": 0.00011378643011994427,
"cov_std": 0.20376956462860107,
"entropy": 0.4482421875,
"epoch": 0.10971428571428571,
"grad_norm": 0.13061358034610748,
"kl": 0.005021095275878906,
"learning_rate": 1.068365111445064e-07,
"loss": 0.007,
"reward": 0.6562500074505806,
"reward_std": 0.3372773453593254,
"rewards/accuracy_reward": 0.2812500037252903,
"rewards/format_reward": 0.3750000074505806,
"step": 96,
"w_high_ratio": 0.08483665436506271,
"w_low_ratio": 0.023257225286215544,
"w_max": 1.6405883729457855,
"w_mean": 1.170585960149765,
"w_min": 0.25,
"w_std": 0.1635773852467537
},
{
"completion_length": 3578.6458740234375,
"cov_mean": 5.654522146869567e-05,
"cov_std": 0.2582091810181737,
"entropy": 0.4716796875,
"epoch": 0.11085714285714286,
"grad_norm": 0.23674072325229645,
"kl": 0.0026683807373046875,
"learning_rate": 1.0437936906629334e-07,
"loss": 0.0282,
"reward": 0.48958334419876337,
"reward_std": 0.4029072895646095,
"rewards/accuracy_reward": 0.21875000558793545,
"rewards/format_reward": 0.27083334140479565,
"step": 97,
"w_high_ratio": 0.0427275113761425,
"w_low_ratio": 0.02712295390665531,
"w_max": 1.5456224977970123,
"w_mean": 1.134882390499115,
"w_min": 8.233329127140463e-42,
"w_std": 0.17803996708244085
},
{
"completion_length": 3275.70849609375,
"cov_mean": -2.5506165911792777e-05,
"cov_std": 0.22730276361107826,
"entropy": 0.43359375,
"epoch": 0.112,
"grad_norm": 0.154100239276886,
"kl": 0.0016241073608398438,
"learning_rate": 1.0246514708427701e-07,
"loss": 0.0791,
"reward": 0.5208333432674408,
"reward_std": 0.45262154936790466,
"rewards/accuracy_reward": 0.14583333674818277,
"rewards/format_reward": 0.3750000149011612,
"step": 98,
"w_high_ratio": 0.031526632606983185,
"w_low_ratio": 0.030860408674925566,
"w_max": 1.7042989134788513,
"w_mean": 1.219407707452774,
"w_min": 1.0509738482436128e-44,
"w_std": 0.1900232806801796
},
{
"completion_length": 3084.593795776367,
"cov_mean": -6.495133902717498e-06,
"cov_std": 0.13577165454626083,
"entropy": 0.386474609375,
"epoch": 0.11314285714285714,
"grad_norm": 0.07463299483060837,
"kl": 0.008008956909179688,
"learning_rate": 1.0109617738307911e-07,
"loss": 0.0185,
"reward": 0.5208333488553762,
"reward_std": 0.2819661721587181,
"rewards/accuracy_reward": 0.19791667815297842,
"rewards/format_reward": 0.3229166669771075,
"step": 99,
"w_high_ratio": 0.04817802831530571,
"w_low_ratio": 0.017340978607535362,
"w_max": 1.389538049697876,
"w_mean": 1.1735607981681824,
"w_min": 0.25,
"w_std": 0.1109000938013196
},
{
"completion_length": 3224.5834350585938,
"cov_mean": -4.5620060973305954e-05,
"cov_std": 0.26372817903757095,
"entropy": 0.42724609375,
"epoch": 0.11428571428571428,
"grad_norm": 0.13250450789928436,
"kl": 0.009317398071289062,
"learning_rate": 1.002741278414069e-07,
"loss": 0.0352,
"reward": 0.7083333656191826,
"reward_std": 0.4755344055593014,
"rewards/accuracy_reward": 0.260416679084301,
"rewards/format_reward": 0.447916679084301,
"step": 100,
"w_high_ratio": 0.05065765231847763,
"w_low_ratio": 0.03619965072721243,
"w_max": 1.6908635199069977,
"w_mean": 1.1874340772628784,
"w_min": 1.2465886678904214e-38,
"w_std": 0.2075340449810028
},
{
"epoch": 0.11428571428571428,
"step": 100,
"total_flos": 0.0,
"train_loss": 0.041623960277065636,
"train_runtime": 8415.8875,
"train_samples_per_second": 1.141,
"train_steps_per_second": 0.012
}
],
"logging_steps": 1,
"max_steps": 100,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 6,
"trial_name": null,
"trial_params": null
}