Qwen-2.5-3B-grpo-code / trainer_state.json
mlxha's picture
Model save
d055e09 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.25028683363649085,
"eval_steps": 500,
"global_step": 559,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 714.5625,
"completions/max_terminated_length": 714.5625,
"completions/mean_length": 534.09375,
"completions/mean_terminated_length": 534.09375,
"completions/min_length": 398.375,
"completions/min_terminated_length": 398.375,
"epoch": 0.00044774031061984047,
"grad_norm": 1.0911544979292094,
"kl": 0.0,
"learning_rate": 0.0,
"loss": 0.0164,
"num_tokens": 143396.0,
"reward": 0.1745322283823043,
"reward_std": 0.14398040855303407,
"rewards/code_reward/mean": 0.10812597409676528,
"rewards/code_reward/std": 0.11770913819782436,
"rewards/format_reward/mean": 0.6640625,
"rewards/format_reward/std": 0.44056092016398907,
"step": 1
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 782.5,
"completions/max_terminated_length": 782.5,
"completions/mean_length": 584.078125,
"completions/mean_terminated_length": 584.078125,
"completions/min_length": 428.625,
"completions/min_terminated_length": 428.625,
"epoch": 0.0022387015530992023,
"grad_norm": 1.0592214128916255,
"kl": 0.00044733285903930664,
"learning_rate": 2.1428571428571428e-07,
"loss": 0.0004,
"num_tokens": 772676.0,
"reward": 0.15631713026959915,
"reward_std": 0.14780386447091587,
"rewards/code_reward/mean": 0.09889525244216202,
"rewards/code_reward/std": 0.12754268431308446,
"rewards/format_reward/mean": 0.57421875,
"rewards/format_reward/std": 0.42564064590260386,
"step": 5
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 743.5625,
"completions/max_terminated_length": 743.5625,
"completions/mean_length": 566.3671875,
"completions/mean_terminated_length": 566.3671875,
"completions/min_length": 412.7625,
"completions/min_terminated_length": 412.7625,
"epoch": 0.004477403106198405,
"grad_norm": 0.8586902730302105,
"kl": 0.0006687402725219727,
"learning_rate": 4.821428571428572e-07,
"loss": 0.02,
"num_tokens": 1514535.0,
"reward": 0.21928292746888473,
"reward_std": 0.17851990209892393,
"rewards/code_reward/mean": 0.15240792171971407,
"rewards/code_reward/std": 0.15891524556500372,
"rewards/format_reward/mean": 0.66875,
"rewards/format_reward/std": 0.42151433378458025,
"step": 10
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 796.05,
"completions/max_terminated_length": 796.05,
"completions/mean_length": 587.6890625,
"completions/mean_terminated_length": 587.6890625,
"completions/min_length": 413.575,
"completions/min_terminated_length": 413.575,
"epoch": 0.0067161046592976075,
"grad_norm": 0.6576078448734007,
"kl": 0.002119898796081543,
"learning_rate": 7.5e-07,
"loss": 0.0262,
"num_tokens": 2322384.0,
"reward": 0.19728650886099786,
"reward_std": 0.153143038158305,
"rewards/code_reward/mean": 0.11244275536737405,
"rewards/code_reward/std": 0.13675388206611389,
"rewards/format_reward/mean": 0.8484375,
"rewards/format_reward/std": 0.2670775193721056,
"step": 15
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 736.6,
"completions/max_terminated_length": 736.6,
"completions/mean_length": 539.6390625,
"completions/mean_terminated_length": 539.6390625,
"completions/min_length": 379.7375,
"completions/min_terminated_length": 379.7375,
"epoch": 0.00895480621239681,
"grad_norm": 0.7320311360828152,
"kl": 0.002538633346557617,
"learning_rate": 1.017857142857143e-06,
"loss": 0.0079,
"num_tokens": 3037089.0,
"reward": 0.21442170465597882,
"reward_std": 0.14227938583353533,
"rewards/code_reward/mean": 0.12176544930553064,
"rewards/code_reward/std": 0.13433333449356724,
"rewards/format_reward/mean": 0.9265625,
"rewards/format_reward/std": 0.156092469394207,
"step": 20
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 770.625,
"completions/max_terminated_length": 770.625,
"completions/mean_length": 554.2328125,
"completions/mean_terminated_length": 554.2328125,
"completions/min_length": 395.7,
"completions/min_terminated_length": 395.7,
"epoch": 0.011193507765496012,
"grad_norm": 0.5737573580410903,
"kl": 0.00330963134765625,
"learning_rate": 1.2857142857142856e-06,
"loss": 0.0233,
"num_tokens": 3787614.0,
"reward": 0.22332688504830003,
"reward_std": 0.11517863497429062,
"rewards/code_reward/mean": 0.12520187861009618,
"rewards/code_reward/std": 0.11075652101717423,
"rewards/format_reward/mean": 0.98125,
"rewards/format_reward/std": 0.04998054876923561,
"step": 25
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 696.775,
"completions/max_terminated_length": 696.775,
"completions/mean_length": 527.1390625,
"completions/mean_terminated_length": 527.1390625,
"completions/min_length": 375.6,
"completions/min_terminated_length": 375.6,
"epoch": 0.013432209318595215,
"grad_norm": 0.6412924324023244,
"kl": 0.004455375671386719,
"learning_rate": 1.5535714285714287e-06,
"loss": 0.0292,
"num_tokens": 4536623.0,
"reward": 0.2232258369214833,
"reward_std": 0.13004211404477245,
"rewards/code_reward/mean": 0.12400708374771056,
"rewards/code_reward/std": 0.12888794834143483,
"rewards/format_reward/mean": 0.9921875,
"rewards/format_reward/std": 0.022097086533904076,
"step": 30
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 688.475,
"completions/max_terminated_length": 688.475,
"completions/mean_length": 493.4390625,
"completions/mean_terminated_length": 493.4390625,
"completions/min_length": 351.2875,
"completions/min_terminated_length": 351.2875,
"epoch": 0.015670910871694418,
"grad_norm": 0.4441774535858199,
"kl": 0.005760383605957031,
"learning_rate": 1.8214285714285714e-06,
"loss": 0.0183,
"num_tokens": 5238216.0,
"reward": 0.23461700212210418,
"reward_std": 0.13695308727037628,
"rewards/code_reward/mean": 0.1349294964238652,
"rewards/code_reward/std": 0.13671803568140603,
"rewards/format_reward/mean": 0.996875,
"rewards/format_reward/std": 0.00883883461356163,
"step": 35
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.003125,
"completions/max_length": 815.8875,
"completions/max_terminated_length": 623.275,
"completions/mean_length": 488.790625,
"completions/mean_terminated_length": 464.4939735412598,
"completions/min_length": 338.1875,
"completions/min_terminated_length": 338.1875,
"epoch": 0.01790961242479362,
"grad_norm": 0.7054891224478806,
"kl": 0.007607078552246094,
"learning_rate": 2.089285714285714e-06,
"loss": 0.0369,
"num_tokens": 5931842.0,
"reward": 0.2295066607184708,
"reward_std": 0.13071401379711461,
"rewards/code_reward/mean": 0.12997540423093595,
"rewards/code_reward/std": 0.1293881902238354,
"rewards/format_reward/mean": 0.9953125,
"rewards/format_reward/std": 0.013258251920342445,
"step": 40
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 595.2375,
"completions/max_terminated_length": 595.2375,
"completions/mean_length": 450.5140625,
"completions/mean_terminated_length": 450.5140625,
"completions/min_length": 334.7125,
"completions/min_terminated_length": 334.7125,
"epoch": 0.020148313977892823,
"grad_norm": 0.7270226911269254,
"kl": 0.008572006225585937,
"learning_rate": 2.357142857142857e-06,
"loss": 0.0023,
"num_tokens": 6579707.0,
"reward": 0.29843434747308495,
"reward_std": 0.13938394124270417,
"rewards/code_reward/mean": 0.19905934149210225,
"rewards/code_reward/std": 0.13823813095805235,
"rewards/format_reward/mean": 0.99375,
"rewards/format_reward/std": 0.01767766922712326,
"step": 45
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0015625,
"completions/max_length": 771.7375,
"completions/max_terminated_length": 676.5875,
"completions/mean_length": 519.5140625,
"completions/mean_terminated_length": 507.42098236083984,
"completions/min_length": 371.625,
"completions/min_terminated_length": 371.625,
"epoch": 0.022387015530992024,
"grad_norm": 0.8864298285641923,
"kl": 0.009959030151367187,
"learning_rate": 2.6250000000000003e-06,
"loss": 0.0247,
"num_tokens": 7287164.0,
"reward": 0.24341339743696153,
"reward_std": 0.13661439061979763,
"rewards/code_reward/mean": 0.14700714359642006,
"rewards/code_reward/std": 0.13423215872608124,
"rewards/format_reward/mean": 0.9640625,
"rewards/format_reward/std": 0.06212893389165401,
"step": 50
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 683.3625,
"completions/max_terminated_length": 683.3625,
"completions/mean_length": 516.046875,
"completions/mean_terminated_length": 516.046875,
"completions/min_length": 386.925,
"completions/min_terminated_length": 386.925,
"epoch": 0.024625717084091225,
"grad_norm": 0.6382131264055037,
"kl": 0.008373641967773437,
"learning_rate": 2.892857142857143e-06,
"loss": 0.021,
"num_tokens": 7971626.0,
"reward": 0.3139894030056894,
"reward_std": 0.15021034325327492,
"rewards/code_reward/mean": 0.21695814684353537,
"rewards/code_reward/std": 0.14836436581681484,
"rewards/format_reward/mean": 0.9703125,
"rewards/format_reward/std": 0.046608568355441096,
"step": 55
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 661.7125,
"completions/max_terminated_length": 661.7125,
"completions/mean_length": 510.025,
"completions/mean_terminated_length": 510.025,
"completions/min_length": 382.3125,
"completions/min_terminated_length": 382.3125,
"epoch": 0.02686441863719043,
"grad_norm": 0.7435002543363699,
"kl": 0.009385299682617188,
"learning_rate": 2.9997366975852433e-06,
"loss": 0.0148,
"num_tokens": 8701666.0,
"reward": 0.24593741996213794,
"reward_std": 0.12826487933343741,
"rewards/code_reward/mean": 0.1460936620060238,
"rewards/code_reward/std": 0.1278229385818122,
"rewards/format_reward/mean": 0.9984375,
"rewards/format_reward/std": 0.004419417306780815,
"step": 60
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 694.2875,
"completions/max_terminated_length": 694.2875,
"completions/mean_length": 517.9140625,
"completions/mean_terminated_length": 517.9140625,
"completions/min_length": 369.8125,
"completions/min_terminated_length": 369.8125,
"epoch": 0.02910312019028963,
"grad_norm": 0.5685331001634877,
"kl": 0.012218093872070313,
"learning_rate": 2.9981279620139177e-06,
"loss": 0.0053,
"num_tokens": 9438523.0,
"reward": 0.24741017883643507,
"reward_std": 0.13050166001776234,
"rewards/code_reward/mean": 0.1474101732033887,
"rewards/code_reward/std": 0.13050166381872258,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 65
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 780.825,
"completions/max_terminated_length": 780.825,
"completions/mean_length": 535.19375,
"completions/mean_terminated_length": 535.19375,
"completions/min_length": 392.3875,
"completions/min_terminated_length": 392.3875,
"epoch": 0.031341821743388835,
"grad_norm": 0.5336222407251643,
"kl": 0.0148834228515625,
"learning_rate": 2.9950583368363777e-06,
"loss": 0.007,
"num_tokens": 10157391.0,
"reward": 0.296136565413326,
"reward_std": 0.16640246821043547,
"rewards/code_reward/mean": 0.19644906022003852,
"rewards/code_reward/std": 0.1658487796317786,
"rewards/format_reward/mean": 0.996875,
"rewards/format_reward/std": 0.00883883461356163,
"step": 70
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 797.0375,
"completions/max_terminated_length": 797.0375,
"completions/mean_length": 589.8359375,
"completions/mean_terminated_length": 589.8359375,
"completions/min_length": 426.275,
"completions/min_terminated_length": 426.275,
"epoch": 0.033580523296488037,
"grad_norm": 0.6369858920854246,
"kl": 0.017626190185546876,
"learning_rate": 2.990530815377378e-06,
"loss": 0.0087,
"num_tokens": 10930742.0,
"reward": 0.27690047658979894,
"reward_std": 0.12926372148940574,
"rewards/code_reward/mean": 0.177994220439723,
"rewards/code_reward/std": 0.12826473288878332,
"rewards/format_reward/mean": 0.9890625,
"rewards/format_reward/std": 0.027883462235331537,
"step": 75
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 758.7125,
"completions/max_terminated_length": 758.7125,
"completions/mean_length": 559.50625,
"completions/mean_terminated_length": 559.50625,
"completions/min_length": 398.6,
"completions/min_terminated_length": 398.6,
"epoch": 0.03581922484958724,
"grad_norm": 0.7033361512162839,
"kl": 0.01629638671875,
"learning_rate": 2.984549812619624e-06,
"loss": -0.0033,
"num_tokens": 11662834.0,
"reward": 0.2631410426460207,
"reward_std": 0.11843040494713933,
"rewards/code_reward/mean": 0.16329728582059033,
"rewards/code_reward/std": 0.11832479977165349,
"rewards/format_reward/mean": 0.9984375,
"rewards/format_reward/std": 0.004419417306780815,
"step": 80
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 671.2875,
"completions/max_terminated_length": 671.2875,
"completions/mean_length": 507.8875,
"completions/mean_terminated_length": 507.8875,
"completions/min_length": 377.5625,
"completions/min_terminated_length": 377.5625,
"epoch": 0.03805792640268644,
"grad_norm": 0.5966458269379716,
"kl": 0.0168060302734375,
"learning_rate": 2.9771211608985266e-06,
"loss": 0.0047,
"num_tokens": 12352234.0,
"reward": 0.32661316031590104,
"reward_std": 0.1419034074380761,
"rewards/code_reward/mean": 0.2267694047826808,
"rewards/code_reward/std": 0.14197679209755734,
"rewards/format_reward/mean": 0.9984375,
"rewards/format_reward/std": 0.004419417306780815,
"step": 85
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 664.3375,
"completions/max_terminated_length": 664.3375,
"completions/mean_length": 502.846875,
"completions/mean_terminated_length": 502.846875,
"completions/min_length": 376.3875,
"completions/min_terminated_length": 376.3875,
"epoch": 0.04029662795578565,
"grad_norm": 0.6882916695583966,
"kl": 0.017774200439453124,
"learning_rate": 2.968252104214841e-06,
"loss": 0.0162,
"num_tokens": 13055856.0,
"reward": 0.26416925797238944,
"reward_std": 0.15208177534805145,
"rewards/code_reward/mean": 0.16432550169847673,
"rewards/code_reward/std": 0.1518084899900714,
"rewards/format_reward/mean": 0.9984375,
"rewards/format_reward/std": 0.004419417306780815,
"step": 90
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 717.9125,
"completions/max_terminated_length": 717.9125,
"completions/mean_length": 529.171875,
"completions/mean_terminated_length": 529.171875,
"completions/min_length": 389.075,
"completions/min_terminated_length": 389.075,
"epoch": 0.04253532950888485,
"grad_norm": 0.5867793943695734,
"kl": 0.01979522705078125,
"learning_rate": 2.9579512911707257e-06,
"loss": 0.012,
"num_tokens": 13781566.0,
"reward": 0.29845606358721855,
"reward_std": 0.14189217127859594,
"rewards/code_reward/mean": 0.19845605657319537,
"rewards/code_reward/std": 0.1418921749223955,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 95
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 750.525,
"completions/max_terminated_length": 750.525,
"completions/mean_length": 518.6890625,
"completions/mean_terminated_length": 518.6890625,
"completions/min_length": 371.575,
"completions/min_terminated_length": 371.575,
"epoch": 0.04477403106198405,
"grad_norm": 0.6756173457255675,
"kl": 0.023876953125,
"learning_rate": 2.9462287665361157e-06,
"loss": 0.017,
"num_tokens": 14508775.0,
"reward": 0.2731386865489185,
"reward_std": 0.1473583393584704,
"rewards/code_reward/mean": 0.17345118119992547,
"rewards/code_reward/std": 0.14723392758751289,
"rewards/format_reward/mean": 0.996875,
"rewards/format_reward/std": 0.00883883461356163,
"step": 100
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 700.8,
"completions/max_terminated_length": 700.8,
"completions/mean_length": 474.2953125,
"completions/mean_terminated_length": 474.2953125,
"completions/min_length": 339.4,
"completions/min_terminated_length": 339.4,
"epoch": 0.04701273261508325,
"grad_norm": 0.6213587460739984,
"kl": 0.027069091796875,
"learning_rate": 2.9330959614536314e-06,
"loss": 0.016,
"num_tokens": 15178396.0,
"reward": 0.29834548365324737,
"reward_std": 0.13777082363376394,
"rewards/code_reward/mean": 0.19834547787031626,
"rewards/code_reward/std": 0.1377708253567107,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 105
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 652.6625,
"completions/max_terminated_length": 652.6625,
"completions/mean_length": 483.6328125,
"completions/mean_terminated_length": 483.6328125,
"completions/min_length": 346.775,
"completions/min_terminated_length": 346.775,
"epoch": 0.04925143416818245,
"grad_norm": 0.6361080911543711,
"kl": 0.02613983154296875,
"learning_rate": 2.9185656822915747e-06,
"loss": -0.0057,
"num_tokens": 15867273.0,
"reward": 0.29470919668674467,
"reward_std": 0.12798963281093165,
"rewards/code_reward/mean": 0.19486543894308853,
"rewards/code_reward/std": 0.12776418880966958,
"rewards/format_reward/mean": 0.9984375,
"rewards/format_reward/std": 0.004419417306780815,
"step": 110
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 741.1375,
"completions/max_terminated_length": 741.1375,
"completions/mean_length": 536.115625,
"completions/mean_terminated_length": 536.115625,
"completions/min_length": 393.3125,
"completions/min_terminated_length": 393.3125,
"epoch": 0.05149013572128166,
"grad_norm": 0.6555109923730857,
"kl": 0.0231597900390625,
"learning_rate": 2.9026520981558844e-06,
"loss": 0.009,
"num_tokens": 16604459.0,
"reward": 0.2888658272102475,
"reward_std": 0.15465332815947477,
"rewards/code_reward/mean": 0.1888658216179465,
"rewards/code_reward/std": 0.15465332991443576,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 115
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 690.875,
"completions/max_terminated_length": 690.875,
"completions/mean_length": 532.3109375,
"completions/mean_terminated_length": 532.3109375,
"completions/min_length": 403.2,
"completions/min_terminated_length": 403.2,
"epoch": 0.05372883727438086,
"grad_norm": 0.6650346931830148,
"kl": 0.024253082275390626,
"learning_rate": 2.8853707270732253e-06,
"loss": 0.0132,
"num_tokens": 17335906.0,
"reward": 0.35356655940413473,
"reward_std": 0.18680918092140927,
"rewards/code_reward/mean": 0.2537228013883578,
"rewards/code_reward/std": 0.18692450551316142,
"rewards/format_reward/mean": 0.9984375,
"rewards/format_reward/std": 0.004419417306780815,
"step": 120
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 694.6625,
"completions/max_terminated_length": 694.6625,
"completions/mean_length": 517.6953125,
"completions/mean_terminated_length": 517.6953125,
"completions/min_length": 385.7,
"completions/min_terminated_length": 385.7,
"epoch": 0.05596753882748006,
"grad_norm": 0.6254170069058008,
"kl": 0.025794219970703126,
"learning_rate": 2.8667384208586865e-06,
"loss": 0.0043,
"num_tokens": 18058943.0,
"reward": 0.3556826992891729,
"reward_std": 0.1363969652389642,
"rewards/code_reward/mean": 0.25583894047886135,
"rewards/code_reward/std": 0.1366761433542706,
"rewards/format_reward/mean": 0.9984375,
"rewards/format_reward/std": 0.004419417306780815,
"step": 125
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 706.0375,
"completions/max_terminated_length": 706.0375,
"completions/mean_length": 543.7203125,
"completions/mean_terminated_length": 543.7203125,
"completions/min_length": 405.2875,
"completions/min_terminated_length": 405.2875,
"epoch": 0.05820624038057926,
"grad_norm": 0.6216908171231574,
"kl": 0.02597503662109375,
"learning_rate": 2.846773348682845e-06,
"loss": 0.0007,
"num_tokens": 18775148.0,
"reward": 0.2654763679020107,
"reward_std": 0.13124802198726684,
"rewards/code_reward/mean": 0.1659451116924174,
"rewards/code_reward/std": 0.13088461093138903,
"rewards/format_reward/mean": 0.9953125,
"rewards/format_reward/std": 0.013258251920342445,
"step": 130
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 732.7125,
"completions/max_terminated_length": 732.7125,
"completions/mean_length": 550.0796875,
"completions/mean_terminated_length": 550.0796875,
"completions/min_length": 407.625,
"completions/min_terminated_length": 407.625,
"epoch": 0.06044494193367847,
"grad_norm": 0.6438445755693917,
"kl": 0.02695159912109375,
"learning_rate": 2.8254949793542194e-06,
"loss": 0.0133,
"num_tokens": 19516591.0,
"reward": 0.30453283004462717,
"reward_std": 0.15741582050104624,
"rewards/code_reward/mean": 0.20453282294183736,
"rewards/code_reward/std": 0.15741582473565358,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 135
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 706.0,
"completions/max_terminated_length": 706.0,
"completions/mean_length": 532.575,
"completions/mean_terminated_length": 532.575,
"completions/min_length": 389.0625,
"completions/min_terminated_length": 389.0625,
"epoch": 0.06268364348677767,
"grad_norm": 0.6680496022315153,
"kl": 0.02960357666015625,
"learning_rate": 2.802924062334391e-06,
"loss": 0.0146,
"num_tokens": 20241207.0,
"reward": 0.3066130679100752,
"reward_std": 0.18769313739612697,
"rewards/code_reward/mean": 0.2072380588942906,
"rewards/code_reward/std": 0.18655281127139461,
"rewards/format_reward/mean": 0.99375,
"rewards/format_reward/std": 0.01767766922712326,
"step": 140
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 711.9125,
"completions/max_terminated_length": 711.9125,
"completions/mean_length": 507.396875,
"completions/mean_terminated_length": 507.396875,
"completions/min_length": 354.5,
"completions/min_terminated_length": 354.5,
"epoch": 0.06492234503987687,
"grad_norm": 0.6756739470558975,
"kl": 0.02831573486328125,
"learning_rate": 2.779082607504298e-06,
"loss": 0.015,
"num_tokens": 20963517.0,
"reward": 0.28101985761895776,
"reward_std": 0.17780419969349168,
"rewards/code_reward/mean": 0.1811761005956214,
"rewards/code_reward/std": 0.17785799705889077,
"rewards/format_reward/mean": 0.9984375,
"rewards/format_reward/std": 0.004419417306780815,
"step": 145
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 606.0625,
"completions/max_terminated_length": 606.0625,
"completions/mean_length": 448.28125,
"completions/mean_terminated_length": 448.28125,
"completions/min_length": 322.1375,
"completions/min_terminated_length": 322.1375,
"epoch": 0.06716104659297607,
"grad_norm": 0.5719295032736604,
"kl": 0.02752532958984375,
"learning_rate": 2.7539938637014514e-06,
"loss": 0.0092,
"num_tokens": 21610025.0,
"reward": 0.3294339914806187,
"reward_std": 0.15150615764432587,
"rewards/code_reward/mean": 0.22959023197181522,
"rewards/code_reward/std": 0.15142173281637952,
"rewards/format_reward/mean": 0.9984375,
"rewards/format_reward/std": 0.004419417306780815,
"step": 150
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 658.65,
"completions/max_terminated_length": 658.65,
"completions/mean_length": 498.0640625,
"completions/mean_terminated_length": 498.0640625,
"completions/min_length": 366.95,
"completions/min_terminated_length": 366.95,
"epoch": 0.06939974814607527,
"grad_norm": 0.5384545916898208,
"kl": 0.02587127685546875,
"learning_rate": 2.7276822960489817e-06,
"loss": -0.0011,
"num_tokens": 22304426.0,
"reward": 0.3431210536509752,
"reward_std": 0.15553151002968663,
"rewards/code_reward/mean": 0.24312104810524032,
"rewards/code_reward/std": 0.1555315111123491,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 155
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 765.3125,
"completions/max_terminated_length": 765.3125,
"completions/mean_length": 542.303125,
"completions/mean_terminated_length": 542.303125,
"completions/min_length": 394.875,
"completions/min_terminated_length": 394.875,
"epoch": 0.07163844969917448,
"grad_norm": 0.5684225568583741,
"kl": 0.030213165283203124,
"learning_rate": 2.7001735620986323e-06,
"loss": 0.0162,
"num_tokens": 23031900.0,
"reward": 0.29204714838415385,
"reward_std": 0.15333203882328234,
"rewards/code_reward/mean": 0.19235964192193933,
"rewards/code_reward/std": 0.15339574370882475,
"rewards/format_reward/mean": 0.996875,
"rewards/format_reward/std": 0.00883883461356163,
"step": 160
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 842.8375,
"completions/max_terminated_length": 842.8375,
"completions/mean_length": 574.2078125,
"completions/mean_terminated_length": 574.2078125,
"completions/min_length": 428.4375,
"completions/min_terminated_length": 428.4375,
"epoch": 0.07387715125227368,
"grad_norm": 0.5435945556590033,
"kl": 0.027813720703125,
"learning_rate": 2.671494486810974e-06,
"loss": 0.0106,
"num_tokens": 23789657.0,
"reward": 0.3045080302283168,
"reward_std": 0.16393477989186067,
"rewards/code_reward/mean": 0.20466427168576046,
"rewards/code_reward/std": 0.1636599010293139,
"rewards/format_reward/mean": 0.9984375,
"rewards/format_reward/std": 0.004419417306780815,
"step": 165
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 743.1375,
"completions/max_terminated_length": 743.1375,
"completions/mean_length": 544.6453125,
"completions/mean_terminated_length": 544.6453125,
"completions/min_length": 400.9375,
"completions/min_terminated_length": 400.9375,
"epoch": 0.07611585280537288,
"grad_norm": 0.5551210527840703,
"kl": 0.03048858642578125,
"learning_rate": 2.641673036397215e-06,
"loss": 0.0108,
"num_tokens": 24537942.0,
"reward": 0.2919698000885546,
"reward_std": 0.14443947067193222,
"rewards/code_reward/mean": 0.19228229282743997,
"rewards/code_reward/std": 0.1438393424032256,
"rewards/format_reward/mean": 0.996875,
"rewards/format_reward/std": 0.00883883461356163,
"step": 170
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 700.1125,
"completions/max_terminated_length": 700.1125,
"completions/mean_length": 538.8515625,
"completions/mean_terminated_length": 538.8515625,
"completions/min_length": 408.6,
"completions/min_terminated_length": 408.6,
"epoch": 0.07835455435847209,
"grad_norm": 0.6528644572698393,
"kl": 0.028961181640625,
"learning_rate": 2.610738291048138e-06,
"loss": 0.0133,
"num_tokens": 25267431.0,
"reward": 0.274235178809613,
"reward_std": 0.1538910755480174,
"rewards/code_reward/mean": 0.17439142313669437,
"rewards/code_reward/std": 0.15372212599031626,
"rewards/format_reward/mean": 0.9984375,
"rewards/format_reward/std": 0.004419417306780815,
"step": 175
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 697.1375,
"completions/max_terminated_length": 697.1375,
"completions/mean_length": 517.7078125,
"completions/mean_terminated_length": 517.7078125,
"completions/min_length": 373.8875,
"completions/min_terminated_length": 373.8875,
"epoch": 0.0805932559115713,
"grad_norm": 0.5356003023929052,
"kl": 0.027515411376953125,
"learning_rate": 2.5787204165767413e-06,
"loss": 0.0123,
"num_tokens": 26025444.0,
"reward": 0.31410480896010995,
"reward_std": 0.17551766034448518,
"rewards/code_reward/mean": 0.21410479900659993,
"rewards/code_reward/std": 0.17551766115357167,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 180
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 627.9375,
"completions/max_terminated_length": 627.9375,
"completions/mean_length": 466.7078125,
"completions/mean_terminated_length": 466.7078125,
"completions/min_length": 336.625,
"completions/min_terminated_length": 336.625,
"epoch": 0.0828319574646705,
"grad_norm": 0.5875291034180061,
"kl": 0.03062591552734375,
"learning_rate": 2.545650635002249e-06,
"loss": 0.014,
"num_tokens": 26715345.0,
"reward": 0.3225731427781284,
"reward_std": 0.14460668399697169,
"rewards/code_reward/mean": 0.22288563377878745,
"rewards/code_reward/std": 0.1446731591859134,
"rewards/format_reward/mean": 0.996875,
"rewards/format_reward/std": 0.00883883461356163,
"step": 185
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 645.05,
"completions/max_terminated_length": 645.05,
"completions/mean_length": 468.11875,
"completions/mean_terminated_length": 468.11875,
"completions/min_length": 320.5625,
"completions/min_terminated_length": 320.5625,
"epoch": 0.0850706590177697,
"grad_norm": 0.5981227815110649,
"kl": 0.03143310546875,
"learning_rate": 2.511561194104161e-06,
"loss": 0.0158,
"num_tokens": 27388005.0,
"reward": 0.30132306115701796,
"reward_std": 0.11532193489110795,
"rewards/code_reward/mean": 0.20147930511957385,
"rewards/code_reward/std": 0.11487999467644841,
"rewards/format_reward/mean": 0.9984375,
"rewards/format_reward/std": 0.004419417306780815,
"step": 190
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 694.2375,
"completions/max_terminated_length": 694.2375,
"completions/mean_length": 508.18125,
"completions/mean_terminated_length": 508.18125,
"completions/min_length": 354.7,
"completions/min_terminated_length": 354.7,
"epoch": 0.0873093605708689,
"grad_norm": 0.7051969480561995,
"kl": 0.030487060546875,
"learning_rate": 2.4764853359760447e-06,
"loss": 0.0074,
"num_tokens": 28089689.0,
"reward": 0.2780560509301722,
"reward_std": 0.13229238498024642,
"rewards/code_reward/mean": 0.17805604453606066,
"rewards/code_reward/std": 0.13229238652565983,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 195
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 679.9625,
"completions/max_terminated_length": 679.9625,
"completions/mean_length": 510.11875,
"completions/mean_terminated_length": 510.11875,
"completions/min_length": 362.825,
"completions/min_terminated_length": 362.825,
"epoch": 0.0895480621239681,
"grad_norm": 0.5512771391532328,
"kl": 0.02972869873046875,
"learning_rate": 2.440457264609727e-06,
"loss": 0.0022,
"num_tokens": 28787549.0,
"reward": 0.2989016550593078,
"reward_std": 0.15942465648986398,
"rewards/code_reward/mean": 0.1989016504448955,
"rewards/code_reward/std": 0.15942465687403456,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 200
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 742.85,
"completions/max_terminated_length": 742.85,
"completions/mean_length": 550.4421875,
"completions/mean_terminated_length": 550.4421875,
"completions/min_length": 397.275,
"completions/min_terminated_length": 397.275,
"epoch": 0.0917867636770673,
"grad_norm": 0.6115605511607163,
"kl": 0.02950439453125,
"learning_rate": 2.403512112541498e-06,
"loss": 0.0262,
"num_tokens": 29531328.0,
"reward": 0.3011234959587455,
"reward_std": 0.13739942002575845,
"rewards/code_reward/mean": 0.20127973848429975,
"rewards/code_reward/std": 0.13699884270899929,
"rewards/format_reward/mean": 0.9984375,
"rewards/format_reward/std": 0.004419417306780815,
"step": 205
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 738.225,
"completions/max_terminated_length": 738.225,
"completions/mean_length": 538.3640625,
"completions/mean_terminated_length": 538.3640625,
"completions/min_length": 388.85,
"completions/min_terminated_length": 388.85,
"epoch": 0.0940254652301665,
"grad_norm": 0.6180896800134059,
"kl": 0.02983551025390625,
"learning_rate": 2.365685906592846e-06,
"loss": 0.013,
"num_tokens": 30274617.0,
"reward": 0.28743315050378443,
"reward_std": 0.14888401252392214,
"rewards/code_reward/mean": 0.18743314441671827,
"rewards/code_reward/std": 0.14888401648786384,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 210
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 657.8125,
"completions/max_terminated_length": 657.8125,
"completions/mean_length": 508.53125,
"completions/mean_terminated_length": 508.53125,
"completions/min_length": 375.2625,
"completions/min_terminated_length": 375.2625,
"epoch": 0.0962641667832657,
"grad_norm": 0.5149353831339782,
"kl": 0.0354248046875,
"learning_rate": 2.327015532739145e-06,
"loss": -0.0035,
"num_tokens": 30968253.0,
"reward": 0.3200162294320762,
"reward_std": 0.16002128778782207,
"rewards/code_reward/mean": 0.22001622177049285,
"rewards/code_reward/std": 0.16002129036933183,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 215
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 693.8375,
"completions/max_terminated_length": 693.8375,
"completions/mean_length": 516.459375,
"completions/mean_terminated_length": 516.459375,
"completions/min_length": 385.2,
"completions/min_terminated_length": 385.2,
"epoch": 0.0985028683363649,
"grad_norm": 0.583768547911125,
"kl": 0.032080078125,
"learning_rate": 2.2875387001405366e-06,
"loss": -0.0004,
"num_tokens": 31677939.0,
"reward": 0.2827278276905417,
"reward_std": 0.12490762829547748,
"rewards/code_reward/mean": 0.182884071078297,
"rewards/code_reward/std": 0.12475912600348238,
"rewards/format_reward/mean": 0.9984375,
"rewards/format_reward/std": 0.004419417306780815,
"step": 220
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 689.225,
"completions/max_terminated_length": 689.225,
"completions/mean_length": 511.4609375,
"completions/mean_terminated_length": 511.4609375,
"completions/min_length": 375.7,
"completions/min_terminated_length": 375.7,
"epoch": 0.10074156988946412,
"grad_norm": 0.47416592884978,
"kl": 0.03255615234375,
"learning_rate": 2.2472939043700894e-06,
"loss": 0.0104,
"num_tokens": 32366802.0,
"reward": 0.288489468768239,
"reward_std": 0.14980540352989918,
"rewards/code_reward/mean": 0.18880196339305258,
"rewards/code_reward/std": 0.14945577481121292,
"rewards/format_reward/mean": 0.996875,
"rewards/format_reward/std": 0.00883883461356163,
"step": 225
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 709.1375,
"completions/max_terminated_length": 709.1375,
"completions/mean_length": 537.6703125,
"completions/mean_terminated_length": 537.6703125,
"completions/min_length": 400.325,
"completions/min_terminated_length": 400.325,
"epoch": 0.10298027144256332,
"grad_norm": 0.6526599784556473,
"kl": 0.031103515625,
"learning_rate": 2.206320389875099e-06,
"loss": 0.0004,
"num_tokens": 33092199.0,
"reward": 0.27060003159567714,
"reward_std": 0.14649803503416478,
"rewards/code_reward/mean": 0.1706000213016523,
"rewards/code_reward/std": 0.14649803435604553,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 230
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 688.975,
"completions/max_terminated_length": 688.975,
"completions/mean_length": 537.1703125,
"completions/mean_terminated_length": 537.1703125,
"completions/min_length": 413.1,
"completions/min_terminated_length": 413.1,
"epoch": 0.10521897299566252,
"grad_norm": 0.578479817853106,
"kl": 0.031402587890625,
"learning_rate": 2.1646581117081187e-06,
"loss": 0.0118,
"num_tokens": 33813252.0,
"reward": 0.24227329418063165,
"reward_std": 0.14281967077986338,
"rewards/code_reward/mean": 0.1422732870618347,
"rewards/code_reward/std": 0.1428196722699795,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 235
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 681.5375,
"completions/max_terminated_length": 681.5375,
"completions/mean_length": 533.6359375,
"completions/mean_terminated_length": 533.6359375,
"completions/min_length": 407.2125,
"completions/min_terminated_length": 407.2125,
"epoch": 0.10745767454876172,
"grad_norm": 0.6265187392839973,
"kl": 0.03284759521484375,
"learning_rate": 2.122347696565059e-06,
"loss": 0.0139,
"num_tokens": 34549147.0,
"reward": 0.33532751044258474,
"reward_std": 0.1622638524393551,
"rewards/code_reward/mean": 0.2353275064189802,
"rewards/code_reward/std": 0.1622638531640405,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 240
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 703.1125,
"completions/max_terminated_length": 703.1125,
"completions/mean_length": 537.425,
"completions/mean_terminated_length": 537.425,
"completions/min_length": 406.0,
"completions/min_terminated_length": 406.0,
"epoch": 0.10969637610186092,
"grad_norm": 0.6224509400429641,
"kl": 0.0321990966796875,
"learning_rate": 2.079430403168327e-06,
"loss": 0.0205,
"num_tokens": 35271579.0,
"reward": 0.3003238163888454,
"reward_std": 0.17288763520191425,
"rewards/code_reward/mean": 0.20032380691118307,
"rewards/code_reward/std": 0.1728876391222002,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 245
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 647.8375,
"completions/max_terminated_length": 647.8375,
"completions/mean_length": 501.9875,
"completions/mean_terminated_length": 501.9875,
"completions/min_length": 376.6375,
"completions/min_terminated_length": 376.6375,
"epoch": 0.11193507765496012,
"grad_norm": 0.6263781800971838,
"kl": 0.03351898193359375,
"learning_rate": 2.0359480820336594e-06,
"loss": 0.0094,
"num_tokens": 35965555.0,
"reward": 0.31694198679178953,
"reward_std": 0.1596899228548864,
"rewards/code_reward/mean": 0.2170982286144863,
"rewards/code_reward/std": 0.15924798299674875,
"rewards/format_reward/mean": 0.9984375,
"rewards/format_reward/std": 0.004419417306780815,
"step": 250
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 636.8625,
"completions/max_terminated_length": 636.8625,
"completions/mean_length": 492.7109375,
"completions/mean_terminated_length": 492.7109375,
"completions/min_length": 373.775,
"completions/min_terminated_length": 373.775,
"epoch": 0.11417377920805932,
"grad_norm": 0.6373686980037819,
"kl": 0.0332763671875,
"learning_rate": 1.9919431346598687e-06,
"loss": 0.0146,
"num_tokens": 36669402.0,
"reward": 0.30089313965290787,
"reward_std": 0.1563536574365571,
"rewards/code_reward/mean": 0.20089313458884134,
"rewards/code_reward/std": 0.15635365938651374,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 255
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 649.975,
"completions/max_terminated_length": 649.975,
"completions/mean_length": 497.025,
"completions/mean_terminated_length": 497.025,
"completions/min_length": 369.1625,
"completions/min_terminated_length": 369.1625,
"epoch": 0.11641248076115852,
"grad_norm": 0.6124161391568931,
"kl": 0.03218841552734375,
"learning_rate": 1.947458472181296e-06,
"loss": 0.0024,
"num_tokens": 37365858.0,
"reward": 0.31037036776542665,
"reward_std": 0.15006352393247652,
"rewards/code_reward/mean": 0.21037036021152744,
"rewards/code_reward/std": 0.15006352449127008,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 260
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 666.975,
"completions/max_terminated_length": 666.975,
"completions/mean_length": 511.603125,
"completions/mean_terminated_length": 511.603125,
"completions/min_length": 381.725,
"completions/min_terminated_length": 381.725,
"epoch": 0.11865118231425772,
"grad_norm": 0.5345035896498757,
"kl": 0.0315277099609375,
"learning_rate": 1.9025374735233068e-06,
"loss": 0.0154,
"num_tokens": 38086620.0,
"reward": 0.32326241619884966,
"reward_std": 0.14852707152604125,
"rewards/code_reward/mean": 0.2234186581481481,
"rewards/code_reward/std": 0.14887304982403293,
"rewards/format_reward/mean": 0.9984375,
"rewards/format_reward/std": 0.004419417306780815,
"step": 265
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 715.9125,
"completions/max_terminated_length": 715.9125,
"completions/mean_length": 527.903125,
"completions/mean_terminated_length": 527.903125,
"completions/min_length": 398.2875,
"completions/min_terminated_length": 398.2875,
"epoch": 0.12088988386735694,
"grad_norm": 0.573684423194082,
"kl": 0.0300811767578125,
"learning_rate": 1.8572239431016146e-06,
"loss": 0.0126,
"num_tokens": 38809214.0,
"reward": 0.2911208848468959,
"reward_std": 0.13888704897253773,
"rewards/code_reward/mean": 0.19127712811168748,
"rewards/code_reward/std": 0.13906100282329134,
"rewards/format_reward/mean": 0.9984375,
"rewards/format_reward/std": 0.004419417306780815,
"step": 270
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 685.725,
"completions/max_terminated_length": 685.725,
"completions/mean_length": 533.9171875,
"completions/mean_terminated_length": 533.9171875,
"completions/min_length": 391.1375,
"completions/min_terminated_length": 391.1375,
"epoch": 0.12312858542045614,
"grad_norm": 0.4830812794073296,
"kl": 0.03022613525390625,
"learning_rate": 1.8115620681066946e-06,
"loss": 0.0069,
"num_tokens": 39531329.0,
"reward": 0.37973827524110676,
"reward_std": 0.17492547728470526,
"rewards/code_reward/mean": 0.2798945170710795,
"rewards/code_reward/std": 0.1747873265412636,
"rewards/format_reward/mean": 0.9984375,
"rewards/format_reward/std": 0.004419417306780815,
"step": 275
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 764.4625,
"completions/max_terminated_length": 764.4625,
"completions/mean_length": 555.675,
"completions/mean_terminated_length": 555.675,
"completions/min_length": 406.3,
"completions/min_terminated_length": 406.3,
"epoch": 0.12536728697355534,
"grad_norm": 0.48053859411140093,
"kl": 0.02889251708984375,
"learning_rate": 1.765596375414936e-06,
"loss": 0.0177,
"num_tokens": 40297449.0,
"reward": 0.26671807700768113,
"reward_std": 0.14942678074003196,
"rewards/code_reward/mean": 0.1671868214616552,
"rewards/code_reward/std": 0.14887573684682137,
"rewards/format_reward/mean": 0.9953125,
"rewards/format_reward/std": 0.013258251920342445,
"step": 280
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 669.825,
"completions/max_terminated_length": 669.825,
"completions/mean_length": 511.8296875,
"completions/mean_terminated_length": 511.8296875,
"completions/min_length": 369.2625,
"completions/min_terminated_length": 369.2625,
"epoch": 0.12760598852665453,
"grad_norm": 0.5012923928076685,
"kl": 0.03134918212890625,
"learning_rate": 1.7193716881685532e-06,
"loss": 0.0171,
"num_tokens": 41000340.0,
"reward": 0.33275858471170067,
"reward_std": 0.16071395185717846,
"rewards/code_reward/mean": 0.23291482530039503,
"rewards/code_reward/std": 0.1606017280719243,
"rewards/format_reward/mean": 0.9984375,
"rewards/format_reward/std": 0.004419417306780815,
"step": 285
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0015625,
"completions/max_length": 857.1125,
"completions/max_terminated_length": 763.7375,
"completions/mean_length": 575.0640625,
"completions/mean_terminated_length": 563.0296878814697,
"completions/min_length": 400.15,
"completions/min_terminated_length": 400.15,
"epoch": 0.12984469007975374,
"grad_norm": 0.630042919145043,
"kl": 0.030005645751953126,
"learning_rate": 1.6729330820665925e-06,
"loss": 0.0156,
"num_tokens": 41754885.0,
"reward": 0.28822933994233607,
"reward_std": 0.1465720217616763,
"rewards/code_reward/mean": 0.18854183692019433,
"rewards/code_reward/std": 0.14625606250483542,
"rewards/format_reward/mean": 0.996875,
"rewards/format_reward/std": 0.00883883461356163,
"step": 290
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 716.25,
"completions/max_terminated_length": 716.25,
"completions/mean_length": 537.20625,
"completions/mean_terminated_length": 537.20625,
"completions/min_length": 392.175,
"completions/min_terminated_length": 392.175,
"epoch": 0.13208339163285296,
"grad_norm": 0.5107272382559945,
"kl": 0.03038330078125,
"learning_rate": 1.6263258414096618e-06,
"loss": 0.0154,
"num_tokens": 42470809.0,
"reward": 0.33072368800640106,
"reward_std": 0.2061192358552944,
"rewards/code_reward/mean": 0.23087992868968285,
"rewards/code_reward/std": 0.20595307812327518,
"rewards/format_reward/mean": 0.9984375,
"rewards/format_reward/std": 0.004419417306780815,
"step": 295
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 730.0875,
"completions/max_terminated_length": 730.0875,
"completions/mean_length": 539.428125,
"completions/mean_terminated_length": 539.428125,
"completions/min_length": 383.3125,
"completions/min_terminated_length": 383.3125,
"epoch": 0.13432209318595215,
"grad_norm": 0.537918547301204,
"kl": 0.0288299560546875,
"learning_rate": 1.5795954149412446e-06,
"loss": 0.0083,
"num_tokens": 43193235.0,
"reward": 0.34142726445570587,
"reward_std": 0.14466436323709786,
"rewards/code_reward/mean": 0.24173975624726154,
"rewards/code_reward/std": 0.14445849329931662,
"rewards/format_reward/mean": 0.996875,
"rewards/format_reward/std": 0.00883883461356163,
"step": 300
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 759.3,
"completions/max_terminated_length": 759.3,
"completions/mean_length": 557.8890625,
"completions/mean_terminated_length": 557.8890625,
"completions/min_length": 399.75,
"completions/min_terminated_length": 399.75,
"epoch": 0.13656079473905136,
"grad_norm": 0.584098194483569,
"kl": 0.02829742431640625,
"learning_rate": 1.5327873715286555e-06,
"loss": 0.0094,
"num_tokens": 43930988.0,
"reward": 0.2912998185493052,
"reward_std": 0.1508971786039183,
"rewards/code_reward/mean": 0.1914560628225445,
"rewards/code_reward/std": 0.15074160079238935,
"rewards/format_reward/mean": 0.9984375,
"rewards/format_reward/std": 0.004419417306780815,
"step": 305
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 703.5875,
"completions/max_terminated_length": 703.5875,
"completions/mean_length": 531.625,
"completions/mean_terminated_length": 531.625,
"completions/min_length": 398.075,
"completions/min_terminated_length": 398.075,
"epoch": 0.13879949629215055,
"grad_norm": 0.5610344866641752,
"kl": 0.029935455322265624,
"learning_rate": 1.4859473557268605e-06,
"loss": 0.0228,
"num_tokens": 44630804.0,
"reward": 0.31272673439234494,
"reward_std": 0.160137642340851,
"rewards/code_reward/mean": 0.21272672956984023,
"rewards/code_reward/std": 0.16013764539093245,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 310
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0015625,
"completions/max_length": 754.0625,
"completions/max_terminated_length": 673.325,
"completions/mean_length": 516.14375,
"completions/mean_terminated_length": 504.68303604125975,
"completions/min_length": 370.5375,
"completions/min_terminated_length": 370.5375,
"epoch": 0.14103819784524976,
"grad_norm": 0.5678371154254215,
"kl": 0.0303436279296875,
"learning_rate": 1.4391210432684911e-06,
"loss": 0.0172,
"num_tokens": 45353968.0,
"reward": 0.30479407841339706,
"reward_std": 0.1592210401489865,
"rewards/code_reward/mean": 0.20510657107515726,
"rewards/code_reward/std": 0.15867348304018378,
"rewards/format_reward/mean": 0.996875,
"rewards/format_reward/std": 0.00883883461356163,
"step": 315
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 662.725,
"completions/max_terminated_length": 662.725,
"completions/mean_length": 500.4296875,
"completions/mean_terminated_length": 500.4296875,
"completions/min_length": 366.975,
"completions/min_terminated_length": 366.975,
"epoch": 0.14327689939834895,
"grad_norm": 0.6017406713534427,
"kl": 0.03122406005859375,
"learning_rate": 1.3923540965234527e-06,
"loss": 0.0166,
"num_tokens": 46065395.0,
"reward": 0.3366297990083694,
"reward_std": 0.14250589827133808,
"rewards/code_reward/mean": 0.23662979124492267,
"rewards/code_reward/std": 0.14250590050360187,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 320
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0015625,
"completions/max_length": 756.5875,
"completions/max_terminated_length": 663.425,
"completions/mean_length": 501.0203125,
"completions/mean_terminated_length": 489.0866073608398,
"completions/min_length": 363.675,
"completions/min_terminated_length": 363.675,
"epoch": 0.14551560095144817,
"grad_norm": 0.6420872754773821,
"kl": 0.03084869384765625,
"learning_rate": 1.3456921199715669e-06,
"loss": 0.0183,
"num_tokens": 46769624.0,
"reward": 0.274929376039654,
"reward_std": 0.14380120979622008,
"rewards/code_reward/mean": 0.17508561803842895,
"rewards/code_reward/std": 0.1433592700981535,
"rewards/format_reward/mean": 0.9984375,
"rewards/format_reward/std": 0.004419417306780815,
"step": 325
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 662.975,
"completions/max_terminated_length": 662.975,
"completions/mean_length": 503.740625,
"completions/mean_terminated_length": 503.740625,
"completions/min_length": 379.2,
"completions/min_terminated_length": 379.2,
"epoch": 0.14775430250454735,
"grad_norm": 0.5635596946152118,
"kl": 0.0294403076171875,
"learning_rate": 1.2991806157316646e-06,
"loss": 0.0095,
"num_tokens": 47486962.0,
"reward": 0.2972354737110436,
"reward_std": 0.11910657306143549,
"rewards/code_reward/mean": 0.19739172172703548,
"rewards/code_reward/std": 0.11866463308397215,
"rewards/format_reward/mean": 0.9984375,
"rewards/format_reward/std": 0.004419417306780815,
"step": 330
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 714.7625,
"completions/max_terminated_length": 714.7625,
"completions/mean_length": 533.565625,
"completions/mean_terminated_length": 533.565625,
"completions/min_length": 388.1125,
"completions/min_terminated_length": 388.1125,
"epoch": 0.14999300405764657,
"grad_norm": 0.60933960917403,
"kl": 0.02783966064453125,
"learning_rate": 1.2528649391904927e-06,
"loss": 0.0078,
"num_tokens": 48202916.0,
"reward": 0.2663810454308987,
"reward_std": 0.13506472197477706,
"rewards/code_reward/mean": 0.1665372904652031,
"rewards/code_reward/std": 0.13477403752622194,
"rewards/format_reward/mean": 0.9984375,
"rewards/format_reward/std": 0.004419417306780815,
"step": 335
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 735.2125,
"completions/max_terminated_length": 735.2125,
"completions/mean_length": 552.1953125,
"completions/mean_terminated_length": 552.1953125,
"completions/min_length": 409.1875,
"completions/min_terminated_length": 409.1875,
"epoch": 0.15223170561074575,
"grad_norm": 0.49791400131307495,
"kl": 0.025357818603515624,
"learning_rate": 1.2067902547747076e-06,
"loss": 0.0164,
"num_tokens": 48932801.0,
"reward": 0.3229883606545627,
"reward_std": 0.1690987061272608,
"rewards/code_reward/mean": 0.2231446014760877,
"rewards/code_reward/std": 0.16865676557354164,
"rewards/format_reward/mean": 0.9984375,
"rewards/format_reward/std": 0.004419417306780815,
"step": 340
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 669.15,
"completions/max_terminated_length": 669.15,
"completions/mean_length": 518.7078125,
"completions/mean_terminated_length": 518.7078125,
"completions/min_length": 389.6,
"completions/min_terminated_length": 389.6,
"epoch": 0.15447040716384497,
"grad_norm": 0.5738371722180043,
"kl": 0.02738189697265625,
"learning_rate": 1.1610014919090847e-06,
"loss": 0.0011,
"num_tokens": 49618094.0,
"reward": 0.36557651134207847,
"reward_std": 0.1583593948977068,
"rewards/code_reward/mean": 0.2655765014962526,
"rewards/code_reward/std": 0.15835939861135556,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 345
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 763.775,
"completions/max_terminated_length": 763.775,
"completions/mean_length": 575.278125,
"completions/mean_terminated_length": 575.278125,
"completions/min_length": 430.725,
"completions/min_terminated_length": 430.725,
"epoch": 0.15670910871694418,
"grad_norm": 0.5330527785978358,
"kl": 0.02547607421875,
"learning_rate": 1.1155433012038849e-06,
"loss": 0.013,
"num_tokens": 50367344.0,
"reward": 0.3111037847585976,
"reward_std": 0.1396631282143062,
"rewards/code_reward/mean": 0.21110378042503725,
"rewards/code_reward/std": 0.1396631306008203,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 350
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 784.2,
"completions/max_terminated_length": 784.2,
"completions/mean_length": 581.1390625,
"completions/mean_terminated_length": 581.1390625,
"completions/min_length": 428.975,
"completions/min_terminated_length": 428.975,
"epoch": 0.15894781027004337,
"grad_norm": 0.5161816982734899,
"kl": 0.0270263671875,
"learning_rate": 1.0704600109141044e-06,
"loss": 0.0081,
"num_tokens": 51121985.0,
"reward": 0.2939129492267966,
"reward_std": 0.13785594400105766,
"rewards/code_reward/mean": 0.19406919270550133,
"rewards/code_reward/std": 0.1376118804764701,
"rewards/format_reward/mean": 0.9984375,
"rewards/format_reward/std": 0.004419417306780815,
"step": 355
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 759.9875,
"completions/max_terminated_length": 759.9875,
"completions/mean_length": 552.328125,
"completions/mean_terminated_length": 552.328125,
"completions/min_length": 402.8875,
"completions/min_terminated_length": 402.8875,
"epoch": 0.1611865118231426,
"grad_norm": 0.6003534969440923,
"kl": 0.026979827880859376,
"learning_rate": 1.0257955837130725e-06,
"loss": 0.0035,
"num_tokens": 51844651.0,
"reward": 0.28144540255889294,
"reward_std": 0.12947248641576153,
"rewards/code_reward/mean": 0.18144539590430214,
"rewards/code_reward/std": 0.1294724913313985,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 360
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 743.025,
"completions/max_terminated_length": 743.025,
"completions/mean_length": 557.615625,
"completions/mean_terminated_length": 557.615625,
"completions/min_length": 414.6,
"completions/min_terminated_length": 414.6,
"epoch": 0.16342521337624177,
"grad_norm": 0.5589064006858112,
"kl": 0.026873779296875,
"learning_rate": 9.815935738225377e-07,
"loss": 0.0076,
"num_tokens": 52581373.0,
"reward": 0.31030982043594124,
"reward_std": 0.14550057554297383,
"rewards/code_reward/mean": 0.21030981277799582,
"rewards/code_reward/std": 0.14550057782616932,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 365
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 781.1625,
"completions/max_terminated_length": 781.1625,
"completions/mean_length": 594.5203125,
"completions/mean_terminated_length": 594.5203125,
"completions/min_length": 453.1125,
"completions/min_terminated_length": 453.1125,
"epoch": 0.165663914929341,
"grad_norm": 0.5310916132708259,
"kl": 0.02626190185546875,
"learning_rate": 9.378970845410571e-07,
"loss": 0.0095,
"num_tokens": 53352410.0,
"reward": 0.2810199284926057,
"reward_std": 0.1396817062428454,
"rewards/code_reward/mean": 0.18101992065639932,
"rewards/code_reward/std": 0.13968170815496705,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 370
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 730.95,
"completions/max_terminated_length": 730.95,
"completions/mean_length": 549.3125,
"completions/mean_terminated_length": 549.3125,
"completions/min_length": 412.425,
"completions/min_terminated_length": 412.425,
"epoch": 0.16790261648244018,
"grad_norm": 0.5669991229498123,
"kl": 0.026529693603515626,
"learning_rate": 8.947487262120971e-07,
"loss": 0.0094,
"num_tokens": 54086442.0,
"reward": 0.2867281662300229,
"reward_std": 0.12997563436510973,
"rewards/code_reward/mean": 0.1867281592771178,
"rewards/code_reward/std": 0.1299756362393964,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 375
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 746.9125,
"completions/max_terminated_length": 746.9125,
"completions/mean_length": 568.5546875,
"completions/mean_terminated_length": 568.5546875,
"completions/min_length": 416.3125,
"completions/min_terminated_length": 416.3125,
"epoch": 0.1701413180355394,
"grad_norm": 0.520216538993176,
"kl": 0.02662353515625,
"learning_rate": 8.521905746728408e-07,
"loss": 0.0137,
"num_tokens": 54836845.0,
"reward": 0.3280904936604202,
"reward_std": 0.13382616126909852,
"rewards/code_reward/mean": 0.22809048727212938,
"rewards/code_reward/std": 0.13382616304443218,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 380
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 705.6,
"completions/max_terminated_length": 705.6,
"completions/mean_length": 541.746875,
"completions/mean_terminated_length": 541.746875,
"completions/min_length": 402.8875,
"completions/min_terminated_length": 402.8875,
"epoch": 0.17238001958863858,
"grad_norm": 0.5872381716257123,
"kl": 0.02609405517578125,
"learning_rate": 8.102641302242105e-07,
"loss": 0.015,
"num_tokens": 55553251.0,
"reward": 0.3441149082966149,
"reward_std": 0.18714927716646343,
"rewards/code_reward/mean": 0.24411489552585408,
"rewards/code_reward/std": 0.18714928096160294,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 385
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 689.0875,
"completions/max_terminated_length": 689.0875,
"completions/mean_length": 521.6296875,
"completions/mean_terminated_length": 521.6296875,
"completions/min_length": 382.25,
"completions/min_terminated_length": 382.25,
"epoch": 0.1746187211417378,
"grad_norm": 0.6105825047365391,
"kl": 0.02542877197265625,
"learning_rate": 7.690102771621219e-07,
"loss": 0.0134,
"num_tokens": 56255086.0,
"reward": 0.35199374333024025,
"reward_std": 0.1669875715917442,
"rewards/code_reward/mean": 0.25199373266659675,
"rewards/code_reward/std": 0.16698757499689237,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 390
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 702.6125,
"completions/max_terminated_length": 702.6125,
"completions/mean_length": 547.8109375,
"completions/mean_terminated_length": 547.8109375,
"completions/min_length": 407.5,
"completions/min_terminated_length": 407.5,
"epoch": 0.176857422694837,
"grad_norm": 0.4867528744361068,
"kl": 0.02476806640625,
"learning_rate": 7.284692439094368e-07,
"loss": 0.0058,
"num_tokens": 56994181.0,
"reward": 0.3013323726132512,
"reward_std": 0.15418729400844314,
"rewards/code_reward/mean": 0.20133236556430348,
"rewards/code_reward/std": 0.15418729329830966,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 395
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 716.2875,
"completions/max_terminated_length": 716.2875,
"completions/mean_length": 523.3640625,
"completions/mean_terminated_length": 523.3640625,
"completions/min_length": 390.775,
"completions/min_terminated_length": 390.775,
"epoch": 0.1790961242479362,
"grad_norm": 0.5350752422212481,
"kl": 0.025472259521484374,
"learning_rate": 6.886805637874772e-07,
"loss": 0.0033,
"num_tokens": 57711366.0,
"reward": 0.3107692304067314,
"reward_std": 0.1176008581998758,
"rewards/code_reward/mean": 0.21076922266220208,
"rewards/code_reward/std": 0.11760085919813719,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 400
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 708.1125,
"completions/max_terminated_length": 708.1125,
"completions/mean_length": 536.55,
"completions/mean_terminated_length": 536.55,
"completions/min_length": 393.9625,
"completions/min_terminated_length": 393.9625,
"epoch": 0.1813348258010354,
"grad_norm": 0.5886059510700545,
"kl": 0.02571868896484375,
"learning_rate": 6.496830364653691e-07,
"loss": 0.0107,
"num_tokens": 58433174.0,
"reward": 0.29278530003502967,
"reward_std": 0.14287365710479208,
"rewards/code_reward/mean": 0.19278529447619802,
"rewards/code_reward/std": 0.14287365918862632,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 405
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 727.0,
"completions/max_terminated_length": 727.0,
"completions/mean_length": 555.8109375,
"completions/mean_terminated_length": 555.8109375,
"completions/min_length": 411.6375,
"completions/min_terminated_length": 411.6375,
"epoch": 0.1835735273541346,
"grad_norm": 0.5723292784440707,
"kl": 0.02495880126953125,
"learning_rate": 6.115146901248015e-07,
"loss": 0.0128,
"num_tokens": 59179325.0,
"reward": 0.2888775954954326,
"reward_std": 0.13932973612099886,
"rewards/code_reward/mean": 0.18903384153090882,
"rewards/code_reward/std": 0.13905893911141903,
"rewards/format_reward/mean": 0.9984375,
"rewards/format_reward/std": 0.004419417306780815,
"step": 410
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 738.7,
"completions/max_terminated_length": 738.7,
"completions/mean_length": 555.56875,
"completions/mean_terminated_length": 555.56875,
"completions/min_length": 405.975,
"completions/min_terminated_length": 405.975,
"epoch": 0.1858122289072338,
"grad_norm": 0.5951785187855096,
"kl": 0.024257659912109375,
"learning_rate": 5.742127443770959e-07,
"loss": -0.0082,
"num_tokens": 59914129.0,
"reward": 0.32972582541406154,
"reward_std": 0.17325325938872993,
"rewards/code_reward/mean": 0.22988206883310341,
"rewards/code_reward/std": 0.17343256894964726,
"rewards/format_reward/mean": 0.9984375,
"rewards/format_reward/std": 0.004419417306780815,
"step": 415
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 789.075,
"completions/max_terminated_length": 789.075,
"completions/mean_length": 567.95625,
"completions/mean_terminated_length": 567.95625,
"completions/min_length": 411.5125,
"completions/min_terminated_length": 411.5125,
"epoch": 0.188050930460333,
"grad_norm": 0.5882458355124174,
"kl": 0.025067138671875,
"learning_rate": 5.378135739687457e-07,
"loss": 0.011,
"num_tokens": 60679605.0,
"reward": 0.3126169110648334,
"reward_std": 0.15262282044568565,
"rewards/code_reward/mean": 0.2126169038747321,
"rewards/code_reward/std": 0.15262282300391233,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 420
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 743.6,
"completions/max_terminated_length": 743.6,
"completions/mean_length": 563.934375,
"completions/mean_terminated_length": 563.934375,
"completions/min_length": 407.6,
"completions/min_terminated_length": 407.6,
"epoch": 0.19028963201343221,
"grad_norm": 0.5844514039485758,
"kl": 0.0232269287109375,
"learning_rate": 5.023526733108258e-07,
"loss": 0.0058,
"num_tokens": 61442035.0,
"reward": 0.28377067698165775,
"reward_std": 0.14047583957435564,
"rewards/code_reward/mean": 0.18392692334891764,
"rewards/code_reward/std": 0.1400338972482132,
"rewards/format_reward/mean": 0.9984375,
"rewards/format_reward/std": 0.004419417306780815,
"step": 425
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 706.925,
"completions/max_terminated_length": 706.925,
"completions/mean_length": 529.33125,
"completions/mean_terminated_length": 529.33125,
"completions/min_length": 382.5125,
"completions/min_terminated_length": 382.5125,
"epoch": 0.1925283335665314,
"grad_norm": 0.6068829854257141,
"kl": 0.024706268310546876,
"learning_rate": 4.6786462186684726e-07,
"loss": 0.0148,
"num_tokens": 62163871.0,
"reward": 0.3680115182884037,
"reward_std": 0.15169981086510234,
"rewards/code_reward/mean": 0.26801150970277376,
"rewards/code_reward/std": 0.1516998124890961,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 430
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 731.0,
"completions/max_terminated_length": 731.0,
"completions/mean_length": 537.5578125,
"completions/mean_terminated_length": 537.5578125,
"completions/min_length": 382.8875,
"completions/min_terminated_length": 382.8875,
"epoch": 0.19476703511963062,
"grad_norm": 0.6072526841260757,
"kl": 0.02366943359375,
"learning_rate": 4.3438305043282314e-07,
"loss": 0.0105,
"num_tokens": 62868964.0,
"reward": 0.288625252712518,
"reward_std": 0.14189330035296735,
"rewards/code_reward/mean": 0.1886252475058427,
"rewards/code_reward/std": 0.14189330387162044,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 435
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0015625,
"completions/max_length": 849.4625,
"completions/max_terminated_length": 760.275,
"completions/mean_length": 566.0203125,
"completions/mean_terminated_length": 554.5283485412598,
"completions/min_length": 385.2875,
"completions/min_terminated_length": 385.2875,
"epoch": 0.1970057366727298,
"grad_norm": 0.5544203482669213,
"kl": 0.02349853515625,
"learning_rate": 4.019406083424222e-07,
"loss": 0.024,
"num_tokens": 63645545.0,
"reward": 0.2873677465133369,
"reward_std": 0.1426866902038455,
"rewards/code_reward/mean": 0.18768023860175162,
"rewards/code_reward/std": 0.14217363530769944,
"rewards/format_reward/mean": 0.996875,
"rewards/format_reward/std": 0.00883883461356163,
"step": 440
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 696.725,
"completions/max_terminated_length": 696.725,
"completions/mean_length": 515.0796875,
"completions/mean_terminated_length": 515.0796875,
"completions/min_length": 375.0375,
"completions/min_terminated_length": 375.0375,
"epoch": 0.19924443822582902,
"grad_norm": 0.5972791947559509,
"kl": 0.02529144287109375,
"learning_rate": 3.7056893162918063e-07,
"loss": 0.0201,
"num_tokens": 64322420.0,
"reward": 0.3194495734758675,
"reward_std": 0.1723791634547524,
"rewards/code_reward/mean": 0.2194495657022344,
"rewards/code_reward/std": 0.17237916672602296,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 445
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 713.55,
"completions/max_terminated_length": 713.55,
"completions/mean_length": 530.815625,
"completions/mean_terminated_length": 530.815625,
"completions/min_length": 378.8375,
"completions/min_terminated_length": 378.8375,
"epoch": 0.20148313977892823,
"grad_norm": 0.4622077467737105,
"kl": 0.0240142822265625,
"learning_rate": 3.4029861217683744e-07,
"loss": 0.0039,
"num_tokens": 65055550.0,
"reward": 0.288273274153471,
"reward_std": 0.13295620558201335,
"rewards/code_reward/mean": 0.1882732652418781,
"rewards/code_reward/std": 0.13295620674616657,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 450
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 823.5,
"completions/max_terminated_length": 823.5,
"completions/mean_length": 554.6484375,
"completions/mean_terminated_length": 554.6484375,
"completions/min_length": 385.95,
"completions/min_terminated_length": 385.95,
"epoch": 0.20372184133202742,
"grad_norm": 0.5779355252222167,
"kl": 0.0229461669921875,
"learning_rate": 3.111591678878596e-07,
"loss": 0.0175,
"num_tokens": 65784213.0,
"reward": 0.2769805608317256,
"reward_std": 0.1467181654064916,
"rewards/code_reward/mean": 0.17698055310174823,
"rewards/code_reward/std": 0.14671816679183394,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 455
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 701.2,
"completions/max_terminated_length": 701.2,
"completions/mean_length": 516.5953125,
"completions/mean_terminated_length": 516.5953125,
"completions/min_length": 376.525,
"completions/min_terminated_length": 376.525,
"epoch": 0.20596054288512664,
"grad_norm": 0.7260007034342328,
"kl": 0.02352447509765625,
"learning_rate": 2.831790138992526e-07,
"loss": 0.0016,
"num_tokens": 66491018.0,
"reward": 0.2927206911146641,
"reward_std": 0.1309030485805124,
"rewards/code_reward/mean": 0.19272068199061324,
"rewards/code_reward/std": 0.1309030512755271,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 460
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 642.7125,
"completions/max_terminated_length": 642.7125,
"completions/mean_length": 490.215625,
"completions/mean_terminated_length": 490.215625,
"completions/min_length": 364.1375,
"completions/min_terminated_length": 364.1375,
"epoch": 0.20819924443822582,
"grad_norm": 0.594813196893333,
"kl": 0.024704742431640624,
"learning_rate": 2.563854348737275e-07,
"loss": 0.0158,
"num_tokens": 67154060.0,
"reward": 0.3452305795624852,
"reward_std": 0.1497524828504538,
"rewards/code_reward/mean": 0.24523057123151376,
"rewards/code_reward/std": 0.14975248328992166,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 465
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 698.1125,
"completions/max_terminated_length": 698.1125,
"completions/mean_length": 521.234375,
"completions/mean_terminated_length": 521.234375,
"completions/min_length": 375.2875,
"completions/min_terminated_length": 375.2875,
"epoch": 0.21043794599132504,
"grad_norm": 0.5197145225969613,
"kl": 0.0245391845703125,
"learning_rate": 2.3080455839324343e-07,
"loss": 0.0051,
"num_tokens": 67889866.0,
"reward": 0.28930564858019353,
"reward_std": 0.13884065752499736,
"rewards/code_reward/mean": 0.18961814382928424,
"rewards/code_reward/std": 0.13826202357886358,
"rewards/format_reward/mean": 0.996875,
"rewards/format_reward/std": 0.005786375701427459,
"step": 470
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 676.2625,
"completions/max_terminated_length": 676.2625,
"completions/mean_length": 497.1359375,
"completions/mean_terminated_length": 497.1359375,
"completions/min_length": 347.875,
"completions/min_terminated_length": 347.875,
"epoch": 0.21267664754442422,
"grad_norm": 0.6912907179100062,
"kl": 0.024478912353515625,
"learning_rate": 2.064613294808664e-07,
"loss": 0.0116,
"num_tokens": 68564793.0,
"reward": 0.36915110973641274,
"reward_std": 0.15182709340006112,
"rewards/code_reward/mean": 0.26946360208967235,
"rewards/code_reward/std": 0.15149775308091193,
"rewards/format_reward/mean": 0.996875,
"rewards/format_reward/std": 0.00883883461356163,
"step": 475
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 691.4625,
"completions/max_terminated_length": 691.4625,
"completions/mean_length": 522.4828125,
"completions/mean_terminated_length": 522.4828125,
"completions/min_length": 379.275,
"completions/min_terminated_length": 379.275,
"epoch": 0.21491534909752344,
"grad_norm": 0.5419894781125532,
"kl": 0.022618865966796874,
"learning_rate": 1.83379486275794e-07,
"loss": 0.0007,
"num_tokens": 69262638.0,
"reward": 0.3051586433313787,
"reward_std": 0.12916497962432913,
"rewards/code_reward/mean": 0.20515863316832111,
"rewards/code_reward/std": 0.12916498319245875,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 480
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 686.775,
"completions/max_terminated_length": 686.775,
"completions/mean_length": 508.7515625,
"completions/mean_terminated_length": 508.7515625,
"completions/min_length": 362.0,
"completions/min_terminated_length": 362.0,
"epoch": 0.21715405065062263,
"grad_norm": 0.688891040637323,
"kl": 0.02315826416015625,
"learning_rate": 1.6158153688526895e-07,
"loss": 0.0091,
"num_tokens": 69978223.0,
"reward": 0.3311784929595888,
"reward_std": 0.17100559230602813,
"rewards/code_reward/mean": 0.2311784830279066,
"rewards/code_reward/std": 0.17100559424143286,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 485
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 693.6375,
"completions/max_terminated_length": 693.6375,
"completions/mean_length": 522.1390625,
"completions/mean_terminated_length": 522.1390625,
"completions/min_length": 380.6875,
"completions/min_terminated_length": 380.6875,
"epoch": 0.21939275220372184,
"grad_norm": 0.5956108322738943,
"kl": 0.0235626220703125,
"learning_rate": 1.4108873743594274e-07,
"loss": 0.0124,
"num_tokens": 70730304.0,
"reward": 0.2989473403431475,
"reward_std": 0.13880361177725717,
"rewards/code_reward/mean": 0.19894733218825422,
"rewards/code_reward/std": 0.13880361234769226,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 490
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 683.5375,
"completions/max_terminated_length": 683.5375,
"completions/mean_length": 519.221875,
"completions/mean_terminated_length": 519.221875,
"completions/min_length": 386.7125,
"completions/min_terminated_length": 386.7125,
"epoch": 0.22163145375682106,
"grad_norm": 0.5427250334330507,
"kl": 0.023455810546875,
"learning_rate": 1.2192107134610586e-07,
"loss": 0.0135,
"num_tokens": 71448214.0,
"reward": 0.29871292021125556,
"reward_std": 0.12881716500851326,
"rewards/code_reward/mean": 0.19886916641116842,
"rewards/code_reward/std": 0.1288392253103666,
"rewards/format_reward/mean": 0.9984375,
"rewards/format_reward/std": 0.004419417306780815,
"step": 495
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 625.8,
"completions/max_terminated_length": 625.8,
"completions/mean_length": 472.728125,
"completions/mean_terminated_length": 472.728125,
"completions/min_length": 342.7375,
"completions/min_terminated_length": 342.7375,
"epoch": 0.22387015530992024,
"grad_norm": 0.6108848009428748,
"kl": 0.02464752197265625,
"learning_rate": 1.0409722983898928e-07,
"loss": 0.0093,
"num_tokens": 72117280.0,
"reward": 0.39794372050091625,
"reward_std": 0.1893569786072476,
"rewards/code_reward/mean": 0.29794371249881807,
"rewards/code_reward/std": 0.18935698276618496,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 500
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 665.15,
"completions/max_terminated_length": 665.15,
"completions/mean_length": 497.05625,
"completions/mean_terminated_length": 497.05625,
"completions/min_length": 354.125,
"completions/min_terminated_length": 354.125,
"epoch": 0.22610885686301946,
"grad_norm": 0.5589824240842507,
"kl": 0.0255523681640625,
"learning_rate": 8.763459371614036e-08,
"loss": 0.0183,
"num_tokens": 72815756.0,
"reward": 0.2931746931746602,
"reward_std": 0.15111528622946935,
"rewards/code_reward/mean": 0.19333093738896423,
"rewards/code_reward/std": 0.15067334883497097,
"rewards/format_reward/mean": 0.9984375,
"rewards/format_reward/std": 0.004419417306780815,
"step": 505
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 708.7375,
"completions/max_terminated_length": 708.7375,
"completions/mean_length": 525.8828125,
"completions/mean_terminated_length": 525.8828125,
"completions/min_length": 383.925,
"completions/min_terminated_length": 383.925,
"epoch": 0.22834755841611865,
"grad_norm": 0.5657630399821236,
"kl": 0.023580169677734374,
"learning_rate": 7.254921640864954e-08,
"loss": 0.005,
"num_tokens": 73527777.0,
"reward": 0.29336653435602783,
"reward_std": 0.15301572528260293,
"rewards/code_reward/mean": 0.19352277733851225,
"rewards/code_reward/std": 0.15257378248206804,
"rewards/format_reward/mean": 0.9984375,
"rewards/format_reward/std": 0.004419417306780815,
"step": 510
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 744.025,
"completions/max_terminated_length": 744.025,
"completions/mean_length": 538.8359375,
"completions/mean_terminated_length": 538.8359375,
"completions/min_length": 391.7625,
"completions/min_terminated_length": 391.7625,
"epoch": 0.23058625996921786,
"grad_norm": 0.5974194655481591,
"kl": 0.02305145263671875,
"learning_rate": 5.885580832275245e-08,
"loss": 0.0084,
"num_tokens": 74267080.0,
"reward": 0.2840338280424476,
"reward_std": 0.1604262540466152,
"rewards/code_reward/mean": 0.1840338213412906,
"rewards/code_reward/std": 0.16042625640693586,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 515
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 747.9375,
"completions/max_terminated_length": 747.9375,
"completions/mean_length": 543.165625,
"completions/mean_terminated_length": 543.165625,
"completions/min_length": 395.525,
"completions/min_terminated_length": 395.525,
"epoch": 0.23282496152231705,
"grad_norm": 0.6721925638851708,
"kl": 0.023305511474609374,
"learning_rate": 4.6567722495074685e-08,
"loss": 0.0021,
"num_tokens": 75032546.0,
"reward": 0.26900712195783855,
"reward_std": 0.15734463239787147,
"rewards/code_reward/mean": 0.16900711600319482,
"rewards/code_reward/std": 0.1573446374386549,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 520
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 688.1875,
"completions/max_terminated_length": 688.1875,
"completions/mean_length": 528.38125,
"completions/mean_terminated_length": 528.38125,
"completions/min_length": 385.1,
"completions/min_terminated_length": 385.1,
"epoch": 0.23506366307541626,
"grad_norm": 0.47513348986208354,
"kl": 0.023612213134765626,
"learning_rate": 3.5696941571505434e-08,
"loss": 0.0069,
"num_tokens": 75779806.0,
"reward": 0.2989699838683009,
"reward_std": 0.144676909170812,
"rewards/code_reward/mean": 0.19896997831820046,
"rewards/code_reward/std": 0.14467690934252458,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 525
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 678.4625,
"completions/max_terminated_length": 678.4625,
"completions/mean_length": 507.25625,
"completions/mean_terminated_length": 507.25625,
"completions/min_length": 360.5125,
"completions/min_terminated_length": 360.5125,
"epoch": 0.23730236462851545,
"grad_norm": 0.5125208061016464,
"kl": 0.02255859375,
"learning_rate": 2.625406612240039e-08,
"loss": 0.006,
"num_tokens": 76477890.0,
"reward": 0.3240066308528185,
"reward_std": 0.16057187110418453,
"rewards/code_reward/mean": 0.22400662462459878,
"rewards/code_reward/std": 0.16057187146507204,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 530
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 747.75,
"completions/max_terminated_length": 747.75,
"completions/mean_length": 534.534375,
"completions/mean_terminated_length": 534.534375,
"completions/min_length": 385.9125,
"completions/min_terminated_length": 385.9125,
"epoch": 0.23954106618161466,
"grad_norm": 0.4891371425966553,
"kl": 0.02330169677734375,
"learning_rate": 1.8248304305504505e-08,
"loss": 0.0196,
"num_tokens": 77209744.0,
"reward": 0.333328259550035,
"reward_std": 0.14479399558040312,
"rewards/code_reward/mean": 0.23332825346733443,
"rewards/code_reward/std": 0.1447939975943882,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 535
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 667.125,
"completions/max_terminated_length": 667.125,
"completions/mean_length": 501.2875,
"completions/mean_terminated_length": 501.2875,
"completions/min_length": 358.6,
"completions/min_terminated_length": 358.6,
"epoch": 0.24177976773471388,
"grad_norm": 0.5700891782095214,
"kl": 0.02592926025390625,
"learning_rate": 1.1687462886677713e-08,
"loss": 0.006,
"num_tokens": 77919416.0,
"reward": 0.313872685469687,
"reward_std": 0.1551548853807617,
"rewards/code_reward/mean": 0.2140289287781343,
"rewards/code_reward/std": 0.15488540646038018,
"rewards/format_reward/mean": 0.9984375,
"rewards/format_reward/std": 0.004419417306780815,
"step": 540
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 657.0375,
"completions/max_terminated_length": 657.0375,
"completions/mean_length": 498.54375,
"completions/mean_terminated_length": 498.54375,
"completions/min_length": 368.6625,
"completions/min_terminated_length": 368.6625,
"epoch": 0.24401846928781307,
"grad_norm": 0.6409927990786405,
"kl": 0.02302703857421875,
"learning_rate": 6.577939627179785e-09,
"loss": 0.0125,
"num_tokens": 78597028.0,
"reward": 0.3173367108218372,
"reward_std": 0.16166887313302142,
"rewards/code_reward/mean": 0.21764920413697836,
"rewards/code_reward/std": 0.16100556787860115,
"rewards/format_reward/mean": 0.996875,
"rewards/format_reward/std": 0.00883883461356163,
"step": 545
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 687.375,
"completions/max_terminated_length": 687.375,
"completions/mean_length": 507.9859375,
"completions/mean_terminated_length": 507.9859375,
"completions/min_length": 373.4375,
"completions/min_terminated_length": 373.4375,
"epoch": 0.24625717084091228,
"grad_norm": 0.5411596737613947,
"kl": 0.024321746826171876,
"learning_rate": 2.9247170449338e-09,
"loss": 0.005,
"num_tokens": 79308787.0,
"reward": 0.3536563721485436,
"reward_std": 0.12869162768765818,
"rewards/code_reward/mean": 0.2538126138912048,
"rewards/code_reward/std": 0.1283905382733792,
"rewards/format_reward/mean": 0.9984375,
"rewards/format_reward/std": 0.004419417306780815,
"step": 550
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 674.4875,
"completions/max_terminated_length": 674.4875,
"completions/mean_length": 508.71875,
"completions/mean_terminated_length": 508.71875,
"completions/min_length": 376.7,
"completions/min_terminated_length": 376.7,
"epoch": 0.24849587239401147,
"grad_norm": 0.6381506844335124,
"kl": 0.022603607177734374,
"learning_rate": 7.313575558583474e-10,
"loss": 0.0068,
"num_tokens": 79983935.0,
"reward": 0.3423418626189232,
"reward_std": 0.13657438448863105,
"rewards/code_reward/mean": 0.24234185529057867,
"rewards/code_reward/std": 0.1365743855072651,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 555
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 651.59375,
"completions/max_terminated_length": 651.59375,
"completions/mean_length": 491.193359375,
"completions/mean_terminated_length": 491.193359375,
"completions/min_length": 356.890625,
"completions/min_terminated_length": 356.890625,
"epoch": 0.25028683363649085,
"kl": 0.023431777954101562,
"num_tokens": 80543474.0,
"reward": 0.3967649736441672,
"reward_std": 0.1777252904503257,
"rewards/code_reward/mean": 0.2967649649071973,
"rewards/code_reward/std": 0.17772529531794135,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 559,
"total_flos": 0.0,
"train_loss": 0.001293145966497858,
"train_runtime": 17459.8588,
"train_samples_per_second": 0.512,
"train_steps_per_second": 0.032
}
],
"logging_steps": 5,
"max_steps": 559,
"num_input_tokens_seen": 80543474,
"num_train_epochs": 1,
"save_steps": 25,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}