diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,2635 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.25028683363649085, + "eval_steps": 500, + "global_step": 559, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 714.5625, + "completions/max_terminated_length": 714.5625, + "completions/mean_length": 534.09375, + "completions/mean_terminated_length": 534.09375, + "completions/min_length": 398.375, + "completions/min_terminated_length": 398.375, + "epoch": 0.00044774031061984047, + "grad_norm": 1.0911544979292094, + "kl": 0.0, + "learning_rate": 0.0, + "loss": 0.0164, + "num_tokens": 143396.0, + "reward": 0.1745322283823043, + "reward_std": 0.14398040855303407, + "rewards/code_reward/mean": 0.10812597409676528, + "rewards/code_reward/std": 0.11770913819782436, + "rewards/format_reward/mean": 0.6640625, + "rewards/format_reward/std": 0.44056092016398907, + "step": 1 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 782.5, + "completions/max_terminated_length": 782.5, + "completions/mean_length": 584.078125, + "completions/mean_terminated_length": 584.078125, + "completions/min_length": 428.625, + "completions/min_terminated_length": 428.625, + "epoch": 0.0022387015530992023, + "grad_norm": 1.0592214128916255, + "kl": 0.00044733285903930664, + "learning_rate": 2.1428571428571428e-07, + "loss": 0.0004, + "num_tokens": 772676.0, + "reward": 0.15631713026959915, + "reward_std": 0.14780386447091587, + "rewards/code_reward/mean": 0.09889525244216202, + "rewards/code_reward/std": 0.12754268431308446, + "rewards/format_reward/mean": 0.57421875, + "rewards/format_reward/std": 0.42564064590260386, + "step": 5 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 743.5625, + "completions/max_terminated_length": 743.5625, + "completions/mean_length": 566.3671875, + "completions/mean_terminated_length": 566.3671875, + "completions/min_length": 412.7625, + "completions/min_terminated_length": 412.7625, + "epoch": 0.004477403106198405, + "grad_norm": 0.8586902730302105, + "kl": 0.0006687402725219727, + "learning_rate": 4.821428571428572e-07, + "loss": 0.02, + "num_tokens": 1514535.0, + "reward": 0.21928292746888473, + "reward_std": 0.17851990209892393, + "rewards/code_reward/mean": 0.15240792171971407, + "rewards/code_reward/std": 0.15891524556500372, + "rewards/format_reward/mean": 0.66875, + "rewards/format_reward/std": 0.42151433378458025, + "step": 10 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 796.05, + "completions/max_terminated_length": 796.05, + "completions/mean_length": 587.6890625, + "completions/mean_terminated_length": 587.6890625, + "completions/min_length": 413.575, + "completions/min_terminated_length": 413.575, + "epoch": 0.0067161046592976075, + "grad_norm": 0.6576078448734007, + "kl": 0.002119898796081543, + "learning_rate": 7.5e-07, + "loss": 0.0262, + "num_tokens": 2322384.0, + "reward": 0.19728650886099786, + "reward_std": 0.153143038158305, + "rewards/code_reward/mean": 0.11244275536737405, + "rewards/code_reward/std": 0.13675388206611389, + "rewards/format_reward/mean": 0.8484375, + "rewards/format_reward/std": 0.2670775193721056, + "step": 15 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 736.6, + "completions/max_terminated_length": 736.6, + "completions/mean_length": 539.6390625, + "completions/mean_terminated_length": 539.6390625, + "completions/min_length": 379.7375, + "completions/min_terminated_length": 379.7375, + "epoch": 0.00895480621239681, + "grad_norm": 0.7320311360828152, + "kl": 0.002538633346557617, + "learning_rate": 1.017857142857143e-06, + "loss": 0.0079, + "num_tokens": 3037089.0, + "reward": 0.21442170465597882, + "reward_std": 0.14227938583353533, + "rewards/code_reward/mean": 0.12176544930553064, + "rewards/code_reward/std": 0.13433333449356724, + "rewards/format_reward/mean": 0.9265625, + "rewards/format_reward/std": 0.156092469394207, + "step": 20 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 770.625, + "completions/max_terminated_length": 770.625, + "completions/mean_length": 554.2328125, + "completions/mean_terminated_length": 554.2328125, + "completions/min_length": 395.7, + "completions/min_terminated_length": 395.7, + "epoch": 0.011193507765496012, + "grad_norm": 0.5737573580410903, + "kl": 0.00330963134765625, + "learning_rate": 1.2857142857142856e-06, + "loss": 0.0233, + "num_tokens": 3787614.0, + "reward": 0.22332688504830003, + "reward_std": 0.11517863497429062, + "rewards/code_reward/mean": 0.12520187861009618, + "rewards/code_reward/std": 0.11075652101717423, + "rewards/format_reward/mean": 0.98125, + "rewards/format_reward/std": 0.04998054876923561, + "step": 25 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 696.775, + "completions/max_terminated_length": 696.775, + "completions/mean_length": 527.1390625, + "completions/mean_terminated_length": 527.1390625, + "completions/min_length": 375.6, + "completions/min_terminated_length": 375.6, + "epoch": 0.013432209318595215, + "grad_norm": 0.6412924324023244, + "kl": 0.004455375671386719, + "learning_rate": 1.5535714285714287e-06, + "loss": 0.0292, + "num_tokens": 4536623.0, + "reward": 0.2232258369214833, + "reward_std": 0.13004211404477245, + "rewards/code_reward/mean": 0.12400708374771056, + "rewards/code_reward/std": 0.12888794834143483, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.022097086533904076, + "step": 30 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 688.475, + "completions/max_terminated_length": 688.475, + "completions/mean_length": 493.4390625, + "completions/mean_terminated_length": 493.4390625, + "completions/min_length": 351.2875, + "completions/min_terminated_length": 351.2875, + "epoch": 0.015670910871694418, + "grad_norm": 0.4441774535858199, + "kl": 0.005760383605957031, + "learning_rate": 1.8214285714285714e-06, + "loss": 0.0183, + "num_tokens": 5238216.0, + "reward": 0.23461700212210418, + "reward_std": 0.13695308727037628, + "rewards/code_reward/mean": 0.1349294964238652, + "rewards/code_reward/std": 0.13671803568140603, + "rewards/format_reward/mean": 0.996875, + "rewards/format_reward/std": 0.00883883461356163, + "step": 35 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.003125, + "completions/max_length": 815.8875, + "completions/max_terminated_length": 623.275, + "completions/mean_length": 488.790625, + "completions/mean_terminated_length": 464.4939735412598, + "completions/min_length": 338.1875, + "completions/min_terminated_length": 338.1875, + "epoch": 0.01790961242479362, + "grad_norm": 0.7054891224478806, + "kl": 0.007607078552246094, + "learning_rate": 2.089285714285714e-06, + "loss": 0.0369, + "num_tokens": 5931842.0, + "reward": 0.2295066607184708, + "reward_std": 0.13071401379711461, + "rewards/code_reward/mean": 0.12997540423093595, + "rewards/code_reward/std": 0.1293881902238354, + "rewards/format_reward/mean": 0.9953125, + "rewards/format_reward/std": 0.013258251920342445, + "step": 40 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 595.2375, + "completions/max_terminated_length": 595.2375, + "completions/mean_length": 450.5140625, + "completions/mean_terminated_length": 450.5140625, + "completions/min_length": 334.7125, + "completions/min_terminated_length": 334.7125, + "epoch": 0.020148313977892823, + "grad_norm": 0.7270226911269254, + "kl": 0.008572006225585937, + "learning_rate": 2.357142857142857e-06, + "loss": 0.0023, + "num_tokens": 6579707.0, + "reward": 0.29843434747308495, + "reward_std": 0.13938394124270417, + "rewards/code_reward/mean": 0.19905934149210225, + "rewards/code_reward/std": 0.13823813095805235, + "rewards/format_reward/mean": 0.99375, + "rewards/format_reward/std": 0.01767766922712326, + "step": 45 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0015625, + "completions/max_length": 771.7375, + "completions/max_terminated_length": 676.5875, + "completions/mean_length": 519.5140625, + "completions/mean_terminated_length": 507.42098236083984, + "completions/min_length": 371.625, + "completions/min_terminated_length": 371.625, + "epoch": 0.022387015530992024, + "grad_norm": 0.8864298285641923, + "kl": 0.009959030151367187, + "learning_rate": 2.6250000000000003e-06, + "loss": 0.0247, + "num_tokens": 7287164.0, + "reward": 0.24341339743696153, + "reward_std": 0.13661439061979763, + "rewards/code_reward/mean": 0.14700714359642006, + "rewards/code_reward/std": 0.13423215872608124, + "rewards/format_reward/mean": 0.9640625, + "rewards/format_reward/std": 0.06212893389165401, + "step": 50 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 683.3625, + "completions/max_terminated_length": 683.3625, + "completions/mean_length": 516.046875, + "completions/mean_terminated_length": 516.046875, + "completions/min_length": 386.925, + "completions/min_terminated_length": 386.925, + "epoch": 0.024625717084091225, + "grad_norm": 0.6382131264055037, + "kl": 0.008373641967773437, + "learning_rate": 2.892857142857143e-06, + "loss": 0.021, + "num_tokens": 7971626.0, + "reward": 0.3139894030056894, + "reward_std": 0.15021034325327492, + "rewards/code_reward/mean": 0.21695814684353537, + "rewards/code_reward/std": 0.14836436581681484, + "rewards/format_reward/mean": 0.9703125, + "rewards/format_reward/std": 0.046608568355441096, + "step": 55 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 661.7125, + "completions/max_terminated_length": 661.7125, + "completions/mean_length": 510.025, + "completions/mean_terminated_length": 510.025, + "completions/min_length": 382.3125, + "completions/min_terminated_length": 382.3125, + "epoch": 0.02686441863719043, + "grad_norm": 0.7435002543363699, + "kl": 0.009385299682617188, + "learning_rate": 2.9997366975852433e-06, + "loss": 0.0148, + "num_tokens": 8701666.0, + "reward": 0.24593741996213794, + "reward_std": 0.12826487933343741, + "rewards/code_reward/mean": 0.1460936620060238, + "rewards/code_reward/std": 0.1278229385818122, + "rewards/format_reward/mean": 0.9984375, + "rewards/format_reward/std": 0.004419417306780815, + "step": 60 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 694.2875, + "completions/max_terminated_length": 694.2875, + "completions/mean_length": 517.9140625, + "completions/mean_terminated_length": 517.9140625, + "completions/min_length": 369.8125, + "completions/min_terminated_length": 369.8125, + "epoch": 0.02910312019028963, + "grad_norm": 0.5685331001634877, + "kl": 0.012218093872070313, + "learning_rate": 2.9981279620139177e-06, + "loss": 0.0053, + "num_tokens": 9438523.0, + "reward": 0.24741017883643507, + "reward_std": 0.13050166001776234, + "rewards/code_reward/mean": 0.1474101732033887, + "rewards/code_reward/std": 0.13050166381872258, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 65 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 780.825, + "completions/max_terminated_length": 780.825, + "completions/mean_length": 535.19375, + "completions/mean_terminated_length": 535.19375, + "completions/min_length": 392.3875, + "completions/min_terminated_length": 392.3875, + "epoch": 0.031341821743388835, + "grad_norm": 0.5336222407251643, + "kl": 0.0148834228515625, + "learning_rate": 2.9950583368363777e-06, + "loss": 0.007, + "num_tokens": 10157391.0, + "reward": 0.296136565413326, + "reward_std": 0.16640246821043547, + "rewards/code_reward/mean": 0.19644906022003852, + "rewards/code_reward/std": 0.1658487796317786, + "rewards/format_reward/mean": 0.996875, + "rewards/format_reward/std": 0.00883883461356163, + "step": 70 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 797.0375, + "completions/max_terminated_length": 797.0375, + "completions/mean_length": 589.8359375, + "completions/mean_terminated_length": 589.8359375, + "completions/min_length": 426.275, + "completions/min_terminated_length": 426.275, + "epoch": 0.033580523296488037, + "grad_norm": 0.6369858920854246, + "kl": 0.017626190185546876, + "learning_rate": 2.990530815377378e-06, + "loss": 0.0087, + "num_tokens": 10930742.0, + "reward": 0.27690047658979894, + "reward_std": 0.12926372148940574, + "rewards/code_reward/mean": 0.177994220439723, + "rewards/code_reward/std": 0.12826473288878332, + "rewards/format_reward/mean": 0.9890625, + "rewards/format_reward/std": 0.027883462235331537, + "step": 75 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 758.7125, + "completions/max_terminated_length": 758.7125, + "completions/mean_length": 559.50625, + "completions/mean_terminated_length": 559.50625, + "completions/min_length": 398.6, + "completions/min_terminated_length": 398.6, + "epoch": 0.03581922484958724, + "grad_norm": 0.7033361512162839, + "kl": 0.01629638671875, + "learning_rate": 2.984549812619624e-06, + "loss": -0.0033, + "num_tokens": 11662834.0, + "reward": 0.2631410426460207, + "reward_std": 0.11843040494713933, + "rewards/code_reward/mean": 0.16329728582059033, + "rewards/code_reward/std": 0.11832479977165349, + "rewards/format_reward/mean": 0.9984375, + "rewards/format_reward/std": 0.004419417306780815, + "step": 80 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 671.2875, + "completions/max_terminated_length": 671.2875, + "completions/mean_length": 507.8875, + "completions/mean_terminated_length": 507.8875, + "completions/min_length": 377.5625, + "completions/min_terminated_length": 377.5625, + "epoch": 0.03805792640268644, + "grad_norm": 0.5966458269379716, + "kl": 0.0168060302734375, + "learning_rate": 2.9771211608985266e-06, + "loss": 0.0047, + "num_tokens": 12352234.0, + "reward": 0.32661316031590104, + "reward_std": 0.1419034074380761, + "rewards/code_reward/mean": 0.2267694047826808, + "rewards/code_reward/std": 0.14197679209755734, + "rewards/format_reward/mean": 0.9984375, + "rewards/format_reward/std": 0.004419417306780815, + "step": 85 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 664.3375, + "completions/max_terminated_length": 664.3375, + "completions/mean_length": 502.846875, + "completions/mean_terminated_length": 502.846875, + "completions/min_length": 376.3875, + "completions/min_terminated_length": 376.3875, + "epoch": 0.04029662795578565, + "grad_norm": 0.6882916695583966, + "kl": 0.017774200439453124, + "learning_rate": 2.968252104214841e-06, + "loss": 0.0162, + "num_tokens": 13055856.0, + "reward": 0.26416925797238944, + "reward_std": 0.15208177534805145, + "rewards/code_reward/mean": 0.16432550169847673, + "rewards/code_reward/std": 0.1518084899900714, + "rewards/format_reward/mean": 0.9984375, + "rewards/format_reward/std": 0.004419417306780815, + "step": 90 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 717.9125, + "completions/max_terminated_length": 717.9125, + "completions/mean_length": 529.171875, + "completions/mean_terminated_length": 529.171875, + "completions/min_length": 389.075, + "completions/min_terminated_length": 389.075, + "epoch": 0.04253532950888485, + "grad_norm": 0.5867793943695734, + "kl": 0.01979522705078125, + "learning_rate": 2.9579512911707257e-06, + "loss": 0.012, + "num_tokens": 13781566.0, + "reward": 0.29845606358721855, + "reward_std": 0.14189217127859594, + "rewards/code_reward/mean": 0.19845605657319537, + "rewards/code_reward/std": 0.1418921749223955, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 95 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 750.525, + "completions/max_terminated_length": 750.525, + "completions/mean_length": 518.6890625, + "completions/mean_terminated_length": 518.6890625, + "completions/min_length": 371.575, + "completions/min_terminated_length": 371.575, + "epoch": 0.04477403106198405, + "grad_norm": 0.6756173457255675, + "kl": 0.023876953125, + "learning_rate": 2.9462287665361157e-06, + "loss": 0.017, + "num_tokens": 14508775.0, + "reward": 0.2731386865489185, + "reward_std": 0.1473583393584704, + "rewards/code_reward/mean": 0.17345118119992547, + "rewards/code_reward/std": 0.14723392758751289, + "rewards/format_reward/mean": 0.996875, + "rewards/format_reward/std": 0.00883883461356163, + "step": 100 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 700.8, + "completions/max_terminated_length": 700.8, + "completions/mean_length": 474.2953125, + "completions/mean_terminated_length": 474.2953125, + "completions/min_length": 339.4, + "completions/min_terminated_length": 339.4, + "epoch": 0.04701273261508325, + "grad_norm": 0.6213587460739984, + "kl": 0.027069091796875, + "learning_rate": 2.9330959614536314e-06, + "loss": 0.016, + "num_tokens": 15178396.0, + "reward": 0.29834548365324737, + "reward_std": 0.13777082363376394, + "rewards/code_reward/mean": 0.19834547787031626, + "rewards/code_reward/std": 0.1377708253567107, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 105 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 652.6625, + "completions/max_terminated_length": 652.6625, + "completions/mean_length": 483.6328125, + "completions/mean_terminated_length": 483.6328125, + "completions/min_length": 346.775, + "completions/min_terminated_length": 346.775, + "epoch": 0.04925143416818245, + "grad_norm": 0.6361080911543711, + "kl": 0.02613983154296875, + "learning_rate": 2.9185656822915747e-06, + "loss": -0.0057, + "num_tokens": 15867273.0, + "reward": 0.29470919668674467, + "reward_std": 0.12798963281093165, + "rewards/code_reward/mean": 0.19486543894308853, + "rewards/code_reward/std": 0.12776418880966958, + "rewards/format_reward/mean": 0.9984375, + "rewards/format_reward/std": 0.004419417306780815, + "step": 110 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 741.1375, + "completions/max_terminated_length": 741.1375, + "completions/mean_length": 536.115625, + "completions/mean_terminated_length": 536.115625, + "completions/min_length": 393.3125, + "completions/min_terminated_length": 393.3125, + "epoch": 0.05149013572128166, + "grad_norm": 0.6555109923730857, + "kl": 0.0231597900390625, + "learning_rate": 2.9026520981558844e-06, + "loss": 0.009, + "num_tokens": 16604459.0, + "reward": 0.2888658272102475, + "reward_std": 0.15465332815947477, + "rewards/code_reward/mean": 0.1888658216179465, + "rewards/code_reward/std": 0.15465332991443576, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 115 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 690.875, + "completions/max_terminated_length": 690.875, + "completions/mean_length": 532.3109375, + "completions/mean_terminated_length": 532.3109375, + "completions/min_length": 403.2, + "completions/min_terminated_length": 403.2, + "epoch": 0.05372883727438086, + "grad_norm": 0.6650346931830148, + "kl": 0.024253082275390626, + "learning_rate": 2.8853707270732253e-06, + "loss": 0.0132, + "num_tokens": 17335906.0, + "reward": 0.35356655940413473, + "reward_std": 0.18680918092140927, + "rewards/code_reward/mean": 0.2537228013883578, + "rewards/code_reward/std": 0.18692450551316142, + "rewards/format_reward/mean": 0.9984375, + "rewards/format_reward/std": 0.004419417306780815, + "step": 120 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 694.6625, + "completions/max_terminated_length": 694.6625, + "completions/mean_length": 517.6953125, + "completions/mean_terminated_length": 517.6953125, + "completions/min_length": 385.7, + "completions/min_terminated_length": 385.7, + "epoch": 0.05596753882748006, + "grad_norm": 0.6254170069058008, + "kl": 0.025794219970703126, + "learning_rate": 2.8667384208586865e-06, + "loss": 0.0043, + "num_tokens": 18058943.0, + "reward": 0.3556826992891729, + "reward_std": 0.1363969652389642, + "rewards/code_reward/mean": 0.25583894047886135, + "rewards/code_reward/std": 0.1366761433542706, + "rewards/format_reward/mean": 0.9984375, + "rewards/format_reward/std": 0.004419417306780815, + "step": 125 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 706.0375, + "completions/max_terminated_length": 706.0375, + "completions/mean_length": 543.7203125, + "completions/mean_terminated_length": 543.7203125, + "completions/min_length": 405.2875, + "completions/min_terminated_length": 405.2875, + "epoch": 0.05820624038057926, + "grad_norm": 0.6216908171231574, + "kl": 0.02597503662109375, + "learning_rate": 2.846773348682845e-06, + "loss": 0.0007, + "num_tokens": 18775148.0, + "reward": 0.2654763679020107, + "reward_std": 0.13124802198726684, + "rewards/code_reward/mean": 0.1659451116924174, + "rewards/code_reward/std": 0.13088461093138903, + "rewards/format_reward/mean": 0.9953125, + "rewards/format_reward/std": 0.013258251920342445, + "step": 130 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 732.7125, + "completions/max_terminated_length": 732.7125, + "completions/mean_length": 550.0796875, + "completions/mean_terminated_length": 550.0796875, + "completions/min_length": 407.625, + "completions/min_terminated_length": 407.625, + "epoch": 0.06044494193367847, + "grad_norm": 0.6438445755693917, + "kl": 0.02695159912109375, + "learning_rate": 2.8254949793542194e-06, + "loss": 0.0133, + "num_tokens": 19516591.0, + "reward": 0.30453283004462717, + "reward_std": 0.15741582050104624, + "rewards/code_reward/mean": 0.20453282294183736, + "rewards/code_reward/std": 0.15741582473565358, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 135 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 706.0, + "completions/max_terminated_length": 706.0, + "completions/mean_length": 532.575, + "completions/mean_terminated_length": 532.575, + "completions/min_length": 389.0625, + "completions/min_terminated_length": 389.0625, + "epoch": 0.06268364348677767, + "grad_norm": 0.6680496022315153, + "kl": 0.02960357666015625, + "learning_rate": 2.802924062334391e-06, + "loss": 0.0146, + "num_tokens": 20241207.0, + "reward": 0.3066130679100752, + "reward_std": 0.18769313739612697, + "rewards/code_reward/mean": 0.2072380588942906, + "rewards/code_reward/std": 0.18655281127139461, + "rewards/format_reward/mean": 0.99375, + "rewards/format_reward/std": 0.01767766922712326, + "step": 140 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 711.9125, + "completions/max_terminated_length": 711.9125, + "completions/mean_length": 507.396875, + "completions/mean_terminated_length": 507.396875, + "completions/min_length": 354.5, + "completions/min_terminated_length": 354.5, + "epoch": 0.06492234503987687, + "grad_norm": 0.6756739470558975, + "kl": 0.02831573486328125, + "learning_rate": 2.779082607504298e-06, + "loss": 0.015, + "num_tokens": 20963517.0, + "reward": 0.28101985761895776, + "reward_std": 0.17780419969349168, + "rewards/code_reward/mean": 0.1811761005956214, + "rewards/code_reward/std": 0.17785799705889077, + "rewards/format_reward/mean": 0.9984375, + "rewards/format_reward/std": 0.004419417306780815, + "step": 145 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 606.0625, + "completions/max_terminated_length": 606.0625, + "completions/mean_length": 448.28125, + "completions/mean_terminated_length": 448.28125, + "completions/min_length": 322.1375, + "completions/min_terminated_length": 322.1375, + "epoch": 0.06716104659297607, + "grad_norm": 0.5719295032736604, + "kl": 0.02752532958984375, + "learning_rate": 2.7539938637014514e-06, + "loss": 0.0092, + "num_tokens": 21610025.0, + "reward": 0.3294339914806187, + "reward_std": 0.15150615764432587, + "rewards/code_reward/mean": 0.22959023197181522, + "rewards/code_reward/std": 0.15142173281637952, + "rewards/format_reward/mean": 0.9984375, + "rewards/format_reward/std": 0.004419417306780815, + "step": 150 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 658.65, + "completions/max_terminated_length": 658.65, + "completions/mean_length": 498.0640625, + "completions/mean_terminated_length": 498.0640625, + "completions/min_length": 366.95, + "completions/min_terminated_length": 366.95, + "epoch": 0.06939974814607527, + "grad_norm": 0.5384545916898208, + "kl": 0.02587127685546875, + "learning_rate": 2.7276822960489817e-06, + "loss": -0.0011, + "num_tokens": 22304426.0, + "reward": 0.3431210536509752, + "reward_std": 0.15553151002968663, + "rewards/code_reward/mean": 0.24312104810524032, + "rewards/code_reward/std": 0.1555315111123491, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 155 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 765.3125, + "completions/max_terminated_length": 765.3125, + "completions/mean_length": 542.303125, + "completions/mean_terminated_length": 542.303125, + "completions/min_length": 394.875, + "completions/min_terminated_length": 394.875, + "epoch": 0.07163844969917448, + "grad_norm": 0.5684225568583741, + "kl": 0.030213165283203124, + "learning_rate": 2.7001735620986323e-06, + "loss": 0.0162, + "num_tokens": 23031900.0, + "reward": 0.29204714838415385, + "reward_std": 0.15333203882328234, + "rewards/code_reward/mean": 0.19235964192193933, + "rewards/code_reward/std": 0.15339574370882475, + "rewards/format_reward/mean": 0.996875, + "rewards/format_reward/std": 0.00883883461356163, + "step": 160 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 842.8375, + "completions/max_terminated_length": 842.8375, + "completions/mean_length": 574.2078125, + "completions/mean_terminated_length": 574.2078125, + "completions/min_length": 428.4375, + "completions/min_terminated_length": 428.4375, + "epoch": 0.07387715125227368, + "grad_norm": 0.5435945556590033, + "kl": 0.027813720703125, + "learning_rate": 2.671494486810974e-06, + "loss": 0.0106, + "num_tokens": 23789657.0, + "reward": 0.3045080302283168, + "reward_std": 0.16393477989186067, + "rewards/code_reward/mean": 0.20466427168576046, + "rewards/code_reward/std": 0.1636599010293139, + "rewards/format_reward/mean": 0.9984375, + "rewards/format_reward/std": 0.004419417306780815, + "step": 165 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 743.1375, + "completions/max_terminated_length": 743.1375, + "completions/mean_length": 544.6453125, + "completions/mean_terminated_length": 544.6453125, + "completions/min_length": 400.9375, + "completions/min_terminated_length": 400.9375, + "epoch": 0.07611585280537288, + "grad_norm": 0.5551210527840703, + "kl": 0.03048858642578125, + "learning_rate": 2.641673036397215e-06, + "loss": 0.0108, + "num_tokens": 24537942.0, + "reward": 0.2919698000885546, + "reward_std": 0.14443947067193222, + "rewards/code_reward/mean": 0.19228229282743997, + "rewards/code_reward/std": 0.1438393424032256, + "rewards/format_reward/mean": 0.996875, + "rewards/format_reward/std": 0.00883883461356163, + "step": 170 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 700.1125, + "completions/max_terminated_length": 700.1125, + "completions/mean_length": 538.8515625, + "completions/mean_terminated_length": 538.8515625, + "completions/min_length": 408.6, + "completions/min_terminated_length": 408.6, + "epoch": 0.07835455435847209, + "grad_norm": 0.6528644572698393, + "kl": 0.028961181640625, + "learning_rate": 2.610738291048138e-06, + "loss": 0.0133, + "num_tokens": 25267431.0, + "reward": 0.274235178809613, + "reward_std": 0.1538910755480174, + "rewards/code_reward/mean": 0.17439142313669437, + "rewards/code_reward/std": 0.15372212599031626, + "rewards/format_reward/mean": 0.9984375, + "rewards/format_reward/std": 0.004419417306780815, + "step": 175 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 697.1375, + "completions/max_terminated_length": 697.1375, + "completions/mean_length": 517.7078125, + "completions/mean_terminated_length": 517.7078125, + "completions/min_length": 373.8875, + "completions/min_terminated_length": 373.8875, + "epoch": 0.0805932559115713, + "grad_norm": 0.5356003023929052, + "kl": 0.027515411376953125, + "learning_rate": 2.5787204165767413e-06, + "loss": 0.0123, + "num_tokens": 26025444.0, + "reward": 0.31410480896010995, + "reward_std": 0.17551766034448518, + "rewards/code_reward/mean": 0.21410479900659993, + "rewards/code_reward/std": 0.17551766115357167, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 180 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 627.9375, + "completions/max_terminated_length": 627.9375, + "completions/mean_length": 466.7078125, + "completions/mean_terminated_length": 466.7078125, + "completions/min_length": 336.625, + "completions/min_terminated_length": 336.625, + "epoch": 0.0828319574646705, + "grad_norm": 0.5875291034180061, + "kl": 0.03062591552734375, + "learning_rate": 2.545650635002249e-06, + "loss": 0.014, + "num_tokens": 26715345.0, + "reward": 0.3225731427781284, + "reward_std": 0.14460668399697169, + "rewards/code_reward/mean": 0.22288563377878745, + "rewards/code_reward/std": 0.1446731591859134, + "rewards/format_reward/mean": 0.996875, + "rewards/format_reward/std": 0.00883883461356163, + "step": 185 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 645.05, + "completions/max_terminated_length": 645.05, + "completions/mean_length": 468.11875, + "completions/mean_terminated_length": 468.11875, + "completions/min_length": 320.5625, + "completions/min_terminated_length": 320.5625, + "epoch": 0.0850706590177697, + "grad_norm": 0.5981227815110649, + "kl": 0.03143310546875, + "learning_rate": 2.511561194104161e-06, + "loss": 0.0158, + "num_tokens": 27388005.0, + "reward": 0.30132306115701796, + "reward_std": 0.11532193489110795, + "rewards/code_reward/mean": 0.20147930511957385, + "rewards/code_reward/std": 0.11487999467644841, + "rewards/format_reward/mean": 0.9984375, + "rewards/format_reward/std": 0.004419417306780815, + "step": 190 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 694.2375, + "completions/max_terminated_length": 694.2375, + "completions/mean_length": 508.18125, + "completions/mean_terminated_length": 508.18125, + "completions/min_length": 354.7, + "completions/min_terminated_length": 354.7, + "epoch": 0.0873093605708689, + "grad_norm": 0.7051969480561995, + "kl": 0.030487060546875, + "learning_rate": 2.4764853359760447e-06, + "loss": 0.0074, + "num_tokens": 28089689.0, + "reward": 0.2780560509301722, + "reward_std": 0.13229238498024642, + "rewards/code_reward/mean": 0.17805604453606066, + "rewards/code_reward/std": 0.13229238652565983, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 195 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 679.9625, + "completions/max_terminated_length": 679.9625, + "completions/mean_length": 510.11875, + "completions/mean_terminated_length": 510.11875, + "completions/min_length": 362.825, + "completions/min_terminated_length": 362.825, + "epoch": 0.0895480621239681, + "grad_norm": 0.5512771391532328, + "kl": 0.02972869873046875, + "learning_rate": 2.440457264609727e-06, + "loss": 0.0022, + "num_tokens": 28787549.0, + "reward": 0.2989016550593078, + "reward_std": 0.15942465648986398, + "rewards/code_reward/mean": 0.1989016504448955, + "rewards/code_reward/std": 0.15942465687403456, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 200 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 742.85, + "completions/max_terminated_length": 742.85, + "completions/mean_length": 550.4421875, + "completions/mean_terminated_length": 550.4421875, + "completions/min_length": 397.275, + "completions/min_terminated_length": 397.275, + "epoch": 0.0917867636770673, + "grad_norm": 0.6115605511607163, + "kl": 0.02950439453125, + "learning_rate": 2.403512112541498e-06, + "loss": 0.0262, + "num_tokens": 29531328.0, + "reward": 0.3011234959587455, + "reward_std": 0.13739942002575845, + "rewards/code_reward/mean": 0.20127973848429975, + "rewards/code_reward/std": 0.13699884270899929, + "rewards/format_reward/mean": 0.9984375, + "rewards/format_reward/std": 0.004419417306780815, + "step": 205 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 738.225, + "completions/max_terminated_length": 738.225, + "completions/mean_length": 538.3640625, + "completions/mean_terminated_length": 538.3640625, + "completions/min_length": 388.85, + "completions/min_terminated_length": 388.85, + "epoch": 0.0940254652301665, + "grad_norm": 0.6180896800134059, + "kl": 0.02983551025390625, + "learning_rate": 2.365685906592846e-06, + "loss": 0.013, + "num_tokens": 30274617.0, + "reward": 0.28743315050378443, + "reward_std": 0.14888401252392214, + "rewards/code_reward/mean": 0.18743314441671827, + "rewards/code_reward/std": 0.14888401648786384, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 210 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 657.8125, + "completions/max_terminated_length": 657.8125, + "completions/mean_length": 508.53125, + "completions/mean_terminated_length": 508.53125, + "completions/min_length": 375.2625, + "completions/min_terminated_length": 375.2625, + "epoch": 0.0962641667832657, + "grad_norm": 0.5149353831339782, + "kl": 0.0354248046875, + "learning_rate": 2.327015532739145e-06, + "loss": -0.0035, + "num_tokens": 30968253.0, + "reward": 0.3200162294320762, + "reward_std": 0.16002128778782207, + "rewards/code_reward/mean": 0.22001622177049285, + "rewards/code_reward/std": 0.16002129036933183, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 215 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 693.8375, + "completions/max_terminated_length": 693.8375, + "completions/mean_length": 516.459375, + "completions/mean_terminated_length": 516.459375, + "completions/min_length": 385.2, + "completions/min_terminated_length": 385.2, + "epoch": 0.0985028683363649, + "grad_norm": 0.583768547911125, + "kl": 0.032080078125, + "learning_rate": 2.2875387001405366e-06, + "loss": -0.0004, + "num_tokens": 31677939.0, + "reward": 0.2827278276905417, + "reward_std": 0.12490762829547748, + "rewards/code_reward/mean": 0.182884071078297, + "rewards/code_reward/std": 0.12475912600348238, + "rewards/format_reward/mean": 0.9984375, + "rewards/format_reward/std": 0.004419417306780815, + "step": 220 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 689.225, + "completions/max_terminated_length": 689.225, + "completions/mean_length": 511.4609375, + "completions/mean_terminated_length": 511.4609375, + "completions/min_length": 375.7, + "completions/min_terminated_length": 375.7, + "epoch": 0.10074156988946412, + "grad_norm": 0.47416592884978, + "kl": 0.03255615234375, + "learning_rate": 2.2472939043700894e-06, + "loss": 0.0104, + "num_tokens": 32366802.0, + "reward": 0.288489468768239, + "reward_std": 0.14980540352989918, + "rewards/code_reward/mean": 0.18880196339305258, + "rewards/code_reward/std": 0.14945577481121292, + "rewards/format_reward/mean": 0.996875, + "rewards/format_reward/std": 0.00883883461356163, + "step": 225 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 709.1375, + "completions/max_terminated_length": 709.1375, + "completions/mean_length": 537.6703125, + "completions/mean_terminated_length": 537.6703125, + "completions/min_length": 400.325, + "completions/min_terminated_length": 400.325, + "epoch": 0.10298027144256332, + "grad_norm": 0.6526599784556473, + "kl": 0.031103515625, + "learning_rate": 2.206320389875099e-06, + "loss": 0.0004, + "num_tokens": 33092199.0, + "reward": 0.27060003159567714, + "reward_std": 0.14649803503416478, + "rewards/code_reward/mean": 0.1706000213016523, + "rewards/code_reward/std": 0.14649803435604553, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 230 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 688.975, + "completions/max_terminated_length": 688.975, + "completions/mean_length": 537.1703125, + "completions/mean_terminated_length": 537.1703125, + "completions/min_length": 413.1, + "completions/min_terminated_length": 413.1, + "epoch": 0.10521897299566252, + "grad_norm": 0.578479817853106, + "kl": 0.031402587890625, + "learning_rate": 2.1646581117081187e-06, + "loss": 0.0118, + "num_tokens": 33813252.0, + "reward": 0.24227329418063165, + "reward_std": 0.14281967077986338, + "rewards/code_reward/mean": 0.1422732870618347, + "rewards/code_reward/std": 0.1428196722699795, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 235 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 681.5375, + "completions/max_terminated_length": 681.5375, + "completions/mean_length": 533.6359375, + "completions/mean_terminated_length": 533.6359375, + "completions/min_length": 407.2125, + "completions/min_terminated_length": 407.2125, + "epoch": 0.10745767454876172, + "grad_norm": 0.6265187392839973, + "kl": 0.03284759521484375, + "learning_rate": 2.122347696565059e-06, + "loss": 0.0139, + "num_tokens": 34549147.0, + "reward": 0.33532751044258474, + "reward_std": 0.1622638524393551, + "rewards/code_reward/mean": 0.2353275064189802, + "rewards/code_reward/std": 0.1622638531640405, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 240 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 703.1125, + "completions/max_terminated_length": 703.1125, + "completions/mean_length": 537.425, + "completions/mean_terminated_length": 537.425, + "completions/min_length": 406.0, + "completions/min_terminated_length": 406.0, + "epoch": 0.10969637610186092, + "grad_norm": 0.6224509400429641, + "kl": 0.0321990966796875, + "learning_rate": 2.079430403168327e-06, + "loss": 0.0205, + "num_tokens": 35271579.0, + "reward": 0.3003238163888454, + "reward_std": 0.17288763520191425, + "rewards/code_reward/mean": 0.20032380691118307, + "rewards/code_reward/std": 0.1728876391222002, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 245 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 647.8375, + "completions/max_terminated_length": 647.8375, + "completions/mean_length": 501.9875, + "completions/mean_terminated_length": 501.9875, + "completions/min_length": 376.6375, + "completions/min_terminated_length": 376.6375, + "epoch": 0.11193507765496012, + "grad_norm": 0.6263781800971838, + "kl": 0.03351898193359375, + "learning_rate": 2.0359480820336594e-06, + "loss": 0.0094, + "num_tokens": 35965555.0, + "reward": 0.31694198679178953, + "reward_std": 0.1596899228548864, + "rewards/code_reward/mean": 0.2170982286144863, + "rewards/code_reward/std": 0.15924798299674875, + "rewards/format_reward/mean": 0.9984375, + "rewards/format_reward/std": 0.004419417306780815, + "step": 250 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 636.8625, + "completions/max_terminated_length": 636.8625, + "completions/mean_length": 492.7109375, + "completions/mean_terminated_length": 492.7109375, + "completions/min_length": 373.775, + "completions/min_terminated_length": 373.775, + "epoch": 0.11417377920805932, + "grad_norm": 0.6373686980037819, + "kl": 0.0332763671875, + "learning_rate": 1.9919431346598687e-06, + "loss": 0.0146, + "num_tokens": 36669402.0, + "reward": 0.30089313965290787, + "reward_std": 0.1563536574365571, + "rewards/code_reward/mean": 0.20089313458884134, + "rewards/code_reward/std": 0.15635365938651374, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 255 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 649.975, + "completions/max_terminated_length": 649.975, + "completions/mean_length": 497.025, + "completions/mean_terminated_length": 497.025, + "completions/min_length": 369.1625, + "completions/min_terminated_length": 369.1625, + "epoch": 0.11641248076115852, + "grad_norm": 0.6124161391568931, + "kl": 0.03218841552734375, + "learning_rate": 1.947458472181296e-06, + "loss": 0.0024, + "num_tokens": 37365858.0, + "reward": 0.31037036776542665, + "reward_std": 0.15006352393247652, + "rewards/code_reward/mean": 0.21037036021152744, + "rewards/code_reward/std": 0.15006352449127008, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 260 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 666.975, + "completions/max_terminated_length": 666.975, + "completions/mean_length": 511.603125, + "completions/mean_terminated_length": 511.603125, + "completions/min_length": 381.725, + "completions/min_terminated_length": 381.725, + "epoch": 0.11865118231425772, + "grad_norm": 0.5345035896498757, + "kl": 0.0315277099609375, + "learning_rate": 1.9025374735233068e-06, + "loss": 0.0154, + "num_tokens": 38086620.0, + "reward": 0.32326241619884966, + "reward_std": 0.14852707152604125, + "rewards/code_reward/mean": 0.2234186581481481, + "rewards/code_reward/std": 0.14887304982403293, + "rewards/format_reward/mean": 0.9984375, + "rewards/format_reward/std": 0.004419417306780815, + "step": 265 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 715.9125, + "completions/max_terminated_length": 715.9125, + "completions/mean_length": 527.903125, + "completions/mean_terminated_length": 527.903125, + "completions/min_length": 398.2875, + "completions/min_terminated_length": 398.2875, + "epoch": 0.12088988386735694, + "grad_norm": 0.573684423194082, + "kl": 0.0300811767578125, + "learning_rate": 1.8572239431016146e-06, + "loss": 0.0126, + "num_tokens": 38809214.0, + "reward": 0.2911208848468959, + "reward_std": 0.13888704897253773, + "rewards/code_reward/mean": 0.19127712811168748, + "rewards/code_reward/std": 0.13906100282329134, + "rewards/format_reward/mean": 0.9984375, + "rewards/format_reward/std": 0.004419417306780815, + "step": 270 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 685.725, + "completions/max_terminated_length": 685.725, + "completions/mean_length": 533.9171875, + "completions/mean_terminated_length": 533.9171875, + "completions/min_length": 391.1375, + "completions/min_terminated_length": 391.1375, + "epoch": 0.12312858542045614, + "grad_norm": 0.4830812794073296, + "kl": 0.03022613525390625, + "learning_rate": 1.8115620681066946e-06, + "loss": 0.0069, + "num_tokens": 39531329.0, + "reward": 0.37973827524110676, + "reward_std": 0.17492547728470526, + "rewards/code_reward/mean": 0.2798945170710795, + "rewards/code_reward/std": 0.1747873265412636, + "rewards/format_reward/mean": 0.9984375, + "rewards/format_reward/std": 0.004419417306780815, + "step": 275 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 764.4625, + "completions/max_terminated_length": 764.4625, + "completions/mean_length": 555.675, + "completions/mean_terminated_length": 555.675, + "completions/min_length": 406.3, + "completions/min_terminated_length": 406.3, + "epoch": 0.12536728697355534, + "grad_norm": 0.48053859411140093, + "kl": 0.02889251708984375, + "learning_rate": 1.765596375414936e-06, + "loss": 0.0177, + "num_tokens": 40297449.0, + "reward": 0.26671807700768113, + "reward_std": 0.14942678074003196, + "rewards/code_reward/mean": 0.1671868214616552, + "rewards/code_reward/std": 0.14887573684682137, + "rewards/format_reward/mean": 0.9953125, + "rewards/format_reward/std": 0.013258251920342445, + "step": 280 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 669.825, + "completions/max_terminated_length": 669.825, + "completions/mean_length": 511.8296875, + "completions/mean_terminated_length": 511.8296875, + "completions/min_length": 369.2625, + "completions/min_terminated_length": 369.2625, + "epoch": 0.12760598852665453, + "grad_norm": 0.5012923928076685, + "kl": 0.03134918212890625, + "learning_rate": 1.7193716881685532e-06, + "loss": 0.0171, + "num_tokens": 41000340.0, + "reward": 0.33275858471170067, + "reward_std": 0.16071395185717846, + "rewards/code_reward/mean": 0.23291482530039503, + "rewards/code_reward/std": 0.1606017280719243, + "rewards/format_reward/mean": 0.9984375, + "rewards/format_reward/std": 0.004419417306780815, + "step": 285 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0015625, + "completions/max_length": 857.1125, + "completions/max_terminated_length": 763.7375, + "completions/mean_length": 575.0640625, + "completions/mean_terminated_length": 563.0296878814697, + "completions/min_length": 400.15, + "completions/min_terminated_length": 400.15, + "epoch": 0.12984469007975374, + "grad_norm": 0.630042919145043, + "kl": 0.030005645751953126, + "learning_rate": 1.6729330820665925e-06, + "loss": 0.0156, + "num_tokens": 41754885.0, + "reward": 0.28822933994233607, + "reward_std": 0.1465720217616763, + "rewards/code_reward/mean": 0.18854183692019433, + "rewards/code_reward/std": 0.14625606250483542, + "rewards/format_reward/mean": 0.996875, + "rewards/format_reward/std": 0.00883883461356163, + "step": 290 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 716.25, + "completions/max_terminated_length": 716.25, + "completions/mean_length": 537.20625, + "completions/mean_terminated_length": 537.20625, + "completions/min_length": 392.175, + "completions/min_terminated_length": 392.175, + "epoch": 0.13208339163285296, + "grad_norm": 0.5107272382559945, + "kl": 0.03038330078125, + "learning_rate": 1.6263258414096618e-06, + "loss": 0.0154, + "num_tokens": 42470809.0, + "reward": 0.33072368800640106, + "reward_std": 0.2061192358552944, + "rewards/code_reward/mean": 0.23087992868968285, + "rewards/code_reward/std": 0.20595307812327518, + "rewards/format_reward/mean": 0.9984375, + "rewards/format_reward/std": 0.004419417306780815, + "step": 295 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 730.0875, + "completions/max_terminated_length": 730.0875, + "completions/mean_length": 539.428125, + "completions/mean_terminated_length": 539.428125, + "completions/min_length": 383.3125, + "completions/min_terminated_length": 383.3125, + "epoch": 0.13432209318595215, + "grad_norm": 0.537918547301204, + "kl": 0.0288299560546875, + "learning_rate": 1.5795954149412446e-06, + "loss": 0.0083, + "num_tokens": 43193235.0, + "reward": 0.34142726445570587, + "reward_std": 0.14466436323709786, + "rewards/code_reward/mean": 0.24173975624726154, + "rewards/code_reward/std": 0.14445849329931662, + "rewards/format_reward/mean": 0.996875, + "rewards/format_reward/std": 0.00883883461356163, + "step": 300 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 759.3, + "completions/max_terminated_length": 759.3, + "completions/mean_length": 557.8890625, + "completions/mean_terminated_length": 557.8890625, + "completions/min_length": 399.75, + "completions/min_terminated_length": 399.75, + "epoch": 0.13656079473905136, + "grad_norm": 0.584098194483569, + "kl": 0.02829742431640625, + "learning_rate": 1.5327873715286555e-06, + "loss": 0.0094, + "num_tokens": 43930988.0, + "reward": 0.2912998185493052, + "reward_std": 0.1508971786039183, + "rewards/code_reward/mean": 0.1914560628225445, + "rewards/code_reward/std": 0.15074160079238935, + "rewards/format_reward/mean": 0.9984375, + "rewards/format_reward/std": 0.004419417306780815, + "step": 305 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 703.5875, + "completions/max_terminated_length": 703.5875, + "completions/mean_length": 531.625, + "completions/mean_terminated_length": 531.625, + "completions/min_length": 398.075, + "completions/min_terminated_length": 398.075, + "epoch": 0.13879949629215055, + "grad_norm": 0.5610344866641752, + "kl": 0.029935455322265624, + "learning_rate": 1.4859473557268605e-06, + "loss": 0.0228, + "num_tokens": 44630804.0, + "reward": 0.31272673439234494, + "reward_std": 0.160137642340851, + "rewards/code_reward/mean": 0.21272672956984023, + "rewards/code_reward/std": 0.16013764539093245, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 310 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0015625, + "completions/max_length": 754.0625, + "completions/max_terminated_length": 673.325, + "completions/mean_length": 516.14375, + "completions/mean_terminated_length": 504.68303604125975, + "completions/min_length": 370.5375, + "completions/min_terminated_length": 370.5375, + "epoch": 0.14103819784524976, + "grad_norm": 0.5678371154254215, + "kl": 0.0303436279296875, + "learning_rate": 1.4391210432684911e-06, + "loss": 0.0172, + "num_tokens": 45353968.0, + "reward": 0.30479407841339706, + "reward_std": 0.1592210401489865, + "rewards/code_reward/mean": 0.20510657107515726, + "rewards/code_reward/std": 0.15867348304018378, + "rewards/format_reward/mean": 0.996875, + "rewards/format_reward/std": 0.00883883461356163, + "step": 315 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 662.725, + "completions/max_terminated_length": 662.725, + "completions/mean_length": 500.4296875, + "completions/mean_terminated_length": 500.4296875, + "completions/min_length": 366.975, + "completions/min_terminated_length": 366.975, + "epoch": 0.14327689939834895, + "grad_norm": 0.6017406713534427, + "kl": 0.03122406005859375, + "learning_rate": 1.3923540965234527e-06, + "loss": 0.0166, + "num_tokens": 46065395.0, + "reward": 0.3366297990083694, + "reward_std": 0.14250589827133808, + "rewards/code_reward/mean": 0.23662979124492267, + "rewards/code_reward/std": 0.14250590050360187, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 320 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0015625, + "completions/max_length": 756.5875, + "completions/max_terminated_length": 663.425, + "completions/mean_length": 501.0203125, + "completions/mean_terminated_length": 489.0866073608398, + "completions/min_length": 363.675, + "completions/min_terminated_length": 363.675, + "epoch": 0.14551560095144817, + "grad_norm": 0.6420872754773821, + "kl": 0.03084869384765625, + "learning_rate": 1.3456921199715669e-06, + "loss": 0.0183, + "num_tokens": 46769624.0, + "reward": 0.274929376039654, + "reward_std": 0.14380120979622008, + "rewards/code_reward/mean": 0.17508561803842895, + "rewards/code_reward/std": 0.1433592700981535, + "rewards/format_reward/mean": 0.9984375, + "rewards/format_reward/std": 0.004419417306780815, + "step": 325 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 662.975, + "completions/max_terminated_length": 662.975, + "completions/mean_length": 503.740625, + "completions/mean_terminated_length": 503.740625, + "completions/min_length": 379.2, + "completions/min_terminated_length": 379.2, + "epoch": 0.14775430250454735, + "grad_norm": 0.5635596946152118, + "kl": 0.0294403076171875, + "learning_rate": 1.2991806157316646e-06, + "loss": 0.0095, + "num_tokens": 47486962.0, + "reward": 0.2972354737110436, + "reward_std": 0.11910657306143549, + "rewards/code_reward/mean": 0.19739172172703548, + "rewards/code_reward/std": 0.11866463308397215, + "rewards/format_reward/mean": 0.9984375, + "rewards/format_reward/std": 0.004419417306780815, + "step": 330 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 714.7625, + "completions/max_terminated_length": 714.7625, + "completions/mean_length": 533.565625, + "completions/mean_terminated_length": 533.565625, + "completions/min_length": 388.1125, + "completions/min_terminated_length": 388.1125, + "epoch": 0.14999300405764657, + "grad_norm": 0.60933960917403, + "kl": 0.02783966064453125, + "learning_rate": 1.2528649391904927e-06, + "loss": 0.0078, + "num_tokens": 48202916.0, + "reward": 0.2663810454308987, + "reward_std": 0.13506472197477706, + "rewards/code_reward/mean": 0.1665372904652031, + "rewards/code_reward/std": 0.13477403752622194, + "rewards/format_reward/mean": 0.9984375, + "rewards/format_reward/std": 0.004419417306780815, + "step": 335 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 735.2125, + "completions/max_terminated_length": 735.2125, + "completions/mean_length": 552.1953125, + "completions/mean_terminated_length": 552.1953125, + "completions/min_length": 409.1875, + "completions/min_terminated_length": 409.1875, + "epoch": 0.15223170561074575, + "grad_norm": 0.49791400131307495, + "kl": 0.025357818603515624, + "learning_rate": 1.2067902547747076e-06, + "loss": 0.0164, + "num_tokens": 48932801.0, + "reward": 0.3229883606545627, + "reward_std": 0.1690987061272608, + "rewards/code_reward/mean": 0.2231446014760877, + "rewards/code_reward/std": 0.16865676557354164, + "rewards/format_reward/mean": 0.9984375, + "rewards/format_reward/std": 0.004419417306780815, + "step": 340 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 669.15, + "completions/max_terminated_length": 669.15, + "completions/mean_length": 518.7078125, + "completions/mean_terminated_length": 518.7078125, + "completions/min_length": 389.6, + "completions/min_terminated_length": 389.6, + "epoch": 0.15447040716384497, + "grad_norm": 0.5738371722180043, + "kl": 0.02738189697265625, + "learning_rate": 1.1610014919090847e-06, + "loss": 0.0011, + "num_tokens": 49618094.0, + "reward": 0.36557651134207847, + "reward_std": 0.1583593948977068, + "rewards/code_reward/mean": 0.2655765014962526, + "rewards/code_reward/std": 0.15835939861135556, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 345 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 763.775, + "completions/max_terminated_length": 763.775, + "completions/mean_length": 575.278125, + "completions/mean_terminated_length": 575.278125, + "completions/min_length": 430.725, + "completions/min_terminated_length": 430.725, + "epoch": 0.15670910871694418, + "grad_norm": 0.5330527785978358, + "kl": 0.02547607421875, + "learning_rate": 1.1155433012038849e-06, + "loss": 0.013, + "num_tokens": 50367344.0, + "reward": 0.3111037847585976, + "reward_std": 0.1396631282143062, + "rewards/code_reward/mean": 0.21110378042503725, + "rewards/code_reward/std": 0.1396631306008203, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 350 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 784.2, + "completions/max_terminated_length": 784.2, + "completions/mean_length": 581.1390625, + "completions/mean_terminated_length": 581.1390625, + "completions/min_length": 428.975, + "completions/min_terminated_length": 428.975, + "epoch": 0.15894781027004337, + "grad_norm": 0.5161816982734899, + "kl": 0.0270263671875, + "learning_rate": 1.0704600109141044e-06, + "loss": 0.0081, + "num_tokens": 51121985.0, + "reward": 0.2939129492267966, + "reward_std": 0.13785594400105766, + "rewards/code_reward/mean": 0.19406919270550133, + "rewards/code_reward/std": 0.1376118804764701, + "rewards/format_reward/mean": 0.9984375, + "rewards/format_reward/std": 0.004419417306780815, + "step": 355 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 759.9875, + "completions/max_terminated_length": 759.9875, + "completions/mean_length": 552.328125, + "completions/mean_terminated_length": 552.328125, + "completions/min_length": 402.8875, + "completions/min_terminated_length": 402.8875, + "epoch": 0.1611865118231426, + "grad_norm": 0.6003534969440923, + "kl": 0.026979827880859376, + "learning_rate": 1.0257955837130725e-06, + "loss": 0.0035, + "num_tokens": 51844651.0, + "reward": 0.28144540255889294, + "reward_std": 0.12947248641576153, + "rewards/code_reward/mean": 0.18144539590430214, + "rewards/code_reward/std": 0.1294724913313985, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 360 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 743.025, + "completions/max_terminated_length": 743.025, + "completions/mean_length": 557.615625, + "completions/mean_terminated_length": 557.615625, + "completions/min_length": 414.6, + "completions/min_terminated_length": 414.6, + "epoch": 0.16342521337624177, + "grad_norm": 0.5589064006858112, + "kl": 0.026873779296875, + "learning_rate": 9.815935738225377e-07, + "loss": 0.0076, + "num_tokens": 52581373.0, + "reward": 0.31030982043594124, + "reward_std": 0.14550057554297383, + "rewards/code_reward/mean": 0.21030981277799582, + "rewards/code_reward/std": 0.14550057782616932, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 365 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 781.1625, + "completions/max_terminated_length": 781.1625, + "completions/mean_length": 594.5203125, + "completions/mean_terminated_length": 594.5203125, + "completions/min_length": 453.1125, + "completions/min_terminated_length": 453.1125, + "epoch": 0.165663914929341, + "grad_norm": 0.5310916132708259, + "kl": 0.02626190185546875, + "learning_rate": 9.378970845410571e-07, + "loss": 0.0095, + "num_tokens": 53352410.0, + "reward": 0.2810199284926057, + "reward_std": 0.1396817062428454, + "rewards/code_reward/mean": 0.18101992065639932, + "rewards/code_reward/std": 0.13968170815496705, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 370 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 730.95, + "completions/max_terminated_length": 730.95, + "completions/mean_length": 549.3125, + "completions/mean_terminated_length": 549.3125, + "completions/min_length": 412.425, + "completions/min_terminated_length": 412.425, + "epoch": 0.16790261648244018, + "grad_norm": 0.5669991229498123, + "kl": 0.026529693603515626, + "learning_rate": 8.947487262120971e-07, + "loss": 0.0094, + "num_tokens": 54086442.0, + "reward": 0.2867281662300229, + "reward_std": 0.12997563436510973, + "rewards/code_reward/mean": 0.1867281592771178, + "rewards/code_reward/std": 0.1299756362393964, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 375 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 746.9125, + "completions/max_terminated_length": 746.9125, + "completions/mean_length": 568.5546875, + "completions/mean_terminated_length": 568.5546875, + "completions/min_length": 416.3125, + "completions/min_terminated_length": 416.3125, + "epoch": 0.1701413180355394, + "grad_norm": 0.520216538993176, + "kl": 0.02662353515625, + "learning_rate": 8.521905746728408e-07, + "loss": 0.0137, + "num_tokens": 54836845.0, + "reward": 0.3280904936604202, + "reward_std": 0.13382616126909852, + "rewards/code_reward/mean": 0.22809048727212938, + "rewards/code_reward/std": 0.13382616304443218, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 380 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 705.6, + "completions/max_terminated_length": 705.6, + "completions/mean_length": 541.746875, + "completions/mean_terminated_length": 541.746875, + "completions/min_length": 402.8875, + "completions/min_terminated_length": 402.8875, + "epoch": 0.17238001958863858, + "grad_norm": 0.5872381716257123, + "kl": 0.02609405517578125, + "learning_rate": 8.102641302242105e-07, + "loss": 0.015, + "num_tokens": 55553251.0, + "reward": 0.3441149082966149, + "reward_std": 0.18714927716646343, + "rewards/code_reward/mean": 0.24411489552585408, + "rewards/code_reward/std": 0.18714928096160294, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 385 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 689.0875, + "completions/max_terminated_length": 689.0875, + "completions/mean_length": 521.6296875, + "completions/mean_terminated_length": 521.6296875, + "completions/min_length": 382.25, + "completions/min_terminated_length": 382.25, + "epoch": 0.1746187211417378, + "grad_norm": 0.6105825047365391, + "kl": 0.02542877197265625, + "learning_rate": 7.690102771621219e-07, + "loss": 0.0134, + "num_tokens": 56255086.0, + "reward": 0.35199374333024025, + "reward_std": 0.1669875715917442, + "rewards/code_reward/mean": 0.25199373266659675, + "rewards/code_reward/std": 0.16698757499689237, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 390 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 702.6125, + "completions/max_terminated_length": 702.6125, + "completions/mean_length": 547.8109375, + "completions/mean_terminated_length": 547.8109375, + "completions/min_length": 407.5, + "completions/min_terminated_length": 407.5, + "epoch": 0.176857422694837, + "grad_norm": 0.4867528744361068, + "kl": 0.02476806640625, + "learning_rate": 7.284692439094368e-07, + "loss": 0.0058, + "num_tokens": 56994181.0, + "reward": 0.3013323726132512, + "reward_std": 0.15418729400844314, + "rewards/code_reward/mean": 0.20133236556430348, + "rewards/code_reward/std": 0.15418729329830966, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 395 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 716.2875, + "completions/max_terminated_length": 716.2875, + "completions/mean_length": 523.3640625, + "completions/mean_terminated_length": 523.3640625, + "completions/min_length": 390.775, + "completions/min_terminated_length": 390.775, + "epoch": 0.1790961242479362, + "grad_norm": 0.5350752422212481, + "kl": 0.025472259521484374, + "learning_rate": 6.886805637874772e-07, + "loss": 0.0033, + "num_tokens": 57711366.0, + "reward": 0.3107692304067314, + "reward_std": 0.1176008581998758, + "rewards/code_reward/mean": 0.21076922266220208, + "rewards/code_reward/std": 0.11760085919813719, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 400 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 708.1125, + "completions/max_terminated_length": 708.1125, + "completions/mean_length": 536.55, + "completions/mean_terminated_length": 536.55, + "completions/min_length": 393.9625, + "completions/min_terminated_length": 393.9625, + "epoch": 0.1813348258010354, + "grad_norm": 0.5886059510700545, + "kl": 0.02571868896484375, + "learning_rate": 6.496830364653691e-07, + "loss": 0.0107, + "num_tokens": 58433174.0, + "reward": 0.29278530003502967, + "reward_std": 0.14287365710479208, + "rewards/code_reward/mean": 0.19278529447619802, + "rewards/code_reward/std": 0.14287365918862632, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 405 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 727.0, + "completions/max_terminated_length": 727.0, + "completions/mean_length": 555.8109375, + "completions/mean_terminated_length": 555.8109375, + "completions/min_length": 411.6375, + "completions/min_terminated_length": 411.6375, + "epoch": 0.1835735273541346, + "grad_norm": 0.5723292784440707, + "kl": 0.02495880126953125, + "learning_rate": 6.115146901248015e-07, + "loss": 0.0128, + "num_tokens": 59179325.0, + "reward": 0.2888775954954326, + "reward_std": 0.13932973612099886, + "rewards/code_reward/mean": 0.18903384153090882, + "rewards/code_reward/std": 0.13905893911141903, + "rewards/format_reward/mean": 0.9984375, + "rewards/format_reward/std": 0.004419417306780815, + "step": 410 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 738.7, + "completions/max_terminated_length": 738.7, + "completions/mean_length": 555.56875, + "completions/mean_terminated_length": 555.56875, + "completions/min_length": 405.975, + "completions/min_terminated_length": 405.975, + "epoch": 0.1858122289072338, + "grad_norm": 0.5951785187855096, + "kl": 0.024257659912109375, + "learning_rate": 5.742127443770959e-07, + "loss": -0.0082, + "num_tokens": 59914129.0, + "reward": 0.32972582541406154, + "reward_std": 0.17325325938872993, + "rewards/code_reward/mean": 0.22988206883310341, + "rewards/code_reward/std": 0.17343256894964726, + "rewards/format_reward/mean": 0.9984375, + "rewards/format_reward/std": 0.004419417306780815, + "step": 415 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 789.075, + "completions/max_terminated_length": 789.075, + "completions/mean_length": 567.95625, + "completions/mean_terminated_length": 567.95625, + "completions/min_length": 411.5125, + "completions/min_terminated_length": 411.5125, + "epoch": 0.188050930460333, + "grad_norm": 0.5882458355124174, + "kl": 0.025067138671875, + "learning_rate": 5.378135739687457e-07, + "loss": 0.011, + "num_tokens": 60679605.0, + "reward": 0.3126169110648334, + "reward_std": 0.15262282044568565, + "rewards/code_reward/mean": 0.2126169038747321, + "rewards/code_reward/std": 0.15262282300391233, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 420 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 743.6, + "completions/max_terminated_length": 743.6, + "completions/mean_length": 563.934375, + "completions/mean_terminated_length": 563.934375, + "completions/min_length": 407.6, + "completions/min_terminated_length": 407.6, + "epoch": 0.19028963201343221, + "grad_norm": 0.5844514039485758, + "kl": 0.0232269287109375, + "learning_rate": 5.023526733108258e-07, + "loss": 0.0058, + "num_tokens": 61442035.0, + "reward": 0.28377067698165775, + "reward_std": 0.14047583957435564, + "rewards/code_reward/mean": 0.18392692334891764, + "rewards/code_reward/std": 0.1400338972482132, + "rewards/format_reward/mean": 0.9984375, + "rewards/format_reward/std": 0.004419417306780815, + "step": 425 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 706.925, + "completions/max_terminated_length": 706.925, + "completions/mean_length": 529.33125, + "completions/mean_terminated_length": 529.33125, + "completions/min_length": 382.5125, + "completions/min_terminated_length": 382.5125, + "epoch": 0.1925283335665314, + "grad_norm": 0.6068829854257141, + "kl": 0.024706268310546876, + "learning_rate": 4.6786462186684726e-07, + "loss": 0.0148, + "num_tokens": 62163871.0, + "reward": 0.3680115182884037, + "reward_std": 0.15169981086510234, + "rewards/code_reward/mean": 0.26801150970277376, + "rewards/code_reward/std": 0.1516998124890961, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 430 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 731.0, + "completions/max_terminated_length": 731.0, + "completions/mean_length": 537.5578125, + "completions/mean_terminated_length": 537.5578125, + "completions/min_length": 382.8875, + "completions/min_terminated_length": 382.8875, + "epoch": 0.19476703511963062, + "grad_norm": 0.6072526841260757, + "kl": 0.02366943359375, + "learning_rate": 4.3438305043282314e-07, + "loss": 0.0105, + "num_tokens": 62868964.0, + "reward": 0.288625252712518, + "reward_std": 0.14189330035296735, + "rewards/code_reward/mean": 0.1886252475058427, + "rewards/code_reward/std": 0.14189330387162044, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 435 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0015625, + "completions/max_length": 849.4625, + "completions/max_terminated_length": 760.275, + "completions/mean_length": 566.0203125, + "completions/mean_terminated_length": 554.5283485412598, + "completions/min_length": 385.2875, + "completions/min_terminated_length": 385.2875, + "epoch": 0.1970057366727298, + "grad_norm": 0.5544203482669213, + "kl": 0.02349853515625, + "learning_rate": 4.019406083424222e-07, + "loss": 0.024, + "num_tokens": 63645545.0, + "reward": 0.2873677465133369, + "reward_std": 0.1426866902038455, + "rewards/code_reward/mean": 0.18768023860175162, + "rewards/code_reward/std": 0.14217363530769944, + "rewards/format_reward/mean": 0.996875, + "rewards/format_reward/std": 0.00883883461356163, + "step": 440 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 696.725, + "completions/max_terminated_length": 696.725, + "completions/mean_length": 515.0796875, + "completions/mean_terminated_length": 515.0796875, + "completions/min_length": 375.0375, + "completions/min_terminated_length": 375.0375, + "epoch": 0.19924443822582902, + "grad_norm": 0.5972791947559509, + "kl": 0.02529144287109375, + "learning_rate": 3.7056893162918063e-07, + "loss": 0.0201, + "num_tokens": 64322420.0, + "reward": 0.3194495734758675, + "reward_std": 0.1723791634547524, + "rewards/code_reward/mean": 0.2194495657022344, + "rewards/code_reward/std": 0.17237916672602296, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 445 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 713.55, + "completions/max_terminated_length": 713.55, + "completions/mean_length": 530.815625, + "completions/mean_terminated_length": 530.815625, + "completions/min_length": 378.8375, + "completions/min_terminated_length": 378.8375, + "epoch": 0.20148313977892823, + "grad_norm": 0.4622077467737105, + "kl": 0.0240142822265625, + "learning_rate": 3.4029861217683744e-07, + "loss": 0.0039, + "num_tokens": 65055550.0, + "reward": 0.288273274153471, + "reward_std": 0.13295620558201335, + "rewards/code_reward/mean": 0.1882732652418781, + "rewards/code_reward/std": 0.13295620674616657, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 450 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 823.5, + "completions/max_terminated_length": 823.5, + "completions/mean_length": 554.6484375, + "completions/mean_terminated_length": 554.6484375, + "completions/min_length": 385.95, + "completions/min_terminated_length": 385.95, + "epoch": 0.20372184133202742, + "grad_norm": 0.5779355252222167, + "kl": 0.0229461669921875, + "learning_rate": 3.111591678878596e-07, + "loss": 0.0175, + "num_tokens": 65784213.0, + "reward": 0.2769805608317256, + "reward_std": 0.1467181654064916, + "rewards/code_reward/mean": 0.17698055310174823, + "rewards/code_reward/std": 0.14671816679183394, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 455 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 701.2, + "completions/max_terminated_length": 701.2, + "completions/mean_length": 516.5953125, + "completions/mean_terminated_length": 516.5953125, + "completions/min_length": 376.525, + "completions/min_terminated_length": 376.525, + "epoch": 0.20596054288512664, + "grad_norm": 0.7260007034342328, + "kl": 0.02352447509765625, + "learning_rate": 2.831790138992526e-07, + "loss": 0.0016, + "num_tokens": 66491018.0, + "reward": 0.2927206911146641, + "reward_std": 0.1309030485805124, + "rewards/code_reward/mean": 0.19272068199061324, + "rewards/code_reward/std": 0.1309030512755271, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 460 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 642.7125, + "completions/max_terminated_length": 642.7125, + "completions/mean_length": 490.215625, + "completions/mean_terminated_length": 490.215625, + "completions/min_length": 364.1375, + "completions/min_terminated_length": 364.1375, + "epoch": 0.20819924443822582, + "grad_norm": 0.594813196893333, + "kl": 0.024704742431640624, + "learning_rate": 2.563854348737275e-07, + "loss": 0.0158, + "num_tokens": 67154060.0, + "reward": 0.3452305795624852, + "reward_std": 0.1497524828504538, + "rewards/code_reward/mean": 0.24523057123151376, + "rewards/code_reward/std": 0.14975248328992166, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 465 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 698.1125, + "completions/max_terminated_length": 698.1125, + "completions/mean_length": 521.234375, + "completions/mean_terminated_length": 521.234375, + "completions/min_length": 375.2875, + "completions/min_terminated_length": 375.2875, + "epoch": 0.21043794599132504, + "grad_norm": 0.5197145225969613, + "kl": 0.0245391845703125, + "learning_rate": 2.3080455839324343e-07, + "loss": 0.0051, + "num_tokens": 67889866.0, + "reward": 0.28930564858019353, + "reward_std": 0.13884065752499736, + "rewards/code_reward/mean": 0.18961814382928424, + "rewards/code_reward/std": 0.13826202357886358, + "rewards/format_reward/mean": 0.996875, + "rewards/format_reward/std": 0.005786375701427459, + "step": 470 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 676.2625, + "completions/max_terminated_length": 676.2625, + "completions/mean_length": 497.1359375, + "completions/mean_terminated_length": 497.1359375, + "completions/min_length": 347.875, + "completions/min_terminated_length": 347.875, + "epoch": 0.21267664754442422, + "grad_norm": 0.6912907179100062, + "kl": 0.024478912353515625, + "learning_rate": 2.064613294808664e-07, + "loss": 0.0116, + "num_tokens": 68564793.0, + "reward": 0.36915110973641274, + "reward_std": 0.15182709340006112, + "rewards/code_reward/mean": 0.26946360208967235, + "rewards/code_reward/std": 0.15149775308091193, + "rewards/format_reward/mean": 0.996875, + "rewards/format_reward/std": 0.00883883461356163, + "step": 475 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 691.4625, + "completions/max_terminated_length": 691.4625, + "completions/mean_length": 522.4828125, + "completions/mean_terminated_length": 522.4828125, + "completions/min_length": 379.275, + "completions/min_terminated_length": 379.275, + "epoch": 0.21491534909752344, + "grad_norm": 0.5419894781125532, + "kl": 0.022618865966796874, + "learning_rate": 1.83379486275794e-07, + "loss": 0.0007, + "num_tokens": 69262638.0, + "reward": 0.3051586433313787, + "reward_std": 0.12916497962432913, + "rewards/code_reward/mean": 0.20515863316832111, + "rewards/code_reward/std": 0.12916498319245875, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 480 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 686.775, + "completions/max_terminated_length": 686.775, + "completions/mean_length": 508.7515625, + "completions/mean_terminated_length": 508.7515625, + "completions/min_length": 362.0, + "completions/min_terminated_length": 362.0, + "epoch": 0.21715405065062263, + "grad_norm": 0.688891040637323, + "kl": 0.02315826416015625, + "learning_rate": 1.6158153688526895e-07, + "loss": 0.0091, + "num_tokens": 69978223.0, + "reward": 0.3311784929595888, + "reward_std": 0.17100559230602813, + "rewards/code_reward/mean": 0.2311784830279066, + "rewards/code_reward/std": 0.17100559424143286, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 485 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 693.6375, + "completions/max_terminated_length": 693.6375, + "completions/mean_length": 522.1390625, + "completions/mean_terminated_length": 522.1390625, + "completions/min_length": 380.6875, + "completions/min_terminated_length": 380.6875, + "epoch": 0.21939275220372184, + "grad_norm": 0.5956108322738943, + "kl": 0.0235626220703125, + "learning_rate": 1.4108873743594274e-07, + "loss": 0.0124, + "num_tokens": 70730304.0, + "reward": 0.2989473403431475, + "reward_std": 0.13880361177725717, + "rewards/code_reward/mean": 0.19894733218825422, + "rewards/code_reward/std": 0.13880361234769226, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 490 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 683.5375, + "completions/max_terminated_length": 683.5375, + "completions/mean_length": 519.221875, + "completions/mean_terminated_length": 519.221875, + "completions/min_length": 386.7125, + "completions/min_terminated_length": 386.7125, + "epoch": 0.22163145375682106, + "grad_norm": 0.5427250334330507, + "kl": 0.023455810546875, + "learning_rate": 1.2192107134610586e-07, + "loss": 0.0135, + "num_tokens": 71448214.0, + "reward": 0.29871292021125556, + "reward_std": 0.12881716500851326, + "rewards/code_reward/mean": 0.19886916641116842, + "rewards/code_reward/std": 0.1288392253103666, + "rewards/format_reward/mean": 0.9984375, + "rewards/format_reward/std": 0.004419417306780815, + "step": 495 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 625.8, + "completions/max_terminated_length": 625.8, + "completions/mean_length": 472.728125, + "completions/mean_terminated_length": 472.728125, + "completions/min_length": 342.7375, + "completions/min_terminated_length": 342.7375, + "epoch": 0.22387015530992024, + "grad_norm": 0.6108848009428748, + "kl": 0.02464752197265625, + "learning_rate": 1.0409722983898928e-07, + "loss": 0.0093, + "num_tokens": 72117280.0, + "reward": 0.39794372050091625, + "reward_std": 0.1893569786072476, + "rewards/code_reward/mean": 0.29794371249881807, + "rewards/code_reward/std": 0.18935698276618496, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 500 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 665.15, + "completions/max_terminated_length": 665.15, + "completions/mean_length": 497.05625, + "completions/mean_terminated_length": 497.05625, + "completions/min_length": 354.125, + "completions/min_terminated_length": 354.125, + "epoch": 0.22610885686301946, + "grad_norm": 0.5589824240842507, + "kl": 0.0255523681640625, + "learning_rate": 8.763459371614036e-08, + "loss": 0.0183, + "num_tokens": 72815756.0, + "reward": 0.2931746931746602, + "reward_std": 0.15111528622946935, + "rewards/code_reward/mean": 0.19333093738896423, + "rewards/code_reward/std": 0.15067334883497097, + "rewards/format_reward/mean": 0.9984375, + "rewards/format_reward/std": 0.004419417306780815, + "step": 505 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 708.7375, + "completions/max_terminated_length": 708.7375, + "completions/mean_length": 525.8828125, + "completions/mean_terminated_length": 525.8828125, + "completions/min_length": 383.925, + "completions/min_terminated_length": 383.925, + "epoch": 0.22834755841611865, + "grad_norm": 0.5657630399821236, + "kl": 0.023580169677734374, + "learning_rate": 7.254921640864954e-08, + "loss": 0.005, + "num_tokens": 73527777.0, + "reward": 0.29336653435602783, + "reward_std": 0.15301572528260293, + "rewards/code_reward/mean": 0.19352277733851225, + "rewards/code_reward/std": 0.15257378248206804, + "rewards/format_reward/mean": 0.9984375, + "rewards/format_reward/std": 0.004419417306780815, + "step": 510 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 744.025, + "completions/max_terminated_length": 744.025, + "completions/mean_length": 538.8359375, + "completions/mean_terminated_length": 538.8359375, + "completions/min_length": 391.7625, + "completions/min_terminated_length": 391.7625, + "epoch": 0.23058625996921786, + "grad_norm": 0.5974194655481591, + "kl": 0.02305145263671875, + "learning_rate": 5.885580832275245e-08, + "loss": 0.0084, + "num_tokens": 74267080.0, + "reward": 0.2840338280424476, + "reward_std": 0.1604262540466152, + "rewards/code_reward/mean": 0.1840338213412906, + "rewards/code_reward/std": 0.16042625640693586, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 515 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 747.9375, + "completions/max_terminated_length": 747.9375, + "completions/mean_length": 543.165625, + "completions/mean_terminated_length": 543.165625, + "completions/min_length": 395.525, + "completions/min_terminated_length": 395.525, + "epoch": 0.23282496152231705, + "grad_norm": 0.6721925638851708, + "kl": 0.023305511474609374, + "learning_rate": 4.6567722495074685e-08, + "loss": 0.0021, + "num_tokens": 75032546.0, + "reward": 0.26900712195783855, + "reward_std": 0.15734463239787147, + "rewards/code_reward/mean": 0.16900711600319482, + "rewards/code_reward/std": 0.1573446374386549, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 520 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 688.1875, + "completions/max_terminated_length": 688.1875, + "completions/mean_length": 528.38125, + "completions/mean_terminated_length": 528.38125, + "completions/min_length": 385.1, + "completions/min_terminated_length": 385.1, + "epoch": 0.23506366307541626, + "grad_norm": 0.47513348986208354, + "kl": 0.023612213134765626, + "learning_rate": 3.5696941571505434e-08, + "loss": 0.0069, + "num_tokens": 75779806.0, + "reward": 0.2989699838683009, + "reward_std": 0.144676909170812, + "rewards/code_reward/mean": 0.19896997831820046, + "rewards/code_reward/std": 0.14467690934252458, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 525 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 678.4625, + "completions/max_terminated_length": 678.4625, + "completions/mean_length": 507.25625, + "completions/mean_terminated_length": 507.25625, + "completions/min_length": 360.5125, + "completions/min_terminated_length": 360.5125, + "epoch": 0.23730236462851545, + "grad_norm": 0.5125208061016464, + "kl": 0.02255859375, + "learning_rate": 2.625406612240039e-08, + "loss": 0.006, + "num_tokens": 76477890.0, + "reward": 0.3240066308528185, + "reward_std": 0.16057187110418453, + "rewards/code_reward/mean": 0.22400662462459878, + "rewards/code_reward/std": 0.16057187146507204, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 530 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 747.75, + "completions/max_terminated_length": 747.75, + "completions/mean_length": 534.534375, + "completions/mean_terminated_length": 534.534375, + "completions/min_length": 385.9125, + "completions/min_terminated_length": 385.9125, + "epoch": 0.23954106618161466, + "grad_norm": 0.4891371425966553, + "kl": 0.02330169677734375, + "learning_rate": 1.8248304305504505e-08, + "loss": 0.0196, + "num_tokens": 77209744.0, + "reward": 0.333328259550035, + "reward_std": 0.14479399558040312, + "rewards/code_reward/mean": 0.23332825346733443, + "rewards/code_reward/std": 0.1447939975943882, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 535 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 667.125, + "completions/max_terminated_length": 667.125, + "completions/mean_length": 501.2875, + "completions/mean_terminated_length": 501.2875, + "completions/min_length": 358.6, + "completions/min_terminated_length": 358.6, + "epoch": 0.24177976773471388, + "grad_norm": 0.5700891782095214, + "kl": 0.02592926025390625, + "learning_rate": 1.1687462886677713e-08, + "loss": 0.006, + "num_tokens": 77919416.0, + "reward": 0.313872685469687, + "reward_std": 0.1551548853807617, + "rewards/code_reward/mean": 0.2140289287781343, + "rewards/code_reward/std": 0.15488540646038018, + "rewards/format_reward/mean": 0.9984375, + "rewards/format_reward/std": 0.004419417306780815, + "step": 540 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 657.0375, + "completions/max_terminated_length": 657.0375, + "completions/mean_length": 498.54375, + "completions/mean_terminated_length": 498.54375, + "completions/min_length": 368.6625, + "completions/min_terminated_length": 368.6625, + "epoch": 0.24401846928781307, + "grad_norm": 0.6409927990786405, + "kl": 0.02302703857421875, + "learning_rate": 6.577939627179785e-09, + "loss": 0.0125, + "num_tokens": 78597028.0, + "reward": 0.3173367108218372, + "reward_std": 0.16166887313302142, + "rewards/code_reward/mean": 0.21764920413697836, + "rewards/code_reward/std": 0.16100556787860115, + "rewards/format_reward/mean": 0.996875, + "rewards/format_reward/std": 0.00883883461356163, + "step": 545 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 687.375, + "completions/max_terminated_length": 687.375, + "completions/mean_length": 507.9859375, + "completions/mean_terminated_length": 507.9859375, + "completions/min_length": 373.4375, + "completions/min_terminated_length": 373.4375, + "epoch": 0.24625717084091228, + "grad_norm": 0.5411596737613947, + "kl": 0.024321746826171876, + "learning_rate": 2.9247170449338e-09, + "loss": 0.005, + "num_tokens": 79308787.0, + "reward": 0.3536563721485436, + "reward_std": 0.12869162768765818, + "rewards/code_reward/mean": 0.2538126138912048, + "rewards/code_reward/std": 0.1283905382733792, + "rewards/format_reward/mean": 0.9984375, + "rewards/format_reward/std": 0.004419417306780815, + "step": 550 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 674.4875, + "completions/max_terminated_length": 674.4875, + "completions/mean_length": 508.71875, + "completions/mean_terminated_length": 508.71875, + "completions/min_length": 376.7, + "completions/min_terminated_length": 376.7, + "epoch": 0.24849587239401147, + "grad_norm": 0.6381506844335124, + "kl": 0.022603607177734374, + "learning_rate": 7.313575558583474e-10, + "loss": 0.0068, + "num_tokens": 79983935.0, + "reward": 0.3423418626189232, + "reward_std": 0.13657438448863105, + "rewards/code_reward/mean": 0.24234185529057867, + "rewards/code_reward/std": 0.1365743855072651, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 555 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 651.59375, + "completions/max_terminated_length": 651.59375, + "completions/mean_length": 491.193359375, + "completions/mean_terminated_length": 491.193359375, + "completions/min_length": 356.890625, + "completions/min_terminated_length": 356.890625, + "epoch": 0.25028683363649085, + "kl": 0.023431777954101562, + "num_tokens": 80543474.0, + "reward": 0.3967649736441672, + "reward_std": 0.1777252904503257, + "rewards/code_reward/mean": 0.2967649649071973, + "rewards/code_reward/std": 0.17772529531794135, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 559, + "total_flos": 0.0, + "train_loss": 0.001293145966497858, + "train_runtime": 17459.8588, + "train_samples_per_second": 0.512, + "train_steps_per_second": 0.032 + } + ], + "logging_steps": 5, + "max_steps": 559, + "num_input_tokens_seen": 80543474, + "num_train_epochs": 1, + "save_steps": 25, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}