| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.25028683363649085, |
| "eval_steps": 500, |
| "global_step": 559, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 714.5625, |
| "completions/max_terminated_length": 714.5625, |
| "completions/mean_length": 534.09375, |
| "completions/mean_terminated_length": 534.09375, |
| "completions/min_length": 398.375, |
| "completions/min_terminated_length": 398.375, |
| "epoch": 0.00044774031061984047, |
| "grad_norm": 1.0911544979292094, |
| "kl": 0.0, |
| "learning_rate": 0.0, |
| "loss": 0.0164, |
| "num_tokens": 143396.0, |
| "reward": 0.1745322283823043, |
| "reward_std": 0.14398040855303407, |
| "rewards/code_reward/mean": 0.10812597409676528, |
| "rewards/code_reward/std": 0.11770913819782436, |
| "rewards/format_reward/mean": 0.6640625, |
| "rewards/format_reward/std": 0.44056092016398907, |
| "step": 1 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 782.5, |
| "completions/max_terminated_length": 782.5, |
| "completions/mean_length": 584.078125, |
| "completions/mean_terminated_length": 584.078125, |
| "completions/min_length": 428.625, |
| "completions/min_terminated_length": 428.625, |
| "epoch": 0.0022387015530992023, |
| "grad_norm": 1.0592214128916255, |
| "kl": 0.00044733285903930664, |
| "learning_rate": 2.1428571428571428e-07, |
| "loss": 0.0004, |
| "num_tokens": 772676.0, |
| "reward": 0.15631713026959915, |
| "reward_std": 0.14780386447091587, |
| "rewards/code_reward/mean": 0.09889525244216202, |
| "rewards/code_reward/std": 0.12754268431308446, |
| "rewards/format_reward/mean": 0.57421875, |
| "rewards/format_reward/std": 0.42564064590260386, |
| "step": 5 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 743.5625, |
| "completions/max_terminated_length": 743.5625, |
| "completions/mean_length": 566.3671875, |
| "completions/mean_terminated_length": 566.3671875, |
| "completions/min_length": 412.7625, |
| "completions/min_terminated_length": 412.7625, |
| "epoch": 0.004477403106198405, |
| "grad_norm": 0.8586902730302105, |
| "kl": 0.0006687402725219727, |
| "learning_rate": 4.821428571428572e-07, |
| "loss": 0.02, |
| "num_tokens": 1514535.0, |
| "reward": 0.21928292746888473, |
| "reward_std": 0.17851990209892393, |
| "rewards/code_reward/mean": 0.15240792171971407, |
| "rewards/code_reward/std": 0.15891524556500372, |
| "rewards/format_reward/mean": 0.66875, |
| "rewards/format_reward/std": 0.42151433378458025, |
| "step": 10 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 796.05, |
| "completions/max_terminated_length": 796.05, |
| "completions/mean_length": 587.6890625, |
| "completions/mean_terminated_length": 587.6890625, |
| "completions/min_length": 413.575, |
| "completions/min_terminated_length": 413.575, |
| "epoch": 0.0067161046592976075, |
| "grad_norm": 0.6576078448734007, |
| "kl": 0.002119898796081543, |
| "learning_rate": 7.5e-07, |
| "loss": 0.0262, |
| "num_tokens": 2322384.0, |
| "reward": 0.19728650886099786, |
| "reward_std": 0.153143038158305, |
| "rewards/code_reward/mean": 0.11244275536737405, |
| "rewards/code_reward/std": 0.13675388206611389, |
| "rewards/format_reward/mean": 0.8484375, |
| "rewards/format_reward/std": 0.2670775193721056, |
| "step": 15 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 736.6, |
| "completions/max_terminated_length": 736.6, |
| "completions/mean_length": 539.6390625, |
| "completions/mean_terminated_length": 539.6390625, |
| "completions/min_length": 379.7375, |
| "completions/min_terminated_length": 379.7375, |
| "epoch": 0.00895480621239681, |
| "grad_norm": 0.7320311360828152, |
| "kl": 0.002538633346557617, |
| "learning_rate": 1.017857142857143e-06, |
| "loss": 0.0079, |
| "num_tokens": 3037089.0, |
| "reward": 0.21442170465597882, |
| "reward_std": 0.14227938583353533, |
| "rewards/code_reward/mean": 0.12176544930553064, |
| "rewards/code_reward/std": 0.13433333449356724, |
| "rewards/format_reward/mean": 0.9265625, |
| "rewards/format_reward/std": 0.156092469394207, |
| "step": 20 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 770.625, |
| "completions/max_terminated_length": 770.625, |
| "completions/mean_length": 554.2328125, |
| "completions/mean_terminated_length": 554.2328125, |
| "completions/min_length": 395.7, |
| "completions/min_terminated_length": 395.7, |
| "epoch": 0.011193507765496012, |
| "grad_norm": 0.5737573580410903, |
| "kl": 0.00330963134765625, |
| "learning_rate": 1.2857142857142856e-06, |
| "loss": 0.0233, |
| "num_tokens": 3787614.0, |
| "reward": 0.22332688504830003, |
| "reward_std": 0.11517863497429062, |
| "rewards/code_reward/mean": 0.12520187861009618, |
| "rewards/code_reward/std": 0.11075652101717423, |
| "rewards/format_reward/mean": 0.98125, |
| "rewards/format_reward/std": 0.04998054876923561, |
| "step": 25 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 696.775, |
| "completions/max_terminated_length": 696.775, |
| "completions/mean_length": 527.1390625, |
| "completions/mean_terminated_length": 527.1390625, |
| "completions/min_length": 375.6, |
| "completions/min_terminated_length": 375.6, |
| "epoch": 0.013432209318595215, |
| "grad_norm": 0.6412924324023244, |
| "kl": 0.004455375671386719, |
| "learning_rate": 1.5535714285714287e-06, |
| "loss": 0.0292, |
| "num_tokens": 4536623.0, |
| "reward": 0.2232258369214833, |
| "reward_std": 0.13004211404477245, |
| "rewards/code_reward/mean": 0.12400708374771056, |
| "rewards/code_reward/std": 0.12888794834143483, |
| "rewards/format_reward/mean": 0.9921875, |
| "rewards/format_reward/std": 0.022097086533904076, |
| "step": 30 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 688.475, |
| "completions/max_terminated_length": 688.475, |
| "completions/mean_length": 493.4390625, |
| "completions/mean_terminated_length": 493.4390625, |
| "completions/min_length": 351.2875, |
| "completions/min_terminated_length": 351.2875, |
| "epoch": 0.015670910871694418, |
| "grad_norm": 0.4441774535858199, |
| "kl": 0.005760383605957031, |
| "learning_rate": 1.8214285714285714e-06, |
| "loss": 0.0183, |
| "num_tokens": 5238216.0, |
| "reward": 0.23461700212210418, |
| "reward_std": 0.13695308727037628, |
| "rewards/code_reward/mean": 0.1349294964238652, |
| "rewards/code_reward/std": 0.13671803568140603, |
| "rewards/format_reward/mean": 0.996875, |
| "rewards/format_reward/std": 0.00883883461356163, |
| "step": 35 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.003125, |
| "completions/max_length": 815.8875, |
| "completions/max_terminated_length": 623.275, |
| "completions/mean_length": 488.790625, |
| "completions/mean_terminated_length": 464.4939735412598, |
| "completions/min_length": 338.1875, |
| "completions/min_terminated_length": 338.1875, |
| "epoch": 0.01790961242479362, |
| "grad_norm": 0.7054891224478806, |
| "kl": 0.007607078552246094, |
| "learning_rate": 2.089285714285714e-06, |
| "loss": 0.0369, |
| "num_tokens": 5931842.0, |
| "reward": 0.2295066607184708, |
| "reward_std": 0.13071401379711461, |
| "rewards/code_reward/mean": 0.12997540423093595, |
| "rewards/code_reward/std": 0.1293881902238354, |
| "rewards/format_reward/mean": 0.9953125, |
| "rewards/format_reward/std": 0.013258251920342445, |
| "step": 40 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 595.2375, |
| "completions/max_terminated_length": 595.2375, |
| "completions/mean_length": 450.5140625, |
| "completions/mean_terminated_length": 450.5140625, |
| "completions/min_length": 334.7125, |
| "completions/min_terminated_length": 334.7125, |
| "epoch": 0.020148313977892823, |
| "grad_norm": 0.7270226911269254, |
| "kl": 0.008572006225585937, |
| "learning_rate": 2.357142857142857e-06, |
| "loss": 0.0023, |
| "num_tokens": 6579707.0, |
| "reward": 0.29843434747308495, |
| "reward_std": 0.13938394124270417, |
| "rewards/code_reward/mean": 0.19905934149210225, |
| "rewards/code_reward/std": 0.13823813095805235, |
| "rewards/format_reward/mean": 0.99375, |
| "rewards/format_reward/std": 0.01767766922712326, |
| "step": 45 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0015625, |
| "completions/max_length": 771.7375, |
| "completions/max_terminated_length": 676.5875, |
| "completions/mean_length": 519.5140625, |
| "completions/mean_terminated_length": 507.42098236083984, |
| "completions/min_length": 371.625, |
| "completions/min_terminated_length": 371.625, |
| "epoch": 0.022387015530992024, |
| "grad_norm": 0.8864298285641923, |
| "kl": 0.009959030151367187, |
| "learning_rate": 2.6250000000000003e-06, |
| "loss": 0.0247, |
| "num_tokens": 7287164.0, |
| "reward": 0.24341339743696153, |
| "reward_std": 0.13661439061979763, |
| "rewards/code_reward/mean": 0.14700714359642006, |
| "rewards/code_reward/std": 0.13423215872608124, |
| "rewards/format_reward/mean": 0.9640625, |
| "rewards/format_reward/std": 0.06212893389165401, |
| "step": 50 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 683.3625, |
| "completions/max_terminated_length": 683.3625, |
| "completions/mean_length": 516.046875, |
| "completions/mean_terminated_length": 516.046875, |
| "completions/min_length": 386.925, |
| "completions/min_terminated_length": 386.925, |
| "epoch": 0.024625717084091225, |
| "grad_norm": 0.6382131264055037, |
| "kl": 0.008373641967773437, |
| "learning_rate": 2.892857142857143e-06, |
| "loss": 0.021, |
| "num_tokens": 7971626.0, |
| "reward": 0.3139894030056894, |
| "reward_std": 0.15021034325327492, |
| "rewards/code_reward/mean": 0.21695814684353537, |
| "rewards/code_reward/std": 0.14836436581681484, |
| "rewards/format_reward/mean": 0.9703125, |
| "rewards/format_reward/std": 0.046608568355441096, |
| "step": 55 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 661.7125, |
| "completions/max_terminated_length": 661.7125, |
| "completions/mean_length": 510.025, |
| "completions/mean_terminated_length": 510.025, |
| "completions/min_length": 382.3125, |
| "completions/min_terminated_length": 382.3125, |
| "epoch": 0.02686441863719043, |
| "grad_norm": 0.7435002543363699, |
| "kl": 0.009385299682617188, |
| "learning_rate": 2.9997366975852433e-06, |
| "loss": 0.0148, |
| "num_tokens": 8701666.0, |
| "reward": 0.24593741996213794, |
| "reward_std": 0.12826487933343741, |
| "rewards/code_reward/mean": 0.1460936620060238, |
| "rewards/code_reward/std": 0.1278229385818122, |
| "rewards/format_reward/mean": 0.9984375, |
| "rewards/format_reward/std": 0.004419417306780815, |
| "step": 60 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 694.2875, |
| "completions/max_terminated_length": 694.2875, |
| "completions/mean_length": 517.9140625, |
| "completions/mean_terminated_length": 517.9140625, |
| "completions/min_length": 369.8125, |
| "completions/min_terminated_length": 369.8125, |
| "epoch": 0.02910312019028963, |
| "grad_norm": 0.5685331001634877, |
| "kl": 0.012218093872070313, |
| "learning_rate": 2.9981279620139177e-06, |
| "loss": 0.0053, |
| "num_tokens": 9438523.0, |
| "reward": 0.24741017883643507, |
| "reward_std": 0.13050166001776234, |
| "rewards/code_reward/mean": 0.1474101732033887, |
| "rewards/code_reward/std": 0.13050166381872258, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 65 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 780.825, |
| "completions/max_terminated_length": 780.825, |
| "completions/mean_length": 535.19375, |
| "completions/mean_terminated_length": 535.19375, |
| "completions/min_length": 392.3875, |
| "completions/min_terminated_length": 392.3875, |
| "epoch": 0.031341821743388835, |
| "grad_norm": 0.5336222407251643, |
| "kl": 0.0148834228515625, |
| "learning_rate": 2.9950583368363777e-06, |
| "loss": 0.007, |
| "num_tokens": 10157391.0, |
| "reward": 0.296136565413326, |
| "reward_std": 0.16640246821043547, |
| "rewards/code_reward/mean": 0.19644906022003852, |
| "rewards/code_reward/std": 0.1658487796317786, |
| "rewards/format_reward/mean": 0.996875, |
| "rewards/format_reward/std": 0.00883883461356163, |
| "step": 70 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 797.0375, |
| "completions/max_terminated_length": 797.0375, |
| "completions/mean_length": 589.8359375, |
| "completions/mean_terminated_length": 589.8359375, |
| "completions/min_length": 426.275, |
| "completions/min_terminated_length": 426.275, |
| "epoch": 0.033580523296488037, |
| "grad_norm": 0.6369858920854246, |
| "kl": 0.017626190185546876, |
| "learning_rate": 2.990530815377378e-06, |
| "loss": 0.0087, |
| "num_tokens": 10930742.0, |
| "reward": 0.27690047658979894, |
| "reward_std": 0.12926372148940574, |
| "rewards/code_reward/mean": 0.177994220439723, |
| "rewards/code_reward/std": 0.12826473288878332, |
| "rewards/format_reward/mean": 0.9890625, |
| "rewards/format_reward/std": 0.027883462235331537, |
| "step": 75 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 758.7125, |
| "completions/max_terminated_length": 758.7125, |
| "completions/mean_length": 559.50625, |
| "completions/mean_terminated_length": 559.50625, |
| "completions/min_length": 398.6, |
| "completions/min_terminated_length": 398.6, |
| "epoch": 0.03581922484958724, |
| "grad_norm": 0.7033361512162839, |
| "kl": 0.01629638671875, |
| "learning_rate": 2.984549812619624e-06, |
| "loss": -0.0033, |
| "num_tokens": 11662834.0, |
| "reward": 0.2631410426460207, |
| "reward_std": 0.11843040494713933, |
| "rewards/code_reward/mean": 0.16329728582059033, |
| "rewards/code_reward/std": 0.11832479977165349, |
| "rewards/format_reward/mean": 0.9984375, |
| "rewards/format_reward/std": 0.004419417306780815, |
| "step": 80 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 671.2875, |
| "completions/max_terminated_length": 671.2875, |
| "completions/mean_length": 507.8875, |
| "completions/mean_terminated_length": 507.8875, |
| "completions/min_length": 377.5625, |
| "completions/min_terminated_length": 377.5625, |
| "epoch": 0.03805792640268644, |
| "grad_norm": 0.5966458269379716, |
| "kl": 0.0168060302734375, |
| "learning_rate": 2.9771211608985266e-06, |
| "loss": 0.0047, |
| "num_tokens": 12352234.0, |
| "reward": 0.32661316031590104, |
| "reward_std": 0.1419034074380761, |
| "rewards/code_reward/mean": 0.2267694047826808, |
| "rewards/code_reward/std": 0.14197679209755734, |
| "rewards/format_reward/mean": 0.9984375, |
| "rewards/format_reward/std": 0.004419417306780815, |
| "step": 85 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 664.3375, |
| "completions/max_terminated_length": 664.3375, |
| "completions/mean_length": 502.846875, |
| "completions/mean_terminated_length": 502.846875, |
| "completions/min_length": 376.3875, |
| "completions/min_terminated_length": 376.3875, |
| "epoch": 0.04029662795578565, |
| "grad_norm": 0.6882916695583966, |
| "kl": 0.017774200439453124, |
| "learning_rate": 2.968252104214841e-06, |
| "loss": 0.0162, |
| "num_tokens": 13055856.0, |
| "reward": 0.26416925797238944, |
| "reward_std": 0.15208177534805145, |
| "rewards/code_reward/mean": 0.16432550169847673, |
| "rewards/code_reward/std": 0.1518084899900714, |
| "rewards/format_reward/mean": 0.9984375, |
| "rewards/format_reward/std": 0.004419417306780815, |
| "step": 90 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 717.9125, |
| "completions/max_terminated_length": 717.9125, |
| "completions/mean_length": 529.171875, |
| "completions/mean_terminated_length": 529.171875, |
| "completions/min_length": 389.075, |
| "completions/min_terminated_length": 389.075, |
| "epoch": 0.04253532950888485, |
| "grad_norm": 0.5867793943695734, |
| "kl": 0.01979522705078125, |
| "learning_rate": 2.9579512911707257e-06, |
| "loss": 0.012, |
| "num_tokens": 13781566.0, |
| "reward": 0.29845606358721855, |
| "reward_std": 0.14189217127859594, |
| "rewards/code_reward/mean": 0.19845605657319537, |
| "rewards/code_reward/std": 0.1418921749223955, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 95 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 750.525, |
| "completions/max_terminated_length": 750.525, |
| "completions/mean_length": 518.6890625, |
| "completions/mean_terminated_length": 518.6890625, |
| "completions/min_length": 371.575, |
| "completions/min_terminated_length": 371.575, |
| "epoch": 0.04477403106198405, |
| "grad_norm": 0.6756173457255675, |
| "kl": 0.023876953125, |
| "learning_rate": 2.9462287665361157e-06, |
| "loss": 0.017, |
| "num_tokens": 14508775.0, |
| "reward": 0.2731386865489185, |
| "reward_std": 0.1473583393584704, |
| "rewards/code_reward/mean": 0.17345118119992547, |
| "rewards/code_reward/std": 0.14723392758751289, |
| "rewards/format_reward/mean": 0.996875, |
| "rewards/format_reward/std": 0.00883883461356163, |
| "step": 100 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 700.8, |
| "completions/max_terminated_length": 700.8, |
| "completions/mean_length": 474.2953125, |
| "completions/mean_terminated_length": 474.2953125, |
| "completions/min_length": 339.4, |
| "completions/min_terminated_length": 339.4, |
| "epoch": 0.04701273261508325, |
| "grad_norm": 0.6213587460739984, |
| "kl": 0.027069091796875, |
| "learning_rate": 2.9330959614536314e-06, |
| "loss": 0.016, |
| "num_tokens": 15178396.0, |
| "reward": 0.29834548365324737, |
| "reward_std": 0.13777082363376394, |
| "rewards/code_reward/mean": 0.19834547787031626, |
| "rewards/code_reward/std": 0.1377708253567107, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 105 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 652.6625, |
| "completions/max_terminated_length": 652.6625, |
| "completions/mean_length": 483.6328125, |
| "completions/mean_terminated_length": 483.6328125, |
| "completions/min_length": 346.775, |
| "completions/min_terminated_length": 346.775, |
| "epoch": 0.04925143416818245, |
| "grad_norm": 0.6361080911543711, |
| "kl": 0.02613983154296875, |
| "learning_rate": 2.9185656822915747e-06, |
| "loss": -0.0057, |
| "num_tokens": 15867273.0, |
| "reward": 0.29470919668674467, |
| "reward_std": 0.12798963281093165, |
| "rewards/code_reward/mean": 0.19486543894308853, |
| "rewards/code_reward/std": 0.12776418880966958, |
| "rewards/format_reward/mean": 0.9984375, |
| "rewards/format_reward/std": 0.004419417306780815, |
| "step": 110 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 741.1375, |
| "completions/max_terminated_length": 741.1375, |
| "completions/mean_length": 536.115625, |
| "completions/mean_terminated_length": 536.115625, |
| "completions/min_length": 393.3125, |
| "completions/min_terminated_length": 393.3125, |
| "epoch": 0.05149013572128166, |
| "grad_norm": 0.6555109923730857, |
| "kl": 0.0231597900390625, |
| "learning_rate": 2.9026520981558844e-06, |
| "loss": 0.009, |
| "num_tokens": 16604459.0, |
| "reward": 0.2888658272102475, |
| "reward_std": 0.15465332815947477, |
| "rewards/code_reward/mean": 0.1888658216179465, |
| "rewards/code_reward/std": 0.15465332991443576, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 115 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 690.875, |
| "completions/max_terminated_length": 690.875, |
| "completions/mean_length": 532.3109375, |
| "completions/mean_terminated_length": 532.3109375, |
| "completions/min_length": 403.2, |
| "completions/min_terminated_length": 403.2, |
| "epoch": 0.05372883727438086, |
| "grad_norm": 0.6650346931830148, |
| "kl": 0.024253082275390626, |
| "learning_rate": 2.8853707270732253e-06, |
| "loss": 0.0132, |
| "num_tokens": 17335906.0, |
| "reward": 0.35356655940413473, |
| "reward_std": 0.18680918092140927, |
| "rewards/code_reward/mean": 0.2537228013883578, |
| "rewards/code_reward/std": 0.18692450551316142, |
| "rewards/format_reward/mean": 0.9984375, |
| "rewards/format_reward/std": 0.004419417306780815, |
| "step": 120 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 694.6625, |
| "completions/max_terminated_length": 694.6625, |
| "completions/mean_length": 517.6953125, |
| "completions/mean_terminated_length": 517.6953125, |
| "completions/min_length": 385.7, |
| "completions/min_terminated_length": 385.7, |
| "epoch": 0.05596753882748006, |
| "grad_norm": 0.6254170069058008, |
| "kl": 0.025794219970703126, |
| "learning_rate": 2.8667384208586865e-06, |
| "loss": 0.0043, |
| "num_tokens": 18058943.0, |
| "reward": 0.3556826992891729, |
| "reward_std": 0.1363969652389642, |
| "rewards/code_reward/mean": 0.25583894047886135, |
| "rewards/code_reward/std": 0.1366761433542706, |
| "rewards/format_reward/mean": 0.9984375, |
| "rewards/format_reward/std": 0.004419417306780815, |
| "step": 125 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 706.0375, |
| "completions/max_terminated_length": 706.0375, |
| "completions/mean_length": 543.7203125, |
| "completions/mean_terminated_length": 543.7203125, |
| "completions/min_length": 405.2875, |
| "completions/min_terminated_length": 405.2875, |
| "epoch": 0.05820624038057926, |
| "grad_norm": 0.6216908171231574, |
| "kl": 0.02597503662109375, |
| "learning_rate": 2.846773348682845e-06, |
| "loss": 0.0007, |
| "num_tokens": 18775148.0, |
| "reward": 0.2654763679020107, |
| "reward_std": 0.13124802198726684, |
| "rewards/code_reward/mean": 0.1659451116924174, |
| "rewards/code_reward/std": 0.13088461093138903, |
| "rewards/format_reward/mean": 0.9953125, |
| "rewards/format_reward/std": 0.013258251920342445, |
| "step": 130 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 732.7125, |
| "completions/max_terminated_length": 732.7125, |
| "completions/mean_length": 550.0796875, |
| "completions/mean_terminated_length": 550.0796875, |
| "completions/min_length": 407.625, |
| "completions/min_terminated_length": 407.625, |
| "epoch": 0.06044494193367847, |
| "grad_norm": 0.6438445755693917, |
| "kl": 0.02695159912109375, |
| "learning_rate": 2.8254949793542194e-06, |
| "loss": 0.0133, |
| "num_tokens": 19516591.0, |
| "reward": 0.30453283004462717, |
| "reward_std": 0.15741582050104624, |
| "rewards/code_reward/mean": 0.20453282294183736, |
| "rewards/code_reward/std": 0.15741582473565358, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 135 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 706.0, |
| "completions/max_terminated_length": 706.0, |
| "completions/mean_length": 532.575, |
| "completions/mean_terminated_length": 532.575, |
| "completions/min_length": 389.0625, |
| "completions/min_terminated_length": 389.0625, |
| "epoch": 0.06268364348677767, |
| "grad_norm": 0.6680496022315153, |
| "kl": 0.02960357666015625, |
| "learning_rate": 2.802924062334391e-06, |
| "loss": 0.0146, |
| "num_tokens": 20241207.0, |
| "reward": 0.3066130679100752, |
| "reward_std": 0.18769313739612697, |
| "rewards/code_reward/mean": 0.2072380588942906, |
| "rewards/code_reward/std": 0.18655281127139461, |
| "rewards/format_reward/mean": 0.99375, |
| "rewards/format_reward/std": 0.01767766922712326, |
| "step": 140 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 711.9125, |
| "completions/max_terminated_length": 711.9125, |
| "completions/mean_length": 507.396875, |
| "completions/mean_terminated_length": 507.396875, |
| "completions/min_length": 354.5, |
| "completions/min_terminated_length": 354.5, |
| "epoch": 0.06492234503987687, |
| "grad_norm": 0.6756739470558975, |
| "kl": 0.02831573486328125, |
| "learning_rate": 2.779082607504298e-06, |
| "loss": 0.015, |
| "num_tokens": 20963517.0, |
| "reward": 0.28101985761895776, |
| "reward_std": 0.17780419969349168, |
| "rewards/code_reward/mean": 0.1811761005956214, |
| "rewards/code_reward/std": 0.17785799705889077, |
| "rewards/format_reward/mean": 0.9984375, |
| "rewards/format_reward/std": 0.004419417306780815, |
| "step": 145 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 606.0625, |
| "completions/max_terminated_length": 606.0625, |
| "completions/mean_length": 448.28125, |
| "completions/mean_terminated_length": 448.28125, |
| "completions/min_length": 322.1375, |
| "completions/min_terminated_length": 322.1375, |
| "epoch": 0.06716104659297607, |
| "grad_norm": 0.5719295032736604, |
| "kl": 0.02752532958984375, |
| "learning_rate": 2.7539938637014514e-06, |
| "loss": 0.0092, |
| "num_tokens": 21610025.0, |
| "reward": 0.3294339914806187, |
| "reward_std": 0.15150615764432587, |
| "rewards/code_reward/mean": 0.22959023197181522, |
| "rewards/code_reward/std": 0.15142173281637952, |
| "rewards/format_reward/mean": 0.9984375, |
| "rewards/format_reward/std": 0.004419417306780815, |
| "step": 150 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 658.65, |
| "completions/max_terminated_length": 658.65, |
| "completions/mean_length": 498.0640625, |
| "completions/mean_terminated_length": 498.0640625, |
| "completions/min_length": 366.95, |
| "completions/min_terminated_length": 366.95, |
| "epoch": 0.06939974814607527, |
| "grad_norm": 0.5384545916898208, |
| "kl": 0.02587127685546875, |
| "learning_rate": 2.7276822960489817e-06, |
| "loss": -0.0011, |
| "num_tokens": 22304426.0, |
| "reward": 0.3431210536509752, |
| "reward_std": 0.15553151002968663, |
| "rewards/code_reward/mean": 0.24312104810524032, |
| "rewards/code_reward/std": 0.1555315111123491, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 155 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 765.3125, |
| "completions/max_terminated_length": 765.3125, |
| "completions/mean_length": 542.303125, |
| "completions/mean_terminated_length": 542.303125, |
| "completions/min_length": 394.875, |
| "completions/min_terminated_length": 394.875, |
| "epoch": 0.07163844969917448, |
| "grad_norm": 0.5684225568583741, |
| "kl": 0.030213165283203124, |
| "learning_rate": 2.7001735620986323e-06, |
| "loss": 0.0162, |
| "num_tokens": 23031900.0, |
| "reward": 0.29204714838415385, |
| "reward_std": 0.15333203882328234, |
| "rewards/code_reward/mean": 0.19235964192193933, |
| "rewards/code_reward/std": 0.15339574370882475, |
| "rewards/format_reward/mean": 0.996875, |
| "rewards/format_reward/std": 0.00883883461356163, |
| "step": 160 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 842.8375, |
| "completions/max_terminated_length": 842.8375, |
| "completions/mean_length": 574.2078125, |
| "completions/mean_terminated_length": 574.2078125, |
| "completions/min_length": 428.4375, |
| "completions/min_terminated_length": 428.4375, |
| "epoch": 0.07387715125227368, |
| "grad_norm": 0.5435945556590033, |
| "kl": 0.027813720703125, |
| "learning_rate": 2.671494486810974e-06, |
| "loss": 0.0106, |
| "num_tokens": 23789657.0, |
| "reward": 0.3045080302283168, |
| "reward_std": 0.16393477989186067, |
| "rewards/code_reward/mean": 0.20466427168576046, |
| "rewards/code_reward/std": 0.1636599010293139, |
| "rewards/format_reward/mean": 0.9984375, |
| "rewards/format_reward/std": 0.004419417306780815, |
| "step": 165 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 743.1375, |
| "completions/max_terminated_length": 743.1375, |
| "completions/mean_length": 544.6453125, |
| "completions/mean_terminated_length": 544.6453125, |
| "completions/min_length": 400.9375, |
| "completions/min_terminated_length": 400.9375, |
| "epoch": 0.07611585280537288, |
| "grad_norm": 0.5551210527840703, |
| "kl": 0.03048858642578125, |
| "learning_rate": 2.641673036397215e-06, |
| "loss": 0.0108, |
| "num_tokens": 24537942.0, |
| "reward": 0.2919698000885546, |
| "reward_std": 0.14443947067193222, |
| "rewards/code_reward/mean": 0.19228229282743997, |
| "rewards/code_reward/std": 0.1438393424032256, |
| "rewards/format_reward/mean": 0.996875, |
| "rewards/format_reward/std": 0.00883883461356163, |
| "step": 170 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 700.1125, |
| "completions/max_terminated_length": 700.1125, |
| "completions/mean_length": 538.8515625, |
| "completions/mean_terminated_length": 538.8515625, |
| "completions/min_length": 408.6, |
| "completions/min_terminated_length": 408.6, |
| "epoch": 0.07835455435847209, |
| "grad_norm": 0.6528644572698393, |
| "kl": 0.028961181640625, |
| "learning_rate": 2.610738291048138e-06, |
| "loss": 0.0133, |
| "num_tokens": 25267431.0, |
| "reward": 0.274235178809613, |
| "reward_std": 0.1538910755480174, |
| "rewards/code_reward/mean": 0.17439142313669437, |
| "rewards/code_reward/std": 0.15372212599031626, |
| "rewards/format_reward/mean": 0.9984375, |
| "rewards/format_reward/std": 0.004419417306780815, |
| "step": 175 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 697.1375, |
| "completions/max_terminated_length": 697.1375, |
| "completions/mean_length": 517.7078125, |
| "completions/mean_terminated_length": 517.7078125, |
| "completions/min_length": 373.8875, |
| "completions/min_terminated_length": 373.8875, |
| "epoch": 0.0805932559115713, |
| "grad_norm": 0.5356003023929052, |
| "kl": 0.027515411376953125, |
| "learning_rate": 2.5787204165767413e-06, |
| "loss": 0.0123, |
| "num_tokens": 26025444.0, |
| "reward": 0.31410480896010995, |
| "reward_std": 0.17551766034448518, |
| "rewards/code_reward/mean": 0.21410479900659993, |
| "rewards/code_reward/std": 0.17551766115357167, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 180 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 627.9375, |
| "completions/max_terminated_length": 627.9375, |
| "completions/mean_length": 466.7078125, |
| "completions/mean_terminated_length": 466.7078125, |
| "completions/min_length": 336.625, |
| "completions/min_terminated_length": 336.625, |
| "epoch": 0.0828319574646705, |
| "grad_norm": 0.5875291034180061, |
| "kl": 0.03062591552734375, |
| "learning_rate": 2.545650635002249e-06, |
| "loss": 0.014, |
| "num_tokens": 26715345.0, |
| "reward": 0.3225731427781284, |
| "reward_std": 0.14460668399697169, |
| "rewards/code_reward/mean": 0.22288563377878745, |
| "rewards/code_reward/std": 0.1446731591859134, |
| "rewards/format_reward/mean": 0.996875, |
| "rewards/format_reward/std": 0.00883883461356163, |
| "step": 185 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 645.05, |
| "completions/max_terminated_length": 645.05, |
| "completions/mean_length": 468.11875, |
| "completions/mean_terminated_length": 468.11875, |
| "completions/min_length": 320.5625, |
| "completions/min_terminated_length": 320.5625, |
| "epoch": 0.0850706590177697, |
| "grad_norm": 0.5981227815110649, |
| "kl": 0.03143310546875, |
| "learning_rate": 2.511561194104161e-06, |
| "loss": 0.0158, |
| "num_tokens": 27388005.0, |
| "reward": 0.30132306115701796, |
| "reward_std": 0.11532193489110795, |
| "rewards/code_reward/mean": 0.20147930511957385, |
| "rewards/code_reward/std": 0.11487999467644841, |
| "rewards/format_reward/mean": 0.9984375, |
| "rewards/format_reward/std": 0.004419417306780815, |
| "step": 190 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 694.2375, |
| "completions/max_terminated_length": 694.2375, |
| "completions/mean_length": 508.18125, |
| "completions/mean_terminated_length": 508.18125, |
| "completions/min_length": 354.7, |
| "completions/min_terminated_length": 354.7, |
| "epoch": 0.0873093605708689, |
| "grad_norm": 0.7051969480561995, |
| "kl": 0.030487060546875, |
| "learning_rate": 2.4764853359760447e-06, |
| "loss": 0.0074, |
| "num_tokens": 28089689.0, |
| "reward": 0.2780560509301722, |
| "reward_std": 0.13229238498024642, |
| "rewards/code_reward/mean": 0.17805604453606066, |
| "rewards/code_reward/std": 0.13229238652565983, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 195 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 679.9625, |
| "completions/max_terminated_length": 679.9625, |
| "completions/mean_length": 510.11875, |
| "completions/mean_terminated_length": 510.11875, |
| "completions/min_length": 362.825, |
| "completions/min_terminated_length": 362.825, |
| "epoch": 0.0895480621239681, |
| "grad_norm": 0.5512771391532328, |
| "kl": 0.02972869873046875, |
| "learning_rate": 2.440457264609727e-06, |
| "loss": 0.0022, |
| "num_tokens": 28787549.0, |
| "reward": 0.2989016550593078, |
| "reward_std": 0.15942465648986398, |
| "rewards/code_reward/mean": 0.1989016504448955, |
| "rewards/code_reward/std": 0.15942465687403456, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 200 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 742.85, |
| "completions/max_terminated_length": 742.85, |
| "completions/mean_length": 550.4421875, |
| "completions/mean_terminated_length": 550.4421875, |
| "completions/min_length": 397.275, |
| "completions/min_terminated_length": 397.275, |
| "epoch": 0.0917867636770673, |
| "grad_norm": 0.6115605511607163, |
| "kl": 0.02950439453125, |
| "learning_rate": 2.403512112541498e-06, |
| "loss": 0.0262, |
| "num_tokens": 29531328.0, |
| "reward": 0.3011234959587455, |
| "reward_std": 0.13739942002575845, |
| "rewards/code_reward/mean": 0.20127973848429975, |
| "rewards/code_reward/std": 0.13699884270899929, |
| "rewards/format_reward/mean": 0.9984375, |
| "rewards/format_reward/std": 0.004419417306780815, |
| "step": 205 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 738.225, |
| "completions/max_terminated_length": 738.225, |
| "completions/mean_length": 538.3640625, |
| "completions/mean_terminated_length": 538.3640625, |
| "completions/min_length": 388.85, |
| "completions/min_terminated_length": 388.85, |
| "epoch": 0.0940254652301665, |
| "grad_norm": 0.6180896800134059, |
| "kl": 0.02983551025390625, |
| "learning_rate": 2.365685906592846e-06, |
| "loss": 0.013, |
| "num_tokens": 30274617.0, |
| "reward": 0.28743315050378443, |
| "reward_std": 0.14888401252392214, |
| "rewards/code_reward/mean": 0.18743314441671827, |
| "rewards/code_reward/std": 0.14888401648786384, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 210 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 657.8125, |
| "completions/max_terminated_length": 657.8125, |
| "completions/mean_length": 508.53125, |
| "completions/mean_terminated_length": 508.53125, |
| "completions/min_length": 375.2625, |
| "completions/min_terminated_length": 375.2625, |
| "epoch": 0.0962641667832657, |
| "grad_norm": 0.5149353831339782, |
| "kl": 0.0354248046875, |
| "learning_rate": 2.327015532739145e-06, |
| "loss": -0.0035, |
| "num_tokens": 30968253.0, |
| "reward": 0.3200162294320762, |
| "reward_std": 0.16002128778782207, |
| "rewards/code_reward/mean": 0.22001622177049285, |
| "rewards/code_reward/std": 0.16002129036933183, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 215 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 693.8375, |
| "completions/max_terminated_length": 693.8375, |
| "completions/mean_length": 516.459375, |
| "completions/mean_terminated_length": 516.459375, |
| "completions/min_length": 385.2, |
| "completions/min_terminated_length": 385.2, |
| "epoch": 0.0985028683363649, |
| "grad_norm": 0.583768547911125, |
| "kl": 0.032080078125, |
| "learning_rate": 2.2875387001405366e-06, |
| "loss": -0.0004, |
| "num_tokens": 31677939.0, |
| "reward": 0.2827278276905417, |
| "reward_std": 0.12490762829547748, |
| "rewards/code_reward/mean": 0.182884071078297, |
| "rewards/code_reward/std": 0.12475912600348238, |
| "rewards/format_reward/mean": 0.9984375, |
| "rewards/format_reward/std": 0.004419417306780815, |
| "step": 220 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 689.225, |
| "completions/max_terminated_length": 689.225, |
| "completions/mean_length": 511.4609375, |
| "completions/mean_terminated_length": 511.4609375, |
| "completions/min_length": 375.7, |
| "completions/min_terminated_length": 375.7, |
| "epoch": 0.10074156988946412, |
| "grad_norm": 0.47416592884978, |
| "kl": 0.03255615234375, |
| "learning_rate": 2.2472939043700894e-06, |
| "loss": 0.0104, |
| "num_tokens": 32366802.0, |
| "reward": 0.288489468768239, |
| "reward_std": 0.14980540352989918, |
| "rewards/code_reward/mean": 0.18880196339305258, |
| "rewards/code_reward/std": 0.14945577481121292, |
| "rewards/format_reward/mean": 0.996875, |
| "rewards/format_reward/std": 0.00883883461356163, |
| "step": 225 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 709.1375, |
| "completions/max_terminated_length": 709.1375, |
| "completions/mean_length": 537.6703125, |
| "completions/mean_terminated_length": 537.6703125, |
| "completions/min_length": 400.325, |
| "completions/min_terminated_length": 400.325, |
| "epoch": 0.10298027144256332, |
| "grad_norm": 0.6526599784556473, |
| "kl": 0.031103515625, |
| "learning_rate": 2.206320389875099e-06, |
| "loss": 0.0004, |
| "num_tokens": 33092199.0, |
| "reward": 0.27060003159567714, |
| "reward_std": 0.14649803503416478, |
| "rewards/code_reward/mean": 0.1706000213016523, |
| "rewards/code_reward/std": 0.14649803435604553, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 230 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 688.975, |
| "completions/max_terminated_length": 688.975, |
| "completions/mean_length": 537.1703125, |
| "completions/mean_terminated_length": 537.1703125, |
| "completions/min_length": 413.1, |
| "completions/min_terminated_length": 413.1, |
| "epoch": 0.10521897299566252, |
| "grad_norm": 0.578479817853106, |
| "kl": 0.031402587890625, |
| "learning_rate": 2.1646581117081187e-06, |
| "loss": 0.0118, |
| "num_tokens": 33813252.0, |
| "reward": 0.24227329418063165, |
| "reward_std": 0.14281967077986338, |
| "rewards/code_reward/mean": 0.1422732870618347, |
| "rewards/code_reward/std": 0.1428196722699795, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 235 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 681.5375, |
| "completions/max_terminated_length": 681.5375, |
| "completions/mean_length": 533.6359375, |
| "completions/mean_terminated_length": 533.6359375, |
| "completions/min_length": 407.2125, |
| "completions/min_terminated_length": 407.2125, |
| "epoch": 0.10745767454876172, |
| "grad_norm": 0.6265187392839973, |
| "kl": 0.03284759521484375, |
| "learning_rate": 2.122347696565059e-06, |
| "loss": 0.0139, |
| "num_tokens": 34549147.0, |
| "reward": 0.33532751044258474, |
| "reward_std": 0.1622638524393551, |
| "rewards/code_reward/mean": 0.2353275064189802, |
| "rewards/code_reward/std": 0.1622638531640405, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 240 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 703.1125, |
| "completions/max_terminated_length": 703.1125, |
| "completions/mean_length": 537.425, |
| "completions/mean_terminated_length": 537.425, |
| "completions/min_length": 406.0, |
| "completions/min_terminated_length": 406.0, |
| "epoch": 0.10969637610186092, |
| "grad_norm": 0.6224509400429641, |
| "kl": 0.0321990966796875, |
| "learning_rate": 2.079430403168327e-06, |
| "loss": 0.0205, |
| "num_tokens": 35271579.0, |
| "reward": 0.3003238163888454, |
| "reward_std": 0.17288763520191425, |
| "rewards/code_reward/mean": 0.20032380691118307, |
| "rewards/code_reward/std": 0.1728876391222002, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 245 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 647.8375, |
| "completions/max_terminated_length": 647.8375, |
| "completions/mean_length": 501.9875, |
| "completions/mean_terminated_length": 501.9875, |
| "completions/min_length": 376.6375, |
| "completions/min_terminated_length": 376.6375, |
| "epoch": 0.11193507765496012, |
| "grad_norm": 0.6263781800971838, |
| "kl": 0.03351898193359375, |
| "learning_rate": 2.0359480820336594e-06, |
| "loss": 0.0094, |
| "num_tokens": 35965555.0, |
| "reward": 0.31694198679178953, |
| "reward_std": 0.1596899228548864, |
| "rewards/code_reward/mean": 0.2170982286144863, |
| "rewards/code_reward/std": 0.15924798299674875, |
| "rewards/format_reward/mean": 0.9984375, |
| "rewards/format_reward/std": 0.004419417306780815, |
| "step": 250 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 636.8625, |
| "completions/max_terminated_length": 636.8625, |
| "completions/mean_length": 492.7109375, |
| "completions/mean_terminated_length": 492.7109375, |
| "completions/min_length": 373.775, |
| "completions/min_terminated_length": 373.775, |
| "epoch": 0.11417377920805932, |
| "grad_norm": 0.6373686980037819, |
| "kl": 0.0332763671875, |
| "learning_rate": 1.9919431346598687e-06, |
| "loss": 0.0146, |
| "num_tokens": 36669402.0, |
| "reward": 0.30089313965290787, |
| "reward_std": 0.1563536574365571, |
| "rewards/code_reward/mean": 0.20089313458884134, |
| "rewards/code_reward/std": 0.15635365938651374, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 255 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 649.975, |
| "completions/max_terminated_length": 649.975, |
| "completions/mean_length": 497.025, |
| "completions/mean_terminated_length": 497.025, |
| "completions/min_length": 369.1625, |
| "completions/min_terminated_length": 369.1625, |
| "epoch": 0.11641248076115852, |
| "grad_norm": 0.6124161391568931, |
| "kl": 0.03218841552734375, |
| "learning_rate": 1.947458472181296e-06, |
| "loss": 0.0024, |
| "num_tokens": 37365858.0, |
| "reward": 0.31037036776542665, |
| "reward_std": 0.15006352393247652, |
| "rewards/code_reward/mean": 0.21037036021152744, |
| "rewards/code_reward/std": 0.15006352449127008, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 260 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 666.975, |
| "completions/max_terminated_length": 666.975, |
| "completions/mean_length": 511.603125, |
| "completions/mean_terminated_length": 511.603125, |
| "completions/min_length": 381.725, |
| "completions/min_terminated_length": 381.725, |
| "epoch": 0.11865118231425772, |
| "grad_norm": 0.5345035896498757, |
| "kl": 0.0315277099609375, |
| "learning_rate": 1.9025374735233068e-06, |
| "loss": 0.0154, |
| "num_tokens": 38086620.0, |
| "reward": 0.32326241619884966, |
| "reward_std": 0.14852707152604125, |
| "rewards/code_reward/mean": 0.2234186581481481, |
| "rewards/code_reward/std": 0.14887304982403293, |
| "rewards/format_reward/mean": 0.9984375, |
| "rewards/format_reward/std": 0.004419417306780815, |
| "step": 265 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 715.9125, |
| "completions/max_terminated_length": 715.9125, |
| "completions/mean_length": 527.903125, |
| "completions/mean_terminated_length": 527.903125, |
| "completions/min_length": 398.2875, |
| "completions/min_terminated_length": 398.2875, |
| "epoch": 0.12088988386735694, |
| "grad_norm": 0.573684423194082, |
| "kl": 0.0300811767578125, |
| "learning_rate": 1.8572239431016146e-06, |
| "loss": 0.0126, |
| "num_tokens": 38809214.0, |
| "reward": 0.2911208848468959, |
| "reward_std": 0.13888704897253773, |
| "rewards/code_reward/mean": 0.19127712811168748, |
| "rewards/code_reward/std": 0.13906100282329134, |
| "rewards/format_reward/mean": 0.9984375, |
| "rewards/format_reward/std": 0.004419417306780815, |
| "step": 270 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 685.725, |
| "completions/max_terminated_length": 685.725, |
| "completions/mean_length": 533.9171875, |
| "completions/mean_terminated_length": 533.9171875, |
| "completions/min_length": 391.1375, |
| "completions/min_terminated_length": 391.1375, |
| "epoch": 0.12312858542045614, |
| "grad_norm": 0.4830812794073296, |
| "kl": 0.03022613525390625, |
| "learning_rate": 1.8115620681066946e-06, |
| "loss": 0.0069, |
| "num_tokens": 39531329.0, |
| "reward": 0.37973827524110676, |
| "reward_std": 0.17492547728470526, |
| "rewards/code_reward/mean": 0.2798945170710795, |
| "rewards/code_reward/std": 0.1747873265412636, |
| "rewards/format_reward/mean": 0.9984375, |
| "rewards/format_reward/std": 0.004419417306780815, |
| "step": 275 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 764.4625, |
| "completions/max_terminated_length": 764.4625, |
| "completions/mean_length": 555.675, |
| "completions/mean_terminated_length": 555.675, |
| "completions/min_length": 406.3, |
| "completions/min_terminated_length": 406.3, |
| "epoch": 0.12536728697355534, |
| "grad_norm": 0.48053859411140093, |
| "kl": 0.02889251708984375, |
| "learning_rate": 1.765596375414936e-06, |
| "loss": 0.0177, |
| "num_tokens": 40297449.0, |
| "reward": 0.26671807700768113, |
| "reward_std": 0.14942678074003196, |
| "rewards/code_reward/mean": 0.1671868214616552, |
| "rewards/code_reward/std": 0.14887573684682137, |
| "rewards/format_reward/mean": 0.9953125, |
| "rewards/format_reward/std": 0.013258251920342445, |
| "step": 280 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 669.825, |
| "completions/max_terminated_length": 669.825, |
| "completions/mean_length": 511.8296875, |
| "completions/mean_terminated_length": 511.8296875, |
| "completions/min_length": 369.2625, |
| "completions/min_terminated_length": 369.2625, |
| "epoch": 0.12760598852665453, |
| "grad_norm": 0.5012923928076685, |
| "kl": 0.03134918212890625, |
| "learning_rate": 1.7193716881685532e-06, |
| "loss": 0.0171, |
| "num_tokens": 41000340.0, |
| "reward": 0.33275858471170067, |
| "reward_std": 0.16071395185717846, |
| "rewards/code_reward/mean": 0.23291482530039503, |
| "rewards/code_reward/std": 0.1606017280719243, |
| "rewards/format_reward/mean": 0.9984375, |
| "rewards/format_reward/std": 0.004419417306780815, |
| "step": 285 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0015625, |
| "completions/max_length": 857.1125, |
| "completions/max_terminated_length": 763.7375, |
| "completions/mean_length": 575.0640625, |
| "completions/mean_terminated_length": 563.0296878814697, |
| "completions/min_length": 400.15, |
| "completions/min_terminated_length": 400.15, |
| "epoch": 0.12984469007975374, |
| "grad_norm": 0.630042919145043, |
| "kl": 0.030005645751953126, |
| "learning_rate": 1.6729330820665925e-06, |
| "loss": 0.0156, |
| "num_tokens": 41754885.0, |
| "reward": 0.28822933994233607, |
| "reward_std": 0.1465720217616763, |
| "rewards/code_reward/mean": 0.18854183692019433, |
| "rewards/code_reward/std": 0.14625606250483542, |
| "rewards/format_reward/mean": 0.996875, |
| "rewards/format_reward/std": 0.00883883461356163, |
| "step": 290 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 716.25, |
| "completions/max_terminated_length": 716.25, |
| "completions/mean_length": 537.20625, |
| "completions/mean_terminated_length": 537.20625, |
| "completions/min_length": 392.175, |
| "completions/min_terminated_length": 392.175, |
| "epoch": 0.13208339163285296, |
| "grad_norm": 0.5107272382559945, |
| "kl": 0.03038330078125, |
| "learning_rate": 1.6263258414096618e-06, |
| "loss": 0.0154, |
| "num_tokens": 42470809.0, |
| "reward": 0.33072368800640106, |
| "reward_std": 0.2061192358552944, |
| "rewards/code_reward/mean": 0.23087992868968285, |
| "rewards/code_reward/std": 0.20595307812327518, |
| "rewards/format_reward/mean": 0.9984375, |
| "rewards/format_reward/std": 0.004419417306780815, |
| "step": 295 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 730.0875, |
| "completions/max_terminated_length": 730.0875, |
| "completions/mean_length": 539.428125, |
| "completions/mean_terminated_length": 539.428125, |
| "completions/min_length": 383.3125, |
| "completions/min_terminated_length": 383.3125, |
| "epoch": 0.13432209318595215, |
| "grad_norm": 0.537918547301204, |
| "kl": 0.0288299560546875, |
| "learning_rate": 1.5795954149412446e-06, |
| "loss": 0.0083, |
| "num_tokens": 43193235.0, |
| "reward": 0.34142726445570587, |
| "reward_std": 0.14466436323709786, |
| "rewards/code_reward/mean": 0.24173975624726154, |
| "rewards/code_reward/std": 0.14445849329931662, |
| "rewards/format_reward/mean": 0.996875, |
| "rewards/format_reward/std": 0.00883883461356163, |
| "step": 300 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 759.3, |
| "completions/max_terminated_length": 759.3, |
| "completions/mean_length": 557.8890625, |
| "completions/mean_terminated_length": 557.8890625, |
| "completions/min_length": 399.75, |
| "completions/min_terminated_length": 399.75, |
| "epoch": 0.13656079473905136, |
| "grad_norm": 0.584098194483569, |
| "kl": 0.02829742431640625, |
| "learning_rate": 1.5327873715286555e-06, |
| "loss": 0.0094, |
| "num_tokens": 43930988.0, |
| "reward": 0.2912998185493052, |
| "reward_std": 0.1508971786039183, |
| "rewards/code_reward/mean": 0.1914560628225445, |
| "rewards/code_reward/std": 0.15074160079238935, |
| "rewards/format_reward/mean": 0.9984375, |
| "rewards/format_reward/std": 0.004419417306780815, |
| "step": 305 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 703.5875, |
| "completions/max_terminated_length": 703.5875, |
| "completions/mean_length": 531.625, |
| "completions/mean_terminated_length": 531.625, |
| "completions/min_length": 398.075, |
| "completions/min_terminated_length": 398.075, |
| "epoch": 0.13879949629215055, |
| "grad_norm": 0.5610344866641752, |
| "kl": 0.029935455322265624, |
| "learning_rate": 1.4859473557268605e-06, |
| "loss": 0.0228, |
| "num_tokens": 44630804.0, |
| "reward": 0.31272673439234494, |
| "reward_std": 0.160137642340851, |
| "rewards/code_reward/mean": 0.21272672956984023, |
| "rewards/code_reward/std": 0.16013764539093245, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 310 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0015625, |
| "completions/max_length": 754.0625, |
| "completions/max_terminated_length": 673.325, |
| "completions/mean_length": 516.14375, |
| "completions/mean_terminated_length": 504.68303604125975, |
| "completions/min_length": 370.5375, |
| "completions/min_terminated_length": 370.5375, |
| "epoch": 0.14103819784524976, |
| "grad_norm": 0.5678371154254215, |
| "kl": 0.0303436279296875, |
| "learning_rate": 1.4391210432684911e-06, |
| "loss": 0.0172, |
| "num_tokens": 45353968.0, |
| "reward": 0.30479407841339706, |
| "reward_std": 0.1592210401489865, |
| "rewards/code_reward/mean": 0.20510657107515726, |
| "rewards/code_reward/std": 0.15867348304018378, |
| "rewards/format_reward/mean": 0.996875, |
| "rewards/format_reward/std": 0.00883883461356163, |
| "step": 315 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 662.725, |
| "completions/max_terminated_length": 662.725, |
| "completions/mean_length": 500.4296875, |
| "completions/mean_terminated_length": 500.4296875, |
| "completions/min_length": 366.975, |
| "completions/min_terminated_length": 366.975, |
| "epoch": 0.14327689939834895, |
| "grad_norm": 0.6017406713534427, |
| "kl": 0.03122406005859375, |
| "learning_rate": 1.3923540965234527e-06, |
| "loss": 0.0166, |
| "num_tokens": 46065395.0, |
| "reward": 0.3366297990083694, |
| "reward_std": 0.14250589827133808, |
| "rewards/code_reward/mean": 0.23662979124492267, |
| "rewards/code_reward/std": 0.14250590050360187, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 320 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0015625, |
| "completions/max_length": 756.5875, |
| "completions/max_terminated_length": 663.425, |
| "completions/mean_length": 501.0203125, |
| "completions/mean_terminated_length": 489.0866073608398, |
| "completions/min_length": 363.675, |
| "completions/min_terminated_length": 363.675, |
| "epoch": 0.14551560095144817, |
| "grad_norm": 0.6420872754773821, |
| "kl": 0.03084869384765625, |
| "learning_rate": 1.3456921199715669e-06, |
| "loss": 0.0183, |
| "num_tokens": 46769624.0, |
| "reward": 0.274929376039654, |
| "reward_std": 0.14380120979622008, |
| "rewards/code_reward/mean": 0.17508561803842895, |
| "rewards/code_reward/std": 0.1433592700981535, |
| "rewards/format_reward/mean": 0.9984375, |
| "rewards/format_reward/std": 0.004419417306780815, |
| "step": 325 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 662.975, |
| "completions/max_terminated_length": 662.975, |
| "completions/mean_length": 503.740625, |
| "completions/mean_terminated_length": 503.740625, |
| "completions/min_length": 379.2, |
| "completions/min_terminated_length": 379.2, |
| "epoch": 0.14775430250454735, |
| "grad_norm": 0.5635596946152118, |
| "kl": 0.0294403076171875, |
| "learning_rate": 1.2991806157316646e-06, |
| "loss": 0.0095, |
| "num_tokens": 47486962.0, |
| "reward": 0.2972354737110436, |
| "reward_std": 0.11910657306143549, |
| "rewards/code_reward/mean": 0.19739172172703548, |
| "rewards/code_reward/std": 0.11866463308397215, |
| "rewards/format_reward/mean": 0.9984375, |
| "rewards/format_reward/std": 0.004419417306780815, |
| "step": 330 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 714.7625, |
| "completions/max_terminated_length": 714.7625, |
| "completions/mean_length": 533.565625, |
| "completions/mean_terminated_length": 533.565625, |
| "completions/min_length": 388.1125, |
| "completions/min_terminated_length": 388.1125, |
| "epoch": 0.14999300405764657, |
| "grad_norm": 0.60933960917403, |
| "kl": 0.02783966064453125, |
| "learning_rate": 1.2528649391904927e-06, |
| "loss": 0.0078, |
| "num_tokens": 48202916.0, |
| "reward": 0.2663810454308987, |
| "reward_std": 0.13506472197477706, |
| "rewards/code_reward/mean": 0.1665372904652031, |
| "rewards/code_reward/std": 0.13477403752622194, |
| "rewards/format_reward/mean": 0.9984375, |
| "rewards/format_reward/std": 0.004419417306780815, |
| "step": 335 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 735.2125, |
| "completions/max_terminated_length": 735.2125, |
| "completions/mean_length": 552.1953125, |
| "completions/mean_terminated_length": 552.1953125, |
| "completions/min_length": 409.1875, |
| "completions/min_terminated_length": 409.1875, |
| "epoch": 0.15223170561074575, |
| "grad_norm": 0.49791400131307495, |
| "kl": 0.025357818603515624, |
| "learning_rate": 1.2067902547747076e-06, |
| "loss": 0.0164, |
| "num_tokens": 48932801.0, |
| "reward": 0.3229883606545627, |
| "reward_std": 0.1690987061272608, |
| "rewards/code_reward/mean": 0.2231446014760877, |
| "rewards/code_reward/std": 0.16865676557354164, |
| "rewards/format_reward/mean": 0.9984375, |
| "rewards/format_reward/std": 0.004419417306780815, |
| "step": 340 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 669.15, |
| "completions/max_terminated_length": 669.15, |
| "completions/mean_length": 518.7078125, |
| "completions/mean_terminated_length": 518.7078125, |
| "completions/min_length": 389.6, |
| "completions/min_terminated_length": 389.6, |
| "epoch": 0.15447040716384497, |
| "grad_norm": 0.5738371722180043, |
| "kl": 0.02738189697265625, |
| "learning_rate": 1.1610014919090847e-06, |
| "loss": 0.0011, |
| "num_tokens": 49618094.0, |
| "reward": 0.36557651134207847, |
| "reward_std": 0.1583593948977068, |
| "rewards/code_reward/mean": 0.2655765014962526, |
| "rewards/code_reward/std": 0.15835939861135556, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 345 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 763.775, |
| "completions/max_terminated_length": 763.775, |
| "completions/mean_length": 575.278125, |
| "completions/mean_terminated_length": 575.278125, |
| "completions/min_length": 430.725, |
| "completions/min_terminated_length": 430.725, |
| "epoch": 0.15670910871694418, |
| "grad_norm": 0.5330527785978358, |
| "kl": 0.02547607421875, |
| "learning_rate": 1.1155433012038849e-06, |
| "loss": 0.013, |
| "num_tokens": 50367344.0, |
| "reward": 0.3111037847585976, |
| "reward_std": 0.1396631282143062, |
| "rewards/code_reward/mean": 0.21110378042503725, |
| "rewards/code_reward/std": 0.1396631306008203, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 350 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 784.2, |
| "completions/max_terminated_length": 784.2, |
| "completions/mean_length": 581.1390625, |
| "completions/mean_terminated_length": 581.1390625, |
| "completions/min_length": 428.975, |
| "completions/min_terminated_length": 428.975, |
| "epoch": 0.15894781027004337, |
| "grad_norm": 0.5161816982734899, |
| "kl": 0.0270263671875, |
| "learning_rate": 1.0704600109141044e-06, |
| "loss": 0.0081, |
| "num_tokens": 51121985.0, |
| "reward": 0.2939129492267966, |
| "reward_std": 0.13785594400105766, |
| "rewards/code_reward/mean": 0.19406919270550133, |
| "rewards/code_reward/std": 0.1376118804764701, |
| "rewards/format_reward/mean": 0.9984375, |
| "rewards/format_reward/std": 0.004419417306780815, |
| "step": 355 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 759.9875, |
| "completions/max_terminated_length": 759.9875, |
| "completions/mean_length": 552.328125, |
| "completions/mean_terminated_length": 552.328125, |
| "completions/min_length": 402.8875, |
| "completions/min_terminated_length": 402.8875, |
| "epoch": 0.1611865118231426, |
| "grad_norm": 0.6003534969440923, |
| "kl": 0.026979827880859376, |
| "learning_rate": 1.0257955837130725e-06, |
| "loss": 0.0035, |
| "num_tokens": 51844651.0, |
| "reward": 0.28144540255889294, |
| "reward_std": 0.12947248641576153, |
| "rewards/code_reward/mean": 0.18144539590430214, |
| "rewards/code_reward/std": 0.1294724913313985, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 360 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 743.025, |
| "completions/max_terminated_length": 743.025, |
| "completions/mean_length": 557.615625, |
| "completions/mean_terminated_length": 557.615625, |
| "completions/min_length": 414.6, |
| "completions/min_terminated_length": 414.6, |
| "epoch": 0.16342521337624177, |
| "grad_norm": 0.5589064006858112, |
| "kl": 0.026873779296875, |
| "learning_rate": 9.815935738225377e-07, |
| "loss": 0.0076, |
| "num_tokens": 52581373.0, |
| "reward": 0.31030982043594124, |
| "reward_std": 0.14550057554297383, |
| "rewards/code_reward/mean": 0.21030981277799582, |
| "rewards/code_reward/std": 0.14550057782616932, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 365 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 781.1625, |
| "completions/max_terminated_length": 781.1625, |
| "completions/mean_length": 594.5203125, |
| "completions/mean_terminated_length": 594.5203125, |
| "completions/min_length": 453.1125, |
| "completions/min_terminated_length": 453.1125, |
| "epoch": 0.165663914929341, |
| "grad_norm": 0.5310916132708259, |
| "kl": 0.02626190185546875, |
| "learning_rate": 9.378970845410571e-07, |
| "loss": 0.0095, |
| "num_tokens": 53352410.0, |
| "reward": 0.2810199284926057, |
| "reward_std": 0.1396817062428454, |
| "rewards/code_reward/mean": 0.18101992065639932, |
| "rewards/code_reward/std": 0.13968170815496705, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 370 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 730.95, |
| "completions/max_terminated_length": 730.95, |
| "completions/mean_length": 549.3125, |
| "completions/mean_terminated_length": 549.3125, |
| "completions/min_length": 412.425, |
| "completions/min_terminated_length": 412.425, |
| "epoch": 0.16790261648244018, |
| "grad_norm": 0.5669991229498123, |
| "kl": 0.026529693603515626, |
| "learning_rate": 8.947487262120971e-07, |
| "loss": 0.0094, |
| "num_tokens": 54086442.0, |
| "reward": 0.2867281662300229, |
| "reward_std": 0.12997563436510973, |
| "rewards/code_reward/mean": 0.1867281592771178, |
| "rewards/code_reward/std": 0.1299756362393964, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 375 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 746.9125, |
| "completions/max_terminated_length": 746.9125, |
| "completions/mean_length": 568.5546875, |
| "completions/mean_terminated_length": 568.5546875, |
| "completions/min_length": 416.3125, |
| "completions/min_terminated_length": 416.3125, |
| "epoch": 0.1701413180355394, |
| "grad_norm": 0.520216538993176, |
| "kl": 0.02662353515625, |
| "learning_rate": 8.521905746728408e-07, |
| "loss": 0.0137, |
| "num_tokens": 54836845.0, |
| "reward": 0.3280904936604202, |
| "reward_std": 0.13382616126909852, |
| "rewards/code_reward/mean": 0.22809048727212938, |
| "rewards/code_reward/std": 0.13382616304443218, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 380 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 705.6, |
| "completions/max_terminated_length": 705.6, |
| "completions/mean_length": 541.746875, |
| "completions/mean_terminated_length": 541.746875, |
| "completions/min_length": 402.8875, |
| "completions/min_terminated_length": 402.8875, |
| "epoch": 0.17238001958863858, |
| "grad_norm": 0.5872381716257123, |
| "kl": 0.02609405517578125, |
| "learning_rate": 8.102641302242105e-07, |
| "loss": 0.015, |
| "num_tokens": 55553251.0, |
| "reward": 0.3441149082966149, |
| "reward_std": 0.18714927716646343, |
| "rewards/code_reward/mean": 0.24411489552585408, |
| "rewards/code_reward/std": 0.18714928096160294, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 385 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 689.0875, |
| "completions/max_terminated_length": 689.0875, |
| "completions/mean_length": 521.6296875, |
| "completions/mean_terminated_length": 521.6296875, |
| "completions/min_length": 382.25, |
| "completions/min_terminated_length": 382.25, |
| "epoch": 0.1746187211417378, |
| "grad_norm": 0.6105825047365391, |
| "kl": 0.02542877197265625, |
| "learning_rate": 7.690102771621219e-07, |
| "loss": 0.0134, |
| "num_tokens": 56255086.0, |
| "reward": 0.35199374333024025, |
| "reward_std": 0.1669875715917442, |
| "rewards/code_reward/mean": 0.25199373266659675, |
| "rewards/code_reward/std": 0.16698757499689237, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 390 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 702.6125, |
| "completions/max_terminated_length": 702.6125, |
| "completions/mean_length": 547.8109375, |
| "completions/mean_terminated_length": 547.8109375, |
| "completions/min_length": 407.5, |
| "completions/min_terminated_length": 407.5, |
| "epoch": 0.176857422694837, |
| "grad_norm": 0.4867528744361068, |
| "kl": 0.02476806640625, |
| "learning_rate": 7.284692439094368e-07, |
| "loss": 0.0058, |
| "num_tokens": 56994181.0, |
| "reward": 0.3013323726132512, |
| "reward_std": 0.15418729400844314, |
| "rewards/code_reward/mean": 0.20133236556430348, |
| "rewards/code_reward/std": 0.15418729329830966, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 395 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 716.2875, |
| "completions/max_terminated_length": 716.2875, |
| "completions/mean_length": 523.3640625, |
| "completions/mean_terminated_length": 523.3640625, |
| "completions/min_length": 390.775, |
| "completions/min_terminated_length": 390.775, |
| "epoch": 0.1790961242479362, |
| "grad_norm": 0.5350752422212481, |
| "kl": 0.025472259521484374, |
| "learning_rate": 6.886805637874772e-07, |
| "loss": 0.0033, |
| "num_tokens": 57711366.0, |
| "reward": 0.3107692304067314, |
| "reward_std": 0.1176008581998758, |
| "rewards/code_reward/mean": 0.21076922266220208, |
| "rewards/code_reward/std": 0.11760085919813719, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 400 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 708.1125, |
| "completions/max_terminated_length": 708.1125, |
| "completions/mean_length": 536.55, |
| "completions/mean_terminated_length": 536.55, |
| "completions/min_length": 393.9625, |
| "completions/min_terminated_length": 393.9625, |
| "epoch": 0.1813348258010354, |
| "grad_norm": 0.5886059510700545, |
| "kl": 0.02571868896484375, |
| "learning_rate": 6.496830364653691e-07, |
| "loss": 0.0107, |
| "num_tokens": 58433174.0, |
| "reward": 0.29278530003502967, |
| "reward_std": 0.14287365710479208, |
| "rewards/code_reward/mean": 0.19278529447619802, |
| "rewards/code_reward/std": 0.14287365918862632, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 405 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 727.0, |
| "completions/max_terminated_length": 727.0, |
| "completions/mean_length": 555.8109375, |
| "completions/mean_terminated_length": 555.8109375, |
| "completions/min_length": 411.6375, |
| "completions/min_terminated_length": 411.6375, |
| "epoch": 0.1835735273541346, |
| "grad_norm": 0.5723292784440707, |
| "kl": 0.02495880126953125, |
| "learning_rate": 6.115146901248015e-07, |
| "loss": 0.0128, |
| "num_tokens": 59179325.0, |
| "reward": 0.2888775954954326, |
| "reward_std": 0.13932973612099886, |
| "rewards/code_reward/mean": 0.18903384153090882, |
| "rewards/code_reward/std": 0.13905893911141903, |
| "rewards/format_reward/mean": 0.9984375, |
| "rewards/format_reward/std": 0.004419417306780815, |
| "step": 410 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 738.7, |
| "completions/max_terminated_length": 738.7, |
| "completions/mean_length": 555.56875, |
| "completions/mean_terminated_length": 555.56875, |
| "completions/min_length": 405.975, |
| "completions/min_terminated_length": 405.975, |
| "epoch": 0.1858122289072338, |
| "grad_norm": 0.5951785187855096, |
| "kl": 0.024257659912109375, |
| "learning_rate": 5.742127443770959e-07, |
| "loss": -0.0082, |
| "num_tokens": 59914129.0, |
| "reward": 0.32972582541406154, |
| "reward_std": 0.17325325938872993, |
| "rewards/code_reward/mean": 0.22988206883310341, |
| "rewards/code_reward/std": 0.17343256894964726, |
| "rewards/format_reward/mean": 0.9984375, |
| "rewards/format_reward/std": 0.004419417306780815, |
| "step": 415 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 789.075, |
| "completions/max_terminated_length": 789.075, |
| "completions/mean_length": 567.95625, |
| "completions/mean_terminated_length": 567.95625, |
| "completions/min_length": 411.5125, |
| "completions/min_terminated_length": 411.5125, |
| "epoch": 0.188050930460333, |
| "grad_norm": 0.5882458355124174, |
| "kl": 0.025067138671875, |
| "learning_rate": 5.378135739687457e-07, |
| "loss": 0.011, |
| "num_tokens": 60679605.0, |
| "reward": 0.3126169110648334, |
| "reward_std": 0.15262282044568565, |
| "rewards/code_reward/mean": 0.2126169038747321, |
| "rewards/code_reward/std": 0.15262282300391233, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 420 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 743.6, |
| "completions/max_terminated_length": 743.6, |
| "completions/mean_length": 563.934375, |
| "completions/mean_terminated_length": 563.934375, |
| "completions/min_length": 407.6, |
| "completions/min_terminated_length": 407.6, |
| "epoch": 0.19028963201343221, |
| "grad_norm": 0.5844514039485758, |
| "kl": 0.0232269287109375, |
| "learning_rate": 5.023526733108258e-07, |
| "loss": 0.0058, |
| "num_tokens": 61442035.0, |
| "reward": 0.28377067698165775, |
| "reward_std": 0.14047583957435564, |
| "rewards/code_reward/mean": 0.18392692334891764, |
| "rewards/code_reward/std": 0.1400338972482132, |
| "rewards/format_reward/mean": 0.9984375, |
| "rewards/format_reward/std": 0.004419417306780815, |
| "step": 425 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 706.925, |
| "completions/max_terminated_length": 706.925, |
| "completions/mean_length": 529.33125, |
| "completions/mean_terminated_length": 529.33125, |
| "completions/min_length": 382.5125, |
| "completions/min_terminated_length": 382.5125, |
| "epoch": 0.1925283335665314, |
| "grad_norm": 0.6068829854257141, |
| "kl": 0.024706268310546876, |
| "learning_rate": 4.6786462186684726e-07, |
| "loss": 0.0148, |
| "num_tokens": 62163871.0, |
| "reward": 0.3680115182884037, |
| "reward_std": 0.15169981086510234, |
| "rewards/code_reward/mean": 0.26801150970277376, |
| "rewards/code_reward/std": 0.1516998124890961, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 430 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 731.0, |
| "completions/max_terminated_length": 731.0, |
| "completions/mean_length": 537.5578125, |
| "completions/mean_terminated_length": 537.5578125, |
| "completions/min_length": 382.8875, |
| "completions/min_terminated_length": 382.8875, |
| "epoch": 0.19476703511963062, |
| "grad_norm": 0.6072526841260757, |
| "kl": 0.02366943359375, |
| "learning_rate": 4.3438305043282314e-07, |
| "loss": 0.0105, |
| "num_tokens": 62868964.0, |
| "reward": 0.288625252712518, |
| "reward_std": 0.14189330035296735, |
| "rewards/code_reward/mean": 0.1886252475058427, |
| "rewards/code_reward/std": 0.14189330387162044, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 435 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0015625, |
| "completions/max_length": 849.4625, |
| "completions/max_terminated_length": 760.275, |
| "completions/mean_length": 566.0203125, |
| "completions/mean_terminated_length": 554.5283485412598, |
| "completions/min_length": 385.2875, |
| "completions/min_terminated_length": 385.2875, |
| "epoch": 0.1970057366727298, |
| "grad_norm": 0.5544203482669213, |
| "kl": 0.02349853515625, |
| "learning_rate": 4.019406083424222e-07, |
| "loss": 0.024, |
| "num_tokens": 63645545.0, |
| "reward": 0.2873677465133369, |
| "reward_std": 0.1426866902038455, |
| "rewards/code_reward/mean": 0.18768023860175162, |
| "rewards/code_reward/std": 0.14217363530769944, |
| "rewards/format_reward/mean": 0.996875, |
| "rewards/format_reward/std": 0.00883883461356163, |
| "step": 440 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 696.725, |
| "completions/max_terminated_length": 696.725, |
| "completions/mean_length": 515.0796875, |
| "completions/mean_terminated_length": 515.0796875, |
| "completions/min_length": 375.0375, |
| "completions/min_terminated_length": 375.0375, |
| "epoch": 0.19924443822582902, |
| "grad_norm": 0.5972791947559509, |
| "kl": 0.02529144287109375, |
| "learning_rate": 3.7056893162918063e-07, |
| "loss": 0.0201, |
| "num_tokens": 64322420.0, |
| "reward": 0.3194495734758675, |
| "reward_std": 0.1723791634547524, |
| "rewards/code_reward/mean": 0.2194495657022344, |
| "rewards/code_reward/std": 0.17237916672602296, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 445 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 713.55, |
| "completions/max_terminated_length": 713.55, |
| "completions/mean_length": 530.815625, |
| "completions/mean_terminated_length": 530.815625, |
| "completions/min_length": 378.8375, |
| "completions/min_terminated_length": 378.8375, |
| "epoch": 0.20148313977892823, |
| "grad_norm": 0.4622077467737105, |
| "kl": 0.0240142822265625, |
| "learning_rate": 3.4029861217683744e-07, |
| "loss": 0.0039, |
| "num_tokens": 65055550.0, |
| "reward": 0.288273274153471, |
| "reward_std": 0.13295620558201335, |
| "rewards/code_reward/mean": 0.1882732652418781, |
| "rewards/code_reward/std": 0.13295620674616657, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 450 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 823.5, |
| "completions/max_terminated_length": 823.5, |
| "completions/mean_length": 554.6484375, |
| "completions/mean_terminated_length": 554.6484375, |
| "completions/min_length": 385.95, |
| "completions/min_terminated_length": 385.95, |
| "epoch": 0.20372184133202742, |
| "grad_norm": 0.5779355252222167, |
| "kl": 0.0229461669921875, |
| "learning_rate": 3.111591678878596e-07, |
| "loss": 0.0175, |
| "num_tokens": 65784213.0, |
| "reward": 0.2769805608317256, |
| "reward_std": 0.1467181654064916, |
| "rewards/code_reward/mean": 0.17698055310174823, |
| "rewards/code_reward/std": 0.14671816679183394, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 455 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 701.2, |
| "completions/max_terminated_length": 701.2, |
| "completions/mean_length": 516.5953125, |
| "completions/mean_terminated_length": 516.5953125, |
| "completions/min_length": 376.525, |
| "completions/min_terminated_length": 376.525, |
| "epoch": 0.20596054288512664, |
| "grad_norm": 0.7260007034342328, |
| "kl": 0.02352447509765625, |
| "learning_rate": 2.831790138992526e-07, |
| "loss": 0.0016, |
| "num_tokens": 66491018.0, |
| "reward": 0.2927206911146641, |
| "reward_std": 0.1309030485805124, |
| "rewards/code_reward/mean": 0.19272068199061324, |
| "rewards/code_reward/std": 0.1309030512755271, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 460 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 642.7125, |
| "completions/max_terminated_length": 642.7125, |
| "completions/mean_length": 490.215625, |
| "completions/mean_terminated_length": 490.215625, |
| "completions/min_length": 364.1375, |
| "completions/min_terminated_length": 364.1375, |
| "epoch": 0.20819924443822582, |
| "grad_norm": 0.594813196893333, |
| "kl": 0.024704742431640624, |
| "learning_rate": 2.563854348737275e-07, |
| "loss": 0.0158, |
| "num_tokens": 67154060.0, |
| "reward": 0.3452305795624852, |
| "reward_std": 0.1497524828504538, |
| "rewards/code_reward/mean": 0.24523057123151376, |
| "rewards/code_reward/std": 0.14975248328992166, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 465 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 698.1125, |
| "completions/max_terminated_length": 698.1125, |
| "completions/mean_length": 521.234375, |
| "completions/mean_terminated_length": 521.234375, |
| "completions/min_length": 375.2875, |
| "completions/min_terminated_length": 375.2875, |
| "epoch": 0.21043794599132504, |
| "grad_norm": 0.5197145225969613, |
| "kl": 0.0245391845703125, |
| "learning_rate": 2.3080455839324343e-07, |
| "loss": 0.0051, |
| "num_tokens": 67889866.0, |
| "reward": 0.28930564858019353, |
| "reward_std": 0.13884065752499736, |
| "rewards/code_reward/mean": 0.18961814382928424, |
| "rewards/code_reward/std": 0.13826202357886358, |
| "rewards/format_reward/mean": 0.996875, |
| "rewards/format_reward/std": 0.005786375701427459, |
| "step": 470 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 676.2625, |
| "completions/max_terminated_length": 676.2625, |
| "completions/mean_length": 497.1359375, |
| "completions/mean_terminated_length": 497.1359375, |
| "completions/min_length": 347.875, |
| "completions/min_terminated_length": 347.875, |
| "epoch": 0.21267664754442422, |
| "grad_norm": 0.6912907179100062, |
| "kl": 0.024478912353515625, |
| "learning_rate": 2.064613294808664e-07, |
| "loss": 0.0116, |
| "num_tokens": 68564793.0, |
| "reward": 0.36915110973641274, |
| "reward_std": 0.15182709340006112, |
| "rewards/code_reward/mean": 0.26946360208967235, |
| "rewards/code_reward/std": 0.15149775308091193, |
| "rewards/format_reward/mean": 0.996875, |
| "rewards/format_reward/std": 0.00883883461356163, |
| "step": 475 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 691.4625, |
| "completions/max_terminated_length": 691.4625, |
| "completions/mean_length": 522.4828125, |
| "completions/mean_terminated_length": 522.4828125, |
| "completions/min_length": 379.275, |
| "completions/min_terminated_length": 379.275, |
| "epoch": 0.21491534909752344, |
| "grad_norm": 0.5419894781125532, |
| "kl": 0.022618865966796874, |
| "learning_rate": 1.83379486275794e-07, |
| "loss": 0.0007, |
| "num_tokens": 69262638.0, |
| "reward": 0.3051586433313787, |
| "reward_std": 0.12916497962432913, |
| "rewards/code_reward/mean": 0.20515863316832111, |
| "rewards/code_reward/std": 0.12916498319245875, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 480 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 686.775, |
| "completions/max_terminated_length": 686.775, |
| "completions/mean_length": 508.7515625, |
| "completions/mean_terminated_length": 508.7515625, |
| "completions/min_length": 362.0, |
| "completions/min_terminated_length": 362.0, |
| "epoch": 0.21715405065062263, |
| "grad_norm": 0.688891040637323, |
| "kl": 0.02315826416015625, |
| "learning_rate": 1.6158153688526895e-07, |
| "loss": 0.0091, |
| "num_tokens": 69978223.0, |
| "reward": 0.3311784929595888, |
| "reward_std": 0.17100559230602813, |
| "rewards/code_reward/mean": 0.2311784830279066, |
| "rewards/code_reward/std": 0.17100559424143286, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 485 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 693.6375, |
| "completions/max_terminated_length": 693.6375, |
| "completions/mean_length": 522.1390625, |
| "completions/mean_terminated_length": 522.1390625, |
| "completions/min_length": 380.6875, |
| "completions/min_terminated_length": 380.6875, |
| "epoch": 0.21939275220372184, |
| "grad_norm": 0.5956108322738943, |
| "kl": 0.0235626220703125, |
| "learning_rate": 1.4108873743594274e-07, |
| "loss": 0.0124, |
| "num_tokens": 70730304.0, |
| "reward": 0.2989473403431475, |
| "reward_std": 0.13880361177725717, |
| "rewards/code_reward/mean": 0.19894733218825422, |
| "rewards/code_reward/std": 0.13880361234769226, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 490 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 683.5375, |
| "completions/max_terminated_length": 683.5375, |
| "completions/mean_length": 519.221875, |
| "completions/mean_terminated_length": 519.221875, |
| "completions/min_length": 386.7125, |
| "completions/min_terminated_length": 386.7125, |
| "epoch": 0.22163145375682106, |
| "grad_norm": 0.5427250334330507, |
| "kl": 0.023455810546875, |
| "learning_rate": 1.2192107134610586e-07, |
| "loss": 0.0135, |
| "num_tokens": 71448214.0, |
| "reward": 0.29871292021125556, |
| "reward_std": 0.12881716500851326, |
| "rewards/code_reward/mean": 0.19886916641116842, |
| "rewards/code_reward/std": 0.1288392253103666, |
| "rewards/format_reward/mean": 0.9984375, |
| "rewards/format_reward/std": 0.004419417306780815, |
| "step": 495 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 625.8, |
| "completions/max_terminated_length": 625.8, |
| "completions/mean_length": 472.728125, |
| "completions/mean_terminated_length": 472.728125, |
| "completions/min_length": 342.7375, |
| "completions/min_terminated_length": 342.7375, |
| "epoch": 0.22387015530992024, |
| "grad_norm": 0.6108848009428748, |
| "kl": 0.02464752197265625, |
| "learning_rate": 1.0409722983898928e-07, |
| "loss": 0.0093, |
| "num_tokens": 72117280.0, |
| "reward": 0.39794372050091625, |
| "reward_std": 0.1893569786072476, |
| "rewards/code_reward/mean": 0.29794371249881807, |
| "rewards/code_reward/std": 0.18935698276618496, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 500 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 665.15, |
| "completions/max_terminated_length": 665.15, |
| "completions/mean_length": 497.05625, |
| "completions/mean_terminated_length": 497.05625, |
| "completions/min_length": 354.125, |
| "completions/min_terminated_length": 354.125, |
| "epoch": 0.22610885686301946, |
| "grad_norm": 0.5589824240842507, |
| "kl": 0.0255523681640625, |
| "learning_rate": 8.763459371614036e-08, |
| "loss": 0.0183, |
| "num_tokens": 72815756.0, |
| "reward": 0.2931746931746602, |
| "reward_std": 0.15111528622946935, |
| "rewards/code_reward/mean": 0.19333093738896423, |
| "rewards/code_reward/std": 0.15067334883497097, |
| "rewards/format_reward/mean": 0.9984375, |
| "rewards/format_reward/std": 0.004419417306780815, |
| "step": 505 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 708.7375, |
| "completions/max_terminated_length": 708.7375, |
| "completions/mean_length": 525.8828125, |
| "completions/mean_terminated_length": 525.8828125, |
| "completions/min_length": 383.925, |
| "completions/min_terminated_length": 383.925, |
| "epoch": 0.22834755841611865, |
| "grad_norm": 0.5657630399821236, |
| "kl": 0.023580169677734374, |
| "learning_rate": 7.254921640864954e-08, |
| "loss": 0.005, |
| "num_tokens": 73527777.0, |
| "reward": 0.29336653435602783, |
| "reward_std": 0.15301572528260293, |
| "rewards/code_reward/mean": 0.19352277733851225, |
| "rewards/code_reward/std": 0.15257378248206804, |
| "rewards/format_reward/mean": 0.9984375, |
| "rewards/format_reward/std": 0.004419417306780815, |
| "step": 510 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 744.025, |
| "completions/max_terminated_length": 744.025, |
| "completions/mean_length": 538.8359375, |
| "completions/mean_terminated_length": 538.8359375, |
| "completions/min_length": 391.7625, |
| "completions/min_terminated_length": 391.7625, |
| "epoch": 0.23058625996921786, |
| "grad_norm": 0.5974194655481591, |
| "kl": 0.02305145263671875, |
| "learning_rate": 5.885580832275245e-08, |
| "loss": 0.0084, |
| "num_tokens": 74267080.0, |
| "reward": 0.2840338280424476, |
| "reward_std": 0.1604262540466152, |
| "rewards/code_reward/mean": 0.1840338213412906, |
| "rewards/code_reward/std": 0.16042625640693586, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 515 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 747.9375, |
| "completions/max_terminated_length": 747.9375, |
| "completions/mean_length": 543.165625, |
| "completions/mean_terminated_length": 543.165625, |
| "completions/min_length": 395.525, |
| "completions/min_terminated_length": 395.525, |
| "epoch": 0.23282496152231705, |
| "grad_norm": 0.6721925638851708, |
| "kl": 0.023305511474609374, |
| "learning_rate": 4.6567722495074685e-08, |
| "loss": 0.0021, |
| "num_tokens": 75032546.0, |
| "reward": 0.26900712195783855, |
| "reward_std": 0.15734463239787147, |
| "rewards/code_reward/mean": 0.16900711600319482, |
| "rewards/code_reward/std": 0.1573446374386549, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 520 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 688.1875, |
| "completions/max_terminated_length": 688.1875, |
| "completions/mean_length": 528.38125, |
| "completions/mean_terminated_length": 528.38125, |
| "completions/min_length": 385.1, |
| "completions/min_terminated_length": 385.1, |
| "epoch": 0.23506366307541626, |
| "grad_norm": 0.47513348986208354, |
| "kl": 0.023612213134765626, |
| "learning_rate": 3.5696941571505434e-08, |
| "loss": 0.0069, |
| "num_tokens": 75779806.0, |
| "reward": 0.2989699838683009, |
| "reward_std": 0.144676909170812, |
| "rewards/code_reward/mean": 0.19896997831820046, |
| "rewards/code_reward/std": 0.14467690934252458, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 525 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 678.4625, |
| "completions/max_terminated_length": 678.4625, |
| "completions/mean_length": 507.25625, |
| "completions/mean_terminated_length": 507.25625, |
| "completions/min_length": 360.5125, |
| "completions/min_terminated_length": 360.5125, |
| "epoch": 0.23730236462851545, |
| "grad_norm": 0.5125208061016464, |
| "kl": 0.02255859375, |
| "learning_rate": 2.625406612240039e-08, |
| "loss": 0.006, |
| "num_tokens": 76477890.0, |
| "reward": 0.3240066308528185, |
| "reward_std": 0.16057187110418453, |
| "rewards/code_reward/mean": 0.22400662462459878, |
| "rewards/code_reward/std": 0.16057187146507204, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 530 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 747.75, |
| "completions/max_terminated_length": 747.75, |
| "completions/mean_length": 534.534375, |
| "completions/mean_terminated_length": 534.534375, |
| "completions/min_length": 385.9125, |
| "completions/min_terminated_length": 385.9125, |
| "epoch": 0.23954106618161466, |
| "grad_norm": 0.4891371425966553, |
| "kl": 0.02330169677734375, |
| "learning_rate": 1.8248304305504505e-08, |
| "loss": 0.0196, |
| "num_tokens": 77209744.0, |
| "reward": 0.333328259550035, |
| "reward_std": 0.14479399558040312, |
| "rewards/code_reward/mean": 0.23332825346733443, |
| "rewards/code_reward/std": 0.1447939975943882, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 535 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 667.125, |
| "completions/max_terminated_length": 667.125, |
| "completions/mean_length": 501.2875, |
| "completions/mean_terminated_length": 501.2875, |
| "completions/min_length": 358.6, |
| "completions/min_terminated_length": 358.6, |
| "epoch": 0.24177976773471388, |
| "grad_norm": 0.5700891782095214, |
| "kl": 0.02592926025390625, |
| "learning_rate": 1.1687462886677713e-08, |
| "loss": 0.006, |
| "num_tokens": 77919416.0, |
| "reward": 0.313872685469687, |
| "reward_std": 0.1551548853807617, |
| "rewards/code_reward/mean": 0.2140289287781343, |
| "rewards/code_reward/std": 0.15488540646038018, |
| "rewards/format_reward/mean": 0.9984375, |
| "rewards/format_reward/std": 0.004419417306780815, |
| "step": 540 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 657.0375, |
| "completions/max_terminated_length": 657.0375, |
| "completions/mean_length": 498.54375, |
| "completions/mean_terminated_length": 498.54375, |
| "completions/min_length": 368.6625, |
| "completions/min_terminated_length": 368.6625, |
| "epoch": 0.24401846928781307, |
| "grad_norm": 0.6409927990786405, |
| "kl": 0.02302703857421875, |
| "learning_rate": 6.577939627179785e-09, |
| "loss": 0.0125, |
| "num_tokens": 78597028.0, |
| "reward": 0.3173367108218372, |
| "reward_std": 0.16166887313302142, |
| "rewards/code_reward/mean": 0.21764920413697836, |
| "rewards/code_reward/std": 0.16100556787860115, |
| "rewards/format_reward/mean": 0.996875, |
| "rewards/format_reward/std": 0.00883883461356163, |
| "step": 545 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 687.375, |
| "completions/max_terminated_length": 687.375, |
| "completions/mean_length": 507.9859375, |
| "completions/mean_terminated_length": 507.9859375, |
| "completions/min_length": 373.4375, |
| "completions/min_terminated_length": 373.4375, |
| "epoch": 0.24625717084091228, |
| "grad_norm": 0.5411596737613947, |
| "kl": 0.024321746826171876, |
| "learning_rate": 2.9247170449338e-09, |
| "loss": 0.005, |
| "num_tokens": 79308787.0, |
| "reward": 0.3536563721485436, |
| "reward_std": 0.12869162768765818, |
| "rewards/code_reward/mean": 0.2538126138912048, |
| "rewards/code_reward/std": 0.1283905382733792, |
| "rewards/format_reward/mean": 0.9984375, |
| "rewards/format_reward/std": 0.004419417306780815, |
| "step": 550 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 674.4875, |
| "completions/max_terminated_length": 674.4875, |
| "completions/mean_length": 508.71875, |
| "completions/mean_terminated_length": 508.71875, |
| "completions/min_length": 376.7, |
| "completions/min_terminated_length": 376.7, |
| "epoch": 0.24849587239401147, |
| "grad_norm": 0.6381506844335124, |
| "kl": 0.022603607177734374, |
| "learning_rate": 7.313575558583474e-10, |
| "loss": 0.0068, |
| "num_tokens": 79983935.0, |
| "reward": 0.3423418626189232, |
| "reward_std": 0.13657438448863105, |
| "rewards/code_reward/mean": 0.24234185529057867, |
| "rewards/code_reward/std": 0.1365743855072651, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 555 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 651.59375, |
| "completions/max_terminated_length": 651.59375, |
| "completions/mean_length": 491.193359375, |
| "completions/mean_terminated_length": 491.193359375, |
| "completions/min_length": 356.890625, |
| "completions/min_terminated_length": 356.890625, |
| "epoch": 0.25028683363649085, |
| "kl": 0.023431777954101562, |
| "num_tokens": 80543474.0, |
| "reward": 0.3967649736441672, |
| "reward_std": 0.1777252904503257, |
| "rewards/code_reward/mean": 0.2967649649071973, |
| "rewards/code_reward/std": 0.17772529531794135, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 559, |
| "total_flos": 0.0, |
| "train_loss": 0.001293145966497858, |
| "train_runtime": 17459.8588, |
| "train_samples_per_second": 0.512, |
| "train_steps_per_second": 0.032 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 559, |
| "num_input_tokens_seen": 80543474, |
| "num_train_epochs": 1, |
| "save_steps": 25, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|