{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.00075, "eval_steps": 500, "global_step": 75, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1843.0, "completions/max_terminated_length": 1843.0, "completions/mean_length": 1586.6875, "completions/mean_terminated_length": 1586.6875, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "entropy": 2.3952305614948273, "epoch": 1e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.051290396600961685, "kl": 0.0, "learning_rate": 0.0, "loss": -0.0056, "num_tokens": 71951.0, "reward": -8.338340759277344, "reward_std": 11.816058158874512, "rewards/rollout_reward_func/mean": -8.338340759277344, "rewards/rollout_reward_func/std": 12.670792579650879, "sampling/importance_sampling_ratio/max": 0.24942533671855927, "sampling/importance_sampling_ratio/mean": 0.020489878952503204, "sampling/importance_sampling_ratio/min": 3.589864120350601e-15, "sampling/sampling_logp_difference/max": 13.334996223449707, "sampling/sampling_logp_difference/mean": 0.37166523933410645, "step": 1, "step_time": 40.07221162000002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.3952305614948273, "epoch": 2e-05, "grad_norm": 0.051755864173173904, "kl": 0.0, "learning_rate": 2.8571428571428575e-07, "loss": -0.0056, "step": 2, "step_time": 6.337008879999928 }, { "clip_ratio/high_max": 0.005321558099240065, "clip_ratio/high_mean": 0.003962862421758473, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003962862421758473, "completions/clipped_ratio": 0.0, "completions/max_length": 1870.0, "completions/max_terminated_length": 1870.0, "completions/mean_length": 1689.21875, "completions/mean_terminated_length": 1689.21875, "completions/min_length": 1392.0, "completions/min_terminated_length": 1392.0, "entropy": 2.241749197244644, "epoch": 3e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.032931093126535416, "kl": 0.0010736112235463224, "learning_rate": 5.714285714285715e-07, "loss": 0.0001, "num_tokens": 147616.0, "reward": -8.11627197265625, "reward_std": 9.453106880187988, "rewards/rollout_reward_func/mean": -8.11627197265625, "rewards/rollout_reward_func/std": 10.086986541748047, "sampling/importance_sampling_ratio/max": 0.038908205926418304, "sampling/importance_sampling_ratio/mean": 0.013574006035923958, "sampling/importance_sampling_ratio/min": 9.307732536101287e-13, "sampling/sampling_logp_difference/max": 8.446062088012695, "sampling/sampling_logp_difference/mean": 0.23850713670253754, "step": 3, "step_time": 42.26439957300016 }, { "clip_ratio/high_max": 0.010532152839004993, "clip_ratio/high_mean": 0.005266076419502497, "clip_ratio/low_mean": 0.0013297871919348836, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00659586361143738, "entropy": 2.2402877509593964, "epoch": 4e-05, "grad_norm": 0.030311495065689087, "kl": 0.0013166169228497893, "learning_rate": 8.571428571428572e-07, "loss": 0.0001, "step": 4, "step_time": 6.949824775000025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0014204545877873898, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0014204545877873898, "completions/clipped_ratio": 0.0, "completions/max_length": 1775.0, "completions/max_terminated_length": 1775.0, "completions/mean_length": 1671.5, "completions/mean_terminated_length": 1671.5, "completions/min_length": 758.0, "completions/min_terminated_length": 758.0, "entropy": 2.280731201171875, "epoch": 5e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.035965703427791595, "kl": 0.0009596906966180541, "learning_rate": 1.142857142857143e-06, "loss": -0.0034, "num_tokens": 222393.0, "reward": -2.6346850395202637, "reward_std": 16.439598083496094, "rewards/rollout_reward_func/mean": -2.6346850395202637, "rewards/rollout_reward_func/std": 17.489192962646484, "sampling/importance_sampling_ratio/max": 0.07573997974395752, "sampling/importance_sampling_ratio/mean": 0.014259650371968746, "sampling/importance_sampling_ratio/min": 0.00022367587371263653, "sampling/sampling_logp_difference/max": 1.2608689069747925, "sampling/sampling_logp_difference/mean": 0.21580657362937927, "step": 5, "step_time": 42.164671801000054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0012499999720603228, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0012499999720603228, "entropy": 2.279596298933029, "epoch": 6e-05, "grad_norm": 0.038707196712493896, "kl": 0.0009941701391653623, "learning_rate": 1.4285714285714286e-06, "loss": -0.0034, "step": 6, "step_time": 6.197612690000028 }, { "clip_ratio/high_max": 0.008154743583872914, "clip_ratio/high_mean": 0.004077371791936457, "clip_ratio/low_mean": 0.0013888889225199819, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005466260714456439, "completions/clipped_ratio": 0.0, "completions/max_length": 1863.0, "completions/max_terminated_length": 1863.0, "completions/mean_length": 1571.40625, "completions/mean_terminated_length": 1571.40625, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "entropy": 2.281369060277939, "epoch": 7e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.043208107352256775, "kl": 0.0007937230257084593, "learning_rate": 1.7142857142857145e-06, "loss": 0.0023, "num_tokens": 294768.0, "reward": -12.273736953735352, "reward_std": 11.2011137008667, "rewards/rollout_reward_func/mean": -12.273736953735352, "rewards/rollout_reward_func/std": 10.951950073242188, "sampling/importance_sampling_ratio/max": 0.2913915514945984, "sampling/importance_sampling_ratio/mean": 0.029564352706074715, "sampling/importance_sampling_ratio/min": 1.11443969016186e-13, "sampling/sampling_logp_difference/max": 11.392870903015137, "sampling/sampling_logp_difference/mean": 0.24621161818504333, "step": 7, "step_time": 41.23813219300007 }, { "clip_ratio/high_max": 0.006393861956894398, "clip_ratio/high_mean": 0.003196930978447199, "clip_ratio/low_mean": 0.0026959646493196487, "clip_ratio/low_min": 0.0025510203558951616, "clip_ratio/region_mean": 0.005892895627766848, "entropy": 2.281323105096817, "epoch": 8e-05, "grad_norm": 0.037575479596853256, "kl": 0.0011317383105051704, "learning_rate": 2.0000000000000003e-06, "loss": 0.0023, "step": 8, "step_time": 6.953839896999966 }, { "clip_ratio/high_max": 0.009784075664356351, "clip_ratio/high_mean": 0.006221824907697737, "clip_ratio/low_mean": 0.0013297871919348836, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007551612099632621, "completions/clipped_ratio": 0.0, "completions/max_length": 1839.0, "completions/max_terminated_length": 1839.0, "completions/mean_length": 1631.96875, "completions/mean_terminated_length": 1631.96875, "completions/min_length": 758.0, "completions/min_terminated_length": 758.0, "entropy": 2.2518777698278427, "epoch": 9e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.024869710206985474, "kl": 0.0008677325677126646, "learning_rate": 2.285714285714286e-06, "loss": 0.002, "num_tokens": 368485.0, "reward": -5.027488708496094, "reward_std": 9.159595489501953, "rewards/rollout_reward_func/mean": -5.027488708496094, "rewards/rollout_reward_func/std": 10.158669471740723, "sampling/importance_sampling_ratio/max": 0.0989551916718483, "sampling/importance_sampling_ratio/mean": 0.01718847081065178, "sampling/importance_sampling_ratio/min": 3.3642640756559317e-11, "sampling/sampling_logp_difference/max": 9.060235977172852, "sampling/sampling_logp_difference/mean": 0.25719159841537476, "step": 9, "step_time": 45.75393526599987 }, { "clip_ratio/high_max": 0.004852556856349111, "clip_ratio/high_mean": 0.0024262784281745553, "clip_ratio/low_mean": 0.0013297871919348836, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003756065620109439, "entropy": 2.247532531619072, "epoch": 0.0001, "grad_norm": 0.025674326345324516, "kl": 0.0010098924503836315, "learning_rate": 2.571428571428571e-06, "loss": 0.002, "step": 10, "step_time": 6.291163318000031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0013020833721384406, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013020833721384406, "completions/clipped_ratio": 0.03125, "completions/max_length": 1848.0, "completions/max_terminated_length": 1848.0, "completions/mean_length": 1693.3125, "completions/mean_terminated_length": 1702.54833984375, "completions/min_length": 1389.0, "completions/min_terminated_length": 1389.0, "entropy": 2.2023140490055084, "epoch": 0.00011, "frac_reward_zero_std": 0.0, "grad_norm": 0.025482358410954475, "kl": 0.0009734490522532724, "learning_rate": 2.8571428571428573e-06, "loss": -0.0018, "num_tokens": 444310.0, "reward": -6.636538028717041, "reward_std": 13.808101654052734, "rewards/rollout_reward_func/mean": -6.636538028717041, "rewards/rollout_reward_func/std": 13.634783744812012, "sampling/importance_sampling_ratio/max": 0.04252217337489128, "sampling/importance_sampling_ratio/mean": 0.015543580055236816, "sampling/importance_sampling_ratio/min": 8.082223128483037e-25, "sampling/sampling_logp_difference/max": 17.322786331176758, "sampling/sampling_logp_difference/mean": 0.3324906826019287, "step": 11, "step_time": 49.50245009800017 }, { "clip_ratio/high_max": 0.005110554862767458, "clip_ratio/high_mean": 0.002555277431383729, "clip_ratio/low_mean": 0.0014204545877873898, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003975732019171119, "entropy": 2.2029761970043182, "epoch": 0.00012, "grad_norm": 0.030912674963474274, "kl": 0.0009688141508377157, "learning_rate": 3.142857142857143e-06, "loss": -0.0018, "step": 12, "step_time": 7.69119761200011 }, { "clip_ratio/high_max": 0.013187056640163064, "clip_ratio/high_mean": 0.006593528320081532, "clip_ratio/low_mean": 0.0013586956774815917, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007952223997563124, "completions/clipped_ratio": 0.0, "completions/max_length": 1780.0, "completions/max_terminated_length": 1780.0, "completions/mean_length": 1663.0, "completions/mean_terminated_length": 1663.0, "completions/min_length": 1240.0, "completions/min_terminated_length": 1240.0, "entropy": 2.258611023426056, "epoch": 0.00013, "frac_reward_zero_std": 0.0, "grad_norm": 0.026398880407214165, "kl": 0.0010122761159436777, "learning_rate": 3.428571428571429e-06, "loss": 0.0013, "num_tokens": 519593.0, "reward": -8.102012634277344, "reward_std": 10.601648330688477, "rewards/rollout_reward_func/mean": -8.102012634277344, "rewards/rollout_reward_func/std": 11.444524765014648, "sampling/importance_sampling_ratio/max": 0.0403943695127964, "sampling/importance_sampling_ratio/mean": 0.01241688709706068, "sampling/importance_sampling_ratio/min": 6.642031217564404e-14, "sampling/sampling_logp_difference/max": 11.826959609985352, "sampling/sampling_logp_difference/mean": 0.2676909267902374, "step": 13, "step_time": 51.398302784000066 }, { "clip_ratio/high_max": 0.005434782709926367, "clip_ratio/high_mean": 0.0027173913549631834, "clip_ratio/low_mean": 0.002418017713353038, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005135408951900899, "entropy": 2.251807004213333, "epoch": 0.00014, "grad_norm": 0.027899302542209625, "kl": 0.001143605462857522, "learning_rate": 3.7142857142857146e-06, "loss": 0.0013, "step": 14, "step_time": 6.214198535999913 }, { "clip_ratio/high_max": 0.010549242608249187, "clip_ratio/high_mean": 0.0066044083796441555, "clip_ratio/low_mean": 0.002659574383869767, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009263982763513923, "completions/clipped_ratio": 0.0, "completions/max_length": 1846.0, "completions/max_terminated_length": 1846.0, "completions/mean_length": 1673.4375, "completions/mean_terminated_length": 1673.4375, "completions/min_length": 1376.0, "completions/min_terminated_length": 1376.0, "entropy": 2.269966244697571, "epoch": 0.00015, "frac_reward_zero_std": 0.0, "grad_norm": 0.028693798929452896, "kl": 0.0012732810355373658, "learning_rate": 4.000000000000001e-06, "loss": -0.0002, "num_tokens": 594885.0, "reward": -9.916947364807129, "reward_std": 10.726478576660156, "rewards/rollout_reward_func/mean": -9.916947364807129, "rewards/rollout_reward_func/std": 11.980547904968262, "sampling/importance_sampling_ratio/max": 0.02854122966527939, "sampling/importance_sampling_ratio/mean": 0.01421053521335125, "sampling/importance_sampling_ratio/min": 2.1058147088061363e-13, "sampling/sampling_logp_difference/max": 9.987288475036621, "sampling/sampling_logp_difference/mean": 0.2753928303718567, "step": 15, "step_time": 52.00545187799992 }, { "clip_ratio/high_max": 0.007995169144123793, "clip_ratio/high_mean": 0.003997584572061896, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003997584572061896, "entropy": 2.2705667912960052, "epoch": 0.00016, "grad_norm": 0.021315351128578186, "kl": 0.0012536912836367264, "learning_rate": 4.2857142857142855e-06, "loss": -0.0002, "step": 16, "step_time": 7.3722107160000405 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.0034722222480922937, "clip_ratio/low_mean": 0.00638433254789561, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009856554795987904, "completions/clipped_ratio": 0.0, "completions/max_length": 2086.0, "completions/max_terminated_length": 2086.0, "completions/mean_length": 1863.21875, "completions/mean_terminated_length": 1863.21875, "completions/min_length": 740.0, "completions/min_terminated_length": 740.0, "entropy": 2.299890086054802, "epoch": 0.00017, "frac_reward_zero_std": 0.0, "grad_norm": 0.01769721694290638, "kl": 0.0012724917105515487, "learning_rate": 4.571428571428572e-06, "loss": 0.0017, "num_tokens": 676591.0, "reward": -8.684309959411621, "reward_std": 9.59238338470459, "rewards/rollout_reward_func/mean": -8.684309959411621, "rewards/rollout_reward_func/std": 10.695920944213867, "sampling/importance_sampling_ratio/max": 0.041902750730514526, "sampling/importance_sampling_ratio/mean": 0.009395781904459, "sampling/importance_sampling_ratio/min": 3.6452324480957535e-20, "sampling/sampling_logp_difference/max": 12.486509323120117, "sampling/sampling_logp_difference/mean": 0.3327711820602417, "step": 17, "step_time": 58.969859735 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.0034722222480922937, "clip_ratio/low_mean": 0.004615322104655206, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0080875443527475, "entropy": 2.297407776117325, "epoch": 0.00018, "grad_norm": 0.01824762113392353, "kl": 0.001823120612243656, "learning_rate": 4.857142857142858e-06, "loss": 0.0017, "step": 18, "step_time": 6.952743392999764 }, { "clip_ratio/high_max": 0.009362624026834965, "clip_ratio/high_mean": 0.005838719545863569, "clip_ratio/low_mean": 0.00231799460016191, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008156714029610157, "completions/clipped_ratio": 0.0, "completions/max_length": 2051.0, "completions/max_terminated_length": 2051.0, "completions/mean_length": 1801.53125, "completions/mean_terminated_length": 1801.53125, "completions/min_length": 527.0, "completions/min_terminated_length": 527.0, "entropy": 2.322386711835861, "epoch": 0.00019, "frac_reward_zero_std": 0.0, "grad_norm": 0.02586973085999489, "kl": 0.0017871049421955831, "learning_rate": 5.142857142857142e-06, "loss": -0.0024, "num_tokens": 756167.0, "reward": -7.00605583190918, "reward_std": 16.651020050048828, "rewards/rollout_reward_func/mean": -7.00605583190918, "rewards/rollout_reward_func/std": 18.44582176208496, "sampling/importance_sampling_ratio/max": 0.07095864415168762, "sampling/importance_sampling_ratio/mean": 0.011077712289988995, "sampling/importance_sampling_ratio/min": 2.652188067117779e-20, "sampling/sampling_logp_difference/max": 15.625991821289062, "sampling/sampling_logp_difference/mean": 0.3300427198410034, "step": 19, "step_time": 58.73655633899966 }, { "clip_ratio/high_max": 0.0068662879057228565, "clip_ratio/high_mean": 0.0034331439528614283, "clip_ratio/low_mean": 0.0032655425602570176, "clip_ratio/low_min": 0.0019841270986944437, "clip_ratio/region_mean": 0.006698686513118446, "entropy": 2.3259487748146057, "epoch": 0.0002, "grad_norm": 0.030740659683942795, "kl": 0.0018880682764574885, "learning_rate": 5.428571428571429e-06, "loss": -0.0023, "step": 20, "step_time": 7.368399762000195 }, { "clip_ratio/high_max": 0.0018939394503831863, "clip_ratio/high_mean": 0.0009469697251915932, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0009469697251915932, "completions/clipped_ratio": 0.0, "completions/max_length": 2070.0, "completions/max_terminated_length": 2070.0, "completions/mean_length": 1879.1875, "completions/mean_terminated_length": 1879.1875, "completions/min_length": 700.0, "completions/min_terminated_length": 700.0, "entropy": 2.2097203731536865, "epoch": 0.00021, "frac_reward_zero_std": 0.0, "grad_norm": 0.02687995322048664, "kl": 0.001916136905492749, "learning_rate": 5.7142857142857145e-06, "loss": -0.0002, "num_tokens": 838107.0, "reward": -2.38564395904541, "reward_std": 11.885160446166992, "rewards/rollout_reward_func/mean": -2.38564395904541, "rewards/rollout_reward_func/std": 15.615160942077637, "sampling/importance_sampling_ratio/max": 0.08406510949134827, "sampling/importance_sampling_ratio/mean": 0.012403802014887333, "sampling/importance_sampling_ratio/min": 7.145396301283091e-16, "sampling/sampling_logp_difference/max": 12.1469087600708, "sampling/sampling_logp_difference/mean": 0.2651059329509735, "step": 21, "step_time": 59.30298631099993 }, { "clip_ratio/high_max": 0.0018939394503831863, "clip_ratio/high_mean": 0.0009469697251915932, "clip_ratio/low_mean": 0.0012019231216982007, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002148892846889794, "entropy": 2.212069094181061, "epoch": 0.00022, "grad_norm": 0.02957003004848957, "kl": 0.0021696449111914262, "learning_rate": 6e-06, "loss": -0.0002, "step": 22, "step_time": 6.887106450000374 }, { "clip_ratio/high_max": 0.002358490601181984, "clip_ratio/high_mean": 0.001179245300590992, "clip_ratio/low_mean": 0.0033517052652314305, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0045309505658224225, "completions/clipped_ratio": 0.0, "completions/max_length": 2068.0, "completions/max_terminated_length": 2068.0, "completions/mean_length": 1889.875, "completions/mean_terminated_length": 1889.875, "completions/min_length": 1525.0, "completions/min_terminated_length": 1525.0, "entropy": 2.1723521649837494, "epoch": 0.00023, "frac_reward_zero_std": 0.0, "grad_norm": 0.022851742804050446, "kl": 0.003631043611676432, "learning_rate": 6.285714285714286e-06, "loss": -0.0045, "num_tokens": 919971.0, "reward": -6.60746955871582, "reward_std": 11.911310195922852, "rewards/rollout_reward_func/mean": -6.60746955871582, "rewards/rollout_reward_func/std": 12.598552703857422, "sampling/importance_sampling_ratio/max": 0.022619599476456642, "sampling/importance_sampling_ratio/mean": 0.00866013765335083, "sampling/importance_sampling_ratio/min": 1.8946926625573762e-16, "sampling/sampling_logp_difference/max": 12.770915031433105, "sampling/sampling_logp_difference/mean": 0.2685585618019104, "step": 23, "step_time": 66.98576966499968 }, { "clip_ratio/high_max": 0.004673305433243513, "clip_ratio/high_mean": 0.0023366527166217566, "clip_ratio/low_mean": 0.0012254902394488454, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003562142956070602, "entropy": 2.1722511053085327, "epoch": 0.00024, "grad_norm": 0.026614658534526825, "kl": 0.004136091796681285, "learning_rate": 6.571428571428572e-06, "loss": -0.0045, "step": 24, "step_time": 6.895327041999735 }, { "clip_ratio/high_max": 0.005816170945763588, "clip_ratio/high_mean": 0.002908085472881794, "clip_ratio/low_mean": 0.002100988756865263, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005009074229747057, "completions/clipped_ratio": 0.0, "completions/max_length": 2360.0, "completions/max_terminated_length": 2360.0, "completions/mean_length": 2112.0625, "completions/mean_terminated_length": 2112.0625, "completions/min_length": 386.0, "completions/min_terminated_length": 386.0, "entropy": 2.1737034767866135, "epoch": 0.00025, "frac_reward_zero_std": 0.0, "grad_norm": 0.013818159699440002, "kl": 0.0030502460795105435, "learning_rate": 6.857142857142858e-06, "loss": 0.0031, "num_tokens": 1008470.0, "reward": -4.425507068634033, "reward_std": 14.518598556518555, "rewards/rollout_reward_func/mean": -4.425507068634033, "rewards/rollout_reward_func/std": 15.62421989440918, "sampling/importance_sampling_ratio/max": 0.2350578010082245, "sampling/importance_sampling_ratio/mean": 0.013511145487427711, "sampling/importance_sampling_ratio/min": 2.6986267040271933e-18, "sampling/sampling_logp_difference/max": 18.317167282104492, "sampling/sampling_logp_difference/mean": 0.2883983850479126, "step": 25, "step_time": 75.20574624999995 }, { "clip_ratio/high_max": 0.0036231884732842445, "clip_ratio/high_mean": 0.0018115942366421223, "clip_ratio/low_mean": 0.003012447035871446, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004824041272513568, "entropy": 2.1722775399684906, "epoch": 0.00026, "grad_norm": 0.013290848582983017, "kl": 0.0030411222542170435, "learning_rate": 7.1428571428571436e-06, "loss": 0.0031, "step": 26, "step_time": 7.616344220999963 }, { "clip_ratio/high_max": 0.006330503383651376, "clip_ratio/high_mean": 0.003165251691825688, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003165251691825688, "completions/clipped_ratio": 0.0, "completions/max_length": 2344.0, "completions/max_terminated_length": 2344.0, "completions/mean_length": 2142.9375, "completions/mean_terminated_length": 2142.9375, "completions/min_length": 634.0, "completions/min_terminated_length": 634.0, "entropy": 2.320455104112625, "epoch": 0.00027, "frac_reward_zero_std": 0.0, "grad_norm": 0.01896420307457447, "kl": 0.002832541649695486, "learning_rate": 7.428571428571429e-06, "loss": 0.0027, "num_tokens": 1098577.0, "reward": -13.397226333618164, "reward_std": 12.59660816192627, "rewards/rollout_reward_func/mean": -13.397226333618164, "rewards/rollout_reward_func/std": 14.914674758911133, "sampling/importance_sampling_ratio/max": 0.1061810627579689, "sampling/importance_sampling_ratio/mean": 0.008775782771408558, "sampling/importance_sampling_ratio/min": 7.311078333300841e-31, "sampling/sampling_logp_difference/max": 18.154491424560547, "sampling/sampling_logp_difference/mean": 0.3568466603755951, "step": 27, "step_time": 76.95750552300024 }, { "clip_ratio/high_max": 0.00859679956920445, "clip_ratio/high_mean": 0.004298399784602225, "clip_ratio/low_mean": 0.0019767729099839926, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006275172694586217, "entropy": 2.319840043783188, "epoch": 0.00028, "grad_norm": 0.01972121186554432, "kl": 0.003210447379387915, "learning_rate": 7.714285714285716e-06, "loss": 0.0027, "step": 28, "step_time": 7.581604469000013 }, { "clip_ratio/high_max": 0.004324290552176535, "clip_ratio/high_mean": 0.0021621452760882676, "clip_ratio/low_mean": 0.0010593220358714461, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0032214673119597137, "completions/clipped_ratio": 0.03125, "completions/max_length": 2252.0, "completions/max_terminated_length": 2252.0, "completions/mean_length": 2059.75, "completions/mean_terminated_length": 2059.258056640625, "completions/min_length": 654.0, "completions/min_terminated_length": 654.0, "entropy": 2.2485389709472656, "epoch": 0.00029, "frac_reward_zero_std": 0.0, "grad_norm": 0.017093749716877937, "kl": 0.0025801300653256476, "learning_rate": 8.000000000000001e-06, "loss": -0.0023, "num_tokens": 1185175.0, "reward": -10.598124504089355, "reward_std": 11.341144561767578, "rewards/rollout_reward_func/mean": -10.598124504089355, "rewards/rollout_reward_func/std": 12.110883712768555, "sampling/importance_sampling_ratio/max": 0.04109551012516022, "sampling/importance_sampling_ratio/mean": 0.006022544577717781, "sampling/importance_sampling_ratio/min": 2.8597005269629276e-29, "sampling/sampling_logp_difference/max": 13.06977367401123, "sampling/sampling_logp_difference/mean": 0.28142935037612915, "step": 29, "step_time": 77.39934379400052 }, { "clip_ratio/high_max": 0.010767225176095963, "clip_ratio/high_mean": 0.005383612588047981, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005383612588047981, "entropy": 2.2509743869304657, "epoch": 0.0003, "grad_norm": 0.01782657578587532, "kl": 0.002700227312743664, "learning_rate": 8.285714285714287e-06, "loss": -0.0023, "step": 30, "step_time": 7.306394946999944 }, { "clip_ratio/high_max": 0.008653939235955477, "clip_ratio/high_mean": 0.004326969617977738, "clip_ratio/low_mean": 0.0040421567391604185, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008369126415345818, "completions/clipped_ratio": 0.0, "completions/max_length": 2400.0, "completions/max_terminated_length": 2400.0, "completions/mean_length": 2102.53125, "completions/mean_terminated_length": 2102.53125, "completions/min_length": 718.0, "completions/min_terminated_length": 718.0, "entropy": 2.3924789130687714, "epoch": 0.00031, "frac_reward_zero_std": 0.0, "grad_norm": 0.02806187979876995, "kl": 0.0026975919608958066, "learning_rate": 8.571428571428571e-06, "loss": 0.0024, "num_tokens": 1273982.0, "reward": -8.974254608154297, "reward_std": 14.775591850280762, "rewards/rollout_reward_func/mean": -8.974254608154297, "rewards/rollout_reward_func/std": 14.762958526611328, "sampling/importance_sampling_ratio/max": 0.08338890224695206, "sampling/importance_sampling_ratio/mean": 0.00720847537741065, "sampling/importance_sampling_ratio/min": 3.938114484780906e-20, "sampling/sampling_logp_difference/max": 12.979628562927246, "sampling/sampling_logp_difference/mean": 0.3850451111793518, "step": 31, "step_time": 78.26030889900039 }, { "clip_ratio/high_max": 0.013500758213922381, "clip_ratio/high_mean": 0.006750379106961191, "clip_ratio/low_mean": 0.0008680555620230734, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007618434610776603, "entropy": 2.3886922001838684, "epoch": 0.00032, "grad_norm": 0.02739325352013111, "kl": 0.0021556210485869087, "learning_rate": 8.857142857142858e-06, "loss": 0.0024, "step": 32, "step_time": 7.6260315599995465 }, { "clip_ratio/high_max": 0.005404347903095186, "clip_ratio/high_mean": 0.002702173951547593, "clip_ratio/low_mean": 0.001923076924867928, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004625250876415521, "completions/clipped_ratio": 0.0, "completions/max_length": 2525.0, "completions/max_terminated_length": 2525.0, "completions/mean_length": 2267.0, "completions/mean_terminated_length": 2267.0, "completions/min_length": 1508.0, "completions/min_terminated_length": 1508.0, "entropy": 2.2084928154945374, "epoch": 0.00033, "frac_reward_zero_std": 0.0, "grad_norm": 0.00787295587360859, "kl": 0.0015827158640604466, "learning_rate": 9.142857142857144e-06, "loss": -0.0001, "num_tokens": 1367561.0, "reward": -5.180222988128662, "reward_std": 17.71061134338379, "rewards/rollout_reward_func/mean": -5.180222988128662, "rewards/rollout_reward_func/std": 20.794387817382812, "sampling/importance_sampling_ratio/max": 0.011497444473206997, "sampling/importance_sampling_ratio/mean": 0.004206728655844927, "sampling/importance_sampling_ratio/min": 2.4751660744964696e-22, "sampling/sampling_logp_difference/max": 21.120386123657227, "sampling/sampling_logp_difference/mean": 0.29462122917175293, "step": 33, "step_time": 80.87171731000012 }, { "clip_ratio/high_max": 0.003289473708719015, "clip_ratio/high_mean": 0.0016447368543595076, "clip_ratio/low_mean": 0.0037393163074739277, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005384053161833435, "entropy": 2.217702329158783, "epoch": 0.00034, "grad_norm": 0.01408409047871828, "kl": 0.002227816090453416, "learning_rate": 9.42857142857143e-06, "loss": -0.0001, "step": 34, "step_time": 8.029501602999972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2603.0, "completions/max_terminated_length": 2603.0, "completions/mean_length": 2335.4375, "completions/mean_terminated_length": 2335.4375, "completions/min_length": 891.0, "completions/min_terminated_length": 891.0, "entropy": 2.327633857727051, "epoch": 0.00035, "frac_reward_zero_std": 0.0, "grad_norm": 0.01498799491673708, "kl": 0.0023266323769348674, "learning_rate": 9.714285714285715e-06, "loss": 0.0035, "num_tokens": 1463740.0, "reward": -9.247687339782715, "reward_std": 12.140151977539062, "rewards/rollout_reward_func/mean": -9.247687339782715, "rewards/rollout_reward_func/std": 13.351397514343262, "sampling/importance_sampling_ratio/max": 0.0675792247056961, "sampling/importance_sampling_ratio/mean": 0.005599465221166611, "sampling/importance_sampling_ratio/min": 7.004530503774031e-41, "sampling/sampling_logp_difference/max": 17.645164489746094, "sampling/sampling_logp_difference/mean": 0.38000231981277466, "step": 35, "step_time": 80.66461238600027 }, { "clip_ratio/high_max": 0.009387426427565515, "clip_ratio/high_mean": 0.0046937132137827575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0046937132137827575, "entropy": 2.3282170593738556, "epoch": 0.00036, "grad_norm": 0.016069183126091957, "kl": 0.0027750690060202032, "learning_rate": 1e-05, "loss": 0.0035, "step": 36, "step_time": 8.139321300999654 }, { "clip_ratio/high_max": 0.0018382353009656072, "clip_ratio/high_mean": 0.0009191176504828036, "clip_ratio/low_mean": 0.0010080644860863686, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0019271821365691721, "completions/clipped_ratio": 0.03125, "completions/max_length": 2526.0, "completions/max_terminated_length": 2526.0, "completions/mean_length": 2198.28125, "completions/mean_terminated_length": 2192.419189453125, "completions/min_length": 582.0, "completions/min_terminated_length": 582.0, "entropy": 2.2252254486083984, "epoch": 0.00037, "frac_reward_zero_std": 0.0, "grad_norm": 0.0396745391190052, "kl": 0.0024436857129330747, "learning_rate": 9.999999999948591e-06, "loss": 0.0037, "num_tokens": 1555280.0, "reward": -10.988698959350586, "reward_std": 11.484918594360352, "rewards/rollout_reward_func/mean": -10.988698959350586, "rewards/rollout_reward_func/std": 13.74112606048584, "sampling/importance_sampling_ratio/max": 0.1169009730219841, "sampling/importance_sampling_ratio/mean": 0.009122872725129128, "sampling/importance_sampling_ratio/min": 6.878056424215178e-17, "sampling/sampling_logp_difference/max": 17.073537826538086, "sampling/sampling_logp_difference/mean": 0.2771769165992737, "step": 37, "step_time": 79.2830818729999 }, { "clip_ratio/high_max": 0.001923076924867928, "clip_ratio/high_mean": 0.000961538462433964, "clip_ratio/low_mean": 0.0015243901871144772, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002485928649548441, "entropy": 2.230591207742691, "epoch": 0.00038, "grad_norm": 0.0169509444385767, "kl": 0.002311948119313456, "learning_rate": 9.999999999794362e-06, "loss": 0.0037, "step": 38, "step_time": 8.018443904999458 }, { "clip_ratio/high_max": 0.003639505594037473, "clip_ratio/high_mean": 0.0018197527970187366, "clip_ratio/low_mean": 0.0008333333535119891, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026530861505307257, "completions/clipped_ratio": 0.0, "completions/max_length": 2566.0, "completions/max_terminated_length": 2566.0, "completions/mean_length": 2258.4375, "completions/mean_terminated_length": 2258.4375, "completions/min_length": 1440.0, "completions/min_terminated_length": 1440.0, "entropy": 2.2081351578235626, "epoch": 0.00039, "frac_reward_zero_std": 0.0, "grad_norm": 0.009336345829069614, "kl": 0.0021164097925066017, "learning_rate": 9.999999999537309e-06, "loss": 0.0004, "num_tokens": 1649454.0, "reward": -10.517179489135742, "reward_std": 9.59807014465332, "rewards/rollout_reward_func/mean": -10.517179489135742, "rewards/rollout_reward_func/std": 9.964343070983887, "sampling/importance_sampling_ratio/max": 0.012029355391860008, "sampling/importance_sampling_ratio/mean": 0.003381735645234585, "sampling/importance_sampling_ratio/min": 1.5594409263266153e-17, "sampling/sampling_logp_difference/max": 18.87306022644043, "sampling/sampling_logp_difference/mean": 0.362891286611557, "step": 39, "step_time": 80.33267479000028 }, { "clip_ratio/high_max": 0.009275426273234189, "clip_ratio/high_mean": 0.005543510254938155, "clip_ratio/low_mean": 0.0009765625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006520072813145816, "entropy": 2.216031640768051, "epoch": 0.0004, "grad_norm": 0.010434896685183048, "kl": 0.001769829061231576, "learning_rate": 9.999999999177437e-06, "loss": 0.0004, "step": 40, "step_time": 8.050674242000014 }, { "clip_ratio/high_max": 0.00331838964484632, "clip_ratio/high_mean": 0.00165919482242316, "clip_ratio/low_mean": 0.001736446050927043, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003395640873350203, "completions/clipped_ratio": 0.0, "completions/max_length": 2887.0, "completions/max_terminated_length": 2887.0, "completions/mean_length": 2683.46875, "completions/mean_terminated_length": 2683.46875, "completions/min_length": 2241.0, "completions/min_terminated_length": 2241.0, "entropy": 2.259254366159439, "epoch": 0.00041, "frac_reward_zero_std": 0.0, "grad_norm": 0.012253516353666782, "kl": 0.0026411395665491, "learning_rate": 9.999999998714745e-06, "loss": 0.0007, "num_tokens": 1756670.0, "reward": -11.294092178344727, "reward_std": 12.929180145263672, "rewards/rollout_reward_func/mean": -11.294092178344727, "rewards/rollout_reward_func/std": 12.838841438293457, "sampling/importance_sampling_ratio/max": 0.005970899015665054, "sampling/importance_sampling_ratio/mean": 0.0017320181941613555, "sampling/importance_sampling_ratio/min": 1.393524575839586e-16, "sampling/sampling_logp_difference/max": 12.961386680603027, "sampling/sampling_logp_difference/mean": 0.3150481879711151, "step": 41, "step_time": 91.85784664900052 }, { "clip_ratio/high_max": 0.0034722222480922937, "clip_ratio/high_mean": 0.0017361111240461469, "clip_ratio/low_mean": 0.0017123287543654442, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00344843982020393, "entropy": 2.2678737342357635, "epoch": 0.00042, "grad_norm": 0.009364346042275429, "kl": 0.002545906128943898, "learning_rate": 9.999999998149234e-06, "loss": 0.0007, "step": 42, "step_time": 8.943521398999792 }, { "clip_ratio/high_max": 0.0031066687079146504, "clip_ratio/high_mean": 0.0015533343539573252, "clip_ratio/low_mean": 0.0063974635559134185, "clip_ratio/low_min": 0.0017857142956927419, "clip_ratio/region_mean": 0.007950797851663083, "completions/clipped_ratio": 0.0, "completions/max_length": 2973.0, "completions/max_terminated_length": 2973.0, "completions/mean_length": 2525.125, "completions/mean_terminated_length": 2525.125, "completions/min_length": 375.0, "completions/min_terminated_length": 375.0, "entropy": 2.2919381260871887, "epoch": 0.00043, "frac_reward_zero_std": 0.0, "grad_norm": 0.012219212017953396, "kl": 0.002040403662249446, "learning_rate": 9.999999997480901e-06, "loss": -0.0004, "num_tokens": 1858672.0, "reward": -11.096410751342773, "reward_std": 12.537565231323242, "rewards/rollout_reward_func/mean": -11.096410751342773, "rewards/rollout_reward_func/std": 12.965230941772461, "sampling/importance_sampling_ratio/max": 0.17080777883529663, "sampling/importance_sampling_ratio/mean": 0.0070959883742034435, "sampling/importance_sampling_ratio/min": 2.0908910633189203e-31, "sampling/sampling_logp_difference/max": 18.829500198364258, "sampling/sampling_logp_difference/mean": 0.3667473793029785, "step": 43, "step_time": 91.86805722899953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0007183908019214869, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0007183908019214869, "entropy": 2.2995532751083374, "epoch": 0.00044, "grad_norm": 0.011969598941504955, "kl": 0.0019854862766806036, "learning_rate": 9.999999996709749e-06, "loss": -0.0004, "step": 44, "step_time": 9.18175687799976 }, { "clip_ratio/high_max": 0.012487898580729961, "clip_ratio/high_mean": 0.007980060297995806, "clip_ratio/low_mean": 0.0025955072487704456, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010575567546766251, "completions/clipped_ratio": 0.03125, "completions/max_length": 2880.0, "completions/max_terminated_length": 2880.0, "completions/mean_length": 2663.9375, "completions/mean_terminated_length": 2665.419189453125, "completions/min_length": 2200.0, "completions/min_terminated_length": 2200.0, "entropy": 2.3340508341789246, "epoch": 0.00045, "frac_reward_zero_std": 0.0, "grad_norm": 0.02003672532737255, "kl": 0.002692480251425877, "learning_rate": 9.999999995835775e-06, "loss": 0.0006, "num_tokens": 1965272.0, "reward": -13.600004196166992, "reward_std": 14.701010704040527, "rewards/rollout_reward_func/mean": -13.600004196166992, "rewards/rollout_reward_func/std": 14.582523345947266, "sampling/importance_sampling_ratio/max": 0.005967509467154741, "sampling/importance_sampling_ratio/mean": 0.001800880883820355, "sampling/importance_sampling_ratio/min": 3.8321957037099815e-21, "sampling/sampling_logp_difference/max": 16.676918029785156, "sampling/sampling_logp_difference/mean": 0.38897034525871277, "step": 45, "step_time": 90.54392121900082 }, { "clip_ratio/high_max": 0.0064415192464366555, "clip_ratio/high_mean": 0.00408881512703374, "clip_ratio/low_mean": 0.0017152255750261247, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005804040702059865, "entropy": 2.334753155708313, "epoch": 0.00046, "grad_norm": 0.009386158548295498, "kl": 0.0021710961300414056, "learning_rate": 9.999999994858982e-06, "loss": 0.0006, "step": 46, "step_time": 10.22533744600014 }, { "clip_ratio/high_max": 0.003598798648454249, "clip_ratio/high_mean": 0.002667454886250198, "clip_ratio/low_mean": 0.0009057971183210611, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003573252004571259, "completions/clipped_ratio": 0.0, "completions/max_length": 2865.0, "completions/max_terminated_length": 2865.0, "completions/mean_length": 2587.71875, "completions/mean_terminated_length": 2587.71875, "completions/min_length": 1094.0, "completions/min_terminated_length": 1094.0, "entropy": 2.2846151292324066, "epoch": 0.00047, "frac_reward_zero_std": 0.0, "grad_norm": 0.01407544780522585, "kl": 0.003350261024024803, "learning_rate": 9.999999993779367e-06, "loss": -0.0001, "num_tokens": 2069829.0, "reward": -9.709592819213867, "reward_std": 8.844062805175781, "rewards/rollout_reward_func/mean": -9.709592819213867, "rewards/rollout_reward_func/std": 9.250896453857422, "sampling/importance_sampling_ratio/max": 0.03837426379323006, "sampling/importance_sampling_ratio/mean": 0.003228831337764859, "sampling/importance_sampling_ratio/min": 1.3482284797513267e-27, "sampling/sampling_logp_difference/max": 19.295942306518555, "sampling/sampling_logp_difference/mean": 0.35356980562210083, "step": 47, "step_time": 92.8158456350011 }, { "clip_ratio/high_max": 0.008580301189795136, "clip_ratio/high_mean": 0.004290150594897568, "clip_ratio/low_mean": 0.001728165545500815, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006018316023983061, "entropy": 2.2845455408096313, "epoch": 0.00048, "grad_norm": 0.013240635395050049, "kl": 0.003361153867444955, "learning_rate": 9.999999992596935e-06, "loss": -0.0001, "step": 48, "step_time": 8.908816245000708 }, { "clip_ratio/high_max": 0.0016025641234591603, "clip_ratio/high_mean": 0.0008012820617295802, "clip_ratio/low_mean": 0.0010775862028822303, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0018788682646118104, "completions/clipped_ratio": 0.03125, "completions/max_length": 3007.0, "completions/max_terminated_length": 3007.0, "completions/mean_length": 2473.59375, "completions/mean_terminated_length": 2460.774169921875, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 2.234508216381073, "epoch": 0.00049, "frac_reward_zero_std": 0.0, "grad_norm": 0.013633599504828453, "kl": 0.0016216770527535118, "learning_rate": 9.999999991311679e-06, "loss": -0.0006, "num_tokens": 2169988.0, "reward": -11.157156944274902, "reward_std": 9.950779914855957, "rewards/rollout_reward_func/mean": -11.157156944274902, "rewards/rollout_reward_func/std": 11.374809265136719, "sampling/importance_sampling_ratio/max": 0.28713250160217285, "sampling/importance_sampling_ratio/mean": 0.018523240461945534, "sampling/importance_sampling_ratio/min": 2.006449198810831e-20, "sampling/sampling_logp_difference/max": 18.636167526245117, "sampling/sampling_logp_difference/mean": 0.3026620149612427, "step": 49, "step_time": 87.0453093810006 }, { "clip_ratio/high_max": 0.009976116823963821, "clip_ratio/high_mean": 0.006601028784643859, "clip_ratio/low_mean": 0.002857419603969902, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009458448505029082, "entropy": 2.2327195405960083, "epoch": 0.0005, "grad_norm": 0.013543589971959591, "kl": 0.001988013260415755, "learning_rate": 9.999999989923604e-06, "loss": -0.0006, "step": 50, "step_time": 10.626580007000484 }, { "clip_ratio/high_max": 0.0028511597774922848, "clip_ratio/high_mean": 0.0014255798887461424, "clip_ratio/low_mean": 0.0024358974769711494, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003861477307509631, "completions/clipped_ratio": 0.0, "completions/max_length": 2977.0, "completions/max_terminated_length": 2977.0, "completions/mean_length": 2764.375, "completions/mean_terminated_length": 2764.375, "completions/min_length": 2371.0, "completions/min_terminated_length": 2371.0, "entropy": 2.372197538614273, "epoch": 0.00051, "frac_reward_zero_std": 0.0, "grad_norm": 0.00828390009701252, "kl": 0.002092195674777031, "learning_rate": 9.999999988432709e-06, "loss": -0.0009, "num_tokens": 2279279.0, "reward": -9.928876876831055, "reward_std": 11.465417861938477, "rewards/rollout_reward_func/mean": -9.928876876831055, "rewards/rollout_reward_func/std": 12.258295059204102, "sampling/importance_sampling_ratio/max": 0.005203698296099901, "sampling/importance_sampling_ratio/mean": 0.0008916730294004083, "sampling/importance_sampling_ratio/min": 6.668663636418266e-29, "sampling/sampling_logp_difference/max": 15.371074676513672, "sampling/sampling_logp_difference/mean": 0.41038042306900024, "step": 51, "step_time": 97.36285335000048 }, { "clip_ratio/high_max": 0.004987157532013953, "clip_ratio/high_mean": 0.0024935787660069764, "clip_ratio/low_mean": 0.002425754675641656, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004919333383440971, "entropy": 2.3701044023036957, "epoch": 0.00052, "grad_norm": 0.009349919855594635, "kl": 0.0021661788632627577, "learning_rate": 9.999999986838993e-06, "loss": -0.001, "step": 52, "step_time": 9.234601858999667 }, { "clip_ratio/high_max": 0.0029559049289673567, "clip_ratio/high_mean": 0.0014779524644836783, "clip_ratio/low_mean": 0.0007022471982054412, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0021801996626891196, "completions/clipped_ratio": 0.0, "completions/max_length": 3143.0, "completions/max_terminated_length": 3143.0, "completions/mean_length": 2911.6875, "completions/mean_terminated_length": 2911.6875, "completions/min_length": 1747.0, "completions/min_terminated_length": 1747.0, "entropy": 2.095397859811783, "epoch": 0.00053, "frac_reward_zero_std": 0.0, "grad_norm": 0.007544730789959431, "kl": 0.0014459962039836682, "learning_rate": 9.999999985142457e-06, "loss": 0.0005, "num_tokens": 2393822.0, "reward": -9.227861404418945, "reward_std": 13.615285873413086, "rewards/rollout_reward_func/mean": -9.227861404418945, "rewards/rollout_reward_func/std": 13.280508995056152, "sampling/importance_sampling_ratio/max": 0.00818893313407898, "sampling/importance_sampling_ratio/mean": 0.0016003338387236, "sampling/importance_sampling_ratio/min": 1.1838428666831955e-38, "sampling/sampling_logp_difference/max": 19.036222457885742, "sampling/sampling_logp_difference/mean": 0.27891838550567627, "step": 53, "step_time": 94.85144506799998 }, { "clip_ratio/high_max": 0.007602313125971705, "clip_ratio/high_mean": 0.004487969708861783, "clip_ratio/low_mean": 0.0009469697251915932, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005434939434053376, "entropy": 2.092746779322624, "epoch": 0.00054, "grad_norm": 0.007749093230813742, "kl": 0.001536271462100558, "learning_rate": 9.999999983343101e-06, "loss": 0.0005, "step": 54, "step_time": 10.820570559000771 }, { "clip_ratio/high_max": 0.0016666667070239782, "clip_ratio/high_mean": 0.0008333333535119891, "clip_ratio/low_mean": 0.0015822785208001733, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0024156119325198233, "completions/clipped_ratio": 0.03125, "completions/max_length": 3139.0, "completions/max_terminated_length": 3139.0, "completions/mean_length": 2945.40625, "completions/mean_terminated_length": 2943.9677734375, "completions/min_length": 2679.0, "completions/min_terminated_length": 2679.0, "entropy": 2.1801984012126923, "epoch": 0.00055, "frac_reward_zero_std": 0.0, "grad_norm": 0.006448639556765556, "kl": 0.0018499534926377237, "learning_rate": 9.999999981440923e-06, "loss": -0.0001, "num_tokens": 2508892.0, "reward": -13.496590614318848, "reward_std": 9.91647720336914, "rewards/rollout_reward_func/mean": -13.496590614318848, "rewards/rollout_reward_func/std": 10.541536331176758, "sampling/importance_sampling_ratio/max": 0.004021179396659136, "sampling/importance_sampling_ratio/mean": 0.0013151702005416155, "sampling/importance_sampling_ratio/min": 1.5261994142437563e-12, "sampling/sampling_logp_difference/max": 8.930527687072754, "sampling/sampling_logp_difference/mean": 0.2391294538974762, "step": 55, "step_time": 97.94088426899998 }, { "clip_ratio/high_max": 0.004934780183248222, "clip_ratio/high_mean": 0.0032585294102318585, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0032585294102318585, "entropy": 2.1794978380203247, "epoch": 0.00056, "grad_norm": 0.006780738476663828, "kl": 0.0018694552418310195, "learning_rate": 9.999999979435926e-06, "loss": -0.0001, "step": 56, "step_time": 9.654078997999477 }, { "clip_ratio/high_max": 0.005736267426982522, "clip_ratio/high_mean": 0.002868133713491261, "clip_ratio/low_mean": 0.002268664597067982, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005136798310559243, "completions/clipped_ratio": 0.03125, "completions/max_length": 3438.0, "completions/max_terminated_length": 3438.0, "completions/mean_length": 3171.25, "completions/mean_terminated_length": 3164.322509765625, "completions/min_length": 2483.0, "completions/min_terminated_length": 2483.0, "entropy": 2.1486852020025253, "epoch": 0.00057, "frac_reward_zero_std": 0.0, "grad_norm": 0.005101061426103115, "kl": 0.001483209984144196, "learning_rate": 9.999999977328107e-06, "loss": 0.0002, "num_tokens": 2631790.0, "reward": -14.85343074798584, "reward_std": 14.20746898651123, "rewards/rollout_reward_func/mean": -14.85343074798584, "rewards/rollout_reward_func/std": 14.325145721435547, "sampling/importance_sampling_ratio/max": 0.005566018167883158, "sampling/importance_sampling_ratio/mean": 0.0012064384063705802, "sampling/importance_sampling_ratio/min": 5.148599772634619e-16, "sampling/sampling_logp_difference/max": 12.547440528869629, "sampling/sampling_logp_difference/mean": 0.24704553186893463, "step": 57, "step_time": 104.96052551200137 }, { "clip_ratio/high_max": 0.004558469052426517, "clip_ratio/high_mean": 0.0022792345262132585, "clip_ratio/low_mean": 0.0007716049440205097, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0030508394702337682, "entropy": 2.1473026871681213, "epoch": 0.00058, "grad_norm": 0.004236625507473946, "kl": 0.001287619597860612, "learning_rate": 9.99999997511747e-06, "loss": 0.0002, "step": 58, "step_time": 11.613219467000363 }, { "clip_ratio/high_max": 0.0030487803742289543, "clip_ratio/high_mean": 0.0015243901871144772, "clip_ratio/low_mean": 0.005422163347247988, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006946553534362465, "completions/clipped_ratio": 0.0, "completions/max_length": 3445.0, "completions/max_terminated_length": 3445.0, "completions/mean_length": 3177.1875, "completions/mean_terminated_length": 3177.1875, "completions/min_length": 903.0, "completions/min_terminated_length": 903.0, "entropy": 2.161461815237999, "epoch": 0.00059, "frac_reward_zero_std": 0.0, "grad_norm": 0.012280529364943504, "kl": 0.002224226642283611, "learning_rate": 9.999999972804012e-06, "loss": -0.0006, "num_tokens": 2754194.0, "reward": -13.905904769897461, "reward_std": 10.438383102416992, "rewards/rollout_reward_func/mean": -13.905904769897461, "rewards/rollout_reward_func/std": 11.29019832611084, "sampling/importance_sampling_ratio/max": 0.02268226072192192, "sampling/importance_sampling_ratio/mean": 0.0018939973087981343, "sampling/importance_sampling_ratio/min": 2.697007293984262e-28, "sampling/sampling_logp_difference/max": 17.155614852905273, "sampling/sampling_logp_difference/mean": 0.2826978266239166, "step": 59, "step_time": 100.61191374800092 }, { "clip_ratio/high_max": 0.008298700326122344, "clip_ratio/high_mean": 0.004149350163061172, "clip_ratio/low_mean": 0.006984907551668584, "clip_ratio/low_min": 0.0015432098880410194, "clip_ratio/region_mean": 0.011134257889352739, "entropy": 2.1591842770576477, "epoch": 0.0006, "grad_norm": 0.01143638975918293, "kl": 0.0023646633271710016, "learning_rate": 9.999999970387732e-06, "loss": -0.0006, "step": 60, "step_time": 10.382511724000324 }, { "clip_ratio/high_max": 0.0015432098880410194, "clip_ratio/high_mean": 0.0007716049440205097, "clip_ratio/low_mean": 0.002259911096189171, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003031516040209681, "completions/clipped_ratio": 0.0, "completions/max_length": 3416.0, "completions/max_terminated_length": 3416.0, "completions/mean_length": 3135.84375, "completions/mean_terminated_length": 3135.84375, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 2.217515081167221, "epoch": 0.00061, "frac_reward_zero_std": 0.0, "grad_norm": 0.006441728211939335, "kl": 0.0010476857314642984, "learning_rate": 9.999999967868633e-06, "loss": 0.0009, "num_tokens": 2876044.0, "reward": -14.489093780517578, "reward_std": 11.084500312805176, "rewards/rollout_reward_func/mean": -14.489093780517578, "rewards/rollout_reward_func/std": 12.003732681274414, "sampling/importance_sampling_ratio/max": 0.25691941380500793, "sampling/importance_sampling_ratio/mean": 0.008919022977352142, "sampling/importance_sampling_ratio/min": 6.2882773049331024e-24, "sampling/sampling_logp_difference/max": 12.479897499084473, "sampling/sampling_logp_difference/mean": 0.29724666476249695, "step": 61, "step_time": 102.78593670800046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0007812500116415322, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0007812500116415322, "entropy": 2.212161585688591, "epoch": 0.00062, "grad_norm": 0.005762570537626743, "kl": 0.001050639031745959, "learning_rate": 9.999999965246713e-06, "loss": 0.0009, "step": 62, "step_time": 10.780980200999693 }, { "clip_ratio/high_max": 0.00550881412345916, "clip_ratio/high_mean": 0.00275440706172958, "clip_ratio/low_mean": 0.0014889392768964171, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004243346338625997, "completions/clipped_ratio": 0.0, "completions/max_length": 3413.0, "completions/max_terminated_length": 3413.0, "completions/mean_length": 2979.78125, "completions/mean_terminated_length": 2979.78125, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "entropy": 2.111812949180603, "epoch": 0.00063, "frac_reward_zero_std": 0.0, "grad_norm": 0.03990187868475914, "kl": 0.001456589438021183, "learning_rate": 9.999999962521974e-06, "loss": -0.0043, "num_tokens": 2992561.0, "reward": -9.381729125976562, "reward_std": 10.014938354492188, "rewards/rollout_reward_func/mean": -9.381729125976562, "rewards/rollout_reward_func/std": 10.544285774230957, "sampling/importance_sampling_ratio/max": 0.23116491734981537, "sampling/importance_sampling_ratio/mean": 0.008492819964885712, "sampling/importance_sampling_ratio/min": 1.2417058145283537e-26, "sampling/sampling_logp_difference/max": 17.79161262512207, "sampling/sampling_logp_difference/mean": 0.31178855895996094, "step": 63, "step_time": 105.0415441539999 }, { "clip_ratio/high_max": 0.004596683429554105, "clip_ratio/high_mean": 0.0022983417147770524, "clip_ratio/low_mean": 0.002260544220916927, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004558885877486318, "entropy": 2.108848959207535, "epoch": 0.00064, "grad_norm": 0.039187680929899216, "kl": 0.00118074486090336, "learning_rate": 9.999999959694412e-06, "loss": -0.0042, "step": 64, "step_time": 10.22475211300025 }, { "clip_ratio/high_max": 0.0014204545877873898, "clip_ratio/high_mean": 0.0014204545877873898, "clip_ratio/low_mean": 0.0014124744920991361, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002832929079886526, "completions/clipped_ratio": 0.0, "completions/max_length": 3596.0, "completions/max_terminated_length": 3596.0, "completions/mean_length": 3147.21875, "completions/mean_terminated_length": 3147.21875, "completions/min_length": 487.0, "completions/min_terminated_length": 487.0, "entropy": 2.150854080915451, "epoch": 0.00065, "frac_reward_zero_std": 0.0, "grad_norm": 0.009450007230043411, "kl": 0.0011926015722565353, "learning_rate": 9.999999956764034e-06, "loss": -0.0003, "num_tokens": 3114343.0, "reward": -11.12032413482666, "reward_std": 18.984477996826172, "rewards/rollout_reward_func/mean": -11.12032413482666, "rewards/rollout_reward_func/std": 21.110273361206055, "sampling/importance_sampling_ratio/max": 0.04695241525769234, "sampling/importance_sampling_ratio/mean": 0.002132077468559146, "sampling/importance_sampling_ratio/min": 4.1585366301487205e-15, "sampling/sampling_logp_difference/max": 12.402358055114746, "sampling/sampling_logp_difference/mean": 0.2630866467952728, "step": 65, "step_time": 106.09026804299992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0006793478387407959, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0006793478387407959, "entropy": 2.1468848437070847, "epoch": 0.00066, "grad_norm": 0.006407527253031731, "kl": 0.0014346542666316964, "learning_rate": 9.999999953730833e-06, "loss": -0.0003, "step": 66, "step_time": 10.735191880000002 }, { "clip_ratio/high_max": 0.0025255101500079036, "clip_ratio/high_mean": 0.0012627550750039518, "clip_ratio/low_mean": 0.0006443298771046102, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001907084952108562, "completions/clipped_ratio": 0.0, "completions/max_length": 3504.0, "completions/max_terminated_length": 3504.0, "completions/mean_length": 2932.6875, "completions/mean_terminated_length": 2932.6875, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "entropy": 2.0600955486297607, "epoch": 0.00067, "frac_reward_zero_std": 0.0, "grad_norm": 0.018519844859838486, "kl": 0.001339123715297319, "learning_rate": 9.99999995059481e-06, "loss": 0.0045, "num_tokens": 3229150.0, "reward": -14.67924690246582, "reward_std": 13.668952941894531, "rewards/rollout_reward_func/mean": -14.67924690246582, "rewards/rollout_reward_func/std": 13.640398979187012, "sampling/importance_sampling_ratio/max": 0.251010000705719, "sampling/importance_sampling_ratio/mean": 0.013134480454027653, "sampling/importance_sampling_ratio/min": 3.752377749352698e-26, "sampling/sampling_logp_difference/max": 17.058979034423828, "sampling/sampling_logp_difference/mean": 0.27970272302627563, "step": 67, "step_time": 98.07139246399856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0014369714772328734, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0014369714772328734, "entropy": 2.056602507829666, "epoch": 0.00068, "grad_norm": 0.018268967047333717, "kl": 0.0017321475461358204, "learning_rate": 9.99999994735597e-06, "loss": 0.0045, "step": 68, "step_time": 10.463535391000278 }, { "clip_ratio/high_max": 0.0014367816038429737, "clip_ratio/high_mean": 0.0013193523045629263, "clip_ratio/low_mean": 0.00210745120421052, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003426803508773446, "completions/clipped_ratio": 0.0, "completions/max_length": 3584.0, "completions/max_terminated_length": 3584.0, "completions/mean_length": 2997.875, "completions/mean_terminated_length": 2997.875, "completions/min_length": 387.0, "completions/min_terminated_length": 387.0, "entropy": 2.179373413324356, "epoch": 0.00069, "frac_reward_zero_std": 0.0, "grad_norm": 0.021575244143605232, "kl": 0.0025506179299554788, "learning_rate": 9.999999944014306e-06, "loss": 0.003, "num_tokens": 3346039.0, "reward": -12.414968490600586, "reward_std": 17.504671096801758, "rewards/rollout_reward_func/mean": -12.414968490600586, "rewards/rollout_reward_func/std": 18.227092742919922, "sampling/importance_sampling_ratio/max": 0.16923412680625916, "sampling/importance_sampling_ratio/mean": 0.007539688143879175, "sampling/importance_sampling_ratio/min": 5.152622264460075e-33, "sampling/sampling_logp_difference/max": 20.113666534423828, "sampling/sampling_logp_difference/mean": 0.35040098428726196, "step": 69, "step_time": 98.19044223200035 }, { "clip_ratio/high_max": 0.006122596096247435, "clip_ratio/high_mean": 0.0036622595507651567, "clip_ratio/low_mean": 0.0029755067662335932, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00663776631699875, "entropy": 2.1783816814422607, "epoch": 0.0007, "grad_norm": 0.014992612414062023, "kl": 0.0022299339179880917, "learning_rate": 9.999999940569825e-06, "loss": 0.003, "step": 70, "step_time": 10.582511048000015 }, { "clip_ratio/high_max": 0.0025389643851667643, "clip_ratio/high_mean": 0.0012694821925833821, "clip_ratio/low_mean": 0.0027696280158124864, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004039110150188208, "completions/clipped_ratio": 0.0, "completions/max_length": 3695.0, "completions/max_terminated_length": 3695.0, "completions/mean_length": 3466.03125, "completions/mean_terminated_length": 3466.03125, "completions/min_length": 2833.0, "completions/min_terminated_length": 2833.0, "entropy": 2.0841893553733826, "epoch": 0.00071, "frac_reward_zero_std": 0.0, "grad_norm": 0.015231302939355373, "kl": 0.002882544751628302, "learning_rate": 9.999999937022522e-06, "loss": -0.0002, "num_tokens": 3477790.0, "reward": -11.157001495361328, "reward_std": 9.339273452758789, "rewards/rollout_reward_func/mean": -11.157001495361328, "rewards/rollout_reward_func/std": 10.505151748657227, "sampling/importance_sampling_ratio/max": 0.002954554045572877, "sampling/importance_sampling_ratio/mean": 0.0008290851255878806, "sampling/importance_sampling_ratio/min": 4.875183740817929e-34, "sampling/sampling_logp_difference/max": 12.618881225585938, "sampling/sampling_logp_difference/mean": 0.25001800060272217, "step": 71, "step_time": 109.58023633999892 }, { "clip_ratio/high_max": 0.0010683761211112142, "clip_ratio/high_mean": 0.0005341880605556071, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0005341880605556071, "entropy": 2.0846132934093475, "epoch": 0.00072, "grad_norm": 0.009933187626302242, "kl": 0.002236059895949438, "learning_rate": 9.999999933372398e-06, "loss": -0.0002, "step": 72, "step_time": 10.986284594999688 }, { "clip_ratio/high_max": 0.010420707054436207, "clip_ratio/high_mean": 0.006717559706885368, "clip_ratio/low_mean": 0.006201559328474104, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012919119151774794, "completions/clipped_ratio": 0.0, "completions/max_length": 3855.0, "completions/max_terminated_length": 3855.0, "completions/mean_length": 3299.0, "completions/mean_terminated_length": 3299.0, "completions/min_length": 727.0, "completions/min_terminated_length": 727.0, "entropy": 2.229044407606125, "epoch": 0.00073, "frac_reward_zero_std": 0.0, "grad_norm": 0.014358018524944782, "kl": 0.0021708139684051275, "learning_rate": 9.999999929619456e-06, "loss": -0.0008, "num_tokens": 3603657.0, "reward": -13.489961624145508, "reward_std": 14.68436336517334, "rewards/rollout_reward_func/mean": -13.489961624145508, "rewards/rollout_reward_func/std": 15.650424003601074, "sampling/importance_sampling_ratio/max": 0.059434566646814346, "sampling/importance_sampling_ratio/mean": 0.002726445673033595, "sampling/importance_sampling_ratio/min": 5.1101291228379537e-39, "sampling/sampling_logp_difference/max": 17.89449691772461, "sampling/sampling_logp_difference/mean": 0.3607054650783539, "step": 73, "step_time": 111.9296666260002 }, { "clip_ratio/high_max": 0.010632613790221512, "clip_ratio/high_mean": 0.0059673485811799765, "clip_ratio/low_mean": 0.0011278195888735354, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007095168228261173, "entropy": 2.2271364331245422, "epoch": 0.00074, "grad_norm": 0.01777312532067299, "kl": 0.0019678229436976835, "learning_rate": 9.99999992576369e-06, "loss": -0.0008, "step": 74, "step_time": 11.365622766999422 }, { "clip_ratio/high_max": 0.007670001010410488, "clip_ratio/high_mean": 0.004492895153816789, "clip_ratio/low_mean": 0.005284888495225459, "clip_ratio/low_min": 0.0012886597542092204, "clip_ratio/region_mean": 0.009777783416211605, "completions/clipped_ratio": 0.03125, "completions/max_length": 3973.0, "completions/max_terminated_length": 3973.0, "completions/mean_length": 3476.15625, "completions/mean_terminated_length": 3471.67724609375, "completions/min_length": 1666.0, "completions/min_terminated_length": 1666.0, "entropy": 2.1275693476200104, "epoch": 0.00075, "frac_reward_zero_std": 0.0, "grad_norm": 0.00603072764351964, "kl": 0.0015271206430043094, "learning_rate": 9.999999921805106e-06, "loss": -0.0, "num_tokens": 3735493.0, "reward": -10.968659400939941, "reward_std": 9.847198486328125, "rewards/rollout_reward_func/mean": -10.968659400939941, "rewards/rollout_reward_func/std": 9.763160705566406, "sampling/importance_sampling_ratio/max": 0.007295163348317146, "sampling/importance_sampling_ratio/mean": 0.0008178789867088199, "sampling/importance_sampling_ratio/min": 3.297161219819182e-30, "sampling/sampling_logp_difference/max": 16.79660987854004, "sampling/sampling_logp_difference/mean": 0.2770848870277405, "step": 75, "step_time": 116.77516848400046 } ], "logging_steps": 1.0, "max_steps": 600000, "num_input_tokens_seen": 3735493, "num_train_epochs": 6, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }