{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.4, "eval_steps": 500, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0002, "grad_norm": 0.4551391005516052, "kl": 0.0006267073404160328, "learning_rate": 0.0, "loss": 0.0, "num_tokens": 8600.0, "reward": 0.7076416015625, "reward_std": 0.014151658862829208, "rewards//mean": 0.7076416015625, "rewards//std": 0.0565522275865078, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0004, "grad_norm": 0.4617377817630768, "kl": 0.0006313717240118422, "learning_rate": 1.0000000000000001e-07, "loss": 0.0, "num_tokens": 17200.0, "reward": 0.72869873046875, "reward_std": 0.012252680957317352, "rewards//mean": 0.72869873046875, "rewards//std": 0.06718378514051437, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0006, "grad_norm": 0.47510647773742676, "kl": 0.0007176821964094415, "learning_rate": 2.0000000000000002e-07, "loss": 0.0, "num_tokens": 25872.0, "reward": 0.7418212890625, "reward_std": 0.01208210177719593, "rewards//mean": 0.7418212890625, "rewards//std": 0.05206342041492462, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0008, "grad_norm": 0.4658019244670868, "kl": 0.0007370488237938844, "learning_rate": 3.0000000000000004e-07, "loss": 0.0, "num_tokens": 34600.0, "reward": 0.70538330078125, "reward_std": 0.015659615397453308, "rewards//mean": 0.70538330078125, "rewards//std": 0.05750183388590813, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.001, "grad_norm": 0.4073522984981537, "kl": 0.0006538679663208313, "learning_rate": 4.0000000000000003e-07, "loss": 0.0, "num_tokens": 43304.0, "reward": 0.71893310546875, "reward_std": 0.014116497710347176, "rewards//mean": 0.71893310546875, "rewards//std": 0.06529892981052399, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0012, "grad_norm": 0.45524272322654724, "kl": 0.0006982390259508975, "learning_rate": 5.000000000000001e-07, "loss": 0.0, "num_tokens": 51992.0, "reward": 0.71209716796875, "reward_std": 0.017292022705078125, "rewards//mean": 0.71209716796875, "rewards//std": 0.060261402279138565, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0014, "grad_norm": 0.44682666659355164, "kl": 0.0009362539058201946, "learning_rate": 6.000000000000001e-07, "loss": 0.0, "num_tokens": 60696.0, "reward": 0.7254638671875, "reward_std": 0.014938775449991226, "rewards//mean": 0.7254638671875, "rewards//std": 0.04342832788825035, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0016, "grad_norm": 0.453555703163147, "kl": 0.0006852814112789929, "learning_rate": 7.000000000000001e-07, "loss": 0.0, "num_tokens": 69336.0, "reward": 0.72991943359375, "reward_std": 0.01723172701895237, "rewards//mean": 0.72991943359375, "rewards//std": 0.05862419679760933, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0018, "grad_norm": 0.49192875623703003, "kl": 0.000731307256501168, "learning_rate": 8.000000000000001e-07, "loss": 0.0, "num_tokens": 78008.0, "reward": 0.70782470703125, "reward_std": 0.018347103148698807, "rewards//mean": 0.70782470703125, "rewards//std": 0.058675043284893036, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.002, "grad_norm": 0.4813508093357086, "kl": 0.0007482333021471277, "learning_rate": 9.000000000000001e-07, "loss": 0.0, "num_tokens": 86648.0, "reward": 0.678955078125, "reward_std": 0.012709339149296284, "rewards//mean": 0.678955078125, "rewards//std": 0.05669618770480156, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0022, "grad_norm": 0.47774913907051086, "kl": 0.0006810418926761486, "learning_rate": 1.0000000000000002e-06, "loss": 0.0, "num_tokens": 95456.0, "reward": 0.70257568359375, "reward_std": 0.016511568799614906, "rewards//mean": 0.70257568359375, "rewards//std": 0.05309594050049782, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0024, "grad_norm": 0.45333075523376465, "kl": 0.0007765620030113496, "learning_rate": 1.1e-06, "loss": 0.0, "num_tokens": 104112.0, "reward": 0.701904296875, "reward_std": 0.014031937345862389, "rewards//mean": 0.701904296875, "rewards//std": 0.04883308708667755, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0026, "grad_norm": 0.4627256989479065, "kl": 0.0007952243977342732, "learning_rate": 1.2000000000000002e-06, "loss": 0.0, "num_tokens": 112728.0, "reward": 0.7293701171875, "reward_std": 0.013355368748307228, "rewards//mean": 0.7293701171875, "rewards//std": 0.048412635922431946, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0028, "grad_norm": 0.529376745223999, "kl": 0.0007697402106714435, "learning_rate": 1.3e-06, "loss": 0.0, "num_tokens": 121352.0, "reward": 0.75030517578125, "reward_std": 0.017826953902840614, "rewards//mean": 0.75030517578125, "rewards//std": 0.044097088277339935, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.003, "grad_norm": 0.5399572253227234, "kl": 0.0007980391455930658, "learning_rate": 1.4000000000000001e-06, "loss": 0.0, "num_tokens": 130072.0, "reward": 0.7301025390625, "reward_std": 0.016819626092910767, "rewards//mean": 0.7301025390625, "rewards//std": 0.06331897526979446, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0032, "grad_norm": 0.48803454637527466, "kl": 0.0007127898643375374, "learning_rate": 1.5e-06, "loss": 0.0, "num_tokens": 138680.0, "reward": 0.72723388671875, "reward_std": 0.01550104096531868, "rewards//mean": 0.72723388671875, "rewards//std": 0.05276963487267494, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0034, "grad_norm": 0.4796734154224396, "kl": 0.0007568690125481226, "learning_rate": 1.6000000000000001e-06, "loss": 0.0, "num_tokens": 147320.0, "reward": 0.67901611328125, "reward_std": 0.012344243004918098, "rewards//mean": 0.67901611328125, "rewards//std": 0.05607627332210541, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0036, "grad_norm": 0.4370866119861603, "kl": 0.0007120481022866443, "learning_rate": 1.7000000000000002e-06, "loss": 0.0, "num_tokens": 155984.0, "reward": 0.7203369140625, "reward_std": 0.010229130275547504, "rewards//mean": 0.7203369140625, "rewards//std": 0.04977063462138176, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0038, "grad_norm": 0.47938504815101624, "kl": 0.0007277187978615984, "learning_rate": 1.8000000000000001e-06, "loss": 0.0, "num_tokens": 164608.0, "reward": 0.702392578125, "reward_std": 0.020038627088069916, "rewards//mean": 0.702392578125, "rewards//std": 0.06601254642009735, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.004, "grad_norm": 0.5836920142173767, "kl": 0.000848565723572392, "learning_rate": 1.9000000000000002e-06, "loss": 0.0, "num_tokens": 173168.0, "reward": 0.71051025390625, "reward_std": 0.016375649720430374, "rewards//mean": 0.71051025390625, "rewards//std": 0.0618068166077137, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0042, "grad_norm": 0.4508132040500641, "kl": 0.0007047736216918565, "learning_rate": 2.0000000000000003e-06, "loss": 0.0, "num_tokens": 181792.0, "reward": 0.71771240234375, "reward_std": 0.01673172414302826, "rewards//mean": 0.71771240234375, "rewards//std": 0.06672913581132889, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0044, "grad_norm": 0.4673251509666443, "kl": 0.0007006596788414754, "learning_rate": 2.1000000000000002e-06, "loss": 0.0, "num_tokens": 190392.0, "reward": 0.72503662109375, "reward_std": 0.016404539346694946, "rewards//mean": 0.72503662109375, "rewards//std": 0.04331747442483902, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0046, "grad_norm": 0.752646267414093, "kl": 0.0010592954204184934, "learning_rate": 2.2e-06, "loss": 0.0, "num_tokens": 199080.0, "reward": 0.70477294921875, "reward_std": 0.013751041144132614, "rewards//mean": 0.70477294921875, "rewards//std": 0.05169031769037247, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0048, "grad_norm": 0.5306899547576904, "kl": 0.000849992771691177, "learning_rate": 2.3000000000000004e-06, "loss": 0.0, "num_tokens": 207720.0, "reward": 0.7218017578125, "reward_std": 0.017207415774464607, "rewards//mean": 0.7218017578125, "rewards//std": 0.0644102469086647, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.005, "grad_norm": 0.44910290837287903, "kl": 0.0007659951588721015, "learning_rate": 2.4000000000000003e-06, "loss": 0.0, "num_tokens": 216296.0, "reward": 0.70526123046875, "reward_std": 0.01439366303384304, "rewards//mean": 0.70526123046875, "rewards//std": 0.060455020517110825, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0052, "grad_norm": 0.5068986415863037, "kl": 0.0008941922133089975, "learning_rate": 2.5e-06, "loss": 0.0, "num_tokens": 224968.0, "reward": 0.732421875, "reward_std": 0.017151644453406334, "rewards//mean": 0.732421875, "rewards//std": 0.05576201528310776, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0054, "grad_norm": 0.5920788645744324, "kl": 0.0007423345305141993, "learning_rate": 2.6e-06, "loss": 0.0, "num_tokens": 233528.0, "reward": 0.6807861328125, "reward_std": 0.018578065559267998, "rewards//mean": 0.6807861328125, "rewards//std": 0.0869307816028595, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0056, "grad_norm": 0.5785125494003296, "kl": 0.0010453350987518206, "learning_rate": 2.7000000000000004e-06, "loss": 0.0, "num_tokens": 242280.0, "reward": 0.68853759765625, "reward_std": 0.016351919621229172, "rewards//mean": 0.68853759765625, "rewards//std": 0.06393762677907944, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0058, "grad_norm": 0.503767728805542, "kl": 0.0008367645714315586, "learning_rate": 2.8000000000000003e-06, "loss": 0.0, "num_tokens": 250976.0, "reward": 0.71185302734375, "reward_std": 0.01879560574889183, "rewards//mean": 0.71185302734375, "rewards//std": 0.05082531273365021, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.006, "grad_norm": 0.5023078918457031, "kl": 0.0007173106641857885, "learning_rate": 2.9e-06, "loss": 0.0, "num_tokens": 259632.0, "reward": 0.70556640625, "reward_std": 0.017621025443077087, "rewards//mean": 0.70556640625, "rewards//std": 0.0475139394402504, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0062, "grad_norm": 0.4851153492927551, "kl": 0.0007923798766569234, "learning_rate": 3e-06, "loss": 0.0, "num_tokens": 268208.0, "reward": 0.7398681640625, "reward_std": 0.01781066693365574, "rewards//mean": 0.7398681640625, "rewards//std": 0.05101071298122406, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0064, "grad_norm": 0.4202479422092438, "kl": 0.0007000941404839978, "learning_rate": 3.1000000000000004e-06, "loss": 0.0, "num_tokens": 276824.0, "reward": 0.711669921875, "reward_std": 0.014990320429205894, "rewards//mean": 0.711669921875, "rewards//std": 0.05982654541730881, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0066, "grad_norm": 0.5245676636695862, "kl": 0.0007394420899800025, "learning_rate": 3.2000000000000003e-06, "loss": 0.0, "num_tokens": 285520.0, "reward": 0.73822021484375, "reward_std": 0.014528016559779644, "rewards//mean": 0.73822021484375, "rewards//std": 0.05750894173979759, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0068, "grad_norm": 0.41506427526474, "kl": 0.0006638994673267007, "learning_rate": 3.3000000000000006e-06, "loss": 0.0, "num_tokens": 294144.0, "reward": 0.705078125, "reward_std": 0.018270526081323624, "rewards//mean": 0.705078125, "rewards//std": 0.05205077305436134, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.007, "grad_norm": 0.6635000705718994, "kl": 0.0008408781359321438, "learning_rate": 3.4000000000000005e-06, "loss": 0.0, "num_tokens": 302888.0, "reward": 0.7369384765625, "reward_std": 0.010665340349078178, "rewards//mean": 0.7369384765625, "rewards//std": 0.041731830686330795, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0072, "grad_norm": 0.5000082850456238, "kl": 0.0007324635953409597, "learning_rate": 3.5e-06, "loss": 0.0, "num_tokens": 311504.0, "reward": 0.7015380859375, "reward_std": 0.015993408858776093, "rewards//mean": 0.7015380859375, "rewards//std": 0.05921553075313568, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0074, "grad_norm": 0.4788602292537689, "kl": 0.0007234790246002376, "learning_rate": 3.6000000000000003e-06, "loss": 0.0, "num_tokens": 320208.0, "reward": 0.71978759765625, "reward_std": 0.015359117649495602, "rewards//mean": 0.71978759765625, "rewards//std": 0.050679761916399, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0076, "grad_norm": 0.5380129218101501, "kl": 0.0006790131737943739, "learning_rate": 3.7e-06, "loss": 0.0, "num_tokens": 328904.0, "reward": 0.71307373046875, "reward_std": 0.017655834555625916, "rewards//mean": 0.71307373046875, "rewards//std": 0.039130210876464844, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0078, "grad_norm": 0.47228410840034485, "kl": 0.0006949657399673015, "learning_rate": 3.8000000000000005e-06, "loss": 0.0, "num_tokens": 337632.0, "reward": 0.705322265625, "reward_std": 0.01650204323232174, "rewards//mean": 0.705322265625, "rewards//std": 0.0695849284529686, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.008, "grad_norm": 0.5206338763237, "kl": 0.0007481320935767144, "learning_rate": 3.900000000000001e-06, "loss": 0.0, "num_tokens": 346264.0, "reward": 0.69219970703125, "reward_std": 0.016963649541139603, "rewards//mean": 0.69219970703125, "rewards//std": 0.05523912236094475, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0082, "grad_norm": 0.49259892106056213, "kl": 0.0007038385519990698, "learning_rate": 4.000000000000001e-06, "loss": 0.0, "num_tokens": 354912.0, "reward": 0.71923828125, "reward_std": 0.018238678574562073, "rewards//mean": 0.71923828125, "rewards//std": 0.060513366013765335, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0084, "grad_norm": 0.42862868309020996, "kl": 0.0006526454817503691, "learning_rate": 4.1e-06, "loss": 0.0, "num_tokens": 363544.0, "reward": 0.73712158203125, "reward_std": 0.012668399140238762, "rewards//mean": 0.73712158203125, "rewards//std": 0.05271969735622406, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0086, "grad_norm": 0.43543925881385803, "kl": 0.0007449448385159485, "learning_rate": 4.2000000000000004e-06, "loss": 0.0, "num_tokens": 372144.0, "reward": 0.73724365234375, "reward_std": 0.017016027122735977, "rewards//mean": 0.73724365234375, "rewards//std": 0.04933328554034233, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0088, "grad_norm": 0.5209327340126038, "kl": 0.0007311399604077451, "learning_rate": 4.3e-06, "loss": 0.0, "num_tokens": 380848.0, "reward": 0.7359619140625, "reward_std": 0.010918927378952503, "rewards//mean": 0.7359619140625, "rewards//std": 0.0476677305996418, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.009, "grad_norm": 0.5357726216316223, "kl": 0.0008118156329146586, "learning_rate": 4.4e-06, "loss": 0.0, "num_tokens": 389504.0, "reward": 0.66937255859375, "reward_std": 0.01259728241711855, "rewards//mean": 0.66937255859375, "rewards//std": 0.051660146564245224, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0092, "grad_norm": 0.45110824704170227, "kl": 0.0006952350158826448, "learning_rate": 4.5e-06, "loss": 0.0, "num_tokens": 398200.0, "reward": 0.724853515625, "reward_std": 0.013849527575075626, "rewards//mean": 0.724853515625, "rewards//std": 0.0702190026640892, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0094, "grad_norm": 0.47275251150131226, "kl": 0.0007834887073840946, "learning_rate": 4.600000000000001e-06, "loss": 0.0, "num_tokens": 406848.0, "reward": 0.69769287109375, "reward_std": 0.015200081281363964, "rewards//mean": 0.69769287109375, "rewards//std": 0.07077126950025558, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0096, "grad_norm": 0.42265617847442627, "kl": 0.000766771991038695, "learning_rate": 4.7e-06, "loss": 0.0, "num_tokens": 415520.0, "reward": 0.70733642578125, "reward_std": 0.017255615442991257, "rewards//mean": 0.70733642578125, "rewards//std": 0.0646086260676384, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0098, "grad_norm": 0.46693459153175354, "kl": 0.0007007038075244054, "learning_rate": 4.800000000000001e-06, "loss": 0.0, "num_tokens": 424088.0, "reward": 0.7169189453125, "reward_std": 0.017792128026485443, "rewards//mean": 0.7169189453125, "rewards//std": 0.06479453295469284, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.01, "grad_norm": 0.46468374133110046, "kl": 0.0007802178952260874, "learning_rate": 4.9000000000000005e-06, "loss": 0.0, "num_tokens": 432696.0, "reward": 0.707275390625, "reward_std": 0.01039247214794159, "rewards//mean": 0.707275390625, "rewards//std": 0.06051086261868477, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0102, "grad_norm": 0.4285586476325989, "kl": 0.000754905202484224, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 441352.0, "reward": 0.72601318359375, "reward_std": 0.015443078242242336, "rewards//mean": 0.72601318359375, "rewards//std": 0.04845963418483734, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0104, "grad_norm": 0.46686625480651855, "kl": 0.00072819792694645, "learning_rate": 4.9999994965001495e-06, "loss": 0.0, "num_tokens": 449960.0, "reward": 0.7313232421875, "reward_std": 0.013708039186894894, "rewards//mean": 0.7313232421875, "rewards//std": 0.057744208723306656, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0106, "grad_norm": 0.5042674541473389, "kl": 0.0007860148616600782, "learning_rate": 4.999997986000801e-06, "loss": 0.0, "num_tokens": 458512.0, "reward": 0.71722412109375, "reward_std": 0.015048246830701828, "rewards//mean": 0.71722412109375, "rewards//std": 0.05379721522331238, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0108, "grad_norm": 0.4119553864002228, "kl": 0.0006910300071467645, "learning_rate": 4.999995468502563e-06, "loss": 0.0, "num_tokens": 467048.0, "reward": 0.73016357421875, "reward_std": 0.01429541502147913, "rewards//mean": 0.73016357421875, "rewards//std": 0.054377954453229904, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.011, "grad_norm": 0.4966086745262146, "kl": 0.0007905790334916674, "learning_rate": 4.9999919440064484e-06, "loss": 0.0, "num_tokens": 475728.0, "reward": 0.69805908203125, "reward_std": 0.016646619886159897, "rewards//mean": 0.69805908203125, "rewards//std": 0.06764635443687439, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0112, "grad_norm": 0.4910055100917816, "kl": 0.0007386217839666642, "learning_rate": 4.999987412513878e-06, "loss": 0.0, "num_tokens": 484360.0, "reward": 0.72802734375, "reward_std": 0.015091042965650558, "rewards//mean": 0.72802734375, "rewards//std": 0.06086854264140129, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0114, "grad_norm": 0.4628139138221741, "kl": 0.0007929667408461682, "learning_rate": 4.999981874026677e-06, "loss": 0.0, "num_tokens": 493000.0, "reward": 0.72283935546875, "reward_std": 0.015091566368937492, "rewards//mean": 0.72283935546875, "rewards//std": 0.056771133095026016, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0116, "grad_norm": 0.5012953877449036, "kl": 0.0007925045429146849, "learning_rate": 4.9999753285470756e-06, "loss": 0.0, "num_tokens": 501632.0, "reward": 0.71661376953125, "reward_std": 0.015197820030152798, "rewards//mean": 0.71661376953125, "rewards//std": 0.051168158650398254, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0118, "grad_norm": 0.5567752718925476, "kl": 0.0009017333650263026, "learning_rate": 4.9999677760777114e-06, "loss": 0.0, "num_tokens": 510288.0, "reward": 0.72381591796875, "reward_std": 0.013100363314151764, "rewards//mean": 0.72381591796875, "rewards//std": 0.04677506536245346, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.012, "grad_norm": 0.41019734740257263, "kl": 0.0007576563366455957, "learning_rate": 4.999959216621626e-06, "loss": 0.0, "num_tokens": 518952.0, "reward": 0.721435546875, "reward_std": 0.01749395951628685, "rewards//mean": 0.721435546875, "rewards//std": 0.0656224712729454, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0122, "grad_norm": 0.5433821082115173, "kl": 0.0012775656068697572, "learning_rate": 4.999949650182267e-06, "loss": 0.0001, "num_tokens": 527520.0, "reward": 0.71600341796875, "reward_std": 0.014195160940289497, "rewards//mean": 0.71600341796875, "rewards//std": 0.040854718536138535, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0124, "grad_norm": 0.4710741639137268, "kl": 0.0007979272995726205, "learning_rate": 4.999939076763487e-06, "loss": 0.0, "num_tokens": 536112.0, "reward": 0.7099609375, "reward_std": 0.015660030767321587, "rewards//mean": 0.7099609375, "rewards//std": 0.0644836500287056, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0126, "grad_norm": 0.5351571440696716, "kl": 0.0008906932343961671, "learning_rate": 4.999927496369547e-06, "loss": 0.0, "num_tokens": 544736.0, "reward": 0.6954345703125, "reward_std": 0.01476267259567976, "rewards//mean": 0.6954345703125, "rewards//std": 0.04151507467031479, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0128, "grad_norm": 0.4595547318458557, "kl": 0.0008543600051780231, "learning_rate": 4.99991490900511e-06, "loss": 0.0, "num_tokens": 553408.0, "reward": 0.71673583984375, "reward_std": 0.012672960758209229, "rewards//mean": 0.71673583984375, "rewards//std": 0.05019837245345116, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.013, "grad_norm": 0.4643782079219818, "kl": 0.0008414792246185243, "learning_rate": 4.999901314675246e-06, "loss": 0.0, "num_tokens": 562064.0, "reward": 0.71978759765625, "reward_std": 0.01744549721479416, "rewards//mean": 0.71978759765625, "rewards//std": 0.05583685636520386, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0132, "grad_norm": 0.4610508680343628, "kl": 0.0008548630212317221, "learning_rate": 4.999886713385432e-06, "loss": 0.0, "num_tokens": 570664.0, "reward": 0.71099853515625, "reward_std": 0.016434110701084137, "rewards//mean": 0.71099853515625, "rewards//std": 0.06650462746620178, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0134, "grad_norm": 0.48795291781425476, "kl": 0.000795883170212619, "learning_rate": 4.999871105141549e-06, "loss": 0.0, "num_tokens": 579360.0, "reward": 0.7103271484375, "reward_std": 0.016300387680530548, "rewards//mean": 0.7103271484375, "rewards//std": 0.0523579940199852, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0136, "grad_norm": 0.4637707769870758, "kl": 0.0008550576094421558, "learning_rate": 4.9998544899498845e-06, "loss": 0.0, "num_tokens": 587992.0, "reward": 0.7423095703125, "reward_std": 0.014237109571695328, "rewards//mean": 0.7423095703125, "rewards//std": 0.04064841568470001, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0138, "grad_norm": 0.47836223244667053, "kl": 0.0008403940519201569, "learning_rate": 4.999836867817129e-06, "loss": 0.0, "num_tokens": 596608.0, "reward": 0.69384765625, "reward_std": 0.014926273375749588, "rewards//mean": 0.69384765625, "rewards//std": 0.040478430688381195, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.014, "grad_norm": 0.49338188767433167, "kl": 0.0009159344772342592, "learning_rate": 4.9998182387503825e-06, "loss": 0.0, "num_tokens": 605248.0, "reward": 0.70245361328125, "reward_std": 0.023733031004667282, "rewards//mean": 0.70245361328125, "rewards//std": 0.05866420641541481, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0142, "grad_norm": 0.3988666534423828, "kl": 0.0008121578212012537, "learning_rate": 4.999798602757149e-06, "loss": 0.0, "num_tokens": 613824.0, "reward": 0.69830322265625, "reward_std": 0.013719111680984497, "rewards//mean": 0.69830322265625, "rewards//std": 0.056460171937942505, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0144, "grad_norm": 0.4849626421928406, "kl": 0.0009415504609933123, "learning_rate": 4.9997779598453365e-06, "loss": 0.0, "num_tokens": 622328.0, "reward": 0.73828125, "reward_std": 0.01322929933667183, "rewards//mean": 0.73828125, "rewards//std": 0.06599832326173782, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0146, "grad_norm": 0.5434121489524841, "kl": 0.0016273690489470027, "learning_rate": 4.999756310023261e-06, "loss": 0.0001, "num_tokens": 630976.0, "reward": 0.6895751953125, "reward_std": 0.01436593197286129, "rewards//mean": 0.6895751953125, "rewards//std": 0.04783637821674347, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0148, "grad_norm": 0.4288107454776764, "kl": 0.0008081157502601855, "learning_rate": 4.999733653299643e-06, "loss": 0.0, "num_tokens": 639624.0, "reward": 0.73236083984375, "reward_std": 0.015878882259130478, "rewards//mean": 0.73236083984375, "rewards//std": 0.040517259389162064, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.015, "grad_norm": 0.48302745819091797, "kl": 0.0009941107928170823, "learning_rate": 4.9997099896836076e-06, "loss": 0.0, "num_tokens": 648320.0, "reward": 0.69451904296875, "reward_std": 0.013520974665880203, "rewards//mean": 0.69451904296875, "rewards//std": 0.06670349836349487, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0152, "grad_norm": 0.4562317728996277, "kl": 0.0008667359288665466, "learning_rate": 4.999685319184688e-06, "loss": 0.0, "num_tokens": 657080.0, "reward": 0.7342529296875, "reward_std": 0.013470092788338661, "rewards//mean": 0.7342529296875, "rewards//std": 0.047755297273397446, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0154, "grad_norm": 0.491922527551651, "kl": 0.000954871931753587, "learning_rate": 4.999659641812821e-06, "loss": 0.0, "num_tokens": 665720.0, "reward": 0.70245361328125, "reward_std": 0.015949761494994164, "rewards//mean": 0.70245361328125, "rewards//std": 0.07394102215766907, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0156, "grad_norm": 0.4906434118747711, "kl": 0.0010538664282648824, "learning_rate": 4.9996329575783486e-06, "loss": 0.0, "num_tokens": 674336.0, "reward": 0.71417236328125, "reward_std": 0.016342610120773315, "rewards//mean": 0.71417236328125, "rewards//std": 0.05462348833680153, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0158, "grad_norm": 0.488397479057312, "kl": 0.0009591389462002553, "learning_rate": 4.99960526649202e-06, "loss": 0.0, "num_tokens": 682968.0, "reward": 0.68719482421875, "reward_std": 0.01782170683145523, "rewards//mean": 0.68719482421875, "rewards//std": 0.07477407157421112, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.016, "grad_norm": 0.4663703143596649, "kl": 0.0009951836473192088, "learning_rate": 4.999576568564989e-06, "loss": 0.0, "num_tokens": 691640.0, "reward": 0.73382568359375, "reward_std": 0.011371012777090073, "rewards//mean": 0.73382568359375, "rewards//std": 0.06883952021598816, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0162, "grad_norm": 0.43239349126815796, "kl": 0.0008809622740955092, "learning_rate": 4.999546863808815e-06, "loss": 0.0, "num_tokens": 700264.0, "reward": 0.6995849609375, "reward_std": 0.011680185794830322, "rewards//mean": 0.6995849609375, "rewards//std": 0.04613397642970085, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0164, "grad_norm": 0.446817010641098, "kl": 0.0009117565277847461, "learning_rate": 4.999516152235463e-06, "loss": 0.0, "num_tokens": 708984.0, "reward": 0.73577880859375, "reward_std": 0.010488376021385193, "rewards//mean": 0.73577880859375, "rewards//std": 0.05107547715306282, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0166, "grad_norm": 0.43615901470184326, "kl": 0.0009348735839012079, "learning_rate": 4.999484433857305e-06, "loss": 0.0, "num_tokens": 717568.0, "reward": 0.723388671875, "reward_std": 0.013231704942882061, "rewards//mean": 0.723388671875, "rewards//std": 0.0505966916680336, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0168, "grad_norm": 0.496662974357605, "kl": 0.0011133336302009411, "learning_rate": 4.999451708687114e-06, "loss": 0.0, "num_tokens": 726304.0, "reward": 0.7135009765625, "reward_std": 0.017629370093345642, "rewards//mean": 0.7135009765625, "rewards//std": 0.05801933631300926, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.017, "grad_norm": 0.5092006921768188, "kl": 0.0010797181967063807, "learning_rate": 4.999417976738075e-06, "loss": 0.0, "num_tokens": 735000.0, "reward": 0.72796630859375, "reward_std": 0.011926619336009026, "rewards//mean": 0.72796630859375, "rewards//std": 0.06286849081516266, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0172, "grad_norm": 0.4398786425590515, "kl": 0.0010526972182560712, "learning_rate": 4.999383238023773e-06, "loss": 0.0, "num_tokens": 743648.0, "reward": 0.70025634765625, "reward_std": 0.013448704965412617, "rewards//mean": 0.70025634765625, "rewards//std": 0.07408399134874344, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0174, "grad_norm": 0.5147225260734558, "kl": 0.0012325849456829019, "learning_rate": 4.999347492558203e-06, "loss": 0.0, "num_tokens": 752416.0, "reward": 0.71722412109375, "reward_std": 0.014034003019332886, "rewards//mean": 0.71722412109375, "rewards//std": 0.07134390622377396, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0176, "grad_norm": 0.5070951581001282, "kl": 0.001082870177924633, "learning_rate": 4.999310740355761e-06, "loss": 0.0, "num_tokens": 761040.0, "reward": 0.7266845703125, "reward_std": 0.014604616910219193, "rewards//mean": 0.7266845703125, "rewards//std": 0.06855076551437378, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0178, "grad_norm": 0.4914563298225403, "kl": 0.0012297793437028304, "learning_rate": 4.9992729814312514e-06, "loss": 0.0, "num_tokens": 769656.0, "reward": 0.74017333984375, "reward_std": 0.019316695630550385, "rewards//mean": 0.74017333984375, "rewards//std": 0.06556382775306702, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.018, "grad_norm": 0.4564233124256134, "kl": 0.0011137609835714102, "learning_rate": 4.999234215799884e-06, "loss": 0.0, "num_tokens": 778248.0, "reward": 0.697509765625, "reward_std": 0.015206321142613888, "rewards//mean": 0.697509765625, "rewards//std": 0.019185619428753853, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0182, "grad_norm": 0.5231274366378784, "kl": 0.0012513935289462097, "learning_rate": 4.999194443477273e-06, "loss": 0.0001, "num_tokens": 786856.0, "reward": 0.689697265625, "reward_std": 0.018568219617009163, "rewards//mean": 0.689697265625, "rewards//std": 0.08479659259319305, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0184, "grad_norm": 0.45985209941864014, "kl": 0.0012780725373886526, "learning_rate": 4.99915366447944e-06, "loss": 0.0001, "num_tokens": 795544.0, "reward": 0.6876220703125, "reward_std": 0.016050245612859726, "rewards//mean": 0.6876220703125, "rewards//std": 0.06961309164762497, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0186, "grad_norm": 0.4588201940059662, "kl": 0.0010780094598885626, "learning_rate": 4.999111878822809e-06, "loss": 0.0, "num_tokens": 804104.0, "reward": 0.73370361328125, "reward_std": 0.015261776745319366, "rewards//mean": 0.73370361328125, "rewards//std": 0.05237804725766182, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0188, "grad_norm": 0.4953310489654541, "kl": 0.0013287549882079475, "learning_rate": 4.999069086524212e-06, "loss": 0.0001, "num_tokens": 812768.0, "reward": 0.74371337890625, "reward_std": 0.017101481556892395, "rewards//mean": 0.74371337890625, "rewards//std": 0.07072333991527557, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.019, "grad_norm": 0.5649318099021912, "kl": 0.0012824844452552497, "learning_rate": 4.999025287600886e-06, "loss": 0.0001, "num_tokens": 821400.0, "reward": 0.73883056640625, "reward_std": 0.012025142088532448, "rewards//mean": 0.73883056640625, "rewards//std": 0.04969830811023712, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0192, "grad_norm": 0.6112560629844666, "kl": 0.0013404397759586573, "learning_rate": 4.998980482070473e-06, "loss": 0.0001, "num_tokens": 829952.0, "reward": 0.68670654296875, "reward_std": 0.015257102437317371, "rewards//mean": 0.68670654296875, "rewards//std": 0.03606545925140381, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0194, "grad_norm": 0.5092772841453552, "kl": 0.0011790767530328594, "learning_rate": 4.9989346699510215e-06, "loss": 0.0, "num_tokens": 838520.0, "reward": 0.71392822265625, "reward_std": 0.013484635390341282, "rewards//mean": 0.71392822265625, "rewards//std": 0.04560608044266701, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0196, "grad_norm": 0.4867624044418335, "kl": 0.001275954389711842, "learning_rate": 4.9988878512609825e-06, "loss": 0.0001, "num_tokens": 847128.0, "reward": 0.708740234375, "reward_std": 0.014133438467979431, "rewards//mean": 0.708740234375, "rewards//std": 0.0834752693772316, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0198, "grad_norm": 0.5237188935279846, "kl": 0.001374387225951068, "learning_rate": 4.998840026019217e-06, "loss": 0.0001, "num_tokens": 855824.0, "reward": 0.6898193359375, "reward_std": 0.014914432540535927, "rewards//mean": 0.6898193359375, "rewards//std": 0.05682779848575592, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.02, "grad_norm": 0.4966977834701538, "kl": 0.0014171117945807055, "learning_rate": 4.998791194244988e-06, "loss": 0.0001, "num_tokens": 864520.0, "reward": 0.71148681640625, "reward_std": 0.014294968917965889, "rewards//mean": 0.71148681640625, "rewards//std": 0.08842907845973969, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0202, "grad_norm": 0.9231608510017395, "kl": 0.0014634677936555818, "learning_rate": 4.998741355957963e-06, "loss": 0.0001, "num_tokens": 873152.0, "reward": 0.709228515625, "reward_std": 0.015058237127959728, "rewards//mean": 0.709228515625, "rewards//std": 0.07786376029253006, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0204, "grad_norm": 0.49847280979156494, "kl": 0.001440991909475997, "learning_rate": 4.99869051117822e-06, "loss": 0.0001, "num_tokens": 881824.0, "reward": 0.7056884765625, "reward_std": 0.014790365472435951, "rewards//mean": 0.7056884765625, "rewards//std": 0.05627746134996414, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0206, "grad_norm": 0.5402097702026367, "kl": 0.0013479594781529158, "learning_rate": 4.998638659926238e-06, "loss": 0.0001, "num_tokens": 890368.0, "reward": 0.74066162109375, "reward_std": 0.014553939923644066, "rewards//mean": 0.74066162109375, "rewards//std": 0.05418301746249199, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0208, "grad_norm": 0.5026318430900574, "kl": 0.001596087298821658, "learning_rate": 4.998585802222902e-06, "loss": 0.0001, "num_tokens": 899128.0, "reward": 0.74249267578125, "reward_std": 0.022882359102368355, "rewards//mean": 0.74249267578125, "rewards//std": 0.04922054708003998, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.021, "grad_norm": 0.4722844958305359, "kl": 0.0016166606947081164, "learning_rate": 4.9985319380895035e-06, "loss": 0.0001, "num_tokens": 907808.0, "reward": 0.7147216796875, "reward_std": 0.01173408329486847, "rewards//mean": 0.7147216796875, "rewards//std": 0.05263826996088028, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0212, "grad_norm": 0.5120245218276978, "kl": 0.0016622742696199566, "learning_rate": 4.99847706754774e-06, "loss": 0.0001, "num_tokens": 916384.0, "reward": 0.74267578125, "reward_std": 0.013011830858886242, "rewards//mean": 0.74267578125, "rewards//std": 0.04836273938417435, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0214, "grad_norm": 0.5261712670326233, "kl": 0.0015574333810945973, "learning_rate": 4.998421190619712e-06, "loss": 0.0001, "num_tokens": 925000.0, "reward": 0.728759765625, "reward_std": 0.01469873171299696, "rewards//mean": 0.728759765625, "rewards//std": 0.029047802090644836, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0216, "grad_norm": 0.4722696542739868, "kl": 0.0016149339353432879, "learning_rate": 4.998364307327927e-06, "loss": 0.0001, "num_tokens": 933680.0, "reward": 0.72637939453125, "reward_std": 0.014723455533385277, "rewards//mean": 0.72637939453125, "rewards//std": 0.057769205421209335, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0218, "grad_norm": 0.471443235874176, "kl": 0.00147363574069459, "learning_rate": 4.998306417695298e-06, "loss": 0.0001, "num_tokens": 942360.0, "reward": 0.70672607421875, "reward_std": 0.013419017195701599, "rewards//mean": 0.70672607421875, "rewards//std": 0.061058465391397476, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.022, "grad_norm": 0.6062172651290894, "kl": 0.0026222791202599183, "learning_rate": 4.998247521745142e-06, "loss": 0.0001, "num_tokens": 951000.0, "reward": 0.74151611328125, "reward_std": 0.018515393137931824, "rewards//mean": 0.74151611328125, "rewards//std": 0.05745021253824234, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0222, "grad_norm": 0.6000130772590637, "kl": 0.0021851205674465746, "learning_rate": 4.998187619501185e-06, "loss": 0.0001, "num_tokens": 959688.0, "reward": 0.674560546875, "reward_std": 0.01506463810801506, "rewards//mean": 0.674560546875, "rewards//std": 0.07186568528413773, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0224, "grad_norm": 0.4527592062950134, "kl": 0.0016580721858190373, "learning_rate": 4.998126710987552e-06, "loss": 0.0001, "num_tokens": 968352.0, "reward": 0.72381591796875, "reward_std": 0.014463772997260094, "rewards//mean": 0.72381591796875, "rewards//std": 0.05746760219335556, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0226, "grad_norm": 0.5143523812294006, "kl": 0.002175911722588353, "learning_rate": 4.998064796228779e-06, "loss": 0.0001, "num_tokens": 976936.0, "reward": 0.7242431640625, "reward_std": 0.014481933787465096, "rewards//mean": 0.7242431640625, "rewards//std": 0.05567820370197296, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0228, "grad_norm": 0.48747143149375916, "kl": 0.0017545891605550423, "learning_rate": 4.998001875249804e-06, "loss": 0.0001, "num_tokens": 985552.0, "reward": 0.71710205078125, "reward_std": 0.013459138572216034, "rewards//mean": 0.71710205078125, "rewards//std": 0.04908997192978859, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.023, "grad_norm": 0.4331216514110565, "kl": 0.0021779875096399337, "learning_rate": 4.997937948075973e-06, "loss": 0.0001, "num_tokens": 994264.0, "reward": 0.72625732421875, "reward_std": 0.014490745961666107, "rewards//mean": 0.72625732421875, "rewards//std": 0.04755370691418648, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0232, "grad_norm": 0.5705957412719727, "kl": 0.002923560852650553, "learning_rate": 4.997873014733036e-06, "loss": 0.0001, "num_tokens": 1002832.0, "reward": 0.71453857421875, "reward_std": 0.013323797844350338, "rewards//mean": 0.71453857421875, "rewards//std": 0.05091845244169235, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0234, "grad_norm": 0.45392346382141113, "kl": 0.0019422014956944622, "learning_rate": 4.997807075247147e-06, "loss": 0.0001, "num_tokens": 1011448.0, "reward": 0.70611572265625, "reward_std": 0.015927188098430634, "rewards//mean": 0.70611572265625, "rewards//std": 0.04444960504770279, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0236, "grad_norm": 0.46781232953071594, "kl": 0.0021620240004267544, "learning_rate": 4.9977401296448655e-06, "loss": 0.0001, "num_tokens": 1020104.0, "reward": 0.71502685546875, "reward_std": 0.014722153544425964, "rewards//mean": 0.71502685546875, "rewards//std": 0.04867687448859215, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0238, "grad_norm": 0.46179869771003723, "kl": 0.0022078527545090765, "learning_rate": 4.99767217795316e-06, "loss": 0.0001, "num_tokens": 1028696.0, "reward": 0.72418212890625, "reward_std": 0.011131198145449162, "rewards//mean": 0.72418212890625, "rewards//std": 0.0295663233846426, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.024, "grad_norm": 0.5357843637466431, "kl": 0.002367166889598593, "learning_rate": 4.997603220199399e-06, "loss": 0.0001, "num_tokens": 1037384.0, "reward": 0.7073974609375, "reward_std": 0.013118360191583633, "rewards//mean": 0.7073974609375, "rewards//std": 0.05458429455757141, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0242, "grad_norm": 0.4789181649684906, "kl": 0.0018334937049075961, "learning_rate": 4.99753325641136e-06, "loss": 0.0001, "num_tokens": 1046176.0, "reward": 0.72113037109375, "reward_std": 0.011054251343011856, "rewards//mean": 0.72113037109375, "rewards//std": 0.0613209493458271, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0244, "grad_norm": 0.4643182158470154, "kl": 0.0028950390114914626, "learning_rate": 4.997462286617224e-06, "loss": 0.0001, "num_tokens": 1054816.0, "reward": 0.7144775390625, "reward_std": 0.014479342848062515, "rewards//mean": 0.7144775390625, "rewards//std": 0.06932283937931061, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0246, "grad_norm": 0.45720401406288147, "kl": 0.002508550591301173, "learning_rate": 4.997390310845578e-06, "loss": 0.0001, "num_tokens": 1063496.0, "reward": 0.73211669921875, "reward_std": 0.018719132989645004, "rewards//mean": 0.73211669921875, "rewards//std": 0.043240174651145935, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0248, "grad_norm": 0.4856073260307312, "kl": 0.005021717923227698, "learning_rate": 4.997317329125413e-06, "loss": 0.0002, "num_tokens": 1072104.0, "reward": 0.739501953125, "reward_std": 0.018492162227630615, "rewards//mean": 0.739501953125, "rewards//std": 0.05581790953874588, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.025, "grad_norm": 0.45310959219932556, "kl": 0.0023723691556369886, "learning_rate": 4.997243341486126e-06, "loss": 0.0001, "num_tokens": 1080752.0, "reward": 0.7279052734375, "reward_std": 0.01648058369755745, "rewards//mean": 0.7279052734375, "rewards//std": 0.05334783345460892, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0252, "grad_norm": 0.44085827469825745, "kl": 0.0022444947535404935, "learning_rate": 4.997168347957521e-06, "loss": 0.0001, "num_tokens": 1089368.0, "reward": 0.72052001953125, "reward_std": 0.015059070661664009, "rewards//mean": 0.72052001953125, "rewards//std": 0.0704086497426033, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0254, "grad_norm": 0.47659870982170105, "kl": 0.002819010740495287, "learning_rate": 4.997092348569802e-06, "loss": 0.0001, "num_tokens": 1097992.0, "reward": 0.734130859375, "reward_std": 0.012258566915988922, "rewards//mean": 0.734130859375, "rewards//std": 0.03378882259130478, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0256, "grad_norm": 0.4902152121067047, "kl": 0.0033210097899427637, "learning_rate": 4.9970153433535855e-06, "loss": 0.0001, "num_tokens": 1106576.0, "reward": 0.71600341796875, "reward_std": 0.01741008460521698, "rewards//mean": 0.71600341796875, "rewards//std": 0.04795122891664505, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0258, "grad_norm": 0.467317670583725, "kl": 0.0020708676311187446, "learning_rate": 4.996937332339887e-06, "loss": 0.0001, "num_tokens": 1115160.0, "reward": 0.71319580078125, "reward_std": 0.010551651939749718, "rewards//mean": 0.71319580078125, "rewards//std": 0.04744407534599304, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.026, "grad_norm": 0.5055683851242065, "kl": 0.0025406221684534103, "learning_rate": 4.996858315560129e-06, "loss": 0.0001, "num_tokens": 1123808.0, "reward": 0.72503662109375, "reward_std": 0.015444736927747726, "rewards//mean": 0.72503662109375, "rewards//std": 0.06147873401641846, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0262, "grad_norm": 0.5300055146217346, "kl": 0.004421208694111556, "learning_rate": 4.9967782930461405e-06, "loss": 0.0002, "num_tokens": 1132528.0, "reward": 0.71380615234375, "reward_std": 0.012746734544634819, "rewards//mean": 0.71380615234375, "rewards//std": 0.05404258891940117, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0264, "grad_norm": 0.5882632732391357, "kl": 0.004176836780970916, "learning_rate": 4.9966972648301535e-06, "loss": 0.0002, "num_tokens": 1141160.0, "reward": 0.73260498046875, "reward_std": 0.013695623725652695, "rewards//mean": 0.73260498046875, "rewards//std": 0.0439189113676548, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0266, "grad_norm": 0.5080795288085938, "kl": 0.0037878667353652418, "learning_rate": 4.996615230944808e-06, "loss": 0.0002, "num_tokens": 1149744.0, "reward": 0.7181396484375, "reward_std": 0.021421236917376518, "rewards//mean": 0.7181396484375, "rewards//std": 0.053551748394966125, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0268, "grad_norm": 0.5161112546920776, "kl": 0.002877964361687191, "learning_rate": 4.996532191423145e-06, "loss": 0.0001, "num_tokens": 1158344.0, "reward": 0.71826171875, "reward_std": 0.017237350344657898, "rewards//mean": 0.71826171875, "rewards//std": 0.0640067458152771, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.027, "grad_norm": 0.501823902130127, "kl": 0.003536474294378422, "learning_rate": 4.996448146298615e-06, "loss": 0.0001, "num_tokens": 1166920.0, "reward": 0.68341064453125, "reward_std": 0.01601843349635601, "rewards//mean": 0.68341064453125, "rewards//std": 0.041225045919418335, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0272, "grad_norm": 0.5185138583183289, "kl": 0.004472654749406502, "learning_rate": 4.996363095605069e-06, "loss": 0.0002, "num_tokens": 1175528.0, "reward": 0.7242431640625, "reward_std": 0.018114212900400162, "rewards//mean": 0.7242431640625, "rewards//std": 0.05369402840733528, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0274, "grad_norm": 0.49364981055259705, "kl": 0.003177185819367878, "learning_rate": 4.996277039376767e-06, "loss": 0.0001, "num_tokens": 1184144.0, "reward": 0.72088623046875, "reward_std": 0.01708320900797844, "rewards//mean": 0.72088623046875, "rewards//std": 0.054411906749010086, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0276, "grad_norm": 0.5048626661300659, "kl": 0.003767449234146625, "learning_rate": 4.9961899776483725e-06, "loss": 0.0002, "num_tokens": 1192832.0, "reward": 0.72998046875, "reward_std": 0.013601850718259811, "rewards//mean": 0.72998046875, "rewards//std": 0.047309596091508865, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0278, "grad_norm": 0.5485401153564453, "kl": 0.005610850581433624, "learning_rate": 4.996101910454953e-06, "loss": 0.0002, "num_tokens": 1201472.0, "reward": 0.70831298828125, "reward_std": 0.018774503841996193, "rewards//mean": 0.70831298828125, "rewards//std": 0.07297329604625702, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.028, "grad_norm": 0.5312867164611816, "kl": 0.004881514410953969, "learning_rate": 4.996012837831983e-06, "loss": 0.0002, "num_tokens": 1210176.0, "reward": 0.7449951171875, "reward_std": 0.01512197032570839, "rewards//mean": 0.7449951171875, "rewards//std": 0.04492507874965668, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0282, "grad_norm": 0.7407464981079102, "kl": 0.005604632286122069, "learning_rate": 4.9959227598153395e-06, "loss": 0.0002, "num_tokens": 1218872.0, "reward": 0.7197265625, "reward_std": 0.018171414732933044, "rewards//mean": 0.7197265625, "rewards//std": 0.04184069111943245, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0284, "grad_norm": 0.5207117199897766, "kl": 0.0038259811990428716, "learning_rate": 4.995831676441307e-06, "loss": 0.0002, "num_tokens": 1227624.0, "reward": 0.72515869140625, "reward_std": 0.0129172932356596, "rewards//mean": 0.72515869140625, "rewards//std": 0.047344744205474854, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0286, "grad_norm": 0.4676206111907959, "kl": 0.004619606072083116, "learning_rate": 4.995739587746574e-06, "loss": 0.0002, "num_tokens": 1236264.0, "reward": 0.734375, "reward_std": 0.01602023094892502, "rewards//mean": 0.734375, "rewards//std": 0.04370923712849617, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0288, "grad_norm": 0.4712521731853485, "kl": 0.005641886862576939, "learning_rate": 4.995646493768234e-06, "loss": 0.0002, "num_tokens": 1244984.0, "reward": 0.69970703125, "reward_std": 0.016067516058683395, "rewards//mean": 0.69970703125, "rewards//std": 0.07418610900640488, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.029, "grad_norm": 0.4732939898967743, "kl": 0.003699015636811964, "learning_rate": 4.995552394543784e-06, "loss": 0.0001, "num_tokens": 1253544.0, "reward": 0.68414306640625, "reward_std": 0.017361916601657867, "rewards//mean": 0.68414306640625, "rewards//std": 0.05543471500277519, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0292, "grad_norm": 0.5424770712852478, "kl": 0.0069044766132719815, "learning_rate": 4.995457290111129e-06, "loss": 0.0003, "num_tokens": 1262272.0, "reward": 0.74951171875, "reward_std": 0.014574643224477768, "rewards//mean": 0.74951171875, "rewards//std": 0.048895664513111115, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0294, "grad_norm": 0.5275890231132507, "kl": 0.004333419870818034, "learning_rate": 4.995361180508575e-06, "loss": 0.0002, "num_tokens": 1270984.0, "reward": 0.72943115234375, "reward_std": 0.013785232789814472, "rewards//mean": 0.72943115234375, "rewards//std": 0.055875882506370544, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0296, "grad_norm": 0.5140582919120789, "kl": 0.006125797764980234, "learning_rate": 4.995264065774837e-06, "loss": 0.0002, "num_tokens": 1279664.0, "reward": 0.71893310546875, "reward_std": 0.011785900220274925, "rewards//mean": 0.71893310546875, "rewards//std": 0.05134063959121704, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0298, "grad_norm": 0.5618065595626831, "kl": 0.006264124327572063, "learning_rate": 4.99516594594903e-06, "loss": 0.0003, "num_tokens": 1288360.0, "reward": 0.71807861328125, "reward_std": 0.013035988435149193, "rewards//mean": 0.71807861328125, "rewards//std": 0.053489912301301956, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.03, "grad_norm": 0.502047598361969, "kl": 0.005594079499132931, "learning_rate": 4.9950668210706795e-06, "loss": 0.0002, "num_tokens": 1297032.0, "reward": 0.732177734375, "reward_std": 0.0157606303691864, "rewards//mean": 0.732177734375, "rewards//std": 0.03976839408278465, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0302, "grad_norm": 0.5016326904296875, "kl": 0.00834939838387072, "learning_rate": 4.994966691179712e-06, "loss": 0.0003, "num_tokens": 1305632.0, "reward": 0.703125, "reward_std": 0.014876965433359146, "rewards//mean": 0.703125, "rewards//std": 0.04747569188475609, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0304, "grad_norm": 0.5275666117668152, "kl": 0.006890498974826187, "learning_rate": 4.9948655563164585e-06, "loss": 0.0003, "num_tokens": 1314272.0, "reward": 0.71551513671875, "reward_std": 0.015125017613172531, "rewards//mean": 0.71551513671875, "rewards//std": 0.05302604287862778, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0306, "grad_norm": 0.5499895215034485, "kl": 0.0106024268316105, "learning_rate": 4.994763416521658e-06, "loss": 0.0004, "num_tokens": 1322960.0, "reward": 0.74847412109375, "reward_std": 0.017264600843191147, "rewards//mean": 0.74847412109375, "rewards//std": 0.058874648064374924, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0308, "grad_norm": 0.5384777784347534, "kl": 0.0059372307441663, "learning_rate": 4.994660271836452e-06, "loss": 0.0002, "num_tokens": 1331608.0, "reward": 0.73358154296875, "reward_std": 0.016230447217822075, "rewards//mean": 0.73358154296875, "rewards//std": 0.06771613657474518, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.031, "grad_norm": 0.5435893535614014, "kl": 0.007853654562495649, "learning_rate": 4.994556122302387e-06, "loss": 0.0003, "num_tokens": 1340352.0, "reward": 0.7288818359375, "reward_std": 0.013145558536052704, "rewards//mean": 0.7288818359375, "rewards//std": 0.05553446710109711, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0312, "grad_norm": 0.5240859985351562, "kl": 0.008158890006598085, "learning_rate": 4.994450967961413e-06, "loss": 0.0003, "num_tokens": 1349024.0, "reward": 0.72222900390625, "reward_std": 0.013540119864046574, "rewards//mean": 0.72222900390625, "rewards//std": 0.034443680197000504, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0314, "grad_norm": 0.6642475724220276, "kl": 0.007359111390542239, "learning_rate": 4.994344808855888e-06, "loss": 0.0003, "num_tokens": 1357544.0, "reward": 0.71826171875, "reward_std": 0.018803555518388748, "rewards//mean": 0.71826171875, "rewards//std": 0.05610194429755211, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0316, "grad_norm": 0.52338045835495, "kl": 0.009215345402481034, "learning_rate": 4.994237645028573e-06, "loss": 0.0004, "num_tokens": 1366224.0, "reward": 0.72686767578125, "reward_std": 0.010744665749371052, "rewards//mean": 0.72686767578125, "rewards//std": 0.0492648109793663, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0318, "grad_norm": 0.4993240237236023, "kl": 0.00795084226410836, "learning_rate": 4.994129476522632e-06, "loss": 0.0003, "num_tokens": 1374848.0, "reward": 0.72064208984375, "reward_std": 0.014471987262368202, "rewards//mean": 0.72064208984375, "rewards//std": 0.04946933686733246, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.032, "grad_norm": 0.5304632186889648, "kl": 0.010611457342747599, "learning_rate": 4.994020303381636e-06, "loss": 0.0004, "num_tokens": 1383456.0, "reward": 0.6920166015625, "reward_std": 0.015825100243091583, "rewards//mean": 0.6920166015625, "rewards//std": 0.06738641113042831, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0322, "grad_norm": 0.6055727005004883, "kl": 0.009950865583959967, "learning_rate": 4.993910125649561e-06, "loss": 0.0004, "num_tokens": 1392112.0, "reward": 0.7283935546875, "reward_std": 0.01466367393732071, "rewards//mean": 0.7283935546875, "rewards//std": 0.051102034747600555, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0324, "grad_norm": 0.6653439998626709, "kl": 0.015546724433079362, "learning_rate": 4.993798943370785e-06, "loss": 0.0006, "num_tokens": 1400784.0, "reward": 0.7451171875, "reward_std": 0.014343861490488052, "rewards//mean": 0.7451171875, "rewards//std": 0.0543498657643795, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0326, "grad_norm": 0.5136329531669617, "kl": 0.009563862986396998, "learning_rate": 4.993686756590093e-06, "loss": 0.0004, "num_tokens": 1409344.0, "reward": 0.74176025390625, "reward_std": 0.01264176145195961, "rewards//mean": 0.74176025390625, "rewards//std": 0.031986285001039505, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0328, "grad_norm": 0.6398639678955078, "kl": 0.008364403067389503, "learning_rate": 4.993573565352674e-06, "loss": 0.0003, "num_tokens": 1417920.0, "reward": 0.728271484375, "reward_std": 0.014848753809928894, "rewards//mean": 0.728271484375, "rewards//std": 0.043802645057439804, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.033, "grad_norm": 0.5334981083869934, "kl": 0.015317061392124742, "learning_rate": 4.993459369704121e-06, "loss": 0.0006, "num_tokens": 1426608.0, "reward": 0.71307373046875, "reward_std": 0.014587020501494408, "rewards//mean": 0.71307373046875, "rewards//std": 0.05228722095489502, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0332, "grad_norm": 0.6520506143569946, "kl": 0.014760783000383526, "learning_rate": 4.9933441696904315e-06, "loss": 0.0006, "num_tokens": 1435216.0, "reward": 0.72015380859375, "reward_std": 0.016750846058130264, "rewards//mean": 0.72015380859375, "rewards//std": 0.03944116830825806, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0334, "grad_norm": 0.6047248244285583, "kl": 0.011187054798938334, "learning_rate": 4.993227965358008e-06, "loss": 0.0004, "num_tokens": 1443832.0, "reward": 0.68414306640625, "reward_std": 0.016772452741861343, "rewards//mean": 0.68414306640625, "rewards//std": 0.07010506093502045, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0336, "grad_norm": 0.5380955338478088, "kl": 0.013987196376547217, "learning_rate": 4.99311075675366e-06, "loss": 0.0006, "num_tokens": 1452488.0, "reward": 0.73919677734375, "reward_std": 0.014058486558496952, "rewards//mean": 0.73919677734375, "rewards//std": 0.04323422163724899, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0338, "grad_norm": 0.5614820122718811, "kl": 0.00901414075633511, "learning_rate": 4.992992543924597e-06, "loss": 0.0004, "num_tokens": 1461184.0, "reward": 0.73291015625, "reward_std": 0.015211429446935654, "rewards//mean": 0.73291015625, "rewards//std": 0.052048444747924805, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.034, "grad_norm": 0.5328094363212585, "kl": 0.013381825701799244, "learning_rate": 4.992873326918434e-06, "loss": 0.0005, "num_tokens": 1469920.0, "reward": 0.7449951171875, "reward_std": 0.01563861407339573, "rewards//mean": 0.7449951171875, "rewards//std": 0.05957742780447006, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0342, "grad_norm": 0.568260133266449, "kl": 0.010022971488069743, "learning_rate": 4.992753105783194e-06, "loss": 0.0004, "num_tokens": 1478560.0, "reward": 0.718017578125, "reward_std": 0.01765732280910015, "rewards//mean": 0.718017578125, "rewards//std": 0.058326005935668945, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0344, "grad_norm": 0.6112688183784485, "kl": 0.011806668480858207, "learning_rate": 4.992631880567301e-06, "loss": 0.0005, "num_tokens": 1487232.0, "reward": 0.68988037109375, "reward_std": 0.014870746061205864, "rewards//mean": 0.68988037109375, "rewards//std": 0.060817260295152664, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0346, "grad_norm": 0.5378821492195129, "kl": 0.01460516860242933, "learning_rate": 4.992509651319585e-06, "loss": 0.0006, "num_tokens": 1495848.0, "reward": 0.71533203125, "reward_std": 0.01423371210694313, "rewards//mean": 0.71533203125, "rewards//std": 0.07337196916341782, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0348, "grad_norm": 0.6649944186210632, "kl": 0.014010886079631746, "learning_rate": 4.992386418089279e-06, "loss": 0.0006, "num_tokens": 1504680.0, "reward": 0.7298583984375, "reward_std": 0.012547630816698074, "rewards//mean": 0.7298583984375, "rewards//std": 0.052616409957408905, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.035, "grad_norm": 0.5258428454399109, "kl": 0.014397241990081966, "learning_rate": 4.992262180926022e-06, "loss": 0.0006, "num_tokens": 1513360.0, "reward": 0.706787109375, "reward_std": 0.015542633831501007, "rewards//mean": 0.706787109375, "rewards//std": 0.06885714083909988, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0352, "grad_norm": 0.4891396164894104, "kl": 0.012740209785988554, "learning_rate": 4.992136939879857e-06, "loss": 0.0005, "num_tokens": 1522112.0, "reward": 0.73828125, "reward_std": 0.01267999317497015, "rewards//mean": 0.73828125, "rewards//std": 0.04550643637776375, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0354, "grad_norm": 0.49961039423942566, "kl": 0.01649987616110593, "learning_rate": 4.992010695001229e-06, "loss": 0.0007, "num_tokens": 1530848.0, "reward": 0.7008056640625, "reward_std": 0.012512251734733582, "rewards//mean": 0.7008056640625, "rewards//std": 0.06246790289878845, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0356, "grad_norm": 0.8701191544532776, "kl": 0.019287034403532743, "learning_rate": 4.9918834463409925e-06, "loss": 0.0008, "num_tokens": 1539528.0, "reward": 0.70989990234375, "reward_std": 0.016380542889237404, "rewards//mean": 0.70989990234375, "rewards//std": 0.042210932821035385, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0358, "grad_norm": 0.6032683253288269, "kl": 0.014312484301626682, "learning_rate": 4.991755193950401e-06, "loss": 0.0006, "num_tokens": 1548192.0, "reward": 0.70318603515625, "reward_std": 0.011435788124799728, "rewards//mean": 0.70318603515625, "rewards//std": 0.054831214249134064, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.036, "grad_norm": 0.6240742802619934, "kl": 0.019559299282263964, "learning_rate": 4.991625937881117e-06, "loss": 0.0008, "num_tokens": 1556856.0, "reward": 0.733642578125, "reward_std": 0.015770340338349342, "rewards//mean": 0.733642578125, "rewards//std": 0.05675596743822098, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0362, "grad_norm": 0.6242550611495972, "kl": 0.014271960069891065, "learning_rate": 4.991495678185202e-06, "loss": 0.0006, "num_tokens": 1565488.0, "reward": 0.7296142578125, "reward_std": 0.012346789240837097, "rewards//mean": 0.7296142578125, "rewards//std": 0.07252856343984604, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0364, "grad_norm": 0.6350885629653931, "kl": 0.018139246094506234, "learning_rate": 4.991364414915126e-06, "loss": 0.0007, "num_tokens": 1574184.0, "reward": 0.72760009765625, "reward_std": 0.016285665333271027, "rewards//mean": 0.72760009765625, "rewards//std": 0.06219015270471573, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0366, "grad_norm": 0.640499472618103, "kl": 0.01825080724665895, "learning_rate": 4.9912321481237616e-06, "loss": 0.0007, "num_tokens": 1582888.0, "reward": 0.72503662109375, "reward_std": 0.010994457639753819, "rewards//mean": 0.72503662109375, "rewards//std": 0.03939354792237282, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0368, "grad_norm": 0.5733298659324646, "kl": 0.022698667307849973, "learning_rate": 4.991098877864386e-06, "loss": 0.0009, "num_tokens": 1591408.0, "reward": 0.7388916015625, "reward_std": 0.016481254249811172, "rewards//mean": 0.7388916015625, "rewards//std": 0.043859802186489105, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.037, "grad_norm": 0.54179447889328, "kl": 0.01430770373553969, "learning_rate": 4.99096460419068e-06, "loss": 0.0006, "num_tokens": 1600088.0, "reward": 0.76007080078125, "reward_std": 0.018877511844038963, "rewards//mean": 0.76007080078125, "rewards//std": 0.04165972024202347, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0372, "grad_norm": 0.7062174677848816, "kl": 0.027250012615695596, "learning_rate": 4.990829327156729e-06, "loss": 0.0011, "num_tokens": 1608712.0, "reward": 0.715576171875, "reward_std": 0.012214237824082375, "rewards//mean": 0.715576171875, "rewards//std": 0.052231352776288986, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0374, "grad_norm": 0.6777660846710205, "kl": 0.02611252712085843, "learning_rate": 4.990693046817023e-06, "loss": 0.001, "num_tokens": 1617400.0, "reward": 0.7432861328125, "reward_std": 0.01230183057487011, "rewards//mean": 0.7432861328125, "rewards//std": 0.034481290727853775, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0376, "grad_norm": 0.557637631893158, "kl": 0.018429984629619867, "learning_rate": 4.990555763226456e-06, "loss": 0.0007, "num_tokens": 1625992.0, "reward": 0.74969482421875, "reward_std": 0.01685107871890068, "rewards//mean": 0.74969482421875, "rewards//std": 0.045630306005477905, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0378, "grad_norm": 0.5212416648864746, "kl": 0.017173759290017188, "learning_rate": 4.990417476440326e-06, "loss": 0.0007, "num_tokens": 1634576.0, "reward": 0.70892333984375, "reward_std": 0.014218729920685291, "rewards//mean": 0.70892333984375, "rewards//std": 0.0681646466255188, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.038, "grad_norm": 0.5165665149688721, "kl": 0.015734603686723858, "learning_rate": 4.9902781865143326e-06, "loss": 0.0006, "num_tokens": 1643200.0, "reward": 0.733154296875, "reward_std": 0.014683052897453308, "rewards//mean": 0.733154296875, "rewards//std": 0.047851089388132095, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0382, "grad_norm": 0.683603823184967, "kl": 0.0229703092481941, "learning_rate": 4.990137893504585e-06, "loss": 0.0009, "num_tokens": 1651848.0, "reward": 0.70361328125, "reward_std": 0.014246530830860138, "rewards//mean": 0.70361328125, "rewards//std": 0.047676779329776764, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0384, "grad_norm": 0.5361626148223877, "kl": 0.02232282201293856, "learning_rate": 4.989996597467591e-06, "loss": 0.0009, "num_tokens": 1660472.0, "reward": 0.721923828125, "reward_std": 0.020648304373025894, "rewards//mean": 0.721923828125, "rewards//std": 0.050515249371528625, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0386, "grad_norm": 0.6549078822135925, "kl": 0.028085972415283322, "learning_rate": 4.989854298460265e-06, "loss": 0.0011, "num_tokens": 1669128.0, "reward": 0.73724365234375, "reward_std": 0.015218530781567097, "rewards//mean": 0.73724365234375, "rewards//std": 0.043220214545726776, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0388, "grad_norm": 0.6959535479545593, "kl": 0.03151222656015307, "learning_rate": 4.989710996539926e-06, "loss": 0.0013, "num_tokens": 1677784.0, "reward": 0.748291015625, "reward_std": 0.014268080703914165, "rewards//mean": 0.748291015625, "rewards//std": 0.0479370579123497, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.039, "grad_norm": 0.6917723417282104, "kl": 0.030924884602427483, "learning_rate": 4.989566691764296e-06, "loss": 0.0012, "num_tokens": 1686448.0, "reward": 0.7283935546875, "reward_std": 0.013681046664714813, "rewards//mean": 0.7283935546875, "rewards//std": 0.0343722440302372, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0392, "grad_norm": 0.7647262811660767, "kl": 0.02689244132488966, "learning_rate": 4.9894213841914994e-06, "loss": 0.0011, "num_tokens": 1695072.0, "reward": 0.695068359375, "reward_std": 0.014594485983252525, "rewards//mean": 0.695068359375, "rewards//std": 0.037756409496068954, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0394, "grad_norm": 0.7618040442466736, "kl": 0.03057428600732237, "learning_rate": 4.989275073880067e-06, "loss": 0.0012, "num_tokens": 1703680.0, "reward": 0.7230224609375, "reward_std": 0.018142350018024445, "rewards//mean": 0.7230224609375, "rewards//std": 0.05768650770187378, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0396, "grad_norm": 0.6719552278518677, "kl": 0.040109737077727914, "learning_rate": 4.989127760888932e-06, "loss": 0.0016, "num_tokens": 1712304.0, "reward": 0.721923828125, "reward_std": 0.012543957680463791, "rewards//mean": 0.721923828125, "rewards//std": 0.050654102116823196, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0398, "grad_norm": 0.6652910709381104, "kl": 0.02444028906757012, "learning_rate": 4.988979445277433e-06, "loss": 0.001, "num_tokens": 1720936.0, "reward": 0.70831298828125, "reward_std": 0.01572088897228241, "rewards//mean": 0.70831298828125, "rewards//std": 0.03865145146846771, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.04, "grad_norm": 0.5563658475875854, "kl": 0.021938784746453166, "learning_rate": 4.988830127105312e-06, "loss": 0.0009, "num_tokens": 1729552.0, "reward": 0.71014404296875, "reward_std": 0.01828540489077568, "rewards//mean": 0.71014404296875, "rewards//std": 0.038920748978853226, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0402, "grad_norm": 0.621466338634491, "kl": 0.04022008215542883, "learning_rate": 4.988679806432712e-06, "loss": 0.0016, "num_tokens": 1738184.0, "reward": 0.71197509765625, "reward_std": 0.017939604818820953, "rewards//mean": 0.71197509765625, "rewards//std": 0.047174979001283646, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0404, "grad_norm": 0.7120692729949951, "kl": 0.03540224314201623, "learning_rate": 4.988528483320184e-06, "loss": 0.0014, "num_tokens": 1746792.0, "reward": 0.72320556640625, "reward_std": 0.016589093953371048, "rewards//mean": 0.72320556640625, "rewards//std": 0.04446118324995041, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0406, "grad_norm": 0.6485600471496582, "kl": 0.02893989998847246, "learning_rate": 4.9883761578286805e-06, "loss": 0.0012, "num_tokens": 1755408.0, "reward": 0.73529052734375, "reward_std": 0.015300112776458263, "rewards//mean": 0.73529052734375, "rewards//std": 0.038799602538347244, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0408, "grad_norm": 0.5922731757164001, "kl": 0.034151540603488684, "learning_rate": 4.988222830019559e-06, "loss": 0.0014, "num_tokens": 1764008.0, "reward": 0.73095703125, "reward_std": 0.013814757578074932, "rewards//mean": 0.73095703125, "rewards//std": 0.046106573194265366, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.041, "grad_norm": 0.5785369277000427, "kl": 0.045372437103651464, "learning_rate": 4.988068499954578e-06, "loss": 0.0018, "num_tokens": 1772688.0, "reward": 0.73089599609375, "reward_std": 0.017161503434181213, "rewards//mean": 0.73089599609375, "rewards//std": 0.057122811675071716, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0412, "grad_norm": 0.6962729096412659, "kl": 0.043618743075057864, "learning_rate": 4.987913167695904e-06, "loss": 0.0017, "num_tokens": 1781256.0, "reward": 0.7265625, "reward_std": 0.016600284725427628, "rewards//mean": 0.7265625, "rewards//std": 0.045094750821590424, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0414, "grad_norm": 0.655792772769928, "kl": 0.030919409124180675, "learning_rate": 4.987756833306103e-06, "loss": 0.0012, "num_tokens": 1789976.0, "reward": 0.742431640625, "reward_std": 0.016554612666368484, "rewards//mean": 0.742431640625, "rewards//std": 0.06263887137174606, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0416, "grad_norm": 0.7749574184417725, "kl": 0.0421929273288697, "learning_rate": 4.987599496848147e-06, "loss": 0.0017, "num_tokens": 1798664.0, "reward": 0.72479248046875, "reward_std": 0.014618289656937122, "rewards//mean": 0.72479248046875, "rewards//std": 0.05295005068182945, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0418, "grad_norm": 0.759087324142456, "kl": 0.04364440473727882, "learning_rate": 4.987441158385411e-06, "loss": 0.0017, "num_tokens": 1807424.0, "reward": 0.68939208984375, "reward_std": 0.012192411348223686, "rewards//mean": 0.68939208984375, "rewards//std": 0.06599829345941544, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.042, "grad_norm": 0.6689143776893616, "kl": 0.05778496875427663, "learning_rate": 4.987281817981674e-06, "loss": 0.0023, "num_tokens": 1816088.0, "reward": 0.72271728515625, "reward_std": 0.014015286229550838, "rewards//mean": 0.72271728515625, "rewards//std": 0.049290310591459274, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0422, "grad_norm": 0.666499674320221, "kl": 0.043112656101584435, "learning_rate": 4.987121475701118e-06, "loss": 0.0017, "num_tokens": 1824672.0, "reward": 0.69085693359375, "reward_std": 0.013759355992078781, "rewards//mean": 0.69085693359375, "rewards//std": 0.06431391090154648, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0424, "grad_norm": 0.6885657906532288, "kl": 0.039523816551081836, "learning_rate": 4.986960131608329e-06, "loss": 0.0016, "num_tokens": 1833248.0, "reward": 0.76361083984375, "reward_std": 0.021125439554452896, "rewards//mean": 0.76361083984375, "rewards//std": 0.03490383177995682, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0426, "grad_norm": 0.7056341171264648, "kl": 0.03723469818942249, "learning_rate": 4.986797785768296e-06, "loss": 0.0015, "num_tokens": 1841808.0, "reward": 0.73406982421875, "reward_std": 0.013552498072385788, "rewards//mean": 0.73406982421875, "rewards//std": 0.058044735342264175, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0428, "grad_norm": 0.5871314406394958, "kl": 0.03060774417826906, "learning_rate": 4.986634438246413e-06, "loss": 0.0012, "num_tokens": 1850392.0, "reward": 0.7166748046875, "reward_std": 0.018129628151655197, "rewards//mean": 0.7166748046875, "rewards//std": 0.05835960432887077, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.043, "grad_norm": 0.556708812713623, "kl": 0.02814829646376893, "learning_rate": 4.986470089108476e-06, "loss": 0.0011, "num_tokens": 1859016.0, "reward": 0.7529296875, "reward_std": 0.017662834376096725, "rewards//mean": 0.7529296875, "rewards//std": 0.045255593955516815, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0432, "grad_norm": 0.771795928478241, "kl": 0.062261566519737244, "learning_rate": 4.986304738420684e-06, "loss": 0.0025, "num_tokens": 1867616.0, "reward": 0.7291259765625, "reward_std": 0.009812846779823303, "rewards//mean": 0.7291259765625, "rewards//std": 0.02753545716404915, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0434, "grad_norm": 0.6160389184951782, "kl": 0.04629370174370706, "learning_rate": 4.986138386249641e-06, "loss": 0.0019, "num_tokens": 1876296.0, "reward": 0.7247314453125, "reward_std": 0.018973447382450104, "rewards//mean": 0.7247314453125, "rewards//std": 0.04735804721713066, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0436, "grad_norm": 0.6583981513977051, "kl": 0.04990657512098551, "learning_rate": 4.985971032662352e-06, "loss": 0.002, "num_tokens": 1884904.0, "reward": 0.7359619140625, "reward_std": 0.017147067934274673, "rewards//mean": 0.7359619140625, "rewards//std": 0.04669753462076187, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0438, "grad_norm": 0.8134801983833313, "kl": 0.0423719760729, "learning_rate": 4.98580267772623e-06, "loss": 0.0017, "num_tokens": 1893616.0, "reward": 0.72003173828125, "reward_std": 0.016933666542172432, "rewards//mean": 0.72003173828125, "rewards//std": 0.06031915172934532, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.044, "grad_norm": 0.6741657853126526, "kl": 0.03965398599393666, "learning_rate": 4.985633321509086e-06, "loss": 0.0016, "num_tokens": 1902160.0, "reward": 0.68719482421875, "reward_std": 0.015195935033261776, "rewards//mean": 0.68719482421875, "rewards//std": 0.05944652855396271, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0442, "grad_norm": 0.9493129849433899, "kl": 0.046657787868753076, "learning_rate": 4.985462964079137e-06, "loss": 0.0019, "num_tokens": 1910880.0, "reward": 0.71942138671875, "reward_std": 0.016195297241210938, "rewards//mean": 0.71942138671875, "rewards//std": 0.05246121808886528, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0444, "grad_norm": 0.7087270021438599, "kl": 0.04575832257978618, "learning_rate": 4.985291605505004e-06, "loss": 0.0018, "num_tokens": 1919640.0, "reward": 0.702392578125, "reward_std": 0.0167884211987257, "rewards//mean": 0.702392578125, "rewards//std": 0.05223599076271057, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0446, "grad_norm": 0.6752747297286987, "kl": 0.04726569773629308, "learning_rate": 4.9851192458557084e-06, "loss": 0.0019, "num_tokens": 1928296.0, "reward": 0.71807861328125, "reward_std": 0.01220088079571724, "rewards//mean": 0.71807861328125, "rewards//std": 0.037438686937093735, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0448, "grad_norm": 0.7452352046966553, "kl": 0.05152314284350723, "learning_rate": 4.984945885200679e-06, "loss": 0.0021, "num_tokens": 1937088.0, "reward": 0.74041748046875, "reward_std": 0.013437759131193161, "rewards//mean": 0.74041748046875, "rewards//std": 0.040072083473205566, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.045, "grad_norm": 0.6877680420875549, "kl": 0.051004784647375345, "learning_rate": 4.984771523609744e-06, "loss": 0.002, "num_tokens": 1945688.0, "reward": 0.73846435546875, "reward_std": 0.016483765095472336, "rewards//mean": 0.73846435546875, "rewards//std": 0.0488903634250164, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0452, "grad_norm": 0.7118944525718689, "kl": 0.05141365760937333, "learning_rate": 4.9845961611531356e-06, "loss": 0.0021, "num_tokens": 1954320.0, "reward": 0.74041748046875, "reward_std": 0.015977714210748672, "rewards//mean": 0.74041748046875, "rewards//std": 0.05143725872039795, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0454, "grad_norm": 0.9068376421928406, "kl": 0.05422702501527965, "learning_rate": 4.984419797901491e-06, "loss": 0.0022, "num_tokens": 1962944.0, "reward": 0.71044921875, "reward_std": 0.01507932785898447, "rewards//mean": 0.71044921875, "rewards//std": 0.036301881074905396, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0456, "grad_norm": 0.7399264574050903, "kl": 0.05034496798180044, "learning_rate": 4.984242433925849e-06, "loss": 0.002, "num_tokens": 1971624.0, "reward": 0.75054931640625, "reward_std": 0.015884902328252792, "rewards//mean": 0.75054931640625, "rewards//std": 0.06389451771974564, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0458, "grad_norm": 0.7348447442054749, "kl": 0.049029057379812, "learning_rate": 4.984064069297652e-06, "loss": 0.002, "num_tokens": 1980280.0, "reward": 0.697021484375, "reward_std": 0.015012217685580254, "rewards//mean": 0.697021484375, "rewards//std": 0.056743159890174866, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.046, "grad_norm": 0.7991431355476379, "kl": 0.07105646608397365, "learning_rate": 4.983884704088745e-06, "loss": 0.0028, "num_tokens": 1988880.0, "reward": 0.695068359375, "reward_std": 0.01788986101746559, "rewards//mean": 0.695068359375, "rewards//std": 0.06029834970831871, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0462, "grad_norm": 0.6498074531555176, "kl": 0.05387417250312865, "learning_rate": 4.983704338371375e-06, "loss": 0.0022, "num_tokens": 1997536.0, "reward": 0.738037109375, "reward_std": 0.012314668856561184, "rewards//mean": 0.738037109375, "rewards//std": 0.05002864450216293, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0464, "grad_norm": 0.6092854142189026, "kl": 0.04524057498201728, "learning_rate": 4.983522972218196e-06, "loss": 0.0018, "num_tokens": 2006280.0, "reward": 0.7039794921875, "reward_std": 0.013910917565226555, "rewards//mean": 0.7039794921875, "rewards//std": 0.04475763440132141, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0466, "grad_norm": 0.5306748747825623, "kl": 0.047080037416890264, "learning_rate": 4.983340605702261e-06, "loss": 0.0019, "num_tokens": 2015040.0, "reward": 0.7469482421875, "reward_std": 0.011357417330145836, "rewards//mean": 0.7469482421875, "rewards//std": 0.03777945414185524, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0468, "grad_norm": 0.768358588218689, "kl": 0.03969805908855051, "learning_rate": 4.983157238897026e-06, "loss": 0.0016, "num_tokens": 2023640.0, "reward": 0.7327880859375, "reward_std": 0.015326134860515594, "rewards//mean": 0.7327880859375, "rewards//std": 0.04921766370534897, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.047, "grad_norm": 0.6886104941368103, "kl": 0.05683159315958619, "learning_rate": 4.982972871876353e-06, "loss": 0.0023, "num_tokens": 2032216.0, "reward": 0.7303466796875, "reward_std": 0.010093173943459988, "rewards//mean": 0.7303466796875, "rewards//std": 0.046449214220047, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0472, "grad_norm": 0.5981897711753845, "kl": 0.04994144011288881, "learning_rate": 4.982787504714503e-06, "loss": 0.002, "num_tokens": 2040856.0, "reward": 0.739501953125, "reward_std": 0.018706027418375015, "rewards//mean": 0.739501953125, "rewards//std": 0.05437158793210983, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0474, "grad_norm": 0.7223436832427979, "kl": 0.06479091383516788, "learning_rate": 4.982601137486144e-06, "loss": 0.0026, "num_tokens": 2049408.0, "reward": 0.71697998046875, "reward_std": 0.014704955741763115, "rewards//mean": 0.71697998046875, "rewards//std": 0.0494886115193367, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0476, "grad_norm": 0.6980536580085754, "kl": 0.04527810198487714, "learning_rate": 4.9824137702663424e-06, "loss": 0.0018, "num_tokens": 2058016.0, "reward": 0.71221923828125, "reward_std": 0.01778585836291313, "rewards//mean": 0.71221923828125, "rewards//std": 0.05478951334953308, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0478, "grad_norm": 1.1460226774215698, "kl": 0.06317855324596167, "learning_rate": 4.982225403130572e-06, "loss": 0.0025, "num_tokens": 2066648.0, "reward": 0.73974609375, "reward_std": 0.01631517894566059, "rewards//mean": 0.73974609375, "rewards//std": 0.03925115987658501, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.048, "grad_norm": 0.9314346313476562, "kl": 0.0637447414919734, "learning_rate": 4.982036036154706e-06, "loss": 0.0025, "num_tokens": 2075336.0, "reward": 0.71893310546875, "reward_std": 0.013403713703155518, "rewards//mean": 0.71893310546875, "rewards//std": 0.05558551847934723, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0482, "grad_norm": 0.6399586200714111, "kl": 0.04521946748718619, "learning_rate": 4.981845669415022e-06, "loss": 0.0018, "num_tokens": 2083952.0, "reward": 0.710693359375, "reward_std": 0.017721328884363174, "rewards//mean": 0.710693359375, "rewards//std": 0.032037414610385895, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0484, "grad_norm": 0.7257511019706726, "kl": 0.04954640497453511, "learning_rate": 4.981654302988198e-06, "loss": 0.002, "num_tokens": 2092624.0, "reward": 0.756591796875, "reward_std": 0.014372417703270912, "rewards//mean": 0.756591796875, "rewards//std": 0.04212266206741333, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0486, "grad_norm": 0.7490444183349609, "kl": 0.06151878600940108, "learning_rate": 4.9814619369513184e-06, "loss": 0.0025, "num_tokens": 2101160.0, "reward": 0.7039794921875, "reward_std": 0.01579831726849079, "rewards//mean": 0.7039794921875, "rewards//std": 0.06837564706802368, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0488, "grad_norm": 0.7180415391921997, "kl": 0.041042948490940034, "learning_rate": 4.981268571381867e-06, "loss": 0.0016, "num_tokens": 2109792.0, "reward": 0.732421875, "reward_std": 0.015042596496641636, "rewards//mean": 0.732421875, "rewards//std": 0.04142765328288078, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.049, "grad_norm": 0.7131726145744324, "kl": 0.04773534648120403, "learning_rate": 4.981074206357732e-06, "loss": 0.0019, "num_tokens": 2118552.0, "reward": 0.7369384765625, "reward_std": 0.011528298258781433, "rewards//mean": 0.7369384765625, "rewards//std": 0.04968539997935295, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0492, "grad_norm": 0.775935709476471, "kl": 0.0707710012793541, "learning_rate": 4.980878841957203e-06, "loss": 0.0028, "num_tokens": 2127248.0, "reward": 0.69036865234375, "reward_std": 0.012703044340014458, "rewards//mean": 0.69036865234375, "rewards//std": 0.05450669303536415, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0494, "grad_norm": 0.7658054232597351, "kl": 0.0640805927105248, "learning_rate": 4.980682478258973e-06, "loss": 0.0026, "num_tokens": 2135896.0, "reward": 0.72894287109375, "reward_std": 0.01790812611579895, "rewards//mean": 0.72894287109375, "rewards//std": 0.05467694625258446, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0496, "grad_norm": 0.5831694006919861, "kl": 0.04178021801635623, "learning_rate": 4.980485115342138e-06, "loss": 0.0017, "num_tokens": 2144536.0, "reward": 0.70562744140625, "reward_std": 0.019685041159391403, "rewards//mean": 0.70562744140625, "rewards//std": 0.06453383713960648, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0498, "grad_norm": 0.686213493347168, "kl": 0.05178135330788791, "learning_rate": 4.980286753286196e-06, "loss": 0.0021, "num_tokens": 2153104.0, "reward": 0.73675537109375, "reward_std": 0.015449654310941696, "rewards//mean": 0.73675537109375, "rewards//std": 0.022234952077269554, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.05, "grad_norm": 0.5769968032836914, "kl": 0.04894346767105162, "learning_rate": 4.980087392171045e-06, "loss": 0.002, "num_tokens": 2161736.0, "reward": 0.7308349609375, "reward_std": 0.009863736107945442, "rewards//mean": 0.7308349609375, "rewards//std": 0.04522731900215149, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0502, "grad_norm": 0.6053898930549622, "kl": 0.04904163209721446, "learning_rate": 4.9798870320769884e-06, "loss": 0.002, "num_tokens": 2170344.0, "reward": 0.71343994140625, "reward_std": 0.011166264303028584, "rewards//mean": 0.71343994140625, "rewards//std": 0.05631681904196739, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0504, "grad_norm": 0.7947268486022949, "kl": 0.0519161622505635, "learning_rate": 4.979685673084733e-06, "loss": 0.0021, "num_tokens": 2179056.0, "reward": 0.77587890625, "reward_std": 0.014476396143436432, "rewards//mean": 0.77587890625, "rewards//std": 0.032057251781225204, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0506, "grad_norm": 0.693539559841156, "kl": 0.05135406623594463, "learning_rate": 4.979483315275385e-06, "loss": 0.0021, "num_tokens": 2187640.0, "reward": 0.70477294921875, "reward_std": 0.012753374874591827, "rewards//mean": 0.70477294921875, "rewards//std": 0.05521116405725479, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0508, "grad_norm": 0.6642072796821594, "kl": 0.07001969963312149, "learning_rate": 4.979279958730454e-06, "loss": 0.0028, "num_tokens": 2196336.0, "reward": 0.74652099609375, "reward_std": 0.012642599642276764, "rewards//mean": 0.74652099609375, "rewards//std": 0.036172330379486084, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.051, "grad_norm": 0.7698396444320679, "kl": 0.05689065787009895, "learning_rate": 4.979075603531852e-06, "loss": 0.0023, "num_tokens": 2205024.0, "reward": 0.7344970703125, "reward_std": 0.014306074939668179, "rewards//mean": 0.7344970703125, "rewards//std": 0.04307273030281067, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0512, "grad_norm": 0.6658589839935303, "kl": 0.04703840473666787, "learning_rate": 4.978870249761893e-06, "loss": 0.0019, "num_tokens": 2213608.0, "reward": 0.74749755859375, "reward_std": 0.01808979921042919, "rewards//mean": 0.74749755859375, "rewards//std": 0.06493417918682098, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0514, "grad_norm": 0.5617851614952087, "kl": 0.04335748380981386, "learning_rate": 4.978663897503294e-06, "loss": 0.0017, "num_tokens": 2222264.0, "reward": 0.76544189453125, "reward_std": 0.01435675285756588, "rewards//mean": 0.76544189453125, "rewards//std": 0.05317598953843117, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0516, "grad_norm": 0.7687885761260986, "kl": 0.06642695516347885, "learning_rate": 4.978456546839175e-06, "loss": 0.0027, "num_tokens": 2230880.0, "reward": 0.71099853515625, "reward_std": 0.01172902062535286, "rewards//mean": 0.71099853515625, "rewards//std": 0.045694950968027115, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0518, "grad_norm": 0.747726559638977, "kl": 0.05708181764930487, "learning_rate": 4.978248197853053e-06, "loss": 0.0023, "num_tokens": 2239608.0, "reward": 0.76300048828125, "reward_std": 0.013816908933222294, "rewards//mean": 0.76300048828125, "rewards//std": 0.04265507683157921, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.052, "grad_norm": 0.7513110041618347, "kl": 0.06573500973172486, "learning_rate": 4.978038850628855e-06, "loss": 0.0026, "num_tokens": 2248296.0, "reward": 0.73321533203125, "reward_std": 0.012971704825758934, "rewards//mean": 0.73321533203125, "rewards//std": 0.03936932981014252, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0522, "grad_norm": 0.8820503354072571, "kl": 0.08888540370389819, "learning_rate": 4.977828505250903e-06, "loss": 0.0036, "num_tokens": 2256976.0, "reward": 0.71124267578125, "reward_std": 0.015261702239513397, "rewards//mean": 0.71124267578125, "rewards//std": 0.048025041818618774, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0524, "grad_norm": 0.5689162611961365, "kl": 0.058710358338430524, "learning_rate": 4.977617161803927e-06, "loss": 0.0023, "num_tokens": 2265672.0, "reward": 0.766357421875, "reward_std": 0.013638041913509369, "rewards//mean": 0.766357421875, "rewards//std": 0.047856152057647705, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0526, "grad_norm": 0.7432250380516052, "kl": 0.05571276368573308, "learning_rate": 4.977404820373053e-06, "loss": 0.0022, "num_tokens": 2274336.0, "reward": 0.75738525390625, "reward_std": 0.012658031657338142, "rewards//mean": 0.75738525390625, "rewards//std": 0.04126431792974472, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0528, "grad_norm": 0.5365133881568909, "kl": 0.06589075829833746, "learning_rate": 4.977191481043814e-06, "loss": 0.0026, "num_tokens": 2282984.0, "reward": 0.7647705078125, "reward_std": 0.012238910421729088, "rewards//mean": 0.7647705078125, "rewards//std": 0.036798667162656784, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.053, "grad_norm": 0.7748925089836121, "kl": 0.06631747842766345, "learning_rate": 4.976977143902143e-06, "loss": 0.0027, "num_tokens": 2291520.0, "reward": 0.71173095703125, "reward_std": 0.01299564354121685, "rewards//mean": 0.71173095703125, "rewards//std": 0.05950303003191948, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0532, "grad_norm": 0.6161185503005981, "kl": 0.07089789980091155, "learning_rate": 4.976761809034375e-06, "loss": 0.0028, "num_tokens": 2300080.0, "reward": 0.71710205078125, "reward_std": 0.015387684106826782, "rewards//mean": 0.71710205078125, "rewards//std": 0.03893085941672325, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0534, "grad_norm": 0.91294926404953, "kl": 0.10131165734492242, "learning_rate": 4.976545476527246e-06, "loss": 0.0041, "num_tokens": 2308656.0, "reward": 0.72802734375, "reward_std": 0.01398141123354435, "rewards//mean": 0.72802734375, "rewards//std": 0.042606472969055176, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0536, "grad_norm": 0.7760770320892334, "kl": 0.07038976019248366, "learning_rate": 4.976328146467895e-06, "loss": 0.0028, "num_tokens": 2317368.0, "reward": 0.75408935546875, "reward_std": 0.017939291894435883, "rewards//mean": 0.75408935546875, "rewards//std": 0.04169821739196777, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0538, "grad_norm": 0.7210671305656433, "kl": 0.07773464918136597, "learning_rate": 4.976109818943863e-06, "loss": 0.0031, "num_tokens": 2325992.0, "reward": 0.7349853515625, "reward_std": 0.013170282356441021, "rewards//mean": 0.7349853515625, "rewards//std": 0.03823667764663696, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.054, "grad_norm": 0.7028035521507263, "kl": 0.0827886825427413, "learning_rate": 4.975890494043092e-06, "loss": 0.0033, "num_tokens": 2334616.0, "reward": 0.75750732421875, "reward_std": 0.013806039467453957, "rewards//mean": 0.75750732421875, "rewards//std": 0.03447311371564865, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0542, "grad_norm": 0.6302931904792786, "kl": 0.07922178274020553, "learning_rate": 4.975670171853926e-06, "loss": 0.0032, "num_tokens": 2343264.0, "reward": 0.7408447265625, "reward_std": 0.01199992373585701, "rewards//mean": 0.7408447265625, "rewards//std": 0.03118186816573143, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0544, "grad_norm": 0.5366842746734619, "kl": 0.053182843839749694, "learning_rate": 4.975448852465111e-06, "loss": 0.0021, "num_tokens": 2351936.0, "reward": 0.73406982421875, "reward_std": 0.012117596343159676, "rewards//mean": 0.73406982421875, "rewards//std": 0.05836515128612518, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0546, "grad_norm": 0.7956821918487549, "kl": 0.10613921284675598, "learning_rate": 4.975226535965795e-06, "loss": 0.0042, "num_tokens": 2360560.0, "reward": 0.73834228515625, "reward_std": 0.013946297578513622, "rewards//mean": 0.73834228515625, "rewards//std": 0.045477114617824554, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0548, "grad_norm": 0.825093686580658, "kl": 0.06953645718749613, "learning_rate": 4.975003222445525e-06, "loss": 0.0028, "num_tokens": 2369160.0, "reward": 0.69384765625, "reward_std": 0.0194076094776392, "rewards//mean": 0.69384765625, "rewards//std": 0.048462796956300735, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.055, "grad_norm": 0.7195761799812317, "kl": 0.06491784797981381, "learning_rate": 4.974778911994254e-06, "loss": 0.0026, "num_tokens": 2377840.0, "reward": 0.7286376953125, "reward_std": 0.015999507158994675, "rewards//mean": 0.7286376953125, "rewards//std": 0.04492238163948059, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0552, "grad_norm": 0.6829413175582886, "kl": 0.06265947036445141, "learning_rate": 4.974553604702332e-06, "loss": 0.0025, "num_tokens": 2386504.0, "reward": 0.74267578125, "reward_std": 0.016472984105348587, "rewards//mean": 0.74267578125, "rewards//std": 0.03249996155500412, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0554, "grad_norm": 0.8279444575309753, "kl": 0.109421216417104, "learning_rate": 4.974327300660515e-06, "loss": 0.0044, "num_tokens": 2395128.0, "reward": 0.74176025390625, "reward_std": 0.013287878595292568, "rewards//mean": 0.74176025390625, "rewards//std": 0.03989148512482643, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0556, "grad_norm": 0.8692975640296936, "kl": 0.13027487508952618, "learning_rate": 4.974099999959957e-06, "loss": 0.0052, "num_tokens": 2403824.0, "reward": 0.71258544921875, "reward_std": 0.01673486828804016, "rewards//mean": 0.71258544921875, "rewards//std": 0.045876458287239075, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0558, "grad_norm": 0.6421030759811401, "kl": 0.075614667031914, "learning_rate": 4.973871702692215e-06, "loss": 0.003, "num_tokens": 2412440.0, "reward": 0.72467041015625, "reward_std": 0.015497658401727676, "rewards//mean": 0.72467041015625, "rewards//std": 0.04439099133014679, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.056, "grad_norm": 0.7968896627426147, "kl": 0.08366691786795855, "learning_rate": 4.973642408949247e-06, "loss": 0.0033, "num_tokens": 2421072.0, "reward": 0.693115234375, "reward_std": 0.013288882561028004, "rewards//mean": 0.693115234375, "rewards//std": 0.04501882195472717, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0562, "grad_norm": 0.8614963889122009, "kl": 0.10154542187228799, "learning_rate": 4.9734121188234115e-06, "loss": 0.0041, "num_tokens": 2429736.0, "reward": 0.72857666015625, "reward_std": 0.013583678752183914, "rewards//mean": 0.72857666015625, "rewards//std": 0.04098678007721901, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0564, "grad_norm": 1.0249260663986206, "kl": 0.10393307078629732, "learning_rate": 4.973180832407471e-06, "loss": 0.0042, "num_tokens": 2438336.0, "reward": 0.72576904296875, "reward_std": 0.01790856570005417, "rewards//mean": 0.72576904296875, "rewards//std": 0.03053005412220955, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0566, "grad_norm": 0.8311775922775269, "kl": 0.11601218301802874, "learning_rate": 4.972948549794587e-06, "loss": 0.0046, "num_tokens": 2446944.0, "reward": 0.7076416015625, "reward_std": 0.017834939062595367, "rewards//mean": 0.7076416015625, "rewards//std": 0.036632098257541656, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0568, "grad_norm": 0.8473641872406006, "kl": 0.12403094908222556, "learning_rate": 4.972715271078323e-06, "loss": 0.005, "num_tokens": 2455528.0, "reward": 0.748779296875, "reward_std": 0.014438275247812271, "rewards//mean": 0.748779296875, "rewards//std": 0.036623213440179825, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.057, "grad_norm": 1.0325453281402588, "kl": 0.15802243910729885, "learning_rate": 4.972480996352644e-06, "loss": 0.0063, "num_tokens": 2464104.0, "reward": 0.72064208984375, "reward_std": 0.011821886524558067, "rewards//mean": 0.72064208984375, "rewards//std": 0.04506784677505493, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0572, "grad_norm": 0.8279632329940796, "kl": 0.14361037500202656, "learning_rate": 4.9722457257119144e-06, "loss": 0.0057, "num_tokens": 2472768.0, "reward": 0.77099609375, "reward_std": 0.011357970535755157, "rewards//mean": 0.77099609375, "rewards//std": 0.027263915166258812, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0574, "grad_norm": 0.8311311602592468, "kl": 0.12657944671809673, "learning_rate": 4.972009459250903e-06, "loss": 0.0051, "num_tokens": 2481432.0, "reward": 0.72271728515625, "reward_std": 0.012970637530088425, "rewards//mean": 0.72271728515625, "rewards//std": 0.03625509515404701, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0576, "grad_norm": 1.064893126487732, "kl": 0.16098652640357614, "learning_rate": 4.971772197064776e-06, "loss": 0.0064, "num_tokens": 2490120.0, "reward": 0.73114013671875, "reward_std": 0.013818023726344109, "rewards//mean": 0.73114013671875, "rewards//std": 0.03643544018268585, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0578, "grad_norm": 0.957671046257019, "kl": 0.19215518794953823, "learning_rate": 4.971533939249105e-06, "loss": 0.0077, "num_tokens": 2498832.0, "reward": 0.73681640625, "reward_std": 0.014772996306419373, "rewards//mean": 0.73681640625, "rewards//std": 0.04102522134780884, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.058, "grad_norm": 0.9647096395492554, "kl": 0.1939010415226221, "learning_rate": 4.9712946858998576e-06, "loss": 0.0078, "num_tokens": 2507544.0, "reward": 0.75213623046875, "reward_std": 0.012323443777859211, "rewards//mean": 0.75213623046875, "rewards//std": 0.036742422729730606, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0582, "grad_norm": 0.9463180899620056, "kl": 0.1474277568049729, "learning_rate": 4.971054437113406e-06, "loss": 0.0059, "num_tokens": 2516200.0, "reward": 0.74615478515625, "reward_std": 0.01541908085346222, "rewards//mean": 0.74615478515625, "rewards//std": 0.0504126213490963, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0584, "grad_norm": 1.2873408794403076, "kl": 0.18494170205667615, "learning_rate": 4.9708131929865235e-06, "loss": 0.0074, "num_tokens": 2524768.0, "reward": 0.74810791015625, "reward_std": 0.01592198945581913, "rewards//mean": 0.74810791015625, "rewards//std": 0.04342567175626755, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0586, "grad_norm": 0.9283332228660583, "kl": 0.15322866081260145, "learning_rate": 4.970570953616383e-06, "loss": 0.0061, "num_tokens": 2533440.0, "reward": 0.74658203125, "reward_std": 0.012621916830539703, "rewards//mean": 0.74658203125, "rewards//std": 0.03913993015885353, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0588, "grad_norm": 2.470165967941284, "kl": 0.21047021076083183, "learning_rate": 4.970327719100556e-06, "loss": 0.0084, "num_tokens": 2542128.0, "reward": 0.7777099609375, "reward_std": 0.013461882248520851, "rewards//mean": 0.7777099609375, "rewards//std": 0.0339735671877861, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.059, "grad_norm": 0.9362149238586426, "kl": 0.14741351851262152, "learning_rate": 4.970083489537021e-06, "loss": 0.0059, "num_tokens": 2550680.0, "reward": 0.72869873046875, "reward_std": 0.014110936783254147, "rewards//mean": 0.72869873046875, "rewards//std": 0.03162695840001106, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0592, "grad_norm": 0.8844790458679199, "kl": 0.1842191582545638, "learning_rate": 4.96983826502415e-06, "loss": 0.0074, "num_tokens": 2559424.0, "reward": 0.7591552734375, "reward_std": 0.008938804268836975, "rewards//mean": 0.7591552734375, "rewards//std": 0.04523535072803497, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0594, "grad_norm": 1.2818913459777832, "kl": 0.27794970013201237, "learning_rate": 4.969592045660723e-06, "loss": 0.0111, "num_tokens": 2568040.0, "reward": 0.78643798828125, "reward_std": 0.01272084191441536, "rewards//mean": 0.78643798828125, "rewards//std": 0.030282124876976013, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0596, "grad_norm": 1.2021187543869019, "kl": 0.19154435116797686, "learning_rate": 4.969344831545914e-06, "loss": 0.0077, "num_tokens": 2576776.0, "reward": 0.75469970703125, "reward_std": 0.01579802855849266, "rewards//mean": 0.75469970703125, "rewards//std": 0.040667545050382614, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0598, "grad_norm": 1.1095116138458252, "kl": 0.2089026332832873, "learning_rate": 4.969096622779303e-06, "loss": 0.0084, "num_tokens": 2585392.0, "reward": 0.71600341796875, "reward_std": 0.01274610310792923, "rewards//mean": 0.71600341796875, "rewards//std": 0.0431722030043602, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.06, "grad_norm": 1.059348702430725, "kl": 0.20234669605270028, "learning_rate": 4.968847419460867e-06, "loss": 0.0081, "num_tokens": 2594032.0, "reward": 0.7236328125, "reward_std": 0.013915905728936195, "rewards//mean": 0.7236328125, "rewards//std": 0.0312344953417778, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0602, "grad_norm": 1.2769887447357178, "kl": 0.22755335364490747, "learning_rate": 4.968597221690986e-06, "loss": 0.0091, "num_tokens": 2602736.0, "reward": 0.72149658203125, "reward_std": 0.011407001875340939, "rewards//mean": 0.72149658203125, "rewards//std": 0.049202706664800644, "step": 301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0604, "grad_norm": 1.2020509243011475, "kl": 0.24635399505496025, "learning_rate": 4.96834602957044e-06, "loss": 0.0099, "num_tokens": 2611384.0, "reward": 0.76763916015625, "reward_std": 0.016048027202486992, "rewards//mean": 0.76763916015625, "rewards//std": 0.04085657000541687, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0606, "grad_norm": 1.436669111251831, "kl": 0.25481397192925215, "learning_rate": 4.968093843200407e-06, "loss": 0.0102, "num_tokens": 2620152.0, "reward": 0.73895263671875, "reward_std": 0.016838543117046356, "rewards//mean": 0.73895263671875, "rewards//std": 0.05139780789613724, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0608, "grad_norm": 1.3443458080291748, "kl": 0.2639510128647089, "learning_rate": 4.96784066268247e-06, "loss": 0.0106, "num_tokens": 2628816.0, "reward": 0.745361328125, "reward_std": 0.014135653153061867, "rewards//mean": 0.745361328125, "rewards//std": 0.03560379147529602, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.061, "grad_norm": 1.522937297821045, "kl": 0.24147153925150633, "learning_rate": 4.967586488118609e-06, "loss": 0.0097, "num_tokens": 2637496.0, "reward": 0.7392578125, "reward_std": 0.01406162977218628, "rewards//mean": 0.7392578125, "rewards//std": 0.028467724099755287, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0612, "grad_norm": 1.3553171157836914, "kl": 0.24848425947129726, "learning_rate": 4.967331319611206e-06, "loss": 0.0099, "num_tokens": 2646104.0, "reward": 0.7559814453125, "reward_std": 0.014312982559204102, "rewards//mean": 0.7559814453125, "rewards//std": 0.03511122614145279, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0614, "grad_norm": 1.5322372913360596, "kl": 0.25969485752284527, "learning_rate": 4.9670751572630425e-06, "loss": 0.0104, "num_tokens": 2654744.0, "reward": 0.7608642578125, "reward_std": 0.013267126865684986, "rewards//mean": 0.7608642578125, "rewards//std": 0.027817683294415474, "step": 307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0616, "grad_norm": 1.4864404201507568, "kl": 0.2588785719126463, "learning_rate": 4.9668180011773e-06, "loss": 0.0104, "num_tokens": 2663376.0, "reward": 0.71484375, "reward_std": 0.01414964348077774, "rewards//mean": 0.71484375, "rewards//std": 0.02796124666929245, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0618, "grad_norm": 1.291443943977356, "kl": 0.2382829338312149, "learning_rate": 4.966559851457562e-06, "loss": 0.0095, "num_tokens": 2671952.0, "reward": 0.7293701171875, "reward_std": 0.015213550999760628, "rewards//mean": 0.7293701171875, "rewards//std": 0.03245498239994049, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.062, "grad_norm": 1.3344167470932007, "kl": 0.2496517887338996, "learning_rate": 4.966300708207811e-06, "loss": 0.01, "num_tokens": 2680624.0, "reward": 0.73394775390625, "reward_std": 0.01914324052631855, "rewards//mean": 0.73394775390625, "rewards//std": 0.04364334046840668, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0622, "grad_norm": 1.3081969022750854, "kl": 0.30021103471517563, "learning_rate": 4.96604057153243e-06, "loss": 0.012, "num_tokens": 2689176.0, "reward": 0.764892578125, "reward_std": 0.01673274114727974, "rewards//mean": 0.764892578125, "rewards//std": 0.040132150053977966, "step": 311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0624, "grad_norm": 1.3401750326156616, "kl": 0.2593008913099766, "learning_rate": 4.965779441536202e-06, "loss": 0.0104, "num_tokens": 2697864.0, "reward": 0.74420166015625, "reward_std": 0.01244452502578497, "rewards//mean": 0.74420166015625, "rewards//std": 0.03462515026330948, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0626, "grad_norm": 1.4522110223770142, "kl": 0.3523233197629452, "learning_rate": 4.965517318324308e-06, "loss": 0.0141, "num_tokens": 2706480.0, "reward": 0.71630859375, "reward_std": 0.011550749652087688, "rewards//mean": 0.71630859375, "rewards//std": 0.026689305901527405, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0628, "grad_norm": 1.3593254089355469, "kl": 0.3362383022904396, "learning_rate": 4.965254202002334e-06, "loss": 0.0134, "num_tokens": 2715056.0, "reward": 0.75225830078125, "reward_std": 0.015795797109603882, "rewards//mean": 0.75225830078125, "rewards//std": 0.0387437678873539, "step": 314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.063, "grad_norm": 1.2711306810379028, "kl": 0.26307030860334635, "learning_rate": 4.964990092676263e-06, "loss": 0.0105, "num_tokens": 2723632.0, "reward": 0.74615478515625, "reward_std": 0.011496458202600479, "rewards//mean": 0.74615478515625, "rewards//std": 0.038194045424461365, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0632, "grad_norm": 1.2173610925674438, "kl": 0.272955933585763, "learning_rate": 4.964724990452476e-06, "loss": 0.0109, "num_tokens": 2732264.0, "reward": 0.72149658203125, "reward_std": 0.012493669055402279, "rewards//mean": 0.72149658203125, "rewards//std": 0.036469489336013794, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0634, "grad_norm": 1.143746256828308, "kl": 0.24079665448516607, "learning_rate": 4.9644588954377595e-06, "loss": 0.0096, "num_tokens": 2740960.0, "reward": 0.7584228515625, "reward_std": 0.015307648107409477, "rewards//mean": 0.7584228515625, "rewards//std": 0.03647971153259277, "step": 317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0636, "grad_norm": 1.21216881275177, "kl": 0.25471873860806227, "learning_rate": 4.964191807739293e-06, "loss": 0.0102, "num_tokens": 2749528.0, "reward": 0.741943359375, "reward_std": 0.011934969574213028, "rewards//mean": 0.741943359375, "rewards//std": 0.02646489255130291, "step": 318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0638, "grad_norm": 1.5371971130371094, "kl": 0.3100880701094866, "learning_rate": 4.963923727464661e-06, "loss": 0.0124, "num_tokens": 2758176.0, "reward": 0.7532958984375, "reward_std": 0.011484376154839993, "rewards//mean": 0.7532958984375, "rewards//std": 0.031002702191472054, "step": 319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.064, "grad_norm": 1.2735233306884766, "kl": 0.2549311425536871, "learning_rate": 4.963654654721848e-06, "loss": 0.0102, "num_tokens": 2766760.0, "reward": 0.7633056640625, "reward_std": 0.01067253015935421, "rewards//mean": 0.7633056640625, "rewards//std": 0.03851751983165741, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0642, "grad_norm": 1.3543078899383545, "kl": 0.33309008402284235, "learning_rate": 4.963384589619233e-06, "loss": 0.0133, "num_tokens": 2775344.0, "reward": 0.75543212890625, "reward_std": 0.015507234260439873, "rewards//mean": 0.75543212890625, "rewards//std": 0.03842874616384506, "step": 321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0644, "grad_norm": 1.2867757081985474, "kl": 0.35989778488874435, "learning_rate": 4.9631135322656e-06, "loss": 0.0144, "num_tokens": 2783880.0, "reward": 0.75213623046875, "reward_std": 0.013345220126211643, "rewards//mean": 0.75213623046875, "rewards//std": 0.0316346138715744, "step": 322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0646, "grad_norm": 1.3027052879333496, "kl": 0.3117766585201025, "learning_rate": 4.962841482770131e-06, "loss": 0.0125, "num_tokens": 2792480.0, "reward": 0.75042724609375, "reward_std": 0.021965457126498222, "rewards//mean": 0.75042724609375, "rewards//std": 0.03844450041651726, "step": 323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0648, "grad_norm": 1.2578948736190796, "kl": 0.31469903141260147, "learning_rate": 4.962568441242408e-06, "loss": 0.0126, "num_tokens": 2801088.0, "reward": 0.723388671875, "reward_std": 0.015950549393892288, "rewards//mean": 0.723388671875, "rewards//std": 0.03848705068230629, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.065, "grad_norm": 1.3456346988677979, "kl": 0.26953551825135946, "learning_rate": 4.962294407792411e-06, "loss": 0.0108, "num_tokens": 2809768.0, "reward": 0.747802734375, "reward_std": 0.015800267457962036, "rewards//mean": 0.747802734375, "rewards//std": 0.03215061128139496, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0652, "grad_norm": 1.6204522848129272, "kl": 0.34877745993435383, "learning_rate": 4.962019382530521e-06, "loss": 0.014, "num_tokens": 2818328.0, "reward": 0.756591796875, "reward_std": 0.017749017104506493, "rewards//mean": 0.756591796875, "rewards//std": 0.03980492055416107, "step": 326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0654, "grad_norm": 1598.2496337890625, "kl": 9.96565246488899, "learning_rate": 4.961743365567517e-06, "loss": 0.3986, "num_tokens": 2826984.0, "reward": 0.71923828125, "reward_std": 0.014437740668654442, "rewards//mean": 0.71923828125, "rewards//std": 0.05055179446935654, "step": 327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0656, "grad_norm": 1.0102441310882568, "kl": 0.30338819324970245, "learning_rate": 4.961466357014581e-06, "loss": 0.0121, "num_tokens": 2835544.0, "reward": 0.75927734375, "reward_std": 0.011439365334808826, "rewards//mean": 0.75927734375, "rewards//std": 0.02604631707072258, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0658, "grad_norm": 451.71466064453125, "kl": 1.1315573640167713, "learning_rate": 4.961188356983291e-06, "loss": 0.0453, "num_tokens": 2844184.0, "reward": 0.77728271484375, "reward_std": 0.014329792931675911, "rewards//mean": 0.77728271484375, "rewards//std": 0.030076975002884865, "step": 329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.066, "grad_norm": 1.7841613292694092, "kl": 0.32306745275855064, "learning_rate": 4.960909365585624e-06, "loss": 0.0129, "num_tokens": 2852824.0, "reward": 0.7259521484375, "reward_std": 0.012303611263632774, "rewards//mean": 0.7259521484375, "rewards//std": 0.024766186252236366, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0662, "grad_norm": 1.2208861112594604, "kl": 0.35031407698988914, "learning_rate": 4.960629382933959e-06, "loss": 0.014, "num_tokens": 2861568.0, "reward": 0.75592041015625, "reward_std": 0.013036654330790043, "rewards//mean": 0.75592041015625, "rewards//std": 0.028763895854353905, "step": 331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0664, "grad_norm": 45.699703216552734, "kl": 0.37541868537664413, "learning_rate": 4.960348409141074e-06, "loss": 0.015, "num_tokens": 2870168.0, "reward": 0.76629638671875, "reward_std": 0.014367573894560337, "rewards//mean": 0.76629638671875, "rewards//std": 0.03430584445595741, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0666, "grad_norm": 1.0523972511291504, "kl": 0.2711593806743622, "learning_rate": 4.960066444320143e-06, "loss": 0.0108, "num_tokens": 2878752.0, "reward": 0.72601318359375, "reward_std": 0.016535844653844833, "rewards//mean": 0.72601318359375, "rewards//std": 0.03955461084842682, "step": 333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0668, "grad_norm": 1.1287596225738525, "kl": 0.27480507269501686, "learning_rate": 4.959783488584743e-06, "loss": 0.011, "num_tokens": 2887504.0, "reward": 0.7252197265625, "reward_std": 0.013544876128435135, "rewards//mean": 0.7252197265625, "rewards//std": 0.036889057606458664, "step": 334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.067, "grad_norm": 1.9436277151107788, "kl": 0.26933558098971844, "learning_rate": 4.9594995420488475e-06, "loss": 0.0108, "num_tokens": 2896176.0, "reward": 0.70745849609375, "reward_std": 0.01362568698823452, "rewards//mean": 0.70745849609375, "rewards//std": 0.0338621586561203, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0672, "grad_norm": 1.3394960165023804, "kl": 0.2702603470534086, "learning_rate": 4.959214604826831e-06, "loss": 0.0108, "num_tokens": 2904808.0, "reward": 0.73876953125, "reward_std": 0.015888595953583717, "rewards//mean": 0.73876953125, "rewards//std": 0.03880433365702629, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0674, "grad_norm": 1.188582181930542, "kl": 0.3325184481218457, "learning_rate": 4.958928677033465e-06, "loss": 0.0133, "num_tokens": 2913544.0, "reward": 0.73760986328125, "reward_std": 0.01286403276026249, "rewards//mean": 0.73760986328125, "rewards//std": 0.03628222644329071, "step": 337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0676, "grad_norm": 1.0829931497573853, "kl": 0.2599369240924716, "learning_rate": 4.9586417587839225e-06, "loss": 0.0104, "num_tokens": 2922208.0, "reward": 0.74237060546875, "reward_std": 0.009288130328059196, "rewards//mean": 0.74237060546875, "rewards//std": 0.0271921269595623, "step": 338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0678, "grad_norm": 1.090649127960205, "kl": 0.35008446872234344, "learning_rate": 4.958353850193773e-06, "loss": 0.014, "num_tokens": 2930904.0, "reward": 0.7298583984375, "reward_std": 0.009747210890054703, "rewards//mean": 0.7298583984375, "rewards//std": 0.031018322333693504, "step": 339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.068, "grad_norm": 1.0257357358932495, "kl": 0.2736462540924549, "learning_rate": 4.958064951378988e-06, "loss": 0.0109, "num_tokens": 2939600.0, "reward": 0.74664306640625, "reward_std": 0.013303879648447037, "rewards//mean": 0.74664306640625, "rewards//std": 0.026327185332775116, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0682, "grad_norm": 1.8254125118255615, "kl": 0.3379829414188862, "learning_rate": 4.957775062455933e-06, "loss": 0.0135, "num_tokens": 2948432.0, "reward": 0.74041748046875, "reward_std": 0.011445442214608192, "rewards//mean": 0.74041748046875, "rewards//std": 0.023927679285407066, "step": 341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0684, "grad_norm": 1.151408314704895, "kl": 0.3058789037168026, "learning_rate": 4.957484183541378e-06, "loss": 0.0122, "num_tokens": 2957032.0, "reward": 0.775390625, "reward_std": 0.010520260781049728, "rewards//mean": 0.775390625, "rewards//std": 0.020039275288581848, "step": 342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0686, "grad_norm": 3.0106112957000732, "kl": 0.330666683614254, "learning_rate": 4.957192314752487e-06, "loss": 0.0132, "num_tokens": 2965680.0, "reward": 0.7501220703125, "reward_std": 0.0123628880828619, "rewards//mean": 0.7501220703125, "rewards//std": 0.03424517437815666, "step": 343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0688, "grad_norm": 9.348411560058594, "kl": 0.3467661701142788, "learning_rate": 4.9568994562068265e-06, "loss": 0.0139, "num_tokens": 2974304.0, "reward": 0.7127685546875, "reward_std": 0.013961128890514374, "rewards//mean": 0.7127685546875, "rewards//std": 0.038146305829286575, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.069, "grad_norm": 1.3383392095565796, "kl": 0.23608121927827597, "learning_rate": 4.9566056080223576e-06, "loss": 0.0094, "num_tokens": 2982920.0, "reward": 0.7254638671875, "reward_std": 0.019042517989873886, "rewards//mean": 0.7254638671875, "rewards//std": 0.05562053620815277, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0692, "grad_norm": 1.1454135179519653, "kl": 0.3010788504034281, "learning_rate": 4.9563107703174444e-06, "loss": 0.012, "num_tokens": 2991640.0, "reward": 0.76824951171875, "reward_std": 0.013647787272930145, "rewards//mean": 0.76824951171875, "rewards//std": 0.02736636996269226, "step": 346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0694, "grad_norm": 1.3544073104858398, "kl": 0.34564877301454544, "learning_rate": 4.956014943210845e-06, "loss": 0.0138, "num_tokens": 3000224.0, "reward": 0.7606201171875, "reward_std": 0.011027848348021507, "rewards//mean": 0.7606201171875, "rewards//std": 0.029086079448461533, "step": 347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0696, "grad_norm": 1.1409554481506348, "kl": 0.29469297640025616, "learning_rate": 4.9557181268217225e-06, "loss": 0.0118, "num_tokens": 3008880.0, "reward": 0.73162841796875, "reward_std": 0.017540261149406433, "rewards//mean": 0.73162841796875, "rewards//std": 0.03117768093943596, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0698, "grad_norm": 1.174091100692749, "kl": 0.34364804439246655, "learning_rate": 4.9554203212696304e-06, "loss": 0.0137, "num_tokens": 3017536.0, "reward": 0.73602294921875, "reward_std": 0.0178272295743227, "rewards//mean": 0.73602294921875, "rewards//std": 0.038189683109521866, "step": 349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.07, "grad_norm": 1.0863193273544312, "kl": 0.2888754541054368, "learning_rate": 4.955121526674528e-06, "loss": 0.0116, "num_tokens": 3026160.0, "reward": 0.76239013671875, "reward_std": 0.015270760282874107, "rewards//mean": 0.76239013671875, "rewards//std": 0.04026879370212555, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0702, "grad_norm": 1.3240951299667358, "kl": 0.32703710440546274, "learning_rate": 4.9548217431567665e-06, "loss": 0.0131, "num_tokens": 3034792.0, "reward": 0.77569580078125, "reward_std": 0.013240109197795391, "rewards//mean": 0.77569580078125, "rewards//std": 0.0282268188893795, "step": 351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0704, "grad_norm": 1.2549176216125488, "kl": 0.27426617220044136, "learning_rate": 4.9545209708371025e-06, "loss": 0.011, "num_tokens": 3043432.0, "reward": 0.75177001953125, "reward_std": 0.009705094620585442, "rewards//mean": 0.75177001953125, "rewards//std": 0.0316346138715744, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0706, "grad_norm": 1.207445502281189, "kl": 0.3094801548868418, "learning_rate": 4.9542192098366835e-06, "loss": 0.0124, "num_tokens": 3052008.0, "reward": 0.76214599609375, "reward_std": 0.015533574856817722, "rewards//mean": 0.76214599609375, "rewards//std": 0.029373183846473694, "step": 353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0708, "grad_norm": 0.9336227774620056, "kl": 0.3114245068281889, "learning_rate": 4.95391646027706e-06, "loss": 0.0125, "num_tokens": 3060680.0, "reward": 0.7603759765625, "reward_std": 0.013422933407127857, "rewards//mean": 0.7603759765625, "rewards//std": 0.034570734947919846, "step": 354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.071, "grad_norm": 0.9703741073608398, "kl": 0.3360181748867035, "learning_rate": 4.953612722280181e-06, "loss": 0.0134, "num_tokens": 3069312.0, "reward": 0.7603759765625, "reward_std": 0.014663382433354855, "rewards//mean": 0.7603759765625, "rewards//std": 0.032032448798418045, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0712, "grad_norm": 0.9895470142364502, "kl": 0.32697275839746, "learning_rate": 4.953307995968391e-06, "loss": 0.0131, "num_tokens": 3078032.0, "reward": 0.74468994140625, "reward_std": 0.010848162695765495, "rewards//mean": 0.74468994140625, "rewards//std": 0.025433441624045372, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0714, "grad_norm": 1.0759690999984741, "kl": 0.3138846158981323, "learning_rate": 4.953002281464432e-06, "loss": 0.0126, "num_tokens": 3086656.0, "reward": 0.738037109375, "reward_std": 0.011200450360774994, "rewards//mean": 0.738037109375, "rewards//std": 0.019008060917258263, "step": 357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0716, "grad_norm": 1.1179635524749756, "kl": 0.2949511222541332, "learning_rate": 4.952695578891449e-06, "loss": 0.0118, "num_tokens": 3095184.0, "reward": 0.74249267578125, "reward_std": 0.010483279824256897, "rewards//mean": 0.74249267578125, "rewards//std": 0.034091606736183167, "step": 358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0718, "grad_norm": 1.4193079471588135, "kl": 0.3405211642384529, "learning_rate": 4.9523878883729794e-06, "loss": 0.0136, "num_tokens": 3103768.0, "reward": 0.78057861328125, "reward_std": 0.01512511633336544, "rewards//mean": 0.78057861328125, "rewards//std": 0.028378715738654137, "step": 359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.072, "grad_norm": 1.239197015762329, "kl": 0.34275365993380547, "learning_rate": 4.952079210032962e-06, "loss": 0.0137, "num_tokens": 3112464.0, "reward": 0.75830078125, "reward_std": 0.00690212519839406, "rewards//mean": 0.75830078125, "rewards//std": 0.029177792370319366, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0722, "grad_norm": 1.2645068168640137, "kl": 0.22989958012476563, "learning_rate": 4.951769543995731e-06, "loss": 0.0092, "num_tokens": 3121000.0, "reward": 0.7113037109375, "reward_std": 0.014688584953546524, "rewards//mean": 0.7113037109375, "rewards//std": 0.043239694088697433, "step": 361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0724, "grad_norm": 0.9607939720153809, "kl": 0.2874492518603802, "learning_rate": 4.951458890386021e-06, "loss": 0.0115, "num_tokens": 3129608.0, "reward": 0.77667236328125, "reward_std": 0.019359227269887924, "rewards//mean": 0.77667236328125, "rewards//std": 0.03485913202166557, "step": 362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0726, "grad_norm": 1.1458990573883057, "kl": 0.3057211823761463, "learning_rate": 4.951147249328964e-06, "loss": 0.0122, "num_tokens": 3138360.0, "reward": 0.74261474609375, "reward_std": 0.016560913994908333, "rewards//mean": 0.74261474609375, "rewards//std": 0.045068517327308655, "step": 363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0728, "grad_norm": 1.1187580823898315, "kl": 0.3212597221136093, "learning_rate": 4.950834620950089e-06, "loss": 0.0129, "num_tokens": 3146912.0, "reward": 0.7310791015625, "reward_std": 0.015014410018920898, "rewards//mean": 0.7310791015625, "rewards//std": 0.04567229375243187, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.073, "grad_norm": 1.4113508462905884, "kl": 0.35538918152451515, "learning_rate": 4.9505210053753204e-06, "loss": 0.0142, "num_tokens": 3155568.0, "reward": 0.7164306640625, "reward_std": 0.013062847778201103, "rewards//mean": 0.7164306640625, "rewards//std": 0.049687836319208145, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0732, "grad_norm": 1.2345703840255737, "kl": 0.288858812302351, "learning_rate": 4.950206402730984e-06, "loss": 0.0116, "num_tokens": 3164336.0, "reward": 0.76116943359375, "reward_std": 0.013638054020702839, "rewards//mean": 0.76116943359375, "rewards//std": 0.04553765431046486, "step": 366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0734, "grad_norm": 1.2489417791366577, "kl": 0.3488999232649803, "learning_rate": 4.949890813143802e-06, "loss": 0.014, "num_tokens": 3173144.0, "reward": 0.7589111328125, "reward_std": 0.010227528400719166, "rewards//mean": 0.7589111328125, "rewards//std": 0.03326575458049774, "step": 367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0736, "grad_norm": 1.1428223848342896, "kl": 0.33989420160651207, "learning_rate": 4.949574236740893e-06, "loss": 0.0136, "num_tokens": 3181768.0, "reward": 0.73931884765625, "reward_std": 0.008424145169556141, "rewards//mean": 0.73931884765625, "rewards//std": 0.02814464643597603, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0738, "grad_norm": 1.1611658334732056, "kl": 0.32859067991375923, "learning_rate": 4.949256673649774e-06, "loss": 0.0131, "num_tokens": 3190400.0, "reward": 0.71636962890625, "reward_std": 0.011011059395968914, "rewards//mean": 0.71636962890625, "rewards//std": 0.041278988122940063, "step": 369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.074, "grad_norm": 1.2848570346832275, "kl": 0.35491205751895905, "learning_rate": 4.94893812399836e-06, "loss": 0.0142, "num_tokens": 3199032.0, "reward": 0.77215576171875, "reward_std": 0.01237148605287075, "rewards//mean": 0.77215576171875, "rewards//std": 0.03036997839808464, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0742, "grad_norm": 1.300308346748352, "kl": 0.3731778897345066, "learning_rate": 4.948618587914963e-06, "loss": 0.0149, "num_tokens": 3207600.0, "reward": 0.7557373046875, "reward_std": 0.01102093979716301, "rewards//mean": 0.7557373046875, "rewards//std": 0.025770245119929314, "step": 371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0744, "grad_norm": 1.1986355781555176, "kl": 0.33789339661598206, "learning_rate": 4.948298065528292e-06, "loss": 0.0135, "num_tokens": 3216240.0, "reward": 0.74041748046875, "reward_std": 0.012240855023264885, "rewards//mean": 0.74041748046875, "rewards//std": 0.03901242837309837, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0746, "grad_norm": 1.1584309339523315, "kl": 0.3138528410345316, "learning_rate": 4.947976556967452e-06, "loss": 0.0126, "num_tokens": 3224856.0, "reward": 0.790283203125, "reward_std": 0.010637026280164719, "rewards//mean": 0.790283203125, "rewards//std": 0.028278857469558716, "step": 373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0748, "grad_norm": 1.1529393196105957, "kl": 0.32554432936012745, "learning_rate": 4.947654062361949e-06, "loss": 0.013, "num_tokens": 3233608.0, "reward": 0.7681884765625, "reward_std": 0.016204483807086945, "rewards//mean": 0.7681884765625, "rewards//std": 0.03761722892522812, "step": 374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.075, "grad_norm": 1.137768030166626, "kl": 0.3275693580508232, "learning_rate": 4.9473305818416805e-06, "loss": 0.0131, "num_tokens": 3242224.0, "reward": 0.72662353515625, "reward_std": 0.012011650949716568, "rewards//mean": 0.72662353515625, "rewards//std": 0.04576380178332329, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0752, "grad_norm": 1.1497950553894043, "kl": 0.373350128531456, "learning_rate": 4.947006115536947e-06, "loss": 0.0149, "num_tokens": 3250880.0, "reward": 0.7626953125, "reward_std": 0.0052624596282839775, "rewards//mean": 0.7626953125, "rewards//std": 0.034618210047483444, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0754, "grad_norm": 1.2146048545837402, "kl": 0.31162706576287746, "learning_rate": 4.946680663578443e-06, "loss": 0.0125, "num_tokens": 3259592.0, "reward": 0.7554931640625, "reward_std": 0.01137151475995779, "rewards//mean": 0.7554931640625, "rewards//std": 0.03868066519498825, "step": 377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0756, "grad_norm": 1.0558278560638428, "kl": 0.34958504140377045, "learning_rate": 4.946354226097261e-06, "loss": 0.014, "num_tokens": 3268216.0, "reward": 0.75439453125, "reward_std": 0.01355811208486557, "rewards//mean": 0.75439453125, "rewards//std": 0.04268030822277069, "step": 378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0758, "grad_norm": 1.3659820556640625, "kl": 0.3736311122775078, "learning_rate": 4.946026803224888e-06, "loss": 0.0149, "num_tokens": 3276832.0, "reward": 0.77215576171875, "reward_std": 0.00643074419349432, "rewards//mean": 0.77215576171875, "rewards//std": 0.028365911915898323, "step": 379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.076, "grad_norm": 1.0803120136260986, "kl": 0.3101845942437649, "learning_rate": 4.945698395093212e-06, "loss": 0.0124, "num_tokens": 3285440.0, "reward": 0.7320556640625, "reward_std": 0.012730730697512627, "rewards//mean": 0.7320556640625, "rewards//std": 0.04066479951143265, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0762, "grad_norm": 1.184386134147644, "kl": 0.33812105655670166, "learning_rate": 4.9453690018345144e-06, "loss": 0.0135, "num_tokens": 3294056.0, "reward": 0.765869140625, "reward_std": 0.010241934098303318, "rewards//mean": 0.765869140625, "rewards//std": 0.03962195664644241, "step": 381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0764, "grad_norm": 1.0949220657348633, "kl": 0.32981303334236145, "learning_rate": 4.9450386235814755e-06, "loss": 0.0132, "num_tokens": 3302832.0, "reward": 0.74432373046875, "reward_std": 0.011420607566833496, "rewards//mean": 0.74432373046875, "rewards//std": 0.036144278943538666, "step": 382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0766, "grad_norm": 0.9847798943519592, "kl": 0.2912776917219162, "learning_rate": 4.944707260467172e-06, "loss": 0.0117, "num_tokens": 3311488.0, "reward": 0.751953125, "reward_std": 0.012202553451061249, "rewards//mean": 0.751953125, "rewards//std": 0.03254092112183571, "step": 383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0768, "grad_norm": 1.0881670713424683, "kl": 0.3519246131181717, "learning_rate": 4.944374912625076e-06, "loss": 0.0141, "num_tokens": 3320128.0, "reward": 0.721435546875, "reward_std": 0.010593841783702374, "rewards//mean": 0.721435546875, "rewards//std": 0.04617677628993988, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.077, "grad_norm": 0.9181442856788635, "kl": 0.348065834492445, "learning_rate": 4.944041580189057e-06, "loss": 0.0139, "num_tokens": 3328744.0, "reward": 0.7432861328125, "reward_std": 0.009224426001310349, "rewards//mean": 0.7432861328125, "rewards//std": 0.026073908433318138, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0772, "grad_norm": 0.9651432037353516, "kl": 0.2787171872332692, "learning_rate": 4.943707263293382e-06, "loss": 0.0111, "num_tokens": 3337352.0, "reward": 0.741943359375, "reward_std": 0.014116060920059681, "rewards//mean": 0.741943359375, "rewards//std": 0.03688023239374161, "step": 386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0774, "grad_norm": 1.0564086437225342, "kl": 0.33349500969052315, "learning_rate": 4.943371962072714e-06, "loss": 0.0133, "num_tokens": 3345952.0, "reward": 0.779052734375, "reward_std": 0.009006861597299576, "rewards//mean": 0.779052734375, "rewards//std": 0.03128775954246521, "step": 387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0776, "grad_norm": 0.905786395072937, "kl": 0.3205620348453522, "learning_rate": 4.9430356766621114e-06, "loss": 0.0128, "num_tokens": 3354552.0, "reward": 0.7352294921875, "reward_std": 0.00872873142361641, "rewards//mean": 0.7352294921875, "rewards//std": 0.030068732798099518, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0778, "grad_norm": 1.0599631071090698, "kl": 0.3847878910601139, "learning_rate": 4.942698407197031e-06, "loss": 0.0154, "num_tokens": 3363192.0, "reward": 0.741943359375, "reward_std": 0.008737495169043541, "rewards//mean": 0.741943359375, "rewards//std": 0.030614785850048065, "step": 389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.078, "grad_norm": 0.9868512749671936, "kl": 0.32742261700332165, "learning_rate": 4.942360153813324e-06, "loss": 0.0131, "num_tokens": 3371832.0, "reward": 0.74371337890625, "reward_std": 0.007218184880912304, "rewards//mean": 0.74371337890625, "rewards//std": 0.035629238933324814, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0782, "grad_norm": 1.0115092992782593, "kl": 0.3397269584238529, "learning_rate": 4.9420209166472386e-06, "loss": 0.0136, "num_tokens": 3380448.0, "reward": 0.770751953125, "reward_std": 0.015268933959305286, "rewards//mean": 0.770751953125, "rewards//std": 0.03136507794260979, "step": 391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0784, "grad_norm": 0.9326263666152954, "kl": 0.32895535230636597, "learning_rate": 4.9416806958354206e-06, "loss": 0.0132, "num_tokens": 3389040.0, "reward": 0.755126953125, "reward_std": 0.013349458575248718, "rewards//mean": 0.755126953125, "rewards//std": 0.031357355415821075, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0786, "grad_norm": 0.7784397602081299, "kl": 0.3131994195282459, "learning_rate": 4.9413394915149094e-06, "loss": 0.0125, "num_tokens": 3397736.0, "reward": 0.748779296875, "reward_std": 0.011831846088171005, "rewards//mean": 0.748779296875, "rewards//std": 0.03131871297955513, "step": 393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0788, "grad_norm": 0.9137967824935913, "kl": 0.3138259369879961, "learning_rate": 4.940997303823144e-06, "loss": 0.0126, "num_tokens": 3406328.0, "reward": 0.7393798828125, "reward_std": 0.009359323419630527, "rewards//mean": 0.7393798828125, "rewards//std": 0.01415227074176073, "step": 394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.079, "grad_norm": 0.8206107020378113, "kl": 0.2667227676138282, "learning_rate": 4.940654132897957e-06, "loss": 0.0107, "num_tokens": 3414960.0, "reward": 0.7227783203125, "reward_std": 0.015779396519064903, "rewards//mean": 0.7227783203125, "rewards//std": 0.04207717627286911, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0792, "grad_norm": 1.1698734760284424, "kl": 0.3585771173238754, "learning_rate": 4.940309978877576e-06, "loss": 0.0143, "num_tokens": 3423696.0, "reward": 0.72821044921875, "reward_std": 0.00786328874528408, "rewards//mean": 0.72821044921875, "rewards//std": 0.03962152823805809, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0794, "grad_norm": 0.9304360151290894, "kl": 0.2744011953473091, "learning_rate": 4.939964841900627e-06, "loss": 0.011, "num_tokens": 3432248.0, "reward": 0.74176025390625, "reward_std": 0.012099739164113998, "rewards//mean": 0.74176025390625, "rewards//std": 0.030537491664290428, "step": 397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0796, "grad_norm": 0.9549676775932312, "kl": 0.32337421737611294, "learning_rate": 4.9396187221061324e-06, "loss": 0.0129, "num_tokens": 3440928.0, "reward": 0.7620849609375, "reward_std": 0.008526656776666641, "rewards//mean": 0.7620849609375, "rewards//std": 0.02168460376560688, "step": 398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0798, "grad_norm": 0.8273400664329529, "kl": 0.30577356554567814, "learning_rate": 4.939271619633508e-06, "loss": 0.0122, "num_tokens": 3449560.0, "reward": 0.74383544921875, "reward_std": 0.014349598437547684, "rewards//mean": 0.74383544921875, "rewards//std": 0.032983046025037766, "step": 399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.08, "grad_norm": 0.9757963418960571, "kl": 0.322640098631382, "learning_rate": 4.938923534622567e-06, "loss": 0.0129, "num_tokens": 3458264.0, "reward": 0.761962890625, "reward_std": 0.011695911176502705, "rewards//mean": 0.761962890625, "rewards//std": 0.02991858497262001, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0802, "grad_norm": 0.8469364643096924, "kl": 0.2975729079917073, "learning_rate": 4.938574467213519e-06, "loss": 0.0119, "num_tokens": 3466896.0, "reward": 0.76605224609375, "reward_std": 0.007424627430737019, "rewards//mean": 0.76605224609375, "rewards//std": 0.03303670138120651, "step": 401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0804, "grad_norm": 0.8685148358345032, "kl": 0.3087438400834799, "learning_rate": 4.938224417546965e-06, "loss": 0.0123, "num_tokens": 3475584.0, "reward": 0.76739501953125, "reward_std": 0.011819308623671532, "rewards//mean": 0.76739501953125, "rewards//std": 0.03834869712591171, "step": 402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0806, "grad_norm": 1.04423987865448, "kl": 0.2927042469382286, "learning_rate": 4.937873385763909e-06, "loss": 0.0117, "num_tokens": 3484184.0, "reward": 0.77471923828125, "reward_std": 0.018968980759382248, "rewards//mean": 0.77471923828125, "rewards//std": 0.0364985354244709, "step": 403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0808, "grad_norm": 0.994342565536499, "kl": 0.250274121761322, "learning_rate": 4.9375213720057435e-06, "loss": 0.01, "num_tokens": 3492760.0, "reward": 0.76336669921875, "reward_std": 0.010855021886527538, "rewards//mean": 0.76336669921875, "rewards//std": 0.022775080054998398, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.081, "grad_norm": 0.8562494516372681, "kl": 0.241558950394392, "learning_rate": 4.937168376414261e-06, "loss": 0.0097, "num_tokens": 3501344.0, "reward": 0.7479248046875, "reward_std": 0.015270461328327656, "rewards//mean": 0.7479248046875, "rewards//std": 0.037885088473558426, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0812, "grad_norm": 0.8906681537628174, "kl": 0.2702600210905075, "learning_rate": 4.9368143991316485e-06, "loss": 0.0108, "num_tokens": 3510008.0, "reward": 0.75970458984375, "reward_std": 0.011862866580486298, "rewards//mean": 0.75970458984375, "rewards//std": 0.03386126458644867, "step": 406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0814, "grad_norm": 0.83770751953125, "kl": 0.2678908761590719, "learning_rate": 4.936459440300487e-06, "loss": 0.0107, "num_tokens": 3518752.0, "reward": 0.7633056640625, "reward_std": 0.010113585740327835, "rewards//mean": 0.7633056640625, "rewards//std": 0.026380963623523712, "step": 407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0816, "grad_norm": 0.7807222604751587, "kl": 0.28538877703249454, "learning_rate": 4.936103500063755e-06, "loss": 0.0114, "num_tokens": 3527400.0, "reward": 0.7591552734375, "reward_std": 0.0059563172981143, "rewards//mean": 0.7591552734375, "rewards//std": 0.03868379816412926, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0818, "grad_norm": 0.9765918850898743, "kl": 0.26162285543978214, "learning_rate": 4.935746578564825e-06, "loss": 0.0105, "num_tokens": 3536064.0, "reward": 0.7486572265625, "reward_std": 0.011643504723906517, "rewards//mean": 0.7486572265625, "rewards//std": 0.025781990960240364, "step": 409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.082, "grad_norm": 0.9713819622993469, "kl": 0.2695306558161974, "learning_rate": 4.935388675947463e-06, "loss": 0.0108, "num_tokens": 3544736.0, "reward": 0.71881103515625, "reward_std": 0.010069970041513443, "rewards//mean": 0.71881103515625, "rewards//std": 0.03545119985938072, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0822, "grad_norm": 0.9451349377632141, "kl": 0.24540182575583458, "learning_rate": 4.935029792355834e-06, "loss": 0.0098, "num_tokens": 3553424.0, "reward": 0.73486328125, "reward_std": 0.014077549800276756, "rewards//mean": 0.73486328125, "rewards//std": 0.03547174483537674, "step": 411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0824, "grad_norm": 0.7770392298698425, "kl": 0.2268413919955492, "learning_rate": 4.934669927934496e-06, "loss": 0.0091, "num_tokens": 3562040.0, "reward": 0.73150634765625, "reward_std": 0.013487438671290874, "rewards//mean": 0.73150634765625, "rewards//std": 0.04288830980658531, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0826, "grad_norm": 0.674587607383728, "kl": 0.28416365571320057, "learning_rate": 4.9343090828284025e-06, "loss": 0.0114, "num_tokens": 3570696.0, "reward": 0.74090576171875, "reward_std": 0.010635284706950188, "rewards//mean": 0.74090576171875, "rewards//std": 0.03285244479775429, "step": 413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0828, "grad_norm": 0.8351263403892517, "kl": 0.2428603582084179, "learning_rate": 4.933947257182901e-06, "loss": 0.0097, "num_tokens": 3579256.0, "reward": 0.72833251953125, "reward_std": 0.00818649772554636, "rewards//mean": 0.72833251953125, "rewards//std": 0.034561701118946075, "step": 414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.083, "grad_norm": 0.9314544200897217, "kl": 0.26274044439196587, "learning_rate": 4.933584451143736e-06, "loss": 0.0105, "num_tokens": 3587928.0, "reward": 0.72344970703125, "reward_std": 0.009486508555710316, "rewards//mean": 0.72344970703125, "rewards//std": 0.04276353493332863, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0832, "grad_norm": 0.8246920704841614, "kl": 0.2645695861428976, "learning_rate": 4.933220664857045e-06, "loss": 0.0106, "num_tokens": 3596568.0, "reward": 0.764892578125, "reward_std": 0.011997243389487267, "rewards//mean": 0.764892578125, "rewards//std": 0.02408854104578495, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0834, "grad_norm": 0.7828191518783569, "kl": 0.23742634430527687, "learning_rate": 4.93285589846936e-06, "loss": 0.0095, "num_tokens": 3605248.0, "reward": 0.77264404296875, "reward_std": 0.010765868239104748, "rewards//mean": 0.77264404296875, "rewards//std": 0.02570635825395584, "step": 417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0836, "grad_norm": 0.8400963544845581, "kl": 0.2275092527270317, "learning_rate": 4.932490152127611e-06, "loss": 0.0091, "num_tokens": 3613840.0, "reward": 0.751708984375, "reward_std": 0.01651906780898571, "rewards//mean": 0.751708984375, "rewards//std": 0.042007505893707275, "step": 418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0838, "grad_norm": 0.9282857179641724, "kl": 0.24514971487224102, "learning_rate": 4.93212342597912e-06, "loss": 0.0098, "num_tokens": 3622464.0, "reward": 0.7489013671875, "reward_std": 0.015580926090478897, "rewards//mean": 0.7489013671875, "rewards//std": 0.04658199101686478, "step": 419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.084, "grad_norm": 0.9025923609733582, "kl": 0.23584598675370216, "learning_rate": 4.931755720171603e-06, "loss": 0.0094, "num_tokens": 3631032.0, "reward": 0.7366943359375, "reward_std": 0.011178325861692429, "rewards//mean": 0.7366943359375, "rewards//std": 0.03618305176496506, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0842, "grad_norm": 0.7268020510673523, "kl": 0.254648195579648, "learning_rate": 4.931387034853173e-06, "loss": 0.0102, "num_tokens": 3639672.0, "reward": 0.74639892578125, "reward_std": 0.009266193956136703, "rewards//mean": 0.74639892578125, "rewards//std": 0.03361223638057709, "step": 421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0844, "grad_norm": 0.8384131789207458, "kl": 0.2413597498089075, "learning_rate": 4.9310173701723365e-06, "loss": 0.0097, "num_tokens": 3648336.0, "reward": 0.75970458984375, "reward_std": 0.008040700107812881, "rewards//mean": 0.75970458984375, "rewards//std": 0.026702843606472015, "step": 422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0846, "grad_norm": 0.7907002568244934, "kl": 0.24139040149748325, "learning_rate": 4.930646726277994e-06, "loss": 0.0097, "num_tokens": 3656896.0, "reward": 0.7911376953125, "reward_std": 0.009463133290410042, "rewards//mean": 0.7911376953125, "rewards//std": 0.0322941355407238, "step": 423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0848, "grad_norm": 0.750485360622406, "kl": 0.23888413794338703, "learning_rate": 4.930275103319441e-06, "loss": 0.0096, "num_tokens": 3665504.0, "reward": 0.73126220703125, "reward_std": 0.013249299488961697, "rewards//mean": 0.73126220703125, "rewards//std": 0.024657992646098137, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.085, "grad_norm": 0.9737104773521423, "kl": 0.2315953504294157, "learning_rate": 4.9299025014463665e-06, "loss": 0.0093, "num_tokens": 3674080.0, "reward": 0.7303466796875, "reward_std": 0.013063129037618637, "rewards//mean": 0.7303466796875, "rewards//std": 0.0421634316444397, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0852, "grad_norm": 0.7097475528717041, "kl": 0.2388191670179367, "learning_rate": 4.9295289208088545e-06, "loss": 0.0096, "num_tokens": 3682640.0, "reward": 0.74786376953125, "reward_std": 0.009970373474061489, "rewards//mean": 0.74786376953125, "rewards//std": 0.03215102478861809, "step": 426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0854, "grad_norm": 0.78131502866745, "kl": 0.2351592220366001, "learning_rate": 4.929154361557384e-06, "loss": 0.0094, "num_tokens": 3691304.0, "reward": 0.77752685546875, "reward_std": 0.010701300576329231, "rewards//mean": 0.77752685546875, "rewards//std": 0.025538571178913116, "step": 427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0856, "grad_norm": 0.813309907913208, "kl": 0.2603413835167885, "learning_rate": 4.928778823842828e-06, "loss": 0.0104, "num_tokens": 3700024.0, "reward": 0.75299072265625, "reward_std": 0.007520100101828575, "rewards//mean": 0.75299072265625, "rewards//std": 0.03588072210550308, "step": 428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0858, "grad_norm": 0.7221551537513733, "kl": 0.18088798597455025, "learning_rate": 4.928402307816452e-06, "loss": 0.0072, "num_tokens": 3708784.0, "reward": 0.74078369140625, "reward_std": 0.01760246977210045, "rewards//mean": 0.74078369140625, "rewards//std": 0.04441485553979874, "step": 429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.086, "grad_norm": 0.8385155200958252, "kl": 0.21827073767781258, "learning_rate": 4.928024813629917e-06, "loss": 0.0087, "num_tokens": 3717392.0, "reward": 0.7694091796875, "reward_std": 0.009142270311713219, "rewards//mean": 0.7694091796875, "rewards//std": 0.026874415576457977, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0862, "grad_norm": 0.6926012635231018, "kl": 0.22130563855171204, "learning_rate": 4.927646341435276e-06, "loss": 0.0089, "num_tokens": 3726008.0, "reward": 0.75091552734375, "reward_std": 0.010623462498188019, "rewards//mean": 0.75091552734375, "rewards//std": 0.028200527653098106, "step": 431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0864, "grad_norm": 0.8035763502120972, "kl": 0.252463411539793, "learning_rate": 4.92726689138498e-06, "loss": 0.0101, "num_tokens": 3734584.0, "reward": 0.74298095703125, "reward_std": 0.011688274331390858, "rewards//mean": 0.74298095703125, "rewards//std": 0.026449372991919518, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0866, "grad_norm": 0.6842226982116699, "kl": 0.23155632801353931, "learning_rate": 4.92688646363187e-06, "loss": 0.0093, "num_tokens": 3743296.0, "reward": 0.7818603515625, "reward_std": 0.011189782060682774, "rewards//mean": 0.7818603515625, "rewards//std": 0.033827103674411774, "step": 433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0868, "grad_norm": 0.6330403089523315, "kl": 0.21206197701394558, "learning_rate": 4.926505058329184e-06, "loss": 0.0085, "num_tokens": 3752000.0, "reward": 0.7442626953125, "reward_std": 0.01008138619363308, "rewards//mean": 0.7442626953125, "rewards//std": 0.02858632244169712, "step": 434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.087, "grad_norm": 0.877251386642456, "kl": 0.22307201102375984, "learning_rate": 4.9261226756305495e-06, "loss": 0.0089, "num_tokens": 3760584.0, "reward": 0.71490478515625, "reward_std": 0.006664145737886429, "rewards//mean": 0.71490478515625, "rewards//std": 0.04637888818979263, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0872, "grad_norm": 0.6964420676231384, "kl": 0.2119651883840561, "learning_rate": 4.925739315689991e-06, "loss": 0.0085, "num_tokens": 3769168.0, "reward": 0.7415771484375, "reward_std": 0.00940138753503561, "rewards//mean": 0.7415771484375, "rewards//std": 0.03519562631845474, "step": 436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0874, "grad_norm": 0.737678587436676, "kl": 0.22590421885252, "learning_rate": 4.925354978661928e-06, "loss": 0.009, "num_tokens": 3777808.0, "reward": 0.7454833984375, "reward_std": 0.008858283050358295, "rewards//mean": 0.7454833984375, "rewards//std": 0.027649568393826485, "step": 437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0876, "grad_norm": 0.6990619897842407, "kl": 0.21303631737828255, "learning_rate": 4.924969664701168e-06, "loss": 0.0085, "num_tokens": 3786392.0, "reward": 0.74658203125, "reward_std": 0.011121334508061409, "rewards//mean": 0.74658203125, "rewards//std": 0.025176284834742546, "step": 438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0878, "grad_norm": 0.6292837858200073, "kl": 0.21526087448000908, "learning_rate": 4.924583373962918e-06, "loss": 0.0086, "num_tokens": 3795064.0, "reward": 0.7734375, "reward_std": 0.010552143678069115, "rewards//mean": 0.7734375, "rewards//std": 0.02722390927374363, "step": 439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.088, "grad_norm": 0.6078249216079712, "kl": 0.24454114213585854, "learning_rate": 4.924196106602774e-06, "loss": 0.0098, "num_tokens": 3803632.0, "reward": 0.74200439453125, "reward_std": 0.009583698585629463, "rewards//mean": 0.74200439453125, "rewards//std": 0.053606946021318436, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0882, "grad_norm": 0.7330633997917175, "kl": 0.2058623656630516, "learning_rate": 4.9238078627767285e-06, "loss": 0.0082, "num_tokens": 3812296.0, "reward": 0.77801513671875, "reward_std": 0.01102867629379034, "rewards//mean": 0.77801513671875, "rewards//std": 0.026662563905119896, "step": 441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0884, "grad_norm": 0.7553256154060364, "kl": 0.21089842356741428, "learning_rate": 4.923418642641166e-06, "loss": 0.0084, "num_tokens": 3820952.0, "reward": 0.7528076171875, "reward_std": 0.011244535446166992, "rewards//mean": 0.7528076171875, "rewards//std": 0.039164479821920395, "step": 442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0886, "grad_norm": 0.7146939039230347, "kl": 0.22701359912753105, "learning_rate": 4.923028446352864e-06, "loss": 0.0091, "num_tokens": 3829712.0, "reward": 0.71051025390625, "reward_std": 0.016265802085399628, "rewards//mean": 0.71051025390625, "rewards//std": 0.048049308359622955, "step": 443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0888, "grad_norm": 0.807270884513855, "kl": 0.25617854483425617, "learning_rate": 4.922637274068993e-06, "loss": 0.0102, "num_tokens": 3838296.0, "reward": 0.72760009765625, "reward_std": 0.008829087018966675, "rewards//mean": 0.72760009765625, "rewards//std": 0.017555613070726395, "step": 444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.089, "grad_norm": 0.8020032048225403, "kl": 0.2054343856871128, "learning_rate": 4.9222451259471185e-06, "loss": 0.0082, "num_tokens": 3846960.0, "reward": 0.74591064453125, "reward_std": 0.008869110606610775, "rewards//mean": 0.74591064453125, "rewards//std": 0.029458094388246536, "step": 445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0892, "grad_norm": 0.6758546233177185, "kl": 0.22217164561152458, "learning_rate": 4.921852002145196e-06, "loss": 0.0089, "num_tokens": 3855688.0, "reward": 0.7484130859375, "reward_std": 0.008662059903144836, "rewards//mean": 0.7484130859375, "rewards//std": 0.0318770706653595, "step": 446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0894, "grad_norm": 0.8543888926506042, "kl": 0.24924145638942719, "learning_rate": 4.921457902821578e-06, "loss": 0.01, "num_tokens": 3864312.0, "reward": 0.759765625, "reward_std": 0.017270008102059364, "rewards//mean": 0.759765625, "rewards//std": 0.029421651735901833, "step": 447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0896, "grad_norm": 0.6530463695526123, "kl": 0.2602735925465822, "learning_rate": 4.921062828135006e-06, "loss": 0.0104, "num_tokens": 3872992.0, "reward": 0.75787353515625, "reward_std": 0.015410242602229118, "rewards//mean": 0.75787353515625, "rewards//std": 0.02605554088950157, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0898, "grad_norm": 0.9578370451927185, "kl": 0.2606674674898386, "learning_rate": 4.920666778244616e-06, "loss": 0.0104, "num_tokens": 3881712.0, "reward": 0.77020263671875, "reward_std": 0.008511267602443695, "rewards//mean": 0.77020263671875, "rewards//std": 0.030421772971749306, "step": 449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.09, "grad_norm": 0.6777032017707825, "kl": 0.2329980656504631, "learning_rate": 4.920269753309937e-06, "loss": 0.0093, "num_tokens": 3890312.0, "reward": 0.7421875, "reward_std": 0.00921049527823925, "rewards//mean": 0.7421875, "rewards//std": 0.031932346522808075, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0902, "grad_norm": 0.7345518469810486, "kl": 0.23562784306704998, "learning_rate": 4.919871753490892e-06, "loss": 0.0094, "num_tokens": 3899016.0, "reward": 0.73236083984375, "reward_std": 0.009874099865555763, "rewards//mean": 0.73236083984375, "rewards//std": 0.04835017770528793, "step": 451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0904, "grad_norm": 0.6494887471199036, "kl": 0.22641596291214228, "learning_rate": 4.919472778947793e-06, "loss": 0.0091, "num_tokens": 3907752.0, "reward": 0.7728271484375, "reward_std": 0.008734005503356457, "rewards//mean": 0.7728271484375, "rewards//std": 0.019265538081526756, "step": 452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0906, "grad_norm": 0.6345562934875488, "kl": 0.22991832438856363, "learning_rate": 4.919072829841347e-06, "loss": 0.0092, "num_tokens": 3916496.0, "reward": 0.76580810546875, "reward_std": 0.009036125615239143, "rewards//mean": 0.76580810546875, "rewards//std": 0.03826966881752014, "step": 453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0908, "grad_norm": 0.7087083458900452, "kl": 0.23438394255936146, "learning_rate": 4.918671906332656e-06, "loss": 0.0094, "num_tokens": 3925008.0, "reward": 0.732666015625, "reward_std": 0.011662309989333153, "rewards//mean": 0.732666015625, "rewards//std": 0.034244511276483536, "step": 454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.091, "grad_norm": 0.7872934937477112, "kl": 0.2746329791843891, "learning_rate": 4.91827000858321e-06, "loss": 0.011, "num_tokens": 3933696.0, "reward": 0.73626708984375, "reward_std": 0.007544973865151405, "rewards//mean": 0.73626708984375, "rewards//std": 0.03399689868092537, "step": 455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0912, "grad_norm": 0.6922155022621155, "kl": 0.22344142571091652, "learning_rate": 4.917867136754894e-06, "loss": 0.0089, "num_tokens": 3942304.0, "reward": 0.77667236328125, "reward_std": 0.013890949077904224, "rewards//mean": 0.77667236328125, "rewards//std": 0.032455623149871826, "step": 456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0914, "grad_norm": 0.8709951043128967, "kl": 0.29020215198397636, "learning_rate": 4.917463291009984e-06, "loss": 0.0116, "num_tokens": 3950968.0, "reward": 0.750244140625, "reward_std": 0.00925417710095644, "rewards//mean": 0.750244140625, "rewards//std": 0.02082025073468685, "step": 457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0916, "grad_norm": 0.7772853374481201, "kl": 0.2596856001764536, "learning_rate": 4.917058471511149e-06, "loss": 0.0104, "num_tokens": 3959640.0, "reward": 0.74822998046875, "reward_std": 0.012660522013902664, "rewards//mean": 0.74822998046875, "rewards//std": 0.03531729802489281, "step": 458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0918, "grad_norm": 0.9631777405738831, "kl": 0.2647391799837351, "learning_rate": 4.916652678421451e-06, "loss": 0.0106, "num_tokens": 3968256.0, "reward": 0.76495361328125, "reward_std": 0.008287420496344566, "rewards//mean": 0.76495361328125, "rewards//std": 0.021716255694627762, "step": 459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.092, "grad_norm": 0.6951785683631897, "kl": 0.24628694355487823, "learning_rate": 4.916245911904344e-06, "loss": 0.0099, "num_tokens": 3977024.0, "reward": 0.772216796875, "reward_std": 0.012418104335665703, "rewards//mean": 0.772216796875, "rewards//std": 0.024899380281567574, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0922, "grad_norm": 0.7456372976303101, "kl": 0.25542332231998444, "learning_rate": 4.9158381721236715e-06, "loss": 0.0102, "num_tokens": 3985664.0, "reward": 0.72442626953125, "reward_std": 0.011360350996255875, "rewards//mean": 0.72442626953125, "rewards//std": 0.03637805953621864, "step": 461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0924, "grad_norm": 0.7007219791412354, "kl": 0.2571563795208931, "learning_rate": 4.915429459243673e-06, "loss": 0.0103, "num_tokens": 3994480.0, "reward": 0.75836181640625, "reward_std": 0.00619722343981266, "rewards//mean": 0.75836181640625, "rewards//std": 0.025561677291989326, "step": 462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0926, "grad_norm": 0.6913952231407166, "kl": 0.2788440138101578, "learning_rate": 4.9150197734289764e-06, "loss": 0.0112, "num_tokens": 4003112.0, "reward": 0.7447509765625, "reward_std": 0.007206355221569538, "rewards//mean": 0.7447509765625, "rewards//std": 0.03733934462070465, "step": 463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0928, "grad_norm": 0.675417423248291, "kl": 0.23573023825883865, "learning_rate": 4.9146091148446055e-06, "loss": 0.0094, "num_tokens": 4011720.0, "reward": 0.7464599609375, "reward_std": 0.015771940350532532, "rewards//mean": 0.7464599609375, "rewards//std": 0.04249957576394081, "step": 464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.093, "grad_norm": 0.7302259802818298, "kl": 0.27733396738767624, "learning_rate": 4.91419748365597e-06, "loss": 0.0111, "num_tokens": 4020384.0, "reward": 0.734130859375, "reward_std": 0.007520634680986404, "rewards//mean": 0.734130859375, "rewards//std": 0.03848075866699219, "step": 465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0932, "grad_norm": 0.7138959169387817, "kl": 0.2417994262650609, "learning_rate": 4.9137848800288775e-06, "loss": 0.0097, "num_tokens": 4029016.0, "reward": 0.75616455078125, "reward_std": 0.010032439604401588, "rewards//mean": 0.75616455078125, "rewards//std": 0.02122269943356514, "step": 466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0934, "grad_norm": 0.7322536706924438, "kl": 0.27695489302277565, "learning_rate": 4.9133713041295235e-06, "loss": 0.0111, "num_tokens": 4037688.0, "reward": 0.7783203125, "reward_std": 0.009051885455846786, "rewards//mean": 0.7783203125, "rewards//std": 0.03194751217961311, "step": 467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0936, "grad_norm": 0.8252130746841431, "kl": 0.26474233716726303, "learning_rate": 4.912956756124498e-06, "loss": 0.0106, "num_tokens": 4046304.0, "reward": 0.7481689453125, "reward_std": 0.0064436085522174835, "rewards//mean": 0.7481689453125, "rewards//std": 0.0320381224155426, "step": 468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0938, "grad_norm": 0.6932944059371948, "kl": 0.27318440936505795, "learning_rate": 4.912541236180779e-06, "loss": 0.0109, "num_tokens": 4054992.0, "reward": 0.71624755859375, "reward_std": 0.009076687507331371, "rewards//mean": 0.71624755859375, "rewards//std": 0.03491336852312088, "step": 469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.094, "grad_norm": 0.6723066568374634, "kl": 0.2577223964035511, "learning_rate": 4.9121247444657384e-06, "loss": 0.0103, "num_tokens": 4063624.0, "reward": 0.7515869140625, "reward_std": 0.005873112007975578, "rewards//mean": 0.7515869140625, "rewards//std": 0.02327904850244522, "step": 470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0942, "grad_norm": 0.8410773277282715, "kl": 0.2808041274547577, "learning_rate": 4.91170728114714e-06, "loss": 0.0112, "num_tokens": 4072256.0, "reward": 0.76995849609375, "reward_std": 0.005484414286911488, "rewards//mean": 0.76995849609375, "rewards//std": 0.022884486243128777, "step": 471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0944, "grad_norm": 0.6858919262886047, "kl": 0.2676295880228281, "learning_rate": 4.911288846393136e-06, "loss": 0.0107, "num_tokens": 4080920.0, "reward": 0.742431640625, "reward_std": 0.012072248384356499, "rewards//mean": 0.742431640625, "rewards//std": 0.022215967997908592, "step": 472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0946, "grad_norm": 0.6118029356002808, "kl": 0.21254741679877043, "learning_rate": 4.910869440372274e-06, "loss": 0.0085, "num_tokens": 4089496.0, "reward": 0.739990234375, "reward_std": 0.01614592783153057, "rewards//mean": 0.739990234375, "rewards//std": 0.03497926890850067, "step": 473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0948, "grad_norm": 0.7061100602149963, "kl": 0.25430491380393505, "learning_rate": 4.910449063253489e-06, "loss": 0.0102, "num_tokens": 4098072.0, "reward": 0.75494384765625, "reward_std": 0.01261814869940281, "rewards//mean": 0.75494384765625, "rewards//std": 0.03197965770959854, "step": 474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.095, "grad_norm": 0.6372392773628235, "kl": 0.29845254495739937, "learning_rate": 4.9100277152061105e-06, "loss": 0.0119, "num_tokens": 4106672.0, "reward": 0.7095947265625, "reward_std": 0.009667182341217995, "rewards//mean": 0.7095947265625, "rewards//std": 0.045762356370687485, "step": 475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0952, "grad_norm": 0.6701680421829224, "kl": 0.25746934674680233, "learning_rate": 4.9096053963998555e-06, "loss": 0.0103, "num_tokens": 4115384.0, "reward": 0.742919921875, "reward_std": 0.01074405387043953, "rewards//mean": 0.742919921875, "rewards//std": 0.027916818857192993, "step": 476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0954, "grad_norm": 0.8074530363082886, "kl": 0.2788533419370651, "learning_rate": 4.909182107004835e-06, "loss": 0.0112, "num_tokens": 4124064.0, "reward": 0.73681640625, "reward_std": 0.006065527442842722, "rewards//mean": 0.73681640625, "rewards//std": 0.03234307840466499, "step": 477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0956, "grad_norm": 0.7189854979515076, "kl": 0.27882962487637997, "learning_rate": 4.908757847191551e-06, "loss": 0.0112, "num_tokens": 4132704.0, "reward": 0.74261474609375, "reward_std": 0.01600596494972706, "rewards//mean": 0.74261474609375, "rewards//std": 0.043848197907209396, "step": 478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0958, "grad_norm": 0.7320364117622375, "kl": 0.2482055276632309, "learning_rate": 4.908332617130893e-06, "loss": 0.0099, "num_tokens": 4141304.0, "reward": 0.76947021484375, "reward_std": 0.007574299816042185, "rewards//mean": 0.76947021484375, "rewards//std": 0.030989211052656174, "step": 479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.096, "grad_norm": 0.735087513923645, "kl": 0.24100211029872298, "learning_rate": 4.907906416994146e-06, "loss": 0.0096, "num_tokens": 4149840.0, "reward": 0.77349853515625, "reward_std": 0.013887856155633926, "rewards//mean": 0.77349853515625, "rewards//std": 0.03502505645155907, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0962, "grad_norm": 0.693162739276886, "kl": 0.27953667007386684, "learning_rate": 4.907479246952981e-06, "loss": 0.0112, "num_tokens": 4158448.0, "reward": 0.718017578125, "reward_std": 0.00825615506619215, "rewards//mean": 0.718017578125, "rewards//std": 0.03687366470694542, "step": 481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0964, "grad_norm": 0.6603836417198181, "kl": 0.23947274032980204, "learning_rate": 4.907051107179464e-06, "loss": 0.0096, "num_tokens": 4167072.0, "reward": 0.748046875, "reward_std": 0.013273896649479866, "rewards//mean": 0.748046875, "rewards//std": 0.0389009565114975, "step": 482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0966, "grad_norm": 0.6945295929908752, "kl": 0.2583098318427801, "learning_rate": 4.9066219978460485e-06, "loss": 0.0103, "num_tokens": 4175656.0, "reward": 0.763916015625, "reward_std": 0.007394441869109869, "rewards//mean": 0.763916015625, "rewards//std": 0.03223337233066559, "step": 483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0968, "grad_norm": 0.6192371845245361, "kl": 0.2828360088169575, "learning_rate": 4.90619191912558e-06, "loss": 0.0113, "num_tokens": 4184232.0, "reward": 0.7418212890625, "reward_std": 0.00855693407356739, "rewards//mean": 0.7418212890625, "rewards//std": 0.02327904850244522, "step": 484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.097, "grad_norm": 0.6635370850563049, "kl": 0.2756698988378048, "learning_rate": 4.905760871191295e-06, "loss": 0.011, "num_tokens": 4192904.0, "reward": 0.71905517578125, "reward_std": 0.007833951152861118, "rewards//mean": 0.71905517578125, "rewards//std": 0.04106537625193596, "step": 485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0972, "grad_norm": 0.669340968132019, "kl": 0.2513146288692951, "learning_rate": 4.9053288542168185e-06, "loss": 0.0101, "num_tokens": 4201520.0, "reward": 0.7288818359375, "reward_std": 0.011037398129701614, "rewards//mean": 0.7288818359375, "rewards//std": 0.022028131410479546, "step": 486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0974, "grad_norm": 0.6306434273719788, "kl": 0.2504331525415182, "learning_rate": 4.904895868376167e-06, "loss": 0.01, "num_tokens": 4210128.0, "reward": 0.76690673828125, "reward_std": 0.010853402316570282, "rewards//mean": 0.76690673828125, "rewards//std": 0.03223472461104393, "step": 487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0976, "grad_norm": 0.6316109299659729, "kl": 0.24033397436141968, "learning_rate": 4.904461913843747e-06, "loss": 0.0096, "num_tokens": 4218696.0, "reward": 0.73101806640625, "reward_std": 0.009801981970667839, "rewards//mean": 0.73101806640625, "rewards//std": 0.033093471080064774, "step": 488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0978, "grad_norm": 0.692654013633728, "kl": 0.30100347846746445, "learning_rate": 4.904026990794356e-06, "loss": 0.012, "num_tokens": 4227304.0, "reward": 0.7596435546875, "reward_std": 0.009308423846960068, "rewards//mean": 0.7596435546875, "rewards//std": 0.032158851623535156, "step": 489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.098, "grad_norm": 0.6925479769706726, "kl": 0.26741546019911766, "learning_rate": 4.903591099403181e-06, "loss": 0.0107, "num_tokens": 4235968.0, "reward": 0.7447509765625, "reward_std": 0.01731371134519577, "rewards//mean": 0.7447509765625, "rewards//std": 0.035796087235212326, "step": 490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0982, "grad_norm": 0.5992981791496277, "kl": 0.24271851778030396, "learning_rate": 4.903154239845798e-06, "loss": 0.0097, "num_tokens": 4244616.0, "reward": 0.73974609375, "reward_std": 0.010727489367127419, "rewards//mean": 0.73974609375, "rewards//std": 0.02847197651863098, "step": 491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0984, "grad_norm": 0.6184040307998657, "kl": 0.24116972647607327, "learning_rate": 4.902716412298174e-06, "loss": 0.0096, "num_tokens": 4253168.0, "reward": 0.72052001953125, "reward_std": 0.011508103460073471, "rewards//mean": 0.72052001953125, "rewards//std": 0.03347956761717796, "step": 492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0986, "grad_norm": 0.6333391070365906, "kl": 0.2607311652973294, "learning_rate": 4.902277616936667e-06, "loss": 0.0104, "num_tokens": 4261768.0, "reward": 0.7559814453125, "reward_std": 0.009352114051580429, "rewards//mean": 0.7559814453125, "rewards//std": 0.030678750947117805, "step": 493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0988, "grad_norm": 0.700835108757019, "kl": 0.2442559963092208, "learning_rate": 4.901837853938024e-06, "loss": 0.0098, "num_tokens": 4270328.0, "reward": 0.7232666015625, "reward_std": 0.011704965494573116, "rewards//mean": 0.7232666015625, "rewards//std": 0.039587363600730896, "step": 494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.099, "grad_norm": 0.687675952911377, "kl": 0.28429468162357807, "learning_rate": 4.90139712347938e-06, "loss": 0.0114, "num_tokens": 4279016.0, "reward": 0.74224853515625, "reward_std": 0.006493415683507919, "rewards//mean": 0.74224853515625, "rewards//std": 0.02292017824947834, "step": 495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0992, "grad_norm": 0.6724541187286377, "kl": 0.2627929784357548, "learning_rate": 4.900955425738262e-06, "loss": 0.0105, "num_tokens": 4287632.0, "reward": 0.73876953125, "reward_std": 0.008314857259392738, "rewards//mean": 0.73876953125, "rewards//std": 0.032192960381507874, "step": 496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0994, "grad_norm": 0.6971147656440735, "kl": 0.2989068515598774, "learning_rate": 4.900512760892585e-06, "loss": 0.012, "num_tokens": 4296256.0, "reward": 0.73370361328125, "reward_std": 0.009461992420256138, "rewards//mean": 0.73370361328125, "rewards//std": 0.025719311088323593, "step": 497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0996, "grad_norm": 0.7619755864143372, "kl": 0.28933385387063026, "learning_rate": 4.900069129120656e-06, "loss": 0.0116, "num_tokens": 4304944.0, "reward": 0.7659912109375, "reward_std": 0.009017490781843662, "rewards//mean": 0.7659912109375, "rewards//std": 0.03981613367795944, "step": 498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0998, "grad_norm": 0.7067105174064636, "kl": 0.2626380883157253, "learning_rate": 4.899624530601168e-06, "loss": 0.0105, "num_tokens": 4313560.0, "reward": 0.72686767578125, "reward_std": 0.01192802656441927, "rewards//mean": 0.72686767578125, "rewards//std": 0.037255480885505676, "step": 499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1, "grad_norm": 0.7758896946907043, "kl": 0.28313127160072327, "learning_rate": 4.899178965513206e-06, "loss": 0.0113, "num_tokens": 4322208.0, "reward": 0.76763916015625, "reward_std": 0.0051316977478563786, "rewards//mean": 0.76763916015625, "rewards//std": 0.022271685302257538, "step": 500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1002, "grad_norm": 0.7543128132820129, "kl": 0.29646800085902214, "learning_rate": 4.8987324340362445e-06, "loss": 0.0119, "num_tokens": 4330888.0, "reward": 0.74884033203125, "reward_std": 0.007682453375309706, "rewards//mean": 0.74884033203125, "rewards//std": 0.027051476761698723, "step": 501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1004, "grad_norm": 0.7896819114685059, "kl": 0.33157695457339287, "learning_rate": 4.898284936350144e-06, "loss": 0.0133, "num_tokens": 4339480.0, "reward": 0.732666015625, "reward_std": 0.007414411753416061, "rewards//mean": 0.732666015625, "rewards//std": 0.03544698655605316, "step": 502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1006, "grad_norm": 0.6674100160598755, "kl": 0.26767886988818645, "learning_rate": 4.897836472635159e-06, "loss": 0.0107, "num_tokens": 4348208.0, "reward": 0.71868896484375, "reward_std": 0.0074168480932712555, "rewards//mean": 0.71868896484375, "rewards//std": 0.03925728052854538, "step": 503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1008, "grad_norm": 0.7596335411071777, "kl": 0.2823500316590071, "learning_rate": 4.89738704307193e-06, "loss": 0.0113, "num_tokens": 4356904.0, "reward": 0.748046875, "reward_std": 0.005976288113743067, "rewards//mean": 0.748046875, "rewards//std": 0.02426011860370636, "step": 504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.101, "grad_norm": 0.7461748123168945, "kl": 0.2936761640012264, "learning_rate": 4.896936647841485e-06, "loss": 0.0117, "num_tokens": 4365528.0, "reward": 0.75103759765625, "reward_std": 0.011169584468007088, "rewards//mean": 0.75103759765625, "rewards//std": 0.030783364549279213, "step": 505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1012, "grad_norm": 0.7608941197395325, "kl": 0.28775020502507687, "learning_rate": 4.896485287125247e-06, "loss": 0.0115, "num_tokens": 4374120.0, "reward": 0.73614501953125, "reward_std": 0.008655678480863571, "rewards//mean": 0.73614501953125, "rewards//std": 0.02487071417272091, "step": 506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1014, "grad_norm": 0.8823245167732239, "kl": 0.2823618911206722, "learning_rate": 4.896032961105021e-06, "loss": 0.0113, "num_tokens": 4382808.0, "reward": 0.75872802734375, "reward_std": 0.007658984046429396, "rewards//mean": 0.75872802734375, "rewards//std": 0.02521706186234951, "step": 507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1016, "grad_norm": 0.6119314432144165, "kl": 0.2920355089008808, "learning_rate": 4.8955796699630045e-06, "loss": 0.0117, "num_tokens": 4391536.0, "reward": 0.767578125, "reward_std": 0.006091872230172157, "rewards//mean": 0.767578125, "rewards//std": 0.02380661852657795, "step": 508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1018, "grad_norm": 0.6585849523544312, "kl": 0.2887584716081619, "learning_rate": 4.895125413881783e-06, "loss": 0.0116, "num_tokens": 4400152.0, "reward": 0.748291015625, "reward_std": 0.005973452236503363, "rewards//mean": 0.748291015625, "rewards//std": 0.030804071575403214, "step": 509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.102, "grad_norm": 0.7349271178245544, "kl": 0.27199224196374416, "learning_rate": 4.894670193044332e-06, "loss": 0.0109, "num_tokens": 4408712.0, "reward": 0.72711181640625, "reward_std": 0.0059516276232898235, "rewards//mean": 0.72711181640625, "rewards//std": 0.03458928316831589, "step": 510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1022, "grad_norm": 0.6272158026695251, "kl": 0.29213687032461166, "learning_rate": 4.894214007634014e-06, "loss": 0.0117, "num_tokens": 4417376.0, "reward": 0.763427734375, "reward_std": 0.00552313681691885, "rewards//mean": 0.763427734375, "rewards//std": 0.02124631404876709, "step": 511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1024, "grad_norm": 0.7491189241409302, "kl": 0.2788054086267948, "learning_rate": 4.893756857834579e-06, "loss": 0.0112, "num_tokens": 4426096.0, "reward": 0.75390625, "reward_std": 0.015128916129469872, "rewards//mean": 0.75390625, "rewards//std": 0.04438012093305588, "step": 512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1026, "grad_norm": 0.598261833190918, "kl": 0.2626344934105873, "learning_rate": 4.893298743830168e-06, "loss": 0.0105, "num_tokens": 4434696.0, "reward": 0.74615478515625, "reward_std": 0.012900955975055695, "rewards//mean": 0.74615478515625, "rewards//std": 0.03896622732281685, "step": 513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1028, "grad_norm": 0.6159838438034058, "kl": 0.2789937425404787, "learning_rate": 4.89283966580531e-06, "loss": 0.0112, "num_tokens": 4443352.0, "reward": 0.7607421875, "reward_std": 0.0056966980919241905, "rewards//mean": 0.7607421875, "rewards//std": 0.02111050859093666, "step": 514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.103, "grad_norm": 0.6811221241950989, "kl": 0.2887964155524969, "learning_rate": 4.8923796239449206e-06, "loss": 0.0116, "num_tokens": 4452008.0, "reward": 0.76751708984375, "reward_std": 0.006497236434370279, "rewards//mean": 0.76751708984375, "rewards//std": 0.029380397871136665, "step": 515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1032, "grad_norm": 0.5849115252494812, "kl": 0.2598783317953348, "learning_rate": 4.891918618434305e-06, "loss": 0.0104, "num_tokens": 4460584.0, "reward": 0.7503662109375, "reward_std": 0.006470596417784691, "rewards//mean": 0.7503662109375, "rewards//std": 0.020818432793021202, "step": 516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1034, "grad_norm": 0.774935245513916, "kl": 0.2854543272405863, "learning_rate": 4.891456649459156e-06, "loss": 0.0114, "num_tokens": 4469208.0, "reward": 0.76519775390625, "reward_std": 0.008341135457158089, "rewards//mean": 0.76519775390625, "rewards//std": 0.025450101122260094, "step": 517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1036, "grad_norm": 0.7881346940994263, "kl": 0.3002116158604622, "learning_rate": 4.890993717205553e-06, "loss": 0.012, "num_tokens": 4477880.0, "reward": 0.7275390625, "reward_std": 0.007611682638525963, "rewards//mean": 0.7275390625, "rewards//std": 0.026013750582933426, "step": 518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1038, "grad_norm": 0.7925445437431335, "kl": 0.2746951151639223, "learning_rate": 4.8905298218599685e-06, "loss": 0.011, "num_tokens": 4486520.0, "reward": 0.753173828125, "reward_std": 0.00768206175416708, "rewards//mean": 0.753173828125, "rewards//std": 0.03539228066802025, "step": 519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.104, "grad_norm": 0.649895191192627, "kl": 0.3011128604412079, "learning_rate": 4.8900649636092565e-06, "loss": 0.012, "num_tokens": 4495064.0, "reward": 0.76141357421875, "reward_std": 0.008288520388305187, "rewards//mean": 0.76141357421875, "rewards//std": 0.029034705832600594, "step": 520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1042, "grad_norm": 0.6826982498168945, "kl": 0.27436352148652077, "learning_rate": 4.889599142640663e-06, "loss": 0.011, "num_tokens": 4503728.0, "reward": 0.73187255859375, "reward_std": 0.007811560295522213, "rewards//mean": 0.73187255859375, "rewards//std": 0.031737327575683594, "step": 521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1044, "grad_norm": 0.7717922329902649, "kl": 0.2864828519523144, "learning_rate": 4.889132359141822e-06, "loss": 0.0115, "num_tokens": 4512344.0, "reward": 0.76220703125, "reward_std": 0.005440828390419483, "rewards//mean": 0.76220703125, "rewards//std": 0.02468077465891838, "step": 522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1046, "grad_norm": 0.6896048784255981, "kl": 0.27172659896314144, "learning_rate": 4.888664613300751e-06, "loss": 0.0109, "num_tokens": 4520896.0, "reward": 0.74566650390625, "reward_std": 0.008871195837855339, "rewards//mean": 0.74566650390625, "rewards//std": 0.02351093478500843, "step": 523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1048, "grad_norm": 0.7394423484802246, "kl": 0.2659613937139511, "learning_rate": 4.888195905305859e-06, "loss": 0.0106, "num_tokens": 4529480.0, "reward": 0.73236083984375, "reward_std": 0.012172890827059746, "rewards//mean": 0.73236083984375, "rewards//std": 0.03177260234951973, "step": 524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.105, "grad_norm": 0.6426916122436523, "kl": 0.27845061011612415, "learning_rate": 4.887726235345943e-06, "loss": 0.0111, "num_tokens": 4538064.0, "reward": 0.76239013671875, "reward_std": 0.008901185356080532, "rewards//mean": 0.76239013671875, "rewards//std": 0.023638714104890823, "step": 525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1052, "grad_norm": 0.6519190669059753, "kl": 0.25384664349257946, "learning_rate": 4.8872556036101845e-06, "loss": 0.0102, "num_tokens": 4546688.0, "reward": 0.77862548828125, "reward_std": 0.009897831827402115, "rewards//mean": 0.77862548828125, "rewards//std": 0.029281822964549065, "step": 526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1054, "grad_norm": 0.7181262969970703, "kl": 0.26998190581798553, "learning_rate": 4.886784010288155e-06, "loss": 0.0108, "num_tokens": 4555432.0, "reward": 0.78851318359375, "reward_std": 0.011629641987383366, "rewards//mean": 0.78851318359375, "rewards//std": 0.025476258248090744, "step": 527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1056, "grad_norm": 0.6040878295898438, "kl": 0.28468160331249237, "learning_rate": 4.886311455569811e-06, "loss": 0.0114, "num_tokens": 4564064.0, "reward": 0.76861572265625, "reward_std": 0.0060232048854231834, "rewards//mean": 0.76861572265625, "rewards//std": 0.024147462099790573, "step": 528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1058, "grad_norm": 0.6055912375450134, "kl": 0.2735178656876087, "learning_rate": 4.885837939645499e-06, "loss": 0.0109, "num_tokens": 4572752.0, "reward": 0.75860595703125, "reward_std": 0.005819528363645077, "rewards//mean": 0.75860595703125, "rewards//std": 0.025687508285045624, "step": 529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.106, "grad_norm": 0.7213678956031799, "kl": 0.2826583944261074, "learning_rate": 4.885363462705949e-06, "loss": 0.0113, "num_tokens": 4581432.0, "reward": 0.7806396484375, "reward_std": 0.005271364934742451, "rewards//mean": 0.7806396484375, "rewards//std": 0.024071883410215378, "step": 530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1062, "grad_norm": 0.6324445605278015, "kl": 0.30165957659482956, "learning_rate": 4.884888024942282e-06, "loss": 0.0121, "num_tokens": 4590056.0, "reward": 0.7506103515625, "reward_std": 0.0039962828159332275, "rewards//mean": 0.7506103515625, "rewards//std": 0.029759034514427185, "step": 531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1064, "grad_norm": 0.6357660293579102, "kl": 0.2692461237311363, "learning_rate": 4.884411626546004e-06, "loss": 0.0108, "num_tokens": 4598776.0, "reward": 0.73028564453125, "reward_std": 0.006139049306511879, "rewards//mean": 0.73028564453125, "rewards//std": 0.020942674949765205, "step": 532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1066, "grad_norm": 0.9847143292427063, "kl": 0.30338218063116074, "learning_rate": 4.883934267709007e-06, "loss": 0.0121, "num_tokens": 4607464.0, "reward": 0.77239990234375, "reward_std": 0.005770155228674412, "rewards//mean": 0.77239990234375, "rewards//std": 0.030767623335123062, "step": 533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1068, "grad_norm": 0.8446683883666992, "kl": 0.3181992806494236, "learning_rate": 4.883455948623574e-06, "loss": 0.0127, "num_tokens": 4616104.0, "reward": 0.7333984375, "reward_std": 0.009834567084908485, "rewards//mean": 0.7333984375, "rewards//std": 0.032324355095624924, "step": 534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.107, "grad_norm": 0.7224143743515015, "kl": 0.2985565960407257, "learning_rate": 4.882976669482368e-06, "loss": 0.0119, "num_tokens": 4624760.0, "reward": 0.7509765625, "reward_std": 0.006324178539216518, "rewards//mean": 0.7509765625, "rewards//std": 0.030273688957095146, "step": 535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1072, "grad_norm": 0.6489189863204956, "kl": 0.2836180441081524, "learning_rate": 4.882496430478445e-06, "loss": 0.0113, "num_tokens": 4633392.0, "reward": 0.7471923828125, "reward_std": 0.00982951931655407, "rewards//mean": 0.7471923828125, "rewards//std": 0.027405409142374992, "step": 536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1074, "grad_norm": 0.711584210395813, "kl": 0.2708862889558077, "learning_rate": 4.882015231805245e-06, "loss": 0.0108, "num_tokens": 4642072.0, "reward": 0.7578125, "reward_std": 0.008214281871914864, "rewards//mean": 0.7578125, "rewards//std": 0.027891865000128746, "step": 537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1076, "grad_norm": 0.6661033034324646, "kl": 0.26852802373468876, "learning_rate": 4.881533073656594e-06, "loss": 0.0107, "num_tokens": 4650720.0, "reward": 0.75445556640625, "reward_std": 0.007886864244937897, "rewards//mean": 0.75445556640625, "rewards//std": 0.028012022376060486, "step": 538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1078, "grad_norm": 0.7064325213432312, "kl": 0.2860189266502857, "learning_rate": 4.8810499562267066e-06, "loss": 0.0114, "num_tokens": 4659312.0, "reward": 0.769287109375, "reward_std": 0.006793664768338203, "rewards//mean": 0.769287109375, "rewards//std": 0.02851766347885132, "step": 539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.108, "grad_norm": 0.6437175869941711, "kl": 0.25416005309671164, "learning_rate": 4.88056587971018e-06, "loss": 0.0102, "num_tokens": 4667896.0, "reward": 0.7423095703125, "reward_std": 0.007785624824464321, "rewards//mean": 0.7423095703125, "rewards//std": 0.028713131323456764, "step": 540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1082, "grad_norm": 0.5998038053512573, "kl": 0.2681788485497236, "learning_rate": 4.880080844302004e-06, "loss": 0.0107, "num_tokens": 4676488.0, "reward": 0.75537109375, "reward_std": 0.006692672614008188, "rewards//mean": 0.75537109375, "rewards//std": 0.020033232867717743, "step": 541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1084, "grad_norm": 0.6596677899360657, "kl": 0.24427864141762257, "learning_rate": 4.879594850197548e-06, "loss": 0.0098, "num_tokens": 4685128.0, "reward": 0.7476806640625, "reward_std": 0.010514110326766968, "rewards//mean": 0.7476806640625, "rewards//std": 0.03853638097643852, "step": 542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1086, "grad_norm": 0.6608676910400391, "kl": 0.259847916662693, "learning_rate": 4.87910789759257e-06, "loss": 0.0104, "num_tokens": 4693824.0, "reward": 0.75299072265625, "reward_std": 0.005296648014336824, "rewards//mean": 0.75299072265625, "rewards//std": 0.048891909420490265, "step": 543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1088, "grad_norm": 0.6944065093994141, "kl": 0.2824724651873112, "learning_rate": 4.878619986683215e-06, "loss": 0.0113, "num_tokens": 4702472.0, "reward": 0.750244140625, "reward_std": 0.007567228749394417, "rewards//mean": 0.750244140625, "rewards//std": 0.03861897811293602, "step": 544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.109, "grad_norm": 0.6676400303840637, "kl": 0.26609333604574203, "learning_rate": 4.8781311176660144e-06, "loss": 0.0106, "num_tokens": 4711104.0, "reward": 0.779052734375, "reward_std": 0.010382162407040596, "rewards//mean": 0.779052734375, "rewards//std": 0.02472366951406002, "step": 545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1092, "grad_norm": 0.6957513689994812, "kl": 0.2683636359870434, "learning_rate": 4.8776412907378845e-06, "loss": 0.0107, "num_tokens": 4719688.0, "reward": 0.71490478515625, "reward_std": 0.008024358190596104, "rewards//mean": 0.71490478515625, "rewards//std": 0.037217672914266586, "step": 546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1094, "grad_norm": 0.6461659073829651, "kl": 0.2612381912767887, "learning_rate": 4.877150506096127e-06, "loss": 0.0104, "num_tokens": 4728272.0, "reward": 0.7481689453125, "reward_std": 0.006405083928257227, "rewards//mean": 0.7481689453125, "rewards//std": 0.02294633351266384, "step": 547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1096, "grad_norm": 0.6994146704673767, "kl": 0.2628567796200514, "learning_rate": 4.8766587639384285e-06, "loss": 0.0105, "num_tokens": 4736888.0, "reward": 0.74835205078125, "reward_std": 0.006025420036166906, "rewards//mean": 0.74835205078125, "rewards//std": 0.028795981779694557, "step": 548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1098, "grad_norm": 0.5853991508483887, "kl": 0.25532799772918224, "learning_rate": 4.876166064462866e-06, "loss": 0.0102, "num_tokens": 4745456.0, "reward": 0.7703857421875, "reward_std": 0.009814901277422905, "rewards//mean": 0.7703857421875, "rewards//std": 0.02585000917315483, "step": 549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.11, "grad_norm": 0.6331080198287964, "kl": 0.2674791272729635, "learning_rate": 4.8756724078678955e-06, "loss": 0.0107, "num_tokens": 4754000.0, "reward": 0.7891845703125, "reward_std": 0.01081712357699871, "rewards//mean": 0.7891845703125, "rewards//std": 0.023632710799574852, "step": 550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1102, "grad_norm": 0.789883553981781, "kl": 0.259792210534215, "learning_rate": 4.875177794352364e-06, "loss": 0.0104, "num_tokens": 4762808.0, "reward": 0.7608642578125, "reward_std": 0.006845149677246809, "rewards//mean": 0.7608642578125, "rewards//std": 0.0270137470215559, "step": 551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1104, "grad_norm": 0.6203144788742065, "kl": 0.2684528976678848, "learning_rate": 4.8746822241155006e-06, "loss": 0.0107, "num_tokens": 4771488.0, "reward": 0.7344970703125, "reward_std": 0.007759134750813246, "rewards//mean": 0.7344970703125, "rewards//std": 0.02165386639535427, "step": 552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1106, "grad_norm": 0.8759577870368958, "kl": 0.2989007495343685, "learning_rate": 4.874185697356921e-06, "loss": 0.012, "num_tokens": 4780144.0, "reward": 0.7337646484375, "reward_std": 0.009506863541901112, "rewards//mean": 0.7337646484375, "rewards//std": 0.02997797727584839, "step": 553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1108, "grad_norm": 0.5836766958236694, "kl": 0.25084237568080425, "learning_rate": 4.873688214276628e-06, "loss": 0.01, "num_tokens": 4788744.0, "reward": 0.71270751953125, "reward_std": 0.008568651042878628, "rewards//mean": 0.71270751953125, "rewards//std": 0.03878828510642052, "step": 554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.111, "grad_norm": 0.6924554109573364, "kl": 0.2806801497936249, "learning_rate": 4.873189775075005e-06, "loss": 0.0112, "num_tokens": 4797400.0, "reward": 0.734375, "reward_std": 0.0058593968860805035, "rewards//mean": 0.734375, "rewards//std": 0.03078145906329155, "step": 555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1112, "grad_norm": 0.7575480341911316, "kl": 0.2851990181952715, "learning_rate": 4.872690379952824e-06, "loss": 0.0114, "num_tokens": 4806144.0, "reward": 0.79217529296875, "reward_std": 0.008231771178543568, "rewards//mean": 0.79217529296875, "rewards//std": 0.02772795408964157, "step": 556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1114, "grad_norm": 0.625200092792511, "kl": 0.26935046166181564, "learning_rate": 4.8721900291112415e-06, "loss": 0.0108, "num_tokens": 4814792.0, "reward": 0.762939453125, "reward_std": 0.00905733834952116, "rewards//mean": 0.762939453125, "rewards//std": 0.03055935725569725, "step": 557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1116, "grad_norm": 1.0318313837051392, "kl": 0.31965751200914383, "learning_rate": 4.871688722751799e-06, "loss": 0.0128, "num_tokens": 4823488.0, "reward": 0.77117919921875, "reward_std": 0.008040939457714558, "rewards//mean": 0.77117919921875, "rewards//std": 0.032330382615327835, "step": 558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1118, "grad_norm": 0.6006855964660645, "kl": 0.24019759241491556, "learning_rate": 4.8711864610764235e-06, "loss": 0.0096, "num_tokens": 4832160.0, "reward": 0.76800537109375, "reward_std": 0.00946258008480072, "rewards//mean": 0.76800537109375, "rewards//std": 0.03378697484731674, "step": 559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.112, "grad_norm": 0.7878683805465698, "kl": 0.2661712933331728, "learning_rate": 4.870683244287425e-06, "loss": 0.0106, "num_tokens": 4840776.0, "reward": 0.720458984375, "reward_std": 0.008840283378958702, "rewards//mean": 0.720458984375, "rewards//std": 0.04082028940320015, "step": 560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1122, "grad_norm": 0.83661288022995, "kl": 0.2538266107439995, "learning_rate": 4.870179072587499e-06, "loss": 0.0102, "num_tokens": 4849480.0, "reward": 0.7720947265625, "reward_std": 0.009472770616412163, "rewards//mean": 0.7720947265625, "rewards//std": 0.025621790438890457, "step": 561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1124, "grad_norm": 0.7142016887664795, "kl": 0.28109513595700264, "learning_rate": 4.869673946179726e-06, "loss": 0.0112, "num_tokens": 4858184.0, "reward": 0.764404296875, "reward_std": 0.00784839503467083, "rewards//mean": 0.764404296875, "rewards//std": 0.02929687686264515, "step": 562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1126, "grad_norm": 0.7756282091140747, "kl": 0.27597307227551937, "learning_rate": 4.8691678652675715e-06, "loss": 0.011, "num_tokens": 4866856.0, "reward": 0.76470947265625, "reward_std": 0.008997954428195953, "rewards//mean": 0.76470947265625, "rewards//std": 0.02482014335691929, "step": 563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1128, "grad_norm": 0.6697260141372681, "kl": 0.25615566223859787, "learning_rate": 4.8686608300548836e-06, "loss": 0.0102, "num_tokens": 4875376.0, "reward": 0.732421875, "reward_std": 0.007549830712378025, "rewards//mean": 0.732421875, "rewards//std": 0.030193578451871872, "step": 564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.113, "grad_norm": 0.7746402621269226, "kl": 0.2652531825006008, "learning_rate": 4.868152840745896e-06, "loss": 0.0106, "num_tokens": 4884008.0, "reward": 0.7220458984375, "reward_std": 0.00881132297217846, "rewards//mean": 0.7220458984375, "rewards//std": 0.036915309727191925, "step": 565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1132, "grad_norm": 0.7145909667015076, "kl": 0.2629079818725586, "learning_rate": 4.8676438975452276e-06, "loss": 0.0105, "num_tokens": 4892672.0, "reward": 0.7686767578125, "reward_std": 0.008167712949216366, "rewards//mean": 0.7686767578125, "rewards//std": 0.02817239984869957, "step": 566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1134, "grad_norm": 0.7171170711517334, "kl": 0.22697240114212036, "learning_rate": 4.86713400065788e-06, "loss": 0.0091, "num_tokens": 4901272.0, "reward": 0.7269287109375, "reward_std": 0.009677172638475895, "rewards//mean": 0.7269287109375, "rewards//std": 0.038479775190353394, "step": 567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1136, "grad_norm": 0.6745283603668213, "kl": 0.24715841561555862, "learning_rate": 4.866623150289241e-06, "loss": 0.0099, "num_tokens": 4909840.0, "reward": 0.75732421875, "reward_std": 0.008682534098625183, "rewards//mean": 0.75732421875, "rewards//std": 0.024094825610518456, "step": 568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1138, "grad_norm": 0.6382898688316345, "kl": 0.25850758142769337, "learning_rate": 4.86611134664508e-06, "loss": 0.0103, "num_tokens": 4918384.0, "reward": 0.7513427734375, "reward_std": 0.0103992260992527, "rewards//mean": 0.7513427734375, "rewards//std": 0.020102253183722496, "step": 569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.114, "grad_norm": 0.9113550186157227, "kl": 0.25279413163661957, "learning_rate": 4.865598589931552e-06, "loss": 0.0101, "num_tokens": 4927096.0, "reward": 0.7747802734375, "reward_std": 0.007576893083751202, "rewards//mean": 0.7747802734375, "rewards//std": 0.033690787851810455, "step": 570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1142, "grad_norm": 0.6302274465560913, "kl": 0.25698636658489704, "learning_rate": 4.865084880355193e-06, "loss": 0.0103, "num_tokens": 4935776.0, "reward": 0.73907470703125, "reward_std": 0.007312727626413107, "rewards//mean": 0.73907470703125, "rewards//std": 0.026749854907393456, "step": 571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1144, "grad_norm": 0.5283216834068298, "kl": 0.22627251036465168, "learning_rate": 4.864570218122928e-06, "loss": 0.0091, "num_tokens": 4944400.0, "reward": 0.73974609375, "reward_std": 0.01004891935735941, "rewards//mean": 0.73974609375, "rewards//std": 0.02998024970293045, "step": 572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1146, "grad_norm": 0.6809362769126892, "kl": 0.24691984988749027, "learning_rate": 4.864054603442063e-06, "loss": 0.0099, "num_tokens": 4953000.0, "reward": 0.75897216796875, "reward_std": 0.009808078408241272, "rewards//mean": 0.75897216796875, "rewards//std": 0.025493483990430832, "step": 573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1148, "grad_norm": 0.6428877711296082, "kl": 0.23521161265671253, "learning_rate": 4.863538036520285e-06, "loss": 0.0094, "num_tokens": 4961696.0, "reward": 0.7613525390625, "reward_std": 0.010448819026350975, "rewards//mean": 0.7613525390625, "rewards//std": 0.03130976855754852, "step": 574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.115, "grad_norm": 0.6935397982597351, "kl": 0.23570332117378712, "learning_rate": 4.863020517565669e-06, "loss": 0.0094, "num_tokens": 4970296.0, "reward": 0.743896484375, "reward_std": 0.007338499650359154, "rewards//mean": 0.743896484375, "rewards//std": 0.024899380281567574, "step": 575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1152, "grad_norm": 0.6086986660957336, "kl": 0.2525392398238182, "learning_rate": 4.862502046786671e-06, "loss": 0.0101, "num_tokens": 4979040.0, "reward": 0.748779296875, "reward_std": 0.007959538139402866, "rewards//mean": 0.748779296875, "rewards//std": 0.04012007638812065, "step": 576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1154, "grad_norm": 0.6362279653549194, "kl": 0.21946440637111664, "learning_rate": 4.861982624392132e-06, "loss": 0.0088, "num_tokens": 4987720.0, "reward": 0.78765869140625, "reward_std": 0.009038891643285751, "rewards//mean": 0.78765869140625, "rewards//std": 0.020556749776005745, "step": 577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1156, "grad_norm": 0.6334317326545715, "kl": 0.2258777841925621, "learning_rate": 4.861462250591273e-06, "loss": 0.009, "num_tokens": 4996352.0, "reward": 0.719482421875, "reward_std": 0.009759314358234406, "rewards//mean": 0.719482421875, "rewards//std": 0.03258461877703667, "step": 578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1158, "grad_norm": 0.585574746131897, "kl": 0.2085122261196375, "learning_rate": 4.860940925593703e-06, "loss": 0.0083, "num_tokens": 5005168.0, "reward": 0.7581787109375, "reward_std": 0.010835596360266209, "rewards//mean": 0.7581787109375, "rewards//std": 0.03846088796854019, "step": 579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.116, "grad_norm": 0.6512745022773743, "kl": 0.2308096420019865, "learning_rate": 4.86041864960941e-06, "loss": 0.0092, "num_tokens": 5013744.0, "reward": 0.75238037109375, "reward_std": 0.008994483388960361, "rewards//mean": 0.75238037109375, "rewards//std": 0.026278842240571976, "step": 580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1162, "grad_norm": 0.5345540642738342, "kl": 0.2112738210707903, "learning_rate": 4.859895422848767e-06, "loss": 0.0085, "num_tokens": 5022304.0, "reward": 0.73822021484375, "reward_std": 0.008295232430100441, "rewards//mean": 0.73822021484375, "rewards//std": 0.04076458141207695, "step": 581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1164, "grad_norm": 0.602761447429657, "kl": 0.21912161633372307, "learning_rate": 4.859371245522531e-06, "loss": 0.0088, "num_tokens": 5030944.0, "reward": 0.703369140625, "reward_std": 0.00949187483638525, "rewards//mean": 0.703369140625, "rewards//std": 0.028560098260641098, "step": 582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1166, "grad_norm": 0.6504238247871399, "kl": 0.23803219199180603, "learning_rate": 4.8588461178418375e-06, "loss": 0.0095, "num_tokens": 5039632.0, "reward": 0.7572021484375, "reward_std": 0.00934354867786169, "rewards//mean": 0.7572021484375, "rewards//std": 0.02799776755273342, "step": 583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1168, "grad_norm": 0.5838614702224731, "kl": 0.21050356701016426, "learning_rate": 4.858320040018212e-06, "loss": 0.0084, "num_tokens": 5048256.0, "reward": 0.75494384765625, "reward_std": 0.009613383561372757, "rewards//mean": 0.75494384765625, "rewards//std": 0.021341485902667046, "step": 584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.117, "grad_norm": 0.5332762002944946, "kl": 0.2162165530025959, "learning_rate": 4.857793012263555e-06, "loss": 0.0086, "num_tokens": 5056808.0, "reward": 0.75299072265625, "reward_std": 0.00866914913058281, "rewards//mean": 0.75299072265625, "rewards//std": 0.030653268098831177, "step": 585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1172, "grad_norm": 0.5869634747505188, "kl": 0.22080680169165134, "learning_rate": 4.857265034790155e-06, "loss": 0.0088, "num_tokens": 5065400.0, "reward": 0.7698974609375, "reward_std": 0.009216565638780594, "rewards//mean": 0.7698974609375, "rewards//std": 0.03380203619599342, "step": 586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1174, "grad_norm": 0.6322459578514099, "kl": 0.20717273838818073, "learning_rate": 4.85673610781068e-06, "loss": 0.0083, "num_tokens": 5074064.0, "reward": 0.78057861328125, "reward_std": 0.010938020423054695, "rewards//mean": 0.78057861328125, "rewards//std": 0.028258977457880974, "step": 587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1176, "grad_norm": 0.8341087102890015, "kl": 0.2445808406919241, "learning_rate": 4.856206231538184e-06, "loss": 0.0098, "num_tokens": 5082720.0, "reward": 0.74163818359375, "reward_std": 0.009593227878212929, "rewards//mean": 0.74163818359375, "rewards//std": 0.02997513860464096, "step": 588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1178, "grad_norm": 0.5464284420013428, "kl": 0.20626290701329708, "learning_rate": 4.855675406186099e-06, "loss": 0.0083, "num_tokens": 5091344.0, "reward": 0.73687744140625, "reward_std": 0.013462206348776817, "rewards//mean": 0.73687744140625, "rewards//std": 0.03030860796570778, "step": 589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.118, "grad_norm": 0.5974122285842896, "kl": 0.2238724958151579, "learning_rate": 4.855143631968242e-06, "loss": 0.009, "num_tokens": 5100024.0, "reward": 0.76165771484375, "reward_std": 0.010216079652309418, "rewards//mean": 0.76165771484375, "rewards//std": 0.022633064538240433, "step": 590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1182, "grad_norm": 0.5498731732368469, "kl": 0.20982307940721512, "learning_rate": 4.854610909098813e-06, "loss": 0.0084, "num_tokens": 5108640.0, "reward": 0.740234375, "reward_std": 0.011146800592541695, "rewards//mean": 0.740234375, "rewards//std": 0.029065513983368874, "step": 591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1184, "grad_norm": 0.5892578363418579, "kl": 0.22463841922581196, "learning_rate": 4.854077237792389e-06, "loss": 0.009, "num_tokens": 5117224.0, "reward": 0.74560546875, "reward_std": 0.010708848014473915, "rewards//mean": 0.74560546875, "rewards//std": 0.03500435873866081, "step": 592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1186, "grad_norm": 0.5446705222129822, "kl": 0.19729456305503845, "learning_rate": 4.853542618263937e-06, "loss": 0.0079, "num_tokens": 5125776.0, "reward": 0.7720947265625, "reward_std": 0.0137183777987957, "rewards//mean": 0.7720947265625, "rewards//std": 0.02645888738334179, "step": 593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1188, "grad_norm": 0.5078734159469604, "kl": 0.20582310110330582, "learning_rate": 4.8530070507288e-06, "loss": 0.0082, "num_tokens": 5134408.0, "reward": 0.7685546875, "reward_std": 0.008237513713538647, "rewards//mean": 0.7685546875, "rewards//std": 0.02214088849723339, "step": 594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.119, "grad_norm": 0.6351189017295837, "kl": 0.2431877087801695, "learning_rate": 4.852470535402703e-06, "loss": 0.0097, "num_tokens": 5143040.0, "reward": 0.75970458984375, "reward_std": 0.009093962609767914, "rewards//mean": 0.75970458984375, "rewards//std": 0.021931972354650497, "step": 595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1192, "grad_norm": 0.8271527886390686, "kl": 0.23255185037851334, "learning_rate": 4.851933072501756e-06, "loss": 0.0093, "num_tokens": 5151704.0, "reward": 0.7572021484375, "reward_std": 0.011184893548488617, "rewards//mean": 0.7572021484375, "rewards//std": 0.02380884252488613, "step": 596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1194, "grad_norm": 0.7389485239982605, "kl": 0.21711469255387783, "learning_rate": 4.851394662242449e-06, "loss": 0.0087, "num_tokens": 5160400.0, "reward": 0.7572021484375, "reward_std": 0.012733031064271927, "rewards//mean": 0.7572021484375, "rewards//std": 0.027341259643435478, "step": 597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1196, "grad_norm": 0.7461937665939331, "kl": 0.2250336967408657, "learning_rate": 4.850855304841653e-06, "loss": 0.009, "num_tokens": 5169024.0, "reward": 0.75531005859375, "reward_std": 0.006518941838294268, "rewards//mean": 0.75531005859375, "rewards//std": 0.027677135542035103, "step": 598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1198, "grad_norm": 0.737876832485199, "kl": 0.21269006468355656, "learning_rate": 4.8503150005166225e-06, "loss": 0.0085, "num_tokens": 5177672.0, "reward": 0.76031494140625, "reward_std": 0.007318540476262569, "rewards//mean": 0.76031494140625, "rewards//std": 0.02775578200817108, "step": 599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.12, "grad_norm": 0.6356746554374695, "kl": 0.22319811396300793, "learning_rate": 4.849773749484989e-06, "loss": 0.0089, "num_tokens": 5186400.0, "reward": 0.72882080078125, "reward_std": 0.008081294596195221, "rewards//mean": 0.72882080078125, "rewards//std": 0.03312958776950836, "step": 600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1202, "grad_norm": 0.5195296406745911, "kl": 0.20459580793976784, "learning_rate": 4.849231551964771e-06, "loss": 0.0082, "num_tokens": 5195048.0, "reward": 0.73486328125, "reward_std": 0.009343027137219906, "rewards//mean": 0.73486328125, "rewards//std": 0.02382187359035015, "step": 601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1204, "grad_norm": 0.6819605827331543, "kl": 0.22957494854927063, "learning_rate": 4.848688408174366e-06, "loss": 0.0092, "num_tokens": 5203712.0, "reward": 0.7725830078125, "reward_std": 0.008309951052069664, "rewards//mean": 0.7725830078125, "rewards//std": 0.02304638922214508, "step": 602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1206, "grad_norm": 0.6028555631637573, "kl": 0.2299717701971531, "learning_rate": 4.84814431833255e-06, "loss": 0.0092, "num_tokens": 5212352.0, "reward": 0.76513671875, "reward_std": 0.011560057289898396, "rewards//mean": 0.76513671875, "rewards//std": 0.026951193809509277, "step": 603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1208, "grad_norm": 0.7233465313911438, "kl": 0.21515335515141487, "learning_rate": 4.847599282658483e-06, "loss": 0.0086, "num_tokens": 5220912.0, "reward": 0.7650146484375, "reward_std": 0.009766367264091969, "rewards//mean": 0.7650146484375, "rewards//std": 0.03968513756990433, "step": 604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.121, "grad_norm": 0.6206122636795044, "kl": 0.2212205920368433, "learning_rate": 4.847053301371706e-06, "loss": 0.0088, "num_tokens": 5229616.0, "reward": 0.75567626953125, "reward_std": 0.010858502238988876, "rewards//mean": 0.75567626953125, "rewards//std": 0.02769080549478531, "step": 605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1212, "grad_norm": 0.6429035663604736, "kl": 0.22523421607911587, "learning_rate": 4.84650637469214e-06, "loss": 0.009, "num_tokens": 5238192.0, "reward": 0.73785400390625, "reward_std": 0.010889837518334389, "rewards//mean": 0.73785400390625, "rewards//std": 0.030354522168636322, "step": 606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1214, "grad_norm": 0.7371296286582947, "kl": 0.22557489946484566, "learning_rate": 4.845958502840087e-06, "loss": 0.009, "num_tokens": 5246720.0, "reward": 0.7720947265625, "reward_std": 0.008845320902764797, "rewards//mean": 0.7720947265625, "rewards//std": 0.03540147468447685, "step": 607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1216, "grad_norm": 0.8345296382904053, "kl": 0.2478946428745985, "learning_rate": 4.8454096860362284e-06, "loss": 0.0099, "num_tokens": 5255328.0, "reward": 0.74542236328125, "reward_std": 0.008497816510498524, "rewards//mean": 0.74542236328125, "rewards//std": 0.02872650697827339, "step": 608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1218, "grad_norm": 0.9587209224700928, "kl": 0.22490216977894306, "learning_rate": 4.8448599245016306e-06, "loss": 0.009, "num_tokens": 5263992.0, "reward": 0.7431640625, "reward_std": 0.008303426206111908, "rewards//mean": 0.7431640625, "rewards//std": 0.033689215779304504, "step": 609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.122, "grad_norm": 0.6855178475379944, "kl": 0.24397451616823673, "learning_rate": 4.844309218457735e-06, "loss": 0.0098, "num_tokens": 5272632.0, "reward": 0.74456787109375, "reward_std": 0.007743033580482006, "rewards//mean": 0.74456787109375, "rewards//std": 0.02703300304710865, "step": 610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1222, "grad_norm": 0.8738477230072021, "kl": 0.24380822479724884, "learning_rate": 4.843757568126366e-06, "loss": 0.0098, "num_tokens": 5281264.0, "reward": 0.76068115234375, "reward_std": 0.008802594617009163, "rewards//mean": 0.76068115234375, "rewards//std": 0.02518342435359955, "step": 611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1224, "grad_norm": 1.1533160209655762, "kl": 0.2847021333873272, "learning_rate": 4.84320497372973e-06, "loss": 0.0114, "num_tokens": 5290096.0, "reward": 0.7354736328125, "reward_std": 0.009391937404870987, "rewards//mean": 0.7354736328125, "rewards//std": 0.03238774463534355, "step": 612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1226, "grad_norm": 0.9461967945098877, "kl": 0.2901354692876339, "learning_rate": 4.8426514354904096e-06, "loss": 0.0116, "num_tokens": 5298720.0, "reward": 0.75830078125, "reward_std": 0.01141531765460968, "rewards//mean": 0.75830078125, "rewards//std": 0.02832699380815029, "step": 613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1228, "grad_norm": 0.8776152729988098, "kl": 0.2779980804771185, "learning_rate": 4.842096953631371e-06, "loss": 0.0111, "num_tokens": 5307344.0, "reward": 0.74365234375, "reward_std": 0.007914327085018158, "rewards//mean": 0.74365234375, "rewards//std": 0.03176122531294823, "step": 614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.123, "grad_norm": 0.7875434160232544, "kl": 0.27193955332040787, "learning_rate": 4.841541528375961e-06, "loss": 0.0109, "num_tokens": 5315880.0, "reward": 0.7650146484375, "reward_std": 0.011012593284249306, "rewards//mean": 0.7650146484375, "rewards//std": 0.03217579424381256, "step": 615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1232, "grad_norm": 0.9433616995811462, "kl": 0.331559956073761, "learning_rate": 4.840985159947902e-06, "loss": 0.0133, "num_tokens": 5324592.0, "reward": 0.759521484375, "reward_std": 0.008395890705287457, "rewards//mean": 0.759521484375, "rewards//std": 0.025189509615302086, "step": 616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1234, "grad_norm": 1.2481539249420166, "kl": 0.3108705338090658, "learning_rate": 4.8404278485713005e-06, "loss": 0.0124, "num_tokens": 5333288.0, "reward": 0.75555419921875, "reward_std": 0.006895901635289192, "rewards//mean": 0.75555419921875, "rewards//std": 0.020772846415638924, "step": 617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1236, "grad_norm": 1.051727056503296, "kl": 0.3007947914302349, "learning_rate": 4.839869594470642e-06, "loss": 0.012, "num_tokens": 5341944.0, "reward": 0.74566650390625, "reward_std": 0.009581143036484718, "rewards//mean": 0.74566650390625, "rewards//std": 0.02763991802930832, "step": 618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1238, "grad_norm": 1.1710658073425293, "kl": 0.3423069082200527, "learning_rate": 4.839310397870791e-06, "loss": 0.0137, "num_tokens": 5350712.0, "reward": 0.80609130859375, "reward_std": 0.0071121640503406525, "rewards//mean": 0.80609130859375, "rewards//std": 0.022340906783938408, "step": 619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.124, "grad_norm": 0.9958809018135071, "kl": 0.30707372911274433, "learning_rate": 4.838750258996992e-06, "loss": 0.0123, "num_tokens": 5359400.0, "reward": 0.7572021484375, "reward_std": 0.008860534057021141, "rewards//mean": 0.7572021484375, "rewards//std": 0.029439911246299744, "step": 620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1242, "grad_norm": 1.331772804260254, "kl": 0.3642958104610443, "learning_rate": 4.838189178074867e-06, "loss": 0.0146, "num_tokens": 5368040.0, "reward": 0.7486572265625, "reward_std": 0.005241964012384415, "rewards//mean": 0.7486572265625, "rewards//std": 0.03716379404067993, "step": 621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1244, "grad_norm": 1.1566888093948364, "kl": 0.32168351113796234, "learning_rate": 4.837627155330421e-06, "loss": 0.0129, "num_tokens": 5376656.0, "reward": 0.74420166015625, "reward_std": 0.010309340432286263, "rewards//mean": 0.74420166015625, "rewards//std": 0.035138972103595734, "step": 622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1246, "grad_norm": 1.2128782272338867, "kl": 0.3658299557864666, "learning_rate": 4.837064190990036e-06, "loss": 0.0146, "num_tokens": 5385312.0, "reward": 0.74615478515625, "reward_std": 0.005186946596950293, "rewards//mean": 0.74615478515625, "rewards//std": 0.023983918130397797, "step": 623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1248, "grad_norm": 1.2308634519577026, "kl": 0.35844672471284866, "learning_rate": 4.836500285280476e-06, "loss": 0.0143, "num_tokens": 5393888.0, "reward": 0.75299072265625, "reward_std": 0.0063725742511451244, "rewards//mean": 0.75299072265625, "rewards//std": 0.028277721256017685, "step": 624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.125, "grad_norm": 1.1482653617858887, "kl": 0.345749881118536, "learning_rate": 4.83593543842888e-06, "loss": 0.0138, "num_tokens": 5402496.0, "reward": 0.7623291015625, "reward_std": 0.00810457207262516, "rewards//mean": 0.7623291015625, "rewards//std": 0.03183525428175926, "step": 625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1252, "grad_norm": 1.252425193786621, "kl": 0.4149610288441181, "learning_rate": 4.835369650662767e-06, "loss": 0.0166, "num_tokens": 5411160.0, "reward": 0.7589111328125, "reward_std": 0.009770114906132221, "rewards//mean": 0.7589111328125, "rewards//std": 0.043982502073049545, "step": 626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1254, "grad_norm": 1.0089774131774902, "kl": 0.3325713388621807, "learning_rate": 4.83480292221004e-06, "loss": 0.0133, "num_tokens": 5419800.0, "reward": 0.78118896484375, "reward_std": 0.007964782416820526, "rewards//mean": 0.78118896484375, "rewards//std": 0.027581803500652313, "step": 627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1256, "grad_norm": 1.1872444152832031, "kl": 0.3795311227440834, "learning_rate": 4.834235253298973e-06, "loss": 0.0152, "num_tokens": 5428536.0, "reward": 0.74444580078125, "reward_std": 0.007206289563328028, "rewards//mean": 0.74444580078125, "rewards//std": 0.021763604134321213, "step": 628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1258, "grad_norm": 1.326137661933899, "kl": 0.37480510398745537, "learning_rate": 4.833666644158227e-06, "loss": 0.015, "num_tokens": 5437144.0, "reward": 0.7896728515625, "reward_std": 0.00821404904127121, "rewards//mean": 0.7896728515625, "rewards//std": 0.026238271966576576, "step": 629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.126, "grad_norm": 1.089011788368225, "kl": 0.30840661004185677, "learning_rate": 4.833097095016835e-06, "loss": 0.0123, "num_tokens": 5445856.0, "reward": 0.76019287109375, "reward_std": 0.008738137781620026, "rewards//mean": 0.76019287109375, "rewards//std": 0.028870007023215294, "step": 630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1262, "grad_norm": 1.430968165397644, "kl": 0.3591831102967262, "learning_rate": 4.832526606104213e-06, "loss": 0.0144, "num_tokens": 5454512.0, "reward": 0.732421875, "reward_std": 0.008345866575837135, "rewards//mean": 0.732421875, "rewards//std": 0.032008104026317596, "step": 631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1264, "grad_norm": 1.2066965103149414, "kl": 0.35580387338995934, "learning_rate": 4.831955177650153e-06, "loss": 0.0142, "num_tokens": 5463144.0, "reward": 0.76519775390625, "reward_std": 0.007233831100165844, "rewards//mean": 0.76519775390625, "rewards//std": 0.02703748270869255, "step": 632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1266, "grad_norm": 1.33864426612854, "kl": 0.3583483546972275, "learning_rate": 4.831382809884826e-06, "loss": 0.0143, "num_tokens": 5471768.0, "reward": 0.751708984375, "reward_std": 0.006699239369481802, "rewards//mean": 0.751708984375, "rewards//std": 0.026021895930171013, "step": 633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1268, "grad_norm": 1.0695289373397827, "kl": 0.331091046333313, "learning_rate": 4.830809503038781e-06, "loss": 0.0132, "num_tokens": 5480384.0, "reward": 0.7537841796875, "reward_std": 0.007534695789217949, "rewards//mean": 0.7537841796875, "rewards//std": 0.020481223240494728, "step": 634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.127, "grad_norm": 1.05119788646698, "kl": 0.2932460140436888, "learning_rate": 4.830235257342948e-06, "loss": 0.0117, "num_tokens": 5488984.0, "reward": 0.73675537109375, "reward_std": 0.009710824117064476, "rewards//mean": 0.73675537109375, "rewards//std": 0.02311091497540474, "step": 635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1272, "grad_norm": 0.9468859434127808, "kl": 0.27140727639198303, "learning_rate": 4.829660073028631e-06, "loss": 0.0109, "num_tokens": 5497624.0, "reward": 0.81549072265625, "reward_std": 0.006733256857842207, "rewards//mean": 0.81549072265625, "rewards//std": 0.017603833228349686, "step": 636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1274, "grad_norm": 1.144340991973877, "kl": 0.3296862170100212, "learning_rate": 4.829083950327516e-06, "loss": 0.0132, "num_tokens": 5506312.0, "reward": 0.7391357421875, "reward_std": 0.007396392058581114, "rewards//mean": 0.7391357421875, "rewards//std": 0.02667086571455002, "step": 637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1276, "grad_norm": 1.0231379270553589, "kl": 0.32026514038443565, "learning_rate": 4.828506889471664e-06, "loss": 0.0128, "num_tokens": 5514912.0, "reward": 0.73388671875, "reward_std": 0.006786069832742214, "rewards//mean": 0.73388671875, "rewards//std": 0.017696566879749298, "step": 638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1278, "grad_norm": 1.3025801181793213, "kl": 0.3653467670083046, "learning_rate": 4.827928890693515e-06, "loss": 0.0146, "num_tokens": 5523536.0, "reward": 0.782958984375, "reward_std": 0.0090708639472723, "rewards//mean": 0.782958984375, "rewards//std": 0.0243386123329401, "step": 639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.128, "grad_norm": 1.0168287754058838, "kl": 0.3438634071499109, "learning_rate": 4.8273499542258885e-06, "loss": 0.0138, "num_tokens": 5532184.0, "reward": 0.7303466796875, "reward_std": 0.007754259742796421, "rewards//mean": 0.7303466796875, "rewards//std": 0.027673648670315742, "step": 640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1282, "grad_norm": 0.9684326648712158, "kl": 0.3083877395838499, "learning_rate": 4.826770080301978e-06, "loss": 0.0123, "num_tokens": 5540808.0, "reward": 0.7808837890625, "reward_std": 0.010280175134539604, "rewards//mean": 0.7808837890625, "rewards//std": 0.029550766572356224, "step": 641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1284, "grad_norm": 1.223793864250183, "kl": 0.34201204031705856, "learning_rate": 4.826189269155357e-06, "loss": 0.0137, "num_tokens": 5549448.0, "reward": 0.7252197265625, "reward_std": 0.008540410548448563, "rewards//mean": 0.7252197265625, "rewards//std": 0.029250076040625572, "step": 642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1286, "grad_norm": 1.2303458452224731, "kl": 0.36477214470505714, "learning_rate": 4.825607521019978e-06, "loss": 0.0146, "num_tokens": 5558080.0, "reward": 0.7886962890625, "reward_std": 0.00828670896589756, "rewards//mean": 0.7886962890625, "rewards//std": 0.022837886586785316, "step": 643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1288, "grad_norm": 1.127670168876648, "kl": 0.36717789620161057, "learning_rate": 4.825024836130166e-06, "loss": 0.0147, "num_tokens": 5566712.0, "reward": 0.776611328125, "reward_std": 0.005144121125340462, "rewards//mean": 0.776611328125, "rewards//std": 0.02671085111796856, "step": 644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.129, "grad_norm": 1.114711880683899, "kl": 0.28322512097656727, "learning_rate": 4.824441214720629e-06, "loss": 0.0113, "num_tokens": 5575304.0, "reward": 0.7742919921875, "reward_std": 0.011971713975071907, "rewards//mean": 0.7742919921875, "rewards//std": 0.034474264830350876, "step": 645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1292, "grad_norm": 1.0570855140686035, "kl": 0.3079027570784092, "learning_rate": 4.823856657026448e-06, "loss": 0.0123, "num_tokens": 5584080.0, "reward": 0.75958251953125, "reward_std": 0.00948810763657093, "rewards//mean": 0.75958251953125, "rewards//std": 0.031992435455322266, "step": 646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1294, "grad_norm": 1.053823471069336, "kl": 0.33063437044620514, "learning_rate": 4.823271163283084e-06, "loss": 0.0132, "num_tokens": 5592720.0, "reward": 0.7550048828125, "reward_std": 0.006211505271494389, "rewards//mean": 0.7550048828125, "rewards//std": 0.02459198608994484, "step": 647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1296, "grad_norm": 0.8944997787475586, "kl": 0.2919867653399706, "learning_rate": 4.822684733726373e-06, "loss": 0.0117, "num_tokens": 5601432.0, "reward": 0.76715087890625, "reward_std": 0.00828872062265873, "rewards//mean": 0.76715087890625, "rewards//std": 0.014392371289432049, "step": 648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1298, "grad_norm": 1.134657382965088, "kl": 0.2569319698959589, "learning_rate": 4.822097368592529e-06, "loss": 0.0103, "num_tokens": 5610040.0, "reward": 0.723388671875, "reward_std": 0.009949292987585068, "rewards//mean": 0.723388671875, "rewards//std": 0.02867857925593853, "step": 649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.13, "grad_norm": 0.8463056087493896, "kl": 0.29777566343545914, "learning_rate": 4.821509068118143e-06, "loss": 0.0119, "num_tokens": 5618752.0, "reward": 0.73681640625, "reward_std": 0.00745723582804203, "rewards//mean": 0.73681640625, "rewards//std": 0.026231637224555016, "step": 650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1302, "grad_norm": 0.9844704270362854, "kl": 0.3054410591721535, "learning_rate": 4.8209198325401815e-06, "loss": 0.0122, "num_tokens": 5627320.0, "reward": 0.76171875, "reward_std": 0.007473442703485489, "rewards//mean": 0.76171875, "rewards//std": 0.01674727164208889, "step": 651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1304, "grad_norm": 1.0094974040985107, "kl": 0.29557618498802185, "learning_rate": 4.82032966209599e-06, "loss": 0.0118, "num_tokens": 5635952.0, "reward": 0.77386474609375, "reward_std": 0.008985331282019615, "rewards//mean": 0.77386474609375, "rewards//std": 0.028392048552632332, "step": 652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1306, "grad_norm": 0.8305743336677551, "kl": 0.2749790381640196, "learning_rate": 4.819738557023287e-06, "loss": 0.011, "num_tokens": 5644616.0, "reward": 0.75286865234375, "reward_std": 0.009393520653247833, "rewards//mean": 0.75286865234375, "rewards//std": 0.024232570081949234, "step": 653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1308, "grad_norm": 0.9685484170913696, "kl": 0.3104813024401665, "learning_rate": 4.819146517560171e-06, "loss": 0.0124, "num_tokens": 5653272.0, "reward": 0.77801513671875, "reward_std": 0.006662518717348576, "rewards//mean": 0.77801513671875, "rewards//std": 0.019912630319595337, "step": 654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.131, "grad_norm": 1.2605221271514893, "kl": 0.28511145897209644, "learning_rate": 4.818553543945115e-06, "loss": 0.0114, "num_tokens": 5661888.0, "reward": 0.78240966796875, "reward_std": 0.007852917537093163, "rewards//mean": 0.78240966796875, "rewards//std": 0.01976766251027584, "step": 655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1312, "grad_norm": 0.945095956325531, "kl": 0.2604890577495098, "learning_rate": 4.817959636416969e-06, "loss": 0.0104, "num_tokens": 5670504.0, "reward": 0.73822021484375, "reward_std": 0.00944865308701992, "rewards//mean": 0.73822021484375, "rewards//std": 0.03974169120192528, "step": 656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1314, "grad_norm": 0.7084172368049622, "kl": 0.24680477194488049, "learning_rate": 4.8173647952149584e-06, "loss": 0.0099, "num_tokens": 5679168.0, "reward": 0.76702880859375, "reward_std": 0.007409450598061085, "rewards//mean": 0.76702880859375, "rewards//std": 0.026313383132219315, "step": 657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1316, "grad_norm": 0.7127532958984375, "kl": 0.2393797803670168, "learning_rate": 4.816769020578685e-06, "loss": 0.0096, "num_tokens": 5687896.0, "reward": 0.76580810546875, "reward_std": 0.009323619306087494, "rewards//mean": 0.76580810546875, "rewards//std": 0.025117818266153336, "step": 658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1318, "grad_norm": 0.7750257253646851, "kl": 0.26598621532320976, "learning_rate": 4.816172312748128e-06, "loss": 0.0106, "num_tokens": 5696520.0, "reward": 0.7744140625, "reward_std": 0.008821007795631886, "rewards//mean": 0.7744140625, "rewards//std": 0.015120835043489933, "step": 659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.132, "grad_norm": 0.6631627082824707, "kl": 0.2553862910717726, "learning_rate": 4.81557467196364e-06, "loss": 0.0102, "num_tokens": 5705160.0, "reward": 0.7291259765625, "reward_std": 0.006548763252794743, "rewards//mean": 0.7291259765625, "rewards//std": 0.022490572184324265, "step": 660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1322, "grad_norm": 0.5979108810424805, "kl": 0.2634982131421566, "learning_rate": 4.814976098465951e-06, "loss": 0.0105, "num_tokens": 5713848.0, "reward": 0.7772216796875, "reward_std": 0.006953542120754719, "rewards//mean": 0.7772216796875, "rewards//std": 0.017441095784306526, "step": 661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1324, "grad_norm": 0.6861351728439331, "kl": 0.2535683810710907, "learning_rate": 4.814376592496167e-06, "loss": 0.0101, "num_tokens": 5722440.0, "reward": 0.7781982421875, "reward_std": 0.006485221907496452, "rewards//mean": 0.7781982421875, "rewards//std": 0.028885535895824432, "step": 662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1326, "grad_norm": 0.6136988997459412, "kl": 0.23714140430092812, "learning_rate": 4.813776154295767e-06, "loss": 0.0095, "num_tokens": 5731056.0, "reward": 0.77496337890625, "reward_std": 0.007635117508471012, "rewards//mean": 0.77496337890625, "rewards//std": 0.0270240418612957, "step": 663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1328, "grad_norm": 0.7283768653869629, "kl": 0.23214296996593475, "learning_rate": 4.81317478410661e-06, "loss": 0.0093, "num_tokens": 5739704.0, "reward": 0.74163818359375, "reward_std": 0.009544800035655499, "rewards//mean": 0.74163818359375, "rewards//std": 0.030503764748573303, "step": 664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.133, "grad_norm": 1.0039687156677246, "kl": 0.2293421532958746, "learning_rate": 4.812572482170926e-06, "loss": 0.0092, "num_tokens": 5748272.0, "reward": 0.76214599609375, "reward_std": 0.008817870169878006, "rewards//mean": 0.76214599609375, "rewards//std": 0.02790156379342079, "step": 665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1332, "grad_norm": 0.7138220071792603, "kl": 0.22582152672111988, "learning_rate": 4.811969248731323e-06, "loss": 0.009, "num_tokens": 5756944.0, "reward": 0.774169921875, "reward_std": 0.009145371615886688, "rewards//mean": 0.774169921875, "rewards//std": 0.024546701461076736, "step": 666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1334, "grad_norm": 0.6133260726928711, "kl": 0.24057517014443874, "learning_rate": 4.811365084030784e-06, "loss": 0.0096, "num_tokens": 5765464.0, "reward": 0.7408447265625, "reward_std": 0.009205160662531853, "rewards//mean": 0.7408447265625, "rewards//std": 0.03023541159927845, "step": 667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1336, "grad_norm": 0.6326518058776855, "kl": 0.23078422248363495, "learning_rate": 4.8107599883126634e-06, "loss": 0.0092, "num_tokens": 5774152.0, "reward": 0.76348876953125, "reward_std": 0.010718736797571182, "rewards//mean": 0.76348876953125, "rewards//std": 0.031055085361003876, "step": 668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1338, "grad_norm": 0.7892494201660156, "kl": 0.2298267874866724, "learning_rate": 4.810153961820697e-06, "loss": 0.0092, "num_tokens": 5782856.0, "reward": 0.76007080078125, "reward_std": 0.00991053506731987, "rewards//mean": 0.76007080078125, "rewards//std": 0.030913403257727623, "step": 669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.134, "grad_norm": 0.6180135011672974, "kl": 0.21059091202914715, "learning_rate": 4.809547004798991e-06, "loss": 0.0084, "num_tokens": 5791512.0, "reward": 0.7464599609375, "reward_std": 0.011077282950282097, "rewards//mean": 0.7464599609375, "rewards//std": 0.03385930880904198, "step": 670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1342, "grad_norm": 0.7543632388114929, "kl": 0.2657131403684616, "learning_rate": 4.808939117492028e-06, "loss": 0.0106, "num_tokens": 5800024.0, "reward": 0.73602294921875, "reward_std": 0.009333530440926552, "rewards//mean": 0.73602294921875, "rewards//std": 0.026308204978704453, "step": 671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1344, "grad_norm": 0.7919978499412537, "kl": 0.2661147303879261, "learning_rate": 4.808330300144664e-06, "loss": 0.0106, "num_tokens": 5808632.0, "reward": 0.75872802734375, "reward_std": 0.009824881330132484, "rewards//mean": 0.75872802734375, "rewards//std": 0.02316456288099289, "step": 672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1346, "grad_norm": 0.7620828747749329, "kl": 0.25611876510083675, "learning_rate": 4.807720553002132e-06, "loss": 0.0102, "num_tokens": 5817208.0, "reward": 0.7601318359375, "reward_std": 0.009487172588706017, "rewards//mean": 0.7601318359375, "rewards//std": 0.029021471738815308, "step": 673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1348, "grad_norm": 0.9600464105606079, "kl": 0.24580915831029415, "learning_rate": 4.807109876310037e-06, "loss": 0.0098, "num_tokens": 5826000.0, "reward": 0.7584228515625, "reward_std": 0.008735930547118187, "rewards//mean": 0.7584228515625, "rewards//std": 0.027020471170544624, "step": 674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.135, "grad_norm": 0.7932077050209045, "kl": 0.251651618629694, "learning_rate": 4.806498270314359e-06, "loss": 0.0101, "num_tokens": 5834584.0, "reward": 0.76025390625, "reward_std": 0.007064810022711754, "rewards//mean": 0.76025390625, "rewards//std": 0.02699608914554119, "step": 675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1352, "grad_norm": 0.8365455269813538, "kl": 0.2700675167143345, "learning_rate": 4.805885735261454e-06, "loss": 0.0108, "num_tokens": 5843224.0, "reward": 0.7371826171875, "reward_std": 0.010191906243562698, "rewards//mean": 0.7371826171875, "rewards//std": 0.023923013359308243, "step": 676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1354, "grad_norm": 0.7729324698448181, "kl": 0.26824005506932735, "learning_rate": 4.805272271398051e-06, "loss": 0.0107, "num_tokens": 5851800.0, "reward": 0.70831298828125, "reward_std": 0.008622756227850914, "rewards//mean": 0.70831298828125, "rewards//std": 0.02663075178861618, "step": 677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1356, "grad_norm": 0.9402274489402771, "kl": 0.2813422940671444, "learning_rate": 4.804657878971252e-06, "loss": 0.0113, "num_tokens": 5860408.0, "reward": 0.74505615234375, "reward_std": 0.007954198867082596, "rewards//mean": 0.74505615234375, "rewards//std": 0.023315679281949997, "step": 678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1358, "grad_norm": 0.7352375984191895, "kl": 0.26763404347002506, "learning_rate": 4.804042558228535e-06, "loss": 0.0107, "num_tokens": 5869048.0, "reward": 0.7579345703125, "reward_std": 0.006437701638787985, "rewards//mean": 0.7579345703125, "rewards//std": 0.029943620786070824, "step": 679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.136, "grad_norm": 0.8864990472793579, "kl": 0.3166388310492039, "learning_rate": 4.803426309417752e-06, "loss": 0.0127, "num_tokens": 5877704.0, "reward": 0.75897216796875, "reward_std": 0.007796045392751694, "rewards//mean": 0.75897216796875, "rewards//std": 0.0265179630368948, "step": 680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1362, "grad_norm": 0.8726670742034912, "kl": 0.28164214082062244, "learning_rate": 4.802809132787125e-06, "loss": 0.0113, "num_tokens": 5886232.0, "reward": 0.74322509765625, "reward_std": 0.007162688300013542, "rewards//mean": 0.74322509765625, "rewards//std": 0.024352213367819786, "step": 681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1364, "grad_norm": 1.2566941976547241, "kl": 0.3058667443692684, "learning_rate": 4.802191028585257e-06, "loss": 0.0122, "num_tokens": 5894872.0, "reward": 0.736328125, "reward_std": 0.009593142196536064, "rewards//mean": 0.736328125, "rewards//std": 0.04730703681707382, "step": 682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1366, "grad_norm": 0.901271641254425, "kl": 0.3303819112479687, "learning_rate": 4.801571997061117e-06, "loss": 0.0132, "num_tokens": 5903480.0, "reward": 0.7298583984375, "reward_std": 0.008278006687760353, "rewards//mean": 0.7298583984375, "rewards//std": 0.03159085661172867, "step": 683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1368, "grad_norm": 0.846440851688385, "kl": 0.2713386472314596, "learning_rate": 4.800952038464051e-06, "loss": 0.0109, "num_tokens": 5912088.0, "reward": 0.753173828125, "reward_std": 0.008396659046411514, "rewards//mean": 0.753173828125, "rewards//std": 0.02986997365951538, "step": 684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.137, "grad_norm": 0.99683678150177, "kl": 0.29254608042538166, "learning_rate": 4.800331153043781e-06, "loss": 0.0117, "num_tokens": 5920760.0, "reward": 0.7811279296875, "reward_std": 0.006906645372509956, "rewards//mean": 0.7811279296875, "rewards//std": 0.02221698872745037, "step": 685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1372, "grad_norm": 0.9804807305335999, "kl": 0.32390232756733894, "learning_rate": 4.799709341050397e-06, "loss": 0.013, "num_tokens": 5929320.0, "reward": 0.75372314453125, "reward_std": 0.009741474874317646, "rewards//mean": 0.75372314453125, "rewards//std": 0.022919517010450363, "step": 686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1374, "grad_norm": 1.048980712890625, "kl": 0.3197479471564293, "learning_rate": 4.799086602734364e-06, "loss": 0.0128, "num_tokens": 5937912.0, "reward": 0.732177734375, "reward_std": 0.011059543117880821, "rewards//mean": 0.732177734375, "rewards//std": 0.03457533195614815, "step": 687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1376, "grad_norm": 0.9630193114280701, "kl": 0.33098217844963074, "learning_rate": 4.798462938346524e-06, "loss": 0.0132, "num_tokens": 5946624.0, "reward": 0.74591064453125, "reward_std": 0.008718679659068584, "rewards//mean": 0.74591064453125, "rewards//std": 0.0328625813126564, "step": 688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1378, "grad_norm": 1.1587707996368408, "kl": 0.29454705864191055, "learning_rate": 4.7978383481380865e-06, "loss": 0.0118, "num_tokens": 5955192.0, "reward": 0.7099609375, "reward_std": 0.007809010334312916, "rewards//mean": 0.7099609375, "rewards//std": 0.034053899347782135, "step": 689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.138, "grad_norm": 0.9946979284286499, "kl": 0.32380471006035805, "learning_rate": 4.797212832360637e-06, "loss": 0.013, "num_tokens": 5963760.0, "reward": 0.767822265625, "reward_std": 0.00826583243906498, "rewards//mean": 0.767822265625, "rewards//std": 0.02232472226023674, "step": 690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1382, "grad_norm": 1.1503069400787354, "kl": 0.3658827282488346, "learning_rate": 4.796586391266135e-06, "loss": 0.0146, "num_tokens": 5972352.0, "reward": 0.750732421875, "reward_std": 0.0072412192821502686, "rewards//mean": 0.750732421875, "rewards//std": 0.031179197132587433, "step": 691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1384, "grad_norm": 1.0804966688156128, "kl": 0.3435588367283344, "learning_rate": 4.795959025106907e-06, "loss": 0.0137, "num_tokens": 5980920.0, "reward": 0.75189208984375, "reward_std": 0.009941812604665756, "rewards//mean": 0.75189208984375, "rewards//std": 0.02548695169389248, "step": 692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1386, "grad_norm": 1.247780442237854, "kl": 0.3512617312371731, "learning_rate": 4.7953307341356595e-06, "loss": 0.0141, "num_tokens": 5989576.0, "reward": 0.765869140625, "reward_std": 0.007969655096530914, "rewards//mean": 0.765869140625, "rewards//std": 0.028762908652424812, "step": 693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1388, "grad_norm": 1.2001007795333862, "kl": 0.36641981452703476, "learning_rate": 4.794701518605467e-06, "loss": 0.0147, "num_tokens": 5998160.0, "reward": 0.763427734375, "reward_std": 0.0078741405159235, "rewards//mean": 0.763427734375, "rewards//std": 0.024566426873207092, "step": 694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.139, "grad_norm": 1.1044167280197144, "kl": 0.33926333859562874, "learning_rate": 4.794071378769776e-06, "loss": 0.0136, "num_tokens": 6006776.0, "reward": 0.76068115234375, "reward_std": 0.01240166462957859, "rewards//mean": 0.76068115234375, "rewards//std": 0.03412666544318199, "step": 695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1392, "grad_norm": 1.0884336233139038, "kl": 0.3415757305920124, "learning_rate": 4.7934403148824085e-06, "loss": 0.0137, "num_tokens": 6015376.0, "reward": 0.7767333984375, "reward_std": 0.00934339314699173, "rewards//mean": 0.7767333984375, "rewards//std": 0.023022731766104698, "step": 696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1394, "grad_norm": 1.1337672472000122, "kl": 0.3384525291621685, "learning_rate": 4.792808327197556e-06, "loss": 0.0135, "num_tokens": 6023976.0, "reward": 0.75604248046875, "reward_std": 0.011509222909808159, "rewards//mean": 0.75604248046875, "rewards//std": 0.03885067626833916, "step": 697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1396, "grad_norm": 1.2506954669952393, "kl": 0.3618217594921589, "learning_rate": 4.792175415969786e-06, "loss": 0.0145, "num_tokens": 6032584.0, "reward": 0.78179931640625, "reward_std": 0.007064157165586948, "rewards//mean": 0.78179931640625, "rewards//std": 0.023384397849440575, "step": 698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1398, "grad_norm": 1.190083384513855, "kl": 0.3704114742577076, "learning_rate": 4.79154158145403e-06, "loss": 0.0148, "num_tokens": 6041200.0, "reward": 0.7354736328125, "reward_std": 0.007111767306923866, "rewards//mean": 0.7354736328125, "rewards//std": 0.03445318341255188, "step": 699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.14, "grad_norm": 1.0952671766281128, "kl": 0.3880801610648632, "learning_rate": 4.790906823905599e-06, "loss": 0.0155, "num_tokens": 6049776.0, "reward": 0.75994873046875, "reward_std": 0.006981437094509602, "rewards//mean": 0.75994873046875, "rewards//std": 0.02911488339304924, "step": 700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1402, "grad_norm": 0.9023262858390808, "kl": 0.3351786471903324, "learning_rate": 4.790271143580174e-06, "loss": 0.0134, "num_tokens": 6058352.0, "reward": 0.76885986328125, "reward_std": 0.008710931986570358, "rewards//mean": 0.76885986328125, "rewards//std": 0.03886703774333, "step": 701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1404, "grad_norm": 1.2910362482070923, "kl": 0.3798423409461975, "learning_rate": 4.789634540733807e-06, "loss": 0.0152, "num_tokens": 6066912.0, "reward": 0.748046875, "reward_std": 0.006378927733749151, "rewards//mean": 0.748046875, "rewards//std": 0.029968129470944405, "step": 702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1406, "grad_norm": 1.287479043006897, "kl": 0.37538506276905537, "learning_rate": 4.78899701562292e-06, "loss": 0.015, "num_tokens": 6075536.0, "reward": 0.75531005859375, "reward_std": 0.010323834605515003, "rewards//mean": 0.75531005859375, "rewards//std": 0.029662420973181725, "step": 703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1408, "grad_norm": 1.0325592756271362, "kl": 0.35265588015317917, "learning_rate": 4.788358568504308e-06, "loss": 0.0141, "num_tokens": 6084152.0, "reward": 0.73480224609375, "reward_std": 0.007402766961604357, "rewards//mean": 0.73480224609375, "rewards//std": 0.034619029611349106, "step": 704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.141, "grad_norm": 1.6814100742340088, "kl": 0.38387027755379677, "learning_rate": 4.78771919963514e-06, "loss": 0.0154, "num_tokens": 6092728.0, "reward": 0.79052734375, "reward_std": 0.006797960493713617, "rewards//mean": 0.79052734375, "rewards//std": 0.020333237946033478, "step": 705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1412, "grad_norm": 1.1181515455245972, "kl": 0.3311033882200718, "learning_rate": 4.787078909272951e-06, "loss": 0.0132, "num_tokens": 6101480.0, "reward": 0.73321533203125, "reward_std": 0.012039251625537872, "rewards//mean": 0.73321533203125, "rewards//std": 0.04149588197469711, "step": 706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1414, "grad_norm": 1.1418641805648804, "kl": 0.2598997447639704, "learning_rate": 4.786437697675651e-06, "loss": 0.0104, "num_tokens": 6110208.0, "reward": 0.73748779296875, "reward_std": 0.01649647206068039, "rewards//mean": 0.73748779296875, "rewards//std": 0.04361593350768089, "step": 707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1416, "grad_norm": 1.1949273347854614, "kl": 0.39064906910061836, "learning_rate": 4.78579556510152e-06, "loss": 0.0156, "num_tokens": 6118832.0, "reward": 0.74700927734375, "reward_std": 0.009526428766548634, "rewards//mean": 0.74700927734375, "rewards//std": 0.02997412718832493, "step": 708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1418, "grad_norm": 1.232944369316101, "kl": 0.3855067417025566, "learning_rate": 4.785152511809208e-06, "loss": 0.0154, "num_tokens": 6127480.0, "reward": 0.751220703125, "reward_std": 0.0074347625486552715, "rewards//mean": 0.751220703125, "rewards//std": 0.0333704799413681, "step": 709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.142, "grad_norm": 0.8768677711486816, "kl": 0.3823428861796856, "learning_rate": 4.784508538057738e-06, "loss": 0.0153, "num_tokens": 6136016.0, "reward": 0.734375, "reward_std": 0.009139763191342354, "rewards//mean": 0.734375, "rewards//std": 0.029879095032811165, "step": 710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1422, "grad_norm": 1.023090124130249, "kl": 0.3618377633392811, "learning_rate": 4.783863644106502e-06, "loss": 0.0145, "num_tokens": 6144688.0, "reward": 0.75811767578125, "reward_std": 0.007518475875258446, "rewards//mean": 0.75811767578125, "rewards//std": 0.02010328881442547, "step": 711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1424, "grad_norm": 0.9730324149131775, "kl": 0.36088190227746964, "learning_rate": 4.783217830215264e-06, "loss": 0.0144, "num_tokens": 6153288.0, "reward": 0.7520751953125, "reward_std": 0.0069280690513551235, "rewards//mean": 0.7520751953125, "rewards//std": 0.03221152722835541, "step": 712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1426, "grad_norm": 0.777560830116272, "kl": 0.3026167619973421, "learning_rate": 4.782571096644157e-06, "loss": 0.0121, "num_tokens": 6161936.0, "reward": 0.75244140625, "reward_std": 0.012461979873478413, "rewards//mean": 0.75244140625, "rewards//std": 0.04013592377305031, "step": 713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1428, "grad_norm": 0.7599678635597229, "kl": 0.27103759720921516, "learning_rate": 4.7819234436536845e-06, "loss": 0.0108, "num_tokens": 6170696.0, "reward": 0.77105712890625, "reward_std": 0.009724288247525692, "rewards//mean": 0.77105712890625, "rewards//std": 0.028811221942305565, "step": 714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.143, "grad_norm": 1.2590875625610352, "kl": 0.3248850591480732, "learning_rate": 4.781274871504722e-06, "loss": 0.013, "num_tokens": 6179384.0, "reward": 0.7764892578125, "reward_std": 0.005956442095339298, "rewards//mean": 0.7764892578125, "rewards//std": 0.02729027532041073, "step": 715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1432, "grad_norm": 0.8374090194702148, "kl": 0.3235581796616316, "learning_rate": 4.780625380458513e-06, "loss": 0.0129, "num_tokens": 6187896.0, "reward": 0.75469970703125, "reward_std": 0.008836697787046432, "rewards//mean": 0.75469970703125, "rewards//std": 0.03515404462814331, "step": 716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1434, "grad_norm": 0.9022362232208252, "kl": 0.3358203247189522, "learning_rate": 4.7799749707766754e-06, "loss": 0.0134, "num_tokens": 6196536.0, "reward": 0.73065185546875, "reward_std": 0.006113333627581596, "rewards//mean": 0.73065185546875, "rewards//std": 0.025127459317445755, "step": 717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1436, "grad_norm": 1.1146235466003418, "kl": 0.3147228341549635, "learning_rate": 4.779323642721191e-06, "loss": 0.0126, "num_tokens": 6205168.0, "reward": 0.7520751953125, "reward_std": 0.007163808681070805, "rewards//mean": 0.7520751953125, "rewards//std": 0.019917665049433708, "step": 718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1438, "grad_norm": 0.9956210851669312, "kl": 0.3257326614111662, "learning_rate": 4.778671396554417e-06, "loss": 0.013, "num_tokens": 6213784.0, "reward": 0.76202392578125, "reward_std": 0.009784232825040817, "rewards//mean": 0.76202392578125, "rewards//std": 0.027377430349588394, "step": 719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.144, "grad_norm": 1.103261947631836, "kl": 0.31043668277561665, "learning_rate": 4.778018232539075e-06, "loss": 0.0124, "num_tokens": 6222432.0, "reward": 0.75640869140625, "reward_std": 0.010043056681752205, "rewards//mean": 0.75640869140625, "rewards//std": 0.026082253083586693, "step": 720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1442, "grad_norm": 1.2115051746368408, "kl": 0.30454549565911293, "learning_rate": 4.777364150938263e-06, "loss": 0.0122, "num_tokens": 6231232.0, "reward": 0.755859375, "reward_std": 0.01059969887137413, "rewards//mean": 0.755859375, "rewards//std": 0.034273672848939896, "step": 721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1444, "grad_norm": 0.8916050791740417, "kl": 0.2920965179800987, "learning_rate": 4.776709152015443e-06, "loss": 0.0117, "num_tokens": 6239984.0, "reward": 0.74688720703125, "reward_std": 0.007067927625030279, "rewards//mean": 0.74688720703125, "rewards//std": 0.025057479739189148, "step": 722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1446, "grad_norm": 0.7612753510475159, "kl": 0.29840802773833275, "learning_rate": 4.776053236034449e-06, "loss": 0.0119, "num_tokens": 6248576.0, "reward": 0.7445068359375, "reward_std": 0.00902190525084734, "rewards//mean": 0.7445068359375, "rewards//std": 0.02381392940878868, "step": 723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1448, "grad_norm": 1.3189657926559448, "kl": 0.3207559399306774, "learning_rate": 4.775396403259483e-06, "loss": 0.0128, "num_tokens": 6257128.0, "reward": 0.7669677734375, "reward_std": 0.01015978679060936, "rewards//mean": 0.7669677734375, "rewards//std": 0.026062294840812683, "step": 724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.145, "grad_norm": 0.9157844185829163, "kl": 0.30339373275637627, "learning_rate": 4.774738653955119e-06, "loss": 0.0121, "num_tokens": 6265768.0, "reward": 0.7802734375, "reward_std": 0.009250717237591743, "rewards//mean": 0.7802734375, "rewards//std": 0.026991603896021843, "step": 725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1452, "grad_norm": 1.1949882507324219, "kl": 0.33243946731090546, "learning_rate": 4.7740799883862966e-06, "loss": 0.0133, "num_tokens": 6274448.0, "reward": 0.7586669921875, "reward_std": 0.01018530037254095, "rewards//mean": 0.7586669921875, "rewards//std": 0.020966242998838425, "step": 726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1454, "grad_norm": 0.8508477807044983, "kl": 0.33175548166036606, "learning_rate": 4.773420406818327e-06, "loss": 0.0133, "num_tokens": 6283040.0, "reward": 0.75091552734375, "reward_std": 0.007159090135246515, "rewards//mean": 0.75091552734375, "rewards//std": 0.02595542185008526, "step": 727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1456, "grad_norm": 0.8403224349021912, "kl": 0.30662048049271107, "learning_rate": 4.772759909516889e-06, "loss": 0.0123, "num_tokens": 6291704.0, "reward": 0.7113037109375, "reward_std": 0.011537307873368263, "rewards//mean": 0.7113037109375, "rewards//std": 0.0381256639957428, "step": 728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1458, "grad_norm": 1.1425209045410156, "kl": 0.31275169365108013, "learning_rate": 4.772098496748031e-06, "loss": 0.0125, "num_tokens": 6300344.0, "reward": 0.75421142578125, "reward_std": 0.00868441816419363, "rewards//mean": 0.75421142578125, "rewards//std": 0.03845828026533127, "step": 729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.146, "grad_norm": 0.7371503710746765, "kl": 0.2826213352382183, "learning_rate": 4.7714361687781705e-06, "loss": 0.0113, "num_tokens": 6308912.0, "reward": 0.75616455078125, "reward_std": 0.01087634265422821, "rewards//mean": 0.75616455078125, "rewards//std": 0.02745252661406994, "step": 730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1462, "grad_norm": 1.2550262212753296, "kl": 0.29363943450152874, "learning_rate": 4.770772925874093e-06, "loss": 0.0117, "num_tokens": 6317680.0, "reward": 0.7362060546875, "reward_std": 0.012717410922050476, "rewards//mean": 0.7362060546875, "rewards//std": 0.031156614422798157, "step": 731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1464, "grad_norm": 1.0804203748703003, "kl": 0.347552876919508, "learning_rate": 4.770108768302953e-06, "loss": 0.0139, "num_tokens": 6326368.0, "reward": 0.79925537109375, "reward_std": 0.00879729725420475, "rewards//mean": 0.79925537109375, "rewards//std": 0.021716952323913574, "step": 732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1466, "grad_norm": 0.8175789713859558, "kl": 0.2651336081326008, "learning_rate": 4.769443696332272e-06, "loss": 0.0106, "num_tokens": 6335016.0, "reward": 0.75885009765625, "reward_std": 0.010360082611441612, "rewards//mean": 0.75885009765625, "rewards//std": 0.0343741700053215, "step": 733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1468, "grad_norm": 0.9594478011131287, "kl": 0.32585857063531876, "learning_rate": 4.768777710229941e-06, "loss": 0.013, "num_tokens": 6343744.0, "reward": 0.76837158203125, "reward_std": 0.006379896309226751, "rewards//mean": 0.76837158203125, "rewards//std": 0.02404632233083248, "step": 734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.147, "grad_norm": 0.9124557971954346, "kl": 0.3457811623811722, "learning_rate": 4.768110810264221e-06, "loss": 0.0138, "num_tokens": 6352456.0, "reward": 0.77996826171875, "reward_std": 0.010944314301013947, "rewards//mean": 0.77996826171875, "rewards//std": 0.025017576292157173, "step": 735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1472, "grad_norm": 1.0064573287963867, "kl": 0.32626586966216564, "learning_rate": 4.767442996703737e-06, "loss": 0.0131, "num_tokens": 6361208.0, "reward": 0.76898193359375, "reward_std": 0.009524751454591751, "rewards//mean": 0.76898193359375, "rewards//std": 0.016290800645947456, "step": 736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1474, "grad_norm": 0.8128183484077454, "kl": 0.3099711798131466, "learning_rate": 4.7667742698174855e-06, "loss": 0.0124, "num_tokens": 6369800.0, "reward": 0.759765625, "reward_std": 0.00793082732707262, "rewards//mean": 0.759765625, "rewards//std": 0.02536318637430668, "step": 737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1476, "grad_norm": 0.9029561281204224, "kl": 0.34095400758087635, "learning_rate": 4.766104629874829e-06, "loss": 0.0136, "num_tokens": 6378480.0, "reward": 0.7481689453125, "reward_std": 0.008078465238213539, "rewards//mean": 0.7481689453125, "rewards//std": 0.03140438720583916, "step": 738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1478, "grad_norm": 1.104424238204956, "kl": 0.3173263669013977, "learning_rate": 4.765434077145499e-06, "loss": 0.0127, "num_tokens": 6387056.0, "reward": 0.74407958984375, "reward_std": 0.01128837838768959, "rewards//mean": 0.74407958984375, "rewards//std": 0.04236414283514023, "step": 739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.148, "grad_norm": 1.0675055980682373, "kl": 0.3194386400282383, "learning_rate": 4.764762611899593e-06, "loss": 0.0128, "num_tokens": 6395720.0, "reward": 0.779296875, "reward_std": 0.00692769093438983, "rewards//mean": 0.779296875, "rewards//std": 0.02041051536798477, "step": 740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1482, "grad_norm": 1.5919986963272095, "kl": 0.3498740680515766, "learning_rate": 4.764090234407578e-06, "loss": 0.014, "num_tokens": 6404408.0, "reward": 0.7606201171875, "reward_std": 0.007645751349627972, "rewards//mean": 0.7606201171875, "rewards//std": 0.017224503681063652, "step": 741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1484, "grad_norm": 1.439361810684204, "kl": 0.33540985360741615, "learning_rate": 4.763416944940287e-06, "loss": 0.0134, "num_tokens": 6413088.0, "reward": 0.7662353515625, "reward_std": 0.011973317712545395, "rewards//mean": 0.7662353515625, "rewards//std": 0.02613653428852558, "step": 742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1486, "grad_norm": 0.9124048352241516, "kl": 0.3327603731304407, "learning_rate": 4.762742743768921e-06, "loss": 0.0133, "num_tokens": 6421912.0, "reward": 0.782470703125, "reward_std": 0.00991036370396614, "rewards//mean": 0.782470703125, "rewards//std": 0.026327257975935936, "step": 743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1488, "grad_norm": 1.1488804817199707, "kl": 0.3750241808593273, "learning_rate": 4.762067631165049e-06, "loss": 0.015, "num_tokens": 6430552.0, "reward": 0.78814697265625, "reward_std": 0.01083114929497242, "rewards//mean": 0.78814697265625, "rewards//std": 0.023038752377033234, "step": 744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.149, "grad_norm": 1.0288392305374146, "kl": 0.4040727950632572, "learning_rate": 4.761391607400606e-06, "loss": 0.0162, "num_tokens": 6439144.0, "reward": 0.774658203125, "reward_std": 0.008166758343577385, "rewards//mean": 0.774658203125, "rewards//std": 0.01806734874844551, "step": 745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1492, "grad_norm": 1.1654855012893677, "kl": 0.3817331902682781, "learning_rate": 4.7607146727478935e-06, "loss": 0.0153, "num_tokens": 6447728.0, "reward": 0.7738037109375, "reward_std": 0.00860125944018364, "rewards//mean": 0.7738037109375, "rewards//std": 0.018528563901782036, "step": 746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1494, "grad_norm": 1.4023302793502808, "kl": 0.3670612499117851, "learning_rate": 4.760036827479582e-06, "loss": 0.0147, "num_tokens": 6456312.0, "reward": 0.76483154296875, "reward_std": 0.007299549877643585, "rewards//mean": 0.76483154296875, "rewards//std": 0.028815951198339462, "step": 747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1496, "grad_norm": 0.9892433285713196, "kl": 0.3555135168135166, "learning_rate": 4.759358071868705e-06, "loss": 0.0142, "num_tokens": 6464936.0, "reward": 0.78558349609375, "reward_std": 0.007815414108335972, "rewards//mean": 0.78558349609375, "rewards//std": 0.016691023483872414, "step": 748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1498, "grad_norm": 0.9962619543075562, "kl": 0.33968275785446167, "learning_rate": 4.758678406188668e-06, "loss": 0.0136, "num_tokens": 6473664.0, "reward": 0.73126220703125, "reward_std": 0.007433678954839706, "rewards//mean": 0.73126220703125, "rewards//std": 0.032445359975099564, "step": 749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.15, "grad_norm": 1.2323095798492432, "kl": 0.3530998080968857, "learning_rate": 4.757997830713239e-06, "loss": 0.0141, "num_tokens": 6482200.0, "reward": 0.74542236328125, "reward_std": 0.01127916481345892, "rewards//mean": 0.74542236328125, "rewards//std": 0.019168945029377937, "step": 750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1502, "grad_norm": 1.0545332431793213, "kl": 0.34781191498041153, "learning_rate": 4.757316345716554e-06, "loss": 0.0139, "num_tokens": 6490712.0, "reward": 0.74566650390625, "reward_std": 0.009755873121321201, "rewards//mean": 0.74566650390625, "rewards//std": 0.03269031271338463, "step": 751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1504, "grad_norm": 0.9189953804016113, "kl": 0.3220175448805094, "learning_rate": 4.756633951473114e-06, "loss": 0.0129, "num_tokens": 6499392.0, "reward": 0.7847900390625, "reward_std": 0.008869939483702183, "rewards//mean": 0.7847900390625, "rewards//std": 0.021482614800333977, "step": 752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1506, "grad_norm": 1.1692512035369873, "kl": 0.33156827092170715, "learning_rate": 4.755950648257789e-06, "loss": 0.0133, "num_tokens": 6508160.0, "reward": 0.77008056640625, "reward_std": 0.011027710512280464, "rewards//mean": 0.77008056640625, "rewards//std": 0.02765524946153164, "step": 753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1508, "grad_norm": 1.144055724143982, "kl": 0.3382889721542597, "learning_rate": 4.755266436345812e-06, "loss": 0.0135, "num_tokens": 6516832.0, "reward": 0.72998046875, "reward_std": 0.01025819219648838, "rewards//mean": 0.72998046875, "rewards//std": 0.029343342408537865, "step": 754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.151, "grad_norm": 1.0410165786743164, "kl": 0.3598978705704212, "learning_rate": 4.754581316012785e-06, "loss": 0.0144, "num_tokens": 6525520.0, "reward": 0.7647705078125, "reward_std": 0.007099618669599295, "rewards//mean": 0.7647705078125, "rewards//std": 0.02260068617761135, "step": 755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1512, "grad_norm": 0.9105063080787659, "kl": 0.30748361349105835, "learning_rate": 4.753895287534673e-06, "loss": 0.0123, "num_tokens": 6534128.0, "reward": 0.7608642578125, "reward_std": 0.011949615553021431, "rewards//mean": 0.7608642578125, "rewards//std": 0.036653582006692886, "step": 756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1514, "grad_norm": 0.8369845747947693, "kl": 0.3764992021024227, "learning_rate": 4.753208351187809e-06, "loss": 0.0151, "num_tokens": 6542736.0, "reward": 0.7330322265625, "reward_std": 0.0075618005357682705, "rewards//mean": 0.7330322265625, "rewards//std": 0.03163682669401169, "step": 757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1516, "grad_norm": 0.9112576842308044, "kl": 0.340505413711071, "learning_rate": 4.75252050724889e-06, "loss": 0.0136, "num_tokens": 6551352.0, "reward": 0.75885009765625, "reward_std": 0.008580232039093971, "rewards//mean": 0.75885009765625, "rewards//std": 0.024510828778147697, "step": 758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1518, "grad_norm": 1.1520575284957886, "kl": 0.3449602983891964, "learning_rate": 4.751831755994981e-06, "loss": 0.0138, "num_tokens": 6560040.0, "reward": 0.7685546875, "reward_std": 0.0092132817953825, "rewards//mean": 0.7685546875, "rewards//std": 0.02560080774128437, "step": 759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.152, "grad_norm": 1.0891118049621582, "kl": 0.3134535998106003, "learning_rate": 4.75114209770351e-06, "loss": 0.0125, "num_tokens": 6568704.0, "reward": 0.74053955078125, "reward_std": 0.009147681295871735, "rewards//mean": 0.74053955078125, "rewards//std": 0.028252549469470978, "step": 760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1522, "grad_norm": 0.9451463222503662, "kl": 0.35255878791213036, "learning_rate": 4.75045153265227e-06, "loss": 0.0141, "num_tokens": 6577336.0, "reward": 0.7484130859375, "reward_std": 0.0063913362100720406, "rewards//mean": 0.7484130859375, "rewards//std": 0.02723030373454094, "step": 761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1524, "grad_norm": 0.8575259447097778, "kl": 0.3487287014722824, "learning_rate": 4.749760061119423e-06, "loss": 0.0139, "num_tokens": 6585944.0, "reward": 0.75885009765625, "reward_std": 0.007511901669204235, "rewards//mean": 0.75885009765625, "rewards//std": 0.031213102862238884, "step": 762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1526, "grad_norm": 0.9670186042785645, "kl": 0.31115617975592613, "learning_rate": 4.749067683383491e-06, "loss": 0.0124, "num_tokens": 6594568.0, "reward": 0.75286865234375, "reward_std": 0.009954852983355522, "rewards//mean": 0.75286865234375, "rewards//std": 0.027128031477332115, "step": 763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1528, "grad_norm": 1.3793588876724243, "kl": 0.3482302203774452, "learning_rate": 4.748374399723366e-06, "loss": 0.0139, "num_tokens": 6603064.0, "reward": 0.74713134765625, "reward_std": 0.011073645204305649, "rewards//mean": 0.74713134765625, "rewards//std": 0.028673233464360237, "step": 764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.153, "grad_norm": 1.212506651878357, "kl": 0.35477422177791595, "learning_rate": 4.747680210418302e-06, "loss": 0.0142, "num_tokens": 6611760.0, "reward": 0.76275634765625, "reward_std": 0.008897896856069565, "rewards//mean": 0.76275634765625, "rewards//std": 0.024282492697238922, "step": 765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1532, "grad_norm": 1.8687018156051636, "kl": 0.33726025745272636, "learning_rate": 4.746985115747918e-06, "loss": 0.0135, "num_tokens": 6620544.0, "reward": 0.788818359375, "reward_std": 0.010754291899502277, "rewards//mean": 0.788818359375, "rewards//std": 0.027611492201685905, "step": 766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1534, "grad_norm": 1.1897015571594238, "kl": 0.35634366050362587, "learning_rate": 4.746289115992198e-06, "loss": 0.0143, "num_tokens": 6629192.0, "reward": 0.75067138671875, "reward_std": 0.006821729242801666, "rewards//mean": 0.75067138671875, "rewards//std": 0.023944122716784477, "step": 767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1536, "grad_norm": 1.6127054691314697, "kl": 0.3192887920886278, "learning_rate": 4.74559221143149e-06, "loss": 0.0128, "num_tokens": 6637832.0, "reward": 0.7333984375, "reward_std": 0.00948795210570097, "rewards//mean": 0.7333984375, "rewards//std": 0.027524733915925026, "step": 768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1538, "grad_norm": 0.9704777598381042, "kl": 0.3409872241318226, "learning_rate": 4.744894402346508e-06, "loss": 0.0136, "num_tokens": 6646552.0, "reward": 0.785400390625, "reward_std": 0.01048012264072895, "rewards//mean": 0.785400390625, "rewards//std": 0.030144404619932175, "step": 769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.154, "grad_norm": 1.119908332824707, "kl": 0.36791034415364265, "learning_rate": 4.744195689018331e-06, "loss": 0.0147, "num_tokens": 6655176.0, "reward": 0.7825927734375, "reward_std": 0.0098853949457407, "rewards//mean": 0.7825927734375, "rewards//std": 0.024990104138851166, "step": 770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1542, "grad_norm": 1.1021844148635864, "kl": 0.36526960879564285, "learning_rate": 4.743496071728396e-06, "loss": 0.0146, "num_tokens": 6663840.0, "reward": 0.7615966796875, "reward_std": 0.005251667462289333, "rewards//mean": 0.7615966796875, "rewards//std": 0.029023557901382446, "step": 771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1544, "grad_norm": 1.5725772380828857, "kl": 0.38798758387565613, "learning_rate": 4.742795550758514e-06, "loss": 0.0155, "num_tokens": 6672440.0, "reward": 0.7513427734375, "reward_std": 0.005276157986372709, "rewards//mean": 0.7513427734375, "rewards//std": 0.020719308406114578, "step": 772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1546, "grad_norm": 1.8963240385055542, "kl": 0.3601706586778164, "learning_rate": 4.742094126390851e-06, "loss": 0.0144, "num_tokens": 6680960.0, "reward": 0.761962890625, "reward_std": 0.007767863571643829, "rewards//mean": 0.761962890625, "rewards//std": 0.017892228439450264, "step": 773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1548, "grad_norm": 0.9243273735046387, "kl": 0.3406798876821995, "learning_rate": 4.7413917989079415e-06, "loss": 0.0136, "num_tokens": 6689760.0, "reward": 0.7685546875, "reward_std": 0.008658873848617077, "rewards//mean": 0.7685546875, "rewards//std": 0.02934746816754341, "step": 774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.155, "grad_norm": 0.9375861883163452, "kl": 0.34522927552461624, "learning_rate": 4.740688568592685e-06, "loss": 0.0138, "num_tokens": 6698392.0, "reward": 0.77886962890625, "reward_std": 0.008174005895853043, "rewards//mean": 0.77886962890625, "rewards//std": 0.021060163155198097, "step": 775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1552, "grad_norm": 0.8136488199234009, "kl": 0.35180309042334557, "learning_rate": 4.73998443572834e-06, "loss": 0.0141, "num_tokens": 6707040.0, "reward": 0.7554931640625, "reward_std": 0.008698609657585621, "rewards//mean": 0.7554931640625, "rewards//std": 0.0261689480394125, "step": 776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1554, "grad_norm": 0.9896641969680786, "kl": 0.33184514194726944, "learning_rate": 4.7392794005985324e-06, "loss": 0.0133, "num_tokens": 6715672.0, "reward": 0.76751708984375, "reward_std": 0.008659705519676208, "rewards//mean": 0.76751708984375, "rewards//std": 0.022086039185523987, "step": 777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1556, "grad_norm": 0.8349254131317139, "kl": 0.3396892622113228, "learning_rate": 4.7385734634872504e-06, "loss": 0.0136, "num_tokens": 6724312.0, "reward": 0.76275634765625, "reward_std": 0.010972029529511929, "rewards//mean": 0.76275634765625, "rewards//std": 0.024619286879897118, "step": 778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1558, "grad_norm": 0.7266840934753418, "kl": 0.3403605706989765, "learning_rate": 4.7378666246788444e-06, "loss": 0.0136, "num_tokens": 6732968.0, "reward": 0.7620849609375, "reward_std": 0.011202620342373848, "rewards//mean": 0.7620849609375, "rewards//std": 0.028896017000079155, "step": 779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.156, "grad_norm": 0.6741737127304077, "kl": 0.30693612433969975, "learning_rate": 4.73715888445803e-06, "loss": 0.0123, "num_tokens": 6741616.0, "reward": 0.7645263671875, "reward_std": 0.01664084941148758, "rewards//mean": 0.7645263671875, "rewards//std": 0.028137991204857826, "step": 780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1562, "grad_norm": 0.9469835162162781, "kl": 0.33184751495718956, "learning_rate": 4.736450243109885e-06, "loss": 0.0133, "num_tokens": 6750248.0, "reward": 0.772216796875, "reward_std": 0.010948438197374344, "rewards//mean": 0.772216796875, "rewards//std": 0.02752363495528698, "step": 781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1564, "grad_norm": 1.3513708114624023, "kl": 0.36903392523527145, "learning_rate": 4.735740700919848e-06, "loss": 0.0148, "num_tokens": 6758784.0, "reward": 0.75982666015625, "reward_std": 0.011673888191580772, "rewards//mean": 0.75982666015625, "rewards//std": 0.02905711531639099, "step": 782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1566, "grad_norm": 1.2254798412322998, "kl": 0.3678886219859123, "learning_rate": 4.7350302581737255e-06, "loss": 0.0147, "num_tokens": 6767448.0, "reward": 0.7747802734375, "reward_std": 0.009641564451158047, "rewards//mean": 0.7747802734375, "rewards//std": 0.019651401787996292, "step": 783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1568, "grad_norm": 1.1070396900177002, "kl": 0.3434663563966751, "learning_rate": 4.734318915157682e-06, "loss": 0.0137, "num_tokens": 6776224.0, "reward": 0.72930908203125, "reward_std": 0.009073897264897823, "rewards//mean": 0.72930908203125, "rewards//std": 0.04494238644838333, "step": 784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.157, "grad_norm": 0.925721287727356, "kl": 0.3413317985832691, "learning_rate": 4.7336066721582464e-06, "loss": 0.0137, "num_tokens": 6784832.0, "reward": 0.74957275390625, "reward_std": 0.008924740366637707, "rewards//mean": 0.74957275390625, "rewards//std": 0.03833093121647835, "step": 785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1572, "grad_norm": 1.4691276550292969, "kl": 0.3254700042307377, "learning_rate": 4.73289352946231e-06, "loss": 0.013, "num_tokens": 6793464.0, "reward": 0.783935546875, "reward_std": 0.009136625565588474, "rewards//mean": 0.783935546875, "rewards//std": 0.021708672866225243, "step": 786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1574, "grad_norm": 0.8711658120155334, "kl": 0.34013716131448746, "learning_rate": 4.732179487357127e-06, "loss": 0.0136, "num_tokens": 6801992.0, "reward": 0.76971435546875, "reward_std": 0.010332462377846241, "rewards//mean": 0.76971435546875, "rewards//std": 0.028943846002221107, "step": 787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1576, "grad_norm": 1.2573686838150024, "kl": 0.31912393122911453, "learning_rate": 4.731464546130315e-06, "loss": 0.0128, "num_tokens": 6810688.0, "reward": 0.7406005859375, "reward_std": 0.01056898757815361, "rewards//mean": 0.7406005859375, "rewards//std": 0.026270560920238495, "step": 788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1578, "grad_norm": 0.6769285798072815, "kl": 0.3092736713588238, "learning_rate": 4.730748706069849e-06, "loss": 0.0124, "num_tokens": 6819336.0, "reward": 0.7562255859375, "reward_std": 0.01258472166955471, "rewards//mean": 0.7562255859375, "rewards//std": 0.038561511784791946, "step": 789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.158, "grad_norm": 0.7998493313789368, "kl": 0.3514319136738777, "learning_rate": 4.730031967464071e-06, "loss": 0.0141, "num_tokens": 6828088.0, "reward": 0.74859619140625, "reward_std": 0.008010081946849823, "rewards//mean": 0.74859619140625, "rewards//std": 0.026524242013692856, "step": 790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1582, "grad_norm": 0.934973955154419, "kl": 0.30601639300584793, "learning_rate": 4.729314330601684e-06, "loss": 0.0122, "num_tokens": 6836808.0, "reward": 0.757080078125, "reward_std": 0.013350230641663074, "rewards//mean": 0.757080078125, "rewards//std": 0.020503751933574677, "step": 791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1584, "grad_norm": 0.9307112097740173, "kl": 0.3210490830242634, "learning_rate": 4.72859579577175e-06, "loss": 0.0128, "num_tokens": 6845400.0, "reward": 0.75616455078125, "reward_std": 0.008527729660272598, "rewards//mean": 0.75616455078125, "rewards//std": 0.020061077550053596, "step": 792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1586, "grad_norm": 1.2021913528442383, "kl": 0.33378901705145836, "learning_rate": 4.7278763632636974e-06, "loss": 0.0134, "num_tokens": 6854096.0, "reward": 0.72943115234375, "reward_std": 0.005128923803567886, "rewards//mean": 0.72943115234375, "rewards//std": 0.0317058339715004, "step": 793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1588, "grad_norm": 1.0796374082565308, "kl": 0.32557179778814316, "learning_rate": 4.727156033367312e-06, "loss": 0.013, "num_tokens": 6862656.0, "reward": 0.7237548828125, "reward_std": 0.006958717480301857, "rewards//mean": 0.7237548828125, "rewards//std": 0.01773715205490589, "step": 794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.159, "grad_norm": 1.2356610298156738, "kl": 0.30212575383484364, "learning_rate": 4.7264348063727415e-06, "loss": 0.0121, "num_tokens": 6871248.0, "reward": 0.763671875, "reward_std": 0.00585189089179039, "rewards//mean": 0.763671875, "rewards//std": 0.024861659854650497, "step": 795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1592, "grad_norm": 1.584045648574829, "kl": 0.32087236642837524, "learning_rate": 4.725712682570498e-06, "loss": 0.0128, "num_tokens": 6879968.0, "reward": 0.7618408203125, "reward_std": 0.01183655858039856, "rewards//mean": 0.7618408203125, "rewards//std": 0.029248006641864777, "step": 796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1594, "grad_norm": 1.0374621152877808, "kl": 0.31318943575024605, "learning_rate": 4.724989662251452e-06, "loss": 0.0125, "num_tokens": 6888536.0, "reward": 0.76678466796875, "reward_std": 0.011448527686297894, "rewards//mean": 0.76678466796875, "rewards//std": 0.025842614471912384, "step": 797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1596, "grad_norm": 1.0205044746398926, "kl": 0.30759022757411003, "learning_rate": 4.724265745706837e-06, "loss": 0.0123, "num_tokens": 6897080.0, "reward": 0.7689208984375, "reward_std": 0.0078608188778162, "rewards//mean": 0.7689208984375, "rewards//std": 0.026348810642957687, "step": 798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1598, "grad_norm": 2.31490159034729, "kl": 0.3154881428927183, "learning_rate": 4.723540933228245e-06, "loss": 0.0126, "num_tokens": 6905712.0, "reward": 0.753662109375, "reward_std": 0.0058388663455843925, "rewards//mean": 0.753662109375, "rewards//std": 0.029780646786093712, "step": 799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.16, "grad_norm": 1.1097886562347412, "kl": 0.3300345875322819, "learning_rate": 4.7228152251076295e-06, "loss": 0.0132, "num_tokens": 6914304.0, "reward": 0.77337646484375, "reward_std": 0.009435447864234447, "rewards//mean": 0.77337646484375, "rewards//std": 0.024680698290467262, "step": 800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1602, "grad_norm": 1.193706750869751, "kl": 0.3689091205596924, "learning_rate": 4.7220886216373095e-06, "loss": 0.0148, "num_tokens": 6922960.0, "reward": 0.78070068359375, "reward_std": 0.007954924367368221, "rewards//mean": 0.78070068359375, "rewards//std": 0.030088046565651894, "step": 801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1604, "grad_norm": 1.4976460933685303, "kl": 0.34658120200037956, "learning_rate": 4.7213611231099575e-06, "loss": 0.0139, "num_tokens": 6931632.0, "reward": 0.7415771484375, "reward_std": 0.007033228408545256, "rewards//mean": 0.7415771484375, "rewards//std": 0.03241577744483948, "step": 802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1606, "grad_norm": 1.3477445840835571, "kl": 0.35779566690325737, "learning_rate": 4.7206327298186105e-06, "loss": 0.0143, "num_tokens": 6940272.0, "reward": 0.77032470703125, "reward_std": 0.006013727746903896, "rewards//mean": 0.77032470703125, "rewards//std": 0.021487809717655182, "step": 803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1608, "grad_norm": 2.233660936355591, "kl": 0.324648205190897, "learning_rate": 4.7199034420566656e-06, "loss": 0.013, "num_tokens": 6948856.0, "reward": 0.74658203125, "reward_std": 0.009060757234692574, "rewards//mean": 0.74658203125, "rewards//std": 0.033411283046007156, "step": 804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.161, "grad_norm": 1.2979011535644531, "kl": 0.35064204037189484, "learning_rate": 4.7191732601178795e-06, "loss": 0.014, "num_tokens": 6957592.0, "reward": 0.7510986328125, "reward_std": 0.007960843853652477, "rewards//mean": 0.7510986328125, "rewards//std": 0.027572816237807274, "step": 805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1612, "grad_norm": 1.3037084341049194, "kl": 0.37102117761969566, "learning_rate": 4.71844218429637e-06, "loss": 0.0148, "num_tokens": 6966240.0, "reward": 0.77691650390625, "reward_std": 0.013105214573442936, "rewards//mean": 0.77691650390625, "rewards//std": 0.0265219584107399, "step": 806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1614, "grad_norm": 1.3451786041259766, "kl": 0.3336266949772835, "learning_rate": 4.717710214886614e-06, "loss": 0.0133, "num_tokens": 6974904.0, "reward": 0.7286376953125, "reward_std": 0.009902372024953365, "rewards//mean": 0.7286376953125, "rewards//std": 0.03538607805967331, "step": 807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1616, "grad_norm": 1.0671733617782593, "kl": 0.3264966905117035, "learning_rate": 4.716977352183449e-06, "loss": 0.0131, "num_tokens": 6983504.0, "reward": 0.773193359375, "reward_std": 0.007287709973752499, "rewards//mean": 0.773193359375, "rewards//std": 0.025993958115577698, "step": 808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1618, "grad_norm": 1.8360722064971924, "kl": 0.33740856125950813, "learning_rate": 4.716243596482071e-06, "loss": 0.0135, "num_tokens": 6992216.0, "reward": 0.75079345703125, "reward_std": 0.008213981986045837, "rewards//mean": 0.75079345703125, "rewards//std": 0.028178511187434196, "step": 809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.162, "grad_norm": 1.5714982748031616, "kl": 0.3707769885659218, "learning_rate": 4.715508948078037e-06, "loss": 0.0148, "num_tokens": 7000816.0, "reward": 0.77569580078125, "reward_std": 0.012640546075999737, "rewards//mean": 0.77569580078125, "rewards//std": 0.034434448927640915, "step": 810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1622, "grad_norm": 1.2873075008392334, "kl": 0.3405948132276535, "learning_rate": 4.714773407267264e-06, "loss": 0.0136, "num_tokens": 7009480.0, "reward": 0.7650146484375, "reward_std": 0.006831423845142126, "rewards//mean": 0.7650146484375, "rewards//std": 0.020054001361131668, "step": 811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1624, "grad_norm": 0.8567954301834106, "kl": 0.3825432136654854, "learning_rate": 4.714036974346028e-06, "loss": 0.0153, "num_tokens": 7018136.0, "reward": 0.7357177734375, "reward_std": 0.011918211355805397, "rewards//mean": 0.7357177734375, "rewards//std": 0.028866665437817574, "step": 812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1626, "grad_norm": 1.2704569101333618, "kl": 0.35488636791706085, "learning_rate": 4.7132996496109625e-06, "loss": 0.0142, "num_tokens": 7026824.0, "reward": 0.76324462890625, "reward_std": 0.010413013398647308, "rewards//mean": 0.76324462890625, "rewards//std": 0.025464370846748352, "step": 813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1628, "grad_norm": 1.061576247215271, "kl": 0.3901236914098263, "learning_rate": 4.712561433359064e-06, "loss": 0.0156, "num_tokens": 7035488.0, "reward": 0.74591064453125, "reward_std": 0.008115001022815704, "rewards//mean": 0.74591064453125, "rewards//std": 0.02554805390536785, "step": 814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.163, "grad_norm": 1.8980876207351685, "kl": 0.3521212972700596, "learning_rate": 4.7118223258876845e-06, "loss": 0.0141, "num_tokens": 7044072.0, "reward": 0.75384521484375, "reward_std": 0.007846582680940628, "rewards//mean": 0.75384521484375, "rewards//std": 0.016912657767534256, "step": 815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1632, "grad_norm": 1.1933066844940186, "kl": 0.38137778267264366, "learning_rate": 4.711082327494536e-06, "loss": 0.0153, "num_tokens": 7052744.0, "reward": 0.76531982421875, "reward_std": 0.007578435353934765, "rewards//mean": 0.76531982421875, "rewards//std": 0.02658751606941223, "step": 816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1634, "grad_norm": 1.466513991355896, "kl": 0.4100740812718868, "learning_rate": 4.710341438477691e-06, "loss": 0.0164, "num_tokens": 7061384.0, "reward": 0.73834228515625, "reward_std": 0.010529394261538982, "rewards//mean": 0.73834228515625, "rewards//std": 0.03200426325201988, "step": 817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1636, "grad_norm": 1.2618350982666016, "kl": 0.39261148124933243, "learning_rate": 4.709599659135579e-06, "loss": 0.0157, "num_tokens": 7070008.0, "reward": 0.73333740234375, "reward_std": 0.009211428463459015, "rewards//mean": 0.73333740234375, "rewards//std": 0.03585033491253853, "step": 818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1638, "grad_norm": 1.817918062210083, "kl": 0.37963099777698517, "learning_rate": 4.708856989766988e-06, "loss": 0.0152, "num_tokens": 7078680.0, "reward": 0.78253173828125, "reward_std": 0.008948778733611107, "rewards//mean": 0.78253173828125, "rewards//std": 0.018895337358117104, "step": 819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.164, "grad_norm": 1.1052753925323486, "kl": 0.3860554024577141, "learning_rate": 4.708113430671066e-06, "loss": 0.0154, "num_tokens": 7087336.0, "reward": 0.73651123046875, "reward_std": 0.007864532060921192, "rewards//mean": 0.73651123046875, "rewards//std": 0.027699550613760948, "step": 820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1642, "grad_norm": 1.0943959951400757, "kl": 0.4198205694556236, "learning_rate": 4.707368982147318e-06, "loss": 0.0168, "num_tokens": 7096016.0, "reward": 0.75958251953125, "reward_std": 0.008250880986452103, "rewards//mean": 0.75958251953125, "rewards//std": 0.024159371852874756, "step": 821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1644, "grad_norm": 1.2998090982437134, "kl": 0.4261195734143257, "learning_rate": 4.706623644495608e-06, "loss": 0.017, "num_tokens": 7104736.0, "reward": 0.71905517578125, "reward_std": 0.007329055108129978, "rewards//mean": 0.71905517578125, "rewards//std": 0.03233412653207779, "step": 822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1646, "grad_norm": 1.3837676048278809, "kl": 0.416591789573431, "learning_rate": 4.705877418016157e-06, "loss": 0.0167, "num_tokens": 7113368.0, "reward": 0.7564697265625, "reward_std": 0.010825035162270069, "rewards//mean": 0.7564697265625, "rewards//std": 0.029000600799918175, "step": 823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1648, "grad_norm": 1.2816661596298218, "kl": 0.44941718876361847, "learning_rate": 4.705130303009547e-06, "loss": 0.018, "num_tokens": 7122128.0, "reward": 0.77276611328125, "reward_std": 0.008830229751765728, "rewards//mean": 0.77276611328125, "rewards//std": 0.021638043224811554, "step": 824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.165, "grad_norm": 1.167736530303955, "kl": 0.41115984693169594, "learning_rate": 4.7043822997767145e-06, "loss": 0.0164, "num_tokens": 7130776.0, "reward": 0.7608642578125, "reward_std": 0.007697493769228458, "rewards//mean": 0.7608642578125, "rewards//std": 0.014951195567846298, "step": 825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1652, "grad_norm": 1.9024914503097534, "kl": 0.44243358448147774, "learning_rate": 4.703633408618955e-06, "loss": 0.0177, "num_tokens": 7139528.0, "reward": 0.739990234375, "reward_std": 0.012601775117218494, "rewards//mean": 0.739990234375, "rewards//std": 0.02922237664461136, "step": 826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1654, "grad_norm": 1.651120901107788, "kl": 0.41875192895531654, "learning_rate": 4.702883629837922e-06, "loss": 0.0168, "num_tokens": 7148128.0, "reward": 0.71734619140625, "reward_std": 0.008713518269360065, "rewards//mean": 0.71734619140625, "rewards//std": 0.04558815062046051, "step": 827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1656, "grad_norm": 3.4639272689819336, "kl": 0.44034529104828835, "learning_rate": 4.7021329637356274e-06, "loss": 0.0176, "num_tokens": 7156728.0, "reward": 0.72979736328125, "reward_std": 0.007449123077094555, "rewards//mean": 0.72979736328125, "rewards//std": 0.03378652408719063, "step": 828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1658, "grad_norm": 1.6177629232406616, "kl": 0.39477837830781937, "learning_rate": 4.701381410614437e-06, "loss": 0.0158, "num_tokens": 7165408.0, "reward": 0.76959228515625, "reward_std": 0.01015202235430479, "rewards//mean": 0.76959228515625, "rewards//std": 0.029647106304764748, "step": 829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.166, "grad_norm": 1.7870838642120361, "kl": 0.49362414330244064, "learning_rate": 4.700628970777078e-06, "loss": 0.0197, "num_tokens": 7174016.0, "reward": 0.759765625, "reward_std": 0.011450910940766335, "rewards//mean": 0.759765625, "rewards//std": 0.033861320465803146, "step": 830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1662, "grad_norm": 1.3444162607192993, "kl": 0.45138998702168465, "learning_rate": 4.699875644526633e-06, "loss": 0.0181, "num_tokens": 7182656.0, "reward": 0.752685546875, "reward_std": 0.009241987019777298, "rewards//mean": 0.752685546875, "rewards//std": 0.02500615082681179, "step": 831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1664, "grad_norm": 2.9015026092529297, "kl": 0.45517536252737045, "learning_rate": 4.699121432166542e-06, "loss": 0.0182, "num_tokens": 7191360.0, "reward": 0.7528076171875, "reward_std": 0.005533740855753422, "rewards//mean": 0.7528076171875, "rewards//std": 0.033112503588199615, "step": 832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1666, "grad_norm": 1.1061749458312988, "kl": 0.5402024127542973, "learning_rate": 4.6983663340006e-06, "loss": 0.0216, "num_tokens": 7200080.0, "reward": 0.75238037109375, "reward_std": 0.009564736858010292, "rewards//mean": 0.75238037109375, "rewards//std": 0.019030246883630753, "step": 833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1668, "grad_norm": 0.9863348603248596, "kl": 0.4983534514904022, "learning_rate": 4.697610350332962e-06, "loss": 0.0199, "num_tokens": 7208816.0, "reward": 0.75982666015625, "reward_std": 0.010304231196641922, "rewards//mean": 0.75982666015625, "rewards//std": 0.026097338646650314, "step": 834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.167, "grad_norm": 0.8810539841651917, "kl": 0.4796448089182377, "learning_rate": 4.696853481468137e-06, "loss": 0.0192, "num_tokens": 7217488.0, "reward": 0.75048828125, "reward_std": 0.009355641901493073, "rewards//mean": 0.75048828125, "rewards//std": 0.030659254640340805, "step": 835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1672, "grad_norm": 1.3327339887619019, "kl": 0.48946595191955566, "learning_rate": 4.6960957277109945e-06, "loss": 0.0196, "num_tokens": 7226024.0, "reward": 0.75537109375, "reward_std": 0.009866978041827679, "rewards//mean": 0.75537109375, "rewards//std": 0.025596076622605324, "step": 836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1674, "grad_norm": 0.9784926772117615, "kl": 0.5766041316092014, "learning_rate": 4.695337089366754e-06, "loss": 0.0231, "num_tokens": 7234672.0, "reward": 0.74761962890625, "reward_std": 0.012241121381521225, "rewards//mean": 0.74761962890625, "rewards//std": 0.03139396011829376, "step": 837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1676, "grad_norm": 1.5746252536773682, "kl": 0.528806246817112, "learning_rate": 4.694577566740996e-06, "loss": 0.0212, "num_tokens": 7243248.0, "reward": 0.72967529296875, "reward_std": 0.00966464914381504, "rewards//mean": 0.72967529296875, "rewards//std": 0.029709843918681145, "step": 838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1678, "grad_norm": 1.091158390045166, "kl": 0.5902933292090893, "learning_rate": 4.693817160139657e-06, "loss": 0.0236, "num_tokens": 7252024.0, "reward": 0.73333740234375, "reward_std": 0.008134350180625916, "rewards//mean": 0.73333740234375, "rewards//std": 0.02597873844206333, "step": 839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.168, "grad_norm": 1.2695715427398682, "kl": 0.5501630380749702, "learning_rate": 4.693055869869029e-06, "loss": 0.022, "num_tokens": 7260632.0, "reward": 0.74896240234375, "reward_std": 0.007687639910727739, "rewards//mean": 0.74896240234375, "rewards//std": 0.0311431884765625, "step": 840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1682, "grad_norm": 2.8090455532073975, "kl": 0.5814741887152195, "learning_rate": 4.692293696235758e-06, "loss": 0.0233, "num_tokens": 7269232.0, "reward": 0.76068115234375, "reward_std": 0.006853449624031782, "rewards//mean": 0.76068115234375, "rewards//std": 0.019774554297327995, "step": 841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1684, "grad_norm": 0.8583130836486816, "kl": 0.5908543989062309, "learning_rate": 4.6915306395468485e-06, "loss": 0.0236, "num_tokens": 7277888.0, "reward": 0.75555419921875, "reward_std": 0.009005377069115639, "rewards//mean": 0.75555419921875, "rewards//std": 0.018253128975629807, "step": 842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1686, "grad_norm": 1.7500971555709839, "kl": 0.5774266049265862, "learning_rate": 4.690766700109659e-06, "loss": 0.0231, "num_tokens": 7286528.0, "reward": 0.7679443359375, "reward_std": 0.010070763528347015, "rewards//mean": 0.7679443359375, "rewards//std": 0.02478085085749626, "step": 843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1688, "grad_norm": 1.1226806640625, "kl": 0.5556729286909103, "learning_rate": 4.690001878231906e-06, "loss": 0.0222, "num_tokens": 7295168.0, "reward": 0.74505615234375, "reward_std": 0.009786233305931091, "rewards//mean": 0.74505615234375, "rewards//std": 0.01919025555253029, "step": 844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.169, "grad_norm": 1.5926430225372314, "kl": 0.5301719754934311, "learning_rate": 4.689236174221658e-06, "loss": 0.0212, "num_tokens": 7303800.0, "reward": 0.73565673828125, "reward_std": 0.010708114132285118, "rewards//mean": 0.73565673828125, "rewards//std": 0.017935220152139664, "step": 845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1692, "grad_norm": 1.3184345960617065, "kl": 0.5430991984903812, "learning_rate": 4.688469588387339e-06, "loss": 0.0217, "num_tokens": 7312504.0, "reward": 0.77593994140625, "reward_std": 0.009601364843547344, "rewards//mean": 0.77593994140625, "rewards//std": 0.026642685756087303, "step": 846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1694, "grad_norm": 0.8678821325302124, "kl": 0.590129129588604, "learning_rate": 4.687702121037734e-06, "loss": 0.0236, "num_tokens": 7321176.0, "reward": 0.740234375, "reward_std": 0.009357824921607971, "rewards//mean": 0.740234375, "rewards//std": 0.026721050962805748, "step": 847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1696, "grad_norm": 3.635012149810791, "kl": 0.5143630467355251, "learning_rate": 4.6869337724819745e-06, "loss": 0.0206, "num_tokens": 7329792.0, "reward": 0.7623291015625, "reward_std": 0.010166186839342117, "rewards//mean": 0.7623291015625, "rewards//std": 0.02383934147655964, "step": 848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1698, "grad_norm": 1.2785440683364868, "kl": 0.5269539132714272, "learning_rate": 4.686164543029554e-06, "loss": 0.0211, "num_tokens": 7338440.0, "reward": 0.774169921875, "reward_std": 0.008573425933718681, "rewards//mean": 0.774169921875, "rewards//std": 0.02073865942656994, "step": 849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.17, "grad_norm": 1.50819730758667, "kl": 0.49373095482587814, "learning_rate": 4.685394432990316e-06, "loss": 0.0197, "num_tokens": 7347016.0, "reward": 0.756591796875, "reward_std": 0.011000225320458412, "rewards//mean": 0.756591796875, "rewards//std": 0.027347072958946228, "step": 850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1702, "grad_norm": 0.9949831962585449, "kl": 0.5279292948544025, "learning_rate": 4.684623442674463e-06, "loss": 0.0211, "num_tokens": 7355664.0, "reward": 0.69683837890625, "reward_std": 0.0144025394693017, "rewards//mean": 0.69683837890625, "rewards//std": 0.041202642023563385, "step": 851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1704, "grad_norm": 1.1295602321624756, "kl": 0.5627134665846825, "learning_rate": 4.683851572392548e-06, "loss": 0.0225, "num_tokens": 7364392.0, "reward": 0.7559814453125, "reward_std": 0.01152932457625866, "rewards//mean": 0.7559814453125, "rewards//std": 0.03178766742348671, "step": 852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1706, "grad_norm": 1.0643706321716309, "kl": 0.5173224434256554, "learning_rate": 4.68307882245548e-06, "loss": 0.0207, "num_tokens": 7373032.0, "reward": 0.7471923828125, "reward_std": 0.010553266853094101, "rewards//mean": 0.7471923828125, "rewards//std": 0.03074578382074833, "step": 853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1708, "grad_norm": 0.8686037063598633, "kl": 0.5057759210467339, "learning_rate": 4.682305193174524e-06, "loss": 0.0202, "num_tokens": 7381688.0, "reward": 0.75091552734375, "reward_std": 0.014319094829261303, "rewards//mean": 0.75091552734375, "rewards//std": 0.027645394206047058, "step": 854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.171, "grad_norm": 2.133981943130493, "kl": 0.5340125001966953, "learning_rate": 4.681530684861298e-06, "loss": 0.0214, "num_tokens": 7390344.0, "reward": 0.77490234375, "reward_std": 0.012180844321846962, "rewards//mean": 0.77490234375, "rewards//std": 0.020534737035632133, "step": 855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1712, "grad_norm": 1.4045356512069702, "kl": 0.5420391000807285, "learning_rate": 4.680755297827772e-06, "loss": 0.0217, "num_tokens": 7398960.0, "reward": 0.76593017578125, "reward_std": 0.007923927158117294, "rewards//mean": 0.76593017578125, "rewards//std": 0.02088331989943981, "step": 856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1714, "grad_norm": 0.8970532417297363, "kl": 0.49403341114521027, "learning_rate": 4.6799790323862735e-06, "loss": 0.0198, "num_tokens": 7407632.0, "reward": 0.7232666015625, "reward_std": 0.01003449596464634, "rewards//mean": 0.7232666015625, "rewards//std": 0.029150541871786118, "step": 857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1716, "grad_norm": 1.2930840253829956, "kl": 0.4648851417005062, "learning_rate": 4.679201888849481e-06, "loss": 0.0186, "num_tokens": 7416224.0, "reward": 0.77532958984375, "reward_std": 0.008281329646706581, "rewards//mean": 0.77532958984375, "rewards//std": 0.02078668773174286, "step": 858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1718, "grad_norm": 0.9651484489440918, "kl": 0.44804292544722557, "learning_rate": 4.678423867530428e-06, "loss": 0.0179, "num_tokens": 7424872.0, "reward": 0.77166748046875, "reward_std": 0.009435847401618958, "rewards//mean": 0.77166748046875, "rewards//std": 0.024149343371391296, "step": 859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.172, "grad_norm": 1.372725486755371, "kl": 0.4652128331363201, "learning_rate": 4.677644968742503e-06, "loss": 0.0186, "num_tokens": 7433544.0, "reward": 0.77142333984375, "reward_std": 0.011212656274437904, "rewards//mean": 0.77142333984375, "rewards//std": 0.028442122042179108, "step": 860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1722, "grad_norm": 1.1838834285736084, "kl": 0.4007790870964527, "learning_rate": 4.676865192799443e-06, "loss": 0.016, "num_tokens": 7442144.0, "reward": 0.75579833984375, "reward_std": 0.008447399362921715, "rewards//mean": 0.75579833984375, "rewards//std": 0.024019237607717514, "step": 861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1724, "grad_norm": 0.6647836565971375, "kl": 0.45183153450489044, "learning_rate": 4.676084540015345e-06, "loss": 0.0181, "num_tokens": 7450776.0, "reward": 0.75628662109375, "reward_std": 0.009683351032435894, "rewards//mean": 0.75628662109375, "rewards//std": 0.026084575802087784, "step": 862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1726, "grad_norm": 1.740395426750183, "kl": 0.4510505050420761, "learning_rate": 4.675303010704654e-06, "loss": 0.018, "num_tokens": 7459440.0, "reward": 0.7762451171875, "reward_std": 0.01001917663961649, "rewards//mean": 0.7762451171875, "rewards//std": 0.022178800776600838, "step": 863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1728, "grad_norm": 0.8341839909553528, "kl": 0.42043209448456764, "learning_rate": 4.674520605182171e-06, "loss": 0.0168, "num_tokens": 7468160.0, "reward": 0.7720947265625, "reward_std": 0.010514896363019943, "rewards//mean": 0.7720947265625, "rewards//std": 0.02007814310491085, "step": 864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.173, "grad_norm": 0.6752360463142395, "kl": 0.43009642139077187, "learning_rate": 4.673737323763048e-06, "loss": 0.0172, "num_tokens": 7476752.0, "reward": 0.76568603515625, "reward_std": 0.00976257212460041, "rewards//mean": 0.76568603515625, "rewards//std": 0.015445588156580925, "step": 865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1732, "grad_norm": 0.6788051128387451, "kl": 0.4369927644729614, "learning_rate": 4.672953166762791e-06, "loss": 0.0175, "num_tokens": 7485448.0, "reward": 0.76446533203125, "reward_std": 0.012529331259429455, "rewards//mean": 0.76446533203125, "rewards//std": 0.025926828384399414, "step": 866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1734, "grad_norm": 0.6571235060691833, "kl": 0.45430251583456993, "learning_rate": 4.672168134497258e-06, "loss": 0.0182, "num_tokens": 7494104.0, "reward": 0.720947265625, "reward_std": 0.015459833666682243, "rewards//mean": 0.720947265625, "rewards//std": 0.031657155603170395, "step": 867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1736, "grad_norm": 0.6456576585769653, "kl": 0.4099312573671341, "learning_rate": 4.671382227282661e-06, "loss": 0.0164, "num_tokens": 7502752.0, "reward": 0.75372314453125, "reward_std": 0.011794820427894592, "rewards//mean": 0.75372314453125, "rewards//std": 0.02672494389116764, "step": 868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1738, "grad_norm": 0.8506948947906494, "kl": 0.41736486181616783, "learning_rate": 4.670595445435561e-06, "loss": 0.0167, "num_tokens": 7511400.0, "reward": 0.7132568359375, "reward_std": 0.010492125526070595, "rewards//mean": 0.7132568359375, "rewards//std": 0.03419385850429535, "step": 869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.174, "grad_norm": 0.5586651563644409, "kl": 0.40891773998737335, "learning_rate": 4.669807789272877e-06, "loss": 0.0164, "num_tokens": 7520040.0, "reward": 0.76055908203125, "reward_std": 0.009399401023983955, "rewards//mean": 0.76055908203125, "rewards//std": 0.025711068883538246, "step": 870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1742, "grad_norm": 0.8430010080337524, "kl": 0.37954793870449066, "learning_rate": 4.669019259111873e-06, "loss": 0.0152, "num_tokens": 7528672.0, "reward": 0.77191162109375, "reward_std": 0.011711275205016136, "rewards//mean": 0.77191162109375, "rewards//std": 0.022103851661086082, "step": 871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1744, "grad_norm": 0.6705936789512634, "kl": 0.3731926344335079, "learning_rate": 4.668229855270172e-06, "loss": 0.0149, "num_tokens": 7537480.0, "reward": 0.76348876953125, "reward_std": 0.017561282962560654, "rewards//mean": 0.76348876953125, "rewards//std": 0.03937086835503578, "step": 872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1746, "grad_norm": 0.5107825994491577, "kl": 0.3830333612859249, "learning_rate": 4.667439578065745e-06, "loss": 0.0153, "num_tokens": 7546136.0, "reward": 0.76861572265625, "reward_std": 0.008930927142500877, "rewards//mean": 0.76861572265625, "rewards//std": 0.0147271528840065, "step": 873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1748, "grad_norm": 0.6749988794326782, "kl": 0.32537274435162544, "learning_rate": 4.666648427816914e-06, "loss": 0.013, "num_tokens": 7554760.0, "reward": 0.7623291015625, "reward_std": 0.014128495939075947, "rewards//mean": 0.7623291015625, "rewards//std": 0.0285969115793705, "step": 874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.175, "grad_norm": 0.5768765807151794, "kl": 0.3486756831407547, "learning_rate": 4.665856404842356e-06, "loss": 0.0139, "num_tokens": 7563360.0, "reward": 0.76446533203125, "reward_std": 0.010931688360869884, "rewards//mean": 0.76446533203125, "rewards//std": 0.028379783034324646, "step": 875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1752, "grad_norm": 0.5427371859550476, "kl": 0.3550872132182121, "learning_rate": 4.665063509461098e-06, "loss": 0.0142, "num_tokens": 7572032.0, "reward": 0.74072265625, "reward_std": 0.008837481960654259, "rewards//mean": 0.74072265625, "rewards//std": 0.029343342408537865, "step": 876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1754, "grad_norm": 0.607874870300293, "kl": 0.3369791731238365, "learning_rate": 4.664269741992516e-06, "loss": 0.0135, "num_tokens": 7580640.0, "reward": 0.73492431640625, "reward_std": 0.009598618373274803, "rewards//mean": 0.73492431640625, "rewards//std": 0.032740283757448196, "step": 877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1756, "grad_norm": 0.5573941469192505, "kl": 0.33512082695961, "learning_rate": 4.663475102756341e-06, "loss": 0.0134, "num_tokens": 7589144.0, "reward": 0.721923828125, "reward_std": 0.008135411888360977, "rewards//mean": 0.721923828125, "rewards//std": 0.031764086335897446, "step": 878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1758, "grad_norm": 0.5206130146980286, "kl": 0.33247267454862595, "learning_rate": 4.662679592072653e-06, "loss": 0.0133, "num_tokens": 7597784.0, "reward": 0.738037109375, "reward_std": 0.011560275219380856, "rewards//mean": 0.738037109375, "rewards//std": 0.02845815010368824, "step": 879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.176, "grad_norm": 0.4959293305873871, "kl": 0.3310950994491577, "learning_rate": 4.661883210261884e-06, "loss": 0.0132, "num_tokens": 7606448.0, "reward": 0.74853515625, "reward_std": 0.009194498881697655, "rewards//mean": 0.74853515625, "rewards//std": 0.024185124784708023, "step": 880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1762, "grad_norm": 0.5584758520126343, "kl": 0.31146182119846344, "learning_rate": 4.661085957644817e-06, "loss": 0.0125, "num_tokens": 7615096.0, "reward": 0.7435302734375, "reward_std": 0.011724255979061127, "rewards//mean": 0.7435302734375, "rewards//std": 0.02828180231153965, "step": 881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1764, "grad_norm": 0.5160636901855469, "kl": 0.31183508411049843, "learning_rate": 4.660287834542585e-06, "loss": 0.0125, "num_tokens": 7623656.0, "reward": 0.75146484375, "reward_std": 0.009842973202466965, "rewards//mean": 0.75146484375, "rewards//std": 0.026406485587358475, "step": 882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1766, "grad_norm": 0.5011554956436157, "kl": 0.27698101848363876, "learning_rate": 4.659488841276671e-06, "loss": 0.0111, "num_tokens": 7632288.0, "reward": 0.73583984375, "reward_std": 0.00816989317536354, "rewards//mean": 0.73583984375, "rewards//std": 0.03832703083753586, "step": 883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1768, "grad_norm": 0.5525965690612793, "kl": 0.3069544155150652, "learning_rate": 4.65868897816891e-06, "loss": 0.0123, "num_tokens": 7640848.0, "reward": 0.71710205078125, "reward_std": 0.008834538981318474, "rewards//mean": 0.71710205078125, "rewards//std": 0.024956993758678436, "step": 884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.177, "grad_norm": 0.7361370325088501, "kl": 0.3224721159785986, "learning_rate": 4.6578882455414865e-06, "loss": 0.0129, "num_tokens": 7649536.0, "reward": 0.72198486328125, "reward_std": 0.01567103900015354, "rewards//mean": 0.72198486328125, "rewards//std": 0.03261938691139221, "step": 885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1772, "grad_norm": 0.5023349523544312, "kl": 0.29884358681738377, "learning_rate": 4.657086643716937e-06, "loss": 0.012, "num_tokens": 7658176.0, "reward": 0.762939453125, "reward_std": 0.007250561378896236, "rewards//mean": 0.762939453125, "rewards//std": 0.022850144654512405, "step": 886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1774, "grad_norm": 0.5194371342658997, "kl": 0.28458988294005394, "learning_rate": 4.656284173018144e-06, "loss": 0.0114, "num_tokens": 7666712.0, "reward": 0.76953125, "reward_std": 0.01195722445845604, "rewards//mean": 0.76953125, "rewards//std": 0.024160075932741165, "step": 887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1776, "grad_norm": 0.46291840076446533, "kl": 0.281252883374691, "learning_rate": 4.655480833768344e-06, "loss": 0.0113, "num_tokens": 7675416.0, "reward": 0.7490234375, "reward_std": 0.009653441607952118, "rewards//mean": 0.7490234375, "rewards//std": 0.04129883274435997, "step": 888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1778, "grad_norm": 0.5779797434806824, "kl": 0.29215819016098976, "learning_rate": 4.654676626291123e-06, "loss": 0.0117, "num_tokens": 7684048.0, "reward": 0.75439453125, "reward_std": 0.010933919809758663, "rewards//mean": 0.75439453125, "rewards//std": 0.03273756802082062, "step": 889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.178, "grad_norm": 0.8437680602073669, "kl": 0.2626949865370989, "learning_rate": 4.653871550910414e-06, "loss": 0.0105, "num_tokens": 7692680.0, "reward": 0.7613525390625, "reward_std": 0.010730916634202003, "rewards//mean": 0.7613525390625, "rewards//std": 0.02644515223801136, "step": 890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1782, "grad_norm": 0.5432496666908264, "kl": 0.2565987464040518, "learning_rate": 4.653065607950502e-06, "loss": 0.0103, "num_tokens": 7701288.0, "reward": 0.7493896484375, "reward_std": 0.013223410584032536, "rewards//mean": 0.7493896484375, "rewards//std": 0.03801432624459267, "step": 891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1784, "grad_norm": 0.6957722902297974, "kl": 0.25557348132133484, "learning_rate": 4.65225879773602e-06, "loss": 0.0102, "num_tokens": 7709928.0, "reward": 0.76568603515625, "reward_std": 0.008938233368098736, "rewards//mean": 0.76568603515625, "rewards//std": 0.023193299770355225, "step": 892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1786, "grad_norm": 0.526515781879425, "kl": 0.2758950814604759, "learning_rate": 4.651451120591952e-06, "loss": 0.011, "num_tokens": 7718504.0, "reward": 0.760009765625, "reward_std": 0.008625369518995285, "rewards//mean": 0.760009765625, "rewards//std": 0.0195234976708889, "step": 893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1788, "grad_norm": 0.6605725884437561, "kl": 0.270997678861022, "learning_rate": 4.650642576843631e-06, "loss": 0.0108, "num_tokens": 7727272.0, "reward": 0.76007080078125, "reward_std": 0.01003330573439598, "rewards//mean": 0.76007080078125, "rewards//std": 0.03080352023243904, "step": 894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.179, "grad_norm": 0.5448390245437622, "kl": 0.252898957580328, "learning_rate": 4.649833166816736e-06, "loss": 0.0101, "num_tokens": 7736008.0, "reward": 0.73876953125, "reward_std": 0.011466844007372856, "rewards//mean": 0.73876953125, "rewards//std": 0.03744480386376381, "step": 895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1792, "grad_norm": 0.6046472191810608, "kl": 0.2753715682774782, "learning_rate": 4.649022890837298e-06, "loss": 0.011, "num_tokens": 7744680.0, "reward": 0.749267578125, "reward_std": 0.009591508656740189, "rewards//mean": 0.749267578125, "rewards//std": 0.02788209356367588, "step": 896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1794, "grad_norm": 0.8163694739341736, "kl": 0.2629574202001095, "learning_rate": 4.648211749231698e-06, "loss": 0.0105, "num_tokens": 7753328.0, "reward": 0.78839111328125, "reward_std": 0.00975382886826992, "rewards//mean": 0.78839111328125, "rewards//std": 0.02201532945036888, "step": 897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1796, "grad_norm": 0.8590662479400635, "kl": 0.2539634648710489, "learning_rate": 4.6473997423266615e-06, "loss": 0.0102, "num_tokens": 7761912.0, "reward": 0.765380859375, "reward_std": 0.007768705021589994, "rewards//mean": 0.765380859375, "rewards//std": 0.016126545146107674, "step": 898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1798, "grad_norm": 0.5974591970443726, "kl": 0.26154331117868423, "learning_rate": 4.646586870449266e-06, "loss": 0.0105, "num_tokens": 7770584.0, "reward": 0.761962890625, "reward_std": 0.013042537495493889, "rewards//mean": 0.761962890625, "rewards//std": 0.034868303686380386, "step": 899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.18, "grad_norm": 0.8781308531761169, "kl": 0.2626851834356785, "learning_rate": 4.645773133926936e-06, "loss": 0.0105, "num_tokens": 7779176.0, "reward": 0.75445556640625, "reward_std": 0.010092251002788544, "rewards//mean": 0.75445556640625, "rewards//std": 0.03166474774479866, "step": 900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1802, "grad_norm": 0.6971238255500793, "kl": 0.25614192336797714, "learning_rate": 4.644958533087443e-06, "loss": 0.0102, "num_tokens": 7787928.0, "reward": 0.75091552734375, "reward_std": 0.008847690187394619, "rewards//mean": 0.75091552734375, "rewards//std": 0.024505889043211937, "step": 901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1804, "grad_norm": 0.6750687956809998, "kl": 0.26914981193840504, "learning_rate": 4.64414306825891e-06, "loss": 0.0108, "num_tokens": 7796560.0, "reward": 0.7652587890625, "reward_std": 0.012014053761959076, "rewards//mean": 0.7652587890625, "rewards//std": 0.035356976091861725, "step": 902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1806, "grad_norm": 0.6785114407539368, "kl": 0.2674534786492586, "learning_rate": 4.643326739769805e-06, "loss": 0.0107, "num_tokens": 7805264.0, "reward": 0.72943115234375, "reward_std": 0.011744974181056023, "rewards//mean": 0.72943115234375, "rewards//std": 0.037007205188274384, "step": 903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1808, "grad_norm": 0.6502825617790222, "kl": 0.2490905374288559, "learning_rate": 4.642509547948947e-06, "loss": 0.01, "num_tokens": 7813920.0, "reward": 0.7679443359375, "reward_std": 0.009585111401975155, "rewards//mean": 0.7679443359375, "rewards//std": 0.020946016535162926, "step": 904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.181, "grad_norm": 0.8235167860984802, "kl": 0.27164326049387455, "learning_rate": 4.6416914931254984e-06, "loss": 0.0109, "num_tokens": 7822416.0, "reward": 0.70794677734375, "reward_std": 0.010866380296647549, "rewards//mean": 0.70794677734375, "rewards//std": 0.040611300617456436, "step": 905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1812, "grad_norm": 1.0692238807678223, "kl": 0.28652898594737053, "learning_rate": 4.640872575628973e-06, "loss": 0.0115, "num_tokens": 7831000.0, "reward": 0.728271484375, "reward_std": 0.01055578701198101, "rewards//mean": 0.728271484375, "rewards//std": 0.03541964292526245, "step": 906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1814, "grad_norm": 0.9947613477706909, "kl": 0.2775047402828932, "learning_rate": 4.6400527957892295e-06, "loss": 0.0111, "num_tokens": 7839616.0, "reward": 0.76861572265625, "reward_std": 0.01045004278421402, "rewards//mean": 0.76861572265625, "rewards//std": 0.025683382526040077, "step": 907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1816, "grad_norm": 0.7290713787078857, "kl": 0.27869971096515656, "learning_rate": 4.639232153936476e-06, "loss": 0.0111, "num_tokens": 7848272.0, "reward": 0.78680419921875, "reward_std": 0.007229713257402182, "rewards//mean": 0.78680419921875, "rewards//std": 0.03553522005677223, "step": 908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1818, "grad_norm": 0.7138441801071167, "kl": 0.27684359066188335, "learning_rate": 4.638410650401267e-06, "loss": 0.0111, "num_tokens": 7856824.0, "reward": 0.75946044921875, "reward_std": 0.009466202929615974, "rewards//mean": 0.75946044921875, "rewards//std": 0.018594926223158836, "step": 909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.182, "grad_norm": 0.7739296555519104, "kl": 0.2615481149405241, "learning_rate": 4.637588285514504e-06, "loss": 0.0105, "num_tokens": 7865384.0, "reward": 0.7427978515625, "reward_std": 0.008604985661804676, "rewards//mean": 0.7427978515625, "rewards//std": 0.02929145097732544, "step": 910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1822, "grad_norm": 0.9076600670814514, "kl": 0.2758956626057625, "learning_rate": 4.636765059607434e-06, "loss": 0.011, "num_tokens": 7874016.0, "reward": 0.765380859375, "reward_std": 0.008405257016420364, "rewards//mean": 0.765380859375, "rewards//std": 0.02437838539481163, "step": 911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1824, "grad_norm": 0.7445156574249268, "kl": 0.2667129561305046, "learning_rate": 4.6359409730116546e-06, "loss": 0.0107, "num_tokens": 7882688.0, "reward": 0.78369140625, "reward_std": 0.0080318758264184, "rewards//mean": 0.78369140625, "rewards//std": 0.023983998224139214, "step": 912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1826, "grad_norm": 0.8461691737174988, "kl": 0.2566282209008932, "learning_rate": 4.635116026059107e-06, "loss": 0.0103, "num_tokens": 7891408.0, "reward": 0.76092529296875, "reward_std": 0.006388828158378601, "rewards//mean": 0.76092529296875, "rewards//std": 0.024912068620324135, "step": 913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1828, "grad_norm": 1.0944255590438843, "kl": 0.27355407923460007, "learning_rate": 4.634290219082078e-06, "loss": 0.0109, "num_tokens": 7900024.0, "reward": 0.7568359375, "reward_std": 0.00810672901570797, "rewards//mean": 0.7568359375, "rewards//std": 0.029487434774637222, "step": 914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.183, "grad_norm": 1.0639708042144775, "kl": 0.28650105744600296, "learning_rate": 4.633463552413205e-06, "loss": 0.0115, "num_tokens": 7908696.0, "reward": 0.76947021484375, "reward_std": 0.006652969866991043, "rewards//mean": 0.76947021484375, "rewards//std": 0.024434128776192665, "step": 915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1832, "grad_norm": 0.8133704662322998, "kl": 0.2669271398335695, "learning_rate": 4.632636026385468e-06, "loss": 0.0107, "num_tokens": 7917392.0, "reward": 0.7740478515625, "reward_std": 0.011425754986703396, "rewards//mean": 0.7740478515625, "rewards//std": 0.01731915958225727, "step": 916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1834, "grad_norm": 1.5723562240600586, "kl": 0.27988065406680107, "learning_rate": 4.631807641332195e-06, "loss": 0.0112, "num_tokens": 7926048.0, "reward": 0.734375, "reward_std": 0.005926807411015034, "rewards//mean": 0.734375, "rewards//std": 0.04288693889975548, "step": 917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1836, "grad_norm": 1.0416011810302734, "kl": 0.3021409697830677, "learning_rate": 4.630978397587058e-06, "loss": 0.0121, "num_tokens": 7935016.0, "reward": 0.77471923828125, "reward_std": 0.00614708149805665, "rewards//mean": 0.77471923828125, "rewards//std": 0.040547508746385574, "step": 918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1838, "grad_norm": 0.7368637919425964, "kl": 0.2729188334196806, "learning_rate": 4.630148295484078e-06, "loss": 0.0109, "num_tokens": 7943656.0, "reward": 0.74053955078125, "reward_std": 0.007835205644369125, "rewards//mean": 0.74053955078125, "rewards//std": 0.025862522423267365, "step": 919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.184, "grad_norm": 1.2126497030258179, "kl": 0.279952809214592, "learning_rate": 4.62931733535762e-06, "loss": 0.0112, "num_tokens": 7952288.0, "reward": 0.73394775390625, "reward_std": 0.007915819063782692, "rewards//mean": 0.73394775390625, "rewards//std": 0.030266623944044113, "step": 920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1842, "grad_norm": 1.5177454948425293, "kl": 0.28335076197981834, "learning_rate": 4.628485517542393e-06, "loss": 0.0113, "num_tokens": 7960904.0, "reward": 0.74371337890625, "reward_std": 0.008045843802392483, "rewards//mean": 0.74371337890625, "rewards//std": 0.03372957557439804, "step": 921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1844, "grad_norm": 0.8178320527076721, "kl": 0.2918772976845503, "learning_rate": 4.627652842373454e-06, "loss": 0.0117, "num_tokens": 7969416.0, "reward": 0.77789306640625, "reward_std": 0.009983016178011894, "rewards//mean": 0.77789306640625, "rewards//std": 0.024058280512690544, "step": 922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1846, "grad_norm": 1.0449796915054321, "kl": 0.3009101003408432, "learning_rate": 4.626819310186204e-06, "loss": 0.012, "num_tokens": 7978048.0, "reward": 0.76824951171875, "reward_std": 0.009645262733101845, "rewards//mean": 0.76824951171875, "rewards//std": 0.027419421821832657, "step": 923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1848, "grad_norm": 0.9068644642829895, "kl": 0.2759493812918663, "learning_rate": 4.625984921316392e-06, "loss": 0.011, "num_tokens": 7986648.0, "reward": 0.76788330078125, "reward_std": 0.011120768263936043, "rewards//mean": 0.76788330078125, "rewards//std": 0.02279633842408657, "step": 924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.185, "grad_norm": 1.1586679220199585, "kl": 0.2916589751839638, "learning_rate": 4.625149676100107e-06, "loss": 0.0117, "num_tokens": 7995320.0, "reward": 0.76434326171875, "reward_std": 0.0063139949925243855, "rewards//mean": 0.76434326171875, "rewards//std": 0.022053800523281097, "step": 925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1852, "grad_norm": 0.9030797481536865, "kl": 0.28806450217962265, "learning_rate": 4.624313574873787e-06, "loss": 0.0115, "num_tokens": 8003952.0, "reward": 0.7706298828125, "reward_std": 0.010051582008600235, "rewards//mean": 0.7706298828125, "rewards//std": 0.03246990218758583, "step": 926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1854, "grad_norm": 0.8641966581344604, "kl": 0.29965170845389366, "learning_rate": 4.623476617974212e-06, "loss": 0.012, "num_tokens": 8012560.0, "reward": 0.76129150390625, "reward_std": 0.006479769945144653, "rewards//mean": 0.76129150390625, "rewards//std": 0.02804011106491089, "step": 927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1856, "grad_norm": 0.9614176154136658, "kl": 0.30398988723754883, "learning_rate": 4.62263880573851e-06, "loss": 0.0122, "num_tokens": 8021216.0, "reward": 0.77435302734375, "reward_std": 0.00971104484051466, "rewards//mean": 0.77435302734375, "rewards//std": 0.02430741675198078, "step": 928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1858, "grad_norm": 0.7756906747817993, "kl": 0.30807607248425484, "learning_rate": 4.6218001385041504e-06, "loss": 0.0123, "num_tokens": 8029968.0, "reward": 0.74884033203125, "reward_std": 0.012052865698933601, "rewards//mean": 0.74884033203125, "rewards//std": 0.03312136232852936, "step": 929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.186, "grad_norm": 1.1806690692901611, "kl": 0.2854270339012146, "learning_rate": 4.6209606166089495e-06, "loss": 0.0114, "num_tokens": 8038632.0, "reward": 0.75872802734375, "reward_std": 0.006012373138219118, "rewards//mean": 0.75872802734375, "rewards//std": 0.02449600212275982, "step": 930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1862, "grad_norm": 0.8380911350250244, "kl": 0.2782778702676296, "learning_rate": 4.620120240391065e-06, "loss": 0.0111, "num_tokens": 8047248.0, "reward": 0.74365234375, "reward_std": 0.009586180560290813, "rewards//mean": 0.74365234375, "rewards//std": 0.023172486573457718, "step": 931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1864, "grad_norm": 1.0672872066497803, "kl": 0.29075466096401215, "learning_rate": 4.619279010189002e-06, "loss": 0.0116, "num_tokens": 8055896.0, "reward": 0.737060546875, "reward_std": 0.008078483864665031, "rewards//mean": 0.737060546875, "rewards//std": 0.03260691091418266, "step": 932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1866, "grad_norm": 0.937512993812561, "kl": 0.2672659792006016, "learning_rate": 4.618436926341607e-06, "loss": 0.0107, "num_tokens": 8064552.0, "reward": 0.77001953125, "reward_std": 0.01028747484087944, "rewards//mean": 0.77001953125, "rewards//std": 0.021871235221624374, "step": 933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1868, "grad_norm": 0.7125497460365295, "kl": 0.2997877672314644, "learning_rate": 4.617593989188071e-06, "loss": 0.012, "num_tokens": 8073232.0, "reward": 0.7581787109375, "reward_std": 0.009922824800014496, "rewards//mean": 0.7581787109375, "rewards//std": 0.03353586792945862, "step": 934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.187, "grad_norm": 0.9831026196479797, "kl": 0.30767880380153656, "learning_rate": 4.616750199067929e-06, "loss": 0.0123, "num_tokens": 8081840.0, "reward": 0.7542724609375, "reward_std": 0.012375378049910069, "rewards//mean": 0.7542724609375, "rewards//std": 0.03080284409224987, "step": 935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1872, "grad_norm": 1.1639822721481323, "kl": 0.3031549211591482, "learning_rate": 4.615905556321061e-06, "loss": 0.0121, "num_tokens": 8090544.0, "reward": 0.75262451171875, "reward_std": 0.01033235713839531, "rewards//mean": 0.75262451171875, "rewards//std": 0.027613069862127304, "step": 936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1874, "grad_norm": 0.8558388352394104, "kl": 0.30340132489800453, "learning_rate": 4.615060061287688e-06, "loss": 0.0121, "num_tokens": 8099208.0, "reward": 0.7528076171875, "reward_std": 0.013534157536923885, "rewards//mean": 0.7528076171875, "rewards//std": 0.027782833203673363, "step": 937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1876, "grad_norm": 0.845984160900116, "kl": 0.29561108350753784, "learning_rate": 4.614213714308374e-06, "loss": 0.0118, "num_tokens": 8107856.0, "reward": 0.75341796875, "reward_std": 0.011012168601155281, "rewards//mean": 0.75341796875, "rewards//std": 0.02374039590358734, "step": 938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1878, "grad_norm": 0.8777411580085754, "kl": 0.29283004254102707, "learning_rate": 4.6133665157240306e-06, "loss": 0.0117, "num_tokens": 8116456.0, "reward": 0.71600341796875, "reward_std": 0.008553056046366692, "rewards//mean": 0.71600341796875, "rewards//std": 0.0435798205435276, "step": 939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.188, "grad_norm": 0.9265841841697693, "kl": 0.29388684406876564, "learning_rate": 4.612518465875906e-06, "loss": 0.0118, "num_tokens": 8125016.0, "reward": 0.7520751953125, "reward_std": 0.009404158219695091, "rewards//mean": 0.7520751953125, "rewards//std": 0.03676244989037514, "step": 940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1882, "grad_norm": 0.8138207793235779, "kl": 0.3315567076206207, "learning_rate": 4.611669565105597e-06, "loss": 0.0133, "num_tokens": 8133600.0, "reward": 0.7581787109375, "reward_std": 0.007814617827534676, "rewards//mean": 0.7581787109375, "rewards//std": 0.029054835438728333, "step": 941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1884, "grad_norm": 0.895754337310791, "kl": 0.2886655665934086, "learning_rate": 4.610819813755038e-06, "loss": 0.0115, "num_tokens": 8142232.0, "reward": 0.7618408203125, "reward_std": 0.008037852123379707, "rewards//mean": 0.7618408203125, "rewards//std": 0.018055196851491928, "step": 942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1886, "grad_norm": 1.0638179779052734, "kl": 0.33327390998601913, "learning_rate": 4.609969212166512e-06, "loss": 0.0133, "num_tokens": 8150976.0, "reward": 0.7430419921875, "reward_std": 0.008485731668770313, "rewards//mean": 0.7430419921875, "rewards//std": 0.023328416049480438, "step": 943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1888, "grad_norm": 1.1232788562774658, "kl": 0.31669921800494194, "learning_rate": 4.609117760682639e-06, "loss": 0.0127, "num_tokens": 8159640.0, "reward": 0.7677001953125, "reward_std": 0.010280384682118893, "rewards//mean": 0.7677001953125, "rewards//std": 0.039260219782590866, "step": 944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.189, "grad_norm": 1.87589430809021, "kl": 0.2973406966775656, "learning_rate": 4.608265459646384e-06, "loss": 0.0119, "num_tokens": 8168288.0, "reward": 0.73162841796875, "reward_std": 0.008025545626878738, "rewards//mean": 0.73162841796875, "rewards//std": 0.028157014399766922, "step": 945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1892, "grad_norm": 0.7903565168380737, "kl": 0.2874004878103733, "learning_rate": 4.607412309401054e-06, "loss": 0.0115, "num_tokens": 8176928.0, "reward": 0.77008056640625, "reward_std": 0.006889053154736757, "rewards//mean": 0.77008056640625, "rewards//std": 0.02101699262857437, "step": 946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1894, "grad_norm": 1.0100493431091309, "kl": 0.28360038809478283, "learning_rate": 4.606558310290298e-06, "loss": 0.0113, "num_tokens": 8185472.0, "reward": 0.7728271484375, "reward_std": 0.007974790409207344, "rewards//mean": 0.7728271484375, "rewards//std": 0.021634282544255257, "step": 947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1896, "grad_norm": 0.9846349358558655, "kl": 0.3387889303267002, "learning_rate": 4.605703462658107e-06, "loss": 0.0136, "num_tokens": 8194096.0, "reward": 0.77178955078125, "reward_std": 0.01358504593372345, "rewards//mean": 0.77178955078125, "rewards//std": 0.02849157713353634, "step": 948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1898, "grad_norm": 1.747856616973877, "kl": 0.33013427443802357, "learning_rate": 4.604847766848812e-06, "loss": 0.0132, "num_tokens": 8202632.0, "reward": 0.7506103515625, "reward_std": 0.011012842878699303, "rewards//mean": 0.7506103515625, "rewards//std": 0.0204012431204319, "step": 949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.19, "grad_norm": 0.925475537776947, "kl": 0.31422964110970497, "learning_rate": 4.60399122320709e-06, "loss": 0.0126, "num_tokens": 8211200.0, "reward": 0.76910400390625, "reward_std": 0.009690100327134132, "rewards//mean": 0.76910400390625, "rewards//std": 0.018468312919139862, "step": 950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1902, "grad_norm": 1.132817268371582, "kl": 0.3214902691543102, "learning_rate": 4.603133832077953e-06, "loss": 0.0129, "num_tokens": 8219872.0, "reward": 0.7730712890625, "reward_std": 0.01109931617975235, "rewards//mean": 0.7730712890625, "rewards//std": 0.030335379764437675, "step": 951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1904, "grad_norm": 0.914539635181427, "kl": 0.3235311582684517, "learning_rate": 4.602275593806761e-06, "loss": 0.0129, "num_tokens": 8228504.0, "reward": 0.76373291015625, "reward_std": 0.008837835863232613, "rewards//mean": 0.76373291015625, "rewards//std": 0.01813581772148609, "step": 952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1906, "grad_norm": 0.9955701231956482, "kl": 0.34584128484129906, "learning_rate": 4.601416508739211e-06, "loss": 0.0138, "num_tokens": 8237192.0, "reward": 0.74029541015625, "reward_std": 0.012629736214876175, "rewards//mean": 0.74029541015625, "rewards//std": 0.027743780985474586, "step": 953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1908, "grad_norm": 0.9932671785354614, "kl": 0.31934236362576485, "learning_rate": 4.600556577221342e-06, "loss": 0.0128, "num_tokens": 8245880.0, "reward": 0.73089599609375, "reward_std": 0.008857986889779568, "rewards//mean": 0.73089599609375, "rewards//std": 0.031467169523239136, "step": 954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.191, "grad_norm": 1.2234418392181396, "kl": 0.30861231684684753, "learning_rate": 4.599695799599537e-06, "loss": 0.0123, "num_tokens": 8254472.0, "reward": 0.78582763671875, "reward_std": 0.00961345061659813, "rewards//mean": 0.78582763671875, "rewards//std": 0.023318924009799957, "step": 955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1912, "grad_norm": 1.053141474723816, "kl": 0.3571406714618206, "learning_rate": 4.5988341762205125e-06, "loss": 0.0143, "num_tokens": 8263168.0, "reward": 0.76080322265625, "reward_std": 0.008980470709502697, "rewards//mean": 0.76080322265625, "rewards//std": 0.023195909336209297, "step": 956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1914, "grad_norm": 1.2716294527053833, "kl": 0.3623424470424652, "learning_rate": 4.5979717074313336e-06, "loss": 0.0145, "num_tokens": 8271784.0, "reward": 0.7208251953125, "reward_std": 0.011484457179903984, "rewards//mean": 0.7208251953125, "rewards//std": 0.04424877464771271, "step": 957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1916, "grad_norm": 1.2183886766433716, "kl": 0.3608044385910034, "learning_rate": 4.5971083935794026e-06, "loss": 0.0144, "num_tokens": 8280496.0, "reward": 0.74468994140625, "reward_std": 0.00796246062964201, "rewards//mean": 0.74468994140625, "rewards//std": 0.03260313719511032, "step": 958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1918, "grad_norm": 1.0903003215789795, "kl": 0.3438631668686867, "learning_rate": 4.5962442350124605e-06, "loss": 0.0138, "num_tokens": 8289160.0, "reward": 0.75726318359375, "reward_std": 0.008945617824792862, "rewards//mean": 0.75726318359375, "rewards//std": 0.02822735533118248, "step": 959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.192, "grad_norm": 1.063489556312561, "kl": 0.3416360281407833, "learning_rate": 4.595379232078592e-06, "loss": 0.0137, "num_tokens": 8297768.0, "reward": 0.72540283203125, "reward_std": 0.010138707235455513, "rewards//mean": 0.72540283203125, "rewards//std": 0.03077893704175949, "step": 960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1922, "grad_norm": 0.9618769884109497, "kl": 0.3541461080312729, "learning_rate": 4.5945133851262185e-06, "loss": 0.0142, "num_tokens": 8306440.0, "reward": 0.7489013671875, "reward_std": 0.008929513394832611, "rewards//mean": 0.7489013671875, "rewards//std": 0.029630571603775024, "step": 961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1924, "grad_norm": 1.2882936000823975, "kl": 0.36649367958307266, "learning_rate": 4.593646694504105e-06, "loss": 0.0147, "num_tokens": 8315072.0, "reward": 0.75372314453125, "reward_std": 0.010234087705612183, "rewards//mean": 0.75372314453125, "rewards//std": 0.025395916774868965, "step": 962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1926, "grad_norm": 1.241235613822937, "kl": 0.3214772678911686, "learning_rate": 4.5927791605613525e-06, "loss": 0.0129, "num_tokens": 8323776.0, "reward": 0.76800537109375, "reward_std": 0.008909153752028942, "rewards//mean": 0.76800537109375, "rewards//std": 0.02892867475748062, "step": 963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1928, "grad_norm": 1.4344669580459595, "kl": 0.36224448308348656, "learning_rate": 4.591910783647405e-06, "loss": 0.0145, "num_tokens": 8332408.0, "reward": 0.75018310546875, "reward_std": 0.011086471378803253, "rewards//mean": 0.75018310546875, "rewards//std": 0.029143985360860825, "step": 964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.193, "grad_norm": 1.057570457458496, "kl": 0.3504234068095684, "learning_rate": 4.591041564112043e-06, "loss": 0.014, "num_tokens": 8341096.0, "reward": 0.75848388671875, "reward_std": 0.011695520021021366, "rewards//mean": 0.75848388671875, "rewards//std": 0.023246105760335922, "step": 965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1932, "grad_norm": 1.2750986814498901, "kl": 0.3357960321009159, "learning_rate": 4.59017150230539e-06, "loss": 0.0134, "num_tokens": 8349760.0, "reward": 0.7686767578125, "reward_std": 0.011024602688848972, "rewards//mean": 0.7686767578125, "rewards//std": 0.02808629721403122, "step": 966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1934, "grad_norm": 1.308644413948059, "kl": 0.35375194624066353, "learning_rate": 4.589300598577906e-06, "loss": 0.0142, "num_tokens": 8358440.0, "reward": 0.75726318359375, "reward_std": 0.0074037788435816765, "rewards//mean": 0.75726318359375, "rewards//std": 0.027497153729200363, "step": 967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1936, "grad_norm": 1.059302568435669, "kl": 0.3852734975516796, "learning_rate": 4.58842885328039e-06, "loss": 0.0154, "num_tokens": 8367064.0, "reward": 0.745361328125, "reward_std": 0.006511340849101543, "rewards//mean": 0.745361328125, "rewards//std": 0.03340674936771393, "step": 968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1938, "grad_norm": 1.2672991752624512, "kl": 0.3788112476468086, "learning_rate": 4.587556266763982e-06, "loss": 0.0152, "num_tokens": 8375712.0, "reward": 0.72161865234375, "reward_std": 0.0076048411428928375, "rewards//mean": 0.72161865234375, "rewards//std": 0.02709229476749897, "step": 969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.194, "grad_norm": 1.2441980838775635, "kl": 0.34678515046834946, "learning_rate": 4.586682839380159e-06, "loss": 0.0139, "num_tokens": 8384464.0, "reward": 0.75909423828125, "reward_std": 0.011957092210650444, "rewards//mean": 0.75909423828125, "rewards//std": 0.029256997630000114, "step": 970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1942, "grad_norm": 1.481521725654602, "kl": 0.3327783904969692, "learning_rate": 4.585808571480739e-06, "loss": 0.0133, "num_tokens": 8393096.0, "reward": 0.76458740234375, "reward_std": 0.009498574770987034, "rewards//mean": 0.76458740234375, "rewards//std": 0.02750045619904995, "step": 971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1944, "grad_norm": 1.0270497798919678, "kl": 0.35905660316348076, "learning_rate": 4.584933463417874e-06, "loss": 0.0144, "num_tokens": 8401712.0, "reward": 0.7547607421875, "reward_std": 0.009442451409995556, "rewards//mean": 0.7547607421875, "rewards//std": 0.02266489528119564, "step": 972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1946, "grad_norm": 1.9370031356811523, "kl": 0.4017368145287037, "learning_rate": 4.584057515544061e-06, "loss": 0.0161, "num_tokens": 8410496.0, "reward": 0.75067138671875, "reward_std": 0.008788703009486198, "rewards//mean": 0.75067138671875, "rewards//std": 0.026213670149445534, "step": 973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1948, "grad_norm": 2.0624356269836426, "kl": 0.36438558250665665, "learning_rate": 4.583180728212128e-06, "loss": 0.0146, "num_tokens": 8419064.0, "reward": 0.75830078125, "reward_std": 0.00762117188423872, "rewards//mean": 0.75830078125, "rewards//std": 0.02847197651863098, "step": 974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.195, "grad_norm": 1.3694345951080322, "kl": 0.3885602429509163, "learning_rate": 4.582303101775249e-06, "loss": 0.0155, "num_tokens": 8427840.0, "reward": 0.76422119140625, "reward_std": 0.009295577183365822, "rewards//mean": 0.76422119140625, "rewards//std": 0.02592390775680542, "step": 975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1952, "grad_norm": 1.3455674648284912, "kl": 0.3587591424584389, "learning_rate": 4.5814246365869285e-06, "loss": 0.0144, "num_tokens": 8436616.0, "reward": 0.780029296875, "reward_std": 0.00841439887881279, "rewards//mean": 0.780029296875, "rewards//std": 0.022754548117518425, "step": 976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1954, "grad_norm": 1.3943334817886353, "kl": 0.3845372460782528, "learning_rate": 4.580545333001014e-06, "loss": 0.0154, "num_tokens": 8445216.0, "reward": 0.7724609375, "reward_std": 0.010662311688065529, "rewards//mean": 0.7724609375, "rewards//std": 0.03599030151963234, "step": 977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1956, "grad_norm": 1.7175418138504028, "kl": 0.3837543651461601, "learning_rate": 4.579665191371687e-06, "loss": 0.0154, "num_tokens": 8453896.0, "reward": 0.777099609375, "reward_std": 0.0080185541883111, "rewards//mean": 0.777099609375, "rewards//std": 0.027151526883244514, "step": 978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1958, "grad_norm": 1.7660549879074097, "kl": 0.4039181172847748, "learning_rate": 4.578784212053471e-06, "loss": 0.0162, "num_tokens": 8462552.0, "reward": 0.7745361328125, "reward_std": 0.0088932104408741, "rewards//mean": 0.7745361328125, "rewards//std": 0.018208811059594154, "step": 979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.196, "grad_norm": 1.3596855401992798, "kl": 0.41640326753258705, "learning_rate": 4.577902395401222e-06, "loss": 0.0167, "num_tokens": 8471232.0, "reward": 0.749755859375, "reward_std": 0.012476136907935143, "rewards//mean": 0.749755859375, "rewards//std": 0.0270084235817194, "step": 980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1962, "grad_norm": 1.7318655252456665, "kl": 0.3891097605228424, "learning_rate": 4.577019741770137e-06, "loss": 0.0156, "num_tokens": 8479856.0, "reward": 0.763427734375, "reward_std": 0.009000815451145172, "rewards//mean": 0.763427734375, "rewards//std": 0.025083519518375397, "step": 981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1964, "grad_norm": 1.3574923276901245, "kl": 0.43050722032785416, "learning_rate": 4.576136251515748e-06, "loss": 0.0172, "num_tokens": 8488528.0, "reward": 0.745849609375, "reward_std": 0.013025259599089622, "rewards//mean": 0.745849609375, "rewards//std": 0.030216630548238754, "step": 982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1966, "grad_norm": 1.5071163177490234, "kl": 0.39033103734254837, "learning_rate": 4.575251924993926e-06, "loss": 0.0156, "num_tokens": 8497112.0, "reward": 0.75579833984375, "reward_std": 0.010598527267575264, "rewards//mean": 0.75579833984375, "rewards//std": 0.028314098715782166, "step": 983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1968, "grad_norm": 1.5211485624313354, "kl": 0.4381902143359184, "learning_rate": 4.574366762560876e-06, "loss": 0.0175, "num_tokens": 8505752.0, "reward": 0.76092529296875, "reward_std": 0.008739031851291656, "rewards//mean": 0.76092529296875, "rewards//std": 0.028815951198339462, "step": 984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.197, "grad_norm": 1.9539810419082642, "kl": 0.4092373363673687, "learning_rate": 4.573480764573143e-06, "loss": 0.0164, "num_tokens": 8514376.0, "reward": 0.74774169921875, "reward_std": 0.008132066577672958, "rewards//mean": 0.74774169921875, "rewards//std": 0.029592422768473625, "step": 985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1972, "grad_norm": 1.4879189729690552, "kl": 0.3980576805770397, "learning_rate": 4.572593931387604e-06, "loss": 0.0159, "num_tokens": 8522960.0, "reward": 0.724853515625, "reward_std": 0.0099346823990345, "rewards//mean": 0.724853515625, "rewards//std": 0.03472910076379776, "step": 986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1974, "grad_norm": 1.3883405923843384, "kl": 0.4651208482682705, "learning_rate": 4.571706263361479e-06, "loss": 0.0186, "num_tokens": 8531496.0, "reward": 0.72979736328125, "reward_std": 0.00940313097089529, "rewards//mean": 0.72979736328125, "rewards//std": 0.033729128539562225, "step": 987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1976, "grad_norm": 1.4428088665008545, "kl": 0.43703150749206543, "learning_rate": 4.570817760852319e-06, "loss": 0.0175, "num_tokens": 8540232.0, "reward": 0.7557373046875, "reward_std": 0.006712545640766621, "rewards//mean": 0.7557373046875, "rewards//std": 0.029765138402581215, "step": 988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1978, "grad_norm": 1.7286840677261353, "kl": 0.4548807218670845, "learning_rate": 4.569928424218012e-06, "loss": 0.0182, "num_tokens": 8548920.0, "reward": 0.75628662109375, "reward_std": 0.008855903521180153, "rewards//mean": 0.75628662109375, "rewards//std": 0.029729709029197693, "step": 989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.198, "grad_norm": 1.4762978553771973, "kl": 0.4822859615087509, "learning_rate": 4.569038253816783e-06, "loss": 0.0193, "num_tokens": 8557688.0, "reward": 0.784912109375, "reward_std": 0.00827457383275032, "rewards//mean": 0.784912109375, "rewards//std": 0.02746196649968624, "step": 990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1982, "grad_norm": 1.853870153427124, "kl": 0.4191112294793129, "learning_rate": 4.5681472500071935e-06, "loss": 0.0168, "num_tokens": 8566352.0, "reward": 0.75384521484375, "reward_std": 0.0067790113389492035, "rewards//mean": 0.75384521484375, "rewards//std": 0.024154985323548317, "step": 991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1984, "grad_norm": 1.573944330215454, "kl": 0.42034513130784035, "learning_rate": 4.567255413148139e-06, "loss": 0.0168, "num_tokens": 8575064.0, "reward": 0.7481689453125, "reward_std": 0.009067408740520477, "rewards//mean": 0.7481689453125, "rewards//std": 0.02402908354997635, "step": 992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1986, "grad_norm": 1.8001471757888794, "kl": 0.4698791652917862, "learning_rate": 4.566362743598851e-06, "loss": 0.0188, "num_tokens": 8583664.0, "reward": 0.7425537109375, "reward_std": 0.012988231144845486, "rewards//mean": 0.7425537109375, "rewards//std": 0.02789810486137867, "step": 993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1988, "grad_norm": 1.660449743270874, "kl": 0.46960631385445595, "learning_rate": 4.565469241718896e-06, "loss": 0.0188, "num_tokens": 8592344.0, "reward": 0.752685546875, "reward_std": 0.005921890959143639, "rewards//mean": 0.752685546875, "rewards//std": 0.020586274564266205, "step": 994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.199, "grad_norm": 1.4339450597763062, "kl": 0.4214029908180237, "learning_rate": 4.564574907868179e-06, "loss": 0.0169, "num_tokens": 8601024.0, "reward": 0.7352294921875, "reward_std": 0.007715367246419191, "rewards//mean": 0.7352294921875, "rewards//std": 0.03900335729122162, "step": 995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1992, "grad_norm": 2.15255069732666, "kl": 0.4501718766987324, "learning_rate": 4.563679742406935e-06, "loss": 0.018, "num_tokens": 8609688.0, "reward": 0.76776123046875, "reward_std": 0.006294669583439827, "rewards//mean": 0.76776123046875, "rewards//std": 0.03227883577346802, "step": 996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1994, "grad_norm": 1.7035322189331055, "kl": 0.45111533999443054, "learning_rate": 4.562783745695738e-06, "loss": 0.018, "num_tokens": 8618400.0, "reward": 0.76800537109375, "reward_std": 0.0073316022753715515, "rewards//mean": 0.76800537109375, "rewards//std": 0.028362175449728966, "step": 997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1996, "grad_norm": 1.9999769926071167, "kl": 0.4639809913933277, "learning_rate": 4.561886918095495e-06, "loss": 0.0186, "num_tokens": 8627216.0, "reward": 0.768798828125, "reward_std": 0.011256873607635498, "rewards//mean": 0.768798828125, "rewards//std": 0.024358505383133888, "step": 998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1998, "grad_norm": 1.600160837173462, "kl": 0.44482381641864777, "learning_rate": 4.560989259967447e-06, "loss": 0.0178, "num_tokens": 8635816.0, "reward": 0.71075439453125, "reward_std": 0.00779071357101202, "rewards//mean": 0.71075439453125, "rewards//std": 0.035272691398859024, "step": 999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2, "grad_norm": 2.164851665496826, "kl": 0.4851101525127888, "learning_rate": 4.560090771673174e-06, "loss": 0.0194, "num_tokens": 8644496.0, "reward": 0.7564697265625, "reward_std": 0.014028170146048069, "rewards//mean": 0.7564697265625, "rewards//std": 0.03170565515756607, "step": 1000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2002, "grad_norm": 2.1553897857666016, "kl": 0.4857872202992439, "learning_rate": 4.559191453574582e-06, "loss": 0.0194, "num_tokens": 8653088.0, "reward": 0.75579833984375, "reward_std": 0.013866904191672802, "rewards//mean": 0.75579833984375, "rewards//std": 0.0308756735175848, "step": 1001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2004, "grad_norm": 1.4568302631378174, "kl": 0.49880654737353325, "learning_rate": 4.55829130603392e-06, "loss": 0.02, "num_tokens": 8661776.0, "reward": 0.73822021484375, "reward_std": 0.00996166467666626, "rewards//mean": 0.73822021484375, "rewards//std": 0.03278186917304993, "step": 1002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2006, "grad_norm": 1.3147813081741333, "kl": 0.4375142343342304, "learning_rate": 4.557390329413765e-06, "loss": 0.0175, "num_tokens": 8670552.0, "reward": 0.7591552734375, "reward_std": 0.008115453645586967, "rewards//mean": 0.7591552734375, "rewards//std": 0.026604946702718735, "step": 1003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2008, "grad_norm": 2.2151377201080322, "kl": 0.44849271699786186, "learning_rate": 4.556488524077033e-06, "loss": 0.0179, "num_tokens": 8679200.0, "reward": 0.76416015625, "reward_std": 0.013429000973701477, "rewards//mean": 0.76416015625, "rewards//std": 0.03323684632778168, "step": 1004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.201, "grad_norm": 1.6588973999023438, "kl": 0.47609858587384224, "learning_rate": 4.555585890386969e-06, "loss": 0.019, "num_tokens": 8687768.0, "reward": 0.7469482421875, "reward_std": 0.008592274971306324, "rewards//mean": 0.7469482421875, "rewards//std": 0.03778586536645889, "step": 1005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2012, "grad_norm": 1.3879097700119019, "kl": 0.5521229170262814, "learning_rate": 4.554682428707153e-06, "loss": 0.0221, "num_tokens": 8696448.0, "reward": 0.7130126953125, "reward_std": 0.014532892033457756, "rewards//mean": 0.7130126953125, "rewards//std": 0.03750599920749664, "step": 1006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2014, "grad_norm": 3.189234495162964, "kl": 0.39750154688954353, "learning_rate": 4.553778139401501e-06, "loss": 0.0159, "num_tokens": 8705088.0, "reward": 0.76165771484375, "reward_std": 0.006667818874120712, "rewards//mean": 0.76165771484375, "rewards//std": 0.020908674225211143, "step": 1007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2016, "grad_norm": 1.9399793148040771, "kl": 0.4622378796339035, "learning_rate": 4.55287302283426e-06, "loss": 0.0185, "num_tokens": 8713744.0, "reward": 0.7225341796875, "reward_std": 0.009592173621058464, "rewards//mean": 0.7225341796875, "rewards//std": 0.029056919738650322, "step": 1008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2018, "grad_norm": 1.8425531387329102, "kl": 0.5178035721182823, "learning_rate": 4.551967079370011e-06, "loss": 0.0207, "num_tokens": 8722456.0, "reward": 0.77581787109375, "reward_std": 0.007759647443890572, "rewards//mean": 0.77581787109375, "rewards//std": 0.01974237523972988, "step": 1009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.202, "grad_norm": 1.7857837677001953, "kl": 0.474933248013258, "learning_rate": 4.551060309373668e-06, "loss": 0.019, "num_tokens": 8731136.0, "reward": 0.75006103515625, "reward_std": 0.009909458458423615, "rewards//mean": 0.75006103515625, "rewards//std": 0.02495396137237549, "step": 1010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2022, "grad_norm": 1.462279200553894, "kl": 0.5031768977642059, "learning_rate": 4.550152713210478e-06, "loss": 0.0201, "num_tokens": 8739872.0, "reward": 0.7667236328125, "reward_std": 0.011526063084602356, "rewards//mean": 0.7667236328125, "rewards//std": 0.018407251685857773, "step": 1011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2024, "grad_norm": 1.6145386695861816, "kl": 0.4688549339771271, "learning_rate": 4.54924429124602e-06, "loss": 0.0188, "num_tokens": 8748488.0, "reward": 0.76123046875, "reward_std": 0.00945983361452818, "rewards//mean": 0.76123046875, "rewards//std": 0.02404451183974743, "step": 1012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2026, "grad_norm": 1.5888701677322388, "kl": 0.4600496143102646, "learning_rate": 4.5483350438462066e-06, "loss": 0.0184, "num_tokens": 8757128.0, "reward": 0.75433349609375, "reward_std": 0.011228648945689201, "rewards//mean": 0.75433349609375, "rewards//std": 0.02934843674302101, "step": 1013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2028, "grad_norm": 1.9413758516311646, "kl": 0.5216050371527672, "learning_rate": 4.547424971377282e-06, "loss": 0.0209, "num_tokens": 8765896.0, "reward": 0.7310791015625, "reward_std": 0.01061311550438404, "rewards//mean": 0.7310791015625, "rewards//std": 0.03697431460022926, "step": 1014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.203, "grad_norm": 1.4738702774047852, "kl": 0.48252981901168823, "learning_rate": 4.546514074205824e-06, "loss": 0.0193, "num_tokens": 8774560.0, "reward": 0.7593994140625, "reward_std": 0.00866001844406128, "rewards//mean": 0.7593994140625, "rewards//std": 0.024446284398436546, "step": 1015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2032, "grad_norm": 2.6102535724639893, "kl": 0.5051307901740074, "learning_rate": 4.545602352698742e-06, "loss": 0.0202, "num_tokens": 8783240.0, "reward": 0.7625732421875, "reward_std": 0.008874187245965004, "rewards//mean": 0.7625732421875, "rewards//std": 0.02584063820540905, "step": 1016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2034, "grad_norm": 2.0985193252563477, "kl": 0.47539572417736053, "learning_rate": 4.544689807223277e-06, "loss": 0.019, "num_tokens": 8791904.0, "reward": 0.75933837890625, "reward_std": 0.011176460422575474, "rewards//mean": 0.75933837890625, "rewards//std": 0.028962144628167152, "step": 1017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2036, "grad_norm": 2.015169143676758, "kl": 0.49015070125460625, "learning_rate": 4.543776438147002e-06, "loss": 0.0196, "num_tokens": 8800440.0, "reward": 0.75531005859375, "reward_std": 0.008005933836102486, "rewards//mean": 0.75531005859375, "rewards//std": 0.026878006756305695, "step": 1018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2038, "grad_norm": 1.495768666267395, "kl": 0.5235383398830891, "learning_rate": 4.542862245837821e-06, "loss": 0.0209, "num_tokens": 8809040.0, "reward": 0.75042724609375, "reward_std": 0.007751651108264923, "rewards//mean": 0.75042724609375, "rewards//std": 0.030354522168636322, "step": 1019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.204, "grad_norm": 2.0663259029388428, "kl": 0.5331315957009792, "learning_rate": 4.541947230663973e-06, "loss": 0.0213, "num_tokens": 8817712.0, "reward": 0.7557373046875, "reward_std": 0.008524209260940552, "rewards//mean": 0.7557373046875, "rewards//std": 0.03529012203216553, "step": 1020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2042, "grad_norm": 1.3930587768554688, "kl": 0.6218722090125084, "learning_rate": 4.541031392994025e-06, "loss": 0.0249, "num_tokens": 8826296.0, "reward": 0.7581787109375, "reward_std": 0.014035872183740139, "rewards//mean": 0.7581787109375, "rewards//std": 0.029599903151392937, "step": 1021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2044, "grad_norm": 1.7703039646148682, "kl": 0.5408430397510529, "learning_rate": 4.540114733196875e-06, "loss": 0.0216, "num_tokens": 8835024.0, "reward": 0.7681884765625, "reward_std": 0.008408166468143463, "rewards//mean": 0.7681884765625, "rewards//std": 0.027596959844231606, "step": 1022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2046, "grad_norm": 1.5688461065292358, "kl": 0.6304674074053764, "learning_rate": 4.5391972516417545e-06, "loss": 0.0252, "num_tokens": 8843712.0, "reward": 0.72796630859375, "reward_std": 0.007372327148914337, "rewards//mean": 0.72796630859375, "rewards//std": 0.024203818291425705, "step": 1023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2048, "grad_norm": 2.1274490356445312, "kl": 0.5350668281316757, "learning_rate": 4.538278948698226e-06, "loss": 0.0214, "num_tokens": 8852368.0, "reward": 0.74969482421875, "reward_std": 0.009116631001234055, "rewards//mean": 0.74969482421875, "rewards//std": 0.021891212090849876, "step": 1024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.205, "grad_norm": 2.364090919494629, "kl": 0.5825952589511871, "learning_rate": 4.537359824736179e-06, "loss": 0.0233, "num_tokens": 8860928.0, "reward": 0.73431396484375, "reward_std": 0.00878153182566166, "rewards//mean": 0.73431396484375, "rewards//std": 0.02596241980791092, "step": 1025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2052, "grad_norm": 2.546067237854004, "kl": 0.6947181299328804, "learning_rate": 4.53643988012584e-06, "loss": 0.0278, "num_tokens": 8869624.0, "reward": 0.74517822265625, "reward_std": 0.009017807431519032, "rewards//mean": 0.74517822265625, "rewards//std": 0.023814167827367783, "step": 1026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2054, "grad_norm": 1.5907171964645386, "kl": 0.5920578241348267, "learning_rate": 4.53551911523776e-06, "loss": 0.0237, "num_tokens": 8878224.0, "reward": 0.7891845703125, "reward_std": 0.014363161288201809, "rewards//mean": 0.7891845703125, "rewards//std": 0.028450436890125275, "step": 1027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2056, "grad_norm": 2.0224955081939697, "kl": 0.5741788223385811, "learning_rate": 4.534597530442824e-06, "loss": 0.023, "num_tokens": 8886832.0, "reward": 0.77069091796875, "reward_std": 0.00794792640954256, "rewards//mean": 0.77069091796875, "rewards//std": 0.023527026176452637, "step": 1028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2058, "grad_norm": 1.4634764194488525, "kl": 0.6235593780875206, "learning_rate": 4.5336751261122455e-06, "loss": 0.0249, "num_tokens": 8895536.0, "reward": 0.7608642578125, "reward_std": 0.0106462761759758, "rewards//mean": 0.7608642578125, "rewards//std": 0.0243644081056118, "step": 1029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.206, "grad_norm": 1.6618282794952393, "kl": 0.6238065287470818, "learning_rate": 4.5327519026175694e-06, "loss": 0.025, "num_tokens": 8904104.0, "reward": 0.74853515625, "reward_std": 0.012076951563358307, "rewards//mean": 0.74853515625, "rewards//std": 0.03306881710886955, "step": 1030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2062, "grad_norm": 2.0615737438201904, "kl": 0.6506002843379974, "learning_rate": 4.53182786033067e-06, "loss": 0.026, "num_tokens": 8912784.0, "reward": 0.74847412109375, "reward_std": 0.01185664627701044, "rewards//mean": 0.74847412109375, "rewards//std": 0.019876867532730103, "step": 1031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2064, "grad_norm": 1.990052342414856, "kl": 0.6529459059238434, "learning_rate": 4.530902999623752e-06, "loss": 0.0261, "num_tokens": 8921536.0, "reward": 0.779052734375, "reward_std": 0.013718672096729279, "rewards//mean": 0.779052734375, "rewards//std": 0.03291740640997887, "step": 1032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2066, "grad_norm": 2.1141812801361084, "kl": 0.6994459852576256, "learning_rate": 4.529977320869349e-06, "loss": 0.028, "num_tokens": 8930136.0, "reward": 0.72808837890625, "reward_std": 0.008768187835812569, "rewards//mean": 0.72808837890625, "rewards//std": 0.022868605330586433, "step": 1033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2068, "grad_norm": 1.4673584699630737, "kl": 0.6505978107452393, "learning_rate": 4.529050824440323e-06, "loss": 0.026, "num_tokens": 8938864.0, "reward": 0.76885986328125, "reward_std": 0.008683929219841957, "rewards//mean": 0.76885986328125, "rewards//std": 0.02408532239496708, "step": 1034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.207, "grad_norm": 2.2642955780029297, "kl": 0.6501578129827976, "learning_rate": 4.528123510709868e-06, "loss": 0.026, "num_tokens": 8947568.0, "reward": 0.76177978515625, "reward_std": 0.011492680758237839, "rewards//mean": 0.76177978515625, "rewards//std": 0.024983063340187073, "step": 1035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2072, "grad_norm": 1.5178568363189697, "kl": 0.6040456928312778, "learning_rate": 4.527195380051505e-06, "loss": 0.0242, "num_tokens": 8956272.0, "reward": 0.76190185546875, "reward_std": 0.008062414824962616, "rewards//mean": 0.76190185546875, "rewards//std": 0.0196786317974329, "step": 1036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2074, "grad_norm": 1.7156482934951782, "kl": 0.6520369872450829, "learning_rate": 4.526266432839086e-06, "loss": 0.0261, "num_tokens": 8964968.0, "reward": 0.75439453125, "reward_std": 0.00760736595839262, "rewards//mean": 0.75439453125, "rewards//std": 0.028633149340748787, "step": 1037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2076, "grad_norm": 2.9114058017730713, "kl": 0.6687226854264736, "learning_rate": 4.525336669446789e-06, "loss": 0.0267, "num_tokens": 8973680.0, "reward": 0.739501953125, "reward_std": 0.00796244852244854, "rewards//mean": 0.739501953125, "rewards//std": 0.029674729332327843, "step": 1038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2078, "grad_norm": 2.458566904067993, "kl": 0.7211368903517723, "learning_rate": 4.524406090249125e-06, "loss": 0.0288, "num_tokens": 8982352.0, "reward": 0.73370361328125, "reward_std": 0.010489147156476974, "rewards//mean": 0.73370361328125, "rewards//std": 0.02587890811264515, "step": 1039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.208, "grad_norm": 2.4322752952575684, "kl": 0.5815874375402927, "learning_rate": 4.5234746956209295e-06, "loss": 0.0233, "num_tokens": 8991008.0, "reward": 0.72247314453125, "reward_std": 0.010957663878798485, "rewards//mean": 0.72247314453125, "rewards//std": 0.03108237124979496, "step": 1040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2082, "grad_norm": 2.1806724071502686, "kl": 0.7126741707324982, "learning_rate": 4.522542485937369e-06, "loss": 0.0285, "num_tokens": 8999736.0, "reward": 0.7744140625, "reward_std": 0.008776089176535606, "rewards//mean": 0.7744140625, "rewards//std": 0.02531539462506771, "step": 1041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2084, "grad_norm": 1.8548524379730225, "kl": 0.5115164630115032, "learning_rate": 4.521609461573937e-06, "loss": 0.0205, "num_tokens": 9008384.0, "reward": 0.72552490234375, "reward_std": 0.007427786476910114, "rewards//mean": 0.72552490234375, "rewards//std": 0.035353709012269974, "step": 1042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2086, "grad_norm": 1.4681609869003296, "kl": 0.5000422447919846, "learning_rate": 4.520675622906455e-06, "loss": 0.02, "num_tokens": 9017096.0, "reward": 0.74542236328125, "reward_std": 0.006922936532646418, "rewards//mean": 0.74542236328125, "rewards//std": 0.023318924009799957, "step": 1043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2088, "grad_norm": 2.1844120025634766, "kl": 0.5875167511403561, "learning_rate": 4.519740970311074e-06, "loss": 0.0235, "num_tokens": 9025688.0, "reward": 0.7470703125, "reward_std": 0.009474307298660278, "rewards//mean": 0.7470703125, "rewards//std": 0.02059950679540634, "step": 1044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.209, "grad_norm": 1.8049288988113403, "kl": 0.4851870872080326, "learning_rate": 4.518805504164272e-06, "loss": 0.0194, "num_tokens": 9034344.0, "reward": 0.75543212890625, "reward_std": 0.009117459878325462, "rewards//mean": 0.75543212890625, "rewards//std": 0.0383719801902771, "step": 1045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2092, "grad_norm": 1.4463645219802856, "kl": 0.5243805386126041, "learning_rate": 4.517869224842853e-06, "loss": 0.021, "num_tokens": 9043032.0, "reward": 0.743408203125, "reward_std": 0.00784677267074585, "rewards//mean": 0.743408203125, "rewards//std": 0.04251755774021149, "step": 1046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2094, "grad_norm": 1.4782451391220093, "kl": 0.4091626890003681, "learning_rate": 4.516932132723953e-06, "loss": 0.0164, "num_tokens": 9051672.0, "reward": 0.73699951171875, "reward_std": 0.00893807876855135, "rewards//mean": 0.73699951171875, "rewards//std": 0.02734866365790367, "step": 1047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2096, "grad_norm": 1.4482992887496948, "kl": 0.47384709492325783, "learning_rate": 4.515994228185031e-06, "loss": 0.019, "num_tokens": 9060280.0, "reward": 0.721435546875, "reward_std": 0.007866722531616688, "rewards//mean": 0.721435546875, "rewards//std": 0.027856020256876945, "step": 1048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2098, "grad_norm": 1.5931669473648071, "kl": 0.44570937752723694, "learning_rate": 4.5150555116038755e-06, "loss": 0.0178, "num_tokens": 9068992.0, "reward": 0.733642578125, "reward_std": 0.007118706591427326, "rewards//mean": 0.733642578125, "rewards//std": 0.02893083170056343, "step": 1049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.21, "grad_norm": 1.6214878559112549, "kl": 0.4380730539560318, "learning_rate": 4.5141159833586e-06, "loss": 0.0175, "num_tokens": 9077648.0, "reward": 0.76312255859375, "reward_std": 0.008403139188885689, "rewards//mean": 0.76312255859375, "rewards//std": 0.03101167269051075, "step": 1050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2102, "grad_norm": 1.556640625, "kl": 0.5078834593296051, "learning_rate": 4.513175643827647e-06, "loss": 0.0203, "num_tokens": 9086304.0, "reward": 0.76580810546875, "reward_std": 0.01045049075037241, "rewards//mean": 0.76580810546875, "rewards//std": 0.02039187401533127, "step": 1051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2104, "grad_norm": 2.5217156410217285, "kl": 0.5452925600111485, "learning_rate": 4.512234493389785e-06, "loss": 0.0218, "num_tokens": 9094952.0, "reward": 0.770751953125, "reward_std": 0.007368247956037521, "rewards//mean": 0.770751953125, "rewards//std": 0.020373400300741196, "step": 1052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2106, "grad_norm": 1.9990670680999756, "kl": 0.453094445168972, "learning_rate": 4.511292532424111e-06, "loss": 0.0181, "num_tokens": 9103560.0, "reward": 0.7662353515625, "reward_std": 0.008373312652111053, "rewards//mean": 0.7662353515625, "rewards//std": 0.026238271966576576, "step": 1053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2108, "grad_norm": 1.5686399936676025, "kl": 0.511558685451746, "learning_rate": 4.510349761310046e-06, "loss": 0.0205, "num_tokens": 9112192.0, "reward": 0.73602294921875, "reward_std": 0.008193356916308403, "rewards//mean": 0.73602294921875, "rewards//std": 0.04012569040060043, "step": 1054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.211, "grad_norm": 1.6067869663238525, "kl": 0.48906319215893745, "learning_rate": 4.509406180427336e-06, "loss": 0.0196, "num_tokens": 9120840.0, "reward": 0.75677490234375, "reward_std": 0.00814962387084961, "rewards//mean": 0.75677490234375, "rewards//std": 0.01931605488061905, "step": 1055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2112, "grad_norm": 1.767385721206665, "kl": 0.4174470715224743, "learning_rate": 4.508461790156057e-06, "loss": 0.0167, "num_tokens": 9129488.0, "reward": 0.75872802734375, "reward_std": 0.006528071127831936, "rewards//mean": 0.75872802734375, "rewards//std": 0.02753126434981823, "step": 1056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2114, "grad_norm": 1.5596750974655151, "kl": 0.5845408402383327, "learning_rate": 4.5075165908766095e-06, "loss": 0.0234, "num_tokens": 9138040.0, "reward": 0.7518310546875, "reward_std": 0.012730174697935581, "rewards//mean": 0.7518310546875, "rewards//std": 0.03324026241898537, "step": 1057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2116, "grad_norm": 1.9303299188613892, "kl": 0.5514601096510887, "learning_rate": 4.506570582969719e-06, "loss": 0.0221, "num_tokens": 9146648.0, "reward": 0.77783203125, "reward_std": 0.0075635421089828014, "rewards//mean": 0.77783203125, "rewards//std": 0.023545756936073303, "step": 1058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2118, "grad_norm": 3.4938831329345703, "kl": 0.5120391063392162, "learning_rate": 4.505623766816438e-06, "loss": 0.0205, "num_tokens": 9155216.0, "reward": 0.72894287109375, "reward_std": 0.010047493502497673, "rewards//mean": 0.72894287109375, "rewards//std": 0.03355679661035538, "step": 1059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.212, "grad_norm": 2.218789577484131, "kl": 0.7058784067630768, "learning_rate": 4.504676142798143e-06, "loss": 0.0282, "num_tokens": 9163848.0, "reward": 0.756103515625, "reward_std": 0.010491996072232723, "rewards//mean": 0.756103515625, "rewards//std": 0.025179890915751457, "step": 1060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2122, "grad_norm": 2.2615737915039062, "kl": 0.6257460974156857, "learning_rate": 4.503727711296539e-06, "loss": 0.025, "num_tokens": 9172520.0, "reward": 0.76434326171875, "reward_std": 0.010336033999919891, "rewards//mean": 0.76434326171875, "rewards//std": 0.03163557127118111, "step": 1061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2124, "grad_norm": 2.7529118061065674, "kl": 0.7096852511167526, "learning_rate": 4.502778472693651e-06, "loss": 0.0284, "num_tokens": 9181096.0, "reward": 0.75250244140625, "reward_std": 0.010183559730648994, "rewards//mean": 0.75250244140625, "rewards//std": 0.020941952243447304, "step": 1062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2126, "grad_norm": 1.4434173107147217, "kl": 0.6540036387741566, "learning_rate": 4.501828427371834e-06, "loss": 0.0262, "num_tokens": 9189752.0, "reward": 0.724609375, "reward_std": 0.01086492370814085, "rewards//mean": 0.724609375, "rewards//std": 0.04137500375509262, "step": 1063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2128, "grad_norm": 2.392683982849121, "kl": 0.6888159178197384, "learning_rate": 4.500877575713766e-06, "loss": 0.0276, "num_tokens": 9198392.0, "reward": 0.7420654296875, "reward_std": 0.009649857878684998, "rewards//mean": 0.7420654296875, "rewards//std": 0.036854568868875504, "step": 1064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.213, "grad_norm": 2.147503137588501, "kl": 0.6591285467147827, "learning_rate": 4.4999259181024504e-06, "loss": 0.0264, "num_tokens": 9207120.0, "reward": 0.7501220703125, "reward_std": 0.007778090890496969, "rewards//mean": 0.7501220703125, "rewards//std": 0.02138373628258705, "step": 1065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2132, "grad_norm": 1.757538914680481, "kl": 0.6607584804296494, "learning_rate": 4.498973454921213e-06, "loss": 0.0264, "num_tokens": 9215808.0, "reward": 0.76861572265625, "reward_std": 0.00861060805618763, "rewards//mean": 0.76861572265625, "rewards//std": 0.02252848446369171, "step": 1066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2134, "grad_norm": 2.1159214973449707, "kl": 0.9841729775071144, "learning_rate": 4.498020186553707e-06, "loss": 0.0394, "num_tokens": 9224600.0, "reward": 0.75347900390625, "reward_std": 0.010120416060090065, "rewards//mean": 0.75347900390625, "rewards//std": 0.02962769754230976, "step": 1067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2136, "grad_norm": 2.4948527812957764, "kl": 0.69302798807621, "learning_rate": 4.49706611338391e-06, "loss": 0.0277, "num_tokens": 9233272.0, "reward": 0.7554931640625, "reward_std": 0.013088841922581196, "rewards//mean": 0.7554931640625, "rewards//std": 0.03837893158197403, "step": 1068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2138, "grad_norm": 2.9260096549987793, "kl": 0.8028748407959938, "learning_rate": 4.49611123579612e-06, "loss": 0.0321, "num_tokens": 9241864.0, "reward": 0.75360107421875, "reward_std": 0.009532960131764412, "rewards//mean": 0.75360107421875, "rewards//std": 0.028067629784345627, "step": 1069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.214, "grad_norm": 2.427626848220825, "kl": 0.7211885526776314, "learning_rate": 4.495155554174963e-06, "loss": 0.0288, "num_tokens": 9250568.0, "reward": 0.7564697265625, "reward_std": 0.009798595681786537, "rewards//mean": 0.7564697265625, "rewards//std": 0.03941568359732628, "step": 1070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2142, "grad_norm": 2.2091565132141113, "kl": 0.8733653463423252, "learning_rate": 4.494199068905389e-06, "loss": 0.0349, "num_tokens": 9259280.0, "reward": 0.74664306640625, "reward_std": 0.007622402161359787, "rewards//mean": 0.74664306640625, "rewards//std": 0.01949857361614704, "step": 1071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2144, "grad_norm": 3.5099453926086426, "kl": 0.7665241658687592, "learning_rate": 4.493241780372667e-06, "loss": 0.0307, "num_tokens": 9267928.0, "reward": 0.74334716796875, "reward_std": 0.008776900358498096, "rewards//mean": 0.74334716796875, "rewards//std": 0.021906420588493347, "step": 1072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2146, "grad_norm": 2.528672456741333, "kl": 0.8200844153761864, "learning_rate": 4.492283688962395e-06, "loss": 0.0328, "num_tokens": 9276600.0, "reward": 0.73779296875, "reward_std": 0.01100831851363182, "rewards//mean": 0.73779296875, "rewards//std": 0.02725502848625183, "step": 1073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2148, "grad_norm": 3.0090630054473877, "kl": 0.9538272470235825, "learning_rate": 4.491324795060491e-06, "loss": 0.0382, "num_tokens": 9285168.0, "reward": 0.75927734375, "reward_std": 0.015303384512662888, "rewards//mean": 0.75927734375, "rewards//std": 0.037373583763837814, "step": 1074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.215, "grad_norm": 2.718337297439575, "kl": 0.77449831366539, "learning_rate": 4.490365099053198e-06, "loss": 0.031, "num_tokens": 9293880.0, "reward": 0.7220458984375, "reward_std": 0.009562456049025059, "rewards//mean": 0.7220458984375, "rewards//std": 0.03656592220067978, "step": 1075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2152, "grad_norm": 3.3346750736236572, "kl": 0.9878917969763279, "learning_rate": 4.489404601327081e-06, "loss": 0.0395, "num_tokens": 9302696.0, "reward": 0.759765625, "reward_std": 0.007531511131674051, "rewards//mean": 0.759765625, "rewards//std": 0.022851470857858658, "step": 1076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2154, "grad_norm": 2.2453107833862305, "kl": 0.7823777347803116, "learning_rate": 4.488443302269028e-06, "loss": 0.0313, "num_tokens": 9311360.0, "reward": 0.768310546875, "reward_std": 0.009474392980337143, "rewards//mean": 0.768310546875, "rewards//std": 0.02859400026500225, "step": 1077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2156, "grad_norm": 4.676713943481445, "kl": 0.7591917254030704, "learning_rate": 4.487481202266251e-06, "loss": 0.0304, "num_tokens": 9320040.0, "reward": 0.76611328125, "reward_std": 0.008736456744372845, "rewards//mean": 0.76611328125, "rewards//std": 0.025367960333824158, "step": 1078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2158, "grad_norm": 3.272170305252075, "kl": 0.8263779357075691, "learning_rate": 4.4865183017062835e-06, "loss": 0.0331, "num_tokens": 9328624.0, "reward": 0.77459716796875, "reward_std": 0.009189307689666748, "rewards//mean": 0.77459716796875, "rewards//std": 0.02472175657749176, "step": 1079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.216, "grad_norm": 2.3054301738739014, "kl": 0.6133085899055004, "learning_rate": 4.485554600976981e-06, "loss": 0.0245, "num_tokens": 9337216.0, "reward": 0.76776123046875, "reward_std": 0.010974482633173466, "rewards//mean": 0.76776123046875, "rewards//std": 0.02550535649061203, "step": 1080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2162, "grad_norm": 1.5770183801651, "kl": 0.8735728226602077, "learning_rate": 4.484590100466524e-06, "loss": 0.0349, "num_tokens": 9345800.0, "reward": 0.7545166015625, "reward_std": 0.00802420824766159, "rewards//mean": 0.7545166015625, "rewards//std": 0.024122139438986778, "step": 1081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2164, "grad_norm": 2.4211506843566895, "kl": 0.5430635958909988, "learning_rate": 4.483624800563411e-06, "loss": 0.0217, "num_tokens": 9354368.0, "reward": 0.75897216796875, "reward_std": 0.008090725168585777, "rewards//mean": 0.75897216796875, "rewards//std": 0.03878672420978546, "step": 1082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2166, "grad_norm": 3.957136392593384, "kl": 0.7904599532485008, "learning_rate": 4.482658701656465e-06, "loss": 0.0316, "num_tokens": 9363000.0, "reward": 0.77166748046875, "reward_std": 0.010565444827079773, "rewards//mean": 0.77166748046875, "rewards//std": 0.026106037199497223, "step": 1083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2168, "grad_norm": 2.7944271564483643, "kl": 0.6459505222737789, "learning_rate": 4.4816918041348325e-06, "loss": 0.0258, "num_tokens": 9371672.0, "reward": 0.7493896484375, "reward_std": 0.007447043899446726, "rewards//mean": 0.7493896484375, "rewards//std": 0.020660776644945145, "step": 1084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.217, "grad_norm": 2.7777011394500732, "kl": 0.9416023418307304, "learning_rate": 4.4807241083879774e-06, "loss": 0.0377, "num_tokens": 9380344.0, "reward": 0.746826171875, "reward_std": 0.010473907925188541, "rewards//mean": 0.746826171875, "rewards//std": 0.026068393141031265, "step": 1085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2172, "grad_norm": 3.9601447582244873, "kl": 1.124600499868393, "learning_rate": 4.4797556148056884e-06, "loss": 0.045, "num_tokens": 9389040.0, "reward": 0.7542724609375, "reward_std": 0.010002071969211102, "rewards//mean": 0.7542724609375, "rewards//std": 0.023548007011413574, "step": 1086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2174, "grad_norm": 3.473670482635498, "kl": 1.2004450410604477, "learning_rate": 4.478786323778074e-06, "loss": 0.048, "num_tokens": 9397592.0, "reward": 0.744384765625, "reward_std": 0.012125678360462189, "rewards//mean": 0.744384765625, "rewards//std": 0.025015834718942642, "step": 1087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2176, "grad_norm": 2.5515403747558594, "kl": 1.0566519647836685, "learning_rate": 4.477816235695566e-06, "loss": 0.0423, "num_tokens": 9406272.0, "reward": 0.75457763671875, "reward_std": 0.009734675288200378, "rewards//mean": 0.75457763671875, "rewards//std": 0.035451628267765045, "step": 1088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2178, "grad_norm": 2.6502327919006348, "kl": 1.1147084645926952, "learning_rate": 4.476845350948914e-06, "loss": 0.0446, "num_tokens": 9414848.0, "reward": 0.73468017578125, "reward_std": 0.009117727167904377, "rewards//mean": 0.73468017578125, "rewards//std": 0.030995560809969902, "step": 1089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.218, "grad_norm": 5.747105121612549, "kl": 1.1493015959858894, "learning_rate": 4.475873669929192e-06, "loss": 0.046, "num_tokens": 9423448.0, "reward": 0.73974609375, "reward_std": 0.010514539666473866, "rewards//mean": 0.73974609375, "rewards//std": 0.02359713427722454, "step": 1090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2182, "grad_norm": 3.1860344409942627, "kl": 1.0591446198523045, "learning_rate": 4.474901193027791e-06, "loss": 0.0424, "num_tokens": 9432040.0, "reward": 0.76080322265625, "reward_std": 0.009376447647809982, "rewards//mean": 0.76080322265625, "rewards//std": 0.028097275644540787, "step": 1091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2184, "grad_norm": 3.0819036960601807, "kl": 1.032674428075552, "learning_rate": 4.473927920636426e-06, "loss": 0.0413, "num_tokens": 9440632.0, "reward": 0.76348876953125, "reward_std": 0.008818727917969227, "rewards//mean": 0.76348876953125, "rewards//std": 0.025966500863432884, "step": 1092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2186, "grad_norm": 3.2575461864471436, "kl": 1.297365564852953, "learning_rate": 4.472953853147131e-06, "loss": 0.0519, "num_tokens": 9449312.0, "reward": 0.69775390625, "reward_std": 0.008006967604160309, "rewards//mean": 0.69775390625, "rewards//std": 0.032678328454494476, "step": 1093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2188, "grad_norm": 3.0446760654449463, "kl": 0.9028544947504997, "learning_rate": 4.471978990952259e-06, "loss": 0.0361, "num_tokens": 9458032.0, "reward": 0.76397705078125, "reward_std": 0.005998008884489536, "rewards//mean": 0.76397705078125, "rewards//std": 0.027106259018182755, "step": 1094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.219, "grad_norm": 2.368042230606079, "kl": 1.3571302145719528, "learning_rate": 4.471003334444486e-06, "loss": 0.0543, "num_tokens": 9466552.0, "reward": 0.75128173828125, "reward_std": 0.009325332939624786, "rewards//mean": 0.75128173828125, "rewards//std": 0.023530885577201843, "step": 1095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2192, "grad_norm": 3.9383130073547363, "kl": 1.3193939849734306, "learning_rate": 4.470026884016805e-06, "loss": 0.0528, "num_tokens": 9475248.0, "reward": 0.75323486328125, "reward_std": 0.010885990224778652, "rewards//mean": 0.75323486328125, "rewards//std": 0.03377218544483185, "step": 1096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2194, "grad_norm": 2.5032310485839844, "kl": 1.471941266208887, "learning_rate": 4.469049640062532e-06, "loss": 0.0589, "num_tokens": 9483984.0, "reward": 0.76910400390625, "reward_std": 0.01335103064775467, "rewards//mean": 0.76910400390625, "rewards//std": 0.033610884100198746, "step": 1097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2196, "grad_norm": 3.67722225189209, "kl": 1.3545877374708652, "learning_rate": 4.468071602975298e-06, "loss": 0.0542, "num_tokens": 9492560.0, "reward": 0.74169921875, "reward_std": 0.009876557625830173, "rewards//mean": 0.74169921875, "rewards//std": 0.027748214080929756, "step": 1098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2198, "grad_norm": 3.622251272201538, "kl": 1.1348747313022614, "learning_rate": 4.467092773149058e-06, "loss": 0.0454, "num_tokens": 9501136.0, "reward": 0.70098876953125, "reward_std": 0.012704404070973396, "rewards//mean": 0.70098876953125, "rewards//std": 0.0359426885843277, "step": 1099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.22, "grad_norm": 1.904492974281311, "kl": 1.0309885777533054, "learning_rate": 4.466113150978085e-06, "loss": 0.0412, "num_tokens": 9509816.0, "reward": 0.739990234375, "reward_std": 0.008449692279100418, "rewards//mean": 0.739990234375, "rewards//std": 0.028063921257853508, "step": 1100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2202, "grad_norm": 2.4637084007263184, "kl": 1.5536633394658566, "learning_rate": 4.4651327368569695e-06, "loss": 0.0621, "num_tokens": 9518472.0, "reward": 0.756103515625, "reward_std": 0.014750070869922638, "rewards//mean": 0.756103515625, "rewards//std": 0.036689288914203644, "step": 1101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2204, "grad_norm": 3.5647053718566895, "kl": 1.4530429542064667, "learning_rate": 4.464151531180622e-06, "loss": 0.0581, "num_tokens": 9527136.0, "reward": 0.74212646484375, "reward_std": 0.0095286276191473, "rewards//mean": 0.74212646484375, "rewards//std": 0.030745968222618103, "step": 1102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2206, "grad_norm": 1.5957767963409424, "kl": 2.0430374331772327, "learning_rate": 4.463169534344273e-06, "loss": 0.0817, "num_tokens": 9535776.0, "reward": 0.74114990234375, "reward_std": 0.010926088318228722, "rewards//mean": 0.74114990234375, "rewards//std": 0.03685636818408966, "step": 1103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2208, "grad_norm": 2.019646167755127, "kl": 1.6706215292215347, "learning_rate": 4.462186746743471e-06, "loss": 0.0668, "num_tokens": 9544424.0, "reward": 0.73858642578125, "reward_std": 0.010018018074333668, "rewards//mean": 0.73858642578125, "rewards//std": 0.032824788242578506, "step": 1104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.221, "grad_norm": 4.353557109832764, "kl": 1.8241998106241226, "learning_rate": 4.461203168774081e-06, "loss": 0.073, "num_tokens": 9553032.0, "reward": 0.7147216796875, "reward_std": 0.009742958471179008, "rewards//mean": 0.7147216796875, "rewards//std": 0.02567608840763569, "step": 1105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2212, "grad_norm": 2.751556634902954, "kl": 1.6184682920575142, "learning_rate": 4.46021880083229e-06, "loss": 0.0647, "num_tokens": 9561712.0, "reward": 0.77850341796875, "reward_std": 0.011825655587017536, "rewards//mean": 0.77850341796875, "rewards//std": 0.030802536755800247, "step": 1106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2214, "grad_norm": 4.208339691162109, "kl": 2.461951345205307, "learning_rate": 4.4592336433146e-06, "loss": 0.0985, "num_tokens": 9570320.0, "reward": 0.73040771484375, "reward_std": 0.010940341278910637, "rewards//mean": 0.73040771484375, "rewards//std": 0.030341552570462227, "step": 1107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2216, "grad_norm": 2.327195405960083, "kl": 1.835537813603878, "learning_rate": 4.458247696617833e-06, "loss": 0.0734, "num_tokens": 9578848.0, "reward": 0.73785400390625, "reward_std": 0.011808148585259914, "rewards//mean": 0.73785400390625, "rewards//std": 0.03465399518609047, "step": 1108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2218, "grad_norm": 3.2864954471588135, "kl": 2.3479328639805317, "learning_rate": 4.4572609611391275e-06, "loss": 0.0939, "num_tokens": 9587480.0, "reward": 0.72821044921875, "reward_std": 0.008950600400567055, "rewards//mean": 0.72821044921875, "rewards//std": 0.031116442754864693, "step": 1109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.222, "grad_norm": 3.4584908485412598, "kl": 1.7205227054655552, "learning_rate": 4.456273437275941e-06, "loss": 0.0688, "num_tokens": 9596152.0, "reward": 0.7496337890625, "reward_std": 0.00794816855341196, "rewards//mean": 0.7496337890625, "rewards//std": 0.02670036442577839, "step": 1110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2222, "grad_norm": 4.047646522521973, "kl": 2.2220290824770927, "learning_rate": 4.455285125426049e-06, "loss": 0.0889, "num_tokens": 9604800.0, "reward": 0.74237060546875, "reward_std": 0.01314927265048027, "rewards//mean": 0.74237060546875, "rewards//std": 0.03537554293870926, "step": 1111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2224, "grad_norm": 3.592796564102173, "kl": 1.532889299094677, "learning_rate": 4.4542960259875415e-06, "loss": 0.0613, "num_tokens": 9613480.0, "reward": 0.75408935546875, "reward_std": 0.00946657545864582, "rewards//mean": 0.75408935546875, "rewards//std": 0.028742311522364616, "step": 1112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2226, "grad_norm": 3.5160577297210693, "kl": 2.157268762588501, "learning_rate": 4.453306139358828e-06, "loss": 0.0863, "num_tokens": 9622088.0, "reward": 0.72589111328125, "reward_std": 0.012263582088053226, "rewards//mean": 0.72589111328125, "rewards//std": 0.03586595505475998, "step": 1113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2228, "grad_norm": 3.2510933876037598, "kl": 1.9689137376844883, "learning_rate": 4.4523154659386355e-06, "loss": 0.0788, "num_tokens": 9630704.0, "reward": 0.77105712890625, "reward_std": 0.00871447753161192, "rewards//mean": 0.77105712890625, "rewards//std": 0.01891535334289074, "step": 1114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.223, "grad_norm": 3.0722126960754395, "kl": 2.2261190228164196, "learning_rate": 4.451324006126006e-06, "loss": 0.089, "num_tokens": 9639360.0, "reward": 0.77435302734375, "reward_std": 0.010914132930338383, "rewards//mean": 0.77435302734375, "rewards//std": 0.02191816456615925, "step": 1115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2232, "grad_norm": 4.954802989959717, "kl": 1.914122849702835, "learning_rate": 4.4503317603203025e-06, "loss": 0.0766, "num_tokens": 9647976.0, "reward": 0.73638916015625, "reward_std": 0.010175145231187344, "rewards//mean": 0.73638916015625, "rewards//std": 0.029976148158311844, "step": 1116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2234, "grad_norm": 3.7981762886047363, "kl": 2.405631937086582, "learning_rate": 4.449338728921197e-06, "loss": 0.0962, "num_tokens": 9656608.0, "reward": 0.777099609375, "reward_std": 0.010214705020189285, "rewards//mean": 0.777099609375, "rewards//std": 0.026501474902033806, "step": 1117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2236, "grad_norm": 2.028653621673584, "kl": 3.687923341989517, "learning_rate": 4.448344912328686e-06, "loss": 0.1475, "num_tokens": 9665184.0, "reward": 0.754150390625, "reward_std": 0.011877220124006271, "rewards//mean": 0.754150390625, "rewards//std": 0.026235099881887436, "step": 1118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2238, "grad_norm": 9.127527236938477, "kl": 2.709699623286724, "learning_rate": 4.447350310943077e-06, "loss": 0.1084, "num_tokens": 9673856.0, "reward": 0.75897216796875, "reward_std": 0.010300416499376297, "rewards//mean": 0.75897216796875, "rewards//std": 0.029965540394186974, "step": 1119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.224, "grad_norm": 5.725995063781738, "kl": 2.1236205883324146, "learning_rate": 4.4463549251649954e-06, "loss": 0.0849, "num_tokens": 9682448.0, "reward": 0.7354736328125, "reward_std": 0.010432107374072075, "rewards//mean": 0.7354736328125, "rewards//std": 0.03617468476295471, "step": 1120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2242, "grad_norm": 4.4017109870910645, "kl": 2.513170436024666, "learning_rate": 4.445358755395382e-06, "loss": 0.1005, "num_tokens": 9691064.0, "reward": 0.7432861328125, "reward_std": 0.007982677780091763, "rewards//mean": 0.7432861328125, "rewards//std": 0.02155858278274536, "step": 1121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2244, "grad_norm": 3.3632447719573975, "kl": 2.61676362529397, "learning_rate": 4.444361802035495e-06, "loss": 0.1047, "num_tokens": 9699704.0, "reward": 0.76348876953125, "reward_std": 0.009026894345879555, "rewards//mean": 0.76348876953125, "rewards//std": 0.02770829387009144, "step": 1122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2246, "grad_norm": 3.67970871925354, "kl": 3.4878183528780937, "learning_rate": 4.443364065486907e-06, "loss": 0.1395, "num_tokens": 9708296.0, "reward": 0.76715087890625, "reward_std": 0.009723657742142677, "rewards//mean": 0.76715087890625, "rewards//std": 0.03365049511194229, "step": 1123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2248, "grad_norm": 2.621422052383423, "kl": 2.6404803469777107, "learning_rate": 4.442365546151506e-06, "loss": 0.1056, "num_tokens": 9717040.0, "reward": 0.759765625, "reward_std": 0.010206403210759163, "rewards//mean": 0.759765625, "rewards//std": 0.023198600858449936, "step": 1124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.225, "grad_norm": 5.458611011505127, "kl": 3.976851686835289, "learning_rate": 4.441366244431494e-06, "loss": 0.1591, "num_tokens": 9725680.0, "reward": 0.73394775390625, "reward_std": 0.006910150405019522, "rewards//mean": 0.73394775390625, "rewards//std": 0.032489653676748276, "step": 1125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2252, "grad_norm": 4.215648651123047, "kl": 3.9354925006628036, "learning_rate": 4.440366160729393e-06, "loss": 0.1574, "num_tokens": 9734240.0, "reward": 0.76776123046875, "reward_std": 0.010620087385177612, "rewards//mean": 0.76776123046875, "rewards//std": 0.01793353259563446, "step": 1126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2254, "grad_norm": 5.209417343139648, "kl": 2.7744726836681366, "learning_rate": 4.439365295448032e-06, "loss": 0.111, "num_tokens": 9742928.0, "reward": 0.72320556640625, "reward_std": 0.008274361491203308, "rewards//mean": 0.72320556640625, "rewards//std": 0.03745364397764206, "step": 1127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2256, "grad_norm": 4.9473557472229, "kl": 3.558548718690872, "learning_rate": 4.438363648990564e-06, "loss": 0.1423, "num_tokens": 9751496.0, "reward": 0.7760009765625, "reward_std": 0.014670850709080696, "rewards//mean": 0.7760009765625, "rewards//std": 0.026650426909327507, "step": 1128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2258, "grad_norm": 3.0841500759124756, "kl": 4.412683874368668, "learning_rate": 4.437361221760449e-06, "loss": 0.1765, "num_tokens": 9760136.0, "reward": 0.7666015625, "reward_std": 0.01099938340485096, "rewards//mean": 0.7666015625, "rewards//std": 0.026621157303452492, "step": 1129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.226, "grad_norm": 4.618311882019043, "kl": 4.27890357375145, "learning_rate": 4.436358014161466e-06, "loss": 0.1712, "num_tokens": 9768864.0, "reward": 0.7708740234375, "reward_std": 0.0102059505879879, "rewards//mean": 0.7708740234375, "rewards//std": 0.024609215557575226, "step": 1130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2262, "grad_norm": 5.920929908752441, "kl": 3.99948251247406, "learning_rate": 4.435354026597707e-06, "loss": 0.16, "num_tokens": 9777504.0, "reward": 0.72979736328125, "reward_std": 0.010434305295348167, "rewards//mean": 0.72979736328125, "rewards//std": 0.03874845802783966, "step": 1131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2264, "grad_norm": 3.771116256713867, "kl": 4.644639670848846, "learning_rate": 4.434349259473576e-06, "loss": 0.1858, "num_tokens": 9786088.0, "reward": 0.737548828125, "reward_std": 0.012446350418031216, "rewards//mean": 0.737548828125, "rewards//std": 0.028543131425976753, "step": 1132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2266, "grad_norm": 2.684006690979004, "kl": 4.054437205195427, "learning_rate": 4.433343713193796e-06, "loss": 0.1622, "num_tokens": 9794704.0, "reward": 0.7557373046875, "reward_std": 0.012024207971990108, "rewards//mean": 0.7557373046875, "rewards//std": 0.033530451357364655, "step": 1133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2268, "grad_norm": 3.2405855655670166, "kl": 4.336524769663811, "learning_rate": 4.432337388163399e-06, "loss": 0.1735, "num_tokens": 9803360.0, "reward": 0.79571533203125, "reward_std": 0.010724304243922234, "rewards//mean": 0.79571533203125, "rewards//std": 0.019446488469839096, "step": 1134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.227, "grad_norm": 4.249029159545898, "kl": 2.5489603132009506, "learning_rate": 4.431330284787733e-06, "loss": 0.102, "num_tokens": 9812040.0, "reward": 0.76373291015625, "reward_std": 0.011748848482966423, "rewards//mean": 0.76373291015625, "rewards//std": 0.02632833458483219, "step": 1135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2272, "grad_norm": 4.823026657104492, "kl": 3.880403060466051, "learning_rate": 4.430322403472459e-06, "loss": 0.1552, "num_tokens": 9820664.0, "reward": 0.72265625, "reward_std": 0.008320711553096771, "rewards//mean": 0.72265625, "rewards//std": 0.029048843309283257, "step": 1136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2274, "grad_norm": 3.968341588973999, "kl": 3.6469134613871574, "learning_rate": 4.429313744623553e-06, "loss": 0.1459, "num_tokens": 9829264.0, "reward": 0.76092529296875, "reward_std": 0.01215219497680664, "rewards//mean": 0.76092529296875, "rewards//std": 0.0291003230959177, "step": 1137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2276, "grad_norm": 4.590412616729736, "kl": 3.6968624889850616, "learning_rate": 4.4283043086473e-06, "loss": 0.1479, "num_tokens": 9837976.0, "reward": 0.74310302734375, "reward_std": 0.011927234940230846, "rewards//mean": 0.74310302734375, "rewards//std": 0.031441181898117065, "step": 1138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2278, "grad_norm": 4.990357398986816, "kl": 2.4735485799610615, "learning_rate": 4.427294095950303e-06, "loss": 0.0989, "num_tokens": 9846624.0, "reward": 0.74005126953125, "reward_std": 0.008735140785574913, "rewards//mean": 0.74005126953125, "rewards//std": 0.021798353642225266, "step": 1139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.228, "grad_norm": 4.313953399658203, "kl": 2.6444992274045944, "learning_rate": 4.426283106939474e-06, "loss": 0.1058, "num_tokens": 9855208.0, "reward": 0.7606201171875, "reward_std": 0.015206810086965561, "rewards//mean": 0.7606201171875, "rewards//std": 0.032752130180597305, "step": 1140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2282, "grad_norm": 4.421706676483154, "kl": 1.973025131970644, "learning_rate": 4.425271342022039e-06, "loss": 0.0789, "num_tokens": 9863816.0, "reward": 0.789306640625, "reward_std": 0.011878270655870438, "rewards//mean": 0.789306640625, "rewards//std": 0.027620263397693634, "step": 1141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2284, "grad_norm": 3.751870632171631, "kl": 4.020584672689438, "learning_rate": 4.42425880160554e-06, "loss": 0.1608, "num_tokens": 9872392.0, "reward": 0.754638671875, "reward_std": 0.009817682206630707, "rewards//mean": 0.754638671875, "rewards//std": 0.03121025487780571, "step": 1142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2286, "grad_norm": 4.546318531036377, "kl": 2.2935502976179123, "learning_rate": 4.423245486097823e-06, "loss": 0.0917, "num_tokens": 9881024.0, "reward": 0.73712158203125, "reward_std": 0.007509090472012758, "rewards//mean": 0.73712158203125, "rewards//std": 0.03501252084970474, "step": 1143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2288, "grad_norm": 4.78199577331543, "kl": 3.1069531105458736, "learning_rate": 4.4222313959070565e-06, "loss": 0.1243, "num_tokens": 9889648.0, "reward": 0.7454833984375, "reward_std": 0.009114361368119717, "rewards//mean": 0.7454833984375, "rewards//std": 0.02207755483686924, "step": 1144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.229, "grad_norm": 2.1705641746520996, "kl": 2.1756571158766747, "learning_rate": 4.421216531441713e-06, "loss": 0.087, "num_tokens": 9898224.0, "reward": 0.756591796875, "reward_std": 0.00859922170639038, "rewards//mean": 0.756591796875, "rewards//std": 0.022292152047157288, "step": 1145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2292, "grad_norm": 4.177046298980713, "kl": 2.509240873157978, "learning_rate": 4.42020089311058e-06, "loss": 0.1004, "num_tokens": 9906824.0, "reward": 0.7294921875, "reward_std": 0.01396462693810463, "rewards//mean": 0.7294921875, "rewards//std": 0.0340610109269619, "step": 1146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2294, "grad_norm": 4.385286331176758, "kl": 1.938187226653099, "learning_rate": 4.419184481322757e-06, "loss": 0.0775, "num_tokens": 9915560.0, "reward": 0.74761962890625, "reward_std": 0.005977232940495014, "rewards//mean": 0.74761962890625, "rewards//std": 0.02819945476949215, "step": 1147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2296, "grad_norm": 4.2766337394714355, "kl": 1.8286756686866283, "learning_rate": 4.418167296487655e-06, "loss": 0.0731, "num_tokens": 9924232.0, "reward": 0.704833984375, "reward_std": 0.00689849816262722, "rewards//mean": 0.704833984375, "rewards//std": 0.03629103675484657, "step": 1148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2298, "grad_norm": 5.587540626525879, "kl": 3.026585966348648, "learning_rate": 4.417149339014994e-06, "loss": 0.1211, "num_tokens": 9932832.0, "reward": 0.75274658203125, "reward_std": 0.011506790295243263, "rewards//mean": 0.75274658203125, "rewards//std": 0.03763307258486748, "step": 1149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.23, "grad_norm": 3.560687303543091, "kl": 2.663250733166933, "learning_rate": 4.41613060931481e-06, "loss": 0.1065, "num_tokens": 9941384.0, "reward": 0.75836181640625, "reward_std": 0.009568380191922188, "rewards//mean": 0.75836181640625, "rewards//std": 0.02459591068327427, "step": 1150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2302, "grad_norm": 4.819980144500732, "kl": 1.7875125631690025, "learning_rate": 4.415111107797445e-06, "loss": 0.0715, "num_tokens": 9949968.0, "reward": 0.7432861328125, "reward_std": 0.007487800437957048, "rewards//mean": 0.7432861328125, "rewards//std": 0.023493947461247444, "step": 1151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2304, "grad_norm": 6.067734241485596, "kl": 1.6880947761237621, "learning_rate": 4.4140908348735555e-06, "loss": 0.0675, "num_tokens": 9958640.0, "reward": 0.7772216796875, "reward_std": 0.007833748124539852, "rewards//mean": 0.7772216796875, "rewards//std": 0.024339543655514717, "step": 1152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2306, "grad_norm": 4.260062217712402, "kl": 2.7471398562192917, "learning_rate": 4.413069790954106e-06, "loss": 0.1099, "num_tokens": 9967248.0, "reward": 0.76300048828125, "reward_std": 0.010684065520763397, "rewards//mean": 0.76300048828125, "rewards//std": 0.025315918028354645, "step": 1153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2308, "grad_norm": 2.4664273262023926, "kl": 2.3026996180415154, "learning_rate": 4.412047976450373e-06, "loss": 0.0921, "num_tokens": 9975864.0, "reward": 0.75982666015625, "reward_std": 0.011599559336900711, "rewards//mean": 0.75982666015625, "rewards//std": 0.03429481014609337, "step": 1154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.231, "grad_norm": 3.980759382247925, "kl": 2.012610200792551, "learning_rate": 4.411025391773945e-06, "loss": 0.0805, "num_tokens": 9984496.0, "reward": 0.77630615234375, "reward_std": 0.009585712105035782, "rewards//mean": 0.77630615234375, "rewards//std": 0.021868381649255753, "step": 1155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2312, "grad_norm": 4.991798400878906, "kl": 1.7778412401676178, "learning_rate": 4.4100020373367166e-06, "loss": 0.0711, "num_tokens": 9993096.0, "reward": 0.7716064453125, "reward_std": 0.008009029552340508, "rewards//mean": 0.7716064453125, "rewards//std": 0.022477107122540474, "step": 1156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2314, "grad_norm": 4.190371513366699, "kl": 3.0115476846694946, "learning_rate": 4.408977913550897e-06, "loss": 0.1205, "num_tokens": 10001696.0, "reward": 0.74127197265625, "reward_std": 0.012816842645406723, "rewards//mean": 0.74127197265625, "rewards//std": 0.03637681156396866, "step": 1157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2316, "grad_norm": 3.610924243927002, "kl": 2.7081813514232635, "learning_rate": 4.407953020829001e-06, "loss": 0.1083, "num_tokens": 10010312.0, "reward": 0.74725341796875, "reward_std": 0.00960319023579359, "rewards//mean": 0.74725341796875, "rewards//std": 0.027131937444210052, "step": 1158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2318, "grad_norm": 4.699491500854492, "kl": 2.203125298023224, "learning_rate": 4.406927359583857e-06, "loss": 0.0881, "num_tokens": 10018944.0, "reward": 0.7421875, "reward_std": 0.007760289125144482, "rewards//mean": 0.7421875, "rewards//std": 0.02530582621693611, "step": 1159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.232, "grad_norm": 2.1665761470794678, "kl": 1.9041131436824799, "learning_rate": 4.4059009302286e-06, "loss": 0.0762, "num_tokens": 10027520.0, "reward": 0.75537109375, "reward_std": 0.010919666849076748, "rewards//mean": 0.75537109375, "rewards//std": 0.031730711460113525, "step": 1160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2322, "grad_norm": 1.8608490228652954, "kl": 1.9596595019102097, "learning_rate": 4.404873733176678e-06, "loss": 0.0784, "num_tokens": 10036120.0, "reward": 0.75701904296875, "reward_std": 0.008197091519832611, "rewards//mean": 0.75701904296875, "rewards//std": 0.027517516165971756, "step": 1161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2324, "grad_norm": 2.7107038497924805, "kl": 3.2588911950588226, "learning_rate": 4.403845768841842e-06, "loss": 0.1304, "num_tokens": 10044704.0, "reward": 0.73614501953125, "reward_std": 0.011148132383823395, "rewards//mean": 0.73614501953125, "rewards//std": 0.026254061609506607, "step": 1162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2326, "grad_norm": 5.473135471343994, "kl": 1.7970853373408318, "learning_rate": 4.402817037638159e-06, "loss": 0.0719, "num_tokens": 10053336.0, "reward": 0.74560546875, "reward_std": 0.010157153010368347, "rewards//mean": 0.74560546875, "rewards//std": 0.03228311613202095, "step": 1163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2328, "grad_norm": 4.408280372619629, "kl": 2.154853366315365, "learning_rate": 4.40178753998e-06, "loss": 0.0862, "num_tokens": 10061936.0, "reward": 0.7889404296875, "reward_std": 0.006923479959368706, "rewards//mean": 0.7889404296875, "rewards//std": 0.0208213422447443, "step": 1164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.233, "grad_norm": 3.6861178874969482, "kl": 2.0424507558345795, "learning_rate": 4.400757276282048e-06, "loss": 0.0817, "num_tokens": 10070680.0, "reward": 0.76171875, "reward_std": 0.009181361645460129, "rewards//mean": 0.76171875, "rewards//std": 0.029487434774637222, "step": 1165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2332, "grad_norm": 4.863555431365967, "kl": 2.720115792006254, "learning_rate": 4.399726246959293e-06, "loss": 0.1088, "num_tokens": 10079240.0, "reward": 0.74169921875, "reward_std": 0.010839330032467842, "rewards//mean": 0.74169921875, "rewards//std": 0.03312736004590988, "step": 1166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2334, "grad_norm": 3.3969473838806152, "kl": 2.498805146664381, "learning_rate": 4.398694452427032e-06, "loss": 0.1, "num_tokens": 10087896.0, "reward": 0.7501220703125, "reward_std": 0.0076134358532726765, "rewards//mean": 0.7501220703125, "rewards//std": 0.025515221059322357, "step": 1167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2336, "grad_norm": 3.243971347808838, "kl": 1.5546209067106247, "learning_rate": 4.397661893100873e-06, "loss": 0.0622, "num_tokens": 10096480.0, "reward": 0.7674560546875, "reward_std": 0.007484575733542442, "rewards//mean": 0.7674560546875, "rewards//std": 0.03198515996336937, "step": 1168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2338, "grad_norm": 3.015559196472168, "kl": 1.6005559861660004, "learning_rate": 4.39662856939673e-06, "loss": 0.064, "num_tokens": 10105048.0, "reward": 0.79119873046875, "reward_std": 0.009435184299945831, "rewards//mean": 0.79119873046875, "rewards//std": 0.024899912998080254, "step": 1169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.234, "grad_norm": 5.621588706970215, "kl": 3.0888603813946247, "learning_rate": 4.3955944817308265e-06, "loss": 0.1236, "num_tokens": 10113640.0, "reward": 0.724365234375, "reward_std": 0.009651388972997665, "rewards//mean": 0.724365234375, "rewards//std": 0.02690059505403042, "step": 1170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2342, "grad_norm": 5.072406768798828, "kl": 3.304726105183363, "learning_rate": 4.3945596305196925e-06, "loss": 0.1322, "num_tokens": 10122320.0, "reward": 0.78167724609375, "reward_std": 0.014596269465982914, "rewards//mean": 0.78167724609375, "rewards//std": 0.029936732724308968, "step": 1171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2344, "grad_norm": 3.8299102783203125, "kl": 3.773449346423149, "learning_rate": 4.393524016180166e-06, "loss": 0.1509, "num_tokens": 10131064.0, "reward": 0.7257080078125, "reward_std": 0.011931387707591057, "rewards//mean": 0.7257080078125, "rewards//std": 0.014722654595971107, "step": 1172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2346, "grad_norm": 4.724763870239258, "kl": 2.916387215256691, "learning_rate": 4.3924876391293915e-06, "loss": 0.1167, "num_tokens": 10139696.0, "reward": 0.7247314453125, "reward_std": 0.010187920182943344, "rewards//mean": 0.7247314453125, "rewards//std": 0.03517325595021248, "step": 1173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2348, "grad_norm": 105.1701889038086, "kl": 2.793856855481863, "learning_rate": 4.391450499784823e-06, "loss": 0.1118, "num_tokens": 10148320.0, "reward": 0.74896240234375, "reward_std": 0.009012022987008095, "rewards//mean": 0.74896240234375, "rewards//std": 0.028341885656118393, "step": 1174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.235, "grad_norm": 3.390784740447998, "kl": 3.590435441583395, "learning_rate": 4.3904125985642185e-06, "loss": 0.1436, "num_tokens": 10156992.0, "reward": 0.780029296875, "reward_std": 0.009464158676564693, "rewards//mean": 0.780029296875, "rewards//std": 0.03168009966611862, "step": 1175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2352, "grad_norm": 2.0135703086853027, "kl": 2.7865120209753513, "learning_rate": 4.3893739358856465e-06, "loss": 0.1115, "num_tokens": 10165576.0, "reward": 0.76513671875, "reward_std": 0.007272062823176384, "rewards//mean": 0.76513671875, "rewards//std": 0.01552391704171896, "step": 1176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2354, "grad_norm": 6.018725872039795, "kl": 3.294798519462347, "learning_rate": 4.388334512167478e-06, "loss": 0.1318, "num_tokens": 10174256.0, "reward": 0.76361083984375, "reward_std": 0.00906122662127018, "rewards//mean": 0.76361083984375, "rewards//std": 0.019237525761127472, "step": 1177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2356, "grad_norm": 5.187139511108398, "kl": 2.524987954646349, "learning_rate": 4.387294327828394e-06, "loss": 0.101, "num_tokens": 10182920.0, "reward": 0.7310791015625, "reward_std": 0.009908773936331272, "rewards//mean": 0.7310791015625, "rewards//std": 0.03632001578807831, "step": 1178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2358, "grad_norm": 2.9680135250091553, "kl": 3.295103371143341, "learning_rate": 4.386253383287381e-06, "loss": 0.1318, "num_tokens": 10191600.0, "reward": 0.755126953125, "reward_std": 0.013424674049019814, "rewards//mean": 0.755126953125, "rewards//std": 0.030984386801719666, "step": 1179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.236, "grad_norm": 5.012116432189941, "kl": 4.495564699172974, "learning_rate": 4.385211678963731e-06, "loss": 0.1798, "num_tokens": 10200344.0, "reward": 0.72955322265625, "reward_std": 0.010548807680606842, "rewards//mean": 0.72955322265625, "rewards//std": 0.027811355888843536, "step": 1180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2362, "grad_norm": 6.391460418701172, "kl": 3.6062071919441223, "learning_rate": 4.384169215277042e-06, "loss": 0.1442, "num_tokens": 10208960.0, "reward": 0.7391357421875, "reward_std": 0.008173111826181412, "rewards//mean": 0.7391357421875, "rewards//std": 0.04144793003797531, "step": 1181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2364, "grad_norm": 2.744133472442627, "kl": 3.9456580877304077, "learning_rate": 4.383125992647218e-06, "loss": 0.1578, "num_tokens": 10217648.0, "reward": 0.7537841796875, "reward_std": 0.00853528082370758, "rewards//mean": 0.7537841796875, "rewards//std": 0.025572113692760468, "step": 1182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2366, "grad_norm": 4.758715629577637, "kl": 2.147137627005577, "learning_rate": 4.382082011494469e-06, "loss": 0.0859, "num_tokens": 10226272.0, "reward": 0.74664306640625, "reward_std": 0.00753501383587718, "rewards//mean": 0.74664306640625, "rewards//std": 0.026161065325140953, "step": 1183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2368, "grad_norm": 4.894461154937744, "kl": 5.255105219781399, "learning_rate": 4.381037272239311e-06, "loss": 0.2102, "num_tokens": 10234856.0, "reward": 0.7403564453125, "reward_std": 0.012467950582504272, "rewards//mean": 0.7403564453125, "rewards//std": 0.027882907539606094, "step": 1184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.237, "grad_norm": 2.9601051807403564, "kl": 2.413652427494526, "learning_rate": 4.379991775302566e-06, "loss": 0.0965, "num_tokens": 10243440.0, "reward": 0.73626708984375, "reward_std": 0.009169712662696838, "rewards//mean": 0.73626708984375, "rewards//std": 0.026766261085867882, "step": 1185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2372, "grad_norm": 4.733938694000244, "kl": 2.0806146636605263, "learning_rate": 4.3789455211053565e-06, "loss": 0.0832, "num_tokens": 10252096.0, "reward": 0.763671875, "reward_std": 0.01066847424954176, "rewards//mean": 0.763671875, "rewards//std": 0.02607884258031845, "step": 1186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2374, "grad_norm": 3.383695602416992, "kl": 3.815249115228653, "learning_rate": 4.377898510069117e-06, "loss": 0.1526, "num_tokens": 10260736.0, "reward": 0.7398681640625, "reward_std": 0.009789492934942245, "rewards//mean": 0.7398681640625, "rewards//std": 0.025569746270775795, "step": 1187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2376, "grad_norm": 5.162660121917725, "kl": 3.2633428424596786, "learning_rate": 4.376850742615583e-06, "loss": 0.1305, "num_tokens": 10269320.0, "reward": 0.7490234375, "reward_std": 0.012103484012186527, "rewards//mean": 0.7490234375, "rewards//std": 0.03822895511984825, "step": 1188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2378, "grad_norm": 1.695252537727356, "kl": 4.215300500392914, "learning_rate": 4.375802219166794e-06, "loss": 0.1686, "num_tokens": 10278008.0, "reward": 0.763916015625, "reward_std": 0.010597416199743748, "rewards//mean": 0.763916015625, "rewards//std": 0.02357017621397972, "step": 1189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.238, "grad_norm": 2.4450998306274414, "kl": 3.1600623950362206, "learning_rate": 4.374752940145094e-06, "loss": 0.1264, "num_tokens": 10286632.0, "reward": 0.7606201171875, "reward_std": 0.009779067710042, "rewards//mean": 0.7606201171875, "rewards//std": 0.03593958169221878, "step": 1190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2382, "grad_norm": 3.4994444847106934, "kl": 2.5188086219131947, "learning_rate": 4.373702905973136e-06, "loss": 0.1008, "num_tokens": 10295192.0, "reward": 0.73529052734375, "reward_std": 0.007761248853057623, "rewards//mean": 0.73529052734375, "rewards//std": 0.02162894606590271, "step": 1191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2384, "grad_norm": 3.1179685592651367, "kl": 3.280868347734213, "learning_rate": 4.37265211707387e-06, "loss": 0.1312, "num_tokens": 10303872.0, "reward": 0.744384765625, "reward_std": 0.010859925299882889, "rewards//mean": 0.744384765625, "rewards//std": 0.031341902911663055, "step": 1192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2386, "grad_norm": 2.0732879638671875, "kl": 2.526439368724823, "learning_rate": 4.371600573870556e-06, "loss": 0.1011, "num_tokens": 10312528.0, "reward": 0.7584228515625, "reward_std": 0.008850334212183952, "rewards//mean": 0.7584228515625, "rewards//std": 0.02416226826608181, "step": 1193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2388, "grad_norm": 4.669473171234131, "kl": 2.2776339799165726, "learning_rate": 4.370548276786753e-06, "loss": 0.0911, "num_tokens": 10321160.0, "reward": 0.75848388671875, "reward_std": 0.006054052617400885, "rewards//mean": 0.75848388671875, "rewards//std": 0.03758718818426132, "step": 1194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.239, "grad_norm": 2.4176149368286133, "kl": 2.397630028426647, "learning_rate": 4.36949522624633e-06, "loss": 0.0959, "num_tokens": 10329808.0, "reward": 0.76031494140625, "reward_std": 0.01437076460570097, "rewards//mean": 0.76031494140625, "rewards//std": 0.019479932263493538, "step": 1195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2392, "grad_norm": 3.9158265590667725, "kl": 3.1381518095731735, "learning_rate": 4.368441422673453e-06, "loss": 0.1255, "num_tokens": 10338520.0, "reward": 0.72344970703125, "reward_std": 0.0075682769529521465, "rewards//mean": 0.72344970703125, "rewards//std": 0.021801132708787918, "step": 1196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2394, "grad_norm": 6.184590816497803, "kl": 2.642193280160427, "learning_rate": 4.367386866492593e-06, "loss": 0.1057, "num_tokens": 10347160.0, "reward": 0.7406005859375, "reward_std": 0.007553707342594862, "rewards//mean": 0.7406005859375, "rewards//std": 0.036069076508283615, "step": 1197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2396, "grad_norm": 1.7824287414550781, "kl": 1.6157731600105762, "learning_rate": 4.366331558128528e-06, "loss": 0.0646, "num_tokens": 10355760.0, "reward": 0.7764892578125, "reward_std": 0.006442622281610966, "rewards//mean": 0.7764892578125, "rewards//std": 0.03328758850693703, "step": 1198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2398, "grad_norm": 2.978865146636963, "kl": 1.7756621427834034, "learning_rate": 4.3652754980063335e-06, "loss": 0.071, "num_tokens": 10364384.0, "reward": 0.7462158203125, "reward_std": 0.010199598968029022, "rewards//mean": 0.7462158203125, "rewards//std": 0.025410590693354607, "step": 1199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.24, "grad_norm": 1.4905486106872559, "kl": 1.333003405481577, "learning_rate": 4.364218686551392e-06, "loss": 0.0533, "num_tokens": 10372912.0, "reward": 0.76739501953125, "reward_std": 0.008700652047991753, "rewards//mean": 0.76739501953125, "rewards//std": 0.02604101411998272, "step": 1200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2402, "grad_norm": 2.886620283126831, "kl": 1.372915230691433, "learning_rate": 4.363161124189387e-06, "loss": 0.0549, "num_tokens": 10381504.0, "reward": 0.7620849609375, "reward_std": 0.008544465526938438, "rewards//mean": 0.7620849609375, "rewards//std": 0.0251495148986578, "step": 1201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2404, "grad_norm": 2.304074764251709, "kl": 1.5740601047873497, "learning_rate": 4.362102811346304e-06, "loss": 0.063, "num_tokens": 10390168.0, "reward": 0.7490234375, "reward_std": 0.008381001651287079, "rewards//mean": 0.7490234375, "rewards//std": 0.030089115723967552, "step": 1202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2406, "grad_norm": 1.8697845935821533, "kl": 2.7240917161107063, "learning_rate": 4.36104374844843e-06, "loss": 0.109, "num_tokens": 10398736.0, "reward": 0.76446533203125, "reward_std": 0.009950193576514721, "rewards//mean": 0.76446533203125, "rewards//std": 0.019056478515267372, "step": 1203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2408, "grad_norm": 3.6496565341949463, "kl": 0.7774477414786816, "learning_rate": 4.3599839359223575e-06, "loss": 0.0311, "num_tokens": 10407320.0, "reward": 0.69573974609375, "reward_std": 0.005646621808409691, "rewards//mean": 0.69573974609375, "rewards//std": 0.035441804677248, "step": 1204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.241, "grad_norm": 3.0308477878570557, "kl": 0.9583526477217674, "learning_rate": 4.358923374194978e-06, "loss": 0.0383, "num_tokens": 10415912.0, "reward": 0.71502685546875, "reward_std": 0.006192350294440985, "rewards//mean": 0.71502685546875, "rewards//std": 0.035286422818899155, "step": 1205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2412, "grad_norm": 2.3300673961639404, "kl": 1.3085511550307274, "learning_rate": 4.357862063693486e-06, "loss": 0.0523, "num_tokens": 10424592.0, "reward": 0.7635498046875, "reward_std": 0.006916014943271875, "rewards//mean": 0.7635498046875, "rewards//std": 0.03164256736636162, "step": 1206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2414, "grad_norm": 6.495100498199463, "kl": 2.5806922614574432, "learning_rate": 4.356800004845376e-06, "loss": 0.1032, "num_tokens": 10433208.0, "reward": 0.75531005859375, "reward_std": 0.008886190131306648, "rewards//mean": 0.75531005859375, "rewards//std": 0.03212417662143707, "step": 1207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2416, "grad_norm": 2.3843295574188232, "kl": 2.7375226244330406, "learning_rate": 4.355737198078447e-06, "loss": 0.1095, "num_tokens": 10441728.0, "reward": 0.74957275390625, "reward_std": 0.012769874185323715, "rewards//mean": 0.74957275390625, "rewards//std": 0.032084569334983826, "step": 1208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2418, "grad_norm": 3.239755868911743, "kl": 1.5005339868366718, "learning_rate": 4.354673643820796e-06, "loss": 0.06, "num_tokens": 10450336.0, "reward": 0.75830078125, "reward_std": 0.010455173440277576, "rewards//mean": 0.75830078125, "rewards//std": 0.029152879491448402, "step": 1209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.242, "grad_norm": 4.127530574798584, "kl": 1.264933981001377, "learning_rate": 4.353609342500824e-06, "loss": 0.0506, "num_tokens": 10458856.0, "reward": 0.74560546875, "reward_std": 0.009353343397378922, "rewards//mean": 0.74560546875, "rewards//std": 0.02658018469810486, "step": 1210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2422, "grad_norm": 3.013749837875366, "kl": 2.1643280014395714, "learning_rate": 4.352544294547229e-06, "loss": 0.0866, "num_tokens": 10467424.0, "reward": 0.7274169921875, "reward_std": 0.009314079768955708, "rewards//mean": 0.7274169921875, "rewards//std": 0.035403184592723846, "step": 1211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2424, "grad_norm": 2.6905953884124756, "kl": 1.7121267318725586, "learning_rate": 4.351478500389014e-06, "loss": 0.0685, "num_tokens": 10476160.0, "reward": 0.76739501953125, "reward_std": 0.0066375043243169785, "rewards//mean": 0.76739501953125, "rewards//std": 0.028322117403149605, "step": 1212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2426, "grad_norm": 3.789945125579834, "kl": 2.449982389807701, "learning_rate": 4.350411960455482e-06, "loss": 0.098, "num_tokens": 10484800.0, "reward": 0.73846435546875, "reward_std": 0.00773600023239851, "rewards//mean": 0.73846435546875, "rewards//std": 0.02766893059015274, "step": 1213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2428, "grad_norm": 2.2612009048461914, "kl": 1.3458568677306175, "learning_rate": 4.349344675176232e-06, "loss": 0.0538, "num_tokens": 10493448.0, "reward": 0.7857666015625, "reward_std": 0.0043028173968195915, "rewards//mean": 0.7857666015625, "rewards//std": 0.018858950585126877, "step": 1214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.243, "grad_norm": 1.4177792072296143, "kl": 1.9706830494105816, "learning_rate": 4.348276644981169e-06, "loss": 0.0788, "num_tokens": 10502088.0, "reward": 0.775146484375, "reward_std": 0.008794134482741356, "rewards//mean": 0.775146484375, "rewards//std": 0.022226866334676743, "step": 1215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2432, "grad_norm": 3.255463123321533, "kl": 0.8058740049600601, "learning_rate": 4.347207870300495e-06, "loss": 0.0322, "num_tokens": 10510608.0, "reward": 0.767578125, "reward_std": 0.004588867072016001, "rewards//mean": 0.767578125, "rewards//std": 0.013351457193493843, "step": 1216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2434, "grad_norm": 2.18398380279541, "kl": 2.623698953539133, "learning_rate": 4.346138351564711e-06, "loss": 0.1049, "num_tokens": 10519232.0, "reward": 0.75537109375, "reward_std": 0.00650961697101593, "rewards//mean": 0.75537109375, "rewards//std": 0.01992412470281124, "step": 1217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2436, "grad_norm": 3.0979387760162354, "kl": 1.6793304719030857, "learning_rate": 4.3450680892046185e-06, "loss": 0.0672, "num_tokens": 10527872.0, "reward": 0.7581787109375, "reward_std": 0.008162347599864006, "rewards//mean": 0.7581787109375, "rewards//std": 0.024712340906262398, "step": 1218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2438, "grad_norm": 5.948206424713135, "kl": 3.9238513708114624, "learning_rate": 4.343997083651321e-06, "loss": 0.157, "num_tokens": 10536560.0, "reward": 0.71429443359375, "reward_std": 0.0079739298671484, "rewards//mean": 0.71429443359375, "rewards//std": 0.02637314423918724, "step": 1219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.244, "grad_norm": 3.663465738296509, "kl": 3.7845727428793907, "learning_rate": 4.342925335336219e-06, "loss": 0.1514, "num_tokens": 10545168.0, "reward": 0.760986328125, "reward_std": 0.007018606178462505, "rewards//mean": 0.760986328125, "rewards//std": 0.028914082795381546, "step": 1220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2442, "grad_norm": 5.352837562561035, "kl": 3.409624829888344, "learning_rate": 4.341852844691012e-06, "loss": 0.1364, "num_tokens": 10553912.0, "reward": 0.7237548828125, "reward_std": 0.009475414641201496, "rewards//mean": 0.7237548828125, "rewards//std": 0.014186456799507141, "step": 1221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2444, "grad_norm": 3.4909679889678955, "kl": 2.078166324645281, "learning_rate": 4.340779612147701e-06, "loss": 0.0831, "num_tokens": 10562520.0, "reward": 0.7203369140625, "reward_std": 0.008686388842761517, "rewards//mean": 0.7203369140625, "rewards//std": 0.03771689534187317, "step": 1222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2446, "grad_norm": 5.076694965362549, "kl": 2.995825830847025, "learning_rate": 4.33970563813858e-06, "loss": 0.1198, "num_tokens": 10571288.0, "reward": 0.77532958984375, "reward_std": 0.00858029443770647, "rewards//mean": 0.77532958984375, "rewards//std": 0.023156721144914627, "step": 1223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2448, "grad_norm": 2.4503300189971924, "kl": 4.274348825216293, "learning_rate": 4.33863092309625e-06, "loss": 0.171, "num_tokens": 10579888.0, "reward": 0.7513427734375, "reward_std": 0.010105829685926437, "rewards//mean": 0.7513427734375, "rewards//std": 0.023481056094169617, "step": 1224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.245, "grad_norm": 4.700718402862549, "kl": 5.662655092775822, "learning_rate": 4.337555467453603e-06, "loss": 0.2265, "num_tokens": 10588520.0, "reward": 0.7791748046875, "reward_std": 0.012307170778512955, "rewards//mean": 0.7791748046875, "rewards//std": 0.027813328430056572, "step": 1225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2452, "grad_norm": 2.5564656257629395, "kl": 2.2768448293209076, "learning_rate": 4.336479271643833e-06, "loss": 0.0911, "num_tokens": 10597080.0, "reward": 0.76702880859375, "reward_std": 0.009477036073803902, "rewards//mean": 0.76702880859375, "rewards//std": 0.02174760028719902, "step": 1226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2454, "grad_norm": 4.391211986541748, "kl": 3.2486671209335327, "learning_rate": 4.335402336100433e-06, "loss": 0.1299, "num_tokens": 10605704.0, "reward": 0.73675537109375, "reward_std": 0.012329833582043648, "rewards//mean": 0.73675537109375, "rewards//std": 0.02734478935599327, "step": 1227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2456, "grad_norm": 4.762077331542969, "kl": 4.207510143518448, "learning_rate": 4.334324661257191e-06, "loss": 0.1683, "num_tokens": 10614352.0, "reward": 0.72943115234375, "reward_std": 0.008869737386703491, "rewards//mean": 0.72943115234375, "rewards//std": 0.03338765725493431, "step": 1228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2458, "grad_norm": 4.046385288238525, "kl": 4.122078999876976, "learning_rate": 4.3332462475481955e-06, "loss": 0.1649, "num_tokens": 10622968.0, "reward": 0.74481201171875, "reward_std": 0.009776229038834572, "rewards//mean": 0.74481201171875, "rewards//std": 0.031466685235500336, "step": 1229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.246, "grad_norm": 1.7352546453475952, "kl": 5.99093770980835, "learning_rate": 4.33216709540783e-06, "loss": 0.2396, "num_tokens": 10631624.0, "reward": 0.74407958984375, "reward_std": 0.015540502034127712, "rewards//mean": 0.74407958984375, "rewards//std": 0.03540676459670067, "step": 1230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2462, "grad_norm": 2.2929062843322754, "kl": 3.6688244491815567, "learning_rate": 4.331087205270778e-06, "loss": 0.1468, "num_tokens": 10640224.0, "reward": 0.7261962890625, "reward_std": 0.006909031420946121, "rewards//mean": 0.7261962890625, "rewards//std": 0.029917320236563683, "step": 1231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2464, "grad_norm": 4.927531719207764, "kl": 3.9019454047083855, "learning_rate": 4.330006577572018e-06, "loss": 0.1561, "num_tokens": 10648752.0, "reward": 0.72509765625, "reward_std": 0.016851941123604774, "rewards//mean": 0.72509765625, "rewards//std": 0.03653465211391449, "step": 1232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2466, "grad_norm": 4.874295711517334, "kl": 5.441168129444122, "learning_rate": 4.328925212746828e-06, "loss": 0.2176, "num_tokens": 10657368.0, "reward": 0.73565673828125, "reward_std": 0.011625098064541817, "rewards//mean": 0.73565673828125, "rewards//std": 0.0374419204890728, "step": 1233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2468, "grad_norm": 2.493762254714966, "kl": 3.567030318081379, "learning_rate": 4.3278431112307806e-06, "loss": 0.1427, "num_tokens": 10666032.0, "reward": 0.75701904296875, "reward_std": 0.009042746387422085, "rewards//mean": 0.75701904296875, "rewards//std": 0.024795126169919968, "step": 1234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.247, "grad_norm": 8.09304428100586, "kl": 1.560712344944477, "learning_rate": 4.326760273459747e-06, "loss": 0.0624, "num_tokens": 10674592.0, "reward": 0.7799072265625, "reward_std": 0.008474647998809814, "rewards//mean": 0.7799072265625, "rewards//std": 0.03750115633010864, "step": 1235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2472, "grad_norm": 2.4928390979766846, "kl": 3.1284263506531715, "learning_rate": 4.325676699869894e-06, "loss": 0.1251, "num_tokens": 10683232.0, "reward": 0.73699951171875, "reward_std": 0.008873146958649158, "rewards//mean": 0.73699951171875, "rewards//std": 0.028485199436545372, "step": 1236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2474, "grad_norm": 3.692227602005005, "kl": 3.018490541726351, "learning_rate": 4.324592390897684e-06, "loss": 0.1207, "num_tokens": 10691880.0, "reward": 0.775390625, "reward_std": 0.013029628433287144, "rewards//mean": 0.775390625, "rewards//std": 0.02277715690433979, "step": 1237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2476, "grad_norm": 1.8288943767547607, "kl": 3.9356056600809097, "learning_rate": 4.323507346979877e-06, "loss": 0.1574, "num_tokens": 10700504.0, "reward": 0.73504638671875, "reward_std": 0.01615409553050995, "rewards//mean": 0.73504638671875, "rewards//std": 0.03200473263859749, "step": 1238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2478, "grad_norm": 3.646533727645874, "kl": 2.190553665161133, "learning_rate": 4.322421568553529e-06, "loss": 0.0876, "num_tokens": 10709176.0, "reward": 0.77142333984375, "reward_std": 0.009891998954117298, "rewards//mean": 0.77142333984375, "rewards//std": 0.022883163765072823, "step": 1239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.248, "grad_norm": 2.136007070541382, "kl": 3.3589197658002377, "learning_rate": 4.321335056055991e-06, "loss": 0.1344, "num_tokens": 10717912.0, "reward": 0.75238037109375, "reward_std": 0.011641919612884521, "rewards//mean": 0.75238037109375, "rewards//std": 0.033047694712877274, "step": 1240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2482, "grad_norm": 2.3114218711853027, "kl": 2.9506953209638596, "learning_rate": 4.320247809924911e-06, "loss": 0.118, "num_tokens": 10726616.0, "reward": 0.75335693359375, "reward_std": 0.007081642281264067, "rewards//mean": 0.75335693359375, "rewards//std": 0.021540580317378044, "step": 1241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2484, "grad_norm": 2.7048633098602295, "kl": 2.2952828258275986, "learning_rate": 4.31915983059823e-06, "loss": 0.0918, "num_tokens": 10735200.0, "reward": 0.7706298828125, "reward_std": 0.008656525053083897, "rewards//mean": 0.7706298828125, "rewards//std": 0.021826548501849174, "step": 1242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2486, "grad_norm": 5.047454833984375, "kl": 3.8681520149111748, "learning_rate": 4.318071118514187e-06, "loss": 0.1547, "num_tokens": 10743880.0, "reward": 0.73712158203125, "reward_std": 0.00912274420261383, "rewards//mean": 0.73712158203125, "rewards//std": 0.03010263293981552, "step": 1243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2488, "grad_norm": 3.9927420616149902, "kl": 5.105262935161591, "learning_rate": 4.316981674111314e-06, "loss": 0.2042, "num_tokens": 10752560.0, "reward": 0.738525390625, "reward_std": 0.010131753981113434, "rewards//mean": 0.738525390625, "rewards//std": 0.03132644295692444, "step": 1244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.249, "grad_norm": 5.0731024742126465, "kl": 3.1860557794570923, "learning_rate": 4.315891497828442e-06, "loss": 0.1274, "num_tokens": 10761192.0, "reward": 0.75299072265625, "reward_std": 0.00944304745644331, "rewards//mean": 0.75299072265625, "rewards//std": 0.02728382684290409, "step": 1245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2492, "grad_norm": 2.1138155460357666, "kl": 5.489202938973904, "learning_rate": 4.314800590104691e-06, "loss": 0.2196, "num_tokens": 10769904.0, "reward": 0.71630859375, "reward_std": 0.01134289987385273, "rewards//mean": 0.71630859375, "rewards//std": 0.0380670540034771, "step": 1246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2494, "grad_norm": 4.446576118469238, "kl": 5.278612971305847, "learning_rate": 4.313708951379478e-06, "loss": 0.2111, "num_tokens": 10778632.0, "reward": 0.776611328125, "reward_std": 0.0145424734801054, "rewards//mean": 0.776611328125, "rewards//std": 0.026999453082680702, "step": 1247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2496, "grad_norm": 2.254894256591797, "kl": 3.4511141404509544, "learning_rate": 4.312616582092517e-06, "loss": 0.138, "num_tokens": 10787216.0, "reward": 0.75140380859375, "reward_std": 0.009194135665893555, "rewards//mean": 0.75140380859375, "rewards//std": 0.023952340707182884, "step": 1248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2498, "grad_norm": 5.996357440948486, "kl": 3.3174730129539967, "learning_rate": 4.311523482683815e-06, "loss": 0.1327, "num_tokens": 10795792.0, "reward": 0.7266845703125, "reward_std": 0.008818934671580791, "rewards//mean": 0.7266845703125, "rewards//std": 0.025581583380699158, "step": 1249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.25, "grad_norm": 3.19474720954895, "kl": 4.030950590968132, "learning_rate": 4.3104296535936695e-06, "loss": 0.1612, "num_tokens": 10804400.0, "reward": 0.74249267578125, "reward_std": 0.008821751922369003, "rewards//mean": 0.74249267578125, "rewards//std": 0.02215515449643135, "step": 1250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2502, "grad_norm": 3.826550245285034, "kl": 4.042018711566925, "learning_rate": 4.309335095262675e-06, "loss": 0.1617, "num_tokens": 10813208.0, "reward": 0.75897216796875, "reward_std": 0.008911153301596642, "rewards//mean": 0.75897216796875, "rewards//std": 0.026618242263793945, "step": 1251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2504, "grad_norm": 3.967430591583252, "kl": 3.3702562004327774, "learning_rate": 4.308239808131722e-06, "loss": 0.1348, "num_tokens": 10821848.0, "reward": 0.75384521484375, "reward_std": 0.011883918195962906, "rewards//mean": 0.75384521484375, "rewards//std": 0.028539882972836494, "step": 1252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2506, "grad_norm": 7.846375942230225, "kl": 4.817364752292633, "learning_rate": 4.30714379264199e-06, "loss": 0.1927, "num_tokens": 10830376.0, "reward": 0.748291015625, "reward_std": 0.01021143514662981, "rewards//mean": 0.748291015625, "rewards//std": 0.01640942320227623, "step": 1253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2508, "grad_norm": 3.820600748062134, "kl": 5.2034651935100555, "learning_rate": 4.306047049234955e-06, "loss": 0.2081, "num_tokens": 10838992.0, "reward": 0.7554931640625, "reward_std": 0.012208238244056702, "rewards//mean": 0.7554931640625, "rewards//std": 0.027653949335217476, "step": 1254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.251, "grad_norm": 3.7934939861297607, "kl": 5.430062413215637, "learning_rate": 4.3049495783523845e-06, "loss": 0.2172, "num_tokens": 10847728.0, "reward": 0.74920654296875, "reward_std": 0.010621393099427223, "rewards//mean": 0.74920654296875, "rewards//std": 0.026485977694392204, "step": 1255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2512, "grad_norm": 2.1324431896209717, "kl": 3.6935007870197296, "learning_rate": 4.3038513804363395e-06, "loss": 0.1477, "num_tokens": 10856544.0, "reward": 0.7738037109375, "reward_std": 0.011837621219456196, "rewards//mean": 0.7738037109375, "rewards//std": 0.041437700390815735, "step": 1256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2514, "grad_norm": 2.981853723526001, "kl": 4.043869718909264, "learning_rate": 4.302752455929174e-06, "loss": 0.1618, "num_tokens": 10865104.0, "reward": 0.74444580078125, "reward_std": 0.009245900437235832, "rewards//mean": 0.74444580078125, "rewards//std": 0.021741336211562157, "step": 1257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2516, "grad_norm": 3.5046322345733643, "kl": 2.84112012386322, "learning_rate": 4.301652805273535e-06, "loss": 0.1136, "num_tokens": 10873816.0, "reward": 0.76141357421875, "reward_std": 0.013442445546388626, "rewards//mean": 0.76141357421875, "rewards//std": 0.03635391592979431, "step": 1258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2518, "grad_norm": 1.9193782806396484, "kl": 3.2310542166233063, "learning_rate": 4.300552428912361e-06, "loss": 0.1292, "num_tokens": 10882440.0, "reward": 0.78790283203125, "reward_std": 0.009786102920770645, "rewards//mean": 0.78790283203125, "rewards//std": 0.03445466607809067, "step": 1259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.252, "grad_norm": 3.8148343563079834, "kl": 2.8285378366708755, "learning_rate": 4.299451327288884e-06, "loss": 0.1131, "num_tokens": 10891048.0, "reward": 0.76531982421875, "reward_std": 0.01002509519457817, "rewards//mean": 0.76531982421875, "rewards//std": 0.019316839054226875, "step": 1260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2522, "grad_norm": 1.9809776544570923, "kl": 3.711024224758148, "learning_rate": 4.2983495008466285e-06, "loss": 0.1484, "num_tokens": 10899720.0, "reward": 0.75579833984375, "reward_std": 0.00950118899345398, "rewards//mean": 0.75579833984375, "rewards//std": 0.031000932678580284, "step": 1261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2524, "grad_norm": 4.752071380615234, "kl": 2.5392694026231766, "learning_rate": 4.2972469500294085e-06, "loss": 0.1016, "num_tokens": 10908304.0, "reward": 0.767333984375, "reward_std": 0.011380651965737343, "rewards//mean": 0.767333984375, "rewards//std": 0.03704405203461647, "step": 1262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2526, "grad_norm": 2.9224507808685303, "kl": 3.6532381176948547, "learning_rate": 4.296143675281332e-06, "loss": 0.1461, "num_tokens": 10916880.0, "reward": 0.7830810546875, "reward_std": 0.014666005969047546, "rewards//mean": 0.7830810546875, "rewards//std": 0.02305164374411106, "step": 1263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2528, "grad_norm": 3.125930070877075, "kl": 2.9229678958654404, "learning_rate": 4.295039677046797e-06, "loss": 0.1169, "num_tokens": 10925520.0, "reward": 0.7720947265625, "reward_std": 0.015708722174167633, "rewards//mean": 0.7720947265625, "rewards//std": 0.02241506241261959, "step": 1264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.253, "grad_norm": 4.279654026031494, "kl": 1.866917934268713, "learning_rate": 4.293934955770496e-06, "loss": 0.0747, "num_tokens": 10934168.0, "reward": 0.73797607421875, "reward_std": 0.009853868745267391, "rewards//mean": 0.73797607421875, "rewards//std": 0.029637403786182404, "step": 1265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2532, "grad_norm": 3.183422803878784, "kl": 3.347011998295784, "learning_rate": 4.292829511897409e-06, "loss": 0.1339, "num_tokens": 10942832.0, "reward": 0.7520751953125, "reward_std": 0.012213675305247307, "rewards//mean": 0.7520751953125, "rewards//std": 0.03148910775780678, "step": 1266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2534, "grad_norm": 3.4027044773101807, "kl": 3.873841166496277, "learning_rate": 4.291723345872809e-06, "loss": 0.155, "num_tokens": 10951568.0, "reward": 0.7808837890625, "reward_std": 0.015410392545163631, "rewards//mean": 0.7808837890625, "rewards//std": 0.021895792335271835, "step": 1267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2536, "grad_norm": 3.050121545791626, "kl": 3.218788832426071, "learning_rate": 4.2906164581422594e-06, "loss": 0.1288, "num_tokens": 10960248.0, "reward": 0.759521484375, "reward_std": 0.011763526126742363, "rewards//mean": 0.759521484375, "rewards//std": 0.027532432228326797, "step": 1268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2538, "grad_norm": 4.074219226837158, "kl": 2.9686827957630157, "learning_rate": 4.289508849151614e-06, "loss": 0.1187, "num_tokens": 10968904.0, "reward": 0.747802734375, "reward_std": 0.011190950870513916, "rewards//mean": 0.747802734375, "rewards//std": 0.030819794163107872, "step": 1269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.254, "grad_norm": 5.134130001068115, "kl": 2.000323750078678, "learning_rate": 4.28840051934702e-06, "loss": 0.08, "num_tokens": 10977560.0, "reward": 0.76239013671875, "reward_std": 0.008431188762187958, "rewards//mean": 0.76239013671875, "rewards//std": 0.02893652208149433, "step": 1270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2542, "grad_norm": 5.8045172691345215, "kl": 3.0239599496126175, "learning_rate": 4.287291469174909e-06, "loss": 0.121, "num_tokens": 10986264.0, "reward": 0.75628662109375, "reward_std": 0.01057804562151432, "rewards//mean": 0.75628662109375, "rewards//std": 0.020239122211933136, "step": 1271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2544, "grad_norm": 7.345980644226074, "kl": 2.68732538446784, "learning_rate": 4.286181699082008e-06, "loss": 0.1075, "num_tokens": 10994872.0, "reward": 0.74957275390625, "reward_std": 0.013684505596756935, "rewards//mean": 0.74957275390625, "rewards//std": 0.02954583615064621, "step": 1272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2546, "grad_norm": 3.0569097995758057, "kl": 2.9385998994112015, "learning_rate": 4.285071209515334e-06, "loss": 0.1175, "num_tokens": 11003560.0, "reward": 0.76019287109375, "reward_std": 0.010790304280817509, "rewards//mean": 0.76019287109375, "rewards//std": 0.03445422649383545, "step": 1273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2548, "grad_norm": 4.791436672210693, "kl": 3.4138810336589813, "learning_rate": 4.283960000922188e-06, "loss": 0.1366, "num_tokens": 11012160.0, "reward": 0.74920654296875, "reward_std": 0.00981982797384262, "rewards//mean": 0.74920654296875, "rewards//std": 0.026613691821694374, "step": 1274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.255, "grad_norm": 3.7833430767059326, "kl": 3.627719782292843, "learning_rate": 4.282848073750169e-06, "loss": 0.1451, "num_tokens": 11020816.0, "reward": 0.74530029296875, "reward_std": 0.01018170453608036, "rewards//mean": 0.74530029296875, "rewards//std": 0.03128237649798393, "step": 1275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2552, "grad_norm": 4.855155944824219, "kl": 2.824738249182701, "learning_rate": 4.281735428447158e-06, "loss": 0.113, "num_tokens": 11029440.0, "reward": 0.7608642578125, "reward_std": 0.00786005612462759, "rewards//mean": 0.7608642578125, "rewards//std": 0.019306354224681854, "step": 1276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2554, "grad_norm": 2.9816348552703857, "kl": 3.4403585828840733, "learning_rate": 4.280622065461329e-06, "loss": 0.1376, "num_tokens": 11038104.0, "reward": 0.7349853515625, "reward_std": 0.008074142970144749, "rewards//mean": 0.7349853515625, "rewards//std": 0.029742753133177757, "step": 1277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2556, "grad_norm": 2.146547794342041, "kl": 4.239322081208229, "learning_rate": 4.279507985241146e-06, "loss": 0.1696, "num_tokens": 11046760.0, "reward": 0.73681640625, "reward_std": 0.011835831217467785, "rewards//mean": 0.73681640625, "rewards//std": 0.033972010016441345, "step": 1278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2558, "grad_norm": 2.9984006881713867, "kl": 5.082880079746246, "learning_rate": 4.278393188235359e-06, "loss": 0.2033, "num_tokens": 11055496.0, "reward": 0.760498046875, "reward_std": 0.014315815642476082, "rewards//mean": 0.760498046875, "rewards//std": 0.029967118054628372, "step": 1279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.256, "grad_norm": 2.648458480834961, "kl": 5.921925187110901, "learning_rate": 4.277277674893008e-06, "loss": 0.2369, "num_tokens": 11064112.0, "reward": 0.7852783203125, "reward_std": 0.017140207812190056, "rewards//mean": 0.7852783203125, "rewards//std": 0.0324363186955452, "step": 1280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2562, "grad_norm": 2.9077279567718506, "kl": 3.548160642385483, "learning_rate": 4.276161445663423e-06, "loss": 0.1419, "num_tokens": 11072752.0, "reward": 0.7635498046875, "reward_std": 0.01120435819029808, "rewards//mean": 0.7635498046875, "rewards//std": 0.015152337029576302, "step": 1281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2564, "grad_norm": 1.9369903802871704, "kl": 5.24702462553978, "learning_rate": 4.275044500996219e-06, "loss": 0.2099, "num_tokens": 11081424.0, "reward": 0.73468017578125, "reward_std": 0.012804493308067322, "rewards//mean": 0.73468017578125, "rewards//std": 0.04025525972247124, "step": 1282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2566, "grad_norm": 10.704317092895508, "kl": 4.317953795194626, "learning_rate": 4.273926841341303e-06, "loss": 0.1727, "num_tokens": 11089952.0, "reward": 0.76416015625, "reward_std": 0.008273369632661343, "rewards//mean": 0.76416015625, "rewards//std": 0.02051113359630108, "step": 1283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2568, "grad_norm": 2.8477797508239746, "kl": 1.8249661214649677, "learning_rate": 4.272808467148866e-06, "loss": 0.073, "num_tokens": 11098544.0, "reward": 0.79022216796875, "reward_std": 0.00894132535904646, "rewards//mean": 0.79022216796875, "rewards//std": 0.022027703002095222, "step": 1284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.257, "grad_norm": 5.987552642822266, "kl": 3.898840442299843, "learning_rate": 4.271689378869392e-06, "loss": 0.156, "num_tokens": 11107168.0, "reward": 0.760986328125, "reward_std": 0.01270859781652689, "rewards//mean": 0.760986328125, "rewards//std": 0.04118063673377037, "step": 1285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2572, "grad_norm": 3.328040599822998, "kl": 3.38457940146327, "learning_rate": 4.270569576953648e-06, "loss": 0.1354, "num_tokens": 11115752.0, "reward": 0.75103759765625, "reward_std": 0.00925066415220499, "rewards//mean": 0.75103759765625, "rewards//std": 0.025317711755633354, "step": 1286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2574, "grad_norm": 4.253782272338867, "kl": 4.360312137752771, "learning_rate": 4.26944906185269e-06, "loss": 0.1744, "num_tokens": 11124360.0, "reward": 0.77117919921875, "reward_std": 0.011268062517046928, "rewards//mean": 0.77117919921875, "rewards//std": 0.030344048514962196, "step": 1287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2576, "grad_norm": 3.0944712162017822, "kl": 2.501808814704418, "learning_rate": 4.268327834017862e-06, "loss": 0.1001, "num_tokens": 11133016.0, "reward": 0.75909423828125, "reward_std": 0.008227546699345112, "rewards//mean": 0.75909423828125, "rewards//std": 0.03322631120681763, "step": 1288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2578, "grad_norm": 2.4748823642730713, "kl": 4.381277114152908, "learning_rate": 4.267205893900793e-06, "loss": 0.1753, "num_tokens": 11141656.0, "reward": 0.76611328125, "reward_std": 0.009312030859291553, "rewards//mean": 0.76611328125, "rewards//std": 0.02663480117917061, "step": 1289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.258, "grad_norm": 1.3232147693634033, "kl": 5.017284005880356, "learning_rate": 4.266083241953402e-06, "loss": 0.2007, "num_tokens": 11150280.0, "reward": 0.74566650390625, "reward_std": 0.014326486736536026, "rewards//mean": 0.74566650390625, "rewards//std": 0.031243158504366875, "step": 1290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2582, "grad_norm": 1.7679723501205444, "kl": 2.710223685950041, "learning_rate": 4.264959878627891e-06, "loss": 0.1084, "num_tokens": 11158864.0, "reward": 0.7637939453125, "reward_std": 0.011151228100061417, "rewards//mean": 0.7637939453125, "rewards//std": 0.03151986002922058, "step": 1291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2584, "grad_norm": 1.710081696510315, "kl": 4.130572408437729, "learning_rate": 4.263835804376754e-06, "loss": 0.1652, "num_tokens": 11167488.0, "reward": 0.7618408203125, "reward_std": 0.00998884066939354, "rewards//mean": 0.7618408203125, "rewards//std": 0.02733018435537815, "step": 1292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2586, "grad_norm": 4.655247688293457, "kl": 2.1683992221951485, "learning_rate": 4.262711019652764e-06, "loss": 0.0867, "num_tokens": 11176208.0, "reward": 0.76580810546875, "reward_std": 0.009925016202032566, "rewards//mean": 0.76580810546875, "rewards//std": 0.027930304408073425, "step": 1293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2588, "grad_norm": 2.2679500579833984, "kl": 4.151427447795868, "learning_rate": 4.261585524908987e-06, "loss": 0.1661, "num_tokens": 11184800.0, "reward": 0.77313232421875, "reward_std": 0.011193597689270973, "rewards//mean": 0.77313232421875, "rewards//std": 0.02435283549129963, "step": 1294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.259, "grad_norm": 2.118525266647339, "kl": 3.805618390440941, "learning_rate": 4.260459320598771e-06, "loss": 0.1522, "num_tokens": 11193456.0, "reward": 0.7685546875, "reward_std": 0.008602013811469078, "rewards//mean": 0.7685546875, "rewards//std": 0.029355719685554504, "step": 1295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2592, "grad_norm": 3.041141986846924, "kl": 4.329920876771212, "learning_rate": 4.259332407175751e-06, "loss": 0.1732, "num_tokens": 11202144.0, "reward": 0.744384765625, "reward_std": 0.01013479195535183, "rewards//mean": 0.744384765625, "rewards//std": 0.032435618340969086, "step": 1296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2594, "grad_norm": 3.062437057495117, "kl": 3.383439801633358, "learning_rate": 4.258204785093849e-06, "loss": 0.1353, "num_tokens": 11210904.0, "reward": 0.74981689453125, "reward_std": 0.010520460084080696, "rewards//mean": 0.74981689453125, "rewards//std": 0.03763146325945854, "step": 1297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2596, "grad_norm": 2.446861982345581, "kl": 5.294970214366913, "learning_rate": 4.257076454807269e-06, "loss": 0.2118, "num_tokens": 11219616.0, "reward": 0.75341796875, "reward_std": 0.009970618411898613, "rewards//mean": 0.75341796875, "rewards//std": 0.02974504418671131, "step": 1298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2598, "grad_norm": 5.3957624435424805, "kl": 3.5341105349361897, "learning_rate": 4.255947416770503e-06, "loss": 0.1414, "num_tokens": 11228280.0, "reward": 0.7518310546875, "reward_std": 0.007633961737155914, "rewards//mean": 0.7518310546875, "rewards//std": 0.024008914828300476, "step": 1299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.26, "grad_norm": 1.3900035619735718, "kl": 1.8465131372213364, "learning_rate": 4.2548176714383274e-06, "loss": 0.0739, "num_tokens": 11236848.0, "reward": 0.760009765625, "reward_std": 0.008688275702297688, "rewards//mean": 0.760009765625, "rewards//std": 0.02729387953877449, "step": 1300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2602, "grad_norm": 2.175323963165283, "kl": 3.811856836080551, "learning_rate": 4.253687219265803e-06, "loss": 0.1525, "num_tokens": 11245440.0, "reward": 0.762939453125, "reward_std": 0.01383256446570158, "rewards//mean": 0.762939453125, "rewards//std": 0.03719412535429001, "step": 1301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2604, "grad_norm": 2.8014745712280273, "kl": 4.165307141840458, "learning_rate": 4.252556060708277e-06, "loss": 0.1666, "num_tokens": 11254128.0, "reward": 0.7366943359375, "reward_std": 0.009986505843698978, "rewards//mean": 0.7366943359375, "rewards//std": 0.018788181245326996, "step": 1302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2606, "grad_norm": 4.433714866638184, "kl": 4.2140659391880035, "learning_rate": 4.2514241962213794e-06, "loss": 0.1686, "num_tokens": 11262768.0, "reward": 0.77044677734375, "reward_std": 0.010732099413871765, "rewards//mean": 0.77044677734375, "rewards//std": 0.030247613787651062, "step": 1303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2608, "grad_norm": 2.6338164806365967, "kl": 2.6811588779091835, "learning_rate": 4.2502916262610264e-06, "loss": 0.1072, "num_tokens": 11271360.0, "reward": 0.7657470703125, "reward_std": 0.007649531587958336, "rewards//mean": 0.7657470703125, "rewards//std": 0.03038523904979229, "step": 1304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.261, "grad_norm": 2.7484657764434814, "kl": 2.502790428698063, "learning_rate": 4.249158351283414e-06, "loss": 0.1001, "num_tokens": 11279992.0, "reward": 0.75103759765625, "reward_std": 0.007283125072717667, "rewards//mean": 0.75103759765625, "rewards//std": 0.017458772286772728, "step": 1305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2612, "grad_norm": 5.599460601806641, "kl": 2.7730154171586037, "learning_rate": 4.248024371745027e-06, "loss": 0.1109, "num_tokens": 11288736.0, "reward": 0.76611328125, "reward_std": 0.01148528978228569, "rewards//mean": 0.76611328125, "rewards//std": 0.02649804763495922, "step": 1306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2614, "grad_norm": 2.3879177570343018, "kl": 1.965024258941412, "learning_rate": 4.246889688102632e-06, "loss": 0.0786, "num_tokens": 11297368.0, "reward": 0.7843017578125, "reward_std": 0.010564582422375679, "rewards//mean": 0.7843017578125, "rewards//std": 0.033620622009038925, "step": 1307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2616, "grad_norm": 3.1506576538085938, "kl": 2.0847064554691315, "learning_rate": 4.24575430081328e-06, "loss": 0.0834, "num_tokens": 11306032.0, "reward": 0.75799560546875, "reward_std": 0.007507959846407175, "rewards//mean": 0.75799560546875, "rewards//std": 0.028632553294301033, "step": 1308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2618, "grad_norm": 2.753695011138916, "kl": 3.937986433506012, "learning_rate": 4.244618210334305e-06, "loss": 0.1575, "num_tokens": 11314848.0, "reward": 0.74609375, "reward_std": 0.014343004673719406, "rewards//mean": 0.74609375, "rewards//std": 0.032481323927640915, "step": 1309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.262, "grad_norm": 2.715945243835449, "kl": 3.515212755650282, "learning_rate": 4.243481417123323e-06, "loss": 0.1406, "num_tokens": 11323456.0, "reward": 0.7469482421875, "reward_std": 0.007614488713443279, "rewards//mean": 0.7469482421875, "rewards//std": 0.026851875707507133, "step": 1310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2622, "grad_norm": 3.772193193435669, "kl": 2.4844468608498573, "learning_rate": 4.242343921638235e-06, "loss": 0.0994, "num_tokens": 11332048.0, "reward": 0.7435302734375, "reward_std": 0.0071250684559345245, "rewards//mean": 0.7435302734375, "rewards//std": 0.03247549757361412, "step": 1311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2624, "grad_norm": 3.8742988109588623, "kl": 3.0398516207933426, "learning_rate": 4.241205724337223e-06, "loss": 0.1216, "num_tokens": 11340712.0, "reward": 0.74407958984375, "reward_std": 0.006669370923191309, "rewards//mean": 0.74407958984375, "rewards//std": 0.03234676644206047, "step": 1312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2626, "grad_norm": 3.9461119174957275, "kl": 5.437603339552879, "learning_rate": 4.2400668256787534e-06, "loss": 0.2175, "num_tokens": 11349392.0, "reward": 0.74609375, "reward_std": 0.008287254720926285, "rewards//mean": 0.74609375, "rewards//std": 0.02863737754523754, "step": 1313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2628, "grad_norm": 2.1901803016662598, "kl": 4.151571042835712, "learning_rate": 4.238927226121574e-06, "loss": 0.1661, "num_tokens": 11358080.0, "reward": 0.7752685546875, "reward_std": 0.008159643970429897, "rewards//mean": 0.7752685546875, "rewards//std": 0.025938868522644043, "step": 1314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.263, "grad_norm": 2.2251639366149902, "kl": 2.423904772847891, "learning_rate": 4.237786926124718e-06, "loss": 0.097, "num_tokens": 11366696.0, "reward": 0.74005126953125, "reward_std": 0.008766488172113895, "rewards//mean": 0.74005126953125, "rewards//std": 0.029568882659077644, "step": 1315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2632, "grad_norm": 1.9419937133789062, "kl": 5.8494908809661865, "learning_rate": 4.236645926147493e-06, "loss": 0.234, "num_tokens": 11375288.0, "reward": 0.75396728515625, "reward_std": 0.010755406692624092, "rewards//mean": 0.75396728515625, "rewards//std": 0.0257281381636858, "step": 1316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2634, "grad_norm": 4.6405863761901855, "kl": 3.1211317777633667, "learning_rate": 4.235504226649499e-06, "loss": 0.1248, "num_tokens": 11383904.0, "reward": 0.76983642578125, "reward_std": 0.012728290632367134, "rewards//mean": 0.76983642578125, "rewards//std": 0.029051383957266808, "step": 1317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2636, "grad_norm": 1.6132014989852905, "kl": 3.6030864007771015, "learning_rate": 4.234361828090609e-06, "loss": 0.1441, "num_tokens": 11392448.0, "reward": 0.74444580078125, "reward_std": 0.010200903750956059, "rewards//mean": 0.74444580078125, "rewards//std": 0.026262130588293076, "step": 1318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2638, "grad_norm": 3.4071428775787354, "kl": 4.188608806580305, "learning_rate": 4.233218730930983e-06, "loss": 0.1675, "num_tokens": 11401088.0, "reward": 0.7685546875, "reward_std": 0.009457055479288101, "rewards//mean": 0.7685546875, "rewards//std": 0.026602955535054207, "step": 1319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.264, "grad_norm": 3.4685566425323486, "kl": 4.904999792575836, "learning_rate": 4.232074935631059e-06, "loss": 0.1962, "num_tokens": 11409840.0, "reward": 0.7586669921875, "reward_std": 0.012601098045706749, "rewards//mean": 0.7586669921875, "rewards//std": 0.03961335867643356, "step": 1320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2642, "grad_norm": 5.757265567779541, "kl": 2.5900511480867863, "learning_rate": 4.230930442651558e-06, "loss": 0.1036, "num_tokens": 11418512.0, "reward": 0.73785400390625, "reward_std": 0.00880451500415802, "rewards//mean": 0.73785400390625, "rewards//std": 0.026014842092990875, "step": 1321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2644, "grad_norm": 6.304155349731445, "kl": 4.3996729254722595, "learning_rate": 4.229785252453481e-06, "loss": 0.176, "num_tokens": 11427168.0, "reward": 0.74993896484375, "reward_std": 0.007294351700693369, "rewards//mean": 0.74993896484375, "rewards//std": 0.01983875222504139, "step": 1322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2646, "grad_norm": 3.899925470352173, "kl": 5.246616512537003, "learning_rate": 4.228639365498111e-06, "loss": 0.2099, "num_tokens": 11435776.0, "reward": 0.79010009765625, "reward_std": 0.015256617218255997, "rewards//mean": 0.79010009765625, "rewards//std": 0.03370533511042595, "step": 1323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2648, "grad_norm": 4.105578422546387, "kl": 5.843824133276939, "learning_rate": 4.227492782247013e-06, "loss": 0.2338, "num_tokens": 11444424.0, "reward": 0.7498779296875, "reward_std": 0.009800959378480911, "rewards//mean": 0.7498779296875, "rewards//std": 0.03320381045341492, "step": 1324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.265, "grad_norm": 2.6493167877197266, "kl": 3.442916586995125, "learning_rate": 4.226345503162027e-06, "loss": 0.1377, "num_tokens": 11453048.0, "reward": 0.74395751953125, "reward_std": 0.01367289386689663, "rewards//mean": 0.74395751953125, "rewards//std": 0.03723353520035744, "step": 1325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2652, "grad_norm": 4.24553108215332, "kl": 3.149893183261156, "learning_rate": 4.2251975287052804e-06, "loss": 0.126, "num_tokens": 11461712.0, "reward": 0.74560546875, "reward_std": 0.009405618533492088, "rewards//mean": 0.74560546875, "rewards//std": 0.033555950969457626, "step": 1326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2654, "grad_norm": 3.081263542175293, "kl": 3.750328652560711, "learning_rate": 4.224048859339175e-06, "loss": 0.15, "num_tokens": 11470416.0, "reward": 0.742919921875, "reward_std": 0.009758269414305687, "rewards//mean": 0.742919921875, "rewards//std": 0.031070252880454063, "step": 1327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2656, "grad_norm": 3.511194944381714, "kl": 1.9571451395750046, "learning_rate": 4.222899495526396e-06, "loss": 0.0783, "num_tokens": 11479032.0, "reward": 0.75213623046875, "reward_std": 0.005107767879962921, "rewards//mean": 0.75213623046875, "rewards//std": 0.024763360619544983, "step": 1328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2658, "grad_norm": 4.438520431518555, "kl": 1.814648114144802, "learning_rate": 4.221749437729905e-06, "loss": 0.0726, "num_tokens": 11487568.0, "reward": 0.7196044921875, "reward_std": 0.007924595847725868, "rewards//mean": 0.7196044921875, "rewards//std": 0.03290705755352974, "step": 1329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.266, "grad_norm": 3.274216413497925, "kl": 2.0031981766223907, "learning_rate": 4.220598686412946e-06, "loss": 0.0801, "num_tokens": 11496160.0, "reward": 0.73077392578125, "reward_std": 0.006381637416779995, "rewards//mean": 0.73077392578125, "rewards//std": 0.03109697811305523, "step": 1330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2662, "grad_norm": 2.7833571434020996, "kl": 1.7837436683475971, "learning_rate": 4.219447242039043e-06, "loss": 0.0713, "num_tokens": 11504792.0, "reward": 0.7459716796875, "reward_std": 0.009451904334127903, "rewards//mean": 0.7459716796875, "rewards//std": 0.03474370017647743, "step": 1331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2664, "grad_norm": 2.6107046604156494, "kl": 3.1608935967087746, "learning_rate": 4.2182951050719955e-06, "loss": 0.1264, "num_tokens": 11513432.0, "reward": 0.77410888671875, "reward_std": 0.011047592386603355, "rewards//mean": 0.77410888671875, "rewards//std": 0.02158200368285179, "step": 1332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2666, "grad_norm": 2.537487268447876, "kl": 3.4065649397671223, "learning_rate": 4.217142275975886e-06, "loss": 0.1363, "num_tokens": 11522112.0, "reward": 0.7703857421875, "reward_std": 0.0073745474219322205, "rewards//mean": 0.7703857421875, "rewards//std": 0.028120771050453186, "step": 1333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2668, "grad_norm": 3.285963773727417, "kl": 3.966996233910322, "learning_rate": 4.215988755215073e-06, "loss": 0.1587, "num_tokens": 11530648.0, "reward": 0.7227783203125, "reward_std": 0.010263636708259583, "rewards//mean": 0.7227783203125, "rewards//std": 0.039395708590745926, "step": 1334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.267, "grad_norm": 2.269796133041382, "kl": 3.3294670581817627, "learning_rate": 4.214834543254195e-06, "loss": 0.1332, "num_tokens": 11539376.0, "reward": 0.76422119140625, "reward_std": 0.009765438735485077, "rewards//mean": 0.76422119140625, "rewards//std": 0.03795151039958, "step": 1335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2672, "grad_norm": 9.574981689453125, "kl": 5.262131106108427, "learning_rate": 4.2136796405581674e-06, "loss": 0.2105, "num_tokens": 11548024.0, "reward": 0.7398681640625, "reward_std": 0.00819869339466095, "rewards//mean": 0.7398681640625, "rewards//std": 0.02798478677868843, "step": 1336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2674, "grad_norm": 1.6330335140228271, "kl": 5.008244797587395, "learning_rate": 4.212524047592185e-06, "loss": 0.2003, "num_tokens": 11556656.0, "reward": 0.754638671875, "reward_std": 0.016101371496915817, "rewards//mean": 0.754638671875, "rewards//std": 0.026738038286566734, "step": 1337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2676, "grad_norm": 2.8575448989868164, "kl": 3.6490194387733936, "learning_rate": 4.211367764821722e-06, "loss": 0.146, "num_tokens": 11565296.0, "reward": 0.769775390625, "reward_std": 0.010555359534919262, "rewards//mean": 0.769775390625, "rewards//std": 0.026538006961345673, "step": 1338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2678, "grad_norm": 2.8837056159973145, "kl": 4.230678800493479, "learning_rate": 4.210210792712528e-06, "loss": 0.1692, "num_tokens": 11573864.0, "reward": 0.73089599609375, "reward_std": 0.011161139234900475, "rewards//mean": 0.73089599609375, "rewards//std": 0.03550325706601143, "step": 1339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.268, "grad_norm": 3.5721397399902344, "kl": 3.4402781426906586, "learning_rate": 4.209053131730631e-06, "loss": 0.1376, "num_tokens": 11582424.0, "reward": 0.7332763671875, "reward_std": 0.008937069214880466, "rewards//mean": 0.7332763671875, "rewards//std": 0.030949924141168594, "step": 1340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2682, "grad_norm": 2.9140172004699707, "kl": 2.9159003160893917, "learning_rate": 4.207894782342337e-06, "loss": 0.1166, "num_tokens": 11590968.0, "reward": 0.79754638671875, "reward_std": 0.010410540737211704, "rewards//mean": 0.79754638671875, "rewards//std": 0.014391319826245308, "step": 1341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2684, "grad_norm": 2.968250036239624, "kl": 3.466621793806553, "learning_rate": 4.206735745014228e-06, "loss": 0.1387, "num_tokens": 11599640.0, "reward": 0.7711181640625, "reward_std": 0.009236998856067657, "rewards//mean": 0.7711181640625, "rewards//std": 0.023059522733092308, "step": 1342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2686, "grad_norm": 4.437006950378418, "kl": 4.309205047786236, "learning_rate": 4.205576020213166e-06, "loss": 0.1724, "num_tokens": 11608232.0, "reward": 0.742919921875, "reward_std": 0.006830193102359772, "rewards//mean": 0.742919921875, "rewards//std": 0.02776893600821495, "step": 1343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2688, "grad_norm": 3.5652430057525635, "kl": 4.825311344116926, "learning_rate": 4.204415608406287e-06, "loss": 0.193, "num_tokens": 11616920.0, "reward": 0.7645263671875, "reward_std": 0.013408385217189789, "rewards//mean": 0.7645263671875, "rewards//std": 0.04326629266142845, "step": 1344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.269, "grad_norm": 2.5441691875457764, "kl": 3.2838336937129498, "learning_rate": 4.203254510061005e-06, "loss": 0.1314, "num_tokens": 11625560.0, "reward": 0.77581787109375, "reward_std": 0.008431902155280113, "rewards//mean": 0.77581787109375, "rewards//std": 0.02227848209440708, "step": 1345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2692, "grad_norm": 3.966306447982788, "kl": 1.686177983880043, "learning_rate": 4.2020927256450085e-06, "loss": 0.0674, "num_tokens": 11634256.0, "reward": 0.76947021484375, "reward_std": 0.009334820322692394, "rewards//mean": 0.76947021484375, "rewards//std": 0.028768105432391167, "step": 1346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2694, "grad_norm": 9.069491386413574, "kl": 6.372577175498009, "learning_rate": 4.200930255626267e-06, "loss": 0.2549, "num_tokens": 11642840.0, "reward": 0.74139404296875, "reward_std": 0.011707983911037445, "rewards//mean": 0.74139404296875, "rewards//std": 0.03755253553390503, "step": 1347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2696, "grad_norm": 1.9915217161178589, "kl": 4.7117859572172165, "learning_rate": 4.199767100473022e-06, "loss": 0.1885, "num_tokens": 11651408.0, "reward": 0.75244140625, "reward_std": 0.01314420998096466, "rewards//mean": 0.75244140625, "rewards//std": 0.02718384377658367, "step": 1348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2698, "grad_norm": 4.670063018798828, "kl": 2.353268202394247, "learning_rate": 4.198603260653792e-06, "loss": 0.0941, "num_tokens": 11660048.0, "reward": 0.759765625, "reward_std": 0.0073879556730389595, "rewards//mean": 0.759765625, "rewards//std": 0.030820775777101517, "step": 1349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.27, "grad_norm": 2.573442220687866, "kl": 3.380015805363655, "learning_rate": 4.197438736637372e-06, "loss": 0.1352, "num_tokens": 11668728.0, "reward": 0.7315673828125, "reward_std": 0.011428534984588623, "rewards//mean": 0.7315673828125, "rewards//std": 0.034340519458055496, "step": 1350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2702, "grad_norm": 3.2867605686187744, "kl": 3.3109308183193207, "learning_rate": 4.196273528892831e-06, "loss": 0.1324, "num_tokens": 11677376.0, "reward": 0.77362060546875, "reward_std": 0.007852231152355671, "rewards//mean": 0.77362060546875, "rewards//std": 0.021892594173550606, "step": 1351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2704, "grad_norm": 4.476799488067627, "kl": 5.812587082386017, "learning_rate": 4.195107637889515e-06, "loss": 0.2325, "num_tokens": 11686016.0, "reward": 0.76849365234375, "reward_std": 0.013360470533370972, "rewards//mean": 0.76849365234375, "rewards//std": 0.023900460451841354, "step": 1352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2706, "grad_norm": 7.903323173522949, "kl": 3.922054812312126, "learning_rate": 4.193941064097047e-06, "loss": 0.1569, "num_tokens": 11694712.0, "reward": 0.78131103515625, "reward_std": 0.009452028200030327, "rewards//mean": 0.78131103515625, "rewards//std": 0.021150536835193634, "step": 1353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2708, "grad_norm": 2.7123863697052, "kl": 2.828689783811569, "learning_rate": 4.19277380798532e-06, "loss": 0.1131, "num_tokens": 11703384.0, "reward": 0.73760986328125, "reward_std": 0.0076402099803090096, "rewards//mean": 0.73760986328125, "rewards//std": 0.020321974530816078, "step": 1354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.271, "grad_norm": 6.89854097366333, "kl": 3.3424340188503265, "learning_rate": 4.191605870024506e-06, "loss": 0.1337, "num_tokens": 11712016.0, "reward": 0.76055908203125, "reward_std": 0.008322667330503464, "rewards//mean": 0.76055908203125, "rewards//std": 0.029451927170157433, "step": 1355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2712, "grad_norm": 7.377481937408447, "kl": 3.7781619802117348, "learning_rate": 4.190437250685049e-06, "loss": 0.1511, "num_tokens": 11720648.0, "reward": 0.75848388671875, "reward_std": 0.012163233011960983, "rewards//mean": 0.75848388671875, "rewards//std": 0.024867061525583267, "step": 1356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2714, "grad_norm": 3.108691930770874, "kl": 3.1337525248527527, "learning_rate": 4.18926795043767e-06, "loss": 0.1254, "num_tokens": 11729304.0, "reward": 0.73797607421875, "reward_std": 0.016611484810709953, "rewards//mean": 0.73797607421875, "rewards//std": 0.03349674493074417, "step": 1357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2716, "grad_norm": 4.089383125305176, "kl": 2.918143719434738, "learning_rate": 4.188097969753363e-06, "loss": 0.1167, "num_tokens": 11738040.0, "reward": 0.736328125, "reward_std": 0.01285855658352375, "rewards//mean": 0.736328125, "rewards//std": 0.027063295245170593, "step": 1358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2718, "grad_norm": 7.366394519805908, "kl": 4.12845204398036, "learning_rate": 4.186927309103395e-06, "loss": 0.1651, "num_tokens": 11746752.0, "reward": 0.72613525390625, "reward_std": 0.010335814207792282, "rewards//mean": 0.72613525390625, "rewards//std": 0.03753277659416199, "step": 1359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.272, "grad_norm": 8.075370788574219, "kl": 3.325389042496681, "learning_rate": 4.185755968959308e-06, "loss": 0.133, "num_tokens": 11755448.0, "reward": 0.71051025390625, "reward_std": 0.009940668009221554, "rewards//mean": 0.71051025390625, "rewards//std": 0.033255912363529205, "step": 1360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2722, "grad_norm": 4.836149215698242, "kl": 2.1319844275712967, "learning_rate": 4.18458394979292e-06, "loss": 0.0853, "num_tokens": 11764152.0, "reward": 0.75579833984375, "reward_std": 0.008836272172629833, "rewards//mean": 0.75579833984375, "rewards//std": 0.02965119108557701, "step": 1361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2724, "grad_norm": 2.578828811645508, "kl": 2.2262017019093037, "learning_rate": 4.183411252076318e-06, "loss": 0.089, "num_tokens": 11772776.0, "reward": 0.73260498046875, "reward_std": 0.00982172042131424, "rewards//mean": 0.73260498046875, "rewards//std": 0.02809027209877968, "step": 1362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2726, "grad_norm": 4.530190467834473, "kl": 2.148841641843319, "learning_rate": 4.182237876281865e-06, "loss": 0.086, "num_tokens": 11781408.0, "reward": 0.75433349609375, "reward_std": 0.011521046981215477, "rewards//mean": 0.75433349609375, "rewards//std": 0.03465399518609047, "step": 1363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2728, "grad_norm": 2.3338022232055664, "kl": 1.8614264577627182, "learning_rate": 4.181063822882196e-06, "loss": 0.0745, "num_tokens": 11789976.0, "reward": 0.74798583984375, "reward_std": 0.008902592584490776, "rewards//mean": 0.74798583984375, "rewards//std": 0.031141243875026703, "step": 1364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.273, "grad_norm": 1.7685571908950806, "kl": 3.221173480153084, "learning_rate": 4.17988909235022e-06, "loss": 0.1288, "num_tokens": 11798632.0, "reward": 0.80010986328125, "reward_std": 0.008679383434355259, "rewards//mean": 0.80010986328125, "rewards//std": 0.020166441798210144, "step": 1365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2732, "grad_norm": 2.4425153732299805, "kl": 1.2437247931957245, "learning_rate": 4.178713685159119e-06, "loss": 0.0497, "num_tokens": 11807256.0, "reward": 0.75714111328125, "reward_std": 0.005524061620235443, "rewards//mean": 0.75714111328125, "rewards//std": 0.025215260684490204, "step": 1366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2734, "grad_norm": 5.30042839050293, "kl": 1.6850194334983826, "learning_rate": 4.1775376017823465e-06, "loss": 0.0674, "num_tokens": 11815920.0, "reward": 0.75689697265625, "reward_std": 0.010127455927431583, "rewards//mean": 0.75689697265625, "rewards//std": 0.027522467076778412, "step": 1367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2736, "grad_norm": 2.3045921325683594, "kl": 3.0477687679231167, "learning_rate": 4.176360842693629e-06, "loss": 0.1219, "num_tokens": 11824560.0, "reward": 0.7606201171875, "reward_std": 0.012172101065516472, "rewards//mean": 0.7606201171875, "rewards//std": 0.02647947520017624, "step": 1368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2738, "grad_norm": 1.765373706817627, "kl": 3.098535794764757, "learning_rate": 4.175183408366964e-06, "loss": 0.1239, "num_tokens": 11833184.0, "reward": 0.7437744140625, "reward_std": 0.00653023412451148, "rewards//mean": 0.7437744140625, "rewards//std": 0.03012707456946373, "step": 1369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.274, "grad_norm": 4.27874755859375, "kl": 1.4636570736765862, "learning_rate": 4.174005299276622e-06, "loss": 0.0585, "num_tokens": 11841856.0, "reward": 0.75885009765625, "reward_std": 0.009472562000155449, "rewards//mean": 0.75885009765625, "rewards//std": 0.02739456668496132, "step": 1370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2742, "grad_norm": 2.264185905456543, "kl": 1.8829293362796307, "learning_rate": 4.172826515897146e-06, "loss": 0.0753, "num_tokens": 11850632.0, "reward": 0.7652587890625, "reward_std": 0.009618008509278297, "rewards//mean": 0.7652587890625, "rewards//std": 0.028226081281900406, "step": 1371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2744, "grad_norm": 2.8925533294677734, "kl": 1.4519289657473564, "learning_rate": 4.17164705870335e-06, "loss": 0.0581, "num_tokens": 11859328.0, "reward": 0.74530029296875, "reward_std": 0.007334470748901367, "rewards//mean": 0.74530029296875, "rewards//std": 0.02404128573834896, "step": 1372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2746, "grad_norm": 1.741515040397644, "kl": 1.4316293448209763, "learning_rate": 4.1704669281703184e-06, "loss": 0.0573, "num_tokens": 11867920.0, "reward": 0.73822021484375, "reward_std": 0.00794820673763752, "rewards//mean": 0.73822021484375, "rewards//std": 0.03611453250050545, "step": 1373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2748, "grad_norm": 7.717780590057373, "kl": 2.644916817545891, "learning_rate": 4.169286124773406e-06, "loss": 0.1058, "num_tokens": 11876616.0, "reward": 0.752197265625, "reward_std": 0.009906463325023651, "rewards//mean": 0.752197265625, "rewards//std": 0.039222609251737595, "step": 1374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.275, "grad_norm": 2.3161444664001465, "kl": 2.929397441446781, "learning_rate": 4.168104648988245e-06, "loss": 0.1172, "num_tokens": 11885248.0, "reward": 0.77349853515625, "reward_std": 0.012358210049569607, "rewards//mean": 0.77349853515625, "rewards//std": 0.02053464576601982, "step": 1375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2752, "grad_norm": 1.340524435043335, "kl": 4.775512523949146, "learning_rate": 4.16692250129073e-06, "loss": 0.191, "num_tokens": 11893872.0, "reward": 0.75830078125, "reward_std": 0.011295108124613762, "rewards//mean": 0.75830078125, "rewards//std": 0.02474937215447426, "step": 1376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2754, "grad_norm": 2.9450316429138184, "kl": 3.9748855009675026, "learning_rate": 4.16573968215703e-06, "loss": 0.159, "num_tokens": 11902448.0, "reward": 0.763916015625, "reward_std": 0.016226261854171753, "rewards//mean": 0.763916015625, "rewards//std": 0.03417370840907097, "step": 1377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2756, "grad_norm": 2.108694076538086, "kl": 5.713043823838234, "learning_rate": 4.164556192063586e-06, "loss": 0.2285, "num_tokens": 11911064.0, "reward": 0.7889404296875, "reward_std": 0.01659802719950676, "rewards//mean": 0.7889404296875, "rewards//std": 0.026097120717167854, "step": 1378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2758, "grad_norm": 3.298931360244751, "kl": 3.3014607429504395, "learning_rate": 4.163372031487106e-06, "loss": 0.1321, "num_tokens": 11919760.0, "reward": 0.7425537109375, "reward_std": 0.012849746271967888, "rewards//mean": 0.7425537109375, "rewards//std": 0.026845108717679977, "step": 1379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.276, "grad_norm": 5.076264381408691, "kl": 1.7298753894865513, "learning_rate": 4.162187200904572e-06, "loss": 0.0692, "num_tokens": 11928344.0, "reward": 0.7325439453125, "reward_std": 0.007705581374466419, "rewards//mean": 0.7325439453125, "rewards//std": 0.03918612003326416, "step": 1380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2762, "grad_norm": 1.2696235179901123, "kl": 3.7642838321626186, "learning_rate": 4.161001700793231e-06, "loss": 0.1506, "num_tokens": 11936976.0, "reward": 0.774169921875, "reward_std": 0.010245183482766151, "rewards//mean": 0.774169921875, "rewards//std": 0.026801373809576035, "step": 1381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2764, "grad_norm": 2.687866687774658, "kl": 5.330626547336578, "learning_rate": 4.159815531630604e-06, "loss": 0.2132, "num_tokens": 11945576.0, "reward": 0.7545166015625, "reward_std": 0.015803910791873932, "rewards//mean": 0.7545166015625, "rewards//std": 0.026275169104337692, "step": 1382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2766, "grad_norm": 3.905477285385132, "kl": 3.33303764089942, "learning_rate": 4.15862869389448e-06, "loss": 0.1333, "num_tokens": 11954136.0, "reward": 0.741943359375, "reward_std": 0.011026639491319656, "rewards//mean": 0.741943359375, "rewards//std": 0.029072804376482964, "step": 1383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2768, "grad_norm": 4.371704578399658, "kl": 6.943100869655609, "learning_rate": 4.157441188062916e-06, "loss": 0.2777, "num_tokens": 11962824.0, "reward": 0.78131103515625, "reward_std": 0.014462907798588276, "rewards//mean": 0.78131103515625, "rewards//std": 0.027643203735351562, "step": 1384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.277, "grad_norm": 4.684089183807373, "kl": 5.149707347154617, "learning_rate": 4.156253014614239e-06, "loss": 0.206, "num_tokens": 11971504.0, "reward": 0.73699951171875, "reward_std": 0.011260450817644596, "rewards//mean": 0.73699951171875, "rewards//std": 0.0291574876755476, "step": 1385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2772, "grad_norm": 6.892230987548828, "kl": 9.5938261449337, "learning_rate": 4.155064174027047e-06, "loss": 0.3838, "num_tokens": 11980152.0, "reward": 0.75567626953125, "reward_std": 0.010517815127968788, "rewards//mean": 0.75567626953125, "rewards//std": 0.02883065491914749, "step": 1386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2774, "grad_norm": 5.8266496658325195, "kl": 10.388528168201447, "learning_rate": 4.153874666780202e-06, "loss": 0.4155, "num_tokens": 11988744.0, "reward": 0.7640380859375, "reward_std": 0.013916620053350925, "rewards//mean": 0.7640380859375, "rewards//std": 0.02321653813123703, "step": 1387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2776, "grad_norm": 2.295222759246826, "kl": 6.7360789477825165, "learning_rate": 4.152684493352841e-06, "loss": 0.2694, "num_tokens": 11997312.0, "reward": 0.7271728515625, "reward_std": 0.006680123507976532, "rewards//mean": 0.7271728515625, "rewards//std": 0.03459174558520317, "step": 1388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2778, "grad_norm": 6.331578254699707, "kl": 4.2862258069217205, "learning_rate": 4.151493654224362e-06, "loss": 0.1714, "num_tokens": 12005928.0, "reward": 0.76031494140625, "reward_std": 0.00941246747970581, "rewards//mean": 0.76031494140625, "rewards//std": 0.03589843958616257, "step": 1389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.278, "grad_norm": 4.718693733215332, "kl": 8.91860032081604, "learning_rate": 4.150302149874438e-06, "loss": 0.3567, "num_tokens": 12014688.0, "reward": 0.73297119140625, "reward_std": 0.01398102194070816, "rewards//mean": 0.73297119140625, "rewards//std": 0.029605207964777946, "step": 1390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2782, "grad_norm": 6.989559650421143, "kl": 10.01040905714035, "learning_rate": 4.149109980783004e-06, "loss": 0.4004, "num_tokens": 12023352.0, "reward": 0.7213134765625, "reward_std": 0.014433177188038826, "rewards//mean": 0.7213134765625, "rewards//std": 0.031827643513679504, "step": 1391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2784, "grad_norm": 7.343052387237549, "kl": 10.065821886062622, "learning_rate": 4.1479171474302675e-06, "loss": 0.4026, "num_tokens": 12031984.0, "reward": 0.75152587890625, "reward_std": 0.01235811784863472, "rewards//mean": 0.75152587890625, "rewards//std": 0.031254783272743225, "step": 1392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2786, "grad_norm": 10.520308494567871, "kl": 7.5734434723854065, "learning_rate": 4.146723650296701e-06, "loss": 0.3029, "num_tokens": 12040632.0, "reward": 0.7491455078125, "reward_std": 0.011275292374193668, "rewards//mean": 0.7491455078125, "rewards//std": 0.02221153862774372, "step": 1393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2788, "grad_norm": 5.63533878326416, "kl": 9.535266697406769, "learning_rate": 4.145529489863046e-06, "loss": 0.3814, "num_tokens": 12049192.0, "reward": 0.7537841796875, "reward_std": 0.015294350683689117, "rewards//mean": 0.7537841796875, "rewards//std": 0.029536420479416847, "step": 1394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.279, "grad_norm": 3.170309066772461, "kl": 7.687914792448282, "learning_rate": 4.144334666610308e-06, "loss": 0.3075, "num_tokens": 12057776.0, "reward": 0.74725341796875, "reward_std": 0.013552828691899776, "rewards//mean": 0.74725341796875, "rewards//std": 0.018211616203188896, "step": 1395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2792, "grad_norm": 8.551511764526367, "kl": 9.639231234788895, "learning_rate": 4.143139181019764e-06, "loss": 0.3856, "num_tokens": 12066448.0, "reward": 0.75311279296875, "reward_std": 0.009898494929075241, "rewards//mean": 0.75311279296875, "rewards//std": 0.0274822860956192, "step": 1396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2794, "grad_norm": 2.6039316654205322, "kl": 6.706505164504051, "learning_rate": 4.141943033572954e-06, "loss": 0.2683, "num_tokens": 12075056.0, "reward": 0.7708740234375, "reward_std": 0.011211846955120564, "rewards//mean": 0.7708740234375, "rewards//std": 0.028554532676935196, "step": 1397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2796, "grad_norm": 2.737032890319824, "kl": 4.356867238879204, "learning_rate": 4.140746224751686e-06, "loss": 0.1743, "num_tokens": 12083688.0, "reward": 0.76727294921875, "reward_std": 0.013638639822602272, "rewards//mean": 0.76727294921875, "rewards//std": 0.030055327340960503, "step": 1398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2798, "grad_norm": 3.4385592937469482, "kl": 7.211082756519318, "learning_rate": 4.139548755038035e-06, "loss": 0.2884, "num_tokens": 12092360.0, "reward": 0.757568359375, "reward_std": 0.01751622185111046, "rewards//mean": 0.757568359375, "rewards//std": 0.04004756733775139, "step": 1399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.28, "grad_norm": 6.710628986358643, "kl": 1.8269840478897095, "learning_rate": 4.138350624914342e-06, "loss": 0.0731, "num_tokens": 12100960.0, "reward": 0.75732421875, "reward_std": 0.01166498102247715, "rewards//mean": 0.75732421875, "rewards//std": 0.033836279064416885, "step": 1400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2802, "grad_norm": 5.002799987792969, "kl": 4.755123943090439, "learning_rate": 4.137151834863213e-06, "loss": 0.1902, "num_tokens": 12109584.0, "reward": 0.75445556640625, "reward_std": 0.008345297537744045, "rewards//mean": 0.75445556640625, "rewards//std": 0.036364324390888214, "step": 1401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2804, "grad_norm": 2.81632924079895, "kl": 2.7672760225832462, "learning_rate": 4.135952385367521e-06, "loss": 0.1107, "num_tokens": 12118208.0, "reward": 0.7666015625, "reward_std": 0.008363215252757072, "rewards//mean": 0.7666015625, "rewards//std": 0.029157033190131187, "step": 1402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2806, "grad_norm": 2.34147572517395, "kl": 3.2020316123962402, "learning_rate": 4.134752276910403e-06, "loss": 0.1281, "num_tokens": 12126832.0, "reward": 0.777587890625, "reward_std": 0.010320193134248257, "rewards//mean": 0.777587890625, "rewards//std": 0.02478238008916378, "step": 1403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2808, "grad_norm": 2.3778469562530518, "kl": 4.8575649112463, "learning_rate": 4.133551509975264e-06, "loss": 0.1943, "num_tokens": 12135432.0, "reward": 0.715087890625, "reward_std": 0.010134581476449966, "rewards//mean": 0.715087890625, "rewards//std": 0.0270532239228487, "step": 1404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.281, "grad_norm": 3.6884686946868896, "kl": 4.2522831708192825, "learning_rate": 4.132350085045772e-06, "loss": 0.1701, "num_tokens": 12143992.0, "reward": 0.74609375, "reward_std": 0.009654376655817032, "rewards//mean": 0.74609375, "rewards//std": 0.023714875802397728, "step": 1405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2812, "grad_norm": 2.767508029937744, "kl": 6.422431230545044, "learning_rate": 4.131148002605861e-06, "loss": 0.2569, "num_tokens": 12152592.0, "reward": 0.7452392578125, "reward_std": 0.011455094441771507, "rewards//mean": 0.7452392578125, "rewards//std": 0.025859376415610313, "step": 1406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2814, "grad_norm": 4.5503387451171875, "kl": 4.585957881063223, "learning_rate": 4.1299452631397295e-06, "loss": 0.1834, "num_tokens": 12161296.0, "reward": 0.78607177734375, "reward_std": 0.014200650155544281, "rewards//mean": 0.78607177734375, "rewards//std": 0.022420717403292656, "step": 1407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2816, "grad_norm": 2.386054754257202, "kl": 2.8070249557495117, "learning_rate": 4.128741867131841e-06, "loss": 0.1123, "num_tokens": 12169976.0, "reward": 0.71966552734375, "reward_std": 0.010204203426837921, "rewards//mean": 0.71966552734375, "rewards//std": 0.03890557959675789, "step": 1408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2818, "grad_norm": 9.660313606262207, "kl": 2.666976176202297, "learning_rate": 4.127537815066924e-06, "loss": 0.1067, "num_tokens": 12178552.0, "reward": 0.7769775390625, "reward_std": 0.01019267551600933, "rewards//mean": 0.7769775390625, "rewards//std": 0.01639235019683838, "step": 1409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.282, "grad_norm": 6.043813228607178, "kl": 3.4627685546875, "learning_rate": 4.126333107429968e-06, "loss": 0.1385, "num_tokens": 12187224.0, "reward": 0.73907470703125, "reward_std": 0.008391840383410454, "rewards//mean": 0.73907470703125, "rewards//std": 0.035660240799188614, "step": 1410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2822, "grad_norm": 3.730494976043701, "kl": 4.844144374132156, "learning_rate": 4.125127744706232e-06, "loss": 0.1938, "num_tokens": 12195976.0, "reward": 0.7587890625, "reward_std": 0.01774844527244568, "rewards//mean": 0.7587890625, "rewards//std": 0.03709060698747635, "step": 1411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2824, "grad_norm": 1.8565348386764526, "kl": 5.223568886518478, "learning_rate": 4.123921727381234e-06, "loss": 0.2089, "num_tokens": 12204544.0, "reward": 0.7655029296875, "reward_std": 0.014612920582294464, "rewards//mean": 0.7655029296875, "rewards//std": 0.027188019827008247, "step": 1412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2826, "grad_norm": 3.7399351596832275, "kl": 3.176049590110779, "learning_rate": 4.122715055940759e-06, "loss": 0.127, "num_tokens": 12213224.0, "reward": 0.7747802734375, "reward_std": 0.0130354855209589, "rewards//mean": 0.7747802734375, "rewards//std": 0.032602038234472275, "step": 1413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2828, "grad_norm": 3.1119418144226074, "kl": 3.8169267773628235, "learning_rate": 4.121507730870853e-06, "loss": 0.1527, "num_tokens": 12221920.0, "reward": 0.7591552734375, "reward_std": 0.013317324221134186, "rewards//mean": 0.7591552734375, "rewards//std": 0.03049861453473568, "step": 1414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.283, "grad_norm": 2.6083669662475586, "kl": 4.088666200637817, "learning_rate": 4.120299752657828e-06, "loss": 0.1635, "num_tokens": 12230600.0, "reward": 0.76153564453125, "reward_std": 0.013929024338722229, "rewards//mean": 0.76153564453125, "rewards//std": 0.032818328589200974, "step": 1415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2832, "grad_norm": 2.712017059326172, "kl": 4.443383142352104, "learning_rate": 4.119091121788256e-06, "loss": 0.1777, "num_tokens": 12239248.0, "reward": 0.76702880859375, "reward_std": 0.014571541920304298, "rewards//mean": 0.76702880859375, "rewards//std": 0.027066580951213837, "step": 1416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2834, "grad_norm": 5.266393184661865, "kl": 3.608183413743973, "learning_rate": 4.117881838748972e-06, "loss": 0.1443, "num_tokens": 12247824.0, "reward": 0.776611328125, "reward_std": 0.007514579687267542, "rewards//mean": 0.776611328125, "rewards//std": 0.021952755749225616, "step": 1417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2836, "grad_norm": 2.195739984512329, "kl": 4.8992273807525635, "learning_rate": 4.116671904027079e-06, "loss": 0.196, "num_tokens": 12256496.0, "reward": 0.754638671875, "reward_std": 0.01588989607989788, "rewards//mean": 0.754638671875, "rewards//std": 0.03708326071500778, "step": 1418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2838, "grad_norm": 2.2936816215515137, "kl": 3.1164577305316925, "learning_rate": 4.115461318109936e-06, "loss": 0.1247, "num_tokens": 12265152.0, "reward": 0.738037109375, "reward_std": 0.011874731630086899, "rewards//mean": 0.738037109375, "rewards//std": 0.03829146921634674, "step": 1419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.284, "grad_norm": 4.549593448638916, "kl": 5.782680720090866, "learning_rate": 4.114250081485166e-06, "loss": 0.2313, "num_tokens": 12273816.0, "reward": 0.72943115234375, "reward_std": 0.01674017310142517, "rewards//mean": 0.72943115234375, "rewards//std": 0.045438483357429504, "step": 1420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2842, "grad_norm": 6.248671054840088, "kl": 6.382633626461029, "learning_rate": 4.113038194640658e-06, "loss": 0.2553, "num_tokens": 12282528.0, "reward": 0.75457763671875, "reward_std": 0.014640046283602715, "rewards//mean": 0.75457763671875, "rewards//std": 0.03054889105260372, "step": 1421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2844, "grad_norm": 2.987802505493164, "kl": 3.853275403380394, "learning_rate": 4.111825658064557e-06, "loss": 0.1541, "num_tokens": 12291168.0, "reward": 0.72625732421875, "reward_std": 0.01331716775894165, "rewards//mean": 0.72625732421875, "rewards//std": 0.023554034531116486, "step": 1422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2846, "grad_norm": 2.8311638832092285, "kl": 4.002157241106033, "learning_rate": 4.110612472245274e-06, "loss": 0.1601, "num_tokens": 12299824.0, "reward": 0.74591064453125, "reward_std": 0.01376560889184475, "rewards//mean": 0.74591064453125, "rewards//std": 0.03327275067567825, "step": 1423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2848, "grad_norm": 3.822575330734253, "kl": 5.53521266579628, "learning_rate": 4.10939863767148e-06, "loss": 0.2214, "num_tokens": 12308464.0, "reward": 0.76507568359375, "reward_std": 0.011104791425168514, "rewards//mean": 0.76507568359375, "rewards//std": 0.02167508937418461, "step": 1424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.285, "grad_norm": 6.730959415435791, "kl": 6.539466828107834, "learning_rate": 4.108184154832106e-06, "loss": 0.2616, "num_tokens": 12316984.0, "reward": 0.76385498046875, "reward_std": 0.017864635214209557, "rewards//mean": 0.76385498046875, "rewards//std": 0.032703738659620285, "step": 1425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2852, "grad_norm": 3.916592597961426, "kl": 2.76847967505455, "learning_rate": 4.106969024216348e-06, "loss": 0.1107, "num_tokens": 12325648.0, "reward": 0.76458740234375, "reward_std": 0.009689198806881905, "rewards//mean": 0.76458740234375, "rewards//std": 0.02589469589293003, "step": 1426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2854, "grad_norm": 5.645354747772217, "kl": 4.721948862075806, "learning_rate": 4.1057532463136594e-06, "loss": 0.1889, "num_tokens": 12334280.0, "reward": 0.723388671875, "reward_std": 0.006540821865200996, "rewards//mean": 0.723388671875, "rewards//std": 0.025112468749284744, "step": 1427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2856, "grad_norm": 3.636981725692749, "kl": 4.059803009033203, "learning_rate": 4.104536821613755e-06, "loss": 0.1624, "num_tokens": 12342960.0, "reward": 0.7408447265625, "reward_std": 0.011184347793459892, "rewards//mean": 0.7408447265625, "rewards//std": 0.03807798773050308, "step": 1428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2858, "grad_norm": 2.13906192779541, "kl": 4.1590482369065285, "learning_rate": 4.10331975060661e-06, "loss": 0.1664, "num_tokens": 12351680.0, "reward": 0.7867431640625, "reward_std": 0.00847603753209114, "rewards//mean": 0.7867431640625, "rewards//std": 0.0236506387591362, "step": 1429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.286, "grad_norm": 6.974765300750732, "kl": 3.7392688989639282, "learning_rate": 4.102102033782462e-06, "loss": 0.1496, "num_tokens": 12360304.0, "reward": 0.7528076171875, "reward_std": 0.009396059438586235, "rewards//mean": 0.7528076171875, "rewards//std": 0.018168862909078598, "step": 1430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2862, "grad_norm": 2.0292155742645264, "kl": 4.619585245847702, "learning_rate": 4.100883671631806e-06, "loss": 0.1848, "num_tokens": 12368984.0, "reward": 0.71295166015625, "reward_std": 0.012967821210622787, "rewards//mean": 0.71295166015625, "rewards//std": 0.04528765007853508, "step": 1431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2864, "grad_norm": 3.384761095046997, "kl": 4.8401243053376675, "learning_rate": 4.099664664645399e-06, "loss": 0.1936, "num_tokens": 12377728.0, "reward": 0.76751708984375, "reward_std": 0.013084901496767998, "rewards//mean": 0.76751708984375, "rewards//std": 0.032143961638212204, "step": 1432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2866, "grad_norm": 2.8469998836517334, "kl": 2.144933510571718, "learning_rate": 4.098445013314255e-06, "loss": 0.0858, "num_tokens": 12386304.0, "reward": 0.721435546875, "reward_std": 0.00890803150832653, "rewards//mean": 0.721435546875, "rewards//std": 0.03138051927089691, "step": 1433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2868, "grad_norm": 1.8555923700332642, "kl": 3.4760134890675545, "learning_rate": 4.097224718129652e-06, "loss": 0.139, "num_tokens": 12394880.0, "reward": 0.78265380859375, "reward_std": 0.008136549033224583, "rewards//mean": 0.78265380859375, "rewards//std": 0.022971635684370995, "step": 1434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.287, "grad_norm": 3.528858184814453, "kl": 1.7143997587263584, "learning_rate": 4.096003779583123e-06, "loss": 0.0686, "num_tokens": 12403512.0, "reward": 0.73431396484375, "reward_std": 0.007884791120886803, "rewards//mean": 0.73431396484375, "rewards//std": 0.024973364546895027, "step": 1435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2872, "grad_norm": 7.626206874847412, "kl": 1.002179641276598, "learning_rate": 4.094782198166463e-06, "loss": 0.0401, "num_tokens": 12412144.0, "reward": 0.77337646484375, "reward_std": 0.00773048447445035, "rewards//mean": 0.77337646484375, "rewards//std": 0.024154985323548317, "step": 1436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2874, "grad_norm": 3.102672576904297, "kl": 2.111334104090929, "learning_rate": 4.093559974371725e-06, "loss": 0.0845, "num_tokens": 12420840.0, "reward": 0.73486328125, "reward_std": 0.008461733348667622, "rewards//mean": 0.73486328125, "rewards//std": 0.02271859534084797, "step": 1437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2876, "grad_norm": 3.5350985527038574, "kl": 2.589526690542698, "learning_rate": 4.092337108691219e-06, "loss": 0.1036, "num_tokens": 12429520.0, "reward": 0.71734619140625, "reward_std": 0.009457661770284176, "rewards//mean": 0.71734619140625, "rewards//std": 0.03557311370968819, "step": 1438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2878, "grad_norm": 3.9980318546295166, "kl": 2.945921778678894, "learning_rate": 4.091113601617516e-06, "loss": 0.1178, "num_tokens": 12438240.0, "reward": 0.7548828125, "reward_std": 0.009153686463832855, "rewards//mean": 0.7548828125, "rewards//std": 0.03132741153240204, "step": 1439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.288, "grad_norm": 3.164839267730713, "kl": 3.1845623552799225, "learning_rate": 4.0898894536434445e-06, "loss": 0.1274, "num_tokens": 12446904.0, "reward": 0.7215576171875, "reward_std": 0.011324258521199226, "rewards//mean": 0.7215576171875, "rewards//std": 0.025060273706912994, "step": 1440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2882, "grad_norm": 4.822700500488281, "kl": 5.374921750277281, "learning_rate": 4.088664665262091e-06, "loss": 0.215, "num_tokens": 12455888.0, "reward": 0.69476318359375, "reward_std": 0.012959405779838562, "rewards//mean": 0.69476318359375, "rewards//std": 0.050183288753032684, "step": 1441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2884, "grad_norm": 4.282898426055908, "kl": 3.6767929680645466, "learning_rate": 4.0874392369668005e-06, "loss": 0.1471, "num_tokens": 12464520.0, "reward": 0.7403564453125, "reward_std": 0.007202908396720886, "rewards//mean": 0.7403564453125, "rewards//std": 0.025685518980026245, "step": 1442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2886, "grad_norm": 4.100472927093506, "kl": 2.150573879480362, "learning_rate": 4.0862131692511755e-06, "loss": 0.086, "num_tokens": 12473136.0, "reward": 0.774658203125, "reward_std": 0.011053423397243023, "rewards//mean": 0.774658203125, "rewards//std": 0.022497639060020447, "step": 1443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2888, "grad_norm": 1.6823365688323975, "kl": 3.7067526429891586, "learning_rate": 4.084986462609075e-06, "loss": 0.1483, "num_tokens": 12481776.0, "reward": 0.7752685546875, "reward_std": 0.010602373629808426, "rewards//mean": 0.7752685546875, "rewards//std": 0.031086571514606476, "step": 1444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.289, "grad_norm": 3.1297998428344727, "kl": 3.933116242289543, "learning_rate": 4.083759117534617e-06, "loss": 0.1573, "num_tokens": 12490408.0, "reward": 0.762939453125, "reward_std": 0.018668608739972115, "rewards//mean": 0.762939453125, "rewards//std": 0.030168499797582626, "step": 1445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2892, "grad_norm": 1.8383193016052246, "kl": 3.303624078631401, "learning_rate": 4.082531134522176e-06, "loss": 0.1321, "num_tokens": 12498952.0, "reward": 0.72576904296875, "reward_std": 0.007930705323815346, "rewards//mean": 0.72576904296875, "rewards//std": 0.031850166618824005, "step": 1446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2894, "grad_norm": 5.4290595054626465, "kl": 2.0315615199506283, "learning_rate": 4.081302514066384e-06, "loss": 0.0813, "num_tokens": 12507640.0, "reward": 0.7705078125, "reward_std": 0.008449772372841835, "rewards//mean": 0.7705078125, "rewards//std": 0.025515519082546234, "step": 1447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2896, "grad_norm": 3.850499153137207, "kl": 3.280447833240032, "learning_rate": 4.080073256662128e-06, "loss": 0.1312, "num_tokens": 12516336.0, "reward": 0.71759033203125, "reward_std": 0.010510705411434174, "rewards//mean": 0.71759033203125, "rewards//std": 0.029468370601534843, "step": 1448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2898, "grad_norm": 2.049560070037842, "kl": 4.092830911278725, "learning_rate": 4.078843362804553e-06, "loss": 0.1637, "num_tokens": 12524968.0, "reward": 0.78131103515625, "reward_std": 0.013291127979755402, "rewards//mean": 0.78131103515625, "rewards//std": 0.025671591982245445, "step": 1449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.29, "grad_norm": 3.9274814128875732, "kl": 4.187639843672514, "learning_rate": 4.07761283298906e-06, "loss": 0.1675, "num_tokens": 12533680.0, "reward": 0.74127197265625, "reward_std": 0.007046463433653116, "rewards//mean": 0.74127197265625, "rewards//std": 0.03290171176195145, "step": 1450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2902, "grad_norm": 10.677163124084473, "kl": 2.5408588349819183, "learning_rate": 4.076381667711306e-06, "loss": 0.1016, "num_tokens": 12542256.0, "reward": 0.75439453125, "reward_std": 0.007715714164078236, "rewards//mean": 0.75439453125, "rewards//std": 0.0276607908308506, "step": 1451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2904, "grad_norm": 2.444880485534668, "kl": 3.5082188546657562, "learning_rate": 4.075149867467206e-06, "loss": 0.1403, "num_tokens": 12550904.0, "reward": 0.734619140625, "reward_std": 0.006623156368732452, "rewards//mean": 0.734619140625, "rewards//std": 0.02278645895421505, "step": 1452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2906, "grad_norm": 4.066626071929932, "kl": 2.942803204059601, "learning_rate": 4.073917432752927e-06, "loss": 0.1177, "num_tokens": 12559496.0, "reward": 0.7200927734375, "reward_std": 0.01067173108458519, "rewards//mean": 0.7200927734375, "rewards//std": 0.0397643968462944, "step": 1453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2908, "grad_norm": 2.5480294227600098, "kl": 1.5013394728302956, "learning_rate": 4.072684364064895e-06, "loss": 0.0601, "num_tokens": 12568048.0, "reward": 0.737060546875, "reward_std": 0.0059945909306406975, "rewards//mean": 0.737060546875, "rewards//std": 0.023927126079797745, "step": 1454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.291, "grad_norm": 2.469015121459961, "kl": 1.7101297825574875, "learning_rate": 4.071450661899789e-06, "loss": 0.0684, "num_tokens": 12576680.0, "reward": 0.73876953125, "reward_std": 0.008723283186554909, "rewards//mean": 0.73876953125, "rewards//std": 0.02756430394947529, "step": 1455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2912, "grad_norm": 4.397410869598389, "kl": 2.8201215378940105, "learning_rate": 4.070216326754544e-06, "loss": 0.1128, "num_tokens": 12585336.0, "reward": 0.768310546875, "reward_std": 0.008774403482675552, "rewards//mean": 0.768310546875, "rewards//std": 0.03067801147699356, "step": 1456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2914, "grad_norm": 4.051808834075928, "kl": 1.870306123048067, "learning_rate": 4.06898135912635e-06, "loss": 0.0748, "num_tokens": 12593952.0, "reward": 0.7608642578125, "reward_std": 0.00787823460996151, "rewards//mean": 0.7608642578125, "rewards//std": 0.03838997334241867, "step": 1457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2916, "grad_norm": 1.7822526693344116, "kl": 3.9868992529809475, "learning_rate": 4.067745759512654e-06, "loss": 0.1595, "num_tokens": 12602672.0, "reward": 0.73980712890625, "reward_std": 0.007910334505140781, "rewards//mean": 0.73980712890625, "rewards//std": 0.031929440796375275, "step": 1458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2918, "grad_norm": 2.3900465965270996, "kl": 2.855707533657551, "learning_rate": 4.066509528411151e-06, "loss": 0.1142, "num_tokens": 12611336.0, "reward": 0.76983642578125, "reward_std": 0.009503044188022614, "rewards//mean": 0.76983642578125, "rewards//std": 0.03067350760102272, "step": 1459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.292, "grad_norm": 3.7143051624298096, "kl": 2.384503550827503, "learning_rate": 4.065272666319799e-06, "loss": 0.0954, "num_tokens": 12619976.0, "reward": 0.73638916015625, "reward_std": 0.007540501654148102, "rewards//mean": 0.73638916015625, "rewards//std": 0.02687462791800499, "step": 1460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2922, "grad_norm": 3.0975253582000732, "kl": 3.75796877220273, "learning_rate": 4.064035173736804e-06, "loss": 0.1503, "num_tokens": 12628720.0, "reward": 0.77313232421875, "reward_std": 0.012095347046852112, "rewards//mean": 0.77313232421875, "rewards//std": 0.028597639873623848, "step": 1461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2924, "grad_norm": 4.802572250366211, "kl": 2.7877336852252483, "learning_rate": 4.062797051160628e-06, "loss": 0.1115, "num_tokens": 12637344.0, "reward": 0.77984619140625, "reward_std": 0.012111471965909004, "rewards//mean": 0.77984619140625, "rewards//std": 0.03057118132710457, "step": 1462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2926, "grad_norm": 2.6957345008850098, "kl": 3.0143202617764473, "learning_rate": 4.061558299089986e-06, "loss": 0.1206, "num_tokens": 12646096.0, "reward": 0.7481689453125, "reward_std": 0.009511973708868027, "rewards//mean": 0.7481689453125, "rewards//std": 0.03658413141965866, "step": 1463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2928, "grad_norm": 1.673730731010437, "kl": 2.9747151993215084, "learning_rate": 4.060318918023849e-06, "loss": 0.119, "num_tokens": 12654704.0, "reward": 0.748779296875, "reward_std": 0.00958467461168766, "rewards//mean": 0.748779296875, "rewards//std": 0.027204997837543488, "step": 1464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.293, "grad_norm": 2.102818012237549, "kl": 4.271336618810892, "learning_rate": 4.059078908461437e-06, "loss": 0.1709, "num_tokens": 12663376.0, "reward": 0.72265625, "reward_std": 0.008489435538649559, "rewards//mean": 0.72265625, "rewards//std": 0.024861659854650497, "step": 1465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2932, "grad_norm": 3.312582015991211, "kl": 3.166490152478218, "learning_rate": 4.057838270902228e-06, "loss": 0.1267, "num_tokens": 12671992.0, "reward": 0.71142578125, "reward_std": 0.01070701889693737, "rewards//mean": 0.71142578125, "rewards//std": 0.02924412488937378, "step": 1466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2934, "grad_norm": 5.6704020500183105, "kl": 3.8965123780071735, "learning_rate": 4.05659700584595e-06, "loss": 0.1559, "num_tokens": 12680584.0, "reward": 0.74383544921875, "reward_std": 0.011219223029911518, "rewards//mean": 0.74383544921875, "rewards//std": 0.029698632657527924, "step": 1467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2936, "grad_norm": 3.7548398971557617, "kl": 2.4549859389662743, "learning_rate": 4.055355113792584e-06, "loss": 0.0982, "num_tokens": 12689168.0, "reward": 0.7698974609375, "reward_std": 0.011444100178778172, "rewards//mean": 0.7698974609375, "rewards//std": 0.022859087213873863, "step": 1468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2938, "grad_norm": 2.35459566116333, "kl": 3.16232592985034, "learning_rate": 4.054112595242364e-06, "loss": 0.1265, "num_tokens": 12697840.0, "reward": 0.720947265625, "reward_std": 0.007741483394056559, "rewards//mean": 0.720947265625, "rewards//std": 0.03355865553021431, "step": 1469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.294, "grad_norm": 1.829099416732788, "kl": 5.453570522367954, "learning_rate": 4.052869450695776e-06, "loss": 0.2181, "num_tokens": 12706584.0, "reward": 0.77227783203125, "reward_std": 0.013817313127219677, "rewards//mean": 0.77227783203125, "rewards//std": 0.029525848105549812, "step": 1470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2942, "grad_norm": 3.456403970718384, "kl": 4.97819522023201, "learning_rate": 4.05162568065356e-06, "loss": 0.1991, "num_tokens": 12715280.0, "reward": 0.739990234375, "reward_std": 0.008206473663449287, "rewards//mean": 0.739990234375, "rewards//std": 0.029552048072218895, "step": 1471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2944, "grad_norm": 2.921943426132202, "kl": 3.4542504996061325, "learning_rate": 4.050381285616704e-06, "loss": 0.1382, "num_tokens": 12723888.0, "reward": 0.77178955078125, "reward_std": 0.012559525668621063, "rewards//mean": 0.77178955078125, "rewards//std": 0.025437606498599052, "step": 1472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2946, "grad_norm": 5.211219310760498, "kl": 7.651161767542362, "learning_rate": 4.049136266086453e-06, "loss": 0.306, "num_tokens": 12732704.0, "reward": 0.73529052734375, "reward_std": 0.01583229936659336, "rewards//mean": 0.73529052734375, "rewards//std": 0.03543924167752266, "step": 1473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2948, "grad_norm": 3.641861915588379, "kl": 6.323392041027546, "learning_rate": 4.047890622564299e-06, "loss": 0.2529, "num_tokens": 12741424.0, "reward": 0.73907470703125, "reward_std": 0.012137703597545624, "rewards//mean": 0.73907470703125, "rewards//std": 0.0349467396736145, "step": 1474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.295, "grad_norm": 4.579390525817871, "kl": 6.657691057771444, "learning_rate": 4.046644355551986e-06, "loss": 0.2663, "num_tokens": 12749984.0, "reward": 0.75115966796875, "reward_std": 0.011435726657509804, "rewards//mean": 0.75115966796875, "rewards//std": 0.03501943498849869, "step": 1475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2952, "grad_norm": 4.425080299377441, "kl": 5.0811514891684055, "learning_rate": 4.045397465551513e-06, "loss": 0.2032, "num_tokens": 12758592.0, "reward": 0.7628173828125, "reward_std": 0.011434515938162804, "rewards//mean": 0.7628173828125, "rewards//std": 0.028241094201803207, "step": 1476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2954, "grad_norm": 2.684704303741455, "kl": 5.89322429895401, "learning_rate": 4.044149953065126e-06, "loss": 0.2357, "num_tokens": 12767192.0, "reward": 0.739990234375, "reward_std": 0.009567596949636936, "rewards//mean": 0.739990234375, "rewards//std": 0.03218073025345802, "step": 1477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2956, "grad_norm": 5.836953639984131, "kl": 7.975795686244965, "learning_rate": 4.042901818595321e-06, "loss": 0.319, "num_tokens": 12775832.0, "reward": 0.78216552734375, "reward_std": 0.011070560663938522, "rewards//mean": 0.78216552734375, "rewards//std": 0.032531555742025375, "step": 1478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2958, "grad_norm": 3.958627939224243, "kl": 6.592168033123016, "learning_rate": 4.0416530626448495e-06, "loss": 0.2637, "num_tokens": 12784360.0, "reward": 0.75225830078125, "reward_std": 0.012074101716279984, "rewards//mean": 0.75225830078125, "rewards//std": 0.02841496467590332, "step": 1479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.296, "grad_norm": 2.6937179565429688, "kl": 4.202334105968475, "learning_rate": 4.040403685716708e-06, "loss": 0.1681, "num_tokens": 12792968.0, "reward": 0.75714111328125, "reward_std": 0.012901132926344872, "rewards//mean": 0.75714111328125, "rewards//std": 0.02416563592851162, "step": 1480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2962, "grad_norm": 2.5103647708892822, "kl": 6.4869774878025055, "learning_rate": 4.039153688314146e-06, "loss": 0.2595, "num_tokens": 12801640.0, "reward": 0.7418212890625, "reward_std": 0.011771900579333305, "rewards//mean": 0.7418212890625, "rewards//std": 0.025188006460666656, "step": 1481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2964, "grad_norm": 8.799986839294434, "kl": 6.715467281639576, "learning_rate": 4.037903070940663e-06, "loss": 0.2686, "num_tokens": 12810376.0, "reward": 0.75384521484375, "reward_std": 0.009749148041009903, "rewards//mean": 0.75384521484375, "rewards//std": 0.03207229822874069, "step": 1482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2966, "grad_norm": 6.437341213226318, "kl": 7.0128159783780575, "learning_rate": 4.036651834100006e-06, "loss": 0.2805, "num_tokens": 12819056.0, "reward": 0.76837158203125, "reward_std": 0.010643383488059044, "rewards//mean": 0.76837158203125, "rewards//std": 0.027274392545223236, "step": 1483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2968, "grad_norm": 5.2682204246521, "kl": 4.794356510043144, "learning_rate": 4.035399978296175e-06, "loss": 0.1918, "num_tokens": 12827616.0, "reward": 0.73419189453125, "reward_std": 0.00901983492076397, "rewards//mean": 0.73419189453125, "rewards//std": 0.028352031484246254, "step": 1484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.297, "grad_norm": 4.125086784362793, "kl": 5.428143136203289, "learning_rate": 4.034147504033416e-06, "loss": 0.2171, "num_tokens": 12836248.0, "reward": 0.76068115234375, "reward_std": 0.011856527999043465, "rewards//mean": 0.76068115234375, "rewards//std": 0.031360190361738205, "step": 1485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2972, "grad_norm": 6.757791996002197, "kl": 8.915775299072266, "learning_rate": 4.032894411816226e-06, "loss": 0.3566, "num_tokens": 12844976.0, "reward": 0.7401123046875, "reward_std": 0.017824605107307434, "rewards//mean": 0.7401123046875, "rewards//std": 0.025276795029640198, "step": 1486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2974, "grad_norm": 5.1655097007751465, "kl": 5.548712193965912, "learning_rate": 4.03164070214935e-06, "loss": 0.2219, "num_tokens": 12853648.0, "reward": 0.769775390625, "reward_std": 0.011850830167531967, "rewards//mean": 0.769775390625, "rewards//std": 0.032240886241197586, "step": 1487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2976, "grad_norm": 7.906528949737549, "kl": 5.0317773669958115, "learning_rate": 4.030386375537782e-06, "loss": 0.2013, "num_tokens": 12862296.0, "reward": 0.75, "reward_std": 0.007678725756704807, "rewards//mean": 0.75, "rewards//std": 0.018843291327357292, "step": 1488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2978, "grad_norm": 2.670036554336548, "kl": 4.5353162325918674, "learning_rate": 4.029131432486765e-06, "loss": 0.1814, "num_tokens": 12870952.0, "reward": 0.7421875, "reward_std": 0.013053884729743004, "rewards//mean": 0.7421875, "rewards//std": 0.03908729553222656, "step": 1489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.298, "grad_norm": 2.879382848739624, "kl": 4.839878886938095, "learning_rate": 4.02787587350179e-06, "loss": 0.1936, "num_tokens": 12879552.0, "reward": 0.72698974609375, "reward_std": 0.01621217280626297, "rewards//mean": 0.72698974609375, "rewards//std": 0.033919330686330795, "step": 1490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2982, "grad_norm": 2.864288568496704, "kl": 4.396785516291857, "learning_rate": 4.0266196990885955e-06, "loss": 0.1759, "num_tokens": 12888144.0, "reward": 0.75537109375, "reward_std": 0.012992036528885365, "rewards//mean": 0.75537109375, "rewards//std": 0.033061493188142776, "step": 1491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2984, "grad_norm": 3.944153308868408, "kl": 4.8247092217206955, "learning_rate": 4.02536290975317e-06, "loss": 0.193, "num_tokens": 12896776.0, "reward": 0.75018310546875, "reward_std": 0.010169756598770618, "rewards//mean": 0.75018310546875, "rewards//std": 0.030028115957975388, "step": 1492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2986, "grad_norm": 1.7587748765945435, "kl": 4.360402375459671, "learning_rate": 4.024105506001745e-06, "loss": 0.1744, "num_tokens": 12905488.0, "reward": 0.737548828125, "reward_std": 0.011785809881985188, "rewards//mean": 0.737548828125, "rewards//std": 0.022292152047157288, "step": 1493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2988, "grad_norm": 2.2188475131988525, "kl": 2.4948428384959698, "learning_rate": 4.022847488340806e-06, "loss": 0.0998, "num_tokens": 12914184.0, "reward": 0.7332763671875, "reward_std": 0.010249923914670944, "rewards//mean": 0.7332763671875, "rewards//std": 0.03392898291349411, "step": 1494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.299, "grad_norm": 2.3185861110687256, "kl": 3.9336375780403614, "learning_rate": 4.02158885727708e-06, "loss": 0.1573, "num_tokens": 12922832.0, "reward": 0.76898193359375, "reward_std": 0.013835008256137371, "rewards//mean": 0.76898193359375, "rewards//std": 0.030788281932473183, "step": 1495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2992, "grad_norm": 3.1732141971588135, "kl": 4.516754850745201, "learning_rate": 4.020329613317545e-06, "loss": 0.1807, "num_tokens": 12931536.0, "reward": 0.7799072265625, "reward_std": 0.013118177652359009, "rewards//mean": 0.7799072265625, "rewards//std": 0.03535354882478714, "step": 1496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2994, "grad_norm": 4.068804740905762, "kl": 4.382329002022743, "learning_rate": 4.0190697569694235e-06, "loss": 0.1753, "num_tokens": 12940248.0, "reward": 0.7420654296875, "reward_std": 0.01609034463763237, "rewards//mean": 0.7420654296875, "rewards//std": 0.03040117584168911, "step": 1497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2996, "grad_norm": 2.665220260620117, "kl": 4.5021251030266285, "learning_rate": 4.0178092887401845e-06, "loss": 0.1801, "num_tokens": 12948888.0, "reward": 0.77166748046875, "reward_std": 0.01489767711609602, "rewards//mean": 0.77166748046875, "rewards//std": 0.02919795550405979, "step": 1498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2998, "grad_norm": 3.1346280574798584, "kl": 3.683500662446022, "learning_rate": 4.0165482091375466e-06, "loss": 0.1473, "num_tokens": 12957584.0, "reward": 0.71893310546875, "reward_std": 0.01587088592350483, "rewards//mean": 0.71893310546875, "rewards//std": 0.049671806395053864, "step": 1499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3, "grad_norm": 6.0744709968566895, "kl": 3.621046721935272, "learning_rate": 4.015286518669471e-06, "loss": 0.1448, "num_tokens": 12966240.0, "reward": 0.75830078125, "reward_std": 0.011738786473870277, "rewards//mean": 0.75830078125, "rewards//std": 0.026807021349668503, "step": 1500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3002, "grad_norm": 2.5515923500061035, "kl": 3.2715363204479218, "learning_rate": 4.014024217844167e-06, "loss": 0.1309, "num_tokens": 12974816.0, "reward": 0.77301025390625, "reward_std": 0.013088170439004898, "rewards//mean": 0.77301025390625, "rewards//std": 0.03388361260294914, "step": 1501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3004, "grad_norm": 2.723175287246704, "kl": 3.6039856374263763, "learning_rate": 4.012761307170089e-06, "loss": 0.1442, "num_tokens": 12983344.0, "reward": 0.73919677734375, "reward_std": 0.013965699821710587, "rewards//mean": 0.73919677734375, "rewards//std": 0.02615990862250328, "step": 1502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3006, "grad_norm": 2.1565959453582764, "kl": 4.537982240319252, "learning_rate": 4.011497787155938e-06, "loss": 0.1815, "num_tokens": 12992024.0, "reward": 0.724365234375, "reward_std": 0.014301682822406292, "rewards//mean": 0.724365234375, "rewards//std": 0.02575996331870556, "step": 1503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3008, "grad_norm": 3.890669345855713, "kl": 2.665222369134426, "learning_rate": 4.010233658310658e-06, "loss": 0.1066, "num_tokens": 13000600.0, "reward": 0.78326416015625, "reward_std": 0.017758097499608994, "rewards//mean": 0.78326416015625, "rewards//std": 0.031094543635845184, "step": 1504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.301, "grad_norm": 4.000090599060059, "kl": 4.0344742350280285, "learning_rate": 4.008968921143441e-06, "loss": 0.1614, "num_tokens": 13009272.0, "reward": 0.7501220703125, "reward_std": 0.017892658710479736, "rewards//mean": 0.7501220703125, "rewards//std": 0.027787191793322563, "step": 1505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3012, "grad_norm": 3.401834726333618, "kl": 2.8880666121840477, "learning_rate": 4.007703576163724e-06, "loss": 0.1155, "num_tokens": 13017872.0, "reward": 0.73883056640625, "reward_std": 0.013098958879709244, "rewards//mean": 0.73883056640625, "rewards//std": 0.035664912313222885, "step": 1506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3014, "grad_norm": 6.675731658935547, "kl": 1.5570039190351963, "learning_rate": 4.006437623881186e-06, "loss": 0.0623, "num_tokens": 13026464.0, "reward": 0.78387451171875, "reward_std": 0.00885712169110775, "rewards//mean": 0.78387451171875, "rewards//std": 0.02909928187727928, "step": 1507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3016, "grad_norm": 2.726919174194336, "kl": 3.61994319409132, "learning_rate": 4.005171064805754e-06, "loss": 0.1448, "num_tokens": 13035080.0, "reward": 0.74298095703125, "reward_std": 0.013533495366573334, "rewards//mean": 0.74298095703125, "rewards//std": 0.030331075191497803, "step": 1508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3018, "grad_norm": 2.095762252807617, "kl": 4.111942909657955, "learning_rate": 4.003903899447597e-06, "loss": 0.1645, "num_tokens": 13043824.0, "reward": 0.7322998046875, "reward_std": 0.010914807207882404, "rewards//mean": 0.7322998046875, "rewards//std": 0.024319633841514587, "step": 1509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.302, "grad_norm": 2.006782054901123, "kl": 2.7966101355850697, "learning_rate": 4.0026361283171285e-06, "loss": 0.1119, "num_tokens": 13052464.0, "reward": 0.749755859375, "reward_std": 0.009890217334032059, "rewards//mean": 0.749755859375, "rewards//std": 0.029081134125590324, "step": 1510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3022, "grad_norm": 4.149724960327148, "kl": 3.129473563283682, "learning_rate": 4.001367751925008e-06, "loss": 0.1252, "num_tokens": 13061144.0, "reward": 0.76483154296875, "reward_std": 0.009312788024544716, "rewards//mean": 0.76483154296875, "rewards//std": 0.02693033404648304, "step": 1511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3024, "grad_norm": 3.0192975997924805, "kl": 6.2304427325725555, "learning_rate": 4.000098770782136e-06, "loss": 0.2492, "num_tokens": 13069808.0, "reward": 0.722900390625, "reward_std": 0.012762438505887985, "rewards//mean": 0.722900390625, "rewards//std": 0.03394615277647972, "step": 1512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3026, "grad_norm": 1.8836160898208618, "kl": 4.318675339221954, "learning_rate": 3.998829185399659e-06, "loss": 0.1727, "num_tokens": 13078400.0, "reward": 0.759765625, "reward_std": 0.015666011720895767, "rewards//mean": 0.759765625, "rewards//std": 0.027081187814474106, "step": 1513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3028, "grad_norm": 3.436419725418091, "kl": 3.437701318413019, "learning_rate": 3.997558996288965e-06, "loss": 0.1375, "num_tokens": 13086976.0, "reward": 0.7628173828125, "reward_std": 0.015964217483997345, "rewards//mean": 0.7628173828125, "rewards//std": 0.02957329712808132, "step": 1514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.303, "grad_norm": 4.87109899520874, "kl": 2.246835172176361, "learning_rate": 3.996288203961686e-06, "loss": 0.0899, "num_tokens": 13095616.0, "reward": 0.753662109375, "reward_std": 0.006346551701426506, "rewards//mean": 0.753662109375, "rewards//std": 0.034209128469228745, "step": 1515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3032, "grad_norm": 5.91757345199585, "kl": 1.3186541683971882, "learning_rate": 3.995016808929698e-06, "loss": 0.0527, "num_tokens": 13104176.0, "reward": 0.7452392578125, "reward_std": 0.006468596868216991, "rewards//mean": 0.7452392578125, "rewards//std": 0.02670716680586338, "step": 1516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3034, "grad_norm": 2.354980707168579, "kl": 4.29764449596405, "learning_rate": 3.993744811705118e-06, "loss": 0.1719, "num_tokens": 13112776.0, "reward": 0.7476806640625, "reward_std": 0.014127172529697418, "rewards//mean": 0.7476806640625, "rewards//std": 0.03229975700378418, "step": 1517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3036, "grad_norm": 2.8933966159820557, "kl": 4.000209979712963, "learning_rate": 3.992472212800307e-06, "loss": 0.16, "num_tokens": 13121368.0, "reward": 0.7572021484375, "reward_std": 0.01566946879029274, "rewards//mean": 0.7572021484375, "rewards//std": 0.03298608586192131, "step": 1518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3038, "grad_norm": 3.904183864593506, "kl": 4.53306770324707, "learning_rate": 3.991199012727867e-06, "loss": 0.1813, "num_tokens": 13130136.0, "reward": 0.77923583984375, "reward_std": 0.021974995732307434, "rewards//mean": 0.77923583984375, "rewards//std": 0.036756012588739395, "step": 1519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.304, "grad_norm": 4.313360691070557, "kl": 5.013117797672749, "learning_rate": 3.989925212000641e-06, "loss": 0.2005, "num_tokens": 13138768.0, "reward": 0.751953125, "reward_std": 0.009261434897780418, "rewards//mean": 0.751953125, "rewards//std": 0.024646401405334473, "step": 1520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3042, "grad_norm": 2.9139199256896973, "kl": 4.355275988578796, "learning_rate": 3.98865081113172e-06, "loss": 0.1742, "num_tokens": 13147480.0, "reward": 0.7357177734375, "reward_std": 0.008917974308133125, "rewards//mean": 0.7357177734375, "rewards//std": 0.025866400450468063, "step": 1521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3044, "grad_norm": 6.974506855010986, "kl": 7.717843107879162, "learning_rate": 3.98737581063443e-06, "loss": 0.3087, "num_tokens": 13156176.0, "reward": 0.74981689453125, "reward_std": 0.008794200606644154, "rewards//mean": 0.74981689453125, "rewards//std": 0.03488604351878166, "step": 1522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3046, "grad_norm": 7.364071846008301, "kl": 6.765659511089325, "learning_rate": 3.986100211022341e-06, "loss": 0.2706, "num_tokens": 13164800.0, "reward": 0.75054931640625, "reward_std": 0.01643495261669159, "rewards//mean": 0.75054931640625, "rewards//std": 0.03362349420785904, "step": 1523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3048, "grad_norm": 2.3057780265808105, "kl": 6.770255953073502, "learning_rate": 3.984824012809265e-06, "loss": 0.2708, "num_tokens": 13173432.0, "reward": 0.761962890625, "reward_std": 0.014942335896193981, "rewards//mean": 0.761962890625, "rewards//std": 0.032732944935560226, "step": 1524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.305, "grad_norm": 3.885357618331909, "kl": 5.186366826295853, "learning_rate": 3.983547216509254e-06, "loss": 0.2075, "num_tokens": 13182008.0, "reward": 0.76287841796875, "reward_std": 0.01552078127861023, "rewards//mean": 0.76287841796875, "rewards//std": 0.025464966893196106, "step": 1525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3052, "grad_norm": 3.481714963912964, "kl": 5.462835013866425, "learning_rate": 3.982269822636602e-06, "loss": 0.2185, "num_tokens": 13190560.0, "reward": 0.763916015625, "reward_std": 0.009710673242807388, "rewards//mean": 0.763916015625, "rewards//std": 0.023050658404827118, "step": 1526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3054, "grad_norm": 6.869836807250977, "kl": 6.406372100114822, "learning_rate": 3.980991831705842e-06, "loss": 0.2563, "num_tokens": 13199200.0, "reward": 0.7440185546875, "reward_std": 0.01734215021133423, "rewards//mean": 0.7440185546875, "rewards//std": 0.03703812509775162, "step": 1527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3056, "grad_norm": 1.4896296262741089, "kl": 5.196640849113464, "learning_rate": 3.97971324423175e-06, "loss": 0.2079, "num_tokens": 13207840.0, "reward": 0.7427978515625, "reward_std": 0.013974122703075409, "rewards//mean": 0.7427978515625, "rewards//std": 0.028307482600212097, "step": 1528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3058, "grad_norm": 3.3353042602539062, "kl": 6.9838144183158875, "learning_rate": 3.97843406072934e-06, "loss": 0.2794, "num_tokens": 13216408.0, "reward": 0.727294921875, "reward_std": 0.012689370661973953, "rewards//mean": 0.727294921875, "rewards//std": 0.026114806532859802, "step": 1529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.306, "grad_norm": 1.999414324760437, "kl": 5.528324902057648, "learning_rate": 3.977154281713866e-06, "loss": 0.2211, "num_tokens": 13225024.0, "reward": 0.75177001953125, "reward_std": 0.019295403733849525, "rewards//mean": 0.75177001953125, "rewards//std": 0.0349033959209919, "step": 1530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3062, "grad_norm": 3.4351212978363037, "kl": 3.186705332249403, "learning_rate": 3.9758739077008256e-06, "loss": 0.1275, "num_tokens": 13233720.0, "reward": 0.7669677734375, "reward_std": 0.005291126202791929, "rewards//mean": 0.7669677734375, "rewards//std": 0.03159085661172867, "step": 1531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3064, "grad_norm": 3.9898831844329834, "kl": 5.059984661638737, "learning_rate": 3.97459293920595e-06, "loss": 0.2024, "num_tokens": 13242352.0, "reward": 0.777587890625, "reward_std": 0.01322136353701353, "rewards//mean": 0.777587890625, "rewards//std": 0.020325791090726852, "step": 1532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3066, "grad_norm": 2.658869981765747, "kl": 3.0981369502842426, "learning_rate": 3.9733113767452165e-06, "loss": 0.1239, "num_tokens": 13250976.0, "reward": 0.77301025390625, "reward_std": 0.010545320808887482, "rewards//mean": 0.77301025390625, "rewards//std": 0.02952943556010723, "step": 1533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3068, "grad_norm": 1.7661465406417847, "kl": 4.030710890889168, "learning_rate": 3.972029220834836e-06, "loss": 0.1612, "num_tokens": 13259616.0, "reward": 0.7215576171875, "reward_std": 0.01025434397161007, "rewards//mean": 0.7215576171875, "rewards//std": 0.041101712733507156, "step": 1534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.307, "grad_norm": 4.213487148284912, "kl": 2.702775727957487, "learning_rate": 3.970746471991261e-06, "loss": 0.1081, "num_tokens": 13268344.0, "reward": 0.75006103515625, "reward_std": 0.010417811572551727, "rewards//mean": 0.75006103515625, "rewards//std": 0.029630763456225395, "step": 1535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3072, "grad_norm": 4.660945892333984, "kl": 2.347678765654564, "learning_rate": 3.969463130731183e-06, "loss": 0.0939, "num_tokens": 13276984.0, "reward": 0.74090576171875, "reward_std": 0.010253921151161194, "rewards//mean": 0.74090576171875, "rewards//std": 0.025864865630865097, "step": 1536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3074, "grad_norm": 3.6448311805725098, "kl": 4.8562411069869995, "learning_rate": 3.968179197571532e-06, "loss": 0.1942, "num_tokens": 13285624.0, "reward": 0.750732421875, "reward_std": 0.01175819244235754, "rewards//mean": 0.750732421875, "rewards//std": 0.02402813732624054, "step": 1537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3076, "grad_norm": 5.100853443145752, "kl": 5.6830049604177475, "learning_rate": 3.966894673029476e-06, "loss": 0.2273, "num_tokens": 13294312.0, "reward": 0.75482177734375, "reward_std": 0.011804318986833096, "rewards//mean": 0.75482177734375, "rewards//std": 0.027274947613477707, "step": 1538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3078, "grad_norm": 6.658006191253662, "kl": 2.0131037458777428, "learning_rate": 3.965609557622421e-06, "loss": 0.0805, "num_tokens": 13302888.0, "reward": 0.72869873046875, "reward_std": 0.005462951026856899, "rewards//mean": 0.72869873046875, "rewards//std": 0.029206249862909317, "step": 1539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.308, "grad_norm": 5.981410980224609, "kl": 4.004415780305862, "learning_rate": 3.964323851868012e-06, "loss": 0.1602, "num_tokens": 13311616.0, "reward": 0.74542236328125, "reward_std": 0.015397053211927414, "rewards//mean": 0.74542236328125, "rewards//std": 0.031647052615880966, "step": 1540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3082, "grad_norm": 3.382993698120117, "kl": 4.99262972176075, "learning_rate": 3.96303755628413e-06, "loss": 0.1997, "num_tokens": 13320200.0, "reward": 0.7445068359375, "reward_std": 0.013256127946078777, "rewards//mean": 0.7445068359375, "rewards//std": 0.03586706519126892, "step": 1541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3084, "grad_norm": 2.3301899433135986, "kl": 2.7433438673615456, "learning_rate": 3.961750671388894e-06, "loss": 0.1097, "num_tokens": 13328760.0, "reward": 0.731689453125, "reward_std": 0.011386476457118988, "rewards//mean": 0.731689453125, "rewards//std": 0.035412803292274475, "step": 1542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3086, "grad_norm": 2.0895755290985107, "kl": 4.60318935289979, "learning_rate": 3.960463197700664e-06, "loss": 0.1841, "num_tokens": 13337336.0, "reward": 0.77288818359375, "reward_std": 0.015915989875793457, "rewards//mean": 0.77288818359375, "rewards//std": 0.03196924179792404, "step": 1543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3088, "grad_norm": 3.303290367126465, "kl": 5.8464949280023575, "learning_rate": 3.959175135738032e-06, "loss": 0.2339, "num_tokens": 13346008.0, "reward": 0.7752685546875, "reward_std": 0.012009695172309875, "rewards//mean": 0.7752685546875, "rewards//std": 0.027268078178167343, "step": 1544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.309, "grad_norm": 4.608892440795898, "kl": 3.7020562514662743, "learning_rate": 3.95788648601983e-06, "loss": 0.1481, "num_tokens": 13354600.0, "reward": 0.74072265625, "reward_std": 0.006680516991764307, "rewards//mean": 0.74072265625, "rewards//std": 0.031269371509552, "step": 1545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3092, "grad_norm": 3.3569841384887695, "kl": 7.069283738732338, "learning_rate": 3.956597249065126e-06, "loss": 0.2828, "num_tokens": 13363288.0, "reward": 0.77880859375, "reward_std": 0.023287571966648102, "rewards//mean": 0.77880859375, "rewards//std": 0.039380528032779694, "step": 1546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3094, "grad_norm": 3.5504496097564697, "kl": 2.6842099353671074, "learning_rate": 3.955307425393224e-06, "loss": 0.1074, "num_tokens": 13371904.0, "reward": 0.750244140625, "reward_std": 0.01209425088018179, "rewards//mean": 0.750244140625, "rewards//std": 0.02247609570622444, "step": 1547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3096, "grad_norm": 2.746368169784546, "kl": 3.1412529051303864, "learning_rate": 3.954017015523665e-06, "loss": 0.1257, "num_tokens": 13380552.0, "reward": 0.765380859375, "reward_std": 0.01681402698159218, "rewards//mean": 0.765380859375, "rewards//std": 0.031588222831487656, "step": 1548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3098, "grad_norm": 2.5239756107330322, "kl": 4.0731871128082275, "learning_rate": 3.9527260199762266e-06, "loss": 0.1629, "num_tokens": 13389160.0, "reward": 0.78118896484375, "reward_std": 0.01501405332237482, "rewards//mean": 0.78118896484375, "rewards//std": 0.03134667128324509, "step": 1549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.31, "grad_norm": 5.013501167297363, "kl": 6.112649917602539, "learning_rate": 3.95143443927092e-06, "loss": 0.2445, "num_tokens": 13397784.0, "reward": 0.73541259765625, "reward_std": 0.012426617555320263, "rewards//mean": 0.73541259765625, "rewards//std": 0.028075717389583588, "step": 1550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3102, "grad_norm": 3.9181103706359863, "kl": 2.8776444494724274, "learning_rate": 3.950142273927996e-06, "loss": 0.1151, "num_tokens": 13406464.0, "reward": 0.7359619140625, "reward_std": 0.01140708476305008, "rewards//mean": 0.7359619140625, "rewards//std": 0.03696448355913162, "step": 1551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3104, "grad_norm": 6.019432544708252, "kl": 6.184125475585461, "learning_rate": 3.948849524467937e-06, "loss": 0.2474, "num_tokens": 13415112.0, "reward": 0.7496337890625, "reward_std": 0.019330419600009918, "rewards//mean": 0.7496337890625, "rewards//std": 0.04154277965426445, "step": 1552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3106, "grad_norm": 4.136176109313965, "kl": 2.408672623336315, "learning_rate": 3.9475561914114625e-06, "loss": 0.0963, "num_tokens": 13423688.0, "reward": 0.7672119140625, "reward_std": 0.010384060442447662, "rewards//mean": 0.7672119140625, "rewards//std": 0.03652947396039963, "step": 1553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3108, "grad_norm": 3.3018338680267334, "kl": 1.738380428403616, "learning_rate": 3.946262275279528e-06, "loss": 0.0695, "num_tokens": 13432320.0, "reward": 0.75347900390625, "reward_std": 0.007586459629237652, "rewards//mean": 0.75347900390625, "rewards//std": 0.020671306177973747, "step": 1554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.311, "grad_norm": 3.595829725265503, "kl": 3.2437251210212708, "learning_rate": 3.944967776593321e-06, "loss": 0.1297, "num_tokens": 13440992.0, "reward": 0.7257080078125, "reward_std": 0.009898264892399311, "rewards//mean": 0.7257080078125, "rewards//std": 0.04108992591500282, "step": 1555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3112, "grad_norm": 7.152646064758301, "kl": 6.4876477643847466, "learning_rate": 3.9436726958742665e-06, "loss": 0.2595, "num_tokens": 13449704.0, "reward": 0.74200439453125, "reward_std": 0.008648525923490524, "rewards//mean": 0.74200439453125, "rewards//std": 0.029135674238204956, "step": 1556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3114, "grad_norm": 3.793785572052002, "kl": 3.751115173101425, "learning_rate": 3.9423770336440235e-06, "loss": 0.15, "num_tokens": 13458200.0, "reward": 0.7406005859375, "reward_std": 0.013699525967240334, "rewards//mean": 0.7406005859375, "rewards//std": 0.029746824875473976, "step": 1557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3116, "grad_norm": 2.0186686515808105, "kl": 5.793144643306732, "learning_rate": 3.941080790424483e-06, "loss": 0.2317, "num_tokens": 13466832.0, "reward": 0.7652587890625, "reward_std": 0.015554094687104225, "rewards//mean": 0.7652587890625, "rewards//std": 0.029196204617619514, "step": 1558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3118, "grad_norm": 3.7210536003112793, "kl": 5.053372606635094, "learning_rate": 3.939783966737774e-06, "loss": 0.2021, "num_tokens": 13475464.0, "reward": 0.78546142578125, "reward_std": 0.007705686613917351, "rewards//mean": 0.78546142578125, "rewards//std": 0.02623906545341015, "step": 1559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.312, "grad_norm": 4.083262920379639, "kl": 4.4861364141106606, "learning_rate": 3.938486563106254e-06, "loss": 0.1794, "num_tokens": 13484048.0, "reward": 0.75396728515625, "reward_std": 0.020328285172581673, "rewards//mean": 0.75396728515625, "rewards//std": 0.03799057751893997, "step": 1560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3122, "grad_norm": 5.260269641876221, "kl": 9.32509833574295, "learning_rate": 3.937188580052518e-06, "loss": 0.373, "num_tokens": 13492744.0, "reward": 0.73553466796875, "reward_std": 0.017937898635864258, "rewards//mean": 0.73553466796875, "rewards//std": 0.03130946680903435, "step": 1561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3124, "grad_norm": 3.738757610321045, "kl": 6.467083293944597, "learning_rate": 3.935890018099395e-06, "loss": 0.2587, "num_tokens": 13501400.0, "reward": 0.71295166015625, "reward_std": 0.009538974612951279, "rewards//mean": 0.71295166015625, "rewards//std": 0.037447985261678696, "step": 1562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3126, "grad_norm": 3.88601016998291, "kl": 5.527736730873585, "learning_rate": 3.934590877769944e-06, "loss": 0.2211, "num_tokens": 13509960.0, "reward": 0.78350830078125, "reward_std": 0.022511865943670273, "rewards//mean": 0.78350830078125, "rewards//std": 0.030716905370354652, "step": 1563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3128, "grad_norm": 2.4513373374938965, "kl": 5.661901403218508, "learning_rate": 3.933291159587459e-06, "loss": 0.2265, "num_tokens": 13518672.0, "reward": 0.75592041015625, "reward_std": 0.014948931522667408, "rewards//mean": 0.75592041015625, "rewards//std": 0.03431599214673042, "step": 1564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.313, "grad_norm": 3.82584547996521, "kl": 7.15450194478035, "learning_rate": 3.931990864075465e-06, "loss": 0.2862, "num_tokens": 13527360.0, "reward": 0.75384521484375, "reward_std": 0.022366268560290337, "rewards//mean": 0.75384521484375, "rewards//std": 0.03906555473804474, "step": 1565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3132, "grad_norm": 3.618346691131592, "kl": 5.740771688520908, "learning_rate": 3.9306899917577245e-06, "loss": 0.2296, "num_tokens": 13536168.0, "reward": 0.74237060546875, "reward_std": 0.010439842939376831, "rewards//mean": 0.74237060546875, "rewards//std": 0.022449729964137077, "step": 1566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3134, "grad_norm": 4.1069159507751465, "kl": 6.25952322781086, "learning_rate": 3.929388543158225e-06, "loss": 0.2504, "num_tokens": 13544952.0, "reward": 0.76171875, "reward_std": 0.015191497281193733, "rewards//mean": 0.76171875, "rewards//std": 0.03121122531592846, "step": 1567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3136, "grad_norm": 4.204971790313721, "kl": 6.11900432407856, "learning_rate": 3.928086518801192e-06, "loss": 0.2448, "num_tokens": 13553632.0, "reward": 0.73681640625, "reward_std": 0.012009566649794579, "rewards//mean": 0.73681640625, "rewards//std": 0.03180694952607155, "step": 1568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3138, "grad_norm": 4.623878002166748, "kl": 7.148261249065399, "learning_rate": 3.92678391921108e-06, "loss": 0.2859, "num_tokens": 13562288.0, "reward": 0.755859375, "reward_std": 0.011219678446650505, "rewards//mean": 0.755859375, "rewards//std": 0.026300795376300812, "step": 1569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.314, "grad_norm": 3.475017547607422, "kl": 8.332570195198059, "learning_rate": 3.925480744912575e-06, "loss": 0.3333, "num_tokens": 13570992.0, "reward": 0.7159423828125, "reward_std": 0.014727736823260784, "rewards//mean": 0.7159423828125, "rewards//std": 0.03395752236247063, "step": 1570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3142, "grad_norm": 3.202954053878784, "kl": 7.570593744516373, "learning_rate": 3.924176996430597e-06, "loss": 0.3028, "num_tokens": 13579696.0, "reward": 0.76318359375, "reward_std": 0.015310345217585564, "rewards//mean": 0.76318359375, "rewards//std": 0.028060683980584145, "step": 1571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3144, "grad_norm": 4.986186504364014, "kl": 6.580612495541573, "learning_rate": 3.922872674290296e-06, "loss": 0.2632, "num_tokens": 13588248.0, "reward": 0.76751708984375, "reward_std": 0.011059070006012917, "rewards//mean": 0.76751708984375, "rewards//std": 0.027383511886000633, "step": 1572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3146, "grad_norm": 5.8427581787109375, "kl": 3.545838311314583, "learning_rate": 3.921567779017051e-06, "loss": 0.1418, "num_tokens": 13596944.0, "reward": 0.77545166015625, "reward_std": 0.012224341742694378, "rewards//mean": 0.77545166015625, "rewards//std": 0.027735048905014992, "step": 1573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3148, "grad_norm": 4.409073352813721, "kl": 2.9774176329374313, "learning_rate": 3.9202623111364745e-06, "loss": 0.1191, "num_tokens": 13605600.0, "reward": 0.75, "reward_std": 0.012865251861512661, "rewards//mean": 0.75, "rewards//std": 0.030520694330334663, "step": 1574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.315, "grad_norm": 2.302351951599121, "kl": 5.247742265462875, "learning_rate": 3.918956271174409e-06, "loss": 0.2099, "num_tokens": 13614224.0, "reward": 0.7850341796875, "reward_std": 0.017034215852618217, "rewards//mean": 0.7850341796875, "rewards//std": 0.02715236321091652, "step": 1575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3152, "grad_norm": 2.224461317062378, "kl": 4.498161181807518, "learning_rate": 3.917649659656927e-06, "loss": 0.1799, "num_tokens": 13622760.0, "reward": 0.77093505859375, "reward_std": 0.013512791134417057, "rewards//mean": 0.77093505859375, "rewards//std": 0.02717375010251999, "step": 1576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3154, "grad_norm": 3.095242738723755, "kl": 5.918704517185688, "learning_rate": 3.916342477110332e-06, "loss": 0.2367, "num_tokens": 13631560.0, "reward": 0.76318359375, "reward_std": 0.011370647698640823, "rewards//mean": 0.76318359375, "rewards//std": 0.02455286681652069, "step": 1577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3156, "grad_norm": 5.29064416885376, "kl": 2.2055795155465603, "learning_rate": 3.915034724061157e-06, "loss": 0.0882, "num_tokens": 13640152.0, "reward": 0.7669677734375, "reward_std": 0.005272216629236937, "rewards//mean": 0.7669677734375, "rewards//std": 0.023253023624420166, "step": 1578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3158, "grad_norm": 3.1178343296051025, "kl": 5.230402827262878, "learning_rate": 3.913726401036164e-06, "loss": 0.2092, "num_tokens": 13648872.0, "reward": 0.7464599609375, "reward_std": 0.014498105272650719, "rewards//mean": 0.7464599609375, "rewards//std": 0.0289797130972147, "step": 1579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.316, "grad_norm": 2.5020105838775635, "kl": 3.6374882236123085, "learning_rate": 3.912417508562345e-06, "loss": 0.1455, "num_tokens": 13657488.0, "reward": 0.7215576171875, "reward_std": 0.007394429296255112, "rewards//mean": 0.7215576171875, "rewards//std": 0.03922627493739128, "step": 1580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3162, "grad_norm": 4.2352728843688965, "kl": 3.2757504247128963, "learning_rate": 3.911108047166924e-06, "loss": 0.131, "num_tokens": 13666072.0, "reward": 0.7701416015625, "reward_std": 0.013673141598701477, "rewards//mean": 0.7701416015625, "rewards//std": 0.029415220022201538, "step": 1581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3164, "grad_norm": 2.905606746673584, "kl": 4.839938536286354, "learning_rate": 3.909798017377348e-06, "loss": 0.1936, "num_tokens": 13674672.0, "reward": 0.76385498046875, "reward_std": 0.015406662598252296, "rewards//mean": 0.76385498046875, "rewards//std": 0.025704003870487213, "step": 1582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3166, "grad_norm": 6.110284805297852, "kl": 7.631506323814392, "learning_rate": 3.908487419721302e-06, "loss": 0.3053, "num_tokens": 13683352.0, "reward": 0.73760986328125, "reward_std": 0.016895674169063568, "rewards//mean": 0.73760986328125, "rewards//std": 0.04232160001993179, "step": 1583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3168, "grad_norm": 2.3218517303466797, "kl": 4.245824441313744, "learning_rate": 3.90717625472669e-06, "loss": 0.1698, "num_tokens": 13691944.0, "reward": 0.7568359375, "reward_std": 0.01431223377585411, "rewards//mean": 0.7568359375, "rewards//std": 0.03482051566243172, "step": 1584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.317, "grad_norm": 4.101203918457031, "kl": 3.779865499585867, "learning_rate": 3.9058645229216515e-06, "loss": 0.1512, "num_tokens": 13700544.0, "reward": 0.74627685546875, "reward_std": 0.009184526279568672, "rewards//mean": 0.74627685546875, "rewards//std": 0.03622794523835182, "step": 1585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3172, "grad_norm": 3.0328354835510254, "kl": 4.4597286731004715, "learning_rate": 3.90455222483455e-06, "loss": 0.1784, "num_tokens": 13709176.0, "reward": 0.74359130859375, "reward_std": 0.010393932461738586, "rewards//mean": 0.74359130859375, "rewards//std": 0.031190786510705948, "step": 1586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3174, "grad_norm": 2.7058539390563965, "kl": 6.741106703877449, "learning_rate": 3.903239360993982e-06, "loss": 0.2696, "num_tokens": 13717928.0, "reward": 0.7542724609375, "reward_std": 0.014929295517504215, "rewards//mean": 0.7542724609375, "rewards//std": 0.029550766572356224, "step": 1587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3176, "grad_norm": 2.9895646572113037, "kl": 5.48422472178936, "learning_rate": 3.9019259319287666e-06, "loss": 0.2194, "num_tokens": 13726672.0, "reward": 0.7484130859375, "reward_std": 0.011701429262757301, "rewards//mean": 0.7484130859375, "rewards//std": 0.028533319011330605, "step": 1588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3178, "grad_norm": 2.3191111087799072, "kl": 4.559877127408981, "learning_rate": 3.900611938167953e-06, "loss": 0.1824, "num_tokens": 13735360.0, "reward": 0.755126953125, "reward_std": 0.010484208352863789, "rewards//mean": 0.755126953125, "rewards//std": 0.027497222647070885, "step": 1589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.318, "grad_norm": 6.599245071411133, "kl": 3.88273523747921, "learning_rate": 3.899297380240819e-06, "loss": 0.1553, "num_tokens": 13743944.0, "reward": 0.74664306640625, "reward_std": 0.013531392440199852, "rewards//mean": 0.74664306640625, "rewards//std": 0.0372721366584301, "step": 1590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3182, "grad_norm": 3.5313384532928467, "kl": 5.8088071048259735, "learning_rate": 3.897982258676867e-06, "loss": 0.2324, "num_tokens": 13752664.0, "reward": 0.7498779296875, "reward_std": 0.01186527218669653, "rewards//mean": 0.7498779296875, "rewards//std": 0.02706301584839821, "step": 1591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3184, "grad_norm": 4.270092964172363, "kl": 4.018723681569099, "learning_rate": 3.896666574005829e-06, "loss": 0.1607, "num_tokens": 13761248.0, "reward": 0.7471923828125, "reward_std": 0.010826993733644485, "rewards//mean": 0.7471923828125, "rewards//std": 0.03019733913242817, "step": 1592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3186, "grad_norm": 4.29428768157959, "kl": 5.104453679174185, "learning_rate": 3.895350326757662e-06, "loss": 0.2042, "num_tokens": 13769792.0, "reward": 0.69921875, "reward_std": 0.019029954448342323, "rewards//mean": 0.69921875, "rewards//std": 0.05014530569314957, "step": 1593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3188, "grad_norm": 3.655932664871216, "kl": 5.787795152515173, "learning_rate": 3.89403351746255e-06, "loss": 0.2315, "num_tokens": 13778368.0, "reward": 0.74761962890625, "reward_std": 0.013004809617996216, "rewards//mean": 0.74761962890625, "rewards//std": 0.0366281233727932, "step": 1594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.319, "grad_norm": 5.370179176330566, "kl": 6.402551054954529, "learning_rate": 3.892716146650903e-06, "loss": 0.2561, "num_tokens": 13786984.0, "reward": 0.72320556640625, "reward_std": 0.011947907507419586, "rewards//mean": 0.72320556640625, "rewards//std": 0.03352249413728714, "step": 1595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3192, "grad_norm": 6.868188381195068, "kl": 4.2734921127557755, "learning_rate": 3.8913982148533605e-06, "loss": 0.1709, "num_tokens": 13795544.0, "reward": 0.75225830078125, "reward_std": 0.01276173535734415, "rewards//mean": 0.75225830078125, "rewards//std": 0.03729771450161934, "step": 1596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3194, "grad_norm": 4.368999004364014, "kl": 5.60970002412796, "learning_rate": 3.890079722600781e-06, "loss": 0.2244, "num_tokens": 13804232.0, "reward": 0.78375244140625, "reward_std": 0.016638927161693573, "rewards//mean": 0.78375244140625, "rewards//std": 0.026787742972373962, "step": 1597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3196, "grad_norm": 4.236912250518799, "kl": 4.494893729686737, "learning_rate": 3.888760670424257e-06, "loss": 0.1798, "num_tokens": 13812880.0, "reward": 0.69921875, "reward_std": 0.014720442704856396, "rewards//mean": 0.69921875, "rewards//std": 0.03863857313990593, "step": 1598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3198, "grad_norm": 2.420100450515747, "kl": 3.8236324340105057, "learning_rate": 3.887441058855102e-06, "loss": 0.1529, "num_tokens": 13821464.0, "reward": 0.74468994140625, "reward_std": 0.013085152953863144, "rewards//mean": 0.74468994140625, "rewards//std": 0.036580149084329605, "step": 1599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.32, "grad_norm": 5.399158954620361, "kl": 4.3547841757535934, "learning_rate": 3.8861208884248526e-06, "loss": 0.1742, "num_tokens": 13830008.0, "reward": 0.70184326171875, "reward_std": 0.011372125707566738, "rewards//mean": 0.70184326171875, "rewards//std": 0.01639731228351593, "step": 1600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3202, "grad_norm": 1.738990306854248, "kl": 4.246619135141373, "learning_rate": 3.8848001596652765e-06, "loss": 0.1699, "num_tokens": 13838680.0, "reward": 0.7559814453125, "reward_std": 0.017016496509313583, "rewards//mean": 0.7559814453125, "rewards//std": 0.04404578357934952, "step": 1601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3204, "grad_norm": 5.545229911804199, "kl": 2.654592990875244, "learning_rate": 3.88347887310836e-06, "loss": 0.1062, "num_tokens": 13847352.0, "reward": 0.7479248046875, "reward_std": 0.008967110887169838, "rewards//mean": 0.7479248046875, "rewards//std": 0.029642829671502113, "step": 1602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3206, "grad_norm": 2.187619686126709, "kl": 2.4325969852507114, "learning_rate": 3.882157029286321e-06, "loss": 0.0973, "num_tokens": 13855992.0, "reward": 0.74383544921875, "reward_std": 0.010725999251008034, "rewards//mean": 0.74383544921875, "rewards//std": 0.023518014699220657, "step": 1603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3208, "grad_norm": 5.178046226501465, "kl": 2.822666585445404, "learning_rate": 3.880834628731594e-06, "loss": 0.1129, "num_tokens": 13864784.0, "reward": 0.76092529296875, "reward_std": 0.013888237997889519, "rewards//mean": 0.76092529296875, "rewards//std": 0.021589016541838646, "step": 1604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.321, "grad_norm": 5.3757429122924805, "kl": 2.5389484018087387, "learning_rate": 3.8795116719768445e-06, "loss": 0.1016, "num_tokens": 13873328.0, "reward": 0.76055908203125, "reward_std": 0.017088573426008224, "rewards//mean": 0.76055908203125, "rewards//std": 0.0257957112044096, "step": 1605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3212, "grad_norm": 4.751806259155273, "kl": 4.00392509996891, "learning_rate": 3.8781881595549585e-06, "loss": 0.1602, "num_tokens": 13882000.0, "reward": 0.7335205078125, "reward_std": 0.007965567521750927, "rewards//mean": 0.7335205078125, "rewards//std": 0.026887930929660797, "step": 1606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3214, "grad_norm": 3.033128023147583, "kl": 3.491590201854706, "learning_rate": 3.876864091999046e-06, "loss": 0.1397, "num_tokens": 13890640.0, "reward": 0.7528076171875, "reward_std": 0.018129397183656693, "rewards//mean": 0.7528076171875, "rewards//std": 0.03280385211110115, "step": 1607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3216, "grad_norm": 4.442730903625488, "kl": 3.122442625463009, "learning_rate": 3.875539469842443e-06, "loss": 0.1249, "num_tokens": 13899208.0, "reward": 0.74884033203125, "reward_std": 0.009502582252025604, "rewards//mean": 0.74884033203125, "rewards//std": 0.02871912717819214, "step": 1608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3218, "grad_norm": 2.1851048469543457, "kl": 3.5695908069610596, "learning_rate": 3.874214293618706e-06, "loss": 0.1428, "num_tokens": 13907808.0, "reward": 0.7408447265625, "reward_std": 0.012224107049405575, "rewards//mean": 0.7408447265625, "rewards//std": 0.02742086909711361, "step": 1609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.322, "grad_norm": 2.487928628921509, "kl": 4.388616859912872, "learning_rate": 3.872888563861615e-06, "loss": 0.1755, "num_tokens": 13916416.0, "reward": 0.7481689453125, "reward_std": 0.013844320550560951, "rewards//mean": 0.7481689453125, "rewards//std": 0.01585533283650875, "step": 1610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3222, "grad_norm": 4.379281044006348, "kl": 3.136295974254608, "learning_rate": 3.8715622811051754e-06, "loss": 0.1255, "num_tokens": 13925032.0, "reward": 0.70574951171875, "reward_std": 0.009376202709972858, "rewards//mean": 0.70574951171875, "rewards//std": 0.028898831456899643, "step": 1611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3224, "grad_norm": 2.4266488552093506, "kl": 4.521780371665955, "learning_rate": 3.8702354458836124e-06, "loss": 0.1809, "num_tokens": 13933616.0, "reward": 0.74737548828125, "reward_std": 0.011686433106660843, "rewards//mean": 0.74737548828125, "rewards//std": 0.03108724020421505, "step": 1612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3226, "grad_norm": 3.00685453414917, "kl": 4.307077512145042, "learning_rate": 3.868908058731376e-06, "loss": 0.1723, "num_tokens": 13942152.0, "reward": 0.74462890625, "reward_std": 0.011653238907456398, "rewards//mean": 0.74462890625, "rewards//std": 0.029687991365790367, "step": 1613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3228, "grad_norm": 8.057661056518555, "kl": 5.4305098205804825, "learning_rate": 3.867580120183138e-06, "loss": 0.2172, "num_tokens": 13950896.0, "reward": 0.7305908203125, "reward_std": 0.012947263196110725, "rewards//mean": 0.7305908203125, "rewards//std": 0.03372491896152496, "step": 1614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.323, "grad_norm": 1.6902443170547485, "kl": 4.279229156672955, "learning_rate": 3.86625163077379e-06, "loss": 0.1712, "num_tokens": 13959576.0, "reward": 0.7835693359375, "reward_std": 0.01230061799287796, "rewards//mean": 0.7835693359375, "rewards//std": 0.026057647541165352, "step": 1615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3232, "grad_norm": 3.876128911972046, "kl": 3.8744760043919086, "learning_rate": 3.864922591038448e-06, "loss": 0.155, "num_tokens": 13968200.0, "reward": 0.77691650390625, "reward_std": 0.01860947534441948, "rewards//mean": 0.77691650390625, "rewards//std": 0.03039439208805561, "step": 1616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3234, "grad_norm": 3.1879289150238037, "kl": 4.42099130153656, "learning_rate": 3.863593001512451e-06, "loss": 0.1768, "num_tokens": 13976824.0, "reward": 0.70831298828125, "reward_std": 0.012146164663136005, "rewards//mean": 0.70831298828125, "rewards//std": 0.034401021897792816, "step": 1617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3236, "grad_norm": 3.56443190574646, "kl": 3.9698301553726196, "learning_rate": 3.862262862731355e-06, "loss": 0.1588, "num_tokens": 13985536.0, "reward": 0.7567138671875, "reward_std": 0.012045308016240597, "rewards//mean": 0.7567138671875, "rewards//std": 0.036936625838279724, "step": 1618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3238, "grad_norm": 4.307621955871582, "kl": 3.8718565702438354, "learning_rate": 3.860932175230941e-06, "loss": 0.1549, "num_tokens": 13994200.0, "reward": 0.77996826171875, "reward_std": 0.014707080088555813, "rewards//mean": 0.77996826171875, "rewards//std": 0.03207654878497124, "step": 1619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.324, "grad_norm": 3.4598147869110107, "kl": 2.7318001836538315, "learning_rate": 3.85960093954721e-06, "loss": 0.1093, "num_tokens": 14002800.0, "reward": 0.75909423828125, "reward_std": 0.00861070491373539, "rewards//mean": 0.75909423828125, "rewards//std": 0.03279341012239456, "step": 1620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3242, "grad_norm": 5.636101245880127, "kl": 5.405292920768261, "learning_rate": 3.858269156216383e-06, "loss": 0.2162, "num_tokens": 14011320.0, "reward": 0.7354736328125, "reward_std": 0.023118987679481506, "rewards//mean": 0.7354736328125, "rewards//std": 0.0414537712931633, "step": 1621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3244, "grad_norm": 3.2637014389038086, "kl": 4.161361530423164, "learning_rate": 3.8569368257749025e-06, "loss": 0.1665, "num_tokens": 14019920.0, "reward": 0.7459716796875, "reward_std": 0.013144556432962418, "rewards//mean": 0.7459716796875, "rewards//std": 0.02942962571978569, "step": 1622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3246, "grad_norm": 1.4425047636032104, "kl": 3.104191865772009, "learning_rate": 3.855603948759431e-06, "loss": 0.1242, "num_tokens": 14028488.0, "reward": 0.75885009765625, "reward_std": 0.011636601760983467, "rewards//mean": 0.75885009765625, "rewards//std": 0.03197965770959854, "step": 1623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3248, "grad_norm": 4.829543590545654, "kl": 5.193379811942577, "learning_rate": 3.85427052570685e-06, "loss": 0.2077, "num_tokens": 14037240.0, "reward": 0.7396240234375, "reward_std": 0.01381854247301817, "rewards//mean": 0.7396240234375, "rewards//std": 0.035769011825323105, "step": 1624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.325, "grad_norm": 5.206241130828857, "kl": 3.5458853505551815, "learning_rate": 3.8529365571542645e-06, "loss": 0.1418, "num_tokens": 14045912.0, "reward": 0.75347900390625, "reward_std": 0.013382203876972198, "rewards//mean": 0.75347900390625, "rewards//std": 0.030226586386561394, "step": 1625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3252, "grad_norm": 2.5621023178100586, "kl": 5.335387617349625, "learning_rate": 3.8516020436389945e-06, "loss": 0.2134, "num_tokens": 14054608.0, "reward": 0.751708984375, "reward_std": 0.016708549112081528, "rewards//mean": 0.751708984375, "rewards//std": 0.025352440774440765, "step": 1626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3254, "grad_norm": 6.030847072601318, "kl": 0.9881482571363449, "learning_rate": 3.850266985698583e-06, "loss": 0.0395, "num_tokens": 14063248.0, "reward": 0.7255859375, "reward_std": 0.0038378227036446333, "rewards//mean": 0.7255859375, "rewards//std": 0.030177529901266098, "step": 1627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3256, "grad_norm": 3.7459475994110107, "kl": 3.565709799528122, "learning_rate": 3.848931383870792e-06, "loss": 0.1426, "num_tokens": 14071864.0, "reward": 0.76239013671875, "reward_std": 0.01435384526848793, "rewards//mean": 0.76239013671875, "rewards//std": 0.03201230242848396, "step": 1628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3258, "grad_norm": 1.889054775238037, "kl": 4.051982514560223, "learning_rate": 3.8475952386936e-06, "loss": 0.1621, "num_tokens": 14080496.0, "reward": 0.74383544921875, "reward_std": 0.009134771302342415, "rewards//mean": 0.74383544921875, "rewards//std": 0.027566980570554733, "step": 1629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.326, "grad_norm": 3.0458016395568848, "kl": 4.46809147298336, "learning_rate": 3.846258550705207e-06, "loss": 0.1787, "num_tokens": 14089224.0, "reward": 0.77362060546875, "reward_std": 0.0122940419241786, "rewards//mean": 0.77362060546875, "rewards//std": 0.027660174295306206, "step": 1630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3262, "grad_norm": 3.8411903381347656, "kl": 5.15141735970974, "learning_rate": 3.844921320444031e-06, "loss": 0.2061, "num_tokens": 14097808.0, "reward": 0.74505615234375, "reward_std": 0.012656359001994133, "rewards//mean": 0.74505615234375, "rewards//std": 0.03316931426525116, "step": 1631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3264, "grad_norm": 2.385568618774414, "kl": 6.338951587677002, "learning_rate": 3.84358354844871e-06, "loss": 0.2536, "num_tokens": 14106552.0, "reward": 0.7197265625, "reward_std": 0.017925230786204338, "rewards//mean": 0.7197265625, "rewards//std": 0.027533533051609993, "step": 1632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3266, "grad_norm": 4.874102592468262, "kl": 6.989391624927521, "learning_rate": 3.842245235258093e-06, "loss": 0.2796, "num_tokens": 14115120.0, "reward": 0.75848388671875, "reward_std": 0.01101350225508213, "rewards//mean": 0.75848388671875, "rewards//std": 0.0264528077095747, "step": 1633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3268, "grad_norm": 3.1344492435455322, "kl": 4.110005401074886, "learning_rate": 3.840906381411258e-06, "loss": 0.1644, "num_tokens": 14123784.0, "reward": 0.76220703125, "reward_std": 0.016101380810141563, "rewards//mean": 0.76220703125, "rewards//std": 0.03784371167421341, "step": 1634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.327, "grad_norm": 6.19732141494751, "kl": 4.503627173602581, "learning_rate": 3.839566987447492e-06, "loss": 0.1801, "num_tokens": 14132368.0, "reward": 0.7496337890625, "reward_std": 0.01359787117689848, "rewards//mean": 0.7496337890625, "rewards//std": 0.03393076732754707, "step": 1635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3272, "grad_norm": 2.645113468170166, "kl": 5.061479531228542, "learning_rate": 3.838227053906304e-06, "loss": 0.2025, "num_tokens": 14140984.0, "reward": 0.7774658203125, "reward_std": 0.01617370918393135, "rewards//mean": 0.7774658203125, "rewards//std": 0.03584172949194908, "step": 1636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3274, "grad_norm": 5.285517692565918, "kl": 6.474849700927734, "learning_rate": 3.836886581327418e-06, "loss": 0.259, "num_tokens": 14149696.0, "reward": 0.74151611328125, "reward_std": 0.012333615683019161, "rewards//mean": 0.74151611328125, "rewards//std": 0.029393276199698448, "step": 1637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3276, "grad_norm": 2.7533681392669678, "kl": 4.140344597399235, "learning_rate": 3.835545570250778e-06, "loss": 0.1656, "num_tokens": 14158304.0, "reward": 0.74090576171875, "reward_std": 0.01173315104097128, "rewards//mean": 0.74090576171875, "rewards//std": 0.024240689352154732, "step": 1638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3278, "grad_norm": 2.161599636077881, "kl": 2.9898950532078743, "learning_rate": 3.834204021216541e-06, "loss": 0.1196, "num_tokens": 14166912.0, "reward": 0.753173828125, "reward_std": 0.008994994685053825, "rewards//mean": 0.753173828125, "rewards//std": 0.034074340015649796, "step": 1639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.328, "grad_norm": 4.661413192749023, "kl": 3.9061889238655567, "learning_rate": 3.832861934765085e-06, "loss": 0.1562, "num_tokens": 14175480.0, "reward": 0.7685546875, "reward_std": 0.01384653802961111, "rewards//mean": 0.7685546875, "rewards//std": 0.030032716691493988, "step": 1640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3282, "grad_norm": 4.080323219299316, "kl": 3.435735657811165, "learning_rate": 3.8315193114369995e-06, "loss": 0.1374, "num_tokens": 14184088.0, "reward": 0.73822021484375, "reward_std": 0.009468725882470608, "rewards//mean": 0.73822021484375, "rewards//std": 0.033907726407051086, "step": 1641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3284, "grad_norm": 2.05501389503479, "kl": 3.6118517071008682, "learning_rate": 3.830176151773096e-06, "loss": 0.1445, "num_tokens": 14192728.0, "reward": 0.71826171875, "reward_std": 0.008289259858429432, "rewards//mean": 0.71826171875, "rewards//std": 0.03615480288863182, "step": 1642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3286, "grad_norm": 5.5025634765625, "kl": 6.697710808366537, "learning_rate": 3.828832456314397e-06, "loss": 0.2679, "num_tokens": 14201512.0, "reward": 0.7421875, "reward_std": 0.009993165731430054, "rewards//mean": 0.7421875, "rewards//std": 0.032354310154914856, "step": 1643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3288, "grad_norm": 1.891310453414917, "kl": 4.32382919639349, "learning_rate": 3.827488225602144e-06, "loss": 0.173, "num_tokens": 14210072.0, "reward": 0.716796875, "reward_std": 0.010930212214589119, "rewards//mean": 0.716796875, "rewards//std": 0.03533834591507912, "step": 1644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.329, "grad_norm": 1.6423219442367554, "kl": 4.4474758207798, "learning_rate": 3.8261434601777916e-06, "loss": 0.1779, "num_tokens": 14218784.0, "reward": 0.75140380859375, "reward_std": 0.014329606667160988, "rewards//mean": 0.75140380859375, "rewards//std": 0.034767381846904755, "step": 1645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3292, "grad_norm": 1.9795019626617432, "kl": 3.89892740547657, "learning_rate": 3.824798160583012e-06, "loss": 0.156, "num_tokens": 14227544.0, "reward": 0.7667236328125, "reward_std": 0.011394035071134567, "rewards//mean": 0.7667236328125, "rewards//std": 0.028232516720891, "step": 1646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3294, "grad_norm": 3.5120882987976074, "kl": 7.797329902648926, "learning_rate": 3.823452327359693e-06, "loss": 0.3119, "num_tokens": 14236200.0, "reward": 0.73980712890625, "reward_std": 0.015849804505705833, "rewards//mean": 0.73980712890625, "rewards//std": 0.03140167519450188, "step": 1647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3296, "grad_norm": 2.1600303649902344, "kl": 4.468118630349636, "learning_rate": 3.8221059610499336e-06, "loss": 0.1787, "num_tokens": 14244816.0, "reward": 0.74578857421875, "reward_std": 0.011331606656312943, "rewards//mean": 0.74578857421875, "rewards//std": 0.02700330875813961, "step": 1648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3298, "grad_norm": 2.6378049850463867, "kl": 5.479107990860939, "learning_rate": 3.820759062196052e-06, "loss": 0.2192, "num_tokens": 14253456.0, "reward": 0.7490234375, "reward_std": 0.014942120760679245, "rewards//mean": 0.7490234375, "rewards//std": 0.03021763451397419, "step": 1649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.33, "grad_norm": 3.6769237518310547, "kl": 4.725665733218193, "learning_rate": 3.819411631340577e-06, "loss": 0.189, "num_tokens": 14262040.0, "reward": 0.7593994140625, "reward_std": 0.016876935958862305, "rewards//mean": 0.7593994140625, "rewards//std": 0.043116286396980286, "step": 1650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3302, "grad_norm": 2.009699821472168, "kl": 4.328770384192467, "learning_rate": 3.8180636690262565e-06, "loss": 0.1732, "num_tokens": 14270656.0, "reward": 0.75079345703125, "reward_std": 0.010999202728271484, "rewards//mean": 0.75079345703125, "rewards//std": 0.028554201126098633, "step": 1651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3304, "grad_norm": 3.8817496299743652, "kl": 3.6385680735111237, "learning_rate": 3.8167151757960466e-06, "loss": 0.1455, "num_tokens": 14279280.0, "reward": 0.74755859375, "reward_std": 0.011752195656299591, "rewards//mean": 0.74755859375, "rewards//std": 0.03820677474141121, "step": 1652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3306, "grad_norm": 3.7502148151397705, "kl": 4.040041476488113, "learning_rate": 3.815366152193122e-06, "loss": 0.1616, "num_tokens": 14287896.0, "reward": 0.7579345703125, "reward_std": 0.013930363580584526, "rewards//mean": 0.7579345703125, "rewards//std": 0.03517325595021248, "step": 1653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3308, "grad_norm": 4.372478485107422, "kl": 2.4471462815999985, "learning_rate": 3.8140165987608678e-06, "loss": 0.0979, "num_tokens": 14296560.0, "reward": 0.746337890625, "reward_std": 0.010630778037011623, "rewards//mean": 0.746337890625, "rewards//std": 0.022357245907187462, "step": 1654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.331, "grad_norm": 6.621041774749756, "kl": 3.1807579547166824, "learning_rate": 3.812666516042885e-06, "loss": 0.1272, "num_tokens": 14305192.0, "reward": 0.745849609375, "reward_std": 0.019893880933523178, "rewards//mean": 0.745849609375, "rewards//std": 0.028509169816970825, "step": 1655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3312, "grad_norm": 3.284824848175049, "kl": 6.243016645312309, "learning_rate": 3.811315904582986e-06, "loss": 0.2497, "num_tokens": 14313848.0, "reward": 0.75970458984375, "reward_std": 0.015608346089720726, "rewards//mean": 0.75970458984375, "rewards//std": 0.03510448709130287, "step": 1656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3314, "grad_norm": 2.2132184505462646, "kl": 6.238913476467133, "learning_rate": 3.8099647649251984e-06, "loss": 0.2496, "num_tokens": 14322432.0, "reward": 0.75909423828125, "reward_std": 0.012755392119288445, "rewards//mean": 0.75909423828125, "rewards//std": 0.028091350570321083, "step": 1657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3316, "grad_norm": 5.210205554962158, "kl": 3.1331373900175095, "learning_rate": 3.808613097613759e-06, "loss": 0.1253, "num_tokens": 14331064.0, "reward": 0.7742919921875, "reward_std": 0.014839716255664825, "rewards//mean": 0.7742919921875, "rewards//std": 0.02921900898218155, "step": 1658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3318, "grad_norm": 3.672131299972534, "kl": 5.093923807144165, "learning_rate": 3.807260903193122e-06, "loss": 0.2038, "num_tokens": 14339672.0, "reward": 0.75341796875, "reward_std": 0.01626969315111637, "rewards//mean": 0.75341796875, "rewards//std": 0.020225748419761658, "step": 1659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.332, "grad_norm": 6.887454986572266, "kl": 7.690740406513214, "learning_rate": 3.805908182207948e-06, "loss": 0.3076, "num_tokens": 14348360.0, "reward": 0.7261962890625, "reward_std": 0.015995383262634277, "rewards//mean": 0.7261962890625, "rewards//std": 0.04500722140073776, "step": 1660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3322, "grad_norm": 3.3311855792999268, "kl": 4.059452801942825, "learning_rate": 3.804554935203115e-06, "loss": 0.1624, "num_tokens": 14356952.0, "reward": 0.763671875, "reward_std": 0.015422793105244637, "rewards//mean": 0.763671875, "rewards//std": 0.028603527694940567, "step": 1661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3324, "grad_norm": 5.8155927658081055, "kl": 6.019365459680557, "learning_rate": 3.8032011627237105e-06, "loss": 0.2408, "num_tokens": 14365712.0, "reward": 0.72637939453125, "reward_std": 0.012821802869439125, "rewards//mean": 0.72637939453125, "rewards//std": 0.03148496150970459, "step": 1662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3326, "grad_norm": 1.2999500036239624, "kl": 6.043249696493149, "learning_rate": 3.801846865315033e-06, "loss": 0.2417, "num_tokens": 14374352.0, "reward": 0.748779296875, "reward_std": 0.018505418673157692, "rewards//mean": 0.748779296875, "rewards//std": 0.034596338868141174, "step": 1663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3328, "grad_norm": 2.126056671142578, "kl": 5.403929501771927, "learning_rate": 3.8004920435225934e-06, "loss": 0.2162, "num_tokens": 14383008.0, "reward": 0.74444580078125, "reward_std": 0.010174641385674477, "rewards//mean": 0.74444580078125, "rewards//std": 0.03812621161341667, "step": 1664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.333, "grad_norm": 2.1426539421081543, "kl": 4.620162762701511, "learning_rate": 3.7991366978921152e-06, "loss": 0.1848, "num_tokens": 14391616.0, "reward": 0.77471923828125, "reward_std": 0.01206993404775858, "rewards//mean": 0.77471923828125, "rewards//std": 0.0319099985063076, "step": 1665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3332, "grad_norm": 1.7775729894638062, "kl": 4.945215664803982, "learning_rate": 3.7977808289695306e-06, "loss": 0.1978, "num_tokens": 14400200.0, "reward": 0.723876953125, "reward_std": 0.01060444675385952, "rewards//mean": 0.723876953125, "rewards//std": 0.034750018268823624, "step": 1666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3334, "grad_norm": 6.186906814575195, "kl": 7.106831792742014, "learning_rate": 3.796424437300982e-06, "loss": 0.2843, "num_tokens": 14408840.0, "reward": 0.753662109375, "reward_std": 0.014872390776872635, "rewards//mean": 0.753662109375, "rewards//std": 0.04185732826590538, "step": 1667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3336, "grad_norm": 4.686117172241211, "kl": 8.069875091314316, "learning_rate": 3.795067523432826e-06, "loss": 0.3228, "num_tokens": 14417464.0, "reward": 0.75439453125, "reward_std": 0.01766124740242958, "rewards//mean": 0.75439453125, "rewards//std": 0.043199241161346436, "step": 1668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3338, "grad_norm": 2.8668720722198486, "kl": 6.57467494904995, "learning_rate": 3.793710087911626e-06, "loss": 0.263, "num_tokens": 14426064.0, "reward": 0.73858642578125, "reward_std": 0.015704531222581863, "rewards//mean": 0.73858642578125, "rewards//std": 0.036413829773664474, "step": 1669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.334, "grad_norm": 2.0742673873901367, "kl": 4.703320808708668, "learning_rate": 3.7923521312841575e-06, "loss": 0.1881, "num_tokens": 14434632.0, "reward": 0.7501220703125, "reward_std": 0.012169692665338516, "rewards//mean": 0.7501220703125, "rewards//std": 0.031296227127313614, "step": 1670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3342, "grad_norm": 4.068416595458984, "kl": 4.855052947998047, "learning_rate": 3.7909936540974052e-06, "loss": 0.1942, "num_tokens": 14443280.0, "reward": 0.74688720703125, "reward_std": 0.01526482030749321, "rewards//mean": 0.74688720703125, "rewards//std": 0.03730664402246475, "step": 1671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3344, "grad_norm": 3.962437868118286, "kl": 4.3286442421376705, "learning_rate": 3.789634656898563e-06, "loss": 0.1731, "num_tokens": 14451808.0, "reward": 0.725830078125, "reward_std": 0.009717818349599838, "rewards//mean": 0.725830078125, "rewards//std": 0.028729207813739777, "step": 1672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3346, "grad_norm": 2.562312602996826, "kl": 4.548275280743837, "learning_rate": 3.788275140235036e-06, "loss": 0.1819, "num_tokens": 14460544.0, "reward": 0.760986328125, "reward_std": 0.01499534584581852, "rewards//mean": 0.760986328125, "rewards//std": 0.03218073025345802, "step": 1673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3348, "grad_norm": 1.7464113235473633, "kl": 3.8864385299384594, "learning_rate": 3.786915104654436e-06, "loss": 0.1555, "num_tokens": 14469176.0, "reward": 0.7264404296875, "reward_std": 0.01183013804256916, "rewards//mean": 0.7264404296875, "rewards//std": 0.037486620247364044, "step": 1674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.335, "grad_norm": 2.9997799396514893, "kl": 4.791320085525513, "learning_rate": 3.7855545507045856e-06, "loss": 0.1917, "num_tokens": 14477800.0, "reward": 0.74505615234375, "reward_std": 0.01385699212551117, "rewards//mean": 0.74505615234375, "rewards//std": 0.028367511928081512, "step": 1675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3352, "grad_norm": 3.0231175422668457, "kl": 4.074645131826401, "learning_rate": 3.7841934789335167e-06, "loss": 0.163, "num_tokens": 14486440.0, "reward": 0.75250244140625, "reward_std": 0.014676653780043125, "rewards//mean": 0.75250244140625, "rewards//std": 0.03280310332775116, "step": 1676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3354, "grad_norm": 2.389728546142578, "kl": 3.1608514711260796, "learning_rate": 3.7828318898894667e-06, "loss": 0.1264, "num_tokens": 14495040.0, "reward": 0.74493408203125, "reward_std": 0.010391874238848686, "rewards//mean": 0.74493408203125, "rewards//std": 0.029989778995513916, "step": 1677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3356, "grad_norm": 5.194191932678223, "kl": 4.416375134140253, "learning_rate": 3.781469784120886e-06, "loss": 0.1767, "num_tokens": 14503640.0, "reward": 0.76605224609375, "reward_std": 0.009443046525120735, "rewards//mean": 0.76605224609375, "rewards//std": 0.015994472429156303, "step": 1678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3358, "grad_norm": 2.6290111541748047, "kl": 3.4723577797412872, "learning_rate": 3.780107162176429e-06, "loss": 0.1389, "num_tokens": 14512272.0, "reward": 0.7392578125, "reward_std": 0.008590968325734138, "rewards//mean": 0.7392578125, "rewards//std": 0.029651254415512085, "step": 1679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.336, "grad_norm": 5.099137783050537, "kl": 2.362569287419319, "learning_rate": 3.7787440246049606e-06, "loss": 0.0945, "num_tokens": 14520976.0, "reward": 0.74932861328125, "reward_std": 0.009301655925810337, "rewards//mean": 0.74932861328125, "rewards//std": 0.032789718359708786, "step": 1680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3362, "grad_norm": 5.3240132331848145, "kl": 4.027422070503235, "learning_rate": 3.777380371955552e-06, "loss": 0.1611, "num_tokens": 14529696.0, "reward": 0.72552490234375, "reward_std": 0.008042724803090096, "rewards//mean": 0.72552490234375, "rewards//std": 0.03345287963747978, "step": 1681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3364, "grad_norm": 8.833404541015625, "kl": 1.708984363824129, "learning_rate": 3.7760162047774816e-06, "loss": 0.0684, "num_tokens": 14538296.0, "reward": 0.76849365234375, "reward_std": 0.005612165667116642, "rewards//mean": 0.76849365234375, "rewards//std": 0.026127483695745468, "step": 1682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3366, "grad_norm": 3.335836887359619, "kl": 5.177524261176586, "learning_rate": 3.7746515236202374e-06, "loss": 0.2071, "num_tokens": 14546904.0, "reward": 0.77020263671875, "reward_std": 0.010980047285556793, "rewards//mean": 0.77020263671875, "rewards//std": 0.03797822445631027, "step": 1683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3368, "grad_norm": 2.3170461654663086, "kl": 5.046614192426205, "learning_rate": 3.773286329033511e-06, "loss": 0.2019, "num_tokens": 14555512.0, "reward": 0.76690673828125, "reward_std": 0.010927481576800346, "rewards//mean": 0.76690673828125, "rewards//std": 0.029968570917844772, "step": 1684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.337, "grad_norm": 5.258528232574463, "kl": 6.64403460174799, "learning_rate": 3.7719206215672046e-06, "loss": 0.2658, "num_tokens": 14564144.0, "reward": 0.748046875, "reward_std": 0.010956788435578346, "rewards//mean": 0.748046875, "rewards//std": 0.03239171952009201, "step": 1685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3372, "grad_norm": 2.2199952602386475, "kl": 4.297748155891895, "learning_rate": 3.770554401771423e-06, "loss": 0.1719, "num_tokens": 14572752.0, "reward": 0.7547607421875, "reward_std": 0.015585494227707386, "rewards//mean": 0.7547607421875, "rewards//std": 0.03634168207645416, "step": 1686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3374, "grad_norm": 2.8922951221466064, "kl": 6.33044446259737, "learning_rate": 3.769187670196481e-06, "loss": 0.2532, "num_tokens": 14581416.0, "reward": 0.71112060546875, "reward_std": 0.009829193353652954, "rewards//mean": 0.71112060546875, "rewards//std": 0.040382158011198044, "step": 1687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3376, "grad_norm": 4.054256916046143, "kl": 4.580543287098408, "learning_rate": 3.7678204273928965e-06, "loss": 0.1832, "num_tokens": 14590024.0, "reward": 0.76043701171875, "reward_std": 0.013119174167513847, "rewards//mean": 0.76043701171875, "rewards//std": 0.028065472841262817, "step": 1688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3378, "grad_norm": 1.857383370399475, "kl": 3.952820636332035, "learning_rate": 3.766452673911396e-06, "loss": 0.1581, "num_tokens": 14598648.0, "reward": 0.71417236328125, "reward_std": 0.00874350592494011, "rewards//mean": 0.71417236328125, "rewards//std": 0.03770299628376961, "step": 1689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.338, "grad_norm": 7.499246120452881, "kl": 3.812231305986643, "learning_rate": 3.7650844103029093e-06, "loss": 0.1525, "num_tokens": 14607320.0, "reward": 0.74273681640625, "reward_std": 0.009935373440384865, "rewards//mean": 0.74273681640625, "rewards//std": 0.033924687653779984, "step": 1690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3382, "grad_norm": 6.163311958312988, "kl": 6.025201186537743, "learning_rate": 3.7637156371185744e-06, "loss": 0.241, "num_tokens": 14616048.0, "reward": 0.722900390625, "reward_std": 0.008395048789680004, "rewards//mean": 0.722900390625, "rewards//std": 0.02935468964278698, "step": 1691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3384, "grad_norm": 10.611398696899414, "kl": 3.0998025611042976, "learning_rate": 3.7623463549097318e-06, "loss": 0.124, "num_tokens": 14624664.0, "reward": 0.76025390625, "reward_std": 0.01041417196393013, "rewards//mean": 0.76025390625, "rewards//std": 0.02223367616534233, "step": 1692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3386, "grad_norm": 3.999767541885376, "kl": 5.197159826755524, "learning_rate": 3.760976564227928e-06, "loss": 0.2079, "num_tokens": 14633264.0, "reward": 0.76104736328125, "reward_std": 0.01729128137230873, "rewards//mean": 0.76104736328125, "rewards//std": 0.033149685710668564, "step": 1693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3388, "grad_norm": 7.153325080871582, "kl": 5.184502184391022, "learning_rate": 3.759606265624915e-06, "loss": 0.2074, "num_tokens": 14641864.0, "reward": 0.744873046875, "reward_std": 0.011871309950947762, "rewards//mean": 0.744873046875, "rewards//std": 0.027142604812979698, "step": 1694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.339, "grad_norm": 3.847614049911499, "kl": 4.592850685119629, "learning_rate": 3.7582354596526493e-06, "loss": 0.1837, "num_tokens": 14650480.0, "reward": 0.73162841796875, "reward_std": 0.010595773346722126, "rewards//mean": 0.73162841796875, "rewards//std": 0.02654534950852394, "step": 1695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3392, "grad_norm": 6.721877574920654, "kl": 2.935020200908184, "learning_rate": 3.7568641468632898e-06, "loss": 0.1174, "num_tokens": 14659040.0, "reward": 0.78375244140625, "reward_std": 0.00859799887984991, "rewards//mean": 0.78375244140625, "rewards//std": 0.017081869766116142, "step": 1696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3394, "grad_norm": 10.931442260742188, "kl": 5.36339545249939, "learning_rate": 3.7554923278092037e-06, "loss": 0.2145, "num_tokens": 14667672.0, "reward": 0.72100830078125, "reward_std": 0.015824105590581894, "rewards//mean": 0.72100830078125, "rewards//std": 0.026298996061086655, "step": 1697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3396, "grad_norm": 2.385443925857544, "kl": 3.9785833917558193, "learning_rate": 3.754120003042957e-06, "loss": 0.1591, "num_tokens": 14676360.0, "reward": 0.75457763671875, "reward_std": 0.009407286532223225, "rewards//mean": 0.75457763671875, "rewards//std": 0.023700110614299774, "step": 1698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3398, "grad_norm": 5.211919784545898, "kl": 3.959941156208515, "learning_rate": 3.752747173117324e-06, "loss": 0.1584, "num_tokens": 14684928.0, "reward": 0.7530517578125, "reward_std": 0.009212936274707317, "rewards//mean": 0.7530517578125, "rewards//std": 0.030100936070084572, "step": 1699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.34, "grad_norm": 5.103973388671875, "kl": 6.505168374627829, "learning_rate": 3.751373838585278e-06, "loss": 0.2602, "num_tokens": 14693568.0, "reward": 0.7435302734375, "reward_std": 0.012003414332866669, "rewards//mean": 0.7435302734375, "rewards//std": 0.03277245908975601, "step": 1700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3402, "grad_norm": 2.6884186267852783, "kl": 5.494181416928768, "learning_rate": 3.7500000000000005e-06, "loss": 0.2198, "num_tokens": 14702208.0, "reward": 0.734130859375, "reward_std": 0.013946492224931717, "rewards//mean": 0.734130859375, "rewards//std": 0.0407133512198925, "step": 1701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3404, "grad_norm": 3.525949001312256, "kl": 5.294013414531946, "learning_rate": 3.748625657914872e-06, "loss": 0.2118, "num_tokens": 14710888.0, "reward": 0.75311279296875, "reward_std": 0.009299607947468758, "rewards//mean": 0.75311279296875, "rewards//std": 0.029332442209124565, "step": 1702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3406, "grad_norm": 3.9921019077301025, "kl": 7.594709262251854, "learning_rate": 3.747250812883478e-06, "loss": 0.3038, "num_tokens": 14719608.0, "reward": 0.76025390625, "reward_std": 0.010880419053137302, "rewards//mean": 0.76025390625, "rewards//std": 0.03666039556264877, "step": 1703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3408, "grad_norm": 3.2974722385406494, "kl": 2.4764913581311703, "learning_rate": 3.7458754654596063e-06, "loss": 0.0991, "num_tokens": 14728232.0, "reward": 0.7738037109375, "reward_std": 0.009059741161763668, "rewards//mean": 0.7738037109375, "rewards//std": 0.028558773919939995, "step": 1704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.341, "grad_norm": 8.248628616333008, "kl": 3.717349626123905, "learning_rate": 3.744499616197246e-06, "loss": 0.1487, "num_tokens": 14736920.0, "reward": 0.74835205078125, "reward_std": 0.009622173383831978, "rewards//mean": 0.74835205078125, "rewards//std": 0.03214254975318909, "step": 1705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3412, "grad_norm": 2.6434848308563232, "kl": 4.18383377045393, "learning_rate": 3.743123265650589e-06, "loss": 0.1674, "num_tokens": 14745584.0, "reward": 0.7108154296875, "reward_std": 0.013213614001870155, "rewards//mean": 0.7108154296875, "rewards//std": 0.04018096625804901, "step": 1706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3414, "grad_norm": 4.964447021484375, "kl": 5.228684935718775, "learning_rate": 3.7417464143740283e-06, "loss": 0.2091, "num_tokens": 14754312.0, "reward": 0.74615478515625, "reward_std": 0.009133871644735336, "rewards//mean": 0.74615478515625, "rewards//std": 0.024768861010670662, "step": 1707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3416, "grad_norm": 1.8092936277389526, "kl": 3.4348859637975693, "learning_rate": 3.740369062922161e-06, "loss": 0.1374, "num_tokens": 14762928.0, "reward": 0.7322998046875, "reward_std": 0.012559403665363789, "rewards//mean": 0.7322998046875, "rewards//std": 0.03454795852303505, "step": 1708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3418, "grad_norm": 2.9312942028045654, "kl": 2.466802716255188, "learning_rate": 3.738991211849784e-06, "loss": 0.0987, "num_tokens": 14771584.0, "reward": 0.7701416015625, "reward_std": 0.008839068002998829, "rewards//mean": 0.7701416015625, "rewards//std": 0.026704899966716766, "step": 1709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.342, "grad_norm": 3.223961591720581, "kl": 4.80736443400383, "learning_rate": 3.7376128617118943e-06, "loss": 0.1923, "num_tokens": 14780264.0, "reward": 0.763427734375, "reward_std": 0.010787725448608398, "rewards//mean": 0.763427734375, "rewards//std": 0.022722594439983368, "step": 1710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3422, "grad_norm": 4.258539199829102, "kl": 4.878018535673618, "learning_rate": 3.7362340130636926e-06, "loss": 0.1951, "num_tokens": 14789040.0, "reward": 0.7625732421875, "reward_std": 0.009816067293286324, "rewards//mean": 0.7625732421875, "rewards//std": 0.027774114161729813, "step": 1711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3424, "grad_norm": 5.686222076416016, "kl": 4.201293881982565, "learning_rate": 3.7348546664605777e-06, "loss": 0.1681, "num_tokens": 14797600.0, "reward": 0.7322998046875, "reward_std": 0.013725521974265575, "rewards//mean": 0.7322998046875, "rewards//std": 0.043814219534397125, "step": 1712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3426, "grad_norm": 3.2337772846221924, "kl": 4.7717547453939915, "learning_rate": 3.7334748224581507e-06, "loss": 0.1909, "num_tokens": 14806248.0, "reward": 0.73114013671875, "reward_std": 0.01224786601960659, "rewards//mean": 0.73114013671875, "rewards//std": 0.02756478451192379, "step": 1713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3428, "grad_norm": 1.673991322517395, "kl": 6.022139221429825, "learning_rate": 3.732094481612214e-06, "loss": 0.2409, "num_tokens": 14814856.0, "reward": 0.7471923828125, "reward_std": 0.014800229109823704, "rewards//mean": 0.7471923828125, "rewards//std": 0.03160044178366661, "step": 1714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.343, "grad_norm": 3.805436134338379, "kl": 4.2859509550035, "learning_rate": 3.730713644478766e-06, "loss": 0.1714, "num_tokens": 14823456.0, "reward": 0.7476806640625, "reward_std": 0.012304323725402355, "rewards//mean": 0.7476806640625, "rewards//std": 0.03198326751589775, "step": 1715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3432, "grad_norm": 3.5949931144714355, "kl": 4.83143388107419, "learning_rate": 3.72933231161401e-06, "loss": 0.1933, "num_tokens": 14832032.0, "reward": 0.77325439453125, "reward_std": 0.020067133009433746, "rewards//mean": 0.77325439453125, "rewards//std": 0.027405062690377235, "step": 1716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3434, "grad_norm": 2.6658265590667725, "kl": 3.057475320994854, "learning_rate": 3.7279504835743453e-06, "loss": 0.1223, "num_tokens": 14840616.0, "reward": 0.71478271484375, "reward_std": 0.015706703066825867, "rewards//mean": 0.71478271484375, "rewards//std": 0.028429346159100533, "step": 1717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3436, "grad_norm": 2.7450592517852783, "kl": 5.1396384090185165, "learning_rate": 3.726568160916373e-06, "loss": 0.2056, "num_tokens": 14849224.0, "reward": 0.75421142578125, "reward_std": 0.013299671933054924, "rewards//mean": 0.75421142578125, "rewards//std": 0.02364639937877655, "step": 1718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3438, "grad_norm": 4.41118049621582, "kl": 2.3270882442593575, "learning_rate": 3.725185344196892e-06, "loss": 0.0931, "num_tokens": 14857824.0, "reward": 0.73114013671875, "reward_std": 0.006970011163502932, "rewards//mean": 0.73114013671875, "rewards//std": 0.026351897045969963, "step": 1719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.344, "grad_norm": 2.4432942867279053, "kl": 5.949076659977436, "learning_rate": 3.7238020339729015e-06, "loss": 0.238, "num_tokens": 14866520.0, "reward": 0.72149658203125, "reward_std": 0.016600128263235092, "rewards//mean": 0.72149658203125, "rewards//std": 0.03167860582470894, "step": 1720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3442, "grad_norm": 1.688549280166626, "kl": 4.576741896569729, "learning_rate": 3.7224182308015977e-06, "loss": 0.1831, "num_tokens": 14875352.0, "reward": 0.75732421875, "reward_std": 0.010729122906923294, "rewards//mean": 0.75732421875, "rewards//std": 0.024847041815519333, "step": 1721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3444, "grad_norm": 3.874100923538208, "kl": 5.122336074709892, "learning_rate": 3.721033935240376e-06, "loss": 0.2049, "num_tokens": 14883920.0, "reward": 0.7286376953125, "reward_std": 0.014914117753505707, "rewards//mean": 0.7286376953125, "rewards//std": 0.045228660106658936, "step": 1722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3446, "grad_norm": 5.967556476593018, "kl": 3.6220389492809772, "learning_rate": 3.7196491478468322e-06, "loss": 0.1449, "num_tokens": 14892592.0, "reward": 0.74810791015625, "reward_std": 0.015123222023248672, "rewards//mean": 0.74810791015625, "rewards//std": 0.04114602506160736, "step": 1723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3448, "grad_norm": 3.3234126567840576, "kl": 3.314839720726013, "learning_rate": 3.718263869178757e-06, "loss": 0.1326, "num_tokens": 14901192.0, "reward": 0.75201416015625, "reward_std": 0.013073880225419998, "rewards//mean": 0.75201416015625, "rewards//std": 0.035283420234918594, "step": 1724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.345, "grad_norm": 3.4035162925720215, "kl": 5.987032651901245, "learning_rate": 3.716878099794141e-06, "loss": 0.2395, "num_tokens": 14909840.0, "reward": 0.760986328125, "reward_std": 0.017380274832248688, "rewards//mean": 0.760986328125, "rewards//std": 0.03158055618405342, "step": 1725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3452, "grad_norm": 2.7718396186828613, "kl": 5.683594286441803, "learning_rate": 3.715491840251172e-06, "loss": 0.2273, "num_tokens": 14918520.0, "reward": 0.74822998046875, "reward_std": 0.01565616950392723, "rewards//mean": 0.74822998046875, "rewards//std": 0.032637014985084534, "step": 1726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3454, "grad_norm": 2.5319278240203857, "kl": 5.7525694668293, "learning_rate": 3.7141050911082357e-06, "loss": 0.2301, "num_tokens": 14927264.0, "reward": 0.7767333984375, "reward_std": 0.023202013224363327, "rewards//mean": 0.7767333984375, "rewards//std": 0.04547032713890076, "step": 1727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3456, "grad_norm": 3.539255380630493, "kl": 6.959166765213013, "learning_rate": 3.7127178529239126e-06, "loss": 0.2784, "num_tokens": 14935976.0, "reward": 0.74609375, "reward_std": 0.012471548281610012, "rewards//mean": 0.74609375, "rewards//std": 0.026060262694954872, "step": 1728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3458, "grad_norm": 2.1429970264434814, "kl": 4.278083831071854, "learning_rate": 3.7113301262569845e-06, "loss": 0.1711, "num_tokens": 14944624.0, "reward": 0.74652099609375, "reward_std": 0.015222073532640934, "rewards//mean": 0.74652099609375, "rewards//std": 0.03422499820590019, "step": 1729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.346, "grad_norm": 5.58650016784668, "kl": 5.295445293188095, "learning_rate": 3.7099419116664255e-06, "loss": 0.2118, "num_tokens": 14953312.0, "reward": 0.73944091796875, "reward_std": 0.012305302545428276, "rewards//mean": 0.73944091796875, "rewards//std": 0.029187584295868874, "step": 1730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3462, "grad_norm": 1.979343295097351, "kl": 4.374940887093544, "learning_rate": 3.7085532097114098e-06, "loss": 0.175, "num_tokens": 14961880.0, "reward": 0.78643798828125, "reward_std": 0.020684566348791122, "rewards//mean": 0.78643798828125, "rewards//std": 0.0385824590921402, "step": 1731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3464, "grad_norm": 2.581482172012329, "kl": 5.581102505326271, "learning_rate": 3.7071640209513054e-06, "loss": 0.2232, "num_tokens": 14970464.0, "reward": 0.71087646484375, "reward_std": 0.01371211837977171, "rewards//mean": 0.71087646484375, "rewards//std": 0.03494890779256821, "step": 1732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3466, "grad_norm": 5.933839797973633, "kl": 5.285151466727257, "learning_rate": 3.7057743459456786e-06, "loss": 0.2114, "num_tokens": 14979112.0, "reward": 0.75689697265625, "reward_std": 0.012620621360838413, "rewards//mean": 0.75689697265625, "rewards//std": 0.03580639511346817, "step": 1733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3468, "grad_norm": 1.7220008373260498, "kl": 5.160760007798672, "learning_rate": 3.7043841852542884e-06, "loss": 0.2064, "num_tokens": 14987824.0, "reward": 0.75360107421875, "reward_std": 0.015133208595216274, "rewards//mean": 0.75360107421875, "rewards//std": 0.03647364303469658, "step": 1734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.347, "grad_norm": 6.4207634925842285, "kl": 5.469633154571056, "learning_rate": 3.702993539437093e-06, "loss": 0.2188, "num_tokens": 14996432.0, "reward": 0.736572265625, "reward_std": 0.01424567960202694, "rewards//mean": 0.736572265625, "rewards//std": 0.034861356019973755, "step": 1735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3472, "grad_norm": 3.844006299972534, "kl": 5.490287780761719, "learning_rate": 3.7016024090542436e-06, "loss": 0.2196, "num_tokens": 15005096.0, "reward": 0.747314453125, "reward_std": 0.016152838245034218, "rewards//mean": 0.747314453125, "rewards//std": 0.0428466834127903, "step": 1736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3474, "grad_norm": 4.639206886291504, "kl": 5.156442314386368, "learning_rate": 3.7002107946660874e-06, "loss": 0.2063, "num_tokens": 15013688.0, "reward": 0.760986328125, "reward_std": 0.014628496952354908, "rewards//mean": 0.760986328125, "rewards//std": 0.036043256521224976, "step": 1737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3476, "grad_norm": 8.933615684509277, "kl": 3.931763805449009, "learning_rate": 3.6988186968331667e-06, "loss": 0.1573, "num_tokens": 15022464.0, "reward": 0.7386474609375, "reward_std": 0.009903889149427414, "rewards//mean": 0.7386474609375, "rewards//std": 0.027033913880586624, "step": 1738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3478, "grad_norm": 2.975750207901001, "kl": 3.6990005634725094, "learning_rate": 3.6974261161162182e-06, "loss": 0.148, "num_tokens": 15031144.0, "reward": 0.78277587890625, "reward_std": 0.010772471316158772, "rewards//mean": 0.78277587890625, "rewards//std": 0.024055764079093933, "step": 1739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.348, "grad_norm": 2.003229856491089, "kl": 3.1437401063740253, "learning_rate": 3.6960330530761735e-06, "loss": 0.1257, "num_tokens": 15039800.0, "reward": 0.7640380859375, "reward_std": 0.014627081342041492, "rewards//mean": 0.7640380859375, "rewards//std": 0.033952172845602036, "step": 1740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3482, "grad_norm": 5.07307243347168, "kl": 2.237717881798744, "learning_rate": 3.6946395082741582e-06, "loss": 0.0895, "num_tokens": 15048400.0, "reward": 0.749267578125, "reward_std": 0.011729895137250423, "rewards//mean": 0.749267578125, "rewards//std": 0.033681128174066544, "step": 1741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3484, "grad_norm": 3.884446620941162, "kl": 2.175688926130533, "learning_rate": 3.6932454822714915e-06, "loss": 0.087, "num_tokens": 15057024.0, "reward": 0.77960205078125, "reward_std": 0.014215859584510326, "rewards//mean": 0.77960205078125, "rewards//std": 0.0322628878057003, "step": 1742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3486, "grad_norm": 4.360764503479004, "kl": 2.66576112434268, "learning_rate": 3.6918509756296876e-06, "loss": 0.1066, "num_tokens": 15065576.0, "reward": 0.78631591796875, "reward_std": 0.0153195196762681, "rewards//mean": 0.78631591796875, "rewards//std": 0.027105143293738365, "step": 1743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3488, "grad_norm": 4.8437886238098145, "kl": 3.650896217674017, "learning_rate": 3.6904559889104534e-06, "loss": 0.146, "num_tokens": 15074264.0, "reward": 0.723876953125, "reward_std": 0.009737934917211533, "rewards//mean": 0.723876953125, "rewards//std": 0.024258870631456375, "step": 1744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.349, "grad_norm": 2.9325010776519775, "kl": 4.061206843703985, "learning_rate": 3.689060522675689e-06, "loss": 0.1624, "num_tokens": 15082896.0, "reward": 0.740478515625, "reward_std": 0.009307686239480972, "rewards//mean": 0.740478515625, "rewards//std": 0.02712475322186947, "step": 1745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3492, "grad_norm": 2.9927175045013428, "kl": 5.11807481944561, "learning_rate": 3.6876645774874882e-06, "loss": 0.2047, "num_tokens": 15091528.0, "reward": 0.73529052734375, "reward_std": 0.015730053186416626, "rewards//mean": 0.73529052734375, "rewards//std": 0.03456038609147072, "step": 1746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3494, "grad_norm": 3.135446786880493, "kl": 2.894928000867367, "learning_rate": 3.686268153908137e-06, "loss": 0.1158, "num_tokens": 15100120.0, "reward": 0.759765625, "reward_std": 0.014989394694566727, "rewards//mean": 0.759765625, "rewards//std": 0.02434980310499668, "step": 1747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3496, "grad_norm": 2.94649076461792, "kl": 3.066000049933791, "learning_rate": 3.684871252500116e-06, "loss": 0.1226, "num_tokens": 15108744.0, "reward": 0.74530029296875, "reward_std": 0.011397941038012505, "rewards//mean": 0.74530029296875, "rewards//std": 0.041052840650081635, "step": 1748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3498, "grad_norm": 3.3719658851623535, "kl": 2.9050259478390217, "learning_rate": 3.6834738738260955e-06, "loss": 0.1162, "num_tokens": 15117312.0, "reward": 0.73199462890625, "reward_std": 0.011112451553344727, "rewards//mean": 0.73199462890625, "rewards//std": 0.03182306885719299, "step": 1749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.35, "grad_norm": 3.9008166790008545, "kl": 3.823961317539215, "learning_rate": 3.6820760184489413e-06, "loss": 0.153, "num_tokens": 15125920.0, "reward": 0.74420166015625, "reward_std": 0.017376679927110672, "rewards//mean": 0.74420166015625, "rewards//std": 0.03587565943598747, "step": 1750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3502, "grad_norm": 2.7916665077209473, "kl": 2.747652105987072, "learning_rate": 3.6806776869317074e-06, "loss": 0.1099, "num_tokens": 15134488.0, "reward": 0.73651123046875, "reward_std": 0.00787161011248827, "rewards//mean": 0.73651123046875, "rewards//std": 0.035969629883766174, "step": 1751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3504, "grad_norm": 5.605264186859131, "kl": 4.086170427501202, "learning_rate": 3.679278879837642e-06, "loss": 0.1634, "num_tokens": 15143216.0, "reward": 0.76800537109375, "reward_std": 0.023673830553889275, "rewards//mean": 0.76800537109375, "rewards//std": 0.03797224536538124, "step": 1752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3506, "grad_norm": 5.638571262359619, "kl": 4.995779138058424, "learning_rate": 3.6778795977301856e-06, "loss": 0.1998, "num_tokens": 15152040.0, "reward": 0.71624755859375, "reward_std": 0.010780375450849533, "rewards//mean": 0.71624755859375, "rewards//std": 0.03834514692425728, "step": 1753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3508, "grad_norm": 5.868319034576416, "kl": 3.1744868103414774, "learning_rate": 3.676479841172968e-06, "loss": 0.127, "num_tokens": 15160672.0, "reward": 0.732666015625, "reward_std": 0.011098875664174557, "rewards//mean": 0.732666015625, "rewards//std": 0.02834729291498661, "step": 1754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.351, "grad_norm": 3.834345579147339, "kl": 3.4977457765489817, "learning_rate": 3.675079610729811e-06, "loss": 0.1399, "num_tokens": 15169312.0, "reward": 0.767333984375, "reward_std": 0.007949569262564182, "rewards//mean": 0.767333984375, "rewards//std": 0.02478238008916378, "step": 1755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3512, "grad_norm": 1.7872984409332275, "kl": 4.047887921333313, "learning_rate": 3.6736789069647273e-06, "loss": 0.1619, "num_tokens": 15177848.0, "reward": 0.74859619140625, "reward_std": 0.011681117117404938, "rewards//mean": 0.74859619140625, "rewards//std": 0.024787189438939095, "step": 1756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3514, "grad_norm": 7.534319877624512, "kl": 4.001326069235802, "learning_rate": 3.67227773044192e-06, "loss": 0.1601, "num_tokens": 15186416.0, "reward": 0.740966796875, "reward_std": 0.009145493619143963, "rewards//mean": 0.740966796875, "rewards//std": 0.02362149953842163, "step": 1757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3516, "grad_norm": 4.318870544433594, "kl": 5.494570571929216, "learning_rate": 3.670876081725784e-06, "loss": 0.2198, "num_tokens": 15195064.0, "reward": 0.719970703125, "reward_std": 0.012324806302785873, "rewards//mean": 0.719970703125, "rewards//std": 0.033392246812582016, "step": 1758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3518, "grad_norm": 4.4790239334106445, "kl": 3.3677577078342438, "learning_rate": 3.6694739613809015e-06, "loss": 0.1347, "num_tokens": 15203696.0, "reward": 0.74395751953125, "reward_std": 0.0096496706828475, "rewards//mean": 0.74395751953125, "rewards//std": 0.026133855804800987, "step": 1759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.352, "grad_norm": 5.450957775115967, "kl": 5.089045122265816, "learning_rate": 3.6680713699720473e-06, "loss": 0.2036, "num_tokens": 15212416.0, "reward": 0.7520751953125, "reward_std": 0.017107827588915825, "rewards//mean": 0.7520751953125, "rewards//std": 0.041834719479084015, "step": 1760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3522, "grad_norm": 4.189018249511719, "kl": 4.33225255459547, "learning_rate": 3.6666683080641846e-06, "loss": 0.1733, "num_tokens": 15221032.0, "reward": 0.73944091796875, "reward_std": 0.012221330776810646, "rewards//mean": 0.73944091796875, "rewards//std": 0.03076319582760334, "step": 1761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3524, "grad_norm": 1.8082475662231445, "kl": 3.7516178265213966, "learning_rate": 3.665264776222467e-06, "loss": 0.1501, "num_tokens": 15229704.0, "reward": 0.7420654296875, "reward_std": 0.013372143730521202, "rewards//mean": 0.7420654296875, "rewards//std": 0.0374348983168602, "step": 1762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3526, "grad_norm": 2.9922726154327393, "kl": 5.103166032582521, "learning_rate": 3.663860775012238e-06, "loss": 0.2041, "num_tokens": 15238400.0, "reward": 0.73504638671875, "reward_std": 0.015410533174872398, "rewards//mean": 0.73504638671875, "rewards//std": 0.028928151354193687, "step": 1763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3528, "grad_norm": 2.9312973022460938, "kl": 5.721847429871559, "learning_rate": 3.662456304999027e-06, "loss": 0.2289, "num_tokens": 15247072.0, "reward": 0.71917724609375, "reward_std": 0.012336707673966885, "rewards//mean": 0.71917724609375, "rewards//std": 0.03982464224100113, "step": 1764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.353, "grad_norm": 7.2277350425720215, "kl": 4.087280720472336, "learning_rate": 3.661051366748555e-06, "loss": 0.1635, "num_tokens": 15255752.0, "reward": 0.75830078125, "reward_std": 0.007890330627560616, "rewards//mean": 0.75830078125, "rewards//std": 0.031562335789203644, "step": 1765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3532, "grad_norm": 3.805083990097046, "kl": 2.6296266727149487, "learning_rate": 3.659645960826732e-06, "loss": 0.1052, "num_tokens": 15264384.0, "reward": 0.73828125, "reward_std": 0.010128607042133808, "rewards//mean": 0.73828125, "rewards//std": 0.02828849107027054, "step": 1766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3534, "grad_norm": 4.467214107513428, "kl": 3.869078677147627, "learning_rate": 3.658240087799655e-06, "loss": 0.1548, "num_tokens": 15273048.0, "reward": 0.76312255859375, "reward_std": 0.011874467134475708, "rewards//mean": 0.76312255859375, "rewards//std": 0.03139203414320946, "step": 1767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3536, "grad_norm": 4.349751949310303, "kl": 1.5508661270141602, "learning_rate": 3.656833748233608e-06, "loss": 0.062, "num_tokens": 15281632.0, "reward": 0.7506103515625, "reward_std": 0.0069464463740587234, "rewards//mean": 0.7506103515625, "rewards//std": 0.018755925819277763, "step": 1768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3538, "grad_norm": 3.453400135040283, "kl": 2.8583107534796, "learning_rate": 3.6554269426950666e-06, "loss": 0.1143, "num_tokens": 15290240.0, "reward": 0.77581787109375, "reward_std": 0.011044031009078026, "rewards//mean": 0.77581787109375, "rewards//std": 0.029410265386104584, "step": 1769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.354, "grad_norm": 2.3106017112731934, "kl": 3.464058078825474, "learning_rate": 3.6540196717506905e-06, "loss": 0.1386, "num_tokens": 15298944.0, "reward": 0.7286376953125, "reward_std": 0.011430254206061363, "rewards//mean": 0.7286376953125, "rewards//std": 0.03521454706788063, "step": 1770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3542, "grad_norm": 2.8070993423461914, "kl": 3.852851916104555, "learning_rate": 3.6526119359673283e-06, "loss": 0.1541, "num_tokens": 15307616.0, "reward": 0.76361083984375, "reward_std": 0.01346183754503727, "rewards//mean": 0.76361083984375, "rewards//std": 0.02439941093325615, "step": 1771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3544, "grad_norm": 4.07964563369751, "kl": 2.360837832093239, "learning_rate": 3.651203735912017e-06, "loss": 0.0944, "num_tokens": 15316304.0, "reward": 0.7574462890625, "reward_std": 0.007419370114803314, "rewards//mean": 0.7574462890625, "rewards//std": 0.03446196764707565, "step": 1772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3546, "grad_norm": 2.2616543769836426, "kl": 2.2828939333558083, "learning_rate": 3.6497950721519777e-06, "loss": 0.0913, "num_tokens": 15324912.0, "reward": 0.76873779296875, "reward_std": 0.011621924117207527, "rewards//mean": 0.76873779296875, "rewards//std": 0.023950444534420967, "step": 1773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3548, "grad_norm": 2.7810115814208984, "kl": 2.4413389042019844, "learning_rate": 3.648385945254621e-06, "loss": 0.0977, "num_tokens": 15333488.0, "reward": 0.7606201171875, "reward_std": 0.009526427835226059, "rewards//mean": 0.7606201171875, "rewards//std": 0.02639702521264553, "step": 1774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.355, "grad_norm": 1.904547929763794, "kl": 2.75910622254014, "learning_rate": 3.646976355787543e-06, "loss": 0.1104, "num_tokens": 15342088.0, "reward": 0.73529052734375, "reward_std": 0.010882018133997917, "rewards//mean": 0.73529052734375, "rewards//std": 0.04142029955983162, "step": 1775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3552, "grad_norm": 2.709667444229126, "kl": 1.587839487940073, "learning_rate": 3.6455663043185264e-06, "loss": 0.0635, "num_tokens": 15350712.0, "reward": 0.771484375, "reward_std": 0.008864466100931168, "rewards//mean": 0.771484375, "rewards//std": 0.02494918368756771, "step": 1776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3554, "grad_norm": 2.9289634227752686, "kl": 3.1669727340340614, "learning_rate": 3.644155791415539e-06, "loss": 0.1267, "num_tokens": 15359416.0, "reward": 0.75030517578125, "reward_std": 0.013817262835800648, "rewards//mean": 0.75030517578125, "rewards//std": 0.02934172935783863, "step": 1777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3556, "grad_norm": 2.066244602203369, "kl": 4.629983961582184, "learning_rate": 3.642744817646736e-06, "loss": 0.1852, "num_tokens": 15368040.0, "reward": 0.74163818359375, "reward_std": 0.01523747481405735, "rewards//mean": 0.74163818359375, "rewards//std": 0.02928854338824749, "step": 1778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3558, "grad_norm": 2.7459943294525146, "kl": 4.184919893741608, "learning_rate": 3.6413333835804567e-06, "loss": 0.1674, "num_tokens": 15376712.0, "reward": 0.73883056640625, "reward_std": 0.01499744039028883, "rewards//mean": 0.73883056640625, "rewards//std": 0.031140271574258804, "step": 1779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.356, "grad_norm": 2.7346317768096924, "kl": 4.333893194794655, "learning_rate": 3.639921489785227e-06, "loss": 0.1734, "num_tokens": 15385416.0, "reward": 0.7755126953125, "reward_std": 0.012135500088334084, "rewards//mean": 0.7755126953125, "rewards//std": 0.028253955766558647, "step": 1780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3562, "grad_norm": 3.842109203338623, "kl": 1.5773951038718224, "learning_rate": 3.6385091368297582e-06, "loss": 0.0631, "num_tokens": 15394024.0, "reward": 0.779296875, "reward_std": 0.012438731268048286, "rewards//mean": 0.779296875, "rewards//std": 0.033247776329517365, "step": 1781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3564, "grad_norm": 3.1556859016418457, "kl": 1.9987241104245186, "learning_rate": 3.637096325282945e-06, "loss": 0.0799, "num_tokens": 15402640.0, "reward": 0.7601318359375, "reward_std": 0.0080687515437603, "rewards//mean": 0.7601318359375, "rewards//std": 0.024545161053538322, "step": 1782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3566, "grad_norm": 3.3110463619232178, "kl": 5.864794071763754, "learning_rate": 3.6356830557138673e-06, "loss": 0.2346, "num_tokens": 15411344.0, "reward": 0.74346923828125, "reward_std": 0.01121966540813446, "rewards//mean": 0.74346923828125, "rewards//std": 0.03344428166747093, "step": 1783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3568, "grad_norm": 2.392683267593384, "kl": 4.6597472578287125, "learning_rate": 3.6342693286917906e-06, "loss": 0.1864, "num_tokens": 15419992.0, "reward": 0.74530029296875, "reward_std": 0.01295483484864235, "rewards//mean": 0.74530029296875, "rewards//std": 0.028815951198339462, "step": 1784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.357, "grad_norm": 1.7499934434890747, "kl": 2.9964402429759502, "learning_rate": 3.632855144786164e-06, "loss": 0.1199, "num_tokens": 15428584.0, "reward": 0.7786865234375, "reward_std": 0.015550244599580765, "rewards//mean": 0.7786865234375, "rewards//std": 0.03731338679790497, "step": 1785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3572, "grad_norm": 2.696981430053711, "kl": 4.8901857659220695, "learning_rate": 3.631440504566621e-06, "loss": 0.1956, "num_tokens": 15437216.0, "reward": 0.75054931640625, "reward_std": 0.014838481321930885, "rewards//mean": 0.75054931640625, "rewards//std": 0.03428686410188675, "step": 1786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3574, "grad_norm": 2.639160394668579, "kl": 5.581234708428383, "learning_rate": 3.630025408602978e-06, "loss": 0.2232, "num_tokens": 15445824.0, "reward": 0.7586669921875, "reward_std": 0.01156878937035799, "rewards//mean": 0.7586669921875, "rewards//std": 0.03505254164338112, "step": 1787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3576, "grad_norm": 2.9624719619750977, "kl": 4.819942608475685, "learning_rate": 3.6286098574652358e-06, "loss": 0.1928, "num_tokens": 15454392.0, "reward": 0.750732421875, "reward_std": 0.008959786966443062, "rewards//mean": 0.750732421875, "rewards//std": 0.030104205012321472, "step": 1788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3578, "grad_norm": 2.8236441612243652, "kl": 6.329464580863714, "learning_rate": 3.627193851723577e-06, "loss": 0.2532, "num_tokens": 15462984.0, "reward": 0.75628662109375, "reward_std": 0.020676400512456894, "rewards//mean": 0.75628662109375, "rewards//std": 0.03526711091399193, "step": 1789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.358, "grad_norm": 2.2510643005371094, "kl": 6.003470242023468, "learning_rate": 3.6257773919483706e-06, "loss": 0.2401, "num_tokens": 15471616.0, "reward": 0.75030517578125, "reward_std": 0.01057460531592369, "rewards//mean": 0.75030517578125, "rewards//std": 0.029792271554470062, "step": 1790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3582, "grad_norm": 3.4438953399658203, "kl": 8.145086526870728, "learning_rate": 3.624360478710165e-06, "loss": 0.3258, "num_tokens": 15480296.0, "reward": 0.76507568359375, "reward_std": 0.0157212782651186, "rewards//mean": 0.76507568359375, "rewards//std": 0.0307173989713192, "step": 1791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3584, "grad_norm": 3.5120270252227783, "kl": 6.0738122165203094, "learning_rate": 3.622943112579693e-06, "loss": 0.243, "num_tokens": 15488912.0, "reward": 0.697998046875, "reward_std": 0.013998921029269695, "rewards//mean": 0.697998046875, "rewards//std": 0.051389895379543304, "step": 1792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3586, "grad_norm": 4.090607643127441, "kl": 6.289332050830126, "learning_rate": 3.621525294127869e-06, "loss": 0.2516, "num_tokens": 15497536.0, "reward": 0.76507568359375, "reward_std": 0.013399823568761349, "rewards//mean": 0.76507568359375, "rewards//std": 0.02500607632100582, "step": 1793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3588, "grad_norm": 3.356712579727173, "kl": 6.252897277474403, "learning_rate": 3.6201070239257914e-06, "loss": 0.2501, "num_tokens": 15506176.0, "reward": 0.73907470703125, "reward_std": 0.016391899436712265, "rewards//mean": 0.73907470703125, "rewards//std": 0.03545589745044708, "step": 1794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.359, "grad_norm": 5.239433288574219, "kl": 6.6012173891067505, "learning_rate": 3.6186883025447382e-06, "loss": 0.264, "num_tokens": 15514840.0, "reward": 0.76190185546875, "reward_std": 0.015034276060760021, "rewards//mean": 0.76190185546875, "rewards//std": 0.0315694697201252, "step": 1795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3592, "grad_norm": 3.0736522674560547, "kl": 6.395701985806227, "learning_rate": 3.617269130556171e-06, "loss": 0.2558, "num_tokens": 15523432.0, "reward": 0.75262451171875, "reward_std": 0.018401451408863068, "rewards//mean": 0.75262451171875, "rewards//std": 0.029936226084828377, "step": 1796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3594, "grad_norm": 3.075705051422119, "kl": 8.35958057641983, "learning_rate": 3.61584950853173e-06, "loss": 0.3344, "num_tokens": 15532040.0, "reward": 0.74017333984375, "reward_std": 0.02437693625688553, "rewards//mean": 0.74017333984375, "rewards//std": 0.04444006830453873, "step": 1797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3596, "grad_norm": 3.691883087158203, "kl": 5.918193563818932, "learning_rate": 3.6144294370432427e-06, "loss": 0.2367, "num_tokens": 15540624.0, "reward": 0.76068115234375, "reward_std": 0.010484467260539532, "rewards//mean": 0.76068115234375, "rewards//std": 0.02044895477592945, "step": 1798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3598, "grad_norm": 3.4526267051696777, "kl": 4.4888074696063995, "learning_rate": 3.6130089166627102e-06, "loss": 0.1796, "num_tokens": 15549200.0, "reward": 0.7294921875, "reward_std": 0.01579568162560463, "rewards//mean": 0.7294921875, "rewards//std": 0.029405182227492332, "step": 1799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.36, "grad_norm": 5.06784200668335, "kl": 2.624183814972639, "learning_rate": 3.611587947962319e-06, "loss": 0.105, "num_tokens": 15557776.0, "reward": 0.78363037109375, "reward_std": 0.005551291164010763, "rewards//mean": 0.78363037109375, "rewards//std": 0.022815588861703873, "step": 1800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3602, "grad_norm": 4.819762229919434, "kl": 5.197374537587166, "learning_rate": 3.6101665315144357e-06, "loss": 0.2079, "num_tokens": 15566328.0, "reward": 0.737548828125, "reward_std": 0.016123276203870773, "rewards//mean": 0.737548828125, "rewards//std": 0.027620263397693634, "step": 1801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3604, "grad_norm": 4.579552173614502, "kl": 2.3194949105381966, "learning_rate": 3.608744667891606e-06, "loss": 0.0928, "num_tokens": 15575120.0, "reward": 0.792724609375, "reward_std": 0.013149932026863098, "rewards//mean": 0.792724609375, "rewards//std": 0.03211292251944542, "step": 1802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3606, "grad_norm": 5.873890399932861, "kl": 2.44502330198884, "learning_rate": 3.607322357666557e-06, "loss": 0.0978, "num_tokens": 15583712.0, "reward": 0.75494384765625, "reward_std": 0.011212198063731194, "rewards//mean": 0.75494384765625, "rewards//std": 0.02106734924018383, "step": 1803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3608, "grad_norm": 3.2312674522399902, "kl": 5.419662941247225, "learning_rate": 3.6058996014121944e-06, "loss": 0.2168, "num_tokens": 15592344.0, "reward": 0.74896240234375, "reward_std": 0.015298707410693169, "rewards//mean": 0.74896240234375, "rewards//std": 0.03928040713071823, "step": 1804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.361, "grad_norm": 3.9153892993927, "kl": 3.8486675024032593, "learning_rate": 3.6044763997016054e-06, "loss": 0.1539, "num_tokens": 15600928.0, "reward": 0.7572021484375, "reward_std": 0.01603119820356369, "rewards//mean": 0.7572021484375, "rewards//std": 0.029365774244070053, "step": 1805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3612, "grad_norm": 2.7449498176574707, "kl": 3.540650635957718, "learning_rate": 3.6030527531080533e-06, "loss": 0.1416, "num_tokens": 15609536.0, "reward": 0.75421142578125, "reward_std": 0.012893864884972572, "rewards//mean": 0.75421142578125, "rewards//std": 0.028393646702170372, "step": 1806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3614, "grad_norm": 1.919557809829712, "kl": 4.996287688612938, "learning_rate": 3.6016286622049857e-06, "loss": 0.1999, "num_tokens": 15618136.0, "reward": 0.75421142578125, "reward_std": 0.018937092274427414, "rewards//mean": 0.75421142578125, "rewards//std": 0.03936086967587471, "step": 1807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3616, "grad_norm": 2.9231443405151367, "kl": 5.571706473827362, "learning_rate": 3.600204127566023e-06, "loss": 0.2229, "num_tokens": 15626680.0, "reward": 0.74285888671875, "reward_std": 0.010228563100099564, "rewards//mean": 0.74285888671875, "rewards//std": 0.023412218317389488, "step": 1808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3618, "grad_norm": 3.5472300052642822, "kl": 2.701965544372797, "learning_rate": 3.5987791497649705e-06, "loss": 0.1081, "num_tokens": 15635376.0, "reward": 0.77410888671875, "reward_std": 0.01017923466861248, "rewards//mean": 0.77410888671875, "rewards//std": 0.022103166207671165, "step": 1809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.362, "grad_norm": 1.4947911500930786, "kl": 4.934126876294613, "learning_rate": 3.5973537293758076e-06, "loss": 0.1974, "num_tokens": 15644080.0, "reward": 0.7457275390625, "reward_std": 0.014975632540881634, "rewards//mean": 0.7457275390625, "rewards//std": 0.032494135200977325, "step": 1810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3622, "grad_norm": 3.2727203369140625, "kl": 4.251815564930439, "learning_rate": 3.595927866972694e-06, "loss": 0.1701, "num_tokens": 15652968.0, "reward": 0.7340087890625, "reward_std": 0.008255157619714737, "rewards//mean": 0.7340087890625, "rewards//std": 0.036276642233133316, "step": 1811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3624, "grad_norm": 4.858521461486816, "kl": 3.510824877768755, "learning_rate": 3.594501563129966e-06, "loss": 0.1404, "num_tokens": 15661712.0, "reward": 0.76995849609375, "reward_std": 0.01405145600438118, "rewards//mean": 0.76995849609375, "rewards//std": 0.035407621413469315, "step": 1812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3626, "grad_norm": 3.0405054092407227, "kl": 2.0984547287225723, "learning_rate": 3.59307481842214e-06, "loss": 0.0839, "num_tokens": 15670432.0, "reward": 0.7706298828125, "reward_std": 0.01243960577994585, "rewards//mean": 0.7706298828125, "rewards//std": 0.027861183509230614, "step": 1813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3628, "grad_norm": 5.275864124298096, "kl": 3.847918577492237, "learning_rate": 3.5916476334239077e-06, "loss": 0.1539, "num_tokens": 15679056.0, "reward": 0.75201416015625, "reward_std": 0.01918402872979641, "rewards//mean": 0.75201416015625, "rewards//std": 0.03136598318815231, "step": 1814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.363, "grad_norm": 1.858066439628601, "kl": 4.3311394937336445, "learning_rate": 3.5902200087101386e-06, "loss": 0.1732, "num_tokens": 15687624.0, "reward": 0.73974609375, "reward_std": 0.010124302469193935, "rewards//mean": 0.73974609375, "rewards//std": 0.028910942375659943, "step": 1815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3632, "grad_norm": 3.5171544551849365, "kl": 3.418184995651245, "learning_rate": 3.5887919448558813e-06, "loss": 0.1367, "num_tokens": 15696272.0, "reward": 0.772705078125, "reward_std": 0.012927958741784096, "rewards//mean": 0.772705078125, "rewards//std": 0.02821025624871254, "step": 1816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3634, "grad_norm": 3.4542486667633057, "kl": 4.784733880311251, "learning_rate": 3.587363442436358e-06, "loss": 0.1914, "num_tokens": 15704840.0, "reward": 0.751708984375, "reward_std": 0.012502050958573818, "rewards//mean": 0.751708984375, "rewards//std": 0.039780572056770325, "step": 1817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3636, "grad_norm": 2.294384717941284, "kl": 5.057851001620293, "learning_rate": 3.5859345020269702e-06, "loss": 0.2023, "num_tokens": 15713432.0, "reward": 0.76837158203125, "reward_std": 0.027551379054784775, "rewards//mean": 0.76837158203125, "rewards//std": 0.03912247344851494, "step": 1818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3638, "grad_norm": 2.4142749309539795, "kl": 3.4691352248191833, "learning_rate": 3.584505124203295e-06, "loss": 0.1388, "num_tokens": 15722048.0, "reward": 0.7491455078125, "reward_std": 0.008535878732800484, "rewards//mean": 0.7491455078125, "rewards//std": 0.03091077134013176, "step": 1819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.364, "grad_norm": 3.9719607830047607, "kl": 4.603520058095455, "learning_rate": 3.5830753095410857e-06, "loss": 0.1841, "num_tokens": 15730736.0, "reward": 0.73150634765625, "reward_std": 0.009856452234089375, "rewards//mean": 0.73150634765625, "rewards//std": 0.046273015439510345, "step": 1820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3642, "grad_norm": 2.374037742614746, "kl": 5.066146817058325, "learning_rate": 3.581645058616271e-06, "loss": 0.2026, "num_tokens": 15739304.0, "reward": 0.75994873046875, "reward_std": 0.009962935000658035, "rewards//mean": 0.75994873046875, "rewards//std": 0.030313102528452873, "step": 1821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3644, "grad_norm": 3.2533576488494873, "kl": 5.167050890624523, "learning_rate": 3.5802143720049565e-06, "loss": 0.2067, "num_tokens": 15747912.0, "reward": 0.71954345703125, "reward_std": 0.00951002910733223, "rewards//mean": 0.71954345703125, "rewards//std": 0.021465254947543144, "step": 1822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3646, "grad_norm": 2.9645206928253174, "kl": 7.691446557641029, "learning_rate": 3.5787832502834214e-06, "loss": 0.3077, "num_tokens": 15756560.0, "reward": 0.77899169921875, "reward_std": 0.021929670125246048, "rewards//mean": 0.77899169921875, "rewards//std": 0.032769400626420975, "step": 1823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3648, "grad_norm": 3.0390965938568115, "kl": 4.091933220624924, "learning_rate": 3.577351694028123e-06, "loss": 0.1637, "num_tokens": 15765136.0, "reward": 0.748291015625, "reward_std": 0.013302493840456009, "rewards//mean": 0.748291015625, "rewards//std": 0.023703385144472122, "step": 1824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.365, "grad_norm": 3.2863476276397705, "kl": 6.7575271390378475, "learning_rate": 3.57591970381569e-06, "loss": 0.2703, "num_tokens": 15773752.0, "reward": 0.74981689453125, "reward_std": 0.01398731954395771, "rewards//mean": 0.74981689453125, "rewards//std": 0.03910854086279869, "step": 1825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3652, "grad_norm": 3.020939588546753, "kl": 5.125618785619736, "learning_rate": 3.5744872802229296e-06, "loss": 0.205, "num_tokens": 15782448.0, "reward": 0.788330078125, "reward_std": 0.017335157841444016, "rewards//mean": 0.788330078125, "rewards//std": 0.030479997396469116, "step": 1826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3654, "grad_norm": 2.6754069328308105, "kl": 4.280109979212284, "learning_rate": 3.573054423826821e-06, "loss": 0.1712, "num_tokens": 15791048.0, "reward": 0.74969482421875, "reward_std": 0.01368811633437872, "rewards//mean": 0.74969482421875, "rewards//std": 0.02649626135826111, "step": 1827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3656, "grad_norm": 3.3102481365203857, "kl": 3.8247243016958237, "learning_rate": 3.5716211352045194e-06, "loss": 0.153, "num_tokens": 15799808.0, "reward": 0.744873046875, "reward_std": 0.009108740836381912, "rewards//mean": 0.744873046875, "rewards//std": 0.03873170539736748, "step": 1828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3658, "grad_norm": 1.7112064361572266, "kl": 5.7573626190423965, "learning_rate": 3.5701874149333515e-06, "loss": 0.2303, "num_tokens": 15808392.0, "reward": 0.72686767578125, "reward_std": 0.016148768365383148, "rewards//mean": 0.72686767578125, "rewards//std": 0.030562762171030045, "step": 1829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.366, "grad_norm": 5.248967170715332, "kl": 7.782414183020592, "learning_rate": 3.5687532635908216e-06, "loss": 0.3113, "num_tokens": 15817048.0, "reward": 0.73699951171875, "reward_std": 0.01457931287586689, "rewards//mean": 0.73699951171875, "rewards//std": 0.04005470499396324, "step": 1830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3662, "grad_norm": 1.368701457977295, "kl": 6.187209323048592, "learning_rate": 3.5673186817546047e-06, "loss": 0.2475, "num_tokens": 15825664.0, "reward": 0.75128173828125, "reward_std": 0.01789102330803871, "rewards//mean": 0.75128173828125, "rewards//std": 0.03230227530002594, "step": 1831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3664, "grad_norm": 3.28334379196167, "kl": 4.885332573205233, "learning_rate": 3.565883670002551e-06, "loss": 0.1954, "num_tokens": 15834296.0, "reward": 0.72332763671875, "reward_std": 0.009691819548606873, "rewards//mean": 0.72332763671875, "rewards//std": 0.03208786994218826, "step": 1832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3666, "grad_norm": 4.298150062561035, "kl": 4.574653310701251, "learning_rate": 3.564448228912682e-06, "loss": 0.183, "num_tokens": 15842992.0, "reward": 0.7724609375, "reward_std": 0.011794152669608593, "rewards//mean": 0.7724609375, "rewards//std": 0.0350700281560421, "step": 1833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3668, "grad_norm": 5.492315769195557, "kl": 5.024640657007694, "learning_rate": 3.563012359063194e-06, "loss": 0.201, "num_tokens": 15851704.0, "reward": 0.744384765625, "reward_std": 0.012358936481177807, "rewards//mean": 0.744384765625, "rewards//std": 0.032836370170116425, "step": 1834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.367, "grad_norm": 6.968992710113525, "kl": 2.042024552822113, "learning_rate": 3.5615760610324547e-06, "loss": 0.0817, "num_tokens": 15860272.0, "reward": 0.7635498046875, "reward_std": 0.009292231872677803, "rewards//mean": 0.7635498046875, "rewards//std": 0.027603542432188988, "step": 1835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3672, "grad_norm": 1.6963984966278076, "kl": 3.3329254500567913, "learning_rate": 3.560139335399005e-06, "loss": 0.1333, "num_tokens": 15868880.0, "reward": 0.76763916015625, "reward_std": 0.011097281239926815, "rewards//mean": 0.76763916015625, "rewards//std": 0.035269688814878464, "step": 1836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3674, "grad_norm": 2.60526180267334, "kl": 4.534869685769081, "learning_rate": 3.558702182741558e-06, "loss": 0.1814, "num_tokens": 15877640.0, "reward": 0.77899169921875, "reward_std": 0.015996374189853668, "rewards//mean": 0.77899169921875, "rewards//std": 0.034532781690359116, "step": 1837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3676, "grad_norm": 3.5594239234924316, "kl": 3.7886227518320084, "learning_rate": 3.557264603638998e-06, "loss": 0.1515, "num_tokens": 15886312.0, "reward": 0.7479248046875, "reward_std": 0.009803744032979012, "rewards//mean": 0.7479248046875, "rewards//std": 0.028021546080708504, "step": 1838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3678, "grad_norm": 3.8163363933563232, "kl": 5.436121881008148, "learning_rate": 3.555826598670382e-06, "loss": 0.2174, "num_tokens": 15894920.0, "reward": 0.7152099609375, "reward_std": 0.01225278340280056, "rewards//mean": 0.7152099609375, "rewards//std": 0.033543091267347336, "step": 1839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.368, "grad_norm": 2.3282289505004883, "kl": 2.5568104833364487, "learning_rate": 3.5543881684149382e-06, "loss": 0.1023, "num_tokens": 15903472.0, "reward": 0.7777099609375, "reward_std": 0.008531015366315842, "rewards//mean": 0.7777099609375, "rewards//std": 0.020347749814391136, "step": 1840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3682, "grad_norm": 3.855396032333374, "kl": 3.296573679894209, "learning_rate": 3.552949313452067e-06, "loss": 0.1319, "num_tokens": 15912104.0, "reward": 0.78363037109375, "reward_std": 0.00923638790845871, "rewards//mean": 0.78363037109375, "rewards//std": 0.020155178382992744, "step": 1841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3684, "grad_norm": 1.6408907175064087, "kl": 4.005101673305035, "learning_rate": 3.5515100343613375e-06, "loss": 0.1602, "num_tokens": 15920808.0, "reward": 0.72705078125, "reward_std": 0.012908661738038063, "rewards//mean": 0.72705078125, "rewards//std": 0.031500887125730515, "step": 1842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3686, "grad_norm": 2.849701404571533, "kl": 7.655633483082056, "learning_rate": 3.5500703317224926e-06, "loss": 0.3062, "num_tokens": 15929496.0, "reward": 0.736083984375, "reward_std": 0.011314325034618378, "rewards//mean": 0.736083984375, "rewards//std": 0.04414413869380951, "step": 1843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3688, "grad_norm": 3.097137451171875, "kl": 2.899221781641245, "learning_rate": 3.5486302061154433e-06, "loss": 0.116, "num_tokens": 15938136.0, "reward": 0.7603759765625, "reward_std": 0.013852903619408607, "rewards//mean": 0.7603759765625, "rewards//std": 0.027719559147953987, "step": 1844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.369, "grad_norm": 8.47358512878418, "kl": 3.5301214568316936, "learning_rate": 3.5471896581202724e-06, "loss": 0.1412, "num_tokens": 15946768.0, "reward": 0.73834228515625, "reward_std": 0.00988297164440155, "rewards//mean": 0.73834228515625, "rewards//std": 0.029376275837421417, "step": 1845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3692, "grad_norm": 2.9831252098083496, "kl": 5.402761921286583, "learning_rate": 3.5457486883172323e-06, "loss": 0.2161, "num_tokens": 15955440.0, "reward": 0.78125, "reward_std": 0.012065006420016289, "rewards//mean": 0.78125, "rewards//std": 0.03275606036186218, "step": 1846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3694, "grad_norm": 2.937488079071045, "kl": 4.375734597444534, "learning_rate": 3.544307297286746e-06, "loss": 0.175, "num_tokens": 15964048.0, "reward": 0.756103515625, "reward_std": 0.009139911271631718, "rewards//mean": 0.756103515625, "rewards//std": 0.03403878211975098, "step": 1847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3696, "grad_norm": 3.9374587535858154, "kl": 2.3039805740118027, "learning_rate": 3.5428654856094047e-06, "loss": 0.0922, "num_tokens": 15972664.0, "reward": 0.77886962890625, "reward_std": 0.01071839313954115, "rewards//mean": 0.77886962890625, "rewards//std": 0.022932061925530434, "step": 1848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3698, "grad_norm": 8.834426879882812, "kl": 4.310526676476002, "learning_rate": 3.541423253865971e-06, "loss": 0.1724, "num_tokens": 15981304.0, "reward": 0.72637939453125, "reward_std": 0.011293129995465279, "rewards//mean": 0.72637939453125, "rewards//std": 0.02869170717895031, "step": 1849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.37, "grad_norm": 2.413186550140381, "kl": 4.857105448842049, "learning_rate": 3.5399806026373746e-06, "loss": 0.1943, "num_tokens": 15989976.0, "reward": 0.77166748046875, "reward_std": 0.018350526690483093, "rewards//mean": 0.77166748046875, "rewards//std": 0.030796637758612633, "step": 1850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3702, "grad_norm": 5.84638786315918, "kl": 8.403250932693481, "learning_rate": 3.5385375325047167e-06, "loss": 0.3361, "num_tokens": 15998608.0, "reward": 0.77899169921875, "reward_std": 0.01690639927983284, "rewards//mean": 0.77899169921875, "rewards//std": 0.03536441549658775, "step": 1851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3704, "grad_norm": 3.667304039001465, "kl": 4.8515715301036835, "learning_rate": 3.537094044049264e-06, "loss": 0.1941, "num_tokens": 16007184.0, "reward": 0.75848388671875, "reward_std": 0.015843886882066727, "rewards//mean": 0.75848388671875, "rewards//std": 0.037946321070194244, "step": 1852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3706, "grad_norm": 2.2512717247009277, "kl": 3.9215282946825027, "learning_rate": 3.535650137852455e-06, "loss": 0.1569, "num_tokens": 16015936.0, "reward": 0.73260498046875, "reward_std": 0.010412785224616528, "rewards//mean": 0.73260498046875, "rewards//std": 0.02588241547346115, "step": 1853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3708, "grad_norm": 4.194582462310791, "kl": 5.359758798032999, "learning_rate": 3.5342058144958943e-06, "loss": 0.2144, "num_tokens": 16024664.0, "reward": 0.74334716796875, "reward_std": 0.010144805535674095, "rewards//mean": 0.74334716796875, "rewards//std": 0.04029359668493271, "step": 1854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.371, "grad_norm": 2.818694829940796, "kl": 4.2983516454696655, "learning_rate": 3.532761074561355e-06, "loss": 0.1719, "num_tokens": 16033264.0, "reward": 0.72418212890625, "reward_std": 0.013120047748088837, "rewards//mean": 0.72418212890625, "rewards//std": 0.033223122358322144, "step": 1855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3712, "grad_norm": 3.249021053314209, "kl": 4.816079258918762, "learning_rate": 3.531315918630778e-06, "loss": 0.1926, "num_tokens": 16041776.0, "reward": 0.7708740234375, "reward_std": 0.011809059418737888, "rewards//mean": 0.7708740234375, "rewards//std": 0.03111966699361801, "step": 1856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3714, "grad_norm": 4.380799770355225, "kl": 5.807232582941651, "learning_rate": 3.5298703472862725e-06, "loss": 0.2323, "num_tokens": 16050400.0, "reward": 0.73773193359375, "reward_std": 0.01249704509973526, "rewards//mean": 0.73773193359375, "rewards//std": 0.025285402312874794, "step": 1857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3716, "grad_norm": 5.402341842651367, "kl": 6.3560740277171135, "learning_rate": 3.528424361110115e-06, "loss": 0.2542, "num_tokens": 16059080.0, "reward": 0.75872802734375, "reward_std": 0.013268772512674332, "rewards//mean": 0.75872802734375, "rewards//std": 0.03085017018020153, "step": 1858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3718, "grad_norm": 5.18095588684082, "kl": 3.1697274781763554, "learning_rate": 3.526977960684747e-06, "loss": 0.1268, "num_tokens": 16067680.0, "reward": 0.7620849609375, "reward_std": 0.011016818694770336, "rewards//mean": 0.7620849609375, "rewards//std": 0.024965863674879074, "step": 1859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.372, "grad_norm": 1.960307002067566, "kl": 3.7448112219572067, "learning_rate": 3.52553114659278e-06, "loss": 0.1498, "num_tokens": 16076320.0, "reward": 0.75408935546875, "reward_std": 0.020055364817380905, "rewards//mean": 0.75408935546875, "rewards//std": 0.043611764907836914, "step": 1860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3722, "grad_norm": 2.1612024307250977, "kl": 2.4561969861388206, "learning_rate": 3.5240839194169885e-06, "loss": 0.0982, "num_tokens": 16084880.0, "reward": 0.74346923828125, "reward_std": 0.008086617104709148, "rewards//mean": 0.74346923828125, "rewards//std": 0.026503117755055428, "step": 1861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3724, "grad_norm": 2.0569639205932617, "kl": 2.920382661744952, "learning_rate": 3.522636279740318e-06, "loss": 0.1168, "num_tokens": 16093472.0, "reward": 0.74884033203125, "reward_std": 0.012318536639213562, "rewards//mean": 0.74884033203125, "rewards//std": 0.03034305013716221, "step": 1862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3726, "grad_norm": 1.8384166955947876, "kl": 4.195387817919254, "learning_rate": 3.521188228145876e-06, "loss": 0.1678, "num_tokens": 16102144.0, "reward": 0.72625732421875, "reward_std": 0.010663729161024094, "rewards//mean": 0.72625732421875, "rewards//std": 0.037694964557886124, "step": 1863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3728, "grad_norm": 1.8482376337051392, "kl": 4.471856124699116, "learning_rate": 3.5197397652169375e-06, "loss": 0.1789, "num_tokens": 16110744.0, "reward": 0.765625, "reward_std": 0.012883743271231651, "rewards//mean": 0.765625, "rewards//std": 0.03561824560165405, "step": 1864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.373, "grad_norm": 2.544144630432129, "kl": 2.5057146325707436, "learning_rate": 3.518290891536944e-06, "loss": 0.1002, "num_tokens": 16119360.0, "reward": 0.715576171875, "reward_std": 0.011591123417019844, "rewards//mean": 0.715576171875, "rewards//std": 0.035888370126485825, "step": 1865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3732, "grad_norm": 1.916542887687683, "kl": 2.0514227245002985, "learning_rate": 3.516841607689501e-06, "loss": 0.0821, "num_tokens": 16128112.0, "reward": 0.760009765625, "reward_std": 0.00854682456701994, "rewards//mean": 0.760009765625, "rewards//std": 0.03033662587404251, "step": 1866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3734, "grad_norm": 2.489891767501831, "kl": 3.7608317732810974, "learning_rate": 3.5153919142583797e-06, "loss": 0.1504, "num_tokens": 16136808.0, "reward": 0.7542724609375, "reward_std": 0.013767517171800137, "rewards//mean": 0.7542724609375, "rewards//std": 0.03243445232510567, "step": 1867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3736, "grad_norm": 2.0835702419281006, "kl": 2.9631675481796265, "learning_rate": 3.5139418118275174e-06, "loss": 0.1185, "num_tokens": 16145360.0, "reward": 0.75250244140625, "reward_std": 0.008370945230126381, "rewards//mean": 0.75250244140625, "rewards//std": 0.023337742313742638, "step": 1868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3738, "grad_norm": 2.661290407180786, "kl": 3.462757796049118, "learning_rate": 3.5124913009810137e-06, "loss": 0.1385, "num_tokens": 16154056.0, "reward": 0.75592041015625, "reward_std": 0.017960133031010628, "rewards//mean": 0.75592041015625, "rewards//std": 0.035118285566568375, "step": 1869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.374, "grad_norm": 3.9872844219207764, "kl": 3.6407096050679684, "learning_rate": 3.511040382303136e-06, "loss": 0.1456, "num_tokens": 16162616.0, "reward": 0.73175048828125, "reward_std": 0.017535485327243805, "rewards//mean": 0.73175048828125, "rewards//std": 0.04146303981542587, "step": 1870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3742, "grad_norm": 2.196523666381836, "kl": 4.523643150925636, "learning_rate": 3.5095890563783124e-06, "loss": 0.1809, "num_tokens": 16171304.0, "reward": 0.74761962890625, "reward_std": 0.014695564284920692, "rewards//mean": 0.74761962890625, "rewards//std": 0.04339568316936493, "step": 1871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3744, "grad_norm": 1.5978864431381226, "kl": 2.8502535447478294, "learning_rate": 3.508137323791138e-06, "loss": 0.114, "num_tokens": 16179984.0, "reward": 0.76629638671875, "reward_std": 0.009461971931159496, "rewards//mean": 0.76629638671875, "rewards//std": 0.03062807209789753, "step": 1872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3746, "grad_norm": 1.525713324546814, "kl": 4.138502024114132, "learning_rate": 3.50668518512637e-06, "loss": 0.1655, "num_tokens": 16188520.0, "reward": 0.7554931640625, "reward_std": 0.013477467000484467, "rewards//mean": 0.7554931640625, "rewards//std": 0.02439669519662857, "step": 1873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3748, "grad_norm": 9.045687675476074, "kl": 3.021356947720051, "learning_rate": 3.5052326409689296e-06, "loss": 0.1209, "num_tokens": 16197088.0, "reward": 0.7861328125, "reward_std": 0.01325589045882225, "rewards//mean": 0.7861328125, "rewards//std": 0.023302771151065826, "step": 1874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.375, "grad_norm": 4.741652488708496, "kl": 3.4749919213354588, "learning_rate": 3.503779691903902e-06, "loss": 0.139, "num_tokens": 16205768.0, "reward": 0.75018310546875, "reward_std": 0.013380886986851692, "rewards//mean": 0.75018310546875, "rewards//std": 0.03493461012840271, "step": 1875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3752, "grad_norm": 4.919980049133301, "kl": 3.201800972223282, "learning_rate": 3.5023263385165346e-06, "loss": 0.1281, "num_tokens": 16214376.0, "reward": 0.76434326171875, "reward_std": 0.01684371754527092, "rewards//mean": 0.76434326171875, "rewards//std": 0.02683911845088005, "step": 1876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3754, "grad_norm": 1.9610189199447632, "kl": 5.433921009302139, "learning_rate": 3.5008725813922383e-06, "loss": 0.2174, "num_tokens": 16222952.0, "reward": 0.72967529296875, "reward_std": 0.01622331514954567, "rewards//mean": 0.72967529296875, "rewards//std": 0.02814410999417305, "step": 1877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3756, "grad_norm": 3.9688291549682617, "kl": 6.453773874789476, "learning_rate": 3.499418421116585e-06, "loss": 0.2582, "num_tokens": 16231624.0, "reward": 0.7720947265625, "reward_std": 0.018322601914405823, "rewards//mean": 0.7720947265625, "rewards//std": 0.03653941676020622, "step": 1878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3758, "grad_norm": 4.758042335510254, "kl": 7.120992444455624, "learning_rate": 3.4979638582753117e-06, "loss": 0.2848, "num_tokens": 16240272.0, "reward": 0.7615966796875, "reward_std": 0.018252242356538773, "rewards//mean": 0.7615966796875, "rewards//std": 0.039006464183330536, "step": 1879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.376, "grad_norm": 1.8219389915466309, "kl": 5.9574927389621735, "learning_rate": 3.4965088934543153e-06, "loss": 0.2383, "num_tokens": 16248920.0, "reward": 0.73956298828125, "reward_std": 0.013029281049966812, "rewards//mean": 0.73956298828125, "rewards//std": 0.03870154917240143, "step": 1880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3762, "grad_norm": 3.0994060039520264, "kl": 5.270317375659943, "learning_rate": 3.4950535272396564e-06, "loss": 0.2108, "num_tokens": 16257576.0, "reward": 0.7628173828125, "reward_std": 0.017168058082461357, "rewards//mean": 0.7628173828125, "rewards//std": 0.034976452589035034, "step": 1881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3764, "grad_norm": 2.7112081050872803, "kl": 5.4090549647808075, "learning_rate": 3.4935977602175547e-06, "loss": 0.2164, "num_tokens": 16266248.0, "reward": 0.72772216796875, "reward_std": 0.013160137459635735, "rewards//mean": 0.72772216796875, "rewards//std": 0.03105313703417778, "step": 1882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3766, "grad_norm": 2.932710886001587, "kl": 6.998684898018837, "learning_rate": 3.492141592974395e-06, "loss": 0.2799, "num_tokens": 16274880.0, "reward": 0.758544921875, "reward_std": 0.016695940867066383, "rewards//mean": 0.758544921875, "rewards//std": 0.03251020610332489, "step": 1883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3768, "grad_norm": 2.0190110206604004, "kl": 5.505975015461445, "learning_rate": 3.4906850260967197e-06, "loss": 0.2202, "num_tokens": 16283720.0, "reward": 0.77923583984375, "reward_std": 0.018565673381090164, "rewards//mean": 0.77923583984375, "rewards//std": 0.036703262478113174, "step": 1884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.377, "grad_norm": 3.781447172164917, "kl": 6.001296103000641, "learning_rate": 3.4892280601712346e-06, "loss": 0.2401, "num_tokens": 16292424.0, "reward": 0.7652587890625, "reward_std": 0.01625027135014534, "rewards//mean": 0.7652587890625, "rewards//std": 0.032930970191955566, "step": 1885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3772, "grad_norm": 3.3916003704071045, "kl": 6.735005713999271, "learning_rate": 3.4877706957848052e-06, "loss": 0.2694, "num_tokens": 16301016.0, "reward": 0.75189208984375, "reward_std": 0.016989074647426605, "rewards//mean": 0.75189208984375, "rewards//std": 0.03659752383828163, "step": 1886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3774, "grad_norm": 3.1227471828460693, "kl": 4.243958551436663, "learning_rate": 3.486312933524457e-06, "loss": 0.1698, "num_tokens": 16309584.0, "reward": 0.74609375, "reward_std": 0.014929790049791336, "rewards//mean": 0.74609375, "rewards//std": 0.0322718620300293, "step": 1887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3776, "grad_norm": 5.100701332092285, "kl": 8.962956815958023, "learning_rate": 3.4848547739773782e-06, "loss": 0.3585, "num_tokens": 16318192.0, "reward": 0.7662353515625, "reward_std": 0.01798715442419052, "rewards//mean": 0.7662353515625, "rewards//std": 0.04075106978416443, "step": 1888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3778, "grad_norm": 1.948432207107544, "kl": 7.121491968631744, "learning_rate": 3.4833962177309137e-06, "loss": 0.2849, "num_tokens": 16326832.0, "reward": 0.75384521484375, "reward_std": 0.020326197147369385, "rewards//mean": 0.75384521484375, "rewards//std": 0.036101117730140686, "step": 1889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.378, "grad_norm": 1.5205079317092896, "kl": 5.375772934406996, "learning_rate": 3.4819372653725704e-06, "loss": 0.215, "num_tokens": 16335400.0, "reward": 0.73065185546875, "reward_std": 0.012093450874090195, "rewards//mean": 0.73065185546875, "rewards//std": 0.027930304408073425, "step": 1890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3782, "grad_norm": 8.445619583129883, "kl": 8.571057945489883, "learning_rate": 3.480477917490014e-06, "loss": 0.3428, "num_tokens": 16344080.0, "reward": 0.72491455078125, "reward_std": 0.01878506690263748, "rewards//mean": 0.72491455078125, "rewards//std": 0.048758283257484436, "step": 1891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3784, "grad_norm": 4.404247283935547, "kl": 8.082384705543518, "learning_rate": 3.47901817467107e-06, "loss": 0.3233, "num_tokens": 16352640.0, "reward": 0.7354736328125, "reward_std": 0.01779378205537796, "rewards//mean": 0.7354736328125, "rewards//std": 0.040971867740154266, "step": 1892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3786, "grad_norm": 2.8230013847351074, "kl": 4.705892436206341, "learning_rate": 3.4775580375037217e-06, "loss": 0.1882, "num_tokens": 16361280.0, "reward": 0.76385498046875, "reward_std": 0.012706092558801174, "rewards//mean": 0.76385498046875, "rewards//std": 0.03016090951859951, "step": 1893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3788, "grad_norm": 3.489748477935791, "kl": 6.008673928678036, "learning_rate": 3.4760975065761134e-06, "loss": 0.2403, "num_tokens": 16369872.0, "reward": 0.76416015625, "reward_std": 0.02048167772591114, "rewards//mean": 0.76416015625, "rewards//std": 0.034050341695547104, "step": 1894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.379, "grad_norm": 2.9648325443267822, "kl": 7.087110884487629, "learning_rate": 3.4746365824765455e-06, "loss": 0.2835, "num_tokens": 16378512.0, "reward": 0.76336669921875, "reward_std": 0.023405857384204865, "rewards//mean": 0.76336669921875, "rewards//std": 0.0346657857298851, "step": 1895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3792, "grad_norm": 2.0531909465789795, "kl": 4.329822055995464, "learning_rate": 3.4731752657934793e-06, "loss": 0.1732, "num_tokens": 16387160.0, "reward": 0.7408447265625, "reward_std": 0.015022501349449158, "rewards//mean": 0.7408447265625, "rewards//std": 0.034535687416791916, "step": 1896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3794, "grad_norm": 3.303743362426758, "kl": 5.287945941090584, "learning_rate": 3.471713557115532e-06, "loss": 0.2115, "num_tokens": 16395784.0, "reward": 0.7655029296875, "reward_std": 0.010594455525279045, "rewards//mean": 0.7655029296875, "rewards//std": 0.018597062677145004, "step": 1897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3796, "grad_norm": 5.308237552642822, "kl": 6.848035991191864, "learning_rate": 3.4702514570314804e-06, "loss": 0.2739, "num_tokens": 16404432.0, "reward": 0.74969482421875, "reward_std": 0.015441283583641052, "rewards//mean": 0.74969482421875, "rewards//std": 0.029954424127936363, "step": 1898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3798, "grad_norm": 2.084984302520752, "kl": 4.673119902610779, "learning_rate": 3.4687889661302577e-06, "loss": 0.1869, "num_tokens": 16413160.0, "reward": 0.75238037109375, "reward_std": 0.011193355545401573, "rewards//mean": 0.75238037109375, "rewards//std": 0.031083831563591957, "step": 1899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.38, "grad_norm": 3.0067574977874756, "kl": 5.0662331990897655, "learning_rate": 3.4673260850009553e-06, "loss": 0.2026, "num_tokens": 16421768.0, "reward": 0.73284912109375, "reward_std": 0.012430117465555668, "rewards//mean": 0.73284912109375, "rewards//std": 0.041073113679885864, "step": 1900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3802, "grad_norm": 1.8881032466888428, "kl": 5.393092446029186, "learning_rate": 3.4658628142328215e-06, "loss": 0.2157, "num_tokens": 16430400.0, "reward": 0.74066162109375, "reward_std": 0.015088468790054321, "rewards//mean": 0.74066162109375, "rewards//std": 0.037536006420850754, "step": 1901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3804, "grad_norm": 2.7245543003082275, "kl": 5.517898853868246, "learning_rate": 3.464399154415262e-06, "loss": 0.2207, "num_tokens": 16439048.0, "reward": 0.72607421875, "reward_std": 0.01673870161175728, "rewards//mean": 0.72607421875, "rewards//std": 0.03594652935862541, "step": 1902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3806, "grad_norm": 1.5048727989196777, "kl": 2.879439502954483, "learning_rate": 3.462935106137838e-06, "loss": 0.1152, "num_tokens": 16447656.0, "reward": 0.71685791015625, "reward_std": 0.010631646029651165, "rewards//mean": 0.71685791015625, "rewards//std": 0.04081505164504051, "step": 1903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3808, "grad_norm": 2.093132257461548, "kl": 3.8948060050606728, "learning_rate": 3.461470669990269e-06, "loss": 0.1558, "num_tokens": 16456272.0, "reward": 0.753173828125, "reward_std": 0.01659526489675045, "rewards//mean": 0.753173828125, "rewards//std": 0.04280143603682518, "step": 1904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.381, "grad_norm": 3.6071078777313232, "kl": 3.0677939411252737, "learning_rate": 3.4600058465624288e-06, "loss": 0.1227, "num_tokens": 16464952.0, "reward": 0.7894287109375, "reward_std": 0.02114536426961422, "rewards//mean": 0.7894287109375, "rewards//std": 0.040973346680402756, "step": 1905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3812, "grad_norm": 4.182697772979736, "kl": 2.1424148119986057, "learning_rate": 3.4585406364443484e-06, "loss": 0.0857, "num_tokens": 16473592.0, "reward": 0.7684326171875, "reward_std": 0.012420227751135826, "rewards//mean": 0.7684326171875, "rewards//std": 0.029882894828915596, "step": 1906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3814, "grad_norm": 1.9149055480957031, "kl": 2.744543641805649, "learning_rate": 3.457075040226214e-06, "loss": 0.1098, "num_tokens": 16482232.0, "reward": 0.74200439453125, "reward_std": 0.0140980314463377, "rewards//mean": 0.74200439453125, "rewards//std": 0.04119566082954407, "step": 1907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3816, "grad_norm": 1.8885047435760498, "kl": 3.2147334069013596, "learning_rate": 3.455609058498369e-06, "loss": 0.1286, "num_tokens": 16490856.0, "reward": 0.75555419921875, "reward_std": 0.014934702776372433, "rewards//mean": 0.75555419921875, "rewards//std": 0.029763804748654366, "step": 1908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3818, "grad_norm": 1.9719579219818115, "kl": 4.306006565690041, "learning_rate": 3.4541426918513084e-06, "loss": 0.1722, "num_tokens": 16499488.0, "reward": 0.7640380859375, "reward_std": 0.02109804004430771, "rewards//mean": 0.7640380859375, "rewards//std": 0.043679963797330856, "step": 1909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.382, "grad_norm": 2.497551202774048, "kl": 3.643350075930357, "learning_rate": 3.452675940875686e-06, "loss": 0.1457, "num_tokens": 16508064.0, "reward": 0.752685546875, "reward_std": 0.012645190581679344, "rewards//mean": 0.752685546875, "rewards//std": 0.03442087396979332, "step": 1910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3822, "grad_norm": 1.4234007596969604, "kl": 3.557353377342224, "learning_rate": 3.4512088061623077e-06, "loss": 0.1423, "num_tokens": 16516688.0, "reward": 0.71148681640625, "reward_std": 0.01196424663066864, "rewards//mean": 0.71148681640625, "rewards//std": 0.033738549798727036, "step": 1911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3824, "grad_norm": 3.3219685554504395, "kl": 3.2287207171320915, "learning_rate": 3.4497412883021375e-06, "loss": 0.1291, "num_tokens": 16525384.0, "reward": 0.7188720703125, "reward_std": 0.009528012946248055, "rewards//mean": 0.7188720703125, "rewards//std": 0.03219648450613022, "step": 1912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3826, "grad_norm": 2.137319564819336, "kl": 2.7224942967295647, "learning_rate": 3.4482733878862885e-06, "loss": 0.1089, "num_tokens": 16533976.0, "reward": 0.75189208984375, "reward_std": 0.009445683099329472, "rewards//mean": 0.75189208984375, "rewards//std": 0.037057891488075256, "step": 1913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3828, "grad_norm": 2.8006231784820557, "kl": 2.3144882321357727, "learning_rate": 3.4468051055060335e-06, "loss": 0.0926, "num_tokens": 16542632.0, "reward": 0.74774169921875, "reward_std": 0.012704930268228054, "rewards//mean": 0.74774169921875, "rewards//std": 0.0344625748693943, "step": 1914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.383, "grad_norm": 4.8700408935546875, "kl": 2.0346884056925774, "learning_rate": 3.4453364417527944e-06, "loss": 0.0814, "num_tokens": 16551320.0, "reward": 0.79705810546875, "reward_std": 0.011713830754160881, "rewards//mean": 0.79705810546875, "rewards//std": 0.027633894234895706, "step": 1915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3832, "grad_norm": 1.4703590869903564, "kl": 3.6757738888263702, "learning_rate": 3.4438673972181503e-06, "loss": 0.147, "num_tokens": 16559944.0, "reward": 0.7962646484375, "reward_std": 0.015926694497466087, "rewards//mean": 0.7962646484375, "rewards//std": 0.03545616567134857, "step": 1916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3834, "grad_norm": 1.9720031023025513, "kl": 3.3295594193041325, "learning_rate": 3.4423979724938305e-06, "loss": 0.1332, "num_tokens": 16568616.0, "reward": 0.7364501953125, "reward_std": 0.012308485805988312, "rewards//mean": 0.7364501953125, "rewards//std": 0.03911187872290611, "step": 1917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3836, "grad_norm": 1.842607855796814, "kl": 3.905703816562891, "learning_rate": 3.440928168171721e-06, "loss": 0.1562, "num_tokens": 16577184.0, "reward": 0.7596435546875, "reward_std": 0.010651938617229462, "rewards//mean": 0.7596435546875, "rewards//std": 0.022074813023209572, "step": 1918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3838, "grad_norm": 2.650440216064453, "kl": 5.415190115571022, "learning_rate": 3.4394579848438573e-06, "loss": 0.2166, "num_tokens": 16585896.0, "reward": 0.74493408203125, "reward_std": 0.01632854901254177, "rewards//mean": 0.74493408203125, "rewards//std": 0.03577890619635582, "step": 1919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.384, "grad_norm": 5.435526371002197, "kl": 3.5381322354078293, "learning_rate": 3.4379874231024297e-06, "loss": 0.1415, "num_tokens": 16594656.0, "reward": 0.76629638671875, "reward_std": 0.018616091459989548, "rewards//mean": 0.76629638671875, "rewards//std": 0.03130849823355675, "step": 1920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3842, "grad_norm": 2.2209486961364746, "kl": 4.72282537817955, "learning_rate": 3.436516483539781e-06, "loss": 0.1889, "num_tokens": 16603224.0, "reward": 0.7696533203125, "reward_std": 0.020938929170370102, "rewards//mean": 0.7696533203125, "rewards//std": 0.03285181149840355, "step": 1921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3844, "grad_norm": 4.096410751342773, "kl": 3.5788148939609528, "learning_rate": 3.4350451667484035e-06, "loss": 0.1432, "num_tokens": 16611944.0, "reward": 0.748046875, "reward_std": 0.012374822050333023, "rewards//mean": 0.748046875, "rewards//std": 0.02055242285132408, "step": 1922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3846, "grad_norm": 3.164698362350464, "kl": 5.214606191962957, "learning_rate": 3.4335734733209457e-06, "loss": 0.2086, "num_tokens": 16620656.0, "reward": 0.741943359375, "reward_std": 0.016436193138360977, "rewards//mean": 0.741943359375, "rewards//std": 0.02685553953051567, "step": 1923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3848, "grad_norm": 2.2820816040039062, "kl": 4.381569150835276, "learning_rate": 3.4321014038502036e-06, "loss": 0.1753, "num_tokens": 16629280.0, "reward": 0.76171875, "reward_std": 0.011013934388756752, "rewards//mean": 0.76171875, "rewards//std": 0.027471886947751045, "step": 1924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.385, "grad_norm": 1.6073607206344604, "kl": 6.154620438814163, "learning_rate": 3.4306289589291287e-06, "loss": 0.2462, "num_tokens": 16637840.0, "reward": 0.74798583984375, "reward_std": 0.015042074024677277, "rewards//mean": 0.74798583984375, "rewards//std": 0.03336407244205475, "step": 1925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3852, "grad_norm": 1.9529304504394531, "kl": 3.492094773799181, "learning_rate": 3.429156139150819e-06, "loss": 0.1397, "num_tokens": 16646456.0, "reward": 0.748046875, "reward_std": 0.009632089175283909, "rewards//mean": 0.748046875, "rewards//std": 0.02865428850054741, "step": 1926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3854, "grad_norm": 2.2657668590545654, "kl": 4.289997339248657, "learning_rate": 3.4276829451085287e-06, "loss": 0.1716, "num_tokens": 16655072.0, "reward": 0.764404296875, "reward_std": 0.012347443960607052, "rewards//mean": 0.764404296875, "rewards//std": 0.02897266112267971, "step": 1927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3856, "grad_norm": 2.5998361110687256, "kl": 5.387521386146545, "learning_rate": 3.4262093773956583e-06, "loss": 0.2155, "num_tokens": 16663784.0, "reward": 0.81903076171875, "reward_std": 0.0207710862159729, "rewards//mean": 0.81903076171875, "rewards//std": 0.03108724020421505, "step": 1928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3858, "grad_norm": 3.3814163208007812, "kl": 6.411819189786911, "learning_rate": 3.4247354366057618e-06, "loss": 0.2565, "num_tokens": 16672448.0, "reward": 0.72003173828125, "reward_std": 0.012293415144085884, "rewards//mean": 0.72003173828125, "rewards//std": 0.03940853103995323, "step": 1929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.386, "grad_norm": 3.502300977706909, "kl": 3.102776788175106, "learning_rate": 3.4232611233325418e-06, "loss": 0.1241, "num_tokens": 16681040.0, "reward": 0.7447509765625, "reward_std": 0.009086589328944683, "rewards//mean": 0.7447509765625, "rewards//std": 0.025451067835092545, "step": 1930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3862, "grad_norm": 2.651043176651001, "kl": 4.640947103500366, "learning_rate": 3.4217864381698523e-06, "loss": 0.1856, "num_tokens": 16689648.0, "reward": 0.751220703125, "reward_std": 0.02113393135368824, "rewards//mean": 0.751220703125, "rewards//std": 0.03570568561553955, "step": 1931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3864, "grad_norm": 1.8548647165298462, "kl": 5.224403560161591, "learning_rate": 3.4203113817116955e-06, "loss": 0.209, "num_tokens": 16698256.0, "reward": 0.7364501953125, "reward_std": 0.012208163738250732, "rewards//mean": 0.7364501953125, "rewards//std": 0.0356723926961422, "step": 1932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3866, "grad_norm": 3.412083148956299, "kl": 3.2396801114082336, "learning_rate": 3.4188359545522235e-06, "loss": 0.1296, "num_tokens": 16706920.0, "reward": 0.7410888671875, "reward_std": 0.008692034520208836, "rewards//mean": 0.7410888671875, "rewards//std": 0.03972326219081879, "step": 1933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3868, "grad_norm": 4.788543224334717, "kl": 7.125677891075611, "learning_rate": 3.41736015728574e-06, "loss": 0.285, "num_tokens": 16715640.0, "reward": 0.7427978515625, "reward_std": 0.017555654048919678, "rewards//mean": 0.7427978515625, "rewards//std": 0.042090125381946564, "step": 1934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.387, "grad_norm": 2.5973010063171387, "kl": 6.067594081163406, "learning_rate": 3.415883990506694e-06, "loss": 0.2427, "num_tokens": 16724336.0, "reward": 0.73162841796875, "reward_std": 0.013264905661344528, "rewards//mean": 0.73162841796875, "rewards//std": 0.033361803740262985, "step": 1935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3872, "grad_norm": 5.8729567527771, "kl": 6.555339261889458, "learning_rate": 3.414407454809687e-06, "loss": 0.2622, "num_tokens": 16733000.0, "reward": 0.760986328125, "reward_std": 0.0139954574406147, "rewards//mean": 0.760986328125, "rewards//std": 0.03867538273334503, "step": 1936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3874, "grad_norm": 3.09769606590271, "kl": 4.209323026239872, "learning_rate": 3.4129305507894657e-06, "loss": 0.1684, "num_tokens": 16741664.0, "reward": 0.72955322265625, "reward_std": 0.015078463591635227, "rewards//mean": 0.72955322265625, "rewards//std": 0.030319593846797943, "step": 1937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3876, "grad_norm": 1.665726661682129, "kl": 7.071193847805262, "learning_rate": 3.411453279040928e-06, "loss": 0.2828, "num_tokens": 16750360.0, "reward": 0.76202392578125, "reward_std": 0.017482366412878036, "rewards//mean": 0.76202392578125, "rewards//std": 0.03366263955831528, "step": 1938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3878, "grad_norm": 2.9725844860076904, "kl": 3.9335062317550182, "learning_rate": 3.4099756401591182e-06, "loss": 0.1573, "num_tokens": 16758984.0, "reward": 0.74884033203125, "reward_std": 0.012135745957493782, "rewards//mean": 0.74884033203125, "rewards//std": 0.03462995961308479, "step": 1939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.388, "grad_norm": 4.411440849304199, "kl": 7.304619371891022, "learning_rate": 3.40849763473923e-06, "loss": 0.2922, "num_tokens": 16767672.0, "reward": 0.75189208984375, "reward_std": 0.013559894636273384, "rewards//mean": 0.75189208984375, "rewards//std": 0.03499392420053482, "step": 1940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3882, "grad_norm": 2.9029901027679443, "kl": 5.809667458757758, "learning_rate": 3.4070192633766025e-06, "loss": 0.2324, "num_tokens": 16776352.0, "reward": 0.72711181640625, "reward_std": 0.011332918889820576, "rewards//mean": 0.72711181640625, "rewards//std": 0.03323952108621597, "step": 1941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3884, "grad_norm": 5.876077651977539, "kl": 4.206186078488827, "learning_rate": 3.405540526666725e-06, "loss": 0.1682, "num_tokens": 16784968.0, "reward": 0.71258544921875, "reward_std": 0.009074600413441658, "rewards//mean": 0.71258544921875, "rewards//std": 0.04498615115880966, "step": 1942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3886, "grad_norm": 2.1351466178894043, "kl": 4.611094564199448, "learning_rate": 3.4040614252052305e-06, "loss": 0.1844, "num_tokens": 16793592.0, "reward": 0.7763671875, "reward_std": 0.008341692388057709, "rewards//mean": 0.7763671875, "rewards//std": 0.020740119740366936, "step": 1943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3888, "grad_norm": 3.233121633529663, "kl": 3.8039817325770855, "learning_rate": 3.4025819595879033e-06, "loss": 0.1522, "num_tokens": 16802216.0, "reward": 0.71368408203125, "reward_std": 0.010252590291202068, "rewards//mean": 0.71368408203125, "rewards//std": 0.03734233230352402, "step": 1944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.389, "grad_norm": 2.4570229053497314, "kl": 6.640096899122, "learning_rate": 3.40110213041067e-06, "loss": 0.2656, "num_tokens": 16810928.0, "reward": 0.75091552734375, "reward_std": 0.021410658955574036, "rewards//mean": 0.75091552734375, "rewards//std": 0.03321674466133118, "step": 1945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3892, "grad_norm": 1.748984456062317, "kl": 2.7434529047459364, "learning_rate": 3.3996219382696066e-06, "loss": 0.1097, "num_tokens": 16819560.0, "reward": 0.73095703125, "reward_std": 0.00811475794762373, "rewards//mean": 0.73095703125, "rewards//std": 0.039711255580186844, "step": 1946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3894, "grad_norm": 3.1654608249664307, "kl": 6.174835802987218, "learning_rate": 3.3981413837609346e-06, "loss": 0.247, "num_tokens": 16828184.0, "reward": 0.722900390625, "reward_std": 0.012586073949933052, "rewards//mean": 0.722900390625, "rewards//std": 0.041543688625097275, "step": 1947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3896, "grad_norm": 1.6541121006011963, "kl": 4.287666857242584, "learning_rate": 3.3966604674810193e-06, "loss": 0.1715, "num_tokens": 16836816.0, "reward": 0.76953125, "reward_std": 0.013239104300737381, "rewards//mean": 0.76953125, "rewards//std": 0.032346826046705246, "step": 1948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3898, "grad_norm": 4.500350475311279, "kl": 3.0607928596436977, "learning_rate": 3.395179190026376e-06, "loss": 0.1224, "num_tokens": 16845448.0, "reward": 0.71356201171875, "reward_std": 0.007997099310159683, "rewards//mean": 0.71356201171875, "rewards//std": 0.03025011532008648, "step": 1949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.39, "grad_norm": 1.4543135166168213, "kl": 3.3837552797049284, "learning_rate": 3.3936975519936615e-06, "loss": 0.1354, "num_tokens": 16854072.0, "reward": 0.72137451171875, "reward_std": 0.009992299601435661, "rewards//mean": 0.72137451171875, "rewards//std": 0.02938915602862835, "step": 1950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3902, "grad_norm": 4.242565631866455, "kl": 6.289864122867584, "learning_rate": 3.39221555397968e-06, "loss": 0.2516, "num_tokens": 16862752.0, "reward": 0.73614501953125, "reward_std": 0.01112848985940218, "rewards//mean": 0.73614501953125, "rewards//std": 0.04873406141996384, "step": 1951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3904, "grad_norm": 1.786988377571106, "kl": 3.7880978118628263, "learning_rate": 3.3907331965813807e-06, "loss": 0.1515, "num_tokens": 16871304.0, "reward": 0.71099853515625, "reward_std": 0.013732369989156723, "rewards//mean": 0.71099853515625, "rewards//std": 0.05154338851571083, "step": 1952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3906, "grad_norm": 2.6163787841796875, "kl": 3.0958042554557323, "learning_rate": 3.3892504803958547e-06, "loss": 0.1238, "num_tokens": 16879952.0, "reward": 0.74981689453125, "reward_std": 0.014431977644562721, "rewards//mean": 0.74981689453125, "rewards//std": 0.034156374633312225, "step": 1953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3908, "grad_norm": 2.15840482711792, "kl": 3.151893150061369, "learning_rate": 3.387767406020343e-06, "loss": 0.1261, "num_tokens": 16888640.0, "reward": 0.75201416015625, "reward_std": 0.013661868870258331, "rewards//mean": 0.75201416015625, "rewards//std": 0.03452708199620247, "step": 1954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.391, "grad_norm": 3.2238967418670654, "kl": 2.6733709536492825, "learning_rate": 3.386283974052226e-06, "loss": 0.1069, "num_tokens": 16897224.0, "reward": 0.7510986328125, "reward_std": 0.011718818917870522, "rewards//mean": 0.7510986328125, "rewards//std": 0.028412101790308952, "step": 1955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3912, "grad_norm": 6.131359100341797, "kl": 5.6489872969686985, "learning_rate": 3.38480018508903e-06, "loss": 0.226, "num_tokens": 16905904.0, "reward": 0.73504638671875, "reward_std": 0.010370709002017975, "rewards//mean": 0.73504638671875, "rewards//std": 0.03944845870137215, "step": 1956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3914, "grad_norm": 4.380613803863525, "kl": 5.038475599139929, "learning_rate": 3.383316039728426e-06, "loss": 0.2015, "num_tokens": 16914576.0, "reward": 0.740966796875, "reward_std": 0.014563515782356262, "rewards//mean": 0.740966796875, "rewards//std": 0.03847446292638779, "step": 1957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3916, "grad_norm": 2.1094446182250977, "kl": 3.627754829823971, "learning_rate": 3.3818315385682255e-06, "loss": 0.1451, "num_tokens": 16923160.0, "reward": 0.699462890625, "reward_std": 0.006299940869212151, "rewards//mean": 0.699462890625, "rewards//std": 0.04007779806852341, "step": 1958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3918, "grad_norm": 1.2274508476257324, "kl": 4.963891446590424, "learning_rate": 3.380346682206388e-06, "loss": 0.1986, "num_tokens": 16931832.0, "reward": 0.77142333984375, "reward_std": 0.013998337090015411, "rewards//mean": 0.77142333984375, "rewards//std": 0.03204585984349251, "step": 1959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.392, "grad_norm": 3.8137454986572266, "kl": 4.307156074792147, "learning_rate": 3.378861471241011e-06, "loss": 0.1723, "num_tokens": 16940536.0, "reward": 0.73193359375, "reward_std": 0.009923969395458698, "rewards//mean": 0.73193359375, "rewards//std": 0.03092668391764164, "step": 1960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3922, "grad_norm": 5.116244792938232, "kl": 3.902967043220997, "learning_rate": 3.37737590627034e-06, "loss": 0.1561, "num_tokens": 16949200.0, "reward": 0.76031494140625, "reward_std": 0.007396344095468521, "rewards//mean": 0.76031494140625, "rewards//std": 0.02863202430307865, "step": 1961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3924, "grad_norm": 3.4993574619293213, "kl": 5.418645918369293, "learning_rate": 3.3758899878927574e-06, "loss": 0.2167, "num_tokens": 16957848.0, "reward": 0.73822021484375, "reward_std": 0.02024884894490242, "rewards//mean": 0.73822021484375, "rewards//std": 0.04504365473985672, "step": 1962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3926, "grad_norm": 3.8792526721954346, "kl": 5.062670778483152, "learning_rate": 3.3744037167067933e-06, "loss": 0.2025, "num_tokens": 16966464.0, "reward": 0.78497314453125, "reward_std": 0.012406611815094948, "rewards//mean": 0.78497314453125, "rewards//std": 0.027964968234300613, "step": 1963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3928, "grad_norm": 4.911318778991699, "kl": 5.010666340589523, "learning_rate": 3.372917093311116e-06, "loss": 0.2004, "num_tokens": 16975304.0, "reward": 0.7335205078125, "reward_std": 0.015302449464797974, "rewards//mean": 0.7335205078125, "rewards//std": 0.03811772167682648, "step": 1964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.393, "grad_norm": 2.6954574584960938, "kl": 2.4886375702917576, "learning_rate": 3.3714301183045382e-06, "loss": 0.0995, "num_tokens": 16983960.0, "reward": 0.7318115234375, "reward_std": 0.011917466297745705, "rewards//mean": 0.7318115234375, "rewards//std": 0.03101051226258278, "step": 1965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3932, "grad_norm": 4.99193000793457, "kl": 3.6547788605093956, "learning_rate": 3.369942792286013e-06, "loss": 0.1462, "num_tokens": 16992616.0, "reward": 0.77001953125, "reward_std": 0.01970420964062214, "rewards//mean": 0.77001953125, "rewards//std": 0.03072238899767399, "step": 1966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3934, "grad_norm": 4.1066508293151855, "kl": 3.565728075802326, "learning_rate": 3.3684551158546354e-06, "loss": 0.1426, "num_tokens": 17001224.0, "reward": 0.736328125, "reward_std": 0.013639327138662338, "rewards//mean": 0.736328125, "rewards//std": 0.043736934661865234, "step": 1967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3936, "grad_norm": 1.6581199169158936, "kl": 5.227335922420025, "learning_rate": 3.3669670896096406e-06, "loss": 0.2091, "num_tokens": 17009864.0, "reward": 0.762939453125, "reward_std": 0.025431770831346512, "rewards//mean": 0.762939453125, "rewards//std": 0.03552206605672836, "step": 1968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3938, "grad_norm": 1.29312264919281, "kl": 3.5075610540807247, "learning_rate": 3.3654787141504062e-06, "loss": 0.1403, "num_tokens": 17018376.0, "reward": 0.74560546875, "reward_std": 0.014015133492648602, "rewards//mean": 0.74560546875, "rewards//std": 0.03724374622106552, "step": 1969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.394, "grad_norm": 2.2914254665374756, "kl": 4.44266465306282, "learning_rate": 3.3639899900764496e-06, "loss": 0.1777, "num_tokens": 17026976.0, "reward": 0.760009765625, "reward_std": 0.015332072973251343, "rewards//mean": 0.760009765625, "rewards//std": 0.03370269387960434, "step": 1970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3942, "grad_norm": 2.2522525787353516, "kl": 2.9977968111634254, "learning_rate": 3.362500917987427e-06, "loss": 0.1199, "num_tokens": 17035568.0, "reward": 0.76318359375, "reward_std": 0.010481065139174461, "rewards//mean": 0.76318359375, "rewards//std": 0.025812797248363495, "step": 1971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3944, "grad_norm": 2.9316532611846924, "kl": 5.524478197097778, "learning_rate": 3.3610114984831388e-06, "loss": 0.221, "num_tokens": 17044176.0, "reward": 0.72430419921875, "reward_std": 0.015699483454227448, "rewards//mean": 0.72430419921875, "rewards//std": 0.033478666096925735, "step": 1972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3946, "grad_norm": 2.776407480239868, "kl": 5.4891092628240585, "learning_rate": 3.3595217321635217e-06, "loss": 0.2196, "num_tokens": 17052752.0, "reward": 0.74920654296875, "reward_std": 0.012270933017134666, "rewards//mean": 0.74920654296875, "rewards//std": 0.026311656460165977, "step": 1973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3948, "grad_norm": 7.1003265380859375, "kl": 9.166486769914627, "learning_rate": 3.3580316196286534e-06, "loss": 0.3667, "num_tokens": 17061416.0, "reward": 0.74609375, "reward_std": 0.016958273947238922, "rewards//mean": 0.74609375, "rewards//std": 0.04968921095132828, "step": 1974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.395, "grad_norm": 5.936905384063721, "kl": 6.188938867300749, "learning_rate": 3.356541161478751e-06, "loss": 0.2476, "num_tokens": 17070008.0, "reward": 0.747802734375, "reward_std": 0.013229751028120518, "rewards//mean": 0.747802734375, "rewards//std": 0.04056435450911522, "step": 1975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3952, "grad_norm": 2.8440916538238525, "kl": 8.051049835979939, "learning_rate": 3.3550503583141726e-06, "loss": 0.322, "num_tokens": 17078680.0, "reward": 0.74298095703125, "reward_std": 0.020944982767105103, "rewards//mean": 0.74298095703125, "rewards//std": 0.04477771371603012, "step": 1976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3954, "grad_norm": 3.45572566986084, "kl": 6.087337572127581, "learning_rate": 3.353559210735411e-06, "loss": 0.2435, "num_tokens": 17087232.0, "reward": 0.7464599609375, "reward_std": 0.014241542667150497, "rewards//mean": 0.7464599609375, "rewards//std": 0.03533641993999481, "step": 1977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3956, "grad_norm": 2.8491766452789307, "kl": 4.87523203715682, "learning_rate": 3.3520677193431017e-06, "loss": 0.195, "num_tokens": 17095856.0, "reward": 0.75762939453125, "reward_std": 0.011618150398135185, "rewards//mean": 0.75762939453125, "rewards//std": 0.03279618173837662, "step": 1978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3958, "grad_norm": 3.316502094268799, "kl": 8.146923631429672, "learning_rate": 3.3505758847380163e-06, "loss": 0.3259, "num_tokens": 17104456.0, "reward": 0.75042724609375, "reward_std": 0.018657229840755463, "rewards//mean": 0.75042724609375, "rewards//std": 0.03762301430106163, "step": 1979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.396, "grad_norm": 3.6141626834869385, "kl": 3.448485430330038, "learning_rate": 3.3490837075210677e-06, "loss": 0.1379, "num_tokens": 17113160.0, "reward": 0.72515869140625, "reward_std": 0.012684915214776993, "rewards//mean": 0.72515869140625, "rewards//std": 0.0363210067152977, "step": 1980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3962, "grad_norm": 2.0411601066589355, "kl": 4.535993196070194, "learning_rate": 3.3475911882933014e-06, "loss": 0.1814, "num_tokens": 17121936.0, "reward": 0.75189208984375, "reward_std": 0.017858631908893585, "rewards//mean": 0.75189208984375, "rewards//std": 0.04033152386546135, "step": 1981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3964, "grad_norm": 1.6296621561050415, "kl": 5.411390423774719, "learning_rate": 3.346098327655907e-06, "loss": 0.2165, "num_tokens": 17130496.0, "reward": 0.73345947265625, "reward_std": 0.018368493765592575, "rewards//mean": 0.73345947265625, "rewards//std": 0.039547719061374664, "step": 1982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3966, "grad_norm": 1.8533588647842407, "kl": 6.862974241375923, "learning_rate": 3.3446051262102076e-06, "loss": 0.2745, "num_tokens": 17139192.0, "reward": 0.732177734375, "reward_std": 0.019884206354618073, "rewards//mean": 0.732177734375, "rewards//std": 0.04294267296791077, "step": 1983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3968, "grad_norm": 2.7352969646453857, "kl": 5.490067843347788, "learning_rate": 3.343111584557664e-06, "loss": 0.2196, "num_tokens": 17147984.0, "reward": 0.7353515625, "reward_std": 0.014425855129957199, "rewards//mean": 0.7353515625, "rewards//std": 0.036940108984708786, "step": 1984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.397, "grad_norm": 1.6130669116973877, "kl": 2.997511487454176, "learning_rate": 3.341617703299875e-06, "loss": 0.1199, "num_tokens": 17156640.0, "reward": 0.73388671875, "reward_std": 0.01010593120008707, "rewards//mean": 0.73388671875, "rewards//std": 0.034092992544174194, "step": 1985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3972, "grad_norm": 2.4124996662139893, "kl": 7.016627460718155, "learning_rate": 3.3401234830385753e-06, "loss": 0.2807, "num_tokens": 17165376.0, "reward": 0.76837158203125, "reward_std": 0.01488836295902729, "rewards//mean": 0.76837158203125, "rewards//std": 0.02691684104502201, "step": 1986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3974, "grad_norm": 1.942112684249878, "kl": 4.13174913264811, "learning_rate": 3.338628924375638e-06, "loss": 0.1653, "num_tokens": 17174008.0, "reward": 0.7769775390625, "reward_std": 0.010031599551439285, "rewards//mean": 0.7769775390625, "rewards//std": 0.028131533414125443, "step": 1987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3976, "grad_norm": 2.906083345413208, "kl": 4.491090428084135, "learning_rate": 3.3371340279130694e-06, "loss": 0.1796, "num_tokens": 17182720.0, "reward": 0.75482177734375, "reward_std": 0.013333836570382118, "rewards//mean": 0.75482177734375, "rewards//std": 0.034250643104314804, "step": 1988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3978, "grad_norm": 2.9822275638580322, "kl": 1.048874620348215, "learning_rate": 3.335638794253015e-06, "loss": 0.042, "num_tokens": 17191400.0, "reward": 0.741943359375, "reward_std": 0.006062339060008526, "rewards//mean": 0.741943359375, "rewards//std": 0.039228782057762146, "step": 1989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.398, "grad_norm": 2.340306282043457, "kl": 6.928777664899826, "learning_rate": 3.3341432239977537e-06, "loss": 0.2772, "num_tokens": 17200032.0, "reward": 0.745849609375, "reward_std": 0.017975017428398132, "rewards//mean": 0.745849609375, "rewards//std": 0.03504844009876251, "step": 1990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3982, "grad_norm": 2.3421599864959717, "kl": 4.486386768519878, "learning_rate": 3.332647317749702e-06, "loss": 0.1795, "num_tokens": 17208680.0, "reward": 0.7718505859375, "reward_std": 0.020117424428462982, "rewards//mean": 0.7718505859375, "rewards//std": 0.034609246999025345, "step": 1991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3984, "grad_norm": 4.350400447845459, "kl": 5.95016784965992, "learning_rate": 3.33115107611141e-06, "loss": 0.238, "num_tokens": 17217296.0, "reward": 0.74859619140625, "reward_std": 0.013034136965870857, "rewards//mean": 0.74859619140625, "rewards//std": 0.03704727068543434, "step": 1992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3986, "grad_norm": 2.675605297088623, "kl": 5.770400807261467, "learning_rate": 3.329654499685565e-06, "loss": 0.2308, "num_tokens": 17225960.0, "reward": 0.715087890625, "reward_std": 0.017844578251242638, "rewards//mean": 0.715087890625, "rewards//std": 0.03987179696559906, "step": 1993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3988, "grad_norm": 2.7848455905914307, "kl": 3.745978608727455, "learning_rate": 3.3281575890749857e-06, "loss": 0.1498, "num_tokens": 17234600.0, "reward": 0.74237060546875, "reward_std": 0.013618675991892815, "rewards//mean": 0.74237060546875, "rewards//std": 0.03912092372775078, "step": 1994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.399, "grad_norm": 1.9877591133117676, "kl": 3.2222051359713078, "learning_rate": 3.3266603448826286e-06, "loss": 0.1289, "num_tokens": 17243224.0, "reward": 0.7698974609375, "reward_std": 0.010375511832535267, "rewards//mean": 0.7698974609375, "rewards//std": 0.026141168549656868, "step": 1995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3992, "grad_norm": 2.3413331508636475, "kl": 4.302839890122414, "learning_rate": 3.325162767711583e-06, "loss": 0.1721, "num_tokens": 17251840.0, "reward": 0.7252197265625, "reward_std": 0.017944417893886566, "rewards//mean": 0.7252197265625, "rewards//std": 0.03660564124584198, "step": 1996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3994, "grad_norm": 3.165403366088867, "kl": 3.4773399271070957, "learning_rate": 3.3236648581650743e-06, "loss": 0.1391, "num_tokens": 17260456.0, "reward": 0.72833251953125, "reward_std": 0.010450906120240688, "rewards//mean": 0.72833251953125, "rewards//std": 0.03788483887910843, "step": 1997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3996, "grad_norm": 2.163254976272583, "kl": 2.5367021188139915, "learning_rate": 3.3221666168464584e-06, "loss": 0.1015, "num_tokens": 17269256.0, "reward": 0.762451171875, "reward_std": 0.013062435202300549, "rewards//mean": 0.762451171875, "rewards//std": 0.0270084235817194, "step": 1998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3998, "grad_norm": 2.8939337730407715, "kl": 4.409974068403244, "learning_rate": 3.3206680443592283e-06, "loss": 0.1764, "num_tokens": 17277904.0, "reward": 0.7598876953125, "reward_std": 0.020727574825286865, "rewards//mean": 0.7598876953125, "rewards//std": 0.041672300547361374, "step": 1999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.4, "grad_norm": 3.267803907394409, "kl": 4.145012024790049, "learning_rate": 3.319169141307007e-06, "loss": 0.1658, "num_tokens": 17286544.0, "reward": 0.71112060546875, "reward_std": 0.014055994339287281, "rewards//mean": 0.71112060546875, "rewards//std": 0.04312027990818024, "step": 2000 } ], "logging_steps": 1, "max_steps": 5000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }