diff --git "a/checkpoint-1000/trainer_state.json" "b/checkpoint-1000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-1000/trainer_state.json" @@ -0,0 +1,19034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.2, + "eval_steps": 500, + "global_step": 1000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0002, + "grad_norm": 0.4551391005516052, + "kl": 0.0006267073404160328, + "learning_rate": 0.0, + "loss": 0.0, + "num_tokens": 8600.0, + "reward": 0.7076416015625, + "reward_std": 0.014151658862829208, + "rewards//mean": 0.7076416015625, + "rewards//std": 0.0565522275865078, + "step": 1 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0004, + "grad_norm": 0.4617377817630768, + "kl": 0.0006313717240118422, + "learning_rate": 1.0000000000000001e-07, + "loss": 0.0, + "num_tokens": 17200.0, + "reward": 0.72869873046875, + "reward_std": 0.012252680957317352, + "rewards//mean": 0.72869873046875, + "rewards//std": 0.06718378514051437, + "step": 2 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0006, + "grad_norm": 0.47510647773742676, + "kl": 0.0007176821964094415, + "learning_rate": 2.0000000000000002e-07, + "loss": 0.0, + "num_tokens": 25872.0, + "reward": 0.7418212890625, + "reward_std": 0.01208210177719593, + "rewards//mean": 0.7418212890625, + "rewards//std": 0.05206342041492462, + "step": 3 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0008, + "grad_norm": 0.4658019244670868, + "kl": 0.0007370488237938844, + "learning_rate": 3.0000000000000004e-07, + "loss": 0.0, + "num_tokens": 34600.0, + "reward": 0.70538330078125, + "reward_std": 0.015659615397453308, + "rewards//mean": 0.70538330078125, + "rewards//std": 0.05750183388590813, + "step": 4 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.001, + "grad_norm": 0.4073522984981537, + "kl": 0.0006538679663208313, + "learning_rate": 4.0000000000000003e-07, + "loss": 0.0, + "num_tokens": 43304.0, + "reward": 0.71893310546875, + "reward_std": 0.014116497710347176, + "rewards//mean": 0.71893310546875, + "rewards//std": 0.06529892981052399, + "step": 5 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0012, + "grad_norm": 0.45524272322654724, + "kl": 0.0006982390259508975, + "learning_rate": 5.000000000000001e-07, + "loss": 0.0, + "num_tokens": 51992.0, + "reward": 0.71209716796875, + "reward_std": 0.017292022705078125, + "rewards//mean": 0.71209716796875, + "rewards//std": 0.060261402279138565, + "step": 6 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0014, + "grad_norm": 0.44682666659355164, + "kl": 0.0009362539058201946, + "learning_rate": 6.000000000000001e-07, + "loss": 0.0, + "num_tokens": 60696.0, + "reward": 0.7254638671875, + "reward_std": 0.014938775449991226, + "rewards//mean": 0.7254638671875, + "rewards//std": 0.04342832788825035, + "step": 7 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0016, + "grad_norm": 0.453555703163147, + "kl": 0.0006852814112789929, + "learning_rate": 7.000000000000001e-07, + "loss": 0.0, + "num_tokens": 69336.0, + "reward": 0.72991943359375, + "reward_std": 0.01723172701895237, + "rewards//mean": 0.72991943359375, + "rewards//std": 0.05862419679760933, + "step": 8 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0018, + "grad_norm": 0.49192875623703003, + "kl": 0.000731307256501168, + "learning_rate": 8.000000000000001e-07, + "loss": 0.0, + "num_tokens": 78008.0, + "reward": 0.70782470703125, + "reward_std": 0.018347103148698807, + "rewards//mean": 0.70782470703125, + "rewards//std": 0.058675043284893036, + "step": 9 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.002, + "grad_norm": 0.4813508093357086, + "kl": 0.0007482333021471277, + "learning_rate": 9.000000000000001e-07, + "loss": 0.0, + "num_tokens": 86648.0, + "reward": 0.678955078125, + "reward_std": 0.012709339149296284, + "rewards//mean": 0.678955078125, + "rewards//std": 0.05669618770480156, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0022, + "grad_norm": 0.47774913907051086, + "kl": 0.0006810418926761486, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.0, + "num_tokens": 95456.0, + "reward": 0.70257568359375, + "reward_std": 0.016511568799614906, + "rewards//mean": 0.70257568359375, + "rewards//std": 0.05309594050049782, + "step": 11 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0024, + "grad_norm": 0.45333075523376465, + "kl": 0.0007765620030113496, + "learning_rate": 1.1e-06, + "loss": 0.0, + "num_tokens": 104112.0, + "reward": 0.701904296875, + "reward_std": 0.014031937345862389, + "rewards//mean": 0.701904296875, + "rewards//std": 0.04883308708667755, + "step": 12 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0026, + "grad_norm": 0.4627256989479065, + "kl": 0.0007952243977342732, + "learning_rate": 1.2000000000000002e-06, + "loss": 0.0, + "num_tokens": 112728.0, + "reward": 0.7293701171875, + "reward_std": 0.013355368748307228, + "rewards//mean": 0.7293701171875, + "rewards//std": 0.048412635922431946, + "step": 13 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0028, + "grad_norm": 0.529376745223999, + "kl": 0.0007697402106714435, + "learning_rate": 1.3e-06, + "loss": 0.0, + "num_tokens": 121352.0, + "reward": 0.75030517578125, + "reward_std": 0.017826953902840614, + "rewards//mean": 0.75030517578125, + "rewards//std": 0.044097088277339935, + "step": 14 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.003, + "grad_norm": 0.5399572253227234, + "kl": 0.0007980391455930658, + "learning_rate": 1.4000000000000001e-06, + "loss": 0.0, + "num_tokens": 130072.0, + "reward": 0.7301025390625, + "reward_std": 0.016819626092910767, + "rewards//mean": 0.7301025390625, + "rewards//std": 0.06331897526979446, + "step": 15 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0032, + "grad_norm": 0.48803454637527466, + "kl": 0.0007127898643375374, + "learning_rate": 1.5e-06, + "loss": 0.0, + "num_tokens": 138680.0, + "reward": 0.72723388671875, + "reward_std": 0.01550104096531868, + "rewards//mean": 0.72723388671875, + "rewards//std": 0.05276963487267494, + "step": 16 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0034, + "grad_norm": 0.4796734154224396, + "kl": 0.0007568690125481226, + "learning_rate": 1.6000000000000001e-06, + "loss": 0.0, + "num_tokens": 147320.0, + "reward": 0.67901611328125, + "reward_std": 0.012344243004918098, + "rewards//mean": 0.67901611328125, + "rewards//std": 0.05607627332210541, + "step": 17 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0036, + "grad_norm": 0.4370866119861603, + "kl": 0.0007120481022866443, + "learning_rate": 1.7000000000000002e-06, + "loss": 0.0, + "num_tokens": 155984.0, + "reward": 0.7203369140625, + "reward_std": 0.010229130275547504, + "rewards//mean": 0.7203369140625, + "rewards//std": 0.04977063462138176, + "step": 18 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0038, + "grad_norm": 0.47938504815101624, + "kl": 0.0007277187978615984, + "learning_rate": 1.8000000000000001e-06, + "loss": 0.0, + "num_tokens": 164608.0, + "reward": 0.702392578125, + "reward_std": 0.020038627088069916, + "rewards//mean": 0.702392578125, + "rewards//std": 0.06601254642009735, + "step": 19 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.004, + "grad_norm": 0.5836920142173767, + "kl": 0.000848565723572392, + "learning_rate": 1.9000000000000002e-06, + "loss": 0.0, + "num_tokens": 173168.0, + "reward": 0.71051025390625, + "reward_std": 0.016375649720430374, + "rewards//mean": 0.71051025390625, + "rewards//std": 0.0618068166077137, + "step": 20 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0042, + "grad_norm": 0.4508132040500641, + "kl": 0.0007047736216918565, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.0, + "num_tokens": 181792.0, + "reward": 0.71771240234375, + "reward_std": 0.01673172414302826, + "rewards//mean": 0.71771240234375, + "rewards//std": 0.06672913581132889, + "step": 21 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0044, + "grad_norm": 0.4673251509666443, + "kl": 0.0007006596788414754, + "learning_rate": 2.1000000000000002e-06, + "loss": 0.0, + "num_tokens": 190392.0, + "reward": 0.72503662109375, + "reward_std": 0.016404539346694946, + "rewards//mean": 0.72503662109375, + "rewards//std": 0.04331747442483902, + "step": 22 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0046, + "grad_norm": 0.752646267414093, + "kl": 0.0010592954204184934, + "learning_rate": 2.2e-06, + "loss": 0.0, + "num_tokens": 199080.0, + "reward": 0.70477294921875, + "reward_std": 0.013751041144132614, + "rewards//mean": 0.70477294921875, + "rewards//std": 0.05169031769037247, + "step": 23 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0048, + "grad_norm": 0.5306899547576904, + "kl": 0.000849992771691177, + "learning_rate": 2.3000000000000004e-06, + "loss": 0.0, + "num_tokens": 207720.0, + "reward": 0.7218017578125, + "reward_std": 0.017207415774464607, + "rewards//mean": 0.7218017578125, + "rewards//std": 0.0644102469086647, + "step": 24 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.005, + "grad_norm": 0.44910290837287903, + "kl": 0.0007659951588721015, + "learning_rate": 2.4000000000000003e-06, + "loss": 0.0, + "num_tokens": 216296.0, + "reward": 0.70526123046875, + "reward_std": 0.01439366303384304, + "rewards//mean": 0.70526123046875, + "rewards//std": 0.060455020517110825, + "step": 25 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0052, + "grad_norm": 0.5068986415863037, + "kl": 0.0008941922133089975, + "learning_rate": 2.5e-06, + "loss": 0.0, + "num_tokens": 224968.0, + "reward": 0.732421875, + "reward_std": 0.017151644453406334, + "rewards//mean": 0.732421875, + "rewards//std": 0.05576201528310776, + "step": 26 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0054, + "grad_norm": 0.5920788645744324, + "kl": 0.0007423345305141993, + "learning_rate": 2.6e-06, + "loss": 0.0, + "num_tokens": 233528.0, + "reward": 0.6807861328125, + "reward_std": 0.018578065559267998, + "rewards//mean": 0.6807861328125, + "rewards//std": 0.0869307816028595, + "step": 27 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0056, + "grad_norm": 0.5785125494003296, + "kl": 0.0010453350987518206, + "learning_rate": 2.7000000000000004e-06, + "loss": 0.0, + "num_tokens": 242280.0, + "reward": 0.68853759765625, + "reward_std": 0.016351919621229172, + "rewards//mean": 0.68853759765625, + "rewards//std": 0.06393762677907944, + "step": 28 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0058, + "grad_norm": 0.503767728805542, + "kl": 0.0008367645714315586, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.0, + "num_tokens": 250976.0, + "reward": 0.71185302734375, + "reward_std": 0.01879560574889183, + "rewards//mean": 0.71185302734375, + "rewards//std": 0.05082531273365021, + "step": 29 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.006, + "grad_norm": 0.5023078918457031, + "kl": 0.0007173106641857885, + "learning_rate": 2.9e-06, + "loss": 0.0, + "num_tokens": 259632.0, + "reward": 0.70556640625, + "reward_std": 0.017621025443077087, + "rewards//mean": 0.70556640625, + "rewards//std": 0.0475139394402504, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0062, + "grad_norm": 0.4851153492927551, + "kl": 0.0007923798766569234, + "learning_rate": 3e-06, + "loss": 0.0, + "num_tokens": 268208.0, + "reward": 0.7398681640625, + "reward_std": 0.01781066693365574, + "rewards//mean": 0.7398681640625, + "rewards//std": 0.05101071298122406, + "step": 31 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0064, + "grad_norm": 0.4202479422092438, + "kl": 0.0007000941404839978, + "learning_rate": 3.1000000000000004e-06, + "loss": 0.0, + "num_tokens": 276824.0, + "reward": 0.711669921875, + "reward_std": 0.014990320429205894, + "rewards//mean": 0.711669921875, + "rewards//std": 0.05982654541730881, + "step": 32 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0066, + "grad_norm": 0.5245676636695862, + "kl": 0.0007394420899800025, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.0, + "num_tokens": 285520.0, + "reward": 0.73822021484375, + "reward_std": 0.014528016559779644, + "rewards//mean": 0.73822021484375, + "rewards//std": 0.05750894173979759, + "step": 33 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0068, + "grad_norm": 0.41506427526474, + "kl": 0.0006638994673267007, + "learning_rate": 3.3000000000000006e-06, + "loss": 0.0, + "num_tokens": 294144.0, + "reward": 0.705078125, + "reward_std": 0.018270526081323624, + "rewards//mean": 0.705078125, + "rewards//std": 0.05205077305436134, + "step": 34 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.007, + "grad_norm": 0.6635000705718994, + "kl": 0.0008408781359321438, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.0, + "num_tokens": 302888.0, + "reward": 0.7369384765625, + "reward_std": 0.010665340349078178, + "rewards//mean": 0.7369384765625, + "rewards//std": 0.041731830686330795, + "step": 35 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0072, + "grad_norm": 0.5000082850456238, + "kl": 0.0007324635953409597, + "learning_rate": 3.5e-06, + "loss": 0.0, + "num_tokens": 311504.0, + "reward": 0.7015380859375, + "reward_std": 0.015993408858776093, + "rewards//mean": 0.7015380859375, + "rewards//std": 0.05921553075313568, + "step": 36 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0074, + "grad_norm": 0.4788602292537689, + "kl": 0.0007234790246002376, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.0, + "num_tokens": 320208.0, + "reward": 0.71978759765625, + "reward_std": 0.015359117649495602, + "rewards//mean": 0.71978759765625, + "rewards//std": 0.050679761916399, + "step": 37 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0076, + "grad_norm": 0.5380129218101501, + "kl": 0.0006790131737943739, + "learning_rate": 3.7e-06, + "loss": 0.0, + "num_tokens": 328904.0, + "reward": 0.71307373046875, + "reward_std": 0.017655834555625916, + "rewards//mean": 0.71307373046875, + "rewards//std": 0.039130210876464844, + "step": 38 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0078, + "grad_norm": 0.47228410840034485, + "kl": 0.0006949657399673015, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.0, + "num_tokens": 337632.0, + "reward": 0.705322265625, + "reward_std": 0.01650204323232174, + "rewards//mean": 0.705322265625, + "rewards//std": 0.0695849284529686, + "step": 39 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.008, + "grad_norm": 0.5206338763237, + "kl": 0.0007481320935767144, + "learning_rate": 3.900000000000001e-06, + "loss": 0.0, + "num_tokens": 346264.0, + "reward": 0.69219970703125, + "reward_std": 0.016963649541139603, + "rewards//mean": 0.69219970703125, + "rewards//std": 0.05523912236094475, + "step": 40 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0082, + "grad_norm": 0.49259892106056213, + "kl": 0.0007038385519990698, + "learning_rate": 4.000000000000001e-06, + "loss": 0.0, + "num_tokens": 354912.0, + "reward": 0.71923828125, + "reward_std": 0.018238678574562073, + "rewards//mean": 0.71923828125, + "rewards//std": 0.060513366013765335, + "step": 41 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0084, + "grad_norm": 0.42862868309020996, + "kl": 0.0006526454817503691, + "learning_rate": 4.1e-06, + "loss": 0.0, + "num_tokens": 363544.0, + "reward": 0.73712158203125, + "reward_std": 0.012668399140238762, + "rewards//mean": 0.73712158203125, + "rewards//std": 0.05271969735622406, + "step": 42 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0086, + "grad_norm": 0.43543925881385803, + "kl": 0.0007449448385159485, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.0, + "num_tokens": 372144.0, + "reward": 0.73724365234375, + "reward_std": 0.017016027122735977, + "rewards//mean": 0.73724365234375, + "rewards//std": 0.04933328554034233, + "step": 43 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0088, + "grad_norm": 0.5209327340126038, + "kl": 0.0007311399604077451, + "learning_rate": 4.3e-06, + "loss": 0.0, + "num_tokens": 380848.0, + "reward": 0.7359619140625, + "reward_std": 0.010918927378952503, + "rewards//mean": 0.7359619140625, + "rewards//std": 0.0476677305996418, + "step": 44 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.009, + "grad_norm": 0.5357726216316223, + "kl": 0.0008118156329146586, + "learning_rate": 4.4e-06, + "loss": 0.0, + "num_tokens": 389504.0, + "reward": 0.66937255859375, + "reward_std": 0.01259728241711855, + "rewards//mean": 0.66937255859375, + "rewards//std": 0.051660146564245224, + "step": 45 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0092, + "grad_norm": 0.45110824704170227, + "kl": 0.0006952350158826448, + "learning_rate": 4.5e-06, + "loss": 0.0, + "num_tokens": 398200.0, + "reward": 0.724853515625, + "reward_std": 0.013849527575075626, + "rewards//mean": 0.724853515625, + "rewards//std": 0.0702190026640892, + "step": 46 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0094, + "grad_norm": 0.47275251150131226, + "kl": 0.0007834887073840946, + "learning_rate": 4.600000000000001e-06, + "loss": 0.0, + "num_tokens": 406848.0, + "reward": 0.69769287109375, + "reward_std": 0.015200081281363964, + "rewards//mean": 0.69769287109375, + "rewards//std": 0.07077126950025558, + "step": 47 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0096, + "grad_norm": 0.42265617847442627, + "kl": 0.000766771991038695, + "learning_rate": 4.7e-06, + "loss": 0.0, + "num_tokens": 415520.0, + "reward": 0.70733642578125, + "reward_std": 0.017255615442991257, + "rewards//mean": 0.70733642578125, + "rewards//std": 0.0646086260676384, + "step": 48 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0098, + "grad_norm": 0.46693459153175354, + "kl": 0.0007007038075244054, + "learning_rate": 4.800000000000001e-06, + "loss": 0.0, + "num_tokens": 424088.0, + "reward": 0.7169189453125, + "reward_std": 0.017792128026485443, + "rewards//mean": 0.7169189453125, + "rewards//std": 0.06479453295469284, + "step": 49 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.01, + "grad_norm": 0.46468374133110046, + "kl": 0.0007802178952260874, + "learning_rate": 4.9000000000000005e-06, + "loss": 0.0, + "num_tokens": 432696.0, + "reward": 0.707275390625, + "reward_std": 0.01039247214794159, + "rewards//mean": 0.707275390625, + "rewards//std": 0.06051086261868477, + "step": 50 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0102, + "grad_norm": 0.4285586476325989, + "kl": 0.000754905202484224, + "learning_rate": 5e-06, + "loss": 0.0, + "num_tokens": 441352.0, + "reward": 0.72601318359375, + "reward_std": 0.015443078242242336, + "rewards//mean": 0.72601318359375, + "rewards//std": 0.04845963418483734, + "step": 51 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0104, + "grad_norm": 0.46686625480651855, + "kl": 0.00072819792694645, + "learning_rate": 4.9999994965001495e-06, + "loss": 0.0, + "num_tokens": 449960.0, + "reward": 0.7313232421875, + "reward_std": 0.013708039186894894, + "rewards//mean": 0.7313232421875, + "rewards//std": 0.057744208723306656, + "step": 52 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0106, + "grad_norm": 0.5042674541473389, + "kl": 0.0007860148616600782, + "learning_rate": 4.999997986000801e-06, + "loss": 0.0, + "num_tokens": 458512.0, + "reward": 0.71722412109375, + "reward_std": 0.015048246830701828, + "rewards//mean": 0.71722412109375, + "rewards//std": 0.05379721522331238, + "step": 53 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0108, + "grad_norm": 0.4119553864002228, + "kl": 0.0006910300071467645, + "learning_rate": 4.999995468502563e-06, + "loss": 0.0, + "num_tokens": 467048.0, + "reward": 0.73016357421875, + "reward_std": 0.01429541502147913, + "rewards//mean": 0.73016357421875, + "rewards//std": 0.054377954453229904, + "step": 54 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.011, + "grad_norm": 0.4966086745262146, + "kl": 0.0007905790334916674, + "learning_rate": 4.9999919440064484e-06, + "loss": 0.0, + "num_tokens": 475728.0, + "reward": 0.69805908203125, + "reward_std": 0.016646619886159897, + "rewards//mean": 0.69805908203125, + "rewards//std": 0.06764635443687439, + "step": 55 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0112, + "grad_norm": 0.4910055100917816, + "kl": 0.0007386217839666642, + "learning_rate": 4.999987412513878e-06, + "loss": 0.0, + "num_tokens": 484360.0, + "reward": 0.72802734375, + "reward_std": 0.015091042965650558, + "rewards//mean": 0.72802734375, + "rewards//std": 0.06086854264140129, + "step": 56 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0114, + "grad_norm": 0.4628139138221741, + "kl": 0.0007929667408461682, + "learning_rate": 4.999981874026677e-06, + "loss": 0.0, + "num_tokens": 493000.0, + "reward": 0.72283935546875, + "reward_std": 0.015091566368937492, + "rewards//mean": 0.72283935546875, + "rewards//std": 0.056771133095026016, + "step": 57 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0116, + "grad_norm": 0.5012953877449036, + "kl": 0.0007925045429146849, + "learning_rate": 4.9999753285470756e-06, + "loss": 0.0, + "num_tokens": 501632.0, + "reward": 0.71661376953125, + "reward_std": 0.015197820030152798, + "rewards//mean": 0.71661376953125, + "rewards//std": 0.051168158650398254, + "step": 58 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0118, + "grad_norm": 0.5567752718925476, + "kl": 0.0009017333650263026, + "learning_rate": 4.9999677760777114e-06, + "loss": 0.0, + "num_tokens": 510288.0, + "reward": 0.72381591796875, + "reward_std": 0.013100363314151764, + "rewards//mean": 0.72381591796875, + "rewards//std": 0.04677506536245346, + "step": 59 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.012, + "grad_norm": 0.41019734740257263, + "kl": 0.0007576563366455957, + "learning_rate": 4.999959216621626e-06, + "loss": 0.0, + "num_tokens": 518952.0, + "reward": 0.721435546875, + "reward_std": 0.01749395951628685, + "rewards//mean": 0.721435546875, + "rewards//std": 0.0656224712729454, + "step": 60 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0122, + "grad_norm": 0.5433821082115173, + "kl": 0.0012775656068697572, + "learning_rate": 4.999949650182267e-06, + "loss": 0.0001, + "num_tokens": 527520.0, + "reward": 0.71600341796875, + "reward_std": 0.014195160940289497, + "rewards//mean": 0.71600341796875, + "rewards//std": 0.040854718536138535, + "step": 61 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0124, + "grad_norm": 0.4710741639137268, + "kl": 0.0007979272995726205, + "learning_rate": 4.999939076763487e-06, + "loss": 0.0, + "num_tokens": 536112.0, + "reward": 0.7099609375, + "reward_std": 0.015660030767321587, + "rewards//mean": 0.7099609375, + "rewards//std": 0.0644836500287056, + "step": 62 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0126, + "grad_norm": 0.5351571440696716, + "kl": 0.0008906932343961671, + "learning_rate": 4.999927496369547e-06, + "loss": 0.0, + "num_tokens": 544736.0, + "reward": 0.6954345703125, + "reward_std": 0.01476267259567976, + "rewards//mean": 0.6954345703125, + "rewards//std": 0.04151507467031479, + "step": 63 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0128, + "grad_norm": 0.4595547318458557, + "kl": 0.0008543600051780231, + "learning_rate": 4.99991490900511e-06, + "loss": 0.0, + "num_tokens": 553408.0, + "reward": 0.71673583984375, + "reward_std": 0.012672960758209229, + "rewards//mean": 0.71673583984375, + "rewards//std": 0.05019837245345116, + "step": 64 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.013, + "grad_norm": 0.4643782079219818, + "kl": 0.0008414792246185243, + "learning_rate": 4.999901314675246e-06, + "loss": 0.0, + "num_tokens": 562064.0, + "reward": 0.71978759765625, + "reward_std": 0.01744549721479416, + "rewards//mean": 0.71978759765625, + "rewards//std": 0.05583685636520386, + "step": 65 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0132, + "grad_norm": 0.4610508680343628, + "kl": 0.0008548630212317221, + "learning_rate": 4.999886713385432e-06, + "loss": 0.0, + "num_tokens": 570664.0, + "reward": 0.71099853515625, + "reward_std": 0.016434110701084137, + "rewards//mean": 0.71099853515625, + "rewards//std": 0.06650462746620178, + "step": 66 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0134, + "grad_norm": 0.48795291781425476, + "kl": 0.000795883170212619, + "learning_rate": 4.999871105141549e-06, + "loss": 0.0, + "num_tokens": 579360.0, + "reward": 0.7103271484375, + "reward_std": 0.016300387680530548, + "rewards//mean": 0.7103271484375, + "rewards//std": 0.0523579940199852, + "step": 67 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0136, + "grad_norm": 0.4637707769870758, + "kl": 0.0008550576094421558, + "learning_rate": 4.9998544899498845e-06, + "loss": 0.0, + "num_tokens": 587992.0, + "reward": 0.7423095703125, + "reward_std": 0.014237109571695328, + "rewards//mean": 0.7423095703125, + "rewards//std": 0.04064841568470001, + "step": 68 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0138, + "grad_norm": 0.47836223244667053, + "kl": 0.0008403940519201569, + "learning_rate": 4.999836867817129e-06, + "loss": 0.0, + "num_tokens": 596608.0, + "reward": 0.69384765625, + "reward_std": 0.014926273375749588, + "rewards//mean": 0.69384765625, + "rewards//std": 0.040478430688381195, + "step": 69 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.014, + "grad_norm": 0.49338188767433167, + "kl": 0.0009159344772342592, + "learning_rate": 4.9998182387503825e-06, + "loss": 0.0, + "num_tokens": 605248.0, + "reward": 0.70245361328125, + "reward_std": 0.023733031004667282, + "rewards//mean": 0.70245361328125, + "rewards//std": 0.05866420641541481, + "step": 70 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0142, + "grad_norm": 0.3988666534423828, + "kl": 0.0008121578212012537, + "learning_rate": 4.999798602757149e-06, + "loss": 0.0, + "num_tokens": 613824.0, + "reward": 0.69830322265625, + "reward_std": 0.013719111680984497, + "rewards//mean": 0.69830322265625, + "rewards//std": 0.056460171937942505, + "step": 71 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0144, + "grad_norm": 0.4849626421928406, + "kl": 0.0009415504609933123, + "learning_rate": 4.9997779598453365e-06, + "loss": 0.0, + "num_tokens": 622328.0, + "reward": 0.73828125, + "reward_std": 0.01322929933667183, + "rewards//mean": 0.73828125, + "rewards//std": 0.06599832326173782, + "step": 72 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0146, + "grad_norm": 0.5434121489524841, + "kl": 0.0016273690489470027, + "learning_rate": 4.999756310023261e-06, + "loss": 0.0001, + "num_tokens": 630976.0, + "reward": 0.6895751953125, + "reward_std": 0.01436593197286129, + "rewards//mean": 0.6895751953125, + "rewards//std": 0.04783637821674347, + "step": 73 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0148, + "grad_norm": 0.4288107454776764, + "kl": 0.0008081157502601855, + "learning_rate": 4.999733653299643e-06, + "loss": 0.0, + "num_tokens": 639624.0, + "reward": 0.73236083984375, + "reward_std": 0.015878882259130478, + "rewards//mean": 0.73236083984375, + "rewards//std": 0.040517259389162064, + "step": 74 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.015, + "grad_norm": 0.48302745819091797, + "kl": 0.0009941107928170823, + "learning_rate": 4.9997099896836076e-06, + "loss": 0.0, + "num_tokens": 648320.0, + "reward": 0.69451904296875, + "reward_std": 0.013520974665880203, + "rewards//mean": 0.69451904296875, + "rewards//std": 0.06670349836349487, + "step": 75 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0152, + "grad_norm": 0.4562317728996277, + "kl": 0.0008667359288665466, + "learning_rate": 4.999685319184688e-06, + "loss": 0.0, + "num_tokens": 657080.0, + "reward": 0.7342529296875, + "reward_std": 0.013470092788338661, + "rewards//mean": 0.7342529296875, + "rewards//std": 0.047755297273397446, + "step": 76 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0154, + "grad_norm": 0.491922527551651, + "kl": 0.000954871931753587, + "learning_rate": 4.999659641812821e-06, + "loss": 0.0, + "num_tokens": 665720.0, + "reward": 0.70245361328125, + "reward_std": 0.015949761494994164, + "rewards//mean": 0.70245361328125, + "rewards//std": 0.07394102215766907, + "step": 77 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0156, + "grad_norm": 0.4906434118747711, + "kl": 0.0010538664282648824, + "learning_rate": 4.9996329575783486e-06, + "loss": 0.0, + "num_tokens": 674336.0, + "reward": 0.71417236328125, + "reward_std": 0.016342610120773315, + "rewards//mean": 0.71417236328125, + "rewards//std": 0.05462348833680153, + "step": 78 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0158, + "grad_norm": 0.488397479057312, + "kl": 0.0009591389462002553, + "learning_rate": 4.99960526649202e-06, + "loss": 0.0, + "num_tokens": 682968.0, + "reward": 0.68719482421875, + "reward_std": 0.01782170683145523, + "rewards//mean": 0.68719482421875, + "rewards//std": 0.07477407157421112, + "step": 79 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.016, + "grad_norm": 0.4663703143596649, + "kl": 0.0009951836473192088, + "learning_rate": 4.999576568564989e-06, + "loss": 0.0, + "num_tokens": 691640.0, + "reward": 0.73382568359375, + "reward_std": 0.011371012777090073, + "rewards//mean": 0.73382568359375, + "rewards//std": 0.06883952021598816, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0162, + "grad_norm": 0.43239349126815796, + "kl": 0.0008809622740955092, + "learning_rate": 4.999546863808815e-06, + "loss": 0.0, + "num_tokens": 700264.0, + "reward": 0.6995849609375, + "reward_std": 0.011680185794830322, + "rewards//mean": 0.6995849609375, + "rewards//std": 0.04613397642970085, + "step": 81 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0164, + "grad_norm": 0.446817010641098, + "kl": 0.0009117565277847461, + "learning_rate": 4.999516152235463e-06, + "loss": 0.0, + "num_tokens": 708984.0, + "reward": 0.73577880859375, + "reward_std": 0.010488376021385193, + "rewards//mean": 0.73577880859375, + "rewards//std": 0.05107547715306282, + "step": 82 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0166, + "grad_norm": 0.43615901470184326, + "kl": 0.0009348735839012079, + "learning_rate": 4.999484433857305e-06, + "loss": 0.0, + "num_tokens": 717568.0, + "reward": 0.723388671875, + "reward_std": 0.013231704942882061, + "rewards//mean": 0.723388671875, + "rewards//std": 0.0505966916680336, + "step": 83 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0168, + "grad_norm": 0.496662974357605, + "kl": 0.0011133336302009411, + "learning_rate": 4.999451708687114e-06, + "loss": 0.0, + "num_tokens": 726304.0, + "reward": 0.7135009765625, + "reward_std": 0.017629370093345642, + "rewards//mean": 0.7135009765625, + "rewards//std": 0.05801933631300926, + "step": 84 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.017, + "grad_norm": 0.5092006921768188, + "kl": 0.0010797181967063807, + "learning_rate": 4.999417976738075e-06, + "loss": 0.0, + "num_tokens": 735000.0, + "reward": 0.72796630859375, + "reward_std": 0.011926619336009026, + "rewards//mean": 0.72796630859375, + "rewards//std": 0.06286849081516266, + "step": 85 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0172, + "grad_norm": 0.4398786425590515, + "kl": 0.0010526972182560712, + "learning_rate": 4.999383238023773e-06, + "loss": 0.0, + "num_tokens": 743648.0, + "reward": 0.70025634765625, + "reward_std": 0.013448704965412617, + "rewards//mean": 0.70025634765625, + "rewards//std": 0.07408399134874344, + "step": 86 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0174, + "grad_norm": 0.5147225260734558, + "kl": 0.0012325849456829019, + "learning_rate": 4.999347492558203e-06, + "loss": 0.0, + "num_tokens": 752416.0, + "reward": 0.71722412109375, + "reward_std": 0.014034003019332886, + "rewards//mean": 0.71722412109375, + "rewards//std": 0.07134390622377396, + "step": 87 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0176, + "grad_norm": 0.5070951581001282, + "kl": 0.001082870177924633, + "learning_rate": 4.999310740355761e-06, + "loss": 0.0, + "num_tokens": 761040.0, + "reward": 0.7266845703125, + "reward_std": 0.014604616910219193, + "rewards//mean": 0.7266845703125, + "rewards//std": 0.06855076551437378, + "step": 88 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0178, + "grad_norm": 0.4914563298225403, + "kl": 0.0012297793437028304, + "learning_rate": 4.9992729814312514e-06, + "loss": 0.0, + "num_tokens": 769656.0, + "reward": 0.74017333984375, + "reward_std": 0.019316695630550385, + "rewards//mean": 0.74017333984375, + "rewards//std": 0.06556382775306702, + "step": 89 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.018, + "grad_norm": 0.4564233124256134, + "kl": 0.0011137609835714102, + "learning_rate": 4.999234215799884e-06, + "loss": 0.0, + "num_tokens": 778248.0, + "reward": 0.697509765625, + "reward_std": 0.015206321142613888, + "rewards//mean": 0.697509765625, + "rewards//std": 0.019185619428753853, + "step": 90 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0182, + "grad_norm": 0.5231274366378784, + "kl": 0.0012513935289462097, + "learning_rate": 4.999194443477273e-06, + "loss": 0.0001, + "num_tokens": 786856.0, + "reward": 0.689697265625, + "reward_std": 0.018568219617009163, + "rewards//mean": 0.689697265625, + "rewards//std": 0.08479659259319305, + "step": 91 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0184, + "grad_norm": 0.45985209941864014, + "kl": 0.0012780725373886526, + "learning_rate": 4.99915366447944e-06, + "loss": 0.0001, + "num_tokens": 795544.0, + "reward": 0.6876220703125, + "reward_std": 0.016050245612859726, + "rewards//mean": 0.6876220703125, + "rewards//std": 0.06961309164762497, + "step": 92 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0186, + "grad_norm": 0.4588201940059662, + "kl": 0.0010780094598885626, + "learning_rate": 4.999111878822809e-06, + "loss": 0.0, + "num_tokens": 804104.0, + "reward": 0.73370361328125, + "reward_std": 0.015261776745319366, + "rewards//mean": 0.73370361328125, + "rewards//std": 0.05237804725766182, + "step": 93 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0188, + "grad_norm": 0.4953310489654541, + "kl": 0.0013287549882079475, + "learning_rate": 4.999069086524212e-06, + "loss": 0.0001, + "num_tokens": 812768.0, + "reward": 0.74371337890625, + "reward_std": 0.017101481556892395, + "rewards//mean": 0.74371337890625, + "rewards//std": 0.07072333991527557, + "step": 94 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.019, + "grad_norm": 0.5649318099021912, + "kl": 0.0012824844452552497, + "learning_rate": 4.999025287600886e-06, + "loss": 0.0001, + "num_tokens": 821400.0, + "reward": 0.73883056640625, + "reward_std": 0.012025142088532448, + "rewards//mean": 0.73883056640625, + "rewards//std": 0.04969830811023712, + "step": 95 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0192, + "grad_norm": 0.6112560629844666, + "kl": 0.0013404397759586573, + "learning_rate": 4.998980482070473e-06, + "loss": 0.0001, + "num_tokens": 829952.0, + "reward": 0.68670654296875, + "reward_std": 0.015257102437317371, + "rewards//mean": 0.68670654296875, + "rewards//std": 0.03606545925140381, + "step": 96 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0194, + "grad_norm": 0.5092772841453552, + "kl": 0.0011790767530328594, + "learning_rate": 4.9989346699510215e-06, + "loss": 0.0, + "num_tokens": 838520.0, + "reward": 0.71392822265625, + "reward_std": 0.013484635390341282, + "rewards//mean": 0.71392822265625, + "rewards//std": 0.04560608044266701, + "step": 97 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0196, + "grad_norm": 0.4867624044418335, + "kl": 0.001275954389711842, + "learning_rate": 4.9988878512609825e-06, + "loss": 0.0001, + "num_tokens": 847128.0, + "reward": 0.708740234375, + "reward_std": 0.014133438467979431, + "rewards//mean": 0.708740234375, + "rewards//std": 0.0834752693772316, + "step": 98 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0198, + "grad_norm": 0.5237188935279846, + "kl": 0.001374387225951068, + "learning_rate": 4.998840026019217e-06, + "loss": 0.0001, + "num_tokens": 855824.0, + "reward": 0.6898193359375, + "reward_std": 0.014914432540535927, + "rewards//mean": 0.6898193359375, + "rewards//std": 0.05682779848575592, + "step": 99 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.02, + "grad_norm": 0.4966977834701538, + "kl": 0.0014171117945807055, + "learning_rate": 4.998791194244988e-06, + "loss": 0.0001, + "num_tokens": 864520.0, + "reward": 0.71148681640625, + "reward_std": 0.014294968917965889, + "rewards//mean": 0.71148681640625, + "rewards//std": 0.08842907845973969, + "step": 100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0202, + "grad_norm": 0.9231608510017395, + "kl": 0.0014634677936555818, + "learning_rate": 4.998741355957963e-06, + "loss": 0.0001, + "num_tokens": 873152.0, + "reward": 0.709228515625, + "reward_std": 0.015058237127959728, + "rewards//mean": 0.709228515625, + "rewards//std": 0.07786376029253006, + "step": 101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0204, + "grad_norm": 0.49847280979156494, + "kl": 0.001440991909475997, + "learning_rate": 4.99869051117822e-06, + "loss": 0.0001, + "num_tokens": 881824.0, + "reward": 0.7056884765625, + "reward_std": 0.014790365472435951, + "rewards//mean": 0.7056884765625, + "rewards//std": 0.05627746134996414, + "step": 102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0206, + "grad_norm": 0.5402097702026367, + "kl": 0.0013479594781529158, + "learning_rate": 4.998638659926238e-06, + "loss": 0.0001, + "num_tokens": 890368.0, + "reward": 0.74066162109375, + "reward_std": 0.014553939923644066, + "rewards//mean": 0.74066162109375, + "rewards//std": 0.05418301746249199, + "step": 103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0208, + "grad_norm": 0.5026318430900574, + "kl": 0.001596087298821658, + "learning_rate": 4.998585802222902e-06, + "loss": 0.0001, + "num_tokens": 899128.0, + "reward": 0.74249267578125, + "reward_std": 0.022882359102368355, + "rewards//mean": 0.74249267578125, + "rewards//std": 0.04922054708003998, + "step": 104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.021, + "grad_norm": 0.4722844958305359, + "kl": 0.0016166606947081164, + "learning_rate": 4.9985319380895035e-06, + "loss": 0.0001, + "num_tokens": 907808.0, + "reward": 0.7147216796875, + "reward_std": 0.01173408329486847, + "rewards//mean": 0.7147216796875, + "rewards//std": 0.05263826996088028, + "step": 105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0212, + "grad_norm": 0.5120245218276978, + "kl": 0.0016622742696199566, + "learning_rate": 4.99847706754774e-06, + "loss": 0.0001, + "num_tokens": 916384.0, + "reward": 0.74267578125, + "reward_std": 0.013011830858886242, + "rewards//mean": 0.74267578125, + "rewards//std": 0.04836273938417435, + "step": 106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0214, + "grad_norm": 0.5261712670326233, + "kl": 0.0015574333810945973, + "learning_rate": 4.998421190619712e-06, + "loss": 0.0001, + "num_tokens": 925000.0, + "reward": 0.728759765625, + "reward_std": 0.01469873171299696, + "rewards//mean": 0.728759765625, + "rewards//std": 0.029047802090644836, + "step": 107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0216, + "grad_norm": 0.4722696542739868, + "kl": 0.0016149339353432879, + "learning_rate": 4.998364307327927e-06, + "loss": 0.0001, + "num_tokens": 933680.0, + "reward": 0.72637939453125, + "reward_std": 0.014723455533385277, + "rewards//mean": 0.72637939453125, + "rewards//std": 0.057769205421209335, + "step": 108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0218, + "grad_norm": 0.471443235874176, + "kl": 0.00147363574069459, + "learning_rate": 4.998306417695298e-06, + "loss": 0.0001, + "num_tokens": 942360.0, + "reward": 0.70672607421875, + "reward_std": 0.013419017195701599, + "rewards//mean": 0.70672607421875, + "rewards//std": 0.061058465391397476, + "step": 109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.022, + "grad_norm": 0.6062172651290894, + "kl": 0.0026222791202599183, + "learning_rate": 4.998247521745142e-06, + "loss": 0.0001, + "num_tokens": 951000.0, + "reward": 0.74151611328125, + "reward_std": 0.018515393137931824, + "rewards//mean": 0.74151611328125, + "rewards//std": 0.05745021253824234, + "step": 110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0222, + "grad_norm": 0.6000130772590637, + "kl": 0.0021851205674465746, + "learning_rate": 4.998187619501185e-06, + "loss": 0.0001, + "num_tokens": 959688.0, + "reward": 0.674560546875, + "reward_std": 0.01506463810801506, + "rewards//mean": 0.674560546875, + "rewards//std": 0.07186568528413773, + "step": 111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0224, + "grad_norm": 0.4527592062950134, + "kl": 0.0016580721858190373, + "learning_rate": 4.998126710987552e-06, + "loss": 0.0001, + "num_tokens": 968352.0, + "reward": 0.72381591796875, + "reward_std": 0.014463772997260094, + "rewards//mean": 0.72381591796875, + "rewards//std": 0.05746760219335556, + "step": 112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0226, + "grad_norm": 0.5143523812294006, + "kl": 0.002175911722588353, + "learning_rate": 4.998064796228779e-06, + "loss": 0.0001, + "num_tokens": 976936.0, + "reward": 0.7242431640625, + "reward_std": 0.014481933787465096, + "rewards//mean": 0.7242431640625, + "rewards//std": 0.05567820370197296, + "step": 113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0228, + "grad_norm": 0.48747143149375916, + "kl": 0.0017545891605550423, + "learning_rate": 4.998001875249804e-06, + "loss": 0.0001, + "num_tokens": 985552.0, + "reward": 0.71710205078125, + "reward_std": 0.013459138572216034, + "rewards//mean": 0.71710205078125, + "rewards//std": 0.04908997192978859, + "step": 114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.023, + "grad_norm": 0.4331216514110565, + "kl": 0.0021779875096399337, + "learning_rate": 4.997937948075973e-06, + "loss": 0.0001, + "num_tokens": 994264.0, + "reward": 0.72625732421875, + "reward_std": 0.014490745961666107, + "rewards//mean": 0.72625732421875, + "rewards//std": 0.04755370691418648, + "step": 115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0232, + "grad_norm": 0.5705957412719727, + "kl": 0.002923560852650553, + "learning_rate": 4.997873014733036e-06, + "loss": 0.0001, + "num_tokens": 1002832.0, + "reward": 0.71453857421875, + "reward_std": 0.013323797844350338, + "rewards//mean": 0.71453857421875, + "rewards//std": 0.05091845244169235, + "step": 116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0234, + "grad_norm": 0.45392346382141113, + "kl": 0.0019422014956944622, + "learning_rate": 4.997807075247147e-06, + "loss": 0.0001, + "num_tokens": 1011448.0, + "reward": 0.70611572265625, + "reward_std": 0.015927188098430634, + "rewards//mean": 0.70611572265625, + "rewards//std": 0.04444960504770279, + "step": 117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0236, + "grad_norm": 0.46781232953071594, + "kl": 0.0021620240004267544, + "learning_rate": 4.9977401296448655e-06, + "loss": 0.0001, + "num_tokens": 1020104.0, + "reward": 0.71502685546875, + "reward_std": 0.014722153544425964, + "rewards//mean": 0.71502685546875, + "rewards//std": 0.04867687448859215, + "step": 118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0238, + "grad_norm": 0.46179869771003723, + "kl": 0.0022078527545090765, + "learning_rate": 4.99767217795316e-06, + "loss": 0.0001, + "num_tokens": 1028696.0, + "reward": 0.72418212890625, + "reward_std": 0.011131198145449162, + "rewards//mean": 0.72418212890625, + "rewards//std": 0.0295663233846426, + "step": 119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.024, + "grad_norm": 0.5357843637466431, + "kl": 0.002367166889598593, + "learning_rate": 4.997603220199399e-06, + "loss": 0.0001, + "num_tokens": 1037384.0, + "reward": 0.7073974609375, + "reward_std": 0.013118360191583633, + "rewards//mean": 0.7073974609375, + "rewards//std": 0.05458429455757141, + "step": 120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0242, + "grad_norm": 0.4789181649684906, + "kl": 0.0018334937049075961, + "learning_rate": 4.99753325641136e-06, + "loss": 0.0001, + "num_tokens": 1046176.0, + "reward": 0.72113037109375, + "reward_std": 0.011054251343011856, + "rewards//mean": 0.72113037109375, + "rewards//std": 0.0613209493458271, + "step": 121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0244, + "grad_norm": 0.4643182158470154, + "kl": 0.0028950390114914626, + "learning_rate": 4.997462286617224e-06, + "loss": 0.0001, + "num_tokens": 1054816.0, + "reward": 0.7144775390625, + "reward_std": 0.014479342848062515, + "rewards//mean": 0.7144775390625, + "rewards//std": 0.06932283937931061, + "step": 122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0246, + "grad_norm": 0.45720401406288147, + "kl": 0.002508550591301173, + "learning_rate": 4.997390310845578e-06, + "loss": 0.0001, + "num_tokens": 1063496.0, + "reward": 0.73211669921875, + "reward_std": 0.018719132989645004, + "rewards//mean": 0.73211669921875, + "rewards//std": 0.043240174651145935, + "step": 123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0248, + "grad_norm": 0.4856073260307312, + "kl": 0.005021717923227698, + "learning_rate": 4.997317329125413e-06, + "loss": 0.0002, + "num_tokens": 1072104.0, + "reward": 0.739501953125, + "reward_std": 0.018492162227630615, + "rewards//mean": 0.739501953125, + "rewards//std": 0.05581790953874588, + "step": 124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.025, + "grad_norm": 0.45310959219932556, + "kl": 0.0023723691556369886, + "learning_rate": 4.997243341486126e-06, + "loss": 0.0001, + "num_tokens": 1080752.0, + "reward": 0.7279052734375, + "reward_std": 0.01648058369755745, + "rewards//mean": 0.7279052734375, + "rewards//std": 0.05334783345460892, + "step": 125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0252, + "grad_norm": 0.44085827469825745, + "kl": 0.0022444947535404935, + "learning_rate": 4.997168347957521e-06, + "loss": 0.0001, + "num_tokens": 1089368.0, + "reward": 0.72052001953125, + "reward_std": 0.015059070661664009, + "rewards//mean": 0.72052001953125, + "rewards//std": 0.0704086497426033, + "step": 126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0254, + "grad_norm": 0.47659870982170105, + "kl": 0.002819010740495287, + "learning_rate": 4.997092348569802e-06, + "loss": 0.0001, + "num_tokens": 1097992.0, + "reward": 0.734130859375, + "reward_std": 0.012258566915988922, + "rewards//mean": 0.734130859375, + "rewards//std": 0.03378882259130478, + "step": 127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0256, + "grad_norm": 0.4902152121067047, + "kl": 0.0033210097899427637, + "learning_rate": 4.9970153433535855e-06, + "loss": 0.0001, + "num_tokens": 1106576.0, + "reward": 0.71600341796875, + "reward_std": 0.01741008460521698, + "rewards//mean": 0.71600341796875, + "rewards//std": 0.04795122891664505, + "step": 128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0258, + "grad_norm": 0.467317670583725, + "kl": 0.0020708676311187446, + "learning_rate": 4.996937332339887e-06, + "loss": 0.0001, + "num_tokens": 1115160.0, + "reward": 0.71319580078125, + "reward_std": 0.010551651939749718, + "rewards//mean": 0.71319580078125, + "rewards//std": 0.04744407534599304, + "step": 129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.026, + "grad_norm": 0.5055683851242065, + "kl": 0.0025406221684534103, + "learning_rate": 4.996858315560129e-06, + "loss": 0.0001, + "num_tokens": 1123808.0, + "reward": 0.72503662109375, + "reward_std": 0.015444736927747726, + "rewards//mean": 0.72503662109375, + "rewards//std": 0.06147873401641846, + "step": 130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0262, + "grad_norm": 0.5300055146217346, + "kl": 0.004421208694111556, + "learning_rate": 4.9967782930461405e-06, + "loss": 0.0002, + "num_tokens": 1132528.0, + "reward": 0.71380615234375, + "reward_std": 0.012746734544634819, + "rewards//mean": 0.71380615234375, + "rewards//std": 0.05404258891940117, + "step": 131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0264, + "grad_norm": 0.5882632732391357, + "kl": 0.004176836780970916, + "learning_rate": 4.9966972648301535e-06, + "loss": 0.0002, + "num_tokens": 1141160.0, + "reward": 0.73260498046875, + "reward_std": 0.013695623725652695, + "rewards//mean": 0.73260498046875, + "rewards//std": 0.0439189113676548, + "step": 132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0266, + "grad_norm": 0.5080795288085938, + "kl": 0.0037878667353652418, + "learning_rate": 4.996615230944808e-06, + "loss": 0.0002, + "num_tokens": 1149744.0, + "reward": 0.7181396484375, + "reward_std": 0.021421236917376518, + "rewards//mean": 0.7181396484375, + "rewards//std": 0.053551748394966125, + "step": 133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0268, + "grad_norm": 0.5161112546920776, + "kl": 0.002877964361687191, + "learning_rate": 4.996532191423145e-06, + "loss": 0.0001, + "num_tokens": 1158344.0, + "reward": 0.71826171875, + "reward_std": 0.017237350344657898, + "rewards//mean": 0.71826171875, + "rewards//std": 0.0640067458152771, + "step": 134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.027, + "grad_norm": 0.501823902130127, + "kl": 0.003536474294378422, + "learning_rate": 4.996448146298615e-06, + "loss": 0.0001, + "num_tokens": 1166920.0, + "reward": 0.68341064453125, + "reward_std": 0.01601843349635601, + "rewards//mean": 0.68341064453125, + "rewards//std": 0.041225045919418335, + "step": 135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0272, + "grad_norm": 0.5185138583183289, + "kl": 0.004472654749406502, + "learning_rate": 4.996363095605069e-06, + "loss": 0.0002, + "num_tokens": 1175528.0, + "reward": 0.7242431640625, + "reward_std": 0.018114212900400162, + "rewards//mean": 0.7242431640625, + "rewards//std": 0.05369402840733528, + "step": 136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0274, + "grad_norm": 0.49364981055259705, + "kl": 0.003177185819367878, + "learning_rate": 4.996277039376767e-06, + "loss": 0.0001, + "num_tokens": 1184144.0, + "reward": 0.72088623046875, + "reward_std": 0.01708320900797844, + "rewards//mean": 0.72088623046875, + "rewards//std": 0.054411906749010086, + "step": 137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0276, + "grad_norm": 0.5048626661300659, + "kl": 0.003767449234146625, + "learning_rate": 4.9961899776483725e-06, + "loss": 0.0002, + "num_tokens": 1192832.0, + "reward": 0.72998046875, + "reward_std": 0.013601850718259811, + "rewards//mean": 0.72998046875, + "rewards//std": 0.047309596091508865, + "step": 138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0278, + "grad_norm": 0.5485401153564453, + "kl": 0.005610850581433624, + "learning_rate": 4.996101910454953e-06, + "loss": 0.0002, + "num_tokens": 1201472.0, + "reward": 0.70831298828125, + "reward_std": 0.018774503841996193, + "rewards//mean": 0.70831298828125, + "rewards//std": 0.07297329604625702, + "step": 139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.028, + "grad_norm": 0.5312867164611816, + "kl": 0.004881514410953969, + "learning_rate": 4.996012837831983e-06, + "loss": 0.0002, + "num_tokens": 1210176.0, + "reward": 0.7449951171875, + "reward_std": 0.01512197032570839, + "rewards//mean": 0.7449951171875, + "rewards//std": 0.04492507874965668, + "step": 140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0282, + "grad_norm": 0.7407464981079102, + "kl": 0.005604632286122069, + "learning_rate": 4.9959227598153395e-06, + "loss": 0.0002, + "num_tokens": 1218872.0, + "reward": 0.7197265625, + "reward_std": 0.018171414732933044, + "rewards//mean": 0.7197265625, + "rewards//std": 0.04184069111943245, + "step": 141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0284, + "grad_norm": 0.5207117199897766, + "kl": 0.0038259811990428716, + "learning_rate": 4.995831676441307e-06, + "loss": 0.0002, + "num_tokens": 1227624.0, + "reward": 0.72515869140625, + "reward_std": 0.0129172932356596, + "rewards//mean": 0.72515869140625, + "rewards//std": 0.047344744205474854, + "step": 142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0286, + "grad_norm": 0.4676206111907959, + "kl": 0.004619606072083116, + "learning_rate": 4.995739587746574e-06, + "loss": 0.0002, + "num_tokens": 1236264.0, + "reward": 0.734375, + "reward_std": 0.01602023094892502, + "rewards//mean": 0.734375, + "rewards//std": 0.04370923712849617, + "step": 143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0288, + "grad_norm": 0.4712521731853485, + "kl": 0.005641886862576939, + "learning_rate": 4.995646493768234e-06, + "loss": 0.0002, + "num_tokens": 1244984.0, + "reward": 0.69970703125, + "reward_std": 0.016067516058683395, + "rewards//mean": 0.69970703125, + "rewards//std": 0.07418610900640488, + "step": 144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.029, + "grad_norm": 0.4732939898967743, + "kl": 0.003699015636811964, + "learning_rate": 4.995552394543784e-06, + "loss": 0.0001, + "num_tokens": 1253544.0, + "reward": 0.68414306640625, + "reward_std": 0.017361916601657867, + "rewards//mean": 0.68414306640625, + "rewards//std": 0.05543471500277519, + "step": 145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0292, + "grad_norm": 0.5424770712852478, + "kl": 0.0069044766132719815, + "learning_rate": 4.995457290111129e-06, + "loss": 0.0003, + "num_tokens": 1262272.0, + "reward": 0.74951171875, + "reward_std": 0.014574643224477768, + "rewards//mean": 0.74951171875, + "rewards//std": 0.048895664513111115, + "step": 146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0294, + "grad_norm": 0.5275890231132507, + "kl": 0.004333419870818034, + "learning_rate": 4.995361180508575e-06, + "loss": 0.0002, + "num_tokens": 1270984.0, + "reward": 0.72943115234375, + "reward_std": 0.013785232789814472, + "rewards//mean": 0.72943115234375, + "rewards//std": 0.055875882506370544, + "step": 147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0296, + "grad_norm": 0.5140582919120789, + "kl": 0.006125797764980234, + "learning_rate": 4.995264065774837e-06, + "loss": 0.0002, + "num_tokens": 1279664.0, + "reward": 0.71893310546875, + "reward_std": 0.011785900220274925, + "rewards//mean": 0.71893310546875, + "rewards//std": 0.05134063959121704, + "step": 148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0298, + "grad_norm": 0.5618065595626831, + "kl": 0.006264124327572063, + "learning_rate": 4.99516594594903e-06, + "loss": 0.0003, + "num_tokens": 1288360.0, + "reward": 0.71807861328125, + "reward_std": 0.013035988435149193, + "rewards//mean": 0.71807861328125, + "rewards//std": 0.053489912301301956, + "step": 149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.03, + "grad_norm": 0.502047598361969, + "kl": 0.005594079499132931, + "learning_rate": 4.9950668210706795e-06, + "loss": 0.0002, + "num_tokens": 1297032.0, + "reward": 0.732177734375, + "reward_std": 0.0157606303691864, + "rewards//mean": 0.732177734375, + "rewards//std": 0.03976839408278465, + "step": 150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0302, + "grad_norm": 0.5016326904296875, + "kl": 0.00834939838387072, + "learning_rate": 4.994966691179712e-06, + "loss": 0.0003, + "num_tokens": 1305632.0, + "reward": 0.703125, + "reward_std": 0.014876965433359146, + "rewards//mean": 0.703125, + "rewards//std": 0.04747569188475609, + "step": 151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0304, + "grad_norm": 0.5275666117668152, + "kl": 0.006890498974826187, + "learning_rate": 4.9948655563164585e-06, + "loss": 0.0003, + "num_tokens": 1314272.0, + "reward": 0.71551513671875, + "reward_std": 0.015125017613172531, + "rewards//mean": 0.71551513671875, + "rewards//std": 0.05302604287862778, + "step": 152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0306, + "grad_norm": 0.5499895215034485, + "kl": 0.0106024268316105, + "learning_rate": 4.994763416521658e-06, + "loss": 0.0004, + "num_tokens": 1322960.0, + "reward": 0.74847412109375, + "reward_std": 0.017264600843191147, + "rewards//mean": 0.74847412109375, + "rewards//std": 0.058874648064374924, + "step": 153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0308, + "grad_norm": 0.5384777784347534, + "kl": 0.0059372307441663, + "learning_rate": 4.994660271836452e-06, + "loss": 0.0002, + "num_tokens": 1331608.0, + "reward": 0.73358154296875, + "reward_std": 0.016230447217822075, + "rewards//mean": 0.73358154296875, + "rewards//std": 0.06771613657474518, + "step": 154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.031, + "grad_norm": 0.5435893535614014, + "kl": 0.007853654562495649, + "learning_rate": 4.994556122302387e-06, + "loss": 0.0003, + "num_tokens": 1340352.0, + "reward": 0.7288818359375, + "reward_std": 0.013145558536052704, + "rewards//mean": 0.7288818359375, + "rewards//std": 0.05553446710109711, + "step": 155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0312, + "grad_norm": 0.5240859985351562, + "kl": 0.008158890006598085, + "learning_rate": 4.994450967961413e-06, + "loss": 0.0003, + "num_tokens": 1349024.0, + "reward": 0.72222900390625, + "reward_std": 0.013540119864046574, + "rewards//mean": 0.72222900390625, + "rewards//std": 0.034443680197000504, + "step": 156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0314, + "grad_norm": 0.6642475724220276, + "kl": 0.007359111390542239, + "learning_rate": 4.994344808855888e-06, + "loss": 0.0003, + "num_tokens": 1357544.0, + "reward": 0.71826171875, + "reward_std": 0.018803555518388748, + "rewards//mean": 0.71826171875, + "rewards//std": 0.05610194429755211, + "step": 157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0316, + "grad_norm": 0.52338045835495, + "kl": 0.009215345402481034, + "learning_rate": 4.994237645028573e-06, + "loss": 0.0004, + "num_tokens": 1366224.0, + "reward": 0.72686767578125, + "reward_std": 0.010744665749371052, + "rewards//mean": 0.72686767578125, + "rewards//std": 0.0492648109793663, + "step": 158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0318, + "grad_norm": 0.4993240237236023, + "kl": 0.00795084226410836, + "learning_rate": 4.994129476522632e-06, + "loss": 0.0003, + "num_tokens": 1374848.0, + "reward": 0.72064208984375, + "reward_std": 0.014471987262368202, + "rewards//mean": 0.72064208984375, + "rewards//std": 0.04946933686733246, + "step": 159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.032, + "grad_norm": 0.5304632186889648, + "kl": 0.010611457342747599, + "learning_rate": 4.994020303381636e-06, + "loss": 0.0004, + "num_tokens": 1383456.0, + "reward": 0.6920166015625, + "reward_std": 0.015825100243091583, + "rewards//mean": 0.6920166015625, + "rewards//std": 0.06738641113042831, + "step": 160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0322, + "grad_norm": 0.6055727005004883, + "kl": 0.009950865583959967, + "learning_rate": 4.993910125649561e-06, + "loss": 0.0004, + "num_tokens": 1392112.0, + "reward": 0.7283935546875, + "reward_std": 0.01466367393732071, + "rewards//mean": 0.7283935546875, + "rewards//std": 0.051102034747600555, + "step": 161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0324, + "grad_norm": 0.6653439998626709, + "kl": 0.015546724433079362, + "learning_rate": 4.993798943370785e-06, + "loss": 0.0006, + "num_tokens": 1400784.0, + "reward": 0.7451171875, + "reward_std": 0.014343861490488052, + "rewards//mean": 0.7451171875, + "rewards//std": 0.0543498657643795, + "step": 162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0326, + "grad_norm": 0.5136329531669617, + "kl": 0.009563862986396998, + "learning_rate": 4.993686756590093e-06, + "loss": 0.0004, + "num_tokens": 1409344.0, + "reward": 0.74176025390625, + "reward_std": 0.01264176145195961, + "rewards//mean": 0.74176025390625, + "rewards//std": 0.031986285001039505, + "step": 163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0328, + "grad_norm": 0.6398639678955078, + "kl": 0.008364403067389503, + "learning_rate": 4.993573565352674e-06, + "loss": 0.0003, + "num_tokens": 1417920.0, + "reward": 0.728271484375, + "reward_std": 0.014848753809928894, + "rewards//mean": 0.728271484375, + "rewards//std": 0.043802645057439804, + "step": 164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.033, + "grad_norm": 0.5334981083869934, + "kl": 0.015317061392124742, + "learning_rate": 4.993459369704121e-06, + "loss": 0.0006, + "num_tokens": 1426608.0, + "reward": 0.71307373046875, + "reward_std": 0.014587020501494408, + "rewards//mean": 0.71307373046875, + "rewards//std": 0.05228722095489502, + "step": 165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0332, + "grad_norm": 0.6520506143569946, + "kl": 0.014760783000383526, + "learning_rate": 4.9933441696904315e-06, + "loss": 0.0006, + "num_tokens": 1435216.0, + "reward": 0.72015380859375, + "reward_std": 0.016750846058130264, + "rewards//mean": 0.72015380859375, + "rewards//std": 0.03944116830825806, + "step": 166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0334, + "grad_norm": 0.6047248244285583, + "kl": 0.011187054798938334, + "learning_rate": 4.993227965358008e-06, + "loss": 0.0004, + "num_tokens": 1443832.0, + "reward": 0.68414306640625, + "reward_std": 0.016772452741861343, + "rewards//mean": 0.68414306640625, + "rewards//std": 0.07010506093502045, + "step": 167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0336, + "grad_norm": 0.5380955338478088, + "kl": 0.013987196376547217, + "learning_rate": 4.99311075675366e-06, + "loss": 0.0006, + "num_tokens": 1452488.0, + "reward": 0.73919677734375, + "reward_std": 0.014058486558496952, + "rewards//mean": 0.73919677734375, + "rewards//std": 0.04323422163724899, + "step": 168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0338, + "grad_norm": 0.5614820122718811, + "kl": 0.00901414075633511, + "learning_rate": 4.992992543924597e-06, + "loss": 0.0004, + "num_tokens": 1461184.0, + "reward": 0.73291015625, + "reward_std": 0.015211429446935654, + "rewards//mean": 0.73291015625, + "rewards//std": 0.052048444747924805, + "step": 169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.034, + "grad_norm": 0.5328094363212585, + "kl": 0.013381825701799244, + "learning_rate": 4.992873326918434e-06, + "loss": 0.0005, + "num_tokens": 1469920.0, + "reward": 0.7449951171875, + "reward_std": 0.01563861407339573, + "rewards//mean": 0.7449951171875, + "rewards//std": 0.05957742780447006, + "step": 170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0342, + "grad_norm": 0.568260133266449, + "kl": 0.010022971488069743, + "learning_rate": 4.992753105783194e-06, + "loss": 0.0004, + "num_tokens": 1478560.0, + "reward": 0.718017578125, + "reward_std": 0.01765732280910015, + "rewards//mean": 0.718017578125, + "rewards//std": 0.058326005935668945, + "step": 171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0344, + "grad_norm": 0.6112688183784485, + "kl": 0.011806668480858207, + "learning_rate": 4.992631880567301e-06, + "loss": 0.0005, + "num_tokens": 1487232.0, + "reward": 0.68988037109375, + "reward_std": 0.014870746061205864, + "rewards//mean": 0.68988037109375, + "rewards//std": 0.060817260295152664, + "step": 172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0346, + "grad_norm": 0.5378821492195129, + "kl": 0.01460516860242933, + "learning_rate": 4.992509651319585e-06, + "loss": 0.0006, + "num_tokens": 1495848.0, + "reward": 0.71533203125, + "reward_std": 0.01423371210694313, + "rewards//mean": 0.71533203125, + "rewards//std": 0.07337196916341782, + "step": 173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0348, + "grad_norm": 0.6649944186210632, + "kl": 0.014010886079631746, + "learning_rate": 4.992386418089279e-06, + "loss": 0.0006, + "num_tokens": 1504680.0, + "reward": 0.7298583984375, + "reward_std": 0.012547630816698074, + "rewards//mean": 0.7298583984375, + "rewards//std": 0.052616409957408905, + "step": 174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.035, + "grad_norm": 0.5258428454399109, + "kl": 0.014397241990081966, + "learning_rate": 4.992262180926022e-06, + "loss": 0.0006, + "num_tokens": 1513360.0, + "reward": 0.706787109375, + "reward_std": 0.015542633831501007, + "rewards//mean": 0.706787109375, + "rewards//std": 0.06885714083909988, + "step": 175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0352, + "grad_norm": 0.4891396164894104, + "kl": 0.012740209785988554, + "learning_rate": 4.992136939879857e-06, + "loss": 0.0005, + "num_tokens": 1522112.0, + "reward": 0.73828125, + "reward_std": 0.01267999317497015, + "rewards//mean": 0.73828125, + "rewards//std": 0.04550643637776375, + "step": 176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0354, + "grad_norm": 0.49961039423942566, + "kl": 0.01649987616110593, + "learning_rate": 4.992010695001229e-06, + "loss": 0.0007, + "num_tokens": 1530848.0, + "reward": 0.7008056640625, + "reward_std": 0.012512251734733582, + "rewards//mean": 0.7008056640625, + "rewards//std": 0.06246790289878845, + "step": 177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0356, + "grad_norm": 0.8701191544532776, + "kl": 0.019287034403532743, + "learning_rate": 4.9918834463409925e-06, + "loss": 0.0008, + "num_tokens": 1539528.0, + "reward": 0.70989990234375, + "reward_std": 0.016380542889237404, + "rewards//mean": 0.70989990234375, + "rewards//std": 0.042210932821035385, + "step": 178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0358, + "grad_norm": 0.6032683253288269, + "kl": 0.014312484301626682, + "learning_rate": 4.991755193950401e-06, + "loss": 0.0006, + "num_tokens": 1548192.0, + "reward": 0.70318603515625, + "reward_std": 0.011435788124799728, + "rewards//mean": 0.70318603515625, + "rewards//std": 0.054831214249134064, + "step": 179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.036, + "grad_norm": 0.6240742802619934, + "kl": 0.019559299282263964, + "learning_rate": 4.991625937881117e-06, + "loss": 0.0008, + "num_tokens": 1556856.0, + "reward": 0.733642578125, + "reward_std": 0.015770340338349342, + "rewards//mean": 0.733642578125, + "rewards//std": 0.05675596743822098, + "step": 180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0362, + "grad_norm": 0.6242550611495972, + "kl": 0.014271960069891065, + "learning_rate": 4.991495678185202e-06, + "loss": 0.0006, + "num_tokens": 1565488.0, + "reward": 0.7296142578125, + "reward_std": 0.012346789240837097, + "rewards//mean": 0.7296142578125, + "rewards//std": 0.07252856343984604, + "step": 181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0364, + "grad_norm": 0.6350885629653931, + "kl": 0.018139246094506234, + "learning_rate": 4.991364414915126e-06, + "loss": 0.0007, + "num_tokens": 1574184.0, + "reward": 0.72760009765625, + "reward_std": 0.016285665333271027, + "rewards//mean": 0.72760009765625, + "rewards//std": 0.06219015270471573, + "step": 182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0366, + "grad_norm": 0.640499472618103, + "kl": 0.01825080724665895, + "learning_rate": 4.9912321481237616e-06, + "loss": 0.0007, + "num_tokens": 1582888.0, + "reward": 0.72503662109375, + "reward_std": 0.010994457639753819, + "rewards//mean": 0.72503662109375, + "rewards//std": 0.03939354792237282, + "step": 183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0368, + "grad_norm": 0.5733298659324646, + "kl": 0.022698667307849973, + "learning_rate": 4.991098877864386e-06, + "loss": 0.0009, + "num_tokens": 1591408.0, + "reward": 0.7388916015625, + "reward_std": 0.016481254249811172, + "rewards//mean": 0.7388916015625, + "rewards//std": 0.043859802186489105, + "step": 184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.037, + "grad_norm": 0.54179447889328, + "kl": 0.01430770373553969, + "learning_rate": 4.99096460419068e-06, + "loss": 0.0006, + "num_tokens": 1600088.0, + "reward": 0.76007080078125, + "reward_std": 0.018877511844038963, + "rewards//mean": 0.76007080078125, + "rewards//std": 0.04165972024202347, + "step": 185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0372, + "grad_norm": 0.7062174677848816, + "kl": 0.027250012615695596, + "learning_rate": 4.990829327156729e-06, + "loss": 0.0011, + "num_tokens": 1608712.0, + "reward": 0.715576171875, + "reward_std": 0.012214237824082375, + "rewards//mean": 0.715576171875, + "rewards//std": 0.052231352776288986, + "step": 186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0374, + "grad_norm": 0.6777660846710205, + "kl": 0.02611252712085843, + "learning_rate": 4.990693046817023e-06, + "loss": 0.001, + "num_tokens": 1617400.0, + "reward": 0.7432861328125, + "reward_std": 0.01230183057487011, + "rewards//mean": 0.7432861328125, + "rewards//std": 0.034481290727853775, + "step": 187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0376, + "grad_norm": 0.557637631893158, + "kl": 0.018429984629619867, + "learning_rate": 4.990555763226456e-06, + "loss": 0.0007, + "num_tokens": 1625992.0, + "reward": 0.74969482421875, + "reward_std": 0.01685107871890068, + "rewards//mean": 0.74969482421875, + "rewards//std": 0.045630306005477905, + "step": 188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0378, + "grad_norm": 0.5212416648864746, + "kl": 0.017173759290017188, + "learning_rate": 4.990417476440326e-06, + "loss": 0.0007, + "num_tokens": 1634576.0, + "reward": 0.70892333984375, + "reward_std": 0.014218729920685291, + "rewards//mean": 0.70892333984375, + "rewards//std": 0.0681646466255188, + "step": 189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.038, + "grad_norm": 0.5165665149688721, + "kl": 0.015734603686723858, + "learning_rate": 4.9902781865143326e-06, + "loss": 0.0006, + "num_tokens": 1643200.0, + "reward": 0.733154296875, + "reward_std": 0.014683052897453308, + "rewards//mean": 0.733154296875, + "rewards//std": 0.047851089388132095, + "step": 190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0382, + "grad_norm": 0.683603823184967, + "kl": 0.0229703092481941, + "learning_rate": 4.990137893504585e-06, + "loss": 0.0009, + "num_tokens": 1651848.0, + "reward": 0.70361328125, + "reward_std": 0.014246530830860138, + "rewards//mean": 0.70361328125, + "rewards//std": 0.047676779329776764, + "step": 191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0384, + "grad_norm": 0.5361626148223877, + "kl": 0.02232282201293856, + "learning_rate": 4.989996597467591e-06, + "loss": 0.0009, + "num_tokens": 1660472.0, + "reward": 0.721923828125, + "reward_std": 0.020648304373025894, + "rewards//mean": 0.721923828125, + "rewards//std": 0.050515249371528625, + "step": 192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0386, + "grad_norm": 0.6549078822135925, + "kl": 0.028085972415283322, + "learning_rate": 4.989854298460265e-06, + "loss": 0.0011, + "num_tokens": 1669128.0, + "reward": 0.73724365234375, + "reward_std": 0.015218530781567097, + "rewards//mean": 0.73724365234375, + "rewards//std": 0.043220214545726776, + "step": 193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0388, + "grad_norm": 0.6959535479545593, + "kl": 0.03151222656015307, + "learning_rate": 4.989710996539926e-06, + "loss": 0.0013, + "num_tokens": 1677784.0, + "reward": 0.748291015625, + "reward_std": 0.014268080703914165, + "rewards//mean": 0.748291015625, + "rewards//std": 0.0479370579123497, + "step": 194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.039, + "grad_norm": 0.6917723417282104, + "kl": 0.030924884602427483, + "learning_rate": 4.989566691764296e-06, + "loss": 0.0012, + "num_tokens": 1686448.0, + "reward": 0.7283935546875, + "reward_std": 0.013681046664714813, + "rewards//mean": 0.7283935546875, + "rewards//std": 0.0343722440302372, + "step": 195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0392, + "grad_norm": 0.7647262811660767, + "kl": 0.02689244132488966, + "learning_rate": 4.9894213841914994e-06, + "loss": 0.0011, + "num_tokens": 1695072.0, + "reward": 0.695068359375, + "reward_std": 0.014594485983252525, + "rewards//mean": 0.695068359375, + "rewards//std": 0.037756409496068954, + "step": 196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0394, + "grad_norm": 0.7618040442466736, + "kl": 0.03057428600732237, + "learning_rate": 4.989275073880067e-06, + "loss": 0.0012, + "num_tokens": 1703680.0, + "reward": 0.7230224609375, + "reward_std": 0.018142350018024445, + "rewards//mean": 0.7230224609375, + "rewards//std": 0.05768650770187378, + "step": 197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0396, + "grad_norm": 0.6719552278518677, + "kl": 0.040109737077727914, + "learning_rate": 4.989127760888932e-06, + "loss": 0.0016, + "num_tokens": 1712304.0, + "reward": 0.721923828125, + "reward_std": 0.012543957680463791, + "rewards//mean": 0.721923828125, + "rewards//std": 0.050654102116823196, + "step": 198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0398, + "grad_norm": 0.6652910709381104, + "kl": 0.02444028906757012, + "learning_rate": 4.988979445277433e-06, + "loss": 0.001, + "num_tokens": 1720936.0, + "reward": 0.70831298828125, + "reward_std": 0.01572088897228241, + "rewards//mean": 0.70831298828125, + "rewards//std": 0.03865145146846771, + "step": 199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.04, + "grad_norm": 0.5563658475875854, + "kl": 0.021938784746453166, + "learning_rate": 4.988830127105312e-06, + "loss": 0.0009, + "num_tokens": 1729552.0, + "reward": 0.71014404296875, + "reward_std": 0.01828540489077568, + "rewards//mean": 0.71014404296875, + "rewards//std": 0.038920748978853226, + "step": 200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0402, + "grad_norm": 0.621466338634491, + "kl": 0.04022008215542883, + "learning_rate": 4.988679806432712e-06, + "loss": 0.0016, + "num_tokens": 1738184.0, + "reward": 0.71197509765625, + "reward_std": 0.017939604818820953, + "rewards//mean": 0.71197509765625, + "rewards//std": 0.047174979001283646, + "step": 201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0404, + "grad_norm": 0.7120692729949951, + "kl": 0.03540224314201623, + "learning_rate": 4.988528483320184e-06, + "loss": 0.0014, + "num_tokens": 1746792.0, + "reward": 0.72320556640625, + "reward_std": 0.016589093953371048, + "rewards//mean": 0.72320556640625, + "rewards//std": 0.04446118324995041, + "step": 202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0406, + "grad_norm": 0.6485600471496582, + "kl": 0.02893989998847246, + "learning_rate": 4.9883761578286805e-06, + "loss": 0.0012, + "num_tokens": 1755408.0, + "reward": 0.73529052734375, + "reward_std": 0.015300112776458263, + "rewards//mean": 0.73529052734375, + "rewards//std": 0.038799602538347244, + "step": 203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0408, + "grad_norm": 0.5922731757164001, + "kl": 0.034151540603488684, + "learning_rate": 4.988222830019559e-06, + "loss": 0.0014, + "num_tokens": 1764008.0, + "reward": 0.73095703125, + "reward_std": 0.013814757578074932, + "rewards//mean": 0.73095703125, + "rewards//std": 0.046106573194265366, + "step": 204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.041, + "grad_norm": 0.5785369277000427, + "kl": 0.045372437103651464, + "learning_rate": 4.988068499954578e-06, + "loss": 0.0018, + "num_tokens": 1772688.0, + "reward": 0.73089599609375, + "reward_std": 0.017161503434181213, + "rewards//mean": 0.73089599609375, + "rewards//std": 0.057122811675071716, + "step": 205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0412, + "grad_norm": 0.6962729096412659, + "kl": 0.043618743075057864, + "learning_rate": 4.987913167695904e-06, + "loss": 0.0017, + "num_tokens": 1781256.0, + "reward": 0.7265625, + "reward_std": 0.016600284725427628, + "rewards//mean": 0.7265625, + "rewards//std": 0.045094750821590424, + "step": 206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0414, + "grad_norm": 0.655792772769928, + "kl": 0.030919409124180675, + "learning_rate": 4.987756833306103e-06, + "loss": 0.0012, + "num_tokens": 1789976.0, + "reward": 0.742431640625, + "reward_std": 0.016554612666368484, + "rewards//mean": 0.742431640625, + "rewards//std": 0.06263887137174606, + "step": 207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0416, + "grad_norm": 0.7749574184417725, + "kl": 0.0421929273288697, + "learning_rate": 4.987599496848147e-06, + "loss": 0.0017, + "num_tokens": 1798664.0, + "reward": 0.72479248046875, + "reward_std": 0.014618289656937122, + "rewards//mean": 0.72479248046875, + "rewards//std": 0.05295005068182945, + "step": 208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0418, + "grad_norm": 0.759087324142456, + "kl": 0.04364440473727882, + "learning_rate": 4.987441158385411e-06, + "loss": 0.0017, + "num_tokens": 1807424.0, + "reward": 0.68939208984375, + "reward_std": 0.012192411348223686, + "rewards//mean": 0.68939208984375, + "rewards//std": 0.06599829345941544, + "step": 209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.042, + "grad_norm": 0.6689143776893616, + "kl": 0.05778496875427663, + "learning_rate": 4.987281817981674e-06, + "loss": 0.0023, + "num_tokens": 1816088.0, + "reward": 0.72271728515625, + "reward_std": 0.014015286229550838, + "rewards//mean": 0.72271728515625, + "rewards//std": 0.049290310591459274, + "step": 210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0422, + "grad_norm": 0.666499674320221, + "kl": 0.043112656101584435, + "learning_rate": 4.987121475701118e-06, + "loss": 0.0017, + "num_tokens": 1824672.0, + "reward": 0.69085693359375, + "reward_std": 0.013759355992078781, + "rewards//mean": 0.69085693359375, + "rewards//std": 0.06431391090154648, + "step": 211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0424, + "grad_norm": 0.6885657906532288, + "kl": 0.039523816551081836, + "learning_rate": 4.986960131608329e-06, + "loss": 0.0016, + "num_tokens": 1833248.0, + "reward": 0.76361083984375, + "reward_std": 0.021125439554452896, + "rewards//mean": 0.76361083984375, + "rewards//std": 0.03490383177995682, + "step": 212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0426, + "grad_norm": 0.7056341171264648, + "kl": 0.03723469818942249, + "learning_rate": 4.986797785768296e-06, + "loss": 0.0015, + "num_tokens": 1841808.0, + "reward": 0.73406982421875, + "reward_std": 0.013552498072385788, + "rewards//mean": 0.73406982421875, + "rewards//std": 0.058044735342264175, + "step": 213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0428, + "grad_norm": 0.5871314406394958, + "kl": 0.03060774417826906, + "learning_rate": 4.986634438246413e-06, + "loss": 0.0012, + "num_tokens": 1850392.0, + "reward": 0.7166748046875, + "reward_std": 0.018129628151655197, + "rewards//mean": 0.7166748046875, + "rewards//std": 0.05835960432887077, + "step": 214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.043, + "grad_norm": 0.556708812713623, + "kl": 0.02814829646376893, + "learning_rate": 4.986470089108476e-06, + "loss": 0.0011, + "num_tokens": 1859016.0, + "reward": 0.7529296875, + "reward_std": 0.017662834376096725, + "rewards//mean": 0.7529296875, + "rewards//std": 0.045255593955516815, + "step": 215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0432, + "grad_norm": 0.771795928478241, + "kl": 0.062261566519737244, + "learning_rate": 4.986304738420684e-06, + "loss": 0.0025, + "num_tokens": 1867616.0, + "reward": 0.7291259765625, + "reward_std": 0.009812846779823303, + "rewards//mean": 0.7291259765625, + "rewards//std": 0.02753545716404915, + "step": 216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0434, + "grad_norm": 0.6160389184951782, + "kl": 0.04629370174370706, + "learning_rate": 4.986138386249641e-06, + "loss": 0.0019, + "num_tokens": 1876296.0, + "reward": 0.7247314453125, + "reward_std": 0.018973447382450104, + "rewards//mean": 0.7247314453125, + "rewards//std": 0.04735804721713066, + "step": 217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0436, + "grad_norm": 0.6583981513977051, + "kl": 0.04990657512098551, + "learning_rate": 4.985971032662352e-06, + "loss": 0.002, + "num_tokens": 1884904.0, + "reward": 0.7359619140625, + "reward_std": 0.017147067934274673, + "rewards//mean": 0.7359619140625, + "rewards//std": 0.04669753462076187, + "step": 218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0438, + "grad_norm": 0.8134801983833313, + "kl": 0.0423719760729, + "learning_rate": 4.98580267772623e-06, + "loss": 0.0017, + "num_tokens": 1893616.0, + "reward": 0.72003173828125, + "reward_std": 0.016933666542172432, + "rewards//mean": 0.72003173828125, + "rewards//std": 0.06031915172934532, + "step": 219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.044, + "grad_norm": 0.6741657853126526, + "kl": 0.03965398599393666, + "learning_rate": 4.985633321509086e-06, + "loss": 0.0016, + "num_tokens": 1902160.0, + "reward": 0.68719482421875, + "reward_std": 0.015195935033261776, + "rewards//mean": 0.68719482421875, + "rewards//std": 0.05944652855396271, + "step": 220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0442, + "grad_norm": 0.9493129849433899, + "kl": 0.046657787868753076, + "learning_rate": 4.985462964079137e-06, + "loss": 0.0019, + "num_tokens": 1910880.0, + "reward": 0.71942138671875, + "reward_std": 0.016195297241210938, + "rewards//mean": 0.71942138671875, + "rewards//std": 0.05246121808886528, + "step": 221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0444, + "grad_norm": 0.7087270021438599, + "kl": 0.04575832257978618, + "learning_rate": 4.985291605505004e-06, + "loss": 0.0018, + "num_tokens": 1919640.0, + "reward": 0.702392578125, + "reward_std": 0.0167884211987257, + "rewards//mean": 0.702392578125, + "rewards//std": 0.05223599076271057, + "step": 222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0446, + "grad_norm": 0.6752747297286987, + "kl": 0.04726569773629308, + "learning_rate": 4.9851192458557084e-06, + "loss": 0.0019, + "num_tokens": 1928296.0, + "reward": 0.71807861328125, + "reward_std": 0.01220088079571724, + "rewards//mean": 0.71807861328125, + "rewards//std": 0.037438686937093735, + "step": 223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0448, + "grad_norm": 0.7452352046966553, + "kl": 0.05152314284350723, + "learning_rate": 4.984945885200679e-06, + "loss": 0.0021, + "num_tokens": 1937088.0, + "reward": 0.74041748046875, + "reward_std": 0.013437759131193161, + "rewards//mean": 0.74041748046875, + "rewards//std": 0.040072083473205566, + "step": 224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.045, + "grad_norm": 0.6877680420875549, + "kl": 0.051004784647375345, + "learning_rate": 4.984771523609744e-06, + "loss": 0.002, + "num_tokens": 1945688.0, + "reward": 0.73846435546875, + "reward_std": 0.016483765095472336, + "rewards//mean": 0.73846435546875, + "rewards//std": 0.0488903634250164, + "step": 225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0452, + "grad_norm": 0.7118944525718689, + "kl": 0.05141365760937333, + "learning_rate": 4.9845961611531356e-06, + "loss": 0.0021, + "num_tokens": 1954320.0, + "reward": 0.74041748046875, + "reward_std": 0.015977714210748672, + "rewards//mean": 0.74041748046875, + "rewards//std": 0.05143725872039795, + "step": 226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0454, + "grad_norm": 0.9068376421928406, + "kl": 0.05422702501527965, + "learning_rate": 4.984419797901491e-06, + "loss": 0.0022, + "num_tokens": 1962944.0, + "reward": 0.71044921875, + "reward_std": 0.01507932785898447, + "rewards//mean": 0.71044921875, + "rewards//std": 0.036301881074905396, + "step": 227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0456, + "grad_norm": 0.7399264574050903, + "kl": 0.05034496798180044, + "learning_rate": 4.984242433925849e-06, + "loss": 0.002, + "num_tokens": 1971624.0, + "reward": 0.75054931640625, + "reward_std": 0.015884902328252792, + "rewards//mean": 0.75054931640625, + "rewards//std": 0.06389451771974564, + "step": 228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0458, + "grad_norm": 0.7348447442054749, + "kl": 0.049029057379812, + "learning_rate": 4.984064069297652e-06, + "loss": 0.002, + "num_tokens": 1980280.0, + "reward": 0.697021484375, + "reward_std": 0.015012217685580254, + "rewards//mean": 0.697021484375, + "rewards//std": 0.056743159890174866, + "step": 229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.046, + "grad_norm": 0.7991431355476379, + "kl": 0.07105646608397365, + "learning_rate": 4.983884704088745e-06, + "loss": 0.0028, + "num_tokens": 1988880.0, + "reward": 0.695068359375, + "reward_std": 0.01788986101746559, + "rewards//mean": 0.695068359375, + "rewards//std": 0.06029834970831871, + "step": 230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0462, + "grad_norm": 0.6498074531555176, + "kl": 0.05387417250312865, + "learning_rate": 4.983704338371375e-06, + "loss": 0.0022, + "num_tokens": 1997536.0, + "reward": 0.738037109375, + "reward_std": 0.012314668856561184, + "rewards//mean": 0.738037109375, + "rewards//std": 0.05002864450216293, + "step": 231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0464, + "grad_norm": 0.6092854142189026, + "kl": 0.04524057498201728, + "learning_rate": 4.983522972218196e-06, + "loss": 0.0018, + "num_tokens": 2006280.0, + "reward": 0.7039794921875, + "reward_std": 0.013910917565226555, + "rewards//mean": 0.7039794921875, + "rewards//std": 0.04475763440132141, + "step": 232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0466, + "grad_norm": 0.5306748747825623, + "kl": 0.047080037416890264, + "learning_rate": 4.983340605702261e-06, + "loss": 0.0019, + "num_tokens": 2015040.0, + "reward": 0.7469482421875, + "reward_std": 0.011357417330145836, + "rewards//mean": 0.7469482421875, + "rewards//std": 0.03777945414185524, + "step": 233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0468, + "grad_norm": 0.768358588218689, + "kl": 0.03969805908855051, + "learning_rate": 4.983157238897026e-06, + "loss": 0.0016, + "num_tokens": 2023640.0, + "reward": 0.7327880859375, + "reward_std": 0.015326134860515594, + "rewards//mean": 0.7327880859375, + "rewards//std": 0.04921766370534897, + "step": 234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.047, + "grad_norm": 0.6886104941368103, + "kl": 0.05683159315958619, + "learning_rate": 4.982972871876353e-06, + "loss": 0.0023, + "num_tokens": 2032216.0, + "reward": 0.7303466796875, + "reward_std": 0.010093173943459988, + "rewards//mean": 0.7303466796875, + "rewards//std": 0.046449214220047, + "step": 235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0472, + "grad_norm": 0.5981897711753845, + "kl": 0.04994144011288881, + "learning_rate": 4.982787504714503e-06, + "loss": 0.002, + "num_tokens": 2040856.0, + "reward": 0.739501953125, + "reward_std": 0.018706027418375015, + "rewards//mean": 0.739501953125, + "rewards//std": 0.05437158793210983, + "step": 236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0474, + "grad_norm": 0.7223436832427979, + "kl": 0.06479091383516788, + "learning_rate": 4.982601137486144e-06, + "loss": 0.0026, + "num_tokens": 2049408.0, + "reward": 0.71697998046875, + "reward_std": 0.014704955741763115, + "rewards//mean": 0.71697998046875, + "rewards//std": 0.0494886115193367, + "step": 237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0476, + "grad_norm": 0.6980536580085754, + "kl": 0.04527810198487714, + "learning_rate": 4.9824137702663424e-06, + "loss": 0.0018, + "num_tokens": 2058016.0, + "reward": 0.71221923828125, + "reward_std": 0.01778585836291313, + "rewards//mean": 0.71221923828125, + "rewards//std": 0.05478951334953308, + "step": 238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0478, + "grad_norm": 1.1460226774215698, + "kl": 0.06317855324596167, + "learning_rate": 4.982225403130572e-06, + "loss": 0.0025, + "num_tokens": 2066648.0, + "reward": 0.73974609375, + "reward_std": 0.01631517894566059, + "rewards//mean": 0.73974609375, + "rewards//std": 0.03925115987658501, + "step": 239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.048, + "grad_norm": 0.9314346313476562, + "kl": 0.0637447414919734, + "learning_rate": 4.982036036154706e-06, + "loss": 0.0025, + "num_tokens": 2075336.0, + "reward": 0.71893310546875, + "reward_std": 0.013403713703155518, + "rewards//mean": 0.71893310546875, + "rewards//std": 0.05558551847934723, + "step": 240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0482, + "grad_norm": 0.6399586200714111, + "kl": 0.04521946748718619, + "learning_rate": 4.981845669415022e-06, + "loss": 0.0018, + "num_tokens": 2083952.0, + "reward": 0.710693359375, + "reward_std": 0.017721328884363174, + "rewards//mean": 0.710693359375, + "rewards//std": 0.032037414610385895, + "step": 241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0484, + "grad_norm": 0.7257511019706726, + "kl": 0.04954640497453511, + "learning_rate": 4.981654302988198e-06, + "loss": 0.002, + "num_tokens": 2092624.0, + "reward": 0.756591796875, + "reward_std": 0.014372417703270912, + "rewards//mean": 0.756591796875, + "rewards//std": 0.04212266206741333, + "step": 242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0486, + "grad_norm": 0.7490444183349609, + "kl": 0.06151878600940108, + "learning_rate": 4.9814619369513184e-06, + "loss": 0.0025, + "num_tokens": 2101160.0, + "reward": 0.7039794921875, + "reward_std": 0.01579831726849079, + "rewards//mean": 0.7039794921875, + "rewards//std": 0.06837564706802368, + "step": 243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0488, + "grad_norm": 0.7180415391921997, + "kl": 0.041042948490940034, + "learning_rate": 4.981268571381867e-06, + "loss": 0.0016, + "num_tokens": 2109792.0, + "reward": 0.732421875, + "reward_std": 0.015042596496641636, + "rewards//mean": 0.732421875, + "rewards//std": 0.04142765328288078, + "step": 244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.049, + "grad_norm": 0.7131726145744324, + "kl": 0.04773534648120403, + "learning_rate": 4.981074206357732e-06, + "loss": 0.0019, + "num_tokens": 2118552.0, + "reward": 0.7369384765625, + "reward_std": 0.011528298258781433, + "rewards//mean": 0.7369384765625, + "rewards//std": 0.04968539997935295, + "step": 245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0492, + "grad_norm": 0.775935709476471, + "kl": 0.0707710012793541, + "learning_rate": 4.980878841957203e-06, + "loss": 0.0028, + "num_tokens": 2127248.0, + "reward": 0.69036865234375, + "reward_std": 0.012703044340014458, + "rewards//mean": 0.69036865234375, + "rewards//std": 0.05450669303536415, + "step": 246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0494, + "grad_norm": 0.7658054232597351, + "kl": 0.0640805927105248, + "learning_rate": 4.980682478258973e-06, + "loss": 0.0026, + "num_tokens": 2135896.0, + "reward": 0.72894287109375, + "reward_std": 0.01790812611579895, + "rewards//mean": 0.72894287109375, + "rewards//std": 0.05467694625258446, + "step": 247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0496, + "grad_norm": 0.5831694006919861, + "kl": 0.04178021801635623, + "learning_rate": 4.980485115342138e-06, + "loss": 0.0017, + "num_tokens": 2144536.0, + "reward": 0.70562744140625, + "reward_std": 0.019685041159391403, + "rewards//mean": 0.70562744140625, + "rewards//std": 0.06453383713960648, + "step": 248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0498, + "grad_norm": 0.686213493347168, + "kl": 0.05178135330788791, + "learning_rate": 4.980286753286196e-06, + "loss": 0.0021, + "num_tokens": 2153104.0, + "reward": 0.73675537109375, + "reward_std": 0.015449654310941696, + "rewards//mean": 0.73675537109375, + "rewards//std": 0.022234952077269554, + "step": 249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.05, + "grad_norm": 0.5769968032836914, + "kl": 0.04894346767105162, + "learning_rate": 4.980087392171045e-06, + "loss": 0.002, + "num_tokens": 2161736.0, + "reward": 0.7308349609375, + "reward_std": 0.009863736107945442, + "rewards//mean": 0.7308349609375, + "rewards//std": 0.04522731900215149, + "step": 250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0502, + "grad_norm": 0.6053898930549622, + "kl": 0.04904163209721446, + "learning_rate": 4.9798870320769884e-06, + "loss": 0.002, + "num_tokens": 2170344.0, + "reward": 0.71343994140625, + "reward_std": 0.011166264303028584, + "rewards//mean": 0.71343994140625, + "rewards//std": 0.05631681904196739, + "step": 251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0504, + "grad_norm": 0.7947268486022949, + "kl": 0.0519161622505635, + "learning_rate": 4.979685673084733e-06, + "loss": 0.0021, + "num_tokens": 2179056.0, + "reward": 0.77587890625, + "reward_std": 0.014476396143436432, + "rewards//mean": 0.77587890625, + "rewards//std": 0.032057251781225204, + "step": 252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0506, + "grad_norm": 0.693539559841156, + "kl": 0.05135406623594463, + "learning_rate": 4.979483315275385e-06, + "loss": 0.0021, + "num_tokens": 2187640.0, + "reward": 0.70477294921875, + "reward_std": 0.012753374874591827, + "rewards//mean": 0.70477294921875, + "rewards//std": 0.05521116405725479, + "step": 253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0508, + "grad_norm": 0.6642072796821594, + "kl": 0.07001969963312149, + "learning_rate": 4.979279958730454e-06, + "loss": 0.0028, + "num_tokens": 2196336.0, + "reward": 0.74652099609375, + "reward_std": 0.012642599642276764, + "rewards//mean": 0.74652099609375, + "rewards//std": 0.036172330379486084, + "step": 254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.051, + "grad_norm": 0.7698396444320679, + "kl": 0.05689065787009895, + "learning_rate": 4.979075603531852e-06, + "loss": 0.0023, + "num_tokens": 2205024.0, + "reward": 0.7344970703125, + "reward_std": 0.014306074939668179, + "rewards//mean": 0.7344970703125, + "rewards//std": 0.04307273030281067, + "step": 255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0512, + "grad_norm": 0.6658589839935303, + "kl": 0.04703840473666787, + "learning_rate": 4.978870249761893e-06, + "loss": 0.0019, + "num_tokens": 2213608.0, + "reward": 0.74749755859375, + "reward_std": 0.01808979921042919, + "rewards//mean": 0.74749755859375, + "rewards//std": 0.06493417918682098, + "step": 256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0514, + "grad_norm": 0.5617851614952087, + "kl": 0.04335748380981386, + "learning_rate": 4.978663897503294e-06, + "loss": 0.0017, + "num_tokens": 2222264.0, + "reward": 0.76544189453125, + "reward_std": 0.01435675285756588, + "rewards//mean": 0.76544189453125, + "rewards//std": 0.05317598953843117, + "step": 257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0516, + "grad_norm": 0.7687885761260986, + "kl": 0.06642695516347885, + "learning_rate": 4.978456546839175e-06, + "loss": 0.0027, + "num_tokens": 2230880.0, + "reward": 0.71099853515625, + "reward_std": 0.01172902062535286, + "rewards//mean": 0.71099853515625, + "rewards//std": 0.045694950968027115, + "step": 258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0518, + "grad_norm": 0.747726559638977, + "kl": 0.05708181764930487, + "learning_rate": 4.978248197853053e-06, + "loss": 0.0023, + "num_tokens": 2239608.0, + "reward": 0.76300048828125, + "reward_std": 0.013816908933222294, + "rewards//mean": 0.76300048828125, + "rewards//std": 0.04265507683157921, + "step": 259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.052, + "grad_norm": 0.7513110041618347, + "kl": 0.06573500973172486, + "learning_rate": 4.978038850628855e-06, + "loss": 0.0026, + "num_tokens": 2248296.0, + "reward": 0.73321533203125, + "reward_std": 0.012971704825758934, + "rewards//mean": 0.73321533203125, + "rewards//std": 0.03936932981014252, + "step": 260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0522, + "grad_norm": 0.8820503354072571, + "kl": 0.08888540370389819, + "learning_rate": 4.977828505250903e-06, + "loss": 0.0036, + "num_tokens": 2256976.0, + "reward": 0.71124267578125, + "reward_std": 0.015261702239513397, + "rewards//mean": 0.71124267578125, + "rewards//std": 0.048025041818618774, + "step": 261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0524, + "grad_norm": 0.5689162611961365, + "kl": 0.058710358338430524, + "learning_rate": 4.977617161803927e-06, + "loss": 0.0023, + "num_tokens": 2265672.0, + "reward": 0.766357421875, + "reward_std": 0.013638041913509369, + "rewards//mean": 0.766357421875, + "rewards//std": 0.047856152057647705, + "step": 262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0526, + "grad_norm": 0.7432250380516052, + "kl": 0.05571276368573308, + "learning_rate": 4.977404820373053e-06, + "loss": 0.0022, + "num_tokens": 2274336.0, + "reward": 0.75738525390625, + "reward_std": 0.012658031657338142, + "rewards//mean": 0.75738525390625, + "rewards//std": 0.04126431792974472, + "step": 263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0528, + "grad_norm": 0.5365133881568909, + "kl": 0.06589075829833746, + "learning_rate": 4.977191481043814e-06, + "loss": 0.0026, + "num_tokens": 2282984.0, + "reward": 0.7647705078125, + "reward_std": 0.012238910421729088, + "rewards//mean": 0.7647705078125, + "rewards//std": 0.036798667162656784, + "step": 264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.053, + "grad_norm": 0.7748925089836121, + "kl": 0.06631747842766345, + "learning_rate": 4.976977143902143e-06, + "loss": 0.0027, + "num_tokens": 2291520.0, + "reward": 0.71173095703125, + "reward_std": 0.01299564354121685, + "rewards//mean": 0.71173095703125, + "rewards//std": 0.05950303003191948, + "step": 265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0532, + "grad_norm": 0.6161185503005981, + "kl": 0.07089789980091155, + "learning_rate": 4.976761809034375e-06, + "loss": 0.0028, + "num_tokens": 2300080.0, + "reward": 0.71710205078125, + "reward_std": 0.015387684106826782, + "rewards//mean": 0.71710205078125, + "rewards//std": 0.03893085941672325, + "step": 266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0534, + "grad_norm": 0.91294926404953, + "kl": 0.10131165734492242, + "learning_rate": 4.976545476527246e-06, + "loss": 0.0041, + "num_tokens": 2308656.0, + "reward": 0.72802734375, + "reward_std": 0.01398141123354435, + "rewards//mean": 0.72802734375, + "rewards//std": 0.042606472969055176, + "step": 267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0536, + "grad_norm": 0.7760770320892334, + "kl": 0.07038976019248366, + "learning_rate": 4.976328146467895e-06, + "loss": 0.0028, + "num_tokens": 2317368.0, + "reward": 0.75408935546875, + "reward_std": 0.017939291894435883, + "rewards//mean": 0.75408935546875, + "rewards//std": 0.04169821739196777, + "step": 268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0538, + "grad_norm": 0.7210671305656433, + "kl": 0.07773464918136597, + "learning_rate": 4.976109818943863e-06, + "loss": 0.0031, + "num_tokens": 2325992.0, + "reward": 0.7349853515625, + "reward_std": 0.013170282356441021, + "rewards//mean": 0.7349853515625, + "rewards//std": 0.03823667764663696, + "step": 269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.054, + "grad_norm": 0.7028035521507263, + "kl": 0.0827886825427413, + "learning_rate": 4.975890494043092e-06, + "loss": 0.0033, + "num_tokens": 2334616.0, + "reward": 0.75750732421875, + "reward_std": 0.013806039467453957, + "rewards//mean": 0.75750732421875, + "rewards//std": 0.03447311371564865, + "step": 270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0542, + "grad_norm": 0.6302931904792786, + "kl": 0.07922178274020553, + "learning_rate": 4.975670171853926e-06, + "loss": 0.0032, + "num_tokens": 2343264.0, + "reward": 0.7408447265625, + "reward_std": 0.01199992373585701, + "rewards//mean": 0.7408447265625, + "rewards//std": 0.03118186816573143, + "step": 271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0544, + "grad_norm": 0.5366842746734619, + "kl": 0.053182843839749694, + "learning_rate": 4.975448852465111e-06, + "loss": 0.0021, + "num_tokens": 2351936.0, + "reward": 0.73406982421875, + "reward_std": 0.012117596343159676, + "rewards//mean": 0.73406982421875, + "rewards//std": 0.05836515128612518, + "step": 272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0546, + "grad_norm": 0.7956821918487549, + "kl": 0.10613921284675598, + "learning_rate": 4.975226535965795e-06, + "loss": 0.0042, + "num_tokens": 2360560.0, + "reward": 0.73834228515625, + "reward_std": 0.013946297578513622, + "rewards//mean": 0.73834228515625, + "rewards//std": 0.045477114617824554, + "step": 273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0548, + "grad_norm": 0.825093686580658, + "kl": 0.06953645718749613, + "learning_rate": 4.975003222445525e-06, + "loss": 0.0028, + "num_tokens": 2369160.0, + "reward": 0.69384765625, + "reward_std": 0.0194076094776392, + "rewards//mean": 0.69384765625, + "rewards//std": 0.048462796956300735, + "step": 274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.055, + "grad_norm": 0.7195761799812317, + "kl": 0.06491784797981381, + "learning_rate": 4.974778911994254e-06, + "loss": 0.0026, + "num_tokens": 2377840.0, + "reward": 0.7286376953125, + "reward_std": 0.015999507158994675, + "rewards//mean": 0.7286376953125, + "rewards//std": 0.04492238163948059, + "step": 275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0552, + "grad_norm": 0.6829413175582886, + "kl": 0.06265947036445141, + "learning_rate": 4.974553604702332e-06, + "loss": 0.0025, + "num_tokens": 2386504.0, + "reward": 0.74267578125, + "reward_std": 0.016472984105348587, + "rewards//mean": 0.74267578125, + "rewards//std": 0.03249996155500412, + "step": 276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0554, + "grad_norm": 0.8279444575309753, + "kl": 0.109421216417104, + "learning_rate": 4.974327300660515e-06, + "loss": 0.0044, + "num_tokens": 2395128.0, + "reward": 0.74176025390625, + "reward_std": 0.013287878595292568, + "rewards//mean": 0.74176025390625, + "rewards//std": 0.03989148512482643, + "step": 277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0556, + "grad_norm": 0.8692975640296936, + "kl": 0.13027487508952618, + "learning_rate": 4.974099999959957e-06, + "loss": 0.0052, + "num_tokens": 2403824.0, + "reward": 0.71258544921875, + "reward_std": 0.01673486828804016, + "rewards//mean": 0.71258544921875, + "rewards//std": 0.045876458287239075, + "step": 278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0558, + "grad_norm": 0.6421030759811401, + "kl": 0.075614667031914, + "learning_rate": 4.973871702692215e-06, + "loss": 0.003, + "num_tokens": 2412440.0, + "reward": 0.72467041015625, + "reward_std": 0.015497658401727676, + "rewards//mean": 0.72467041015625, + "rewards//std": 0.04439099133014679, + "step": 279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.056, + "grad_norm": 0.7968896627426147, + "kl": 0.08366691786795855, + "learning_rate": 4.973642408949247e-06, + "loss": 0.0033, + "num_tokens": 2421072.0, + "reward": 0.693115234375, + "reward_std": 0.013288882561028004, + "rewards//mean": 0.693115234375, + "rewards//std": 0.04501882195472717, + "step": 280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0562, + "grad_norm": 0.8614963889122009, + "kl": 0.10154542187228799, + "learning_rate": 4.9734121188234115e-06, + "loss": 0.0041, + "num_tokens": 2429736.0, + "reward": 0.72857666015625, + "reward_std": 0.013583678752183914, + "rewards//mean": 0.72857666015625, + "rewards//std": 0.04098678007721901, + "step": 281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0564, + "grad_norm": 1.0249260663986206, + "kl": 0.10393307078629732, + "learning_rate": 4.973180832407471e-06, + "loss": 0.0042, + "num_tokens": 2438336.0, + "reward": 0.72576904296875, + "reward_std": 0.01790856570005417, + "rewards//mean": 0.72576904296875, + "rewards//std": 0.03053005412220955, + "step": 282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0566, + "grad_norm": 0.8311775922775269, + "kl": 0.11601218301802874, + "learning_rate": 4.972948549794587e-06, + "loss": 0.0046, + "num_tokens": 2446944.0, + "reward": 0.7076416015625, + "reward_std": 0.017834939062595367, + "rewards//mean": 0.7076416015625, + "rewards//std": 0.036632098257541656, + "step": 283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0568, + "grad_norm": 0.8473641872406006, + "kl": 0.12403094908222556, + "learning_rate": 4.972715271078323e-06, + "loss": 0.005, + "num_tokens": 2455528.0, + "reward": 0.748779296875, + "reward_std": 0.014438275247812271, + "rewards//mean": 0.748779296875, + "rewards//std": 0.036623213440179825, + "step": 284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.057, + "grad_norm": 1.0325453281402588, + "kl": 0.15802243910729885, + "learning_rate": 4.972480996352644e-06, + "loss": 0.0063, + "num_tokens": 2464104.0, + "reward": 0.72064208984375, + "reward_std": 0.011821886524558067, + "rewards//mean": 0.72064208984375, + "rewards//std": 0.04506784677505493, + "step": 285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0572, + "grad_norm": 0.8279632329940796, + "kl": 0.14361037500202656, + "learning_rate": 4.9722457257119144e-06, + "loss": 0.0057, + "num_tokens": 2472768.0, + "reward": 0.77099609375, + "reward_std": 0.011357970535755157, + "rewards//mean": 0.77099609375, + "rewards//std": 0.027263915166258812, + "step": 286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0574, + "grad_norm": 0.8311311602592468, + "kl": 0.12657944671809673, + "learning_rate": 4.972009459250903e-06, + "loss": 0.0051, + "num_tokens": 2481432.0, + "reward": 0.72271728515625, + "reward_std": 0.012970637530088425, + "rewards//mean": 0.72271728515625, + "rewards//std": 0.03625509515404701, + "step": 287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0576, + "grad_norm": 1.064893126487732, + "kl": 0.16098652640357614, + "learning_rate": 4.971772197064776e-06, + "loss": 0.0064, + "num_tokens": 2490120.0, + "reward": 0.73114013671875, + "reward_std": 0.013818023726344109, + "rewards//mean": 0.73114013671875, + "rewards//std": 0.03643544018268585, + "step": 288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0578, + "grad_norm": 0.957671046257019, + "kl": 0.19215518794953823, + "learning_rate": 4.971533939249105e-06, + "loss": 0.0077, + "num_tokens": 2498832.0, + "reward": 0.73681640625, + "reward_std": 0.014772996306419373, + "rewards//mean": 0.73681640625, + "rewards//std": 0.04102522134780884, + "step": 289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.058, + "grad_norm": 0.9647096395492554, + "kl": 0.1939010415226221, + "learning_rate": 4.9712946858998576e-06, + "loss": 0.0078, + "num_tokens": 2507544.0, + "reward": 0.75213623046875, + "reward_std": 0.012323443777859211, + "rewards//mean": 0.75213623046875, + "rewards//std": 0.036742422729730606, + "step": 290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0582, + "grad_norm": 0.9463180899620056, + "kl": 0.1474277568049729, + "learning_rate": 4.971054437113406e-06, + "loss": 0.0059, + "num_tokens": 2516200.0, + "reward": 0.74615478515625, + "reward_std": 0.01541908085346222, + "rewards//mean": 0.74615478515625, + "rewards//std": 0.0504126213490963, + "step": 291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0584, + "grad_norm": 1.2873408794403076, + "kl": 0.18494170205667615, + "learning_rate": 4.9708131929865235e-06, + "loss": 0.0074, + "num_tokens": 2524768.0, + "reward": 0.74810791015625, + "reward_std": 0.01592198945581913, + "rewards//mean": 0.74810791015625, + "rewards//std": 0.04342567175626755, + "step": 292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0586, + "grad_norm": 0.9283332228660583, + "kl": 0.15322866081260145, + "learning_rate": 4.970570953616383e-06, + "loss": 0.0061, + "num_tokens": 2533440.0, + "reward": 0.74658203125, + "reward_std": 0.012621916830539703, + "rewards//mean": 0.74658203125, + "rewards//std": 0.03913993015885353, + "step": 293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0588, + "grad_norm": 2.470165967941284, + "kl": 0.21047021076083183, + "learning_rate": 4.970327719100556e-06, + "loss": 0.0084, + "num_tokens": 2542128.0, + "reward": 0.7777099609375, + "reward_std": 0.013461882248520851, + "rewards//mean": 0.7777099609375, + "rewards//std": 0.0339735671877861, + "step": 294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.059, + "grad_norm": 0.9362149238586426, + "kl": 0.14741351851262152, + "learning_rate": 4.970083489537021e-06, + "loss": 0.0059, + "num_tokens": 2550680.0, + "reward": 0.72869873046875, + "reward_std": 0.014110936783254147, + "rewards//mean": 0.72869873046875, + "rewards//std": 0.03162695840001106, + "step": 295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0592, + "grad_norm": 0.8844790458679199, + "kl": 0.1842191582545638, + "learning_rate": 4.96983826502415e-06, + "loss": 0.0074, + "num_tokens": 2559424.0, + "reward": 0.7591552734375, + "reward_std": 0.008938804268836975, + "rewards//mean": 0.7591552734375, + "rewards//std": 0.04523535072803497, + "step": 296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0594, + "grad_norm": 1.2818913459777832, + "kl": 0.27794970013201237, + "learning_rate": 4.969592045660723e-06, + "loss": 0.0111, + "num_tokens": 2568040.0, + "reward": 0.78643798828125, + "reward_std": 0.01272084191441536, + "rewards//mean": 0.78643798828125, + "rewards//std": 0.030282124876976013, + "step": 297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0596, + "grad_norm": 1.2021187543869019, + "kl": 0.19154435116797686, + "learning_rate": 4.969344831545914e-06, + "loss": 0.0077, + "num_tokens": 2576776.0, + "reward": 0.75469970703125, + "reward_std": 0.01579802855849266, + "rewards//mean": 0.75469970703125, + "rewards//std": 0.040667545050382614, + "step": 298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0598, + "grad_norm": 1.1095116138458252, + "kl": 0.2089026332832873, + "learning_rate": 4.969096622779303e-06, + "loss": 0.0084, + "num_tokens": 2585392.0, + "reward": 0.71600341796875, + "reward_std": 0.01274610310792923, + "rewards//mean": 0.71600341796875, + "rewards//std": 0.0431722030043602, + "step": 299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.06, + "grad_norm": 1.059348702430725, + "kl": 0.20234669605270028, + "learning_rate": 4.968847419460867e-06, + "loss": 0.0081, + "num_tokens": 2594032.0, + "reward": 0.7236328125, + "reward_std": 0.013915905728936195, + "rewards//mean": 0.7236328125, + "rewards//std": 0.0312344953417778, + "step": 300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0602, + "grad_norm": 1.2769887447357178, + "kl": 0.22755335364490747, + "learning_rate": 4.968597221690986e-06, + "loss": 0.0091, + "num_tokens": 2602736.0, + "reward": 0.72149658203125, + "reward_std": 0.011407001875340939, + "rewards//mean": 0.72149658203125, + "rewards//std": 0.049202706664800644, + "step": 301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0604, + "grad_norm": 1.2020509243011475, + "kl": 0.24635399505496025, + "learning_rate": 4.96834602957044e-06, + "loss": 0.0099, + "num_tokens": 2611384.0, + "reward": 0.76763916015625, + "reward_std": 0.016048027202486992, + "rewards//mean": 0.76763916015625, + "rewards//std": 0.04085657000541687, + "step": 302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0606, + "grad_norm": 1.436669111251831, + "kl": 0.25481397192925215, + "learning_rate": 4.968093843200407e-06, + "loss": 0.0102, + "num_tokens": 2620152.0, + "reward": 0.73895263671875, + "reward_std": 0.016838543117046356, + "rewards//mean": 0.73895263671875, + "rewards//std": 0.05139780789613724, + "step": 303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0608, + "grad_norm": 1.3443458080291748, + "kl": 0.2639510128647089, + "learning_rate": 4.96784066268247e-06, + "loss": 0.0106, + "num_tokens": 2628816.0, + "reward": 0.745361328125, + "reward_std": 0.014135653153061867, + "rewards//mean": 0.745361328125, + "rewards//std": 0.03560379147529602, + "step": 304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.061, + "grad_norm": 1.522937297821045, + "kl": 0.24147153925150633, + "learning_rate": 4.967586488118609e-06, + "loss": 0.0097, + "num_tokens": 2637496.0, + "reward": 0.7392578125, + "reward_std": 0.01406162977218628, + "rewards//mean": 0.7392578125, + "rewards//std": 0.028467724099755287, + "step": 305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0612, + "grad_norm": 1.3553171157836914, + "kl": 0.24848425947129726, + "learning_rate": 4.967331319611206e-06, + "loss": 0.0099, + "num_tokens": 2646104.0, + "reward": 0.7559814453125, + "reward_std": 0.014312982559204102, + "rewards//mean": 0.7559814453125, + "rewards//std": 0.03511122614145279, + "step": 306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0614, + "grad_norm": 1.5322372913360596, + "kl": 0.25969485752284527, + "learning_rate": 4.9670751572630425e-06, + "loss": 0.0104, + "num_tokens": 2654744.0, + "reward": 0.7608642578125, + "reward_std": 0.013267126865684986, + "rewards//mean": 0.7608642578125, + "rewards//std": 0.027817683294415474, + "step": 307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0616, + "grad_norm": 1.4864404201507568, + "kl": 0.2588785719126463, + "learning_rate": 4.9668180011773e-06, + "loss": 0.0104, + "num_tokens": 2663376.0, + "reward": 0.71484375, + "reward_std": 0.01414964348077774, + "rewards//mean": 0.71484375, + "rewards//std": 0.02796124666929245, + "step": 308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0618, + "grad_norm": 1.291443943977356, + "kl": 0.2382829338312149, + "learning_rate": 4.966559851457562e-06, + "loss": 0.0095, + "num_tokens": 2671952.0, + "reward": 0.7293701171875, + "reward_std": 0.015213550999760628, + "rewards//mean": 0.7293701171875, + "rewards//std": 0.03245498239994049, + "step": 309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.062, + "grad_norm": 1.3344167470932007, + "kl": 0.2496517887338996, + "learning_rate": 4.966300708207811e-06, + "loss": 0.01, + "num_tokens": 2680624.0, + "reward": 0.73394775390625, + "reward_std": 0.01914324052631855, + "rewards//mean": 0.73394775390625, + "rewards//std": 0.04364334046840668, + "step": 310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0622, + "grad_norm": 1.3081969022750854, + "kl": 0.30021103471517563, + "learning_rate": 4.96604057153243e-06, + "loss": 0.012, + "num_tokens": 2689176.0, + "reward": 0.764892578125, + "reward_std": 0.01673274114727974, + "rewards//mean": 0.764892578125, + "rewards//std": 0.040132150053977966, + "step": 311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0624, + "grad_norm": 1.3401750326156616, + "kl": 0.2593008913099766, + "learning_rate": 4.965779441536202e-06, + "loss": 0.0104, + "num_tokens": 2697864.0, + "reward": 0.74420166015625, + "reward_std": 0.01244452502578497, + "rewards//mean": 0.74420166015625, + "rewards//std": 0.03462515026330948, + "step": 312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0626, + "grad_norm": 1.4522110223770142, + "kl": 0.3523233197629452, + "learning_rate": 4.965517318324308e-06, + "loss": 0.0141, + "num_tokens": 2706480.0, + "reward": 0.71630859375, + "reward_std": 0.011550749652087688, + "rewards//mean": 0.71630859375, + "rewards//std": 0.026689305901527405, + "step": 313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0628, + "grad_norm": 1.3593254089355469, + "kl": 0.3362383022904396, + "learning_rate": 4.965254202002334e-06, + "loss": 0.0134, + "num_tokens": 2715056.0, + "reward": 0.75225830078125, + "reward_std": 0.015795797109603882, + "rewards//mean": 0.75225830078125, + "rewards//std": 0.0387437678873539, + "step": 314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.063, + "grad_norm": 1.2711306810379028, + "kl": 0.26307030860334635, + "learning_rate": 4.964990092676263e-06, + "loss": 0.0105, + "num_tokens": 2723632.0, + "reward": 0.74615478515625, + "reward_std": 0.011496458202600479, + "rewards//mean": 0.74615478515625, + "rewards//std": 0.038194045424461365, + "step": 315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0632, + "grad_norm": 1.2173610925674438, + "kl": 0.272955933585763, + "learning_rate": 4.964724990452476e-06, + "loss": 0.0109, + "num_tokens": 2732264.0, + "reward": 0.72149658203125, + "reward_std": 0.012493669055402279, + "rewards//mean": 0.72149658203125, + "rewards//std": 0.036469489336013794, + "step": 316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0634, + "grad_norm": 1.143746256828308, + "kl": 0.24079665448516607, + "learning_rate": 4.9644588954377595e-06, + "loss": 0.0096, + "num_tokens": 2740960.0, + "reward": 0.7584228515625, + "reward_std": 0.015307648107409477, + "rewards//mean": 0.7584228515625, + "rewards//std": 0.03647971153259277, + "step": 317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0636, + "grad_norm": 1.21216881275177, + "kl": 0.25471873860806227, + "learning_rate": 4.964191807739293e-06, + "loss": 0.0102, + "num_tokens": 2749528.0, + "reward": 0.741943359375, + "reward_std": 0.011934969574213028, + "rewards//mean": 0.741943359375, + "rewards//std": 0.02646489255130291, + "step": 318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0638, + "grad_norm": 1.5371971130371094, + "kl": 0.3100880701094866, + "learning_rate": 4.963923727464661e-06, + "loss": 0.0124, + "num_tokens": 2758176.0, + "reward": 0.7532958984375, + "reward_std": 0.011484376154839993, + "rewards//mean": 0.7532958984375, + "rewards//std": 0.031002702191472054, + "step": 319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.064, + "grad_norm": 1.2735233306884766, + "kl": 0.2549311425536871, + "learning_rate": 4.963654654721848e-06, + "loss": 0.0102, + "num_tokens": 2766760.0, + "reward": 0.7633056640625, + "reward_std": 0.01067253015935421, + "rewards//mean": 0.7633056640625, + "rewards//std": 0.03851751983165741, + "step": 320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0642, + "grad_norm": 1.3543078899383545, + "kl": 0.33309008402284235, + "learning_rate": 4.963384589619233e-06, + "loss": 0.0133, + "num_tokens": 2775344.0, + "reward": 0.75543212890625, + "reward_std": 0.015507234260439873, + "rewards//mean": 0.75543212890625, + "rewards//std": 0.03842874616384506, + "step": 321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0644, + "grad_norm": 1.2867757081985474, + "kl": 0.35989778488874435, + "learning_rate": 4.9631135322656e-06, + "loss": 0.0144, + "num_tokens": 2783880.0, + "reward": 0.75213623046875, + "reward_std": 0.013345220126211643, + "rewards//mean": 0.75213623046875, + "rewards//std": 0.0316346138715744, + "step": 322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0646, + "grad_norm": 1.3027052879333496, + "kl": 0.3117766585201025, + "learning_rate": 4.962841482770131e-06, + "loss": 0.0125, + "num_tokens": 2792480.0, + "reward": 0.75042724609375, + "reward_std": 0.021965457126498222, + "rewards//mean": 0.75042724609375, + "rewards//std": 0.03844450041651726, + "step": 323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0648, + "grad_norm": 1.2578948736190796, + "kl": 0.31469903141260147, + "learning_rate": 4.962568441242408e-06, + "loss": 0.0126, + "num_tokens": 2801088.0, + "reward": 0.723388671875, + "reward_std": 0.015950549393892288, + "rewards//mean": 0.723388671875, + "rewards//std": 0.03848705068230629, + "step": 324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.065, + "grad_norm": 1.3456346988677979, + "kl": 0.26953551825135946, + "learning_rate": 4.962294407792411e-06, + "loss": 0.0108, + "num_tokens": 2809768.0, + "reward": 0.747802734375, + "reward_std": 0.015800267457962036, + "rewards//mean": 0.747802734375, + "rewards//std": 0.03215061128139496, + "step": 325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0652, + "grad_norm": 1.6204522848129272, + "kl": 0.34877745993435383, + "learning_rate": 4.962019382530521e-06, + "loss": 0.014, + "num_tokens": 2818328.0, + "reward": 0.756591796875, + "reward_std": 0.017749017104506493, + "rewards//mean": 0.756591796875, + "rewards//std": 0.03980492055416107, + "step": 326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0654, + "grad_norm": 1598.2496337890625, + "kl": 9.96565246488899, + "learning_rate": 4.961743365567517e-06, + "loss": 0.3986, + "num_tokens": 2826984.0, + "reward": 0.71923828125, + "reward_std": 0.014437740668654442, + "rewards//mean": 0.71923828125, + "rewards//std": 0.05055179446935654, + "step": 327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0656, + "grad_norm": 1.0102441310882568, + "kl": 0.30338819324970245, + "learning_rate": 4.961466357014581e-06, + "loss": 0.0121, + "num_tokens": 2835544.0, + "reward": 0.75927734375, + "reward_std": 0.011439365334808826, + "rewards//mean": 0.75927734375, + "rewards//std": 0.02604631707072258, + "step": 328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0658, + "grad_norm": 451.71466064453125, + "kl": 1.1315573640167713, + "learning_rate": 4.961188356983291e-06, + "loss": 0.0453, + "num_tokens": 2844184.0, + "reward": 0.77728271484375, + "reward_std": 0.014329792931675911, + "rewards//mean": 0.77728271484375, + "rewards//std": 0.030076975002884865, + "step": 329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.066, + "grad_norm": 1.7841613292694092, + "kl": 0.32306745275855064, + "learning_rate": 4.960909365585624e-06, + "loss": 0.0129, + "num_tokens": 2852824.0, + "reward": 0.7259521484375, + "reward_std": 0.012303611263632774, + "rewards//mean": 0.7259521484375, + "rewards//std": 0.024766186252236366, + "step": 330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0662, + "grad_norm": 1.2208861112594604, + "kl": 0.35031407698988914, + "learning_rate": 4.960629382933959e-06, + "loss": 0.014, + "num_tokens": 2861568.0, + "reward": 0.75592041015625, + "reward_std": 0.013036654330790043, + "rewards//mean": 0.75592041015625, + "rewards//std": 0.028763895854353905, + "step": 331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0664, + "grad_norm": 45.699703216552734, + "kl": 0.37541868537664413, + "learning_rate": 4.960348409141074e-06, + "loss": 0.015, + "num_tokens": 2870168.0, + "reward": 0.76629638671875, + "reward_std": 0.014367573894560337, + "rewards//mean": 0.76629638671875, + "rewards//std": 0.03430584445595741, + "step": 332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0666, + "grad_norm": 1.0523972511291504, + "kl": 0.2711593806743622, + "learning_rate": 4.960066444320143e-06, + "loss": 0.0108, + "num_tokens": 2878752.0, + "reward": 0.72601318359375, + "reward_std": 0.016535844653844833, + "rewards//mean": 0.72601318359375, + "rewards//std": 0.03955461084842682, + "step": 333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0668, + "grad_norm": 1.1287596225738525, + "kl": 0.27480507269501686, + "learning_rate": 4.959783488584743e-06, + "loss": 0.011, + "num_tokens": 2887504.0, + "reward": 0.7252197265625, + "reward_std": 0.013544876128435135, + "rewards//mean": 0.7252197265625, + "rewards//std": 0.036889057606458664, + "step": 334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.067, + "grad_norm": 1.9436277151107788, + "kl": 0.26933558098971844, + "learning_rate": 4.9594995420488475e-06, + "loss": 0.0108, + "num_tokens": 2896176.0, + "reward": 0.70745849609375, + "reward_std": 0.01362568698823452, + "rewards//mean": 0.70745849609375, + "rewards//std": 0.0338621586561203, + "step": 335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0672, + "grad_norm": 1.3394960165023804, + "kl": 0.2702603470534086, + "learning_rate": 4.959214604826831e-06, + "loss": 0.0108, + "num_tokens": 2904808.0, + "reward": 0.73876953125, + "reward_std": 0.015888595953583717, + "rewards//mean": 0.73876953125, + "rewards//std": 0.03880433365702629, + "step": 336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0674, + "grad_norm": 1.188582181930542, + "kl": 0.3325184481218457, + "learning_rate": 4.958928677033465e-06, + "loss": 0.0133, + "num_tokens": 2913544.0, + "reward": 0.73760986328125, + "reward_std": 0.01286403276026249, + "rewards//mean": 0.73760986328125, + "rewards//std": 0.03628222644329071, + "step": 337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0676, + "grad_norm": 1.0829931497573853, + "kl": 0.2599369240924716, + "learning_rate": 4.9586417587839225e-06, + "loss": 0.0104, + "num_tokens": 2922208.0, + "reward": 0.74237060546875, + "reward_std": 0.009288130328059196, + "rewards//mean": 0.74237060546875, + "rewards//std": 0.0271921269595623, + "step": 338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0678, + "grad_norm": 1.090649127960205, + "kl": 0.35008446872234344, + "learning_rate": 4.958353850193773e-06, + "loss": 0.014, + "num_tokens": 2930904.0, + "reward": 0.7298583984375, + "reward_std": 0.009747210890054703, + "rewards//mean": 0.7298583984375, + "rewards//std": 0.031018322333693504, + "step": 339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.068, + "grad_norm": 1.0257357358932495, + "kl": 0.2736462540924549, + "learning_rate": 4.958064951378988e-06, + "loss": 0.0109, + "num_tokens": 2939600.0, + "reward": 0.74664306640625, + "reward_std": 0.013303879648447037, + "rewards//mean": 0.74664306640625, + "rewards//std": 0.026327185332775116, + "step": 340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0682, + "grad_norm": 1.8254125118255615, + "kl": 0.3379829414188862, + "learning_rate": 4.957775062455933e-06, + "loss": 0.0135, + "num_tokens": 2948432.0, + "reward": 0.74041748046875, + "reward_std": 0.011445442214608192, + "rewards//mean": 0.74041748046875, + "rewards//std": 0.023927679285407066, + "step": 341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0684, + "grad_norm": 1.151408314704895, + "kl": 0.3058789037168026, + "learning_rate": 4.957484183541378e-06, + "loss": 0.0122, + "num_tokens": 2957032.0, + "reward": 0.775390625, + "reward_std": 0.010520260781049728, + "rewards//mean": 0.775390625, + "rewards//std": 0.020039275288581848, + "step": 342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0686, + "grad_norm": 3.0106112957000732, + "kl": 0.330666683614254, + "learning_rate": 4.957192314752487e-06, + "loss": 0.0132, + "num_tokens": 2965680.0, + "reward": 0.7501220703125, + "reward_std": 0.0123628880828619, + "rewards//mean": 0.7501220703125, + "rewards//std": 0.03424517437815666, + "step": 343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0688, + "grad_norm": 9.348411560058594, + "kl": 0.3467661701142788, + "learning_rate": 4.9568994562068265e-06, + "loss": 0.0139, + "num_tokens": 2974304.0, + "reward": 0.7127685546875, + "reward_std": 0.013961128890514374, + "rewards//mean": 0.7127685546875, + "rewards//std": 0.038146305829286575, + "step": 344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.069, + "grad_norm": 1.3383392095565796, + "kl": 0.23608121927827597, + "learning_rate": 4.9566056080223576e-06, + "loss": 0.0094, + "num_tokens": 2982920.0, + "reward": 0.7254638671875, + "reward_std": 0.019042517989873886, + "rewards//mean": 0.7254638671875, + "rewards//std": 0.05562053620815277, + "step": 345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0692, + "grad_norm": 1.1454135179519653, + "kl": 0.3010788504034281, + "learning_rate": 4.9563107703174444e-06, + "loss": 0.012, + "num_tokens": 2991640.0, + "reward": 0.76824951171875, + "reward_std": 0.013647787272930145, + "rewards//mean": 0.76824951171875, + "rewards//std": 0.02736636996269226, + "step": 346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0694, + "grad_norm": 1.3544073104858398, + "kl": 0.34564877301454544, + "learning_rate": 4.956014943210845e-06, + "loss": 0.0138, + "num_tokens": 3000224.0, + "reward": 0.7606201171875, + "reward_std": 0.011027848348021507, + "rewards//mean": 0.7606201171875, + "rewards//std": 0.029086079448461533, + "step": 347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0696, + "grad_norm": 1.1409554481506348, + "kl": 0.29469297640025616, + "learning_rate": 4.9557181268217225e-06, + "loss": 0.0118, + "num_tokens": 3008880.0, + "reward": 0.73162841796875, + "reward_std": 0.017540261149406433, + "rewards//mean": 0.73162841796875, + "rewards//std": 0.03117768093943596, + "step": 348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0698, + "grad_norm": 1.174091100692749, + "kl": 0.34364804439246655, + "learning_rate": 4.9554203212696304e-06, + "loss": 0.0137, + "num_tokens": 3017536.0, + "reward": 0.73602294921875, + "reward_std": 0.0178272295743227, + "rewards//mean": 0.73602294921875, + "rewards//std": 0.038189683109521866, + "step": 349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.07, + "grad_norm": 1.0863193273544312, + "kl": 0.2888754541054368, + "learning_rate": 4.955121526674528e-06, + "loss": 0.0116, + "num_tokens": 3026160.0, + "reward": 0.76239013671875, + "reward_std": 0.015270760282874107, + "rewards//mean": 0.76239013671875, + "rewards//std": 0.04026879370212555, + "step": 350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0702, + "grad_norm": 1.3240951299667358, + "kl": 0.32703710440546274, + "learning_rate": 4.9548217431567665e-06, + "loss": 0.0131, + "num_tokens": 3034792.0, + "reward": 0.77569580078125, + "reward_std": 0.013240109197795391, + "rewards//mean": 0.77569580078125, + "rewards//std": 0.0282268188893795, + "step": 351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0704, + "grad_norm": 1.2549176216125488, + "kl": 0.27426617220044136, + "learning_rate": 4.9545209708371025e-06, + "loss": 0.011, + "num_tokens": 3043432.0, + "reward": 0.75177001953125, + "reward_std": 0.009705094620585442, + "rewards//mean": 0.75177001953125, + "rewards//std": 0.0316346138715744, + "step": 352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0706, + "grad_norm": 1.207445502281189, + "kl": 0.3094801548868418, + "learning_rate": 4.9542192098366835e-06, + "loss": 0.0124, + "num_tokens": 3052008.0, + "reward": 0.76214599609375, + "reward_std": 0.015533574856817722, + "rewards//mean": 0.76214599609375, + "rewards//std": 0.029373183846473694, + "step": 353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0708, + "grad_norm": 0.9336227774620056, + "kl": 0.3114245068281889, + "learning_rate": 4.95391646027706e-06, + "loss": 0.0125, + "num_tokens": 3060680.0, + "reward": 0.7603759765625, + "reward_std": 0.013422933407127857, + "rewards//mean": 0.7603759765625, + "rewards//std": 0.034570734947919846, + "step": 354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.071, + "grad_norm": 0.9703741073608398, + "kl": 0.3360181748867035, + "learning_rate": 4.953612722280181e-06, + "loss": 0.0134, + "num_tokens": 3069312.0, + "reward": 0.7603759765625, + "reward_std": 0.014663382433354855, + "rewards//mean": 0.7603759765625, + "rewards//std": 0.032032448798418045, + "step": 355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0712, + "grad_norm": 0.9895470142364502, + "kl": 0.32697275839746, + "learning_rate": 4.953307995968391e-06, + "loss": 0.0131, + "num_tokens": 3078032.0, + "reward": 0.74468994140625, + "reward_std": 0.010848162695765495, + "rewards//mean": 0.74468994140625, + "rewards//std": 0.025433441624045372, + "step": 356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0714, + "grad_norm": 1.0759690999984741, + "kl": 0.3138846158981323, + "learning_rate": 4.953002281464432e-06, + "loss": 0.0126, + "num_tokens": 3086656.0, + "reward": 0.738037109375, + "reward_std": 0.011200450360774994, + "rewards//mean": 0.738037109375, + "rewards//std": 0.019008060917258263, + "step": 357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0716, + "grad_norm": 1.1179635524749756, + "kl": 0.2949511222541332, + "learning_rate": 4.952695578891449e-06, + "loss": 0.0118, + "num_tokens": 3095184.0, + "reward": 0.74249267578125, + "reward_std": 0.010483279824256897, + "rewards//mean": 0.74249267578125, + "rewards//std": 0.034091606736183167, + "step": 358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0718, + "grad_norm": 1.4193079471588135, + "kl": 0.3405211642384529, + "learning_rate": 4.9523878883729794e-06, + "loss": 0.0136, + "num_tokens": 3103768.0, + "reward": 0.78057861328125, + "reward_std": 0.01512511633336544, + "rewards//mean": 0.78057861328125, + "rewards//std": 0.028378715738654137, + "step": 359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.072, + "grad_norm": 1.239197015762329, + "kl": 0.34275365993380547, + "learning_rate": 4.952079210032962e-06, + "loss": 0.0137, + "num_tokens": 3112464.0, + "reward": 0.75830078125, + "reward_std": 0.00690212519839406, + "rewards//mean": 0.75830078125, + "rewards//std": 0.029177792370319366, + "step": 360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0722, + "grad_norm": 1.2645068168640137, + "kl": 0.22989958012476563, + "learning_rate": 4.951769543995731e-06, + "loss": 0.0092, + "num_tokens": 3121000.0, + "reward": 0.7113037109375, + "reward_std": 0.014688584953546524, + "rewards//mean": 0.7113037109375, + "rewards//std": 0.043239694088697433, + "step": 361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0724, + "grad_norm": 0.9607939720153809, + "kl": 0.2874492518603802, + "learning_rate": 4.951458890386021e-06, + "loss": 0.0115, + "num_tokens": 3129608.0, + "reward": 0.77667236328125, + "reward_std": 0.019359227269887924, + "rewards//mean": 0.77667236328125, + "rewards//std": 0.03485913202166557, + "step": 362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0726, + "grad_norm": 1.1458990573883057, + "kl": 0.3057211823761463, + "learning_rate": 4.951147249328964e-06, + "loss": 0.0122, + "num_tokens": 3138360.0, + "reward": 0.74261474609375, + "reward_std": 0.016560913994908333, + "rewards//mean": 0.74261474609375, + "rewards//std": 0.045068517327308655, + "step": 363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0728, + "grad_norm": 1.1187580823898315, + "kl": 0.3212597221136093, + "learning_rate": 4.950834620950089e-06, + "loss": 0.0129, + "num_tokens": 3146912.0, + "reward": 0.7310791015625, + "reward_std": 0.015014410018920898, + "rewards//mean": 0.7310791015625, + "rewards//std": 0.04567229375243187, + "step": 364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.073, + "grad_norm": 1.4113508462905884, + "kl": 0.35538918152451515, + "learning_rate": 4.9505210053753204e-06, + "loss": 0.0142, + "num_tokens": 3155568.0, + "reward": 0.7164306640625, + "reward_std": 0.013062847778201103, + "rewards//mean": 0.7164306640625, + "rewards//std": 0.049687836319208145, + "step": 365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0732, + "grad_norm": 1.2345703840255737, + "kl": 0.288858812302351, + "learning_rate": 4.950206402730984e-06, + "loss": 0.0116, + "num_tokens": 3164336.0, + "reward": 0.76116943359375, + "reward_std": 0.013638054020702839, + "rewards//mean": 0.76116943359375, + "rewards//std": 0.04553765431046486, + "step": 366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0734, + "grad_norm": 1.2489417791366577, + "kl": 0.3488999232649803, + "learning_rate": 4.949890813143802e-06, + "loss": 0.014, + "num_tokens": 3173144.0, + "reward": 0.7589111328125, + "reward_std": 0.010227528400719166, + "rewards//mean": 0.7589111328125, + "rewards//std": 0.03326575458049774, + "step": 367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0736, + "grad_norm": 1.1428223848342896, + "kl": 0.33989420160651207, + "learning_rate": 4.949574236740893e-06, + "loss": 0.0136, + "num_tokens": 3181768.0, + "reward": 0.73931884765625, + "reward_std": 0.008424145169556141, + "rewards//mean": 0.73931884765625, + "rewards//std": 0.02814464643597603, + "step": 368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0738, + "grad_norm": 1.1611658334732056, + "kl": 0.32859067991375923, + "learning_rate": 4.949256673649774e-06, + "loss": 0.0131, + "num_tokens": 3190400.0, + "reward": 0.71636962890625, + "reward_std": 0.011011059395968914, + "rewards//mean": 0.71636962890625, + "rewards//std": 0.041278988122940063, + "step": 369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.074, + "grad_norm": 1.2848570346832275, + "kl": 0.35491205751895905, + "learning_rate": 4.94893812399836e-06, + "loss": 0.0142, + "num_tokens": 3199032.0, + "reward": 0.77215576171875, + "reward_std": 0.01237148605287075, + "rewards//mean": 0.77215576171875, + "rewards//std": 0.03036997839808464, + "step": 370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0742, + "grad_norm": 1.300308346748352, + "kl": 0.3731778897345066, + "learning_rate": 4.948618587914963e-06, + "loss": 0.0149, + "num_tokens": 3207600.0, + "reward": 0.7557373046875, + "reward_std": 0.01102093979716301, + "rewards//mean": 0.7557373046875, + "rewards//std": 0.025770245119929314, + "step": 371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0744, + "grad_norm": 1.1986355781555176, + "kl": 0.33789339661598206, + "learning_rate": 4.948298065528292e-06, + "loss": 0.0135, + "num_tokens": 3216240.0, + "reward": 0.74041748046875, + "reward_std": 0.012240855023264885, + "rewards//mean": 0.74041748046875, + "rewards//std": 0.03901242837309837, + "step": 372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0746, + "grad_norm": 1.1584309339523315, + "kl": 0.3138528410345316, + "learning_rate": 4.947976556967452e-06, + "loss": 0.0126, + "num_tokens": 3224856.0, + "reward": 0.790283203125, + "reward_std": 0.010637026280164719, + "rewards//mean": 0.790283203125, + "rewards//std": 0.028278857469558716, + "step": 373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0748, + "grad_norm": 1.1529393196105957, + "kl": 0.32554432936012745, + "learning_rate": 4.947654062361949e-06, + "loss": 0.013, + "num_tokens": 3233608.0, + "reward": 0.7681884765625, + "reward_std": 0.016204483807086945, + "rewards//mean": 0.7681884765625, + "rewards//std": 0.03761722892522812, + "step": 374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.075, + "grad_norm": 1.137768030166626, + "kl": 0.3275693580508232, + "learning_rate": 4.9473305818416805e-06, + "loss": 0.0131, + "num_tokens": 3242224.0, + "reward": 0.72662353515625, + "reward_std": 0.012011650949716568, + "rewards//mean": 0.72662353515625, + "rewards//std": 0.04576380178332329, + "step": 375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0752, + "grad_norm": 1.1497950553894043, + "kl": 0.373350128531456, + "learning_rate": 4.947006115536947e-06, + "loss": 0.0149, + "num_tokens": 3250880.0, + "reward": 0.7626953125, + "reward_std": 0.0052624596282839775, + "rewards//mean": 0.7626953125, + "rewards//std": 0.034618210047483444, + "step": 376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0754, + "grad_norm": 1.2146048545837402, + "kl": 0.31162706576287746, + "learning_rate": 4.946680663578443e-06, + "loss": 0.0125, + "num_tokens": 3259592.0, + "reward": 0.7554931640625, + "reward_std": 0.01137151475995779, + "rewards//mean": 0.7554931640625, + "rewards//std": 0.03868066519498825, + "step": 377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0756, + "grad_norm": 1.0558278560638428, + "kl": 0.34958504140377045, + "learning_rate": 4.946354226097261e-06, + "loss": 0.014, + "num_tokens": 3268216.0, + "reward": 0.75439453125, + "reward_std": 0.01355811208486557, + "rewards//mean": 0.75439453125, + "rewards//std": 0.04268030822277069, + "step": 378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0758, + "grad_norm": 1.3659820556640625, + "kl": 0.3736311122775078, + "learning_rate": 4.946026803224888e-06, + "loss": 0.0149, + "num_tokens": 3276832.0, + "reward": 0.77215576171875, + "reward_std": 0.00643074419349432, + "rewards//mean": 0.77215576171875, + "rewards//std": 0.028365911915898323, + "step": 379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.076, + "grad_norm": 1.0803120136260986, + "kl": 0.3101845942437649, + "learning_rate": 4.945698395093212e-06, + "loss": 0.0124, + "num_tokens": 3285440.0, + "reward": 0.7320556640625, + "reward_std": 0.012730730697512627, + "rewards//mean": 0.7320556640625, + "rewards//std": 0.04066479951143265, + "step": 380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0762, + "grad_norm": 1.184386134147644, + "kl": 0.33812105655670166, + "learning_rate": 4.9453690018345144e-06, + "loss": 0.0135, + "num_tokens": 3294056.0, + "reward": 0.765869140625, + "reward_std": 0.010241934098303318, + "rewards//mean": 0.765869140625, + "rewards//std": 0.03962195664644241, + "step": 381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0764, + "grad_norm": 1.0949220657348633, + "kl": 0.32981303334236145, + "learning_rate": 4.9450386235814755e-06, + "loss": 0.0132, + "num_tokens": 3302832.0, + "reward": 0.74432373046875, + "reward_std": 0.011420607566833496, + "rewards//mean": 0.74432373046875, + "rewards//std": 0.036144278943538666, + "step": 382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0766, + "grad_norm": 0.9847798943519592, + "kl": 0.2912776917219162, + "learning_rate": 4.944707260467172e-06, + "loss": 0.0117, + "num_tokens": 3311488.0, + "reward": 0.751953125, + "reward_std": 0.012202553451061249, + "rewards//mean": 0.751953125, + "rewards//std": 0.03254092112183571, + "step": 383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0768, + "grad_norm": 1.0881670713424683, + "kl": 0.3519246131181717, + "learning_rate": 4.944374912625076e-06, + "loss": 0.0141, + "num_tokens": 3320128.0, + "reward": 0.721435546875, + "reward_std": 0.010593841783702374, + "rewards//mean": 0.721435546875, + "rewards//std": 0.04617677628993988, + "step": 384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.077, + "grad_norm": 0.9181442856788635, + "kl": 0.348065834492445, + "learning_rate": 4.944041580189057e-06, + "loss": 0.0139, + "num_tokens": 3328744.0, + "reward": 0.7432861328125, + "reward_std": 0.009224426001310349, + "rewards//mean": 0.7432861328125, + "rewards//std": 0.026073908433318138, + "step": 385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0772, + "grad_norm": 0.9651432037353516, + "kl": 0.2787171872332692, + "learning_rate": 4.943707263293382e-06, + "loss": 0.0111, + "num_tokens": 3337352.0, + "reward": 0.741943359375, + "reward_std": 0.014116060920059681, + "rewards//mean": 0.741943359375, + "rewards//std": 0.03688023239374161, + "step": 386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0774, + "grad_norm": 1.0564086437225342, + "kl": 0.33349500969052315, + "learning_rate": 4.943371962072714e-06, + "loss": 0.0133, + "num_tokens": 3345952.0, + "reward": 0.779052734375, + "reward_std": 0.009006861597299576, + "rewards//mean": 0.779052734375, + "rewards//std": 0.03128775954246521, + "step": 387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0776, + "grad_norm": 0.905786395072937, + "kl": 0.3205620348453522, + "learning_rate": 4.9430356766621114e-06, + "loss": 0.0128, + "num_tokens": 3354552.0, + "reward": 0.7352294921875, + "reward_std": 0.00872873142361641, + "rewards//mean": 0.7352294921875, + "rewards//std": 0.030068732798099518, + "step": 388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0778, + "grad_norm": 1.0599631071090698, + "kl": 0.3847878910601139, + "learning_rate": 4.942698407197031e-06, + "loss": 0.0154, + "num_tokens": 3363192.0, + "reward": 0.741943359375, + "reward_std": 0.008737495169043541, + "rewards//mean": 0.741943359375, + "rewards//std": 0.030614785850048065, + "step": 389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.078, + "grad_norm": 0.9868512749671936, + "kl": 0.32742261700332165, + "learning_rate": 4.942360153813324e-06, + "loss": 0.0131, + "num_tokens": 3371832.0, + "reward": 0.74371337890625, + "reward_std": 0.007218184880912304, + "rewards//mean": 0.74371337890625, + "rewards//std": 0.035629238933324814, + "step": 390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0782, + "grad_norm": 1.0115092992782593, + "kl": 0.3397269584238529, + "learning_rate": 4.9420209166472386e-06, + "loss": 0.0136, + "num_tokens": 3380448.0, + "reward": 0.770751953125, + "reward_std": 0.015268933959305286, + "rewards//mean": 0.770751953125, + "rewards//std": 0.03136507794260979, + "step": 391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0784, + "grad_norm": 0.9326263666152954, + "kl": 0.32895535230636597, + "learning_rate": 4.9416806958354206e-06, + "loss": 0.0132, + "num_tokens": 3389040.0, + "reward": 0.755126953125, + "reward_std": 0.013349458575248718, + "rewards//mean": 0.755126953125, + "rewards//std": 0.031357355415821075, + "step": 392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0786, + "grad_norm": 0.7784397602081299, + "kl": 0.3131994195282459, + "learning_rate": 4.9413394915149094e-06, + "loss": 0.0125, + "num_tokens": 3397736.0, + "reward": 0.748779296875, + "reward_std": 0.011831846088171005, + "rewards//mean": 0.748779296875, + "rewards//std": 0.03131871297955513, + "step": 393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0788, + "grad_norm": 0.9137967824935913, + "kl": 0.3138259369879961, + "learning_rate": 4.940997303823144e-06, + "loss": 0.0126, + "num_tokens": 3406328.0, + "reward": 0.7393798828125, + "reward_std": 0.009359323419630527, + "rewards//mean": 0.7393798828125, + "rewards//std": 0.01415227074176073, + "step": 394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.079, + "grad_norm": 0.8206107020378113, + "kl": 0.2667227676138282, + "learning_rate": 4.940654132897957e-06, + "loss": 0.0107, + "num_tokens": 3414960.0, + "reward": 0.7227783203125, + "reward_std": 0.015779396519064903, + "rewards//mean": 0.7227783203125, + "rewards//std": 0.04207717627286911, + "step": 395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0792, + "grad_norm": 1.1698734760284424, + "kl": 0.3585771173238754, + "learning_rate": 4.940309978877576e-06, + "loss": 0.0143, + "num_tokens": 3423696.0, + "reward": 0.72821044921875, + "reward_std": 0.00786328874528408, + "rewards//mean": 0.72821044921875, + "rewards//std": 0.03962152823805809, + "step": 396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0794, + "grad_norm": 0.9304360151290894, + "kl": 0.2744011953473091, + "learning_rate": 4.939964841900627e-06, + "loss": 0.011, + "num_tokens": 3432248.0, + "reward": 0.74176025390625, + "reward_std": 0.012099739164113998, + "rewards//mean": 0.74176025390625, + "rewards//std": 0.030537491664290428, + "step": 397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0796, + "grad_norm": 0.9549676775932312, + "kl": 0.32337421737611294, + "learning_rate": 4.9396187221061324e-06, + "loss": 0.0129, + "num_tokens": 3440928.0, + "reward": 0.7620849609375, + "reward_std": 0.008526656776666641, + "rewards//mean": 0.7620849609375, + "rewards//std": 0.02168460376560688, + "step": 398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0798, + "grad_norm": 0.8273400664329529, + "kl": 0.30577356554567814, + "learning_rate": 4.939271619633508e-06, + "loss": 0.0122, + "num_tokens": 3449560.0, + "reward": 0.74383544921875, + "reward_std": 0.014349598437547684, + "rewards//mean": 0.74383544921875, + "rewards//std": 0.032983046025037766, + "step": 399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.08, + "grad_norm": 0.9757963418960571, + "kl": 0.322640098631382, + "learning_rate": 4.938923534622567e-06, + "loss": 0.0129, + "num_tokens": 3458264.0, + "reward": 0.761962890625, + "reward_std": 0.011695911176502705, + "rewards//mean": 0.761962890625, + "rewards//std": 0.02991858497262001, + "step": 400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0802, + "grad_norm": 0.8469364643096924, + "kl": 0.2975729079917073, + "learning_rate": 4.938574467213519e-06, + "loss": 0.0119, + "num_tokens": 3466896.0, + "reward": 0.76605224609375, + "reward_std": 0.007424627430737019, + "rewards//mean": 0.76605224609375, + "rewards//std": 0.03303670138120651, + "step": 401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0804, + "grad_norm": 0.8685148358345032, + "kl": 0.3087438400834799, + "learning_rate": 4.938224417546965e-06, + "loss": 0.0123, + "num_tokens": 3475584.0, + "reward": 0.76739501953125, + "reward_std": 0.011819308623671532, + "rewards//mean": 0.76739501953125, + "rewards//std": 0.03834869712591171, + "step": 402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0806, + "grad_norm": 1.04423987865448, + "kl": 0.2927042469382286, + "learning_rate": 4.937873385763909e-06, + "loss": 0.0117, + "num_tokens": 3484184.0, + "reward": 0.77471923828125, + "reward_std": 0.018968980759382248, + "rewards//mean": 0.77471923828125, + "rewards//std": 0.0364985354244709, + "step": 403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0808, + "grad_norm": 0.994342565536499, + "kl": 0.250274121761322, + "learning_rate": 4.9375213720057435e-06, + "loss": 0.01, + "num_tokens": 3492760.0, + "reward": 0.76336669921875, + "reward_std": 0.010855021886527538, + "rewards//mean": 0.76336669921875, + "rewards//std": 0.022775080054998398, + "step": 404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.081, + "grad_norm": 0.8562494516372681, + "kl": 0.241558950394392, + "learning_rate": 4.937168376414261e-06, + "loss": 0.0097, + "num_tokens": 3501344.0, + "reward": 0.7479248046875, + "reward_std": 0.015270461328327656, + "rewards//mean": 0.7479248046875, + "rewards//std": 0.037885088473558426, + "step": 405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0812, + "grad_norm": 0.8906681537628174, + "kl": 0.2702600210905075, + "learning_rate": 4.9368143991316485e-06, + "loss": 0.0108, + "num_tokens": 3510008.0, + "reward": 0.75970458984375, + "reward_std": 0.011862866580486298, + "rewards//mean": 0.75970458984375, + "rewards//std": 0.03386126458644867, + "step": 406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0814, + "grad_norm": 0.83770751953125, + "kl": 0.2678908761590719, + "learning_rate": 4.936459440300487e-06, + "loss": 0.0107, + "num_tokens": 3518752.0, + "reward": 0.7633056640625, + "reward_std": 0.010113585740327835, + "rewards//mean": 0.7633056640625, + "rewards//std": 0.026380963623523712, + "step": 407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0816, + "grad_norm": 0.7807222604751587, + "kl": 0.28538877703249454, + "learning_rate": 4.936103500063755e-06, + "loss": 0.0114, + "num_tokens": 3527400.0, + "reward": 0.7591552734375, + "reward_std": 0.0059563172981143, + "rewards//mean": 0.7591552734375, + "rewards//std": 0.03868379816412926, + "step": 408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0818, + "grad_norm": 0.9765918850898743, + "kl": 0.26162285543978214, + "learning_rate": 4.935746578564825e-06, + "loss": 0.0105, + "num_tokens": 3536064.0, + "reward": 0.7486572265625, + "reward_std": 0.011643504723906517, + "rewards//mean": 0.7486572265625, + "rewards//std": 0.025781990960240364, + "step": 409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.082, + "grad_norm": 0.9713819622993469, + "kl": 0.2695306558161974, + "learning_rate": 4.935388675947463e-06, + "loss": 0.0108, + "num_tokens": 3544736.0, + "reward": 0.71881103515625, + "reward_std": 0.010069970041513443, + "rewards//mean": 0.71881103515625, + "rewards//std": 0.03545119985938072, + "step": 410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0822, + "grad_norm": 0.9451349377632141, + "kl": 0.24540182575583458, + "learning_rate": 4.935029792355834e-06, + "loss": 0.0098, + "num_tokens": 3553424.0, + "reward": 0.73486328125, + "reward_std": 0.014077549800276756, + "rewards//mean": 0.73486328125, + "rewards//std": 0.03547174483537674, + "step": 411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0824, + "grad_norm": 0.7770392298698425, + "kl": 0.2268413919955492, + "learning_rate": 4.934669927934496e-06, + "loss": 0.0091, + "num_tokens": 3562040.0, + "reward": 0.73150634765625, + "reward_std": 0.013487438671290874, + "rewards//mean": 0.73150634765625, + "rewards//std": 0.04288830980658531, + "step": 412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0826, + "grad_norm": 0.674587607383728, + "kl": 0.28416365571320057, + "learning_rate": 4.9343090828284025e-06, + "loss": 0.0114, + "num_tokens": 3570696.0, + "reward": 0.74090576171875, + "reward_std": 0.010635284706950188, + "rewards//mean": 0.74090576171875, + "rewards//std": 0.03285244479775429, + "step": 413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0828, + "grad_norm": 0.8351263403892517, + "kl": 0.2428603582084179, + "learning_rate": 4.933947257182901e-06, + "loss": 0.0097, + "num_tokens": 3579256.0, + "reward": 0.72833251953125, + "reward_std": 0.00818649772554636, + "rewards//mean": 0.72833251953125, + "rewards//std": 0.034561701118946075, + "step": 414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.083, + "grad_norm": 0.9314544200897217, + "kl": 0.26274044439196587, + "learning_rate": 4.933584451143736e-06, + "loss": 0.0105, + "num_tokens": 3587928.0, + "reward": 0.72344970703125, + "reward_std": 0.009486508555710316, + "rewards//mean": 0.72344970703125, + "rewards//std": 0.04276353493332863, + "step": 415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0832, + "grad_norm": 0.8246920704841614, + "kl": 0.2645695861428976, + "learning_rate": 4.933220664857045e-06, + "loss": 0.0106, + "num_tokens": 3596568.0, + "reward": 0.764892578125, + "reward_std": 0.011997243389487267, + "rewards//mean": 0.764892578125, + "rewards//std": 0.02408854104578495, + "step": 416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0834, + "grad_norm": 0.7828191518783569, + "kl": 0.23742634430527687, + "learning_rate": 4.93285589846936e-06, + "loss": 0.0095, + "num_tokens": 3605248.0, + "reward": 0.77264404296875, + "reward_std": 0.010765868239104748, + "rewards//mean": 0.77264404296875, + "rewards//std": 0.02570635825395584, + "step": 417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0836, + "grad_norm": 0.8400963544845581, + "kl": 0.2275092527270317, + "learning_rate": 4.932490152127611e-06, + "loss": 0.0091, + "num_tokens": 3613840.0, + "reward": 0.751708984375, + "reward_std": 0.01651906780898571, + "rewards//mean": 0.751708984375, + "rewards//std": 0.042007505893707275, + "step": 418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0838, + "grad_norm": 0.9282857179641724, + "kl": 0.24514971487224102, + "learning_rate": 4.93212342597912e-06, + "loss": 0.0098, + "num_tokens": 3622464.0, + "reward": 0.7489013671875, + "reward_std": 0.015580926090478897, + "rewards//mean": 0.7489013671875, + "rewards//std": 0.04658199101686478, + "step": 419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.084, + "grad_norm": 0.9025923609733582, + "kl": 0.23584598675370216, + "learning_rate": 4.931755720171603e-06, + "loss": 0.0094, + "num_tokens": 3631032.0, + "reward": 0.7366943359375, + "reward_std": 0.011178325861692429, + "rewards//mean": 0.7366943359375, + "rewards//std": 0.03618305176496506, + "step": 420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0842, + "grad_norm": 0.7268020510673523, + "kl": 0.254648195579648, + "learning_rate": 4.931387034853173e-06, + "loss": 0.0102, + "num_tokens": 3639672.0, + "reward": 0.74639892578125, + "reward_std": 0.009266193956136703, + "rewards//mean": 0.74639892578125, + "rewards//std": 0.03361223638057709, + "step": 421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0844, + "grad_norm": 0.8384131789207458, + "kl": 0.2413597498089075, + "learning_rate": 4.9310173701723365e-06, + "loss": 0.0097, + "num_tokens": 3648336.0, + "reward": 0.75970458984375, + "reward_std": 0.008040700107812881, + "rewards//mean": 0.75970458984375, + "rewards//std": 0.026702843606472015, + "step": 422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0846, + "grad_norm": 0.7907002568244934, + "kl": 0.24139040149748325, + "learning_rate": 4.930646726277994e-06, + "loss": 0.0097, + "num_tokens": 3656896.0, + "reward": 0.7911376953125, + "reward_std": 0.009463133290410042, + "rewards//mean": 0.7911376953125, + "rewards//std": 0.0322941355407238, + "step": 423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0848, + "grad_norm": 0.750485360622406, + "kl": 0.23888413794338703, + "learning_rate": 4.930275103319441e-06, + "loss": 0.0096, + "num_tokens": 3665504.0, + "reward": 0.73126220703125, + "reward_std": 0.013249299488961697, + "rewards//mean": 0.73126220703125, + "rewards//std": 0.024657992646098137, + "step": 424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.085, + "grad_norm": 0.9737104773521423, + "kl": 0.2315953504294157, + "learning_rate": 4.9299025014463665e-06, + "loss": 0.0093, + "num_tokens": 3674080.0, + "reward": 0.7303466796875, + "reward_std": 0.013063129037618637, + "rewards//mean": 0.7303466796875, + "rewards//std": 0.0421634316444397, + "step": 425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0852, + "grad_norm": 0.7097475528717041, + "kl": 0.2388191670179367, + "learning_rate": 4.9295289208088545e-06, + "loss": 0.0096, + "num_tokens": 3682640.0, + "reward": 0.74786376953125, + "reward_std": 0.009970373474061489, + "rewards//mean": 0.74786376953125, + "rewards//std": 0.03215102478861809, + "step": 426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0854, + "grad_norm": 0.78131502866745, + "kl": 0.2351592220366001, + "learning_rate": 4.929154361557384e-06, + "loss": 0.0094, + "num_tokens": 3691304.0, + "reward": 0.77752685546875, + "reward_std": 0.010701300576329231, + "rewards//mean": 0.77752685546875, + "rewards//std": 0.025538571178913116, + "step": 427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0856, + "grad_norm": 0.813309907913208, + "kl": 0.2603413835167885, + "learning_rate": 4.928778823842828e-06, + "loss": 0.0104, + "num_tokens": 3700024.0, + "reward": 0.75299072265625, + "reward_std": 0.007520100101828575, + "rewards//mean": 0.75299072265625, + "rewards//std": 0.03588072210550308, + "step": 428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0858, + "grad_norm": 0.7221551537513733, + "kl": 0.18088798597455025, + "learning_rate": 4.928402307816452e-06, + "loss": 0.0072, + "num_tokens": 3708784.0, + "reward": 0.74078369140625, + "reward_std": 0.01760246977210045, + "rewards//mean": 0.74078369140625, + "rewards//std": 0.04441485553979874, + "step": 429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.086, + "grad_norm": 0.8385155200958252, + "kl": 0.21827073767781258, + "learning_rate": 4.928024813629917e-06, + "loss": 0.0087, + "num_tokens": 3717392.0, + "reward": 0.7694091796875, + "reward_std": 0.009142270311713219, + "rewards//mean": 0.7694091796875, + "rewards//std": 0.026874415576457977, + "step": 430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0862, + "grad_norm": 0.6926012635231018, + "kl": 0.22130563855171204, + "learning_rate": 4.927646341435276e-06, + "loss": 0.0089, + "num_tokens": 3726008.0, + "reward": 0.75091552734375, + "reward_std": 0.010623462498188019, + "rewards//mean": 0.75091552734375, + "rewards//std": 0.028200527653098106, + "step": 431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0864, + "grad_norm": 0.8035763502120972, + "kl": 0.252463411539793, + "learning_rate": 4.92726689138498e-06, + "loss": 0.0101, + "num_tokens": 3734584.0, + "reward": 0.74298095703125, + "reward_std": 0.011688274331390858, + "rewards//mean": 0.74298095703125, + "rewards//std": 0.026449372991919518, + "step": 432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0866, + "grad_norm": 0.6842226982116699, + "kl": 0.23155632801353931, + "learning_rate": 4.92688646363187e-06, + "loss": 0.0093, + "num_tokens": 3743296.0, + "reward": 0.7818603515625, + "reward_std": 0.011189782060682774, + "rewards//mean": 0.7818603515625, + "rewards//std": 0.033827103674411774, + "step": 433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0868, + "grad_norm": 0.6330403089523315, + "kl": 0.21206197701394558, + "learning_rate": 4.926505058329184e-06, + "loss": 0.0085, + "num_tokens": 3752000.0, + "reward": 0.7442626953125, + "reward_std": 0.01008138619363308, + "rewards//mean": 0.7442626953125, + "rewards//std": 0.02858632244169712, + "step": 434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.087, + "grad_norm": 0.877251386642456, + "kl": 0.22307201102375984, + "learning_rate": 4.9261226756305495e-06, + "loss": 0.0089, + "num_tokens": 3760584.0, + "reward": 0.71490478515625, + "reward_std": 0.006664145737886429, + "rewards//mean": 0.71490478515625, + "rewards//std": 0.04637888818979263, + "step": 435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0872, + "grad_norm": 0.6964420676231384, + "kl": 0.2119651883840561, + "learning_rate": 4.925739315689991e-06, + "loss": 0.0085, + "num_tokens": 3769168.0, + "reward": 0.7415771484375, + "reward_std": 0.00940138753503561, + "rewards//mean": 0.7415771484375, + "rewards//std": 0.03519562631845474, + "step": 436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0874, + "grad_norm": 0.737678587436676, + "kl": 0.22590421885252, + "learning_rate": 4.925354978661928e-06, + "loss": 0.009, + "num_tokens": 3777808.0, + "reward": 0.7454833984375, + "reward_std": 0.008858283050358295, + "rewards//mean": 0.7454833984375, + "rewards//std": 0.027649568393826485, + "step": 437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0876, + "grad_norm": 0.6990619897842407, + "kl": 0.21303631737828255, + "learning_rate": 4.924969664701168e-06, + "loss": 0.0085, + "num_tokens": 3786392.0, + "reward": 0.74658203125, + "reward_std": 0.011121334508061409, + "rewards//mean": 0.74658203125, + "rewards//std": 0.025176284834742546, + "step": 438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0878, + "grad_norm": 0.6292837858200073, + "kl": 0.21526087448000908, + "learning_rate": 4.924583373962918e-06, + "loss": 0.0086, + "num_tokens": 3795064.0, + "reward": 0.7734375, + "reward_std": 0.010552143678069115, + "rewards//mean": 0.7734375, + "rewards//std": 0.02722390927374363, + "step": 439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.088, + "grad_norm": 0.6078249216079712, + "kl": 0.24454114213585854, + "learning_rate": 4.924196106602774e-06, + "loss": 0.0098, + "num_tokens": 3803632.0, + "reward": 0.74200439453125, + "reward_std": 0.009583698585629463, + "rewards//mean": 0.74200439453125, + "rewards//std": 0.053606946021318436, + "step": 440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0882, + "grad_norm": 0.7330633997917175, + "kl": 0.2058623656630516, + "learning_rate": 4.9238078627767285e-06, + "loss": 0.0082, + "num_tokens": 3812296.0, + "reward": 0.77801513671875, + "reward_std": 0.01102867629379034, + "rewards//mean": 0.77801513671875, + "rewards//std": 0.026662563905119896, + "step": 441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0884, + "grad_norm": 0.7553256154060364, + "kl": 0.21089842356741428, + "learning_rate": 4.923418642641166e-06, + "loss": 0.0084, + "num_tokens": 3820952.0, + "reward": 0.7528076171875, + "reward_std": 0.011244535446166992, + "rewards//mean": 0.7528076171875, + "rewards//std": 0.039164479821920395, + "step": 442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0886, + "grad_norm": 0.7146939039230347, + "kl": 0.22701359912753105, + "learning_rate": 4.923028446352864e-06, + "loss": 0.0091, + "num_tokens": 3829712.0, + "reward": 0.71051025390625, + "reward_std": 0.016265802085399628, + "rewards//mean": 0.71051025390625, + "rewards//std": 0.048049308359622955, + "step": 443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0888, + "grad_norm": 0.807270884513855, + "kl": 0.25617854483425617, + "learning_rate": 4.922637274068993e-06, + "loss": 0.0102, + "num_tokens": 3838296.0, + "reward": 0.72760009765625, + "reward_std": 0.008829087018966675, + "rewards//mean": 0.72760009765625, + "rewards//std": 0.017555613070726395, + "step": 444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.089, + "grad_norm": 0.8020032048225403, + "kl": 0.2054343856871128, + "learning_rate": 4.9222451259471185e-06, + "loss": 0.0082, + "num_tokens": 3846960.0, + "reward": 0.74591064453125, + "reward_std": 0.008869110606610775, + "rewards//mean": 0.74591064453125, + "rewards//std": 0.029458094388246536, + "step": 445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0892, + "grad_norm": 0.6758546233177185, + "kl": 0.22217164561152458, + "learning_rate": 4.921852002145196e-06, + "loss": 0.0089, + "num_tokens": 3855688.0, + "reward": 0.7484130859375, + "reward_std": 0.008662059903144836, + "rewards//mean": 0.7484130859375, + "rewards//std": 0.0318770706653595, + "step": 446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0894, + "grad_norm": 0.8543888926506042, + "kl": 0.24924145638942719, + "learning_rate": 4.921457902821578e-06, + "loss": 0.01, + "num_tokens": 3864312.0, + "reward": 0.759765625, + "reward_std": 0.017270008102059364, + "rewards//mean": 0.759765625, + "rewards//std": 0.029421651735901833, + "step": 447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0896, + "grad_norm": 0.6530463695526123, + "kl": 0.2602735925465822, + "learning_rate": 4.921062828135006e-06, + "loss": 0.0104, + "num_tokens": 3872992.0, + "reward": 0.75787353515625, + "reward_std": 0.015410242602229118, + "rewards//mean": 0.75787353515625, + "rewards//std": 0.02605554088950157, + "step": 448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0898, + "grad_norm": 0.9578370451927185, + "kl": 0.2606674674898386, + "learning_rate": 4.920666778244616e-06, + "loss": 0.0104, + "num_tokens": 3881712.0, + "reward": 0.77020263671875, + "reward_std": 0.008511267602443695, + "rewards//mean": 0.77020263671875, + "rewards//std": 0.030421772971749306, + "step": 449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.09, + "grad_norm": 0.6777032017707825, + "kl": 0.2329980656504631, + "learning_rate": 4.920269753309937e-06, + "loss": 0.0093, + "num_tokens": 3890312.0, + "reward": 0.7421875, + "reward_std": 0.00921049527823925, + "rewards//mean": 0.7421875, + "rewards//std": 0.031932346522808075, + "step": 450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0902, + "grad_norm": 0.7345518469810486, + "kl": 0.23562784306704998, + "learning_rate": 4.919871753490892e-06, + "loss": 0.0094, + "num_tokens": 3899016.0, + "reward": 0.73236083984375, + "reward_std": 0.009874099865555763, + "rewards//mean": 0.73236083984375, + "rewards//std": 0.04835017770528793, + "step": 451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0904, + "grad_norm": 0.6494887471199036, + "kl": 0.22641596291214228, + "learning_rate": 4.919472778947793e-06, + "loss": 0.0091, + "num_tokens": 3907752.0, + "reward": 0.7728271484375, + "reward_std": 0.008734005503356457, + "rewards//mean": 0.7728271484375, + "rewards//std": 0.019265538081526756, + "step": 452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0906, + "grad_norm": 0.6345562934875488, + "kl": 0.22991832438856363, + "learning_rate": 4.919072829841347e-06, + "loss": 0.0092, + "num_tokens": 3916496.0, + "reward": 0.76580810546875, + "reward_std": 0.009036125615239143, + "rewards//mean": 0.76580810546875, + "rewards//std": 0.03826966881752014, + "step": 453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0908, + "grad_norm": 0.7087083458900452, + "kl": 0.23438394255936146, + "learning_rate": 4.918671906332656e-06, + "loss": 0.0094, + "num_tokens": 3925008.0, + "reward": 0.732666015625, + "reward_std": 0.011662309989333153, + "rewards//mean": 0.732666015625, + "rewards//std": 0.034244511276483536, + "step": 454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.091, + "grad_norm": 0.7872934937477112, + "kl": 0.2746329791843891, + "learning_rate": 4.91827000858321e-06, + "loss": 0.011, + "num_tokens": 3933696.0, + "reward": 0.73626708984375, + "reward_std": 0.007544973865151405, + "rewards//mean": 0.73626708984375, + "rewards//std": 0.03399689868092537, + "step": 455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0912, + "grad_norm": 0.6922155022621155, + "kl": 0.22344142571091652, + "learning_rate": 4.917867136754894e-06, + "loss": 0.0089, + "num_tokens": 3942304.0, + "reward": 0.77667236328125, + "reward_std": 0.013890949077904224, + "rewards//mean": 0.77667236328125, + "rewards//std": 0.032455623149871826, + "step": 456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0914, + "grad_norm": 0.8709951043128967, + "kl": 0.29020215198397636, + "learning_rate": 4.917463291009984e-06, + "loss": 0.0116, + "num_tokens": 3950968.0, + "reward": 0.750244140625, + "reward_std": 0.00925417710095644, + "rewards//mean": 0.750244140625, + "rewards//std": 0.02082025073468685, + "step": 457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0916, + "grad_norm": 0.7772853374481201, + "kl": 0.2596856001764536, + "learning_rate": 4.917058471511149e-06, + "loss": 0.0104, + "num_tokens": 3959640.0, + "reward": 0.74822998046875, + "reward_std": 0.012660522013902664, + "rewards//mean": 0.74822998046875, + "rewards//std": 0.03531729802489281, + "step": 458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0918, + "grad_norm": 0.9631777405738831, + "kl": 0.2647391799837351, + "learning_rate": 4.916652678421451e-06, + "loss": 0.0106, + "num_tokens": 3968256.0, + "reward": 0.76495361328125, + "reward_std": 0.008287420496344566, + "rewards//mean": 0.76495361328125, + "rewards//std": 0.021716255694627762, + "step": 459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.092, + "grad_norm": 0.6951785683631897, + "kl": 0.24628694355487823, + "learning_rate": 4.916245911904344e-06, + "loss": 0.0099, + "num_tokens": 3977024.0, + "reward": 0.772216796875, + "reward_std": 0.012418104335665703, + "rewards//mean": 0.772216796875, + "rewards//std": 0.024899380281567574, + "step": 460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0922, + "grad_norm": 0.7456372976303101, + "kl": 0.25542332231998444, + "learning_rate": 4.9158381721236715e-06, + "loss": 0.0102, + "num_tokens": 3985664.0, + "reward": 0.72442626953125, + "reward_std": 0.011360350996255875, + "rewards//mean": 0.72442626953125, + "rewards//std": 0.03637805953621864, + "step": 461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0924, + "grad_norm": 0.7007219791412354, + "kl": 0.2571563795208931, + "learning_rate": 4.915429459243673e-06, + "loss": 0.0103, + "num_tokens": 3994480.0, + "reward": 0.75836181640625, + "reward_std": 0.00619722343981266, + "rewards//mean": 0.75836181640625, + "rewards//std": 0.025561677291989326, + "step": 462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0926, + "grad_norm": 0.6913952231407166, + "kl": 0.2788440138101578, + "learning_rate": 4.9150197734289764e-06, + "loss": 0.0112, + "num_tokens": 4003112.0, + "reward": 0.7447509765625, + "reward_std": 0.007206355221569538, + "rewards//mean": 0.7447509765625, + "rewards//std": 0.03733934462070465, + "step": 463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0928, + "grad_norm": 0.675417423248291, + "kl": 0.23573023825883865, + "learning_rate": 4.9146091148446055e-06, + "loss": 0.0094, + "num_tokens": 4011720.0, + "reward": 0.7464599609375, + "reward_std": 0.015771940350532532, + "rewards//mean": 0.7464599609375, + "rewards//std": 0.04249957576394081, + "step": 464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.093, + "grad_norm": 0.7302259802818298, + "kl": 0.27733396738767624, + "learning_rate": 4.91419748365597e-06, + "loss": 0.0111, + "num_tokens": 4020384.0, + "reward": 0.734130859375, + "reward_std": 0.007520634680986404, + "rewards//mean": 0.734130859375, + "rewards//std": 0.03848075866699219, + "step": 465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0932, + "grad_norm": 0.7138959169387817, + "kl": 0.2417994262650609, + "learning_rate": 4.9137848800288775e-06, + "loss": 0.0097, + "num_tokens": 4029016.0, + "reward": 0.75616455078125, + "reward_std": 0.010032439604401588, + "rewards//mean": 0.75616455078125, + "rewards//std": 0.02122269943356514, + "step": 466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0934, + "grad_norm": 0.7322536706924438, + "kl": 0.27695489302277565, + "learning_rate": 4.9133713041295235e-06, + "loss": 0.0111, + "num_tokens": 4037688.0, + "reward": 0.7783203125, + "reward_std": 0.009051885455846786, + "rewards//mean": 0.7783203125, + "rewards//std": 0.03194751217961311, + "step": 467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0936, + "grad_norm": 0.8252130746841431, + "kl": 0.26474233716726303, + "learning_rate": 4.912956756124498e-06, + "loss": 0.0106, + "num_tokens": 4046304.0, + "reward": 0.7481689453125, + "reward_std": 0.0064436085522174835, + "rewards//mean": 0.7481689453125, + "rewards//std": 0.0320381224155426, + "step": 468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0938, + "grad_norm": 0.6932944059371948, + "kl": 0.27318440936505795, + "learning_rate": 4.912541236180779e-06, + "loss": 0.0109, + "num_tokens": 4054992.0, + "reward": 0.71624755859375, + "reward_std": 0.009076687507331371, + "rewards//mean": 0.71624755859375, + "rewards//std": 0.03491336852312088, + "step": 469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.094, + "grad_norm": 0.6723066568374634, + "kl": 0.2577223964035511, + "learning_rate": 4.9121247444657384e-06, + "loss": 0.0103, + "num_tokens": 4063624.0, + "reward": 0.7515869140625, + "reward_std": 0.005873112007975578, + "rewards//mean": 0.7515869140625, + "rewards//std": 0.02327904850244522, + "step": 470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0942, + "grad_norm": 0.8410773277282715, + "kl": 0.2808041274547577, + "learning_rate": 4.91170728114714e-06, + "loss": 0.0112, + "num_tokens": 4072256.0, + "reward": 0.76995849609375, + "reward_std": 0.005484414286911488, + "rewards//mean": 0.76995849609375, + "rewards//std": 0.022884486243128777, + "step": 471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0944, + "grad_norm": 0.6858919262886047, + "kl": 0.2676295880228281, + "learning_rate": 4.911288846393136e-06, + "loss": 0.0107, + "num_tokens": 4080920.0, + "reward": 0.742431640625, + "reward_std": 0.012072248384356499, + "rewards//mean": 0.742431640625, + "rewards//std": 0.022215967997908592, + "step": 472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0946, + "grad_norm": 0.6118029356002808, + "kl": 0.21254741679877043, + "learning_rate": 4.910869440372274e-06, + "loss": 0.0085, + "num_tokens": 4089496.0, + "reward": 0.739990234375, + "reward_std": 0.01614592783153057, + "rewards//mean": 0.739990234375, + "rewards//std": 0.03497926890850067, + "step": 473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0948, + "grad_norm": 0.7061100602149963, + "kl": 0.25430491380393505, + "learning_rate": 4.910449063253489e-06, + "loss": 0.0102, + "num_tokens": 4098072.0, + "reward": 0.75494384765625, + "reward_std": 0.01261814869940281, + "rewards//mean": 0.75494384765625, + "rewards//std": 0.03197965770959854, + "step": 474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.095, + "grad_norm": 0.6372392773628235, + "kl": 0.29845254495739937, + "learning_rate": 4.9100277152061105e-06, + "loss": 0.0119, + "num_tokens": 4106672.0, + "reward": 0.7095947265625, + "reward_std": 0.009667182341217995, + "rewards//mean": 0.7095947265625, + "rewards//std": 0.045762356370687485, + "step": 475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0952, + "grad_norm": 0.6701680421829224, + "kl": 0.25746934674680233, + "learning_rate": 4.9096053963998555e-06, + "loss": 0.0103, + "num_tokens": 4115384.0, + "reward": 0.742919921875, + "reward_std": 0.01074405387043953, + "rewards//mean": 0.742919921875, + "rewards//std": 0.027916818857192993, + "step": 476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0954, + "grad_norm": 0.8074530363082886, + "kl": 0.2788533419370651, + "learning_rate": 4.909182107004835e-06, + "loss": 0.0112, + "num_tokens": 4124064.0, + "reward": 0.73681640625, + "reward_std": 0.006065527442842722, + "rewards//mean": 0.73681640625, + "rewards//std": 0.03234307840466499, + "step": 477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0956, + "grad_norm": 0.7189854979515076, + "kl": 0.27882962487637997, + "learning_rate": 4.908757847191551e-06, + "loss": 0.0112, + "num_tokens": 4132704.0, + "reward": 0.74261474609375, + "reward_std": 0.01600596494972706, + "rewards//mean": 0.74261474609375, + "rewards//std": 0.043848197907209396, + "step": 478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0958, + "grad_norm": 0.7320364117622375, + "kl": 0.2482055276632309, + "learning_rate": 4.908332617130893e-06, + "loss": 0.0099, + "num_tokens": 4141304.0, + "reward": 0.76947021484375, + "reward_std": 0.007574299816042185, + "rewards//mean": 0.76947021484375, + "rewards//std": 0.030989211052656174, + "step": 479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.096, + "grad_norm": 0.735087513923645, + "kl": 0.24100211029872298, + "learning_rate": 4.907906416994146e-06, + "loss": 0.0096, + "num_tokens": 4149840.0, + "reward": 0.77349853515625, + "reward_std": 0.013887856155633926, + "rewards//mean": 0.77349853515625, + "rewards//std": 0.03502505645155907, + "step": 480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0962, + "grad_norm": 0.693162739276886, + "kl": 0.27953667007386684, + "learning_rate": 4.907479246952981e-06, + "loss": 0.0112, + "num_tokens": 4158448.0, + "reward": 0.718017578125, + "reward_std": 0.00825615506619215, + "rewards//mean": 0.718017578125, + "rewards//std": 0.03687366470694542, + "step": 481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0964, + "grad_norm": 0.6603836417198181, + "kl": 0.23947274032980204, + "learning_rate": 4.907051107179464e-06, + "loss": 0.0096, + "num_tokens": 4167072.0, + "reward": 0.748046875, + "reward_std": 0.013273896649479866, + "rewards//mean": 0.748046875, + "rewards//std": 0.0389009565114975, + "step": 482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0966, + "grad_norm": 0.6945295929908752, + "kl": 0.2583098318427801, + "learning_rate": 4.9066219978460485e-06, + "loss": 0.0103, + "num_tokens": 4175656.0, + "reward": 0.763916015625, + "reward_std": 0.007394441869109869, + "rewards//mean": 0.763916015625, + "rewards//std": 0.03223337233066559, + "step": 483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0968, + "grad_norm": 0.6192371845245361, + "kl": 0.2828360088169575, + "learning_rate": 4.90619191912558e-06, + "loss": 0.0113, + "num_tokens": 4184232.0, + "reward": 0.7418212890625, + "reward_std": 0.00855693407356739, + "rewards//mean": 0.7418212890625, + "rewards//std": 0.02327904850244522, + "step": 484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.097, + "grad_norm": 0.6635370850563049, + "kl": 0.2756698988378048, + "learning_rate": 4.905760871191295e-06, + "loss": 0.011, + "num_tokens": 4192904.0, + "reward": 0.71905517578125, + "reward_std": 0.007833951152861118, + "rewards//mean": 0.71905517578125, + "rewards//std": 0.04106537625193596, + "step": 485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0972, + "grad_norm": 0.669340968132019, + "kl": 0.2513146288692951, + "learning_rate": 4.9053288542168185e-06, + "loss": 0.0101, + "num_tokens": 4201520.0, + "reward": 0.7288818359375, + "reward_std": 0.011037398129701614, + "rewards//mean": 0.7288818359375, + "rewards//std": 0.022028131410479546, + "step": 486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0974, + "grad_norm": 0.6306434273719788, + "kl": 0.2504331525415182, + "learning_rate": 4.904895868376167e-06, + "loss": 0.01, + "num_tokens": 4210128.0, + "reward": 0.76690673828125, + "reward_std": 0.010853402316570282, + "rewards//mean": 0.76690673828125, + "rewards//std": 0.03223472461104393, + "step": 487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0976, + "grad_norm": 0.6316109299659729, + "kl": 0.24033397436141968, + "learning_rate": 4.904461913843747e-06, + "loss": 0.0096, + "num_tokens": 4218696.0, + "reward": 0.73101806640625, + "reward_std": 0.009801981970667839, + "rewards//mean": 0.73101806640625, + "rewards//std": 0.033093471080064774, + "step": 488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0978, + "grad_norm": 0.692654013633728, + "kl": 0.30100347846746445, + "learning_rate": 4.904026990794356e-06, + "loss": 0.012, + "num_tokens": 4227304.0, + "reward": 0.7596435546875, + "reward_std": 0.009308423846960068, + "rewards//mean": 0.7596435546875, + "rewards//std": 0.032158851623535156, + "step": 489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.098, + "grad_norm": 0.6925479769706726, + "kl": 0.26741546019911766, + "learning_rate": 4.903591099403181e-06, + "loss": 0.0107, + "num_tokens": 4235968.0, + "reward": 0.7447509765625, + "reward_std": 0.01731371134519577, + "rewards//mean": 0.7447509765625, + "rewards//std": 0.035796087235212326, + "step": 490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0982, + "grad_norm": 0.5992981791496277, + "kl": 0.24271851778030396, + "learning_rate": 4.903154239845798e-06, + "loss": 0.0097, + "num_tokens": 4244616.0, + "reward": 0.73974609375, + "reward_std": 0.010727489367127419, + "rewards//mean": 0.73974609375, + "rewards//std": 0.02847197651863098, + "step": 491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0984, + "grad_norm": 0.6184040307998657, + "kl": 0.24116972647607327, + "learning_rate": 4.902716412298174e-06, + "loss": 0.0096, + "num_tokens": 4253168.0, + "reward": 0.72052001953125, + "reward_std": 0.011508103460073471, + "rewards//mean": 0.72052001953125, + "rewards//std": 0.03347956761717796, + "step": 492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0986, + "grad_norm": 0.6333391070365906, + "kl": 0.2607311652973294, + "learning_rate": 4.902277616936667e-06, + "loss": 0.0104, + "num_tokens": 4261768.0, + "reward": 0.7559814453125, + "reward_std": 0.009352114051580429, + "rewards//mean": 0.7559814453125, + "rewards//std": 0.030678750947117805, + "step": 493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0988, + "grad_norm": 0.700835108757019, + "kl": 0.2442559963092208, + "learning_rate": 4.901837853938024e-06, + "loss": 0.0098, + "num_tokens": 4270328.0, + "reward": 0.7232666015625, + "reward_std": 0.011704965494573116, + "rewards//mean": 0.7232666015625, + "rewards//std": 0.039587363600730896, + "step": 494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.099, + "grad_norm": 0.687675952911377, + "kl": 0.28429468162357807, + "learning_rate": 4.90139712347938e-06, + "loss": 0.0114, + "num_tokens": 4279016.0, + "reward": 0.74224853515625, + "reward_std": 0.006493415683507919, + "rewards//mean": 0.74224853515625, + "rewards//std": 0.02292017824947834, + "step": 495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0992, + "grad_norm": 0.6724541187286377, + "kl": 0.2627929784357548, + "learning_rate": 4.900955425738262e-06, + "loss": 0.0105, + "num_tokens": 4287632.0, + "reward": 0.73876953125, + "reward_std": 0.008314857259392738, + "rewards//mean": 0.73876953125, + "rewards//std": 0.032192960381507874, + "step": 496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0994, + "grad_norm": 0.6971147656440735, + "kl": 0.2989068515598774, + "learning_rate": 4.900512760892585e-06, + "loss": 0.012, + "num_tokens": 4296256.0, + "reward": 0.73370361328125, + "reward_std": 0.009461992420256138, + "rewards//mean": 0.73370361328125, + "rewards//std": 0.025719311088323593, + "step": 497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0996, + "grad_norm": 0.7619755864143372, + "kl": 0.28933385387063026, + "learning_rate": 4.900069129120656e-06, + "loss": 0.0116, + "num_tokens": 4304944.0, + "reward": 0.7659912109375, + "reward_std": 0.009017490781843662, + "rewards//mean": 0.7659912109375, + "rewards//std": 0.03981613367795944, + "step": 498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0998, + "grad_norm": 0.7067105174064636, + "kl": 0.2626380883157253, + "learning_rate": 4.899624530601168e-06, + "loss": 0.0105, + "num_tokens": 4313560.0, + "reward": 0.72686767578125, + "reward_std": 0.01192802656441927, + "rewards//mean": 0.72686767578125, + "rewards//std": 0.037255480885505676, + "step": 499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1, + "grad_norm": 0.7758896946907043, + "kl": 0.28313127160072327, + "learning_rate": 4.899178965513206e-06, + "loss": 0.0113, + "num_tokens": 4322208.0, + "reward": 0.76763916015625, + "reward_std": 0.0051316977478563786, + "rewards//mean": 0.76763916015625, + "rewards//std": 0.022271685302257538, + "step": 500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1002, + "grad_norm": 0.7543128132820129, + "kl": 0.29646800085902214, + "learning_rate": 4.8987324340362445e-06, + "loss": 0.0119, + "num_tokens": 4330888.0, + "reward": 0.74884033203125, + "reward_std": 0.007682453375309706, + "rewards//mean": 0.74884033203125, + "rewards//std": 0.027051476761698723, + "step": 501 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1004, + "grad_norm": 0.7896819114685059, + "kl": 0.33157695457339287, + "learning_rate": 4.898284936350144e-06, + "loss": 0.0133, + "num_tokens": 4339480.0, + "reward": 0.732666015625, + "reward_std": 0.007414411753416061, + "rewards//mean": 0.732666015625, + "rewards//std": 0.03544698655605316, + "step": 502 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1006, + "grad_norm": 0.6674100160598755, + "kl": 0.26767886988818645, + "learning_rate": 4.897836472635159e-06, + "loss": 0.0107, + "num_tokens": 4348208.0, + "reward": 0.71868896484375, + "reward_std": 0.0074168480932712555, + "rewards//mean": 0.71868896484375, + "rewards//std": 0.03925728052854538, + "step": 503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1008, + "grad_norm": 0.7596335411071777, + "kl": 0.2823500316590071, + "learning_rate": 4.89738704307193e-06, + "loss": 0.0113, + "num_tokens": 4356904.0, + "reward": 0.748046875, + "reward_std": 0.005976288113743067, + "rewards//mean": 0.748046875, + "rewards//std": 0.02426011860370636, + "step": 504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.101, + "grad_norm": 0.7461748123168945, + "kl": 0.2936761640012264, + "learning_rate": 4.896936647841485e-06, + "loss": 0.0117, + "num_tokens": 4365528.0, + "reward": 0.75103759765625, + "reward_std": 0.011169584468007088, + "rewards//mean": 0.75103759765625, + "rewards//std": 0.030783364549279213, + "step": 505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1012, + "grad_norm": 0.7608941197395325, + "kl": 0.28775020502507687, + "learning_rate": 4.896485287125247e-06, + "loss": 0.0115, + "num_tokens": 4374120.0, + "reward": 0.73614501953125, + "reward_std": 0.008655678480863571, + "rewards//mean": 0.73614501953125, + "rewards//std": 0.02487071417272091, + "step": 506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1014, + "grad_norm": 0.8823245167732239, + "kl": 0.2823618911206722, + "learning_rate": 4.896032961105021e-06, + "loss": 0.0113, + "num_tokens": 4382808.0, + "reward": 0.75872802734375, + "reward_std": 0.007658984046429396, + "rewards//mean": 0.75872802734375, + "rewards//std": 0.02521706186234951, + "step": 507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1016, + "grad_norm": 0.6119314432144165, + "kl": 0.2920355089008808, + "learning_rate": 4.8955796699630045e-06, + "loss": 0.0117, + "num_tokens": 4391536.0, + "reward": 0.767578125, + "reward_std": 0.006091872230172157, + "rewards//mean": 0.767578125, + "rewards//std": 0.02380661852657795, + "step": 508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1018, + "grad_norm": 0.6585849523544312, + "kl": 0.2887584716081619, + "learning_rate": 4.895125413881783e-06, + "loss": 0.0116, + "num_tokens": 4400152.0, + "reward": 0.748291015625, + "reward_std": 0.005973452236503363, + "rewards//mean": 0.748291015625, + "rewards//std": 0.030804071575403214, + "step": 509 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.102, + "grad_norm": 0.7349271178245544, + "kl": 0.27199224196374416, + "learning_rate": 4.894670193044332e-06, + "loss": 0.0109, + "num_tokens": 4408712.0, + "reward": 0.72711181640625, + "reward_std": 0.0059516276232898235, + "rewards//mean": 0.72711181640625, + "rewards//std": 0.03458928316831589, + "step": 510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1022, + "grad_norm": 0.6272158026695251, + "kl": 0.29213687032461166, + "learning_rate": 4.894214007634014e-06, + "loss": 0.0117, + "num_tokens": 4417376.0, + "reward": 0.763427734375, + "reward_std": 0.00552313681691885, + "rewards//mean": 0.763427734375, + "rewards//std": 0.02124631404876709, + "step": 511 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1024, + "grad_norm": 0.7491189241409302, + "kl": 0.2788054086267948, + "learning_rate": 4.893756857834579e-06, + "loss": 0.0112, + "num_tokens": 4426096.0, + "reward": 0.75390625, + "reward_std": 0.015128916129469872, + "rewards//mean": 0.75390625, + "rewards//std": 0.04438012093305588, + "step": 512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1026, + "grad_norm": 0.598261833190918, + "kl": 0.2626344934105873, + "learning_rate": 4.893298743830168e-06, + "loss": 0.0105, + "num_tokens": 4434696.0, + "reward": 0.74615478515625, + "reward_std": 0.012900955975055695, + "rewards//mean": 0.74615478515625, + "rewards//std": 0.03896622732281685, + "step": 513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1028, + "grad_norm": 0.6159838438034058, + "kl": 0.2789937425404787, + "learning_rate": 4.89283966580531e-06, + "loss": 0.0112, + "num_tokens": 4443352.0, + "reward": 0.7607421875, + "reward_std": 0.0056966980919241905, + "rewards//mean": 0.7607421875, + "rewards//std": 0.02111050859093666, + "step": 514 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.103, + "grad_norm": 0.6811221241950989, + "kl": 0.2887964155524969, + "learning_rate": 4.8923796239449206e-06, + "loss": 0.0116, + "num_tokens": 4452008.0, + "reward": 0.76751708984375, + "reward_std": 0.006497236434370279, + "rewards//mean": 0.76751708984375, + "rewards//std": 0.029380397871136665, + "step": 515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1032, + "grad_norm": 0.5849115252494812, + "kl": 0.2598783317953348, + "learning_rate": 4.891918618434305e-06, + "loss": 0.0104, + "num_tokens": 4460584.0, + "reward": 0.7503662109375, + "reward_std": 0.006470596417784691, + "rewards//mean": 0.7503662109375, + "rewards//std": 0.020818432793021202, + "step": 516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1034, + "grad_norm": 0.774935245513916, + "kl": 0.2854543272405863, + "learning_rate": 4.891456649459156e-06, + "loss": 0.0114, + "num_tokens": 4469208.0, + "reward": 0.76519775390625, + "reward_std": 0.008341135457158089, + "rewards//mean": 0.76519775390625, + "rewards//std": 0.025450101122260094, + "step": 517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1036, + "grad_norm": 0.7881346940994263, + "kl": 0.3002116158604622, + "learning_rate": 4.890993717205553e-06, + "loss": 0.012, + "num_tokens": 4477880.0, + "reward": 0.7275390625, + "reward_std": 0.007611682638525963, + "rewards//mean": 0.7275390625, + "rewards//std": 0.026013750582933426, + "step": 518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1038, + "grad_norm": 0.7925445437431335, + "kl": 0.2746951151639223, + "learning_rate": 4.8905298218599685e-06, + "loss": 0.011, + "num_tokens": 4486520.0, + "reward": 0.753173828125, + "reward_std": 0.00768206175416708, + "rewards//mean": 0.753173828125, + "rewards//std": 0.03539228066802025, + "step": 519 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.104, + "grad_norm": 0.649895191192627, + "kl": 0.3011128604412079, + "learning_rate": 4.8900649636092565e-06, + "loss": 0.012, + "num_tokens": 4495064.0, + "reward": 0.76141357421875, + "reward_std": 0.008288520388305187, + "rewards//mean": 0.76141357421875, + "rewards//std": 0.029034705832600594, + "step": 520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1042, + "grad_norm": 0.6826982498168945, + "kl": 0.27436352148652077, + "learning_rate": 4.889599142640663e-06, + "loss": 0.011, + "num_tokens": 4503728.0, + "reward": 0.73187255859375, + "reward_std": 0.007811560295522213, + "rewards//mean": 0.73187255859375, + "rewards//std": 0.031737327575683594, + "step": 521 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1044, + "grad_norm": 0.7717922329902649, + "kl": 0.2864828519523144, + "learning_rate": 4.889132359141822e-06, + "loss": 0.0115, + "num_tokens": 4512344.0, + "reward": 0.76220703125, + "reward_std": 0.005440828390419483, + "rewards//mean": 0.76220703125, + "rewards//std": 0.02468077465891838, + "step": 522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1046, + "grad_norm": 0.6896048784255981, + "kl": 0.27172659896314144, + "learning_rate": 4.888664613300751e-06, + "loss": 0.0109, + "num_tokens": 4520896.0, + "reward": 0.74566650390625, + "reward_std": 0.008871195837855339, + "rewards//mean": 0.74566650390625, + "rewards//std": 0.02351093478500843, + "step": 523 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1048, + "grad_norm": 0.7394423484802246, + "kl": 0.2659613937139511, + "learning_rate": 4.888195905305859e-06, + "loss": 0.0106, + "num_tokens": 4529480.0, + "reward": 0.73236083984375, + "reward_std": 0.012172890827059746, + "rewards//mean": 0.73236083984375, + "rewards//std": 0.03177260234951973, + "step": 524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.105, + "grad_norm": 0.6426916122436523, + "kl": 0.27845061011612415, + "learning_rate": 4.887726235345943e-06, + "loss": 0.0111, + "num_tokens": 4538064.0, + "reward": 0.76239013671875, + "reward_std": 0.008901185356080532, + "rewards//mean": 0.76239013671875, + "rewards//std": 0.023638714104890823, + "step": 525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1052, + "grad_norm": 0.6519190669059753, + "kl": 0.25384664349257946, + "learning_rate": 4.8872556036101845e-06, + "loss": 0.0102, + "num_tokens": 4546688.0, + "reward": 0.77862548828125, + "reward_std": 0.009897831827402115, + "rewards//mean": 0.77862548828125, + "rewards//std": 0.029281822964549065, + "step": 526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1054, + "grad_norm": 0.7181262969970703, + "kl": 0.26998190581798553, + "learning_rate": 4.886784010288155e-06, + "loss": 0.0108, + "num_tokens": 4555432.0, + "reward": 0.78851318359375, + "reward_std": 0.011629641987383366, + "rewards//mean": 0.78851318359375, + "rewards//std": 0.025476258248090744, + "step": 527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1056, + "grad_norm": 0.6040878295898438, + "kl": 0.28468160331249237, + "learning_rate": 4.886311455569811e-06, + "loss": 0.0114, + "num_tokens": 4564064.0, + "reward": 0.76861572265625, + "reward_std": 0.0060232048854231834, + "rewards//mean": 0.76861572265625, + "rewards//std": 0.024147462099790573, + "step": 528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1058, + "grad_norm": 0.6055912375450134, + "kl": 0.2735178656876087, + "learning_rate": 4.885837939645499e-06, + "loss": 0.0109, + "num_tokens": 4572752.0, + "reward": 0.75860595703125, + "reward_std": 0.005819528363645077, + "rewards//mean": 0.75860595703125, + "rewards//std": 0.025687508285045624, + "step": 529 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.106, + "grad_norm": 0.7213678956031799, + "kl": 0.2826583944261074, + "learning_rate": 4.885363462705949e-06, + "loss": 0.0113, + "num_tokens": 4581432.0, + "reward": 0.7806396484375, + "reward_std": 0.005271364934742451, + "rewards//mean": 0.7806396484375, + "rewards//std": 0.024071883410215378, + "step": 530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1062, + "grad_norm": 0.6324445605278015, + "kl": 0.30165957659482956, + "learning_rate": 4.884888024942282e-06, + "loss": 0.0121, + "num_tokens": 4590056.0, + "reward": 0.7506103515625, + "reward_std": 0.0039962828159332275, + "rewards//mean": 0.7506103515625, + "rewards//std": 0.029759034514427185, + "step": 531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1064, + "grad_norm": 0.6357660293579102, + "kl": 0.2692461237311363, + "learning_rate": 4.884411626546004e-06, + "loss": 0.0108, + "num_tokens": 4598776.0, + "reward": 0.73028564453125, + "reward_std": 0.006139049306511879, + "rewards//mean": 0.73028564453125, + "rewards//std": 0.020942674949765205, + "step": 532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1066, + "grad_norm": 0.9847143292427063, + "kl": 0.30338218063116074, + "learning_rate": 4.883934267709007e-06, + "loss": 0.0121, + "num_tokens": 4607464.0, + "reward": 0.77239990234375, + "reward_std": 0.005770155228674412, + "rewards//mean": 0.77239990234375, + "rewards//std": 0.030767623335123062, + "step": 533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1068, + "grad_norm": 0.8446683883666992, + "kl": 0.3181992806494236, + "learning_rate": 4.883455948623574e-06, + "loss": 0.0127, + "num_tokens": 4616104.0, + "reward": 0.7333984375, + "reward_std": 0.009834567084908485, + "rewards//mean": 0.7333984375, + "rewards//std": 0.032324355095624924, + "step": 534 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.107, + "grad_norm": 0.7224143743515015, + "kl": 0.2985565960407257, + "learning_rate": 4.882976669482368e-06, + "loss": 0.0119, + "num_tokens": 4624760.0, + "reward": 0.7509765625, + "reward_std": 0.006324178539216518, + "rewards//mean": 0.7509765625, + "rewards//std": 0.030273688957095146, + "step": 535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1072, + "grad_norm": 0.6489189863204956, + "kl": 0.2836180441081524, + "learning_rate": 4.882496430478445e-06, + "loss": 0.0113, + "num_tokens": 4633392.0, + "reward": 0.7471923828125, + "reward_std": 0.00982951931655407, + "rewards//mean": 0.7471923828125, + "rewards//std": 0.027405409142374992, + "step": 536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1074, + "grad_norm": 0.711584210395813, + "kl": 0.2708862889558077, + "learning_rate": 4.882015231805245e-06, + "loss": 0.0108, + "num_tokens": 4642072.0, + "reward": 0.7578125, + "reward_std": 0.008214281871914864, + "rewards//mean": 0.7578125, + "rewards//std": 0.027891865000128746, + "step": 537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1076, + "grad_norm": 0.6661033034324646, + "kl": 0.26852802373468876, + "learning_rate": 4.881533073656594e-06, + "loss": 0.0107, + "num_tokens": 4650720.0, + "reward": 0.75445556640625, + "reward_std": 0.007886864244937897, + "rewards//mean": 0.75445556640625, + "rewards//std": 0.028012022376060486, + "step": 538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1078, + "grad_norm": 0.7064325213432312, + "kl": 0.2860189266502857, + "learning_rate": 4.8810499562267066e-06, + "loss": 0.0114, + "num_tokens": 4659312.0, + "reward": 0.769287109375, + "reward_std": 0.006793664768338203, + "rewards//mean": 0.769287109375, + "rewards//std": 0.02851766347885132, + "step": 539 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.108, + "grad_norm": 0.6437175869941711, + "kl": 0.25416005309671164, + "learning_rate": 4.88056587971018e-06, + "loss": 0.0102, + "num_tokens": 4667896.0, + "reward": 0.7423095703125, + "reward_std": 0.007785624824464321, + "rewards//mean": 0.7423095703125, + "rewards//std": 0.028713131323456764, + "step": 540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1082, + "grad_norm": 0.5998038053512573, + "kl": 0.2681788485497236, + "learning_rate": 4.880080844302004e-06, + "loss": 0.0107, + "num_tokens": 4676488.0, + "reward": 0.75537109375, + "reward_std": 0.006692672614008188, + "rewards//mean": 0.75537109375, + "rewards//std": 0.020033232867717743, + "step": 541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1084, + "grad_norm": 0.6596677899360657, + "kl": 0.24427864141762257, + "learning_rate": 4.879594850197548e-06, + "loss": 0.0098, + "num_tokens": 4685128.0, + "reward": 0.7476806640625, + "reward_std": 0.010514110326766968, + "rewards//mean": 0.7476806640625, + "rewards//std": 0.03853638097643852, + "step": 542 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1086, + "grad_norm": 0.6608676910400391, + "kl": 0.259847916662693, + "learning_rate": 4.87910789759257e-06, + "loss": 0.0104, + "num_tokens": 4693824.0, + "reward": 0.75299072265625, + "reward_std": 0.005296648014336824, + "rewards//mean": 0.75299072265625, + "rewards//std": 0.048891909420490265, + "step": 543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1088, + "grad_norm": 0.6944065093994141, + "kl": 0.2824724651873112, + "learning_rate": 4.878619986683215e-06, + "loss": 0.0113, + "num_tokens": 4702472.0, + "reward": 0.750244140625, + "reward_std": 0.007567228749394417, + "rewards//mean": 0.750244140625, + "rewards//std": 0.03861897811293602, + "step": 544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.109, + "grad_norm": 0.6676400303840637, + "kl": 0.26609333604574203, + "learning_rate": 4.8781311176660144e-06, + "loss": 0.0106, + "num_tokens": 4711104.0, + "reward": 0.779052734375, + "reward_std": 0.010382162407040596, + "rewards//mean": 0.779052734375, + "rewards//std": 0.02472366951406002, + "step": 545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1092, + "grad_norm": 0.6957513689994812, + "kl": 0.2683636359870434, + "learning_rate": 4.8776412907378845e-06, + "loss": 0.0107, + "num_tokens": 4719688.0, + "reward": 0.71490478515625, + "reward_std": 0.008024358190596104, + "rewards//mean": 0.71490478515625, + "rewards//std": 0.037217672914266586, + "step": 546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1094, + "grad_norm": 0.6461659073829651, + "kl": 0.2612381912767887, + "learning_rate": 4.877150506096127e-06, + "loss": 0.0104, + "num_tokens": 4728272.0, + "reward": 0.7481689453125, + "reward_std": 0.006405083928257227, + "rewards//mean": 0.7481689453125, + "rewards//std": 0.02294633351266384, + "step": 547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1096, + "grad_norm": 0.6994146704673767, + "kl": 0.2628567796200514, + "learning_rate": 4.8766587639384285e-06, + "loss": 0.0105, + "num_tokens": 4736888.0, + "reward": 0.74835205078125, + "reward_std": 0.006025420036166906, + "rewards//mean": 0.74835205078125, + "rewards//std": 0.028795981779694557, + "step": 548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1098, + "grad_norm": 0.5853991508483887, + "kl": 0.25532799772918224, + "learning_rate": 4.876166064462866e-06, + "loss": 0.0102, + "num_tokens": 4745456.0, + "reward": 0.7703857421875, + "reward_std": 0.009814901277422905, + "rewards//mean": 0.7703857421875, + "rewards//std": 0.02585000917315483, + "step": 549 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.11, + "grad_norm": 0.6331080198287964, + "kl": 0.2674791272729635, + "learning_rate": 4.8756724078678955e-06, + "loss": 0.0107, + "num_tokens": 4754000.0, + "reward": 0.7891845703125, + "reward_std": 0.01081712357699871, + "rewards//mean": 0.7891845703125, + "rewards//std": 0.023632710799574852, + "step": 550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1102, + "grad_norm": 0.789883553981781, + "kl": 0.259792210534215, + "learning_rate": 4.875177794352364e-06, + "loss": 0.0104, + "num_tokens": 4762808.0, + "reward": 0.7608642578125, + "reward_std": 0.006845149677246809, + "rewards//mean": 0.7608642578125, + "rewards//std": 0.0270137470215559, + "step": 551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1104, + "grad_norm": 0.6203144788742065, + "kl": 0.2684528976678848, + "learning_rate": 4.8746822241155006e-06, + "loss": 0.0107, + "num_tokens": 4771488.0, + "reward": 0.7344970703125, + "reward_std": 0.007759134750813246, + "rewards//mean": 0.7344970703125, + "rewards//std": 0.02165386639535427, + "step": 552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1106, + "grad_norm": 0.8759577870368958, + "kl": 0.2989007495343685, + "learning_rate": 4.874185697356921e-06, + "loss": 0.012, + "num_tokens": 4780144.0, + "reward": 0.7337646484375, + "reward_std": 0.009506863541901112, + "rewards//mean": 0.7337646484375, + "rewards//std": 0.02997797727584839, + "step": 553 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1108, + "grad_norm": 0.5836766958236694, + "kl": 0.25084237568080425, + "learning_rate": 4.873688214276628e-06, + "loss": 0.01, + "num_tokens": 4788744.0, + "reward": 0.71270751953125, + "reward_std": 0.008568651042878628, + "rewards//mean": 0.71270751953125, + "rewards//std": 0.03878828510642052, + "step": 554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.111, + "grad_norm": 0.6924554109573364, + "kl": 0.2806801497936249, + "learning_rate": 4.873189775075005e-06, + "loss": 0.0112, + "num_tokens": 4797400.0, + "reward": 0.734375, + "reward_std": 0.0058593968860805035, + "rewards//mean": 0.734375, + "rewards//std": 0.03078145906329155, + "step": 555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1112, + "grad_norm": 0.7575480341911316, + "kl": 0.2851990181952715, + "learning_rate": 4.872690379952824e-06, + "loss": 0.0114, + "num_tokens": 4806144.0, + "reward": 0.79217529296875, + "reward_std": 0.008231771178543568, + "rewards//mean": 0.79217529296875, + "rewards//std": 0.02772795408964157, + "step": 556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1114, + "grad_norm": 0.625200092792511, + "kl": 0.26935046166181564, + "learning_rate": 4.8721900291112415e-06, + "loss": 0.0108, + "num_tokens": 4814792.0, + "reward": 0.762939453125, + "reward_std": 0.00905733834952116, + "rewards//mean": 0.762939453125, + "rewards//std": 0.03055935725569725, + "step": 557 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1116, + "grad_norm": 1.0318313837051392, + "kl": 0.31965751200914383, + "learning_rate": 4.871688722751799e-06, + "loss": 0.0128, + "num_tokens": 4823488.0, + "reward": 0.77117919921875, + "reward_std": 0.008040939457714558, + "rewards//mean": 0.77117919921875, + "rewards//std": 0.032330382615327835, + "step": 558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1118, + "grad_norm": 0.6006855964660645, + "kl": 0.24019759241491556, + "learning_rate": 4.8711864610764235e-06, + "loss": 0.0096, + "num_tokens": 4832160.0, + "reward": 0.76800537109375, + "reward_std": 0.00946258008480072, + "rewards//mean": 0.76800537109375, + "rewards//std": 0.03378697484731674, + "step": 559 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.112, + "grad_norm": 0.7878683805465698, + "kl": 0.2661712933331728, + "learning_rate": 4.870683244287425e-06, + "loss": 0.0106, + "num_tokens": 4840776.0, + "reward": 0.720458984375, + "reward_std": 0.008840283378958702, + "rewards//mean": 0.720458984375, + "rewards//std": 0.04082028940320015, + "step": 560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1122, + "grad_norm": 0.83661288022995, + "kl": 0.2538266107439995, + "learning_rate": 4.870179072587499e-06, + "loss": 0.0102, + "num_tokens": 4849480.0, + "reward": 0.7720947265625, + "reward_std": 0.009472770616412163, + "rewards//mean": 0.7720947265625, + "rewards//std": 0.025621790438890457, + "step": 561 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1124, + "grad_norm": 0.7142016887664795, + "kl": 0.28109513595700264, + "learning_rate": 4.869673946179726e-06, + "loss": 0.0112, + "num_tokens": 4858184.0, + "reward": 0.764404296875, + "reward_std": 0.00784839503467083, + "rewards//mean": 0.764404296875, + "rewards//std": 0.02929687686264515, + "step": 562 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1126, + "grad_norm": 0.7756282091140747, + "kl": 0.27597307227551937, + "learning_rate": 4.8691678652675715e-06, + "loss": 0.011, + "num_tokens": 4866856.0, + "reward": 0.76470947265625, + "reward_std": 0.008997954428195953, + "rewards//mean": 0.76470947265625, + "rewards//std": 0.02482014335691929, + "step": 563 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1128, + "grad_norm": 0.6697260141372681, + "kl": 0.25615566223859787, + "learning_rate": 4.8686608300548836e-06, + "loss": 0.0102, + "num_tokens": 4875376.0, + "reward": 0.732421875, + "reward_std": 0.007549830712378025, + "rewards//mean": 0.732421875, + "rewards//std": 0.030193578451871872, + "step": 564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.113, + "grad_norm": 0.7746402621269226, + "kl": 0.2652531825006008, + "learning_rate": 4.868152840745896e-06, + "loss": 0.0106, + "num_tokens": 4884008.0, + "reward": 0.7220458984375, + "reward_std": 0.00881132297217846, + "rewards//mean": 0.7220458984375, + "rewards//std": 0.036915309727191925, + "step": 565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1132, + "grad_norm": 0.7145909667015076, + "kl": 0.2629079818725586, + "learning_rate": 4.8676438975452276e-06, + "loss": 0.0105, + "num_tokens": 4892672.0, + "reward": 0.7686767578125, + "reward_std": 0.008167712949216366, + "rewards//mean": 0.7686767578125, + "rewards//std": 0.02817239984869957, + "step": 566 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1134, + "grad_norm": 0.7171170711517334, + "kl": 0.22697240114212036, + "learning_rate": 4.86713400065788e-06, + "loss": 0.0091, + "num_tokens": 4901272.0, + "reward": 0.7269287109375, + "reward_std": 0.009677172638475895, + "rewards//mean": 0.7269287109375, + "rewards//std": 0.038479775190353394, + "step": 567 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1136, + "grad_norm": 0.6745283603668213, + "kl": 0.24715841561555862, + "learning_rate": 4.866623150289241e-06, + "loss": 0.0099, + "num_tokens": 4909840.0, + "reward": 0.75732421875, + "reward_std": 0.008682534098625183, + "rewards//mean": 0.75732421875, + "rewards//std": 0.024094825610518456, + "step": 568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1138, + "grad_norm": 0.6382898688316345, + "kl": 0.25850758142769337, + "learning_rate": 4.86611134664508e-06, + "loss": 0.0103, + "num_tokens": 4918384.0, + "reward": 0.7513427734375, + "reward_std": 0.0103992260992527, + "rewards//mean": 0.7513427734375, + "rewards//std": 0.020102253183722496, + "step": 569 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.114, + "grad_norm": 0.9113550186157227, + "kl": 0.25279413163661957, + "learning_rate": 4.865598589931552e-06, + "loss": 0.0101, + "num_tokens": 4927096.0, + "reward": 0.7747802734375, + "reward_std": 0.007576893083751202, + "rewards//mean": 0.7747802734375, + "rewards//std": 0.033690787851810455, + "step": 570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1142, + "grad_norm": 0.6302274465560913, + "kl": 0.25698636658489704, + "learning_rate": 4.865084880355193e-06, + "loss": 0.0103, + "num_tokens": 4935776.0, + "reward": 0.73907470703125, + "reward_std": 0.007312727626413107, + "rewards//mean": 0.73907470703125, + "rewards//std": 0.026749854907393456, + "step": 571 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1144, + "grad_norm": 0.5283216834068298, + "kl": 0.22627251036465168, + "learning_rate": 4.864570218122928e-06, + "loss": 0.0091, + "num_tokens": 4944400.0, + "reward": 0.73974609375, + "reward_std": 0.01004891935735941, + "rewards//mean": 0.73974609375, + "rewards//std": 0.02998024970293045, + "step": 572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1146, + "grad_norm": 0.6809362769126892, + "kl": 0.24691984988749027, + "learning_rate": 4.864054603442063e-06, + "loss": 0.0099, + "num_tokens": 4953000.0, + "reward": 0.75897216796875, + "reward_std": 0.009808078408241272, + "rewards//mean": 0.75897216796875, + "rewards//std": 0.025493483990430832, + "step": 573 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1148, + "grad_norm": 0.6428877711296082, + "kl": 0.23521161265671253, + "learning_rate": 4.863538036520285e-06, + "loss": 0.0094, + "num_tokens": 4961696.0, + "reward": 0.7613525390625, + "reward_std": 0.010448819026350975, + "rewards//mean": 0.7613525390625, + "rewards//std": 0.03130976855754852, + "step": 574 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.115, + "grad_norm": 0.6935397982597351, + "kl": 0.23570332117378712, + "learning_rate": 4.863020517565669e-06, + "loss": 0.0094, + "num_tokens": 4970296.0, + "reward": 0.743896484375, + "reward_std": 0.007338499650359154, + "rewards//mean": 0.743896484375, + "rewards//std": 0.024899380281567574, + "step": 575 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1152, + "grad_norm": 0.6086986660957336, + "kl": 0.2525392398238182, + "learning_rate": 4.862502046786671e-06, + "loss": 0.0101, + "num_tokens": 4979040.0, + "reward": 0.748779296875, + "reward_std": 0.007959538139402866, + "rewards//mean": 0.748779296875, + "rewards//std": 0.04012007638812065, + "step": 576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1154, + "grad_norm": 0.6362279653549194, + "kl": 0.21946440637111664, + "learning_rate": 4.861982624392132e-06, + "loss": 0.0088, + "num_tokens": 4987720.0, + "reward": 0.78765869140625, + "reward_std": 0.009038891643285751, + "rewards//mean": 0.78765869140625, + "rewards//std": 0.020556749776005745, + "step": 577 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1156, + "grad_norm": 0.6334317326545715, + "kl": 0.2258777841925621, + "learning_rate": 4.861462250591273e-06, + "loss": 0.009, + "num_tokens": 4996352.0, + "reward": 0.719482421875, + "reward_std": 0.009759314358234406, + "rewards//mean": 0.719482421875, + "rewards//std": 0.03258461877703667, + "step": 578 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1158, + "grad_norm": 0.585574746131897, + "kl": 0.2085122261196375, + "learning_rate": 4.860940925593703e-06, + "loss": 0.0083, + "num_tokens": 5005168.0, + "reward": 0.7581787109375, + "reward_std": 0.010835596360266209, + "rewards//mean": 0.7581787109375, + "rewards//std": 0.03846088796854019, + "step": 579 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.116, + "grad_norm": 0.6512745022773743, + "kl": 0.2308096420019865, + "learning_rate": 4.86041864960941e-06, + "loss": 0.0092, + "num_tokens": 5013744.0, + "reward": 0.75238037109375, + "reward_std": 0.008994483388960361, + "rewards//mean": 0.75238037109375, + "rewards//std": 0.026278842240571976, + "step": 580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1162, + "grad_norm": 0.5345540642738342, + "kl": 0.2112738210707903, + "learning_rate": 4.859895422848767e-06, + "loss": 0.0085, + "num_tokens": 5022304.0, + "reward": 0.73822021484375, + "reward_std": 0.008295232430100441, + "rewards//mean": 0.73822021484375, + "rewards//std": 0.04076458141207695, + "step": 581 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1164, + "grad_norm": 0.602761447429657, + "kl": 0.21912161633372307, + "learning_rate": 4.859371245522531e-06, + "loss": 0.0088, + "num_tokens": 5030944.0, + "reward": 0.703369140625, + "reward_std": 0.00949187483638525, + "rewards//mean": 0.703369140625, + "rewards//std": 0.028560098260641098, + "step": 582 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1166, + "grad_norm": 0.6504238247871399, + "kl": 0.23803219199180603, + "learning_rate": 4.8588461178418375e-06, + "loss": 0.0095, + "num_tokens": 5039632.0, + "reward": 0.7572021484375, + "reward_std": 0.00934354867786169, + "rewards//mean": 0.7572021484375, + "rewards//std": 0.02799776755273342, + "step": 583 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1168, + "grad_norm": 0.5838614702224731, + "kl": 0.21050356701016426, + "learning_rate": 4.858320040018212e-06, + "loss": 0.0084, + "num_tokens": 5048256.0, + "reward": 0.75494384765625, + "reward_std": 0.009613383561372757, + "rewards//mean": 0.75494384765625, + "rewards//std": 0.021341485902667046, + "step": 584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.117, + "grad_norm": 0.5332762002944946, + "kl": 0.2162165530025959, + "learning_rate": 4.857793012263555e-06, + "loss": 0.0086, + "num_tokens": 5056808.0, + "reward": 0.75299072265625, + "reward_std": 0.00866914913058281, + "rewards//mean": 0.75299072265625, + "rewards//std": 0.030653268098831177, + "step": 585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1172, + "grad_norm": 0.5869634747505188, + "kl": 0.22080680169165134, + "learning_rate": 4.857265034790155e-06, + "loss": 0.0088, + "num_tokens": 5065400.0, + "reward": 0.7698974609375, + "reward_std": 0.009216565638780594, + "rewards//mean": 0.7698974609375, + "rewards//std": 0.03380203619599342, + "step": 586 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1174, + "grad_norm": 0.6322459578514099, + "kl": 0.20717273838818073, + "learning_rate": 4.85673610781068e-06, + "loss": 0.0083, + "num_tokens": 5074064.0, + "reward": 0.78057861328125, + "reward_std": 0.010938020423054695, + "rewards//mean": 0.78057861328125, + "rewards//std": 0.028258977457880974, + "step": 587 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1176, + "grad_norm": 0.8341087102890015, + "kl": 0.2445808406919241, + "learning_rate": 4.856206231538184e-06, + "loss": 0.0098, + "num_tokens": 5082720.0, + "reward": 0.74163818359375, + "reward_std": 0.009593227878212929, + "rewards//mean": 0.74163818359375, + "rewards//std": 0.02997513860464096, + "step": 588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1178, + "grad_norm": 0.5464284420013428, + "kl": 0.20626290701329708, + "learning_rate": 4.855675406186099e-06, + "loss": 0.0083, + "num_tokens": 5091344.0, + "reward": 0.73687744140625, + "reward_std": 0.013462206348776817, + "rewards//mean": 0.73687744140625, + "rewards//std": 0.03030860796570778, + "step": 589 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.118, + "grad_norm": 0.5974122285842896, + "kl": 0.2238724958151579, + "learning_rate": 4.855143631968242e-06, + "loss": 0.009, + "num_tokens": 5100024.0, + "reward": 0.76165771484375, + "reward_std": 0.010216079652309418, + "rewards//mean": 0.76165771484375, + "rewards//std": 0.022633064538240433, + "step": 590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1182, + "grad_norm": 0.5498731732368469, + "kl": 0.20982307940721512, + "learning_rate": 4.854610909098813e-06, + "loss": 0.0084, + "num_tokens": 5108640.0, + "reward": 0.740234375, + "reward_std": 0.011146800592541695, + "rewards//mean": 0.740234375, + "rewards//std": 0.029065513983368874, + "step": 591 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1184, + "grad_norm": 0.5892578363418579, + "kl": 0.22463841922581196, + "learning_rate": 4.854077237792389e-06, + "loss": 0.009, + "num_tokens": 5117224.0, + "reward": 0.74560546875, + "reward_std": 0.010708848014473915, + "rewards//mean": 0.74560546875, + "rewards//std": 0.03500435873866081, + "step": 592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1186, + "grad_norm": 0.5446705222129822, + "kl": 0.19729456305503845, + "learning_rate": 4.853542618263937e-06, + "loss": 0.0079, + "num_tokens": 5125776.0, + "reward": 0.7720947265625, + "reward_std": 0.0137183777987957, + "rewards//mean": 0.7720947265625, + "rewards//std": 0.02645888738334179, + "step": 593 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1188, + "grad_norm": 0.5078734159469604, + "kl": 0.20582310110330582, + "learning_rate": 4.8530070507288e-06, + "loss": 0.0082, + "num_tokens": 5134408.0, + "reward": 0.7685546875, + "reward_std": 0.008237513713538647, + "rewards//mean": 0.7685546875, + "rewards//std": 0.02214088849723339, + "step": 594 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.119, + "grad_norm": 0.6351189017295837, + "kl": 0.2431877087801695, + "learning_rate": 4.852470535402703e-06, + "loss": 0.0097, + "num_tokens": 5143040.0, + "reward": 0.75970458984375, + "reward_std": 0.009093962609767914, + "rewards//mean": 0.75970458984375, + "rewards//std": 0.021931972354650497, + "step": 595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1192, + "grad_norm": 0.8271527886390686, + "kl": 0.23255185037851334, + "learning_rate": 4.851933072501756e-06, + "loss": 0.0093, + "num_tokens": 5151704.0, + "reward": 0.7572021484375, + "reward_std": 0.011184893548488617, + "rewards//mean": 0.7572021484375, + "rewards//std": 0.02380884252488613, + "step": 596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1194, + "grad_norm": 0.7389485239982605, + "kl": 0.21711469255387783, + "learning_rate": 4.851394662242449e-06, + "loss": 0.0087, + "num_tokens": 5160400.0, + "reward": 0.7572021484375, + "reward_std": 0.012733031064271927, + "rewards//mean": 0.7572021484375, + "rewards//std": 0.027341259643435478, + "step": 597 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1196, + "grad_norm": 0.7461937665939331, + "kl": 0.2250336967408657, + "learning_rate": 4.850855304841653e-06, + "loss": 0.009, + "num_tokens": 5169024.0, + "reward": 0.75531005859375, + "reward_std": 0.006518941838294268, + "rewards//mean": 0.75531005859375, + "rewards//std": 0.027677135542035103, + "step": 598 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1198, + "grad_norm": 0.737876832485199, + "kl": 0.21269006468355656, + "learning_rate": 4.8503150005166225e-06, + "loss": 0.0085, + "num_tokens": 5177672.0, + "reward": 0.76031494140625, + "reward_std": 0.007318540476262569, + "rewards//mean": 0.76031494140625, + "rewards//std": 0.02775578200817108, + "step": 599 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.12, + "grad_norm": 0.6356746554374695, + "kl": 0.22319811396300793, + "learning_rate": 4.849773749484989e-06, + "loss": 0.0089, + "num_tokens": 5186400.0, + "reward": 0.72882080078125, + "reward_std": 0.008081294596195221, + "rewards//mean": 0.72882080078125, + "rewards//std": 0.03312958776950836, + "step": 600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1202, + "grad_norm": 0.5195296406745911, + "kl": 0.20459580793976784, + "learning_rate": 4.849231551964771e-06, + "loss": 0.0082, + "num_tokens": 5195048.0, + "reward": 0.73486328125, + "reward_std": 0.009343027137219906, + "rewards//mean": 0.73486328125, + "rewards//std": 0.02382187359035015, + "step": 601 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1204, + "grad_norm": 0.6819605827331543, + "kl": 0.22957494854927063, + "learning_rate": 4.848688408174366e-06, + "loss": 0.0092, + "num_tokens": 5203712.0, + "reward": 0.7725830078125, + "reward_std": 0.008309951052069664, + "rewards//mean": 0.7725830078125, + "rewards//std": 0.02304638922214508, + "step": 602 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1206, + "grad_norm": 0.6028555631637573, + "kl": 0.2299717701971531, + "learning_rate": 4.84814431833255e-06, + "loss": 0.0092, + "num_tokens": 5212352.0, + "reward": 0.76513671875, + "reward_std": 0.011560057289898396, + "rewards//mean": 0.76513671875, + "rewards//std": 0.026951193809509277, + "step": 603 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1208, + "grad_norm": 0.7233465313911438, + "kl": 0.21515335515141487, + "learning_rate": 4.847599282658483e-06, + "loss": 0.0086, + "num_tokens": 5220912.0, + "reward": 0.7650146484375, + "reward_std": 0.009766367264091969, + "rewards//mean": 0.7650146484375, + "rewards//std": 0.03968513756990433, + "step": 604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.121, + "grad_norm": 0.6206122636795044, + "kl": 0.2212205920368433, + "learning_rate": 4.847053301371706e-06, + "loss": 0.0088, + "num_tokens": 5229616.0, + "reward": 0.75567626953125, + "reward_std": 0.010858502238988876, + "rewards//mean": 0.75567626953125, + "rewards//std": 0.02769080549478531, + "step": 605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1212, + "grad_norm": 0.6429035663604736, + "kl": 0.22523421607911587, + "learning_rate": 4.84650637469214e-06, + "loss": 0.009, + "num_tokens": 5238192.0, + "reward": 0.73785400390625, + "reward_std": 0.010889837518334389, + "rewards//mean": 0.73785400390625, + "rewards//std": 0.030354522168636322, + "step": 606 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1214, + "grad_norm": 0.7371296286582947, + "kl": 0.22557489946484566, + "learning_rate": 4.845958502840087e-06, + "loss": 0.009, + "num_tokens": 5246720.0, + "reward": 0.7720947265625, + "reward_std": 0.008845320902764797, + "rewards//mean": 0.7720947265625, + "rewards//std": 0.03540147468447685, + "step": 607 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1216, + "grad_norm": 0.8345296382904053, + "kl": 0.2478946428745985, + "learning_rate": 4.8454096860362284e-06, + "loss": 0.0099, + "num_tokens": 5255328.0, + "reward": 0.74542236328125, + "reward_std": 0.008497816510498524, + "rewards//mean": 0.74542236328125, + "rewards//std": 0.02872650697827339, + "step": 608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1218, + "grad_norm": 0.9587209224700928, + "kl": 0.22490216977894306, + "learning_rate": 4.8448599245016306e-06, + "loss": 0.009, + "num_tokens": 5263992.0, + "reward": 0.7431640625, + "reward_std": 0.008303426206111908, + "rewards//mean": 0.7431640625, + "rewards//std": 0.033689215779304504, + "step": 609 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.122, + "grad_norm": 0.6855178475379944, + "kl": 0.24397451616823673, + "learning_rate": 4.844309218457735e-06, + "loss": 0.0098, + "num_tokens": 5272632.0, + "reward": 0.74456787109375, + "reward_std": 0.007743033580482006, + "rewards//mean": 0.74456787109375, + "rewards//std": 0.02703300304710865, + "step": 610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1222, + "grad_norm": 0.8738477230072021, + "kl": 0.24380822479724884, + "learning_rate": 4.843757568126366e-06, + "loss": 0.0098, + "num_tokens": 5281264.0, + "reward": 0.76068115234375, + "reward_std": 0.008802594617009163, + "rewards//mean": 0.76068115234375, + "rewards//std": 0.02518342435359955, + "step": 611 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1224, + "grad_norm": 1.1533160209655762, + "kl": 0.2847021333873272, + "learning_rate": 4.84320497372973e-06, + "loss": 0.0114, + "num_tokens": 5290096.0, + "reward": 0.7354736328125, + "reward_std": 0.009391937404870987, + "rewards//mean": 0.7354736328125, + "rewards//std": 0.03238774463534355, + "step": 612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1226, + "grad_norm": 0.9461967945098877, + "kl": 0.2901354692876339, + "learning_rate": 4.8426514354904096e-06, + "loss": 0.0116, + "num_tokens": 5298720.0, + "reward": 0.75830078125, + "reward_std": 0.01141531765460968, + "rewards//mean": 0.75830078125, + "rewards//std": 0.02832699380815029, + "step": 613 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1228, + "grad_norm": 0.8776152729988098, + "kl": 0.2779980804771185, + "learning_rate": 4.842096953631371e-06, + "loss": 0.0111, + "num_tokens": 5307344.0, + "reward": 0.74365234375, + "reward_std": 0.007914327085018158, + "rewards//mean": 0.74365234375, + "rewards//std": 0.03176122531294823, + "step": 614 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.123, + "grad_norm": 0.7875434160232544, + "kl": 0.27193955332040787, + "learning_rate": 4.841541528375961e-06, + "loss": 0.0109, + "num_tokens": 5315880.0, + "reward": 0.7650146484375, + "reward_std": 0.011012593284249306, + "rewards//mean": 0.7650146484375, + "rewards//std": 0.03217579424381256, + "step": 615 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1232, + "grad_norm": 0.9433616995811462, + "kl": 0.331559956073761, + "learning_rate": 4.840985159947902e-06, + "loss": 0.0133, + "num_tokens": 5324592.0, + "reward": 0.759521484375, + "reward_std": 0.008395890705287457, + "rewards//mean": 0.759521484375, + "rewards//std": 0.025189509615302086, + "step": 616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1234, + "grad_norm": 1.2481539249420166, + "kl": 0.3108705338090658, + "learning_rate": 4.8404278485713005e-06, + "loss": 0.0124, + "num_tokens": 5333288.0, + "reward": 0.75555419921875, + "reward_std": 0.006895901635289192, + "rewards//mean": 0.75555419921875, + "rewards//std": 0.020772846415638924, + "step": 617 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1236, + "grad_norm": 1.051727056503296, + "kl": 0.3007947914302349, + "learning_rate": 4.839869594470642e-06, + "loss": 0.012, + "num_tokens": 5341944.0, + "reward": 0.74566650390625, + "reward_std": 0.009581143036484718, + "rewards//mean": 0.74566650390625, + "rewards//std": 0.02763991802930832, + "step": 618 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1238, + "grad_norm": 1.1710658073425293, + "kl": 0.3423069082200527, + "learning_rate": 4.839310397870791e-06, + "loss": 0.0137, + "num_tokens": 5350712.0, + "reward": 0.80609130859375, + "reward_std": 0.0071121640503406525, + "rewards//mean": 0.80609130859375, + "rewards//std": 0.022340906783938408, + "step": 619 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.124, + "grad_norm": 0.9958809018135071, + "kl": 0.30707372911274433, + "learning_rate": 4.838750258996992e-06, + "loss": 0.0123, + "num_tokens": 5359400.0, + "reward": 0.7572021484375, + "reward_std": 0.008860534057021141, + "rewards//mean": 0.7572021484375, + "rewards//std": 0.029439911246299744, + "step": 620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1242, + "grad_norm": 1.331772804260254, + "kl": 0.3642958104610443, + "learning_rate": 4.838189178074867e-06, + "loss": 0.0146, + "num_tokens": 5368040.0, + "reward": 0.7486572265625, + "reward_std": 0.005241964012384415, + "rewards//mean": 0.7486572265625, + "rewards//std": 0.03716379404067993, + "step": 621 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1244, + "grad_norm": 1.1566888093948364, + "kl": 0.32168351113796234, + "learning_rate": 4.837627155330421e-06, + "loss": 0.0129, + "num_tokens": 5376656.0, + "reward": 0.74420166015625, + "reward_std": 0.010309340432286263, + "rewards//mean": 0.74420166015625, + "rewards//std": 0.035138972103595734, + "step": 622 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1246, + "grad_norm": 1.2128782272338867, + "kl": 0.3658299557864666, + "learning_rate": 4.837064190990036e-06, + "loss": 0.0146, + "num_tokens": 5385312.0, + "reward": 0.74615478515625, + "reward_std": 0.005186946596950293, + "rewards//mean": 0.74615478515625, + "rewards//std": 0.023983918130397797, + "step": 623 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1248, + "grad_norm": 1.2308634519577026, + "kl": 0.35844672471284866, + "learning_rate": 4.836500285280476e-06, + "loss": 0.0143, + "num_tokens": 5393888.0, + "reward": 0.75299072265625, + "reward_std": 0.0063725742511451244, + "rewards//mean": 0.75299072265625, + "rewards//std": 0.028277721256017685, + "step": 624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.125, + "grad_norm": 1.1482653617858887, + "kl": 0.345749881118536, + "learning_rate": 4.83593543842888e-06, + "loss": 0.0138, + "num_tokens": 5402496.0, + "reward": 0.7623291015625, + "reward_std": 0.00810457207262516, + "rewards//mean": 0.7623291015625, + "rewards//std": 0.03183525428175926, + "step": 625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1252, + "grad_norm": 1.252425193786621, + "kl": 0.4149610288441181, + "learning_rate": 4.835369650662767e-06, + "loss": 0.0166, + "num_tokens": 5411160.0, + "reward": 0.7589111328125, + "reward_std": 0.009770114906132221, + "rewards//mean": 0.7589111328125, + "rewards//std": 0.043982502073049545, + "step": 626 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1254, + "grad_norm": 1.0089774131774902, + "kl": 0.3325713388621807, + "learning_rate": 4.83480292221004e-06, + "loss": 0.0133, + "num_tokens": 5419800.0, + "reward": 0.78118896484375, + "reward_std": 0.007964782416820526, + "rewards//mean": 0.78118896484375, + "rewards//std": 0.027581803500652313, + "step": 627 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1256, + "grad_norm": 1.1872444152832031, + "kl": 0.3795311227440834, + "learning_rate": 4.834235253298973e-06, + "loss": 0.0152, + "num_tokens": 5428536.0, + "reward": 0.74444580078125, + "reward_std": 0.007206289563328028, + "rewards//mean": 0.74444580078125, + "rewards//std": 0.021763604134321213, + "step": 628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1258, + "grad_norm": 1.326137661933899, + "kl": 0.37480510398745537, + "learning_rate": 4.833666644158227e-06, + "loss": 0.015, + "num_tokens": 5437144.0, + "reward": 0.7896728515625, + "reward_std": 0.00821404904127121, + "rewards//mean": 0.7896728515625, + "rewards//std": 0.026238271966576576, + "step": 629 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.126, + "grad_norm": 1.089011788368225, + "kl": 0.30840661004185677, + "learning_rate": 4.833097095016835e-06, + "loss": 0.0123, + "num_tokens": 5445856.0, + "reward": 0.76019287109375, + "reward_std": 0.008738137781620026, + "rewards//mean": 0.76019287109375, + "rewards//std": 0.028870007023215294, + "step": 630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1262, + "grad_norm": 1.430968165397644, + "kl": 0.3591831102967262, + "learning_rate": 4.832526606104213e-06, + "loss": 0.0144, + "num_tokens": 5454512.0, + "reward": 0.732421875, + "reward_std": 0.008345866575837135, + "rewards//mean": 0.732421875, + "rewards//std": 0.032008104026317596, + "step": 631 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1264, + "grad_norm": 1.2066965103149414, + "kl": 0.35580387338995934, + "learning_rate": 4.831955177650153e-06, + "loss": 0.0142, + "num_tokens": 5463144.0, + "reward": 0.76519775390625, + "reward_std": 0.007233831100165844, + "rewards//mean": 0.76519775390625, + "rewards//std": 0.02703748270869255, + "step": 632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1266, + "grad_norm": 1.33864426612854, + "kl": 0.3583483546972275, + "learning_rate": 4.831382809884826e-06, + "loss": 0.0143, + "num_tokens": 5471768.0, + "reward": 0.751708984375, + "reward_std": 0.006699239369481802, + "rewards//mean": 0.751708984375, + "rewards//std": 0.026021895930171013, + "step": 633 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1268, + "grad_norm": 1.0695289373397827, + "kl": 0.331091046333313, + "learning_rate": 4.830809503038781e-06, + "loss": 0.0132, + "num_tokens": 5480384.0, + "reward": 0.7537841796875, + "reward_std": 0.007534695789217949, + "rewards//mean": 0.7537841796875, + "rewards//std": 0.020481223240494728, + "step": 634 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.127, + "grad_norm": 1.05119788646698, + "kl": 0.2932460140436888, + "learning_rate": 4.830235257342948e-06, + "loss": 0.0117, + "num_tokens": 5488984.0, + "reward": 0.73675537109375, + "reward_std": 0.009710824117064476, + "rewards//mean": 0.73675537109375, + "rewards//std": 0.02311091497540474, + "step": 635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1272, + "grad_norm": 0.9468859434127808, + "kl": 0.27140727639198303, + "learning_rate": 4.829660073028631e-06, + "loss": 0.0109, + "num_tokens": 5497624.0, + "reward": 0.81549072265625, + "reward_std": 0.006733256857842207, + "rewards//mean": 0.81549072265625, + "rewards//std": 0.017603833228349686, + "step": 636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1274, + "grad_norm": 1.144340991973877, + "kl": 0.3296862170100212, + "learning_rate": 4.829083950327516e-06, + "loss": 0.0132, + "num_tokens": 5506312.0, + "reward": 0.7391357421875, + "reward_std": 0.007396392058581114, + "rewards//mean": 0.7391357421875, + "rewards//std": 0.02667086571455002, + "step": 637 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1276, + "grad_norm": 1.0231379270553589, + "kl": 0.32026514038443565, + "learning_rate": 4.828506889471664e-06, + "loss": 0.0128, + "num_tokens": 5514912.0, + "reward": 0.73388671875, + "reward_std": 0.006786069832742214, + "rewards//mean": 0.73388671875, + "rewards//std": 0.017696566879749298, + "step": 638 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1278, + "grad_norm": 1.3025801181793213, + "kl": 0.3653467670083046, + "learning_rate": 4.827928890693515e-06, + "loss": 0.0146, + "num_tokens": 5523536.0, + "reward": 0.782958984375, + "reward_std": 0.0090708639472723, + "rewards//mean": 0.782958984375, + "rewards//std": 0.0243386123329401, + "step": 639 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.128, + "grad_norm": 1.0168287754058838, + "kl": 0.3438634071499109, + "learning_rate": 4.8273499542258885e-06, + "loss": 0.0138, + "num_tokens": 5532184.0, + "reward": 0.7303466796875, + "reward_std": 0.007754259742796421, + "rewards//mean": 0.7303466796875, + "rewards//std": 0.027673648670315742, + "step": 640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1282, + "grad_norm": 0.9684326648712158, + "kl": 0.3083877395838499, + "learning_rate": 4.826770080301978e-06, + "loss": 0.0123, + "num_tokens": 5540808.0, + "reward": 0.7808837890625, + "reward_std": 0.010280175134539604, + "rewards//mean": 0.7808837890625, + "rewards//std": 0.029550766572356224, + "step": 641 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1284, + "grad_norm": 1.223793864250183, + "kl": 0.34201204031705856, + "learning_rate": 4.826189269155357e-06, + "loss": 0.0137, + "num_tokens": 5549448.0, + "reward": 0.7252197265625, + "reward_std": 0.008540410548448563, + "rewards//mean": 0.7252197265625, + "rewards//std": 0.029250076040625572, + "step": 642 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1286, + "grad_norm": 1.2303458452224731, + "kl": 0.36477214470505714, + "learning_rate": 4.825607521019978e-06, + "loss": 0.0146, + "num_tokens": 5558080.0, + "reward": 0.7886962890625, + "reward_std": 0.00828670896589756, + "rewards//mean": 0.7886962890625, + "rewards//std": 0.022837886586785316, + "step": 643 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1288, + "grad_norm": 1.127670168876648, + "kl": 0.36717789620161057, + "learning_rate": 4.825024836130166e-06, + "loss": 0.0147, + "num_tokens": 5566712.0, + "reward": 0.776611328125, + "reward_std": 0.005144121125340462, + "rewards//mean": 0.776611328125, + "rewards//std": 0.02671085111796856, + "step": 644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.129, + "grad_norm": 1.114711880683899, + "kl": 0.28322512097656727, + "learning_rate": 4.824441214720629e-06, + "loss": 0.0113, + "num_tokens": 5575304.0, + "reward": 0.7742919921875, + "reward_std": 0.011971713975071907, + "rewards//mean": 0.7742919921875, + "rewards//std": 0.034474264830350876, + "step": 645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1292, + "grad_norm": 1.0570855140686035, + "kl": 0.3079027570784092, + "learning_rate": 4.823856657026448e-06, + "loss": 0.0123, + "num_tokens": 5584080.0, + "reward": 0.75958251953125, + "reward_std": 0.00948810763657093, + "rewards//mean": 0.75958251953125, + "rewards//std": 0.031992435455322266, + "step": 646 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1294, + "grad_norm": 1.053823471069336, + "kl": 0.33063437044620514, + "learning_rate": 4.823271163283084e-06, + "loss": 0.0132, + "num_tokens": 5592720.0, + "reward": 0.7550048828125, + "reward_std": 0.006211505271494389, + "rewards//mean": 0.7550048828125, + "rewards//std": 0.02459198608994484, + "step": 647 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1296, + "grad_norm": 0.8944997787475586, + "kl": 0.2919867653399706, + "learning_rate": 4.822684733726373e-06, + "loss": 0.0117, + "num_tokens": 5601432.0, + "reward": 0.76715087890625, + "reward_std": 0.00828872062265873, + "rewards//mean": 0.76715087890625, + "rewards//std": 0.014392371289432049, + "step": 648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1298, + "grad_norm": 1.134657382965088, + "kl": 0.2569319698959589, + "learning_rate": 4.822097368592529e-06, + "loss": 0.0103, + "num_tokens": 5610040.0, + "reward": 0.723388671875, + "reward_std": 0.009949292987585068, + "rewards//mean": 0.723388671875, + "rewards//std": 0.02867857925593853, + "step": 649 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.13, + "grad_norm": 0.8463056087493896, + "kl": 0.29777566343545914, + "learning_rate": 4.821509068118143e-06, + "loss": 0.0119, + "num_tokens": 5618752.0, + "reward": 0.73681640625, + "reward_std": 0.00745723582804203, + "rewards//mean": 0.73681640625, + "rewards//std": 0.026231637224555016, + "step": 650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1302, + "grad_norm": 0.9844704270362854, + "kl": 0.3054410591721535, + "learning_rate": 4.8209198325401815e-06, + "loss": 0.0122, + "num_tokens": 5627320.0, + "reward": 0.76171875, + "reward_std": 0.007473442703485489, + "rewards//mean": 0.76171875, + "rewards//std": 0.01674727164208889, + "step": 651 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1304, + "grad_norm": 1.0094974040985107, + "kl": 0.29557618498802185, + "learning_rate": 4.82032966209599e-06, + "loss": 0.0118, + "num_tokens": 5635952.0, + "reward": 0.77386474609375, + "reward_std": 0.008985331282019615, + "rewards//mean": 0.77386474609375, + "rewards//std": 0.028392048552632332, + "step": 652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1306, + "grad_norm": 0.8305743336677551, + "kl": 0.2749790381640196, + "learning_rate": 4.819738557023287e-06, + "loss": 0.011, + "num_tokens": 5644616.0, + "reward": 0.75286865234375, + "reward_std": 0.009393520653247833, + "rewards//mean": 0.75286865234375, + "rewards//std": 0.024232570081949234, + "step": 653 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1308, + "grad_norm": 0.9685484170913696, + "kl": 0.3104813024401665, + "learning_rate": 4.819146517560171e-06, + "loss": 0.0124, + "num_tokens": 5653272.0, + "reward": 0.77801513671875, + "reward_std": 0.006662518717348576, + "rewards//mean": 0.77801513671875, + "rewards//std": 0.019912630319595337, + "step": 654 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.131, + "grad_norm": 1.2605221271514893, + "kl": 0.28511145897209644, + "learning_rate": 4.818553543945115e-06, + "loss": 0.0114, + "num_tokens": 5661888.0, + "reward": 0.78240966796875, + "reward_std": 0.007852917537093163, + "rewards//mean": 0.78240966796875, + "rewards//std": 0.01976766251027584, + "step": 655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1312, + "grad_norm": 0.945095956325531, + "kl": 0.2604890577495098, + "learning_rate": 4.817959636416969e-06, + "loss": 0.0104, + "num_tokens": 5670504.0, + "reward": 0.73822021484375, + "reward_std": 0.00944865308701992, + "rewards//mean": 0.73822021484375, + "rewards//std": 0.03974169120192528, + "step": 656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1314, + "grad_norm": 0.7084172368049622, + "kl": 0.24680477194488049, + "learning_rate": 4.8173647952149584e-06, + "loss": 0.0099, + "num_tokens": 5679168.0, + "reward": 0.76702880859375, + "reward_std": 0.007409450598061085, + "rewards//mean": 0.76702880859375, + "rewards//std": 0.026313383132219315, + "step": 657 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1316, + "grad_norm": 0.7127532958984375, + "kl": 0.2393797803670168, + "learning_rate": 4.816769020578685e-06, + "loss": 0.0096, + "num_tokens": 5687896.0, + "reward": 0.76580810546875, + "reward_std": 0.009323619306087494, + "rewards//mean": 0.76580810546875, + "rewards//std": 0.025117818266153336, + "step": 658 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1318, + "grad_norm": 0.7750257253646851, + "kl": 0.26598621532320976, + "learning_rate": 4.816172312748128e-06, + "loss": 0.0106, + "num_tokens": 5696520.0, + "reward": 0.7744140625, + "reward_std": 0.008821007795631886, + "rewards//mean": 0.7744140625, + "rewards//std": 0.015120835043489933, + "step": 659 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.132, + "grad_norm": 0.6631627082824707, + "kl": 0.2553862910717726, + "learning_rate": 4.81557467196364e-06, + "loss": 0.0102, + "num_tokens": 5705160.0, + "reward": 0.7291259765625, + "reward_std": 0.006548763252794743, + "rewards//mean": 0.7291259765625, + "rewards//std": 0.022490572184324265, + "step": 660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1322, + "grad_norm": 0.5979108810424805, + "kl": 0.2634982131421566, + "learning_rate": 4.814976098465951e-06, + "loss": 0.0105, + "num_tokens": 5713848.0, + "reward": 0.7772216796875, + "reward_std": 0.006953542120754719, + "rewards//mean": 0.7772216796875, + "rewards//std": 0.017441095784306526, + "step": 661 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1324, + "grad_norm": 0.6861351728439331, + "kl": 0.2535683810710907, + "learning_rate": 4.814376592496167e-06, + "loss": 0.0101, + "num_tokens": 5722440.0, + "reward": 0.7781982421875, + "reward_std": 0.006485221907496452, + "rewards//mean": 0.7781982421875, + "rewards//std": 0.028885535895824432, + "step": 662 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1326, + "grad_norm": 0.6136988997459412, + "kl": 0.23714140430092812, + "learning_rate": 4.813776154295767e-06, + "loss": 0.0095, + "num_tokens": 5731056.0, + "reward": 0.77496337890625, + "reward_std": 0.007635117508471012, + "rewards//mean": 0.77496337890625, + "rewards//std": 0.0270240418612957, + "step": 663 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1328, + "grad_norm": 0.7283768653869629, + "kl": 0.23214296996593475, + "learning_rate": 4.81317478410661e-06, + "loss": 0.0093, + "num_tokens": 5739704.0, + "reward": 0.74163818359375, + "reward_std": 0.009544800035655499, + "rewards//mean": 0.74163818359375, + "rewards//std": 0.030503764748573303, + "step": 664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.133, + "grad_norm": 1.0039687156677246, + "kl": 0.2293421532958746, + "learning_rate": 4.812572482170926e-06, + "loss": 0.0092, + "num_tokens": 5748272.0, + "reward": 0.76214599609375, + "reward_std": 0.008817870169878006, + "rewards//mean": 0.76214599609375, + "rewards//std": 0.02790156379342079, + "step": 665 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1332, + "grad_norm": 0.7138220071792603, + "kl": 0.22582152672111988, + "learning_rate": 4.811969248731323e-06, + "loss": 0.009, + "num_tokens": 5756944.0, + "reward": 0.774169921875, + "reward_std": 0.009145371615886688, + "rewards//mean": 0.774169921875, + "rewards//std": 0.024546701461076736, + "step": 666 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1334, + "grad_norm": 0.6133260726928711, + "kl": 0.24057517014443874, + "learning_rate": 4.811365084030784e-06, + "loss": 0.0096, + "num_tokens": 5765464.0, + "reward": 0.7408447265625, + "reward_std": 0.009205160662531853, + "rewards//mean": 0.7408447265625, + "rewards//std": 0.03023541159927845, + "step": 667 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1336, + "grad_norm": 0.6326518058776855, + "kl": 0.23078422248363495, + "learning_rate": 4.8107599883126634e-06, + "loss": 0.0092, + "num_tokens": 5774152.0, + "reward": 0.76348876953125, + "reward_std": 0.010718736797571182, + "rewards//mean": 0.76348876953125, + "rewards//std": 0.031055085361003876, + "step": 668 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1338, + "grad_norm": 0.7892494201660156, + "kl": 0.2298267874866724, + "learning_rate": 4.810153961820697e-06, + "loss": 0.0092, + "num_tokens": 5782856.0, + "reward": 0.76007080078125, + "reward_std": 0.00991053506731987, + "rewards//mean": 0.76007080078125, + "rewards//std": 0.030913403257727623, + "step": 669 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.134, + "grad_norm": 0.6180135011672974, + "kl": 0.21059091202914715, + "learning_rate": 4.809547004798991e-06, + "loss": 0.0084, + "num_tokens": 5791512.0, + "reward": 0.7464599609375, + "reward_std": 0.011077282950282097, + "rewards//mean": 0.7464599609375, + "rewards//std": 0.03385930880904198, + "step": 670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1342, + "grad_norm": 0.7543632388114929, + "kl": 0.2657131403684616, + "learning_rate": 4.808939117492028e-06, + "loss": 0.0106, + "num_tokens": 5800024.0, + "reward": 0.73602294921875, + "reward_std": 0.009333530440926552, + "rewards//mean": 0.73602294921875, + "rewards//std": 0.026308204978704453, + "step": 671 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1344, + "grad_norm": 0.7919978499412537, + "kl": 0.2661147303879261, + "learning_rate": 4.808330300144664e-06, + "loss": 0.0106, + "num_tokens": 5808632.0, + "reward": 0.75872802734375, + "reward_std": 0.009824881330132484, + "rewards//mean": 0.75872802734375, + "rewards//std": 0.02316456288099289, + "step": 672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1346, + "grad_norm": 0.7620828747749329, + "kl": 0.25611876510083675, + "learning_rate": 4.807720553002132e-06, + "loss": 0.0102, + "num_tokens": 5817208.0, + "reward": 0.7601318359375, + "reward_std": 0.009487172588706017, + "rewards//mean": 0.7601318359375, + "rewards//std": 0.029021471738815308, + "step": 673 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1348, + "grad_norm": 0.9600464105606079, + "kl": 0.24580915831029415, + "learning_rate": 4.807109876310037e-06, + "loss": 0.0098, + "num_tokens": 5826000.0, + "reward": 0.7584228515625, + "reward_std": 0.008735930547118187, + "rewards//mean": 0.7584228515625, + "rewards//std": 0.027020471170544624, + "step": 674 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.135, + "grad_norm": 0.7932077050209045, + "kl": 0.251651618629694, + "learning_rate": 4.806498270314359e-06, + "loss": 0.0101, + "num_tokens": 5834584.0, + "reward": 0.76025390625, + "reward_std": 0.007064810022711754, + "rewards//mean": 0.76025390625, + "rewards//std": 0.02699608914554119, + "step": 675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1352, + "grad_norm": 0.8365455269813538, + "kl": 0.2700675167143345, + "learning_rate": 4.805885735261454e-06, + "loss": 0.0108, + "num_tokens": 5843224.0, + "reward": 0.7371826171875, + "reward_std": 0.010191906243562698, + "rewards//mean": 0.7371826171875, + "rewards//std": 0.023923013359308243, + "step": 676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1354, + "grad_norm": 0.7729324698448181, + "kl": 0.26824005506932735, + "learning_rate": 4.805272271398051e-06, + "loss": 0.0107, + "num_tokens": 5851800.0, + "reward": 0.70831298828125, + "reward_std": 0.008622756227850914, + "rewards//mean": 0.70831298828125, + "rewards//std": 0.02663075178861618, + "step": 677 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1356, + "grad_norm": 0.9402274489402771, + "kl": 0.2813422940671444, + "learning_rate": 4.804657878971252e-06, + "loss": 0.0113, + "num_tokens": 5860408.0, + "reward": 0.74505615234375, + "reward_std": 0.007954198867082596, + "rewards//mean": 0.74505615234375, + "rewards//std": 0.023315679281949997, + "step": 678 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1358, + "grad_norm": 0.7352375984191895, + "kl": 0.26763404347002506, + "learning_rate": 4.804042558228535e-06, + "loss": 0.0107, + "num_tokens": 5869048.0, + "reward": 0.7579345703125, + "reward_std": 0.006437701638787985, + "rewards//mean": 0.7579345703125, + "rewards//std": 0.029943620786070824, + "step": 679 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.136, + "grad_norm": 0.8864990472793579, + "kl": 0.3166388310492039, + "learning_rate": 4.803426309417752e-06, + "loss": 0.0127, + "num_tokens": 5877704.0, + "reward": 0.75897216796875, + "reward_std": 0.007796045392751694, + "rewards//mean": 0.75897216796875, + "rewards//std": 0.0265179630368948, + "step": 680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1362, + "grad_norm": 0.8726670742034912, + "kl": 0.28164214082062244, + "learning_rate": 4.802809132787125e-06, + "loss": 0.0113, + "num_tokens": 5886232.0, + "reward": 0.74322509765625, + "reward_std": 0.007162688300013542, + "rewards//mean": 0.74322509765625, + "rewards//std": 0.024352213367819786, + "step": 681 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1364, + "grad_norm": 1.2566941976547241, + "kl": 0.3058667443692684, + "learning_rate": 4.802191028585257e-06, + "loss": 0.0122, + "num_tokens": 5894872.0, + "reward": 0.736328125, + "reward_std": 0.009593142196536064, + "rewards//mean": 0.736328125, + "rewards//std": 0.04730703681707382, + "step": 682 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1366, + "grad_norm": 0.901271641254425, + "kl": 0.3303819112479687, + "learning_rate": 4.801571997061117e-06, + "loss": 0.0132, + "num_tokens": 5903480.0, + "reward": 0.7298583984375, + "reward_std": 0.008278006687760353, + "rewards//mean": 0.7298583984375, + "rewards//std": 0.03159085661172867, + "step": 683 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1368, + "grad_norm": 0.846440851688385, + "kl": 0.2713386472314596, + "learning_rate": 4.800952038464051e-06, + "loss": 0.0109, + "num_tokens": 5912088.0, + "reward": 0.753173828125, + "reward_std": 0.008396659046411514, + "rewards//mean": 0.753173828125, + "rewards//std": 0.02986997365951538, + "step": 684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.137, + "grad_norm": 0.99683678150177, + "kl": 0.29254608042538166, + "learning_rate": 4.800331153043781e-06, + "loss": 0.0117, + "num_tokens": 5920760.0, + "reward": 0.7811279296875, + "reward_std": 0.006906645372509956, + "rewards//mean": 0.7811279296875, + "rewards//std": 0.02221698872745037, + "step": 685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1372, + "grad_norm": 0.9804807305335999, + "kl": 0.32390232756733894, + "learning_rate": 4.799709341050397e-06, + "loss": 0.013, + "num_tokens": 5929320.0, + "reward": 0.75372314453125, + "reward_std": 0.009741474874317646, + "rewards//mean": 0.75372314453125, + "rewards//std": 0.022919517010450363, + "step": 686 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1374, + "grad_norm": 1.048980712890625, + "kl": 0.3197479471564293, + "learning_rate": 4.799086602734364e-06, + "loss": 0.0128, + "num_tokens": 5937912.0, + "reward": 0.732177734375, + "reward_std": 0.011059543117880821, + "rewards//mean": 0.732177734375, + "rewards//std": 0.03457533195614815, + "step": 687 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1376, + "grad_norm": 0.9630193114280701, + "kl": 0.33098217844963074, + "learning_rate": 4.798462938346524e-06, + "loss": 0.0132, + "num_tokens": 5946624.0, + "reward": 0.74591064453125, + "reward_std": 0.008718679659068584, + "rewards//mean": 0.74591064453125, + "rewards//std": 0.0328625813126564, + "step": 688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1378, + "grad_norm": 1.1587707996368408, + "kl": 0.29454705864191055, + "learning_rate": 4.7978383481380865e-06, + "loss": 0.0118, + "num_tokens": 5955192.0, + "reward": 0.7099609375, + "reward_std": 0.007809010334312916, + "rewards//mean": 0.7099609375, + "rewards//std": 0.034053899347782135, + "step": 689 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.138, + "grad_norm": 0.9946979284286499, + "kl": 0.32380471006035805, + "learning_rate": 4.797212832360637e-06, + "loss": 0.013, + "num_tokens": 5963760.0, + "reward": 0.767822265625, + "reward_std": 0.00826583243906498, + "rewards//mean": 0.767822265625, + "rewards//std": 0.02232472226023674, + "step": 690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1382, + "grad_norm": 1.1503069400787354, + "kl": 0.3658827282488346, + "learning_rate": 4.796586391266135e-06, + "loss": 0.0146, + "num_tokens": 5972352.0, + "reward": 0.750732421875, + "reward_std": 0.0072412192821502686, + "rewards//mean": 0.750732421875, + "rewards//std": 0.031179197132587433, + "step": 691 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1384, + "grad_norm": 1.0804966688156128, + "kl": 0.3435588367283344, + "learning_rate": 4.795959025106907e-06, + "loss": 0.0137, + "num_tokens": 5980920.0, + "reward": 0.75189208984375, + "reward_std": 0.009941812604665756, + "rewards//mean": 0.75189208984375, + "rewards//std": 0.02548695169389248, + "step": 692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1386, + "grad_norm": 1.247780442237854, + "kl": 0.3512617312371731, + "learning_rate": 4.7953307341356595e-06, + "loss": 0.0141, + "num_tokens": 5989576.0, + "reward": 0.765869140625, + "reward_std": 0.007969655096530914, + "rewards//mean": 0.765869140625, + "rewards//std": 0.028762908652424812, + "step": 693 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1388, + "grad_norm": 1.2001007795333862, + "kl": 0.36641981452703476, + "learning_rate": 4.794701518605467e-06, + "loss": 0.0147, + "num_tokens": 5998160.0, + "reward": 0.763427734375, + "reward_std": 0.0078741405159235, + "rewards//mean": 0.763427734375, + "rewards//std": 0.024566426873207092, + "step": 694 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.139, + "grad_norm": 1.1044167280197144, + "kl": 0.33926333859562874, + "learning_rate": 4.794071378769776e-06, + "loss": 0.0136, + "num_tokens": 6006776.0, + "reward": 0.76068115234375, + "reward_std": 0.01240166462957859, + "rewards//mean": 0.76068115234375, + "rewards//std": 0.03412666544318199, + "step": 695 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1392, + "grad_norm": 1.0884336233139038, + "kl": 0.3415757305920124, + "learning_rate": 4.7934403148824085e-06, + "loss": 0.0137, + "num_tokens": 6015376.0, + "reward": 0.7767333984375, + "reward_std": 0.00934339314699173, + "rewards//mean": 0.7767333984375, + "rewards//std": 0.023022731766104698, + "step": 696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1394, + "grad_norm": 1.1337672472000122, + "kl": 0.3384525291621685, + "learning_rate": 4.792808327197556e-06, + "loss": 0.0135, + "num_tokens": 6023976.0, + "reward": 0.75604248046875, + "reward_std": 0.011509222909808159, + "rewards//mean": 0.75604248046875, + "rewards//std": 0.03885067626833916, + "step": 697 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1396, + "grad_norm": 1.2506954669952393, + "kl": 0.3618217594921589, + "learning_rate": 4.792175415969786e-06, + "loss": 0.0145, + "num_tokens": 6032584.0, + "reward": 0.78179931640625, + "reward_std": 0.007064157165586948, + "rewards//mean": 0.78179931640625, + "rewards//std": 0.023384397849440575, + "step": 698 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1398, + "grad_norm": 1.190083384513855, + "kl": 0.3704114742577076, + "learning_rate": 4.79154158145403e-06, + "loss": 0.0148, + "num_tokens": 6041200.0, + "reward": 0.7354736328125, + "reward_std": 0.007111767306923866, + "rewards//mean": 0.7354736328125, + "rewards//std": 0.03445318341255188, + "step": 699 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.14, + "grad_norm": 1.0952671766281128, + "kl": 0.3880801610648632, + "learning_rate": 4.790906823905599e-06, + "loss": 0.0155, + "num_tokens": 6049776.0, + "reward": 0.75994873046875, + "reward_std": 0.006981437094509602, + "rewards//mean": 0.75994873046875, + "rewards//std": 0.02911488339304924, + "step": 700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1402, + "grad_norm": 0.9023262858390808, + "kl": 0.3351786471903324, + "learning_rate": 4.790271143580174e-06, + "loss": 0.0134, + "num_tokens": 6058352.0, + "reward": 0.76885986328125, + "reward_std": 0.008710931986570358, + "rewards//mean": 0.76885986328125, + "rewards//std": 0.03886703774333, + "step": 701 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1404, + "grad_norm": 1.2910362482070923, + "kl": 0.3798423409461975, + "learning_rate": 4.789634540733807e-06, + "loss": 0.0152, + "num_tokens": 6066912.0, + "reward": 0.748046875, + "reward_std": 0.006378927733749151, + "rewards//mean": 0.748046875, + "rewards//std": 0.029968129470944405, + "step": 702 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1406, + "grad_norm": 1.287479043006897, + "kl": 0.37538506276905537, + "learning_rate": 4.78899701562292e-06, + "loss": 0.015, + "num_tokens": 6075536.0, + "reward": 0.75531005859375, + "reward_std": 0.010323834605515003, + "rewards//mean": 0.75531005859375, + "rewards//std": 0.029662420973181725, + "step": 703 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1408, + "grad_norm": 1.0325592756271362, + "kl": 0.35265588015317917, + "learning_rate": 4.788358568504308e-06, + "loss": 0.0141, + "num_tokens": 6084152.0, + "reward": 0.73480224609375, + "reward_std": 0.007402766961604357, + "rewards//mean": 0.73480224609375, + "rewards//std": 0.034619029611349106, + "step": 704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.141, + "grad_norm": 1.6814100742340088, + "kl": 0.38387027755379677, + "learning_rate": 4.78771919963514e-06, + "loss": 0.0154, + "num_tokens": 6092728.0, + "reward": 0.79052734375, + "reward_std": 0.006797960493713617, + "rewards//mean": 0.79052734375, + "rewards//std": 0.020333237946033478, + "step": 705 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1412, + "grad_norm": 1.1181515455245972, + "kl": 0.3311033882200718, + "learning_rate": 4.787078909272951e-06, + "loss": 0.0132, + "num_tokens": 6101480.0, + "reward": 0.73321533203125, + "reward_std": 0.012039251625537872, + "rewards//mean": 0.73321533203125, + "rewards//std": 0.04149588197469711, + "step": 706 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1414, + "grad_norm": 1.1418641805648804, + "kl": 0.2598997447639704, + "learning_rate": 4.786437697675651e-06, + "loss": 0.0104, + "num_tokens": 6110208.0, + "reward": 0.73748779296875, + "reward_std": 0.01649647206068039, + "rewards//mean": 0.73748779296875, + "rewards//std": 0.04361593350768089, + "step": 707 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1416, + "grad_norm": 1.1949273347854614, + "kl": 0.39064906910061836, + "learning_rate": 4.78579556510152e-06, + "loss": 0.0156, + "num_tokens": 6118832.0, + "reward": 0.74700927734375, + "reward_std": 0.009526428766548634, + "rewards//mean": 0.74700927734375, + "rewards//std": 0.02997412718832493, + "step": 708 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1418, + "grad_norm": 1.232944369316101, + "kl": 0.3855067417025566, + "learning_rate": 4.785152511809208e-06, + "loss": 0.0154, + "num_tokens": 6127480.0, + "reward": 0.751220703125, + "reward_std": 0.0074347625486552715, + "rewards//mean": 0.751220703125, + "rewards//std": 0.0333704799413681, + "step": 709 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.142, + "grad_norm": 0.8768677711486816, + "kl": 0.3823428861796856, + "learning_rate": 4.784508538057738e-06, + "loss": 0.0153, + "num_tokens": 6136016.0, + "reward": 0.734375, + "reward_std": 0.009139763191342354, + "rewards//mean": 0.734375, + "rewards//std": 0.029879095032811165, + "step": 710 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1422, + "grad_norm": 1.023090124130249, + "kl": 0.3618377633392811, + "learning_rate": 4.783863644106502e-06, + "loss": 0.0145, + "num_tokens": 6144688.0, + "reward": 0.75811767578125, + "reward_std": 0.007518475875258446, + "rewards//mean": 0.75811767578125, + "rewards//std": 0.02010328881442547, + "step": 711 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1424, + "grad_norm": 0.9730324149131775, + "kl": 0.36088190227746964, + "learning_rate": 4.783217830215264e-06, + "loss": 0.0144, + "num_tokens": 6153288.0, + "reward": 0.7520751953125, + "reward_std": 0.0069280690513551235, + "rewards//mean": 0.7520751953125, + "rewards//std": 0.03221152722835541, + "step": 712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1426, + "grad_norm": 0.777560830116272, + "kl": 0.3026167619973421, + "learning_rate": 4.782571096644157e-06, + "loss": 0.0121, + "num_tokens": 6161936.0, + "reward": 0.75244140625, + "reward_std": 0.012461979873478413, + "rewards//mean": 0.75244140625, + "rewards//std": 0.04013592377305031, + "step": 713 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1428, + "grad_norm": 0.7599678635597229, + "kl": 0.27103759720921516, + "learning_rate": 4.7819234436536845e-06, + "loss": 0.0108, + "num_tokens": 6170696.0, + "reward": 0.77105712890625, + "reward_std": 0.009724288247525692, + "rewards//mean": 0.77105712890625, + "rewards//std": 0.028811221942305565, + "step": 714 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.143, + "grad_norm": 1.2590875625610352, + "kl": 0.3248850591480732, + "learning_rate": 4.781274871504722e-06, + "loss": 0.013, + "num_tokens": 6179384.0, + "reward": 0.7764892578125, + "reward_std": 0.005956442095339298, + "rewards//mean": 0.7764892578125, + "rewards//std": 0.02729027532041073, + "step": 715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1432, + "grad_norm": 0.8374090194702148, + "kl": 0.3235581796616316, + "learning_rate": 4.780625380458513e-06, + "loss": 0.0129, + "num_tokens": 6187896.0, + "reward": 0.75469970703125, + "reward_std": 0.008836697787046432, + "rewards//mean": 0.75469970703125, + "rewards//std": 0.03515404462814331, + "step": 716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1434, + "grad_norm": 0.9022362232208252, + "kl": 0.3358203247189522, + "learning_rate": 4.7799749707766754e-06, + "loss": 0.0134, + "num_tokens": 6196536.0, + "reward": 0.73065185546875, + "reward_std": 0.006113333627581596, + "rewards//mean": 0.73065185546875, + "rewards//std": 0.025127459317445755, + "step": 717 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1436, + "grad_norm": 1.1146235466003418, + "kl": 0.3147228341549635, + "learning_rate": 4.779323642721191e-06, + "loss": 0.0126, + "num_tokens": 6205168.0, + "reward": 0.7520751953125, + "reward_std": 0.007163808681070805, + "rewards//mean": 0.7520751953125, + "rewards//std": 0.019917665049433708, + "step": 718 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1438, + "grad_norm": 0.9956210851669312, + "kl": 0.3257326614111662, + "learning_rate": 4.778671396554417e-06, + "loss": 0.013, + "num_tokens": 6213784.0, + "reward": 0.76202392578125, + "reward_std": 0.009784232825040817, + "rewards//mean": 0.76202392578125, + "rewards//std": 0.027377430349588394, + "step": 719 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.144, + "grad_norm": 1.103261947631836, + "kl": 0.31043668277561665, + "learning_rate": 4.778018232539075e-06, + "loss": 0.0124, + "num_tokens": 6222432.0, + "reward": 0.75640869140625, + "reward_std": 0.010043056681752205, + "rewards//mean": 0.75640869140625, + "rewards//std": 0.026082253083586693, + "step": 720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1442, + "grad_norm": 1.2115051746368408, + "kl": 0.30454549565911293, + "learning_rate": 4.777364150938263e-06, + "loss": 0.0122, + "num_tokens": 6231232.0, + "reward": 0.755859375, + "reward_std": 0.01059969887137413, + "rewards//mean": 0.755859375, + "rewards//std": 0.034273672848939896, + "step": 721 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1444, + "grad_norm": 0.8916050791740417, + "kl": 0.2920965179800987, + "learning_rate": 4.776709152015443e-06, + "loss": 0.0117, + "num_tokens": 6239984.0, + "reward": 0.74688720703125, + "reward_std": 0.007067927625030279, + "rewards//mean": 0.74688720703125, + "rewards//std": 0.025057479739189148, + "step": 722 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1446, + "grad_norm": 0.7612753510475159, + "kl": 0.29840802773833275, + "learning_rate": 4.776053236034449e-06, + "loss": 0.0119, + "num_tokens": 6248576.0, + "reward": 0.7445068359375, + "reward_std": 0.00902190525084734, + "rewards//mean": 0.7445068359375, + "rewards//std": 0.02381392940878868, + "step": 723 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1448, + "grad_norm": 1.3189657926559448, + "kl": 0.3207559399306774, + "learning_rate": 4.775396403259483e-06, + "loss": 0.0128, + "num_tokens": 6257128.0, + "reward": 0.7669677734375, + "reward_std": 0.01015978679060936, + "rewards//mean": 0.7669677734375, + "rewards//std": 0.026062294840812683, + "step": 724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.145, + "grad_norm": 0.9157844185829163, + "kl": 0.30339373275637627, + "learning_rate": 4.774738653955119e-06, + "loss": 0.0121, + "num_tokens": 6265768.0, + "reward": 0.7802734375, + "reward_std": 0.009250717237591743, + "rewards//mean": 0.7802734375, + "rewards//std": 0.026991603896021843, + "step": 725 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1452, + "grad_norm": 1.1949882507324219, + "kl": 0.33243946731090546, + "learning_rate": 4.7740799883862966e-06, + "loss": 0.0133, + "num_tokens": 6274448.0, + "reward": 0.7586669921875, + "reward_std": 0.01018530037254095, + "rewards//mean": 0.7586669921875, + "rewards//std": 0.020966242998838425, + "step": 726 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1454, + "grad_norm": 0.8508477807044983, + "kl": 0.33175548166036606, + "learning_rate": 4.773420406818327e-06, + "loss": 0.0133, + "num_tokens": 6283040.0, + "reward": 0.75091552734375, + "reward_std": 0.007159090135246515, + "rewards//mean": 0.75091552734375, + "rewards//std": 0.02595542185008526, + "step": 727 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1456, + "grad_norm": 0.8403224349021912, + "kl": 0.30662048049271107, + "learning_rate": 4.772759909516889e-06, + "loss": 0.0123, + "num_tokens": 6291704.0, + "reward": 0.7113037109375, + "reward_std": 0.011537307873368263, + "rewards//mean": 0.7113037109375, + "rewards//std": 0.0381256639957428, + "step": 728 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1458, + "grad_norm": 1.1425209045410156, + "kl": 0.31275169365108013, + "learning_rate": 4.772098496748031e-06, + "loss": 0.0125, + "num_tokens": 6300344.0, + "reward": 0.75421142578125, + "reward_std": 0.00868441816419363, + "rewards//mean": 0.75421142578125, + "rewards//std": 0.03845828026533127, + "step": 729 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.146, + "grad_norm": 0.7371503710746765, + "kl": 0.2826213352382183, + "learning_rate": 4.7714361687781705e-06, + "loss": 0.0113, + "num_tokens": 6308912.0, + "reward": 0.75616455078125, + "reward_std": 0.01087634265422821, + "rewards//mean": 0.75616455078125, + "rewards//std": 0.02745252661406994, + "step": 730 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1462, + "grad_norm": 1.2550262212753296, + "kl": 0.29363943450152874, + "learning_rate": 4.770772925874093e-06, + "loss": 0.0117, + "num_tokens": 6317680.0, + "reward": 0.7362060546875, + "reward_std": 0.012717410922050476, + "rewards//mean": 0.7362060546875, + "rewards//std": 0.031156614422798157, + "step": 731 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1464, + "grad_norm": 1.0804203748703003, + "kl": 0.347552876919508, + "learning_rate": 4.770108768302953e-06, + "loss": 0.0139, + "num_tokens": 6326368.0, + "reward": 0.79925537109375, + "reward_std": 0.00879729725420475, + "rewards//mean": 0.79925537109375, + "rewards//std": 0.021716952323913574, + "step": 732 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1466, + "grad_norm": 0.8175789713859558, + "kl": 0.2651336081326008, + "learning_rate": 4.769443696332272e-06, + "loss": 0.0106, + "num_tokens": 6335016.0, + "reward": 0.75885009765625, + "reward_std": 0.010360082611441612, + "rewards//mean": 0.75885009765625, + "rewards//std": 0.0343741700053215, + "step": 733 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1468, + "grad_norm": 0.9594478011131287, + "kl": 0.32585857063531876, + "learning_rate": 4.768777710229941e-06, + "loss": 0.013, + "num_tokens": 6343744.0, + "reward": 0.76837158203125, + "reward_std": 0.006379896309226751, + "rewards//mean": 0.76837158203125, + "rewards//std": 0.02404632233083248, + "step": 734 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.147, + "grad_norm": 0.9124557971954346, + "kl": 0.3457811623811722, + "learning_rate": 4.768110810264221e-06, + "loss": 0.0138, + "num_tokens": 6352456.0, + "reward": 0.77996826171875, + "reward_std": 0.010944314301013947, + "rewards//mean": 0.77996826171875, + "rewards//std": 0.025017576292157173, + "step": 735 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1472, + "grad_norm": 1.0064573287963867, + "kl": 0.32626586966216564, + "learning_rate": 4.767442996703737e-06, + "loss": 0.0131, + "num_tokens": 6361208.0, + "reward": 0.76898193359375, + "reward_std": 0.009524751454591751, + "rewards//mean": 0.76898193359375, + "rewards//std": 0.016290800645947456, + "step": 736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1474, + "grad_norm": 0.8128183484077454, + "kl": 0.3099711798131466, + "learning_rate": 4.7667742698174855e-06, + "loss": 0.0124, + "num_tokens": 6369800.0, + "reward": 0.759765625, + "reward_std": 0.00793082732707262, + "rewards//mean": 0.759765625, + "rewards//std": 0.02536318637430668, + "step": 737 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1476, + "grad_norm": 0.9029561281204224, + "kl": 0.34095400758087635, + "learning_rate": 4.766104629874829e-06, + "loss": 0.0136, + "num_tokens": 6378480.0, + "reward": 0.7481689453125, + "reward_std": 0.008078465238213539, + "rewards//mean": 0.7481689453125, + "rewards//std": 0.03140438720583916, + "step": 738 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1478, + "grad_norm": 1.104424238204956, + "kl": 0.3173263669013977, + "learning_rate": 4.765434077145499e-06, + "loss": 0.0127, + "num_tokens": 6387056.0, + "reward": 0.74407958984375, + "reward_std": 0.01128837838768959, + "rewards//mean": 0.74407958984375, + "rewards//std": 0.04236414283514023, + "step": 739 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.148, + "grad_norm": 1.0675055980682373, + "kl": 0.3194386400282383, + "learning_rate": 4.764762611899593e-06, + "loss": 0.0128, + "num_tokens": 6395720.0, + "reward": 0.779296875, + "reward_std": 0.00692769093438983, + "rewards//mean": 0.779296875, + "rewards//std": 0.02041051536798477, + "step": 740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1482, + "grad_norm": 1.5919986963272095, + "kl": 0.3498740680515766, + "learning_rate": 4.764090234407578e-06, + "loss": 0.014, + "num_tokens": 6404408.0, + "reward": 0.7606201171875, + "reward_std": 0.007645751349627972, + "rewards//mean": 0.7606201171875, + "rewards//std": 0.017224503681063652, + "step": 741 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1484, + "grad_norm": 1.439361810684204, + "kl": 0.33540985360741615, + "learning_rate": 4.763416944940287e-06, + "loss": 0.0134, + "num_tokens": 6413088.0, + "reward": 0.7662353515625, + "reward_std": 0.011973317712545395, + "rewards//mean": 0.7662353515625, + "rewards//std": 0.02613653428852558, + "step": 742 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1486, + "grad_norm": 0.9124048352241516, + "kl": 0.3327603731304407, + "learning_rate": 4.762742743768921e-06, + "loss": 0.0133, + "num_tokens": 6421912.0, + "reward": 0.782470703125, + "reward_std": 0.00991036370396614, + "rewards//mean": 0.782470703125, + "rewards//std": 0.026327257975935936, + "step": 743 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1488, + "grad_norm": 1.1488804817199707, + "kl": 0.3750241808593273, + "learning_rate": 4.762067631165049e-06, + "loss": 0.015, + "num_tokens": 6430552.0, + "reward": 0.78814697265625, + "reward_std": 0.01083114929497242, + "rewards//mean": 0.78814697265625, + "rewards//std": 0.023038752377033234, + "step": 744 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.149, + "grad_norm": 1.0288392305374146, + "kl": 0.4040727950632572, + "learning_rate": 4.761391607400606e-06, + "loss": 0.0162, + "num_tokens": 6439144.0, + "reward": 0.774658203125, + "reward_std": 0.008166758343577385, + "rewards//mean": 0.774658203125, + "rewards//std": 0.01806734874844551, + "step": 745 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1492, + "grad_norm": 1.1654855012893677, + "kl": 0.3817331902682781, + "learning_rate": 4.7607146727478935e-06, + "loss": 0.0153, + "num_tokens": 6447728.0, + "reward": 0.7738037109375, + "reward_std": 0.00860125944018364, + "rewards//mean": 0.7738037109375, + "rewards//std": 0.018528563901782036, + "step": 746 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1494, + "grad_norm": 1.4023302793502808, + "kl": 0.3670612499117851, + "learning_rate": 4.760036827479582e-06, + "loss": 0.0147, + "num_tokens": 6456312.0, + "reward": 0.76483154296875, + "reward_std": 0.007299549877643585, + "rewards//mean": 0.76483154296875, + "rewards//std": 0.028815951198339462, + "step": 747 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1496, + "grad_norm": 0.9892433285713196, + "kl": 0.3555135168135166, + "learning_rate": 4.759358071868705e-06, + "loss": 0.0142, + "num_tokens": 6464936.0, + "reward": 0.78558349609375, + "reward_std": 0.007815414108335972, + "rewards//mean": 0.78558349609375, + "rewards//std": 0.016691023483872414, + "step": 748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1498, + "grad_norm": 0.9962619543075562, + "kl": 0.33968275785446167, + "learning_rate": 4.758678406188668e-06, + "loss": 0.0136, + "num_tokens": 6473664.0, + "reward": 0.73126220703125, + "reward_std": 0.007433678954839706, + "rewards//mean": 0.73126220703125, + "rewards//std": 0.032445359975099564, + "step": 749 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.15, + "grad_norm": 1.2323095798492432, + "kl": 0.3530998080968857, + "learning_rate": 4.757997830713239e-06, + "loss": 0.0141, + "num_tokens": 6482200.0, + "reward": 0.74542236328125, + "reward_std": 0.01127916481345892, + "rewards//mean": 0.74542236328125, + "rewards//std": 0.019168945029377937, + "step": 750 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1502, + "grad_norm": 1.0545332431793213, + "kl": 0.34781191498041153, + "learning_rate": 4.757316345716554e-06, + "loss": 0.0139, + "num_tokens": 6490712.0, + "reward": 0.74566650390625, + "reward_std": 0.009755873121321201, + "rewards//mean": 0.74566650390625, + "rewards//std": 0.03269031271338463, + "step": 751 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1504, + "grad_norm": 0.9189953804016113, + "kl": 0.3220175448805094, + "learning_rate": 4.756633951473114e-06, + "loss": 0.0129, + "num_tokens": 6499392.0, + "reward": 0.7847900390625, + "reward_std": 0.008869939483702183, + "rewards//mean": 0.7847900390625, + "rewards//std": 0.021482614800333977, + "step": 752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1506, + "grad_norm": 1.1692512035369873, + "kl": 0.33156827092170715, + "learning_rate": 4.755950648257789e-06, + "loss": 0.0133, + "num_tokens": 6508160.0, + "reward": 0.77008056640625, + "reward_std": 0.011027710512280464, + "rewards//mean": 0.77008056640625, + "rewards//std": 0.02765524946153164, + "step": 753 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1508, + "grad_norm": 1.144055724143982, + "kl": 0.3382889721542597, + "learning_rate": 4.755266436345812e-06, + "loss": 0.0135, + "num_tokens": 6516832.0, + "reward": 0.72998046875, + "reward_std": 0.01025819219648838, + "rewards//mean": 0.72998046875, + "rewards//std": 0.029343342408537865, + "step": 754 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.151, + "grad_norm": 1.0410165786743164, + "kl": 0.3598978705704212, + "learning_rate": 4.754581316012785e-06, + "loss": 0.0144, + "num_tokens": 6525520.0, + "reward": 0.7647705078125, + "reward_std": 0.007099618669599295, + "rewards//mean": 0.7647705078125, + "rewards//std": 0.02260068617761135, + "step": 755 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1512, + "grad_norm": 0.9105063080787659, + "kl": 0.30748361349105835, + "learning_rate": 4.753895287534673e-06, + "loss": 0.0123, + "num_tokens": 6534128.0, + "reward": 0.7608642578125, + "reward_std": 0.011949615553021431, + "rewards//mean": 0.7608642578125, + "rewards//std": 0.036653582006692886, + "step": 756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1514, + "grad_norm": 0.8369845747947693, + "kl": 0.3764992021024227, + "learning_rate": 4.753208351187809e-06, + "loss": 0.0151, + "num_tokens": 6542736.0, + "reward": 0.7330322265625, + "reward_std": 0.0075618005357682705, + "rewards//mean": 0.7330322265625, + "rewards//std": 0.03163682669401169, + "step": 757 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1516, + "grad_norm": 0.9112576842308044, + "kl": 0.340505413711071, + "learning_rate": 4.75252050724889e-06, + "loss": 0.0136, + "num_tokens": 6551352.0, + "reward": 0.75885009765625, + "reward_std": 0.008580232039093971, + "rewards//mean": 0.75885009765625, + "rewards//std": 0.024510828778147697, + "step": 758 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1518, + "grad_norm": 1.1520575284957886, + "kl": 0.3449602983891964, + "learning_rate": 4.751831755994981e-06, + "loss": 0.0138, + "num_tokens": 6560040.0, + "reward": 0.7685546875, + "reward_std": 0.0092132817953825, + "rewards//mean": 0.7685546875, + "rewards//std": 0.02560080774128437, + "step": 759 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.152, + "grad_norm": 1.0891118049621582, + "kl": 0.3134535998106003, + "learning_rate": 4.75114209770351e-06, + "loss": 0.0125, + "num_tokens": 6568704.0, + "reward": 0.74053955078125, + "reward_std": 0.009147681295871735, + "rewards//mean": 0.74053955078125, + "rewards//std": 0.028252549469470978, + "step": 760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1522, + "grad_norm": 0.9451463222503662, + "kl": 0.35255878791213036, + "learning_rate": 4.75045153265227e-06, + "loss": 0.0141, + "num_tokens": 6577336.0, + "reward": 0.7484130859375, + "reward_std": 0.0063913362100720406, + "rewards//mean": 0.7484130859375, + "rewards//std": 0.02723030373454094, + "step": 761 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1524, + "grad_norm": 0.8575259447097778, + "kl": 0.3487287014722824, + "learning_rate": 4.749760061119423e-06, + "loss": 0.0139, + "num_tokens": 6585944.0, + "reward": 0.75885009765625, + "reward_std": 0.007511901669204235, + "rewards//mean": 0.75885009765625, + "rewards//std": 0.031213102862238884, + "step": 762 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1526, + "grad_norm": 0.9670186042785645, + "kl": 0.31115617975592613, + "learning_rate": 4.749067683383491e-06, + "loss": 0.0124, + "num_tokens": 6594568.0, + "reward": 0.75286865234375, + "reward_std": 0.009954852983355522, + "rewards//mean": 0.75286865234375, + "rewards//std": 0.027128031477332115, + "step": 763 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1528, + "grad_norm": 1.3793588876724243, + "kl": 0.3482302203774452, + "learning_rate": 4.748374399723366e-06, + "loss": 0.0139, + "num_tokens": 6603064.0, + "reward": 0.74713134765625, + "reward_std": 0.011073645204305649, + "rewards//mean": 0.74713134765625, + "rewards//std": 0.028673233464360237, + "step": 764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.153, + "grad_norm": 1.212506651878357, + "kl": 0.35477422177791595, + "learning_rate": 4.747680210418302e-06, + "loss": 0.0142, + "num_tokens": 6611760.0, + "reward": 0.76275634765625, + "reward_std": 0.008897896856069565, + "rewards//mean": 0.76275634765625, + "rewards//std": 0.024282492697238922, + "step": 765 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1532, + "grad_norm": 1.8687018156051636, + "kl": 0.33726025745272636, + "learning_rate": 4.746985115747918e-06, + "loss": 0.0135, + "num_tokens": 6620544.0, + "reward": 0.788818359375, + "reward_std": 0.010754291899502277, + "rewards//mean": 0.788818359375, + "rewards//std": 0.027611492201685905, + "step": 766 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1534, + "grad_norm": 1.1897015571594238, + "kl": 0.35634366050362587, + "learning_rate": 4.746289115992198e-06, + "loss": 0.0143, + "num_tokens": 6629192.0, + "reward": 0.75067138671875, + "reward_std": 0.006821729242801666, + "rewards//mean": 0.75067138671875, + "rewards//std": 0.023944122716784477, + "step": 767 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1536, + "grad_norm": 1.6127054691314697, + "kl": 0.3192887920886278, + "learning_rate": 4.74559221143149e-06, + "loss": 0.0128, + "num_tokens": 6637832.0, + "reward": 0.7333984375, + "reward_std": 0.00948795210570097, + "rewards//mean": 0.7333984375, + "rewards//std": 0.027524733915925026, + "step": 768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1538, + "grad_norm": 0.9704777598381042, + "kl": 0.3409872241318226, + "learning_rate": 4.744894402346508e-06, + "loss": 0.0136, + "num_tokens": 6646552.0, + "reward": 0.785400390625, + "reward_std": 0.01048012264072895, + "rewards//mean": 0.785400390625, + "rewards//std": 0.030144404619932175, + "step": 769 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.154, + "grad_norm": 1.119908332824707, + "kl": 0.36791034415364265, + "learning_rate": 4.744195689018331e-06, + "loss": 0.0147, + "num_tokens": 6655176.0, + "reward": 0.7825927734375, + "reward_std": 0.0098853949457407, + "rewards//mean": 0.7825927734375, + "rewards//std": 0.024990104138851166, + "step": 770 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1542, + "grad_norm": 1.1021844148635864, + "kl": 0.36526960879564285, + "learning_rate": 4.743496071728396e-06, + "loss": 0.0146, + "num_tokens": 6663840.0, + "reward": 0.7615966796875, + "reward_std": 0.005251667462289333, + "rewards//mean": 0.7615966796875, + "rewards//std": 0.029023557901382446, + "step": 771 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1544, + "grad_norm": 1.5725772380828857, + "kl": 0.38798758387565613, + "learning_rate": 4.742795550758514e-06, + "loss": 0.0155, + "num_tokens": 6672440.0, + "reward": 0.7513427734375, + "reward_std": 0.005276157986372709, + "rewards//mean": 0.7513427734375, + "rewards//std": 0.020719308406114578, + "step": 772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1546, + "grad_norm": 1.8963240385055542, + "kl": 0.3601706586778164, + "learning_rate": 4.742094126390851e-06, + "loss": 0.0144, + "num_tokens": 6680960.0, + "reward": 0.761962890625, + "reward_std": 0.007767863571643829, + "rewards//mean": 0.761962890625, + "rewards//std": 0.017892228439450264, + "step": 773 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1548, + "grad_norm": 0.9243273735046387, + "kl": 0.3406798876821995, + "learning_rate": 4.7413917989079415e-06, + "loss": 0.0136, + "num_tokens": 6689760.0, + "reward": 0.7685546875, + "reward_std": 0.008658873848617077, + "rewards//mean": 0.7685546875, + "rewards//std": 0.02934746816754341, + "step": 774 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.155, + "grad_norm": 0.9375861883163452, + "kl": 0.34522927552461624, + "learning_rate": 4.740688568592685e-06, + "loss": 0.0138, + "num_tokens": 6698392.0, + "reward": 0.77886962890625, + "reward_std": 0.008174005895853043, + "rewards//mean": 0.77886962890625, + "rewards//std": 0.021060163155198097, + "step": 775 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1552, + "grad_norm": 0.8136488199234009, + "kl": 0.35180309042334557, + "learning_rate": 4.73998443572834e-06, + "loss": 0.0141, + "num_tokens": 6707040.0, + "reward": 0.7554931640625, + "reward_std": 0.008698609657585621, + "rewards//mean": 0.7554931640625, + "rewards//std": 0.0261689480394125, + "step": 776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1554, + "grad_norm": 0.9896641969680786, + "kl": 0.33184514194726944, + "learning_rate": 4.7392794005985324e-06, + "loss": 0.0133, + "num_tokens": 6715672.0, + "reward": 0.76751708984375, + "reward_std": 0.008659705519676208, + "rewards//mean": 0.76751708984375, + "rewards//std": 0.022086039185523987, + "step": 777 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1556, + "grad_norm": 0.8349254131317139, + "kl": 0.3396892622113228, + "learning_rate": 4.7385734634872504e-06, + "loss": 0.0136, + "num_tokens": 6724312.0, + "reward": 0.76275634765625, + "reward_std": 0.010972029529511929, + "rewards//mean": 0.76275634765625, + "rewards//std": 0.024619286879897118, + "step": 778 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1558, + "grad_norm": 0.7266840934753418, + "kl": 0.3403605706989765, + "learning_rate": 4.7378666246788444e-06, + "loss": 0.0136, + "num_tokens": 6732968.0, + "reward": 0.7620849609375, + "reward_std": 0.011202620342373848, + "rewards//mean": 0.7620849609375, + "rewards//std": 0.028896017000079155, + "step": 779 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.156, + "grad_norm": 0.6741737127304077, + "kl": 0.30693612433969975, + "learning_rate": 4.73715888445803e-06, + "loss": 0.0123, + "num_tokens": 6741616.0, + "reward": 0.7645263671875, + "reward_std": 0.01664084941148758, + "rewards//mean": 0.7645263671875, + "rewards//std": 0.028137991204857826, + "step": 780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1562, + "grad_norm": 0.9469835162162781, + "kl": 0.33184751495718956, + "learning_rate": 4.736450243109885e-06, + "loss": 0.0133, + "num_tokens": 6750248.0, + "reward": 0.772216796875, + "reward_std": 0.010948438197374344, + "rewards//mean": 0.772216796875, + "rewards//std": 0.02752363495528698, + "step": 781 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1564, + "grad_norm": 1.3513708114624023, + "kl": 0.36903392523527145, + "learning_rate": 4.735740700919848e-06, + "loss": 0.0148, + "num_tokens": 6758784.0, + "reward": 0.75982666015625, + "reward_std": 0.011673888191580772, + "rewards//mean": 0.75982666015625, + "rewards//std": 0.02905711531639099, + "step": 782 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1566, + "grad_norm": 1.2254798412322998, + "kl": 0.3678886219859123, + "learning_rate": 4.7350302581737255e-06, + "loss": 0.0147, + "num_tokens": 6767448.0, + "reward": 0.7747802734375, + "reward_std": 0.009641564451158047, + "rewards//mean": 0.7747802734375, + "rewards//std": 0.019651401787996292, + "step": 783 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1568, + "grad_norm": 1.1070396900177002, + "kl": 0.3434663563966751, + "learning_rate": 4.734318915157682e-06, + "loss": 0.0137, + "num_tokens": 6776224.0, + "reward": 0.72930908203125, + "reward_std": 0.009073897264897823, + "rewards//mean": 0.72930908203125, + "rewards//std": 0.04494238644838333, + "step": 784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.157, + "grad_norm": 0.925721287727356, + "kl": 0.3413317985832691, + "learning_rate": 4.7336066721582464e-06, + "loss": 0.0137, + "num_tokens": 6784832.0, + "reward": 0.74957275390625, + "reward_std": 0.008924740366637707, + "rewards//mean": 0.74957275390625, + "rewards//std": 0.03833093121647835, + "step": 785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1572, + "grad_norm": 1.4691276550292969, + "kl": 0.3254700042307377, + "learning_rate": 4.73289352946231e-06, + "loss": 0.013, + "num_tokens": 6793464.0, + "reward": 0.783935546875, + "reward_std": 0.009136625565588474, + "rewards//mean": 0.783935546875, + "rewards//std": 0.021708672866225243, + "step": 786 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1574, + "grad_norm": 0.8711658120155334, + "kl": 0.34013716131448746, + "learning_rate": 4.732179487357127e-06, + "loss": 0.0136, + "num_tokens": 6801992.0, + "reward": 0.76971435546875, + "reward_std": 0.010332462377846241, + "rewards//mean": 0.76971435546875, + "rewards//std": 0.028943846002221107, + "step": 787 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1576, + "grad_norm": 1.2573686838150024, + "kl": 0.31912393122911453, + "learning_rate": 4.731464546130315e-06, + "loss": 0.0128, + "num_tokens": 6810688.0, + "reward": 0.7406005859375, + "reward_std": 0.01056898757815361, + "rewards//mean": 0.7406005859375, + "rewards//std": 0.026270560920238495, + "step": 788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1578, + "grad_norm": 0.6769285798072815, + "kl": 0.3092736713588238, + "learning_rate": 4.730748706069849e-06, + "loss": 0.0124, + "num_tokens": 6819336.0, + "reward": 0.7562255859375, + "reward_std": 0.01258472166955471, + "rewards//mean": 0.7562255859375, + "rewards//std": 0.038561511784791946, + "step": 789 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.158, + "grad_norm": 0.7998493313789368, + "kl": 0.3514319136738777, + "learning_rate": 4.730031967464071e-06, + "loss": 0.0141, + "num_tokens": 6828088.0, + "reward": 0.74859619140625, + "reward_std": 0.008010081946849823, + "rewards//mean": 0.74859619140625, + "rewards//std": 0.026524242013692856, + "step": 790 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1582, + "grad_norm": 0.934973955154419, + "kl": 0.30601639300584793, + "learning_rate": 4.729314330601684e-06, + "loss": 0.0122, + "num_tokens": 6836808.0, + "reward": 0.757080078125, + "reward_std": 0.013350230641663074, + "rewards//mean": 0.757080078125, + "rewards//std": 0.020503751933574677, + "step": 791 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1584, + "grad_norm": 0.9307112097740173, + "kl": 0.3210490830242634, + "learning_rate": 4.72859579577175e-06, + "loss": 0.0128, + "num_tokens": 6845400.0, + "reward": 0.75616455078125, + "reward_std": 0.008527729660272598, + "rewards//mean": 0.75616455078125, + "rewards//std": 0.020061077550053596, + "step": 792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1586, + "grad_norm": 1.2021913528442383, + "kl": 0.33378901705145836, + "learning_rate": 4.7278763632636974e-06, + "loss": 0.0134, + "num_tokens": 6854096.0, + "reward": 0.72943115234375, + "reward_std": 0.005128923803567886, + "rewards//mean": 0.72943115234375, + "rewards//std": 0.0317058339715004, + "step": 793 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1588, + "grad_norm": 1.0796374082565308, + "kl": 0.32557179778814316, + "learning_rate": 4.727156033367312e-06, + "loss": 0.013, + "num_tokens": 6862656.0, + "reward": 0.7237548828125, + "reward_std": 0.006958717480301857, + "rewards//mean": 0.7237548828125, + "rewards//std": 0.01773715205490589, + "step": 794 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.159, + "grad_norm": 1.2356610298156738, + "kl": 0.30212575383484364, + "learning_rate": 4.7264348063727415e-06, + "loss": 0.0121, + "num_tokens": 6871248.0, + "reward": 0.763671875, + "reward_std": 0.00585189089179039, + "rewards//mean": 0.763671875, + "rewards//std": 0.024861659854650497, + "step": 795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1592, + "grad_norm": 1.584045648574829, + "kl": 0.32087236642837524, + "learning_rate": 4.725712682570498e-06, + "loss": 0.0128, + "num_tokens": 6879968.0, + "reward": 0.7618408203125, + "reward_std": 0.01183655858039856, + "rewards//mean": 0.7618408203125, + "rewards//std": 0.029248006641864777, + "step": 796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1594, + "grad_norm": 1.0374621152877808, + "kl": 0.31318943575024605, + "learning_rate": 4.724989662251452e-06, + "loss": 0.0125, + "num_tokens": 6888536.0, + "reward": 0.76678466796875, + "reward_std": 0.011448527686297894, + "rewards//mean": 0.76678466796875, + "rewards//std": 0.025842614471912384, + "step": 797 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1596, + "grad_norm": 1.0205044746398926, + "kl": 0.30759022757411003, + "learning_rate": 4.724265745706837e-06, + "loss": 0.0123, + "num_tokens": 6897080.0, + "reward": 0.7689208984375, + "reward_std": 0.0078608188778162, + "rewards//mean": 0.7689208984375, + "rewards//std": 0.026348810642957687, + "step": 798 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1598, + "grad_norm": 2.31490159034729, + "kl": 0.3154881428927183, + "learning_rate": 4.723540933228245e-06, + "loss": 0.0126, + "num_tokens": 6905712.0, + "reward": 0.753662109375, + "reward_std": 0.0058388663455843925, + "rewards//mean": 0.753662109375, + "rewards//std": 0.029780646786093712, + "step": 799 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.16, + "grad_norm": 1.1097886562347412, + "kl": 0.3300345875322819, + "learning_rate": 4.7228152251076295e-06, + "loss": 0.0132, + "num_tokens": 6914304.0, + "reward": 0.77337646484375, + "reward_std": 0.009435447864234447, + "rewards//mean": 0.77337646484375, + "rewards//std": 0.024680698290467262, + "step": 800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1602, + "grad_norm": 1.193706750869751, + "kl": 0.3689091205596924, + "learning_rate": 4.7220886216373095e-06, + "loss": 0.0148, + "num_tokens": 6922960.0, + "reward": 0.78070068359375, + "reward_std": 0.007954924367368221, + "rewards//mean": 0.78070068359375, + "rewards//std": 0.030088046565651894, + "step": 801 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1604, + "grad_norm": 1.4976460933685303, + "kl": 0.34658120200037956, + "learning_rate": 4.7213611231099575e-06, + "loss": 0.0139, + "num_tokens": 6931632.0, + "reward": 0.7415771484375, + "reward_std": 0.007033228408545256, + "rewards//mean": 0.7415771484375, + "rewards//std": 0.03241577744483948, + "step": 802 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1606, + "grad_norm": 1.3477445840835571, + "kl": 0.35779566690325737, + "learning_rate": 4.7206327298186105e-06, + "loss": 0.0143, + "num_tokens": 6940272.0, + "reward": 0.77032470703125, + "reward_std": 0.006013727746903896, + "rewards//mean": 0.77032470703125, + "rewards//std": 0.021487809717655182, + "step": 803 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1608, + "grad_norm": 2.233660936355591, + "kl": 0.324648205190897, + "learning_rate": 4.7199034420566656e-06, + "loss": 0.013, + "num_tokens": 6948856.0, + "reward": 0.74658203125, + "reward_std": 0.009060757234692574, + "rewards//mean": 0.74658203125, + "rewards//std": 0.033411283046007156, + "step": 804 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.161, + "grad_norm": 1.2979011535644531, + "kl": 0.35064204037189484, + "learning_rate": 4.7191732601178795e-06, + "loss": 0.014, + "num_tokens": 6957592.0, + "reward": 0.7510986328125, + "reward_std": 0.007960843853652477, + "rewards//mean": 0.7510986328125, + "rewards//std": 0.027572816237807274, + "step": 805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1612, + "grad_norm": 1.3037084341049194, + "kl": 0.37102117761969566, + "learning_rate": 4.71844218429637e-06, + "loss": 0.0148, + "num_tokens": 6966240.0, + "reward": 0.77691650390625, + "reward_std": 0.013105214573442936, + "rewards//mean": 0.77691650390625, + "rewards//std": 0.0265219584107399, + "step": 806 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1614, + "grad_norm": 1.3451786041259766, + "kl": 0.3336266949772835, + "learning_rate": 4.717710214886614e-06, + "loss": 0.0133, + "num_tokens": 6974904.0, + "reward": 0.7286376953125, + "reward_std": 0.009902372024953365, + "rewards//mean": 0.7286376953125, + "rewards//std": 0.03538607805967331, + "step": 807 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1616, + "grad_norm": 1.0671733617782593, + "kl": 0.3264966905117035, + "learning_rate": 4.716977352183449e-06, + "loss": 0.0131, + "num_tokens": 6983504.0, + "reward": 0.773193359375, + "reward_std": 0.007287709973752499, + "rewards//mean": 0.773193359375, + "rewards//std": 0.025993958115577698, + "step": 808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1618, + "grad_norm": 1.8360722064971924, + "kl": 0.33740856125950813, + "learning_rate": 4.716243596482071e-06, + "loss": 0.0135, + "num_tokens": 6992216.0, + "reward": 0.75079345703125, + "reward_std": 0.008213981986045837, + "rewards//mean": 0.75079345703125, + "rewards//std": 0.028178511187434196, + "step": 809 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.162, + "grad_norm": 1.5714982748031616, + "kl": 0.3707769885659218, + "learning_rate": 4.715508948078037e-06, + "loss": 0.0148, + "num_tokens": 7000816.0, + "reward": 0.77569580078125, + "reward_std": 0.012640546075999737, + "rewards//mean": 0.77569580078125, + "rewards//std": 0.034434448927640915, + "step": 810 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1622, + "grad_norm": 1.2873075008392334, + "kl": 0.3405948132276535, + "learning_rate": 4.714773407267264e-06, + "loss": 0.0136, + "num_tokens": 7009480.0, + "reward": 0.7650146484375, + "reward_std": 0.006831423845142126, + "rewards//mean": 0.7650146484375, + "rewards//std": 0.020054001361131668, + "step": 811 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1624, + "grad_norm": 0.8567954301834106, + "kl": 0.3825432136654854, + "learning_rate": 4.714036974346028e-06, + "loss": 0.0153, + "num_tokens": 7018136.0, + "reward": 0.7357177734375, + "reward_std": 0.011918211355805397, + "rewards//mean": 0.7357177734375, + "rewards//std": 0.028866665437817574, + "step": 812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1626, + "grad_norm": 1.2704569101333618, + "kl": 0.35488636791706085, + "learning_rate": 4.7132996496109625e-06, + "loss": 0.0142, + "num_tokens": 7026824.0, + "reward": 0.76324462890625, + "reward_std": 0.010413013398647308, + "rewards//mean": 0.76324462890625, + "rewards//std": 0.025464370846748352, + "step": 813 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1628, + "grad_norm": 1.061576247215271, + "kl": 0.3901236914098263, + "learning_rate": 4.712561433359064e-06, + "loss": 0.0156, + "num_tokens": 7035488.0, + "reward": 0.74591064453125, + "reward_std": 0.008115001022815704, + "rewards//mean": 0.74591064453125, + "rewards//std": 0.02554805390536785, + "step": 814 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.163, + "grad_norm": 1.8980876207351685, + "kl": 0.3521212972700596, + "learning_rate": 4.7118223258876845e-06, + "loss": 0.0141, + "num_tokens": 7044072.0, + "reward": 0.75384521484375, + "reward_std": 0.007846582680940628, + "rewards//mean": 0.75384521484375, + "rewards//std": 0.016912657767534256, + "step": 815 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1632, + "grad_norm": 1.1933066844940186, + "kl": 0.38137778267264366, + "learning_rate": 4.711082327494536e-06, + "loss": 0.0153, + "num_tokens": 7052744.0, + "reward": 0.76531982421875, + "reward_std": 0.007578435353934765, + "rewards//mean": 0.76531982421875, + "rewards//std": 0.02658751606941223, + "step": 816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1634, + "grad_norm": 1.466513991355896, + "kl": 0.4100740812718868, + "learning_rate": 4.710341438477691e-06, + "loss": 0.0164, + "num_tokens": 7061384.0, + "reward": 0.73834228515625, + "reward_std": 0.010529394261538982, + "rewards//mean": 0.73834228515625, + "rewards//std": 0.03200426325201988, + "step": 817 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1636, + "grad_norm": 1.2618350982666016, + "kl": 0.39261148124933243, + "learning_rate": 4.709599659135579e-06, + "loss": 0.0157, + "num_tokens": 7070008.0, + "reward": 0.73333740234375, + "reward_std": 0.009211428463459015, + "rewards//mean": 0.73333740234375, + "rewards//std": 0.03585033491253853, + "step": 818 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1638, + "grad_norm": 1.817918062210083, + "kl": 0.37963099777698517, + "learning_rate": 4.708856989766988e-06, + "loss": 0.0152, + "num_tokens": 7078680.0, + "reward": 0.78253173828125, + "reward_std": 0.008948778733611107, + "rewards//mean": 0.78253173828125, + "rewards//std": 0.018895337358117104, + "step": 819 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.164, + "grad_norm": 1.1052753925323486, + "kl": 0.3860554024577141, + "learning_rate": 4.708113430671066e-06, + "loss": 0.0154, + "num_tokens": 7087336.0, + "reward": 0.73651123046875, + "reward_std": 0.007864532060921192, + "rewards//mean": 0.73651123046875, + "rewards//std": 0.027699550613760948, + "step": 820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1642, + "grad_norm": 1.0943959951400757, + "kl": 0.4198205694556236, + "learning_rate": 4.707368982147318e-06, + "loss": 0.0168, + "num_tokens": 7096016.0, + "reward": 0.75958251953125, + "reward_std": 0.008250880986452103, + "rewards//mean": 0.75958251953125, + "rewards//std": 0.024159371852874756, + "step": 821 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1644, + "grad_norm": 1.2998090982437134, + "kl": 0.4261195734143257, + "learning_rate": 4.706623644495608e-06, + "loss": 0.017, + "num_tokens": 7104736.0, + "reward": 0.71905517578125, + "reward_std": 0.007329055108129978, + "rewards//mean": 0.71905517578125, + "rewards//std": 0.03233412653207779, + "step": 822 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1646, + "grad_norm": 1.3837676048278809, + "kl": 0.416591789573431, + "learning_rate": 4.705877418016157e-06, + "loss": 0.0167, + "num_tokens": 7113368.0, + "reward": 0.7564697265625, + "reward_std": 0.010825035162270069, + "rewards//mean": 0.7564697265625, + "rewards//std": 0.029000600799918175, + "step": 823 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1648, + "grad_norm": 1.2816661596298218, + "kl": 0.44941718876361847, + "learning_rate": 4.705130303009547e-06, + "loss": 0.018, + "num_tokens": 7122128.0, + "reward": 0.77276611328125, + "reward_std": 0.008830229751765728, + "rewards//mean": 0.77276611328125, + "rewards//std": 0.021638043224811554, + "step": 824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.165, + "grad_norm": 1.167736530303955, + "kl": 0.41115984693169594, + "learning_rate": 4.7043822997767145e-06, + "loss": 0.0164, + "num_tokens": 7130776.0, + "reward": 0.7608642578125, + "reward_std": 0.007697493769228458, + "rewards//mean": 0.7608642578125, + "rewards//std": 0.014951195567846298, + "step": 825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1652, + "grad_norm": 1.9024914503097534, + "kl": 0.44243358448147774, + "learning_rate": 4.703633408618955e-06, + "loss": 0.0177, + "num_tokens": 7139528.0, + "reward": 0.739990234375, + "reward_std": 0.012601775117218494, + "rewards//mean": 0.739990234375, + "rewards//std": 0.02922237664461136, + "step": 826 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1654, + "grad_norm": 1.651120901107788, + "kl": 0.41875192895531654, + "learning_rate": 4.702883629837922e-06, + "loss": 0.0168, + "num_tokens": 7148128.0, + "reward": 0.71734619140625, + "reward_std": 0.008713518269360065, + "rewards//mean": 0.71734619140625, + "rewards//std": 0.04558815062046051, + "step": 827 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1656, + "grad_norm": 3.4639272689819336, + "kl": 0.44034529104828835, + "learning_rate": 4.7021329637356274e-06, + "loss": 0.0176, + "num_tokens": 7156728.0, + "reward": 0.72979736328125, + "reward_std": 0.007449123077094555, + "rewards//mean": 0.72979736328125, + "rewards//std": 0.03378652408719063, + "step": 828 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1658, + "grad_norm": 1.6177629232406616, + "kl": 0.39477837830781937, + "learning_rate": 4.701381410614437e-06, + "loss": 0.0158, + "num_tokens": 7165408.0, + "reward": 0.76959228515625, + "reward_std": 0.01015202235430479, + "rewards//mean": 0.76959228515625, + "rewards//std": 0.029647106304764748, + "step": 829 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.166, + "grad_norm": 1.7870838642120361, + "kl": 0.49362414330244064, + "learning_rate": 4.700628970777078e-06, + "loss": 0.0197, + "num_tokens": 7174016.0, + "reward": 0.759765625, + "reward_std": 0.011450910940766335, + "rewards//mean": 0.759765625, + "rewards//std": 0.033861320465803146, + "step": 830 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1662, + "grad_norm": 1.3444162607192993, + "kl": 0.45138998702168465, + "learning_rate": 4.699875644526633e-06, + "loss": 0.0181, + "num_tokens": 7182656.0, + "reward": 0.752685546875, + "reward_std": 0.009241987019777298, + "rewards//mean": 0.752685546875, + "rewards//std": 0.02500615082681179, + "step": 831 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1664, + "grad_norm": 2.9015026092529297, + "kl": 0.45517536252737045, + "learning_rate": 4.699121432166542e-06, + "loss": 0.0182, + "num_tokens": 7191360.0, + "reward": 0.7528076171875, + "reward_std": 0.005533740855753422, + "rewards//mean": 0.7528076171875, + "rewards//std": 0.033112503588199615, + "step": 832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1666, + "grad_norm": 1.1061749458312988, + "kl": 0.5402024127542973, + "learning_rate": 4.6983663340006e-06, + "loss": 0.0216, + "num_tokens": 7200080.0, + "reward": 0.75238037109375, + "reward_std": 0.009564736858010292, + "rewards//mean": 0.75238037109375, + "rewards//std": 0.019030246883630753, + "step": 833 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1668, + "grad_norm": 0.9863348603248596, + "kl": 0.4983534514904022, + "learning_rate": 4.697610350332962e-06, + "loss": 0.0199, + "num_tokens": 7208816.0, + "reward": 0.75982666015625, + "reward_std": 0.010304231196641922, + "rewards//mean": 0.75982666015625, + "rewards//std": 0.026097338646650314, + "step": 834 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.167, + "grad_norm": 0.8810539841651917, + "kl": 0.4796448089182377, + "learning_rate": 4.696853481468137e-06, + "loss": 0.0192, + "num_tokens": 7217488.0, + "reward": 0.75048828125, + "reward_std": 0.009355641901493073, + "rewards//mean": 0.75048828125, + "rewards//std": 0.030659254640340805, + "step": 835 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1672, + "grad_norm": 1.3327339887619019, + "kl": 0.48946595191955566, + "learning_rate": 4.6960957277109945e-06, + "loss": 0.0196, + "num_tokens": 7226024.0, + "reward": 0.75537109375, + "reward_std": 0.009866978041827679, + "rewards//mean": 0.75537109375, + "rewards//std": 0.025596076622605324, + "step": 836 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1674, + "grad_norm": 0.9784926772117615, + "kl": 0.5766041316092014, + "learning_rate": 4.695337089366754e-06, + "loss": 0.0231, + "num_tokens": 7234672.0, + "reward": 0.74761962890625, + "reward_std": 0.012241121381521225, + "rewards//mean": 0.74761962890625, + "rewards//std": 0.03139396011829376, + "step": 837 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1676, + "grad_norm": 1.5746252536773682, + "kl": 0.528806246817112, + "learning_rate": 4.694577566740996e-06, + "loss": 0.0212, + "num_tokens": 7243248.0, + "reward": 0.72967529296875, + "reward_std": 0.00966464914381504, + "rewards//mean": 0.72967529296875, + "rewards//std": 0.029709843918681145, + "step": 838 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1678, + "grad_norm": 1.091158390045166, + "kl": 0.5902933292090893, + "learning_rate": 4.693817160139657e-06, + "loss": 0.0236, + "num_tokens": 7252024.0, + "reward": 0.73333740234375, + "reward_std": 0.008134350180625916, + "rewards//mean": 0.73333740234375, + "rewards//std": 0.02597873844206333, + "step": 839 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.168, + "grad_norm": 1.2695715427398682, + "kl": 0.5501630380749702, + "learning_rate": 4.693055869869029e-06, + "loss": 0.022, + "num_tokens": 7260632.0, + "reward": 0.74896240234375, + "reward_std": 0.007687639910727739, + "rewards//mean": 0.74896240234375, + "rewards//std": 0.0311431884765625, + "step": 840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1682, + "grad_norm": 2.8090455532073975, + "kl": 0.5814741887152195, + "learning_rate": 4.692293696235758e-06, + "loss": 0.0233, + "num_tokens": 7269232.0, + "reward": 0.76068115234375, + "reward_std": 0.006853449624031782, + "rewards//mean": 0.76068115234375, + "rewards//std": 0.019774554297327995, + "step": 841 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1684, + "grad_norm": 0.8583130836486816, + "kl": 0.5908543989062309, + "learning_rate": 4.6915306395468485e-06, + "loss": 0.0236, + "num_tokens": 7277888.0, + "reward": 0.75555419921875, + "reward_std": 0.009005377069115639, + "rewards//mean": 0.75555419921875, + "rewards//std": 0.018253128975629807, + "step": 842 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1686, + "grad_norm": 1.7500971555709839, + "kl": 0.5774266049265862, + "learning_rate": 4.690766700109659e-06, + "loss": 0.0231, + "num_tokens": 7286528.0, + "reward": 0.7679443359375, + "reward_std": 0.010070763528347015, + "rewards//mean": 0.7679443359375, + "rewards//std": 0.02478085085749626, + "step": 843 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1688, + "grad_norm": 1.1226806640625, + "kl": 0.5556729286909103, + "learning_rate": 4.690001878231906e-06, + "loss": 0.0222, + "num_tokens": 7295168.0, + "reward": 0.74505615234375, + "reward_std": 0.009786233305931091, + "rewards//mean": 0.74505615234375, + "rewards//std": 0.01919025555253029, + "step": 844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.169, + "grad_norm": 1.5926430225372314, + "kl": 0.5301719754934311, + "learning_rate": 4.689236174221658e-06, + "loss": 0.0212, + "num_tokens": 7303800.0, + "reward": 0.73565673828125, + "reward_std": 0.010708114132285118, + "rewards//mean": 0.73565673828125, + "rewards//std": 0.017935220152139664, + "step": 845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1692, + "grad_norm": 1.3184345960617065, + "kl": 0.5430991984903812, + "learning_rate": 4.688469588387339e-06, + "loss": 0.0217, + "num_tokens": 7312504.0, + "reward": 0.77593994140625, + "reward_std": 0.009601364843547344, + "rewards//mean": 0.77593994140625, + "rewards//std": 0.026642685756087303, + "step": 846 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1694, + "grad_norm": 0.8678821325302124, + "kl": 0.590129129588604, + "learning_rate": 4.687702121037734e-06, + "loss": 0.0236, + "num_tokens": 7321176.0, + "reward": 0.740234375, + "reward_std": 0.009357824921607971, + "rewards//mean": 0.740234375, + "rewards//std": 0.026721050962805748, + "step": 847 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1696, + "grad_norm": 3.635012149810791, + "kl": 0.5143630467355251, + "learning_rate": 4.6869337724819745e-06, + "loss": 0.0206, + "num_tokens": 7329792.0, + "reward": 0.7623291015625, + "reward_std": 0.010166186839342117, + "rewards//mean": 0.7623291015625, + "rewards//std": 0.02383934147655964, + "step": 848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1698, + "grad_norm": 1.2785440683364868, + "kl": 0.5269539132714272, + "learning_rate": 4.686164543029554e-06, + "loss": 0.0211, + "num_tokens": 7338440.0, + "reward": 0.774169921875, + "reward_std": 0.008573425933718681, + "rewards//mean": 0.774169921875, + "rewards//std": 0.02073865942656994, + "step": 849 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.17, + "grad_norm": 1.50819730758667, + "kl": 0.49373095482587814, + "learning_rate": 4.685394432990316e-06, + "loss": 0.0197, + "num_tokens": 7347016.0, + "reward": 0.756591796875, + "reward_std": 0.011000225320458412, + "rewards//mean": 0.756591796875, + "rewards//std": 0.027347072958946228, + "step": 850 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1702, + "grad_norm": 0.9949831962585449, + "kl": 0.5279292948544025, + "learning_rate": 4.684623442674463e-06, + "loss": 0.0211, + "num_tokens": 7355664.0, + "reward": 0.69683837890625, + "reward_std": 0.0144025394693017, + "rewards//mean": 0.69683837890625, + "rewards//std": 0.041202642023563385, + "step": 851 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1704, + "grad_norm": 1.1295602321624756, + "kl": 0.5627134665846825, + "learning_rate": 4.683851572392548e-06, + "loss": 0.0225, + "num_tokens": 7364392.0, + "reward": 0.7559814453125, + "reward_std": 0.01152932457625866, + "rewards//mean": 0.7559814453125, + "rewards//std": 0.03178766742348671, + "step": 852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1706, + "grad_norm": 1.0643706321716309, + "kl": 0.5173224434256554, + "learning_rate": 4.68307882245548e-06, + "loss": 0.0207, + "num_tokens": 7373032.0, + "reward": 0.7471923828125, + "reward_std": 0.010553266853094101, + "rewards//mean": 0.7471923828125, + "rewards//std": 0.03074578382074833, + "step": 853 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1708, + "grad_norm": 0.8686037063598633, + "kl": 0.5057759210467339, + "learning_rate": 4.682305193174524e-06, + "loss": 0.0202, + "num_tokens": 7381688.0, + "reward": 0.75091552734375, + "reward_std": 0.014319094829261303, + "rewards//mean": 0.75091552734375, + "rewards//std": 0.027645394206047058, + "step": 854 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.171, + "grad_norm": 2.133981943130493, + "kl": 0.5340125001966953, + "learning_rate": 4.681530684861298e-06, + "loss": 0.0214, + "num_tokens": 7390344.0, + "reward": 0.77490234375, + "reward_std": 0.012180844321846962, + "rewards//mean": 0.77490234375, + "rewards//std": 0.020534737035632133, + "step": 855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1712, + "grad_norm": 1.4045356512069702, + "kl": 0.5420391000807285, + "learning_rate": 4.680755297827772e-06, + "loss": 0.0217, + "num_tokens": 7398960.0, + "reward": 0.76593017578125, + "reward_std": 0.007923927158117294, + "rewards//mean": 0.76593017578125, + "rewards//std": 0.02088331989943981, + "step": 856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1714, + "grad_norm": 0.8970532417297363, + "kl": 0.49403341114521027, + "learning_rate": 4.6799790323862735e-06, + "loss": 0.0198, + "num_tokens": 7407632.0, + "reward": 0.7232666015625, + "reward_std": 0.01003449596464634, + "rewards//mean": 0.7232666015625, + "rewards//std": 0.029150541871786118, + "step": 857 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1716, + "grad_norm": 1.2930840253829956, + "kl": 0.4648851417005062, + "learning_rate": 4.679201888849481e-06, + "loss": 0.0186, + "num_tokens": 7416224.0, + "reward": 0.77532958984375, + "reward_std": 0.008281329646706581, + "rewards//mean": 0.77532958984375, + "rewards//std": 0.02078668773174286, + "step": 858 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1718, + "grad_norm": 0.9651484489440918, + "kl": 0.44804292544722557, + "learning_rate": 4.678423867530428e-06, + "loss": 0.0179, + "num_tokens": 7424872.0, + "reward": 0.77166748046875, + "reward_std": 0.009435847401618958, + "rewards//mean": 0.77166748046875, + "rewards//std": 0.024149343371391296, + "step": 859 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.172, + "grad_norm": 1.372725486755371, + "kl": 0.4652128331363201, + "learning_rate": 4.677644968742503e-06, + "loss": 0.0186, + "num_tokens": 7433544.0, + "reward": 0.77142333984375, + "reward_std": 0.011212656274437904, + "rewards//mean": 0.77142333984375, + "rewards//std": 0.028442122042179108, + "step": 860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1722, + "grad_norm": 1.1838834285736084, + "kl": 0.4007790870964527, + "learning_rate": 4.676865192799443e-06, + "loss": 0.016, + "num_tokens": 7442144.0, + "reward": 0.75579833984375, + "reward_std": 0.008447399362921715, + "rewards//mean": 0.75579833984375, + "rewards//std": 0.024019237607717514, + "step": 861 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1724, + "grad_norm": 0.6647836565971375, + "kl": 0.45183153450489044, + "learning_rate": 4.676084540015345e-06, + "loss": 0.0181, + "num_tokens": 7450776.0, + "reward": 0.75628662109375, + "reward_std": 0.009683351032435894, + "rewards//mean": 0.75628662109375, + "rewards//std": 0.026084575802087784, + "step": 862 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1726, + "grad_norm": 1.740395426750183, + "kl": 0.4510505050420761, + "learning_rate": 4.675303010704654e-06, + "loss": 0.018, + "num_tokens": 7459440.0, + "reward": 0.7762451171875, + "reward_std": 0.01001917663961649, + "rewards//mean": 0.7762451171875, + "rewards//std": 0.022178800776600838, + "step": 863 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1728, + "grad_norm": 0.8341839909553528, + "kl": 0.42043209448456764, + "learning_rate": 4.674520605182171e-06, + "loss": 0.0168, + "num_tokens": 7468160.0, + "reward": 0.7720947265625, + "reward_std": 0.010514896363019943, + "rewards//mean": 0.7720947265625, + "rewards//std": 0.02007814310491085, + "step": 864 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.173, + "grad_norm": 0.6752360463142395, + "kl": 0.43009642139077187, + "learning_rate": 4.673737323763048e-06, + "loss": 0.0172, + "num_tokens": 7476752.0, + "reward": 0.76568603515625, + "reward_std": 0.00976257212460041, + "rewards//mean": 0.76568603515625, + "rewards//std": 0.015445588156580925, + "step": 865 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1732, + "grad_norm": 0.6788051128387451, + "kl": 0.4369927644729614, + "learning_rate": 4.672953166762791e-06, + "loss": 0.0175, + "num_tokens": 7485448.0, + "reward": 0.76446533203125, + "reward_std": 0.012529331259429455, + "rewards//mean": 0.76446533203125, + "rewards//std": 0.025926828384399414, + "step": 866 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1734, + "grad_norm": 0.6571235060691833, + "kl": 0.45430251583456993, + "learning_rate": 4.672168134497258e-06, + "loss": 0.0182, + "num_tokens": 7494104.0, + "reward": 0.720947265625, + "reward_std": 0.015459833666682243, + "rewards//mean": 0.720947265625, + "rewards//std": 0.031657155603170395, + "step": 867 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1736, + "grad_norm": 0.6456576585769653, + "kl": 0.4099312573671341, + "learning_rate": 4.671382227282661e-06, + "loss": 0.0164, + "num_tokens": 7502752.0, + "reward": 0.75372314453125, + "reward_std": 0.011794820427894592, + "rewards//mean": 0.75372314453125, + "rewards//std": 0.02672494389116764, + "step": 868 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1738, + "grad_norm": 0.8506948947906494, + "kl": 0.41736486181616783, + "learning_rate": 4.670595445435561e-06, + "loss": 0.0167, + "num_tokens": 7511400.0, + "reward": 0.7132568359375, + "reward_std": 0.010492125526070595, + "rewards//mean": 0.7132568359375, + "rewards//std": 0.03419385850429535, + "step": 869 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.174, + "grad_norm": 0.5586651563644409, + "kl": 0.40891773998737335, + "learning_rate": 4.669807789272877e-06, + "loss": 0.0164, + "num_tokens": 7520040.0, + "reward": 0.76055908203125, + "reward_std": 0.009399401023983955, + "rewards//mean": 0.76055908203125, + "rewards//std": 0.025711068883538246, + "step": 870 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1742, + "grad_norm": 0.8430010080337524, + "kl": 0.37954793870449066, + "learning_rate": 4.669019259111873e-06, + "loss": 0.0152, + "num_tokens": 7528672.0, + "reward": 0.77191162109375, + "reward_std": 0.011711275205016136, + "rewards//mean": 0.77191162109375, + "rewards//std": 0.022103851661086082, + "step": 871 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1744, + "grad_norm": 0.6705936789512634, + "kl": 0.3731926344335079, + "learning_rate": 4.668229855270172e-06, + "loss": 0.0149, + "num_tokens": 7537480.0, + "reward": 0.76348876953125, + "reward_std": 0.017561282962560654, + "rewards//mean": 0.76348876953125, + "rewards//std": 0.03937086835503578, + "step": 872 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1746, + "grad_norm": 0.5107825994491577, + "kl": 0.3830333612859249, + "learning_rate": 4.667439578065745e-06, + "loss": 0.0153, + "num_tokens": 7546136.0, + "reward": 0.76861572265625, + "reward_std": 0.008930927142500877, + "rewards//mean": 0.76861572265625, + "rewards//std": 0.0147271528840065, + "step": 873 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1748, + "grad_norm": 0.6749988794326782, + "kl": 0.32537274435162544, + "learning_rate": 4.666648427816914e-06, + "loss": 0.013, + "num_tokens": 7554760.0, + "reward": 0.7623291015625, + "reward_std": 0.014128495939075947, + "rewards//mean": 0.7623291015625, + "rewards//std": 0.0285969115793705, + "step": 874 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.175, + "grad_norm": 0.5768765807151794, + "kl": 0.3486756831407547, + "learning_rate": 4.665856404842356e-06, + "loss": 0.0139, + "num_tokens": 7563360.0, + "reward": 0.76446533203125, + "reward_std": 0.010931688360869884, + "rewards//mean": 0.76446533203125, + "rewards//std": 0.028379783034324646, + "step": 875 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1752, + "grad_norm": 0.5427371859550476, + "kl": 0.3550872132182121, + "learning_rate": 4.665063509461098e-06, + "loss": 0.0142, + "num_tokens": 7572032.0, + "reward": 0.74072265625, + "reward_std": 0.008837481960654259, + "rewards//mean": 0.74072265625, + "rewards//std": 0.029343342408537865, + "step": 876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1754, + "grad_norm": 0.607874870300293, + "kl": 0.3369791731238365, + "learning_rate": 4.664269741992516e-06, + "loss": 0.0135, + "num_tokens": 7580640.0, + "reward": 0.73492431640625, + "reward_std": 0.009598618373274803, + "rewards//mean": 0.73492431640625, + "rewards//std": 0.032740283757448196, + "step": 877 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1756, + "grad_norm": 0.5573941469192505, + "kl": 0.33512082695961, + "learning_rate": 4.663475102756341e-06, + "loss": 0.0134, + "num_tokens": 7589144.0, + "reward": 0.721923828125, + "reward_std": 0.008135411888360977, + "rewards//mean": 0.721923828125, + "rewards//std": 0.031764086335897446, + "step": 878 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1758, + "grad_norm": 0.5206130146980286, + "kl": 0.33247267454862595, + "learning_rate": 4.662679592072653e-06, + "loss": 0.0133, + "num_tokens": 7597784.0, + "reward": 0.738037109375, + "reward_std": 0.011560275219380856, + "rewards//mean": 0.738037109375, + "rewards//std": 0.02845815010368824, + "step": 879 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.176, + "grad_norm": 0.4959293305873871, + "kl": 0.3310950994491577, + "learning_rate": 4.661883210261884e-06, + "loss": 0.0132, + "num_tokens": 7606448.0, + "reward": 0.74853515625, + "reward_std": 0.009194498881697655, + "rewards//mean": 0.74853515625, + "rewards//std": 0.024185124784708023, + "step": 880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1762, + "grad_norm": 0.5584758520126343, + "kl": 0.31146182119846344, + "learning_rate": 4.661085957644817e-06, + "loss": 0.0125, + "num_tokens": 7615096.0, + "reward": 0.7435302734375, + "reward_std": 0.011724255979061127, + "rewards//mean": 0.7435302734375, + "rewards//std": 0.02828180231153965, + "step": 881 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1764, + "grad_norm": 0.5160636901855469, + "kl": 0.31183508411049843, + "learning_rate": 4.660287834542585e-06, + "loss": 0.0125, + "num_tokens": 7623656.0, + "reward": 0.75146484375, + "reward_std": 0.009842973202466965, + "rewards//mean": 0.75146484375, + "rewards//std": 0.026406485587358475, + "step": 882 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1766, + "grad_norm": 0.5011554956436157, + "kl": 0.27698101848363876, + "learning_rate": 4.659488841276671e-06, + "loss": 0.0111, + "num_tokens": 7632288.0, + "reward": 0.73583984375, + "reward_std": 0.00816989317536354, + "rewards//mean": 0.73583984375, + "rewards//std": 0.03832703083753586, + "step": 883 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1768, + "grad_norm": 0.5525965690612793, + "kl": 0.3069544155150652, + "learning_rate": 4.65868897816891e-06, + "loss": 0.0123, + "num_tokens": 7640848.0, + "reward": 0.71710205078125, + "reward_std": 0.008834538981318474, + "rewards//mean": 0.71710205078125, + "rewards//std": 0.024956993758678436, + "step": 884 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.177, + "grad_norm": 0.7361370325088501, + "kl": 0.3224721159785986, + "learning_rate": 4.6578882455414865e-06, + "loss": 0.0129, + "num_tokens": 7649536.0, + "reward": 0.72198486328125, + "reward_std": 0.01567103900015354, + "rewards//mean": 0.72198486328125, + "rewards//std": 0.03261938691139221, + "step": 885 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1772, + "grad_norm": 0.5023349523544312, + "kl": 0.29884358681738377, + "learning_rate": 4.657086643716937e-06, + "loss": 0.012, + "num_tokens": 7658176.0, + "reward": 0.762939453125, + "reward_std": 0.007250561378896236, + "rewards//mean": 0.762939453125, + "rewards//std": 0.022850144654512405, + "step": 886 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1774, + "grad_norm": 0.5194371342658997, + "kl": 0.28458988294005394, + "learning_rate": 4.656284173018144e-06, + "loss": 0.0114, + "num_tokens": 7666712.0, + "reward": 0.76953125, + "reward_std": 0.01195722445845604, + "rewards//mean": 0.76953125, + "rewards//std": 0.024160075932741165, + "step": 887 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1776, + "grad_norm": 0.46291840076446533, + "kl": 0.281252883374691, + "learning_rate": 4.655480833768344e-06, + "loss": 0.0113, + "num_tokens": 7675416.0, + "reward": 0.7490234375, + "reward_std": 0.009653441607952118, + "rewards//mean": 0.7490234375, + "rewards//std": 0.04129883274435997, + "step": 888 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1778, + "grad_norm": 0.5779797434806824, + "kl": 0.29215819016098976, + "learning_rate": 4.654676626291123e-06, + "loss": 0.0117, + "num_tokens": 7684048.0, + "reward": 0.75439453125, + "reward_std": 0.010933919809758663, + "rewards//mean": 0.75439453125, + "rewards//std": 0.03273756802082062, + "step": 889 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.178, + "grad_norm": 0.8437680602073669, + "kl": 0.2626949865370989, + "learning_rate": 4.653871550910414e-06, + "loss": 0.0105, + "num_tokens": 7692680.0, + "reward": 0.7613525390625, + "reward_std": 0.010730916634202003, + "rewards//mean": 0.7613525390625, + "rewards//std": 0.02644515223801136, + "step": 890 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1782, + "grad_norm": 0.5432496666908264, + "kl": 0.2565987464040518, + "learning_rate": 4.653065607950502e-06, + "loss": 0.0103, + "num_tokens": 7701288.0, + "reward": 0.7493896484375, + "reward_std": 0.013223410584032536, + "rewards//mean": 0.7493896484375, + "rewards//std": 0.03801432624459267, + "step": 891 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1784, + "grad_norm": 0.6957722902297974, + "kl": 0.25557348132133484, + "learning_rate": 4.65225879773602e-06, + "loss": 0.0102, + "num_tokens": 7709928.0, + "reward": 0.76568603515625, + "reward_std": 0.008938233368098736, + "rewards//mean": 0.76568603515625, + "rewards//std": 0.023193299770355225, + "step": 892 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1786, + "grad_norm": 0.526515781879425, + "kl": 0.2758950814604759, + "learning_rate": 4.651451120591952e-06, + "loss": 0.011, + "num_tokens": 7718504.0, + "reward": 0.760009765625, + "reward_std": 0.008625369518995285, + "rewards//mean": 0.760009765625, + "rewards//std": 0.0195234976708889, + "step": 893 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1788, + "grad_norm": 0.6605725884437561, + "kl": 0.270997678861022, + "learning_rate": 4.650642576843631e-06, + "loss": 0.0108, + "num_tokens": 7727272.0, + "reward": 0.76007080078125, + "reward_std": 0.01003330573439598, + "rewards//mean": 0.76007080078125, + "rewards//std": 0.03080352023243904, + "step": 894 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.179, + "grad_norm": 0.5448390245437622, + "kl": 0.252898957580328, + "learning_rate": 4.649833166816736e-06, + "loss": 0.0101, + "num_tokens": 7736008.0, + "reward": 0.73876953125, + "reward_std": 0.011466844007372856, + "rewards//mean": 0.73876953125, + "rewards//std": 0.03744480386376381, + "step": 895 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1792, + "grad_norm": 0.6046472191810608, + "kl": 0.2753715682774782, + "learning_rate": 4.649022890837298e-06, + "loss": 0.011, + "num_tokens": 7744680.0, + "reward": 0.749267578125, + "reward_std": 0.009591508656740189, + "rewards//mean": 0.749267578125, + "rewards//std": 0.02788209356367588, + "step": 896 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1794, + "grad_norm": 0.8163694739341736, + "kl": 0.2629574202001095, + "learning_rate": 4.648211749231698e-06, + "loss": 0.0105, + "num_tokens": 7753328.0, + "reward": 0.78839111328125, + "reward_std": 0.00975382886826992, + "rewards//mean": 0.78839111328125, + "rewards//std": 0.02201532945036888, + "step": 897 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1796, + "grad_norm": 0.8590662479400635, + "kl": 0.2539634648710489, + "learning_rate": 4.6473997423266615e-06, + "loss": 0.0102, + "num_tokens": 7761912.0, + "reward": 0.765380859375, + "reward_std": 0.007768705021589994, + "rewards//mean": 0.765380859375, + "rewards//std": 0.016126545146107674, + "step": 898 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1798, + "grad_norm": 0.5974591970443726, + "kl": 0.26154331117868423, + "learning_rate": 4.646586870449266e-06, + "loss": 0.0105, + "num_tokens": 7770584.0, + "reward": 0.761962890625, + "reward_std": 0.013042537495493889, + "rewards//mean": 0.761962890625, + "rewards//std": 0.034868303686380386, + "step": 899 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.18, + "grad_norm": 0.8781308531761169, + "kl": 0.2626851834356785, + "learning_rate": 4.645773133926936e-06, + "loss": 0.0105, + "num_tokens": 7779176.0, + "reward": 0.75445556640625, + "reward_std": 0.010092251002788544, + "rewards//mean": 0.75445556640625, + "rewards//std": 0.03166474774479866, + "step": 900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1802, + "grad_norm": 0.6971238255500793, + "kl": 0.25614192336797714, + "learning_rate": 4.644958533087443e-06, + "loss": 0.0102, + "num_tokens": 7787928.0, + "reward": 0.75091552734375, + "reward_std": 0.008847690187394619, + "rewards//mean": 0.75091552734375, + "rewards//std": 0.024505889043211937, + "step": 901 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1804, + "grad_norm": 0.6750687956809998, + "kl": 0.26914981193840504, + "learning_rate": 4.64414306825891e-06, + "loss": 0.0108, + "num_tokens": 7796560.0, + "reward": 0.7652587890625, + "reward_std": 0.012014053761959076, + "rewards//mean": 0.7652587890625, + "rewards//std": 0.035356976091861725, + "step": 902 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1806, + "grad_norm": 0.6785114407539368, + "kl": 0.2674534786492586, + "learning_rate": 4.643326739769805e-06, + "loss": 0.0107, + "num_tokens": 7805264.0, + "reward": 0.72943115234375, + "reward_std": 0.011744974181056023, + "rewards//mean": 0.72943115234375, + "rewards//std": 0.037007205188274384, + "step": 903 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1808, + "grad_norm": 0.6502825617790222, + "kl": 0.2490905374288559, + "learning_rate": 4.642509547948947e-06, + "loss": 0.01, + "num_tokens": 7813920.0, + "reward": 0.7679443359375, + "reward_std": 0.009585111401975155, + "rewards//mean": 0.7679443359375, + "rewards//std": 0.020946016535162926, + "step": 904 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.181, + "grad_norm": 0.8235167860984802, + "kl": 0.27164326049387455, + "learning_rate": 4.6416914931254984e-06, + "loss": 0.0109, + "num_tokens": 7822416.0, + "reward": 0.70794677734375, + "reward_std": 0.010866380296647549, + "rewards//mean": 0.70794677734375, + "rewards//std": 0.040611300617456436, + "step": 905 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1812, + "grad_norm": 1.0692238807678223, + "kl": 0.28652898594737053, + "learning_rate": 4.640872575628973e-06, + "loss": 0.0115, + "num_tokens": 7831000.0, + "reward": 0.728271484375, + "reward_std": 0.01055578701198101, + "rewards//mean": 0.728271484375, + "rewards//std": 0.03541964292526245, + "step": 906 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1814, + "grad_norm": 0.9947613477706909, + "kl": 0.2775047402828932, + "learning_rate": 4.6400527957892295e-06, + "loss": 0.0111, + "num_tokens": 7839616.0, + "reward": 0.76861572265625, + "reward_std": 0.01045004278421402, + "rewards//mean": 0.76861572265625, + "rewards//std": 0.025683382526040077, + "step": 907 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1816, + "grad_norm": 0.7290713787078857, + "kl": 0.27869971096515656, + "learning_rate": 4.639232153936476e-06, + "loss": 0.0111, + "num_tokens": 7848272.0, + "reward": 0.78680419921875, + "reward_std": 0.007229713257402182, + "rewards//mean": 0.78680419921875, + "rewards//std": 0.03553522005677223, + "step": 908 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1818, + "grad_norm": 0.7138441801071167, + "kl": 0.27684359066188335, + "learning_rate": 4.638410650401267e-06, + "loss": 0.0111, + "num_tokens": 7856824.0, + "reward": 0.75946044921875, + "reward_std": 0.009466202929615974, + "rewards//mean": 0.75946044921875, + "rewards//std": 0.018594926223158836, + "step": 909 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.182, + "grad_norm": 0.7739296555519104, + "kl": 0.2615481149405241, + "learning_rate": 4.637588285514504e-06, + "loss": 0.0105, + "num_tokens": 7865384.0, + "reward": 0.7427978515625, + "reward_std": 0.008604985661804676, + "rewards//mean": 0.7427978515625, + "rewards//std": 0.02929145097732544, + "step": 910 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1822, + "grad_norm": 0.9076600670814514, + "kl": 0.2758956626057625, + "learning_rate": 4.636765059607434e-06, + "loss": 0.011, + "num_tokens": 7874016.0, + "reward": 0.765380859375, + "reward_std": 0.008405257016420364, + "rewards//mean": 0.765380859375, + "rewards//std": 0.02437838539481163, + "step": 911 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1824, + "grad_norm": 0.7445156574249268, + "kl": 0.2667129561305046, + "learning_rate": 4.6359409730116546e-06, + "loss": 0.0107, + "num_tokens": 7882688.0, + "reward": 0.78369140625, + "reward_std": 0.0080318758264184, + "rewards//mean": 0.78369140625, + "rewards//std": 0.023983998224139214, + "step": 912 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1826, + "grad_norm": 0.8461691737174988, + "kl": 0.2566282209008932, + "learning_rate": 4.635116026059107e-06, + "loss": 0.0103, + "num_tokens": 7891408.0, + "reward": 0.76092529296875, + "reward_std": 0.006388828158378601, + "rewards//mean": 0.76092529296875, + "rewards//std": 0.024912068620324135, + "step": 913 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1828, + "grad_norm": 1.0944255590438843, + "kl": 0.27355407923460007, + "learning_rate": 4.634290219082078e-06, + "loss": 0.0109, + "num_tokens": 7900024.0, + "reward": 0.7568359375, + "reward_std": 0.00810672901570797, + "rewards//mean": 0.7568359375, + "rewards//std": 0.029487434774637222, + "step": 914 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.183, + "grad_norm": 1.0639708042144775, + "kl": 0.28650105744600296, + "learning_rate": 4.633463552413205e-06, + "loss": 0.0115, + "num_tokens": 7908696.0, + "reward": 0.76947021484375, + "reward_std": 0.006652969866991043, + "rewards//mean": 0.76947021484375, + "rewards//std": 0.024434128776192665, + "step": 915 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1832, + "grad_norm": 0.8133704662322998, + "kl": 0.2669271398335695, + "learning_rate": 4.632636026385468e-06, + "loss": 0.0107, + "num_tokens": 7917392.0, + "reward": 0.7740478515625, + "reward_std": 0.011425754986703396, + "rewards//mean": 0.7740478515625, + "rewards//std": 0.01731915958225727, + "step": 916 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1834, + "grad_norm": 1.5723562240600586, + "kl": 0.27988065406680107, + "learning_rate": 4.631807641332195e-06, + "loss": 0.0112, + "num_tokens": 7926048.0, + "reward": 0.734375, + "reward_std": 0.005926807411015034, + "rewards//mean": 0.734375, + "rewards//std": 0.04288693889975548, + "step": 917 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1836, + "grad_norm": 1.0416011810302734, + "kl": 0.3021409697830677, + "learning_rate": 4.630978397587058e-06, + "loss": 0.0121, + "num_tokens": 7935016.0, + "reward": 0.77471923828125, + "reward_std": 0.00614708149805665, + "rewards//mean": 0.77471923828125, + "rewards//std": 0.040547508746385574, + "step": 918 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1838, + "grad_norm": 0.7368637919425964, + "kl": 0.2729188334196806, + "learning_rate": 4.630148295484078e-06, + "loss": 0.0109, + "num_tokens": 7943656.0, + "reward": 0.74053955078125, + "reward_std": 0.007835205644369125, + "rewards//mean": 0.74053955078125, + "rewards//std": 0.025862522423267365, + "step": 919 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.184, + "grad_norm": 1.2126497030258179, + "kl": 0.279952809214592, + "learning_rate": 4.62931733535762e-06, + "loss": 0.0112, + "num_tokens": 7952288.0, + "reward": 0.73394775390625, + "reward_std": 0.007915819063782692, + "rewards//mean": 0.73394775390625, + "rewards//std": 0.030266623944044113, + "step": 920 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1842, + "grad_norm": 1.5177454948425293, + "kl": 0.28335076197981834, + "learning_rate": 4.628485517542393e-06, + "loss": 0.0113, + "num_tokens": 7960904.0, + "reward": 0.74371337890625, + "reward_std": 0.008045843802392483, + "rewards//mean": 0.74371337890625, + "rewards//std": 0.03372957557439804, + "step": 921 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1844, + "grad_norm": 0.8178320527076721, + "kl": 0.2918772976845503, + "learning_rate": 4.627652842373454e-06, + "loss": 0.0117, + "num_tokens": 7969416.0, + "reward": 0.77789306640625, + "reward_std": 0.009983016178011894, + "rewards//mean": 0.77789306640625, + "rewards//std": 0.024058280512690544, + "step": 922 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1846, + "grad_norm": 1.0449796915054321, + "kl": 0.3009101003408432, + "learning_rate": 4.626819310186204e-06, + "loss": 0.012, + "num_tokens": 7978048.0, + "reward": 0.76824951171875, + "reward_std": 0.009645262733101845, + "rewards//mean": 0.76824951171875, + "rewards//std": 0.027419421821832657, + "step": 923 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1848, + "grad_norm": 0.9068644642829895, + "kl": 0.2759493812918663, + "learning_rate": 4.625984921316392e-06, + "loss": 0.011, + "num_tokens": 7986648.0, + "reward": 0.76788330078125, + "reward_std": 0.011120768263936043, + "rewards//mean": 0.76788330078125, + "rewards//std": 0.02279633842408657, + "step": 924 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.185, + "grad_norm": 1.1586679220199585, + "kl": 0.2916589751839638, + "learning_rate": 4.625149676100107e-06, + "loss": 0.0117, + "num_tokens": 7995320.0, + "reward": 0.76434326171875, + "reward_std": 0.0063139949925243855, + "rewards//mean": 0.76434326171875, + "rewards//std": 0.022053800523281097, + "step": 925 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1852, + "grad_norm": 0.9030797481536865, + "kl": 0.28806450217962265, + "learning_rate": 4.624313574873787e-06, + "loss": 0.0115, + "num_tokens": 8003952.0, + "reward": 0.7706298828125, + "reward_std": 0.010051582008600235, + "rewards//mean": 0.7706298828125, + "rewards//std": 0.03246990218758583, + "step": 926 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1854, + "grad_norm": 0.8641966581344604, + "kl": 0.29965170845389366, + "learning_rate": 4.623476617974212e-06, + "loss": 0.012, + "num_tokens": 8012560.0, + "reward": 0.76129150390625, + "reward_std": 0.006479769945144653, + "rewards//mean": 0.76129150390625, + "rewards//std": 0.02804011106491089, + "step": 927 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1856, + "grad_norm": 0.9614176154136658, + "kl": 0.30398988723754883, + "learning_rate": 4.62263880573851e-06, + "loss": 0.0122, + "num_tokens": 8021216.0, + "reward": 0.77435302734375, + "reward_std": 0.00971104484051466, + "rewards//mean": 0.77435302734375, + "rewards//std": 0.02430741675198078, + "step": 928 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1858, + "grad_norm": 0.7756906747817993, + "kl": 0.30807607248425484, + "learning_rate": 4.6218001385041504e-06, + "loss": 0.0123, + "num_tokens": 8029968.0, + "reward": 0.74884033203125, + "reward_std": 0.012052865698933601, + "rewards//mean": 0.74884033203125, + "rewards//std": 0.03312136232852936, + "step": 929 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.186, + "grad_norm": 1.1806690692901611, + "kl": 0.2854270339012146, + "learning_rate": 4.6209606166089495e-06, + "loss": 0.0114, + "num_tokens": 8038632.0, + "reward": 0.75872802734375, + "reward_std": 0.006012373138219118, + "rewards//mean": 0.75872802734375, + "rewards//std": 0.02449600212275982, + "step": 930 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1862, + "grad_norm": 0.8380911350250244, + "kl": 0.2782778702676296, + "learning_rate": 4.620120240391065e-06, + "loss": 0.0111, + "num_tokens": 8047248.0, + "reward": 0.74365234375, + "reward_std": 0.009586180560290813, + "rewards//mean": 0.74365234375, + "rewards//std": 0.023172486573457718, + "step": 931 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1864, + "grad_norm": 1.0672872066497803, + "kl": 0.29075466096401215, + "learning_rate": 4.619279010189002e-06, + "loss": 0.0116, + "num_tokens": 8055896.0, + "reward": 0.737060546875, + "reward_std": 0.008078483864665031, + "rewards//mean": 0.737060546875, + "rewards//std": 0.03260691091418266, + "step": 932 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1866, + "grad_norm": 0.937512993812561, + "kl": 0.2672659792006016, + "learning_rate": 4.618436926341607e-06, + "loss": 0.0107, + "num_tokens": 8064552.0, + "reward": 0.77001953125, + "reward_std": 0.01028747484087944, + "rewards//mean": 0.77001953125, + "rewards//std": 0.021871235221624374, + "step": 933 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1868, + "grad_norm": 0.7125497460365295, + "kl": 0.2997877672314644, + "learning_rate": 4.617593989188071e-06, + "loss": 0.012, + "num_tokens": 8073232.0, + "reward": 0.7581787109375, + "reward_std": 0.009922824800014496, + "rewards//mean": 0.7581787109375, + "rewards//std": 0.03353586792945862, + "step": 934 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.187, + "grad_norm": 0.9831026196479797, + "kl": 0.30767880380153656, + "learning_rate": 4.616750199067929e-06, + "loss": 0.0123, + "num_tokens": 8081840.0, + "reward": 0.7542724609375, + "reward_std": 0.012375378049910069, + "rewards//mean": 0.7542724609375, + "rewards//std": 0.03080284409224987, + "step": 935 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1872, + "grad_norm": 1.1639822721481323, + "kl": 0.3031549211591482, + "learning_rate": 4.615905556321061e-06, + "loss": 0.0121, + "num_tokens": 8090544.0, + "reward": 0.75262451171875, + "reward_std": 0.01033235713839531, + "rewards//mean": 0.75262451171875, + "rewards//std": 0.027613069862127304, + "step": 936 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1874, + "grad_norm": 0.8558388352394104, + "kl": 0.30340132489800453, + "learning_rate": 4.615060061287688e-06, + "loss": 0.0121, + "num_tokens": 8099208.0, + "reward": 0.7528076171875, + "reward_std": 0.013534157536923885, + "rewards//mean": 0.7528076171875, + "rewards//std": 0.027782833203673363, + "step": 937 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1876, + "grad_norm": 0.845984160900116, + "kl": 0.29561108350753784, + "learning_rate": 4.614213714308374e-06, + "loss": 0.0118, + "num_tokens": 8107856.0, + "reward": 0.75341796875, + "reward_std": 0.011012168601155281, + "rewards//mean": 0.75341796875, + "rewards//std": 0.02374039590358734, + "step": 938 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1878, + "grad_norm": 0.8777411580085754, + "kl": 0.29283004254102707, + "learning_rate": 4.6133665157240306e-06, + "loss": 0.0117, + "num_tokens": 8116456.0, + "reward": 0.71600341796875, + "reward_std": 0.008553056046366692, + "rewards//mean": 0.71600341796875, + "rewards//std": 0.0435798205435276, + "step": 939 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.188, + "grad_norm": 0.9265841841697693, + "kl": 0.29388684406876564, + "learning_rate": 4.612518465875906e-06, + "loss": 0.0118, + "num_tokens": 8125016.0, + "reward": 0.7520751953125, + "reward_std": 0.009404158219695091, + "rewards//mean": 0.7520751953125, + "rewards//std": 0.03676244989037514, + "step": 940 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1882, + "grad_norm": 0.8138207793235779, + "kl": 0.3315567076206207, + "learning_rate": 4.611669565105597e-06, + "loss": 0.0133, + "num_tokens": 8133600.0, + "reward": 0.7581787109375, + "reward_std": 0.007814617827534676, + "rewards//mean": 0.7581787109375, + "rewards//std": 0.029054835438728333, + "step": 941 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1884, + "grad_norm": 0.895754337310791, + "kl": 0.2886655665934086, + "learning_rate": 4.610819813755038e-06, + "loss": 0.0115, + "num_tokens": 8142232.0, + "reward": 0.7618408203125, + "reward_std": 0.008037852123379707, + "rewards//mean": 0.7618408203125, + "rewards//std": 0.018055196851491928, + "step": 942 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1886, + "grad_norm": 1.0638179779052734, + "kl": 0.33327390998601913, + "learning_rate": 4.609969212166512e-06, + "loss": 0.0133, + "num_tokens": 8150976.0, + "reward": 0.7430419921875, + "reward_std": 0.008485731668770313, + "rewards//mean": 0.7430419921875, + "rewards//std": 0.023328416049480438, + "step": 943 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1888, + "grad_norm": 1.1232788562774658, + "kl": 0.31669921800494194, + "learning_rate": 4.609117760682639e-06, + "loss": 0.0127, + "num_tokens": 8159640.0, + "reward": 0.7677001953125, + "reward_std": 0.010280384682118893, + "rewards//mean": 0.7677001953125, + "rewards//std": 0.039260219782590866, + "step": 944 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.189, + "grad_norm": 1.87589430809021, + "kl": 0.2973406966775656, + "learning_rate": 4.608265459646384e-06, + "loss": 0.0119, + "num_tokens": 8168288.0, + "reward": 0.73162841796875, + "reward_std": 0.008025545626878738, + "rewards//mean": 0.73162841796875, + "rewards//std": 0.028157014399766922, + "step": 945 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1892, + "grad_norm": 0.7903565168380737, + "kl": 0.2874004878103733, + "learning_rate": 4.607412309401054e-06, + "loss": 0.0115, + "num_tokens": 8176928.0, + "reward": 0.77008056640625, + "reward_std": 0.006889053154736757, + "rewards//mean": 0.77008056640625, + "rewards//std": 0.02101699262857437, + "step": 946 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1894, + "grad_norm": 1.0100493431091309, + "kl": 0.28360038809478283, + "learning_rate": 4.606558310290298e-06, + "loss": 0.0113, + "num_tokens": 8185472.0, + "reward": 0.7728271484375, + "reward_std": 0.007974790409207344, + "rewards//mean": 0.7728271484375, + "rewards//std": 0.021634282544255257, + "step": 947 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1896, + "grad_norm": 0.9846349358558655, + "kl": 0.3387889303267002, + "learning_rate": 4.605703462658107e-06, + "loss": 0.0136, + "num_tokens": 8194096.0, + "reward": 0.77178955078125, + "reward_std": 0.01358504593372345, + "rewards//mean": 0.77178955078125, + "rewards//std": 0.02849157713353634, + "step": 948 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1898, + "grad_norm": 1.747856616973877, + "kl": 0.33013427443802357, + "learning_rate": 4.604847766848812e-06, + "loss": 0.0132, + "num_tokens": 8202632.0, + "reward": 0.7506103515625, + "reward_std": 0.011012842878699303, + "rewards//mean": 0.7506103515625, + "rewards//std": 0.0204012431204319, + "step": 949 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.19, + "grad_norm": 0.925475537776947, + "kl": 0.31422964110970497, + "learning_rate": 4.60399122320709e-06, + "loss": 0.0126, + "num_tokens": 8211200.0, + "reward": 0.76910400390625, + "reward_std": 0.009690100327134132, + "rewards//mean": 0.76910400390625, + "rewards//std": 0.018468312919139862, + "step": 950 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1902, + "grad_norm": 1.132817268371582, + "kl": 0.3214902691543102, + "learning_rate": 4.603133832077953e-06, + "loss": 0.0129, + "num_tokens": 8219872.0, + "reward": 0.7730712890625, + "reward_std": 0.01109931617975235, + "rewards//mean": 0.7730712890625, + "rewards//std": 0.030335379764437675, + "step": 951 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1904, + "grad_norm": 0.914539635181427, + "kl": 0.3235311582684517, + "learning_rate": 4.602275593806761e-06, + "loss": 0.0129, + "num_tokens": 8228504.0, + "reward": 0.76373291015625, + "reward_std": 0.008837835863232613, + "rewards//mean": 0.76373291015625, + "rewards//std": 0.01813581772148609, + "step": 952 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1906, + "grad_norm": 0.9955701231956482, + "kl": 0.34584128484129906, + "learning_rate": 4.601416508739211e-06, + "loss": 0.0138, + "num_tokens": 8237192.0, + "reward": 0.74029541015625, + "reward_std": 0.012629736214876175, + "rewards//mean": 0.74029541015625, + "rewards//std": 0.027743780985474586, + "step": 953 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1908, + "grad_norm": 0.9932671785354614, + "kl": 0.31934236362576485, + "learning_rate": 4.600556577221342e-06, + "loss": 0.0128, + "num_tokens": 8245880.0, + "reward": 0.73089599609375, + "reward_std": 0.008857986889779568, + "rewards//mean": 0.73089599609375, + "rewards//std": 0.031467169523239136, + "step": 954 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.191, + "grad_norm": 1.2234418392181396, + "kl": 0.30861231684684753, + "learning_rate": 4.599695799599537e-06, + "loss": 0.0123, + "num_tokens": 8254472.0, + "reward": 0.78582763671875, + "reward_std": 0.00961345061659813, + "rewards//mean": 0.78582763671875, + "rewards//std": 0.023318924009799957, + "step": 955 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1912, + "grad_norm": 1.053141474723816, + "kl": 0.3571406714618206, + "learning_rate": 4.5988341762205125e-06, + "loss": 0.0143, + "num_tokens": 8263168.0, + "reward": 0.76080322265625, + "reward_std": 0.008980470709502697, + "rewards//mean": 0.76080322265625, + "rewards//std": 0.023195909336209297, + "step": 956 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1914, + "grad_norm": 1.2716294527053833, + "kl": 0.3623424470424652, + "learning_rate": 4.5979717074313336e-06, + "loss": 0.0145, + "num_tokens": 8271784.0, + "reward": 0.7208251953125, + "reward_std": 0.011484457179903984, + "rewards//mean": 0.7208251953125, + "rewards//std": 0.04424877464771271, + "step": 957 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1916, + "grad_norm": 1.2183886766433716, + "kl": 0.3608044385910034, + "learning_rate": 4.5971083935794026e-06, + "loss": 0.0144, + "num_tokens": 8280496.0, + "reward": 0.74468994140625, + "reward_std": 0.00796246062964201, + "rewards//mean": 0.74468994140625, + "rewards//std": 0.03260313719511032, + "step": 958 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1918, + "grad_norm": 1.0903003215789795, + "kl": 0.3438631668686867, + "learning_rate": 4.5962442350124605e-06, + "loss": 0.0138, + "num_tokens": 8289160.0, + "reward": 0.75726318359375, + "reward_std": 0.008945617824792862, + "rewards//mean": 0.75726318359375, + "rewards//std": 0.02822735533118248, + "step": 959 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.192, + "grad_norm": 1.063489556312561, + "kl": 0.3416360281407833, + "learning_rate": 4.595379232078592e-06, + "loss": 0.0137, + "num_tokens": 8297768.0, + "reward": 0.72540283203125, + "reward_std": 0.010138707235455513, + "rewards//mean": 0.72540283203125, + "rewards//std": 0.03077893704175949, + "step": 960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1922, + "grad_norm": 0.9618769884109497, + "kl": 0.3541461080312729, + "learning_rate": 4.5945133851262185e-06, + "loss": 0.0142, + "num_tokens": 8306440.0, + "reward": 0.7489013671875, + "reward_std": 0.008929513394832611, + "rewards//mean": 0.7489013671875, + "rewards//std": 0.029630571603775024, + "step": 961 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1924, + "grad_norm": 1.2882936000823975, + "kl": 0.36649367958307266, + "learning_rate": 4.593646694504105e-06, + "loss": 0.0147, + "num_tokens": 8315072.0, + "reward": 0.75372314453125, + "reward_std": 0.010234087705612183, + "rewards//mean": 0.75372314453125, + "rewards//std": 0.025395916774868965, + "step": 962 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1926, + "grad_norm": 1.241235613822937, + "kl": 0.3214772678911686, + "learning_rate": 4.5927791605613525e-06, + "loss": 0.0129, + "num_tokens": 8323776.0, + "reward": 0.76800537109375, + "reward_std": 0.008909153752028942, + "rewards//mean": 0.76800537109375, + "rewards//std": 0.02892867475748062, + "step": 963 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1928, + "grad_norm": 1.4344669580459595, + "kl": 0.36224448308348656, + "learning_rate": 4.591910783647405e-06, + "loss": 0.0145, + "num_tokens": 8332408.0, + "reward": 0.75018310546875, + "reward_std": 0.011086471378803253, + "rewards//mean": 0.75018310546875, + "rewards//std": 0.029143985360860825, + "step": 964 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.193, + "grad_norm": 1.057570457458496, + "kl": 0.3504234068095684, + "learning_rate": 4.591041564112043e-06, + "loss": 0.014, + "num_tokens": 8341096.0, + "reward": 0.75848388671875, + "reward_std": 0.011695520021021366, + "rewards//mean": 0.75848388671875, + "rewards//std": 0.023246105760335922, + "step": 965 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1932, + "grad_norm": 1.2750986814498901, + "kl": 0.3357960321009159, + "learning_rate": 4.59017150230539e-06, + "loss": 0.0134, + "num_tokens": 8349760.0, + "reward": 0.7686767578125, + "reward_std": 0.011024602688848972, + "rewards//mean": 0.7686767578125, + "rewards//std": 0.02808629721403122, + "step": 966 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1934, + "grad_norm": 1.308644413948059, + "kl": 0.35375194624066353, + "learning_rate": 4.589300598577906e-06, + "loss": 0.0142, + "num_tokens": 8358440.0, + "reward": 0.75726318359375, + "reward_std": 0.0074037788435816765, + "rewards//mean": 0.75726318359375, + "rewards//std": 0.027497153729200363, + "step": 967 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1936, + "grad_norm": 1.059302568435669, + "kl": 0.3852734975516796, + "learning_rate": 4.58842885328039e-06, + "loss": 0.0154, + "num_tokens": 8367064.0, + "reward": 0.745361328125, + "reward_std": 0.006511340849101543, + "rewards//mean": 0.745361328125, + "rewards//std": 0.03340674936771393, + "step": 968 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1938, + "grad_norm": 1.2672991752624512, + "kl": 0.3788112476468086, + "learning_rate": 4.587556266763982e-06, + "loss": 0.0152, + "num_tokens": 8375712.0, + "reward": 0.72161865234375, + "reward_std": 0.0076048411428928375, + "rewards//mean": 0.72161865234375, + "rewards//std": 0.02709229476749897, + "step": 969 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.194, + "grad_norm": 1.2441980838775635, + "kl": 0.34678515046834946, + "learning_rate": 4.586682839380159e-06, + "loss": 0.0139, + "num_tokens": 8384464.0, + "reward": 0.75909423828125, + "reward_std": 0.011957092210650444, + "rewards//mean": 0.75909423828125, + "rewards//std": 0.029256997630000114, + "step": 970 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1942, + "grad_norm": 1.481521725654602, + "kl": 0.3327783904969692, + "learning_rate": 4.585808571480739e-06, + "loss": 0.0133, + "num_tokens": 8393096.0, + "reward": 0.76458740234375, + "reward_std": 0.009498574770987034, + "rewards//mean": 0.76458740234375, + "rewards//std": 0.02750045619904995, + "step": 971 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1944, + "grad_norm": 1.0270497798919678, + "kl": 0.35905660316348076, + "learning_rate": 4.584933463417874e-06, + "loss": 0.0144, + "num_tokens": 8401712.0, + "reward": 0.7547607421875, + "reward_std": 0.009442451409995556, + "rewards//mean": 0.7547607421875, + "rewards//std": 0.02266489528119564, + "step": 972 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1946, + "grad_norm": 1.9370031356811523, + "kl": 0.4017368145287037, + "learning_rate": 4.584057515544061e-06, + "loss": 0.0161, + "num_tokens": 8410496.0, + "reward": 0.75067138671875, + "reward_std": 0.008788703009486198, + "rewards//mean": 0.75067138671875, + "rewards//std": 0.026213670149445534, + "step": 973 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1948, + "grad_norm": 2.0624356269836426, + "kl": 0.36438558250665665, + "learning_rate": 4.583180728212128e-06, + "loss": 0.0146, + "num_tokens": 8419064.0, + "reward": 0.75830078125, + "reward_std": 0.00762117188423872, + "rewards//mean": 0.75830078125, + "rewards//std": 0.02847197651863098, + "step": 974 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.195, + "grad_norm": 1.3694345951080322, + "kl": 0.3885602429509163, + "learning_rate": 4.582303101775249e-06, + "loss": 0.0155, + "num_tokens": 8427840.0, + "reward": 0.76422119140625, + "reward_std": 0.009295577183365822, + "rewards//mean": 0.76422119140625, + "rewards//std": 0.02592390775680542, + "step": 975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1952, + "grad_norm": 1.3455674648284912, + "kl": 0.3587591424584389, + "learning_rate": 4.5814246365869285e-06, + "loss": 0.0144, + "num_tokens": 8436616.0, + "reward": 0.780029296875, + "reward_std": 0.00841439887881279, + "rewards//mean": 0.780029296875, + "rewards//std": 0.022754548117518425, + "step": 976 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1954, + "grad_norm": 1.3943334817886353, + "kl": 0.3845372460782528, + "learning_rate": 4.580545333001014e-06, + "loss": 0.0154, + "num_tokens": 8445216.0, + "reward": 0.7724609375, + "reward_std": 0.010662311688065529, + "rewards//mean": 0.7724609375, + "rewards//std": 0.03599030151963234, + "step": 977 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1956, + "grad_norm": 1.7175418138504028, + "kl": 0.3837543651461601, + "learning_rate": 4.579665191371687e-06, + "loss": 0.0154, + "num_tokens": 8453896.0, + "reward": 0.777099609375, + "reward_std": 0.0080185541883111, + "rewards//mean": 0.777099609375, + "rewards//std": 0.027151526883244514, + "step": 978 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1958, + "grad_norm": 1.7660549879074097, + "kl": 0.4039181172847748, + "learning_rate": 4.578784212053471e-06, + "loss": 0.0162, + "num_tokens": 8462552.0, + "reward": 0.7745361328125, + "reward_std": 0.0088932104408741, + "rewards//mean": 0.7745361328125, + "rewards//std": 0.018208811059594154, + "step": 979 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.196, + "grad_norm": 1.3596855401992798, + "kl": 0.41640326753258705, + "learning_rate": 4.577902395401222e-06, + "loss": 0.0167, + "num_tokens": 8471232.0, + "reward": 0.749755859375, + "reward_std": 0.012476136907935143, + "rewards//mean": 0.749755859375, + "rewards//std": 0.0270084235817194, + "step": 980 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1962, + "grad_norm": 1.7318655252456665, + "kl": 0.3891097605228424, + "learning_rate": 4.577019741770137e-06, + "loss": 0.0156, + "num_tokens": 8479856.0, + "reward": 0.763427734375, + "reward_std": 0.009000815451145172, + "rewards//mean": 0.763427734375, + "rewards//std": 0.025083519518375397, + "step": 981 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1964, + "grad_norm": 1.3574923276901245, + "kl": 0.43050722032785416, + "learning_rate": 4.576136251515748e-06, + "loss": 0.0172, + "num_tokens": 8488528.0, + "reward": 0.745849609375, + "reward_std": 0.013025259599089622, + "rewards//mean": 0.745849609375, + "rewards//std": 0.030216630548238754, + "step": 982 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1966, + "grad_norm": 1.5071163177490234, + "kl": 0.39033103734254837, + "learning_rate": 4.575251924993926e-06, + "loss": 0.0156, + "num_tokens": 8497112.0, + "reward": 0.75579833984375, + "reward_std": 0.010598527267575264, + "rewards//mean": 0.75579833984375, + "rewards//std": 0.028314098715782166, + "step": 983 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1968, + "grad_norm": 1.5211485624313354, + "kl": 0.4381902143359184, + "learning_rate": 4.574366762560876e-06, + "loss": 0.0175, + "num_tokens": 8505752.0, + "reward": 0.76092529296875, + "reward_std": 0.008739031851291656, + "rewards//mean": 0.76092529296875, + "rewards//std": 0.028815951198339462, + "step": 984 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.197, + "grad_norm": 1.9539810419082642, + "kl": 0.4092373363673687, + "learning_rate": 4.573480764573143e-06, + "loss": 0.0164, + "num_tokens": 8514376.0, + "reward": 0.74774169921875, + "reward_std": 0.008132066577672958, + "rewards//mean": 0.74774169921875, + "rewards//std": 0.029592422768473625, + "step": 985 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1972, + "grad_norm": 1.4879189729690552, + "kl": 0.3980576805770397, + "learning_rate": 4.572593931387604e-06, + "loss": 0.0159, + "num_tokens": 8522960.0, + "reward": 0.724853515625, + "reward_std": 0.0099346823990345, + "rewards//mean": 0.724853515625, + "rewards//std": 0.03472910076379776, + "step": 986 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1974, + "grad_norm": 1.3883405923843384, + "kl": 0.4651208482682705, + "learning_rate": 4.571706263361479e-06, + "loss": 0.0186, + "num_tokens": 8531496.0, + "reward": 0.72979736328125, + "reward_std": 0.00940313097089529, + "rewards//mean": 0.72979736328125, + "rewards//std": 0.033729128539562225, + "step": 987 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1976, + "grad_norm": 1.4428088665008545, + "kl": 0.43703150749206543, + "learning_rate": 4.570817760852319e-06, + "loss": 0.0175, + "num_tokens": 8540232.0, + "reward": 0.7557373046875, + "reward_std": 0.006712545640766621, + "rewards//mean": 0.7557373046875, + "rewards//std": 0.029765138402581215, + "step": 988 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1978, + "grad_norm": 1.7286840677261353, + "kl": 0.4548807218670845, + "learning_rate": 4.569928424218012e-06, + "loss": 0.0182, + "num_tokens": 8548920.0, + "reward": 0.75628662109375, + "reward_std": 0.008855903521180153, + "rewards//mean": 0.75628662109375, + "rewards//std": 0.029729709029197693, + "step": 989 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.198, + "grad_norm": 1.4762978553771973, + "kl": 0.4822859615087509, + "learning_rate": 4.569038253816783e-06, + "loss": 0.0193, + "num_tokens": 8557688.0, + "reward": 0.784912109375, + "reward_std": 0.00827457383275032, + "rewards//mean": 0.784912109375, + "rewards//std": 0.02746196649968624, + "step": 990 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1982, + "grad_norm": 1.853870153427124, + "kl": 0.4191112294793129, + "learning_rate": 4.5681472500071935e-06, + "loss": 0.0168, + "num_tokens": 8566352.0, + "reward": 0.75384521484375, + "reward_std": 0.0067790113389492035, + "rewards//mean": 0.75384521484375, + "rewards//std": 0.024154985323548317, + "step": 991 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1984, + "grad_norm": 1.573944330215454, + "kl": 0.42034513130784035, + "learning_rate": 4.567255413148139e-06, + "loss": 0.0168, + "num_tokens": 8575064.0, + "reward": 0.7481689453125, + "reward_std": 0.009067408740520477, + "rewards//mean": 0.7481689453125, + "rewards//std": 0.02402908354997635, + "step": 992 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1986, + "grad_norm": 1.8001471757888794, + "kl": 0.4698791652917862, + "learning_rate": 4.566362743598851e-06, + "loss": 0.0188, + "num_tokens": 8583664.0, + "reward": 0.7425537109375, + "reward_std": 0.012988231144845486, + "rewards//mean": 0.7425537109375, + "rewards//std": 0.02789810486137867, + "step": 993 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1988, + "grad_norm": 1.660449743270874, + "kl": 0.46960631385445595, + "learning_rate": 4.565469241718896e-06, + "loss": 0.0188, + "num_tokens": 8592344.0, + "reward": 0.752685546875, + "reward_std": 0.005921890959143639, + "rewards//mean": 0.752685546875, + "rewards//std": 0.020586274564266205, + "step": 994 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.199, + "grad_norm": 1.4339450597763062, + "kl": 0.4214029908180237, + "learning_rate": 4.564574907868179e-06, + "loss": 0.0169, + "num_tokens": 8601024.0, + "reward": 0.7352294921875, + "reward_std": 0.007715367246419191, + "rewards//mean": 0.7352294921875, + "rewards//std": 0.03900335729122162, + "step": 995 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1992, + "grad_norm": 2.15255069732666, + "kl": 0.4501718766987324, + "learning_rate": 4.563679742406935e-06, + "loss": 0.018, + "num_tokens": 8609688.0, + "reward": 0.76776123046875, + "reward_std": 0.006294669583439827, + "rewards//mean": 0.76776123046875, + "rewards//std": 0.03227883577346802, + "step": 996 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1994, + "grad_norm": 1.7035322189331055, + "kl": 0.45111533999443054, + "learning_rate": 4.562783745695738e-06, + "loss": 0.018, + "num_tokens": 8618400.0, + "reward": 0.76800537109375, + "reward_std": 0.0073316022753715515, + "rewards//mean": 0.76800537109375, + "rewards//std": 0.028362175449728966, + "step": 997 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1996, + "grad_norm": 1.9999769926071167, + "kl": 0.4639809913933277, + "learning_rate": 4.561886918095495e-06, + "loss": 0.0186, + "num_tokens": 8627216.0, + "reward": 0.768798828125, + "reward_std": 0.011256873607635498, + "rewards//mean": 0.768798828125, + "rewards//std": 0.024358505383133888, + "step": 998 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1998, + "grad_norm": 1.600160837173462, + "kl": 0.44482381641864777, + "learning_rate": 4.560989259967447e-06, + "loss": 0.0178, + "num_tokens": 8635816.0, + "reward": 0.71075439453125, + "reward_std": 0.00779071357101202, + "rewards//mean": 0.71075439453125, + "rewards//std": 0.035272691398859024, + "step": 999 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2, + "grad_norm": 2.164851665496826, + "kl": 0.4851101525127888, + "learning_rate": 4.560090771673174e-06, + "loss": 0.0194, + "num_tokens": 8644496.0, + "reward": 0.7564697265625, + "reward_std": 0.014028170146048069, + "rewards//mean": 0.7564697265625, + "rewards//std": 0.03170565515756607, + "step": 1000 + } + ], + "logging_steps": 1, + "max_steps": 5000, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}