{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9554140127388535, "eval_steps": 500, "global_step": 300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5714285714285714, "completions/max_length": 8057.0, "completions/mean_length": 6127.464599609375, "completions/min_length": 1222.5, "epoch": 0.0031847133757961785, "grad_norm": 0.4097345173358917, "kl": 0.0001392364501953125, "learning_rate": 2.5e-07, "loss": -0.22166921198368073, "memory(GiB)": 142.96, "reward": 0.3214285969734192, "reward_std": 0.18409645557403564, "rewards/AnswerTagAccuracyORM/mean": 0.3214285969734192, "rewards/AnswerTagAccuracyORM/std": 0.4609040319919586, "step": 1, "train_speed(iter/s)": 0.002718 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5892857142857143, "completions/max_length": 8075.0, "completions/mean_length": 6311.14306640625, "completions/min_length": 1474.0, "epoch": 0.006369426751592357, "grad_norm": 0.19287629425525665, "kl": 0.00012159347534179688, "learning_rate": 5e-07, "loss": -0.09268201887607574, "memory(GiB)": 157.67, "reward": 0.5357142984867096, "reward_std": 0.25552501529455185, "rewards/AnswerTagAccuracyORM/mean": 0.5357142984867096, "rewards/AnswerTagAccuracyORM/std": 0.4739968925714493, "step": 2, "train_speed(iter/s)": 0.002969 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 8049.5, "completions/mean_length": 6904.018310546875, "completions/min_length": 1641.0, "epoch": 0.009554140127388535, "grad_norm": 0.19940905272960663, "kl": 4.482269287109375e-05, "learning_rate": 7.5e-07, "loss": -0.1511116474866867, "memory(GiB)": 157.67, "reward": 0.4464285969734192, "reward_std": 0.29123930633068085, "rewards/AnswerTagAccuracyORM/mean": 0.4464285969734192, "rewards/AnswerTagAccuracyORM/std": 0.4897737503051758, "step": 3, "train_speed(iter/s)": 0.003082 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.6785714285714286, "completions/max_length": 8045.0, "completions/mean_length": 6924.089599609375, "completions/min_length": 2193.5, "epoch": 0.012738853503184714, "grad_norm": 0.6029608845710754, "kl": 0.00013637542724609375, "learning_rate": 1e-06, "loss": -0.41891923546791077, "memory(GiB)": 157.67, "reward": 0.2500000149011612, "reward_std": 0.2253357544541359, "rewards/AnswerTagAccuracyORM/mean": 0.2500000149011612, "rewards/AnswerTagAccuracyORM/std": 0.4389495849609375, "step": 4, "train_speed(iter/s)": 0.003138 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.6964285714285714, "completions/max_length": 8054.0, "completions/mean_length": 7148.3037109375, "completions/min_length": 2388.5, "epoch": 0.01592356687898089, "grad_norm": 0.2799333333969116, "kl": 0.00018405914306640625, "learning_rate": 9.999743248701019e-07, "loss": -0.09833915531635284, "memory(GiB)": 157.67, "reward": 0.3214285969734192, "reward_std": 0.32695358991622925, "rewards/AnswerTagAccuracyORM/mean": 0.3214285969734192, "rewards/AnswerTagAccuracyORM/std": 0.4755948781967163, "step": 5, "train_speed(iter/s)": 0.003168 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.7857142857142857, "completions/max_length": 8034.5, "completions/mean_length": 7279.303955078125, "completions/min_length": 3276.0, "epoch": 0.01910828025477707, "grad_norm": 0.2061210572719574, "kl": 0.00016450881958007812, "learning_rate": 9.998973021172564e-07, "loss": 0.03570747375488281, "memory(GiB)": 157.75, "reward": 0.3035714477300644, "reward_std": 0.23086077719926834, "rewards/AnswerTagAccuracyORM/mean": 0.3035714477300644, "rewards/AnswerTagAccuracyORM/std": 0.46781930327415466, "step": 6, "train_speed(iter/s)": 0.003171 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.7857142857142857, "completions/max_length": 8047.5, "completions/mean_length": 7143.1611328125, "completions/min_length": 2512.5, "epoch": 0.022292993630573247, "grad_norm": 0.5545817017555237, "kl": 9.918212890625e-05, "learning_rate": 9.997689396517406e-07, "loss": -0.36179664731025696, "memory(GiB)": 157.86, "reward": 0.2500000149011612, "reward_std": 0.33800362795591354, "rewards/AnswerTagAccuracyORM/mean": 0.2500000149011612, "rewards/AnswerTagAccuracyORM/std": 0.4389495849609375, "step": 7, "train_speed(iter/s)": 0.003182 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.7142857142857142, "completions/max_length": 8051.0, "completions/mean_length": 6924.375244140625, "completions/min_length": 1848.0, "epoch": 0.025477707006369428, "grad_norm": 0.3819968104362488, "kl": 0.0002493858337402344, "learning_rate": 9.99589250656446e-07, "loss": -0.23172156512737274, "memory(GiB)": 157.86, "reward": 0.2678571566939354, "reward_std": 0.21981073915958405, "rewards/AnswerTagAccuracyORM/mean": 0.2678571566939354, "rewards/AnswerTagAccuracyORM/std": 0.426847904920578, "step": 8, "train_speed(iter/s)": 0.003194 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.6964285714285714, "completions/max_length": 8042.5, "completions/mean_length": 6457.82177734375, "completions/min_length": 1266.0, "epoch": 0.028662420382165606, "grad_norm": 0.4406679570674896, "kl": 1.0362602143682022e-41, "learning_rate": 9.993582535855263e-07, "loss": -0.26652470231056213, "memory(GiB)": 157.86, "reward": 0.3928571492433548, "reward_std": 0.2253357619047165, "rewards/AnswerTagAccuracyORM/mean": 0.3928571492433548, "rewards/AnswerTagAccuracyORM/std": 0.4846093952655792, "step": 9, "train_speed(iter/s)": 0.003209 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.6071428571428572, "completions/max_length": 8019.0, "completions/mean_length": 6910.01806640625, "completions/min_length": 1922.5, "epoch": 0.03184713375796178, "grad_norm": 0.22980190813541412, "kl": 0.0003070831298828125, "learning_rate": 9.990759721625005e-07, "loss": 0.014422202482819557, "memory(GiB)": 157.89, "reward": 0.392857164144516, "reward_std": 0.33800362050533295, "rewards/AnswerTagAccuracyORM/mean": 0.392857164144516, "rewards/AnswerTagAccuracyORM/std": 0.4959513247013092, "step": 10, "train_speed(iter/s)": 0.003216 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.8035714285714286, "completions/max_length": 8046.0, "completions/mean_length": 7162.714599609375, "completions/min_length": 3054.0, "epoch": 0.03503184713375796, "grad_norm": 0.13719218969345093, "kl": 5.929594451790463e-42, "learning_rate": 9.98742435377817e-07, "loss": -0.0598757378757, "memory(GiB)": 158.35, "reward": 0.2500000186264515, "reward_std": 0.19514648616313934, "rewards/AnswerTagAccuracyORM/mean": 0.2500000186264515, "rewards/AnswerTagAccuracyORM/std": 0.3831089437007904, "step": 11, "train_speed(iter/s)": 0.003205 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.6964285714285714, "completions/max_length": 8062.0, "completions/mean_length": 6935.589599609375, "completions/min_length": 1498.0, "epoch": 0.03821656050955414, "grad_norm": 0.3652593195438385, "kl": 5.194803875951948e-08, "learning_rate": 9.983576774858775e-07, "loss": -0.02829546295106411, "memory(GiB)": 158.35, "reward": 0.2857142984867096, "reward_std": 0.3681928962469101, "rewards/AnswerTagAccuracyORM/mean": 0.2857142984867096, "rewards/AnswerTagAccuracyORM/std": 0.458276703953743, "step": 12, "train_speed(iter/s)": 0.0032 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.7142857142857143, "completions/max_length": 8041.0, "completions/mean_length": 7132.393310546875, "completions/min_length": 2462.0, "epoch": 0.041401273885350316, "grad_norm": 0.2656404376029968, "kl": 0.00013446807861328125, "learning_rate": 9.979217380015173e-07, "loss": -0.09244758635759354, "memory(GiB)": 158.35, "reward": 0.2857142984867096, "reward_std": 0.2253357619047165, "rewards/AnswerTagAccuracyORM/mean": 0.2857142984867096, "rewards/AnswerTagAccuracyORM/std": 0.4600437134504318, "step": 13, "train_speed(iter/s)": 0.003197 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.8214285714285714, "completions/max_length": 8044.5, "completions/mean_length": 7196.393310546875, "completions/min_length": 1879.0, "epoch": 0.044585987261146494, "grad_norm": 0.5126783847808838, "kl": 0.00011968612670898438, "learning_rate": 9.974346616959475e-07, "loss": -0.23802340030670166, "memory(GiB)": 158.35, "reward": 0.1428571492433548, "reward_std": 0.2857142984867096, "rewards/AnswerTagAccuracyORM/mean": 0.1428571492433548, "rewards/AnswerTagAccuracyORM/std": 0.3524957150220871, "step": 14, "train_speed(iter/s)": 0.003188 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.7142857142857143, "completions/max_length": 8062.5, "completions/mean_length": 7267.26806640625, "completions/min_length": 2604.5, "epoch": 0.04777070063694268, "grad_norm": 0.6457058787345886, "kl": 0.000148773193359375, "learning_rate": 9.968964985921581e-07, "loss": -0.41891902685165405, "memory(GiB)": 158.35, "reward": 0.3750000298023224, "reward_std": 0.3324785977602005, "rewards/AnswerTagAccuracyORM/mean": 0.3750000298023224, "rewards/AnswerTagAccuracyORM/std": 0.4897737503051758, "step": 15, "train_speed(iter/s)": 0.003189 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.6964285714285714, "completions/max_length": 8051.5, "completions/mean_length": 7219.57177734375, "completions/min_length": 3177.0, "epoch": 0.050955414012738856, "grad_norm": 0.29857584834098816, "kl": 0.0001468658447265625, "learning_rate": 9.963073039597796e-07, "loss": -0.1913096308708191, "memory(GiB)": 158.35, "reward": 0.3035714477300644, "reward_std": 0.30228935927152634, "rewards/AnswerTagAccuracyORM/mean": 0.3035714477300644, "rewards/AnswerTagAccuracyORM/std": 0.46781930327415466, "step": 16, "train_speed(iter/s)": 0.003187 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.7142857142857142, "completions/max_length": 8070.0, "completions/mean_length": 7195.3037109375, "completions/min_length": 2742.0, "epoch": 0.054140127388535034, "grad_norm": 0.5090013742446899, "kl": 1.1901227857510671e-41, "learning_rate": 9.956671383094068e-07, "loss": -0.181706503033638, "memory(GiB)": 158.35, "reward": 0.2678571566939354, "reward_std": 0.2500000149011612, "rewards/AnswerTagAccuracyORM/mean": 0.2678571566939354, "rewards/AnswerTagAccuracyORM/std": 0.426847904920578, "step": 17, "train_speed(iter/s)": 0.003177 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.6785714285714286, "completions/max_length": 8023.5, "completions/mean_length": 6882.964599609375, "completions/min_length": 1486.5, "epoch": 0.05732484076433121, "grad_norm": 0.24539124965667725, "kl": 0.0003590583801269531, "learning_rate": 9.949760673863846e-07, "loss": -0.04751904308795929, "memory(GiB)": 158.35, "reward": 0.3035714477300644, "reward_std": 0.37371790409088135, "rewards/AnswerTagAccuracyORM/mean": 0.3035714477300644, "rewards/AnswerTagAccuracyORM/std": 0.4469868242740631, "step": 18, "train_speed(iter/s)": 0.003182 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.6428571428571428, "completions/max_length": 8032.0, "completions/mean_length": 6654.82177734375, "completions/min_length": 1637.0, "epoch": 0.06050955414012739, "grad_norm": 0.2233801782131195, "kl": 0.0003261566162109375, "learning_rate": 9.942341621640557e-07, "loss": -0.11176574230194092, "memory(GiB)": 158.35, "reward": 0.446428582072258, "reward_std": 0.30228935927152634, "rewards/AnswerTagAccuracyORM/mean": 0.446428582072258, "rewards/AnswerTagAccuracyORM/std": 0.5032612681388855, "step": 19, "train_speed(iter/s)": 0.003186 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.6071428571428572, "completions/max_length": 8052.5, "completions/mean_length": 6537.839599609375, "completions/min_length": 1345.0, "epoch": 0.06369426751592357, "grad_norm": 0.2528194189071655, "kl": 0.0002574920654296875, "learning_rate": 9.934414988364722e-07, "loss": -0.1300119161605835, "memory(GiB)": 158.38, "reward": 0.392857164144516, "reward_std": 0.3078143745660782, "rewards/AnswerTagAccuracyORM/mean": 0.392857164144516, "rewards/AnswerTagAccuracyORM/std": 0.4744165241718292, "step": 20, "train_speed(iter/s)": 0.003187 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.7142857142857143, "completions/max_length": 8054.5, "completions/mean_length": 7098.500244140625, "completions/min_length": 2948.5, "epoch": 0.06687898089171974, "grad_norm": 0.35901573300361633, "kl": 0.0002613067626953125, "learning_rate": 9.925981588105694e-07, "loss": 0.0739278644323349, "memory(GiB)": 158.38, "reward": 0.3571428656578064, "reward_std": 0.3078143820166588, "rewards/AnswerTagAccuracyORM/mean": 0.3571428656578064, "rewards/AnswerTagAccuracyORM/std": 0.4635152816772461, "step": 21, "train_speed(iter/s)": 0.003186 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.6428571428571428, "completions/max_length": 8043.0, "completions/mean_length": 6802.339599609375, "completions/min_length": 2142.5, "epoch": 0.07006369426751592, "grad_norm": 0.31139233708381653, "kl": 0.0002899169921875, "learning_rate": 9.917042286978063e-07, "loss": -0.23081684112548828, "memory(GiB)": 158.38, "reward": 0.3392857313156128, "reward_std": 0.2610500380396843, "rewards/AnswerTagAccuracyORM/mean": 0.3392857313156128, "rewards/AnswerTagAccuracyORM/std": 0.4786955863237381, "step": 22, "train_speed(iter/s)": 0.003189 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5357142857142857, "completions/max_length": 8046.5, "completions/mean_length": 6078.089599609375, "completions/min_length": 1577.5, "epoch": 0.0732484076433121, "grad_norm": 0.3595212697982788, "kl": 0.06200312077999115, "learning_rate": 9.9075980030527e-07, "loss": -0.26652395725250244, "memory(GiB)": 158.38, "reward": 0.6250000298023224, "reward_std": 0.30228935182094574, "rewards/AnswerTagAccuracyORM/mean": 0.6250000298023224, "rewards/AnswerTagAccuracyORM/std": 0.4750668406486511, "step": 23, "train_speed(iter/s)": 0.00319 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.6964285714285714, "completions/max_length": 8046.0, "completions/mean_length": 6813.75048828125, "completions/min_length": 1453.0, "epoch": 0.07643312101910828, "grad_norm": 0.3403284549713135, "kl": 0.0003566741943359375, "learning_rate": 9.897649706262473e-07, "loss": -0.11760014295578003, "memory(GiB)": 158.38, "reward": 0.3035714477300644, "reward_std": 0.31333938241004944, "rewards/AnswerTagAccuracyORM/mean": 0.3035714477300644, "rewards/AnswerTagAccuracyORM/std": 0.4469868242740631, "step": 24, "train_speed(iter/s)": 0.00319 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.6428571428571428, "completions/max_length": 8073.5, "completions/mean_length": 6462.9111328125, "completions/min_length": 474.5, "epoch": 0.07961783439490445, "grad_norm": 0.5593795776367188, "kl": 25.92991018295288, "learning_rate": 9.887198418302628e-07, "loss": -0.2458166778087616, "memory(GiB)": 158.38, "reward": 0.4285714477300644, "reward_std": 0.3681928962469101, "rewards/AnswerTagAccuracyORM/mean": 0.4285714477300644, "rewards/AnswerTagAccuracyORM/std": 0.502610981464386, "step": 25, "train_speed(iter/s)": 0.00319 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.7678571428571428, "completions/max_length": 8025.0, "completions/mean_length": 7177.714599609375, "completions/min_length": 1826.5, "epoch": 0.08280254777070063, "grad_norm": 0.3172476291656494, "kl": 0.0003204345703125, "learning_rate": 9.87624521252587e-07, "loss": 0.09654846042394638, "memory(GiB)": 158.38, "reward": 0.267857164144516, "reward_std": 0.3214285969734192, "rewards/AnswerTagAccuracyORM/mean": 0.267857164144516, "rewards/AnswerTagAccuracyORM/std": 0.40946151316165924, "step": 26, "train_speed(iter/s)": 0.003186 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.875, "completions/max_length": 8030.5, "completions/mean_length": 7575.19677734375, "completions/min_length": 5771.5, "epoch": 0.08598726114649681, "grad_norm": 0.24004344642162323, "kl": 0.0003147125244140625, "learning_rate": 9.864791213832125e-07, "loss": -0.0921306237578392, "memory(GiB)": 158.38, "reward": 0.1964285857975483, "reward_std": 0.23086078464984894, "rewards/AnswerTagAccuracyORM/mean": 0.1964285857975483, "rewards/AnswerTagAccuracyORM/std": 0.36893007159233093, "step": 27, "train_speed(iter/s)": 0.003183 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.7321428571428571, "completions/max_length": 8037.0, "completions/mean_length": 7293.5361328125, "completions/min_length": 3104.0, "epoch": 0.08917197452229299, "grad_norm": 0.46742865443229675, "kl": 0.0008754730224609375, "learning_rate": 9.852837598553008e-07, "loss": -0.4069012999534607, "memory(GiB)": 158.38, "reward": 0.2142857313156128, "reward_std": 0.27762509882450104, "rewards/AnswerTagAccuracyORM/mean": 0.2142857313156128, "rewards/AnswerTagAccuracyORM/std": 0.417855441570282, "step": 28, "train_speed(iter/s)": 0.003179 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.6071428571428572, "completions/max_length": 8046.5, "completions/mean_length": 6678.232421875, "completions/min_length": 2332.5, "epoch": 0.09235668789808917, "grad_norm": 0.4905704855918884, "kl": 1.6553538759069064e-41, "learning_rate": 9.840385594331022e-07, "loss": -0.22333285212516785, "memory(GiB)": 169.42, "reward": 0.2321428656578064, "reward_std": 0.37371791899204254, "rewards/AnswerTagAccuracyORM/mean": 0.2321428656578064, "rewards/AnswerTagAccuracyORM/std": 0.429407000541687, "step": 29, "train_speed(iter/s)": 0.003179 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5892857142857143, "completions/max_length": 8048.5, "completions/mean_length": 6784.232421875, "completions/min_length": 1839.5, "epoch": 0.09554140127388536, "grad_norm": 0.2499741017818451, "kl": 0.0011577606201171875, "learning_rate": 9.827436479993468e-07, "loss": -0.10406609624624252, "memory(GiB)": 169.42, "reward": 0.3035714477300644, "reward_std": 0.3324786201119423, "rewards/AnswerTagAccuracyORM/mean": 0.3035714477300644, "rewards/AnswerTagAccuracyORM/std": 0.4576014578342438, "step": 30, "train_speed(iter/s)": 0.00318 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 8053.5, "completions/mean_length": 7322.607666015625, "completions/min_length": 3317.5, "epoch": 0.09872611464968153, "grad_norm": 0.22784624993801117, "kl": 0.0009670257568359375, "learning_rate": 9.813991585421116e-07, "loss": -0.1821945756673813, "memory(GiB)": 169.42, "reward": 0.25, "reward_std": 0.23638580739498138, "rewards/AnswerTagAccuracyORM/mean": 0.25, "rewards/AnswerTagAccuracyORM/std": 0.44095855951309204, "step": 31, "train_speed(iter/s)": 0.003176 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.7321428571428572, "completions/max_length": 8045.5, "completions/mean_length": 7113.89306640625, "completions/min_length": 2601.0, "epoch": 0.10191082802547771, "grad_norm": 0.27732953429222107, "kl": 0.000583648681640625, "learning_rate": 9.800052291411629e-07, "loss": -0.1299564391374588, "memory(GiB)": 169.42, "reward": 0.2678571492433548, "reward_std": 0.23086076974868774, "rewards/AnswerTagAccuracyORM/mean": 0.2678571492433548, "rewards/AnswerTagAccuracyORM/std": 0.45050112903118134, "step": 32, "train_speed(iter/s)": 0.00317 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.7142857142857143, "completions/max_length": 8044.0, "completions/mean_length": 6726.625244140625, "completions/min_length": 1058.0, "epoch": 0.10509554140127389, "grad_norm": 0.4480609595775604, "kl": 0.000911712646484375, "learning_rate": 9.78562002953774e-07, "loss": -0.21174360811710358, "memory(GiB)": 169.42, "reward": 0.2321428656578064, "reward_std": 0.2610500454902649, "rewards/AnswerTagAccuracyORM/mean": 0.2321428656578064, "rewards/AnswerTagAccuracyORM/std": 0.425032377243042, "step": 33, "train_speed(iter/s)": 0.003163 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.6071428571428572, "completions/max_length": 8025.0, "completions/mean_length": 6545.64306640625, "completions/min_length": 1659.0, "epoch": 0.10828025477707007, "grad_norm": 0.2851637005805969, "kl": 0.00090789794921875, "learning_rate": 9.770696282000244e-07, "loss": -0.11861994117498398, "memory(GiB)": 169.42, "reward": 0.3928571492433548, "reward_std": 0.26657506078481674, "rewards/AnswerTagAccuracyORM/mean": 0.3928571492433548, "rewards/AnswerTagAccuracyORM/std": 0.4846093952655792, "step": 34, "train_speed(iter/s)": 0.003162 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 8052.0, "completions/mean_length": 6577.803955078125, "completions/min_length": 1984.5, "epoch": 0.11146496815286625, "grad_norm": 0.32200559973716736, "kl": 0.00152587890625, "learning_rate": 9.755282581475767e-07, "loss": -0.16413375735282898, "memory(GiB)": 169.42, "reward": 0.4464285969734192, "reward_std": 0.30228935927152634, "rewards/AnswerTagAccuracyORM/mean": 0.4464285969734192, "rewards/AnswerTagAccuracyORM/std": 0.5059135556221008, "step": 35, "train_speed(iter/s)": 0.003164 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5357142857142857, "completions/max_length": 8069.0, "completions/mean_length": 6215.32177734375, "completions/min_length": 921.0, "epoch": 0.11464968152866242, "grad_norm": 0.3323407471179962, "kl": 0.00124359130859375, "learning_rate": 9.739380510959364e-07, "loss": -0.22452522814273834, "memory(GiB)": 169.42, "reward": 0.5000000298023224, "reward_std": 0.33800363540649414, "rewards/AnswerTagAccuracyORM/mean": 0.5000000298023224, "rewards/AnswerTagAccuracyORM/std": 0.44095855951309204, "step": 36, "train_speed(iter/s)": 0.003167 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.8035714285714286, "completions/max_length": 8056.0, "completions/mean_length": 7640.928955078125, "completions/min_length": 4844.0, "epoch": 0.1178343949044586, "grad_norm": 0.22709085047245026, "kl": 9.104936771950499e-42, "learning_rate": 9.722991703601935e-07, "loss": -0.06687425076961517, "memory(GiB)": 169.42, "reward": 0.160714291036129, "reward_std": 0.23086076974868774, "rewards/AnswerTagAccuracyORM/mean": 0.160714291036129, "rewards/AnswerTagAccuracyORM/std": 0.3731846660375595, "step": 37, "train_speed(iter/s)": 0.003163 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5714285714285714, "completions/max_length": 8069.0, "completions/mean_length": 6600.446533203125, "completions/min_length": 2290.0, "epoch": 0.12101910828025478, "grad_norm": 0.4062687158584595, "kl": 0.002471923828125, "learning_rate": 9.706117842542516e-07, "loss": -0.22931034862995148, "memory(GiB)": 169.42, "reward": 0.3571428656578064, "reward_std": 0.33800362050533295, "rewards/AnswerTagAccuracyORM/mean": 0.3571428656578064, "rewards/AnswerTagAccuracyORM/std": 0.48795004189014435, "step": 38, "train_speed(iter/s)": 0.003164 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.7678571428571428, "completions/max_length": 8054.5, "completions/mean_length": 7287.62548828125, "completions/min_length": 2747.5, "epoch": 0.12420382165605096, "grad_norm": 0.306149959564209, "kl": 0.000904083251953125, "learning_rate": 9.688760660735402e-07, "loss": -0.11831033229827881, "memory(GiB)": 169.42, "reward": 0.1785714328289032, "reward_std": 0.18409645557403564, "rewards/AnswerTagAccuracyORM/mean": 0.1785714328289032, "rewards/AnswerTagAccuracyORM/std": 0.39002102613449097, "step": 39, "train_speed(iter/s)": 0.003162 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.7857142857142858, "completions/max_length": 8063.5, "completions/mean_length": 7219.143310546875, "completions/min_length": 3600.0, "epoch": 0.12738853503184713, "grad_norm": 0.29684150218963623, "kl": 0.000675201416015625, "learning_rate": 9.670921940772186e-07, "loss": -0.16423112154006958, "memory(GiB)": 169.42, "reward": 0.3035714477300644, "reward_std": 0.2721000760793686, "rewards/AnswerTagAccuracyORM/mean": 0.3035714477300644, "rewards/AnswerTagAccuracyORM/std": 0.4469868242740631, "step": 40, "train_speed(iter/s)": 0.003158 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.4821428571428571, "completions/max_length": 8044.5, "completions/mean_length": 5554.14306640625, "completions/min_length": 769.5, "epoch": 0.1305732484076433, "grad_norm": 0.34710463881492615, "kl": 0.00290679931640625, "learning_rate": 9.652603514698673e-07, "loss": -0.11703464388847351, "memory(GiB)": 169.48, "reward": 0.4642857313156128, "reward_std": 0.379242941737175, "rewards/AnswerTagAccuracyORM/mean": 0.4642857313156128, "rewards/AnswerTagAccuracyORM/std": 0.5065638720989227, "step": 41, "train_speed(iter/s)": 0.003157 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.23214285714285715, "completions/max_length": 8039.5, "completions/mean_length": 4597.910888671875, "completions/min_length": 949.0, "epoch": 0.1337579617834395, "grad_norm": 0.33481189608573914, "kl": 0.00357818603515625, "learning_rate": 9.633807263826744e-07, "loss": -0.02082175388932228, "memory(GiB)": 169.48, "reward": 0.5892857313156128, "reward_std": 0.2610500380396843, "rewards/AnswerTagAccuracyORM/mean": 0.5892857313156128, "rewards/AnswerTagAccuracyORM/std": 0.4839591085910797, "step": 42, "train_speed(iter/s)": 0.00316 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5178571428571428, "completions/max_length": 7991.5, "completions/mean_length": 6022.285888671875, "completions/min_length": 1342.0, "epoch": 0.13694267515923567, "grad_norm": 0.321031391620636, "kl": 0.003082275390625, "learning_rate": 9.614535118541125e-07, "loss": -0.06112157553434372, "memory(GiB)": 169.48, "reward": 0.3035714477300644, "reward_std": 0.14838216826319695, "rewards/AnswerTagAccuracyORM/mean": 0.3035714477300644, "rewards/AnswerTagAccuracyORM/std": 0.46781930327415466, "step": 43, "train_speed(iter/s)": 0.00316 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 8059.5, "completions/mean_length": 7382.285888671875, "completions/min_length": 3295.0, "epoch": 0.14012738853503184, "grad_norm": 0.33003029227256775, "kl": 0.00177764892578125, "learning_rate": 9.594789058101153e-07, "loss": -0.22145554423332214, "memory(GiB)": 169.48, "reward": 0.3035714328289032, "reward_std": 0.3324786126613617, "rewards/AnswerTagAccuracyORM/mean": 0.3035714328289032, "rewards/AnswerTagAccuracyORM/std": 0.4644543081521988, "step": 44, "train_speed(iter/s)": 0.003155 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.48214285714285715, "completions/max_length": 8033.0, "completions/mean_length": 5761.893310546875, "completions/min_length": 988.5, "epoch": 0.14331210191082802, "grad_norm": 0.3568911552429199, "kl": 0.00131988525390625, "learning_rate": 9.574571110437496e-07, "loss": -0.1308659315109253, "memory(GiB)": 169.48, "reward": 0.535714328289032, "reward_std": 0.19514648616313934, "rewards/AnswerTagAccuracyORM/mean": 0.535714328289032, "rewards/AnswerTagAccuracyORM/std": 0.5078744888305664, "step": 45, "train_speed(iter/s)": 0.003156 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.7857142857142858, "completions/max_length": 8067.0, "completions/mean_length": 7363.285888671875, "completions/min_length": 3301.0, "epoch": 0.1464968152866242, "grad_norm": 0.4601195454597473, "kl": 0.0016326904296875, "learning_rate": 9.55388335194388e-07, "loss": -0.4354901909828186, "memory(GiB)": 169.48, "reward": 0.3571428656578064, "reward_std": 0.40943221747875214, "rewards/AnswerTagAccuracyORM/mean": 0.3571428656578064, "rewards/AnswerTagAccuracyORM/std": 0.4744165241718292, "step": 46, "train_speed(iter/s)": 0.003157 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5714285714285714, "completions/max_length": 8056.5, "completions/mean_length": 6383.57177734375, "completions/min_length": 1777.5, "epoch": 0.14968152866242038, "grad_norm": 0.32571953535079956, "kl": 0.00157928466796875, "learning_rate": 9.532727907263859e-07, "loss": -0.08489307761192322, "memory(GiB)": 169.48, "reward": 0.2500000149011612, "reward_std": 0.32695360481739044, "rewards/AnswerTagAccuracyORM/mean": 0.2500000149011612, "rewards/AnswerTagAccuracyORM/std": 0.4061589241027832, "step": 47, "train_speed(iter/s)": 0.003158 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.6964285714285714, "completions/max_length": 8029.0, "completions/mean_length": 6672.178955078125, "completions/min_length": 1226.0, "epoch": 0.15286624203821655, "grad_norm": 0.2804900109767914, "kl": 0.001705169677734375, "learning_rate": 9.511106949072587e-07, "loss": -0.07148804515600204, "memory(GiB)": 169.48, "reward": 0.4285714328289032, "reward_std": 0.2142857238650322, "rewards/AnswerTagAccuracyORM/mean": 0.4285714328289032, "rewards/AnswerTagAccuracyORM/std": 0.4985625892877579, "step": 48, "train_speed(iter/s)": 0.003156 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5892857142857142, "completions/max_length": 8053.5, "completions/mean_length": 6692.678955078125, "completions/min_length": 2195.5, "epoch": 0.15605095541401273, "grad_norm": 0.43050575256347656, "kl": 0.00308990478515625, "learning_rate": 9.489022697853708e-07, "loss": -0.3393845856189728, "memory(GiB)": 169.48, "reward": 0.3750000298023224, "reward_std": 0.3324786126613617, "rewards/AnswerTagAccuracyORM/mean": 0.3750000298023224, "rewards/AnswerTagAccuracyORM/std": 0.4897737354040146, "step": 49, "train_speed(iter/s)": 0.003156 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.6607142857142857, "completions/max_length": 8043.0, "completions/mean_length": 6784.5537109375, "completions/min_length": 2953.5, "epoch": 0.1592356687898089, "grad_norm": 0.3050151765346527, "kl": 0.00262451171875, "learning_rate": 9.466477421671295e-07, "loss": -0.11103521287441254, "memory(GiB)": 169.48, "reward": 0.23214287497103214, "reward_std": 0.1785714365541935, "rewards/AnswerTagAccuracyORM/mean": 0.23214287497103214, "rewards/AnswerTagAccuracyORM/std": 0.34646742790937424, "step": 50, "train_speed(iter/s)": 0.003152 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.8035714285714286, "completions/max_length": 8053.0, "completions/mean_length": 7592.14306640625, "completions/min_length": 4495.5, "epoch": 0.1624203821656051, "grad_norm": 0.2979692220687866, "kl": 9.197422470595937e-42, "learning_rate": 9.443473435936927e-07, "loss": -0.14950904250144958, "memory(GiB)": 169.48, "reward": 0.1428571492433548, "reward_std": 0.18409645557403564, "rewards/AnswerTagAccuracyORM/mean": 0.1428571492433548, "rewards/AnswerTagAccuracyORM/std": 0.35634833574295044, "step": 51, "train_speed(iter/s)": 0.003147 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5357142857142857, "completions/max_length": 8060.5, "completions/mean_length": 6415.9111328125, "completions/min_length": 1379.5, "epoch": 0.16560509554140126, "grad_norm": 0.19361363351345062, "kl": 965.6489562988281, "learning_rate": 9.420013103171891e-07, "loss": -0.16895829141139984, "memory(GiB)": 169.48, "reward": 0.4107143133878708, "reward_std": 0.14838215708732605, "rewards/AnswerTagAccuracyORM/mean": 0.4107143133878708, "rewards/AnswerTagAccuracyORM/std": 0.5006500333547592, "step": 52, "train_speed(iter/s)": 0.003141 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 8066.5, "completions/mean_length": 6355.839599609375, "completions/min_length": 1968.0, "epoch": 0.16878980891719744, "grad_norm": 0.3892778158187866, "kl": 0.0029144287109375, "learning_rate": 9.396098832764554e-07, "loss": -0.2460786998271942, "memory(GiB)": 169.48, "reward": 0.3392857313156128, "reward_std": 0.30228935182094574, "rewards/AnswerTagAccuracyORM/mean": 0.3392857313156128, "rewards/AnswerTagAccuracyORM/std": 0.48177245259284973, "step": 53, "train_speed(iter/s)": 0.003138 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5357142857142857, "completions/max_length": 8032.5, "completions/mean_length": 6231.6787109375, "completions/min_length": 1345.0, "epoch": 0.17197452229299362, "grad_norm": 0.5784780383110046, "kl": 0.00372314453125, "learning_rate": 9.37173308072291e-07, "loss": -0.3872688412666321, "memory(GiB)": 175.0, "reward": 0.3571428805589676, "reward_std": 0.25552503019571304, "rewards/AnswerTagAccuracyORM/mean": 0.3571428805589676, "rewards/AnswerTagAccuracyORM/std": 0.48199816048145294, "step": 54, "train_speed(iter/s)": 0.003136 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.4285714285714286, "completions/max_length": 8043.5, "completions/mean_length": 5922.51806640625, "completions/min_length": 920.5, "epoch": 0.1751592356687898, "grad_norm": 0.2570510804653168, "kl": 0.0035492679744493216, "learning_rate": 9.346918349422355e-07, "loss": -0.05529748648405075, "memory(GiB)": 175.0, "reward": 0.3750000149011612, "reward_std": 0.2006715089082718, "rewards/AnswerTagAccuracyORM/mean": 0.3750000149011612, "rewards/AnswerTagAccuracyORM/std": 0.4469868391752243, "step": 55, "train_speed(iter/s)": 0.003134 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.6964285714285714, "completions/max_length": 8032.0, "completions/mean_length": 6564.035888671875, "completions/min_length": 1182.0, "epoch": 0.17834394904458598, "grad_norm": 0.526690661907196, "kl": 0.0024261474609375, "learning_rate": 9.321657187348688e-07, "loss": -0.20846779644489288, "memory(GiB)": 175.0, "reward": 0.321428582072258, "reward_std": 0.3681928962469101, "rewards/AnswerTagAccuracyORM/mean": 0.321428582072258, "rewards/AnswerTagAccuracyORM/std": 0.4489477872848511, "step": 56, "train_speed(iter/s)": 0.003129 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.7857142857142857, "completions/max_length": 8056.0, "completions/mean_length": 7038.2861328125, "completions/min_length": 4610.0, "epoch": 0.18152866242038215, "grad_norm": 0.15928295254707336, "kl": 0.0027923583984375, "learning_rate": 9.295952188836378e-07, "loss": 0.021755440160632133, "memory(GiB)": 175.0, "reward": 0.1250000074505806, "reward_std": 0.14838216453790665, "rewards/AnswerTagAccuracyORM/mean": 0.1250000074505806, "rewards/AnswerTagAccuracyORM/std": 0.33565935492515564, "step": 57, "train_speed(iter/s)": 0.003121 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.6428571428571428, "completions/max_length": 8024.0, "completions/mean_length": 6775.214599609375, "completions/min_length": 1785.0, "epoch": 0.18471337579617833, "grad_norm": 0.5611134767532349, "kl": 7.1264767029788345e-06, "learning_rate": 9.269805993802128e-07, "loss": -0.15547017753124237, "memory(GiB)": 175.0, "reward": 0.3392857313156128, "reward_std": 0.29123931378126144, "rewards/AnswerTagAccuracyORM/mean": 0.3392857313156128, "rewards/AnswerTagAccuracyORM/std": 0.4628649652004242, "step": 58, "train_speed(iter/s)": 0.003119 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 8037.5, "completions/mean_length": 6358.4111328125, "completions/min_length": 1612.5, "epoch": 0.18789808917197454, "grad_norm": 0.15718594193458557, "kl": 0.004180908203125, "learning_rate": 9.243221287473755e-07, "loss": 0.03484642878174782, "memory(GiB)": 175.0, "reward": 0.3571428656578064, "reward_std": 0.11266787722706795, "rewards/AnswerTagAccuracyORM/mean": 0.3571428656578064, "rewards/AnswerTagAccuracyORM/std": 0.4744165539741516, "step": 59, "train_speed(iter/s)": 0.003118 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.6607142857142858, "completions/max_length": 8040.0, "completions/mean_length": 6776.6787109375, "completions/min_length": 1845.0, "epoch": 0.1910828025477707, "grad_norm": 0.36074620485305786, "kl": 0.0035858154296875, "learning_rate": 9.216200800114411e-07, "loss": -0.19808490574359894, "memory(GiB)": 175.0, "reward": 0.2142857164144516, "reward_std": 0.19514648616313934, "rewards/AnswerTagAccuracyORM/mean": 0.2142857164144516, "rewards/AnswerTagAccuracyORM/std": 0.4154897928237915, "step": 60, "train_speed(iter/s)": 0.003113 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.6607142857142857, "completions/max_length": 8027.0, "completions/mean_length": 6271.26806640625, "completions/min_length": 1195.5, "epoch": 0.1942675159235669, "grad_norm": 0.39658188819885254, "kl": 1.2364356999970023e-41, "learning_rate": 9.188747306742189e-07, "loss": -0.1974707692861557, "memory(GiB)": 175.0, "reward": 0.321428582072258, "reward_std": 0.2967643216252327, "rewards/AnswerTagAccuracyORM/mean": 0.321428582072258, "rewards/AnswerTagAccuracyORM/std": 0.4739968627691269, "step": 61, "train_speed(iter/s)": 0.003113 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.42857142857142855, "completions/max_length": 8047.5, "completions/mean_length": 5597.178955078125, "completions/min_length": 1193.0, "epoch": 0.19745222929936307, "grad_norm": 0.20000393688678741, "kl": 0.0042724609375, "learning_rate": 9.160863626845119e-07, "loss": -0.044626351445913315, "memory(GiB)": 175.0, "reward": 0.3571428656578064, "reward_std": 0.1539071872830391, "rewards/AnswerTagAccuracyORM/mean": 0.3571428656578064, "rewards/AnswerTagAccuracyORM/std": 0.4635152518749237, "step": 62, "train_speed(iter/s)": 0.003113 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.7142857142857142, "completions/max_length": 8042.5, "completions/mean_length": 6918.053955078125, "completions/min_length": 1768.0, "epoch": 0.20063694267515925, "grad_norm": 0.6508347988128662, "kl": 0.003936767578125, "learning_rate": 9.132552624091619e-07, "loss": -0.21150559186935425, "memory(GiB)": 175.0, "reward": 0.1607142984867096, "reward_std": 0.1785714402794838, "rewards/AnswerTagAccuracyORM/mean": 0.1607142984867096, "rewards/AnswerTagAccuracyORM/std": 0.3664129227399826, "step": 63, "train_speed(iter/s)": 0.003111 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.6785714285714286, "completions/max_length": 8040.5, "completions/mean_length": 6990.643310546875, "completions/min_length": 2698.5, "epoch": 0.20382165605095542, "grad_norm": 0.44794604182243347, "kl": 0.00531005859375, "learning_rate": 9.103817206036382e-07, "loss": -0.43547752499580383, "memory(GiB)": 175.0, "reward": 0.4107143133878708, "reward_std": 0.30228935927152634, "rewards/AnswerTagAccuracyORM/mean": 0.4107143133878708, "rewards/AnswerTagAccuracyORM/std": 0.4839591383934021, "step": 64, "train_speed(iter/s)": 0.003107 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.7678571428571428, "completions/max_length": 8033.0, "completions/mean_length": 6935.285888671875, "completions/min_length": 1890.5, "epoch": 0.2070063694267516, "grad_norm": 1.0245670080184937, "kl": 0.0034942626953125, "learning_rate": 9.07466032382177e-07, "loss": -0.16633543372154236, "memory(GiB)": 175.0, "reward": 0.321428582072258, "reward_std": 0.26657506078481674, "rewards/AnswerTagAccuracyORM/mean": 0.321428582072258, "rewards/AnswerTagAccuracyORM/std": 0.4739968627691269, "step": 65, "train_speed(iter/s)": 0.003103 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5535714285714286, "completions/max_length": 8060.5, "completions/mean_length": 6056.26806640625, "completions/min_length": 1797.0, "epoch": 0.21019108280254778, "grad_norm": 0.26687300205230713, "kl": 0.005523681640625, "learning_rate": 9.045084971874737e-07, "loss": -0.17107920348644257, "memory(GiB)": 175.0, "reward": 0.464285746216774, "reward_std": 0.26657507568597794, "rewards/AnswerTagAccuracyORM/mean": 0.464285746216774, "rewards/AnswerTagAccuracyORM/std": 0.4739968925714493, "step": 66, "train_speed(iter/s)": 0.003104 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.6964285714285714, "completions/max_length": 8050.0, "completions/mean_length": 7268.107421875, "completions/min_length": 1853.5, "epoch": 0.21337579617834396, "grad_norm": 0.39480164647102356, "kl": 0.00036018589162267745, "learning_rate": 9.015094187599296e-07, "loss": -0.2682667374610901, "memory(GiB)": 175.0, "reward": 0.2142857238650322, "reward_std": 0.2253357544541359, "rewards/AnswerTagAccuracyORM/mean": 0.2142857238650322, "rewards/AnswerTagAccuracyORM/std": 0.40819603204727173, "step": 67, "train_speed(iter/s)": 0.003105 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5357142857142857, "completions/max_length": 8069.5, "completions/mean_length": 5839.25048828125, "completions/min_length": 776.0, "epoch": 0.21656050955414013, "grad_norm": 0.37252077460289, "kl": 0.0123291015625, "learning_rate": 8.984691051064574e-07, "loss": -0.14431779086589813, "memory(GiB)": 175.0, "reward": 0.4107142984867096, "reward_std": 0.21981073170900345, "rewards/AnswerTagAccuracyORM/mean": 0.4107142984867096, "rewards/AnswerTagAccuracyORM/std": 0.497912272810936, "step": 68, "train_speed(iter/s)": 0.003108 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5535714285714286, "completions/max_length": 8044.0, "completions/mean_length": 5993.5361328125, "completions/min_length": 1093.0, "epoch": 0.2197452229299363, "grad_norm": 0.3685239553451538, "kl": 0.01165771484375, "learning_rate": 8.953878684688492e-07, "loss": -0.1928965002298355, "memory(GiB)": 175.0, "reward": 0.321428582072258, "reward_std": 0.18409645557403564, "rewards/AnswerTagAccuracyORM/mean": 0.321428582072258, "rewards/AnswerTagAccuracyORM/std": 0.4739968627691269, "step": 69, "train_speed(iter/s)": 0.003111 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5357142857142857, "completions/max_length": 8056.0, "completions/mean_length": 6114.82177734375, "completions/min_length": 1662.5, "epoch": 0.2229299363057325, "grad_norm": 0.5651078224182129, "kl": 0.013641357421875, "learning_rate": 8.922660252917087e-07, "loss": -0.3238877058029175, "memory(GiB)": 175.0, "reward": 0.3571428805589676, "reward_std": 0.33800363540649414, "rewards/AnswerTagAccuracyORM/mean": 0.3571428805589676, "rewards/AnswerTagAccuracyORM/std": 0.48647116124629974, "step": 70, "train_speed(iter/s)": 0.003113 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5178571428571428, "completions/max_length": 8029.0, "completions/mean_length": 6308.1787109375, "completions/min_length": 1943.5, "epoch": 0.22611464968152867, "grad_norm": 0.41033416986465454, "kl": 0.0162353515625, "learning_rate": 8.891038961899519e-07, "loss": -0.17454411089420319, "memory(GiB)": 175.0, "reward": 0.3392857387661934, "reward_std": 0.1896214708685875, "rewards/AnswerTagAccuracyORM/mean": 0.3392857387661934, "rewards/AnswerTagAccuracyORM/std": 0.43211139738559723, "step": 71, "train_speed(iter/s)": 0.003115 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.4107142857142857, "completions/max_length": 8051.5, "completions/mean_length": 5688.107177734375, "completions/min_length": 1738.5, "epoch": 0.22929936305732485, "grad_norm": 0.4853333830833435, "kl": 0.015869140625, "learning_rate": 8.859018059158809e-07, "loss": -0.2635350823402405, "memory(GiB)": 175.0, "reward": 0.392857164144516, "reward_std": 0.3792429566383362, "rewards/AnswerTagAccuracyORM/mean": 0.392857164144516, "rewards/AnswerTagAccuracyORM/std": 0.49173468351364136, "step": 72, "train_speed(iter/s)": 0.003117 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.6071428571428571, "completions/max_length": 8056.0, "completions/mean_length": 6535.660888671875, "completions/min_length": 2464.0, "epoch": 0.23248407643312102, "grad_norm": 0.18295517563819885, "kl": 0.00787353515625, "learning_rate": 8.826600833258307e-07, "loss": -0.05387752130627632, "memory(GiB)": 175.14, "reward": 0.2857142984867096, "reward_std": 0.18409645557403564, "rewards/AnswerTagAccuracyORM/mean": 0.2857142984867096, "rewards/AnswerTagAccuracyORM/std": 0.460043728351593, "step": 73, "train_speed(iter/s)": 0.003115 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.44642857142857145, "completions/max_length": 8065.5, "completions/mean_length": 5659.46435546875, "completions/min_length": 1074.5, "epoch": 0.2356687898089172, "grad_norm": 0.27235084772109985, "kl": 0.01446533203125, "learning_rate": 8.793790613463954e-07, "loss": -0.07979360222816467, "memory(GiB)": 175.14, "reward": 0.375, "reward_std": 0.23086076974868774, "rewards/AnswerTagAccuracyORM/mean": 0.375, "rewards/AnswerTagAccuracyORM/std": 0.4750668406486511, "step": 74, "train_speed(iter/s)": 0.003117 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.6785714285714286, "completions/max_length": 8040.5, "completions/mean_length": 6586.518310546875, "completions/min_length": 1586.5, "epoch": 0.23885350318471338, "grad_norm": 0.4448719322681427, "kl": 1.2463148541704923e-41, "learning_rate": 8.760590769402371e-07, "loss": -0.14089421927928925, "memory(GiB)": 175.14, "reward": 0.3571428656578064, "reward_std": 0.27762509882450104, "rewards/AnswerTagAccuracyORM/mean": 0.3571428656578064, "rewards/AnswerTagAccuracyORM/std": 0.4635152518749237, "step": 75, "train_speed(iter/s)": 0.003119 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.4107142857142857, "completions/max_length": 7960.5, "completions/mean_length": 5350.035888671875, "completions/min_length": 1008.5, "epoch": 0.24203821656050956, "grad_norm": 0.3529123067855835, "kl": 0.007568359375, "learning_rate": 8.727004710714798e-07, "loss": -0.06856054067611694, "memory(GiB)": 175.14, "reward": 0.4285714477300644, "reward_std": 0.18409645557403564, "rewards/AnswerTagAccuracyORM/mean": 0.4285714477300644, "rewards/AnswerTagAccuracyORM/std": 0.5026109665632248, "step": 76, "train_speed(iter/s)": 0.00312 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.3928571428571429, "completions/max_length": 8051.5, "completions/mean_length": 5923.339599609375, "completions/min_length": 1552.0, "epoch": 0.24522292993630573, "grad_norm": 0.4466400742530823, "kl": 0.01898193359375, "learning_rate": 8.693035886706907e-07, "loss": -0.28264355659484863, "memory(GiB)": 175.14, "reward": 0.4821428954601288, "reward_std": 0.29123931378126144, "rewards/AnswerTagAccuracyORM/mean": 0.4821428954601288, "rewards/AnswerTagAccuracyORM/std": 0.5059135556221008, "step": 77, "train_speed(iter/s)": 0.003121 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5535714285714286, "completions/max_length": 8047.5, "completions/mean_length": 6365.14306640625, "completions/min_length": 1254.0, "epoch": 0.2484076433121019, "grad_norm": 0.42635998129844666, "kl": 0.0216064453125, "learning_rate": 8.658687785994578e-07, "loss": -0.2270558625459671, "memory(GiB)": 175.23, "reward": 0.2678571492433548, "reward_std": 0.1896214708685875, "rewards/AnswerTagAccuracyORM/mean": 0.2678571492433548, "rewards/AnswerTagAccuracyORM/std": 0.45050114393234253, "step": 78, "train_speed(iter/s)": 0.003119 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.6071428571428571, "completions/max_length": 8055.5, "completions/mean_length": 6524.4111328125, "completions/min_length": 2012.5, "epoch": 0.2515923566878981, "grad_norm": 0.2484586536884308, "kl": 0.00823974609375, "learning_rate": 8.623963936145599e-07, "loss": -0.10795820504426956, "memory(GiB)": 175.23, "reward": 0.267857164144516, "reward_std": 0.07695359364151955, "rewards/AnswerTagAccuracyORM/mean": 0.267857164144516, "rewards/AnswerTagAccuracyORM/std": 0.44672515988349915, "step": 79, "train_speed(iter/s)": 0.00312 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.4642857142857143, "completions/max_length": 8033.0, "completions/mean_length": 5579.375244140625, "completions/min_length": 1700.5, "epoch": 0.25477707006369427, "grad_norm": 0.4129941463470459, "kl": 0.02008056640625, "learning_rate": 8.588867903317394e-07, "loss": -0.17563505470752716, "memory(GiB)": 175.23, "reward": 0.4821428656578064, "reward_std": 0.2610500380396843, "rewards/AnswerTagAccuracyORM/mean": 0.4821428656578064, "rewards/AnswerTagAccuracyORM/std": 0.5085247755050659, "step": 80, "train_speed(iter/s)": 0.003123 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.4107142857142857, "completions/max_length": 8030.5, "completions/mean_length": 5784.410888671875, "completions/min_length": 1787.0, "epoch": 0.25796178343949044, "grad_norm": 0.44289883971214294, "kl": 0.0218505859375, "learning_rate": 8.553403291890767e-07, "loss": -0.2814217805862427, "memory(GiB)": 175.23, "reward": 0.4821428805589676, "reward_std": 0.30228935927152634, "rewards/AnswerTagAccuracyORM/mean": 0.4821428805589676, "rewards/AnswerTagAccuracyORM/std": 0.5006500482559204, "step": 81, "train_speed(iter/s)": 0.003126 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.39285714285714285, "completions/max_length": 8028.5, "completions/mean_length": 5141.1787109375, "completions/min_length": 1243.5, "epoch": 0.2611464968152866, "grad_norm": 0.556716799736023, "kl": 0.0203857421875, "learning_rate": 8.51757374409974e-07, "loss": -0.24188844859600067, "memory(GiB)": 175.23, "reward": 0.4285714626312256, "reward_std": 0.2967643439769745, "rewards/AnswerTagAccuracyORM/mean": 0.4285714626312256, "rewards/AnswerTagAccuracyORM/std": 0.49173468351364136, "step": 82, "train_speed(iter/s)": 0.003129 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.4642857142857143, "completions/max_length": 8033.5, "completions/mean_length": 5490.625244140625, "completions/min_length": 1102.5, "epoch": 0.2643312101910828, "grad_norm": 0.43611571192741394, "kl": 0.01397705078125, "learning_rate": 8.481382939657488e-07, "loss": -0.06833790987730026, "memory(GiB)": 175.23, "reward": 0.517857164144516, "reward_std": 0.21981073170900345, "rewards/AnswerTagAccuracyORM/mean": 0.517857164144516, "rewards/AnswerTagAccuracyORM/std": 0.5006500482559204, "step": 83, "train_speed(iter/s)": 0.003131 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.4642857142857143, "completions/max_length": 8042.5, "completions/mean_length": 6488.946533203125, "completions/min_length": 2348.5, "epoch": 0.267515923566879, "grad_norm": 0.5264121890068054, "kl": 0.0264892578125, "learning_rate": 8.444834595378433e-07, "loss": -0.28356218338012695, "memory(GiB)": 175.23, "reward": 0.3035714477300644, "reward_std": 0.30228933691978455, "rewards/AnswerTagAccuracyORM/mean": 0.3035714477300644, "rewards/AnswerTagAccuracyORM/std": 0.46781930327415466, "step": 84, "train_speed(iter/s)": 0.003133 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5357142857142857, "completions/max_length": 8023.5, "completions/mean_length": 5915.893310546875, "completions/min_length": 1153.0, "epoch": 0.27070063694267515, "grad_norm": 0.852773129940033, "kl": 0.01702880859375, "learning_rate": 8.407932464796521e-07, "loss": -0.2204403430223465, "memory(GiB)": 175.23, "reward": 0.3392857313156128, "reward_std": 0.29123931378126144, "rewards/AnswerTagAccuracyORM/mean": 0.3392857313156128, "rewards/AnswerTagAccuracyORM/std": 0.47245559096336365, "step": 85, "train_speed(iter/s)": 0.003133 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.35714285714285715, "completions/max_length": 8021.5, "completions/mean_length": 5667.4111328125, "completions/min_length": 1922.0, "epoch": 0.27388535031847133, "grad_norm": 0.41208142042160034, "kl": 0.02435302734375, "learning_rate": 8.370680337779736e-07, "loss": -0.09956123679876328, "memory(GiB)": 175.23, "reward": 0.4464285969734192, "reward_std": 0.29123932123184204, "rewards/AnswerTagAccuracyORM/mean": 0.4464285969734192, "rewards/AnswerTagAccuracyORM/std": 0.5059135854244232, "step": 86, "train_speed(iter/s)": 0.003136 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.6607142857142857, "completions/max_length": 8047.0, "completions/mean_length": 6915.0361328125, "completions/min_length": 2560.5, "epoch": 0.2770700636942675, "grad_norm": 0.5504844188690186, "kl": 0.021484375, "learning_rate": 8.333082040140882e-07, "loss": -0.33471736311912537, "memory(GiB)": 175.23, "reward": 0.3750000149011612, "reward_std": 0.2610500305891037, "rewards/AnswerTagAccuracyORM/mean": 0.3750000149011612, "rewards/AnswerTagAccuracyORM/std": 0.49264875054359436, "step": 87, "train_speed(iter/s)": 0.003136 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5357142857142857, "completions/max_length": 8103.0, "completions/mean_length": 5899.035888671875, "completions/min_length": 1566.0, "epoch": 0.2802547770700637, "grad_norm": 0.6481395959854126, "kl": 0.023193359375, "learning_rate": 8.295141433244659e-07, "loss": -0.3118484616279602, "memory(GiB)": 175.23, "reward": 0.4107143133878708, "reward_std": 0.30228935927152634, "rewards/AnswerTagAccuracyORM/mean": 0.4107143133878708, "rewards/AnswerTagAccuracyORM/std": 0.4839591085910797, "step": 88, "train_speed(iter/s)": 0.003138 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 8064.5, "completions/mean_length": 5719.839599609375, "completions/min_length": 1776.0, "epoch": 0.28343949044585987, "grad_norm": 0.3430127203464508, "kl": 0.01434326171875, "learning_rate": 8.256862413611112e-07, "loss": 0.07326733320951462, "memory(GiB)": 175.55, "reward": 0.4285714328289032, "reward_std": 0.40943220257759094, "rewards/AnswerTagAccuracyORM/mean": 0.4285714328289032, "rewards/AnswerTagAccuracyORM/std": 0.4985625892877579, "step": 89, "train_speed(iter/s)": 0.003141 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5535714285714286, "completions/max_length": 8071.0, "completions/mean_length": 6036.107421875, "completions/min_length": 1410.5, "epoch": 0.28662420382165604, "grad_norm": 0.9747743010520935, "kl": 0.0157470703125, "learning_rate": 8.218248912515442e-07, "loss": -0.32189831137657166, "memory(GiB)": 175.55, "reward": 0.4285714477300644, "reward_std": 0.379242941737175, "rewards/AnswerTagAccuracyORM/mean": 0.4285714477300644, "rewards/AnswerTagAccuracyORM/std": 0.48199817538261414, "step": 90, "train_speed(iter/s)": 0.003143 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 8030.0, "completions/mean_length": 5853.910888671875, "completions/min_length": 1923.5, "epoch": 0.2898089171974522, "grad_norm": 0.6417209506034851, "kl": 0.0321044921875, "learning_rate": 8.179304895584281e-07, "loss": -0.20055779814720154, "memory(GiB)": 175.55, "reward": 0.4285714477300644, "reward_std": 0.40943220257759094, "rewards/AnswerTagAccuracyORM/mean": 0.4285714477300644, "rewards/AnswerTagAccuracyORM/std": 0.502610981464386, "step": 91, "train_speed(iter/s)": 0.003145 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5714285714285714, "completions/max_length": 8069.0, "completions/mean_length": 6345.964599609375, "completions/min_length": 1514.0, "epoch": 0.2929936305732484, "grad_norm": 0.157789409160614, "kl": 0.01397705078125, "learning_rate": 8.140034362388397e-07, "loss": -0.0897713378071785, "memory(GiB)": 175.55, "reward": 0.4285714328289032, "reward_std": 0.2142857238650322, "rewards/AnswerTagAccuracyORM/mean": 0.4285714328289032, "rewards/AnswerTagAccuracyORM/std": 0.4985625743865967, "step": 92, "train_speed(iter/s)": 0.003146 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 8048.0, "completions/mean_length": 6575.268310546875, "completions/min_length": 1400.5, "epoch": 0.2961783439490446, "grad_norm": 0.19858166575431824, "kl": 0.02850341796875, "learning_rate": 8.100441346031958e-07, "loss": -0.004586466588079929, "memory(GiB)": 175.55, "reward": 0.3392857313156128, "reward_std": 0.1785714365541935, "rewards/AnswerTagAccuracyORM/mean": 0.3392857313156128, "rewards/AnswerTagAccuracyORM/std": 0.4628649652004242, "step": 93, "train_speed(iter/s)": 0.003146 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.48214285714285715, "completions/max_length": 8063.5, "completions/mean_length": 5983.518310546875, "completions/min_length": 1320.5, "epoch": 0.29936305732484075, "grad_norm": 0.5492147207260132, "kl": 796.5401611328125, "learning_rate": 8.060529912738314e-07, "loss": -0.20520265400409698, "memory(GiB)": 175.55, "reward": 0.4464285969734192, "reward_std": 0.31333939731121063, "rewards/AnswerTagAccuracyORM/mean": 0.4464285969734192, "rewards/AnswerTagAccuracyORM/std": 0.4979122579097748, "step": 94, "train_speed(iter/s)": 0.003147 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.7678571428571428, "completions/max_length": 8047.5, "completions/mean_length": 6954.500244140625, "completions/min_length": 2610.0, "epoch": 0.30254777070063693, "grad_norm": 0.38487347960472107, "kl": 1.0918917634018975e-41, "learning_rate": 8.020304161432403e-07, "loss": -0.061524372547864914, "memory(GiB)": 175.55, "reward": 0.1785714402794838, "reward_std": 0.1539071798324585, "rewards/AnswerTagAccuracyORM/mean": 0.1785714402794838, "rewards/AnswerTagAccuracyORM/std": 0.3871018886566162, "step": 95, "train_speed(iter/s)": 0.003145 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.4821428571428571, "completions/max_length": 8047.5, "completions/mean_length": 5583.64306640625, "completions/min_length": 1472.5, "epoch": 0.3057324840764331, "grad_norm": 0.3516985774040222, "kl": 0.018798828125, "learning_rate": 7.979768223319785e-07, "loss": -0.08431195467710495, "memory(GiB)": 175.55, "reward": 0.392857164144516, "reward_std": 0.26657505333423615, "rewards/AnswerTagAccuracyORM/mean": 0.392857164144516, "rewards/AnswerTagAccuracyORM/std": 0.4973474591970444, "step": 96, "train_speed(iter/s)": 0.003147 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.4642857142857143, "completions/max_length": 8028.5, "completions/mean_length": 5748.21435546875, "completions/min_length": 1341.0, "epoch": 0.3089171974522293, "grad_norm": 0.4274459779262543, "kl": 0.0357666015625, "learning_rate": 7.938926261462365e-07, "loss": -0.17042537033557892, "memory(GiB)": 175.55, "reward": 0.321428582072258, "reward_std": 0.2253357619047165, "rewards/AnswerTagAccuracyORM/mean": 0.321428582072258, "rewards/AnswerTagAccuracyORM/std": 0.4691530019044876, "step": 97, "train_speed(iter/s)": 0.003149 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5714285714285714, "completions/max_length": 8063.5, "completions/mean_length": 6201.875244140625, "completions/min_length": 1993.5, "epoch": 0.31210191082802546, "grad_norm": 0.32356998324394226, "kl": 1.5625879175686035e-41, "learning_rate": 7.897782470350849e-07, "loss": -0.13458389043807983, "memory(GiB)": 175.55, "reward": 0.392857164144516, "reward_std": 0.26657506078481674, "rewards/AnswerTagAccuracyORM/mean": 0.392857164144516, "rewards/AnswerTagAccuracyORM/std": 0.4959513247013092, "step": 98, "train_speed(iter/s)": 0.003151 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5535714285714286, "completions/max_length": 8029.5, "completions/mean_length": 6730.339599609375, "completions/min_length": 2250.0, "epoch": 0.31528662420382164, "grad_norm": 0.7448033690452576, "kl": 0.00970458984375, "learning_rate": 7.856341075473961e-07, "loss": -0.19996413588523865, "memory(GiB)": 175.55, "reward": 0.3750000149011612, "reward_std": 0.3435286581516266, "rewards/AnswerTagAccuracyORM/mean": 0.3750000149011612, "rewards/AnswerTagAccuracyORM/std": 0.4839591085910797, "step": 99, "train_speed(iter/s)": 0.003152 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.3214285714285714, "completions/max_length": 8037.5, "completions/mean_length": 5035.035888671875, "completions/min_length": 932.0, "epoch": 0.3184713375796178, "grad_norm": 0.5652780532836914, "kl": 0.04150390625, "learning_rate": 7.814606332884488e-07, "loss": -0.10035921633243561, "memory(GiB)": 175.55, "reward": 0.5714285969734192, "reward_std": 0.32695361226797104, "rewards/AnswerTagAccuracyORM/mean": 0.5714285969734192, "rewards/AnswerTagAccuracyORM/std": 0.4985625892877579, "step": 100, "train_speed(iter/s)": 0.003155 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 8061.0, "completions/mean_length": 6390.910888671875, "completions/min_length": 1904.0, "epoch": 0.321656050955414, "grad_norm": 0.4127773344516754, "kl": 0.038330078125, "learning_rate": 7.772582528762178e-07, "loss": -0.16605715453624725, "memory(GiB)": 175.55, "reward": 0.4464285969734192, "reward_std": 0.23086078464984894, "rewards/AnswerTagAccuracyORM/mean": 0.4464285969734192, "rewards/AnswerTagAccuracyORM/std": 0.4979122579097748, "step": 101, "train_speed(iter/s)": 0.003152 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5892857142857143, "completions/max_length": 8040.0, "completions/mean_length": 6349.01806640625, "completions/min_length": 1580.0, "epoch": 0.3248407643312102, "grad_norm": 0.4982652962207794, "kl": 22.047773361206055, "learning_rate": 7.730273978973552e-07, "loss": -0.2726196050643921, "memory(GiB)": 175.55, "reward": 0.4464285969734192, "reward_std": 0.2610500380396843, "rewards/AnswerTagAccuracyORM/mean": 0.4464285969734192, "rewards/AnswerTagAccuracyORM/std": 0.49791230261325836, "step": 102, "train_speed(iter/s)": 0.003152 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.4642857142857143, "completions/max_length": 8051.0, "completions/mean_length": 5684.39306640625, "completions/min_length": 1680.0, "epoch": 0.32802547770700635, "grad_norm": 0.6498633027076721, "kl": 0.082763671875, "learning_rate": 7.687685028628652e-07, "loss": -0.21366629004478455, "memory(GiB)": 175.55, "reward": 0.3750000149011612, "reward_std": 0.1896214783191681, "rewards/AnswerTagAccuracyORM/mean": 0.3750000149011612, "rewards/AnswerTagAccuracyORM/std": 0.4839591085910797, "step": 103, "train_speed(iter/s)": 0.003152 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.6071428571428572, "completions/max_length": 8018.5, "completions/mean_length": 6570.8037109375, "completions/min_length": 2079.0, "epoch": 0.33121019108280253, "grad_norm": 0.7568531632423401, "kl": 0.0177001953125, "learning_rate": 7.644820051634812e-07, "loss": -0.23152737319469452, "memory(GiB)": 175.55, "reward": 0.196428582072258, "reward_std": 0.2610500454902649, "rewards/AnswerTagAccuracyORM/mean": 0.196428582072258, "rewards/AnswerTagAccuracyORM/std": 0.4039382338523865, "step": 104, "train_speed(iter/s)": 0.003152 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.4642857142857143, "completions/max_length": 8056.0, "completions/mean_length": 5679.3037109375, "completions/min_length": 1807.0, "epoch": 0.3343949044585987, "grad_norm": 0.48169225454330444, "kl": 0.0501708984375, "learning_rate": 7.60168345024744e-07, "loss": -0.10911425948143005, "memory(GiB)": 175.55, "reward": 0.4107143133878708, "reward_std": 0.4149572253227234, "rewards/AnswerTagAccuracyORM/mean": 0.4107143133878708, "rewards/AnswerTagAccuracyORM/std": 0.5006500482559204, "step": 105, "train_speed(iter/s)": 0.003153 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5714285714285714, "completions/max_length": 8050.5, "completions/mean_length": 6120.9111328125, "completions/min_length": 1163.5, "epoch": 0.3375796178343949, "grad_norm": 0.41283684968948364, "kl": 0.013671875, "learning_rate": 7.558279654617912e-07, "loss": -0.15918892621994019, "memory(GiB)": 175.55, "reward": 0.3571428805589676, "reward_std": 0.307814359664917, "rewards/AnswerTagAccuracyORM/mean": 0.3571428805589676, "rewards/AnswerTagAccuracyORM/std": 0.48199817538261414, "step": 106, "train_speed(iter/s)": 0.003153 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5892857142857143, "completions/max_length": 8047.5, "completions/mean_length": 6480.857421875, "completions/min_length": 1826.5, "epoch": 0.34076433121019106, "grad_norm": 0.5248873829841614, "kl": 0.03424072265625, "learning_rate": 7.514613122338589e-07, "loss": -0.17167718708515167, "memory(GiB)": 175.55, "reward": 0.267857164144516, "reward_std": 0.2610500454902649, "rewards/AnswerTagAccuracyORM/mean": 0.267857164144516, "rewards/AnswerTagAccuracyORM/std": 0.44672515988349915, "step": 107, "train_speed(iter/s)": 0.003154 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 8055.0, "completions/mean_length": 6560.357421875, "completions/min_length": 1519.0, "epoch": 0.34394904458598724, "grad_norm": 0.7283872365951538, "kl": 0.01104736328125, "learning_rate": 7.470688337985029e-07, "loss": -0.3855530917644501, "memory(GiB)": 175.55, "reward": 0.3035714328289032, "reward_std": 0.21981074661016464, "rewards/AnswerTagAccuracyORM/mean": 0.3035714328289032, "rewards/AnswerTagAccuracyORM/std": 0.4644543081521988, "step": 108, "train_speed(iter/s)": 0.003153 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.6428571428571428, "completions/max_length": 8042.5, "completions/mean_length": 6183.893310546875, "completions/min_length": 1794.5, "epoch": 0.3471337579617834, "grad_norm": 0.6313491463661194, "kl": 0.0653076171875, "learning_rate": 7.426509812655405e-07, "loss": -0.2580084502696991, "memory(GiB)": 175.55, "reward": 0.2857142984867096, "reward_std": 0.26657505333423615, "rewards/AnswerTagAccuracyORM/mean": 0.2857142984867096, "rewards/AnswerTagAccuracyORM/std": 0.4436842352151871, "step": 109, "train_speed(iter/s)": 0.003153 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.6785714285714286, "completions/max_length": 8056.5, "completions/mean_length": 6767.071533203125, "completions/min_length": 1435.5, "epoch": 0.3503184713375796, "grad_norm": 0.45339030027389526, "kl": 0.030517578125, "learning_rate": 7.382082083507225e-07, "loss": -0.16426539421081543, "memory(GiB)": 175.55, "reward": 0.3392857313156128, "reward_std": 0.2721000909805298, "rewards/AnswerTagAccuracyORM/mean": 0.3392857313156128, "rewards/AnswerTagAccuracyORM/std": 0.4786956012248993, "step": 110, "train_speed(iter/s)": 0.003152 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5535714285714286, "completions/max_length": 8042.0, "completions/mean_length": 6121.696533203125, "completions/min_length": 1248.0, "epoch": 0.3535031847133758, "grad_norm": 0.44927987456321716, "kl": 1.280361533164978, "learning_rate": 7.337409713291355e-07, "loss": -0.032493047416210175, "memory(GiB)": 175.77, "reward": 0.3392857313156128, "reward_std": 0.2610500454902649, "rewards/AnswerTagAccuracyORM/mean": 0.3392857313156128, "rewards/AnswerTagAccuracyORM/std": 0.4817724674940109, "step": 111, "train_speed(iter/s)": 0.003152 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5178571428571428, "completions/max_length": 8030.5, "completions/mean_length": 6174.0361328125, "completions/min_length": 1399.0, "epoch": 0.35668789808917195, "grad_norm": 0.451442688703537, "kl": 0.1644287109375, "learning_rate": 7.292497289883432e-07, "loss": 0.022751769050955772, "memory(GiB)": 175.77, "reward": 0.321428582072258, "reward_std": 0.2142857201397419, "rewards/AnswerTagAccuracyORM/mean": 0.321428582072258, "rewards/AnswerTagAccuracyORM/std": 0.4691530168056488, "step": 112, "train_speed(iter/s)": 0.003152 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.4464285714285714, "completions/max_length": 8033.5, "completions/mean_length": 6115.357421875, "completions/min_length": 1842.5, "epoch": 0.35987261146496813, "grad_norm": 0.5630915760993958, "kl": 0.02783203125, "learning_rate": 7.24734942581267e-07, "loss": -0.30773723125457764, "memory(GiB)": 175.77, "reward": 0.3035714402794838, "reward_std": 0.2610500380396843, "rewards/AnswerTagAccuracyORM/mean": 0.3035714402794838, "rewards/AnswerTagAccuracyORM/std": 0.4321114122867584, "step": 113, "train_speed(iter/s)": 0.003153 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 8043.0, "completions/mean_length": 6186.803955078125, "completions/min_length": 1296.5, "epoch": 0.3630573248407643, "grad_norm": 0.49433982372283936, "kl": 0.0421142578125, "learning_rate": 7.201970757788171e-07, "loss": -0.055917054414749146, "memory(GiB)": 175.77, "reward": 0.3571428656578064, "reward_std": 0.3792429566383362, "rewards/AnswerTagAccuracyORM/mean": 0.3571428656578064, "rewards/AnswerTagAccuracyORM/std": 0.4744165241718292, "step": 114, "train_speed(iter/s)": 0.003153 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.6428571428571428, "completions/max_length": 8041.0, "completions/mean_length": 6332.6611328125, "completions/min_length": 1067.0, "epoch": 0.3662420382165605, "grad_norm": 0.2365662157535553, "kl": 0.02978515625, "learning_rate": 7.15636594622272e-07, "loss": 0.03591850399971008, "memory(GiB)": 175.77, "reward": 0.1964285857975483, "reward_std": 0.07695359364151955, "rewards/AnswerTagAccuracyORM/mean": 0.1964285857975483, "rewards/AnswerTagAccuracyORM/std": 0.36893007159233093, "step": 115, "train_speed(iter/s)": 0.003154 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.4821428571428572, "completions/max_length": 8068.0, "completions/mean_length": 5660.196533203125, "completions/min_length": 1355.0, "epoch": 0.36942675159235666, "grad_norm": 0.8137925863265991, "kl": 0.21121002733707428, "learning_rate": 7.110539674754159e-07, "loss": -0.3542415499687195, "memory(GiB)": 175.77, "reward": 0.375, "reward_std": 0.29123930633068085, "rewards/AnswerTagAccuracyORM/mean": 0.375, "rewards/AnswerTagAccuracyORM/std": 0.47506681084632874, "step": 116, "train_speed(iter/s)": 0.003154 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 8045.0, "completions/mean_length": 5915.482421875, "completions/min_length": 1733.0, "epoch": 0.37261146496815284, "grad_norm": 71.4358139038086, "kl": 18.5830078125, "learning_rate": 7.06449664976438e-07, "loss": -0.01425784919410944, "memory(GiB)": 175.77, "reward": 0.2678571492433548, "reward_std": 0.2610500380396843, "rewards/AnswerTagAccuracyORM/mean": 0.2678571492433548, "rewards/AnswerTagAccuracyORM/std": 0.45050112903118134, "step": 117, "train_speed(iter/s)": 0.003154 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.3214285714285714, "completions/max_length": 8045.5, "completions/mean_length": 5088.660888671875, "completions/min_length": 908.5, "epoch": 0.37579617834394907, "grad_norm": 0.42832183837890625, "kl": 0.0203857421875, "learning_rate": 7.018241599895973e-07, "loss": -0.0722283348441124, "memory(GiB)": 175.77, "reward": 0.4464285969734192, "reward_std": 0.3324786126613617, "rewards/AnswerTagAccuracyORM/mean": 0.4464285969734192, "rewards/AnswerTagAccuracyORM/std": 0.5059135556221008, "step": 118, "train_speed(iter/s)": 0.003156 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5357142857142857, "completions/max_length": 8042.0, "completions/mean_length": 6498.76806640625, "completions/min_length": 2193.0, "epoch": 0.37898089171974525, "grad_norm": 0.5034194588661194, "kl": 0.0457763671875, "learning_rate": 6.971779275566593e-07, "loss": -0.14812861382961273, "memory(GiB)": 175.77, "reward": 0.1785714328289032, "reward_std": 0.2253357470035553, "rewards/AnswerTagAccuracyORM/mean": 0.1785714328289032, "rewards/AnswerTagAccuracyORM/std": 0.37796448171138763, "step": 119, "train_speed(iter/s)": 0.003156 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.35714285714285715, "completions/max_length": 8051.0, "completions/mean_length": 4917.14306640625, "completions/min_length": 1183.5, "epoch": 0.3821656050955414, "grad_norm": 0.7444789409637451, "kl": 0.0513916015625, "learning_rate": 6.925114448481088e-07, "loss": -0.26919350028038025, "memory(GiB)": 175.77, "reward": 0.642857164144516, "reward_std": 0.3681929111480713, "rewards/AnswerTagAccuracyORM/mean": 0.642857164144516, "rewards/AnswerTagAccuracyORM/std": 0.48199817538261414, "step": 120, "train_speed(iter/s)": 0.003157 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 8067.5, "completions/mean_length": 6616.857421875, "completions/min_length": 1642.0, "epoch": 0.3853503184713376, "grad_norm": 0.4549780786037445, "kl": 0.0621337890625, "learning_rate": 6.87825191114145e-07, "loss": -0.012696207500994205, "memory(GiB)": 175.77, "reward": 0.2857142984867096, "reward_std": 0.27762509882450104, "rewards/AnswerTagAccuracyORM/mean": 0.2857142984867096, "rewards/AnswerTagAccuracyORM/std": 0.45290274918079376, "step": 121, "train_speed(iter/s)": 0.003157 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 8049.5, "completions/mean_length": 6262.821533203125, "completions/min_length": 1352.0, "epoch": 0.3885350318471338, "grad_norm": 0.6874606013298035, "kl": 0.025634765625, "learning_rate": 6.831196476354614e-07, "loss": -0.19306860864162445, "memory(GiB)": 175.77, "reward": 0.4107143133878708, "reward_std": 0.37371791899204254, "rewards/AnswerTagAccuracyORM/mean": 0.4107143133878708, "rewards/AnswerTagAccuracyORM/std": 0.5006500333547592, "step": 122, "train_speed(iter/s)": 0.003157 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.3392857142857143, "completions/max_length": 8052.5, "completions/mean_length": 5354.339599609375, "completions/min_length": 1056.5, "epoch": 0.39171974522292996, "grad_norm": 0.5245152711868286, "kl": 0.041748046875, "learning_rate": 6.783952976738178e-07, "loss": -0.24547605216503143, "memory(GiB)": 175.77, "reward": 0.5, "reward_std": 0.26657505333423615, "rewards/AnswerTagAccuracyORM/mean": 0.5, "rewards/AnswerTagAccuracyORM/std": 0.5091750919818878, "step": 123, "train_speed(iter/s)": 0.003158 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 8028.5, "completions/mean_length": 4598.0179443359375, "completions/min_length": 803.0, "epoch": 0.39490445859872614, "grad_norm": 0.31059756875038147, "kl": 0.03350830078125, "learning_rate": 6.7365262642241e-07, "loss": -0.010546373203396797, "memory(GiB)": 175.77, "reward": 0.5357142984867096, "reward_std": 0.2142857238650322, "rewards/AnswerTagAccuracyORM/mean": 0.5357142984867096, "rewards/AnswerTagAccuracyORM/std": 0.5065638422966003, "step": 124, "train_speed(iter/s)": 0.00316 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5357142857142857, "completions/max_length": 8051.0, "completions/mean_length": 6164.375244140625, "completions/min_length": 1348.5, "epoch": 0.3980891719745223, "grad_norm": 0.36853721737861633, "kl": 0.0347900390625, "learning_rate": 6.688921209560403e-07, "loss": -0.15686392784118652, "memory(GiB)": 175.77, "reward": 0.3392857164144516, "reward_std": 0.2610500305891037, "rewards/AnswerTagAccuracyORM/mean": 0.3392857164144516, "rewards/AnswerTagAccuracyORM/std": 0.4495980441570282, "step": 125, "train_speed(iter/s)": 0.00316 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5357142857142857, "completions/max_length": 8056.5, "completions/mean_length": 6088.44677734375, "completions/min_length": 1482.5, "epoch": 0.4012738853503185, "grad_norm": 0.4430883228778839, "kl": 0.0606689453125, "learning_rate": 6.641142701810931e-07, "loss": -0.24629399180412292, "memory(GiB)": 175.77, "reward": 0.3928571492433548, "reward_std": 0.20619653165340424, "rewards/AnswerTagAccuracyORM/mean": 0.3928571492433548, "rewards/AnswerTagAccuracyORM/std": 0.4846093952655792, "step": 126, "train_speed(iter/s)": 0.00316 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.4642857142857143, "completions/max_length": 8057.5, "completions/mean_length": 6104.5361328125, "completions/min_length": 2068.5, "epoch": 0.40445859872611467, "grad_norm": 0.6949173808097839, "kl": 0.106201171875, "learning_rate": 6.593195647853258e-07, "loss": -0.3462884724140167, "memory(GiB)": 175.77, "reward": 0.3035714477300644, "reward_std": 0.4039071798324585, "rewards/AnswerTagAccuracyORM/mean": 0.3035714477300644, "rewards/AnswerTagAccuracyORM/std": 0.4576014429330826, "step": 127, "train_speed(iter/s)": 0.00316 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 8028.5, "completions/mean_length": 5471.26806640625, "completions/min_length": 798.5, "epoch": 0.40764331210191085, "grad_norm": 0.4856749176979065, "kl": 0.0289306640625, "learning_rate": 6.545084971874736e-07, "loss": -0.14708048105239868, "memory(GiB)": 175.77, "reward": 0.3392857313156128, "reward_std": 0.1896214708685875, "rewards/AnswerTagAccuracyORM/mean": 0.3392857313156128, "rewards/AnswerTagAccuracyORM/std": 0.47245559096336365, "step": 128, "train_speed(iter/s)": 0.003161 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.4642857142857143, "completions/max_length": 8055.0, "completions/mean_length": 5762.50048828125, "completions/min_length": 1270.0, "epoch": 0.410828025477707, "grad_norm": 0.5902029275894165, "kl": 0.03662109375, "learning_rate": 6.496815614866791e-07, "loss": -0.08738569170236588, "memory(GiB)": 175.77, "reward": 0.4821428805589676, "reward_std": 0.2610500380396843, "rewards/AnswerTagAccuracyORM/mean": 0.4821428805589676, "rewards/AnswerTagAccuracyORM/std": 0.5006500333547592, "step": 129, "train_speed(iter/s)": 0.003162 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5357142857142857, "completions/max_length": 8043.5, "completions/mean_length": 6093.4287109375, "completions/min_length": 1134.0, "epoch": 0.4140127388535032, "grad_norm": 0.3767712712287903, "kl": 0.006173963658511639, "learning_rate": 6.448392534117461e-07, "loss": -0.17554393410682678, "memory(GiB)": 175.77, "reward": 0.4464285969734192, "reward_std": 0.23086077719926834, "rewards/AnswerTagAccuracyORM/mean": 0.4464285969734192, "rewards/AnswerTagAccuracyORM/std": 0.4897737503051758, "step": 130, "train_speed(iter/s)": 0.003162 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 8051.0, "completions/mean_length": 6678.94677734375, "completions/min_length": 2177.0, "epoch": 0.4171974522292994, "grad_norm": 0.7001217603683472, "kl": 0.0213623046875, "learning_rate": 6.399820702702304e-07, "loss": -0.2599309980869293, "memory(GiB)": 175.77, "reward": 0.3750000149011612, "reward_std": 0.37371791899204254, "rewards/AnswerTagAccuracyORM/mean": 0.3750000149011612, "rewards/AnswerTagAccuracyORM/std": 0.49264875054359436, "step": 131, "train_speed(iter/s)": 0.003162 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.6607142857142857, "completions/max_length": 8036.5, "completions/mean_length": 6684.57177734375, "completions/min_length": 1943.5, "epoch": 0.42038216560509556, "grad_norm": 0.5899467468261719, "kl": 0.05859375, "learning_rate": 6.351105108973644e-07, "loss": -0.17598707973957062, "memory(GiB)": 175.77, "reward": 0.267857164144516, "reward_std": 0.29123930633068085, "rewards/AnswerTagAccuracyORM/mean": 0.267857164144516, "rewards/AnswerTagAccuracyORM/std": 0.44672515988349915, "step": 132, "train_speed(iter/s)": 0.003162 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5535714285714286, "completions/max_length": 8050.0, "completions/mean_length": 6337.08935546875, "completions/min_length": 1953.0, "epoch": 0.42356687898089174, "grad_norm": 0.4514944553375244, "kl": 0.0384521484375, "learning_rate": 6.302250756048267e-07, "loss": -0.23234152793884277, "memory(GiB)": 175.77, "reward": 0.3571428656578064, "reward_std": 0.25552502274513245, "rewards/AnswerTagAccuracyORM/mean": 0.3571428656578064, "rewards/AnswerTagAccuracyORM/std": 0.4744165241718292, "step": 133, "train_speed(iter/s)": 0.003161 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.4642857142857143, "completions/max_length": 8029.0, "completions/mean_length": 5813.14306640625, "completions/min_length": 780.0, "epoch": 0.4267515923566879, "grad_norm": 0.5305230617523193, "kl": 0.084716796875, "learning_rate": 6.253262661293602e-07, "loss": -0.16682913899421692, "memory(GiB)": 175.77, "reward": 0.4285714626312256, "reward_std": 0.2363857924938202, "rewards/AnswerTagAccuracyORM/mean": 0.4285714626312256, "rewards/AnswerTagAccuracyORM/std": 0.49173471331596375, "step": 134, "train_speed(iter/s)": 0.003161 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.6607142857142858, "completions/max_length": 8067.0, "completions/mean_length": 6596.125244140625, "completions/min_length": 1434.0, "epoch": 0.4299363057324841, "grad_norm": 6475.49609375, "kl": 2048.0140380859375, "learning_rate": 6.204145855812438e-07, "loss": 3.8663876056671143, "memory(GiB)": 175.77, "reward": 0.3571428805589676, "reward_std": 0.26657505333423615, "rewards/AnswerTagAccuracyORM/mean": 0.3571428805589676, "rewards/AnswerTagAccuracyORM/std": 0.48647116124629974, "step": 135, "train_speed(iter/s)": 0.003161 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.3392857142857143, "completions/max_length": 8056.0, "completions/mean_length": 4872.14306640625, "completions/min_length": 1017.0, "epoch": 0.43312101910828027, "grad_norm": 0.26369690895080566, "kl": 0.047607421875, "learning_rate": 6.154905383926216e-07, "loss": -0.016700170934200287, "memory(GiB)": 175.77, "reward": 0.517857164144516, "reward_std": 0.21981074661016464, "rewards/AnswerTagAccuracyORM/mean": 0.517857164144516, "rewards/AnswerTagAccuracyORM/std": 0.5085247755050659, "step": 136, "train_speed(iter/s)": 0.003161 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.6428571428571429, "completions/max_length": 8029.5, "completions/mean_length": 6843.303955078125, "completions/min_length": 2125.5, "epoch": 0.43630573248407645, "grad_norm": 0.48116692900657654, "kl": 0.04315185546875, "learning_rate": 6.105546302656986e-07, "loss": -0.08012282848358154, "memory(GiB)": 175.77, "reward": 0.2142857201397419, "reward_std": 0.2967643365263939, "rewards/AnswerTagAccuracyORM/mean": 0.2142857201397419, "rewards/AnswerTagAccuracyORM/std": 0.37510766088962555, "step": 137, "train_speed(iter/s)": 0.00316 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.6785714285714286, "completions/max_length": 8045.5, "completions/mean_length": 6495.821533203125, "completions/min_length": 1893.5, "epoch": 0.4394904458598726, "grad_norm": 0.39181163907051086, "kl": 289.435302734375, "learning_rate": 6.056073681208037e-07, "loss": -0.048386890441179276, "memory(GiB)": 175.77, "reward": 0.3035714477300644, "reward_std": 0.2721000984311104, "rewards/AnswerTagAccuracyORM/mean": 0.3035714477300644, "rewards/AnswerTagAccuracyORM/std": 0.4576014429330826, "step": 138, "train_speed(iter/s)": 0.00316 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 8016.0, "completions/mean_length": 5637.39306640625, "completions/min_length": 1424.5, "epoch": 0.4426751592356688, "grad_norm": 0.2754450738430023, "kl": 0.046875, "learning_rate": 6.0064926004433e-07, "loss": -0.08374255150556564, "memory(GiB)": 175.77, "reward": 0.2678571492433548, "reward_std": 0.1785714365541935, "rewards/AnswerTagAccuracyORM/mean": 0.2678571492433548, "rewards/AnswerTagAccuracyORM/std": 0.45050114393234253, "step": 139, "train_speed(iter/s)": 0.003161 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.48214285714285715, "completions/max_length": 8044.0, "completions/mean_length": 5262.000244140625, "completions/min_length": 1212.5, "epoch": 0.445859872611465, "grad_norm": 0.641979455947876, "kl": 0.07440185546875, "learning_rate": 5.956808152365532e-07, "loss": -0.11813775449991226, "memory(GiB)": 175.77, "reward": 0.3214285746216774, "reward_std": 0.25552503019571304, "rewards/AnswerTagAccuracyORM/mean": 0.3214285746216774, "rewards/AnswerTagAccuracyORM/std": 0.4327617287635803, "step": 140, "train_speed(iter/s)": 0.003163 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.4821428571428571, "completions/max_length": 8020.0, "completions/mean_length": 5747.625244140625, "completions/min_length": 1448.5, "epoch": 0.44904458598726116, "grad_norm": 0.30898842215538025, "kl": 0.0184326171875, "learning_rate": 5.907025439593365e-07, "loss": -0.03316553309559822, "memory(GiB)": 175.77, "reward": 0.4464285969734192, "reward_std": 0.2610500305891037, "rewards/AnswerTagAccuracyORM/mean": 0.4464285969734192, "rewards/AnswerTagAccuracyORM/std": 0.5059135556221008, "step": 141, "train_speed(iter/s)": 0.003164 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.4464285714285714, "completions/max_length": 8040.5, "completions/mean_length": 5516.482177734375, "completions/min_length": 1085.5, "epoch": 0.45222929936305734, "grad_norm": 0.5380728244781494, "kl": 0.0506591796875, "learning_rate": 5.857149574837268e-07, "loss": -0.25722232460975647, "memory(GiB)": 175.77, "reward": 0.4464285969734192, "reward_std": 0.30228933691978455, "rewards/AnswerTagAccuracyORM/mean": 0.4464285969734192, "rewards/AnswerTagAccuracyORM/std": 0.497912272810936, "step": 142, "train_speed(iter/s)": 0.003165 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.4821428571428571, "completions/max_length": 8030.0, "completions/mean_length": 5218.1611328125, "completions/min_length": 1178.5, "epoch": 0.4554140127388535, "grad_norm": 0.4074670672416687, "kl": 0.0584716796875, "learning_rate": 5.807185680374467e-07, "loss": -0.0464337095618248, "memory(GiB)": 175.77, "reward": 0.3035714328289032, "reward_std": 0.14838216453790665, "rewards/AnswerTagAccuracyORM/mean": 0.3035714328289032, "rewards/AnswerTagAccuracyORM/std": 0.4644543081521988, "step": 143, "train_speed(iter/s)": 0.003166 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5714285714285714, "completions/max_length": 8053.5, "completions/mean_length": 6286.125244140625, "completions/min_length": 1153.5, "epoch": 0.4585987261146497, "grad_norm": 0.4895031154155731, "kl": 0.0423583984375, "learning_rate": 5.757138887522883e-07, "loss": -0.06021181866526604, "memory(GiB)": 175.77, "reward": 0.2500000149011612, "reward_std": 0.26657506823539734, "rewards/AnswerTagAccuracyORM/mean": 0.2500000149011612, "rewards/AnswerTagAccuracyORM/std": 0.4389495849609375, "step": 144, "train_speed(iter/s)": 0.003165 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5178571428571428, "completions/max_length": 8067.0, "completions/mean_length": 6192.5537109375, "completions/min_length": 1276.5, "epoch": 0.46178343949044587, "grad_norm": 0.40592557191848755, "kl": 0.0712890625, "learning_rate": 5.707014336114146e-07, "loss": -0.048989661037921906, "memory(GiB)": 175.77, "reward": 0.4285714477300644, "reward_std": 0.27762511372566223, "rewards/AnswerTagAccuracyORM/mean": 0.4285714477300644, "rewards/AnswerTagAccuracyORM/std": 0.5026109665632248, "step": 145, "train_speed(iter/s)": 0.003165 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5357142857142857, "completions/max_length": 8031.0, "completions/mean_length": 6168.714599609375, "completions/min_length": 1853.0, "epoch": 0.46496815286624205, "grad_norm": 0.4599376618862152, "kl": 0.051025390625, "learning_rate": 5.656817173965732e-07, "loss": -0.06779219955205917, "memory(GiB)": 175.77, "reward": 0.3571428805589676, "reward_std": 0.33800363540649414, "rewards/AnswerTagAccuracyORM/mean": 0.3571428805589676, "rewards/AnswerTagAccuracyORM/std": 0.4489477574825287, "step": 146, "train_speed(iter/s)": 0.003165 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5178571428571428, "completions/max_length": 8048.0, "completions/mean_length": 6351.232421875, "completions/min_length": 2068.5, "epoch": 0.4681528662420382, "grad_norm": 0.3567911684513092, "kl": 0.0421142578125, "learning_rate": 5.606552556352274e-07, "loss": -0.09004680067300797, "memory(GiB)": 175.77, "reward": 0.3392857313156128, "reward_std": 0.29123930633068085, "rewards/AnswerTagAccuracyORM/mean": 0.3392857313156128, "rewards/AnswerTagAccuracyORM/std": 0.47245559096336365, "step": 147, "train_speed(iter/s)": 0.003165 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 8061.5, "completions/mean_length": 6130.57177734375, "completions/min_length": 1451.0, "epoch": 0.4713375796178344, "grad_norm": 0.5385421514511108, "kl": 0.0621337890625, "learning_rate": 5.556225645476118e-07, "loss": -0.12423180043697357, "memory(GiB)": 175.77, "reward": 0.392857164144516, "reward_std": 0.26657505333423615, "rewards/AnswerTagAccuracyORM/mean": 0.392857164144516, "rewards/AnswerTagAccuracyORM/std": 0.4959513247013092, "step": 148, "train_speed(iter/s)": 0.003165 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.3035714285714286, "completions/max_length": 8047.5, "completions/mean_length": 4891.875244140625, "completions/min_length": 1196.5, "epoch": 0.4745222929936306, "grad_norm": 0.39023974537849426, "kl": 0.0255126953125, "learning_rate": 5.505841609937161e-07, "loss": -0.18357349932193756, "memory(GiB)": 175.77, "reward": 0.517857164144516, "reward_std": 0.29123931378126144, "rewards/AnswerTagAccuracyORM/mean": 0.517857164144516, "rewards/AnswerTagAccuracyORM/std": 0.5059135556221008, "step": 149, "train_speed(iter/s)": 0.003166 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 8016.0, "completions/mean_length": 4689.14306640625, "completions/min_length": 833.5, "epoch": 0.47770700636942676, "grad_norm": 0.5281595587730408, "kl": 0.052490234375, "learning_rate": 5.455405624202032e-07, "loss": -0.15326838195323944, "memory(GiB)": 175.77, "reward": 0.3928571492433548, "reward_std": 0.1539071872830391, "rewards/AnswerTagAccuracyORM/mean": 0.3928571492433548, "rewards/AnswerTagAccuracyORM/std": 0.4846093952655792, "step": 150, "train_speed(iter/s)": 0.003167 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5357142857142857, "completions/max_length": 8023.0, "completions/mean_length": 5550.000244140625, "completions/min_length": 1138.5, "epoch": 0.48089171974522293, "grad_norm": 0.2876298427581787, "kl": 0.03759765625, "learning_rate": 5.404922868072672e-07, "loss": 0.08055908977985382, "memory(GiB)": 175.77, "reward": 0.4107142984867096, "reward_std": 0.14838216453790665, "rewards/AnswerTagAccuracyORM/mean": 0.4107142984867096, "rewards/AnswerTagAccuracyORM/std": 0.4576014578342438, "step": 151, "train_speed(iter/s)": 0.003168 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.26785714285714285, "completions/max_length": 8016.0, "completions/mean_length": 3929.5538330078125, "completions/min_length": 986.0, "epoch": 0.4840764331210191, "grad_norm": 1.2029175758361816, "kl": 0.45068359375, "learning_rate": 5.354398526154365e-07, "loss": -0.09824319183826447, "memory(GiB)": 175.77, "reward": 0.5357142984867096, "reward_std": 0.25552502274513245, "rewards/AnswerTagAccuracyORM/mean": 0.5357142984867096, "rewards/AnswerTagAccuracyORM/std": 0.5026109963655472, "step": 152, "train_speed(iter/s)": 0.00317 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.4285714285714286, "completions/max_length": 8062.0, "completions/mean_length": 5090.232421875, "completions/min_length": 1046.0, "epoch": 0.4872611464968153, "grad_norm": 0.44915616512298584, "kl": 0.0460205078125, "learning_rate": 5.30383778732328e-07, "loss": -0.08847501873970032, "memory(GiB)": 175.77, "reward": 0.4285714477300644, "reward_std": 0.34905366599559784, "rewards/AnswerTagAccuracyORM/mean": 0.4285714477300644, "rewards/AnswerTagAccuracyORM/std": 0.502610981464386, "step": 153, "train_speed(iter/s)": 0.003169 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.4642857142857143, "completions/max_length": 8054.0, "completions/mean_length": 5964.4287109375, "completions/min_length": 1407.5, "epoch": 0.49044585987261147, "grad_norm": 0.7181938886642456, "kl": 0.0526123046875, "learning_rate": 5.253245844193564e-07, "loss": -0.25980204343795776, "memory(GiB)": 175.77, "reward": 0.392857164144516, "reward_std": 0.4727715849876404, "rewards/AnswerTagAccuracyORM/mean": 0.392857164144516, "rewards/AnswerTagAccuracyORM/std": 0.4959513396024704, "step": 154, "train_speed(iter/s)": 0.003169 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5535714285714286, "completions/max_length": 8052.0, "completions/mean_length": 6524.339599609375, "completions/min_length": 1993.0, "epoch": 0.49363057324840764, "grad_norm": 0.6525817513465881, "kl": 0.08203125, "learning_rate": 5.202627892584065e-07, "loss": -0.14491651952266693, "memory(GiB)": 175.77, "reward": 0.3392857313156128, "reward_std": 0.3324785977602005, "rewards/AnswerTagAccuracyORM/mean": 0.3392857313156128, "rewards/AnswerTagAccuracyORM/std": 0.4817724674940109, "step": 155, "train_speed(iter/s)": 0.003169 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 8034.5, "completions/mean_length": 5646.178955078125, "completions/min_length": 1162.5, "epoch": 0.4968152866242038, "grad_norm": 0.7706606388092041, "kl": 0.07666015625, "learning_rate": 5.151989130984714e-07, "loss": -0.09087943285703659, "memory(GiB)": 175.77, "reward": 0.2857142984867096, "reward_std": 0.32695358991622925, "rewards/AnswerTagAccuracyORM/mean": 0.2857142984867096, "rewards/AnswerTagAccuracyORM/std": 0.460043728351593, "step": 156, "train_speed(iter/s)": 0.003169 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.4821428571428572, "completions/max_length": 8046.5, "completions/mean_length": 5837.58935546875, "completions/min_length": 1139.5, "epoch": 0.5, "grad_norm": 0.4703036844730377, "kl": 0.03955078125, "learning_rate": 5.101334760022639e-07, "loss": -0.15095964074134827, "memory(GiB)": 175.77, "reward": 0.3571428805589676, "reward_std": 0.25552502274513245, "rewards/AnswerTagAccuracyORM/mean": 0.3571428805589676, "rewards/AnswerTagAccuracyORM/std": 0.48647117614746094, "step": 157, "train_speed(iter/s)": 0.00317 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.3214285714285714, "completions/max_length": 8043.5, "completions/mean_length": 5477.0361328125, "completions/min_length": 1285.0, "epoch": 0.5031847133757962, "grad_norm": 0.975563645362854, "kl": 0.197265625, "learning_rate": 5.050669981928055e-07, "loss": -0.09782678633928299, "memory(GiB)": 175.77, "reward": 0.392857164144516, "reward_std": 0.4396214783191681, "rewards/AnswerTagAccuracyORM/mean": 0.392857164144516, "rewards/AnswerTagAccuracyORM/std": 0.4973474591970444, "step": 158, "train_speed(iter/s)": 0.003169 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.4107142857142857, "completions/max_length": 8048.5, "completions/mean_length": 5391.803955078125, "completions/min_length": 1264.5, "epoch": 0.5063694267515924, "grad_norm": 0.6060774922370911, "kl": 0.044677734375, "learning_rate": 5e-07, "loss": -0.30903252959251404, "memory(GiB)": 175.77, "reward": 0.4821428656578064, "reward_std": 0.3435286581516266, "rewards/AnswerTagAccuracyORM/mean": 0.4821428656578064, "rewards/AnswerTagAccuracyORM/std": 0.5085247755050659, "step": 159, "train_speed(iter/s)": 0.00317 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5535714285714286, "completions/max_length": 8036.5, "completions/mean_length": 6244.39306640625, "completions/min_length": 1104.5, "epoch": 0.5095541401273885, "grad_norm": 0.3611801564693451, "kl": 0.032743350418968475, "learning_rate": 4.949330018071946e-07, "loss": -0.14383217692375183, "memory(GiB)": 175.77, "reward": 0.4285714626312256, "reward_std": 0.2253357656300068, "rewards/AnswerTagAccuracyORM/mean": 0.4285714626312256, "rewards/AnswerTagAccuracyORM/std": 0.49173468351364136, "step": 160, "train_speed(iter/s)": 0.00317 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5357142857142857, "completions/max_length": 8052.0, "completions/mean_length": 6116.875244140625, "completions/min_length": 1644.5, "epoch": 0.5127388535031847, "grad_norm": 0.3732077479362488, "kl": 0.07177734375, "learning_rate": 4.898665239977362e-07, "loss": 0.008805501274764538, "memory(GiB)": 175.77, "reward": 0.3035714402794838, "reward_std": 0.23086079210042953, "rewards/AnswerTagAccuracyORM/mean": 0.3035714402794838, "rewards/AnswerTagAccuracyORM/std": 0.4321114122867584, "step": 161, "train_speed(iter/s)": 0.003169 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.6964285714285714, "completions/max_length": 8043.5, "completions/mean_length": 6551.1611328125, "completions/min_length": 2065.5, "epoch": 0.5159235668789809, "grad_norm": 0.40818125009536743, "kl": 0.0394287109375, "learning_rate": 4.848010869015287e-07, "loss": -0.172414168715477, "memory(GiB)": 175.77, "reward": 0.2500000149011612, "reward_std": 0.1428571529686451, "rewards/AnswerTagAccuracyORM/mean": 0.2500000149011612, "rewards/AnswerTagAccuracyORM/std": 0.4061589390039444, "step": 162, "train_speed(iter/s)": 0.003168 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.2857142857142857, "completions/max_length": 8026.5, "completions/mean_length": 5044.535888671875, "completions/min_length": 1356.5, "epoch": 0.5191082802547771, "grad_norm": 1.3621019124984741, "kl": 0.4180908203125, "learning_rate": 4.797372107415935e-07, "loss": -0.2163340151309967, "memory(GiB)": 175.77, "reward": 0.5357142984867096, "reward_std": 0.42048224806785583, "rewards/AnswerTagAccuracyORM/mean": 0.5357142984867096, "rewards/AnswerTagAccuracyORM/std": 0.5065638720989227, "step": 163, "train_speed(iter/s)": 0.003169 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.2857142857142857, "completions/max_length": 8046.5, "completions/mean_length": 5025.2861328125, "completions/min_length": 1086.0, "epoch": 0.5222929936305732, "grad_norm": 0.5102728605270386, "kl": 0.0535888671875, "learning_rate": 4.746754155806437e-07, "loss": -0.23832020163536072, "memory(GiB)": 175.77, "reward": 0.517857164144516, "reward_std": 0.2721000909805298, "rewards/AnswerTagAccuracyORM/mean": 0.517857164144516, "rewards/AnswerTagAccuracyORM/std": 0.5059135556221008, "step": 164, "train_speed(iter/s)": 0.00317 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.4464285714285714, "completions/max_length": 8029.5, "completions/mean_length": 5601.375244140625, "completions/min_length": 1904.5, "epoch": 0.5254777070063694, "grad_norm": 0.7859033942222595, "kl": 0.1533203125, "learning_rate": 4.69616221267672e-07, "loss": -0.3384000360965729, "memory(GiB)": 175.77, "reward": 0.4642857313156128, "reward_std": 0.42048226296901703, "rewards/AnswerTagAccuracyORM/mean": 0.4642857313156128, "rewards/AnswerTagAccuracyORM/std": 0.4959513396024704, "step": 165, "train_speed(iter/s)": 0.00317 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.48214285714285715, "completions/max_length": 8026.0, "completions/mean_length": 5810.482421875, "completions/min_length": 1074.0, "epoch": 0.5286624203821656, "grad_norm": 0.5610930323600769, "kl": 0.06005859375, "learning_rate": 4.645601473845635e-07, "loss": -0.1656591296195984, "memory(GiB)": 175.77, "reward": 0.3571428656578064, "reward_std": 0.307814359664917, "rewards/AnswerTagAccuracyORM/mean": 0.3571428656578064, "rewards/AnswerTagAccuracyORM/std": 0.48795005679130554, "step": 166, "train_speed(iter/s)": 0.003169 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 8055.5, "completions/mean_length": 5735.08935546875, "completions/min_length": 1439.0, "epoch": 0.5318471337579618, "grad_norm": 0.912844717502594, "kl": 0.1314697265625, "learning_rate": 4.5950771319273296e-07, "loss": -0.04387956112623215, "memory(GiB)": 175.77, "reward": 0.4107143133878708, "reward_std": 0.21981074661016464, "rewards/AnswerTagAccuracyORM/mean": 0.4107143133878708, "rewards/AnswerTagAccuracyORM/std": 0.5006500333547592, "step": 167, "train_speed(iter/s)": 0.003169 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5714285714285714, "completions/max_length": 8049.0, "completions/mean_length": 6489.285888671875, "completions/min_length": 1574.5, "epoch": 0.535031847133758, "grad_norm": 0.3967674970626831, "kl": 1.4857299551628638e+34, "learning_rate": 4.544594375797968e-07, "loss": 0.07432208955287933, "memory(GiB)": 175.77, "reward": 0.357142873108387, "reward_std": 0.2967643216252327, "rewards/AnswerTagAccuracyORM/mean": 0.357142873108387, "rewards/AnswerTagAccuracyORM/std": 0.43015046417713165, "step": 168, "train_speed(iter/s)": 0.003169 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.4821428571428571, "completions/max_length": 8061.0, "completions/mean_length": 5971.535888671875, "completions/min_length": 867.0, "epoch": 0.5382165605095541, "grad_norm": 3.6906192302703857, "kl": 0.22985238194814883, "learning_rate": 4.4941583900628393e-07, "loss": -0.2558591365814209, "memory(GiB)": 175.77, "reward": 0.267857164144516, "reward_std": 0.23086077719926834, "rewards/AnswerTagAccuracyORM/mean": 0.267857164144516, "rewards/AnswerTagAccuracyORM/std": 0.44672515988349915, "step": 169, "train_speed(iter/s)": 0.003169 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.2857142857142857, "completions/max_length": 8057.5, "completions/mean_length": 4240.500244140625, "completions/min_length": 650.0, "epoch": 0.5414012738853503, "grad_norm": 0.5648459196090698, "kl": 0.0484619140625, "learning_rate": 4.443774354523882e-07, "loss": -0.026298800483345985, "memory(GiB)": 175.77, "reward": 0.5535714626312256, "reward_std": 0.3324786126613617, "rewards/AnswerTagAccuracyORM/mean": 0.5535714626312256, "rewards/AnswerTagAccuracyORM/std": 0.4897737503051758, "step": 170, "train_speed(iter/s)": 0.003171 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.4642857142857143, "completions/max_length": 8025.0, "completions/mean_length": 5686.607421875, "completions/min_length": 1502.0, "epoch": 0.5445859872611465, "grad_norm": 0.7092170715332031, "kl": 0.0477294921875, "learning_rate": 4.3934474436477253e-07, "loss": -0.25052332878112793, "memory(GiB)": 175.77, "reward": 0.3392857313156128, "reward_std": 0.23086077719926834, "rewards/AnswerTagAccuracyORM/mean": 0.3392857313156128, "rewards/AnswerTagAccuracyORM/std": 0.48177245259284973, "step": 171, "train_speed(iter/s)": 0.003172 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 8053.0, "completions/mean_length": 5796.964599609375, "completions/min_length": 1074.0, "epoch": 0.5477707006369427, "grad_norm": 0.448478639125824, "kl": 0.0557861328125, "learning_rate": 4.3431828260342675e-07, "loss": -0.27337488532066345, "memory(GiB)": 175.77, "reward": 0.3571428656578064, "reward_std": 0.27762509882450104, "rewards/AnswerTagAccuracyORM/mean": 0.3571428656578064, "rewards/AnswerTagAccuracyORM/std": 0.4635152518749237, "step": 172, "train_speed(iter/s)": 0.003172 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5178571428571428, "completions/max_length": 8049.5, "completions/mean_length": 6193.589599609375, "completions/min_length": 1322.0, "epoch": 0.5509554140127388, "grad_norm": 0.4961269795894623, "kl": 0.0238037109375, "learning_rate": 4.292985663885853e-07, "loss": -0.1575016975402832, "memory(GiB)": 175.77, "reward": 0.3392857313156128, "reward_std": 0.3324786126613617, "rewards/AnswerTagAccuracyORM/mean": 0.3392857313156128, "rewards/AnswerTagAccuracyORM/std": 0.4817724674940109, "step": 173, "train_speed(iter/s)": 0.003172 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 8028.0, "completions/mean_length": 6504.14306640625, "completions/min_length": 1982.0, "epoch": 0.554140127388535, "grad_norm": 0.8586503267288208, "kl": 0.0418701171875, "learning_rate": 4.242861112477118e-07, "loss": -0.26731064915657043, "memory(GiB)": 175.77, "reward": 0.3392857313156128, "reward_std": 0.4149572402238846, "rewards/AnswerTagAccuracyORM/mean": 0.3392857313156128, "rewards/AnswerTagAccuracyORM/std": 0.4786956012248993, "step": 174, "train_speed(iter/s)": 0.003172 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.4821428571428571, "completions/max_length": 8055.0, "completions/mean_length": 5515.000244140625, "completions/min_length": 1278.0, "epoch": 0.5573248407643312, "grad_norm": 0.5378698110580444, "kl": 3752.9690551757812, "learning_rate": 4.192814319625533e-07, "loss": -0.10730624943971634, "memory(GiB)": 175.77, "reward": 0.4285714328289032, "reward_std": 0.2967643290758133, "rewards/AnswerTagAccuracyORM/mean": 0.4285714328289032, "rewards/AnswerTagAccuracyORM/std": 0.4985625743865967, "step": 175, "train_speed(iter/s)": 0.003173 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.3392857142857143, "completions/max_length": 8059.0, "completions/mean_length": 5019.607421875, "completions/min_length": 1034.0, "epoch": 0.5605095541401274, "grad_norm": 0.39968299865722656, "kl": 0.0423583984375, "learning_rate": 4.1428504251627325e-07, "loss": 0.0431935153901577, "memory(GiB)": 175.77, "reward": 0.535714328289032, "reward_std": 0.19514648616313934, "rewards/AnswerTagAccuracyORM/mean": 0.535714328289032, "rewards/AnswerTagAccuracyORM/std": 0.5078745484352112, "step": 176, "train_speed(iter/s)": 0.003175 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.48214285714285715, "completions/max_length": 8054.5, "completions/mean_length": 5915.3037109375, "completions/min_length": 1198.5, "epoch": 0.5636942675159236, "grad_norm": 0.39187169075012207, "kl": 3.19933819770813, "learning_rate": 4.0929745604066343e-07, "loss": -0.09193204343318939, "memory(GiB)": 176.78, "reward": 0.5000000149011612, "reward_std": 0.2967643290758133, "rewards/AnswerTagAccuracyORM/mean": 0.5000000149011612, "rewards/AnswerTagAccuracyORM/std": 0.4973474442958832, "step": 177, "train_speed(iter/s)": 0.003175 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.3928571428571429, "completions/max_length": 8029.5, "completions/mean_length": 5273.268310546875, "completions/min_length": 1048.5, "epoch": 0.5668789808917197, "grad_norm": 0.6660857796669006, "kl": 0.1064453125, "learning_rate": 4.0431918476344685e-07, "loss": -0.19565197825431824, "memory(GiB)": 176.78, "reward": 0.535714328289032, "reward_std": 0.2967643216252327, "rewards/AnswerTagAccuracyORM/mean": 0.535714328289032, "rewards/AnswerTagAccuracyORM/std": 0.4959513396024704, "step": 178, "train_speed(iter/s)": 0.003177 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.6071428571428571, "completions/max_length": 8059.5, "completions/mean_length": 6514.857421875, "completions/min_length": 1473.0, "epoch": 0.5700636942675159, "grad_norm": 0.698070228099823, "kl": 0.085205078125, "learning_rate": 3.9935073995566987e-07, "loss": -0.09061294049024582, "memory(GiB)": 176.78, "reward": 0.2857142984867096, "reward_std": 0.25552503019571304, "rewards/AnswerTagAccuracyORM/mean": 0.2857142984867096, "rewards/AnswerTagAccuracyORM/std": 0.460043728351593, "step": 179, "train_speed(iter/s)": 0.003177 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.4821428571428571, "completions/max_length": 8024.0, "completions/mean_length": 5687.553955078125, "completions/min_length": 1122.0, "epoch": 0.5732484076433121, "grad_norm": 0.31134656071662903, "kl": 0.03778076171875, "learning_rate": 3.943926318791963e-07, "loss": -0.14330630004405975, "memory(GiB)": 176.78, "reward": 0.4821428954601288, "reward_std": 0.2610500454902649, "rewards/AnswerTagAccuracyORM/mean": 0.4821428954601288, "rewards/AnswerTagAccuracyORM/std": 0.4817724674940109, "step": 180, "train_speed(iter/s)": 0.003178 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5714285714285714, "completions/max_length": 8041.5, "completions/mean_length": 6372.910888671875, "completions/min_length": 1543.5, "epoch": 0.5764331210191083, "grad_norm": 0.5216385126113892, "kl": 0.023681640625, "learning_rate": 3.8944536973430156e-07, "loss": -0.11478479206562042, "memory(GiB)": 176.78, "reward": 0.2857142984867096, "reward_std": 0.18409644439816475, "rewards/AnswerTagAccuracyORM/mean": 0.2857142984867096, "rewards/AnswerTagAccuracyORM/std": 0.4582767188549042, "step": 181, "train_speed(iter/s)": 0.003179 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5892857142857143, "completions/max_length": 8024.5, "completions/mean_length": 6145.232421875, "completions/min_length": 1394.5, "epoch": 0.5796178343949044, "grad_norm": 0.4548920691013336, "kl": 0.0491943359375, "learning_rate": 3.845094616073783e-07, "loss": -0.04961169883608818, "memory(GiB)": 176.78, "reward": 0.2857142984867096, "reward_std": 0.18409645557403564, "rewards/AnswerTagAccuracyORM/mean": 0.2857142984867096, "rewards/AnswerTagAccuracyORM/std": 0.44368425011634827, "step": 182, "train_speed(iter/s)": 0.003179 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.35714285714285715, "completions/max_length": 8025.0, "completions/mean_length": 4988.08935546875, "completions/min_length": 1431.0, "epoch": 0.5828025477707006, "grad_norm": 0.45550552010536194, "kl": 0.0574951171875, "learning_rate": 3.7958541441875627e-07, "loss": -0.0993984192609787, "memory(GiB)": 176.78, "reward": 0.5535714626312256, "reward_std": 0.21981073170900345, "rewards/AnswerTagAccuracyORM/mean": 0.5535714626312256, "rewards/AnswerTagAccuracyORM/std": 0.4897737503051758, "step": 183, "train_speed(iter/s)": 0.003181 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5535714285714286, "completions/max_length": 8052.5, "completions/mean_length": 6105.107421875, "completions/min_length": 1428.0, "epoch": 0.5859872611464968, "grad_norm": 0.4129337668418884, "kl": 0.03155517578125, "learning_rate": 3.7467373387063964e-07, "loss": -0.18813078105449677, "memory(GiB)": 176.78, "reward": 0.4107142984867096, "reward_std": 0.2610500305891037, "rewards/AnswerTagAccuracyORM/mean": 0.4107142984867096, "rewards/AnswerTagAccuracyORM/std": 0.497912272810936, "step": 184, "train_speed(iter/s)": 0.003181 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 8026.5, "completions/mean_length": 5963.446533203125, "completions/min_length": 1203.5, "epoch": 0.589171974522293, "grad_norm": 0.4038497805595398, "kl": 0.0673828125, "learning_rate": 3.6977492439517346e-07, "loss": -0.08315330743789673, "memory(GiB)": 176.78, "reward": 0.446428582072258, "reward_std": 0.2610500454902649, "rewards/AnswerTagAccuracyORM/mean": 0.446428582072258, "rewards/AnswerTagAccuracyORM/std": 0.4786956012248993, "step": 185, "train_speed(iter/s)": 0.003182 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5535714285714286, "completions/max_length": 8037.5, "completions/mean_length": 6319.321533203125, "completions/min_length": 1822.0, "epoch": 0.5923566878980892, "grad_norm": 1.8196825981140137, "kl": 0.7462158203125, "learning_rate": 3.648894891026358e-07, "loss": -0.03824207931756973, "memory(GiB)": 176.78, "reward": 0.3035714328289032, "reward_std": 0.1785714402794838, "rewards/AnswerTagAccuracyORM/mean": 0.3035714328289032, "rewards/AnswerTagAccuracyORM/std": 0.4644542932510376, "step": 186, "train_speed(iter/s)": 0.003183 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.4642857142857143, "completions/max_length": 8033.5, "completions/mean_length": 5305.8037109375, "completions/min_length": 942.0, "epoch": 0.5955414012738853, "grad_norm": 0.41588449478149414, "kl": 0.025146484375, "learning_rate": 3.600179297297695e-07, "loss": -0.10517025738954544, "memory(GiB)": 176.78, "reward": 0.517857164144516, "reward_std": 0.3435286581516266, "rewards/AnswerTagAccuracyORM/mean": 0.517857164144516, "rewards/AnswerTagAccuracyORM/std": 0.48177245259284973, "step": 187, "train_speed(iter/s)": 0.003184 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 8029.5, "completions/mean_length": 5222.250244140625, "completions/min_length": 1221.5, "epoch": 0.5987261146496815, "grad_norm": 0.42883527278900146, "kl": 0.033935546875, "learning_rate": 3.5516074658825397e-07, "loss": -0.04254484549164772, "memory(GiB)": 176.78, "reward": 0.392857164144516, "reward_std": 0.2967643141746521, "rewards/AnswerTagAccuracyORM/mean": 0.392857164144516, "rewards/AnswerTagAccuracyORM/std": 0.49734747409820557, "step": 188, "train_speed(iter/s)": 0.003185 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.35714285714285715, "completions/max_length": 8027.5, "completions/mean_length": 4837.39306640625, "completions/min_length": 1332.0, "epoch": 0.6019108280254777, "grad_norm": 0.6678107380867004, "kl": 0.08056640625, "learning_rate": 3.50318438513321e-07, "loss": -0.23368430137634277, "memory(GiB)": 176.78, "reward": 0.4285714626312256, "reward_std": 0.3681928962469101, "rewards/AnswerTagAccuracyORM/mean": 0.4285714626312256, "rewards/AnswerTagAccuracyORM/std": 0.49173466861248016, "step": 189, "train_speed(iter/s)": 0.003187 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5357142857142857, "completions/max_length": 8052.0, "completions/mean_length": 6229.303955078125, "completions/min_length": 2022.0, "epoch": 0.6050955414012739, "grad_norm": 0.6864339113235474, "kl": 1.5787729148315552e-41, "learning_rate": 3.454915028125263e-07, "loss": -0.17627106606960297, "memory(GiB)": 176.78, "reward": 0.4642857313156128, "reward_std": 0.3078143820166588, "rewards/AnswerTagAccuracyORM/mean": 0.4642857313156128, "rewards/AnswerTagAccuracyORM/std": 0.5065638422966003, "step": 190, "train_speed(iter/s)": 0.003187 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.4285714285714286, "completions/max_length": 8023.5, "completions/mean_length": 5785.803955078125, "completions/min_length": 1411.0, "epoch": 0.60828025477707, "grad_norm": 0.4775516092777252, "kl": 0.027587890625, "learning_rate": 3.406804352146742e-07, "loss": -0.1085924282670021, "memory(GiB)": 176.78, "reward": 0.3392857313156128, "reward_std": 0.1896214708685875, "rewards/AnswerTagAccuracyORM/mean": 0.3392857313156128, "rewards/AnswerTagAccuracyORM/std": 0.48177245259284973, "step": 191, "train_speed(iter/s)": 0.003189 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5714285714285714, "completions/max_length": 8018.0, "completions/mean_length": 6275.1611328125, "completions/min_length": 1364.0, "epoch": 0.6114649681528662, "grad_norm": 0.3889318108558655, "kl": 0.0579833984375, "learning_rate": 3.3588572981890684e-07, "loss": -0.09743942320346832, "memory(GiB)": 176.78, "reward": 0.2321428656578064, "reward_std": 0.14838216453790665, "rewards/AnswerTagAccuracyORM/mean": 0.2321428656578064, "rewards/AnswerTagAccuracyORM/std": 0.429407000541687, "step": 192, "train_speed(iter/s)": 0.003189 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5178571428571428, "completions/max_length": 8026.0, "completions/mean_length": 6002.982421875, "completions/min_length": 805.0, "epoch": 0.6146496815286624, "grad_norm": 0.6355292201042175, "kl": 0.0274658203125, "learning_rate": 3.311078790439598e-07, "loss": -0.29961735010147095, "memory(GiB)": 176.78, "reward": 0.4285714626312256, "reward_std": 0.3078143820166588, "rewards/AnswerTagAccuracyORM/mean": 0.4285714626312256, "rewards/AnswerTagAccuracyORM/std": 0.5039526224136353, "step": 193, "train_speed(iter/s)": 0.003189 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 8070.0, "completions/mean_length": 6182.6787109375, "completions/min_length": 1595.0, "epoch": 0.6178343949044586, "grad_norm": 0.5008262395858765, "kl": 0.0277099609375, "learning_rate": 3.263473735775899e-07, "loss": -0.17556621134281158, "memory(GiB)": 176.78, "reward": 0.3750000298023224, "reward_std": 0.3243894428014755, "rewards/AnswerTagAccuracyORM/mean": 0.3750000298023224, "rewards/AnswerTagAccuracyORM/std": 0.4897737503051758, "step": 194, "train_speed(iter/s)": 0.003189 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.48214285714285715, "completions/max_length": 8046.5, "completions/mean_length": 6006.875244140625, "completions/min_length": 1325.0, "epoch": 0.6210191082802548, "grad_norm": 0.6817752122879028, "kl": 0.0562744140625, "learning_rate": 3.2160470232618225e-07, "loss": -0.2873086631298065, "memory(GiB)": 176.78, "reward": 0.267857164144516, "reward_std": 0.3435286581516266, "rewards/AnswerTagAccuracyORM/mean": 0.267857164144516, "rewards/AnswerTagAccuracyORM/std": 0.44672515988349915, "step": 195, "train_speed(iter/s)": 0.003189 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 8042.5, "completions/mean_length": 5104.1787109375, "completions/min_length": 1014.0, "epoch": 0.6242038216560509, "grad_norm": 0.40815719962120056, "kl": 22.176969528198242, "learning_rate": 3.1688035236453865e-07, "loss": 0.02237345091998577, "memory(GiB)": 176.78, "reward": 0.4464285969734192, "reward_std": 0.23086077719926834, "rewards/AnswerTagAccuracyORM/mean": 0.4464285969734192, "rewards/AnswerTagAccuracyORM/std": 0.5059135556221008, "step": 196, "train_speed(iter/s)": 0.00319 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5892857142857143, "completions/max_length": 8032.0, "completions/mean_length": 6547.00048828125, "completions/min_length": 1434.5, "epoch": 0.6273885350318471, "grad_norm": 0.4397486746311188, "kl": 0.02099609375, "learning_rate": 3.121748088858549e-07, "loss": -0.10549207031726837, "memory(GiB)": 176.78, "reward": 0.2857142984867096, "reward_std": 0.3681929111480713, "rewards/AnswerTagAccuracyORM/mean": 0.2857142984867096, "rewards/AnswerTagAccuracyORM/std": 0.4114224463701248, "step": 197, "train_speed(iter/s)": 0.00319 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5535714285714286, "completions/max_length": 8035.0, "completions/mean_length": 5620.51806640625, "completions/min_length": 597.0, "epoch": 0.6305732484076433, "grad_norm": 27.555814743041992, "kl": 2.0523681640625, "learning_rate": 3.0748855515189096e-07, "loss": -0.24800211191177368, "memory(GiB)": 176.78, "reward": 0.464285746216774, "reward_std": 0.26657505333423615, "rewards/AnswerTagAccuracyORM/mean": 0.464285746216774, "rewards/AnswerTagAccuracyORM/std": 0.4739968776702881, "step": 198, "train_speed(iter/s)": 0.00319 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.42857142857142855, "completions/max_length": 8045.5, "completions/mean_length": 5171.607421875, "completions/min_length": 424.0, "epoch": 0.6337579617834395, "grad_norm": 0.4833768606185913, "kl": 1.5786327849851227e-41, "learning_rate": 3.028220724433408e-07, "loss": -0.06498469412326813, "memory(GiB)": 176.78, "reward": 0.375, "reward_std": 0.30228936672210693, "rewards/AnswerTagAccuracyORM/mean": 0.375, "rewards/AnswerTagAccuracyORM/std": 0.4750668406486511, "step": 199, "train_speed(iter/s)": 0.00319 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.4285714285714286, "completions/max_length": 8049.0, "completions/mean_length": 5697.178955078125, "completions/min_length": 1219.0, "epoch": 0.6369426751592356, "grad_norm": 1.165695071220398, "kl": 1483.6046142578125, "learning_rate": 2.981758400104028e-07, "loss": -0.07902750372886658, "memory(GiB)": 176.78, "reward": 0.392857164144516, "reward_std": 0.2967643439769745, "rewards/AnswerTagAccuracyORM/mean": 0.392857164144516, "rewards/AnswerTagAccuracyORM/std": 0.4973474442958832, "step": 200, "train_speed(iter/s)": 0.00319 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.4464285714285714, "completions/max_length": 8044.0, "completions/mean_length": 6021.26806640625, "completions/min_length": 1541.5, "epoch": 0.6401273885350318, "grad_norm": 0.3752177655696869, "kl": 0.0467529296875, "learning_rate": 2.9355033502356194e-07, "loss": -0.0795094221830368, "memory(GiB)": 176.78, "reward": 0.3392857313156128, "reward_std": 0.21981073915958405, "rewards/AnswerTagAccuracyORM/mean": 0.3392857313156128, "rewards/AnswerTagAccuracyORM/std": 0.47245559096336365, "step": 201, "train_speed(iter/s)": 0.003189 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5178571428571429, "completions/max_length": 8032.5, "completions/mean_length": 6366.946533203125, "completions/min_length": 1789.0, "epoch": 0.643312101910828, "grad_norm": 0.5095097422599792, "kl": 0.042724609375, "learning_rate": 2.8894603252458403e-07, "loss": -0.24165716767311096, "memory(GiB)": 176.78, "reward": 0.2678571492433548, "reward_std": 0.30228933691978455, "rewards/AnswerTagAccuracyORM/mean": 0.2678571492433548, "rewards/AnswerTagAccuracyORM/std": 0.45050114393234253, "step": 202, "train_speed(iter/s)": 0.003189 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 8052.5, "completions/mean_length": 6997.57177734375, "completions/min_length": 1818.5, "epoch": 0.6464968152866242, "grad_norm": 0.716253399848938, "kl": 0.0411376953125, "learning_rate": 2.8436340537772794e-07, "loss": -0.3257341980934143, "memory(GiB)": 176.78, "reward": 0.2500000149011612, "reward_std": 0.2967643216252327, "rewards/AnswerTagAccuracyORM/mean": 0.2500000149011612, "rewards/AnswerTagAccuracyORM/std": 0.4389495849609375, "step": 203, "train_speed(iter/s)": 0.00319 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5357142857142857, "completions/max_length": 8052.0, "completions/mean_length": 6138.303955078125, "completions/min_length": 1044.5, "epoch": 0.6496815286624203, "grad_norm": 0.6485111713409424, "kl": 0.0439453125, "learning_rate": 2.7980292422118277e-07, "loss": -0.20691558718681335, "memory(GiB)": 176.78, "reward": 0.535714328289032, "reward_std": 0.3902929872274399, "rewards/AnswerTagAccuracyORM/mean": 0.535714328289032, "rewards/AnswerTagAccuracyORM/std": 0.5078744888305664, "step": 204, "train_speed(iter/s)": 0.00319 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.30357142857142855, "completions/max_length": 8048.0, "completions/mean_length": 5014.732177734375, "completions/min_length": 1183.5, "epoch": 0.6528662420382165, "grad_norm": 0.4565260410308838, "kl": 0.0528564453125, "learning_rate": 2.75265057418733e-07, "loss": -0.19542962312698364, "memory(GiB)": 176.78, "reward": 0.446428582072258, "reward_std": 0.21981074661016464, "rewards/AnswerTagAccuracyORM/mean": 0.446428582072258, "rewards/AnswerTagAccuracyORM/std": 0.5032612979412079, "step": 205, "train_speed(iter/s)": 0.003192 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5357142857142857, "completions/max_length": 8044.0, "completions/mean_length": 6365.250244140625, "completions/min_length": 2251.0, "epoch": 0.6560509554140127, "grad_norm": 0.7114933729171753, "kl": 0.169677734375, "learning_rate": 2.70750271011657e-07, "loss": -0.12340293079614639, "memory(GiB)": 176.78, "reward": 0.321428582072258, "reward_std": 0.2857142984867096, "rewards/AnswerTagAccuracyORM/mean": 0.321428582072258, "rewards/AnswerTagAccuracyORM/std": 0.4691530168056488, "step": 206, "train_speed(iter/s)": 0.003192 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 8031.5, "completions/mean_length": 5673.46435546875, "completions/min_length": 1684.0, "epoch": 0.6592356687898089, "grad_norm": 0.5769023299217224, "kl": 0.03271484375, "learning_rate": 2.6625902867086447e-07, "loss": -0.10204778611660004, "memory(GiB)": 176.78, "reward": 0.4821428656578064, "reward_std": 0.2721000798046589, "rewards/AnswerTagAccuracyORM/mean": 0.4821428656578064, "rewards/AnswerTagAccuracyORM/std": 0.5085248351097107, "step": 207, "train_speed(iter/s)": 0.003193 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.35714285714285715, "completions/max_length": 8049.0, "completions/mean_length": 5337.607421875, "completions/min_length": 1091.5, "epoch": 0.6624203821656051, "grad_norm": 7.4786057472229, "kl": 4.21484375, "learning_rate": 2.6179179164927754e-07, "loss": 0.045502904802560806, "memory(GiB)": 176.78, "reward": 0.3750000149011612, "reward_std": 0.14838217198848724, "rewards/AnswerTagAccuracyORM/mean": 0.3750000149011612, "rewards/AnswerTagAccuracyORM/std": 0.49264875054359436, "step": 208, "train_speed(iter/s)": 0.003194 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5357142857142857, "completions/max_length": 8050.5, "completions/mean_length": 6416.553955078125, "completions/min_length": 2526.0, "epoch": 0.6656050955414012, "grad_norm": 0.5907986760139465, "kl": NaN, "learning_rate": 2.5734901873445956e-07, "loss": -0.22783614695072174, "memory(GiB)": 176.78, "reward": 0.321428582072258, "reward_std": 0.32695358991622925, "rewards/AnswerTagAccuracyORM/mean": 0.321428582072258, "rewards/AnswerTagAccuracyORM/std": 0.4739968925714493, "step": 209, "train_speed(iter/s)": 0.003194 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.4464285714285714, "completions/max_length": 8041.0, "completions/mean_length": 5512.57177734375, "completions/min_length": 1217.5, "epoch": 0.6687898089171974, "grad_norm": 6.297171115875244, "kl": 1.9635009765625, "learning_rate": 2.529311662014972e-07, "loss": -0.09180951863527298, "memory(GiB)": 176.78, "reward": 0.3392857313156128, "reward_std": 0.30228934437036514, "rewards/AnswerTagAccuracyORM/mean": 0.3392857313156128, "rewards/AnswerTagAccuracyORM/std": 0.4817724674940109, "step": 210, "train_speed(iter/s)": 0.003195 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.6428571428571428, "completions/max_length": 8047.5, "completions/mean_length": 6852.94677734375, "completions/min_length": 2993.5, "epoch": 0.6719745222929936, "grad_norm": 0.5260760188102722, "kl": 0.0394287109375, "learning_rate": 2.485386877661411e-07, "loss": -0.13763374090194702, "memory(GiB)": 176.78, "reward": 0.2857142984867096, "reward_std": 0.32695360481739044, "rewards/AnswerTagAccuracyORM/mean": 0.2857142984867096, "rewards/AnswerTagAccuracyORM/std": 0.458276703953743, "step": 211, "train_speed(iter/s)": 0.003194 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.4821428571428572, "completions/max_length": 8028.0, "completions/mean_length": 6248.76806640625, "completions/min_length": 1682.0, "epoch": 0.6751592356687898, "grad_norm": 2.8984975814819336, "kl": 0.337890625, "learning_rate": 2.441720345382089e-07, "loss": -0.07186296582221985, "memory(GiB)": 176.78, "reward": 0.2678571492433548, "reward_std": 0.29123930633068085, "rewards/AnswerTagAccuracyORM/mean": 0.2678571492433548, "rewards/AnswerTagAccuracyORM/std": 0.45050112903118134, "step": 212, "train_speed(iter/s)": 0.003194 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5178571428571428, "completions/max_length": 8037.0, "completions/mean_length": 5828.625244140625, "completions/min_length": 1334.0, "epoch": 0.678343949044586, "grad_norm": 47.080162048339844, "kl": 19.67626953125, "learning_rate": 2.3983165497525596e-07, "loss": 0.14914795756340027, "memory(GiB)": 176.78, "reward": 0.2857142984867096, "reward_std": 0.32695358991622925, "rewards/AnswerTagAccuracyORM/mean": 0.2857142984867096, "rewards/AnswerTagAccuracyORM/std": 0.4600437134504318, "step": 213, "train_speed(iter/s)": 0.003193 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.39285714285714285, "completions/max_length": 8034.5, "completions/mean_length": 5060.107421875, "completions/min_length": 1518.5, "epoch": 0.6815286624203821, "grad_norm": 0.5471820831298828, "kl": 0.0250244140625, "learning_rate": 2.355179948365189e-07, "loss": -0.1757272332906723, "memory(GiB)": 176.78, "reward": 0.4642857313156128, "reward_std": 0.26657506078481674, "rewards/AnswerTagAccuracyORM/mean": 0.4642857313156128, "rewards/AnswerTagAccuracyORM/std": 0.4959513247013092, "step": 214, "train_speed(iter/s)": 0.003194 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5178571428571429, "completions/max_length": 8038.5, "completions/mean_length": 5785.375244140625, "completions/min_length": 1677.0, "epoch": 0.6847133757961783, "grad_norm": 0.6869480013847351, "kl": 0.026123046875, "learning_rate": 2.3123149713713468e-07, "loss": -0.2949236035346985, "memory(GiB)": 176.78, "reward": 0.392857164144516, "reward_std": 0.307814359664917, "rewards/AnswerTagAccuracyORM/mean": 0.392857164144516, "rewards/AnswerTagAccuracyORM/std": 0.4609040319919586, "step": 215, "train_speed(iter/s)": 0.003194 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.4821428571428571, "completions/max_length": 8027.5, "completions/mean_length": 6098.482421875, "completions/min_length": 1733.5, "epoch": 0.6878980891719745, "grad_norm": 0.38128015398979187, "kl": 0.0662841796875, "learning_rate": 2.26972602102645e-07, "loss": 0.0502837672829628, "memory(GiB)": 176.78, "reward": 0.5000000298023224, "reward_std": 0.2253357619047165, "rewards/AnswerTagAccuracyORM/mean": 0.5000000298023224, "rewards/AnswerTagAccuracyORM/std": 0.48795005679130554, "step": 216, "train_speed(iter/s)": 0.003194 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5714285714285714, "completions/max_length": 8033.5, "completions/mean_length": 6116.053955078125, "completions/min_length": 1466.5, "epoch": 0.6910828025477707, "grad_norm": 0.926658034324646, "kl": 0.076416015625, "learning_rate": 2.2274174712378207e-07, "loss": -0.12618423998355865, "memory(GiB)": 176.78, "reward": 0.3392857313156128, "reward_std": 0.3435286581516266, "rewards/AnswerTagAccuracyORM/mean": 0.3392857313156128, "rewards/AnswerTagAccuracyORM/std": 0.48177245259284973, "step": 217, "train_speed(iter/s)": 0.003195 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.4821428571428571, "completions/max_length": 8053.5, "completions/mean_length": 5445.69677734375, "completions/min_length": 853.0, "epoch": 0.6942675159235668, "grad_norm": 0.6047350764274597, "kl": 51.23466873168945, "learning_rate": 2.1853936671155127e-07, "loss": -0.11778053641319275, "memory(GiB)": 176.78, "reward": 0.4285714626312256, "reward_std": 0.49191083014011383, "rewards/AnswerTagAccuracyORM/mean": 0.4285714626312256, "rewards/AnswerTagAccuracyORM/std": 0.49173468351364136, "step": 218, "train_speed(iter/s)": 0.003195 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 8047.0, "completions/mean_length": 6047.035888671875, "completions/min_length": 1355.5, "epoch": 0.697452229299363, "grad_norm": 0.9835929274559021, "kl": 0.0267333984375, "learning_rate": 2.1436589245260372e-07, "loss": -0.29198572039604187, "memory(GiB)": 176.78, "reward": 0.3392857313156128, "reward_std": 0.37371790409088135, "rewards/AnswerTagAccuracyORM/mean": 0.3392857313156128, "rewards/AnswerTagAccuracyORM/std": 0.47245559096336365, "step": 219, "train_speed(iter/s)": 0.003195 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.4642857142857143, "completions/max_length": 8063.5, "completions/mean_length": 5899.7861328125, "completions/min_length": 1418.5, "epoch": 0.7006369426751592, "grad_norm": 0.48660025000572205, "kl": 0.0634765625, "learning_rate": 2.1022175296491512e-07, "loss": -0.18401746451854706, "memory(GiB)": 176.78, "reward": 0.4107143133878708, "reward_std": 0.3435286581516266, "rewards/AnswerTagAccuracyORM/mean": 0.4107143133878708, "rewards/AnswerTagAccuracyORM/std": 0.5006500333547592, "step": 220, "train_speed(iter/s)": 0.003195 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.4285714285714286, "completions/max_length": 8063.5, "completions/mean_length": 5425.0003662109375, "completions/min_length": 1516.5, "epoch": 0.7038216560509554, "grad_norm": 0.38574594259262085, "kl": 0.03814697265625, "learning_rate": 2.0610737385376348e-07, "loss": -0.04302213340997696, "memory(GiB)": 176.78, "reward": 0.4642857313156128, "reward_std": 0.25552501529455185, "rewards/AnswerTagAccuracyORM/mean": 0.4642857313156128, "rewards/AnswerTagAccuracyORM/std": 0.4389495849609375, "step": 221, "train_speed(iter/s)": 0.003195 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.3035714285714286, "completions/max_length": 8049.5, "completions/mean_length": 5389.4287109375, "completions/min_length": 1246.5, "epoch": 0.7070063694267515, "grad_norm": 5124.12841796875, "kl": 274.0233154296875, "learning_rate": 2.0202317766802152e-07, "loss": 0.4420851469039917, "memory(GiB)": 176.78, "reward": 0.392857164144516, "reward_std": 0.3078143745660782, "rewards/AnswerTagAccuracyORM/mean": 0.392857164144516, "rewards/AnswerTagAccuracyORM/std": 0.4973474591970444, "step": 222, "train_speed(iter/s)": 0.003196 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.26785714285714285, "completions/max_length": 8044.5, "completions/mean_length": 4609.19677734375, "completions/min_length": 923.5, "epoch": 0.7101910828025477, "grad_norm": 4.193013668060303, "kl": 2.1361083984375, "learning_rate": 1.9796958385675965e-07, "loss": 0.02818513847887516, "memory(GiB)": 176.78, "reward": 0.6071428954601288, "reward_std": 0.2253357544541359, "rewards/AnswerTagAccuracyORM/mean": 0.6071428954601288, "rewards/AnswerTagAccuracyORM/std": 0.4959513247013092, "step": 223, "train_speed(iter/s)": 0.003197 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5714285714285714, "completions/max_length": 8045.5, "completions/mean_length": 6820.643310546875, "completions/min_length": 1928.0, "epoch": 0.7133757961783439, "grad_norm": 0.7545607089996338, "kl": 0.048828125, "learning_rate": 1.9394700872616853e-07, "loss": -0.3313080370426178, "memory(GiB)": 176.78, "reward": 0.3392857313156128, "reward_std": 0.45619654655456543, "rewards/AnswerTagAccuracyORM/mean": 0.3392857313156128, "rewards/AnswerTagAccuracyORM/std": 0.47245559096336365, "step": 224, "train_speed(iter/s)": 0.003197 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 8044.0, "completions/mean_length": 5816.178955078125, "completions/min_length": 1331.5, "epoch": 0.7165605095541401, "grad_norm": 0.531544029712677, "kl": 0.094482421875, "learning_rate": 1.899558653968042e-07, "loss": -0.11124253273010254, "memory(GiB)": 176.78, "reward": 0.4107143133878708, "reward_std": 0.2610500454902649, "rewards/AnswerTagAccuracyORM/mean": 0.4107143133878708, "rewards/AnswerTagAccuracyORM/std": 0.5006500482559204, "step": 225, "train_speed(iter/s)": 0.003197 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.39285714285714285, "completions/max_length": 8038.5, "completions/mean_length": 5465.143310546875, "completions/min_length": 1681.0, "epoch": 0.7197452229299363, "grad_norm": 0.45902571082115173, "kl": 0.0625, "learning_rate": 1.8599656376116024e-07, "loss": -0.2800550162792206, "memory(GiB)": 176.78, "reward": 0.5000000149011612, "reward_std": 0.3490536957979202, "rewards/AnswerTagAccuracyORM/mean": 0.5000000149011612, "rewards/AnswerTagAccuracyORM/std": 0.49734747409820557, "step": 226, "train_speed(iter/s)": 0.003198 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 8061.0, "completions/mean_length": 5977.035888671875, "completions/min_length": 1405.5, "epoch": 0.7229299363057324, "grad_norm": 0.40895432233810425, "kl": 0.06298828125, "learning_rate": 1.820695104415721e-07, "loss": -0.07818439602851868, "memory(GiB)": 176.78, "reward": 0.267857164144516, "reward_std": 0.2500000149011612, "rewards/AnswerTagAccuracyORM/mean": 0.267857164144516, "rewards/AnswerTagAccuracyORM/std": 0.44672515988349915, "step": 227, "train_speed(iter/s)": 0.003199 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5357142857142857, "completions/max_length": 8046.0, "completions/mean_length": 6236.82177734375, "completions/min_length": 2348.5, "epoch": 0.7261146496815286, "grad_norm": 1.3278292417526245, "kl": 0.188232421875, "learning_rate": 1.7817510874845582e-07, "loss": -0.1289130449295044, "memory(GiB)": 176.78, "reward": 0.321428582072258, "reward_std": 0.19514649361371994, "rewards/AnswerTagAccuracyORM/mean": 0.321428582072258, "rewards/AnswerTagAccuracyORM/std": 0.4739968925714493, "step": 228, "train_speed(iter/s)": 0.003199 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.4464285714285714, "completions/max_length": 8058.0, "completions/mean_length": 6219.535888671875, "completions/min_length": 2286.5, "epoch": 0.7292993630573248, "grad_norm": 2.8997809886932373, "kl": 1.2813720703125, "learning_rate": 1.7431375863888898e-07, "loss": -0.2479753941297531, "memory(GiB)": 176.78, "reward": 0.321428582072258, "reward_std": 0.32695358991622925, "rewards/AnswerTagAccuracyORM/mean": 0.321428582072258, "rewards/AnswerTagAccuracyORM/std": 0.4489477574825287, "step": 229, "train_speed(iter/s)": 0.0032 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 8028.5, "completions/mean_length": 5949.357421875, "completions/min_length": 1292.5, "epoch": 0.732484076433121, "grad_norm": 0.37234172224998474, "kl": 0.020263671875, "learning_rate": 1.7048585667553412e-07, "loss": 0.023813849315047264, "memory(GiB)": 176.78, "reward": 0.375, "reward_std": 0.2500000149011612, "rewards/AnswerTagAccuracyORM/mean": 0.375, "rewards/AnswerTagAccuracyORM/std": 0.4750668406486511, "step": 230, "train_speed(iter/s)": 0.0032 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5357142857142857, "completions/max_length": 8021.5, "completions/mean_length": 5838.39306640625, "completions/min_length": 1460.0, "epoch": 0.7356687898089171, "grad_norm": 0.3578107953071594, "kl": 0.1229248046875, "learning_rate": 1.6669179598591183e-07, "loss": -0.21507693827152252, "memory(GiB)": 176.78, "reward": 0.5535714626312256, "reward_std": 0.29123930633068085, "rewards/AnswerTagAccuracyORM/mean": 0.5535714626312256, "rewards/AnswerTagAccuracyORM/std": 0.49791228771209717, "step": 231, "train_speed(iter/s)": 0.0032 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.3035714285714286, "completions/max_length": 8058.5, "completions/mean_length": 4920.39306640625, "completions/min_length": 1596.0, "epoch": 0.7388535031847133, "grad_norm": 0.4056342840194702, "kl": 0.05517578125, "learning_rate": 1.6293196622202632e-07, "loss": 0.024090681225061417, "memory(GiB)": 176.78, "reward": 0.517857164144516, "reward_std": 0.2721000760793686, "rewards/AnswerTagAccuracyORM/mean": 0.517857164144516, "rewards/AnswerTagAccuracyORM/std": 0.5006500333547592, "step": 232, "train_speed(iter/s)": 0.003201 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 8033.5, "completions/mean_length": 5438.732421875, "completions/min_length": 2205.0, "epoch": 0.7420382165605095, "grad_norm": 0.5016524195671082, "kl": 0.0782470703125, "learning_rate": 1.592067535203479e-07, "loss": -0.18710020184516907, "memory(GiB)": 176.78, "reward": 0.517857164144516, "reward_std": 0.2610500380396843, "rewards/AnswerTagAccuracyORM/mean": 0.517857164144516, "rewards/AnswerTagAccuracyORM/std": 0.429407000541687, "step": 233, "train_speed(iter/s)": 0.003202 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.4107142857142857, "completions/max_length": 8018.0, "completions/mean_length": 5327.535888671875, "completions/min_length": 1035.5, "epoch": 0.7452229299363057, "grad_norm": 0.6563963294029236, "kl": 0.0595703125, "learning_rate": 1.555165404621567e-07, "loss": -0.38865458965301514, "memory(GiB)": 176.78, "reward": 0.5, "reward_std": 0.40943218767642975, "rewards/AnswerTagAccuracyORM/mean": 0.5, "rewards/AnswerTagAccuracyORM/std": 0.5091750919818878, "step": 234, "train_speed(iter/s)": 0.003202 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.6607142857142857, "completions/max_length": 8040.5, "completions/mean_length": 6642.339599609375, "completions/min_length": 1762.5, "epoch": 0.7484076433121019, "grad_norm": 0.815308153629303, "kl": 0.0169677734375, "learning_rate": 1.518617060342513e-07, "loss": -0.25266438722610474, "memory(GiB)": 176.78, "reward": 0.321428582072258, "reward_std": 0.3078143820166588, "rewards/AnswerTagAccuracyORM/mean": 0.321428582072258, "rewards/AnswerTagAccuracyORM/std": 0.4739968925714493, "step": 235, "train_speed(iter/s)": 0.003202 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.4642857142857143, "completions/max_length": 8059.5, "completions/mean_length": 5736.250244140625, "completions/min_length": 998.0, "epoch": 0.7515923566878981, "grad_norm": 0.5217832922935486, "kl": 0.04364013671875, "learning_rate": 1.4824262559002592e-07, "loss": -0.21336083114147186, "memory(GiB)": 176.78, "reward": 0.3750000298023224, "reward_std": 0.30228935927152634, "rewards/AnswerTagAccuracyORM/mean": 0.3750000298023224, "rewards/AnswerTagAccuracyORM/std": 0.4628649652004242, "step": 236, "train_speed(iter/s)": 0.003203 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 8044.5, "completions/mean_length": 5882.714599609375, "completions/min_length": 1346.5, "epoch": 0.7547770700636943, "grad_norm": 0.5135810971260071, "kl": 0.0560302734375, "learning_rate": 1.4465967081092345e-07, "loss": -0.15495836734771729, "memory(GiB)": 176.78, "reward": 0.2857142984867096, "reward_std": 0.26657506823539734, "rewards/AnswerTagAccuracyORM/mean": 0.2857142984867096, "rewards/AnswerTagAccuracyORM/std": 0.4582767188549042, "step": 237, "train_speed(iter/s)": 0.003203 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.6428571428571429, "completions/max_length": 8056.5, "completions/mean_length": 6505.57177734375, "completions/min_length": 1349.0, "epoch": 0.7579617834394905, "grad_norm": 0.20377548038959503, "kl": 1.419865668977121e-41, "learning_rate": 1.4111320966826057e-07, "loss": -0.06175333261489868, "memory(GiB)": 176.78, "reward": 0.321428582072258, "reward_std": 0.2253357470035553, "rewards/AnswerTagAccuracyORM/mean": 0.321428582072258, "rewards/AnswerTagAccuracyORM/std": 0.4691530168056488, "step": 238, "train_speed(iter/s)": 0.003203 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 8044.0, "completions/mean_length": 5583.607421875, "completions/min_length": 1573.5, "epoch": 0.7611464968152867, "grad_norm": 0.7509793639183044, "kl": 0.03759765625, "learning_rate": 1.376036063854401e-07, "loss": -0.21666017174720764, "memory(GiB)": 176.78, "reward": 0.4285714477300644, "reward_std": 0.32695360481739044, "rewards/AnswerTagAccuracyORM/mean": 0.4285714477300644, "rewards/AnswerTagAccuracyORM/std": 0.5026109665632248, "step": 239, "train_speed(iter/s)": 0.003204 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.39285714285714285, "completions/max_length": 8051.0, "completions/mean_length": 5945.732421875, "completions/min_length": 1600.0, "epoch": 0.7643312101910829, "grad_norm": 0.36760520935058594, "kl": 0.0548095703125, "learning_rate": 1.3413122140054217e-07, "loss": -0.12507101893424988, "memory(GiB)": 176.78, "reward": 0.3571428656578064, "reward_std": 0.2967643365263939, "rewards/AnswerTagAccuracyORM/mean": 0.3571428656578064, "rewards/AnswerTagAccuracyORM/std": 0.48795004189014435, "step": 240, "train_speed(iter/s)": 0.003205 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.4464285714285714, "completions/max_length": 8023.5, "completions/mean_length": 5641.893310546875, "completions/min_length": 1097.5, "epoch": 0.767515923566879, "grad_norm": 0.5026525259017944, "kl": 0.0391845703125, "learning_rate": 1.3069641132930926e-07, "loss": -0.16692417860031128, "memory(GiB)": 176.78, "reward": 0.4642857313156128, "reward_std": 0.2967643365263939, "rewards/AnswerTagAccuracyORM/mean": 0.4642857313156128, "rewards/AnswerTagAccuracyORM/std": 0.5065638720989227, "step": 241, "train_speed(iter/s)": 0.003206 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 8050.0, "completions/mean_length": 6501.6787109375, "completions/min_length": 2805.0, "epoch": 0.7707006369426752, "grad_norm": 0.4156489968299866, "kl": 0.023803617145823353, "learning_rate": 1.272995289285202e-07, "loss": -0.18214215338230133, "memory(GiB)": 176.78, "reward": 0.2857142984867096, "reward_std": 0.26657506078481674, "rewards/AnswerTagAccuracyORM/mean": 0.2857142984867096, "rewards/AnswerTagAccuracyORM/std": 0.45290274918079376, "step": 242, "train_speed(iter/s)": 0.003206 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 8062.0, "completions/mean_length": 6547.01806640625, "completions/min_length": 1906.5, "epoch": 0.7738853503184714, "grad_norm": 0.29880252480506897, "kl": 0.0003601927019190043, "learning_rate": 1.2394092305976272e-07, "loss": -0.12600275874137878, "memory(GiB)": 176.78, "reward": 0.267857164144516, "reward_std": 0.23086078837513924, "rewards/AnswerTagAccuracyORM/mean": 0.267857164144516, "rewards/AnswerTagAccuracyORM/std": 0.40946151316165924, "step": 243, "train_speed(iter/s)": 0.003206 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 8044.5, "completions/mean_length": 6625.714599609375, "completions/min_length": 1873.5, "epoch": 0.7770700636942676, "grad_norm": 0.6597678661346436, "kl": 556.0035400390625, "learning_rate": 1.2062093865360457e-07, "loss": -0.10461865365505219, "memory(GiB)": 176.78, "reward": 0.160714291036129, "reward_std": 0.29123931378126144, "rewards/AnswerTagAccuracyORM/mean": 0.160714291036129, "rewards/AnswerTagAccuracyORM/std": 0.3731846809387207, "step": 244, "train_speed(iter/s)": 0.003206 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.4107142857142857, "completions/max_length": 8078.0, "completions/mean_length": 5632.14306640625, "completions/min_length": 2093.0, "epoch": 0.7802547770700637, "grad_norm": 0.4589294493198395, "kl": 0.025390625, "learning_rate": 1.1733991667416926e-07, "loss": 0.09096799790859222, "memory(GiB)": 176.78, "reward": 0.5000000298023224, "reward_std": 0.33800365030765533, "rewards/AnswerTagAccuracyORM/mean": 0.5000000298023224, "rewards/AnswerTagAccuracyORM/std": 0.5078744888305664, "step": 245, "train_speed(iter/s)": 0.003207 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.4642857142857143, "completions/max_length": 8042.5, "completions/mean_length": 5813.6611328125, "completions/min_length": 1675.5, "epoch": 0.7834394904458599, "grad_norm": 0.5432755947113037, "kl": 0.048095703125, "learning_rate": 1.1409819408411897e-07, "loss": -0.0925760492682457, "memory(GiB)": 176.78, "reward": 0.4642857313156128, "reward_std": 0.25552502274513245, "rewards/AnswerTagAccuracyORM/mean": 0.4642857313156128, "rewards/AnswerTagAccuracyORM/std": 0.4582767188549042, "step": 246, "train_speed(iter/s)": 0.003207 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.4285714285714286, "completions/max_length": 8064.0, "completions/mean_length": 5805.607421875, "completions/min_length": 1816.5, "epoch": 0.7866242038216561, "grad_norm": 0.5733675360679626, "kl": 0.028795340098440647, "learning_rate": 1.108961038100481e-07, "loss": 0.0018586559453979135, "memory(GiB)": 176.78, "reward": 0.446428582072258, "reward_std": 0.23086076974868774, "rewards/AnswerTagAccuracyORM/mean": 0.446428582072258, "rewards/AnswerTagAccuracyORM/std": 0.4786955714225769, "step": 247, "train_speed(iter/s)": 0.003208 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.3392857142857143, "completions/max_length": 8042.0, "completions/mean_length": 5231.625244140625, "completions/min_length": 1361.0, "epoch": 0.7898089171974523, "grad_norm": 0.3587133586406708, "kl": 2.521376371383667, "learning_rate": 1.0773397470829143e-07, "loss": -0.1912791132926941, "memory(GiB)": 176.78, "reward": 0.6071428954601288, "reward_std": 0.34905368089675903, "rewards/AnswerTagAccuracyORM/mean": 0.6071428954601288, "rewards/AnswerTagAccuracyORM/std": 0.4609040319919586, "step": 248, "train_speed(iter/s)": 0.003209 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.4642857142857143, "completions/max_length": 8039.0, "completions/mean_length": 5721.500244140625, "completions/min_length": 1892.0, "epoch": 0.7929936305732485, "grad_norm": 0.48776480555534363, "kl": 0.077392578125, "learning_rate": 1.0461213153115079e-07, "loss": -0.07181628048419952, "memory(GiB)": 176.78, "reward": 0.5000000298023224, "reward_std": 0.34905368089675903, "rewards/AnswerTagAccuracyORM/mean": 0.5000000298023224, "rewards/AnswerTagAccuracyORM/std": 0.5078745186328888, "step": 249, "train_speed(iter/s)": 0.00321 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5178571428571428, "completions/max_length": 8027.5, "completions/mean_length": 5601.21435546875, "completions/min_length": 1040.0, "epoch": 0.7961783439490446, "grad_norm": 0.2700536251068115, "kl": 0.0224609375, "learning_rate": 1.0153089489354256e-07, "loss": -0.038272541016340256, "memory(GiB)": 176.78, "reward": 0.2500000149011612, "reward_std": 0.11266787722706795, "rewards/AnswerTagAccuracyORM/mean": 0.2500000149011612, "rewards/AnswerTagAccuracyORM/std": 0.4061589390039444, "step": 250, "train_speed(iter/s)": 0.003211 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5535714285714286, "completions/max_length": 8045.0, "completions/mean_length": 6220.410888671875, "completions/min_length": 1321.5, "epoch": 0.7993630573248408, "grad_norm": 0.547614574432373, "kl": 0.04052734375, "learning_rate": 9.849058124007043e-08, "loss": -0.17825300991535187, "memory(GiB)": 176.78, "reward": 0.4107142984867096, "reward_std": 0.37371791899204254, "rewards/AnswerTagAccuracyORM/mean": 0.4107142984867096, "rewards/AnswerTagAccuracyORM/std": 0.497912272810936, "step": 251, "train_speed(iter/s)": 0.003211 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.4464285714285714, "completions/max_length": 8047.5, "completions/mean_length": 6005.39306640625, "completions/min_length": 1652.0, "epoch": 0.802547770700637, "grad_norm": 0.3931257426738739, "kl": 0.0443115234375, "learning_rate": 9.549150281252632e-08, "loss": 0.03433360904455185, "memory(GiB)": 176.78, "reward": 0.4642857313156128, "reward_std": 0.2253357470035553, "rewards/AnswerTagAccuracyORM/mean": 0.4642857313156128, "rewards/AnswerTagAccuracyORM/std": 0.5078744888305664, "step": 252, "train_speed(iter/s)": 0.003212 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5714285714285714, "completions/max_length": 8042.5, "completions/mean_length": 6229.875244140625, "completions/min_length": 1784.5, "epoch": 0.8057324840764332, "grad_norm": 0.9034252166748047, "kl": 0.07275390625, "learning_rate": 9.253396761782306e-08, "loss": 0.06343323737382889, "memory(GiB)": 176.78, "reward": 0.3750000149011612, "reward_std": 0.3324786275625229, "rewards/AnswerTagAccuracyORM/mean": 0.3750000149011612, "rewards/AnswerTagAccuracyORM/std": 0.4839591085910797, "step": 253, "train_speed(iter/s)": 0.003212 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.3214285714285714, "completions/max_length": 8007.0, "completions/mean_length": 4358.8216552734375, "completions/min_length": 924.5, "epoch": 0.8089171974522293, "grad_norm": 0.36683788895606995, "kl": 0.0548095703125, "learning_rate": 8.961827939636196e-08, "loss": -0.12078238278627396, "memory(GiB)": 176.78, "reward": 0.3750000149011612, "reward_std": 0.21981072798371315, "rewards/AnswerTagAccuracyORM/mean": 0.3750000149011612, "rewards/AnswerTagAccuracyORM/std": 0.49264873564243317, "step": 254, "train_speed(iter/s)": 0.003213 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.26785714285714285, "completions/max_length": 8039.5, "completions/mean_length": 4816.250244140625, "completions/min_length": 1190.0, "epoch": 0.8121019108280255, "grad_norm": 0.5869824886322021, "kl": 0.0762939453125, "learning_rate": 8.6744737590838e-08, "loss": -0.1321803778409958, "memory(GiB)": 176.78, "reward": 0.5535714328289032, "reward_std": 0.3324786275625229, "rewards/AnswerTagAccuracyORM/mean": 0.5535714328289032, "rewards/AnswerTagAccuracyORM/std": 0.5032612830400467, "step": 255, "train_speed(iter/s)": 0.003214 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.30357142857142855, "completions/max_length": 8047.0, "completions/mean_length": 5286.732421875, "completions/min_length": 1331.5, "epoch": 0.8152866242038217, "grad_norm": 0.562877893447876, "kl": 0.0516357421875, "learning_rate": 8.391363731548811e-08, "loss": -0.20328308641910553, "memory(GiB)": 176.78, "reward": 0.5535714477300644, "reward_std": 0.38476796448230743, "rewards/AnswerTagAccuracyORM/mean": 0.5535714477300644, "rewards/AnswerTagAccuracyORM/std": 0.4786955863237381, "step": 256, "train_speed(iter/s)": 0.003215 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.4107142857142857, "completions/max_length": 8021.0, "completions/mean_length": 5327.553955078125, "completions/min_length": 1074.0, "epoch": 0.8184713375796179, "grad_norm": 0.4446698725223541, "kl": 0.0565185546875, "learning_rate": 8.112526932578117e-08, "loss": -0.039517223834991455, "memory(GiB)": 176.78, "reward": 0.4821428656578064, "reward_std": 0.37371791899204254, "rewards/AnswerTagAccuracyORM/mean": 0.4821428656578064, "rewards/AnswerTagAccuracyORM/std": 0.5085247755050659, "step": 257, "train_speed(iter/s)": 0.003215 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 8030.5, "completions/mean_length": 5912.285888671875, "completions/min_length": 1241.0, "epoch": 0.821656050955414, "grad_norm": 0.6693341732025146, "kl": 0.130615234375, "learning_rate": 7.837991998855897e-08, "loss": -0.08013840764760971, "memory(GiB)": 176.78, "reward": 0.3214285969734192, "reward_std": 0.2967643216252327, "rewards/AnswerTagAccuracyORM/mean": 0.3214285969734192, "rewards/AnswerTagAccuracyORM/std": 0.4609040319919586, "step": 258, "train_speed(iter/s)": 0.003216 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.4642857142857143, "completions/max_length": 8041.5, "completions/mean_length": 5953.500244140625, "completions/min_length": 1419.0, "epoch": 0.8248407643312102, "grad_norm": 0.6183769702911377, "kl": 0.085205078125, "learning_rate": 7.567787125262449e-08, "loss": -0.2941018044948578, "memory(GiB)": 176.78, "reward": 0.3571428805589676, "reward_std": 0.19514649361371994, "rewards/AnswerTagAccuracyORM/mean": 0.3571428805589676, "rewards/AnswerTagAccuracyORM/std": 0.48199816048145294, "step": 259, "train_speed(iter/s)": 0.003216 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5178571428571428, "completions/max_length": 8020.0, "completions/mean_length": 6054.143310546875, "completions/min_length": 1255.0, "epoch": 0.8280254777070064, "grad_norm": 0.35015594959259033, "kl": 0.024169921875, "learning_rate": 7.301940061978722e-08, "loss": 0.021139614284038544, "memory(GiB)": 176.78, "reward": 0.2321428656578064, "reward_std": 0.29123930633068085, "rewards/AnswerTagAccuracyORM/mean": 0.2321428656578064, "rewards/AnswerTagAccuracyORM/std": 0.425032377243042, "step": 260, "train_speed(iter/s)": 0.003215 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 8042.5, "completions/mean_length": 6572.178955078125, "completions/min_length": 2292.5, "epoch": 0.8312101910828026, "grad_norm": 0.41886037588119507, "kl": 0.01409912109375, "learning_rate": 7.040478111636228e-08, "loss": -0.14545117318630219, "memory(GiB)": 176.78, "reward": 0.2678571492433548, "reward_std": 0.3435286581516266, "rewards/AnswerTagAccuracyORM/mean": 0.2678571492433548, "rewards/AnswerTagAccuracyORM/std": 0.45050114393234253, "step": 261, "train_speed(iter/s)": 0.003215 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 8034.5, "completions/mean_length": 5691.9287109375, "completions/min_length": 1192.0, "epoch": 0.8343949044585988, "grad_norm": 1.4889556169509888, "kl": 0.3389892578125, "learning_rate": 6.783428126513125e-08, "loss": -0.09831805527210236, "memory(GiB)": 176.78, "reward": 0.2142857201397419, "reward_std": 0.19514648616313934, "rewards/AnswerTagAccuracyORM/mean": 0.2142857201397419, "rewards/AnswerTagAccuracyORM/std": 0.37510764598846436, "step": 262, "train_speed(iter/s)": 0.003216 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 8028.0, "completions/mean_length": 5496.607421875, "completions/min_length": 1581.0, "epoch": 0.8375796178343949, "grad_norm": 0.2921965718269348, "kl": 0.0533447265625, "learning_rate": 6.530816505776443e-08, "loss": -0.031957417726516724, "memory(GiB)": 176.78, "reward": 0.4107143133878708, "reward_std": 0.1896214708685875, "rewards/AnswerTagAccuracyORM/mean": 0.4107143133878708, "rewards/AnswerTagAccuracyORM/std": 0.4839591085910797, "step": 263, "train_speed(iter/s)": 0.003217 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 8054.5, "completions/mean_length": 5091.857421875, "completions/min_length": 1231.0, "epoch": 0.8407643312101911, "grad_norm": 0.24407930672168732, "kl": 0.06640625, "learning_rate": 6.282669192770895e-08, "loss": -0.011408509686589241, "memory(GiB)": 176.78, "reward": 0.5000000149011612, "reward_std": 0.19514649361371994, "rewards/AnswerTagAccuracyORM/mean": 0.5000000149011612, "rewards/AnswerTagAccuracyORM/std": 0.460043728351593, "step": 264, "train_speed(iter/s)": 0.003217 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.7321428571428572, "completions/max_length": 8064.0, "completions/mean_length": 7200.107666015625, "completions/min_length": 3825.5, "epoch": 0.8439490445859873, "grad_norm": 0.6117250919342041, "kl": 0.03643798828125, "learning_rate": 6.039011672354455e-08, "loss": -0.25999927520751953, "memory(GiB)": 176.78, "reward": 0.2500000149011612, "reward_std": 0.3078143745660782, "rewards/AnswerTagAccuracyORM/mean": 0.2500000149011612, "rewards/AnswerTagAccuracyORM/std": 0.43280795216560364, "step": 265, "train_speed(iter/s)": 0.003217 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.2857142857142857, "completions/max_length": 8020.5, "completions/mean_length": 4265.500244140625, "completions/min_length": 979.0, "epoch": 0.8471337579617835, "grad_norm": 8.540395736694336, "kl": 3.001220703125, "learning_rate": 5.799868968281074e-08, "loss": -0.06987228244543076, "memory(GiB)": 176.78, "reward": 0.6250000298023224, "reward_std": 0.14838216453790665, "rewards/AnswerTagAccuracyORM/mean": 0.6250000298023224, "rewards/AnswerTagAccuracyORM/std": 0.4897737503051758, "step": 266, "train_speed(iter/s)": 0.003219 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5535714285714286, "completions/max_length": 8050.5, "completions/mean_length": 6214.964599609375, "completions/min_length": 1640.0, "epoch": 0.8503184713375797, "grad_norm": 0.4220220446586609, "kl": 0.03466796875, "learning_rate": 5.565265640630723e-08, "loss": -0.0669555515050888, "memory(GiB)": 176.78, "reward": 0.1785714328289032, "reward_std": 0.2253357470035553, "rewards/AnswerTagAccuracyORM/mean": 0.1785714328289032, "rewards/AnswerTagAccuracyORM/std": 0.37796446681022644, "step": 267, "train_speed(iter/s)": 0.003219 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 8021.0, "completions/mean_length": 5096.89306640625, "completions/min_length": 1902.5, "epoch": 0.8535031847133758, "grad_norm": 562.29443359375, "kl": 174.029296875, "learning_rate": 5.335225783287051e-08, "loss": 0.30988335609436035, "memory(GiB)": 176.78, "reward": 0.446428582072258, "reward_std": 0.2610500454902649, "rewards/AnswerTagAccuracyORM/mean": 0.446428582072258, "rewards/AnswerTagAccuracyORM/std": 0.5032612681388855, "step": 268, "train_speed(iter/s)": 0.00322 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.4285714285714286, "completions/max_length": 8059.5, "completions/mean_length": 5627.44677734375, "completions/min_length": 1379.0, "epoch": 0.856687898089172, "grad_norm": 0.6505483984947205, "kl": 0.04296875, "learning_rate": 5.109773021462921e-08, "loss": -0.24035362899303436, "memory(GiB)": 176.78, "reward": 0.3392857313156128, "reward_std": 0.3435286581516266, "rewards/AnswerTagAccuracyORM/mean": 0.3392857313156128, "rewards/AnswerTagAccuracyORM/std": 0.48177245259284973, "step": 269, "train_speed(iter/s)": 0.00322 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.4821428571428571, "completions/max_length": 8059.0, "completions/mean_length": 5999.339599609375, "completions/min_length": 2076.0, "epoch": 0.8598726114649682, "grad_norm": 1.334702730178833, "kl": 1.001953125, "learning_rate": 4.888930509274125e-08, "loss": 0.008383408188819885, "memory(GiB)": 176.78, "reward": 0.321428582072258, "reward_std": 0.2253357619047165, "rewards/AnswerTagAccuracyORM/mean": 0.321428582072258, "rewards/AnswerTagAccuracyORM/std": 0.4739968925714493, "step": 270, "train_speed(iter/s)": 0.00322 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.4107142857142857, "completions/max_length": 8038.5, "completions/mean_length": 5342.410888671875, "completions/min_length": 1213.5, "epoch": 0.8630573248407644, "grad_norm": 0.2792721390724182, "kl": 0.041748046875, "learning_rate": 4.6727209273614124e-08, "loss": -0.08284494280815125, "memory(GiB)": 176.78, "reward": 0.4107143133878708, "reward_std": 0.21981074661016464, "rewards/AnswerTagAccuracyORM/mean": 0.4107143133878708, "rewards/AnswerTagAccuracyORM/std": 0.5006500333547592, "step": 271, "train_speed(iter/s)": 0.003221 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.4821428571428571, "completions/max_length": 8046.0, "completions/mean_length": 5400.964599609375, "completions/min_length": 739.0, "epoch": 0.8662420382165605, "grad_norm": 0.5285795331001282, "kl": 21.917213439941406, "learning_rate": 4.4611664805611794e-08, "loss": -0.2615126073360443, "memory(GiB)": 176.78, "reward": 0.446428582072258, "reward_std": 0.2610500529408455, "rewards/AnswerTagAccuracyORM/mean": 0.446428582072258, "rewards/AnswerTagAccuracyORM/std": 0.5032612979412079, "step": 272, "train_speed(iter/s)": 0.003221 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.4285714285714286, "completions/max_length": 8027.5, "completions/mean_length": 5324.553955078125, "completions/min_length": 855.0, "epoch": 0.8694267515923567, "grad_norm": 0.3686632513999939, "kl": 0.06591796875, "learning_rate": 4.2542888956250464e-08, "loss": -0.04334472492337227, "memory(GiB)": 176.78, "reward": 0.5535714626312256, "reward_std": 0.30228933691978455, "rewards/AnswerTagAccuracyORM/mean": 0.5535714626312256, "rewards/AnswerTagAccuracyORM/std": 0.5059135854244232, "step": 273, "train_speed(iter/s)": 0.003222 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5535714285714286, "completions/max_length": 8016.5, "completions/mean_length": 5866.160888671875, "completions/min_length": 1242.5, "epoch": 0.8726114649681529, "grad_norm": 0.23201723396778107, "kl": 0.03265380859375, "learning_rate": 4.0521094189884696e-08, "loss": -0.03820869326591492, "memory(GiB)": 176.78, "reward": 0.2678571566939354, "reward_std": 0.14838216453790665, "rewards/AnswerTagAccuracyORM/mean": 0.2678571566939354, "rewards/AnswerTagAccuracyORM/std": 0.4268478900194168, "step": 274, "train_speed(iter/s)": 0.003222 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5892857142857143, "completions/max_length": 8049.5, "completions/mean_length": 6177.125244140625, "completions/min_length": 1355.0, "epoch": 0.8757961783439491, "grad_norm": 0.4513340890407562, "kl": 0.0380859375, "learning_rate": 3.8546488145887624e-08, "loss": -0.08043000847101212, "memory(GiB)": 176.78, "reward": 0.321428582072258, "reward_std": 0.3078143745660782, "rewards/AnswerTagAccuracyORM/mean": 0.321428582072258, "rewards/AnswerTagAccuracyORM/std": 0.4489477574825287, "step": 275, "train_speed(iter/s)": 0.003222 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.6964285714285714, "completions/max_length": 8028.0, "completions/mean_length": 6920.803955078125, "completions/min_length": 2485.0, "epoch": 0.8789808917197452, "grad_norm": 1.4028353691101074, "kl": 0.6268310546875, "learning_rate": 3.6619273617325695e-08, "loss": -0.16914184391498566, "memory(GiB)": 176.78, "reward": 0.232142873108387, "reward_std": 0.23086076974868774, "rewards/AnswerTagAccuracyORM/mean": 0.232142873108387, "rewards/AnswerTagAccuracyORM/std": 0.4159715920686722, "step": 276, "train_speed(iter/s)": 0.003222 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.6071428571428572, "completions/max_length": 8043.0, "completions/mean_length": 6354.357421875, "completions/min_length": 1492.5, "epoch": 0.8821656050955414, "grad_norm": 0.5193430185317993, "kl": 1.416993007125255e-41, "learning_rate": 3.473964853013273e-08, "loss": -0.09646855294704437, "memory(GiB)": 176.78, "reward": 0.3035714477300644, "reward_std": 0.30228933691978455, "rewards/AnswerTagAccuracyORM/mean": 0.3035714477300644, "rewards/AnswerTagAccuracyORM/std": 0.46781928837299347, "step": 277, "train_speed(iter/s)": 0.003222 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 8023.0, "completions/mean_length": 5800.464599609375, "completions/min_length": 1759.0, "epoch": 0.8853503184713376, "grad_norm": 1.6207529306411743, "kl": 0.4185791015625, "learning_rate": 3.2907805922781476e-08, "loss": -0.2863817811012268, "memory(GiB)": 176.78, "reward": 0.4285714626312256, "reward_std": 0.26657505333423615, "rewards/AnswerTagAccuracyORM/mean": 0.4285714626312256, "rewards/AnswerTagAccuracyORM/std": 0.5039526224136353, "step": 278, "train_speed(iter/s)": 0.003222 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5357142857142857, "completions/max_length": 8027.5, "completions/mean_length": 6033.375244140625, "completions/min_length": 1543.0, "epoch": 0.8885350318471338, "grad_norm": 0.6210139989852905, "kl": 0.0206298828125, "learning_rate": 3.1123933926459845e-08, "loss": -0.1716410219669342, "memory(GiB)": 176.78, "reward": 0.446428582072258, "reward_std": 0.3324785977602005, "rewards/AnswerTagAccuracyORM/mean": 0.446428582072258, "rewards/AnswerTagAccuracyORM/std": 0.4786955863237381, "step": 279, "train_speed(iter/s)": 0.003222 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5178571428571428, "completions/max_length": 8034.0, "completions/mean_length": 6295.035888671875, "completions/min_length": 1541.5, "epoch": 0.89171974522293, "grad_norm": 0.38655340671539307, "kl": 0.0438232421875, "learning_rate": 2.9388215745748345e-08, "loss": -0.10300473123788834, "memory(GiB)": 176.78, "reward": 0.2142857238650322, "reward_std": 0.2967643216252327, "rewards/AnswerTagAccuracyORM/mean": 0.2142857238650322, "rewards/AnswerTagAccuracyORM/std": 0.40819603204727173, "step": 280, "train_speed(iter/s)": 0.003222 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5714285714285714, "completions/max_length": 8059.0, "completions/mean_length": 6408.1611328125, "completions/min_length": 1532.0, "epoch": 0.8949044585987261, "grad_norm": 0.5052747130393982, "kl": 0.039306640625, "learning_rate": 2.7700829639806465e-08, "loss": -0.17020417749881744, "memory(GiB)": 176.78, "reward": 0.321428582072258, "reward_std": 0.26657506078481674, "rewards/AnswerTagAccuracyORM/mean": 0.321428582072258, "rewards/AnswerTagAccuracyORM/std": 0.4739968776702881, "step": 281, "train_speed(iter/s)": 0.003223 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.4107142857142857, "completions/max_length": 8031.5, "completions/mean_length": 5205.26806640625, "completions/min_length": 1111.0, "epoch": 0.8980891719745223, "grad_norm": 0.7017294764518738, "kl": 0.04931640625, "learning_rate": 2.6061948904063658e-08, "loss": -0.0019177369540557265, "memory(GiB)": 176.78, "reward": 0.4642857313156128, "reward_std": 0.33800363540649414, "rewards/AnswerTagAccuracyORM/mean": 0.4642857313156128, "rewards/AnswerTagAccuracyORM/std": 0.5078744888305664, "step": 282, "train_speed(iter/s)": 0.003223 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.4642857142857143, "completions/max_length": 8032.0, "completions/mean_length": 6105.125244140625, "completions/min_length": 1046.5, "epoch": 0.9012738853503185, "grad_norm": 0.5392616391181946, "kl": 0.0572509765625, "learning_rate": 2.4471741852423233e-08, "loss": -0.040537793189287186, "memory(GiB)": 176.78, "reward": 0.392857164144516, "reward_std": 0.33800365030765533, "rewards/AnswerTagAccuracyORM/mean": 0.392857164144516, "rewards/AnswerTagAccuracyORM/std": 0.49173468351364136, "step": 283, "train_speed(iter/s)": 0.003224 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.4642857142857143, "completions/max_length": 8060.0, "completions/mean_length": 6067.14306640625, "completions/min_length": 745.5, "epoch": 0.9044585987261147, "grad_norm": 0.5731423497200012, "kl": 0.05810546875, "learning_rate": 2.293037179997559e-08, "loss": -0.07903746515512466, "memory(GiB)": 176.78, "reward": 0.4642857313156128, "reward_std": 0.33800363540649414, "rewards/AnswerTagAccuracyORM/mean": 0.4642857313156128, "rewards/AnswerTagAccuracyORM/std": 0.48647116124629974, "step": 284, "train_speed(iter/s)": 0.003223 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 8035.0, "completions/mean_length": 5818.035888671875, "completions/min_length": 1439.0, "epoch": 0.9076433121019108, "grad_norm": 0.6655234694480896, "kl": 0.0400390625, "learning_rate": 2.1437997046226008e-08, "loss": -0.2412337064743042, "memory(GiB)": 176.78, "reward": 0.3392857313156128, "reward_std": 0.2610500380396843, "rewards/AnswerTagAccuracyORM/mean": 0.3392857313156128, "rewards/AnswerTagAccuracyORM/std": 0.48177245259284973, "step": 285, "train_speed(iter/s)": 0.003224 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5714285714285714, "completions/max_length": 8061.0, "completions/mean_length": 6306.19677734375, "completions/min_length": 1288.0, "epoch": 0.910828025477707, "grad_norm": 0.8818116188049316, "kl": 0.0340576171875, "learning_rate": 1.9994770858837107e-08, "loss": -0.10960516333580017, "memory(GiB)": 176.78, "reward": 0.3571428805589676, "reward_std": 0.26657506078481674, "rewards/AnswerTagAccuracyORM/mean": 0.3571428805589676, "rewards/AnswerTagAccuracyORM/std": 0.48199817538261414, "step": 286, "train_speed(iter/s)": 0.003224 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5535714285714286, "completions/max_length": 8053.0, "completions/mean_length": 6440.303955078125, "completions/min_length": 1986.5, "epoch": 0.9140127388535032, "grad_norm": 0.39754489064216614, "kl": 0.03631591796875, "learning_rate": 1.860084145788826e-08, "loss": 0.06631383299827576, "memory(GiB)": 176.78, "reward": 0.2678571492433548, "reward_std": 0.3324785977602005, "rewards/AnswerTagAccuracyORM/mean": 0.2678571492433548, "rewards/AnswerTagAccuracyORM/std": 0.43898552656173706, "step": 287, "train_speed(iter/s)": 0.003224 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.4642857142857143, "completions/max_length": 8059.0, "completions/mean_length": 5826.82177734375, "completions/min_length": 1675.0, "epoch": 0.9171974522292994, "grad_norm": 0.6225547194480896, "kl": 0.0421142578125, "learning_rate": 1.725635200065323e-08, "loss": -0.3206044137477875, "memory(GiB)": 176.78, "reward": 0.4285714477300644, "reward_std": 0.33800363540649414, "rewards/AnswerTagAccuracyORM/mean": 0.4285714477300644, "rewards/AnswerTagAccuracyORM/std": 0.502610981464386, "step": 288, "train_speed(iter/s)": 0.003224 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.39285714285714285, "completions/max_length": 8045.0, "completions/mean_length": 5416.107177734375, "completions/min_length": 1183.5, "epoch": 0.9203821656050956, "grad_norm": 0.47386959195137024, "kl": 0.115966796875, "learning_rate": 1.596144056689791e-08, "loss": -0.10028056055307388, "memory(GiB)": 176.78, "reward": 0.4821428954601288, "reward_std": 0.2610500305891037, "rewards/AnswerTagAccuracyORM/mean": 0.4821428954601288, "rewards/AnswerTagAccuracyORM/std": 0.4817724674940109, "step": 289, "train_speed(iter/s)": 0.003225 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5178571428571429, "completions/max_length": 8044.0, "completions/mean_length": 6445.464599609375, "completions/min_length": 1398.5, "epoch": 0.9235668789808917, "grad_norm": 0.49086901545524597, "kl": 0.0467529296875, "learning_rate": 1.4716240144699187e-08, "loss": -0.14612972736358643, "memory(GiB)": 176.78, "reward": 0.2142857313156128, "reward_std": 0.2253357470035553, "rewards/AnswerTagAccuracyORM/mean": 0.2142857313156128, "rewards/AnswerTagAccuracyORM/std": 0.39528264105319977, "step": 290, "train_speed(iter/s)": 0.003225 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 8029.5, "completions/mean_length": 5734.375244140625, "completions/min_length": 844.5, "epoch": 0.9267515923566879, "grad_norm": 0.5175443291664124, "kl": 0.0667724609375, "learning_rate": 1.3520878616787523e-08, "loss": 0.018897494301199913, "memory(GiB)": 176.78, "reward": 0.392857164144516, "reward_std": 0.3681929111480713, "rewards/AnswerTagAccuracyORM/mean": 0.392857164144516, "rewards/AnswerTagAccuracyORM/std": 0.4744165241718292, "step": 291, "train_speed(iter/s)": 0.003225 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.42857142857142855, "completions/max_length": 8033.0, "completions/mean_length": 5356.482421875, "completions/min_length": 1571.0, "epoch": 0.9299363057324841, "grad_norm": 0.33042165637016296, "kl": 0.0430908203125, "learning_rate": 1.2375478747413015e-08, "loss": 0.04050120711326599, "memory(GiB)": 176.78, "reward": 0.5000000298023224, "reward_std": 0.2967643141746521, "rewards/AnswerTagAccuracyORM/mean": 0.5000000298023224, "rewards/AnswerTagAccuracyORM/std": 0.5078744888305664, "step": 292, "train_speed(iter/s)": 0.003226 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.3214285714285714, "completions/max_length": 8054.5, "completions/mean_length": 5093.76806640625, "completions/min_length": 1073.5, "epoch": 0.9331210191082803, "grad_norm": 0.8467631936073303, "kl": 0.052978515625, "learning_rate": 1.1280158169737265e-08, "loss": -0.26039645075798035, "memory(GiB)": 176.78, "reward": 0.5535714626312256, "reward_std": 0.3324785977602005, "rewards/AnswerTagAccuracyORM/mean": 0.5535714626312256, "rewards/AnswerTagAccuracyORM/std": 0.5059135556221008, "step": 293, "train_speed(iter/s)": 0.003226 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.3392857142857143, "completions/max_length": 8050.0, "completions/mean_length": 4675.946533203125, "completions/min_length": 587.0, "epoch": 0.9363057324840764, "grad_norm": 0.6540654301643372, "kl": 0.0511474609375, "learning_rate": 1.0235029373752757e-08, "loss": -0.02355077676475048, "memory(GiB)": 176.78, "reward": 0.4464285969734192, "reward_std": 0.3324786126613617, "rewards/AnswerTagAccuracyORM/mean": 0.4464285969734192, "rewards/AnswerTagAccuracyORM/std": 0.4979122579097748, "step": 294, "train_speed(iter/s)": 0.003227 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5535714285714286, "completions/max_length": 8030.5, "completions/mean_length": 6136.69677734375, "completions/min_length": 1404.0, "epoch": 0.9394904458598726, "grad_norm": 0.40905487537384033, "kl": 0.0523681640625, "learning_rate": 9.240199694729944e-09, "loss": 0.014087937772274017, "memory(GiB)": 176.78, "reward": 0.392857164144516, "reward_std": 0.18409644439816475, "rewards/AnswerTagAccuracyORM/mean": 0.392857164144516, "rewards/AnswerTagAccuracyORM/std": 0.49173468351364136, "step": 295, "train_speed(iter/s)": 0.003227 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.4107142857142857, "completions/max_length": 8051.0, "completions/mean_length": 5592.5537109375, "completions/min_length": 1196.0, "epoch": 0.9426751592356688, "grad_norm": 0.3601033091545105, "kl": 0.0228271484375, "learning_rate": 8.295771302193721e-09, "loss": -0.025663327425718307, "memory(GiB)": 176.78, "reward": 0.2857143059372902, "reward_std": 0.11266788095235825, "rewards/AnswerTagAccuracyORM/mean": 0.2857143059372902, "rewards/AnswerTagAccuracyORM/std": 0.43015047907829285, "step": 296, "train_speed(iter/s)": 0.003227 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.4107142857142857, "completions/max_length": 8053.0, "completions/mean_length": 5372.053955078125, "completions/min_length": 1333.0, "epoch": 0.945859872611465, "grad_norm": 0.5271080732345581, "kl": 0.040283203125, "learning_rate": 7.401841189430657e-09, "loss": -0.12217244505882263, "memory(GiB)": 176.78, "reward": 0.3750000149011612, "reward_std": 0.2721000909805298, "rewards/AnswerTagAccuracyORM/mean": 0.3750000149011612, "rewards/AnswerTagAccuracyORM/std": 0.49264876544475555, "step": 297, "train_speed(iter/s)": 0.003227 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5357142857142857, "completions/max_length": 8052.0, "completions/mean_length": 5935.625244140625, "completions/min_length": 1075.5, "epoch": 0.9490445859872612, "grad_norm": 0.5965413451194763, "kl": 0.05285042445757426, "learning_rate": 6.558501163527963e-09, "loss": -0.2230261266231537, "memory(GiB)": 176.78, "reward": 0.3392857313156128, "reward_std": 0.2721000760793686, "rewards/AnswerTagAccuracyORM/mean": 0.3392857313156128, "rewards/AnswerTagAccuracyORM/std": 0.4628649652004242, "step": 298, "train_speed(iter/s)": 0.003227 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 8037.0, "completions/mean_length": 5281.518310546875, "completions/min_length": 893.0, "epoch": 0.9522292993630573, "grad_norm": 0.32658663392066956, "kl": 0.07177734375, "learning_rate": 5.765837835944309e-09, "loss": 0.05634969845414162, "memory(GiB)": 176.78, "reward": 0.517857164144516, "reward_std": 0.2610500529408455, "rewards/AnswerTagAccuracyORM/mean": 0.517857164144516, "rewards/AnswerTagAccuracyORM/std": 0.5059135854244232, "step": 299, "train_speed(iter/s)": 0.003226 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.6071428571428572, "completions/max_length": 8031.0, "completions/mean_length": 6464.035888671875, "completions/min_length": 1501.5, "epoch": 0.9554140127388535, "grad_norm": 0.33558884263038635, "kl": 1.261028488045903e-41, "learning_rate": 5.023932613615445e-09, "loss": -0.09524659812450409, "memory(GiB)": 176.78, "reward": 0.3035714477300644, "reward_std": 0.3435286656022072, "rewards/AnswerTagAccuracyORM/mean": 0.3035714477300644, "rewards/AnswerTagAccuracyORM/std": 0.4576014429330826, "step": 300, "train_speed(iter/s)": 0.003225 } ], "logging_steps": 1, "max_steps": 314, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }