{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 935.5, "completions/mean_length": 571.30859375, "completions/min_length": 264.5, "epoch": 0.02, "grad_norm": 1.2956373691558838, "kl": 0.0006160736083984375, "learning_rate": 2e-07, "loss": 0.11099594086408615, "memory(GiB)": 18.17, "reward": 0.18179254233837128, "reward_std": 0.021205796860158443, "rewards/MCQ_Reward/mean": 0.18179254233837128, "rewards/MCQ_Reward/std": 0.0575394481420517, "step": 1, "train_speed(iter/s)": 0.017384 }, { "clip_ratio": 0.0, "epoch": 0.04, "grad_norm": 1.2956030368804932, "kl": 0.0006160736083984375, "learning_rate": 4e-07, "loss": 0.11099594086408615, "memory(GiB)": 18.17, "step": 2, "train_speed(iter/s)": 0.033769 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1004.0, "completions/mean_length": 582.2890625, "completions/min_length": 126.5, "epoch": 0.06, "grad_norm": 1.1973260641098022, "kl": 0.00061798095703125, "learning_rate": 6e-07, "loss": 0.09401366859674454, "memory(GiB)": 18.17, "reward": 0.1757229119539261, "reward_std": 0.02308646310120821, "rewards/MCQ_Reward/mean": 0.1757229119539261, "rewards/MCQ_Reward/std": 0.06555243954062462, "step": 3, "train_speed(iter/s)": 0.029478 }, { "clip_ratio": 0.0011098573449999094, "epoch": 0.08, "grad_norm": 1.206025242805481, "kl": 0.0006008148193359375, "learning_rate": 8e-07, "loss": 0.09423406422138214, "memory(GiB)": 18.17, "step": 4, "train_speed(iter/s)": 0.038797 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1025.0, "completions/mean_length": 587.22265625, "completions/min_length": 50.0, "epoch": 0.1, "grad_norm": 1.1425890922546387, "kl": 0.0006389617919921875, "learning_rate": 1e-06, "loss": 0.10835893452167511, "memory(GiB)": 18.17, "reward": 0.20135290175676346, "reward_std": 0.026336468756198883, "rewards/MCQ_Reward/mean": 0.20135290175676346, "rewards/MCQ_Reward/std": 0.04013596661388874, "step": 5, "train_speed(iter/s)": 0.033455 }, { "clip_ratio": 0.000744842371204868, "epoch": 0.12, "grad_norm": 1.1426688432693481, "kl": 0.0006389617919921875, "learning_rate": 9.999899300364532e-07, "loss": 0.10809706896543503, "memory(GiB)": 18.17, "step": 6, "train_speed(iter/s)": 0.039768 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 986.0, "completions/mean_length": 554.33203125, "completions/min_length": 187.5, "epoch": 0.14, "grad_norm": 1.2598297595977783, "kl": 0.000637054443359375, "learning_rate": 9.999597205514296e-07, "loss": 0.10747133195400238, "memory(GiB)": 18.17, "reward": 0.18709591031074524, "reward_std": 0.022870728746056557, "rewards/MCQ_Reward/mean": 0.18709591031074524, "rewards/MCQ_Reward/std": 0.061255430802702904, "step": 7, "train_speed(iter/s)": 0.036272 }, { "clip_ratio": 0.0011600544094108045, "epoch": 0.16, "grad_norm": 1.2500499486923218, "kl": 0.0007114410400390625, "learning_rate": 9.999093727617628e-07, "loss": 0.10704316943883896, "memory(GiB)": 18.17, "step": 8, "train_speed(iter/s)": 0.041177 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1011.5, "completions/mean_length": 562.61328125, "completions/min_length": 231.5, "epoch": 0.18, "grad_norm": 1.4137037992477417, "kl": 0.00092315673828125, "learning_rate": 9.998388886954545e-07, "loss": 0.1194264367222786, "memory(GiB)": 18.17, "reward": 0.20057281106710434, "reward_std": 0.02457202784717083, "rewards/MCQ_Reward/mean": 0.20057281106710434, "rewards/MCQ_Reward/std": 0.0581410713493824, "step": 9, "train_speed(iter/s)": 0.037627 }, { "clip_ratio": 0.0008636733400635421, "epoch": 0.2, "grad_norm": 1.4122164249420166, "kl": 0.001087188720703125, "learning_rate": 9.997482711915925e-07, "loss": 0.11916504055261612, "memory(GiB)": 18.17, "step": 10, "train_speed(iter/s)": 0.041584 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 988.0, "completions/mean_length": 545.3203125, "completions/min_length": 13.0, "epoch": 0.22, "grad_norm": 1.1587789058685303, "kl": 0.001285552978515625, "learning_rate": 9.996375239002368e-07, "loss": 0.06654135137796402, "memory(GiB)": 18.17, "reward": 0.18803076446056366, "reward_std": 0.027116701006889343, "rewards/MCQ_Reward/mean": 0.18803076446056366, "rewards/MCQ_Reward/std": 0.06116201728582382, "step": 11, "train_speed(iter/s)": 0.037797 }, { "clip_ratio": 0.0012727798894047737, "epoch": 0.24, "grad_norm": 1.1393318176269531, "kl": 0.001819610595703125, "learning_rate": 9.995066512822718e-07, "loss": 0.0661393254995346, "memory(GiB)": 18.17, "step": 12, "train_speed(iter/s)": 0.041011 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 928.0, "completions/mean_length": 502.984375, "completions/min_length": 181.5, "epoch": 0.26, "grad_norm": 1.3736039400100708, "kl": 0.00341796875, "learning_rate": 9.99355658609228e-07, "loss": 0.09961968660354614, "memory(GiB)": 18.17, "reward": 0.2046608179807663, "reward_std": 0.02339835651218891, "rewards/MCQ_Reward/mean": 0.2046608179807663, "rewards/MCQ_Reward/std": 0.07441236078739166, "step": 13, "train_speed(iter/s)": 0.038941 }, { "clip_ratio": 0.0013542931410484016, "epoch": 0.28, "grad_norm": 1.341399073600769, "kl": 0.004730224609375, "learning_rate": 9.991845519630676e-07, "loss": 0.09878668189048767, "memory(GiB)": 18.17, "step": 14, "train_speed(iter/s)": 0.041763 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 939.0, "completions/mean_length": 479.08984375, "completions/min_length": 201.5, "epoch": 0.3, "grad_norm": 1.2583457231521606, "kl": 0.005706787109375, "learning_rate": 9.989933382359422e-07, "loss": 0.09561844170093536, "memory(GiB)": 18.17, "reward": 0.23959992825984955, "reward_std": 0.024829759262502193, "rewards/MCQ_Reward/mean": 0.23959992825984955, "rewards/MCQ_Reward/std": 0.059385696426033974, "step": 15, "train_speed(iter/s)": 0.040033 }, { "clip_ratio": 0.0012090829550288618, "epoch": 0.32, "grad_norm": 1.2485970258712769, "kl": 0.0069122314453125, "learning_rate": 9.98782025129912e-07, "loss": 0.09502086043357849, "memory(GiB)": 18.17, "step": 16, "train_speed(iter/s)": 0.042555 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 785.0, "completions/mean_length": 446.19140625, "completions/min_length": 186.5, "epoch": 0.34, "grad_norm": 1.4837766885757446, "kl": 0.0080718994140625, "learning_rate": 9.985506211566386e-07, "loss": 0.11237534880638123, "memory(GiB)": 18.17, "reward": 0.204755961894989, "reward_std": 0.025960725732147694, "rewards/MCQ_Reward/mean": 0.204755961894989, "rewards/MCQ_Reward/std": 0.05882856249809265, "step": 17, "train_speed(iter/s)": 0.041421 }, { "clip_ratio": 0.0012163713108748198, "epoch": 0.36, "grad_norm": 1.4663207530975342, "kl": 0.00933837890625, "learning_rate": 9.982991356370403e-07, "loss": 0.11209464073181152, "memory(GiB)": 18.17, "step": 18, "train_speed(iter/s)": 0.043701 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 771.0, "completions/mean_length": 451.41015625, "completions/min_length": 101.0, "epoch": 0.38, "grad_norm": 1.2070645093917847, "kl": 0.010772705078125, "learning_rate": 9.98027578700917e-07, "loss": 0.0659424215555191, "memory(GiB)": 18.17, "reward": 0.18814751505851746, "reward_std": 0.024471789598464966, "rewards/MCQ_Reward/mean": 0.18814751505851746, "rewards/MCQ_Reward/std": 0.062104713171720505, "step": 19, "train_speed(iter/s)": 0.042657 }, { "clip_ratio": 0.0017630973597988486, "epoch": 0.4, "grad_norm": 1.1632057428359985, "kl": 0.014007568359375, "learning_rate": 9.977359612865422e-07, "loss": 0.0650935024023056, "memory(GiB)": 18.17, "step": 20, "train_speed(iter/s)": 0.044775 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 808.0, "completions/mean_length": 392.30078125, "completions/min_length": 84.0, "epoch": 0.42, "grad_norm": 1.313915491104126, "kl": 0.019775390625, "learning_rate": 9.974242951402235e-07, "loss": 0.07705788314342499, "memory(GiB)": 18.17, "reward": 0.23380683362483978, "reward_std": 0.03150738961994648, "rewards/MCQ_Reward/mean": 0.23380683362483978, "rewards/MCQ_Reward/std": 0.057576023042201996, "step": 21, "train_speed(iter/s)": 0.043224 }, { "clip_ratio": 0.0028022455517202616, "epoch": 0.44, "grad_norm": 1.242121934890747, "kl": 0.02642822265625, "learning_rate": 9.970925928158272e-07, "loss": 0.07613129168748856, "memory(GiB)": 18.17, "step": 22, "train_speed(iter/s)": 0.045118 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 621.0, "completions/mean_length": 355.48828125, "completions/min_length": 144.0, "epoch": 0.46, "grad_norm": 1.3318829536437988, "kl": 0.034423828125, "learning_rate": 9.967408676742751e-07, "loss": 0.07269842177629471, "memory(GiB)": 18.17, "reward": 0.22312550246715546, "reward_std": 0.031231535598635674, "rewards/MCQ_Reward/mean": 0.22312550246715546, "rewards/MCQ_Reward/std": 0.05438939481973648, "step": 23, "train_speed(iter/s)": 0.044616 }, { "clip_ratio": 0.0020711172837764025, "epoch": 0.48, "grad_norm": 1.2974779605865479, "kl": 0.0413818359375, "learning_rate": 9.963691338830042e-07, "loss": 0.07173984497785568, "memory(GiB)": 18.17, "step": 24, "train_speed(iter/s)": 0.046444 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 651.5, "completions/mean_length": 318.5234375, "completions/min_length": 92.0, "epoch": 0.5, "grad_norm": 1.397636890411377, "kl": 0.047119140625, "learning_rate": 9.959774064153975e-07, "loss": 0.03884683549404144, "memory(GiB)": 18.17, "reward": 0.23498350381851196, "reward_std": 0.03053601924329996, "rewards/MCQ_Reward/mean": 0.23498350381851196, "rewards/MCQ_Reward/std": 0.05711263045668602, "step": 25, "train_speed(iter/s)": 0.045888 }, { "clip_ratio": 0.0013737165136262774, "epoch": 0.52, "grad_norm": 1.379469394683838, "kl": 0.052734375, "learning_rate": 9.955657010501806e-07, "loss": 0.038122277706861496, "memory(GiB)": 18.17, "step": 26, "train_speed(iter/s)": 0.047611 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 611.5, "completions/mean_length": 293.42578125, "completions/min_length": 110.5, "epoch": 0.54, "grad_norm": 1.3771414756774902, "kl": 0.0574951171875, "learning_rate": 9.95134034370785e-07, "loss": 0.05064291134476662, "memory(GiB)": 18.17, "reward": 0.257246270775795, "reward_std": 0.03051395993679762, "rewards/MCQ_Reward/mean": 0.257246270775795, "rewards/MCQ_Reward/std": 0.05405682139098644, "step": 27, "train_speed(iter/s)": 0.046967 }, { "clip_ratio": 0.0015082518220879138, "epoch": 0.56, "grad_norm": 1.3394073247909546, "kl": 0.063720703125, "learning_rate": 9.946824237646824e-07, "loss": 0.04972712695598602, "memory(GiB)": 18.17, "step": 28, "train_speed(iter/s)": 0.048554 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.5, "completions/mean_length": 259.3515625, "completions/min_length": 76.0, "epoch": 0.58, "grad_norm": 1.4677767753601074, "kl": 0.070556640625, "learning_rate": 9.94210887422681e-07, "loss": -0.01695432886481285, "memory(GiB)": 18.17, "reward": 0.25767549127340317, "reward_std": 0.03901047818362713, "rewards/MCQ_Reward/mean": 0.25767549127340317, "rewards/MCQ_Reward/std": 0.05495491810142994, "step": 29, "train_speed(iter/s)": 0.048377 }, { "clip_ratio": 0.001286374346818775, "epoch": 0.6, "grad_norm": 1.4747378826141357, "kl": 0.076904296875, "learning_rate": 9.93719444338197e-07, "loss": -0.017460569739341736, "memory(GiB)": 18.17, "step": 30, "train_speed(iter/s)": 0.04994 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.5, "completions/mean_length": 250.26171875, "completions/min_length": 96.5, "epoch": 0.62, "grad_norm": 1.6029585599899292, "kl": 0.07763671875, "learning_rate": 9.932081143064858e-07, "loss": 0.042436983436346054, "memory(GiB)": 18.17, "reward": 0.23062269389629364, "reward_std": 0.036025889217853546, "rewards/MCQ_Reward/mean": 0.23062269389629364, "rewards/MCQ_Reward/std": 0.0671730749309063, "step": 31, "train_speed(iter/s)": 0.048974 }, { "clip_ratio": 0.00158036028733477, "epoch": 0.64, "grad_norm": 1.5435467958450317, "kl": 0.08349609375, "learning_rate": 9.926769179238464e-07, "loss": 0.04148583859205246, "memory(GiB)": 18.17, "step": 32, "train_speed(iter/s)": 0.050428 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.5, "completions/mean_length": 246.3984375, "completions/min_length": 89.0, "epoch": 0.66, "grad_norm": 1.466068983078003, "kl": 0.093994140625, "learning_rate": 9.921258765867919e-07, "loss": 0.008220436982810497, "memory(GiB)": 18.17, "reward": 0.22424693405628204, "reward_std": 0.03309958428144455, "rewards/MCQ_Reward/mean": 0.22424693405628204, "rewards/MCQ_Reward/std": 0.06848622299730778, "step": 33, "train_speed(iter/s)": 0.050299 }, { "clip_ratio": 0.0012578482856042683, "epoch": 0.68, "grad_norm": 1.4434019327163696, "kl": 0.10009765625, "learning_rate": 9.915550124911866e-07, "loss": 0.007482614368200302, "memory(GiB)": 18.17, "step": 34, "train_speed(iter/s)": 0.051722 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/mean_length": 226.12109375, "completions/min_length": 47.5, "epoch": 0.7, "grad_norm": 1.529449224472046, "kl": 0.10546875, "learning_rate": 9.909643486313533e-07, "loss": -0.024700753390789032, "memory(GiB)": 18.17, "reward": 0.24431276321411133, "reward_std": 0.03709370456635952, "rewards/MCQ_Reward/mean": 0.24431276321411133, "rewards/MCQ_Reward/std": 0.06565525010228157, "step": 35, "train_speed(iter/s)": 0.051572 }, { "clip_ratio": 0.0013001365587115288, "epoch": 0.72, "grad_norm": 1.524826169013977, "kl": 0.110595703125, "learning_rate": 9.903539087991461e-07, "loss": -0.025061530992388725, "memory(GiB)": 18.17, "step": 36, "train_speed(iter/s)": 0.052951 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/mean_length": 206.1328125, "completions/min_length": 63.0, "epoch": 0.74, "grad_norm": 1.5648741722106934, "kl": 0.11474609375, "learning_rate": 9.897237175829926e-07, "loss": -0.010986058972775936, "memory(GiB)": 18.17, "reward": 0.26653096079826355, "reward_std": 0.03736630827188492, "rewards/MCQ_Reward/mean": 0.26653096079826355, "rewards/MCQ_Reward/std": 0.065978042781353, "step": 37, "train_speed(iter/s)": 0.052793 }, { "clip_ratio": 0.0015517690917477012, "epoch": 0.76, "grad_norm": 1.5597436428070068, "kl": 0.122802734375, "learning_rate": 9.890738003669027e-07, "loss": -0.011755033396184444, "memory(GiB)": 18.17, "step": 38, "train_speed(iter/s)": 0.054118 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.5, "completions/mean_length": 203.34375, "completions/min_length": 35.5, "epoch": 0.78, "grad_norm": 1.6045058965682983, "kl": 0.125244140625, "learning_rate": 9.884041833294475e-07, "loss": -0.04164643585681915, "memory(GiB)": 18.17, "reward": 0.2605663910508156, "reward_std": 0.03675983473658562, "rewards/MCQ_Reward/mean": 0.2605663910508156, "rewards/MCQ_Reward/std": 0.06591521203517914, "step": 39, "train_speed(iter/s)": 0.054082 }, { "clip_ratio": 0.0013205534196458757, "epoch": 0.8, "grad_norm": 1.608991265296936, "kl": 0.1337890625, "learning_rate": 9.877148934427035e-07, "loss": -0.042494483292102814, "memory(GiB)": 18.17, "step": 40, "train_speed(iter/s)": 0.055369 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/mean_length": 189.2734375, "completions/min_length": 60.5, "epoch": 0.82, "grad_norm": 1.8442962169647217, "kl": 0.14208984375, "learning_rate": 9.870059584711668e-07, "loss": -0.07683762162923813, "memory(GiB)": 18.17, "reward": 0.26815178990364075, "reward_std": 0.04410684481263161, "rewards/MCQ_Reward/mean": 0.26815178990364075, "rewards/MCQ_Reward/std": 0.06000189855694771, "step": 41, "train_speed(iter/s)": 0.055022 }, { "clip_ratio": 0.0013334141112864017, "epoch": 0.84, "grad_norm": 1.8422967195510864, "kl": 0.14599609375, "learning_rate": 9.862774069706345e-07, "loss": -0.0775442123413086, "memory(GiB)": 18.17, "step": 42, "train_speed(iter/s)": 0.056271 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.5, "completions/mean_length": 185.765625, "completions/min_length": 65.5, "epoch": 0.86, "grad_norm": 1.7880198955535889, "kl": 0.14453125, "learning_rate": 9.85529268287055e-07, "loss": 0.009722323156893253, "memory(GiB)": 18.17, "reward": 0.26024360954761505, "reward_std": 0.04201339744031429, "rewards/MCQ_Reward/mean": 0.26024360954761505, "rewards/MCQ_Reward/std": 0.0699400007724762, "step": 43, "train_speed(iter/s)": 0.056122 }, { "clip_ratio": 0.0013897960307076573, "epoch": 0.88, "grad_norm": 1.7613471746444702, "kl": 0.14599609375, "learning_rate": 9.847615725553455e-07, "loss": 0.008702307008206844, "memory(GiB)": 18.17, "step": 44, "train_speed(iter/s)": 0.057328 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.5, "completions/mean_length": 180.44921875, "completions/min_length": 71.5, "epoch": 0.9, "grad_norm": 1.8986045122146606, "kl": 0.16357421875, "learning_rate": 9.83974350698178e-07, "loss": -0.01265439111739397, "memory(GiB)": 18.17, "reward": 0.24561913311481476, "reward_std": 0.041749605908989906, "rewards/MCQ_Reward/mean": 0.24561913311481476, "rewards/MCQ_Reward/std": 0.0692291297018528, "step": 45, "train_speed(iter/s)": 0.057564 }, { "clip_ratio": 0.0017767796525731683, "epoch": 0.92, "grad_norm": 1.8627526760101318, "kl": 0.1669921875, "learning_rate": 9.831676344247342e-07, "loss": -0.013573069125413895, "memory(GiB)": 18.17, "step": 46, "train_speed(iter/s)": 0.058753 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.5, "completions/mean_length": 181.046875, "completions/min_length": 58.0, "epoch": 0.94, "grad_norm": 1.8329010009765625, "kl": 0.1689453125, "learning_rate": 9.82341456229428e-07, "loss": -0.009910675697028637, "memory(GiB)": 18.17, "reward": 0.2712182253599167, "reward_std": 0.03875480592250824, "rewards/MCQ_Reward/mean": 0.2712182253599167, "rewards/MCQ_Reward/std": 0.05874207057058811, "step": 47, "train_speed(iter/s)": 0.05881 }, { "clip_ratio": 0.0020254994742572308, "epoch": 0.96, "grad_norm": 1.7636630535125732, "kl": 0.17529296875, "learning_rate": 9.814958493905962e-07, "loss": -0.011010742746293545, "memory(GiB)": 18.17, "step": 48, "train_speed(iter/s)": 0.05997 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/mean_length": 198.5, "completions/min_length": 83.0, "epoch": 0.98, "grad_norm": 1.9754475355148315, "kl": 0.15625, "learning_rate": 9.806308479691594e-07, "loss": 0.026388226076960564, "memory(GiB)": 18.17, "reward": 0.2969816029071808, "reward_std": 0.033485451713204384, "rewards/MCQ_Reward/mean": 0.2969816029071808, "rewards/MCQ_Reward/std": 0.06154371425509453, "step": 49, "train_speed(iter/s)": 0.059869 }, { "clip_ratio": 0.002143923775292933, "epoch": 1.0, "grad_norm": 1.9168144464492798, "kl": 0.16455078125, "learning_rate": 9.797464868072486e-07, "loss": 0.025302505120635033, "memory(GiB)": 18.17, "step": 50, "train_speed(iter/s)": 0.060949 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.5, "completions/mean_length": 175.86328125, "completions/min_length": 67.5, "epoch": 1.02, "grad_norm": 1.949724793434143, "kl": 0.18359375, "learning_rate": 9.788428015268026e-07, "loss": 0.016914475709199905, "memory(GiB)": 18.17, "reward": 0.28643812239170074, "reward_std": 0.038882166147232056, "rewards/MCQ_Reward/mean": 0.28643812239170074, "rewards/MCQ_Reward/std": 0.05762592889368534, "step": 51, "train_speed(iter/s)": 0.06051 }, { "clip_ratio": 0.0030939964344725013, "epoch": 1.04, "grad_norm": 1.873901128768921, "kl": 0.1962890625, "learning_rate": 9.779198285281326e-07, "loss": 0.015664130449295044, "memory(GiB)": 18.17, "step": 52, "train_speed(iter/s)": 0.061602 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/mean_length": 173.26953125, "completions/min_length": 50.5, "epoch": 1.06, "grad_norm": 1.748197317123413, "kl": 0.20361328125, "learning_rate": 9.769776049884563e-07, "loss": -0.012495264410972595, "memory(GiB)": 18.17, "reward": 0.2694673240184784, "reward_std": 0.03306659869849682, "rewards/MCQ_Reward/mean": 0.2694673240184784, "rewards/MCQ_Reward/std": 0.06984242424368858, "step": 53, "train_speed(iter/s)": 0.061749 }, { "clip_ratio": 0.003254209994338453, "epoch": 1.08, "grad_norm": 1.7254936695098877, "kl": 0.22021484375, "learning_rate": 9.760161688604007e-07, "loss": -0.012979630380868912, "memory(GiB)": 18.17, "step": 54, "train_speed(iter/s)": 0.062813 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/mean_length": 164.23046875, "completions/min_length": 74.0, "epoch": 1.1, "grad_norm": 1.8942813873291016, "kl": 0.21044921875, "learning_rate": 9.750355588704727e-07, "loss": -0.009442738257348537, "memory(GiB)": 18.17, "reward": 0.29137177765369415, "reward_std": 0.03919493593275547, "rewards/MCQ_Reward/mean": 0.29137177765369415, "rewards/MCQ_Reward/std": 0.055357255041599274, "step": 55, "train_speed(iter/s)": 0.062825 }, { "clip_ratio": 0.0029244048055261374, "epoch": 1.12, "grad_norm": 1.8403282165527344, "kl": 0.2255859375, "learning_rate": 9.740358145174997e-07, "loss": -0.010412258096039295, "memory(GiB)": 18.17, "step": 56, "train_speed(iter/s)": 0.063885 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.5, "completions/mean_length": 159.5703125, "completions/min_length": 68.5, "epoch": 1.1400000000000001, "grad_norm": 1.9502640962600708, "kl": 0.24072265625, "learning_rate": 9.730169760710385e-07, "loss": -0.01350313052535057, "memory(GiB)": 18.17, "reward": 0.3086051344871521, "reward_std": 0.036856647580862045, "rewards/MCQ_Reward/mean": 0.3086051344871521, "rewards/MCQ_Reward/std": 0.05716245248913765, "step": 57, "train_speed(iter/s)": 0.064059 }, { "clip_ratio": 0.0026392132276669145, "epoch": 1.16, "grad_norm": 1.8639681339263916, "kl": 0.244140625, "learning_rate": 9.719790845697532e-07, "loss": -0.014377694576978683, "memory(GiB)": 18.17, "step": 58, "train_speed(iter/s)": 0.065093 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/mean_length": 133.83984375, "completions/min_length": 52.5, "epoch": 1.18, "grad_norm": 2.159579038619995, "kl": 0.2607421875, "learning_rate": 9.709221818197623e-07, "loss": -0.03235793486237526, "memory(GiB)": 18.17, "reward": 0.3192738890647888, "reward_std": 0.03647255524992943, "rewards/MCQ_Reward/mean": 0.3192738890647888, "rewards/MCQ_Reward/std": 0.04580973833799362, "step": 59, "train_speed(iter/s)": 0.065376 }, { "clip_ratio": 0.0033569036750122905, "epoch": 1.2, "grad_norm": 2.0858945846557617, "kl": 0.2685546875, "learning_rate": 9.698463103929541e-07, "loss": -0.03384597226977348, "memory(GiB)": 18.17, "step": 60, "train_speed(iter/s)": 0.066397 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.5, "completions/mean_length": 152.6640625, "completions/min_length": 54.0, "epoch": 1.22, "grad_norm": 1.9752745628356934, "kl": 0.2509765625, "learning_rate": 9.68751513625273e-07, "loss": -0.012610888108611107, "memory(GiB)": 18.17, "reward": 0.30408790707588196, "reward_std": 0.03896576911211014, "rewards/MCQ_Reward/mean": 0.30408790707588196, "rewards/MCQ_Reward/std": 0.059865519404411316, "step": 61, "train_speed(iter/s)": 0.066047 }, { "clip_ratio": 0.0028306948952376842, "epoch": 1.24, "grad_norm": 1.8911457061767578, "kl": 0.2509765625, "learning_rate": 9.676378356149732e-07, "loss": -0.014004014432430267, "memory(GiB)": 18.17, "step": 62, "train_speed(iter/s)": 0.067044 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.5, "completions/mean_length": 147.6953125, "completions/min_length": 69.0, "epoch": 1.26, "grad_norm": 2.153862953186035, "kl": 0.265625, "learning_rate": 9.665053212208426e-07, "loss": -0.027626825496554375, "memory(GiB)": 18.17, "reward": 0.31602054834365845, "reward_std": 0.03946657292544842, "rewards/MCQ_Reward/mean": 0.31602054834365845, "rewards/MCQ_Reward/std": 0.06625748611986637, "step": 63, "train_speed(iter/s)": 0.067162 }, { "clip_ratio": 0.004200217663310468, "epoch": 1.28, "grad_norm": 2.027595281600952, "kl": 0.2626953125, "learning_rate": 9.653540160603955e-07, "loss": -0.028667613863945007, "memory(GiB)": 18.17, "step": 64, "train_speed(iter/s)": 0.06814 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.5, "completions/mean_length": 153.3828125, "completions/min_length": 42.0, "epoch": 1.3, "grad_norm": 2.058096170425415, "kl": 0.26318359375, "learning_rate": 9.641839665080363e-07, "loss": 0.019130591303110123, "memory(GiB)": 18.17, "reward": 0.3058909475803375, "reward_std": 0.03743278048932552, "rewards/MCQ_Reward/mean": 0.3058909475803375, "rewards/MCQ_Reward/std": 0.06633425317704678, "step": 65, "train_speed(iter/s)": 0.068294 }, { "clip_ratio": 0.0030368451261892915, "epoch": 1.32, "grad_norm": 2.0810675621032715, "kl": 0.26708984375, "learning_rate": 9.6299521969319e-07, "loss": 0.01858600787818432, "memory(GiB)": 18.17, "step": 66, "train_speed(iter/s)": 0.069245 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.5, "completions/mean_length": 170.65625, "completions/min_length": 70.0, "epoch": 1.34, "grad_norm": 1.9177082777023315, "kl": 0.25390625, "learning_rate": 9.617878234984054e-07, "loss": 0.013776745647192001, "memory(GiB)": 18.17, "reward": 0.32124653458595276, "reward_std": 0.03586815297603607, "rewards/MCQ_Reward/mean": 0.32124653458595276, "rewards/MCQ_Reward/std": 0.05279739946126938, "step": 67, "train_speed(iter/s)": 0.069258 }, { "clip_ratio": 0.003581640077754855, "epoch": 1.3599999999999999, "grad_norm": 1.800355076789856, "kl": 0.271484375, "learning_rate": 9.60561826557425e-07, "loss": 0.01218567043542862, "memory(GiB)": 18.17, "step": 68, "train_speed(iter/s)": 0.070198 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.5, "completions/mean_length": 165.45703125, "completions/min_length": 84.5, "epoch": 1.38, "grad_norm": 1.9321861267089844, "kl": 0.2734375, "learning_rate": 9.593172782532267e-07, "loss": -0.06093820929527283, "memory(GiB)": 18.17, "reward": 0.33785562217235565, "reward_std": 0.03626340813934803, "rewards/MCQ_Reward/mean": 0.33785562217235565, "rewards/MCQ_Reward/std": 0.04918426461517811, "step": 69, "train_speed(iter/s)": 0.070079 }, { "clip_ratio": 0.002684593666344881, "epoch": 1.4, "grad_norm": 1.9250681400299072, "kl": 0.2822265625, "learning_rate": 9.580542287160346e-07, "loss": -0.06187870353460312, "memory(GiB)": 18.17, "step": 70, "train_speed(iter/s)": 0.071007 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.5, "completions/mean_length": 167.71875, "completions/min_length": 60.0, "epoch": 1.42, "grad_norm": 1.9310671091079712, "kl": 0.26953125, "learning_rate": 9.567727288213004e-07, "loss": -0.03052324429154396, "memory(GiB)": 18.17, "reward": 0.3391506224870682, "reward_std": 0.037205325439572334, "rewards/MCQ_Reward/mean": 0.3391506224870682, "rewards/MCQ_Reward/std": 0.06270403787493706, "step": 71, "train_speed(iter/s)": 0.070595 }, { "clip_ratio": 0.004182511591352522, "epoch": 1.44, "grad_norm": 1.808637261390686, "kl": 0.26953125, "learning_rate": 9.554728301876524e-07, "loss": -0.031438540667295456, "memory(GiB)": 18.17, "step": 72, "train_speed(iter/s)": 0.071499 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/mean_length": 171.5859375, "completions/min_length": 73.5, "epoch": 1.46, "grad_norm": 2.1356284618377686, "kl": 0.2666015625, "learning_rate": 9.541545851748185e-07, "loss": 0.06165466085076332, "memory(GiB)": 18.17, "reward": 0.3267658054828644, "reward_std": 0.03793729655444622, "rewards/MCQ_Reward/mean": 0.3267658054828644, "rewards/MCQ_Reward/std": 0.06866181083023548, "step": 73, "train_speed(iter/s)": 0.071359 }, { "clip_ratio": 0.0023740422911942005, "epoch": 1.48, "grad_norm": 2.081942319869995, "kl": 0.2724609375, "learning_rate": 9.528180468815154e-07, "loss": 0.06085401773452759, "memory(GiB)": 18.17, "step": 74, "train_speed(iter/s)": 0.072254 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/mean_length": 176.4140625, "completions/min_length": 60.0, "epoch": 1.5, "grad_norm": 1.819736361503601, "kl": 0.291015625, "learning_rate": 9.514632691433106e-07, "loss": 0.041995078325271606, "memory(GiB)": 18.17, "reward": 0.34543414413928986, "reward_std": 0.03658975474536419, "rewards/MCQ_Reward/mean": 0.34543414413928986, "rewards/MCQ_Reward/std": 0.0643342137336731, "step": 75, "train_speed(iter/s)": 0.072103 }, { "clip_ratio": 0.0024005533196032047, "epoch": 1.52, "grad_norm": 1.7825483083724976, "kl": 0.302734375, "learning_rate": 9.500903065304539e-07, "loss": 0.04098404943943024, "memory(GiB)": 18.17, "step": 76, "train_speed(iter/s)": 0.072975 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/mean_length": 179.35546875, "completions/min_length": 71.5, "epoch": 1.54, "grad_norm": 1.83073091506958, "kl": 0.2919921875, "learning_rate": 9.486992143456791e-07, "loss": 0.026145532727241516, "memory(GiB)": 18.17, "reward": 0.33697785437107086, "reward_std": 0.033385418355464935, "rewards/MCQ_Reward/mean": 0.33697785437107086, "rewards/MCQ_Reward/std": 0.06162330321967602, "step": 77, "train_speed(iter/s)": 0.072818 }, { "clip_ratio": 0.0029612210346385837, "epoch": 1.56, "grad_norm": 1.7568435668945312, "kl": 0.3046875, "learning_rate": 9.472900486219768e-07, "loss": 0.02535586804151535, "memory(GiB)": 18.17, "step": 78, "train_speed(iter/s)": 0.07364 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/mean_length": 181.63671875, "completions/min_length": 86.0, "epoch": 1.58, "grad_norm": 1.763022541999817, "kl": 0.296875, "learning_rate": 9.458628661203366e-07, "loss": -0.016155043616890907, "memory(GiB)": 18.17, "reward": 0.3397578001022339, "reward_std": 0.030555096454918385, "rewards/MCQ_Reward/mean": 0.3397578001022339, "rewards/MCQ_Reward/std": 0.0736413523554802, "step": 79, "train_speed(iter/s)": 0.073639 }, { "clip_ratio": 0.003752505173906684, "epoch": 1.6, "grad_norm": 1.75266695022583, "kl": 0.314453125, "learning_rate": 9.444177243274617e-07, "loss": -0.016932127997279167, "memory(GiB)": 18.17, "step": 80, "train_speed(iter/s)": 0.074482 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/mean_length": 173.53515625, "completions/min_length": 82.5, "epoch": 1.62, "grad_norm": 1.813202142715454, "kl": 0.3193359375, "learning_rate": 9.429546814534528e-07, "loss": 0.014175940304994583, "memory(GiB)": 18.17, "reward": 0.35451021790504456, "reward_std": 0.0316955391317606, "rewards/MCQ_Reward/mean": 0.35451021790504456, "rewards/MCQ_Reward/std": 0.058956997469067574, "step": 81, "train_speed(iter/s)": 0.073923 }, { "clip_ratio": 0.003929685335606337, "epoch": 1.6400000000000001, "grad_norm": 1.7315208911895752, "kl": 0.337890625, "learning_rate": 9.414737964294634e-07, "loss": 0.013125661760568619, "memory(GiB)": 18.17, "step": 82, "train_speed(iter/s)": 0.074757 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.5, "completions/mean_length": 159.95703125, "completions/min_length": 68.5, "epoch": 1.6600000000000001, "grad_norm": 1.86507248878479, "kl": 0.333984375, "learning_rate": 9.399751289053266e-07, "loss": 0.0190749391913414, "memory(GiB)": 18.17, "reward": 0.32107532024383545, "reward_std": 0.03531700000166893, "rewards/MCQ_Reward/mean": 0.32107532024383545, "rewards/MCQ_Reward/std": 0.06730588898062706, "step": 83, "train_speed(iter/s)": 0.074766 }, { "clip_ratio": 0.005602485965937376, "epoch": 1.6800000000000002, "grad_norm": 1.8452680110931396, "kl": 0.3515625, "learning_rate": 9.384587392471514e-07, "loss": 0.018391648307442665, "memory(GiB)": 18.17, "step": 84, "train_speed(iter/s)": 0.075562 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.5, "completions/mean_length": 146.36328125, "completions/min_length": 51.5, "epoch": 1.7, "grad_norm": 2.060523271560669, "kl": 0.3564453125, "learning_rate": 9.369246885348925e-07, "loss": 0.00966290757060051, "memory(GiB)": 18.17, "reward": 0.34230072796344757, "reward_std": 0.03451686259359121, "rewards/MCQ_Reward/mean": 0.34230072796344757, "rewards/MCQ_Reward/std": 0.07506715506315231, "step": 85, "train_speed(iter/s)": 0.075608 }, { "clip_ratio": 0.0025914940051734447, "epoch": 1.72, "grad_norm": 2.089233875274658, "kl": 0.357421875, "learning_rate": 9.353730385598886e-07, "loss": 0.008917246013879776, "memory(GiB)": 18.17, "step": 86, "train_speed(iter/s)": 0.076403 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/mean_length": 149.41796875, "completions/min_length": 72.0, "epoch": 1.74, "grad_norm": 2.100825071334839, "kl": 0.3642578125, "learning_rate": 9.338038518223745e-07, "loss": 0.0011688023805618286, "memory(GiB)": 18.17, "reward": 0.29714760184288025, "reward_std": 0.03046888206154108, "rewards/MCQ_Reward/mean": 0.29714760184288025, "rewards/MCQ_Reward/std": 0.0724717304110527, "step": 87, "train_speed(iter/s)": 0.076468 }, { "clip_ratio": 0.0029116831719875336, "epoch": 1.76, "grad_norm": 2.091975688934326, "kl": 0.3740234375, "learning_rate": 9.322171915289633e-07, "loss": 0.0007365690544247627, "memory(GiB)": 18.17, "step": 88, "train_speed(iter/s)": 0.077267 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/mean_length": 149.0546875, "completions/min_length": 74.5, "epoch": 1.78, "grad_norm": 2.0660133361816406, "kl": 0.5546875, "learning_rate": 9.306131215901003e-07, "loss": -0.002558637410402298, "memory(GiB)": 18.17, "reward": 0.3453996330499649, "reward_std": 0.030298423022031784, "rewards/MCQ_Reward/mean": 0.3453996330499649, "rewards/MCQ_Reward/std": 0.05576108209788799, "step": 89, "train_speed(iter/s)": 0.07741 }, { "clip_ratio": 0.0030759836081415415, "epoch": 1.8, "grad_norm": 1.9661788940429688, "kl": 0.5439453125, "learning_rate": 9.289917066174885e-07, "loss": -0.003219339996576309, "memory(GiB)": 18.17, "step": 90, "train_speed(iter/s)": 0.078204 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/mean_length": 137.28125, "completions/min_length": 57.0, "epoch": 1.8199999999999998, "grad_norm": 2.1432077884674072, "kl": 0.4169921875, "learning_rate": 9.273530119214867e-07, "loss": -0.019994597882032394, "memory(GiB)": 18.17, "reward": 0.3450734615325928, "reward_std": 0.03698188066482544, "rewards/MCQ_Reward/mean": 0.3450734615325928, "rewards/MCQ_Reward/std": 0.06834666058421135, "step": 91, "train_speed(iter/s)": 0.077823 }, { "clip_ratio": 0.006807451136410236, "epoch": 1.8399999999999999, "grad_norm": 2.026726484298706, "kl": 0.4423828125, "learning_rate": 9.256971035084784e-07, "loss": -0.02127775177359581, "memory(GiB)": 18.17, "step": 92, "train_speed(iter/s)": 0.078595 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/mean_length": 144.11328125, "completions/min_length": 62.5, "epoch": 1.8599999999999999, "grad_norm": 2.5080695152282715, "kl": 0.44140625, "learning_rate": 9.240240480782129e-07, "loss": 0.038984864950180054, "memory(GiB)": 18.17, "reward": 0.34395235776901245, "reward_std": 0.030767593532800674, "rewards/MCQ_Reward/mean": 0.34395235776901245, "rewards/MCQ_Reward/std": 0.08772432059049606, "step": 93, "train_speed(iter/s)": 0.07864 }, { "clip_ratio": 0.0038948373403400183, "epoch": 1.88, "grad_norm": 2.293992042541504, "kl": 0.466796875, "learning_rate": 9.223339130211192e-07, "loss": 0.03854737430810928, "memory(GiB)": 18.17, "step": 94, "train_speed(iter/s)": 0.0794 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/mean_length": 144.3671875, "completions/min_length": 66.5, "epoch": 1.9, "grad_norm": 2.3717093467712402, "kl": 0.4423828125, "learning_rate": 9.206267664155906e-07, "loss": 0.02822975069284439, "memory(GiB)": 18.17, "reward": 0.35692907869815826, "reward_std": 0.033766910433769226, "rewards/MCQ_Reward/mean": 0.35692907869815826, "rewards/MCQ_Reward/std": 0.055017637088894844, "step": 95, "train_speed(iter/s)": 0.079264 }, { "clip_ratio": 0.01540788309648633, "epoch": 1.92, "grad_norm": 2.8082501888275146, "kl": 0.4873046875, "learning_rate": 9.189026770252436e-07, "loss": 0.027400558814406395, "memory(GiB)": 18.17, "step": 96, "train_speed(iter/s)": 0.080015 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.5, "completions/mean_length": 131.2265625, "completions/min_length": 64.0, "epoch": 1.94, "grad_norm": 2.578866481781006, "kl": 0.458984375, "learning_rate": 9.171617142961476e-07, "loss": -0.028647061437368393, "memory(GiB)": 18.17, "reward": 0.35198159515857697, "reward_std": 0.036471933126449585, "rewards/MCQ_Reward/mean": 0.35198159515857697, "rewards/MCQ_Reward/std": 0.09679177403450012, "step": 97, "train_speed(iter/s)": 0.080136 }, { "clip_ratio": 0.007482210174202919, "epoch": 1.96, "grad_norm": 2.6245126724243164, "kl": 0.455078125, "learning_rate": 9.154039483540272e-07, "loss": -0.02990054339170456, "memory(GiB)": 18.17, "step": 98, "train_speed(iter/s)": 0.080877 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.5, "completions/mean_length": 140.546875, "completions/min_length": 70.0, "epoch": 1.98, "grad_norm": 2.0212841033935547, "kl": 0.4462890625, "learning_rate": 9.136294500014385e-07, "loss": 0.007645269390195608, "memory(GiB)": 18.17, "reward": 0.3687240034341812, "reward_std": 0.0377286896109581, "rewards/MCQ_Reward/mean": 0.3687240034341812, "rewards/MCQ_Reward/std": 0.09235312044620514, "step": 99, "train_speed(iter/s)": 0.080838 }, { "clip_ratio": 0.004757207585498691, "epoch": 2.0, "grad_norm": 1.9354287385940552, "kl": 0.4638671875, "learning_rate": 9.118382907149163e-07, "loss": 0.006971254944801331, "memory(GiB)": 18.17, "step": 100, "train_speed(iter/s)": 0.08155 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.5, "completions/mean_length": 123.4140625, "completions/min_length": 54.0, "epoch": 2.02, "grad_norm": 2.3176586627960205, "kl": 0.4755859375, "learning_rate": 9.100305426420956e-07, "loss": -0.016116395592689514, "memory(GiB)": 18.17, "reward": 0.38898809254169464, "reward_std": 0.038034453988075256, "rewards/MCQ_Reward/mean": 0.38898809254169464, "rewards/MCQ_Reward/std": 0.07776015624403954, "step": 101, "train_speed(iter/s)": 0.081234 }, { "clip_ratio": 0.004006300354376435, "epoch": 2.04, "grad_norm": 2.1871023178100586, "kl": 0.4931640625, "learning_rate": 9.082062785988048e-07, "loss": -0.01703297346830368, "memory(GiB)": 18.17, "step": 102, "train_speed(iter/s)": 0.081962 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/mean_length": 113.1484375, "completions/min_length": 56.5, "epoch": 2.06, "grad_norm": 2.5120768547058105, "kl": 0.517578125, "learning_rate": 9.06365572066134e-07, "loss": -0.027387384325265884, "memory(GiB)": 18.17, "reward": 0.357058048248291, "reward_std": 0.031020362861454487, "rewards/MCQ_Reward/mean": 0.357058048248291, "rewards/MCQ_Reward/std": 0.06582547165453434, "step": 103, "train_speed(iter/s)": 0.082061 }, { "clip_ratio": 0.014288442209362984, "epoch": 2.08, "grad_norm": 3.2106845378875732, "kl": 0.5009765625, "learning_rate": 9.045084971874737e-07, "loss": -0.02823379635810852, "memory(GiB)": 18.17, "step": 104, "train_speed(iter/s)": 0.082761 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/mean_length": 126.953125, "completions/min_length": 70.0, "epoch": 2.1, "grad_norm": 2.2478950023651123, "kl": 0.48828125, "learning_rate": 9.026351287655293e-07, "loss": 0.02888938970863819, "memory(GiB)": 18.17, "reward": 0.3573220670223236, "reward_std": 0.03388269431889057, "rewards/MCQ_Reward/mean": 0.3573220670223236, "rewards/MCQ_Reward/std": 0.08621830865740776, "step": 105, "train_speed(iter/s)": 0.082851 }, { "clip_ratio": 0.005271225702017546, "epoch": 2.12, "grad_norm": 2.07523250579834, "kl": 0.513671875, "learning_rate": 9.007455422593075e-07, "loss": 0.028001034632325172, "memory(GiB)": 18.17, "step": 106, "train_speed(iter/s)": 0.083561 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/mean_length": 143.50390625, "completions/min_length": 62.5, "epoch": 2.14, "grad_norm": 2.149932861328125, "kl": 0.474609375, "learning_rate": 8.988398137810776e-07, "loss": -0.0027789073064923286, "memory(GiB)": 18.17, "reward": 0.37795157730579376, "reward_std": 0.03415030054748058, "rewards/MCQ_Reward/mean": 0.37795157730579376, "rewards/MCQ_Reward/std": 0.07794364914298058, "step": 107, "train_speed(iter/s)": 0.083617 }, { "clip_ratio": 0.008057619212195277, "epoch": 2.16, "grad_norm": 2.7377026081085205, "kl": 0.5078125, "learning_rate": 8.969180200933047e-07, "loss": -0.003491489216685295, "memory(GiB)": 18.17, "step": 108, "train_speed(iter/s)": 0.084274 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.5, "completions/mean_length": 133.1875, "completions/min_length": 58.5, "epoch": 2.18, "grad_norm": 2.826488494873047, "kl": 0.5390625, "learning_rate": 8.94980238605558e-07, "loss": 0.02833351120352745, "memory(GiB)": 18.17, "reward": 0.39782722294330597, "reward_std": 0.031135279685258865, "rewards/MCQ_Reward/mean": 0.39782722294330597, "rewards/MCQ_Reward/std": 0.07045348361134529, "step": 109, "train_speed(iter/s)": 0.084336 }, { "clip_ratio": 0.00684792990796268, "epoch": 2.2, "grad_norm": 2.434086322784424, "kl": 0.5703125, "learning_rate": 8.930265473713937e-07, "loss": 0.027658611536026, "memory(GiB)": 18.17, "step": 110, "train_speed(iter/s)": 0.085034 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/mean_length": 131.703125, "completions/min_length": 67.0, "epoch": 2.22, "grad_norm": 2.134516716003418, "kl": 0.48828125, "learning_rate": 8.910570250852096e-07, "loss": 0.006394753232598305, "memory(GiB)": 18.17, "reward": 0.3707956522703171, "reward_std": 0.03248129412531853, "rewards/MCQ_Reward/mean": 0.3707956522703171, "rewards/MCQ_Reward/std": 0.10541465878486633, "step": 111, "train_speed(iter/s)": 0.084685 }, { "clip_ratio": 0.00865771621465683, "epoch": 2.24, "grad_norm": 2.2900125980377197, "kl": 0.513671875, "learning_rate": 8.890717510790762e-07, "loss": 0.00539240799844265, "memory(GiB)": 18.17, "step": 112, "train_speed(iter/s)": 0.085353 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.5, "completions/mean_length": 126.9140625, "completions/min_length": 62.0, "epoch": 2.26, "grad_norm": 2.6178812980651855, "kl": 0.546875, "learning_rate": 8.870708053195413e-07, "loss": 0.019267559051513672, "memory(GiB)": 18.17, "reward": 0.3922416865825653, "reward_std": 0.03025819268077612, "rewards/MCQ_Reward/mean": 0.3922416865825653, "rewards/MCQ_Reward/std": 0.08424495533108711, "step": 113, "train_speed(iter/s)": 0.085338 }, { "clip_ratio": 0.006454117828980088, "epoch": 2.2800000000000002, "grad_norm": 2.1509737968444824, "kl": 0.57421875, "learning_rate": 8.850542684044078e-07, "loss": 0.01820582151412964, "memory(GiB)": 18.17, "step": 114, "train_speed(iter/s)": 0.085985 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.5, "completions/mean_length": 118.85546875, "completions/min_length": 59.5, "epoch": 2.3, "grad_norm": 2.528681755065918, "kl": 0.525390625, "learning_rate": 8.83022221559489e-07, "loss": 0.008160990662872791, "memory(GiB)": 18.17, "reward": 0.404242143034935, "reward_std": 0.03400178253650665, "rewards/MCQ_Reward/mean": 0.404242143034935, "rewards/MCQ_Reward/std": 0.09943690523505211, "step": 115, "train_speed(iter/s)": 0.086069 }, { "clip_ratio": 0.005366077646613121, "epoch": 2.32, "grad_norm": 2.1966934204101562, "kl": 0.546875, "learning_rate": 8.809747466353355e-07, "loss": 0.007157166488468647, "memory(GiB)": 18.17, "step": 116, "train_speed(iter/s)": 0.086734 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/mean_length": 125.3359375, "completions/min_length": 59.5, "epoch": 2.34, "grad_norm": 2.4033124446868896, "kl": 0.537109375, "learning_rate": 8.789119261039384e-07, "loss": 0.017890973016619682, "memory(GiB)": 18.17, "reward": 0.36347851157188416, "reward_std": 0.027591521851718426, "rewards/MCQ_Reward/mean": 0.36347851157188416, "rewards/MCQ_Reward/std": 0.09114562720060349, "step": 117, "train_speed(iter/s)": 0.086687 }, { "clip_ratio": 0.011405623517930508, "epoch": 2.36, "grad_norm": 2.8501975536346436, "kl": 0.587890625, "learning_rate": 8.768338430554082e-07, "loss": 0.016866052523255348, "memory(GiB)": 18.17, "step": 118, "train_speed(iter/s)": 0.08735 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/mean_length": 122.23046875, "completions/min_length": 65.0, "epoch": 2.38, "grad_norm": 2.5570151805877686, "kl": 0.5126953125, "learning_rate": 8.74740581194627e-07, "loss": -0.011926580220460892, "memory(GiB)": 18.17, "reward": 0.40480077266693115, "reward_std": 0.03289741463959217, "rewards/MCQ_Reward/mean": 0.40480077266693115, "rewards/MCQ_Reward/std": 0.08261778578162193, "step": 119, "train_speed(iter/s)": 0.087419 }, { "clip_ratio": 0.007963848765939474, "epoch": 2.4, "grad_norm": 2.1802773475646973, "kl": 0.5009765625, "learning_rate": 8.726322248378774e-07, "loss": -0.0127539848908782, "memory(GiB)": 18.17, "step": 120, "train_speed(iter/s)": 0.088053 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/mean_length": 130.2421875, "completions/min_length": 60.5, "epoch": 2.42, "grad_norm": 2.4936065673828125, "kl": 0.537109375, "learning_rate": 8.705088589094458e-07, "loss": 0.008000252768397331, "memory(GiB)": 18.17, "reward": 0.36072438955307007, "reward_std": 0.030319811776280403, "rewards/MCQ_Reward/mean": 0.36072438955307007, "rewards/MCQ_Reward/std": 0.1019350104033947, "step": 121, "train_speed(iter/s)": 0.08768 }, { "clip_ratio": 0.006943409331142902, "epoch": 2.44, "grad_norm": 2.4447567462921143, "kl": 0.544921875, "learning_rate": 8.683705689382024e-07, "loss": 0.0072016119956970215, "memory(GiB)": 18.17, "step": 122, "train_speed(iter/s)": 0.088326 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.5, "completions/mean_length": 112.40234375, "completions/min_length": 53.0, "epoch": 2.46, "grad_norm": 2.279759168624878, "kl": 0.55859375, "learning_rate": 8.662174410541554e-07, "loss": 0.00623547937721014, "memory(GiB)": 18.17, "reward": 0.3670702576637268, "reward_std": 0.02890967670828104, "rewards/MCQ_Reward/mean": 0.3670702576637268, "rewards/MCQ_Reward/std": 0.0740283839404583, "step": 123, "train_speed(iter/s)": 0.088484 }, { "clip_ratio": 0.007923177909106016, "epoch": 2.48, "grad_norm": 2.789609909057617, "kl": 0.587890625, "learning_rate": 8.64049561984982e-07, "loss": 0.005373558960855007, "memory(GiB)": 18.17, "step": 124, "train_speed(iter/s)": 0.089133 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.5, "completions/mean_length": 124.91796875, "completions/min_length": 73.0, "epoch": 2.5, "grad_norm": 2.2765557765960693, "kl": 0.498046875, "learning_rate": 8.61867019052535e-07, "loss": -0.0031618811190128326, "memory(GiB)": 18.17, "reward": 0.3880574107170105, "reward_std": 0.02767461072653532, "rewards/MCQ_Reward/mean": 0.3880574107170105, "rewards/MCQ_Reward/std": 0.11312882974743843, "step": 125, "train_speed(iter/s)": 0.089217 }, { "clip_ratio": 0.006887951632961631, "epoch": 2.52, "grad_norm": 2.2742230892181396, "kl": 0.509765625, "learning_rate": 8.596699001693255e-07, "loss": -0.004048643633723259, "memory(GiB)": 18.17, "step": 126, "train_speed(iter/s)": 0.089838 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.5, "completions/mean_length": 117.484375, "completions/min_length": 56.5, "epoch": 2.54, "grad_norm": 2.340428113937378, "kl": 0.546875, "learning_rate": 8.574582938349817e-07, "loss": -0.009344515390694141, "memory(GiB)": 18.17, "reward": 0.38609637320041656, "reward_std": 0.033216655254364014, "rewards/MCQ_Reward/mean": 0.38609637320041656, "rewards/MCQ_Reward/std": 0.09242032468318939, "step": 127, "train_speed(iter/s)": 0.089914 }, { "clip_ratio": 0.007429210003465414, "epoch": 2.56, "grad_norm": 2.3134751319885254, "kl": 0.57421875, "learning_rate": 8.552322891326844e-07, "loss": -0.010545218363404274, "memory(GiB)": 18.17, "step": 128, "train_speed(iter/s)": 0.090544 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/mean_length": 119.9765625, "completions/min_length": 57.0, "epoch": 2.58, "grad_norm": 2.265873670578003, "kl": 0.4931640625, "learning_rate": 8.529919757255781e-07, "loss": -0.007635302376002073, "memory(GiB)": 18.17, "reward": 0.41428878903388977, "reward_std": 0.028425303287804127, "rewards/MCQ_Reward/mean": 0.41428878903388977, "rewards/MCQ_Reward/std": 0.07786687836050987, "step": 129, "train_speed(iter/s)": 0.09048 }, { "clip_ratio": 0.006183756981045008, "epoch": 2.6, "grad_norm": 2.283554792404175, "kl": 0.498046875, "learning_rate": 8.507374438531606e-07, "loss": -0.008446864783763885, "memory(GiB)": 18.17, "step": 130, "train_speed(iter/s)": 0.091107 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/mean_length": 119.125, "completions/min_length": 59.0, "epoch": 2.62, "grad_norm": 2.8296353816986084, "kl": 0.525390625, "learning_rate": 8.484687843276468e-07, "loss": 0.003696079831570387, "memory(GiB)": 18.17, "reward": 0.40898391604423523, "reward_std": 0.02961808815598488, "rewards/MCQ_Reward/mean": 0.40898391604423523, "rewards/MCQ_Reward/std": 0.09117832407355309, "step": 131, "train_speed(iter/s)": 0.09081 }, { "clip_ratio": 0.010138689540326595, "epoch": 2.64, "grad_norm": 2.565761089324951, "kl": 0.53515625, "learning_rate": 8.461860885303113e-07, "loss": 0.003048412501811981, "memory(GiB)": 18.17, "step": 132, "train_speed(iter/s)": 0.091425 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/mean_length": 129.40234375, "completions/min_length": 70.0, "epoch": 2.66, "grad_norm": 2.344294786453247, "kl": 0.513671875, "learning_rate": 8.438894484078085e-07, "loss": 0.005981519352644682, "memory(GiB)": 18.17, "reward": 0.40958625078201294, "reward_std": 0.027244774624705315, "rewards/MCQ_Reward/mean": 0.40958625078201294, "rewards/MCQ_Reward/std": 0.07108591124415398, "step": 133, "train_speed(iter/s)": 0.091506 }, { "clip_ratio": 0.006955728633329272, "epoch": 2.68, "grad_norm": 2.667799949645996, "kl": 0.50390625, "learning_rate": 8.415789564684673e-07, "loss": 0.0052396636456251144, "memory(GiB)": 18.17, "step": 134, "train_speed(iter/s)": 0.092113 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/mean_length": 132.30859375, "completions/min_length": 79.0, "epoch": 2.7, "grad_norm": 2.6722846031188965, "kl": 0.5029296875, "learning_rate": 8.392547057785661e-07, "loss": 0.0176947470754385, "memory(GiB)": 18.17, "reward": 0.39249348640441895, "reward_std": 0.024370728991925716, "rewards/MCQ_Reward/mean": 0.39249348640441895, "rewards/MCQ_Reward/std": 0.10880232974886894, "step": 135, "train_speed(iter/s)": 0.092158 }, { "clip_ratio": 0.009976111352443695, "epoch": 2.7199999999999998, "grad_norm": 2.80319881439209, "kl": 0.548828125, "learning_rate": 8.369167899585839e-07, "loss": 0.01698880083858967, "memory(GiB)": 18.17, "step": 136, "train_speed(iter/s)": 0.092755 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/mean_length": 117.91015625, "completions/min_length": 53.5, "epoch": 2.74, "grad_norm": 2.5274980068206787, "kl": 0.5087890625, "learning_rate": 8.34565303179429e-07, "loss": -0.004888280760496855, "memory(GiB)": 18.17, "reward": 0.3668254613876343, "reward_std": 0.02390660159289837, "rewards/MCQ_Reward/mean": 0.3668254613876343, "rewards/MCQ_Reward/std": 0.06858384422957897, "step": 137, "train_speed(iter/s)": 0.092788 }, { "clip_ratio": 0.00792233063839376, "epoch": 2.76, "grad_norm": 2.6973214149475098, "kl": 0.513671875, "learning_rate": 8.322003401586461e-07, "loss": -0.0054510245099663734, "memory(GiB)": 18.17, "step": 138, "train_speed(iter/s)": 0.093386 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.5, "completions/mean_length": 128.76953125, "completions/min_length": 74.0, "epoch": 2.7800000000000002, "grad_norm": 2.22070574760437, "kl": 0.4912109375, "learning_rate": 8.298219961566008e-07, "loss": -0.001897591631859541, "memory(GiB)": 18.17, "reward": 0.3943639397621155, "reward_std": 0.021683918312191963, "rewards/MCQ_Reward/mean": 0.3943639397621155, "rewards/MCQ_Reward/std": 0.08081439509987831, "step": 139, "train_speed(iter/s)": 0.093426 }, { "clip_ratio": 0.005092586623504758, "epoch": 2.8, "grad_norm": 2.3254384994506836, "kl": 0.5009765625, "learning_rate": 8.274303669726426e-07, "loss": -0.0023171789944171906, "memory(GiB)": 18.17, "step": 140, "train_speed(iter/s)": 0.094018 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/mean_length": 131.94140625, "completions/min_length": 76.0, "epoch": 2.82, "grad_norm": 2.8199474811553955, "kl": 0.513671875, "learning_rate": 8.250255489412462e-07, "loss": 0.03072257712483406, "memory(GiB)": 18.17, "reward": 0.4145784378051758, "reward_std": 0.026746340095996857, "rewards/MCQ_Reward/mean": 0.4145784378051758, "rewards/MCQ_Reward/std": 0.1253884807229042, "step": 141, "train_speed(iter/s)": 0.093563 }, { "clip_ratio": 0.01698949094861746, "epoch": 2.84, "grad_norm": 3.6371665000915527, "kl": 0.5654296875, "learning_rate": 8.226076389281314e-07, "loss": 0.030751001089811325, "memory(GiB)": 18.17, "step": 142, "train_speed(iter/s)": 0.094156 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/mean_length": 122.05859375, "completions/min_length": 41.0, "epoch": 2.86, "grad_norm": 3.697355031967163, "kl": 0.529296875, "learning_rate": 8.201767343263611e-07, "loss": 0.001254035159945488, "memory(GiB)": 18.17, "reward": 0.4235128164291382, "reward_std": 0.02945070993155241, "rewards/MCQ_Reward/mean": 0.4235128164291382, "rewards/MCQ_Reward/std": 0.0826257448643446, "step": 143, "train_speed(iter/s)": 0.094158 }, { "clip_ratio": 0.010704205837100744, "epoch": 2.88, "grad_norm": 2.6047918796539307, "kl": 0.556640625, "learning_rate": 8.177329330524181e-07, "loss": 0.0003689592704176903, "memory(GiB)": 18.17, "step": 144, "train_speed(iter/s)": 0.09474 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.5, "completions/mean_length": 147.65234375, "completions/min_length": 84.0, "epoch": 2.9, "grad_norm": 2.0444202423095703, "kl": 0.4521484375, "learning_rate": 8.152763335422612e-07, "loss": 0.009064443409442902, "memory(GiB)": 18.17, "reward": 0.38259103894233704, "reward_std": 0.023838728666305542, "rewards/MCQ_Reward/mean": 0.38259103894233704, "rewards/MCQ_Reward/std": 0.0847747940570116, "step": 145, "train_speed(iter/s)": 0.09459 }, { "clip_ratio": 0.013846603687852621, "epoch": 2.92, "grad_norm": 3.0148403644561768, "kl": 0.47265625, "learning_rate": 8.128070347473608e-07, "loss": 0.008937995880842209, "memory(GiB)": 18.17, "step": 146, "train_speed(iter/s)": 0.095167 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/mean_length": 131.4609375, "completions/min_length": 58.5, "epoch": 2.94, "grad_norm": 2.3035802841186523, "kl": 0.515625, "learning_rate": 8.103251361307118e-07, "loss": -0.003920593298971653, "memory(GiB)": 18.17, "reward": 0.46591490507125854, "reward_std": 0.02803555503487587, "rewards/MCQ_Reward/mean": 0.46591490507125854, "rewards/MCQ_Reward/std": 0.08151933178305626, "step": 147, "train_speed(iter/s)": 0.095144 }, { "clip_ratio": 0.008604592643678188, "epoch": 2.96, "grad_norm": 3.269644021987915, "kl": 0.498046875, "learning_rate": 8.07830737662829e-07, "loss": -0.004623805172741413, "memory(GiB)": 18.17, "step": 148, "train_speed(iter/s)": 0.095712 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/mean_length": 115.5859375, "completions/min_length": 47.5, "epoch": 2.98, "grad_norm": 2.762554883956909, "kl": 0.55859375, "learning_rate": 8.053239398177191e-07, "loss": -0.002270375844091177, "memory(GiB)": 18.17, "reward": 0.40475866198539734, "reward_std": 0.02323055360466242, "rewards/MCQ_Reward/mean": 0.40475866198539734, "rewards/MCQ_Reward/std": 0.11423858627676964, "step": 149, "train_speed(iter/s)": 0.095646 }, { "clip_ratio": 0.005962205119431019, "epoch": 3.0, "grad_norm": 2.495875358581543, "kl": 0.5625, "learning_rate": 8.028048435688333e-07, "loss": -0.0031687067821621895, "memory(GiB)": 18.17, "step": 150, "train_speed(iter/s)": 0.0962 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.5, "completions/mean_length": 117.21484375, "completions/min_length": 58.0, "epoch": 3.02, "grad_norm": 3.30179762840271, "kl": 0.572265625, "learning_rate": 8.002735503850015e-07, "loss": -0.0032917922362685204, "memory(GiB)": 18.17, "reward": 0.39226125180721283, "reward_std": 0.025511370040476322, "rewards/MCQ_Reward/mean": 0.39226125180721283, "rewards/MCQ_Reward/std": 0.08468513377010822, "step": 151, "train_speed(iter/s)": 0.095897 }, { "clip_ratio": 0.007298078387975693, "epoch": 3.04, "grad_norm": 2.3152873516082764, "kl": 0.56640625, "learning_rate": 7.97730162226344e-07, "loss": -0.004036391619592905, "memory(GiB)": 18.17, "step": 152, "train_speed(iter/s)": 0.096461 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/mean_length": 121.8984375, "completions/min_length": 63.5, "epoch": 3.06, "grad_norm": 2.2318758964538574, "kl": 0.51171875, "learning_rate": 7.951747815401649e-07, "loss": 0.008308425545692444, "memory(GiB)": 18.17, "reward": 0.425733745098114, "reward_std": 0.02289827074855566, "rewards/MCQ_Reward/mean": 0.425733745098114, "rewards/MCQ_Reward/std": 0.12863966077566147, "step": 153, "train_speed(iter/s)": 0.096546 }, { "clip_ratio": 0.009599440731108189, "epoch": 3.08, "grad_norm": 3.2350826263427734, "kl": 0.5009765625, "learning_rate": 7.926075112568258e-07, "loss": 0.00774328364059329, "memory(GiB)": 18.17, "step": 154, "train_speed(iter/s)": 0.0971 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.5, "completions/mean_length": 129.765625, "completions/min_length": 63.5, "epoch": 3.1, "grad_norm": 2.8958089351654053, "kl": 0.5146484375, "learning_rate": 7.900284547855991e-07, "loss": 0.005472003482282162, "memory(GiB)": 18.17, "reward": 0.3814770430326462, "reward_std": 0.021100854501128197, "rewards/MCQ_Reward/mean": 0.3814770430326462, "rewards/MCQ_Reward/std": 0.08354593068361282, "step": 155, "train_speed(iter/s)": 0.096733 }, { "clip_ratio": 0.008797692600637674, "epoch": 3.12, "grad_norm": 2.330720901489258, "kl": 0.5107421875, "learning_rate": 7.874377160105036e-07, "loss": 0.00483354227617383, "memory(GiB)": 18.17, "step": 156, "train_speed(iter/s)": 0.097282 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/mean_length": 123.1640625, "completions/min_length": 68.0, "epoch": 3.14, "grad_norm": 2.1395411491394043, "kl": 0.515625, "learning_rate": 7.848353992861194e-07, "loss": 0.009709931910037994, "memory(GiB)": 18.17, "reward": 0.4426523745059967, "reward_std": 0.024569914676249027, "rewards/MCQ_Reward/mean": 0.4426523745059967, "rewards/MCQ_Reward/std": 0.10452848672866821, "step": 157, "train_speed(iter/s)": 0.097277 }, { "clip_ratio": 0.008177514653652906, "epoch": 3.16, "grad_norm": 2.8377902507781982, "kl": 0.49609375, "learning_rate": 7.822216094333847e-07, "loss": 0.00888834334909916, "memory(GiB)": 18.17, "step": 158, "train_speed(iter/s)": 0.097824 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.5, "completions/mean_length": 121.08203125, "completions/min_length": 59.0, "epoch": 3.18, "grad_norm": 2.439819574356079, "kl": 0.5009765625, "learning_rate": 7.795964517353733e-07, "loss": -0.005721232853829861, "memory(GiB)": 18.17, "reward": 0.4260745346546173, "reward_std": 0.024243751540780067, "rewards/MCQ_Reward/mean": 0.4260745346546173, "rewards/MCQ_Reward/std": 0.08284034207463264, "step": 159, "train_speed(iter/s)": 0.09781 }, { "clip_ratio": 0.006790396990254521, "epoch": 3.2, "grad_norm": 1.9817484617233276, "kl": 0.4970703125, "learning_rate": 7.769600319330552e-07, "loss": -0.006797813344746828, "memory(GiB)": 18.17, "step": 160, "train_speed(iter/s)": 0.098355 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.5, "completions/mean_length": 112.234375, "completions/min_length": 54.0, "epoch": 3.22, "grad_norm": 2.4277918338775635, "kl": 0.60546875, "learning_rate": 7.743124562210351e-07, "loss": 0.011250641196966171, "memory(GiB)": 18.17, "reward": 0.4286917597055435, "reward_std": 0.023968255147337914, "rewards/MCQ_Reward/mean": 0.4286917597055435, "rewards/MCQ_Reward/std": 0.08755803853273392, "step": 161, "train_speed(iter/s)": 0.097905 }, { "clip_ratio": 0.008228898979723454, "epoch": 3.24, "grad_norm": 2.4396235942840576, "kl": 0.63671875, "learning_rate": 7.716538312432765e-07, "loss": 0.009992354549467564, "memory(GiB)": 18.17, "step": 162, "train_speed(iter/s)": 0.098438 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/mean_length": 128.484375, "completions/min_length": 65.5, "epoch": 3.26, "grad_norm": 2.378303289413452, "kl": 0.4560546875, "learning_rate": 7.689842640888063e-07, "loss": 0.014578643254935741, "memory(GiB)": 18.17, "reward": 0.4368235617876053, "reward_std": 0.024292019195854664, "rewards/MCQ_Reward/mean": 0.4368235617876053, "rewards/MCQ_Reward/std": 0.10128979757428169, "step": 163, "train_speed(iter/s)": 0.098485 }, { "clip_ratio": 0.006144619081169367, "epoch": 3.2800000000000002, "grad_norm": 2.336179733276367, "kl": 0.455078125, "learning_rate": 7.663038622873999e-07, "loss": 0.014264167286455631, "memory(GiB)": 18.17, "step": 164, "train_speed(iter/s)": 0.09902 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/mean_length": 127.5546875, "completions/min_length": 68.0, "epoch": 3.3, "grad_norm": 2.3888978958129883, "kl": 0.51953125, "learning_rate": 7.636127338052511e-07, "loss": 0.0008876635693013668, "memory(GiB)": 18.17, "reward": 0.3655773550271988, "reward_std": 0.023151511326432228, "rewards/MCQ_Reward/mean": 0.3655773550271988, "rewards/MCQ_Reward/std": 0.08209535107016563, "step": 165, "train_speed(iter/s)": 0.099067 }, { "clip_ratio": 0.009708862751722336, "epoch": 3.32, "grad_norm": 2.849376678466797, "kl": 0.53515625, "learning_rate": 7.60910987040623e-07, "loss": 0.0005215085111558437, "memory(GiB)": 18.17, "step": 166, "train_speed(iter/s)": 0.099591 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/mean_length": 114.09375, "completions/min_length": 68.5, "epoch": 3.34, "grad_norm": 2.3568837642669678, "kl": 0.568359375, "learning_rate": 7.581987308194809e-07, "loss": 0.009412365034222603, "memory(GiB)": 18.17, "reward": 0.38831935822963715, "reward_std": 0.024401471950113773, "rewards/MCQ_Reward/mean": 0.38831935822963715, "rewards/MCQ_Reward/std": 0.07682501710951328, "step": 167, "train_speed(iter/s)": 0.099643 }, { "clip_ratio": 0.009874043520539999, "epoch": 3.36, "grad_norm": 4.141200542449951, "kl": 0.548828125, "learning_rate": 7.554760743911103e-07, "loss": 0.008638818748295307, "memory(GiB)": 18.17, "step": 168, "train_speed(iter/s)": 0.100139 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.5, "completions/mean_length": 116.015625, "completions/min_length": 68.0, "epoch": 3.38, "grad_norm": 2.3995447158813477, "kl": 0.5390625, "learning_rate": 7.527431274237149e-07, "loss": 0.009148918092250824, "memory(GiB)": 18.17, "reward": 0.43169474601745605, "reward_std": 0.023636899888515472, "rewards/MCQ_Reward/mean": 0.43169474601745605, "rewards/MCQ_Reward/std": 0.08781928941607475, "step": 169, "train_speed(iter/s)": 0.100207 }, { "clip_ratio": 0.011634313501417637, "epoch": 3.4, "grad_norm": 3.3103132247924805, "kl": 0.580078125, "learning_rate": 7.5e-07, "loss": 0.008654891513288021, "memory(GiB)": 18.17, "step": 170, "train_speed(iter/s)": 0.100725 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.5, "completions/mean_length": 116.5859375, "completions/min_length": 61.0, "epoch": 3.42, "grad_norm": 2.4376144409179688, "kl": 0.51171875, "learning_rate": 7.472468026127384e-07, "loss": 0.0037187309935688972, "memory(GiB)": 18.17, "reward": 0.4193449318408966, "reward_std": 0.024272997863590717, "rewards/MCQ_Reward/mean": 0.4193449318408966, "rewards/MCQ_Reward/std": 0.08024471625685692, "step": 171, "train_speed(iter/s)": 0.100337 }, { "clip_ratio": 0.004286584910005331, "epoch": 3.44, "grad_norm": 2.298527479171753, "kl": 0.501953125, "learning_rate": 7.444836461603194e-07, "loss": 0.0035052020102739334, "memory(GiB)": 18.17, "step": 172, "train_speed(iter/s)": 0.10083 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/mean_length": 107.78515625, "completions/min_length": 54.0, "epoch": 3.46, "grad_norm": 2.706815004348755, "kl": 0.572265625, "learning_rate": 7.417106419422818e-07, "loss": 0.001836567185819149, "memory(GiB)": 18.17, "reward": 0.4373796284198761, "reward_std": 0.024632513523101807, "rewards/MCQ_Reward/mean": 0.4373796284198761, "rewards/MCQ_Reward/std": 0.10328296199440956, "step": 173, "train_speed(iter/s)": 0.100842 }, { "clip_ratio": 0.00837572431191802, "epoch": 3.48, "grad_norm": 2.7765517234802246, "kl": 0.55859375, "learning_rate": 7.389279016548316e-07, "loss": 0.0008762972429394722, "memory(GiB)": 18.17, "step": 174, "train_speed(iter/s)": 0.10133 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.5, "completions/mean_length": 141.59375, "completions/min_length": 93.0, "epoch": 3.5, "grad_norm": 2.0208756923675537, "kl": 0.494140625, "learning_rate": 7.361355373863413e-07, "loss": -0.0017252122052013874, "memory(GiB)": 18.17, "reward": 0.4430805742740631, "reward_std": 0.023134860210120678, "rewards/MCQ_Reward/mean": 0.4430805742740631, "rewards/MCQ_Reward/std": 0.10230642557144165, "step": 175, "train_speed(iter/s)": 0.101269 }, { "clip_ratio": 0.008417821954935789, "epoch": 3.52, "grad_norm": 2.5541892051696777, "kl": 0.498046875, "learning_rate": 7.333336616128369e-07, "loss": -0.0020766020752489567, "memory(GiB)": 18.17, "step": 176, "train_speed(iter/s)": 0.101776 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.5, "completions/mean_length": 140.26953125, "completions/min_length": 61.5, "epoch": 3.54, "grad_norm": 2.090574264526367, "kl": 0.455078125, "learning_rate": 7.305223871934656e-07, "loss": -0.004062575753778219, "memory(GiB)": 18.17, "reward": 0.4077337831258774, "reward_std": 0.021388554014265537, "rewards/MCQ_Reward/mean": 0.4077337831258774, "rewards/MCQ_Reward/std": 0.1092216707766056, "step": 177, "train_speed(iter/s)": 0.101717 }, { "clip_ratio": 0.009097482077777386, "epoch": 3.56, "grad_norm": 2.031277894973755, "kl": 0.4638671875, "learning_rate": 7.277018273659516e-07, "loss": -0.005147318355739117, "memory(GiB)": 18.17, "step": 178, "train_speed(iter/s)": 0.102192 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.5, "completions/mean_length": 103.77734375, "completions/min_length": 56.0, "epoch": 3.58, "grad_norm": 2.28383731842041, "kl": 0.55078125, "learning_rate": 7.248720957420329e-07, "loss": 0.0054731229320168495, "memory(GiB)": 18.17, "reward": 0.37708504498004913, "reward_std": 0.022474835626780987, "rewards/MCQ_Reward/mean": 0.37708504498004913, "rewards/MCQ_Reward/std": 0.10817139223217964, "step": 179, "train_speed(iter/s)": 0.102207 }, { "clip_ratio": 0.005004609236493707, "epoch": 3.6, "grad_norm": 2.2720046043395996, "kl": 0.552734375, "learning_rate": 7.220333063028871e-07, "loss": 0.004853987134993076, "memory(GiB)": 18.17, "step": 180, "train_speed(iter/s)": 0.10258 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/mean_length": 135.375, "completions/min_length": 64.5, "epoch": 3.62, "grad_norm": 2.0278213024139404, "kl": 0.537109375, "learning_rate": 7.191855733945386e-07, "loss": 0.007204895373433828, "memory(GiB)": 18.17, "reward": 0.37996095418930054, "reward_std": 0.024972867220640182, "rewards/MCQ_Reward/mean": 0.37996095418930054, "rewards/MCQ_Reward/std": 0.06211347132921219, "step": 181, "train_speed(iter/s)": 0.102022 }, { "clip_ratio": 0.0050066676922142506, "epoch": 3.64, "grad_norm": 2.026421308517456, "kl": 0.54296875, "learning_rate": 7.163290117232541e-07, "loss": 0.006550833582878113, "memory(GiB)": 18.17, "step": 182, "train_speed(iter/s)": 0.102515 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/mean_length": 132.80078125, "completions/min_length": 70.0, "epoch": 3.66, "grad_norm": 2.322474479675293, "kl": 0.4560546875, "learning_rate": 7.134637363509209e-07, "loss": 0.00408747885376215, "memory(GiB)": 18.17, "reward": 0.42590010166168213, "reward_std": 0.02117757499217987, "rewards/MCQ_Reward/mean": 0.42590010166168213, "rewards/MCQ_Reward/std": 0.10450495779514313, "step": 183, "train_speed(iter/s)": 0.102439 }, { "clip_ratio": 0.005717001855373383, "epoch": 3.68, "grad_norm": 2.0725347995758057, "kl": 0.4501953125, "learning_rate": 7.105898626904134e-07, "loss": 0.003590245731174946, "memory(GiB)": 18.17, "step": 184, "train_speed(iter/s)": 0.10291 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/mean_length": 107.73828125, "completions/min_length": 67.5, "epoch": 3.7, "grad_norm": 2.94624662399292, "kl": 0.578125, "learning_rate": 7.077075065009433e-07, "loss": -0.0015533820260316133, "memory(GiB)": 18.17, "reward": 0.4082287549972534, "reward_std": 0.023994137533009052, "rewards/MCQ_Reward/mean": 0.4082287549972534, "rewards/MCQ_Reward/std": 0.09996674209833145, "step": 185, "train_speed(iter/s)": 0.102951 }, { "clip_ratio": 0.006125608924776316, "epoch": 3.7199999999999998, "grad_norm": 2.3971669673919678, "kl": 0.572265625, "learning_rate": 7.048167838833976e-07, "loss": -0.0021633533760905266, "memory(GiB)": 18.17, "step": 186, "train_speed(iter/s)": 0.103425 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.5, "completions/mean_length": 131.453125, "completions/min_length": 59.0, "epoch": 3.74, "grad_norm": 2.0767407417297363, "kl": 0.513671875, "learning_rate": 7.019178112756625e-07, "loss": 0.005040531512349844, "memory(GiB)": 18.17, "reward": 0.43931877613067627, "reward_std": 0.02542781364172697, "rewards/MCQ_Reward/mean": 0.43931877613067627, "rewards/MCQ_Reward/std": 0.0755577739328146, "step": 187, "train_speed(iter/s)": 0.103367 }, { "clip_ratio": 0.007456609280779958, "epoch": 3.76, "grad_norm": 2.0555458068847656, "kl": 0.513671875, "learning_rate": 6.990107054479312e-07, "loss": 0.004873338155448437, "memory(GiB)": 18.17, "step": 188, "train_speed(iter/s)": 0.103852 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/mean_length": 120.015625, "completions/min_length": 56.0, "epoch": 3.7800000000000002, "grad_norm": 2.1511483192443848, "kl": 0.546875, "learning_rate": 6.960955834980027e-07, "loss": -0.007258214056491852, "memory(GiB)": 18.17, "reward": 0.3652060180902481, "reward_std": 0.023877170868217945, "rewards/MCQ_Reward/mean": 0.3652060180902481, "rewards/MCQ_Reward/std": 0.09329301491379738, "step": 189, "train_speed(iter/s)": 0.103851 }, { "clip_ratio": 0.006274498999118805, "epoch": 3.8, "grad_norm": 2.204212188720703, "kl": 0.5546875, "learning_rate": 6.931725628465642e-07, "loss": -0.0077828834764659405, "memory(GiB)": 18.17, "step": 190, "train_speed(iter/s)": 0.104325 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.5, "completions/mean_length": 119.02734375, "completions/min_length": 68.0, "epoch": 3.82, "grad_norm": 2.489328384399414, "kl": 0.5625, "learning_rate": 6.902417612324615e-07, "loss": -0.004156440030783415, "memory(GiB)": 18.17, "reward": 0.41069237887859344, "reward_std": 0.02522939257323742, "rewards/MCQ_Reward/mean": 0.41069237887859344, "rewards/MCQ_Reward/std": 0.10438777878880501, "step": 191, "train_speed(iter/s)": 0.103961 }, { "clip_ratio": 0.006902764085680246, "epoch": 3.84, "grad_norm": 2.573939085006714, "kl": 0.53125, "learning_rate": 6.87303296707956e-07, "loss": -0.004263042006641626, "memory(GiB)": 18.17, "step": 192, "train_speed(iter/s)": 0.104434 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/mean_length": 119.2109375, "completions/min_length": 63.5, "epoch": 3.86, "grad_norm": 2.4605846405029297, "kl": 0.537109375, "learning_rate": 6.843572876339704e-07, "loss": -0.006107931490987539, "memory(GiB)": 18.17, "reward": 0.41506680846214294, "reward_std": 0.025901762768626213, "rewards/MCQ_Reward/mean": 0.41506680846214294, "rewards/MCQ_Reward/std": 0.11812347918748856, "step": 193, "train_speed(iter/s)": 0.104435 }, { "clip_ratio": 0.006947604939341545, "epoch": 3.88, "grad_norm": 2.9201459884643555, "kl": 0.533203125, "learning_rate": 6.814038526753204e-07, "loss": -0.006667410954833031, "memory(GiB)": 18.17, "step": 194, "train_speed(iter/s)": 0.104911 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/mean_length": 123.375, "completions/min_length": 58.5, "epoch": 3.9, "grad_norm": 2.481006145477295, "kl": 0.638671875, "learning_rate": 6.784431107959358e-07, "loss": -0.00256272591650486, "memory(GiB)": 18.17, "reward": 0.4147709757089615, "reward_std": 0.023487260565161705, "rewards/MCQ_Reward/mean": 0.4147709757089615, "rewards/MCQ_Reward/std": 0.08765164762735367, "step": 195, "train_speed(iter/s)": 0.104938 }, { "clip_ratio": 0.00836537522263825, "epoch": 3.92, "grad_norm": 2.211996078491211, "kl": 0.62109375, "learning_rate": 6.754751812540679e-07, "loss": -0.0026485356502234936, "memory(GiB)": 18.17, "step": 196, "train_speed(iter/s)": 0.105375 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/mean_length": 113.68359375, "completions/min_length": 58.0, "epoch": 3.94, "grad_norm": 2.5469682216644287, "kl": 0.556640625, "learning_rate": 6.725001835974852e-07, "loss": -0.005141774192452431, "memory(GiB)": 18.17, "reward": 0.39422211050987244, "reward_std": 0.022977779619395733, "rewards/MCQ_Reward/mean": 0.39422211050987244, "rewards/MCQ_Reward/std": 0.09659452736377716, "step": 197, "train_speed(iter/s)": 0.105428 }, { "clip_ratio": 0.007515270030125976, "epoch": 3.96, "grad_norm": 2.603193998336792, "kl": 0.57421875, "learning_rate": 6.695182376586602e-07, "loss": -0.00558980368077755, "memory(GiB)": 18.17, "step": 198, "train_speed(iter/s)": 0.105897 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/mean_length": 124.140625, "completions/min_length": 66.5, "epoch": 3.98, "grad_norm": 2.8109734058380127, "kl": 0.5703125, "learning_rate": 6.665294635499403e-07, "loss": -0.008472483605146408, "memory(GiB)": 18.17, "reward": 0.3954710364341736, "reward_std": 0.026893282309174538, "rewards/MCQ_Reward/mean": 0.3954710364341736, "rewards/MCQ_Reward/std": 0.07466300576925278, "step": 199, "train_speed(iter/s)": 0.10569 }, { "clip_ratio": 0.007555491756647825, "epoch": 4.0, "grad_norm": 3.981370687484741, "kl": 0.5625, "learning_rate": 6.635339816587108e-07, "loss": -0.008467345498502254, "memory(GiB)": 18.17, "step": 200, "train_speed(iter/s)": 0.106122 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/mean_length": 114.1640625, "completions/min_length": 67.0, "epoch": 4.02, "grad_norm": 3.464586019515991, "kl": 1.001953125, "learning_rate": 6.605319126425453e-07, "loss": 0.010952511802315712, "memory(GiB)": 18.17, "reward": 0.4330308884382248, "reward_std": 0.022406785748898983, "rewards/MCQ_Reward/mean": 0.4330308884382248, "rewards/MCQ_Reward/std": 0.09031685814261436, "step": 201, "train_speed(iter/s)": 0.10573 }, { "clip_ratio": 0.010695958975702524, "epoch": 4.04, "grad_norm": 3.2848002910614014, "kl": 1.3125, "learning_rate": 6.575233774243464e-07, "loss": 0.010859224945306778, "memory(GiB)": 18.17, "step": 202, "train_speed(iter/s)": 0.106187 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.5, "completions/mean_length": 115.0625, "completions/min_length": 64.5, "epoch": 4.06, "grad_norm": 2.5354137420654297, "kl": 0.521484375, "learning_rate": 6.545084971874736e-07, "loss": 0.008116345852613449, "memory(GiB)": 18.17, "reward": 0.4043910503387451, "reward_std": 0.023216267116367817, "rewards/MCQ_Reward/mean": 0.4043910503387451, "rewards/MCQ_Reward/std": 0.09529644250869751, "step": 203, "train_speed(iter/s)": 0.106255 }, { "clip_ratio": 0.005409660283476114, "epoch": 4.08, "grad_norm": 2.4091176986694336, "kl": 0.52734375, "learning_rate": 6.514873933708637e-07, "loss": 0.007959958165884018, "memory(GiB)": 18.17, "step": 204, "train_speed(iter/s)": 0.10667 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.5, "completions/mean_length": 103.03515625, "completions/min_length": 53.0, "epoch": 4.1, "grad_norm": 2.983665704727173, "kl": 0.62109375, "learning_rate": 6.484601876641375e-07, "loss": -0.014035141095519066, "memory(GiB)": 18.17, "reward": 0.4240594506263733, "reward_std": 0.025937434285879135, "rewards/MCQ_Reward/mean": 0.4240594506263733, "rewards/MCQ_Reward/std": 0.07473786175251007, "step": 205, "train_speed(iter/s)": 0.106723 }, { "clip_ratio": 0.018164899200201035, "epoch": 4.12, "grad_norm": 6.4920454025268555, "kl": 0.5859375, "learning_rate": 6.454270020026995e-07, "loss": -0.013708272948861122, "memory(GiB)": 18.17, "step": 206, "train_speed(iter/s)": 0.107162 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/mean_length": 129.375, "completions/min_length": 58.5, "epoch": 4.14, "grad_norm": 2.714660882949829, "kl": 0.5625, "learning_rate": 6.423879585628261e-07, "loss": -0.014167927205562592, "memory(GiB)": 18.17, "reward": 0.396339014172554, "reward_std": 0.02192540653049946, "rewards/MCQ_Reward/mean": 0.396339014172554, "rewards/MCQ_Reward/std": 0.11277944594621658, "step": 207, "train_speed(iter/s)": 0.106875 }, { "clip_ratio": 0.007178165018558502, "epoch": 4.16, "grad_norm": 2.4650375843048096, "kl": 0.560546875, "learning_rate": 6.393431797567439e-07, "loss": -0.014689125120639801, "memory(GiB)": 18.17, "step": 208, "train_speed(iter/s)": 0.107325 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/mean_length": 131.01953125, "completions/min_length": 64.5, "epoch": 4.18, "grad_norm": 2.1339519023895264, "kl": 0.58203125, "learning_rate": 6.362927882276989e-07, "loss": -0.017007270827889442, "memory(GiB)": 18.17, "reward": 0.42686355113983154, "reward_std": 0.023915644735097885, "rewards/MCQ_Reward/mean": 0.42686355113983154, "rewards/MCQ_Reward/std": 0.10529575496912003, "step": 209, "train_speed(iter/s)": 0.107141 }, { "clip_ratio": 0.005084275268018246, "epoch": 4.2, "grad_norm": 2.0464680194854736, "kl": 0.59375, "learning_rate": 6.332369068450174e-07, "loss": -0.0175747312605381, "memory(GiB)": 18.17, "step": 210, "train_speed(iter/s)": 0.107586 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.5, "completions/mean_length": 116.63671875, "completions/min_length": 61.5, "epoch": 4.22, "grad_norm": 2.4869492053985596, "kl": 0.544921875, "learning_rate": 6.30175658699156e-07, "loss": -0.0016960185021162033, "memory(GiB)": 18.17, "reward": 0.43242450058460236, "reward_std": 0.02396441251039505, "rewards/MCQ_Reward/mean": 0.43242450058460236, "rewards/MCQ_Reward/std": 0.07406600937247276, "step": 211, "train_speed(iter/s)": 0.107182 }, { "clip_ratio": 0.006936221849173307, "epoch": 4.24, "grad_norm": 2.2954320907592773, "kl": 0.5390625, "learning_rate": 6.271091670967436e-07, "loss": -0.001955235842615366, "memory(GiB)": 18.17, "step": 212, "train_speed(iter/s)": 0.10762 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.5, "completions/mean_length": 132.296875, "completions/min_length": 90.0, "epoch": 4.26, "grad_norm": 2.5567421913146973, "kl": 0.548828125, "learning_rate": 6.240375555556145e-07, "loss": -0.010683618485927582, "memory(GiB)": 18.17, "reward": 0.3712979108095169, "reward_std": 0.022392110899090767, "rewards/MCQ_Reward/mean": 0.3712979108095169, "rewards/MCQ_Reward/std": 0.0758376233279705, "step": 213, "train_speed(iter/s)": 0.107578 }, { "clip_ratio": 0.01051389379426837, "epoch": 4.28, "grad_norm": 3.9029605388641357, "kl": 0.529296875, "learning_rate": 6.209609477998338e-07, "loss": -0.010750237852334976, "memory(GiB)": 18.17, "step": 214, "train_speed(iter/s)": 0.108018 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.5, "completions/mean_length": 117.71875, "completions/min_length": 60.5, "epoch": 4.3, "grad_norm": 2.3913040161132812, "kl": 0.6015625, "learning_rate": 6.178794677547137e-07, "loss": -0.012967615388333797, "memory(GiB)": 18.17, "reward": 0.3914954960346222, "reward_std": 0.021691203117370605, "rewards/MCQ_Reward/mean": 0.3914954960346222, "rewards/MCQ_Reward/std": 0.10047328472137451, "step": 215, "train_speed(iter/s)": 0.108034 }, { "clip_ratio": 0.005430733785033226, "epoch": 4.32, "grad_norm": 2.3732998371124268, "kl": 0.61328125, "learning_rate": 6.147932395418205e-07, "loss": -0.013309886679053307, "memory(GiB)": 18.17, "step": 216, "train_speed(iter/s)": 0.108474 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.5, "completions/mean_length": 123.4765625, "completions/min_length": 65.0, "epoch": 4.34, "grad_norm": 2.7147343158721924, "kl": 0.552734375, "learning_rate": 6.117023874739771e-07, "loss": -0.0006074332632124424, "memory(GiB)": 18.17, "reward": 0.4220256060361862, "reward_std": 0.0257421238347888, "rewards/MCQ_Reward/mean": 0.4220256060361862, "rewards/MCQ_Reward/std": 0.12063978612422943, "step": 217, "train_speed(iter/s)": 0.10841 }, { "clip_ratio": 0.006779439281672239, "epoch": 4.36, "grad_norm": 2.3169238567352295, "kl": 0.544921875, "learning_rate": 6.086070360502539e-07, "loss": -0.0006955214776098728, "memory(GiB)": 18.17, "step": 218, "train_speed(iter/s)": 0.108822 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.5, "completions/mean_length": 116.0625, "completions/min_length": 53.5, "epoch": 4.38, "grad_norm": 2.7408437728881836, "kl": 0.615234375, "learning_rate": 6.055073099509549e-07, "loss": -0.007178765721619129, "memory(GiB)": 18.17, "reward": 0.41480791568756104, "reward_std": 0.028133179992437363, "rewards/MCQ_Reward/mean": 0.41480791568756104, "rewards/MCQ_Reward/std": 0.1095062680542469, "step": 219, "train_speed(iter/s)": 0.108796 }, { "clip_ratio": 0.007214481011033058, "epoch": 4.4, "grad_norm": 2.457122802734375, "kl": 0.6171875, "learning_rate": 6.024033340325954e-07, "loss": -0.008253653533756733, "memory(GiB)": 18.17, "step": 220, "train_speed(iter/s)": 0.109227 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.5, "completions/mean_length": 118.609375, "completions/min_length": 59.0, "epoch": 4.42, "grad_norm": 2.8679587841033936, "kl": 0.568359375, "learning_rate": 5.992952333228726e-07, "loss": 0.013627042062580585, "memory(GiB)": 18.17, "reward": 0.4350634217262268, "reward_std": 0.0218770457431674, "rewards/MCQ_Reward/mean": 0.4350634217262268, "rewards/MCQ_Reward/std": 0.07635831832885742, "step": 221, "train_speed(iter/s)": 0.108811 }, { "clip_ratio": 0.005678659770637751, "epoch": 4.44, "grad_norm": 2.187412738800049, "kl": 0.58203125, "learning_rate": 5.961831330156305e-07, "loss": 0.013213744387030602, "memory(GiB)": 18.17, "step": 222, "train_speed(iter/s)": 0.109221 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.5, "completions/mean_length": 125.9765625, "completions/min_length": 48.5, "epoch": 4.46, "grad_norm": 3.5221126079559326, "kl": 0.587890625, "learning_rate": 5.93067158465815e-07, "loss": -0.0011408873833715916, "memory(GiB)": 18.17, "reward": 0.44135691225528717, "reward_std": 0.025366419926285744, "rewards/MCQ_Reward/mean": 0.44135691225528717, "rewards/MCQ_Reward/std": 0.07711124420166016, "step": 223, "train_speed(iter/s)": 0.109176 }, { "clip_ratio": 0.007937990361824632, "epoch": 4.48, "grad_norm": 2.513356924057007, "kl": 0.5703125, "learning_rate": 5.899474351844269e-07, "loss": -0.0011316398158669472, "memory(GiB)": 18.17, "step": 224, "train_speed(iter/s)": 0.109601 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/mean_length": 120.234375, "completions/min_length": 54.0, "epoch": 4.5, "grad_norm": 2.853579044342041, "kl": 0.744140625, "learning_rate": 5.868240888334652e-07, "loss": -0.0010898616164922714, "memory(GiB)": 18.17, "reward": 0.41750770807266235, "reward_std": 0.024566995911300182, "rewards/MCQ_Reward/mean": 0.41750770807266235, "rewards/MCQ_Reward/std": 0.09383138827979565, "step": 225, "train_speed(iter/s)": 0.109546 }, { "clip_ratio": 0.012675716076046228, "epoch": 4.52, "grad_norm": 5.211337089538574, "kl": 0.658203125, "learning_rate": 5.836972452208654e-07, "loss": -0.001642034389078617, "memory(GiB)": 18.17, "step": 226, "train_speed(iter/s)": 0.109972 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.5, "completions/mean_length": 126.68359375, "completions/min_length": 64.0, "epoch": 4.54, "grad_norm": 2.3116183280944824, "kl": 0.505859375, "learning_rate": 5.805670302954321e-07, "loss": 0.017429981380701065, "memory(GiB)": 18.17, "reward": 0.41671665012836456, "reward_std": 0.02627546712756157, "rewards/MCQ_Reward/mean": 0.41671665012836456, "rewards/MCQ_Reward/std": 0.09354511648416519, "step": 227, "train_speed(iter/s)": 0.109937 }, { "clip_ratio": 0.005898691713809967, "epoch": 4.5600000000000005, "grad_norm": 2.306483507156372, "kl": 0.5087890625, "learning_rate": 5.774335701417662e-07, "loss": 0.016744598746299744, "memory(GiB)": 18.17, "step": 228, "train_speed(iter/s)": 0.110353 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.5, "completions/mean_length": 124.5546875, "completions/min_length": 61.0, "epoch": 4.58, "grad_norm": 2.3084402084350586, "kl": 0.552734375, "learning_rate": 5.742969909751858e-07, "loss": -0.009621858596801758, "memory(GiB)": 18.17, "reward": 0.45828977227211, "reward_std": 0.023471640422940254, "rewards/MCQ_Reward/mean": 0.45828977227211, "rewards/MCQ_Reward/std": 0.09269878640770912, "step": 229, "train_speed(iter/s)": 0.110326 }, { "clip_ratio": 0.005610911408439279, "epoch": 4.6, "grad_norm": 2.163801431655884, "kl": 0.552734375, "learning_rate": 5.711574191366427e-07, "loss": -0.010531945154070854, "memory(GiB)": 18.17, "step": 230, "train_speed(iter/s)": 0.110743 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.5, "completions/mean_length": 116.93359375, "completions/min_length": 62.5, "epoch": 4.62, "grad_norm": 3.1812872886657715, "kl": 2.26171875, "learning_rate": 5.680149810876322e-07, "loss": 0.006941274274140596, "memory(GiB)": 18.17, "reward": 0.45568907260894775, "reward_std": 0.023496804758906364, "rewards/MCQ_Reward/mean": 0.45568907260894775, "rewards/MCQ_Reward/std": 0.09556515514850616, "step": 231, "train_speed(iter/s)": 0.110377 }, { "clip_ratio": 0.006443677702918649, "epoch": 4.64, "grad_norm": 2.733854293823242, "kl": 2.2734375, "learning_rate": 5.648698034051008e-07, "loss": 0.006462510209530592, "memory(GiB)": 18.17, "step": 232, "train_speed(iter/s)": 0.110787 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/mean_length": 133.1015625, "completions/min_length": 70.5, "epoch": 4.66, "grad_norm": 2.4281585216522217, "kl": 0.55859375, "learning_rate": 5.617220127763474e-07, "loss": 0.013438165187835693, "memory(GiB)": 18.17, "reward": 0.43506887555122375, "reward_std": 0.025797616690397263, "rewards/MCQ_Reward/mean": 0.43506887555122375, "rewards/MCQ_Reward/std": 0.09859243780374527, "step": 233, "train_speed(iter/s)": 0.110691 }, { "clip_ratio": 0.0072706313803792, "epoch": 4.68, "grad_norm": 2.526357889175415, "kl": 0.55859375, "learning_rate": 5.585717359939192e-07, "loss": 0.012631012126803398, "memory(GiB)": 18.17, "step": 234, "train_speed(iter/s)": 0.111101 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.5, "completions/mean_length": 133.1328125, "completions/min_length": 57.0, "epoch": 4.7, "grad_norm": 2.639338731765747, "kl": 0.552734375, "learning_rate": 5.554190999505055e-07, "loss": -0.008054563775658607, "memory(GiB)": 18.17, "reward": 0.40963128209114075, "reward_std": 0.024876238778233528, "rewards/MCQ_Reward/mean": 0.40963128209114075, "rewards/MCQ_Reward/std": 0.06643268279731274, "step": 235, "train_speed(iter/s)": 0.111027 }, { "clip_ratio": 0.008271400351077318, "epoch": 4.72, "grad_norm": 2.7264564037323, "kl": 0.568359375, "learning_rate": 5.522642316338268e-07, "loss": -0.008453292772173882, "memory(GiB)": 18.17, "step": 236, "train_speed(iter/s)": 0.111434 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/mean_length": 123.84765625, "completions/min_length": 65.5, "epoch": 4.74, "grad_norm": 2.405317544937134, "kl": 0.5400390625, "learning_rate": 5.491072581215186e-07, "loss": 0.00114892004057765, "memory(GiB)": 18.17, "reward": 0.4337426722049713, "reward_std": 0.020247386768460274, "rewards/MCQ_Reward/mean": 0.4337426722049713, "rewards/MCQ_Reward/std": 0.07973705604672432, "step": 237, "train_speed(iter/s)": 0.111369 }, { "clip_ratio": 0.006459691561758518, "epoch": 4.76, "grad_norm": 2.8662662506103516, "kl": 0.5400390625, "learning_rate": 5.459483065760138e-07, "loss": 0.0009391154162585735, "memory(GiB)": 18.17, "step": 238, "train_speed(iter/s)": 0.111775 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/mean_length": 133.96875, "completions/min_length": 75.0, "epoch": 4.78, "grad_norm": 2.400651216506958, "kl": 0.5078125, "learning_rate": 5.427875042394199e-07, "loss": 0.002962369006127119, "memory(GiB)": 18.17, "reward": 0.4192984253168106, "reward_std": 0.023103597573935986, "rewards/MCQ_Reward/mean": 0.4192984253168106, "rewards/MCQ_Reward/std": 0.08515846729278564, "step": 239, "train_speed(iter/s)": 0.11166 }, { "clip_ratio": 0.00794414198026061, "epoch": 4.8, "grad_norm": 3.1118853092193604, "kl": 0.5029296875, "learning_rate": 5.396249784283942e-07, "loss": 0.0026899795047938824, "memory(GiB)": 18.17, "step": 240, "train_speed(iter/s)": 0.112066 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.5, "completions/mean_length": 114.4609375, "completions/min_length": 47.5, "epoch": 4.82, "grad_norm": 2.5313034057617188, "kl": 0.5390625, "learning_rate": 5.364608565290154e-07, "loss": -0.0074430471286177635, "memory(GiB)": 18.17, "reward": 0.4074428677558899, "reward_std": 0.02112921793013811, "rewards/MCQ_Reward/mean": 0.4074428677558899, "rewards/MCQ_Reward/std": 0.07994595915079117, "step": 241, "train_speed(iter/s)": 0.111745 }, { "clip_ratio": 0.007256179815158248, "epoch": 4.84, "grad_norm": 2.768711566925049, "kl": 0.5625, "learning_rate": 5.33295265991652e-07, "loss": -0.0077315750531852245, "memory(GiB)": 18.17, "step": 242, "train_speed(iter/s)": 0.112147 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/mean_length": 115.47265625, "completions/min_length": 67.5, "epoch": 4.86, "grad_norm": 2.561013698577881, "kl": 0.57421875, "learning_rate": 5.301283343258292e-07, "loss": -0.0039140088483691216, "memory(GiB)": 18.17, "reward": 0.42967718839645386, "reward_std": 0.020259867422282696, "rewards/MCQ_Reward/mean": 0.42967718839645386, "rewards/MCQ_Reward/std": 0.09365658834576607, "step": 243, "train_speed(iter/s)": 0.112166 }, { "clip_ratio": 0.008353757206350565, "epoch": 4.88, "grad_norm": 3.9286372661590576, "kl": 0.560546875, "learning_rate": 5.26960189095093e-07, "loss": -0.003905682824552059, "memory(GiB)": 18.17, "step": 244, "train_speed(iter/s)": 0.112566 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/mean_length": 130.84375, "completions/min_length": 77.0, "epoch": 4.9, "grad_norm": 2.3792028427124023, "kl": 0.515625, "learning_rate": 5.237909579118712e-07, "loss": 0.0075805773958563805, "memory(GiB)": 18.17, "reward": 0.37578998506069183, "reward_std": 0.022264255210757256, "rewards/MCQ_Reward/mean": 0.37578998506069183, "rewards/MCQ_Reward/std": 0.09643128886818886, "step": 245, "train_speed(iter/s)": 0.112504 }, { "clip_ratio": 0.006022685440257192, "epoch": 4.92, "grad_norm": 2.490131378173828, "kl": 0.501953125, "learning_rate": 5.206207684323335e-07, "loss": 0.007525968365371227, "memory(GiB)": 18.17, "step": 246, "train_speed(iter/s)": 0.112901 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.5, "completions/mean_length": 112.49609375, "completions/min_length": 62.5, "epoch": 4.9399999999999995, "grad_norm": 2.270827293395996, "kl": 0.580078125, "learning_rate": 5.174497483512505e-07, "loss": 0.011211629025638103, "memory(GiB)": 18.17, "reward": 0.39156346023082733, "reward_std": 0.02191222459077835, "rewards/MCQ_Reward/mean": 0.39156346023082733, "rewards/MCQ_Reward/std": 0.12107554450631142, "step": 247, "train_speed(iter/s)": 0.112883 }, { "clip_ratio": 0.006176856812089682, "epoch": 4.96, "grad_norm": 2.373053550720215, "kl": 0.57421875, "learning_rate": 5.142780253968481e-07, "loss": 0.010641951113939285, "memory(GiB)": 18.17, "step": 248, "train_speed(iter/s)": 0.11328 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/mean_length": 131.14453125, "completions/min_length": 62.5, "epoch": 4.98, "grad_norm": 2.2482690811157227, "kl": 0.525390625, "learning_rate": 5.111057273256647e-07, "loss": 0.0050743343308568, "memory(GiB)": 18.17, "reward": 0.40770605206489563, "reward_std": 0.022150222212076187, "rewards/MCQ_Reward/mean": 0.40770605206489563, "rewards/MCQ_Reward/std": 0.11748149991035461, "step": 249, "train_speed(iter/s)": 0.113183 }, { "clip_ratio": 0.006638662423938513, "epoch": 5.0, "grad_norm": 2.2492520809173584, "kl": 0.5390625, "learning_rate": 5.07932981917404e-07, "loss": 0.004837746266275644, "memory(GiB)": 18.17, "step": 250, "train_speed(iter/s)": 0.113563 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/mean_length": 125.765625, "completions/min_length": 68.5, "epoch": 5.02, "grad_norm": 2.556406259536743, "kl": 0.5078125, "learning_rate": 5.047599169697883e-07, "loss": 0.017076797783374786, "memory(GiB)": 18.17, "reward": 0.4466231018304825, "reward_std": 0.0222383551299572, "rewards/MCQ_Reward/mean": 0.4466231018304825, "rewards/MCQ_Reward/std": 0.11308542639017105, "step": 251, "train_speed(iter/s)": 0.113109 }, { "clip_ratio": 0.007436602842062712, "epoch": 5.04, "grad_norm": 2.0482616424560547, "kl": 0.515625, "learning_rate": 5.015866602934111e-07, "loss": 0.01610303670167923, "memory(GiB)": 18.17, "step": 252, "train_speed(iter/s)": 0.113475 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/mean_length": 109.3359375, "completions/min_length": 65.0, "epoch": 5.06, "grad_norm": 2.6583385467529297, "kl": 0.6015625, "learning_rate": 4.984133397065888e-07, "loss": 0.005715301260352135, "memory(GiB)": 18.17, "reward": 0.3956441879272461, "reward_std": 0.02386545669287443, "rewards/MCQ_Reward/mean": 0.3956441879272461, "rewards/MCQ_Reward/std": 0.0772719755768776, "step": 253, "train_speed(iter/s)": 0.113471 }, { "clip_ratio": 0.006691478192806244, "epoch": 5.08, "grad_norm": 2.478234052658081, "kl": 0.5859375, "learning_rate": 4.952400830302116e-07, "loss": 0.00553365983068943, "memory(GiB)": 18.17, "step": 254, "train_speed(iter/s)": 0.113858 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/mean_length": 144.1796875, "completions/min_length": 78.0, "epoch": 5.1, "grad_norm": 2.308807373046875, "kl": 0.5009765625, "learning_rate": 4.92067018082596e-07, "loss": -0.0058871605433523655, "memory(GiB)": 18.17, "reward": 0.4203776866197586, "reward_std": 0.022159602493047714, "rewards/MCQ_Reward/mean": 0.4203776866197586, "rewards/MCQ_Reward/std": 0.09526496008038521, "step": 255, "train_speed(iter/s)": 0.113761 }, { "clip_ratio": 0.007533560739830136, "epoch": 5.12, "grad_norm": 2.9820773601531982, "kl": 0.4921875, "learning_rate": 4.888942726743353e-07, "loss": -0.006009383127093315, "memory(GiB)": 18.17, "step": 256, "train_speed(iter/s)": 0.114127 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.5, "completions/mean_length": 115.0078125, "completions/min_length": 65.0, "epoch": 5.14, "grad_norm": 2.3862602710723877, "kl": 0.57421875, "learning_rate": 4.857219746031519e-07, "loss": -0.010767871513962746, "memory(GiB)": 18.17, "reward": 0.43338486552238464, "reward_std": 0.025110138580203056, "rewards/MCQ_Reward/mean": 0.43338486552238464, "rewards/MCQ_Reward/std": 0.08122389577329159, "step": 257, "train_speed(iter/s)": 0.114083 }, { "clip_ratio": 0.005816203076392412, "epoch": 5.16, "grad_norm": 2.2391088008880615, "kl": 0.57421875, "learning_rate": 4.825502516487496e-07, "loss": -0.011337889358401299, "memory(GiB)": 18.17, "step": 258, "train_speed(iter/s)": 0.11446 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/mean_length": 121.109375, "completions/min_length": 68.0, "epoch": 5.18, "grad_norm": 3.2198102474212646, "kl": 0.642578125, "learning_rate": 4.793792315676664e-07, "loss": -0.0017241109162569046, "memory(GiB)": 18.17, "reward": 0.41922956705093384, "reward_std": 0.02394416555762291, "rewards/MCQ_Reward/mean": 0.41922956705093384, "rewards/MCQ_Reward/std": 0.08786309324204922, "step": 259, "train_speed(iter/s)": 0.11433 }, { "clip_ratio": 0.008633819408714771, "epoch": 5.2, "grad_norm": 2.5045688152313232, "kl": 0.611328125, "learning_rate": 4.762090420881288e-07, "loss": -0.0024092746898531914, "memory(GiB)": 18.17, "step": 260, "train_speed(iter/s)": 0.11471 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.5, "completions/mean_length": 121.0, "completions/min_length": 59.5, "epoch": 5.22, "grad_norm": 3.3788204193115234, "kl": 0.65625, "learning_rate": 4.7303981090490706e-07, "loss": 0.0016009537503123283, "memory(GiB)": 18.17, "reward": 0.4228467643260956, "reward_std": 0.02382771298289299, "rewards/MCQ_Reward/mean": 0.4228467643260956, "rewards/MCQ_Reward/std": 0.08922314271330833, "step": 261, "train_speed(iter/s)": 0.114325 }, { "clip_ratio": 0.009796116035431623, "epoch": 5.24, "grad_norm": 3.2910051345825195, "kl": 0.603515625, "learning_rate": 4.698716656741708e-07, "loss": 0.0013471171259880066, "memory(GiB)": 18.17, "step": 262, "train_speed(iter/s)": 0.114703 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/mean_length": 117.85546875, "completions/min_length": 58.5, "epoch": 5.26, "grad_norm": 3.0833852291107178, "kl": 0.607421875, "learning_rate": 4.66704734008348e-07, "loss": 0.01880352757871151, "memory(GiB)": 18.17, "reward": 0.4038514196872711, "reward_std": 0.024144282564520836, "rewards/MCQ_Reward/mean": 0.4038514196872711, "rewards/MCQ_Reward/std": 0.11032669246196747, "step": 263, "train_speed(iter/s)": 0.114712 }, { "clip_ratio": 0.0071860982570797205, "epoch": 5.28, "grad_norm": 2.223651885986328, "kl": 0.62109375, "learning_rate": 4.6353914347098467e-07, "loss": 0.018028832972049713, "memory(GiB)": 18.17, "step": 264, "train_speed(iter/s)": 0.115068 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/mean_length": 126.16796875, "completions/min_length": 63.0, "epoch": 5.3, "grad_norm": 2.7954585552215576, "kl": 0.521484375, "learning_rate": 4.6037502157160567e-07, "loss": 0.008576348423957825, "memory(GiB)": 18.17, "reward": 0.4126065671443939, "reward_std": 0.02162686362862587, "rewards/MCQ_Reward/mean": 0.4126065671443939, "rewards/MCQ_Reward/std": 0.08540061488747597, "step": 265, "train_speed(iter/s)": 0.115013 }, { "clip_ratio": 0.00956161879003048, "epoch": 5.32, "grad_norm": 4.209680557250977, "kl": 0.544921875, "learning_rate": 4.5721249576058027e-07, "loss": 0.009101202711462975, "memory(GiB)": 18.17, "step": 266, "train_speed(iter/s)": 0.115384 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.5, "completions/mean_length": 113.3515625, "completions/min_length": 71.5, "epoch": 5.34, "grad_norm": 2.6387808322906494, "kl": 0.595703125, "learning_rate": 4.540516934239863e-07, "loss": 0.008354030549526215, "memory(GiB)": 18.17, "reward": 0.4057372510433197, "reward_std": 0.025215539149940014, "rewards/MCQ_Reward/mean": 0.4057372510433197, "rewards/MCQ_Reward/std": 0.10797113552689552, "step": 267, "train_speed(iter/s)": 0.115352 }, { "clip_ratio": 0.004749758169054985, "epoch": 5.36, "grad_norm": 2.726827383041382, "kl": 0.59765625, "learning_rate": 4.508927418784814e-07, "loss": 0.008263107389211655, "memory(GiB)": 18.17, "step": 268, "train_speed(iter/s)": 0.115721 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/mean_length": 128.75, "completions/min_length": 65.0, "epoch": 5.38, "grad_norm": 2.4489338397979736, "kl": 0.5859375, "learning_rate": 4.477357683661733e-07, "loss": 0.0003694836050271988, "memory(GiB)": 18.17, "reward": 0.39796915650367737, "reward_std": 0.0229190643876791, "rewards/MCQ_Reward/mean": 0.39796915650367737, "rewards/MCQ_Reward/std": 0.06984946131706238, "step": 269, "train_speed(iter/s)": 0.115538 }, { "clip_ratio": 0.0044297389686107635, "epoch": 5.4, "grad_norm": 2.187133312225342, "kl": 0.587890625, "learning_rate": 4.445809000494945e-07, "loss": 6.162561476230621e-06, "memory(GiB)": 18.17, "step": 270, "train_speed(iter/s)": 0.115873 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.5, "completions/mean_length": 120.48046875, "completions/min_length": 76.5, "epoch": 5.42, "grad_norm": 2.354365348815918, "kl": 0.595703125, "learning_rate": 4.4142826400608085e-07, "loss": -0.011774084530770779, "memory(GiB)": 18.17, "reward": 0.4731539338827133, "reward_std": 0.025172382593154907, "rewards/MCQ_Reward/mean": 0.4731539338827133, "rewards/MCQ_Reward/std": 0.09358260780572891, "step": 271, "train_speed(iter/s)": 0.115479 }, { "clip_ratio": 0.007754836697131395, "epoch": 5.44, "grad_norm": 2.9754416942596436, "kl": 0.568359375, "learning_rate": 4.382779872236526e-07, "loss": -0.01219811663031578, "memory(GiB)": 18.17, "step": 272, "train_speed(iter/s)": 0.115843 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/mean_length": 127.671875, "completions/min_length": 82.0, "epoch": 5.46, "grad_norm": 2.66938853263855, "kl": 0.587890625, "learning_rate": 4.3513019659489906e-07, "loss": -0.01641671359539032, "memory(GiB)": 18.17, "reward": 0.3951749950647354, "reward_std": 0.026222089305520058, "rewards/MCQ_Reward/mean": 0.3951749950647354, "rewards/MCQ_Reward/std": 0.07432432845234871, "step": 273, "train_speed(iter/s)": 0.11581 }, { "clip_ratio": 0.006316621555015445, "epoch": 5.48, "grad_norm": 2.3686916828155518, "kl": 0.595703125, "learning_rate": 4.31985018912368e-07, "loss": -0.01686863601207733, "memory(GiB)": 18.17, "step": 274, "train_speed(iter/s)": 0.116173 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/mean_length": 127.2578125, "completions/min_length": 64.5, "epoch": 5.5, "grad_norm": 2.3570117950439453, "kl": 0.5390625, "learning_rate": 4.2884258086335745e-07, "loss": 0.0007358621805906296, "memory(GiB)": 18.17, "reward": 0.44543667137622833, "reward_std": 0.024644173681735992, "rewards/MCQ_Reward/mean": 0.44543667137622833, "rewards/MCQ_Reward/std": 0.09130855649709702, "step": 275, "train_speed(iter/s)": 0.116062 }, { "clip_ratio": 0.009702229872345924, "epoch": 5.52, "grad_norm": 4.230794906616211, "kl": 0.517578125, "learning_rate": 4.257030090248142e-07, "loss": 0.0004968619905412197, "memory(GiB)": 18.17, "step": 276, "train_speed(iter/s)": 0.116424 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.5, "completions/mean_length": 124.16796875, "completions/min_length": 66.5, "epoch": 5.54, "grad_norm": 2.1478097438812256, "kl": 0.607421875, "learning_rate": 4.2256642985823387e-07, "loss": 0.012350899167358875, "memory(GiB)": 18.17, "reward": 0.4112658351659775, "reward_std": 0.023498238995671272, "rewards/MCQ_Reward/mean": 0.4112658351659775, "rewards/MCQ_Reward/std": 0.08520639687776566, "step": 277, "train_speed(iter/s)": 0.116375 }, { "clip_ratio": 0.004101653583347797, "epoch": 5.5600000000000005, "grad_norm": 2.062098503112793, "kl": 0.62109375, "learning_rate": 4.19432969704568e-07, "loss": 0.012091840617358685, "memory(GiB)": 18.17, "step": 278, "train_speed(iter/s)": 0.116723 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.5, "completions/mean_length": 122.69921875, "completions/min_length": 59.0, "epoch": 5.58, "grad_norm": 2.9315075874328613, "kl": 0.5390625, "learning_rate": 4.1630275477913465e-07, "loss": -0.013242216780781746, "memory(GiB)": 18.17, "reward": 0.39477604627609253, "reward_std": 0.02283278852701187, "rewards/MCQ_Reward/mean": 0.39477604627609253, "rewards/MCQ_Reward/std": 0.09505810588598251, "step": 279, "train_speed(iter/s)": 0.116608 }, { "clip_ratio": 0.006070411531254649, "epoch": 5.6, "grad_norm": 2.2812304496765137, "kl": 0.53515625, "learning_rate": 4.131759111665348e-07, "loss": -0.013854868710041046, "memory(GiB)": 18.17, "step": 280, "train_speed(iter/s)": 0.116971 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.5, "completions/mean_length": 129.95703125, "completions/min_length": 60.5, "epoch": 5.62, "grad_norm": 2.015717029571533, "kl": 0.513671875, "learning_rate": 4.1005256481557306e-07, "loss": 0.0003234475152567029, "memory(GiB)": 18.17, "reward": 0.40168674290180206, "reward_std": 0.020120804198086262, "rewards/MCQ_Reward/mean": 0.40168674290180206, "rewards/MCQ_Reward/std": 0.09599081426858902, "step": 281, "train_speed(iter/s)": 0.116542 }, { "clip_ratio": 0.0076590063981711864, "epoch": 5.64, "grad_norm": 2.828334331512451, "kl": 0.5009765625, "learning_rate": 4.0693284153418497e-07, "loss": 0.00015916512347757816, "memory(GiB)": 18.17, "step": 282, "train_speed(iter/s)": 0.116903 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.5, "completions/mean_length": 121.16015625, "completions/min_length": 71.5, "epoch": 5.66, "grad_norm": 2.985908269882202, "kl": 0.58203125, "learning_rate": 4.038168669843697e-07, "loss": -0.0021479236893355846, "memory(GiB)": 18.17, "reward": 0.4441321939229965, "reward_std": 0.021154197864234447, "rewards/MCQ_Reward/mean": 0.4441321939229965, "rewards/MCQ_Reward/std": 0.10662735998630524, "step": 283, "train_speed(iter/s)": 0.116806 }, { "clip_ratio": 0.00845325831323862, "epoch": 5.68, "grad_norm": 2.2008328437805176, "kl": 0.5703125, "learning_rate": 4.0070476667712736e-07, "loss": -0.0024233213625848293, "memory(GiB)": 18.17, "step": 284, "train_speed(iter/s)": 0.117157 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.5, "completions/mean_length": 131.40625, "completions/min_length": 65.5, "epoch": 5.7, "grad_norm": 2.1404271125793457, "kl": 0.609375, "learning_rate": 3.9759666596740473e-07, "loss": 0.009725593030452728, "memory(GiB)": 18.17, "reward": 0.4451696425676346, "reward_std": 0.02477285359054804, "rewards/MCQ_Reward/mean": 0.4451696425676346, "rewards/MCQ_Reward/std": 0.07242370769381523, "step": 285, "train_speed(iter/s)": 0.117116 }, { "clip_ratio": 0.004681814229115844, "epoch": 5.72, "grad_norm": 2.289313316345215, "kl": 0.61328125, "learning_rate": 3.9449269004904516e-07, "loss": 0.009346994571387768, "memory(GiB)": 18.17, "step": 286, "train_speed(iter/s)": 0.117466 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/mean_length": 104.30078125, "completions/min_length": 51.0, "epoch": 5.74, "grad_norm": 2.770270347595215, "kl": 1.189453125, "learning_rate": 3.913929639497462e-07, "loss": 0.009477443993091583, "memory(GiB)": 18.17, "reward": 0.43081943690776825, "reward_std": 0.025431891903281212, "rewards/MCQ_Reward/mean": 0.43081943690776825, "rewards/MCQ_Reward/std": 0.10991119593381882, "step": 287, "train_speed(iter/s)": 0.117471 }, { "clip_ratio": 0.006838085595518351, "epoch": 5.76, "grad_norm": 2.8960061073303223, "kl": 1.087890625, "learning_rate": 3.882976125260229e-07, "loss": 0.008670520968735218, "memory(GiB)": 18.17, "step": 288, "train_speed(iter/s)": 0.117827 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.5, "completions/mean_length": 114.71484375, "completions/min_length": 56.5, "epoch": 5.78, "grad_norm": 2.4359030723571777, "kl": 0.552734375, "learning_rate": 3.852067604581794e-07, "loss": 0.006409616209566593, "memory(GiB)": 18.17, "reward": 0.41095563769340515, "reward_std": 0.02436618786305189, "rewards/MCQ_Reward/mean": 0.41095563769340515, "rewards/MCQ_Reward/std": 0.09878598526120186, "step": 289, "train_speed(iter/s)": 0.117814 }, { "clip_ratio": 0.007955410983413458, "epoch": 5.8, "grad_norm": 3.950528383255005, "kl": 0.5390625, "learning_rate": 3.821205322452863e-07, "loss": 0.0066283950582146645, "memory(GiB)": 18.17, "step": 290, "train_speed(iter/s)": 0.118161 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.5, "completions/mean_length": 134.59375, "completions/min_length": 63.0, "epoch": 5.82, "grad_norm": 2.4326717853546143, "kl": 0.5263671875, "learning_rate": 3.790390522001662e-07, "loss": 0.002648044377565384, "memory(GiB)": 18.17, "reward": 0.4533398002386093, "reward_std": 0.023892495781183243, "rewards/MCQ_Reward/mean": 0.4533398002386093, "rewards/MCQ_Reward/std": 0.08347899466753006, "step": 291, "train_speed(iter/s)": 0.117724 }, { "clip_ratio": 0.004736665170639753, "epoch": 5.84, "grad_norm": 2.2011497020721436, "kl": 0.541015625, "learning_rate": 3.7596244444438574e-07, "loss": 0.002431286498904228, "memory(GiB)": 18.17, "step": 292, "train_speed(iter/s)": 0.118068 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.5, "completions/mean_length": 117.24609375, "completions/min_length": 63.5, "epoch": 5.86, "grad_norm": 2.58125376701355, "kl": 0.541015625, "learning_rate": 3.728908329032566e-07, "loss": -0.003335139248520136, "memory(GiB)": 18.17, "reward": 0.4097088426351547, "reward_std": 0.022918211296200752, "rewards/MCQ_Reward/mean": 0.4097088426351547, "rewards/MCQ_Reward/std": 0.1199105829000473, "step": 293, "train_speed(iter/s)": 0.118029 }, { "clip_ratio": 0.007036251947283745, "epoch": 5.88, "grad_norm": 2.4533321857452393, "kl": 0.5625, "learning_rate": 3.6982434130084396e-07, "loss": -0.0037924423813819885, "memory(GiB)": 18.17, "step": 294, "train_speed(iter/s)": 0.118366 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.5, "completions/mean_length": 127.00390625, "completions/min_length": 75.0, "epoch": 5.9, "grad_norm": 2.2269814014434814, "kl": 0.5, "learning_rate": 3.6676309315498255e-07, "loss": 0.012001181952655315, "memory(GiB)": 18.17, "reward": 0.42691150307655334, "reward_std": 0.021617514081299305, "rewards/MCQ_Reward/mean": 0.42691150307655334, "rewards/MCQ_Reward/std": 0.11347687244415283, "step": 295, "train_speed(iter/s)": 0.11833 }, { "clip_ratio": 0.004536686465144157, "epoch": 5.92, "grad_norm": 2.593670129776001, "kl": 0.513671875, "learning_rate": 3.6370721177230115e-07, "loss": 0.011945893988013268, "memory(GiB)": 18.17, "step": 296, "train_speed(iter/s)": 0.118674 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.5, "completions/mean_length": 123.5234375, "completions/min_length": 71.5, "epoch": 5.9399999999999995, "grad_norm": 2.1928629875183105, "kl": 0.4970703125, "learning_rate": 3.6065682024325617e-07, "loss": 0.015498391352593899, "memory(GiB)": 18.17, "reward": 0.41268619894981384, "reward_std": 0.02419480960816145, "rewards/MCQ_Reward/mean": 0.41268619894981384, "rewards/MCQ_Reward/std": 0.09195958822965622, "step": 297, "train_speed(iter/s)": 0.118532 }, { "clip_ratio": 0.0050865779630839825, "epoch": 5.96, "grad_norm": 2.1392431259155273, "kl": 0.494140625, "learning_rate": 3.5761204143717385e-07, "loss": 0.014891544356942177, "memory(GiB)": 18.17, "step": 298, "train_speed(iter/s)": 0.118872 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.5, "completions/mean_length": 124.328125, "completions/min_length": 64.5, "epoch": 5.98, "grad_norm": 2.7249698638916016, "kl": 0.880859375, "learning_rate": 3.5457299799730045e-07, "loss": -0.010070513002574444, "memory(GiB)": 18.17, "reward": 0.4588439464569092, "reward_std": 0.029408703558146954, "rewards/MCQ_Reward/mean": 0.4588439464569092, "rewards/MCQ_Reward/std": 0.09774744883179665, "step": 299, "train_speed(iter/s)": 0.118723 }, { "clip_ratio": 0.01025686739012599, "epoch": 6.0, "grad_norm": 3.8231394290924072, "kl": 0.7529296875, "learning_rate": 3.5153981233586274e-07, "loss": -0.009807607159018517, "memory(GiB)": 18.17, "step": 300, "train_speed(iter/s)": 0.119048 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.5, "completions/mean_length": 109.19140625, "completions/min_length": 56.0, "epoch": 6.02, "grad_norm": 2.6895663738250732, "kl": 0.599609375, "learning_rate": 3.485126066291364e-07, "loss": -0.010052207857370377, "memory(GiB)": 18.17, "reward": 0.4080576002597809, "reward_std": 0.02562197484076023, "rewards/MCQ_Reward/mean": 0.4080576002597809, "rewards/MCQ_Reward/std": 0.09971121698617935, "step": 301, "train_speed(iter/s)": 0.118697 }, { "clip_ratio": 0.005149862729012966, "epoch": 6.04, "grad_norm": 2.655897855758667, "kl": 0.607421875, "learning_rate": 3.454915028125263e-07, "loss": -0.010359197854995728, "memory(GiB)": 18.17, "step": 302, "train_speed(iter/s)": 0.11903 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.5, "completions/mean_length": 117.90234375, "completions/min_length": 56.5, "epoch": 6.06, "grad_norm": 2.423926591873169, "kl": 0.546875, "learning_rate": 3.4247662257565366e-07, "loss": 0.018125958740711212, "memory(GiB)": 18.17, "reward": 0.4407869875431061, "reward_std": 0.025757532566785812, "rewards/MCQ_Reward/mean": 0.4407869875431061, "rewards/MCQ_Reward/std": 0.12692639231681824, "step": 303, "train_speed(iter/s)": 0.118923 }, { "clip_ratio": 0.00550723378546536, "epoch": 6.08, "grad_norm": 2.2029030323028564, "kl": 0.5546875, "learning_rate": 3.394680873574546e-07, "loss": 0.017929650843143463, "memory(GiB)": 18.17, "step": 304, "train_speed(iter/s)": 0.119254 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/mean_length": 124.44140625, "completions/min_length": 54.5, "epoch": 6.1, "grad_norm": 2.3613805770874023, "kl": 0.5703125, "learning_rate": 3.3646601834128916e-07, "loss": -0.007877168245613575, "memory(GiB)": 18.17, "reward": 0.49866482615470886, "reward_std": 0.024780258536338806, "rewards/MCQ_Reward/mean": 0.49866482615470886, "rewards/MCQ_Reward/std": 0.07562171667814255, "step": 305, "train_speed(iter/s)": 0.11921 }, { "clip_ratio": 0.004300985252484679, "epoch": 6.12, "grad_norm": 2.1242995262145996, "kl": 0.576171875, "learning_rate": 3.3347053645005965e-07, "loss": -0.008408917114138603, "memory(GiB)": 18.17, "step": 306, "train_speed(iter/s)": 0.119519 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.5, "completions/mean_length": 105.72265625, "completions/min_length": 64.5, "epoch": 6.14, "grad_norm": 2.5641608238220215, "kl": 0.560546875, "learning_rate": 3.3048176234133963e-07, "loss": 0.0034052138216793537, "memory(GiB)": 18.17, "reward": 0.3926085978746414, "reward_std": 0.01911616325378418, "rewards/MCQ_Reward/mean": 0.3926085978746414, "rewards/MCQ_Reward/std": 0.06766298227012157, "step": 307, "train_speed(iter/s)": 0.119522 }, { "clip_ratio": 0.007244990672916174, "epoch": 6.16, "grad_norm": 2.7589051723480225, "kl": 0.572265625, "learning_rate": 3.274998164025148e-07, "loss": 0.0031583395320922136, "memory(GiB)": 18.17, "step": 308, "train_speed(iter/s)": 0.119856 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/mean_length": 119.86328125, "completions/min_length": 57.0, "epoch": 6.18, "grad_norm": 2.9221317768096924, "kl": 0.611328125, "learning_rate": 3.245248187459323e-07, "loss": -0.019380319863557816, "memory(GiB)": 18.17, "reward": 0.386982798576355, "reward_std": 0.026672961190342903, "rewards/MCQ_Reward/mean": 0.386982798576355, "rewards/MCQ_Reward/std": 0.10517054051160812, "step": 309, "train_speed(iter/s)": 0.119747 }, { "clip_ratio": 0.005416512954980135, "epoch": 6.2, "grad_norm": 2.7965259552001953, "kl": 0.61328125, "learning_rate": 3.215568892040641e-07, "loss": -0.019356630742549896, "memory(GiB)": 18.17, "step": 310, "train_speed(iter/s)": 0.120077 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.5, "completions/mean_length": 118.21484375, "completions/min_length": 57.0, "epoch": 6.22, "grad_norm": 2.8668336868286133, "kl": 0.607421875, "learning_rate": 3.1859614732467954e-07, "loss": -0.013122756965458393, "memory(GiB)": 18.17, "reward": 0.4595968574285507, "reward_std": 0.024624092504382133, "rewards/MCQ_Reward/mean": 0.4595968574285507, "rewards/MCQ_Reward/std": 0.08434771373867989, "step": 311, "train_speed(iter/s)": 0.119696 }, { "clip_ratio": 0.00573662668466568, "epoch": 6.24, "grad_norm": 2.4580280780792236, "kl": 0.609375, "learning_rate": 3.156427123660297e-07, "loss": -0.013560149818658829, "memory(GiB)": 18.17, "step": 312, "train_speed(iter/s)": 0.120023 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.5, "completions/mean_length": 121.68359375, "completions/min_length": 73.5, "epoch": 6.26, "grad_norm": 2.6274502277374268, "kl": 0.58984375, "learning_rate": 3.1269670329204393e-07, "loss": 0.0022671520709991455, "memory(GiB)": 18.17, "reward": 0.44664010405540466, "reward_std": 0.024377938359975815, "rewards/MCQ_Reward/mean": 0.44664010405540466, "rewards/MCQ_Reward/std": 0.08575410395860672, "step": 313, "train_speed(iter/s)": 0.119945 }, { "clip_ratio": 0.0052670135628432035, "epoch": 6.28, "grad_norm": 2.753713607788086, "kl": 0.578125, "learning_rate": 3.097582387675385e-07, "loss": 0.0018416689708828926, "memory(GiB)": 18.17, "step": 314, "train_speed(iter/s)": 0.120272 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.5, "completions/mean_length": 127.57421875, "completions/min_length": 77.0, "epoch": 6.3, "grad_norm": 2.4003334045410156, "kl": 0.583984375, "learning_rate": 3.068274371534356e-07, "loss": 0.0005114064551889896, "memory(GiB)": 18.17, "reward": 0.44641484320163727, "reward_std": 0.024146192707121372, "rewards/MCQ_Reward/mean": 0.44641484320163727, "rewards/MCQ_Reward/std": 0.08713827468454838, "step": 315, "train_speed(iter/s)": 0.120168 }, { "clip_ratio": 0.008136166725307703, "epoch": 6.32, "grad_norm": 2.3975117206573486, "kl": 0.619140625, "learning_rate": 3.039044165019972e-07, "loss": 0.0004498562775552273, "memory(GiB)": 18.17, "step": 316, "train_speed(iter/s)": 0.120495 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.5, "completions/mean_length": 117.9609375, "completions/min_length": 58.5, "epoch": 6.34, "grad_norm": 2.348710060119629, "kl": 0.548828125, "learning_rate": 3.00989294552069e-07, "loss": 0.00850888341665268, "memory(GiB)": 18.17, "reward": 0.42280539870262146, "reward_std": 0.02416596282273531, "rewards/MCQ_Reward/mean": 0.42280539870262146, "rewards/MCQ_Reward/std": 0.0933729000389576, "step": 317, "train_speed(iter/s)": 0.120401 }, { "clip_ratio": 0.005974379135295749, "epoch": 6.36, "grad_norm": 2.630732774734497, "kl": 0.5390625, "learning_rate": 2.9808218872433766e-07, "loss": 0.008482606150209904, "memory(GiB)": 18.17, "step": 318, "train_speed(iter/s)": 0.120723 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/mean_length": 123.66796875, "completions/min_length": 75.5, "epoch": 6.38, "grad_norm": 2.1341052055358887, "kl": 0.517578125, "learning_rate": 2.9518321611660234e-07, "loss": -0.0021673766896128654, "memory(GiB)": 18.17, "reward": 0.4051154851913452, "reward_std": 0.020906205289065838, "rewards/MCQ_Reward/mean": 0.4051154851913452, "rewards/MCQ_Reward/std": 0.09874700754880905, "step": 319, "train_speed(iter/s)": 0.12062 }, { "clip_ratio": 0.00719631533138454, "epoch": 6.4, "grad_norm": 3.2350962162017822, "kl": 0.5390625, "learning_rate": 2.922924934990568e-07, "loss": -0.0024176109582185745, "memory(GiB)": 18.17, "step": 320, "train_speed(iter/s)": 0.120919 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/mean_length": 117.234375, "completions/min_length": 69.0, "epoch": 6.42, "grad_norm": 74.83729553222656, "kl": 20.791015625, "learning_rate": 2.894101373095867e-07, "loss": 0.04349440336227417, "memory(GiB)": 18.17, "reward": 0.44527527689933777, "reward_std": 0.021908948197960854, "rewards/MCQ_Reward/mean": 0.44527527689933777, "rewards/MCQ_Reward/std": 0.08160104416310787, "step": 321, "train_speed(iter/s)": 0.120602 }, { "clip_ratio": 0.004950069589540362, "epoch": 6.44, "grad_norm": 99.64342498779297, "kl": 26.54296875, "learning_rate": 2.8653626364907914e-07, "loss": 0.04914519935846329, "memory(GiB)": 18.17, "step": 322, "train_speed(iter/s)": 0.120907 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/mean_length": 128.45703125, "completions/min_length": 52.5, "epoch": 6.46, "grad_norm": 2.5322988033294678, "kl": 0.529296875, "learning_rate": 2.8367098827674576e-07, "loss": 0.009952299296855927, "memory(GiB)": 18.17, "reward": 0.4740261733531952, "reward_std": 0.023401367478072643, "rewards/MCQ_Reward/mean": 0.4740261733531952, "rewards/MCQ_Reward/std": 0.08106581121683121, "step": 323, "train_speed(iter/s)": 0.12071 }, { "clip_ratio": 0.005782874301075935, "epoch": 6.48, "grad_norm": 2.591923952102661, "kl": 0.53125, "learning_rate": 2.808144266054612e-07, "loss": 0.009899303317070007, "memory(GiB)": 18.17, "step": 324, "train_speed(iter/s)": 0.121029 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.5, "completions/mean_length": 133.33203125, "completions/min_length": 81.5, "epoch": 6.5, "grad_norm": 2.113783121109009, "kl": 0.537109375, "learning_rate": 2.779666936971129e-07, "loss": -0.0006487010978162289, "memory(GiB)": 18.17, "reward": 0.39647024869918823, "reward_std": 0.02249709703028202, "rewards/MCQ_Reward/mean": 0.39647024869918823, "rewards/MCQ_Reward/std": 0.0880400650203228, "step": 325, "train_speed(iter/s)": 0.120986 }, { "clip_ratio": 0.006350549403578043, "epoch": 6.52, "grad_norm": 2.4789633750915527, "kl": 0.525390625, "learning_rate": 2.751279042579672e-07, "loss": -0.0002095792442560196, "memory(GiB)": 18.17, "step": 326, "train_speed(iter/s)": 0.121304 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/mean_length": 126.234375, "completions/min_length": 54.0, "epoch": 6.54, "grad_norm": 2.4260339736938477, "kl": 0.548828125, "learning_rate": 2.7229817263404864e-07, "loss": -0.0033088945783674717, "memory(GiB)": 18.17, "reward": 0.4554037004709244, "reward_std": 0.02187604457139969, "rewards/MCQ_Reward/mean": 0.4554037004709244, "rewards/MCQ_Reward/std": 0.09804989397525787, "step": 327, "train_speed(iter/s)": 0.121167 }, { "clip_ratio": 0.008008664939552546, "epoch": 6.5600000000000005, "grad_norm": 4.365505695343018, "kl": 0.533203125, "learning_rate": 2.6947761280653447e-07, "loss": -0.00283604022115469, "memory(GiB)": 18.17, "step": 328, "train_speed(iter/s)": 0.121483 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/mean_length": 117.9609375, "completions/min_length": 69.5, "epoch": 6.58, "grad_norm": 2.2564356327056885, "kl": 0.5283203125, "learning_rate": 2.6666633838716314e-07, "loss": -0.0077381255105137825, "memory(GiB)": 18.17, "reward": 0.4396722763776779, "reward_std": 0.022700872272253036, "rewards/MCQ_Reward/mean": 0.4396722763776779, "rewards/MCQ_Reward/std": 0.10192850604653358, "step": 329, "train_speed(iter/s)": 0.12143 }, { "clip_ratio": 0.0047557426150888205, "epoch": 6.6, "grad_norm": 2.172281503677368, "kl": 0.5322265625, "learning_rate": 2.638644626136587e-07, "loss": -0.008173219859600067, "memory(GiB)": 18.17, "step": 330, "train_speed(iter/s)": 0.121737 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/mean_length": 126.0859375, "completions/min_length": 68.5, "epoch": 6.62, "grad_norm": 2.167248010635376, "kl": 0.4873046875, "learning_rate": 2.610720983451685e-07, "loss": 0.018461888656020164, "memory(GiB)": 18.17, "reward": 0.44843943417072296, "reward_std": 0.02303914539515972, "rewards/MCQ_Reward/mean": 0.44843943417072296, "rewards/MCQ_Reward/std": 0.08497340604662895, "step": 331, "train_speed(iter/s)": 0.121397 }, { "clip_ratio": 0.0052658268250525, "epoch": 6.64, "grad_norm": 2.136260509490967, "kl": 0.4921875, "learning_rate": 2.58289358057718e-07, "loss": 0.01842992939054966, "memory(GiB)": 18.17, "step": 332, "train_speed(iter/s)": 0.121707 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.5, "completions/mean_length": 127.5546875, "completions/min_length": 65.5, "epoch": 6.66, "grad_norm": 2.595977306365967, "kl": 0.578125, "learning_rate": 2.555163538396806e-07, "loss": -0.011687211692333221, "memory(GiB)": 18.17, "reward": 0.4103027582168579, "reward_std": 0.02552829496562481, "rewards/MCQ_Reward/mean": 0.4103027582168579, "rewards/MCQ_Reward/std": 0.0971563570201397, "step": 333, "train_speed(iter/s)": 0.1216 }, { "clip_ratio": 0.0067884225863963366, "epoch": 6.68, "grad_norm": 3.2224881649017334, "kl": 0.59765625, "learning_rate": 2.5275319738726165e-07, "loss": -0.011430272832512856, "memory(GiB)": 18.17, "step": 334, "train_speed(iter/s)": 0.121912 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/mean_length": 123.2578125, "completions/min_length": 75.0, "epoch": 6.7, "grad_norm": 2.387573480606079, "kl": 0.56640625, "learning_rate": 2.500000000000001e-07, "loss": -0.006422008387744427, "memory(GiB)": 18.17, "reward": 0.4134673774242401, "reward_std": 0.022745592519640923, "rewards/MCQ_Reward/mean": 0.4134673774242401, "rewards/MCQ_Reward/std": 0.10698199272155762, "step": 335, "train_speed(iter/s)": 0.121789 }, { "clip_ratio": 0.007158383261412382, "epoch": 6.72, "grad_norm": 2.7240705490112305, "kl": 0.564453125, "learning_rate": 2.472568725762853e-07, "loss": -0.0065142130479216576, "memory(GiB)": 18.17, "step": 336, "train_speed(iter/s)": 0.122088 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.5, "completions/mean_length": 108.890625, "completions/min_length": 63.5, "epoch": 6.74, "grad_norm": 2.2466800212860107, "kl": 0.7421875, "learning_rate": 2.4452392560888976e-07, "loss": -0.00018489733338356018, "memory(GiB)": 18.17, "reward": 0.42812955379486084, "reward_std": 0.0208740271627903, "rewards/MCQ_Reward/mean": 0.42812955379486084, "rewards/MCQ_Reward/std": 0.08048268780112267, "step": 337, "train_speed(iter/s)": 0.12208 }, { "clip_ratio": 0.005281613674014807, "epoch": 6.76, "grad_norm": 2.0434200763702393, "kl": 0.771484375, "learning_rate": 2.418012691805191e-07, "loss": -0.0005159445572644472, "memory(GiB)": 18.17, "step": 338, "train_speed(iter/s)": 0.122388 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/mean_length": 117.3984375, "completions/min_length": 65.0, "epoch": 6.78, "grad_norm": 2.669919729232788, "kl": 0.572265625, "learning_rate": 2.390890129593771e-07, "loss": -0.009503326378762722, "memory(GiB)": 18.17, "reward": 0.41273191571235657, "reward_std": 0.023225258104503155, "rewards/MCQ_Reward/mean": 0.41273191571235657, "rewards/MCQ_Reward/std": 0.08152876608073711, "step": 339, "train_speed(iter/s)": 0.122302 }, { "clip_ratio": 0.005108103854581714, "epoch": 6.8, "grad_norm": 2.5069973468780518, "kl": 0.576171875, "learning_rate": 2.3638726619474875e-07, "loss": -0.009927002713084221, "memory(GiB)": 18.17, "step": 340, "train_speed(iter/s)": 0.122605 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.5, "completions/mean_length": 121.0703125, "completions/min_length": 66.0, "epoch": 6.82, "grad_norm": 2.5319740772247314, "kl": 0.59765625, "learning_rate": 2.3369613771260005e-07, "loss": 0.004871162120252848, "memory(GiB)": 18.17, "reward": 0.39162860810756683, "reward_std": 0.022268068976700306, "rewards/MCQ_Reward/mean": 0.39162860810756683, "rewards/MCQ_Reward/std": 0.07392172142863274, "step": 341, "train_speed(iter/s)": 0.12225 }, { "clip_ratio": 0.004840584937483072, "epoch": 6.84, "grad_norm": 2.547236204147339, "kl": 0.60546875, "learning_rate": 2.310157359111938e-07, "loss": 0.004931057803332806, "memory(GiB)": 18.17, "step": 342, "train_speed(iter/s)": 0.122534 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.5, "completions/mean_length": 124.6328125, "completions/min_length": 67.5, "epoch": 6.86, "grad_norm": 2.610426664352417, "kl": 0.5419921875, "learning_rate": 2.283461687567236e-07, "loss": 0.012133005075156689, "memory(GiB)": 18.17, "reward": 0.38104377686977386, "reward_std": 0.023476887494325638, "rewards/MCQ_Reward/mean": 0.38104377686977386, "rewards/MCQ_Reward/std": 0.13691367208957672, "step": 343, "train_speed(iter/s)": 0.122472 }, { "clip_ratio": 0.005503881955519319, "epoch": 6.88, "grad_norm": 2.517308473587036, "kl": 0.5458984375, "learning_rate": 2.2568754377896515e-07, "loss": 0.012206798419356346, "memory(GiB)": 18.17, "step": 344, "train_speed(iter/s)": 0.122771 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/mean_length": 122.8125, "completions/min_length": 54.0, "epoch": 6.9, "grad_norm": 2.268815517425537, "kl": 0.576171875, "learning_rate": 2.2303996806694486e-07, "loss": 0.005438795313239098, "memory(GiB)": 18.17, "reward": 0.41502565145492554, "reward_std": 0.021418385207653046, "rewards/MCQ_Reward/mean": 0.41502565145492554, "rewards/MCQ_Reward/std": 0.09508999437093735, "step": 345, "train_speed(iter/s)": 0.122753 }, { "clip_ratio": 0.005775286350399256, "epoch": 6.92, "grad_norm": 2.83811616897583, "kl": 0.603515625, "learning_rate": 2.2040354826462664e-07, "loss": 0.005799311213195324, "memory(GiB)": 18.17, "step": 346, "train_speed(iter/s)": 0.123049 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.5, "completions/mean_length": 116.91796875, "completions/min_length": 65.5, "epoch": 6.9399999999999995, "grad_norm": 2.334526777267456, "kl": 0.564453125, "learning_rate": 2.177783905666155e-07, "loss": 0.0054929498583078384, "memory(GiB)": 18.17, "reward": 0.39654283225536346, "reward_std": 0.022173049859702587, "rewards/MCQ_Reward/mean": 0.39654283225536346, "rewards/MCQ_Reward/std": 0.09505746513605118, "step": 347, "train_speed(iter/s)": 0.123026 }, { "clip_ratio": 0.0045166281051933765, "epoch": 6.96, "grad_norm": 2.271827220916748, "kl": 0.564453125, "learning_rate": 2.151646007138806e-07, "loss": 0.0055296882055699825, "memory(GiB)": 18.17, "step": 348, "train_speed(iter/s)": 0.123296 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/mean_length": 130.65625, "completions/min_length": 77.5, "epoch": 6.98, "grad_norm": 2.0946249961853027, "kl": 0.55859375, "learning_rate": 2.125622839894964e-07, "loss": 0.003636482171714306, "memory(GiB)": 18.17, "reward": 0.43836964666843414, "reward_std": 0.021374424919486046, "rewards/MCQ_Reward/mean": 0.43836964666843414, "rewards/MCQ_Reward/std": 0.06100250408053398, "step": 349, "train_speed(iter/s)": 0.123225 }, { "clip_ratio": 0.0046428050845861435, "epoch": 7.0, "grad_norm": 2.23724365234375, "kl": 0.57421875, "learning_rate": 2.0997154521440097e-07, "loss": 0.004051330033689737, "memory(GiB)": 18.17, "step": 350, "train_speed(iter/s)": 0.123516 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/mean_length": 121.21484375, "completions/min_length": 72.0, "epoch": 7.02, "grad_norm": 2.815627336502075, "kl": 0.5703125, "learning_rate": 2.0739248874317438e-07, "loss": -0.019233888015151024, "memory(GiB)": 18.17, "reward": 0.4290418028831482, "reward_std": 0.022210314869880676, "rewards/MCQ_Reward/mean": 0.4290418028831482, "rewards/MCQ_Reward/std": 0.06661852076649666, "step": 351, "train_speed(iter/s)": 0.123139 }, { "clip_ratio": 0.00514651439152658, "epoch": 7.04, "grad_norm": 3.0636136531829834, "kl": 0.576171875, "learning_rate": 2.048252184598352e-07, "loss": -0.01901531219482422, "memory(GiB)": 18.17, "step": 352, "train_speed(iter/s)": 0.12342 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/mean_length": 112.85546875, "completions/min_length": 62.5, "epoch": 7.06, "grad_norm": 2.700939178466797, "kl": 0.58203125, "learning_rate": 2.0226983777365603e-07, "loss": -0.007234710268676281, "memory(GiB)": 18.17, "reward": 0.43640220165252686, "reward_std": 0.022726435214281082, "rewards/MCQ_Reward/mean": 0.43640220165252686, "rewards/MCQ_Reward/std": 0.08832718059420586, "step": 353, "train_speed(iter/s)": 0.123424 }, { "clip_ratio": 0.00972440093755722, "epoch": 7.08, "grad_norm": 3.0179059505462646, "kl": 0.564453125, "learning_rate": 1.9972644961499853e-07, "loss": -0.007274748291820288, "memory(GiB)": 18.17, "step": 354, "train_speed(iter/s)": 0.123722 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.5, "completions/mean_length": 115.6328125, "completions/min_length": 68.0, "epoch": 7.1, "grad_norm": 2.484236240386963, "kl": 0.619140625, "learning_rate": 1.9719515643116674e-07, "loss": 0.015900151804089546, "memory(GiB)": 18.17, "reward": 0.45114465057849884, "reward_std": 0.024738659150898457, "rewards/MCQ_Reward/mean": 0.45114465057849884, "rewards/MCQ_Reward/std": 0.10900644585490227, "step": 355, "train_speed(iter/s)": 0.123607 }, { "clip_ratio": 0.0064309455920010805, "epoch": 7.12, "grad_norm": 3.852499485015869, "kl": 0.607421875, "learning_rate": 1.9467606018228088e-07, "loss": 0.01630295254290104, "memory(GiB)": 18.17, "step": 356, "train_speed(iter/s)": 0.123891 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/mean_length": 128.88671875, "completions/min_length": 74.5, "epoch": 7.14, "grad_norm": 2.455781936645508, "kl": 0.5478515625, "learning_rate": 1.9216926233717084e-07, "loss": -0.00730013195425272, "memory(GiB)": 18.17, "reward": 0.4758221060037613, "reward_std": 0.024665928445756435, "rewards/MCQ_Reward/mean": 0.4758221060037613, "rewards/MCQ_Reward/std": 0.0809130035340786, "step": 357, "train_speed(iter/s)": 0.123852 }, { "clip_ratio": 0.00344535568729043, "epoch": 7.16, "grad_norm": 2.2257754802703857, "kl": 0.5576171875, "learning_rate": 1.8967486386928817e-07, "loss": -0.0074045369401574135, "memory(GiB)": 18.17, "step": 358, "train_speed(iter/s)": 0.124151 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/mean_length": 130.06640625, "completions/min_length": 67.5, "epoch": 7.18, "grad_norm": 2.7154037952423096, "kl": 0.51171875, "learning_rate": 1.8719296525263923e-07, "loss": 0.019313501194119453, "memory(GiB)": 18.17, "reward": 0.4561205357313156, "reward_std": 0.023944508284330368, "rewards/MCQ_Reward/mean": 0.4561205357313156, "rewards/MCQ_Reward/std": 0.10000644996762276, "step": 359, "train_speed(iter/s)": 0.124074 }, { "clip_ratio": 0.006082270760089159, "epoch": 7.2, "grad_norm": 2.114431381225586, "kl": 0.5234375, "learning_rate": 1.847236664577389e-07, "loss": 0.01907144859433174, "memory(GiB)": 18.17, "step": 360, "train_speed(iter/s)": 0.124368 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.5, "completions/mean_length": 130.765625, "completions/min_length": 79.0, "epoch": 7.22, "grad_norm": 2.2248895168304443, "kl": 0.5390625, "learning_rate": 1.8226706694758193e-07, "loss": 0.012620393186807632, "memory(GiB)": 18.17, "reward": 0.44832468032836914, "reward_std": 0.025768463499844074, "rewards/MCQ_Reward/mean": 0.44832468032836914, "rewards/MCQ_Reward/std": 0.09799568355083466, "step": 361, "train_speed(iter/s)": 0.123928 }, { "clip_ratio": 0.006066091358661652, "epoch": 7.24, "grad_norm": 2.5757896900177, "kl": 0.53515625, "learning_rate": 1.7982326567363886e-07, "loss": 0.013028541579842567, "memory(GiB)": 18.17, "step": 362, "train_speed(iter/s)": 0.124219 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.5, "completions/mean_length": 122.546875, "completions/min_length": 50.5, "epoch": 7.26, "grad_norm": 2.2651302814483643, "kl": 0.5322265625, "learning_rate": 1.7739236107186857e-07, "loss": 0.009481780230998993, "memory(GiB)": 18.17, "reward": 0.4318048655986786, "reward_std": 0.022731643170118332, "rewards/MCQ_Reward/mean": 0.4318048655986786, "rewards/MCQ_Reward/std": 0.09833444282412529, "step": 363, "train_speed(iter/s)": 0.124163 }, { "clip_ratio": 0.0038783656200394034, "epoch": 7.28, "grad_norm": 2.2316813468933105, "kl": 0.5302734375, "learning_rate": 1.7497445105875374e-07, "loss": 0.009487325325608253, "memory(GiB)": 18.17, "step": 364, "train_speed(iter/s)": 0.124456 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.5, "completions/mean_length": 131.63671875, "completions/min_length": 61.5, "epoch": 7.3, "grad_norm": 2.720024347305298, "kl": 0.5517578125, "learning_rate": 1.725696330273575e-07, "loss": 0.0073198857717216015, "memory(GiB)": 18.17, "reward": 0.4407372921705246, "reward_std": 0.019983571954071522, "rewards/MCQ_Reward/mean": 0.4407372921705246, "rewards/MCQ_Reward/std": 0.07775032892823219, "step": 365, "train_speed(iter/s)": 0.124298 }, { "clip_ratio": 0.005759742809459567, "epoch": 7.32, "grad_norm": 2.4700775146484375, "kl": 0.5556640625, "learning_rate": 1.7017800384339924e-07, "loss": 0.00751863420009613, "memory(GiB)": 18.17, "step": 366, "train_speed(iter/s)": 0.124588 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.5, "completions/mean_length": 122.73828125, "completions/min_length": 64.5, "epoch": 7.34, "grad_norm": 2.3976547718048096, "kl": 0.541015625, "learning_rate": 1.6779965984135374e-07, "loss": 0.015993405133485794, "memory(GiB)": 18.17, "reward": 0.41162461042404175, "reward_std": 0.020391933619976044, "rewards/MCQ_Reward/mean": 0.41162461042404175, "rewards/MCQ_Reward/std": 0.0841926857829094, "step": 367, "train_speed(iter/s)": 0.124346 }, { "clip_ratio": 0.005305928410962224, "epoch": 7.36, "grad_norm": 2.444512128829956, "kl": 0.546875, "learning_rate": 1.6543469682057104e-07, "loss": 0.016359636560082436, "memory(GiB)": 18.17, "step": 368, "train_speed(iter/s)": 0.124615 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/mean_length": 113.90234375, "completions/min_length": 68.5, "epoch": 7.38, "grad_norm": 3.490565299987793, "kl": 0.57421875, "learning_rate": 1.6308321004141607e-07, "loss": -0.0010942098451778293, "memory(GiB)": 18.17, "reward": 0.38713136315345764, "reward_std": 0.021422830410301685, "rewards/MCQ_Reward/mean": 0.38713136315345764, "rewards/MCQ_Reward/std": 0.10617586970329285, "step": 369, "train_speed(iter/s)": 0.124639 }, { "clip_ratio": 0.005288022803142667, "epoch": 7.4, "grad_norm": 2.881525754928589, "kl": 0.564453125, "learning_rate": 1.6074529422143396e-07, "loss": -0.0009173217695206404, "memory(GiB)": 18.17, "step": 370, "train_speed(iter/s)": 0.124914 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.5, "completions/mean_length": 139.4375, "completions/min_length": 87.0, "epoch": 7.42, "grad_norm": 2.1569535732269287, "kl": 0.49609375, "learning_rate": 1.5842104353153285e-07, "loss": 0.014979809522628784, "memory(GiB)": 18.17, "reward": 0.4273018389940262, "reward_std": 0.02148488350212574, "rewards/MCQ_Reward/mean": 0.4273018389940262, "rewards/MCQ_Reward/std": 0.13347461819648743, "step": 371, "train_speed(iter/s)": 0.124503 }, { "clip_ratio": 0.006136654410511255, "epoch": 7.44, "grad_norm": 2.3948974609375, "kl": 0.486328125, "learning_rate": 1.561105515921915e-07, "loss": 0.015109008178114891, "memory(GiB)": 18.17, "step": 372, "train_speed(iter/s)": 0.124788 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.5, "completions/mean_length": 117.00390625, "completions/min_length": 69.5, "epoch": 7.46, "grad_norm": 2.3135647773742676, "kl": 0.669921875, "learning_rate": 1.5381391146968863e-07, "loss": 0.006555130705237389, "memory(GiB)": 18.17, "reward": 0.4488084018230438, "reward_std": 0.02006101794540882, "rewards/MCQ_Reward/mean": 0.4488084018230438, "rewards/MCQ_Reward/std": 0.07920502312481403, "step": 373, "train_speed(iter/s)": 0.124722 }, { "clip_ratio": 0.007013680646196008, "epoch": 7.48, "grad_norm": 2.962529420852661, "kl": 0.642578125, "learning_rate": 1.5153121567235333e-07, "loss": 0.006604420021176338, "memory(GiB)": 18.17, "step": 374, "train_speed(iter/s)": 0.125001 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/mean_length": 107.60546875, "completions/min_length": 53.5, "epoch": 7.5, "grad_norm": 2.731383800506592, "kl": 0.576171875, "learning_rate": 1.492625561468393e-07, "loss": -0.005473949480801821, "memory(GiB)": 18.17, "reward": 0.41762372851371765, "reward_std": 0.019964593462646008, "rewards/MCQ_Reward/mean": 0.41762372851371765, "rewards/MCQ_Reward/std": 0.08107879385352135, "step": 375, "train_speed(iter/s)": 0.124937 }, { "clip_ratio": 0.004663396626710892, "epoch": 7.52, "grad_norm": 2.615187406539917, "kl": 0.576171875, "learning_rate": 1.4700802427442178e-07, "loss": -0.005246948450803757, "memory(GiB)": 18.17, "step": 376, "train_speed(iter/s)": 0.125201 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.5, "completions/mean_length": 107.15625, "completions/min_length": 50.5, "epoch": 7.54, "grad_norm": 2.796724557876587, "kl": 0.640625, "learning_rate": 1.4476771086731565e-07, "loss": 0.01410718634724617, "memory(GiB)": 18.17, "reward": 0.4095290005207062, "reward_std": 0.02420712448656559, "rewards/MCQ_Reward/mean": 0.4095290005207062, "rewards/MCQ_Reward/std": 0.07465272396802902, "step": 377, "train_speed(iter/s)": 0.125163 }, { "clip_ratio": 0.006976983975619078, "epoch": 7.5600000000000005, "grad_norm": 2.945889711380005, "kl": 0.66015625, "learning_rate": 1.4254170616501827e-07, "loss": 0.014726857654750347, "memory(GiB)": 18.17, "step": 378, "train_speed(iter/s)": 0.125433 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/mean_length": 118.50390625, "completions/min_length": 63.5, "epoch": 7.58, "grad_norm": 2.9761271476745605, "kl": 0.607421875, "learning_rate": 1.4033009983067452e-07, "loss": -0.004153972025960684, "memory(GiB)": 18.17, "reward": 0.42119112610816956, "reward_std": 0.02067422866821289, "rewards/MCQ_Reward/mean": 0.42119112610816956, "rewards/MCQ_Reward/std": 0.0681285560131073, "step": 379, "train_speed(iter/s)": 0.125369 }, { "clip_ratio": 0.0061764034908264875, "epoch": 7.6, "grad_norm": 3.6120944023132324, "kl": 0.6171875, "learning_rate": 1.381329809474649e-07, "loss": -0.0035073161125183105, "memory(GiB)": 18.17, "step": 380, "train_speed(iter/s)": 0.125649 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/mean_length": 130.67578125, "completions/min_length": 79.0, "epoch": 7.62, "grad_norm": 2.3507981300354004, "kl": 0.5419921875, "learning_rate": 1.3595043801501794e-07, "loss": -0.0032176347449421883, "memory(GiB)": 18.17, "reward": 0.43415170907974243, "reward_std": 0.021646766923367977, "rewards/MCQ_Reward/mean": 0.43415170907974243, "rewards/MCQ_Reward/std": 0.11485166102647781, "step": 381, "train_speed(iter/s)": 0.125308 }, { "clip_ratio": 0.006046550814062357, "epoch": 7.64, "grad_norm": 2.5917809009552, "kl": 0.541015625, "learning_rate": 1.3378255894584462e-07, "loss": -0.0032573172356933355, "memory(GiB)": 18.17, "step": 382, "train_speed(iter/s)": 0.125575 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 591.0, "completions/mean_length": 111.87109375, "completions/min_length": 62.5, "epoch": 7.66, "grad_norm": 3.2898316383361816, "kl": 0.84375, "learning_rate": 1.3162943106179748e-07, "loss": 0.05431316792964935, "memory(GiB)": 25.14, "reward": 0.4442131072282791, "reward_std": 0.02893070410937071, "rewards/MCQ_Reward/mean": 0.4442131072282791, "rewards/MCQ_Reward/std": 0.0882490873336792, "step": 383, "train_speed(iter/s)": 0.124772 }, { "clip_ratio": 0.005024469457566738, "epoch": 7.68, "grad_norm": 3.0035033226013184, "kl": 0.82421875, "learning_rate": 1.2949114109055414e-07, "loss": 0.054804857820272446, "memory(GiB)": 25.14, "step": 384, "train_speed(iter/s)": 0.125047 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.5, "completions/mean_length": 125.80078125, "completions/min_length": 67.0, "epoch": 7.7, "grad_norm": 2.8262860774993896, "kl": 0.55078125, "learning_rate": 1.2736777516212267e-07, "loss": -0.006510823965072632, "memory(GiB)": 25.14, "reward": 0.40428027510643005, "reward_std": 0.025332522578537464, "rewards/MCQ_Reward/mean": 0.40428027510643005, "rewards/MCQ_Reward/std": 0.10921913757920265, "step": 385, "train_speed(iter/s)": 0.124957 }, { "clip_ratio": 0.005720158107578754, "epoch": 7.72, "grad_norm": 2.3165252208709717, "kl": 0.54296875, "learning_rate": 1.2525941880537304e-07, "loss": -0.006398671306669712, "memory(GiB)": 25.14, "step": 386, "train_speed(iter/s)": 0.125223 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.5, "completions/mean_length": 115.546875, "completions/min_length": 68.5, "epoch": 7.74, "grad_norm": 2.5941028594970703, "kl": 0.650390625, "learning_rate": 1.2316615694459186e-07, "loss": 0.013789664953947067, "memory(GiB)": 25.14, "reward": 0.4454474151134491, "reward_std": 0.02376528736203909, "rewards/MCQ_Reward/mean": 0.4454474151134491, "rewards/MCQ_Reward/std": 0.07124818488955498, "step": 387, "train_speed(iter/s)": 0.125174 }, { "clip_ratio": 0.00573781062848866, "epoch": 7.76, "grad_norm": 2.886561393737793, "kl": 0.634765625, "learning_rate": 1.2108807389606158e-07, "loss": 0.014278584159910679, "memory(GiB)": 25.14, "step": 388, "train_speed(iter/s)": 0.125449 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/mean_length": 121.00390625, "completions/min_length": 57.5, "epoch": 7.78, "grad_norm": 2.2996103763580322, "kl": 0.6171875, "learning_rate": 1.1902525336466462e-07, "loss": 0.012145346030592918, "memory(GiB)": 25.14, "reward": 0.42450854182243347, "reward_std": 0.021244493313133717, "rewards/MCQ_Reward/mean": 0.42450854182243347, "rewards/MCQ_Reward/std": 0.09635130688548088, "step": 389, "train_speed(iter/s)": 0.125399 }, { "clip_ratio": 0.005426776595413685, "epoch": 7.8, "grad_norm": 2.1788930892944336, "kl": 0.62890625, "learning_rate": 1.1697777844051104e-07, "loss": 0.011829939670860767, "memory(GiB)": 25.14, "step": 390, "train_speed(iter/s)": 0.125672 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.5, "completions/mean_length": 129.03125, "completions/min_length": 70.0, "epoch": 7.82, "grad_norm": 2.2412619590759277, "kl": 0.53515625, "learning_rate": 1.1494573159559212e-07, "loss": 9.762030094861984e-05, "memory(GiB)": 25.14, "reward": 0.4155340790748596, "reward_std": 0.020521354861557484, "rewards/MCQ_Reward/mean": 0.4155340790748596, "rewards/MCQ_Reward/std": 0.12795967236161232, "step": 391, "train_speed(iter/s)": 0.125325 }, { "clip_ratio": 0.005442213034257293, "epoch": 7.84, "grad_norm": 2.445225954055786, "kl": 0.54296875, "learning_rate": 1.1292919468045875e-07, "loss": 0.0006964541971683502, "memory(GiB)": 25.14, "step": 392, "train_speed(iter/s)": 0.125594 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.5, "completions/mean_length": 129.45703125, "completions/min_length": 68.5, "epoch": 7.86, "grad_norm": 2.254128932952881, "kl": 0.607421875, "learning_rate": 1.1092824892092373e-07, "loss": -0.010345934890210629, "memory(GiB)": 25.14, "reward": 0.40340456366539, "reward_std": 0.022636689245700836, "rewards/MCQ_Reward/mean": 0.40340456366539, "rewards/MCQ_Reward/std": 0.09724823385477066, "step": 393, "train_speed(iter/s)": 0.125579 }, { "clip_ratio": 0.004930965369567275, "epoch": 7.88, "grad_norm": 2.3455586433410645, "kl": 0.623046875, "learning_rate": 1.0894297491479043e-07, "loss": -0.009814320132136345, "memory(GiB)": 25.14, "step": 394, "train_speed(iter/s)": 0.125852 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.5, "completions/mean_length": 122.03125, "completions/min_length": 72.5, "epoch": 7.9, "grad_norm": 2.7601866722106934, "kl": 0.54296875, "learning_rate": 1.0697345262860635e-07, "loss": 0.011853070929646492, "memory(GiB)": 25.14, "reward": 0.44544240832328796, "reward_std": 0.02559925615787506, "rewards/MCQ_Reward/mean": 0.44544240832328796, "rewards/MCQ_Reward/std": 0.09495911747217178, "step": 395, "train_speed(iter/s)": 0.125762 }, { "clip_ratio": 0.004873325582593679, "epoch": 7.92, "grad_norm": 3.1385254859924316, "kl": 0.541015625, "learning_rate": 1.0501976139444191e-07, "loss": 0.01212891936302185, "memory(GiB)": 25.14, "step": 396, "train_speed(iter/s)": 0.126021 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/mean_length": 131.75, "completions/min_length": 80.0, "epoch": 7.9399999999999995, "grad_norm": 2.280336380004883, "kl": 0.59765625, "learning_rate": 1.0308197990669537e-07, "loss": -0.0006723229307681322, "memory(GiB)": 25.14, "reward": 0.3935137987136841, "reward_std": 0.0229948153719306, "rewards/MCQ_Reward/mean": 0.3935137987136841, "rewards/MCQ_Reward/std": 0.09170003235340118, "step": 397, "train_speed(iter/s)": 0.125959 }, { "clip_ratio": 0.009115117136389017, "epoch": 7.96, "grad_norm": 2.6576101779937744, "kl": 0.623046875, "learning_rate": 1.0116018621892236e-07, "loss": -0.0008128315676003695, "memory(GiB)": 25.14, "step": 398, "train_speed(iter/s)": 0.126231 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.5, "completions/mean_length": 125.65625, "completions/min_length": 67.0, "epoch": 7.98, "grad_norm": 2.7158310413360596, "kl": 0.58203125, "learning_rate": 9.92544577406923e-08, "loss": 0.006697420962154865, "memory(GiB)": 25.14, "reward": 0.43207649886608124, "reward_std": 0.02400553785264492, "rewards/MCQ_Reward/mean": 0.43207649886608124, "rewards/MCQ_Reward/std": 0.0867740847170353, "step": 399, "train_speed(iter/s)": 0.126178 }, { "clip_ratio": 0.005927033722400665, "epoch": 8.0, "grad_norm": 2.416578769683838, "kl": 0.580078125, "learning_rate": 9.736487123447068e-08, "loss": 0.006666385568678379, "memory(GiB)": 25.14, "step": 400, "train_speed(iter/s)": 0.126428 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/mean_length": 128.03515625, "completions/min_length": 68.0, "epoch": 8.02, "grad_norm": 2.4625000953674316, "kl": 0.55078125, "learning_rate": 9.549150281252632e-08, "loss": 0.019197747111320496, "memory(GiB)": 25.14, "reward": 0.41131871938705444, "reward_std": 0.02179474849253893, "rewards/MCQ_Reward/mean": 0.41131871938705444, "rewards/MCQ_Reward/std": 0.0903569795191288, "step": 401, "train_speed(iter/s)": 0.12607 }, { "clip_ratio": 0.004682507831603289, "epoch": 8.04, "grad_norm": 2.4578921794891357, "kl": 0.556640625, "learning_rate": 9.363442793386606e-08, "loss": 0.019492177292704582, "memory(GiB)": 25.14, "step": 402, "train_speed(iter/s)": 0.126333 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.5, "completions/mean_length": 124.9453125, "completions/min_length": 65.0, "epoch": 8.06, "grad_norm": 2.380934000015259, "kl": 0.595703125, "learning_rate": 9.179372140119524e-08, "loss": 0.00032033398747444153, "memory(GiB)": 25.14, "reward": 0.45213624835014343, "reward_std": 0.019670803099870682, "rewards/MCQ_Reward/mean": 0.45213624835014343, "rewards/MCQ_Reward/std": 0.05602107755839825, "step": 403, "train_speed(iter/s)": 0.126289 }, { "clip_ratio": 0.005494384560734034, "epoch": 8.08, "grad_norm": 2.2825376987457275, "kl": 0.59765625, "learning_rate": 8.996945735790446e-08, "loss": 0.00025699660181999207, "memory(GiB)": 25.14, "step": 404, "train_speed(iter/s)": 0.126553 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/mean_length": 113.30078125, "completions/min_length": 66.0, "epoch": 8.1, "grad_norm": 2.4504525661468506, "kl": 0.65234375, "learning_rate": 8.816170928508365e-08, "loss": 0.005521825514733791, "memory(GiB)": 25.14, "reward": 0.4200716018676758, "reward_std": 0.02163711003959179, "rewards/MCQ_Reward/mean": 0.4200716018676758, "rewards/MCQ_Reward/std": 0.09177059680223465, "step": 405, "train_speed(iter/s)": 0.126487 }, { "clip_ratio": 0.005122944712638855, "epoch": 8.12, "grad_norm": 2.5025854110717773, "kl": 0.65234375, "learning_rate": 8.637054999856147e-08, "loss": 0.005893816705793142, "memory(GiB)": 25.14, "step": 406, "train_speed(iter/s)": 0.126707 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/mean_length": 131.84765625, "completions/min_length": 84.0, "epoch": 8.14, "grad_norm": 2.2803900241851807, "kl": 0.677734375, "learning_rate": 8.459605164597267e-08, "loss": 0.002506987191736698, "memory(GiB)": 25.14, "reward": 0.42351874709129333, "reward_std": 0.019920101389288902, "rewards/MCQ_Reward/mean": 0.42351874709129333, "rewards/MCQ_Reward/std": 0.07087348401546478, "step": 407, "train_speed(iter/s)": 0.126629 }, { "clip_ratio": 0.004146608873270452, "epoch": 8.16, "grad_norm": 2.197411060333252, "kl": 0.693359375, "learning_rate": 8.283828570385237e-08, "loss": 0.0028184172697365284, "memory(GiB)": 25.14, "step": 408, "train_speed(iter/s)": 0.126894 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.5, "completions/mean_length": 126.35546875, "completions/min_length": 55.0, "epoch": 8.18, "grad_norm": 3.133226156234741, "kl": 0.54296875, "learning_rate": 8.109732297475635e-08, "loss": 0.003347148187458515, "memory(GiB)": 25.14, "reward": 0.4289032816886902, "reward_std": 0.023678142577409744, "rewards/MCQ_Reward/mean": 0.4289032816886902, "rewards/MCQ_Reward/std": 0.08180082961916924, "step": 409, "train_speed(iter/s)": 0.126716 }, { "clip_ratio": 0.004793429281562567, "epoch": 8.2, "grad_norm": 2.647909164428711, "kl": 0.548828125, "learning_rate": 7.937323358440934e-08, "loss": 0.003219081088900566, "memory(GiB)": 25.14, "step": 410, "train_speed(iter/s)": 0.126979 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/mean_length": 120.65234375, "completions/min_length": 66.0, "epoch": 8.22, "grad_norm": 2.844910144805908, "kl": 1.08984375, "learning_rate": 7.766608697888094e-08, "loss": 0.00578346848487854, "memory(GiB)": 25.14, "reward": 0.40613003075122833, "reward_std": 0.024234792217612267, "rewards/MCQ_Reward/mean": 0.40613003075122833, "rewards/MCQ_Reward/std": 0.10613492503762245, "step": 411, "train_speed(iter/s)": 0.126628 }, { "clip_ratio": 0.008466396480798721, "epoch": 8.24, "grad_norm": 3.322730779647827, "kl": 1.30859375, "learning_rate": 7.597595192178702e-08, "loss": 0.006200029980391264, "memory(GiB)": 25.14, "step": 412, "train_speed(iter/s)": 0.126892 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/mean_length": 120.59375, "completions/min_length": 63.5, "epoch": 8.26, "grad_norm": 3.1121227741241455, "kl": 0.57421875, "learning_rate": 7.430289649152155e-08, "loss": -0.005076010245829821, "memory(GiB)": 25.14, "reward": 0.4349597841501236, "reward_std": 0.022311867214739323, "rewards/MCQ_Reward/mean": 0.4349597841501236, "rewards/MCQ_Reward/std": 0.0992676205933094, "step": 413, "train_speed(iter/s)": 0.126827 }, { "clip_ratio": 0.005325015634298325, "epoch": 8.28, "grad_norm": 3.336932897567749, "kl": 0.5859375, "learning_rate": 7.264698807851327e-08, "loss": -0.004951636306941509, "memory(GiB)": 25.14, "step": 414, "train_speed(iter/s)": 0.127083 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/mean_length": 122.34765625, "completions/min_length": 80.0, "epoch": 8.3, "grad_norm": 2.32357120513916, "kl": 0.576171875, "learning_rate": 7.100829338251146e-08, "loss": 0.010018033906817436, "memory(GiB)": 25.14, "reward": 0.46219733357429504, "reward_std": 0.023064136505126953, "rewards/MCQ_Reward/mean": 0.46219733357429504, "rewards/MCQ_Reward/std": 0.10461203381419182, "step": 415, "train_speed(iter/s)": 0.127059 }, { "clip_ratio": 0.004823329858481884, "epoch": 8.32, "grad_norm": 2.399235486984253, "kl": 0.56640625, "learning_rate": 6.938687840989971e-08, "loss": 0.010338631458580494, "memory(GiB)": 25.14, "step": 416, "train_speed(iter/s)": 0.127319 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/mean_length": 126.6484375, "completions/min_length": 59.5, "epoch": 8.34, "grad_norm": 2.3096046447753906, "kl": 0.59765625, "learning_rate": 6.778280847103667e-08, "loss": 0.007643429096788168, "memory(GiB)": 25.14, "reward": 0.45115791261196136, "reward_std": 0.026236201636493206, "rewards/MCQ_Reward/mean": 0.45115791261196136, "rewards/MCQ_Reward/std": 0.07101332768797874, "step": 417, "train_speed(iter/s)": 0.127229 }, { "clip_ratio": 0.00613890727981925, "epoch": 8.36, "grad_norm": 2.6392662525177, "kl": 0.599609375, "learning_rate": 6.619614817762536e-08, "loss": 0.00813712365925312, "memory(GiB)": 25.14, "step": 418, "train_speed(iter/s)": 0.127474 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.5, "completions/mean_length": 128.6484375, "completions/min_length": 70.5, "epoch": 8.38, "grad_norm": 2.6424126625061035, "kl": 0.5546875, "learning_rate": 6.462696144011148e-08, "loss": 0.01095396839082241, "memory(GiB)": 25.14, "reward": 0.43093007802963257, "reward_std": 0.021352089941501617, "rewards/MCQ_Reward/mean": 0.43093007802963257, "rewards/MCQ_Reward/std": 0.09322765283286572, "step": 419, "train_speed(iter/s)": 0.127401 }, { "clip_ratio": 0.005334047833457589, "epoch": 8.4, "grad_norm": 2.514528751373291, "kl": 0.560546875, "learning_rate": 6.307531146510753e-08, "loss": 0.011139345355331898, "memory(GiB)": 25.14, "step": 420, "train_speed(iter/s)": 0.127655 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/mean_length": 121.234375, "completions/min_length": 61.5, "epoch": 8.42, "grad_norm": 2.6931869983673096, "kl": 0.576171875, "learning_rate": 6.154126075284855e-08, "loss": -0.004434285219758749, "memory(GiB)": 25.14, "reward": 0.47386451065540314, "reward_std": 0.02479046955704689, "rewards/MCQ_Reward/mean": 0.47386451065540314, "rewards/MCQ_Reward/std": 0.08362133055925369, "step": 421, "train_speed(iter/s)": 0.127304 }, { "clip_ratio": 0.004985473584383726, "epoch": 8.44, "grad_norm": 2.623483896255493, "kl": 0.5859375, "learning_rate": 6.002487109467347e-08, "loss": -0.004044556524604559, "memory(GiB)": 25.14, "step": 422, "train_speed(iter/s)": 0.12756 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/mean_length": 120.3359375, "completions/min_length": 57.0, "epoch": 8.46, "grad_norm": 2.4557580947875977, "kl": 0.54296875, "learning_rate": 5.8526203570536504e-08, "loss": -0.0014804373495280743, "memory(GiB)": 25.14, "reward": 0.38437609374523163, "reward_std": 0.019576413556933403, "rewards/MCQ_Reward/mean": 0.38437609374523163, "rewards/MCQ_Reward/std": 0.08220572769641876, "step": 423, "train_speed(iter/s)": 0.12751 }, { "clip_ratio": 0.005047354847192764, "epoch": 8.48, "grad_norm": 2.414680004119873, "kl": 0.548828125, "learning_rate": 5.70453185465472e-08, "loss": -0.0010703507578000426, "memory(GiB)": 25.14, "step": 424, "train_speed(iter/s)": 0.127763 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/mean_length": 109.29296875, "completions/min_length": 59.0, "epoch": 8.5, "grad_norm": 2.3690483570098877, "kl": 0.59375, "learning_rate": 5.5582275672538316e-08, "loss": 0.0056993430480360985, "memory(GiB)": 25.14, "reward": 0.404767170548439, "reward_std": 0.024388392455875874, "rewards/MCQ_Reward/mean": 0.404767170548439, "rewards/MCQ_Reward/std": 0.09245007485151291, "step": 425, "train_speed(iter/s)": 0.127734 }, { "clip_ratio": 0.004816505592316389, "epoch": 8.52, "grad_norm": 2.3456268310546875, "kl": 0.59765625, "learning_rate": 5.4137133879663287e-08, "loss": 0.005467045586556196, "memory(GiB)": 25.14, "step": 426, "train_speed(iter/s)": 0.127977 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.5, "completions/mean_length": 131.4375, "completions/min_length": 65.5, "epoch": 8.54, "grad_norm": 2.3816792964935303, "kl": 0.55078125, "learning_rate": 5.270995137802314e-08, "loss": 0.0031818237621337175, "memory(GiB)": 25.14, "reward": 0.38306334614753723, "reward_std": 0.02167375199496746, "rewards/MCQ_Reward/mean": 0.38306334614753723, "rewards/MCQ_Reward/std": 0.12913303077220917, "step": 427, "train_speed(iter/s)": 0.12777 }, { "clip_ratio": 0.005708938697353005, "epoch": 8.56, "grad_norm": 2.7459070682525635, "kl": 0.560546875, "learning_rate": 5.1300785654320886e-08, "loss": 0.0036508457269519567, "memory(GiB)": 25.14, "step": 428, "train_speed(iter/s)": 0.128012 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.5, "completions/mean_length": 141.1796875, "completions/min_length": 63.5, "epoch": 8.58, "grad_norm": 2.546011447906494, "kl": 0.560546875, "learning_rate": 4.9909693469546097e-08, "loss": -0.0037225554697215557, "memory(GiB)": 25.14, "reward": 0.4553868919610977, "reward_std": 0.024206943809986115, "rewards/MCQ_Reward/mean": 0.4553868919610977, "rewards/MCQ_Reward/std": 0.10913475230336189, "step": 429, "train_speed(iter/s)": 0.127896 }, { "clip_ratio": 0.005615573842078447, "epoch": 8.6, "grad_norm": 2.4503653049468994, "kl": 0.552734375, "learning_rate": 4.853673085668947e-08, "loss": -0.0035459164064377546, "memory(GiB)": 25.14, "step": 430, "train_speed(iter/s)": 0.128133 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.5, "completions/mean_length": 121.25, "completions/min_length": 68.0, "epoch": 8.62, "grad_norm": 2.6130316257476807, "kl": 0.560546875, "learning_rate": 4.718195311848455e-08, "loss": 0.006583400070667267, "memory(GiB)": 25.14, "reward": 0.4170517176389694, "reward_std": 0.022290964610874653, "rewards/MCQ_Reward/mean": 0.4170517176389694, "rewards/MCQ_Reward/std": 0.10183962434530258, "step": 431, "train_speed(iter/s)": 0.12785 }, { "clip_ratio": 0.0055829116608947515, "epoch": 8.64, "grad_norm": 2.6913576126098633, "kl": 0.572265625, "learning_rate": 4.5845414825181394e-08, "loss": 0.006918736733496189, "memory(GiB)": 25.14, "step": 432, "train_speed(iter/s)": 0.128096 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.5, "completions/mean_length": 113.8046875, "completions/min_length": 74.0, "epoch": 8.66, "grad_norm": 2.4241960048675537, "kl": 0.6201171875, "learning_rate": 4.452716981234744e-08, "loss": 0.011290742084383965, "memory(GiB)": 25.14, "reward": 0.4250094145536423, "reward_std": 0.022951221093535423, "rewards/MCQ_Reward/mean": 0.4250094145536423, "rewards/MCQ_Reward/std": 0.10084276273846626, "step": 433, "train_speed(iter/s)": 0.128069 }, { "clip_ratio": 0.005609560292214155, "epoch": 8.68, "grad_norm": 2.5790963172912598, "kl": 0.650390625, "learning_rate": 4.322727117869951e-08, "loss": 0.011948860250413418, "memory(GiB)": 25.14, "step": 434, "train_speed(iter/s)": 0.128291 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/mean_length": 126.3515625, "completions/min_length": 83.5, "epoch": 8.7, "grad_norm": 2.430708885192871, "kl": 0.5390625, "learning_rate": 4.19457712839652e-08, "loss": -0.008761925622820854, "memory(GiB)": 25.14, "reward": 0.43507225811481476, "reward_std": 0.024821095168590546, "rewards/MCQ_Reward/mean": 0.43507225811481476, "rewards/MCQ_Reward/std": 0.10436990112066269, "step": 435, "train_speed(iter/s)": 0.128196 }, { "clip_ratio": 0.004881069879047573, "epoch": 8.72, "grad_norm": 2.439311981201172, "kl": 0.5400390625, "learning_rate": 4.068272174677334e-08, "loss": -0.00834021344780922, "memory(GiB)": 25.14, "step": 436, "train_speed(iter/s)": 0.128446 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/mean_length": 118.14453125, "completions/min_length": 67.5, "epoch": 8.74, "grad_norm": 2.607220411300659, "kl": 0.619140625, "learning_rate": 3.9438173442575e-08, "loss": 0.005073768552392721, "memory(GiB)": 25.14, "reward": 0.4522544592618942, "reward_std": 0.024327417835593224, "rewards/MCQ_Reward/mean": 0.4522544592618942, "rewards/MCQ_Reward/std": 0.08557374030351639, "step": 437, "train_speed(iter/s)": 0.128414 }, { "clip_ratio": 0.005367731209844351, "epoch": 8.76, "grad_norm": 2.472538709640503, "kl": 0.626953125, "learning_rate": 3.821217650159453e-08, "loss": 0.005441693589091301, "memory(GiB)": 25.14, "step": 438, "train_speed(iter/s)": 0.128664 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/mean_length": 117.36328125, "completions/min_length": 65.5, "epoch": 8.78, "grad_norm": 2.8752048015594482, "kl": 0.62109375, "learning_rate": 3.700478030680987e-08, "loss": 0.001543362159281969, "memory(GiB)": 25.14, "reward": 0.44734521210193634, "reward_std": 0.02054190542548895, "rewards/MCQ_Reward/mean": 0.44734521210193634, "rewards/MCQ_Reward/std": 0.09018547832965851, "step": 439, "train_speed(iter/s)": 0.128624 }, { "clip_ratio": 0.006753503577783704, "epoch": 8.8, "grad_norm": 2.822502374649048, "kl": 0.625, "learning_rate": 3.581603349196371e-08, "loss": 0.0017494899220764637, "memory(GiB)": 25.14, "step": 440, "train_speed(iter/s)": 0.128861 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.5, "completions/mean_length": 117.40234375, "completions/min_length": 62.0, "epoch": 8.82, "grad_norm": 2.5104751586914062, "kl": 0.59375, "learning_rate": 3.464598393960449e-08, "loss": -0.004553473554551601, "memory(GiB)": 25.14, "reward": 0.39943838119506836, "reward_std": 0.023083772510290146, "rewards/MCQ_Reward/mean": 0.39943838119506836, "rewards/MCQ_Reward/std": 0.08860309049487114, "step": 441, "train_speed(iter/s)": 0.128489 }, { "clip_ratio": 0.00470179901458323, "epoch": 8.84, "grad_norm": 2.480741500854492, "kl": 0.58984375, "learning_rate": 3.349467877915746e-08, "loss": -0.004542327020317316, "memory(GiB)": 25.14, "step": 442, "train_speed(iter/s)": 0.128733 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/mean_length": 127.69140625, "completions/min_length": 50.0, "epoch": 8.86, "grad_norm": 2.399143934249878, "kl": 0.607421875, "learning_rate": 3.23621643850267e-08, "loss": -0.004238632973283529, "memory(GiB)": 25.14, "reward": 0.40998475253582, "reward_std": 0.02201936673372984, "rewards/MCQ_Reward/mean": 0.40998475253582, "rewards/MCQ_Reward/std": 0.0800128486007452, "step": 443, "train_speed(iter/s)": 0.128561 }, { "clip_ratio": 0.006211797473952174, "epoch": 8.88, "grad_norm": 2.5745253562927246, "kl": 0.603515625, "learning_rate": 3.124848637472688e-08, "loss": -0.003581822384148836, "memory(GiB)": 25.14, "step": 444, "train_speed(iter/s)": 0.128809 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.5, "completions/mean_length": 128.41015625, "completions/min_length": 71.0, "epoch": 8.9, "grad_norm": 2.989118814468384, "kl": 0.6640625, "learning_rate": 3.015368960704584e-08, "loss": 0.0020642182789742947, "memory(GiB)": 25.14, "reward": 0.45626600086688995, "reward_std": 0.022524941712617874, "rewards/MCQ_Reward/mean": 0.45626600086688995, "rewards/MCQ_Reward/std": 0.08293722942471504, "step": 445, "train_speed(iter/s)": 0.128751 }, { "clip_ratio": 0.0053639879915863276, "epoch": 8.92, "grad_norm": 2.226865291595459, "kl": 0.65234375, "learning_rate": 2.907781818023769e-08, "loss": 0.0022344959434121847, "memory(GiB)": 25.14, "step": 446, "train_speed(iter/s)": 0.128997 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.5, "completions/mean_length": 114.81640625, "completions/min_length": 69.5, "epoch": 8.94, "grad_norm": 2.5736968517303467, "kl": 0.626953125, "learning_rate": 2.8020915430246706e-08, "loss": 0.00543589424341917, "memory(GiB)": 25.14, "reward": 0.4480299800634384, "reward_std": 0.021618574857711792, "rewards/MCQ_Reward/mean": 0.4480299800634384, "rewards/MCQ_Reward/std": 0.08090543001890182, "step": 447, "train_speed(iter/s)": 0.128968 }, { "clip_ratio": 0.005519783589988947, "epoch": 8.96, "grad_norm": 2.7313241958618164, "kl": 0.62890625, "learning_rate": 2.69830239289614e-08, "loss": 0.005457316525280476, "memory(GiB)": 25.14, "step": 448, "train_speed(iter/s)": 0.12921 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/mean_length": 114.08203125, "completions/min_length": 69.5, "epoch": 8.98, "grad_norm": 3.3176426887512207, "kl": 0.658203125, "learning_rate": 2.596418548250029e-08, "loss": -0.006901263725012541, "memory(GiB)": 25.14, "reward": 0.4552987068891525, "reward_std": 0.02576339803636074, "rewards/MCQ_Reward/mean": 0.4552987068891525, "rewards/MCQ_Reward/std": 0.09829828701913357, "step": 449, "train_speed(iter/s)": 0.129186 }, { "clip_ratio": 0.005895850248634815, "epoch": 9.0, "grad_norm": 3.1435494422912598, "kl": 0.65625, "learning_rate": 2.4964441129527335e-08, "loss": -0.006242312025278807, "memory(GiB)": 25.14, "step": 450, "train_speed(iter/s)": 0.129418 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.5, "completions/mean_length": 107.38671875, "completions/min_length": 61.0, "epoch": 9.02, "grad_norm": 2.6646904945373535, "kl": 0.60546875, "learning_rate": 2.3983831139599286e-08, "loss": 0.006207154132425785, "memory(GiB)": 25.14, "reward": 0.39446285367012024, "reward_std": 0.022946057841181755, "rewards/MCQ_Reward/mean": 0.39446285367012024, "rewards/MCQ_Reward/std": 0.1063094437122345, "step": 451, "train_speed(iter/s)": 0.129116 }, { "clip_ratio": 0.005521278129890561, "epoch": 9.04, "grad_norm": 2.453953504562378, "kl": 0.619140625, "learning_rate": 2.3022395011543682e-08, "loss": 0.006389847490936518, "memory(GiB)": 25.14, "step": 452, "train_speed(iter/s)": 0.129358 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.5, "completions/mean_length": 128.57421875, "completions/min_length": 55.0, "epoch": 9.06, "grad_norm": 2.812540054321289, "kl": 0.580078125, "learning_rate": 2.208017147186736e-08, "loss": -0.005320190917700529, "memory(GiB)": 25.14, "reward": 0.41816772520542145, "reward_std": 0.023720718920230865, "rewards/MCQ_Reward/mean": 0.41816772520542145, "rewards/MCQ_Reward/std": 0.11730682849884033, "step": 453, "train_speed(iter/s)": 0.129235 }, { "clip_ratio": 0.005719892680644989, "epoch": 9.08, "grad_norm": 2.8398780822753906, "kl": 0.578125, "learning_rate": 2.1157198473197413e-08, "loss": -0.004547153599560261, "memory(GiB)": 25.14, "step": 454, "train_speed(iter/s)": 0.129473 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.5, "completions/mean_length": 121.10546875, "completions/min_length": 61.0, "epoch": 9.1, "grad_norm": 2.6457087993621826, "kl": 0.623046875, "learning_rate": 2.025351319275137e-08, "loss": 0.006458953022956848, "memory(GiB)": 25.14, "reward": 0.4360807240009308, "reward_std": 0.023424276150763035, "rewards/MCQ_Reward/mean": 0.4360807240009308, "rewards/MCQ_Reward/std": 0.08403830602765083, "step": 455, "train_speed(iter/s)": 0.129418 }, { "clip_ratio": 0.007413617800921202, "epoch": 9.12, "grad_norm": 3.019871473312378, "kl": 0.615234375, "learning_rate": 1.936915203084055e-08, "loss": 0.007484931964427233, "memory(GiB)": 25.14, "step": 456, "train_speed(iter/s)": 0.129657 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.5, "completions/mean_length": 115.48828125, "completions/min_length": 62.0, "epoch": 9.14, "grad_norm": 2.869127035140991, "kl": 0.5703125, "learning_rate": 1.8504150609403856e-08, "loss": 0.002277131425216794, "memory(GiB)": 25.14, "reward": 0.42605504393577576, "reward_std": 0.02147796005010605, "rewards/MCQ_Reward/mean": 0.42605504393577576, "rewards/MCQ_Reward/std": 0.09400845319032669, "step": 457, "train_speed(iter/s)": 0.129623 }, { "clip_ratio": 0.00495463190600276, "epoch": 9.16, "grad_norm": 2.7837038040161133, "kl": 0.564453125, "learning_rate": 1.7658543770572186e-08, "loss": 0.0023261206224560738, "memory(GiB)": 25.14, "step": 458, "train_speed(iter/s)": 0.129859 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.5, "completions/mean_length": 131.125, "completions/min_length": 63.0, "epoch": 9.18, "grad_norm": 2.4485437870025635, "kl": 0.564453125, "learning_rate": 1.683236557526574e-08, "loss": -0.001264197751879692, "memory(GiB)": 25.14, "reward": 0.43159276247024536, "reward_std": 0.02392040565609932, "rewards/MCQ_Reward/mean": 0.43159276247024536, "rewards/MCQ_Reward/std": 0.10159046202898026, "step": 459, "train_speed(iter/s)": 0.129693 }, { "clip_ratio": 0.004053628304973245, "epoch": 9.2, "grad_norm": 2.3056235313415527, "kl": 0.5625, "learning_rate": 1.6025649301821875e-08, "loss": -0.000987461768090725, "memory(GiB)": 25.14, "step": 460, "train_speed(iter/s)": 0.129933 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.5, "completions/mean_length": 113.18359375, "completions/min_length": 65.5, "epoch": 9.22, "grad_norm": 2.3913767337799072, "kl": 0.544921875, "learning_rate": 1.5238427444654367e-08, "loss": 0.012515128590166569, "memory(GiB)": 25.14, "reward": 0.4141518771648407, "reward_std": 0.019386641681194305, "rewards/MCQ_Reward/mean": 0.4141518771648407, "rewards/MCQ_Reward/std": 0.09657716751098633, "step": 461, "train_speed(iter/s)": 0.129665 }, { "clip_ratio": 0.005686681717634201, "epoch": 9.24, "grad_norm": 2.5303232669830322, "kl": 0.544921875, "learning_rate": 1.4470731712944883e-08, "loss": 0.013128566555678844, "memory(GiB)": 25.14, "step": 462, "train_speed(iter/s)": 0.129891 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/mean_length": 113.08984375, "completions/min_length": 68.0, "epoch": 9.26, "grad_norm": 2.9452006816864014, "kl": 0.578125, "learning_rate": 1.3722593029365459e-08, "loss": 0.01786494255065918, "memory(GiB)": 25.14, "reward": 0.4347621351480484, "reward_std": 0.023103663697838783, "rewards/MCQ_Reward/mean": 0.4347621351480484, "rewards/MCQ_Reward/std": 0.10107803344726562, "step": 463, "train_speed(iter/s)": 0.129821 }, { "clip_ratio": 0.004837532993406057, "epoch": 9.28, "grad_norm": 3.270838499069214, "kl": 0.576171875, "learning_rate": 1.2994041528833267e-08, "loss": 0.01855536922812462, "memory(GiB)": 25.14, "step": 464, "train_speed(iter/s)": 0.130055 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/mean_length": 130.0703125, "completions/min_length": 61.0, "epoch": 9.3, "grad_norm": 2.5287396907806396, "kl": 0.5703125, "learning_rate": 1.2285106557296476e-08, "loss": -0.009716257452964783, "memory(GiB)": 25.14, "reward": 0.4242394268512726, "reward_std": 0.024817454628646374, "rewards/MCQ_Reward/mean": 0.4242394268512726, "rewards/MCQ_Reward/std": 0.11753027141094208, "step": 465, "train_speed(iter/s)": 0.129996 }, { "clip_ratio": 0.0049513031262904406, "epoch": 9.32, "grad_norm": 2.6941351890563965, "kl": 0.56640625, "learning_rate": 1.1595816670552428e-08, "loss": -0.009578550234436989, "memory(GiB)": 25.14, "step": 466, "train_speed(iter/s)": 0.130232 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/mean_length": 120.80859375, "completions/min_length": 79.0, "epoch": 9.34, "grad_norm": 2.4061837196350098, "kl": 0.580078125, "learning_rate": 1.0926199633097154e-08, "loss": 0.009803004562854767, "memory(GiB)": 25.14, "reward": 0.4236748516559601, "reward_std": 0.020633171312510967, "rewards/MCQ_Reward/mean": 0.4236748516559601, "rewards/MCQ_Reward/std": 0.10525783523917198, "step": 467, "train_speed(iter/s)": 0.130202 }, { "clip_ratio": 0.0038570521865040064, "epoch": 9.36, "grad_norm": 2.538754463195801, "kl": 0.576171875, "learning_rate": 1.0276282417007399e-08, "loss": 0.010506462305784225, "memory(GiB)": 25.14, "step": 468, "train_speed(iter/s)": 0.130419 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/mean_length": 115.359375, "completions/min_length": 72.5, "epoch": 9.38, "grad_norm": 2.767404317855835, "kl": 0.58203125, "learning_rate": 9.646091200853801e-09, "loss": 0.002447181846946478, "memory(GiB)": 25.14, "reward": 0.4558543264865875, "reward_std": 0.023351009003818035, "rewards/MCQ_Reward/mean": 0.4558543264865875, "rewards/MCQ_Reward/std": 0.10045822337269783, "step": 469, "train_speed(iter/s)": 0.130376 }, { "clip_ratio": 0.003978088265284896, "epoch": 9.4, "grad_norm": 2.3947746753692627, "kl": 0.58984375, "learning_rate": 9.035651368646646e-09, "loss": 0.0025905624497681856, "memory(GiB)": 25.14, "step": 470, "train_speed(iter/s)": 0.130609 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/mean_length": 120.05078125, "completions/min_length": 61.0, "epoch": 9.42, "grad_norm": 2.2213082313537598, "kl": 0.595703125, "learning_rate": 8.44498750881345e-09, "loss": 0.022836437448859215, "memory(GiB)": 25.14, "reward": 0.4252375066280365, "reward_std": 0.02044745907187462, "rewards/MCQ_Reward/mean": 0.4252375066280365, "rewards/MCQ_Reward/std": 0.0874844454228878, "step": 471, "train_speed(iter/s)": 0.130308 }, { "clip_ratio": 0.004947596346028149, "epoch": 9.44, "grad_norm": 2.374445676803589, "kl": 0.599609375, "learning_rate": 7.874123413208145e-09, "loss": 0.02313510701060295, "memory(GiB)": 25.14, "step": 472, "train_speed(iter/s)": 0.130541 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.5, "completions/mean_length": 122.046875, "completions/min_length": 59.0, "epoch": 9.46, "grad_norm": 2.6664299964904785, "kl": 0.626953125, "learning_rate": 7.323082076153508e-09, "loss": 0.0047410172410309315, "memory(GiB)": 25.14, "reward": 0.42370498180389404, "reward_std": 0.021436103619635105, "rewards/MCQ_Reward/mean": 0.42370498180389404, "rewards/MCQ_Reward/std": 0.11163535714149475, "step": 473, "train_speed(iter/s)": 0.130462 }, { "clip_ratio": 0.005457588471472263, "epoch": 9.48, "grad_norm": 2.7726047039031982, "kl": 0.626953125, "learning_rate": 6.791885693514132e-09, "loss": 0.005159153137356043, "memory(GiB)": 25.14, "step": 474, "train_speed(iter/s)": 0.130692 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.5, "completions/mean_length": 136.453125, "completions/min_length": 83.0, "epoch": 9.5, "grad_norm": 2.2565746307373047, "kl": 0.595703125, "learning_rate": 6.280555661802856e-09, "loss": 0.011247138492763042, "memory(GiB)": 25.14, "reward": 0.4296618103981018, "reward_std": 0.021635888144373894, "rewards/MCQ_Reward/mean": 0.4296618103981018, "rewards/MCQ_Reward/std": 0.06789225153625011, "step": 475, "train_speed(iter/s)": 0.130512 }, { "clip_ratio": 0.005767492577433586, "epoch": 9.52, "grad_norm": 2.250284433364868, "kl": 0.6015625, "learning_rate": 5.789112577318789e-09, "loss": 0.011374367401003838, "memory(GiB)": 25.14, "step": 476, "train_speed(iter/s)": 0.130746 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.5, "completions/mean_length": 118.5703125, "completions/min_length": 73.5, "epoch": 9.54, "grad_norm": 2.5178654193878174, "kl": 0.728515625, "learning_rate": 5.317576235317756e-09, "loss": 0.007045174017548561, "memory(GiB)": 25.14, "reward": 0.44049952924251556, "reward_std": 0.02334336470812559, "rewards/MCQ_Reward/mean": 0.44049952924251556, "rewards/MCQ_Reward/std": 0.0808117426931858, "step": 477, "train_speed(iter/s)": 0.130671 }, { "clip_ratio": 0.004105736967176199, "epoch": 9.56, "grad_norm": 2.5065832138061523, "kl": 0.6953125, "learning_rate": 4.865965629214819e-09, "loss": 0.007527303881943226, "memory(GiB)": 25.14, "step": 478, "train_speed(iter/s)": 0.130887 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/mean_length": 117.73046875, "completions/min_length": 75.0, "epoch": 9.58, "grad_norm": 3.128554105758667, "kl": 0.59765625, "learning_rate": 4.434298949819448e-09, "loss": -0.021542608737945557, "memory(GiB)": 25.14, "reward": 0.4070900082588196, "reward_std": 0.023668975569307804, "rewards/MCQ_Reward/mean": 0.4070900082588196, "rewards/MCQ_Reward/std": 0.08471970073878765, "step": 479, "train_speed(iter/s)": 0.130803 }, { "clip_ratio": 0.00539792119525373, "epoch": 9.6, "grad_norm": 3.067028045654297, "kl": 0.59765625, "learning_rate": 4.022593584602329e-09, "loss": -0.02082860842347145, "memory(GiB)": 25.14, "step": 480, "train_speed(iter/s)": 0.131034 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/mean_length": 130.140625, "completions/min_length": 54.0, "epoch": 9.62, "grad_norm": 2.8921902179718018, "kl": 0.59375, "learning_rate": 3.6308661169957565e-09, "loss": -0.0016225441358983517, "memory(GiB)": 25.14, "reward": 0.42697805166244507, "reward_std": 0.0217811968177557, "rewards/MCQ_Reward/mean": 0.42697805166244507, "rewards/MCQ_Reward/std": 0.0660354271531105, "step": 481, "train_speed(iter/s)": 0.130674 }, { "clip_ratio": 0.007906233426183462, "epoch": 9.64, "grad_norm": 2.9274981021881104, "kl": 0.595703125, "learning_rate": 3.2591323257248894e-09, "loss": -0.0016696015372872353, "memory(GiB)": 25.14, "step": 482, "train_speed(iter/s)": 0.130879 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/mean_length": 135.94921875, "completions/min_length": 71.0, "epoch": 9.66, "grad_norm": 2.4433958530426025, "kl": 0.5546875, "learning_rate": 2.9074071841727054e-09, "loss": 0.019563939422369003, "memory(GiB)": 25.14, "reward": 0.42691025137901306, "reward_std": 0.020791654475033283, "rewards/MCQ_Reward/mean": 0.42691025137901306, "rewards/MCQ_Reward/std": 0.0828494131565094, "step": 483, "train_speed(iter/s)": 0.13078 }, { "clip_ratio": 0.004861004883423448, "epoch": 9.68, "grad_norm": 2.2269864082336426, "kl": 0.55859375, "learning_rate": 2.5757048597765395e-09, "loss": 0.019545655697584152, "memory(GiB)": 25.14, "step": 484, "train_speed(iter/s)": 0.131008 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/mean_length": 141.06640625, "completions/min_length": 89.0, "epoch": 9.7, "grad_norm": 2.19620418548584, "kl": 0.513671875, "learning_rate": 2.2640387134577053e-09, "loss": 0.010847845114767551, "memory(GiB)": 25.14, "reward": 0.42219071090221405, "reward_std": 0.022757427766919136, "rewards/MCQ_Reward/mean": 0.42219071090221405, "rewards/MCQ_Reward/std": 0.0853536631911993, "step": 485, "train_speed(iter/s)": 0.130923 }, { "clip_ratio": 0.006320674438029528, "epoch": 9.72, "grad_norm": 2.1190598011016846, "kl": 0.5048828125, "learning_rate": 1.9724212990830936e-09, "loss": 0.010512834414839745, "memory(GiB)": 25.14, "step": 486, "train_speed(iter/s)": 0.13115 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.5, "completions/mean_length": 111.921875, "completions/min_length": 63.5, "epoch": 9.74, "grad_norm": 2.5479891300201416, "kl": 0.59765625, "learning_rate": 1.7008643629596864e-09, "loss": -0.008141995407640934, "memory(GiB)": 25.14, "reward": 0.41020119190216064, "reward_std": 0.022871771827340126, "rewards/MCQ_Reward/mean": 0.41020119190216064, "rewards/MCQ_Reward/std": 0.10586465150117874, "step": 487, "train_speed(iter/s)": 0.131123 }, { "clip_ratio": 0.004743925994262099, "epoch": 9.76, "grad_norm": 2.7629165649414062, "kl": 0.591796875, "learning_rate": 1.4493788433612708e-09, "loss": -0.008076684549450874, "memory(GiB)": 25.14, "step": 488, "train_speed(iter/s)": 0.131348 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/mean_length": 116.10546875, "completions/min_length": 67.0, "epoch": 9.78, "grad_norm": 2.770082950592041, "kl": 0.576171875, "learning_rate": 1.217974870087901e-09, "loss": 0.010374639183282852, "memory(GiB)": 25.14, "reward": 0.47805055975914, "reward_std": 0.023321266286075115, "rewards/MCQ_Reward/mean": 0.47805055975914, "rewards/MCQ_Reward/std": 0.1008174680173397, "step": 489, "train_speed(iter/s)": 0.131298 }, { "clip_ratio": 0.005443725967779756, "epoch": 9.8, "grad_norm": 2.5658154487609863, "kl": 0.583984375, "learning_rate": 1.0066617640578368e-09, "loss": 0.010389911010861397, "memory(GiB)": 25.14, "step": 490, "train_speed(iter/s)": 0.131523 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.5, "completions/mean_length": 128.69921875, "completions/min_length": 71.5, "epoch": 9.82, "grad_norm": 2.3105576038360596, "kl": 0.90625, "learning_rate": 8.154480369321759e-10, "loss": -0.004896960221230984, "memory(GiB)": 25.14, "reward": 0.43206796050071716, "reward_std": 0.02110449317842722, "rewards/MCQ_Reward/mean": 0.43206796050071716, "rewards/MCQ_Reward/std": 0.10026764124631882, "step": 491, "train_speed(iter/s)": 0.13119 }, { "clip_ratio": 0.004017886472865939, "epoch": 9.84, "grad_norm": 2.2543957233428955, "kl": 0.892578125, "learning_rate": 6.443413907720186e-10, "loss": -0.004858216270804405, "memory(GiB)": 25.14, "step": 492, "train_speed(iter/s)": 0.131415 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.5, "completions/mean_length": 131.3203125, "completions/min_length": 58.0, "epoch": 9.86, "grad_norm": 2.459817409515381, "kl": 0.5390625, "learning_rate": 4.933487177280482e-10, "loss": 0.0025399066507816315, "memory(GiB)": 25.14, "reward": 0.47691330313682556, "reward_std": 0.022764784283936024, "rewards/MCQ_Reward/mean": 0.47691330313682556, "rewards/MCQ_Reward/std": 0.09778410196304321, "step": 493, "train_speed(iter/s)": 0.131346 }, { "clip_ratio": 0.004864038084633648, "epoch": 9.88, "grad_norm": 2.518949508666992, "kl": 0.537109375, "learning_rate": 3.6247609976319817e-10, "loss": 0.0027223415672779083, "memory(GiB)": 25.14, "step": 494, "train_speed(iter/s)": 0.131569 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/mean_length": 113.7421875, "completions/min_length": 57.5, "epoch": 9.9, "grad_norm": 2.7932207584381104, "kl": 0.640625, "learning_rate": 2.517288084074587e-10, "loss": -0.008804459124803543, "memory(GiB)": 25.14, "reward": 0.45272429287433624, "reward_std": 0.02382285613566637, "rewards/MCQ_Reward/mean": 0.45272429287433624, "rewards/MCQ_Reward/std": 0.08811983093619347, "step": 495, "train_speed(iter/s)": 0.13153 }, { "clip_ratio": 0.005316317779943347, "epoch": 9.92, "grad_norm": 2.3468141555786133, "kl": 0.634765625, "learning_rate": 1.6111130454543597e-10, "loss": -0.00884802732616663, "memory(GiB)": 25.14, "step": 496, "train_speed(iter/s)": 0.131752 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/mean_length": 111.8515625, "completions/min_length": 57.5, "epoch": 9.94, "grad_norm": 2.973198413848877, "kl": 0.642578125, "learning_rate": 9.06272382371065e-11, "loss": 0.002287194598466158, "memory(GiB)": 25.14, "reward": 0.4001469016075134, "reward_std": 0.0235411636531353, "rewards/MCQ_Reward/mean": 0.4001469016075134, "rewards/MCQ_Reward/std": 0.07189228385686874, "step": 497, "train_speed(iter/s)": 0.131698 }, { "clip_ratio": 0.0034996896283701062, "epoch": 9.96, "grad_norm": 3.0021812915802, "kl": 0.6484375, "learning_rate": 4.0279448570323946e-11, "loss": 0.002919801976531744, "memory(GiB)": 25.14, "step": 498, "train_speed(iter/s)": 0.131924 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/mean_length": 135.265625, "completions/min_length": 68.5, "epoch": 9.98, "grad_norm": 2.244234085083008, "kl": 0.55078125, "learning_rate": 1.0069963546743831e-11, "loss": -0.0014414777979254723, "memory(GiB)": 25.14, "reward": 0.46473294496536255, "reward_std": 0.02351410035043955, "rewards/MCQ_Reward/mean": 0.46473294496536255, "rewards/MCQ_Reward/std": 0.06907243467867374, "step": 499, "train_speed(iter/s)": 0.131777 }, { "clip_ratio": 0.0020644072210416198, "epoch": 10.0, "grad_norm": 2.3687548637390137, "kl": 0.55078125, "learning_rate": 0.0, "loss": -0.0014774189330637455, "memory(GiB)": 25.14, "step": 500, "train_speed(iter/s)": 0.131993 } ], "logging_steps": 1, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }