Kenya_Challenge_XMed / trainer_state.json
easonqin's picture
Upload folder using huggingface_hub
84bfe79 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 10.0,
"eval_steps": 500,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 935.5,
"completions/mean_length": 571.30859375,
"completions/min_length": 264.5,
"epoch": 0.02,
"grad_norm": 1.2956373691558838,
"kl": 0.0006160736083984375,
"learning_rate": 2e-07,
"loss": 0.11099594086408615,
"memory(GiB)": 18.17,
"reward": 0.18179254233837128,
"reward_std": 0.021205796860158443,
"rewards/MCQ_Reward/mean": 0.18179254233837128,
"rewards/MCQ_Reward/std": 0.0575394481420517,
"step": 1,
"train_speed(iter/s)": 0.017384
},
{
"clip_ratio": 0.0,
"epoch": 0.04,
"grad_norm": 1.2956030368804932,
"kl": 0.0006160736083984375,
"learning_rate": 4e-07,
"loss": 0.11099594086408615,
"memory(GiB)": 18.17,
"step": 2,
"train_speed(iter/s)": 0.033769
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1004.0,
"completions/mean_length": 582.2890625,
"completions/min_length": 126.5,
"epoch": 0.06,
"grad_norm": 1.1973260641098022,
"kl": 0.00061798095703125,
"learning_rate": 6e-07,
"loss": 0.09401366859674454,
"memory(GiB)": 18.17,
"reward": 0.1757229119539261,
"reward_std": 0.02308646310120821,
"rewards/MCQ_Reward/mean": 0.1757229119539261,
"rewards/MCQ_Reward/std": 0.06555243954062462,
"step": 3,
"train_speed(iter/s)": 0.029478
},
{
"clip_ratio": 0.0011098573449999094,
"epoch": 0.08,
"grad_norm": 1.206025242805481,
"kl": 0.0006008148193359375,
"learning_rate": 8e-07,
"loss": 0.09423406422138214,
"memory(GiB)": 18.17,
"step": 4,
"train_speed(iter/s)": 0.038797
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1025.0,
"completions/mean_length": 587.22265625,
"completions/min_length": 50.0,
"epoch": 0.1,
"grad_norm": 1.1425890922546387,
"kl": 0.0006389617919921875,
"learning_rate": 1e-06,
"loss": 0.10835893452167511,
"memory(GiB)": 18.17,
"reward": 0.20135290175676346,
"reward_std": 0.026336468756198883,
"rewards/MCQ_Reward/mean": 0.20135290175676346,
"rewards/MCQ_Reward/std": 0.04013596661388874,
"step": 5,
"train_speed(iter/s)": 0.033455
},
{
"clip_ratio": 0.000744842371204868,
"epoch": 0.12,
"grad_norm": 1.1426688432693481,
"kl": 0.0006389617919921875,
"learning_rate": 9.999899300364532e-07,
"loss": 0.10809706896543503,
"memory(GiB)": 18.17,
"step": 6,
"train_speed(iter/s)": 0.039768
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 986.0,
"completions/mean_length": 554.33203125,
"completions/min_length": 187.5,
"epoch": 0.14,
"grad_norm": 1.2598297595977783,
"kl": 0.000637054443359375,
"learning_rate": 9.999597205514296e-07,
"loss": 0.10747133195400238,
"memory(GiB)": 18.17,
"reward": 0.18709591031074524,
"reward_std": 0.022870728746056557,
"rewards/MCQ_Reward/mean": 0.18709591031074524,
"rewards/MCQ_Reward/std": 0.061255430802702904,
"step": 7,
"train_speed(iter/s)": 0.036272
},
{
"clip_ratio": 0.0011600544094108045,
"epoch": 0.16,
"grad_norm": 1.2500499486923218,
"kl": 0.0007114410400390625,
"learning_rate": 9.999093727617628e-07,
"loss": 0.10704316943883896,
"memory(GiB)": 18.17,
"step": 8,
"train_speed(iter/s)": 0.041177
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1011.5,
"completions/mean_length": 562.61328125,
"completions/min_length": 231.5,
"epoch": 0.18,
"grad_norm": 1.4137037992477417,
"kl": 0.00092315673828125,
"learning_rate": 9.998388886954545e-07,
"loss": 0.1194264367222786,
"memory(GiB)": 18.17,
"reward": 0.20057281106710434,
"reward_std": 0.02457202784717083,
"rewards/MCQ_Reward/mean": 0.20057281106710434,
"rewards/MCQ_Reward/std": 0.0581410713493824,
"step": 9,
"train_speed(iter/s)": 0.037627
},
{
"clip_ratio": 0.0008636733400635421,
"epoch": 0.2,
"grad_norm": 1.4122164249420166,
"kl": 0.001087188720703125,
"learning_rate": 9.997482711915925e-07,
"loss": 0.11916504055261612,
"memory(GiB)": 18.17,
"step": 10,
"train_speed(iter/s)": 0.041584
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 988.0,
"completions/mean_length": 545.3203125,
"completions/min_length": 13.0,
"epoch": 0.22,
"grad_norm": 1.1587789058685303,
"kl": 0.001285552978515625,
"learning_rate": 9.996375239002368e-07,
"loss": 0.06654135137796402,
"memory(GiB)": 18.17,
"reward": 0.18803076446056366,
"reward_std": 0.027116701006889343,
"rewards/MCQ_Reward/mean": 0.18803076446056366,
"rewards/MCQ_Reward/std": 0.06116201728582382,
"step": 11,
"train_speed(iter/s)": 0.037797
},
{
"clip_ratio": 0.0012727798894047737,
"epoch": 0.24,
"grad_norm": 1.1393318176269531,
"kl": 0.001819610595703125,
"learning_rate": 9.995066512822718e-07,
"loss": 0.0661393254995346,
"memory(GiB)": 18.17,
"step": 12,
"train_speed(iter/s)": 0.041011
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 928.0,
"completions/mean_length": 502.984375,
"completions/min_length": 181.5,
"epoch": 0.26,
"grad_norm": 1.3736039400100708,
"kl": 0.00341796875,
"learning_rate": 9.99355658609228e-07,
"loss": 0.09961968660354614,
"memory(GiB)": 18.17,
"reward": 0.2046608179807663,
"reward_std": 0.02339835651218891,
"rewards/MCQ_Reward/mean": 0.2046608179807663,
"rewards/MCQ_Reward/std": 0.07441236078739166,
"step": 13,
"train_speed(iter/s)": 0.038941
},
{
"clip_ratio": 0.0013542931410484016,
"epoch": 0.28,
"grad_norm": 1.341399073600769,
"kl": 0.004730224609375,
"learning_rate": 9.991845519630676e-07,
"loss": 0.09878668189048767,
"memory(GiB)": 18.17,
"step": 14,
"train_speed(iter/s)": 0.041763
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 939.0,
"completions/mean_length": 479.08984375,
"completions/min_length": 201.5,
"epoch": 0.3,
"grad_norm": 1.2583457231521606,
"kl": 0.005706787109375,
"learning_rate": 9.989933382359422e-07,
"loss": 0.09561844170093536,
"memory(GiB)": 18.17,
"reward": 0.23959992825984955,
"reward_std": 0.024829759262502193,
"rewards/MCQ_Reward/mean": 0.23959992825984955,
"rewards/MCQ_Reward/std": 0.059385696426033974,
"step": 15,
"train_speed(iter/s)": 0.040033
},
{
"clip_ratio": 0.0012090829550288618,
"epoch": 0.32,
"grad_norm": 1.2485970258712769,
"kl": 0.0069122314453125,
"learning_rate": 9.98782025129912e-07,
"loss": 0.09502086043357849,
"memory(GiB)": 18.17,
"step": 16,
"train_speed(iter/s)": 0.042555
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 785.0,
"completions/mean_length": 446.19140625,
"completions/min_length": 186.5,
"epoch": 0.34,
"grad_norm": 1.4837766885757446,
"kl": 0.0080718994140625,
"learning_rate": 9.985506211566386e-07,
"loss": 0.11237534880638123,
"memory(GiB)": 18.17,
"reward": 0.204755961894989,
"reward_std": 0.025960725732147694,
"rewards/MCQ_Reward/mean": 0.204755961894989,
"rewards/MCQ_Reward/std": 0.05882856249809265,
"step": 17,
"train_speed(iter/s)": 0.041421
},
{
"clip_ratio": 0.0012163713108748198,
"epoch": 0.36,
"grad_norm": 1.4663207530975342,
"kl": 0.00933837890625,
"learning_rate": 9.982991356370403e-07,
"loss": 0.11209464073181152,
"memory(GiB)": 18.17,
"step": 18,
"train_speed(iter/s)": 0.043701
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 771.0,
"completions/mean_length": 451.41015625,
"completions/min_length": 101.0,
"epoch": 0.38,
"grad_norm": 1.2070645093917847,
"kl": 0.010772705078125,
"learning_rate": 9.98027578700917e-07,
"loss": 0.0659424215555191,
"memory(GiB)": 18.17,
"reward": 0.18814751505851746,
"reward_std": 0.024471789598464966,
"rewards/MCQ_Reward/mean": 0.18814751505851746,
"rewards/MCQ_Reward/std": 0.062104713171720505,
"step": 19,
"train_speed(iter/s)": 0.042657
},
{
"clip_ratio": 0.0017630973597988486,
"epoch": 0.4,
"grad_norm": 1.1632057428359985,
"kl": 0.014007568359375,
"learning_rate": 9.977359612865422e-07,
"loss": 0.0650935024023056,
"memory(GiB)": 18.17,
"step": 20,
"train_speed(iter/s)": 0.044775
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 808.0,
"completions/mean_length": 392.30078125,
"completions/min_length": 84.0,
"epoch": 0.42,
"grad_norm": 1.313915491104126,
"kl": 0.019775390625,
"learning_rate": 9.974242951402235e-07,
"loss": 0.07705788314342499,
"memory(GiB)": 18.17,
"reward": 0.23380683362483978,
"reward_std": 0.03150738961994648,
"rewards/MCQ_Reward/mean": 0.23380683362483978,
"rewards/MCQ_Reward/std": 0.057576023042201996,
"step": 21,
"train_speed(iter/s)": 0.043224
},
{
"clip_ratio": 0.0028022455517202616,
"epoch": 0.44,
"grad_norm": 1.242121934890747,
"kl": 0.02642822265625,
"learning_rate": 9.970925928158272e-07,
"loss": 0.07613129168748856,
"memory(GiB)": 18.17,
"step": 22,
"train_speed(iter/s)": 0.045118
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 621.0,
"completions/mean_length": 355.48828125,
"completions/min_length": 144.0,
"epoch": 0.46,
"grad_norm": 1.3318829536437988,
"kl": 0.034423828125,
"learning_rate": 9.967408676742751e-07,
"loss": 0.07269842177629471,
"memory(GiB)": 18.17,
"reward": 0.22312550246715546,
"reward_std": 0.031231535598635674,
"rewards/MCQ_Reward/mean": 0.22312550246715546,
"rewards/MCQ_Reward/std": 0.05438939481973648,
"step": 23,
"train_speed(iter/s)": 0.044616
},
{
"clip_ratio": 0.0020711172837764025,
"epoch": 0.48,
"grad_norm": 1.2974779605865479,
"kl": 0.0413818359375,
"learning_rate": 9.963691338830042e-07,
"loss": 0.07173984497785568,
"memory(GiB)": 18.17,
"step": 24,
"train_speed(iter/s)": 0.046444
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 651.5,
"completions/mean_length": 318.5234375,
"completions/min_length": 92.0,
"epoch": 0.5,
"grad_norm": 1.397636890411377,
"kl": 0.047119140625,
"learning_rate": 9.959774064153975e-07,
"loss": 0.03884683549404144,
"memory(GiB)": 18.17,
"reward": 0.23498350381851196,
"reward_std": 0.03053601924329996,
"rewards/MCQ_Reward/mean": 0.23498350381851196,
"rewards/MCQ_Reward/std": 0.05711263045668602,
"step": 25,
"train_speed(iter/s)": 0.045888
},
{
"clip_ratio": 0.0013737165136262774,
"epoch": 0.52,
"grad_norm": 1.379469394683838,
"kl": 0.052734375,
"learning_rate": 9.955657010501806e-07,
"loss": 0.038122277706861496,
"memory(GiB)": 18.17,
"step": 26,
"train_speed(iter/s)": 0.047611
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 611.5,
"completions/mean_length": 293.42578125,
"completions/min_length": 110.5,
"epoch": 0.54,
"grad_norm": 1.3771414756774902,
"kl": 0.0574951171875,
"learning_rate": 9.95134034370785e-07,
"loss": 0.05064291134476662,
"memory(GiB)": 18.17,
"reward": 0.257246270775795,
"reward_std": 0.03051395993679762,
"rewards/MCQ_Reward/mean": 0.257246270775795,
"rewards/MCQ_Reward/std": 0.05405682139098644,
"step": 27,
"train_speed(iter/s)": 0.046967
},
{
"clip_ratio": 0.0015082518220879138,
"epoch": 0.56,
"grad_norm": 1.3394073247909546,
"kl": 0.063720703125,
"learning_rate": 9.946824237646824e-07,
"loss": 0.04972712695598602,
"memory(GiB)": 18.17,
"step": 28,
"train_speed(iter/s)": 0.048554
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 505.5,
"completions/mean_length": 259.3515625,
"completions/min_length": 76.0,
"epoch": 0.58,
"grad_norm": 1.4677767753601074,
"kl": 0.070556640625,
"learning_rate": 9.94210887422681e-07,
"loss": -0.01695432886481285,
"memory(GiB)": 18.17,
"reward": 0.25767549127340317,
"reward_std": 0.03901047818362713,
"rewards/MCQ_Reward/mean": 0.25767549127340317,
"rewards/MCQ_Reward/std": 0.05495491810142994,
"step": 29,
"train_speed(iter/s)": 0.048377
},
{
"clip_ratio": 0.001286374346818775,
"epoch": 0.6,
"grad_norm": 1.4747378826141357,
"kl": 0.076904296875,
"learning_rate": 9.93719444338197e-07,
"loss": -0.017460569739341736,
"memory(GiB)": 18.17,
"step": 30,
"train_speed(iter/s)": 0.04994
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 561.5,
"completions/mean_length": 250.26171875,
"completions/min_length": 96.5,
"epoch": 0.62,
"grad_norm": 1.6029585599899292,
"kl": 0.07763671875,
"learning_rate": 9.932081143064858e-07,
"loss": 0.042436983436346054,
"memory(GiB)": 18.17,
"reward": 0.23062269389629364,
"reward_std": 0.036025889217853546,
"rewards/MCQ_Reward/mean": 0.23062269389629364,
"rewards/MCQ_Reward/std": 0.0671730749309063,
"step": 31,
"train_speed(iter/s)": 0.048974
},
{
"clip_ratio": 0.00158036028733477,
"epoch": 0.64,
"grad_norm": 1.5435467958450317,
"kl": 0.08349609375,
"learning_rate": 9.926769179238464e-07,
"loss": 0.04148583859205246,
"memory(GiB)": 18.17,
"step": 32,
"train_speed(iter/s)": 0.050428
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 522.5,
"completions/mean_length": 246.3984375,
"completions/min_length": 89.0,
"epoch": 0.66,
"grad_norm": 1.466068983078003,
"kl": 0.093994140625,
"learning_rate": 9.921258765867919e-07,
"loss": 0.008220436982810497,
"memory(GiB)": 18.17,
"reward": 0.22424693405628204,
"reward_std": 0.03309958428144455,
"rewards/MCQ_Reward/mean": 0.22424693405628204,
"rewards/MCQ_Reward/std": 0.06848622299730778,
"step": 33,
"train_speed(iter/s)": 0.050299
},
{
"clip_ratio": 0.0012578482856042683,
"epoch": 0.68,
"grad_norm": 1.4434019327163696,
"kl": 0.10009765625,
"learning_rate": 9.915550124911866e-07,
"loss": 0.007482614368200302,
"memory(GiB)": 18.17,
"step": 34,
"train_speed(iter/s)": 0.051722
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 520.0,
"completions/mean_length": 226.12109375,
"completions/min_length": 47.5,
"epoch": 0.7,
"grad_norm": 1.529449224472046,
"kl": 0.10546875,
"learning_rate": 9.909643486313533e-07,
"loss": -0.024700753390789032,
"memory(GiB)": 18.17,
"reward": 0.24431276321411133,
"reward_std": 0.03709370456635952,
"rewards/MCQ_Reward/mean": 0.24431276321411133,
"rewards/MCQ_Reward/std": 0.06565525010228157,
"step": 35,
"train_speed(iter/s)": 0.051572
},
{
"clip_ratio": 0.0013001365587115288,
"epoch": 0.72,
"grad_norm": 1.524826169013977,
"kl": 0.110595703125,
"learning_rate": 9.903539087991461e-07,
"loss": -0.025061530992388725,
"memory(GiB)": 18.17,
"step": 36,
"train_speed(iter/s)": 0.052951
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 461.0,
"completions/mean_length": 206.1328125,
"completions/min_length": 63.0,
"epoch": 0.74,
"grad_norm": 1.5648741722106934,
"kl": 0.11474609375,
"learning_rate": 9.897237175829926e-07,
"loss": -0.010986058972775936,
"memory(GiB)": 18.17,
"reward": 0.26653096079826355,
"reward_std": 0.03736630827188492,
"rewards/MCQ_Reward/mean": 0.26653096079826355,
"rewards/MCQ_Reward/std": 0.065978042781353,
"step": 37,
"train_speed(iter/s)": 0.052793
},
{
"clip_ratio": 0.0015517690917477012,
"epoch": 0.76,
"grad_norm": 1.5597436428070068,
"kl": 0.122802734375,
"learning_rate": 9.890738003669027e-07,
"loss": -0.011755033396184444,
"memory(GiB)": 18.17,
"step": 38,
"train_speed(iter/s)": 0.054118
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 412.5,
"completions/mean_length": 203.34375,
"completions/min_length": 35.5,
"epoch": 0.78,
"grad_norm": 1.6045058965682983,
"kl": 0.125244140625,
"learning_rate": 9.884041833294475e-07,
"loss": -0.04164643585681915,
"memory(GiB)": 18.17,
"reward": 0.2605663910508156,
"reward_std": 0.03675983473658562,
"rewards/MCQ_Reward/mean": 0.2605663910508156,
"rewards/MCQ_Reward/std": 0.06591521203517914,
"step": 39,
"train_speed(iter/s)": 0.054082
},
{
"clip_ratio": 0.0013205534196458757,
"epoch": 0.8,
"grad_norm": 1.608991265296936,
"kl": 0.1337890625,
"learning_rate": 9.877148934427035e-07,
"loss": -0.042494483292102814,
"memory(GiB)": 18.17,
"step": 40,
"train_speed(iter/s)": 0.055369
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 385.0,
"completions/mean_length": 189.2734375,
"completions/min_length": 60.5,
"epoch": 0.82,
"grad_norm": 1.8442962169647217,
"kl": 0.14208984375,
"learning_rate": 9.870059584711668e-07,
"loss": -0.07683762162923813,
"memory(GiB)": 18.17,
"reward": 0.26815178990364075,
"reward_std": 0.04410684481263161,
"rewards/MCQ_Reward/mean": 0.26815178990364075,
"rewards/MCQ_Reward/std": 0.06000189855694771,
"step": 41,
"train_speed(iter/s)": 0.055022
},
{
"clip_ratio": 0.0013334141112864017,
"epoch": 0.84,
"grad_norm": 1.8422967195510864,
"kl": 0.14599609375,
"learning_rate": 9.862774069706345e-07,
"loss": -0.0775442123413086,
"memory(GiB)": 18.17,
"step": 42,
"train_speed(iter/s)": 0.056271
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 447.5,
"completions/mean_length": 185.765625,
"completions/min_length": 65.5,
"epoch": 0.86,
"grad_norm": 1.7880198955535889,
"kl": 0.14453125,
"learning_rate": 9.85529268287055e-07,
"loss": 0.009722323156893253,
"memory(GiB)": 18.17,
"reward": 0.26024360954761505,
"reward_std": 0.04201339744031429,
"rewards/MCQ_Reward/mean": 0.26024360954761505,
"rewards/MCQ_Reward/std": 0.0699400007724762,
"step": 43,
"train_speed(iter/s)": 0.056122
},
{
"clip_ratio": 0.0013897960307076573,
"epoch": 0.88,
"grad_norm": 1.7613471746444702,
"kl": 0.14599609375,
"learning_rate": 9.847615725553455e-07,
"loss": 0.008702307008206844,
"memory(GiB)": 18.17,
"step": 44,
"train_speed(iter/s)": 0.057328
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 329.5,
"completions/mean_length": 180.44921875,
"completions/min_length": 71.5,
"epoch": 0.9,
"grad_norm": 1.8986045122146606,
"kl": 0.16357421875,
"learning_rate": 9.83974350698178e-07,
"loss": -0.01265439111739397,
"memory(GiB)": 18.17,
"reward": 0.24561913311481476,
"reward_std": 0.041749605908989906,
"rewards/MCQ_Reward/mean": 0.24561913311481476,
"rewards/MCQ_Reward/std": 0.0692291297018528,
"step": 45,
"train_speed(iter/s)": 0.057564
},
{
"clip_ratio": 0.0017767796525731683,
"epoch": 0.92,
"grad_norm": 1.8627526760101318,
"kl": 0.1669921875,
"learning_rate": 9.831676344247342e-07,
"loss": -0.013573069125413895,
"memory(GiB)": 18.17,
"step": 46,
"train_speed(iter/s)": 0.058753
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 360.5,
"completions/mean_length": 181.046875,
"completions/min_length": 58.0,
"epoch": 0.94,
"grad_norm": 1.8329010009765625,
"kl": 0.1689453125,
"learning_rate": 9.82341456229428e-07,
"loss": -0.009910675697028637,
"memory(GiB)": 18.17,
"reward": 0.2712182253599167,
"reward_std": 0.03875480592250824,
"rewards/MCQ_Reward/mean": 0.2712182253599167,
"rewards/MCQ_Reward/std": 0.05874207057058811,
"step": 47,
"train_speed(iter/s)": 0.05881
},
{
"clip_ratio": 0.0020254994742572308,
"epoch": 0.96,
"grad_norm": 1.7636630535125732,
"kl": 0.17529296875,
"learning_rate": 9.814958493905962e-07,
"loss": -0.011010742746293545,
"memory(GiB)": 18.17,
"step": 48,
"train_speed(iter/s)": 0.05997
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 373.0,
"completions/mean_length": 198.5,
"completions/min_length": 83.0,
"epoch": 0.98,
"grad_norm": 1.9754475355148315,
"kl": 0.15625,
"learning_rate": 9.806308479691594e-07,
"loss": 0.026388226076960564,
"memory(GiB)": 18.17,
"reward": 0.2969816029071808,
"reward_std": 0.033485451713204384,
"rewards/MCQ_Reward/mean": 0.2969816029071808,
"rewards/MCQ_Reward/std": 0.06154371425509453,
"step": 49,
"train_speed(iter/s)": 0.059869
},
{
"clip_ratio": 0.002143923775292933,
"epoch": 1.0,
"grad_norm": 1.9168144464492798,
"kl": 0.16455078125,
"learning_rate": 9.797464868072486e-07,
"loss": 0.025302505120635033,
"memory(GiB)": 18.17,
"step": 50,
"train_speed(iter/s)": 0.060949
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 349.5,
"completions/mean_length": 175.86328125,
"completions/min_length": 67.5,
"epoch": 1.02,
"grad_norm": 1.949724793434143,
"kl": 0.18359375,
"learning_rate": 9.788428015268026e-07,
"loss": 0.016914475709199905,
"memory(GiB)": 18.17,
"reward": 0.28643812239170074,
"reward_std": 0.038882166147232056,
"rewards/MCQ_Reward/mean": 0.28643812239170074,
"rewards/MCQ_Reward/std": 0.05762592889368534,
"step": 51,
"train_speed(iter/s)": 0.06051
},
{
"clip_ratio": 0.0030939964344725013,
"epoch": 1.04,
"grad_norm": 1.873901128768921,
"kl": 0.1962890625,
"learning_rate": 9.779198285281326e-07,
"loss": 0.015664130449295044,
"memory(GiB)": 18.17,
"step": 52,
"train_speed(iter/s)": 0.061602
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 301.0,
"completions/mean_length": 173.26953125,
"completions/min_length": 50.5,
"epoch": 1.06,
"grad_norm": 1.748197317123413,
"kl": 0.20361328125,
"learning_rate": 9.769776049884563e-07,
"loss": -0.012495264410972595,
"memory(GiB)": 18.17,
"reward": 0.2694673240184784,
"reward_std": 0.03306659869849682,
"rewards/MCQ_Reward/mean": 0.2694673240184784,
"rewards/MCQ_Reward/std": 0.06984242424368858,
"step": 53,
"train_speed(iter/s)": 0.061749
},
{
"clip_ratio": 0.003254209994338453,
"epoch": 1.08,
"grad_norm": 1.7254936695098877,
"kl": 0.22021484375,
"learning_rate": 9.760161688604007e-07,
"loss": -0.012979630380868912,
"memory(GiB)": 18.17,
"step": 54,
"train_speed(iter/s)": 0.062813
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 323.0,
"completions/mean_length": 164.23046875,
"completions/min_length": 74.0,
"epoch": 1.1,
"grad_norm": 1.8942813873291016,
"kl": 0.21044921875,
"learning_rate": 9.750355588704727e-07,
"loss": -0.009442738257348537,
"memory(GiB)": 18.17,
"reward": 0.29137177765369415,
"reward_std": 0.03919493593275547,
"rewards/MCQ_Reward/mean": 0.29137177765369415,
"rewards/MCQ_Reward/std": 0.055357255041599274,
"step": 55,
"train_speed(iter/s)": 0.062825
},
{
"clip_ratio": 0.0029244048055261374,
"epoch": 1.12,
"grad_norm": 1.8403282165527344,
"kl": 0.2255859375,
"learning_rate": 9.740358145174997e-07,
"loss": -0.010412258096039295,
"memory(GiB)": 18.17,
"step": 56,
"train_speed(iter/s)": 0.063885
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 291.5,
"completions/mean_length": 159.5703125,
"completions/min_length": 68.5,
"epoch": 1.1400000000000001,
"grad_norm": 1.9502640962600708,
"kl": 0.24072265625,
"learning_rate": 9.730169760710385e-07,
"loss": -0.01350313052535057,
"memory(GiB)": 18.17,
"reward": 0.3086051344871521,
"reward_std": 0.036856647580862045,
"rewards/MCQ_Reward/mean": 0.3086051344871521,
"rewards/MCQ_Reward/std": 0.05716245248913765,
"step": 57,
"train_speed(iter/s)": 0.064059
},
{
"clip_ratio": 0.0026392132276669145,
"epoch": 1.16,
"grad_norm": 1.8639681339263916,
"kl": 0.244140625,
"learning_rate": 9.719790845697532e-07,
"loss": -0.014377694576978683,
"memory(GiB)": 18.17,
"step": 58,
"train_speed(iter/s)": 0.065093
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 236.0,
"completions/mean_length": 133.83984375,
"completions/min_length": 52.5,
"epoch": 1.18,
"grad_norm": 2.159579038619995,
"kl": 0.2607421875,
"learning_rate": 9.709221818197623e-07,
"loss": -0.03235793486237526,
"memory(GiB)": 18.17,
"reward": 0.3192738890647888,
"reward_std": 0.03647255524992943,
"rewards/MCQ_Reward/mean": 0.3192738890647888,
"rewards/MCQ_Reward/std": 0.04580973833799362,
"step": 59,
"train_speed(iter/s)": 0.065376
},
{
"clip_ratio": 0.0033569036750122905,
"epoch": 1.2,
"grad_norm": 2.0858945846557617,
"kl": 0.2685546875,
"learning_rate": 9.698463103929541e-07,
"loss": -0.03384597226977348,
"memory(GiB)": 18.17,
"step": 60,
"train_speed(iter/s)": 0.066397
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 275.5,
"completions/mean_length": 152.6640625,
"completions/min_length": 54.0,
"epoch": 1.22,
"grad_norm": 1.9752745628356934,
"kl": 0.2509765625,
"learning_rate": 9.68751513625273e-07,
"loss": -0.012610888108611107,
"memory(GiB)": 18.17,
"reward": 0.30408790707588196,
"reward_std": 0.03896576911211014,
"rewards/MCQ_Reward/mean": 0.30408790707588196,
"rewards/MCQ_Reward/std": 0.059865519404411316,
"step": 61,
"train_speed(iter/s)": 0.066047
},
{
"clip_ratio": 0.0028306948952376842,
"epoch": 1.24,
"grad_norm": 1.8911457061767578,
"kl": 0.2509765625,
"learning_rate": 9.676378356149732e-07,
"loss": -0.014004014432430267,
"memory(GiB)": 18.17,
"step": 62,
"train_speed(iter/s)": 0.067044
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 275.5,
"completions/mean_length": 147.6953125,
"completions/min_length": 69.0,
"epoch": 1.26,
"grad_norm": 2.153862953186035,
"kl": 0.265625,
"learning_rate": 9.665053212208426e-07,
"loss": -0.027626825496554375,
"memory(GiB)": 18.17,
"reward": 0.31602054834365845,
"reward_std": 0.03946657292544842,
"rewards/MCQ_Reward/mean": 0.31602054834365845,
"rewards/MCQ_Reward/std": 0.06625748611986637,
"step": 63,
"train_speed(iter/s)": 0.067162
},
{
"clip_ratio": 0.004200217663310468,
"epoch": 1.28,
"grad_norm": 2.027595281600952,
"kl": 0.2626953125,
"learning_rate": 9.653540160603955e-07,
"loss": -0.028667613863945007,
"memory(GiB)": 18.17,
"step": 64,
"train_speed(iter/s)": 0.06814
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 300.5,
"completions/mean_length": 153.3828125,
"completions/min_length": 42.0,
"epoch": 1.3,
"grad_norm": 2.058096170425415,
"kl": 0.26318359375,
"learning_rate": 9.641839665080363e-07,
"loss": 0.019130591303110123,
"memory(GiB)": 18.17,
"reward": 0.3058909475803375,
"reward_std": 0.03743278048932552,
"rewards/MCQ_Reward/mean": 0.3058909475803375,
"rewards/MCQ_Reward/std": 0.06633425317704678,
"step": 65,
"train_speed(iter/s)": 0.068294
},
{
"clip_ratio": 0.0030368451261892915,
"epoch": 1.32,
"grad_norm": 2.0810675621032715,
"kl": 0.26708984375,
"learning_rate": 9.6299521969319e-07,
"loss": 0.01858600787818432,
"memory(GiB)": 18.17,
"step": 66,
"train_speed(iter/s)": 0.069245
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 310.5,
"completions/mean_length": 170.65625,
"completions/min_length": 70.0,
"epoch": 1.34,
"grad_norm": 1.9177082777023315,
"kl": 0.25390625,
"learning_rate": 9.617878234984054e-07,
"loss": 0.013776745647192001,
"memory(GiB)": 18.17,
"reward": 0.32124653458595276,
"reward_std": 0.03586815297603607,
"rewards/MCQ_Reward/mean": 0.32124653458595276,
"rewards/MCQ_Reward/std": 0.05279739946126938,
"step": 67,
"train_speed(iter/s)": 0.069258
},
{
"clip_ratio": 0.003581640077754855,
"epoch": 1.3599999999999999,
"grad_norm": 1.800355076789856,
"kl": 0.271484375,
"learning_rate": 9.60561826557425e-07,
"loss": 0.01218567043542862,
"memory(GiB)": 18.17,
"step": 68,
"train_speed(iter/s)": 0.070198
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 320.5,
"completions/mean_length": 165.45703125,
"completions/min_length": 84.5,
"epoch": 1.38,
"grad_norm": 1.9321861267089844,
"kl": 0.2734375,
"learning_rate": 9.593172782532267e-07,
"loss": -0.06093820929527283,
"memory(GiB)": 18.17,
"reward": 0.33785562217235565,
"reward_std": 0.03626340813934803,
"rewards/MCQ_Reward/mean": 0.33785562217235565,
"rewards/MCQ_Reward/std": 0.04918426461517811,
"step": 69,
"train_speed(iter/s)": 0.070079
},
{
"clip_ratio": 0.002684593666344881,
"epoch": 1.4,
"grad_norm": 1.9250681400299072,
"kl": 0.2822265625,
"learning_rate": 9.580542287160346e-07,
"loss": -0.06187870353460312,
"memory(GiB)": 18.17,
"step": 70,
"train_speed(iter/s)": 0.071007
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 315.5,
"completions/mean_length": 167.71875,
"completions/min_length": 60.0,
"epoch": 1.42,
"grad_norm": 1.9310671091079712,
"kl": 0.26953125,
"learning_rate": 9.567727288213004e-07,
"loss": -0.03052324429154396,
"memory(GiB)": 18.17,
"reward": 0.3391506224870682,
"reward_std": 0.037205325439572334,
"rewards/MCQ_Reward/mean": 0.3391506224870682,
"rewards/MCQ_Reward/std": 0.06270403787493706,
"step": 71,
"train_speed(iter/s)": 0.070595
},
{
"clip_ratio": 0.004182511591352522,
"epoch": 1.44,
"grad_norm": 1.808637261390686,
"kl": 0.26953125,
"learning_rate": 9.554728301876524e-07,
"loss": -0.031438540667295456,
"memory(GiB)": 18.17,
"step": 72,
"train_speed(iter/s)": 0.071499
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 330.0,
"completions/mean_length": 171.5859375,
"completions/min_length": 73.5,
"epoch": 1.46,
"grad_norm": 2.1356284618377686,
"kl": 0.2666015625,
"learning_rate": 9.541545851748185e-07,
"loss": 0.06165466085076332,
"memory(GiB)": 18.17,
"reward": 0.3267658054828644,
"reward_std": 0.03793729655444622,
"rewards/MCQ_Reward/mean": 0.3267658054828644,
"rewards/MCQ_Reward/std": 0.06866181083023548,
"step": 73,
"train_speed(iter/s)": 0.071359
},
{
"clip_ratio": 0.0023740422911942005,
"epoch": 1.48,
"grad_norm": 2.081942319869995,
"kl": 0.2724609375,
"learning_rate": 9.528180468815154e-07,
"loss": 0.06085401773452759,
"memory(GiB)": 18.17,
"step": 74,
"train_speed(iter/s)": 0.072254
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 388.0,
"completions/mean_length": 176.4140625,
"completions/min_length": 60.0,
"epoch": 1.5,
"grad_norm": 1.819736361503601,
"kl": 0.291015625,
"learning_rate": 9.514632691433106e-07,
"loss": 0.041995078325271606,
"memory(GiB)": 18.17,
"reward": 0.34543414413928986,
"reward_std": 0.03658975474536419,
"rewards/MCQ_Reward/mean": 0.34543414413928986,
"rewards/MCQ_Reward/std": 0.0643342137336731,
"step": 75,
"train_speed(iter/s)": 0.072103
},
{
"clip_ratio": 0.0024005533196032047,
"epoch": 1.52,
"grad_norm": 1.7825483083724976,
"kl": 0.302734375,
"learning_rate": 9.500903065304539e-07,
"loss": 0.04098404943943024,
"memory(GiB)": 18.17,
"step": 76,
"train_speed(iter/s)": 0.072975
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 324.0,
"completions/mean_length": 179.35546875,
"completions/min_length": 71.5,
"epoch": 1.54,
"grad_norm": 1.83073091506958,
"kl": 0.2919921875,
"learning_rate": 9.486992143456791e-07,
"loss": 0.026145532727241516,
"memory(GiB)": 18.17,
"reward": 0.33697785437107086,
"reward_std": 0.033385418355464935,
"rewards/MCQ_Reward/mean": 0.33697785437107086,
"rewards/MCQ_Reward/std": 0.06162330321967602,
"step": 77,
"train_speed(iter/s)": 0.072818
},
{
"clip_ratio": 0.0029612210346385837,
"epoch": 1.56,
"grad_norm": 1.7568435668945312,
"kl": 0.3046875,
"learning_rate": 9.472900486219768e-07,
"loss": 0.02535586804151535,
"memory(GiB)": 18.17,
"step": 78,
"train_speed(iter/s)": 0.07364
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 297.0,
"completions/mean_length": 181.63671875,
"completions/min_length": 86.0,
"epoch": 1.58,
"grad_norm": 1.763022541999817,
"kl": 0.296875,
"learning_rate": 9.458628661203366e-07,
"loss": -0.016155043616890907,
"memory(GiB)": 18.17,
"reward": 0.3397578001022339,
"reward_std": 0.030555096454918385,
"rewards/MCQ_Reward/mean": 0.3397578001022339,
"rewards/MCQ_Reward/std": 0.0736413523554802,
"step": 79,
"train_speed(iter/s)": 0.073639
},
{
"clip_ratio": 0.003752505173906684,
"epoch": 1.6,
"grad_norm": 1.75266695022583,
"kl": 0.314453125,
"learning_rate": 9.444177243274617e-07,
"loss": -0.016932127997279167,
"memory(GiB)": 18.17,
"step": 80,
"train_speed(iter/s)": 0.074482
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 316.0,
"completions/mean_length": 173.53515625,
"completions/min_length": 82.5,
"epoch": 1.62,
"grad_norm": 1.813202142715454,
"kl": 0.3193359375,
"learning_rate": 9.429546814534528e-07,
"loss": 0.014175940304994583,
"memory(GiB)": 18.17,
"reward": 0.35451021790504456,
"reward_std": 0.0316955391317606,
"rewards/MCQ_Reward/mean": 0.35451021790504456,
"rewards/MCQ_Reward/std": 0.058956997469067574,
"step": 81,
"train_speed(iter/s)": 0.073923
},
{
"clip_ratio": 0.003929685335606337,
"epoch": 1.6400000000000001,
"grad_norm": 1.7315208911895752,
"kl": 0.337890625,
"learning_rate": 9.414737964294634e-07,
"loss": 0.013125661760568619,
"memory(GiB)": 18.17,
"step": 82,
"train_speed(iter/s)": 0.074757
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 265.5,
"completions/mean_length": 159.95703125,
"completions/min_length": 68.5,
"epoch": 1.6600000000000001,
"grad_norm": 1.86507248878479,
"kl": 0.333984375,
"learning_rate": 9.399751289053266e-07,
"loss": 0.0190749391913414,
"memory(GiB)": 18.17,
"reward": 0.32107532024383545,
"reward_std": 0.03531700000166893,
"rewards/MCQ_Reward/mean": 0.32107532024383545,
"rewards/MCQ_Reward/std": 0.06730588898062706,
"step": 83,
"train_speed(iter/s)": 0.074766
},
{
"clip_ratio": 0.005602485965937376,
"epoch": 1.6800000000000002,
"grad_norm": 1.8452680110931396,
"kl": 0.3515625,
"learning_rate": 9.384587392471514e-07,
"loss": 0.018391648307442665,
"memory(GiB)": 18.17,
"step": 84,
"train_speed(iter/s)": 0.075562
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 274.5,
"completions/mean_length": 146.36328125,
"completions/min_length": 51.5,
"epoch": 1.7,
"grad_norm": 2.060523271560669,
"kl": 0.3564453125,
"learning_rate": 9.369246885348925e-07,
"loss": 0.00966290757060051,
"memory(GiB)": 18.17,
"reward": 0.34230072796344757,
"reward_std": 0.03451686259359121,
"rewards/MCQ_Reward/mean": 0.34230072796344757,
"rewards/MCQ_Reward/std": 0.07506715506315231,
"step": 85,
"train_speed(iter/s)": 0.075608
},
{
"clip_ratio": 0.0025914940051734447,
"epoch": 1.72,
"grad_norm": 2.089233875274658,
"kl": 0.357421875,
"learning_rate": 9.353730385598886e-07,
"loss": 0.008917246013879776,
"memory(GiB)": 18.17,
"step": 86,
"train_speed(iter/s)": 0.076403
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 251.0,
"completions/mean_length": 149.41796875,
"completions/min_length": 72.0,
"epoch": 1.74,
"grad_norm": 2.100825071334839,
"kl": 0.3642578125,
"learning_rate": 9.338038518223745e-07,
"loss": 0.0011688023805618286,
"memory(GiB)": 18.17,
"reward": 0.29714760184288025,
"reward_std": 0.03046888206154108,
"rewards/MCQ_Reward/mean": 0.29714760184288025,
"rewards/MCQ_Reward/std": 0.0724717304110527,
"step": 87,
"train_speed(iter/s)": 0.076468
},
{
"clip_ratio": 0.0029116831719875336,
"epoch": 1.76,
"grad_norm": 2.091975688934326,
"kl": 0.3740234375,
"learning_rate": 9.322171915289633e-07,
"loss": 0.0007365690544247627,
"memory(GiB)": 18.17,
"step": 88,
"train_speed(iter/s)": 0.077267
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 243.0,
"completions/mean_length": 149.0546875,
"completions/min_length": 74.5,
"epoch": 1.78,
"grad_norm": 2.0660133361816406,
"kl": 0.5546875,
"learning_rate": 9.306131215901003e-07,
"loss": -0.002558637410402298,
"memory(GiB)": 18.17,
"reward": 0.3453996330499649,
"reward_std": 0.030298423022031784,
"rewards/MCQ_Reward/mean": 0.3453996330499649,
"rewards/MCQ_Reward/std": 0.05576108209788799,
"step": 89,
"train_speed(iter/s)": 0.07741
},
{
"clip_ratio": 0.0030759836081415415,
"epoch": 1.8,
"grad_norm": 1.9661788940429688,
"kl": 0.5439453125,
"learning_rate": 9.289917066174885e-07,
"loss": -0.003219339996576309,
"memory(GiB)": 18.17,
"step": 90,
"train_speed(iter/s)": 0.078204
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 279.0,
"completions/mean_length": 137.28125,
"completions/min_length": 57.0,
"epoch": 1.8199999999999998,
"grad_norm": 2.1432077884674072,
"kl": 0.4169921875,
"learning_rate": 9.273530119214867e-07,
"loss": -0.019994597882032394,
"memory(GiB)": 18.17,
"reward": 0.3450734615325928,
"reward_std": 0.03698188066482544,
"rewards/MCQ_Reward/mean": 0.3450734615325928,
"rewards/MCQ_Reward/std": 0.06834666058421135,
"step": 91,
"train_speed(iter/s)": 0.077823
},
{
"clip_ratio": 0.006807451136410236,
"epoch": 1.8399999999999999,
"grad_norm": 2.026726484298706,
"kl": 0.4423828125,
"learning_rate": 9.256971035084784e-07,
"loss": -0.02127775177359581,
"memory(GiB)": 18.17,
"step": 92,
"train_speed(iter/s)": 0.078595
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 258.0,
"completions/mean_length": 144.11328125,
"completions/min_length": 62.5,
"epoch": 1.8599999999999999,
"grad_norm": 2.5080695152282715,
"kl": 0.44140625,
"learning_rate": 9.240240480782129e-07,
"loss": 0.038984864950180054,
"memory(GiB)": 18.17,
"reward": 0.34395235776901245,
"reward_std": 0.030767593532800674,
"rewards/MCQ_Reward/mean": 0.34395235776901245,
"rewards/MCQ_Reward/std": 0.08772432059049606,
"step": 93,
"train_speed(iter/s)": 0.07864
},
{
"clip_ratio": 0.0038948373403400183,
"epoch": 1.88,
"grad_norm": 2.293992042541504,
"kl": 0.466796875,
"learning_rate": 9.223339130211192e-07,
"loss": 0.03854737430810928,
"memory(GiB)": 18.17,
"step": 94,
"train_speed(iter/s)": 0.0794
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 288.0,
"completions/mean_length": 144.3671875,
"completions/min_length": 66.5,
"epoch": 1.9,
"grad_norm": 2.3717093467712402,
"kl": 0.4423828125,
"learning_rate": 9.206267664155906e-07,
"loss": 0.02822975069284439,
"memory(GiB)": 18.17,
"reward": 0.35692907869815826,
"reward_std": 0.033766910433769226,
"rewards/MCQ_Reward/mean": 0.35692907869815826,
"rewards/MCQ_Reward/std": 0.055017637088894844,
"step": 95,
"train_speed(iter/s)": 0.079264
},
{
"clip_ratio": 0.01540788309648633,
"epoch": 1.92,
"grad_norm": 2.8082501888275146,
"kl": 0.4873046875,
"learning_rate": 9.189026770252436e-07,
"loss": 0.027400558814406395,
"memory(GiB)": 18.17,
"step": 96,
"train_speed(iter/s)": 0.080015
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 216.5,
"completions/mean_length": 131.2265625,
"completions/min_length": 64.0,
"epoch": 1.94,
"grad_norm": 2.578866481781006,
"kl": 0.458984375,
"learning_rate": 9.171617142961476e-07,
"loss": -0.028647061437368393,
"memory(GiB)": 18.17,
"reward": 0.35198159515857697,
"reward_std": 0.036471933126449585,
"rewards/MCQ_Reward/mean": 0.35198159515857697,
"rewards/MCQ_Reward/std": 0.09679177403450012,
"step": 97,
"train_speed(iter/s)": 0.080136
},
{
"clip_ratio": 0.007482210174202919,
"epoch": 1.96,
"grad_norm": 2.6245126724243164,
"kl": 0.455078125,
"learning_rate": 9.154039483540272e-07,
"loss": -0.02990054339170456,
"memory(GiB)": 18.17,
"step": 98,
"train_speed(iter/s)": 0.080877
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 254.5,
"completions/mean_length": 140.546875,
"completions/min_length": 70.0,
"epoch": 1.98,
"grad_norm": 2.0212841033935547,
"kl": 0.4462890625,
"learning_rate": 9.136294500014385e-07,
"loss": 0.007645269390195608,
"memory(GiB)": 18.17,
"reward": 0.3687240034341812,
"reward_std": 0.0377286896109581,
"rewards/MCQ_Reward/mean": 0.3687240034341812,
"rewards/MCQ_Reward/std": 0.09235312044620514,
"step": 99,
"train_speed(iter/s)": 0.080838
},
{
"clip_ratio": 0.004757207585498691,
"epoch": 2.0,
"grad_norm": 1.9354287385940552,
"kl": 0.4638671875,
"learning_rate": 9.118382907149163e-07,
"loss": 0.006971254944801331,
"memory(GiB)": 18.17,
"step": 100,
"train_speed(iter/s)": 0.08155
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 252.5,
"completions/mean_length": 123.4140625,
"completions/min_length": 54.0,
"epoch": 2.02,
"grad_norm": 2.3176586627960205,
"kl": 0.4755859375,
"learning_rate": 9.100305426420956e-07,
"loss": -0.016116395592689514,
"memory(GiB)": 18.17,
"reward": 0.38898809254169464,
"reward_std": 0.038034453988075256,
"rewards/MCQ_Reward/mean": 0.38898809254169464,
"rewards/MCQ_Reward/std": 0.07776015624403954,
"step": 101,
"train_speed(iter/s)": 0.081234
},
{
"clip_ratio": 0.004006300354376435,
"epoch": 2.04,
"grad_norm": 2.1871023178100586,
"kl": 0.4931640625,
"learning_rate": 9.082062785988048e-07,
"loss": -0.01703297346830368,
"memory(GiB)": 18.17,
"step": 102,
"train_speed(iter/s)": 0.081962
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 199.0,
"completions/mean_length": 113.1484375,
"completions/min_length": 56.5,
"epoch": 2.06,
"grad_norm": 2.5120768547058105,
"kl": 0.517578125,
"learning_rate": 9.06365572066134e-07,
"loss": -0.027387384325265884,
"memory(GiB)": 18.17,
"reward": 0.357058048248291,
"reward_std": 0.031020362861454487,
"rewards/MCQ_Reward/mean": 0.357058048248291,
"rewards/MCQ_Reward/std": 0.06582547165453434,
"step": 103,
"train_speed(iter/s)": 0.082061
},
{
"clip_ratio": 0.014288442209362984,
"epoch": 2.08,
"grad_norm": 3.2106845378875732,
"kl": 0.5009765625,
"learning_rate": 9.045084971874737e-07,
"loss": -0.02823379635810852,
"memory(GiB)": 18.17,
"step": 104,
"train_speed(iter/s)": 0.082761
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 211.0,
"completions/mean_length": 126.953125,
"completions/min_length": 70.0,
"epoch": 2.1,
"grad_norm": 2.2478950023651123,
"kl": 0.48828125,
"learning_rate": 9.026351287655293e-07,
"loss": 0.02888938970863819,
"memory(GiB)": 18.17,
"reward": 0.3573220670223236,
"reward_std": 0.03388269431889057,
"rewards/MCQ_Reward/mean": 0.3573220670223236,
"rewards/MCQ_Reward/std": 0.08621830865740776,
"step": 105,
"train_speed(iter/s)": 0.082851
},
{
"clip_ratio": 0.005271225702017546,
"epoch": 2.12,
"grad_norm": 2.07523250579834,
"kl": 0.513671875,
"learning_rate": 9.007455422593075e-07,
"loss": 0.028001034632325172,
"memory(GiB)": 18.17,
"step": 106,
"train_speed(iter/s)": 0.083561
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 244.0,
"completions/mean_length": 143.50390625,
"completions/min_length": 62.5,
"epoch": 2.14,
"grad_norm": 2.149932861328125,
"kl": 0.474609375,
"learning_rate": 8.988398137810776e-07,
"loss": -0.0027789073064923286,
"memory(GiB)": 18.17,
"reward": 0.37795157730579376,
"reward_std": 0.03415030054748058,
"rewards/MCQ_Reward/mean": 0.37795157730579376,
"rewards/MCQ_Reward/std": 0.07794364914298058,
"step": 107,
"train_speed(iter/s)": 0.083617
},
{
"clip_ratio": 0.008057619212195277,
"epoch": 2.16,
"grad_norm": 2.7377026081085205,
"kl": 0.5078125,
"learning_rate": 8.969180200933047e-07,
"loss": -0.003491489216685295,
"memory(GiB)": 18.17,
"step": 108,
"train_speed(iter/s)": 0.084274
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 226.5,
"completions/mean_length": 133.1875,
"completions/min_length": 58.5,
"epoch": 2.18,
"grad_norm": 2.826488494873047,
"kl": 0.5390625,
"learning_rate": 8.94980238605558e-07,
"loss": 0.02833351120352745,
"memory(GiB)": 18.17,
"reward": 0.39782722294330597,
"reward_std": 0.031135279685258865,
"rewards/MCQ_Reward/mean": 0.39782722294330597,
"rewards/MCQ_Reward/std": 0.07045348361134529,
"step": 109,
"train_speed(iter/s)": 0.084336
},
{
"clip_ratio": 0.00684792990796268,
"epoch": 2.2,
"grad_norm": 2.434086322784424,
"kl": 0.5703125,
"learning_rate": 8.930265473713937e-07,
"loss": 0.027658611536026,
"memory(GiB)": 18.17,
"step": 110,
"train_speed(iter/s)": 0.085034
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 216.0,
"completions/mean_length": 131.703125,
"completions/min_length": 67.0,
"epoch": 2.22,
"grad_norm": 2.134516716003418,
"kl": 0.48828125,
"learning_rate": 8.910570250852096e-07,
"loss": 0.006394753232598305,
"memory(GiB)": 18.17,
"reward": 0.3707956522703171,
"reward_std": 0.03248129412531853,
"rewards/MCQ_Reward/mean": 0.3707956522703171,
"rewards/MCQ_Reward/std": 0.10541465878486633,
"step": 111,
"train_speed(iter/s)": 0.084685
},
{
"clip_ratio": 0.00865771621465683,
"epoch": 2.24,
"grad_norm": 2.2900125980377197,
"kl": 0.513671875,
"learning_rate": 8.890717510790762e-07,
"loss": 0.00539240799844265,
"memory(GiB)": 18.17,
"step": 112,
"train_speed(iter/s)": 0.085353
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 264.5,
"completions/mean_length": 126.9140625,
"completions/min_length": 62.0,
"epoch": 2.26,
"grad_norm": 2.6178812980651855,
"kl": 0.546875,
"learning_rate": 8.870708053195413e-07,
"loss": 0.019267559051513672,
"memory(GiB)": 18.17,
"reward": 0.3922416865825653,
"reward_std": 0.03025819268077612,
"rewards/MCQ_Reward/mean": 0.3922416865825653,
"rewards/MCQ_Reward/std": 0.08424495533108711,
"step": 113,
"train_speed(iter/s)": 0.085338
},
{
"clip_ratio": 0.006454117828980088,
"epoch": 2.2800000000000002,
"grad_norm": 2.1509737968444824,
"kl": 0.57421875,
"learning_rate": 8.850542684044078e-07,
"loss": 0.01820582151412964,
"memory(GiB)": 18.17,
"step": 114,
"train_speed(iter/s)": 0.085985
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 212.5,
"completions/mean_length": 118.85546875,
"completions/min_length": 59.5,
"epoch": 2.3,
"grad_norm": 2.528681755065918,
"kl": 0.525390625,
"learning_rate": 8.83022221559489e-07,
"loss": 0.008160990662872791,
"memory(GiB)": 18.17,
"reward": 0.404242143034935,
"reward_std": 0.03400178253650665,
"rewards/MCQ_Reward/mean": 0.404242143034935,
"rewards/MCQ_Reward/std": 0.09943690523505211,
"step": 115,
"train_speed(iter/s)": 0.086069
},
{
"clip_ratio": 0.005366077646613121,
"epoch": 2.32,
"grad_norm": 2.1966934204101562,
"kl": 0.546875,
"learning_rate": 8.809747466353355e-07,
"loss": 0.007157166488468647,
"memory(GiB)": 18.17,
"step": 116,
"train_speed(iter/s)": 0.086734
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 239.0,
"completions/mean_length": 125.3359375,
"completions/min_length": 59.5,
"epoch": 2.34,
"grad_norm": 2.4033124446868896,
"kl": 0.537109375,
"learning_rate": 8.789119261039384e-07,
"loss": 0.017890973016619682,
"memory(GiB)": 18.17,
"reward": 0.36347851157188416,
"reward_std": 0.027591521851718426,
"rewards/MCQ_Reward/mean": 0.36347851157188416,
"rewards/MCQ_Reward/std": 0.09114562720060349,
"step": 117,
"train_speed(iter/s)": 0.086687
},
{
"clip_ratio": 0.011405623517930508,
"epoch": 2.36,
"grad_norm": 2.8501975536346436,
"kl": 0.587890625,
"learning_rate": 8.768338430554082e-07,
"loss": 0.016866052523255348,
"memory(GiB)": 18.17,
"step": 118,
"train_speed(iter/s)": 0.08735
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 194.0,
"completions/mean_length": 122.23046875,
"completions/min_length": 65.0,
"epoch": 2.38,
"grad_norm": 2.5570151805877686,
"kl": 0.5126953125,
"learning_rate": 8.74740581194627e-07,
"loss": -0.011926580220460892,
"memory(GiB)": 18.17,
"reward": 0.40480077266693115,
"reward_std": 0.03289741463959217,
"rewards/MCQ_Reward/mean": 0.40480077266693115,
"rewards/MCQ_Reward/std": 0.08261778578162193,
"step": 119,
"train_speed(iter/s)": 0.087419
},
{
"clip_ratio": 0.007963848765939474,
"epoch": 2.4,
"grad_norm": 2.1802773475646973,
"kl": 0.5009765625,
"learning_rate": 8.726322248378774e-07,
"loss": -0.0127539848908782,
"memory(GiB)": 18.17,
"step": 120,
"train_speed(iter/s)": 0.088053
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 244.0,
"completions/mean_length": 130.2421875,
"completions/min_length": 60.5,
"epoch": 2.42,
"grad_norm": 2.4936065673828125,
"kl": 0.537109375,
"learning_rate": 8.705088589094458e-07,
"loss": 0.008000252768397331,
"memory(GiB)": 18.17,
"reward": 0.36072438955307007,
"reward_std": 0.030319811776280403,
"rewards/MCQ_Reward/mean": 0.36072438955307007,
"rewards/MCQ_Reward/std": 0.1019350104033947,
"step": 121,
"train_speed(iter/s)": 0.08768
},
{
"clip_ratio": 0.006943409331142902,
"epoch": 2.44,
"grad_norm": 2.4447567462921143,
"kl": 0.544921875,
"learning_rate": 8.683705689382024e-07,
"loss": 0.0072016119956970215,
"memory(GiB)": 18.17,
"step": 122,
"train_speed(iter/s)": 0.088326
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 191.5,
"completions/mean_length": 112.40234375,
"completions/min_length": 53.0,
"epoch": 2.46,
"grad_norm": 2.279759168624878,
"kl": 0.55859375,
"learning_rate": 8.662174410541554e-07,
"loss": 0.00623547937721014,
"memory(GiB)": 18.17,
"reward": 0.3670702576637268,
"reward_std": 0.02890967670828104,
"rewards/MCQ_Reward/mean": 0.3670702576637268,
"rewards/MCQ_Reward/std": 0.0740283839404583,
"step": 123,
"train_speed(iter/s)": 0.088484
},
{
"clip_ratio": 0.007923177909106016,
"epoch": 2.48,
"grad_norm": 2.789609909057617,
"kl": 0.587890625,
"learning_rate": 8.64049561984982e-07,
"loss": 0.005373558960855007,
"memory(GiB)": 18.17,
"step": 124,
"train_speed(iter/s)": 0.089133
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 194.5,
"completions/mean_length": 124.91796875,
"completions/min_length": 73.0,
"epoch": 2.5,
"grad_norm": 2.2765557765960693,
"kl": 0.498046875,
"learning_rate": 8.61867019052535e-07,
"loss": -0.0031618811190128326,
"memory(GiB)": 18.17,
"reward": 0.3880574107170105,
"reward_std": 0.02767461072653532,
"rewards/MCQ_Reward/mean": 0.3880574107170105,
"rewards/MCQ_Reward/std": 0.11312882974743843,
"step": 125,
"train_speed(iter/s)": 0.089217
},
{
"clip_ratio": 0.006887951632961631,
"epoch": 2.52,
"grad_norm": 2.2742230892181396,
"kl": 0.509765625,
"learning_rate": 8.596699001693255e-07,
"loss": -0.004048643633723259,
"memory(GiB)": 18.17,
"step": 126,
"train_speed(iter/s)": 0.089838
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 202.5,
"completions/mean_length": 117.484375,
"completions/min_length": 56.5,
"epoch": 2.54,
"grad_norm": 2.340428113937378,
"kl": 0.546875,
"learning_rate": 8.574582938349817e-07,
"loss": -0.009344515390694141,
"memory(GiB)": 18.17,
"reward": 0.38609637320041656,
"reward_std": 0.033216655254364014,
"rewards/MCQ_Reward/mean": 0.38609637320041656,
"rewards/MCQ_Reward/std": 0.09242032468318939,
"step": 127,
"train_speed(iter/s)": 0.089914
},
{
"clip_ratio": 0.007429210003465414,
"epoch": 2.56,
"grad_norm": 2.3134751319885254,
"kl": 0.57421875,
"learning_rate": 8.552322891326844e-07,
"loss": -0.010545218363404274,
"memory(GiB)": 18.17,
"step": 128,
"train_speed(iter/s)": 0.090544
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 238.0,
"completions/mean_length": 119.9765625,
"completions/min_length": 57.0,
"epoch": 2.58,
"grad_norm": 2.265873670578003,
"kl": 0.4931640625,
"learning_rate": 8.529919757255781e-07,
"loss": -0.007635302376002073,
"memory(GiB)": 18.17,
"reward": 0.41428878903388977,
"reward_std": 0.028425303287804127,
"rewards/MCQ_Reward/mean": 0.41428878903388977,
"rewards/MCQ_Reward/std": 0.07786687836050987,
"step": 129,
"train_speed(iter/s)": 0.09048
},
{
"clip_ratio": 0.006183756981045008,
"epoch": 2.6,
"grad_norm": 2.283554792404175,
"kl": 0.498046875,
"learning_rate": 8.507374438531606e-07,
"loss": -0.008446864783763885,
"memory(GiB)": 18.17,
"step": 130,
"train_speed(iter/s)": 0.091107
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 196.0,
"completions/mean_length": 119.125,
"completions/min_length": 59.0,
"epoch": 2.62,
"grad_norm": 2.8296353816986084,
"kl": 0.525390625,
"learning_rate": 8.484687843276468e-07,
"loss": 0.003696079831570387,
"memory(GiB)": 18.17,
"reward": 0.40898391604423523,
"reward_std": 0.02961808815598488,
"rewards/MCQ_Reward/mean": 0.40898391604423523,
"rewards/MCQ_Reward/std": 0.09117832407355309,
"step": 131,
"train_speed(iter/s)": 0.09081
},
{
"clip_ratio": 0.010138689540326595,
"epoch": 2.64,
"grad_norm": 2.565761089324951,
"kl": 0.53515625,
"learning_rate": 8.461860885303113e-07,
"loss": 0.003048412501811981,
"memory(GiB)": 18.17,
"step": 132,
"train_speed(iter/s)": 0.091425
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 209.0,
"completions/mean_length": 129.40234375,
"completions/min_length": 70.0,
"epoch": 2.66,
"grad_norm": 2.344294786453247,
"kl": 0.513671875,
"learning_rate": 8.438894484078085e-07,
"loss": 0.005981519352644682,
"memory(GiB)": 18.17,
"reward": 0.40958625078201294,
"reward_std": 0.027244774624705315,
"rewards/MCQ_Reward/mean": 0.40958625078201294,
"rewards/MCQ_Reward/std": 0.07108591124415398,
"step": 133,
"train_speed(iter/s)": 0.091506
},
{
"clip_ratio": 0.006955728633329272,
"epoch": 2.68,
"grad_norm": 2.667799949645996,
"kl": 0.50390625,
"learning_rate": 8.415789564684673e-07,
"loss": 0.0052396636456251144,
"memory(GiB)": 18.17,
"step": 134,
"train_speed(iter/s)": 0.092113
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 193.0,
"completions/mean_length": 132.30859375,
"completions/min_length": 79.0,
"epoch": 2.7,
"grad_norm": 2.6722846031188965,
"kl": 0.5029296875,
"learning_rate": 8.392547057785661e-07,
"loss": 0.0176947470754385,
"memory(GiB)": 18.17,
"reward": 0.39249348640441895,
"reward_std": 0.024370728991925716,
"rewards/MCQ_Reward/mean": 0.39249348640441895,
"rewards/MCQ_Reward/std": 0.10880232974886894,
"step": 135,
"train_speed(iter/s)": 0.092158
},
{
"clip_ratio": 0.009976111352443695,
"epoch": 2.7199999999999998,
"grad_norm": 2.80319881439209,
"kl": 0.548828125,
"learning_rate": 8.369167899585839e-07,
"loss": 0.01698880083858967,
"memory(GiB)": 18.17,
"step": 136,
"train_speed(iter/s)": 0.092755
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 186.0,
"completions/mean_length": 117.91015625,
"completions/min_length": 53.5,
"epoch": 2.74,
"grad_norm": 2.5274980068206787,
"kl": 0.5087890625,
"learning_rate": 8.34565303179429e-07,
"loss": -0.004888280760496855,
"memory(GiB)": 18.17,
"reward": 0.3668254613876343,
"reward_std": 0.02390660159289837,
"rewards/MCQ_Reward/mean": 0.3668254613876343,
"rewards/MCQ_Reward/std": 0.06858384422957897,
"step": 137,
"train_speed(iter/s)": 0.092788
},
{
"clip_ratio": 0.00792233063839376,
"epoch": 2.76,
"grad_norm": 2.6973214149475098,
"kl": 0.513671875,
"learning_rate": 8.322003401586461e-07,
"loss": -0.0054510245099663734,
"memory(GiB)": 18.17,
"step": 138,
"train_speed(iter/s)": 0.093386
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 212.5,
"completions/mean_length": 128.76953125,
"completions/min_length": 74.0,
"epoch": 2.7800000000000002,
"grad_norm": 2.22070574760437,
"kl": 0.4912109375,
"learning_rate": 8.298219961566008e-07,
"loss": -0.001897591631859541,
"memory(GiB)": 18.17,
"reward": 0.3943639397621155,
"reward_std": 0.021683918312191963,
"rewards/MCQ_Reward/mean": 0.3943639397621155,
"rewards/MCQ_Reward/std": 0.08081439509987831,
"step": 139,
"train_speed(iter/s)": 0.093426
},
{
"clip_ratio": 0.005092586623504758,
"epoch": 2.8,
"grad_norm": 2.3254384994506836,
"kl": 0.5009765625,
"learning_rate": 8.274303669726426e-07,
"loss": -0.0023171789944171906,
"memory(GiB)": 18.17,
"step": 140,
"train_speed(iter/s)": 0.094018
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 236.0,
"completions/mean_length": 131.94140625,
"completions/min_length": 76.0,
"epoch": 2.82,
"grad_norm": 2.8199474811553955,
"kl": 0.513671875,
"learning_rate": 8.250255489412462e-07,
"loss": 0.03072257712483406,
"memory(GiB)": 18.17,
"reward": 0.4145784378051758,
"reward_std": 0.026746340095996857,
"rewards/MCQ_Reward/mean": 0.4145784378051758,
"rewards/MCQ_Reward/std": 0.1253884807229042,
"step": 141,
"train_speed(iter/s)": 0.093563
},
{
"clip_ratio": 0.01698949094861746,
"epoch": 2.84,
"grad_norm": 3.6371665000915527,
"kl": 0.5654296875,
"learning_rate": 8.226076389281314e-07,
"loss": 0.030751001089811325,
"memory(GiB)": 18.17,
"step": 142,
"train_speed(iter/s)": 0.094156
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 222.0,
"completions/mean_length": 122.05859375,
"completions/min_length": 41.0,
"epoch": 2.86,
"grad_norm": 3.697355031967163,
"kl": 0.529296875,
"learning_rate": 8.201767343263611e-07,
"loss": 0.001254035159945488,
"memory(GiB)": 18.17,
"reward": 0.4235128164291382,
"reward_std": 0.02945070993155241,
"rewards/MCQ_Reward/mean": 0.4235128164291382,
"rewards/MCQ_Reward/std": 0.0826257448643446,
"step": 143,
"train_speed(iter/s)": 0.094158
},
{
"clip_ratio": 0.010704205837100744,
"epoch": 2.88,
"grad_norm": 2.6047918796539307,
"kl": 0.556640625,
"learning_rate": 8.177329330524181e-07,
"loss": 0.0003689592704176903,
"memory(GiB)": 18.17,
"step": 144,
"train_speed(iter/s)": 0.09474
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 314.5,
"completions/mean_length": 147.65234375,
"completions/min_length": 84.0,
"epoch": 2.9,
"grad_norm": 2.0444202423095703,
"kl": 0.4521484375,
"learning_rate": 8.152763335422612e-07,
"loss": 0.009064443409442902,
"memory(GiB)": 18.17,
"reward": 0.38259103894233704,
"reward_std": 0.023838728666305542,
"rewards/MCQ_Reward/mean": 0.38259103894233704,
"rewards/MCQ_Reward/std": 0.0847747940570116,
"step": 145,
"train_speed(iter/s)": 0.09459
},
{
"clip_ratio": 0.013846603687852621,
"epoch": 2.92,
"grad_norm": 3.0148403644561768,
"kl": 0.47265625,
"learning_rate": 8.128070347473608e-07,
"loss": 0.008937995880842209,
"memory(GiB)": 18.17,
"step": 146,
"train_speed(iter/s)": 0.095167
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 212.0,
"completions/mean_length": 131.4609375,
"completions/min_length": 58.5,
"epoch": 2.94,
"grad_norm": 2.3035802841186523,
"kl": 0.515625,
"learning_rate": 8.103251361307118e-07,
"loss": -0.003920593298971653,
"memory(GiB)": 18.17,
"reward": 0.46591490507125854,
"reward_std": 0.02803555503487587,
"rewards/MCQ_Reward/mean": 0.46591490507125854,
"rewards/MCQ_Reward/std": 0.08151933178305626,
"step": 147,
"train_speed(iter/s)": 0.095144
},
{
"clip_ratio": 0.008604592643678188,
"epoch": 2.96,
"grad_norm": 3.269644021987915,
"kl": 0.498046875,
"learning_rate": 8.07830737662829e-07,
"loss": -0.004623805172741413,
"memory(GiB)": 18.17,
"step": 148,
"train_speed(iter/s)": 0.095712
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 229.0,
"completions/mean_length": 115.5859375,
"completions/min_length": 47.5,
"epoch": 2.98,
"grad_norm": 2.762554883956909,
"kl": 0.55859375,
"learning_rate": 8.053239398177191e-07,
"loss": -0.002270375844091177,
"memory(GiB)": 18.17,
"reward": 0.40475866198539734,
"reward_std": 0.02323055360466242,
"rewards/MCQ_Reward/mean": 0.40475866198539734,
"rewards/MCQ_Reward/std": 0.11423858627676964,
"step": 149,
"train_speed(iter/s)": 0.095646
},
{
"clip_ratio": 0.005962205119431019,
"epoch": 3.0,
"grad_norm": 2.495875358581543,
"kl": 0.5625,
"learning_rate": 8.028048435688333e-07,
"loss": -0.0031687067821621895,
"memory(GiB)": 18.17,
"step": 150,
"train_speed(iter/s)": 0.0962
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 167.5,
"completions/mean_length": 117.21484375,
"completions/min_length": 58.0,
"epoch": 3.02,
"grad_norm": 3.30179762840271,
"kl": 0.572265625,
"learning_rate": 8.002735503850015e-07,
"loss": -0.0032917922362685204,
"memory(GiB)": 18.17,
"reward": 0.39226125180721283,
"reward_std": 0.025511370040476322,
"rewards/MCQ_Reward/mean": 0.39226125180721283,
"rewards/MCQ_Reward/std": 0.08468513377010822,
"step": 151,
"train_speed(iter/s)": 0.095897
},
{
"clip_ratio": 0.007298078387975693,
"epoch": 3.04,
"grad_norm": 2.3152873516082764,
"kl": 0.56640625,
"learning_rate": 7.97730162226344e-07,
"loss": -0.004036391619592905,
"memory(GiB)": 18.17,
"step": 152,
"train_speed(iter/s)": 0.096461
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 200.0,
"completions/mean_length": 121.8984375,
"completions/min_length": 63.5,
"epoch": 3.06,
"grad_norm": 2.2318758964538574,
"kl": 0.51171875,
"learning_rate": 7.951747815401649e-07,
"loss": 0.008308425545692444,
"memory(GiB)": 18.17,
"reward": 0.425733745098114,
"reward_std": 0.02289827074855566,
"rewards/MCQ_Reward/mean": 0.425733745098114,
"rewards/MCQ_Reward/std": 0.12863966077566147,
"step": 153,
"train_speed(iter/s)": 0.096546
},
{
"clip_ratio": 0.009599440731108189,
"epoch": 3.08,
"grad_norm": 3.2350826263427734,
"kl": 0.5009765625,
"learning_rate": 7.926075112568258e-07,
"loss": 0.00774328364059329,
"memory(GiB)": 18.17,
"step": 154,
"train_speed(iter/s)": 0.0971
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 315.5,
"completions/mean_length": 129.765625,
"completions/min_length": 63.5,
"epoch": 3.1,
"grad_norm": 2.8958089351654053,
"kl": 0.5146484375,
"learning_rate": 7.900284547855991e-07,
"loss": 0.005472003482282162,
"memory(GiB)": 18.17,
"reward": 0.3814770430326462,
"reward_std": 0.021100854501128197,
"rewards/MCQ_Reward/mean": 0.3814770430326462,
"rewards/MCQ_Reward/std": 0.08354593068361282,
"step": 155,
"train_speed(iter/s)": 0.096733
},
{
"clip_ratio": 0.008797692600637674,
"epoch": 3.12,
"grad_norm": 2.330720901489258,
"kl": 0.5107421875,
"learning_rate": 7.874377160105036e-07,
"loss": 0.00483354227617383,
"memory(GiB)": 18.17,
"step": 156,
"train_speed(iter/s)": 0.097282
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 208.0,
"completions/mean_length": 123.1640625,
"completions/min_length": 68.0,
"epoch": 3.14,
"grad_norm": 2.1395411491394043,
"kl": 0.515625,
"learning_rate": 7.848353992861194e-07,
"loss": 0.009709931910037994,
"memory(GiB)": 18.17,
"reward": 0.4426523745059967,
"reward_std": 0.024569914676249027,
"rewards/MCQ_Reward/mean": 0.4426523745059967,
"rewards/MCQ_Reward/std": 0.10452848672866821,
"step": 157,
"train_speed(iter/s)": 0.097277
},
{
"clip_ratio": 0.008177514653652906,
"epoch": 3.16,
"grad_norm": 2.8377902507781982,
"kl": 0.49609375,
"learning_rate": 7.822216094333847e-07,
"loss": 0.00888834334909916,
"memory(GiB)": 18.17,
"step": 158,
"train_speed(iter/s)": 0.097824
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 192.5,
"completions/mean_length": 121.08203125,
"completions/min_length": 59.0,
"epoch": 3.18,
"grad_norm": 2.439819574356079,
"kl": 0.5009765625,
"learning_rate": 7.795964517353733e-07,
"loss": -0.005721232853829861,
"memory(GiB)": 18.17,
"reward": 0.4260745346546173,
"reward_std": 0.024243751540780067,
"rewards/MCQ_Reward/mean": 0.4260745346546173,
"rewards/MCQ_Reward/std": 0.08284034207463264,
"step": 159,
"train_speed(iter/s)": 0.09781
},
{
"clip_ratio": 0.006790396990254521,
"epoch": 3.2,
"grad_norm": 1.9817484617233276,
"kl": 0.4970703125,
"learning_rate": 7.769600319330552e-07,
"loss": -0.006797813344746828,
"memory(GiB)": 18.17,
"step": 160,
"train_speed(iter/s)": 0.098355
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 200.5,
"completions/mean_length": 112.234375,
"completions/min_length": 54.0,
"epoch": 3.22,
"grad_norm": 2.4277918338775635,
"kl": 0.60546875,
"learning_rate": 7.743124562210351e-07,
"loss": 0.011250641196966171,
"memory(GiB)": 18.17,
"reward": 0.4286917597055435,
"reward_std": 0.023968255147337914,
"rewards/MCQ_Reward/mean": 0.4286917597055435,
"rewards/MCQ_Reward/std": 0.08755803853273392,
"step": 161,
"train_speed(iter/s)": 0.097905
},
{
"clip_ratio": 0.008228898979723454,
"epoch": 3.24,
"grad_norm": 2.4396235942840576,
"kl": 0.63671875,
"learning_rate": 7.716538312432765e-07,
"loss": 0.009992354549467564,
"memory(GiB)": 18.17,
"step": 162,
"train_speed(iter/s)": 0.098438
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 182.0,
"completions/mean_length": 128.484375,
"completions/min_length": 65.5,
"epoch": 3.26,
"grad_norm": 2.378303289413452,
"kl": 0.4560546875,
"learning_rate": 7.689842640888063e-07,
"loss": 0.014578643254935741,
"memory(GiB)": 18.17,
"reward": 0.4368235617876053,
"reward_std": 0.024292019195854664,
"rewards/MCQ_Reward/mean": 0.4368235617876053,
"rewards/MCQ_Reward/std": 0.10128979757428169,
"step": 163,
"train_speed(iter/s)": 0.098485
},
{
"clip_ratio": 0.006144619081169367,
"epoch": 3.2800000000000002,
"grad_norm": 2.336179733276367,
"kl": 0.455078125,
"learning_rate": 7.663038622873999e-07,
"loss": 0.014264167286455631,
"memory(GiB)": 18.17,
"step": 164,
"train_speed(iter/s)": 0.09902
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 190.0,
"completions/mean_length": 127.5546875,
"completions/min_length": 68.0,
"epoch": 3.3,
"grad_norm": 2.3888978958129883,
"kl": 0.51953125,
"learning_rate": 7.636127338052511e-07,
"loss": 0.0008876635693013668,
"memory(GiB)": 18.17,
"reward": 0.3655773550271988,
"reward_std": 0.023151511326432228,
"rewards/MCQ_Reward/mean": 0.3655773550271988,
"rewards/MCQ_Reward/std": 0.08209535107016563,
"step": 165,
"train_speed(iter/s)": 0.099067
},
{
"clip_ratio": 0.009708862751722336,
"epoch": 3.32,
"grad_norm": 2.849376678466797,
"kl": 0.53515625,
"learning_rate": 7.60910987040623e-07,
"loss": 0.0005215085111558437,
"memory(GiB)": 18.17,
"step": 166,
"train_speed(iter/s)": 0.099591
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 183.0,
"completions/mean_length": 114.09375,
"completions/min_length": 68.5,
"epoch": 3.34,
"grad_norm": 2.3568837642669678,
"kl": 0.568359375,
"learning_rate": 7.581987308194809e-07,
"loss": 0.009412365034222603,
"memory(GiB)": 18.17,
"reward": 0.38831935822963715,
"reward_std": 0.024401471950113773,
"rewards/MCQ_Reward/mean": 0.38831935822963715,
"rewards/MCQ_Reward/std": 0.07682501710951328,
"step": 167,
"train_speed(iter/s)": 0.099643
},
{
"clip_ratio": 0.009874043520539999,
"epoch": 3.36,
"grad_norm": 4.141200542449951,
"kl": 0.548828125,
"learning_rate": 7.554760743911103e-07,
"loss": 0.008638818748295307,
"memory(GiB)": 18.17,
"step": 168,
"train_speed(iter/s)": 0.100139
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 177.5,
"completions/mean_length": 116.015625,
"completions/min_length": 68.0,
"epoch": 3.38,
"grad_norm": 2.3995447158813477,
"kl": 0.5390625,
"learning_rate": 7.527431274237149e-07,
"loss": 0.009148918092250824,
"memory(GiB)": 18.17,
"reward": 0.43169474601745605,
"reward_std": 0.023636899888515472,
"rewards/MCQ_Reward/mean": 0.43169474601745605,
"rewards/MCQ_Reward/std": 0.08781928941607475,
"step": 169,
"train_speed(iter/s)": 0.100207
},
{
"clip_ratio": 0.011634313501417637,
"epoch": 3.4,
"grad_norm": 3.3103132247924805,
"kl": 0.580078125,
"learning_rate": 7.5e-07,
"loss": 0.008654891513288021,
"memory(GiB)": 18.17,
"step": 170,
"train_speed(iter/s)": 0.100725
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 198.5,
"completions/mean_length": 116.5859375,
"completions/min_length": 61.0,
"epoch": 3.42,
"grad_norm": 2.4376144409179688,
"kl": 0.51171875,
"learning_rate": 7.472468026127384e-07,
"loss": 0.0037187309935688972,
"memory(GiB)": 18.17,
"reward": 0.4193449318408966,
"reward_std": 0.024272997863590717,
"rewards/MCQ_Reward/mean": 0.4193449318408966,
"rewards/MCQ_Reward/std": 0.08024471625685692,
"step": 171,
"train_speed(iter/s)": 0.100337
},
{
"clip_ratio": 0.004286584910005331,
"epoch": 3.44,
"grad_norm": 2.298527479171753,
"kl": 0.501953125,
"learning_rate": 7.444836461603194e-07,
"loss": 0.0035052020102739334,
"memory(GiB)": 18.17,
"step": 172,
"train_speed(iter/s)": 0.10083
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 192.0,
"completions/mean_length": 107.78515625,
"completions/min_length": 54.0,
"epoch": 3.46,
"grad_norm": 2.706815004348755,
"kl": 0.572265625,
"learning_rate": 7.417106419422818e-07,
"loss": 0.001836567185819149,
"memory(GiB)": 18.17,
"reward": 0.4373796284198761,
"reward_std": 0.024632513523101807,
"rewards/MCQ_Reward/mean": 0.4373796284198761,
"rewards/MCQ_Reward/std": 0.10328296199440956,
"step": 173,
"train_speed(iter/s)": 0.100842
},
{
"clip_ratio": 0.00837572431191802,
"epoch": 3.48,
"grad_norm": 2.7765517234802246,
"kl": 0.55859375,
"learning_rate": 7.389279016548316e-07,
"loss": 0.0008762972429394722,
"memory(GiB)": 18.17,
"step": 174,
"train_speed(iter/s)": 0.10133
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 235.5,
"completions/mean_length": 141.59375,
"completions/min_length": 93.0,
"epoch": 3.5,
"grad_norm": 2.0208756923675537,
"kl": 0.494140625,
"learning_rate": 7.361355373863413e-07,
"loss": -0.0017252122052013874,
"memory(GiB)": 18.17,
"reward": 0.4430805742740631,
"reward_std": 0.023134860210120678,
"rewards/MCQ_Reward/mean": 0.4430805742740631,
"rewards/MCQ_Reward/std": 0.10230642557144165,
"step": 175,
"train_speed(iter/s)": 0.101269
},
{
"clip_ratio": 0.008417821954935789,
"epoch": 3.52,
"grad_norm": 2.5541892051696777,
"kl": 0.498046875,
"learning_rate": 7.333336616128369e-07,
"loss": -0.0020766020752489567,
"memory(GiB)": 18.17,
"step": 176,
"train_speed(iter/s)": 0.101776
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 237.5,
"completions/mean_length": 140.26953125,
"completions/min_length": 61.5,
"epoch": 3.54,
"grad_norm": 2.090574264526367,
"kl": 0.455078125,
"learning_rate": 7.305223871934656e-07,
"loss": -0.004062575753778219,
"memory(GiB)": 18.17,
"reward": 0.4077337831258774,
"reward_std": 0.021388554014265537,
"rewards/MCQ_Reward/mean": 0.4077337831258774,
"rewards/MCQ_Reward/std": 0.1092216707766056,
"step": 177,
"train_speed(iter/s)": 0.101717
},
{
"clip_ratio": 0.009097482077777386,
"epoch": 3.56,
"grad_norm": 2.031277894973755,
"kl": 0.4638671875,
"learning_rate": 7.277018273659516e-07,
"loss": -0.005147318355739117,
"memory(GiB)": 18.17,
"step": 178,
"train_speed(iter/s)": 0.102192
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 176.5,
"completions/mean_length": 103.77734375,
"completions/min_length": 56.0,
"epoch": 3.58,
"grad_norm": 2.28383731842041,
"kl": 0.55078125,
"learning_rate": 7.248720957420329e-07,
"loss": 0.0054731229320168495,
"memory(GiB)": 18.17,
"reward": 0.37708504498004913,
"reward_std": 0.022474835626780987,
"rewards/MCQ_Reward/mean": 0.37708504498004913,
"rewards/MCQ_Reward/std": 0.10817139223217964,
"step": 179,
"train_speed(iter/s)": 0.102207
},
{
"clip_ratio": 0.005004609236493707,
"epoch": 3.6,
"grad_norm": 2.2720046043395996,
"kl": 0.552734375,
"learning_rate": 7.220333063028871e-07,
"loss": 0.004853987134993076,
"memory(GiB)": 18.17,
"step": 180,
"train_speed(iter/s)": 0.10258
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 267.0,
"completions/mean_length": 135.375,
"completions/min_length": 64.5,
"epoch": 3.62,
"grad_norm": 2.0278213024139404,
"kl": 0.537109375,
"learning_rate": 7.191855733945386e-07,
"loss": 0.007204895373433828,
"memory(GiB)": 18.17,
"reward": 0.37996095418930054,
"reward_std": 0.024972867220640182,
"rewards/MCQ_Reward/mean": 0.37996095418930054,
"rewards/MCQ_Reward/std": 0.06211347132921219,
"step": 181,
"train_speed(iter/s)": 0.102022
},
{
"clip_ratio": 0.0050066676922142506,
"epoch": 3.64,
"grad_norm": 2.026421308517456,
"kl": 0.54296875,
"learning_rate": 7.163290117232541e-07,
"loss": 0.006550833582878113,
"memory(GiB)": 18.17,
"step": 182,
"train_speed(iter/s)": 0.102515
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 231.0,
"completions/mean_length": 132.80078125,
"completions/min_length": 70.0,
"epoch": 3.66,
"grad_norm": 2.322474479675293,
"kl": 0.4560546875,
"learning_rate": 7.134637363509209e-07,
"loss": 0.00408747885376215,
"memory(GiB)": 18.17,
"reward": 0.42590010166168213,
"reward_std": 0.02117757499217987,
"rewards/MCQ_Reward/mean": 0.42590010166168213,
"rewards/MCQ_Reward/std": 0.10450495779514313,
"step": 183,
"train_speed(iter/s)": 0.102439
},
{
"clip_ratio": 0.005717001855373383,
"epoch": 3.68,
"grad_norm": 2.0725347995758057,
"kl": 0.4501953125,
"learning_rate": 7.105898626904134e-07,
"loss": 0.003590245731174946,
"memory(GiB)": 18.17,
"step": 184,
"train_speed(iter/s)": 0.10291
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 185.0,
"completions/mean_length": 107.73828125,
"completions/min_length": 67.5,
"epoch": 3.7,
"grad_norm": 2.94624662399292,
"kl": 0.578125,
"learning_rate": 7.077075065009433e-07,
"loss": -0.0015533820260316133,
"memory(GiB)": 18.17,
"reward": 0.4082287549972534,
"reward_std": 0.023994137533009052,
"rewards/MCQ_Reward/mean": 0.4082287549972534,
"rewards/MCQ_Reward/std": 0.09996674209833145,
"step": 185,
"train_speed(iter/s)": 0.102951
},
{
"clip_ratio": 0.006125608924776316,
"epoch": 3.7199999999999998,
"grad_norm": 2.3971669673919678,
"kl": 0.572265625,
"learning_rate": 7.048167838833976e-07,
"loss": -0.0021633533760905266,
"memory(GiB)": 18.17,
"step": 186,
"train_speed(iter/s)": 0.103425
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 227.5,
"completions/mean_length": 131.453125,
"completions/min_length": 59.0,
"epoch": 3.74,
"grad_norm": 2.0767407417297363,
"kl": 0.513671875,
"learning_rate": 7.019178112756625e-07,
"loss": 0.005040531512349844,
"memory(GiB)": 18.17,
"reward": 0.43931877613067627,
"reward_std": 0.02542781364172697,
"rewards/MCQ_Reward/mean": 0.43931877613067627,
"rewards/MCQ_Reward/std": 0.0755577739328146,
"step": 187,
"train_speed(iter/s)": 0.103367
},
{
"clip_ratio": 0.007456609280779958,
"epoch": 3.76,
"grad_norm": 2.0555458068847656,
"kl": 0.513671875,
"learning_rate": 6.990107054479312e-07,
"loss": 0.004873338155448437,
"memory(GiB)": 18.17,
"step": 188,
"train_speed(iter/s)": 0.103852
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 187.0,
"completions/mean_length": 120.015625,
"completions/min_length": 56.0,
"epoch": 3.7800000000000002,
"grad_norm": 2.1511483192443848,
"kl": 0.546875,
"learning_rate": 6.960955834980027e-07,
"loss": -0.007258214056491852,
"memory(GiB)": 18.17,
"reward": 0.3652060180902481,
"reward_std": 0.023877170868217945,
"rewards/MCQ_Reward/mean": 0.3652060180902481,
"rewards/MCQ_Reward/std": 0.09329301491379738,
"step": 189,
"train_speed(iter/s)": 0.103851
},
{
"clip_ratio": 0.006274498999118805,
"epoch": 3.8,
"grad_norm": 2.204212188720703,
"kl": 0.5546875,
"learning_rate": 6.931725628465642e-07,
"loss": -0.0077828834764659405,
"memory(GiB)": 18.17,
"step": 190,
"train_speed(iter/s)": 0.104325
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 179.5,
"completions/mean_length": 119.02734375,
"completions/min_length": 68.0,
"epoch": 3.82,
"grad_norm": 2.489328384399414,
"kl": 0.5625,
"learning_rate": 6.902417612324615e-07,
"loss": -0.004156440030783415,
"memory(GiB)": 18.17,
"reward": 0.41069237887859344,
"reward_std": 0.02522939257323742,
"rewards/MCQ_Reward/mean": 0.41069237887859344,
"rewards/MCQ_Reward/std": 0.10438777878880501,
"step": 191,
"train_speed(iter/s)": 0.103961
},
{
"clip_ratio": 0.006902764085680246,
"epoch": 3.84,
"grad_norm": 2.573939085006714,
"kl": 0.53125,
"learning_rate": 6.87303296707956e-07,
"loss": -0.004263042006641626,
"memory(GiB)": 18.17,
"step": 192,
"train_speed(iter/s)": 0.104434
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 199.0,
"completions/mean_length": 119.2109375,
"completions/min_length": 63.5,
"epoch": 3.86,
"grad_norm": 2.4605846405029297,
"kl": 0.537109375,
"learning_rate": 6.843572876339704e-07,
"loss": -0.006107931490987539,
"memory(GiB)": 18.17,
"reward": 0.41506680846214294,
"reward_std": 0.025901762768626213,
"rewards/MCQ_Reward/mean": 0.41506680846214294,
"rewards/MCQ_Reward/std": 0.11812347918748856,
"step": 193,
"train_speed(iter/s)": 0.104435
},
{
"clip_ratio": 0.006947604939341545,
"epoch": 3.88,
"grad_norm": 2.9201459884643555,
"kl": 0.533203125,
"learning_rate": 6.814038526753204e-07,
"loss": -0.006667410954833031,
"memory(GiB)": 18.17,
"step": 194,
"train_speed(iter/s)": 0.104911
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 204.0,
"completions/mean_length": 123.375,
"completions/min_length": 58.5,
"epoch": 3.9,
"grad_norm": 2.481006145477295,
"kl": 0.638671875,
"learning_rate": 6.784431107959358e-07,
"loss": -0.00256272591650486,
"memory(GiB)": 18.17,
"reward": 0.4147709757089615,
"reward_std": 0.023487260565161705,
"rewards/MCQ_Reward/mean": 0.4147709757089615,
"rewards/MCQ_Reward/std": 0.08765164762735367,
"step": 195,
"train_speed(iter/s)": 0.104938
},
{
"clip_ratio": 0.00836537522263825,
"epoch": 3.92,
"grad_norm": 2.211996078491211,
"kl": 0.62109375,
"learning_rate": 6.754751812540679e-07,
"loss": -0.0026485356502234936,
"memory(GiB)": 18.17,
"step": 196,
"train_speed(iter/s)": 0.105375
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 161.0,
"completions/mean_length": 113.68359375,
"completions/min_length": 58.0,
"epoch": 3.94,
"grad_norm": 2.5469682216644287,
"kl": 0.556640625,
"learning_rate": 6.725001835974852e-07,
"loss": -0.005141774192452431,
"memory(GiB)": 18.17,
"reward": 0.39422211050987244,
"reward_std": 0.022977779619395733,
"rewards/MCQ_Reward/mean": 0.39422211050987244,
"rewards/MCQ_Reward/std": 0.09659452736377716,
"step": 197,
"train_speed(iter/s)": 0.105428
},
{
"clip_ratio": 0.007515270030125976,
"epoch": 3.96,
"grad_norm": 2.603193998336792,
"kl": 0.57421875,
"learning_rate": 6.695182376586602e-07,
"loss": -0.00558980368077755,
"memory(GiB)": 18.17,
"step": 198,
"train_speed(iter/s)": 0.105897
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 251.0,
"completions/mean_length": 124.140625,
"completions/min_length": 66.5,
"epoch": 3.98,
"grad_norm": 2.8109734058380127,
"kl": 0.5703125,
"learning_rate": 6.665294635499403e-07,
"loss": -0.008472483605146408,
"memory(GiB)": 18.17,
"reward": 0.3954710364341736,
"reward_std": 0.026893282309174538,
"rewards/MCQ_Reward/mean": 0.3954710364341736,
"rewards/MCQ_Reward/std": 0.07466300576925278,
"step": 199,
"train_speed(iter/s)": 0.10569
},
{
"clip_ratio": 0.007555491756647825,
"epoch": 4.0,
"grad_norm": 3.981370687484741,
"kl": 0.5625,
"learning_rate": 6.635339816587108e-07,
"loss": -0.008467345498502254,
"memory(GiB)": 18.17,
"step": 200,
"train_speed(iter/s)": 0.106122
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 189.0,
"completions/mean_length": 114.1640625,
"completions/min_length": 67.0,
"epoch": 4.02,
"grad_norm": 3.464586019515991,
"kl": 1.001953125,
"learning_rate": 6.605319126425453e-07,
"loss": 0.010952511802315712,
"memory(GiB)": 18.17,
"reward": 0.4330308884382248,
"reward_std": 0.022406785748898983,
"rewards/MCQ_Reward/mean": 0.4330308884382248,
"rewards/MCQ_Reward/std": 0.09031685814261436,
"step": 201,
"train_speed(iter/s)": 0.10573
},
{
"clip_ratio": 0.010695958975702524,
"epoch": 4.04,
"grad_norm": 3.2848002910614014,
"kl": 1.3125,
"learning_rate": 6.575233774243464e-07,
"loss": 0.010859224945306778,
"memory(GiB)": 18.17,
"step": 202,
"train_speed(iter/s)": 0.106187
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 173.5,
"completions/mean_length": 115.0625,
"completions/min_length": 64.5,
"epoch": 4.06,
"grad_norm": 2.5354137420654297,
"kl": 0.521484375,
"learning_rate": 6.545084971874736e-07,
"loss": 0.008116345852613449,
"memory(GiB)": 18.17,
"reward": 0.4043910503387451,
"reward_std": 0.023216267116367817,
"rewards/MCQ_Reward/mean": 0.4043910503387451,
"rewards/MCQ_Reward/std": 0.09529644250869751,
"step": 203,
"train_speed(iter/s)": 0.106255
},
{
"clip_ratio": 0.005409660283476114,
"epoch": 4.08,
"grad_norm": 2.4091176986694336,
"kl": 0.52734375,
"learning_rate": 6.514873933708637e-07,
"loss": 0.007959958165884018,
"memory(GiB)": 18.17,
"step": 204,
"train_speed(iter/s)": 0.10667
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 191.5,
"completions/mean_length": 103.03515625,
"completions/min_length": 53.0,
"epoch": 4.1,
"grad_norm": 2.983665704727173,
"kl": 0.62109375,
"learning_rate": 6.484601876641375e-07,
"loss": -0.014035141095519066,
"memory(GiB)": 18.17,
"reward": 0.4240594506263733,
"reward_std": 0.025937434285879135,
"rewards/MCQ_Reward/mean": 0.4240594506263733,
"rewards/MCQ_Reward/std": 0.07473786175251007,
"step": 205,
"train_speed(iter/s)": 0.106723
},
{
"clip_ratio": 0.018164899200201035,
"epoch": 4.12,
"grad_norm": 6.4920454025268555,
"kl": 0.5859375,
"learning_rate": 6.454270020026995e-07,
"loss": -0.013708272948861122,
"memory(GiB)": 18.17,
"step": 206,
"train_speed(iter/s)": 0.107162
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 242.0,
"completions/mean_length": 129.375,
"completions/min_length": 58.5,
"epoch": 4.14,
"grad_norm": 2.714660882949829,
"kl": 0.5625,
"learning_rate": 6.423879585628261e-07,
"loss": -0.014167927205562592,
"memory(GiB)": 18.17,
"reward": 0.396339014172554,
"reward_std": 0.02192540653049946,
"rewards/MCQ_Reward/mean": 0.396339014172554,
"rewards/MCQ_Reward/std": 0.11277944594621658,
"step": 207,
"train_speed(iter/s)": 0.106875
},
{
"clip_ratio": 0.007178165018558502,
"epoch": 4.16,
"grad_norm": 2.4650375843048096,
"kl": 0.560546875,
"learning_rate": 6.393431797567439e-07,
"loss": -0.014689125120639801,
"memory(GiB)": 18.17,
"step": 208,
"train_speed(iter/s)": 0.107325
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 255.0,
"completions/mean_length": 131.01953125,
"completions/min_length": 64.5,
"epoch": 4.18,
"grad_norm": 2.1339519023895264,
"kl": 0.58203125,
"learning_rate": 6.362927882276989e-07,
"loss": -0.017007270827889442,
"memory(GiB)": 18.17,
"reward": 0.42686355113983154,
"reward_std": 0.023915644735097885,
"rewards/MCQ_Reward/mean": 0.42686355113983154,
"rewards/MCQ_Reward/std": 0.10529575496912003,
"step": 209,
"train_speed(iter/s)": 0.107141
},
{
"clip_ratio": 0.005084275268018246,
"epoch": 4.2,
"grad_norm": 2.0464680194854736,
"kl": 0.59375,
"learning_rate": 6.332369068450174e-07,
"loss": -0.0175747312605381,
"memory(GiB)": 18.17,
"step": 210,
"train_speed(iter/s)": 0.107586
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 197.5,
"completions/mean_length": 116.63671875,
"completions/min_length": 61.5,
"epoch": 4.22,
"grad_norm": 2.4869492053985596,
"kl": 0.544921875,
"learning_rate": 6.30175658699156e-07,
"loss": -0.0016960185021162033,
"memory(GiB)": 18.17,
"reward": 0.43242450058460236,
"reward_std": 0.02396441251039505,
"rewards/MCQ_Reward/mean": 0.43242450058460236,
"rewards/MCQ_Reward/std": 0.07406600937247276,
"step": 211,
"train_speed(iter/s)": 0.107182
},
{
"clip_ratio": 0.006936221849173307,
"epoch": 4.24,
"grad_norm": 2.2954320907592773,
"kl": 0.5390625,
"learning_rate": 6.271091670967436e-07,
"loss": -0.001955235842615366,
"memory(GiB)": 18.17,
"step": 212,
"train_speed(iter/s)": 0.10762
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 212.5,
"completions/mean_length": 132.296875,
"completions/min_length": 90.0,
"epoch": 4.26,
"grad_norm": 2.5567421913146973,
"kl": 0.548828125,
"learning_rate": 6.240375555556145e-07,
"loss": -0.010683618485927582,
"memory(GiB)": 18.17,
"reward": 0.3712979108095169,
"reward_std": 0.022392110899090767,
"rewards/MCQ_Reward/mean": 0.3712979108095169,
"rewards/MCQ_Reward/std": 0.0758376233279705,
"step": 213,
"train_speed(iter/s)": 0.107578
},
{
"clip_ratio": 0.01051389379426837,
"epoch": 4.28,
"grad_norm": 3.9029605388641357,
"kl": 0.529296875,
"learning_rate": 6.209609477998338e-07,
"loss": -0.010750237852334976,
"memory(GiB)": 18.17,
"step": 214,
"train_speed(iter/s)": 0.108018
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 181.5,
"completions/mean_length": 117.71875,
"completions/min_length": 60.5,
"epoch": 4.3,
"grad_norm": 2.3913040161132812,
"kl": 0.6015625,
"learning_rate": 6.178794677547137e-07,
"loss": -0.012967615388333797,
"memory(GiB)": 18.17,
"reward": 0.3914954960346222,
"reward_std": 0.021691203117370605,
"rewards/MCQ_Reward/mean": 0.3914954960346222,
"rewards/MCQ_Reward/std": 0.10047328472137451,
"step": 215,
"train_speed(iter/s)": 0.108034
},
{
"clip_ratio": 0.005430733785033226,
"epoch": 4.32,
"grad_norm": 2.3732998371124268,
"kl": 0.61328125,
"learning_rate": 6.147932395418205e-07,
"loss": -0.013309886679053307,
"memory(GiB)": 18.17,
"step": 216,
"train_speed(iter/s)": 0.108474
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 209.5,
"completions/mean_length": 123.4765625,
"completions/min_length": 65.0,
"epoch": 4.34,
"grad_norm": 2.7147343158721924,
"kl": 0.552734375,
"learning_rate": 6.117023874739771e-07,
"loss": -0.0006074332632124424,
"memory(GiB)": 18.17,
"reward": 0.4220256060361862,
"reward_std": 0.0257421238347888,
"rewards/MCQ_Reward/mean": 0.4220256060361862,
"rewards/MCQ_Reward/std": 0.12063978612422943,
"step": 217,
"train_speed(iter/s)": 0.10841
},
{
"clip_ratio": 0.006779439281672239,
"epoch": 4.36,
"grad_norm": 2.3169238567352295,
"kl": 0.544921875,
"learning_rate": 6.086070360502539e-07,
"loss": -0.0006955214776098728,
"memory(GiB)": 18.17,
"step": 218,
"train_speed(iter/s)": 0.108822
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 203.5,
"completions/mean_length": 116.0625,
"completions/min_length": 53.5,
"epoch": 4.38,
"grad_norm": 2.7408437728881836,
"kl": 0.615234375,
"learning_rate": 6.055073099509549e-07,
"loss": -0.007178765721619129,
"memory(GiB)": 18.17,
"reward": 0.41480791568756104,
"reward_std": 0.028133179992437363,
"rewards/MCQ_Reward/mean": 0.41480791568756104,
"rewards/MCQ_Reward/std": 0.1095062680542469,
"step": 219,
"train_speed(iter/s)": 0.108796
},
{
"clip_ratio": 0.007214481011033058,
"epoch": 4.4,
"grad_norm": 2.457122802734375,
"kl": 0.6171875,
"learning_rate": 6.024033340325954e-07,
"loss": -0.008253653533756733,
"memory(GiB)": 18.17,
"step": 220,
"train_speed(iter/s)": 0.109227
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 209.5,
"completions/mean_length": 118.609375,
"completions/min_length": 59.0,
"epoch": 4.42,
"grad_norm": 2.8679587841033936,
"kl": 0.568359375,
"learning_rate": 5.992952333228726e-07,
"loss": 0.013627042062580585,
"memory(GiB)": 18.17,
"reward": 0.4350634217262268,
"reward_std": 0.0218770457431674,
"rewards/MCQ_Reward/mean": 0.4350634217262268,
"rewards/MCQ_Reward/std": 0.07635831832885742,
"step": 221,
"train_speed(iter/s)": 0.108811
},
{
"clip_ratio": 0.005678659770637751,
"epoch": 4.44,
"grad_norm": 2.187412738800049,
"kl": 0.58203125,
"learning_rate": 5.961831330156305e-07,
"loss": 0.013213744387030602,
"memory(GiB)": 18.17,
"step": 222,
"train_speed(iter/s)": 0.109221
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 226.5,
"completions/mean_length": 125.9765625,
"completions/min_length": 48.5,
"epoch": 4.46,
"grad_norm": 3.5221126079559326,
"kl": 0.587890625,
"learning_rate": 5.93067158465815e-07,
"loss": -0.0011408873833715916,
"memory(GiB)": 18.17,
"reward": 0.44135691225528717,
"reward_std": 0.025366419926285744,
"rewards/MCQ_Reward/mean": 0.44135691225528717,
"rewards/MCQ_Reward/std": 0.07711124420166016,
"step": 223,
"train_speed(iter/s)": 0.109176
},
{
"clip_ratio": 0.007937990361824632,
"epoch": 4.48,
"grad_norm": 2.513356924057007,
"kl": 0.5703125,
"learning_rate": 5.899474351844269e-07,
"loss": -0.0011316398158669472,
"memory(GiB)": 18.17,
"step": 224,
"train_speed(iter/s)": 0.109601
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 228.0,
"completions/mean_length": 120.234375,
"completions/min_length": 54.0,
"epoch": 4.5,
"grad_norm": 2.853579044342041,
"kl": 0.744140625,
"learning_rate": 5.868240888334652e-07,
"loss": -0.0010898616164922714,
"memory(GiB)": 18.17,
"reward": 0.41750770807266235,
"reward_std": 0.024566995911300182,
"rewards/MCQ_Reward/mean": 0.41750770807266235,
"rewards/MCQ_Reward/std": 0.09383138827979565,
"step": 225,
"train_speed(iter/s)": 0.109546
},
{
"clip_ratio": 0.012675716076046228,
"epoch": 4.52,
"grad_norm": 5.211337089538574,
"kl": 0.658203125,
"learning_rate": 5.836972452208654e-07,
"loss": -0.001642034389078617,
"memory(GiB)": 18.17,
"step": 226,
"train_speed(iter/s)": 0.109972
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 183.5,
"completions/mean_length": 126.68359375,
"completions/min_length": 64.0,
"epoch": 4.54,
"grad_norm": 2.3116183280944824,
"kl": 0.505859375,
"learning_rate": 5.805670302954321e-07,
"loss": 0.017429981380701065,
"memory(GiB)": 18.17,
"reward": 0.41671665012836456,
"reward_std": 0.02627546712756157,
"rewards/MCQ_Reward/mean": 0.41671665012836456,
"rewards/MCQ_Reward/std": 0.09354511648416519,
"step": 227,
"train_speed(iter/s)": 0.109937
},
{
"clip_ratio": 0.005898691713809967,
"epoch": 4.5600000000000005,
"grad_norm": 2.306483507156372,
"kl": 0.5087890625,
"learning_rate": 5.774335701417662e-07,
"loss": 0.016744598746299744,
"memory(GiB)": 18.17,
"step": 228,
"train_speed(iter/s)": 0.110353
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 213.5,
"completions/mean_length": 124.5546875,
"completions/min_length": 61.0,
"epoch": 4.58,
"grad_norm": 2.3084402084350586,
"kl": 0.552734375,
"learning_rate": 5.742969909751858e-07,
"loss": -0.009621858596801758,
"memory(GiB)": 18.17,
"reward": 0.45828977227211,
"reward_std": 0.023471640422940254,
"rewards/MCQ_Reward/mean": 0.45828977227211,
"rewards/MCQ_Reward/std": 0.09269878640770912,
"step": 229,
"train_speed(iter/s)": 0.110326
},
{
"clip_ratio": 0.005610911408439279,
"epoch": 4.6,
"grad_norm": 2.163801431655884,
"kl": 0.552734375,
"learning_rate": 5.711574191366427e-07,
"loss": -0.010531945154070854,
"memory(GiB)": 18.17,
"step": 230,
"train_speed(iter/s)": 0.110743
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 187.5,
"completions/mean_length": 116.93359375,
"completions/min_length": 62.5,
"epoch": 4.62,
"grad_norm": 3.1812872886657715,
"kl": 2.26171875,
"learning_rate": 5.680149810876322e-07,
"loss": 0.006941274274140596,
"memory(GiB)": 18.17,
"reward": 0.45568907260894775,
"reward_std": 0.023496804758906364,
"rewards/MCQ_Reward/mean": 0.45568907260894775,
"rewards/MCQ_Reward/std": 0.09556515514850616,
"step": 231,
"train_speed(iter/s)": 0.110377
},
{
"clip_ratio": 0.006443677702918649,
"epoch": 4.64,
"grad_norm": 2.733854293823242,
"kl": 2.2734375,
"learning_rate": 5.648698034051008e-07,
"loss": 0.006462510209530592,
"memory(GiB)": 18.17,
"step": 232,
"train_speed(iter/s)": 0.110787
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 233.0,
"completions/mean_length": 133.1015625,
"completions/min_length": 70.5,
"epoch": 4.66,
"grad_norm": 2.4281585216522217,
"kl": 0.55859375,
"learning_rate": 5.617220127763474e-07,
"loss": 0.013438165187835693,
"memory(GiB)": 18.17,
"reward": 0.43506887555122375,
"reward_std": 0.025797616690397263,
"rewards/MCQ_Reward/mean": 0.43506887555122375,
"rewards/MCQ_Reward/std": 0.09859243780374527,
"step": 233,
"train_speed(iter/s)": 0.110691
},
{
"clip_ratio": 0.0072706313803792,
"epoch": 4.68,
"grad_norm": 2.526357889175415,
"kl": 0.55859375,
"learning_rate": 5.585717359939192e-07,
"loss": 0.012631012126803398,
"memory(GiB)": 18.17,
"step": 234,
"train_speed(iter/s)": 0.111101
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 214.5,
"completions/mean_length": 133.1328125,
"completions/min_length": 57.0,
"epoch": 4.7,
"grad_norm": 2.639338731765747,
"kl": 0.552734375,
"learning_rate": 5.554190999505055e-07,
"loss": -0.008054563775658607,
"memory(GiB)": 18.17,
"reward": 0.40963128209114075,
"reward_std": 0.024876238778233528,
"rewards/MCQ_Reward/mean": 0.40963128209114075,
"rewards/MCQ_Reward/std": 0.06643268279731274,
"step": 235,
"train_speed(iter/s)": 0.111027
},
{
"clip_ratio": 0.008271400351077318,
"epoch": 4.72,
"grad_norm": 2.7264564037323,
"kl": 0.568359375,
"learning_rate": 5.522642316338268e-07,
"loss": -0.008453292772173882,
"memory(GiB)": 18.17,
"step": 236,
"train_speed(iter/s)": 0.111434
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 188.0,
"completions/mean_length": 123.84765625,
"completions/min_length": 65.5,
"epoch": 4.74,
"grad_norm": 2.405317544937134,
"kl": 0.5400390625,
"learning_rate": 5.491072581215186e-07,
"loss": 0.00114892004057765,
"memory(GiB)": 18.17,
"reward": 0.4337426722049713,
"reward_std": 0.020247386768460274,
"rewards/MCQ_Reward/mean": 0.4337426722049713,
"rewards/MCQ_Reward/std": 0.07973705604672432,
"step": 237,
"train_speed(iter/s)": 0.111369
},
{
"clip_ratio": 0.006459691561758518,
"epoch": 4.76,
"grad_norm": 2.8662662506103516,
"kl": 0.5400390625,
"learning_rate": 5.459483065760138e-07,
"loss": 0.0009391154162585735,
"memory(GiB)": 18.17,
"step": 238,
"train_speed(iter/s)": 0.111775
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 244.0,
"completions/mean_length": 133.96875,
"completions/min_length": 75.0,
"epoch": 4.78,
"grad_norm": 2.400651216506958,
"kl": 0.5078125,
"learning_rate": 5.427875042394199e-07,
"loss": 0.002962369006127119,
"memory(GiB)": 18.17,
"reward": 0.4192984253168106,
"reward_std": 0.023103597573935986,
"rewards/MCQ_Reward/mean": 0.4192984253168106,
"rewards/MCQ_Reward/std": 0.08515846729278564,
"step": 239,
"train_speed(iter/s)": 0.11166
},
{
"clip_ratio": 0.00794414198026061,
"epoch": 4.8,
"grad_norm": 3.1118853092193604,
"kl": 0.5029296875,
"learning_rate": 5.396249784283942e-07,
"loss": 0.0026899795047938824,
"memory(GiB)": 18.17,
"step": 240,
"train_speed(iter/s)": 0.112066
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 176.5,
"completions/mean_length": 114.4609375,
"completions/min_length": 47.5,
"epoch": 4.82,
"grad_norm": 2.5313034057617188,
"kl": 0.5390625,
"learning_rate": 5.364608565290154e-07,
"loss": -0.0074430471286177635,
"memory(GiB)": 18.17,
"reward": 0.4074428677558899,
"reward_std": 0.02112921793013811,
"rewards/MCQ_Reward/mean": 0.4074428677558899,
"rewards/MCQ_Reward/std": 0.07994595915079117,
"step": 241,
"train_speed(iter/s)": 0.111745
},
{
"clip_ratio": 0.007256179815158248,
"epoch": 4.84,
"grad_norm": 2.768711566925049,
"kl": 0.5625,
"learning_rate": 5.33295265991652e-07,
"loss": -0.0077315750531852245,
"memory(GiB)": 18.17,
"step": 242,
"train_speed(iter/s)": 0.112147
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 167.0,
"completions/mean_length": 115.47265625,
"completions/min_length": 67.5,
"epoch": 4.86,
"grad_norm": 2.561013698577881,
"kl": 0.57421875,
"learning_rate": 5.301283343258292e-07,
"loss": -0.0039140088483691216,
"memory(GiB)": 18.17,
"reward": 0.42967718839645386,
"reward_std": 0.020259867422282696,
"rewards/MCQ_Reward/mean": 0.42967718839645386,
"rewards/MCQ_Reward/std": 0.09365658834576607,
"step": 243,
"train_speed(iter/s)": 0.112166
},
{
"clip_ratio": 0.008353757206350565,
"epoch": 4.88,
"grad_norm": 3.9286372661590576,
"kl": 0.560546875,
"learning_rate": 5.26960189095093e-07,
"loss": -0.003905682824552059,
"memory(GiB)": 18.17,
"step": 244,
"train_speed(iter/s)": 0.112566
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 206.0,
"completions/mean_length": 130.84375,
"completions/min_length": 77.0,
"epoch": 4.9,
"grad_norm": 2.3792028427124023,
"kl": 0.515625,
"learning_rate": 5.237909579118712e-07,
"loss": 0.0075805773958563805,
"memory(GiB)": 18.17,
"reward": 0.37578998506069183,
"reward_std": 0.022264255210757256,
"rewards/MCQ_Reward/mean": 0.37578998506069183,
"rewards/MCQ_Reward/std": 0.09643128886818886,
"step": 245,
"train_speed(iter/s)": 0.112504
},
{
"clip_ratio": 0.006022685440257192,
"epoch": 4.92,
"grad_norm": 2.490131378173828,
"kl": 0.501953125,
"learning_rate": 5.206207684323335e-07,
"loss": 0.007525968365371227,
"memory(GiB)": 18.17,
"step": 246,
"train_speed(iter/s)": 0.112901
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 177.5,
"completions/mean_length": 112.49609375,
"completions/min_length": 62.5,
"epoch": 4.9399999999999995,
"grad_norm": 2.270827293395996,
"kl": 0.580078125,
"learning_rate": 5.174497483512505e-07,
"loss": 0.011211629025638103,
"memory(GiB)": 18.17,
"reward": 0.39156346023082733,
"reward_std": 0.02191222459077835,
"rewards/MCQ_Reward/mean": 0.39156346023082733,
"rewards/MCQ_Reward/std": 0.12107554450631142,
"step": 247,
"train_speed(iter/s)": 0.112883
},
{
"clip_ratio": 0.006176856812089682,
"epoch": 4.96,
"grad_norm": 2.373053550720215,
"kl": 0.57421875,
"learning_rate": 5.142780253968481e-07,
"loss": 0.010641951113939285,
"memory(GiB)": 18.17,
"step": 248,
"train_speed(iter/s)": 0.11328
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 204.0,
"completions/mean_length": 131.14453125,
"completions/min_length": 62.5,
"epoch": 4.98,
"grad_norm": 2.2482690811157227,
"kl": 0.525390625,
"learning_rate": 5.111057273256647e-07,
"loss": 0.0050743343308568,
"memory(GiB)": 18.17,
"reward": 0.40770605206489563,
"reward_std": 0.022150222212076187,
"rewards/MCQ_Reward/mean": 0.40770605206489563,
"rewards/MCQ_Reward/std": 0.11748149991035461,
"step": 249,
"train_speed(iter/s)": 0.113183
},
{
"clip_ratio": 0.006638662423938513,
"epoch": 5.0,
"grad_norm": 2.2492520809173584,
"kl": 0.5390625,
"learning_rate": 5.07932981917404e-07,
"loss": 0.004837746266275644,
"memory(GiB)": 18.17,
"step": 250,
"train_speed(iter/s)": 0.113563
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 214.0,
"completions/mean_length": 125.765625,
"completions/min_length": 68.5,
"epoch": 5.02,
"grad_norm": 2.556406259536743,
"kl": 0.5078125,
"learning_rate": 5.047599169697883e-07,
"loss": 0.017076797783374786,
"memory(GiB)": 18.17,
"reward": 0.4466231018304825,
"reward_std": 0.0222383551299572,
"rewards/MCQ_Reward/mean": 0.4466231018304825,
"rewards/MCQ_Reward/std": 0.11308542639017105,
"step": 251,
"train_speed(iter/s)": 0.113109
},
{
"clip_ratio": 0.007436602842062712,
"epoch": 5.04,
"grad_norm": 2.0482616424560547,
"kl": 0.515625,
"learning_rate": 5.015866602934111e-07,
"loss": 0.01610303670167923,
"memory(GiB)": 18.17,
"step": 252,
"train_speed(iter/s)": 0.113475
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 168.0,
"completions/mean_length": 109.3359375,
"completions/min_length": 65.0,
"epoch": 5.06,
"grad_norm": 2.6583385467529297,
"kl": 0.6015625,
"learning_rate": 4.984133397065888e-07,
"loss": 0.005715301260352135,
"memory(GiB)": 18.17,
"reward": 0.3956441879272461,
"reward_std": 0.02386545669287443,
"rewards/MCQ_Reward/mean": 0.3956441879272461,
"rewards/MCQ_Reward/std": 0.0772719755768776,
"step": 253,
"train_speed(iter/s)": 0.113471
},
{
"clip_ratio": 0.006691478192806244,
"epoch": 5.08,
"grad_norm": 2.478234052658081,
"kl": 0.5859375,
"learning_rate": 4.952400830302116e-07,
"loss": 0.00553365983068943,
"memory(GiB)": 18.17,
"step": 254,
"train_speed(iter/s)": 0.113858
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 235.0,
"completions/mean_length": 144.1796875,
"completions/min_length": 78.0,
"epoch": 5.1,
"grad_norm": 2.308807373046875,
"kl": 0.5009765625,
"learning_rate": 4.92067018082596e-07,
"loss": -0.0058871605433523655,
"memory(GiB)": 18.17,
"reward": 0.4203776866197586,
"reward_std": 0.022159602493047714,
"rewards/MCQ_Reward/mean": 0.4203776866197586,
"rewards/MCQ_Reward/std": 0.09526496008038521,
"step": 255,
"train_speed(iter/s)": 0.113761
},
{
"clip_ratio": 0.007533560739830136,
"epoch": 5.12,
"grad_norm": 2.9820773601531982,
"kl": 0.4921875,
"learning_rate": 4.888942726743353e-07,
"loss": -0.006009383127093315,
"memory(GiB)": 18.17,
"step": 256,
"train_speed(iter/s)": 0.114127
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 189.5,
"completions/mean_length": 115.0078125,
"completions/min_length": 65.0,
"epoch": 5.14,
"grad_norm": 2.3862602710723877,
"kl": 0.57421875,
"learning_rate": 4.857219746031519e-07,
"loss": -0.010767871513962746,
"memory(GiB)": 18.17,
"reward": 0.43338486552238464,
"reward_std": 0.025110138580203056,
"rewards/MCQ_Reward/mean": 0.43338486552238464,
"rewards/MCQ_Reward/std": 0.08122389577329159,
"step": 257,
"train_speed(iter/s)": 0.114083
},
{
"clip_ratio": 0.005816203076392412,
"epoch": 5.16,
"grad_norm": 2.2391088008880615,
"kl": 0.57421875,
"learning_rate": 4.825502516487496e-07,
"loss": -0.011337889358401299,
"memory(GiB)": 18.17,
"step": 258,
"train_speed(iter/s)": 0.11446
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 241.0,
"completions/mean_length": 121.109375,
"completions/min_length": 68.0,
"epoch": 5.18,
"grad_norm": 3.2198102474212646,
"kl": 0.642578125,
"learning_rate": 4.793792315676664e-07,
"loss": -0.0017241109162569046,
"memory(GiB)": 18.17,
"reward": 0.41922956705093384,
"reward_std": 0.02394416555762291,
"rewards/MCQ_Reward/mean": 0.41922956705093384,
"rewards/MCQ_Reward/std": 0.08786309324204922,
"step": 259,
"train_speed(iter/s)": 0.11433
},
{
"clip_ratio": 0.008633819408714771,
"epoch": 5.2,
"grad_norm": 2.5045688152313232,
"kl": 0.611328125,
"learning_rate": 4.762090420881288e-07,
"loss": -0.0024092746898531914,
"memory(GiB)": 18.17,
"step": 260,
"train_speed(iter/s)": 0.11471
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 203.5,
"completions/mean_length": 121.0,
"completions/min_length": 59.5,
"epoch": 5.22,
"grad_norm": 3.3788204193115234,
"kl": 0.65625,
"learning_rate": 4.7303981090490706e-07,
"loss": 0.0016009537503123283,
"memory(GiB)": 18.17,
"reward": 0.4228467643260956,
"reward_std": 0.02382771298289299,
"rewards/MCQ_Reward/mean": 0.4228467643260956,
"rewards/MCQ_Reward/std": 0.08922314271330833,
"step": 261,
"train_speed(iter/s)": 0.114325
},
{
"clip_ratio": 0.009796116035431623,
"epoch": 5.24,
"grad_norm": 3.2910051345825195,
"kl": 0.603515625,
"learning_rate": 4.698716656741708e-07,
"loss": 0.0013471171259880066,
"memory(GiB)": 18.17,
"step": 262,
"train_speed(iter/s)": 0.114703
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 178.0,
"completions/mean_length": 117.85546875,
"completions/min_length": 58.5,
"epoch": 5.26,
"grad_norm": 3.0833852291107178,
"kl": 0.607421875,
"learning_rate": 4.66704734008348e-07,
"loss": 0.01880352757871151,
"memory(GiB)": 18.17,
"reward": 0.4038514196872711,
"reward_std": 0.024144282564520836,
"rewards/MCQ_Reward/mean": 0.4038514196872711,
"rewards/MCQ_Reward/std": 0.11032669246196747,
"step": 263,
"train_speed(iter/s)": 0.114712
},
{
"clip_ratio": 0.0071860982570797205,
"epoch": 5.28,
"grad_norm": 2.223651885986328,
"kl": 0.62109375,
"learning_rate": 4.6353914347098467e-07,
"loss": 0.018028832972049713,
"memory(GiB)": 18.17,
"step": 264,
"train_speed(iter/s)": 0.115068
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 178.0,
"completions/mean_length": 126.16796875,
"completions/min_length": 63.0,
"epoch": 5.3,
"grad_norm": 2.7954585552215576,
"kl": 0.521484375,
"learning_rate": 4.6037502157160567e-07,
"loss": 0.008576348423957825,
"memory(GiB)": 18.17,
"reward": 0.4126065671443939,
"reward_std": 0.02162686362862587,
"rewards/MCQ_Reward/mean": 0.4126065671443939,
"rewards/MCQ_Reward/std": 0.08540061488747597,
"step": 265,
"train_speed(iter/s)": 0.115013
},
{
"clip_ratio": 0.00956161879003048,
"epoch": 5.32,
"grad_norm": 4.209680557250977,
"kl": 0.544921875,
"learning_rate": 4.5721249576058027e-07,
"loss": 0.009101202711462975,
"memory(GiB)": 18.17,
"step": 266,
"train_speed(iter/s)": 0.115384
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 181.5,
"completions/mean_length": 113.3515625,
"completions/min_length": 71.5,
"epoch": 5.34,
"grad_norm": 2.6387808322906494,
"kl": 0.595703125,
"learning_rate": 4.540516934239863e-07,
"loss": 0.008354030549526215,
"memory(GiB)": 18.17,
"reward": 0.4057372510433197,
"reward_std": 0.025215539149940014,
"rewards/MCQ_Reward/mean": 0.4057372510433197,
"rewards/MCQ_Reward/std": 0.10797113552689552,
"step": 267,
"train_speed(iter/s)": 0.115352
},
{
"clip_ratio": 0.004749758169054985,
"epoch": 5.36,
"grad_norm": 2.726827383041382,
"kl": 0.59765625,
"learning_rate": 4.508927418784814e-07,
"loss": 0.008263107389211655,
"memory(GiB)": 18.17,
"step": 268,
"train_speed(iter/s)": 0.115721
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 253.0,
"completions/mean_length": 128.75,
"completions/min_length": 65.0,
"epoch": 5.38,
"grad_norm": 2.4489338397979736,
"kl": 0.5859375,
"learning_rate": 4.477357683661733e-07,
"loss": 0.0003694836050271988,
"memory(GiB)": 18.17,
"reward": 0.39796915650367737,
"reward_std": 0.0229190643876791,
"rewards/MCQ_Reward/mean": 0.39796915650367737,
"rewards/MCQ_Reward/std": 0.06984946131706238,
"step": 269,
"train_speed(iter/s)": 0.115538
},
{
"clip_ratio": 0.0044297389686107635,
"epoch": 5.4,
"grad_norm": 2.187133312225342,
"kl": 0.587890625,
"learning_rate": 4.445809000494945e-07,
"loss": 6.162561476230621e-06,
"memory(GiB)": 18.17,
"step": 270,
"train_speed(iter/s)": 0.115873
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 207.5,
"completions/mean_length": 120.48046875,
"completions/min_length": 76.5,
"epoch": 5.42,
"grad_norm": 2.354365348815918,
"kl": 0.595703125,
"learning_rate": 4.4142826400608085e-07,
"loss": -0.011774084530770779,
"memory(GiB)": 18.17,
"reward": 0.4731539338827133,
"reward_std": 0.025172382593154907,
"rewards/MCQ_Reward/mean": 0.4731539338827133,
"rewards/MCQ_Reward/std": 0.09358260780572891,
"step": 271,
"train_speed(iter/s)": 0.115479
},
{
"clip_ratio": 0.007754836697131395,
"epoch": 5.44,
"grad_norm": 2.9754416942596436,
"kl": 0.568359375,
"learning_rate": 4.382779872236526e-07,
"loss": -0.01219811663031578,
"memory(GiB)": 18.17,
"step": 272,
"train_speed(iter/s)": 0.115843
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 193.0,
"completions/mean_length": 127.671875,
"completions/min_length": 82.0,
"epoch": 5.46,
"grad_norm": 2.66938853263855,
"kl": 0.587890625,
"learning_rate": 4.3513019659489906e-07,
"loss": -0.01641671359539032,
"memory(GiB)": 18.17,
"reward": 0.3951749950647354,
"reward_std": 0.026222089305520058,
"rewards/MCQ_Reward/mean": 0.3951749950647354,
"rewards/MCQ_Reward/std": 0.07432432845234871,
"step": 273,
"train_speed(iter/s)": 0.11581
},
{
"clip_ratio": 0.006316621555015445,
"epoch": 5.48,
"grad_norm": 2.3686916828155518,
"kl": 0.595703125,
"learning_rate": 4.31985018912368e-07,
"loss": -0.01686863601207733,
"memory(GiB)": 18.17,
"step": 274,
"train_speed(iter/s)": 0.116173
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 212.0,
"completions/mean_length": 127.2578125,
"completions/min_length": 64.5,
"epoch": 5.5,
"grad_norm": 2.3570117950439453,
"kl": 0.5390625,
"learning_rate": 4.2884258086335745e-07,
"loss": 0.0007358621805906296,
"memory(GiB)": 18.17,
"reward": 0.44543667137622833,
"reward_std": 0.024644173681735992,
"rewards/MCQ_Reward/mean": 0.44543667137622833,
"rewards/MCQ_Reward/std": 0.09130855649709702,
"step": 275,
"train_speed(iter/s)": 0.116062
},
{
"clip_ratio": 0.009702229872345924,
"epoch": 5.52,
"grad_norm": 4.230794906616211,
"kl": 0.517578125,
"learning_rate": 4.257030090248142e-07,
"loss": 0.0004968619905412197,
"memory(GiB)": 18.17,
"step": 276,
"train_speed(iter/s)": 0.116424
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 188.5,
"completions/mean_length": 124.16796875,
"completions/min_length": 66.5,
"epoch": 5.54,
"grad_norm": 2.1478097438812256,
"kl": 0.607421875,
"learning_rate": 4.2256642985823387e-07,
"loss": 0.012350899167358875,
"memory(GiB)": 18.17,
"reward": 0.4112658351659775,
"reward_std": 0.023498238995671272,
"rewards/MCQ_Reward/mean": 0.4112658351659775,
"rewards/MCQ_Reward/std": 0.08520639687776566,
"step": 277,
"train_speed(iter/s)": 0.116375
},
{
"clip_ratio": 0.004101653583347797,
"epoch": 5.5600000000000005,
"grad_norm": 2.062098503112793,
"kl": 0.62109375,
"learning_rate": 4.19432969704568e-07,
"loss": 0.012091840617358685,
"memory(GiB)": 18.17,
"step": 278,
"train_speed(iter/s)": 0.116723
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 252.5,
"completions/mean_length": 122.69921875,
"completions/min_length": 59.0,
"epoch": 5.58,
"grad_norm": 2.9315075874328613,
"kl": 0.5390625,
"learning_rate": 4.1630275477913465e-07,
"loss": -0.013242216780781746,
"memory(GiB)": 18.17,
"reward": 0.39477604627609253,
"reward_std": 0.02283278852701187,
"rewards/MCQ_Reward/mean": 0.39477604627609253,
"rewards/MCQ_Reward/std": 0.09505810588598251,
"step": 279,
"train_speed(iter/s)": 0.116608
},
{
"clip_ratio": 0.006070411531254649,
"epoch": 5.6,
"grad_norm": 2.2812304496765137,
"kl": 0.53515625,
"learning_rate": 4.131759111665348e-07,
"loss": -0.013854868710041046,
"memory(GiB)": 18.17,
"step": 280,
"train_speed(iter/s)": 0.116971
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 206.5,
"completions/mean_length": 129.95703125,
"completions/min_length": 60.5,
"epoch": 5.62,
"grad_norm": 2.015717029571533,
"kl": 0.513671875,
"learning_rate": 4.1005256481557306e-07,
"loss": 0.0003234475152567029,
"memory(GiB)": 18.17,
"reward": 0.40168674290180206,
"reward_std": 0.020120804198086262,
"rewards/MCQ_Reward/mean": 0.40168674290180206,
"rewards/MCQ_Reward/std": 0.09599081426858902,
"step": 281,
"train_speed(iter/s)": 0.116542
},
{
"clip_ratio": 0.0076590063981711864,
"epoch": 5.64,
"grad_norm": 2.828334331512451,
"kl": 0.5009765625,
"learning_rate": 4.0693284153418497e-07,
"loss": 0.00015916512347757816,
"memory(GiB)": 18.17,
"step": 282,
"train_speed(iter/s)": 0.116903
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 201.5,
"completions/mean_length": 121.16015625,
"completions/min_length": 71.5,
"epoch": 5.66,
"grad_norm": 2.985908269882202,
"kl": 0.58203125,
"learning_rate": 4.038168669843697e-07,
"loss": -0.0021479236893355846,
"memory(GiB)": 18.17,
"reward": 0.4441321939229965,
"reward_std": 0.021154197864234447,
"rewards/MCQ_Reward/mean": 0.4441321939229965,
"rewards/MCQ_Reward/std": 0.10662735998630524,
"step": 283,
"train_speed(iter/s)": 0.116806
},
{
"clip_ratio": 0.00845325831323862,
"epoch": 5.68,
"grad_norm": 2.2008328437805176,
"kl": 0.5703125,
"learning_rate": 4.0070476667712736e-07,
"loss": -0.0024233213625848293,
"memory(GiB)": 18.17,
"step": 284,
"train_speed(iter/s)": 0.117157
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 192.5,
"completions/mean_length": 131.40625,
"completions/min_length": 65.5,
"epoch": 5.7,
"grad_norm": 2.1404271125793457,
"kl": 0.609375,
"learning_rate": 3.9759666596740473e-07,
"loss": 0.009725593030452728,
"memory(GiB)": 18.17,
"reward": 0.4451696425676346,
"reward_std": 0.02477285359054804,
"rewards/MCQ_Reward/mean": 0.4451696425676346,
"rewards/MCQ_Reward/std": 0.07242370769381523,
"step": 285,
"train_speed(iter/s)": 0.117116
},
{
"clip_ratio": 0.004681814229115844,
"epoch": 5.72,
"grad_norm": 2.289313316345215,
"kl": 0.61328125,
"learning_rate": 3.9449269004904516e-07,
"loss": 0.009346994571387768,
"memory(GiB)": 18.17,
"step": 286,
"train_speed(iter/s)": 0.117466
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 160.0,
"completions/mean_length": 104.30078125,
"completions/min_length": 51.0,
"epoch": 5.74,
"grad_norm": 2.770270347595215,
"kl": 1.189453125,
"learning_rate": 3.913929639497462e-07,
"loss": 0.009477443993091583,
"memory(GiB)": 18.17,
"reward": 0.43081943690776825,
"reward_std": 0.025431891903281212,
"rewards/MCQ_Reward/mean": 0.43081943690776825,
"rewards/MCQ_Reward/std": 0.10991119593381882,
"step": 287,
"train_speed(iter/s)": 0.117471
},
{
"clip_ratio": 0.006838085595518351,
"epoch": 5.76,
"grad_norm": 2.8960061073303223,
"kl": 1.087890625,
"learning_rate": 3.882976125260229e-07,
"loss": 0.008670520968735218,
"memory(GiB)": 18.17,
"step": 288,
"train_speed(iter/s)": 0.117827
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 172.5,
"completions/mean_length": 114.71484375,
"completions/min_length": 56.5,
"epoch": 5.78,
"grad_norm": 2.4359030723571777,
"kl": 0.552734375,
"learning_rate": 3.852067604581794e-07,
"loss": 0.006409616209566593,
"memory(GiB)": 18.17,
"reward": 0.41095563769340515,
"reward_std": 0.02436618786305189,
"rewards/MCQ_Reward/mean": 0.41095563769340515,
"rewards/MCQ_Reward/std": 0.09878598526120186,
"step": 289,
"train_speed(iter/s)": 0.117814
},
{
"clip_ratio": 0.007955410983413458,
"epoch": 5.8,
"grad_norm": 3.950528383255005,
"kl": 0.5390625,
"learning_rate": 3.821205322452863e-07,
"loss": 0.0066283950582146645,
"memory(GiB)": 18.17,
"step": 290,
"train_speed(iter/s)": 0.118161
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 225.5,
"completions/mean_length": 134.59375,
"completions/min_length": 63.0,
"epoch": 5.82,
"grad_norm": 2.4326717853546143,
"kl": 0.5263671875,
"learning_rate": 3.790390522001662e-07,
"loss": 0.002648044377565384,
"memory(GiB)": 18.17,
"reward": 0.4533398002386093,
"reward_std": 0.023892495781183243,
"rewards/MCQ_Reward/mean": 0.4533398002386093,
"rewards/MCQ_Reward/std": 0.08347899466753006,
"step": 291,
"train_speed(iter/s)": 0.117724
},
{
"clip_ratio": 0.004736665170639753,
"epoch": 5.84,
"grad_norm": 2.2011497020721436,
"kl": 0.541015625,
"learning_rate": 3.7596244444438574e-07,
"loss": 0.002431286498904228,
"memory(GiB)": 18.17,
"step": 292,
"train_speed(iter/s)": 0.118068
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 191.5,
"completions/mean_length": 117.24609375,
"completions/min_length": 63.5,
"epoch": 5.86,
"grad_norm": 2.58125376701355,
"kl": 0.541015625,
"learning_rate": 3.728908329032566e-07,
"loss": -0.003335139248520136,
"memory(GiB)": 18.17,
"reward": 0.4097088426351547,
"reward_std": 0.022918211296200752,
"rewards/MCQ_Reward/mean": 0.4097088426351547,
"rewards/MCQ_Reward/std": 0.1199105829000473,
"step": 293,
"train_speed(iter/s)": 0.118029
},
{
"clip_ratio": 0.007036251947283745,
"epoch": 5.88,
"grad_norm": 2.4533321857452393,
"kl": 0.5625,
"learning_rate": 3.6982434130084396e-07,
"loss": -0.0037924423813819885,
"memory(GiB)": 18.17,
"step": 294,
"train_speed(iter/s)": 0.118366
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 180.5,
"completions/mean_length": 127.00390625,
"completions/min_length": 75.0,
"epoch": 5.9,
"grad_norm": 2.2269814014434814,
"kl": 0.5,
"learning_rate": 3.6676309315498255e-07,
"loss": 0.012001181952655315,
"memory(GiB)": 18.17,
"reward": 0.42691150307655334,
"reward_std": 0.021617514081299305,
"rewards/MCQ_Reward/mean": 0.42691150307655334,
"rewards/MCQ_Reward/std": 0.11347687244415283,
"step": 295,
"train_speed(iter/s)": 0.11833
},
{
"clip_ratio": 0.004536686465144157,
"epoch": 5.92,
"grad_norm": 2.593670129776001,
"kl": 0.513671875,
"learning_rate": 3.6370721177230115e-07,
"loss": 0.011945893988013268,
"memory(GiB)": 18.17,
"step": 296,
"train_speed(iter/s)": 0.118674
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 231.5,
"completions/mean_length": 123.5234375,
"completions/min_length": 71.5,
"epoch": 5.9399999999999995,
"grad_norm": 2.1928629875183105,
"kl": 0.4970703125,
"learning_rate": 3.6065682024325617e-07,
"loss": 0.015498391352593899,
"memory(GiB)": 18.17,
"reward": 0.41268619894981384,
"reward_std": 0.02419480960816145,
"rewards/MCQ_Reward/mean": 0.41268619894981384,
"rewards/MCQ_Reward/std": 0.09195958822965622,
"step": 297,
"train_speed(iter/s)": 0.118532
},
{
"clip_ratio": 0.0050865779630839825,
"epoch": 5.96,
"grad_norm": 2.1392431259155273,
"kl": 0.494140625,
"learning_rate": 3.5761204143717385e-07,
"loss": 0.014891544356942177,
"memory(GiB)": 18.17,
"step": 298,
"train_speed(iter/s)": 0.118872
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 226.5,
"completions/mean_length": 124.328125,
"completions/min_length": 64.5,
"epoch": 5.98,
"grad_norm": 2.7249698638916016,
"kl": 0.880859375,
"learning_rate": 3.5457299799730045e-07,
"loss": -0.010070513002574444,
"memory(GiB)": 18.17,
"reward": 0.4588439464569092,
"reward_std": 0.029408703558146954,
"rewards/MCQ_Reward/mean": 0.4588439464569092,
"rewards/MCQ_Reward/std": 0.09774744883179665,
"step": 299,
"train_speed(iter/s)": 0.118723
},
{
"clip_ratio": 0.01025686739012599,
"epoch": 6.0,
"grad_norm": 3.8231394290924072,
"kl": 0.7529296875,
"learning_rate": 3.5153981233586274e-07,
"loss": -0.009807607159018517,
"memory(GiB)": 18.17,
"step": 300,
"train_speed(iter/s)": 0.119048
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 185.5,
"completions/mean_length": 109.19140625,
"completions/min_length": 56.0,
"epoch": 6.02,
"grad_norm": 2.6895663738250732,
"kl": 0.599609375,
"learning_rate": 3.485126066291364e-07,
"loss": -0.010052207857370377,
"memory(GiB)": 18.17,
"reward": 0.4080576002597809,
"reward_std": 0.02562197484076023,
"rewards/MCQ_Reward/mean": 0.4080576002597809,
"rewards/MCQ_Reward/std": 0.09971121698617935,
"step": 301,
"train_speed(iter/s)": 0.118697
},
{
"clip_ratio": 0.005149862729012966,
"epoch": 6.04,
"grad_norm": 2.655897855758667,
"kl": 0.607421875,
"learning_rate": 3.454915028125263e-07,
"loss": -0.010359197854995728,
"memory(GiB)": 18.17,
"step": 302,
"train_speed(iter/s)": 0.11903
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 179.5,
"completions/mean_length": 117.90234375,
"completions/min_length": 56.5,
"epoch": 6.06,
"grad_norm": 2.423926591873169,
"kl": 0.546875,
"learning_rate": 3.4247662257565366e-07,
"loss": 0.018125958740711212,
"memory(GiB)": 18.17,
"reward": 0.4407869875431061,
"reward_std": 0.025757532566785812,
"rewards/MCQ_Reward/mean": 0.4407869875431061,
"rewards/MCQ_Reward/std": 0.12692639231681824,
"step": 303,
"train_speed(iter/s)": 0.118923
},
{
"clip_ratio": 0.00550723378546536,
"epoch": 6.08,
"grad_norm": 2.2029030323028564,
"kl": 0.5546875,
"learning_rate": 3.394680873574546e-07,
"loss": 0.017929650843143463,
"memory(GiB)": 18.17,
"step": 304,
"train_speed(iter/s)": 0.119254
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 195.0,
"completions/mean_length": 124.44140625,
"completions/min_length": 54.5,
"epoch": 6.1,
"grad_norm": 2.3613805770874023,
"kl": 0.5703125,
"learning_rate": 3.3646601834128916e-07,
"loss": -0.007877168245613575,
"memory(GiB)": 18.17,
"reward": 0.49866482615470886,
"reward_std": 0.024780258536338806,
"rewards/MCQ_Reward/mean": 0.49866482615470886,
"rewards/MCQ_Reward/std": 0.07562171667814255,
"step": 305,
"train_speed(iter/s)": 0.11921
},
{
"clip_ratio": 0.004300985252484679,
"epoch": 6.12,
"grad_norm": 2.1242995262145996,
"kl": 0.576171875,
"learning_rate": 3.3347053645005965e-07,
"loss": -0.008408917114138603,
"memory(GiB)": 18.17,
"step": 306,
"train_speed(iter/s)": 0.119519
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 159.5,
"completions/mean_length": 105.72265625,
"completions/min_length": 64.5,
"epoch": 6.14,
"grad_norm": 2.5641608238220215,
"kl": 0.560546875,
"learning_rate": 3.3048176234133963e-07,
"loss": 0.0034052138216793537,
"memory(GiB)": 18.17,
"reward": 0.3926085978746414,
"reward_std": 0.01911616325378418,
"rewards/MCQ_Reward/mean": 0.3926085978746414,
"rewards/MCQ_Reward/std": 0.06766298227012157,
"step": 307,
"train_speed(iter/s)": 0.119522
},
{
"clip_ratio": 0.007244990672916174,
"epoch": 6.16,
"grad_norm": 2.7589051723480225,
"kl": 0.572265625,
"learning_rate": 3.274998164025148e-07,
"loss": 0.0031583395320922136,
"memory(GiB)": 18.17,
"step": 308,
"train_speed(iter/s)": 0.119856
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 236.0,
"completions/mean_length": 119.86328125,
"completions/min_length": 57.0,
"epoch": 6.18,
"grad_norm": 2.9221317768096924,
"kl": 0.611328125,
"learning_rate": 3.245248187459323e-07,
"loss": -0.019380319863557816,
"memory(GiB)": 18.17,
"reward": 0.386982798576355,
"reward_std": 0.026672961190342903,
"rewards/MCQ_Reward/mean": 0.386982798576355,
"rewards/MCQ_Reward/std": 0.10517054051160812,
"step": 309,
"train_speed(iter/s)": 0.119747
},
{
"clip_ratio": 0.005416512954980135,
"epoch": 6.2,
"grad_norm": 2.7965259552001953,
"kl": 0.61328125,
"learning_rate": 3.215568892040641e-07,
"loss": -0.019356630742549896,
"memory(GiB)": 18.17,
"step": 310,
"train_speed(iter/s)": 0.120077
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 200.5,
"completions/mean_length": 118.21484375,
"completions/min_length": 57.0,
"epoch": 6.22,
"grad_norm": 2.8668336868286133,
"kl": 0.607421875,
"learning_rate": 3.1859614732467954e-07,
"loss": -0.013122756965458393,
"memory(GiB)": 18.17,
"reward": 0.4595968574285507,
"reward_std": 0.024624092504382133,
"rewards/MCQ_Reward/mean": 0.4595968574285507,
"rewards/MCQ_Reward/std": 0.08434771373867989,
"step": 311,
"train_speed(iter/s)": 0.119696
},
{
"clip_ratio": 0.00573662668466568,
"epoch": 6.24,
"grad_norm": 2.4580280780792236,
"kl": 0.609375,
"learning_rate": 3.156427123660297e-07,
"loss": -0.013560149818658829,
"memory(GiB)": 18.17,
"step": 312,
"train_speed(iter/s)": 0.120023
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 202.5,
"completions/mean_length": 121.68359375,
"completions/min_length": 73.5,
"epoch": 6.26,
"grad_norm": 2.6274502277374268,
"kl": 0.58984375,
"learning_rate": 3.1269670329204393e-07,
"loss": 0.0022671520709991455,
"memory(GiB)": 18.17,
"reward": 0.44664010405540466,
"reward_std": 0.024377938359975815,
"rewards/MCQ_Reward/mean": 0.44664010405540466,
"rewards/MCQ_Reward/std": 0.08575410395860672,
"step": 313,
"train_speed(iter/s)": 0.119945
},
{
"clip_ratio": 0.0052670135628432035,
"epoch": 6.28,
"grad_norm": 2.753713607788086,
"kl": 0.578125,
"learning_rate": 3.097582387675385e-07,
"loss": 0.0018416689708828926,
"memory(GiB)": 18.17,
"step": 314,
"train_speed(iter/s)": 0.120272
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 207.5,
"completions/mean_length": 127.57421875,
"completions/min_length": 77.0,
"epoch": 6.3,
"grad_norm": 2.4003334045410156,
"kl": 0.583984375,
"learning_rate": 3.068274371534356e-07,
"loss": 0.0005114064551889896,
"memory(GiB)": 18.17,
"reward": 0.44641484320163727,
"reward_std": 0.024146192707121372,
"rewards/MCQ_Reward/mean": 0.44641484320163727,
"rewards/MCQ_Reward/std": 0.08713827468454838,
"step": 315,
"train_speed(iter/s)": 0.120168
},
{
"clip_ratio": 0.008136166725307703,
"epoch": 6.32,
"grad_norm": 2.3975117206573486,
"kl": 0.619140625,
"learning_rate": 3.039044165019972e-07,
"loss": 0.0004498562775552273,
"memory(GiB)": 18.17,
"step": 316,
"train_speed(iter/s)": 0.120495
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 196.5,
"completions/mean_length": 117.9609375,
"completions/min_length": 58.5,
"epoch": 6.34,
"grad_norm": 2.348710060119629,
"kl": 0.548828125,
"learning_rate": 3.00989294552069e-07,
"loss": 0.00850888341665268,
"memory(GiB)": 18.17,
"reward": 0.42280539870262146,
"reward_std": 0.02416596282273531,
"rewards/MCQ_Reward/mean": 0.42280539870262146,
"rewards/MCQ_Reward/std": 0.0933729000389576,
"step": 317,
"train_speed(iter/s)": 0.120401
},
{
"clip_ratio": 0.005974379135295749,
"epoch": 6.36,
"grad_norm": 2.630732774734497,
"kl": 0.5390625,
"learning_rate": 2.9808218872433766e-07,
"loss": 0.008482606150209904,
"memory(GiB)": 18.17,
"step": 318,
"train_speed(iter/s)": 0.120723
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 200.0,
"completions/mean_length": 123.66796875,
"completions/min_length": 75.5,
"epoch": 6.38,
"grad_norm": 2.1341052055358887,
"kl": 0.517578125,
"learning_rate": 2.9518321611660234e-07,
"loss": -0.0021673766896128654,
"memory(GiB)": 18.17,
"reward": 0.4051154851913452,
"reward_std": 0.020906205289065838,
"rewards/MCQ_Reward/mean": 0.4051154851913452,
"rewards/MCQ_Reward/std": 0.09874700754880905,
"step": 319,
"train_speed(iter/s)": 0.12062
},
{
"clip_ratio": 0.00719631533138454,
"epoch": 6.4,
"grad_norm": 3.2350962162017822,
"kl": 0.5390625,
"learning_rate": 2.922924934990568e-07,
"loss": -0.0024176109582185745,
"memory(GiB)": 18.17,
"step": 320,
"train_speed(iter/s)": 0.120919
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 170.0,
"completions/mean_length": 117.234375,
"completions/min_length": 69.0,
"epoch": 6.42,
"grad_norm": 74.83729553222656,
"kl": 20.791015625,
"learning_rate": 2.894101373095867e-07,
"loss": 0.04349440336227417,
"memory(GiB)": 18.17,
"reward": 0.44527527689933777,
"reward_std": 0.021908948197960854,
"rewards/MCQ_Reward/mean": 0.44527527689933777,
"rewards/MCQ_Reward/std": 0.08160104416310787,
"step": 321,
"train_speed(iter/s)": 0.120602
},
{
"clip_ratio": 0.004950069589540362,
"epoch": 6.44,
"grad_norm": 99.64342498779297,
"kl": 26.54296875,
"learning_rate": 2.8653626364907914e-07,
"loss": 0.04914519935846329,
"memory(GiB)": 18.17,
"step": 322,
"train_speed(iter/s)": 0.120907
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 232.0,
"completions/mean_length": 128.45703125,
"completions/min_length": 52.5,
"epoch": 6.46,
"grad_norm": 2.5322988033294678,
"kl": 0.529296875,
"learning_rate": 2.8367098827674576e-07,
"loss": 0.009952299296855927,
"memory(GiB)": 18.17,
"reward": 0.4740261733531952,
"reward_std": 0.023401367478072643,
"rewards/MCQ_Reward/mean": 0.4740261733531952,
"rewards/MCQ_Reward/std": 0.08106581121683121,
"step": 323,
"train_speed(iter/s)": 0.12071
},
{
"clip_ratio": 0.005782874301075935,
"epoch": 6.48,
"grad_norm": 2.591923952102661,
"kl": 0.53125,
"learning_rate": 2.808144266054612e-07,
"loss": 0.009899303317070007,
"memory(GiB)": 18.17,
"step": 324,
"train_speed(iter/s)": 0.121029
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 202.5,
"completions/mean_length": 133.33203125,
"completions/min_length": 81.5,
"epoch": 6.5,
"grad_norm": 2.113783121109009,
"kl": 0.537109375,
"learning_rate": 2.779666936971129e-07,
"loss": -0.0006487010978162289,
"memory(GiB)": 18.17,
"reward": 0.39647024869918823,
"reward_std": 0.02249709703028202,
"rewards/MCQ_Reward/mean": 0.39647024869918823,
"rewards/MCQ_Reward/std": 0.0880400650203228,
"step": 325,
"train_speed(iter/s)": 0.120986
},
{
"clip_ratio": 0.006350549403578043,
"epoch": 6.52,
"grad_norm": 2.4789633750915527,
"kl": 0.525390625,
"learning_rate": 2.751279042579672e-07,
"loss": -0.0002095792442560196,
"memory(GiB)": 18.17,
"step": 326,
"train_speed(iter/s)": 0.121304
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 199.0,
"completions/mean_length": 126.234375,
"completions/min_length": 54.0,
"epoch": 6.54,
"grad_norm": 2.4260339736938477,
"kl": 0.548828125,
"learning_rate": 2.7229817263404864e-07,
"loss": -0.0033088945783674717,
"memory(GiB)": 18.17,
"reward": 0.4554037004709244,
"reward_std": 0.02187604457139969,
"rewards/MCQ_Reward/mean": 0.4554037004709244,
"rewards/MCQ_Reward/std": 0.09804989397525787,
"step": 327,
"train_speed(iter/s)": 0.121167
},
{
"clip_ratio": 0.008008664939552546,
"epoch": 6.5600000000000005,
"grad_norm": 4.365505695343018,
"kl": 0.533203125,
"learning_rate": 2.6947761280653447e-07,
"loss": -0.00283604022115469,
"memory(GiB)": 18.17,
"step": 328,
"train_speed(iter/s)": 0.121483
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 180.0,
"completions/mean_length": 117.9609375,
"completions/min_length": 69.5,
"epoch": 6.58,
"grad_norm": 2.2564356327056885,
"kl": 0.5283203125,
"learning_rate": 2.6666633838716314e-07,
"loss": -0.0077381255105137825,
"memory(GiB)": 18.17,
"reward": 0.4396722763776779,
"reward_std": 0.022700872272253036,
"rewards/MCQ_Reward/mean": 0.4396722763776779,
"rewards/MCQ_Reward/std": 0.10192850604653358,
"step": 329,
"train_speed(iter/s)": 0.12143
},
{
"clip_ratio": 0.0047557426150888205,
"epoch": 6.6,
"grad_norm": 2.172281503677368,
"kl": 0.5322265625,
"learning_rate": 2.638644626136587e-07,
"loss": -0.008173219859600067,
"memory(GiB)": 18.17,
"step": 330,
"train_speed(iter/s)": 0.121737
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 183.0,
"completions/mean_length": 126.0859375,
"completions/min_length": 68.5,
"epoch": 6.62,
"grad_norm": 2.167248010635376,
"kl": 0.4873046875,
"learning_rate": 2.610720983451685e-07,
"loss": 0.018461888656020164,
"memory(GiB)": 18.17,
"reward": 0.44843943417072296,
"reward_std": 0.02303914539515972,
"rewards/MCQ_Reward/mean": 0.44843943417072296,
"rewards/MCQ_Reward/std": 0.08497340604662895,
"step": 331,
"train_speed(iter/s)": 0.121397
},
{
"clip_ratio": 0.0052658268250525,
"epoch": 6.64,
"grad_norm": 2.136260509490967,
"kl": 0.4921875,
"learning_rate": 2.58289358057718e-07,
"loss": 0.01842992939054966,
"memory(GiB)": 18.17,
"step": 332,
"train_speed(iter/s)": 0.121707
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 225.5,
"completions/mean_length": 127.5546875,
"completions/min_length": 65.5,
"epoch": 6.66,
"grad_norm": 2.595977306365967,
"kl": 0.578125,
"learning_rate": 2.555163538396806e-07,
"loss": -0.011687211692333221,
"memory(GiB)": 18.17,
"reward": 0.4103027582168579,
"reward_std": 0.02552829496562481,
"rewards/MCQ_Reward/mean": 0.4103027582168579,
"rewards/MCQ_Reward/std": 0.0971563570201397,
"step": 333,
"train_speed(iter/s)": 0.1216
},
{
"clip_ratio": 0.0067884225863963366,
"epoch": 6.68,
"grad_norm": 3.2224881649017334,
"kl": 0.59765625,
"learning_rate": 2.5275319738726165e-07,
"loss": -0.011430272832512856,
"memory(GiB)": 18.17,
"step": 334,
"train_speed(iter/s)": 0.121912
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 220.0,
"completions/mean_length": 123.2578125,
"completions/min_length": 75.0,
"epoch": 6.7,
"grad_norm": 2.387573480606079,
"kl": 0.56640625,
"learning_rate": 2.500000000000001e-07,
"loss": -0.006422008387744427,
"memory(GiB)": 18.17,
"reward": 0.4134673774242401,
"reward_std": 0.022745592519640923,
"rewards/MCQ_Reward/mean": 0.4134673774242401,
"rewards/MCQ_Reward/std": 0.10698199272155762,
"step": 335,
"train_speed(iter/s)": 0.121789
},
{
"clip_ratio": 0.007158383261412382,
"epoch": 6.72,
"grad_norm": 2.7240705490112305,
"kl": 0.564453125,
"learning_rate": 2.472568725762853e-07,
"loss": -0.0065142130479216576,
"memory(GiB)": 18.17,
"step": 336,
"train_speed(iter/s)": 0.122088
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 153.5,
"completions/mean_length": 108.890625,
"completions/min_length": 63.5,
"epoch": 6.74,
"grad_norm": 2.2466800212860107,
"kl": 0.7421875,
"learning_rate": 2.4452392560888976e-07,
"loss": -0.00018489733338356018,
"memory(GiB)": 18.17,
"reward": 0.42812955379486084,
"reward_std": 0.0208740271627903,
"rewards/MCQ_Reward/mean": 0.42812955379486084,
"rewards/MCQ_Reward/std": 0.08048268780112267,
"step": 337,
"train_speed(iter/s)": 0.12208
},
{
"clip_ratio": 0.005281613674014807,
"epoch": 6.76,
"grad_norm": 2.0434200763702393,
"kl": 0.771484375,
"learning_rate": 2.418012691805191e-07,
"loss": -0.0005159445572644472,
"memory(GiB)": 18.17,
"step": 338,
"train_speed(iter/s)": 0.122388
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 202.0,
"completions/mean_length": 117.3984375,
"completions/min_length": 65.0,
"epoch": 6.78,
"grad_norm": 2.669919729232788,
"kl": 0.572265625,
"learning_rate": 2.390890129593771e-07,
"loss": -0.009503326378762722,
"memory(GiB)": 18.17,
"reward": 0.41273191571235657,
"reward_std": 0.023225258104503155,
"rewards/MCQ_Reward/mean": 0.41273191571235657,
"rewards/MCQ_Reward/std": 0.08152876608073711,
"step": 339,
"train_speed(iter/s)": 0.122302
},
{
"clip_ratio": 0.005108103854581714,
"epoch": 6.8,
"grad_norm": 2.5069973468780518,
"kl": 0.576171875,
"learning_rate": 2.3638726619474875e-07,
"loss": -0.009927002713084221,
"memory(GiB)": 18.17,
"step": 340,
"train_speed(iter/s)": 0.122605
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 205.5,
"completions/mean_length": 121.0703125,
"completions/min_length": 66.0,
"epoch": 6.82,
"grad_norm": 2.5319740772247314,
"kl": 0.59765625,
"learning_rate": 2.3369613771260005e-07,
"loss": 0.004871162120252848,
"memory(GiB)": 18.17,
"reward": 0.39162860810756683,
"reward_std": 0.022268068976700306,
"rewards/MCQ_Reward/mean": 0.39162860810756683,
"rewards/MCQ_Reward/std": 0.07392172142863274,
"step": 341,
"train_speed(iter/s)": 0.12225
},
{
"clip_ratio": 0.004840584937483072,
"epoch": 6.84,
"grad_norm": 2.547236204147339,
"kl": 0.60546875,
"learning_rate": 2.310157359111938e-07,
"loss": 0.004931057803332806,
"memory(GiB)": 18.17,
"step": 342,
"train_speed(iter/s)": 0.122534
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 201.5,
"completions/mean_length": 124.6328125,
"completions/min_length": 67.5,
"epoch": 6.86,
"grad_norm": 2.610426664352417,
"kl": 0.5419921875,
"learning_rate": 2.283461687567236e-07,
"loss": 0.012133005075156689,
"memory(GiB)": 18.17,
"reward": 0.38104377686977386,
"reward_std": 0.023476887494325638,
"rewards/MCQ_Reward/mean": 0.38104377686977386,
"rewards/MCQ_Reward/std": 0.13691367208957672,
"step": 343,
"train_speed(iter/s)": 0.122472
},
{
"clip_ratio": 0.005503881955519319,
"epoch": 6.88,
"grad_norm": 2.517308473587036,
"kl": 0.5458984375,
"learning_rate": 2.2568754377896515e-07,
"loss": 0.012206798419356346,
"memory(GiB)": 18.17,
"step": 344,
"train_speed(iter/s)": 0.122771
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 191.0,
"completions/mean_length": 122.8125,
"completions/min_length": 54.0,
"epoch": 6.9,
"grad_norm": 2.268815517425537,
"kl": 0.576171875,
"learning_rate": 2.2303996806694486e-07,
"loss": 0.005438795313239098,
"memory(GiB)": 18.17,
"reward": 0.41502565145492554,
"reward_std": 0.021418385207653046,
"rewards/MCQ_Reward/mean": 0.41502565145492554,
"rewards/MCQ_Reward/std": 0.09508999437093735,
"step": 345,
"train_speed(iter/s)": 0.122753
},
{
"clip_ratio": 0.005775286350399256,
"epoch": 6.92,
"grad_norm": 2.83811616897583,
"kl": 0.603515625,
"learning_rate": 2.2040354826462664e-07,
"loss": 0.005799311213195324,
"memory(GiB)": 18.17,
"step": 346,
"train_speed(iter/s)": 0.123049
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 164.5,
"completions/mean_length": 116.91796875,
"completions/min_length": 65.5,
"epoch": 6.9399999999999995,
"grad_norm": 2.334526777267456,
"kl": 0.564453125,
"learning_rate": 2.177783905666155e-07,
"loss": 0.0054929498583078384,
"memory(GiB)": 18.17,
"reward": 0.39654283225536346,
"reward_std": 0.022173049859702587,
"rewards/MCQ_Reward/mean": 0.39654283225536346,
"rewards/MCQ_Reward/std": 0.09505746513605118,
"step": 347,
"train_speed(iter/s)": 0.123026
},
{
"clip_ratio": 0.0045166281051933765,
"epoch": 6.96,
"grad_norm": 2.271827220916748,
"kl": 0.564453125,
"learning_rate": 2.151646007138806e-07,
"loss": 0.0055296882055699825,
"memory(GiB)": 18.17,
"step": 348,
"train_speed(iter/s)": 0.123296
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 219.0,
"completions/mean_length": 130.65625,
"completions/min_length": 77.5,
"epoch": 6.98,
"grad_norm": 2.0946249961853027,
"kl": 0.55859375,
"learning_rate": 2.125622839894964e-07,
"loss": 0.003636482171714306,
"memory(GiB)": 18.17,
"reward": 0.43836964666843414,
"reward_std": 0.021374424919486046,
"rewards/MCQ_Reward/mean": 0.43836964666843414,
"rewards/MCQ_Reward/std": 0.06100250408053398,
"step": 349,
"train_speed(iter/s)": 0.123225
},
{
"clip_ratio": 0.0046428050845861435,
"epoch": 7.0,
"grad_norm": 2.23724365234375,
"kl": 0.57421875,
"learning_rate": 2.0997154521440097e-07,
"loss": 0.004051330033689737,
"memory(GiB)": 18.17,
"step": 350,
"train_speed(iter/s)": 0.123516
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 188.0,
"completions/mean_length": 121.21484375,
"completions/min_length": 72.0,
"epoch": 7.02,
"grad_norm": 2.815627336502075,
"kl": 0.5703125,
"learning_rate": 2.0739248874317438e-07,
"loss": -0.019233888015151024,
"memory(GiB)": 18.17,
"reward": 0.4290418028831482,
"reward_std": 0.022210314869880676,
"rewards/MCQ_Reward/mean": 0.4290418028831482,
"rewards/MCQ_Reward/std": 0.06661852076649666,
"step": 351,
"train_speed(iter/s)": 0.123139
},
{
"clip_ratio": 0.00514651439152658,
"epoch": 7.04,
"grad_norm": 3.0636136531829834,
"kl": 0.576171875,
"learning_rate": 2.048252184598352e-07,
"loss": -0.01901531219482422,
"memory(GiB)": 18.17,
"step": 352,
"train_speed(iter/s)": 0.12342
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 169.0,
"completions/mean_length": 112.85546875,
"completions/min_length": 62.5,
"epoch": 7.06,
"grad_norm": 2.700939178466797,
"kl": 0.58203125,
"learning_rate": 2.0226983777365603e-07,
"loss": -0.007234710268676281,
"memory(GiB)": 18.17,
"reward": 0.43640220165252686,
"reward_std": 0.022726435214281082,
"rewards/MCQ_Reward/mean": 0.43640220165252686,
"rewards/MCQ_Reward/std": 0.08832718059420586,
"step": 353,
"train_speed(iter/s)": 0.123424
},
{
"clip_ratio": 0.00972440093755722,
"epoch": 7.08,
"grad_norm": 3.0179059505462646,
"kl": 0.564453125,
"learning_rate": 1.9972644961499853e-07,
"loss": -0.007274748291820288,
"memory(GiB)": 18.17,
"step": 354,
"train_speed(iter/s)": 0.123722
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 212.5,
"completions/mean_length": 115.6328125,
"completions/min_length": 68.0,
"epoch": 7.1,
"grad_norm": 2.484236240386963,
"kl": 0.619140625,
"learning_rate": 1.9719515643116674e-07,
"loss": 0.015900151804089546,
"memory(GiB)": 18.17,
"reward": 0.45114465057849884,
"reward_std": 0.024738659150898457,
"rewards/MCQ_Reward/mean": 0.45114465057849884,
"rewards/MCQ_Reward/std": 0.10900644585490227,
"step": 355,
"train_speed(iter/s)": 0.123607
},
{
"clip_ratio": 0.0064309455920010805,
"epoch": 7.12,
"grad_norm": 3.852499485015869,
"kl": 0.607421875,
"learning_rate": 1.9467606018228088e-07,
"loss": 0.01630295254290104,
"memory(GiB)": 18.17,
"step": 356,
"train_speed(iter/s)": 0.123891
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 195.0,
"completions/mean_length": 128.88671875,
"completions/min_length": 74.5,
"epoch": 7.14,
"grad_norm": 2.455781936645508,
"kl": 0.5478515625,
"learning_rate": 1.9216926233717084e-07,
"loss": -0.00730013195425272,
"memory(GiB)": 18.17,
"reward": 0.4758221060037613,
"reward_std": 0.024665928445756435,
"rewards/MCQ_Reward/mean": 0.4758221060037613,
"rewards/MCQ_Reward/std": 0.0809130035340786,
"step": 357,
"train_speed(iter/s)": 0.123852
},
{
"clip_ratio": 0.00344535568729043,
"epoch": 7.16,
"grad_norm": 2.2257754802703857,
"kl": 0.5576171875,
"learning_rate": 1.8967486386928817e-07,
"loss": -0.0074045369401574135,
"memory(GiB)": 18.17,
"step": 358,
"train_speed(iter/s)": 0.124151
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 217.0,
"completions/mean_length": 130.06640625,
"completions/min_length": 67.5,
"epoch": 7.18,
"grad_norm": 2.7154037952423096,
"kl": 0.51171875,
"learning_rate": 1.8719296525263923e-07,
"loss": 0.019313501194119453,
"memory(GiB)": 18.17,
"reward": 0.4561205357313156,
"reward_std": 0.023944508284330368,
"rewards/MCQ_Reward/mean": 0.4561205357313156,
"rewards/MCQ_Reward/std": 0.10000644996762276,
"step": 359,
"train_speed(iter/s)": 0.124074
},
{
"clip_ratio": 0.006082270760089159,
"epoch": 7.2,
"grad_norm": 2.114431381225586,
"kl": 0.5234375,
"learning_rate": 1.847236664577389e-07,
"loss": 0.01907144859433174,
"memory(GiB)": 18.17,
"step": 360,
"train_speed(iter/s)": 0.124368
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 223.5,
"completions/mean_length": 130.765625,
"completions/min_length": 79.0,
"epoch": 7.22,
"grad_norm": 2.2248895168304443,
"kl": 0.5390625,
"learning_rate": 1.8226706694758193e-07,
"loss": 0.012620393186807632,
"memory(GiB)": 18.17,
"reward": 0.44832468032836914,
"reward_std": 0.025768463499844074,
"rewards/MCQ_Reward/mean": 0.44832468032836914,
"rewards/MCQ_Reward/std": 0.09799568355083466,
"step": 361,
"train_speed(iter/s)": 0.123928
},
{
"clip_ratio": 0.006066091358661652,
"epoch": 7.24,
"grad_norm": 2.5757896900177,
"kl": 0.53515625,
"learning_rate": 1.7982326567363886e-07,
"loss": 0.013028541579842567,
"memory(GiB)": 18.17,
"step": 362,
"train_speed(iter/s)": 0.124219
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 194.5,
"completions/mean_length": 122.546875,
"completions/min_length": 50.5,
"epoch": 7.26,
"grad_norm": 2.2651302814483643,
"kl": 0.5322265625,
"learning_rate": 1.7739236107186857e-07,
"loss": 0.009481780230998993,
"memory(GiB)": 18.17,
"reward": 0.4318048655986786,
"reward_std": 0.022731643170118332,
"rewards/MCQ_Reward/mean": 0.4318048655986786,
"rewards/MCQ_Reward/std": 0.09833444282412529,
"step": 363,
"train_speed(iter/s)": 0.124163
},
{
"clip_ratio": 0.0038783656200394034,
"epoch": 7.28,
"grad_norm": 2.2316813468933105,
"kl": 0.5302734375,
"learning_rate": 1.7497445105875374e-07,
"loss": 0.009487325325608253,
"memory(GiB)": 18.17,
"step": 364,
"train_speed(iter/s)": 0.124456
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 236.5,
"completions/mean_length": 131.63671875,
"completions/min_length": 61.5,
"epoch": 7.3,
"grad_norm": 2.720024347305298,
"kl": 0.5517578125,
"learning_rate": 1.725696330273575e-07,
"loss": 0.0073198857717216015,
"memory(GiB)": 18.17,
"reward": 0.4407372921705246,
"reward_std": 0.019983571954071522,
"rewards/MCQ_Reward/mean": 0.4407372921705246,
"rewards/MCQ_Reward/std": 0.07775032892823219,
"step": 365,
"train_speed(iter/s)": 0.124298
},
{
"clip_ratio": 0.005759742809459567,
"epoch": 7.32,
"grad_norm": 2.4700775146484375,
"kl": 0.5556640625,
"learning_rate": 1.7017800384339924e-07,
"loss": 0.00751863420009613,
"memory(GiB)": 18.17,
"step": 366,
"train_speed(iter/s)": 0.124588
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 240.5,
"completions/mean_length": 122.73828125,
"completions/min_length": 64.5,
"epoch": 7.34,
"grad_norm": 2.3976547718048096,
"kl": 0.541015625,
"learning_rate": 1.6779965984135374e-07,
"loss": 0.015993405133485794,
"memory(GiB)": 18.17,
"reward": 0.41162461042404175,
"reward_std": 0.020391933619976044,
"rewards/MCQ_Reward/mean": 0.41162461042404175,
"rewards/MCQ_Reward/std": 0.0841926857829094,
"step": 367,
"train_speed(iter/s)": 0.124346
},
{
"clip_ratio": 0.005305928410962224,
"epoch": 7.36,
"grad_norm": 2.444512128829956,
"kl": 0.546875,
"learning_rate": 1.6543469682057104e-07,
"loss": 0.016359636560082436,
"memory(GiB)": 18.17,
"step": 368,
"train_speed(iter/s)": 0.124615
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 156.0,
"completions/mean_length": 113.90234375,
"completions/min_length": 68.5,
"epoch": 7.38,
"grad_norm": 3.490565299987793,
"kl": 0.57421875,
"learning_rate": 1.6308321004141607e-07,
"loss": -0.0010942098451778293,
"memory(GiB)": 18.17,
"reward": 0.38713136315345764,
"reward_std": 0.021422830410301685,
"rewards/MCQ_Reward/mean": 0.38713136315345764,
"rewards/MCQ_Reward/std": 0.10617586970329285,
"step": 369,
"train_speed(iter/s)": 0.124639
},
{
"clip_ratio": 0.005288022803142667,
"epoch": 7.4,
"grad_norm": 2.881525754928589,
"kl": 0.564453125,
"learning_rate": 1.6074529422143396e-07,
"loss": -0.0009173217695206404,
"memory(GiB)": 18.17,
"step": 370,
"train_speed(iter/s)": 0.124914
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 238.5,
"completions/mean_length": 139.4375,
"completions/min_length": 87.0,
"epoch": 7.42,
"grad_norm": 2.1569535732269287,
"kl": 0.49609375,
"learning_rate": 1.5842104353153285e-07,
"loss": 0.014979809522628784,
"memory(GiB)": 18.17,
"reward": 0.4273018389940262,
"reward_std": 0.02148488350212574,
"rewards/MCQ_Reward/mean": 0.4273018389940262,
"rewards/MCQ_Reward/std": 0.13347461819648743,
"step": 371,
"train_speed(iter/s)": 0.124503
},
{
"clip_ratio": 0.006136654410511255,
"epoch": 7.44,
"grad_norm": 2.3948974609375,
"kl": 0.486328125,
"learning_rate": 1.561105515921915e-07,
"loss": 0.015109008178114891,
"memory(GiB)": 18.17,
"step": 372,
"train_speed(iter/s)": 0.124788
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 176.5,
"completions/mean_length": 117.00390625,
"completions/min_length": 69.5,
"epoch": 7.46,
"grad_norm": 2.3135647773742676,
"kl": 0.669921875,
"learning_rate": 1.5381391146968863e-07,
"loss": 0.006555130705237389,
"memory(GiB)": 18.17,
"reward": 0.4488084018230438,
"reward_std": 0.02006101794540882,
"rewards/MCQ_Reward/mean": 0.4488084018230438,
"rewards/MCQ_Reward/std": 0.07920502312481403,
"step": 373,
"train_speed(iter/s)": 0.124722
},
{
"clip_ratio": 0.007013680646196008,
"epoch": 7.48,
"grad_norm": 2.962529420852661,
"kl": 0.642578125,
"learning_rate": 1.5153121567235333e-07,
"loss": 0.006604420021176338,
"memory(GiB)": 18.17,
"step": 374,
"train_speed(iter/s)": 0.125001
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 169.0,
"completions/mean_length": 107.60546875,
"completions/min_length": 53.5,
"epoch": 7.5,
"grad_norm": 2.731383800506592,
"kl": 0.576171875,
"learning_rate": 1.492625561468393e-07,
"loss": -0.005473949480801821,
"memory(GiB)": 18.17,
"reward": 0.41762372851371765,
"reward_std": 0.019964593462646008,
"rewards/MCQ_Reward/mean": 0.41762372851371765,
"rewards/MCQ_Reward/std": 0.08107879385352135,
"step": 375,
"train_speed(iter/s)": 0.124937
},
{
"clip_ratio": 0.004663396626710892,
"epoch": 7.52,
"grad_norm": 2.615187406539917,
"kl": 0.576171875,
"learning_rate": 1.4700802427442178e-07,
"loss": -0.005246948450803757,
"memory(GiB)": 18.17,
"step": 376,
"train_speed(iter/s)": 0.125201
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 176.5,
"completions/mean_length": 107.15625,
"completions/min_length": 50.5,
"epoch": 7.54,
"grad_norm": 2.796724557876587,
"kl": 0.640625,
"learning_rate": 1.4476771086731565e-07,
"loss": 0.01410718634724617,
"memory(GiB)": 18.17,
"reward": 0.4095290005207062,
"reward_std": 0.02420712448656559,
"rewards/MCQ_Reward/mean": 0.4095290005207062,
"rewards/MCQ_Reward/std": 0.07465272396802902,
"step": 377,
"train_speed(iter/s)": 0.125163
},
{
"clip_ratio": 0.006976983975619078,
"epoch": 7.5600000000000005,
"grad_norm": 2.945889711380005,
"kl": 0.66015625,
"learning_rate": 1.4254170616501827e-07,
"loss": 0.014726857654750347,
"memory(GiB)": 18.17,
"step": 378,
"train_speed(iter/s)": 0.125433
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 174.0,
"completions/mean_length": 118.50390625,
"completions/min_length": 63.5,
"epoch": 7.58,
"grad_norm": 2.9761271476745605,
"kl": 0.607421875,
"learning_rate": 1.4033009983067452e-07,
"loss": -0.004153972025960684,
"memory(GiB)": 18.17,
"reward": 0.42119112610816956,
"reward_std": 0.02067422866821289,
"rewards/MCQ_Reward/mean": 0.42119112610816956,
"rewards/MCQ_Reward/std": 0.0681285560131073,
"step": 379,
"train_speed(iter/s)": 0.125369
},
{
"clip_ratio": 0.0061764034908264875,
"epoch": 7.6,
"grad_norm": 3.6120944023132324,
"kl": 0.6171875,
"learning_rate": 1.381329809474649e-07,
"loss": -0.0035073161125183105,
"memory(GiB)": 18.17,
"step": 380,
"train_speed(iter/s)": 0.125649
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 194.0,
"completions/mean_length": 130.67578125,
"completions/min_length": 79.0,
"epoch": 7.62,
"grad_norm": 2.3507981300354004,
"kl": 0.5419921875,
"learning_rate": 1.3595043801501794e-07,
"loss": -0.0032176347449421883,
"memory(GiB)": 18.17,
"reward": 0.43415170907974243,
"reward_std": 0.021646766923367977,
"rewards/MCQ_Reward/mean": 0.43415170907974243,
"rewards/MCQ_Reward/std": 0.11485166102647781,
"step": 381,
"train_speed(iter/s)": 0.125308
},
{
"clip_ratio": 0.006046550814062357,
"epoch": 7.64,
"grad_norm": 2.5917809009552,
"kl": 0.541015625,
"learning_rate": 1.3378255894584462e-07,
"loss": -0.0032573172356933355,
"memory(GiB)": 18.17,
"step": 382,
"train_speed(iter/s)": 0.125575
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 591.0,
"completions/mean_length": 111.87109375,
"completions/min_length": 62.5,
"epoch": 7.66,
"grad_norm": 3.2898316383361816,
"kl": 0.84375,
"learning_rate": 1.3162943106179748e-07,
"loss": 0.05431316792964935,
"memory(GiB)": 25.14,
"reward": 0.4442131072282791,
"reward_std": 0.02893070410937071,
"rewards/MCQ_Reward/mean": 0.4442131072282791,
"rewards/MCQ_Reward/std": 0.0882490873336792,
"step": 383,
"train_speed(iter/s)": 0.124772
},
{
"clip_ratio": 0.005024469457566738,
"epoch": 7.68,
"grad_norm": 3.0035033226013184,
"kl": 0.82421875,
"learning_rate": 1.2949114109055414e-07,
"loss": 0.054804857820272446,
"memory(GiB)": 25.14,
"step": 384,
"train_speed(iter/s)": 0.125047
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 236.5,
"completions/mean_length": 125.80078125,
"completions/min_length": 67.0,
"epoch": 7.7,
"grad_norm": 2.8262860774993896,
"kl": 0.55078125,
"learning_rate": 1.2736777516212267e-07,
"loss": -0.006510823965072632,
"memory(GiB)": 25.14,
"reward": 0.40428027510643005,
"reward_std": 0.025332522578537464,
"rewards/MCQ_Reward/mean": 0.40428027510643005,
"rewards/MCQ_Reward/std": 0.10921913757920265,
"step": 385,
"train_speed(iter/s)": 0.124957
},
{
"clip_ratio": 0.005720158107578754,
"epoch": 7.72,
"grad_norm": 2.3165252208709717,
"kl": 0.54296875,
"learning_rate": 1.2525941880537304e-07,
"loss": -0.006398671306669712,
"memory(GiB)": 25.14,
"step": 386,
"train_speed(iter/s)": 0.125223
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 171.5,
"completions/mean_length": 115.546875,
"completions/min_length": 68.5,
"epoch": 7.74,
"grad_norm": 2.5941028594970703,
"kl": 0.650390625,
"learning_rate": 1.2316615694459186e-07,
"loss": 0.013789664953947067,
"memory(GiB)": 25.14,
"reward": 0.4454474151134491,
"reward_std": 0.02376528736203909,
"rewards/MCQ_Reward/mean": 0.4454474151134491,
"rewards/MCQ_Reward/std": 0.07124818488955498,
"step": 387,
"train_speed(iter/s)": 0.125174
},
{
"clip_ratio": 0.00573781062848866,
"epoch": 7.76,
"grad_norm": 2.886561393737793,
"kl": 0.634765625,
"learning_rate": 1.2108807389606158e-07,
"loss": 0.014278584159910679,
"memory(GiB)": 25.14,
"step": 388,
"train_speed(iter/s)": 0.125449
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 183.0,
"completions/mean_length": 121.00390625,
"completions/min_length": 57.5,
"epoch": 7.78,
"grad_norm": 2.2996103763580322,
"kl": 0.6171875,
"learning_rate": 1.1902525336466462e-07,
"loss": 0.012145346030592918,
"memory(GiB)": 25.14,
"reward": 0.42450854182243347,
"reward_std": 0.021244493313133717,
"rewards/MCQ_Reward/mean": 0.42450854182243347,
"rewards/MCQ_Reward/std": 0.09635130688548088,
"step": 389,
"train_speed(iter/s)": 0.125399
},
{
"clip_ratio": 0.005426776595413685,
"epoch": 7.8,
"grad_norm": 2.1788930892944336,
"kl": 0.62890625,
"learning_rate": 1.1697777844051104e-07,
"loss": 0.011829939670860767,
"memory(GiB)": 25.14,
"step": 390,
"train_speed(iter/s)": 0.125672
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 216.5,
"completions/mean_length": 129.03125,
"completions/min_length": 70.0,
"epoch": 7.82,
"grad_norm": 2.2412619590759277,
"kl": 0.53515625,
"learning_rate": 1.1494573159559212e-07,
"loss": 9.762030094861984e-05,
"memory(GiB)": 25.14,
"reward": 0.4155340790748596,
"reward_std": 0.020521354861557484,
"rewards/MCQ_Reward/mean": 0.4155340790748596,
"rewards/MCQ_Reward/std": 0.12795967236161232,
"step": 391,
"train_speed(iter/s)": 0.125325
},
{
"clip_ratio": 0.005442213034257293,
"epoch": 7.84,
"grad_norm": 2.445225954055786,
"kl": 0.54296875,
"learning_rate": 1.1292919468045875e-07,
"loss": 0.0006964541971683502,
"memory(GiB)": 25.14,
"step": 392,
"train_speed(iter/s)": 0.125594
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 180.5,
"completions/mean_length": 129.45703125,
"completions/min_length": 68.5,
"epoch": 7.86,
"grad_norm": 2.254128932952881,
"kl": 0.607421875,
"learning_rate": 1.1092824892092373e-07,
"loss": -0.010345934890210629,
"memory(GiB)": 25.14,
"reward": 0.40340456366539,
"reward_std": 0.022636689245700836,
"rewards/MCQ_Reward/mean": 0.40340456366539,
"rewards/MCQ_Reward/std": 0.09724823385477066,
"step": 393,
"train_speed(iter/s)": 0.125579
},
{
"clip_ratio": 0.004930965369567275,
"epoch": 7.88,
"grad_norm": 2.3455586433410645,
"kl": 0.623046875,
"learning_rate": 1.0894297491479043e-07,
"loss": -0.009814320132136345,
"memory(GiB)": 25.14,
"step": 394,
"train_speed(iter/s)": 0.125852
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 211.5,
"completions/mean_length": 122.03125,
"completions/min_length": 72.5,
"epoch": 7.9,
"grad_norm": 2.7601866722106934,
"kl": 0.54296875,
"learning_rate": 1.0697345262860635e-07,
"loss": 0.011853070929646492,
"memory(GiB)": 25.14,
"reward": 0.44544240832328796,
"reward_std": 0.02559925615787506,
"rewards/MCQ_Reward/mean": 0.44544240832328796,
"rewards/MCQ_Reward/std": 0.09495911747217178,
"step": 395,
"train_speed(iter/s)": 0.125762
},
{
"clip_ratio": 0.004873325582593679,
"epoch": 7.92,
"grad_norm": 3.1385254859924316,
"kl": 0.541015625,
"learning_rate": 1.0501976139444191e-07,
"loss": 0.01212891936302185,
"memory(GiB)": 25.14,
"step": 396,
"train_speed(iter/s)": 0.126021
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 186.0,
"completions/mean_length": 131.75,
"completions/min_length": 80.0,
"epoch": 7.9399999999999995,
"grad_norm": 2.280336380004883,
"kl": 0.59765625,
"learning_rate": 1.0308197990669537e-07,
"loss": -0.0006723229307681322,
"memory(GiB)": 25.14,
"reward": 0.3935137987136841,
"reward_std": 0.0229948153719306,
"rewards/MCQ_Reward/mean": 0.3935137987136841,
"rewards/MCQ_Reward/std": 0.09170003235340118,
"step": 397,
"train_speed(iter/s)": 0.125959
},
{
"clip_ratio": 0.009115117136389017,
"epoch": 7.96,
"grad_norm": 2.6576101779937744,
"kl": 0.623046875,
"learning_rate": 1.0116018621892236e-07,
"loss": -0.0008128315676003695,
"memory(GiB)": 25.14,
"step": 398,
"train_speed(iter/s)": 0.126231
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 195.5,
"completions/mean_length": 125.65625,
"completions/min_length": 67.0,
"epoch": 7.98,
"grad_norm": 2.7158310413360596,
"kl": 0.58203125,
"learning_rate": 9.92544577406923e-08,
"loss": 0.006697420962154865,
"memory(GiB)": 25.14,
"reward": 0.43207649886608124,
"reward_std": 0.02400553785264492,
"rewards/MCQ_Reward/mean": 0.43207649886608124,
"rewards/MCQ_Reward/std": 0.0867740847170353,
"step": 399,
"train_speed(iter/s)": 0.126178
},
{
"clip_ratio": 0.005927033722400665,
"epoch": 8.0,
"grad_norm": 2.416578769683838,
"kl": 0.580078125,
"learning_rate": 9.736487123447068e-08,
"loss": 0.006666385568678379,
"memory(GiB)": 25.14,
"step": 400,
"train_speed(iter/s)": 0.126428
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 210.0,
"completions/mean_length": 128.03515625,
"completions/min_length": 68.0,
"epoch": 8.02,
"grad_norm": 2.4625000953674316,
"kl": 0.55078125,
"learning_rate": 9.549150281252632e-08,
"loss": 0.019197747111320496,
"memory(GiB)": 25.14,
"reward": 0.41131871938705444,
"reward_std": 0.02179474849253893,
"rewards/MCQ_Reward/mean": 0.41131871938705444,
"rewards/MCQ_Reward/std": 0.0903569795191288,
"step": 401,
"train_speed(iter/s)": 0.12607
},
{
"clip_ratio": 0.004682507831603289,
"epoch": 8.04,
"grad_norm": 2.4578921794891357,
"kl": 0.556640625,
"learning_rate": 9.363442793386606e-08,
"loss": 0.019492177292704582,
"memory(GiB)": 25.14,
"step": 402,
"train_speed(iter/s)": 0.126333
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 179.5,
"completions/mean_length": 124.9453125,
"completions/min_length": 65.0,
"epoch": 8.06,
"grad_norm": 2.380934000015259,
"kl": 0.595703125,
"learning_rate": 9.179372140119524e-08,
"loss": 0.00032033398747444153,
"memory(GiB)": 25.14,
"reward": 0.45213624835014343,
"reward_std": 0.019670803099870682,
"rewards/MCQ_Reward/mean": 0.45213624835014343,
"rewards/MCQ_Reward/std": 0.05602107755839825,
"step": 403,
"train_speed(iter/s)": 0.126289
},
{
"clip_ratio": 0.005494384560734034,
"epoch": 8.08,
"grad_norm": 2.2825376987457275,
"kl": 0.59765625,
"learning_rate": 8.996945735790446e-08,
"loss": 0.00025699660181999207,
"memory(GiB)": 25.14,
"step": 404,
"train_speed(iter/s)": 0.126553
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 193.0,
"completions/mean_length": 113.30078125,
"completions/min_length": 66.0,
"epoch": 8.1,
"grad_norm": 2.4504525661468506,
"kl": 0.65234375,
"learning_rate": 8.816170928508365e-08,
"loss": 0.005521825514733791,
"memory(GiB)": 25.14,
"reward": 0.4200716018676758,
"reward_std": 0.02163711003959179,
"rewards/MCQ_Reward/mean": 0.4200716018676758,
"rewards/MCQ_Reward/std": 0.09177059680223465,
"step": 405,
"train_speed(iter/s)": 0.126487
},
{
"clip_ratio": 0.005122944712638855,
"epoch": 8.12,
"grad_norm": 2.5025854110717773,
"kl": 0.65234375,
"learning_rate": 8.637054999856147e-08,
"loss": 0.005893816705793142,
"memory(GiB)": 25.14,
"step": 406,
"train_speed(iter/s)": 0.126707
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 206.0,
"completions/mean_length": 131.84765625,
"completions/min_length": 84.0,
"epoch": 8.14,
"grad_norm": 2.2803900241851807,
"kl": 0.677734375,
"learning_rate": 8.459605164597267e-08,
"loss": 0.002506987191736698,
"memory(GiB)": 25.14,
"reward": 0.42351874709129333,
"reward_std": 0.019920101389288902,
"rewards/MCQ_Reward/mean": 0.42351874709129333,
"rewards/MCQ_Reward/std": 0.07087348401546478,
"step": 407,
"train_speed(iter/s)": 0.126629
},
{
"clip_ratio": 0.004146608873270452,
"epoch": 8.16,
"grad_norm": 2.197411060333252,
"kl": 0.693359375,
"learning_rate": 8.283828570385237e-08,
"loss": 0.0028184172697365284,
"memory(GiB)": 25.14,
"step": 408,
"train_speed(iter/s)": 0.126894
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 246.5,
"completions/mean_length": 126.35546875,
"completions/min_length": 55.0,
"epoch": 8.18,
"grad_norm": 3.133226156234741,
"kl": 0.54296875,
"learning_rate": 8.109732297475635e-08,
"loss": 0.003347148187458515,
"memory(GiB)": 25.14,
"reward": 0.4289032816886902,
"reward_std": 0.023678142577409744,
"rewards/MCQ_Reward/mean": 0.4289032816886902,
"rewards/MCQ_Reward/std": 0.08180082961916924,
"step": 409,
"train_speed(iter/s)": 0.126716
},
{
"clip_ratio": 0.004793429281562567,
"epoch": 8.2,
"grad_norm": 2.647909164428711,
"kl": 0.548828125,
"learning_rate": 7.937323358440934e-08,
"loss": 0.003219081088900566,
"memory(GiB)": 25.14,
"step": 410,
"train_speed(iter/s)": 0.126979
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 200.0,
"completions/mean_length": 120.65234375,
"completions/min_length": 66.0,
"epoch": 8.22,
"grad_norm": 2.844910144805908,
"kl": 1.08984375,
"learning_rate": 7.766608697888094e-08,
"loss": 0.00578346848487854,
"memory(GiB)": 25.14,
"reward": 0.40613003075122833,
"reward_std": 0.024234792217612267,
"rewards/MCQ_Reward/mean": 0.40613003075122833,
"rewards/MCQ_Reward/std": 0.10613492503762245,
"step": 411,
"train_speed(iter/s)": 0.126628
},
{
"clip_ratio": 0.008466396480798721,
"epoch": 8.24,
"grad_norm": 3.322730779647827,
"kl": 1.30859375,
"learning_rate": 7.597595192178702e-08,
"loss": 0.006200029980391264,
"memory(GiB)": 25.14,
"step": 412,
"train_speed(iter/s)": 0.126892
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 181.0,
"completions/mean_length": 120.59375,
"completions/min_length": 63.5,
"epoch": 8.26,
"grad_norm": 3.1121227741241455,
"kl": 0.57421875,
"learning_rate": 7.430289649152155e-08,
"loss": -0.005076010245829821,
"memory(GiB)": 25.14,
"reward": 0.4349597841501236,
"reward_std": 0.022311867214739323,
"rewards/MCQ_Reward/mean": 0.4349597841501236,
"rewards/MCQ_Reward/std": 0.0992676205933094,
"step": 413,
"train_speed(iter/s)": 0.126827
},
{
"clip_ratio": 0.005325015634298325,
"epoch": 8.28,
"grad_norm": 3.336932897567749,
"kl": 0.5859375,
"learning_rate": 7.264698807851327e-08,
"loss": -0.004951636306941509,
"memory(GiB)": 25.14,
"step": 414,
"train_speed(iter/s)": 0.127083
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 176.0,
"completions/mean_length": 122.34765625,
"completions/min_length": 80.0,
"epoch": 8.3,
"grad_norm": 2.32357120513916,
"kl": 0.576171875,
"learning_rate": 7.100829338251146e-08,
"loss": 0.010018033906817436,
"memory(GiB)": 25.14,
"reward": 0.46219733357429504,
"reward_std": 0.023064136505126953,
"rewards/MCQ_Reward/mean": 0.46219733357429504,
"rewards/MCQ_Reward/std": 0.10461203381419182,
"step": 415,
"train_speed(iter/s)": 0.127059
},
{
"clip_ratio": 0.004823329858481884,
"epoch": 8.32,
"grad_norm": 2.399235486984253,
"kl": 0.56640625,
"learning_rate": 6.938687840989971e-08,
"loss": 0.010338631458580494,
"memory(GiB)": 25.14,
"step": 416,
"train_speed(iter/s)": 0.127319
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 205.0,
"completions/mean_length": 126.6484375,
"completions/min_length": 59.5,
"epoch": 8.34,
"grad_norm": 2.3096046447753906,
"kl": 0.59765625,
"learning_rate": 6.778280847103667e-08,
"loss": 0.007643429096788168,
"memory(GiB)": 25.14,
"reward": 0.45115791261196136,
"reward_std": 0.026236201636493206,
"rewards/MCQ_Reward/mean": 0.45115791261196136,
"rewards/MCQ_Reward/std": 0.07101332768797874,
"step": 417,
"train_speed(iter/s)": 0.127229
},
{
"clip_ratio": 0.00613890727981925,
"epoch": 8.36,
"grad_norm": 2.6392662525177,
"kl": 0.599609375,
"learning_rate": 6.619614817762536e-08,
"loss": 0.00813712365925312,
"memory(GiB)": 25.14,
"step": 418,
"train_speed(iter/s)": 0.127474
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 203.5,
"completions/mean_length": 128.6484375,
"completions/min_length": 70.5,
"epoch": 8.38,
"grad_norm": 2.6424126625061035,
"kl": 0.5546875,
"learning_rate": 6.462696144011148e-08,
"loss": 0.01095396839082241,
"memory(GiB)": 25.14,
"reward": 0.43093007802963257,
"reward_std": 0.021352089941501617,
"rewards/MCQ_Reward/mean": 0.43093007802963257,
"rewards/MCQ_Reward/std": 0.09322765283286572,
"step": 419,
"train_speed(iter/s)": 0.127401
},
{
"clip_ratio": 0.005334047833457589,
"epoch": 8.4,
"grad_norm": 2.514528751373291,
"kl": 0.560546875,
"learning_rate": 6.307531146510753e-08,
"loss": 0.011139345355331898,
"memory(GiB)": 25.14,
"step": 420,
"train_speed(iter/s)": 0.127655
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 198.0,
"completions/mean_length": 121.234375,
"completions/min_length": 61.5,
"epoch": 8.42,
"grad_norm": 2.6931869983673096,
"kl": 0.576171875,
"learning_rate": 6.154126075284855e-08,
"loss": -0.004434285219758749,
"memory(GiB)": 25.14,
"reward": 0.47386451065540314,
"reward_std": 0.02479046955704689,
"rewards/MCQ_Reward/mean": 0.47386451065540314,
"rewards/MCQ_Reward/std": 0.08362133055925369,
"step": 421,
"train_speed(iter/s)": 0.127304
},
{
"clip_ratio": 0.004985473584383726,
"epoch": 8.44,
"grad_norm": 2.623483896255493,
"kl": 0.5859375,
"learning_rate": 6.002487109467347e-08,
"loss": -0.004044556524604559,
"memory(GiB)": 25.14,
"step": 422,
"train_speed(iter/s)": 0.12756
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 167.0,
"completions/mean_length": 120.3359375,
"completions/min_length": 57.0,
"epoch": 8.46,
"grad_norm": 2.4557580947875977,
"kl": 0.54296875,
"learning_rate": 5.8526203570536504e-08,
"loss": -0.0014804373495280743,
"memory(GiB)": 25.14,
"reward": 0.38437609374523163,
"reward_std": 0.019576413556933403,
"rewards/MCQ_Reward/mean": 0.38437609374523163,
"rewards/MCQ_Reward/std": 0.08220572769641876,
"step": 423,
"train_speed(iter/s)": 0.12751
},
{
"clip_ratio": 0.005047354847192764,
"epoch": 8.48,
"grad_norm": 2.414680004119873,
"kl": 0.548828125,
"learning_rate": 5.70453185465472e-08,
"loss": -0.0010703507578000426,
"memory(GiB)": 25.14,
"step": 424,
"train_speed(iter/s)": 0.127763
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 171.0,
"completions/mean_length": 109.29296875,
"completions/min_length": 59.0,
"epoch": 8.5,
"grad_norm": 2.3690483570098877,
"kl": 0.59375,
"learning_rate": 5.5582275672538316e-08,
"loss": 0.0056993430480360985,
"memory(GiB)": 25.14,
"reward": 0.404767170548439,
"reward_std": 0.024388392455875874,
"rewards/MCQ_Reward/mean": 0.404767170548439,
"rewards/MCQ_Reward/std": 0.09245007485151291,
"step": 425,
"train_speed(iter/s)": 0.127734
},
{
"clip_ratio": 0.004816505592316389,
"epoch": 8.52,
"grad_norm": 2.3456268310546875,
"kl": 0.59765625,
"learning_rate": 5.4137133879663287e-08,
"loss": 0.005467045586556196,
"memory(GiB)": 25.14,
"step": 426,
"train_speed(iter/s)": 0.127977
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 259.5,
"completions/mean_length": 131.4375,
"completions/min_length": 65.5,
"epoch": 8.54,
"grad_norm": 2.3816792964935303,
"kl": 0.55078125,
"learning_rate": 5.270995137802314e-08,
"loss": 0.0031818237621337175,
"memory(GiB)": 25.14,
"reward": 0.38306334614753723,
"reward_std": 0.02167375199496746,
"rewards/MCQ_Reward/mean": 0.38306334614753723,
"rewards/MCQ_Reward/std": 0.12913303077220917,
"step": 427,
"train_speed(iter/s)": 0.12777
},
{
"clip_ratio": 0.005708938697353005,
"epoch": 8.56,
"grad_norm": 2.7459070682525635,
"kl": 0.560546875,
"learning_rate": 5.1300785654320886e-08,
"loss": 0.0036508457269519567,
"memory(GiB)": 25.14,
"step": 428,
"train_speed(iter/s)": 0.128012
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 216.5,
"completions/mean_length": 141.1796875,
"completions/min_length": 63.5,
"epoch": 8.58,
"grad_norm": 2.546011447906494,
"kl": 0.560546875,
"learning_rate": 4.9909693469546097e-08,
"loss": -0.0037225554697215557,
"memory(GiB)": 25.14,
"reward": 0.4553868919610977,
"reward_std": 0.024206943809986115,
"rewards/MCQ_Reward/mean": 0.4553868919610977,
"rewards/MCQ_Reward/std": 0.10913475230336189,
"step": 429,
"train_speed(iter/s)": 0.127896
},
{
"clip_ratio": 0.005615573842078447,
"epoch": 8.6,
"grad_norm": 2.4503653049468994,
"kl": 0.552734375,
"learning_rate": 4.853673085668947e-08,
"loss": -0.0035459164064377546,
"memory(GiB)": 25.14,
"step": 430,
"train_speed(iter/s)": 0.128133
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 170.5,
"completions/mean_length": 121.25,
"completions/min_length": 68.0,
"epoch": 8.62,
"grad_norm": 2.6130316257476807,
"kl": 0.560546875,
"learning_rate": 4.718195311848455e-08,
"loss": 0.006583400070667267,
"memory(GiB)": 25.14,
"reward": 0.4170517176389694,
"reward_std": 0.022290964610874653,
"rewards/MCQ_Reward/mean": 0.4170517176389694,
"rewards/MCQ_Reward/std": 0.10183962434530258,
"step": 431,
"train_speed(iter/s)": 0.12785
},
{
"clip_ratio": 0.0055829116608947515,
"epoch": 8.64,
"grad_norm": 2.6913576126098633,
"kl": 0.572265625,
"learning_rate": 4.5845414825181394e-08,
"loss": 0.006918736733496189,
"memory(GiB)": 25.14,
"step": 432,
"train_speed(iter/s)": 0.128096
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 185.5,
"completions/mean_length": 113.8046875,
"completions/min_length": 74.0,
"epoch": 8.66,
"grad_norm": 2.4241960048675537,
"kl": 0.6201171875,
"learning_rate": 4.452716981234744e-08,
"loss": 0.011290742084383965,
"memory(GiB)": 25.14,
"reward": 0.4250094145536423,
"reward_std": 0.022951221093535423,
"rewards/MCQ_Reward/mean": 0.4250094145536423,
"rewards/MCQ_Reward/std": 0.10084276273846626,
"step": 433,
"train_speed(iter/s)": 0.128069
},
{
"clip_ratio": 0.005609560292214155,
"epoch": 8.68,
"grad_norm": 2.5790963172912598,
"kl": 0.650390625,
"learning_rate": 4.322727117869951e-08,
"loss": 0.011948860250413418,
"memory(GiB)": 25.14,
"step": 434,
"train_speed(iter/s)": 0.128291
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 196.0,
"completions/mean_length": 126.3515625,
"completions/min_length": 83.5,
"epoch": 8.7,
"grad_norm": 2.430708885192871,
"kl": 0.5390625,
"learning_rate": 4.19457712839652e-08,
"loss": -0.008761925622820854,
"memory(GiB)": 25.14,
"reward": 0.43507225811481476,
"reward_std": 0.024821095168590546,
"rewards/MCQ_Reward/mean": 0.43507225811481476,
"rewards/MCQ_Reward/std": 0.10436990112066269,
"step": 435,
"train_speed(iter/s)": 0.128196
},
{
"clip_ratio": 0.004881069879047573,
"epoch": 8.72,
"grad_norm": 2.439311981201172,
"kl": 0.5400390625,
"learning_rate": 4.068272174677334e-08,
"loss": -0.00834021344780922,
"memory(GiB)": 25.14,
"step": 436,
"train_speed(iter/s)": 0.128446
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 169.0,
"completions/mean_length": 118.14453125,
"completions/min_length": 67.5,
"epoch": 8.74,
"grad_norm": 2.607220411300659,
"kl": 0.619140625,
"learning_rate": 3.9438173442575e-08,
"loss": 0.005073768552392721,
"memory(GiB)": 25.14,
"reward": 0.4522544592618942,
"reward_std": 0.024327417835593224,
"rewards/MCQ_Reward/mean": 0.4522544592618942,
"rewards/MCQ_Reward/std": 0.08557374030351639,
"step": 437,
"train_speed(iter/s)": 0.128414
},
{
"clip_ratio": 0.005367731209844351,
"epoch": 8.76,
"grad_norm": 2.472538709640503,
"kl": 0.626953125,
"learning_rate": 3.821217650159453e-08,
"loss": 0.005441693589091301,
"memory(GiB)": 25.14,
"step": 438,
"train_speed(iter/s)": 0.128664
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 177.0,
"completions/mean_length": 117.36328125,
"completions/min_length": 65.5,
"epoch": 8.78,
"grad_norm": 2.8752048015594482,
"kl": 0.62109375,
"learning_rate": 3.700478030680987e-08,
"loss": 0.001543362159281969,
"memory(GiB)": 25.14,
"reward": 0.44734521210193634,
"reward_std": 0.02054190542548895,
"rewards/MCQ_Reward/mean": 0.44734521210193634,
"rewards/MCQ_Reward/std": 0.09018547832965851,
"step": 439,
"train_speed(iter/s)": 0.128624
},
{
"clip_ratio": 0.006753503577783704,
"epoch": 8.8,
"grad_norm": 2.822502374649048,
"kl": 0.625,
"learning_rate": 3.581603349196371e-08,
"loss": 0.0017494899220764637,
"memory(GiB)": 25.14,
"step": 440,
"train_speed(iter/s)": 0.128861
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 217.5,
"completions/mean_length": 117.40234375,
"completions/min_length": 62.0,
"epoch": 8.82,
"grad_norm": 2.5104751586914062,
"kl": 0.59375,
"learning_rate": 3.464598393960449e-08,
"loss": -0.004553473554551601,
"memory(GiB)": 25.14,
"reward": 0.39943838119506836,
"reward_std": 0.023083772510290146,
"rewards/MCQ_Reward/mean": 0.39943838119506836,
"rewards/MCQ_Reward/std": 0.08860309049487114,
"step": 441,
"train_speed(iter/s)": 0.128489
},
{
"clip_ratio": 0.00470179901458323,
"epoch": 8.84,
"grad_norm": 2.480741500854492,
"kl": 0.58984375,
"learning_rate": 3.349467877915746e-08,
"loss": -0.004542327020317316,
"memory(GiB)": 25.14,
"step": 442,
"train_speed(iter/s)": 0.128733
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 254.0,
"completions/mean_length": 127.69140625,
"completions/min_length": 50.0,
"epoch": 8.86,
"grad_norm": 2.399143934249878,
"kl": 0.607421875,
"learning_rate": 3.23621643850267e-08,
"loss": -0.004238632973283529,
"memory(GiB)": 25.14,
"reward": 0.40998475253582,
"reward_std": 0.02201936673372984,
"rewards/MCQ_Reward/mean": 0.40998475253582,
"rewards/MCQ_Reward/std": 0.0800128486007452,
"step": 443,
"train_speed(iter/s)": 0.128561
},
{
"clip_ratio": 0.006211797473952174,
"epoch": 8.88,
"grad_norm": 2.5745253562927246,
"kl": 0.603515625,
"learning_rate": 3.124848637472688e-08,
"loss": -0.003581822384148836,
"memory(GiB)": 25.14,
"step": 444,
"train_speed(iter/s)": 0.128809
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 187.5,
"completions/mean_length": 128.41015625,
"completions/min_length": 71.0,
"epoch": 8.9,
"grad_norm": 2.989118814468384,
"kl": 0.6640625,
"learning_rate": 3.015368960704584e-08,
"loss": 0.0020642182789742947,
"memory(GiB)": 25.14,
"reward": 0.45626600086688995,
"reward_std": 0.022524941712617874,
"rewards/MCQ_Reward/mean": 0.45626600086688995,
"rewards/MCQ_Reward/std": 0.08293722942471504,
"step": 445,
"train_speed(iter/s)": 0.128751
},
{
"clip_ratio": 0.0053639879915863276,
"epoch": 8.92,
"grad_norm": 2.226865291595459,
"kl": 0.65234375,
"learning_rate": 2.907781818023769e-08,
"loss": 0.0022344959434121847,
"memory(GiB)": 25.14,
"step": 446,
"train_speed(iter/s)": 0.128997
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 198.5,
"completions/mean_length": 114.81640625,
"completions/min_length": 69.5,
"epoch": 8.94,
"grad_norm": 2.5736968517303467,
"kl": 0.626953125,
"learning_rate": 2.8020915430246706e-08,
"loss": 0.00543589424341917,
"memory(GiB)": 25.14,
"reward": 0.4480299800634384,
"reward_std": 0.021618574857711792,
"rewards/MCQ_Reward/mean": 0.4480299800634384,
"rewards/MCQ_Reward/std": 0.08090543001890182,
"step": 447,
"train_speed(iter/s)": 0.128968
},
{
"clip_ratio": 0.005519783589988947,
"epoch": 8.96,
"grad_norm": 2.7313241958618164,
"kl": 0.62890625,
"learning_rate": 2.69830239289614e-08,
"loss": 0.005457316525280476,
"memory(GiB)": 25.14,
"step": 448,
"train_speed(iter/s)": 0.12921
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 163.0,
"completions/mean_length": 114.08203125,
"completions/min_length": 69.5,
"epoch": 8.98,
"grad_norm": 3.3176426887512207,
"kl": 0.658203125,
"learning_rate": 2.596418548250029e-08,
"loss": -0.006901263725012541,
"memory(GiB)": 25.14,
"reward": 0.4552987068891525,
"reward_std": 0.02576339803636074,
"rewards/MCQ_Reward/mean": 0.4552987068891525,
"rewards/MCQ_Reward/std": 0.09829828701913357,
"step": 449,
"train_speed(iter/s)": 0.129186
},
{
"clip_ratio": 0.005895850248634815,
"epoch": 9.0,
"grad_norm": 3.1435494422912598,
"kl": 0.65625,
"learning_rate": 2.4964441129527335e-08,
"loss": -0.006242312025278807,
"memory(GiB)": 25.14,
"step": 450,
"train_speed(iter/s)": 0.129418
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 173.5,
"completions/mean_length": 107.38671875,
"completions/min_length": 61.0,
"epoch": 9.02,
"grad_norm": 2.6646904945373535,
"kl": 0.60546875,
"learning_rate": 2.3983831139599286e-08,
"loss": 0.006207154132425785,
"memory(GiB)": 25.14,
"reward": 0.39446285367012024,
"reward_std": 0.022946057841181755,
"rewards/MCQ_Reward/mean": 0.39446285367012024,
"rewards/MCQ_Reward/std": 0.1063094437122345,
"step": 451,
"train_speed(iter/s)": 0.129116
},
{
"clip_ratio": 0.005521278129890561,
"epoch": 9.04,
"grad_norm": 2.453953504562378,
"kl": 0.619140625,
"learning_rate": 2.3022395011543682e-08,
"loss": 0.006389847490936518,
"memory(GiB)": 25.14,
"step": 452,
"train_speed(iter/s)": 0.129358
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 210.5,
"completions/mean_length": 128.57421875,
"completions/min_length": 55.0,
"epoch": 9.06,
"grad_norm": 2.812540054321289,
"kl": 0.580078125,
"learning_rate": 2.208017147186736e-08,
"loss": -0.005320190917700529,
"memory(GiB)": 25.14,
"reward": 0.41816772520542145,
"reward_std": 0.023720718920230865,
"rewards/MCQ_Reward/mean": 0.41816772520542145,
"rewards/MCQ_Reward/std": 0.11730682849884033,
"step": 453,
"train_speed(iter/s)": 0.129235
},
{
"clip_ratio": 0.005719892680644989,
"epoch": 9.08,
"grad_norm": 2.8398780822753906,
"kl": 0.578125,
"learning_rate": 2.1157198473197413e-08,
"loss": -0.004547153599560261,
"memory(GiB)": 25.14,
"step": 454,
"train_speed(iter/s)": 0.129473
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 209.5,
"completions/mean_length": 121.10546875,
"completions/min_length": 61.0,
"epoch": 9.1,
"grad_norm": 2.6457087993621826,
"kl": 0.623046875,
"learning_rate": 2.025351319275137e-08,
"loss": 0.006458953022956848,
"memory(GiB)": 25.14,
"reward": 0.4360807240009308,
"reward_std": 0.023424276150763035,
"rewards/MCQ_Reward/mean": 0.4360807240009308,
"rewards/MCQ_Reward/std": 0.08403830602765083,
"step": 455,
"train_speed(iter/s)": 0.129418
},
{
"clip_ratio": 0.007413617800921202,
"epoch": 9.12,
"grad_norm": 3.019871473312378,
"kl": 0.615234375,
"learning_rate": 1.936915203084055e-08,
"loss": 0.007484931964427233,
"memory(GiB)": 25.14,
"step": 456,
"train_speed(iter/s)": 0.129657
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 180.5,
"completions/mean_length": 115.48828125,
"completions/min_length": 62.0,
"epoch": 9.14,
"grad_norm": 2.869127035140991,
"kl": 0.5703125,
"learning_rate": 1.8504150609403856e-08,
"loss": 0.002277131425216794,
"memory(GiB)": 25.14,
"reward": 0.42605504393577576,
"reward_std": 0.02147796005010605,
"rewards/MCQ_Reward/mean": 0.42605504393577576,
"rewards/MCQ_Reward/std": 0.09400845319032669,
"step": 457,
"train_speed(iter/s)": 0.129623
},
{
"clip_ratio": 0.00495463190600276,
"epoch": 9.16,
"grad_norm": 2.7837038040161133,
"kl": 0.564453125,
"learning_rate": 1.7658543770572186e-08,
"loss": 0.0023261206224560738,
"memory(GiB)": 25.14,
"step": 458,
"train_speed(iter/s)": 0.129859
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 226.5,
"completions/mean_length": 131.125,
"completions/min_length": 63.0,
"epoch": 9.18,
"grad_norm": 2.4485437870025635,
"kl": 0.564453125,
"learning_rate": 1.683236557526574e-08,
"loss": -0.001264197751879692,
"memory(GiB)": 25.14,
"reward": 0.43159276247024536,
"reward_std": 0.02392040565609932,
"rewards/MCQ_Reward/mean": 0.43159276247024536,
"rewards/MCQ_Reward/std": 0.10159046202898026,
"step": 459,
"train_speed(iter/s)": 0.129693
},
{
"clip_ratio": 0.004053628304973245,
"epoch": 9.2,
"grad_norm": 2.3056235313415527,
"kl": 0.5625,
"learning_rate": 1.6025649301821875e-08,
"loss": -0.000987461768090725,
"memory(GiB)": 25.14,
"step": 460,
"train_speed(iter/s)": 0.129933
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 181.5,
"completions/mean_length": 113.18359375,
"completions/min_length": 65.5,
"epoch": 9.22,
"grad_norm": 2.3913767337799072,
"kl": 0.544921875,
"learning_rate": 1.5238427444654367e-08,
"loss": 0.012515128590166569,
"memory(GiB)": 25.14,
"reward": 0.4141518771648407,
"reward_std": 0.019386641681194305,
"rewards/MCQ_Reward/mean": 0.4141518771648407,
"rewards/MCQ_Reward/std": 0.09657716751098633,
"step": 461,
"train_speed(iter/s)": 0.129665
},
{
"clip_ratio": 0.005686681717634201,
"epoch": 9.24,
"grad_norm": 2.5303232669830322,
"kl": 0.544921875,
"learning_rate": 1.4470731712944883e-08,
"loss": 0.013128566555678844,
"memory(GiB)": 25.14,
"step": 462,
"train_speed(iter/s)": 0.129891
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 173.0,
"completions/mean_length": 113.08984375,
"completions/min_length": 68.0,
"epoch": 9.26,
"grad_norm": 2.9452006816864014,
"kl": 0.578125,
"learning_rate": 1.3722593029365459e-08,
"loss": 0.01786494255065918,
"memory(GiB)": 25.14,
"reward": 0.4347621351480484,
"reward_std": 0.023103663697838783,
"rewards/MCQ_Reward/mean": 0.4347621351480484,
"rewards/MCQ_Reward/std": 0.10107803344726562,
"step": 463,
"train_speed(iter/s)": 0.129821
},
{
"clip_ratio": 0.004837532993406057,
"epoch": 9.28,
"grad_norm": 3.270838499069214,
"kl": 0.576171875,
"learning_rate": 1.2994041528833267e-08,
"loss": 0.01855536922812462,
"memory(GiB)": 25.14,
"step": 464,
"train_speed(iter/s)": 0.130055
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 197.0,
"completions/mean_length": 130.0703125,
"completions/min_length": 61.0,
"epoch": 9.3,
"grad_norm": 2.5287396907806396,
"kl": 0.5703125,
"learning_rate": 1.2285106557296476e-08,
"loss": -0.009716257452964783,
"memory(GiB)": 25.14,
"reward": 0.4242394268512726,
"reward_std": 0.024817454628646374,
"rewards/MCQ_Reward/mean": 0.4242394268512726,
"rewards/MCQ_Reward/std": 0.11753027141094208,
"step": 465,
"train_speed(iter/s)": 0.129996
},
{
"clip_ratio": 0.0049513031262904406,
"epoch": 9.32,
"grad_norm": 2.6941351890563965,
"kl": 0.56640625,
"learning_rate": 1.1595816670552428e-08,
"loss": -0.009578550234436989,
"memory(GiB)": 25.14,
"step": 466,
"train_speed(iter/s)": 0.130232
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 176.0,
"completions/mean_length": 120.80859375,
"completions/min_length": 79.0,
"epoch": 9.34,
"grad_norm": 2.4061837196350098,
"kl": 0.580078125,
"learning_rate": 1.0926199633097154e-08,
"loss": 0.009803004562854767,
"memory(GiB)": 25.14,
"reward": 0.4236748516559601,
"reward_std": 0.020633171312510967,
"rewards/MCQ_Reward/mean": 0.4236748516559601,
"rewards/MCQ_Reward/std": 0.10525783523917198,
"step": 467,
"train_speed(iter/s)": 0.130202
},
{
"clip_ratio": 0.0038570521865040064,
"epoch": 9.36,
"grad_norm": 2.538754463195801,
"kl": 0.576171875,
"learning_rate": 1.0276282417007399e-08,
"loss": 0.010506462305784225,
"memory(GiB)": 25.14,
"step": 468,
"train_speed(iter/s)": 0.130419
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 163.0,
"completions/mean_length": 115.359375,
"completions/min_length": 72.5,
"epoch": 9.38,
"grad_norm": 2.767404317855835,
"kl": 0.58203125,
"learning_rate": 9.646091200853801e-09,
"loss": 0.002447181846946478,
"memory(GiB)": 25.14,
"reward": 0.4558543264865875,
"reward_std": 0.023351009003818035,
"rewards/MCQ_Reward/mean": 0.4558543264865875,
"rewards/MCQ_Reward/std": 0.10045822337269783,
"step": 469,
"train_speed(iter/s)": 0.130376
},
{
"clip_ratio": 0.003978088265284896,
"epoch": 9.4,
"grad_norm": 2.3947746753692627,
"kl": 0.58984375,
"learning_rate": 9.035651368646646e-09,
"loss": 0.0025905624497681856,
"memory(GiB)": 25.14,
"step": 470,
"train_speed(iter/s)": 0.130609
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 199.0,
"completions/mean_length": 120.05078125,
"completions/min_length": 61.0,
"epoch": 9.42,
"grad_norm": 2.2213082313537598,
"kl": 0.595703125,
"learning_rate": 8.44498750881345e-09,
"loss": 0.022836437448859215,
"memory(GiB)": 25.14,
"reward": 0.4252375066280365,
"reward_std": 0.02044745907187462,
"rewards/MCQ_Reward/mean": 0.4252375066280365,
"rewards/MCQ_Reward/std": 0.0874844454228878,
"step": 471,
"train_speed(iter/s)": 0.130308
},
{
"clip_ratio": 0.004947596346028149,
"epoch": 9.44,
"grad_norm": 2.374445676803589,
"kl": 0.599609375,
"learning_rate": 7.874123413208145e-09,
"loss": 0.02313510701060295,
"memory(GiB)": 25.14,
"step": 472,
"train_speed(iter/s)": 0.130541
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 202.5,
"completions/mean_length": 122.046875,
"completions/min_length": 59.0,
"epoch": 9.46,
"grad_norm": 2.6664299964904785,
"kl": 0.626953125,
"learning_rate": 7.323082076153508e-09,
"loss": 0.0047410172410309315,
"memory(GiB)": 25.14,
"reward": 0.42370498180389404,
"reward_std": 0.021436103619635105,
"rewards/MCQ_Reward/mean": 0.42370498180389404,
"rewards/MCQ_Reward/std": 0.11163535714149475,
"step": 473,
"train_speed(iter/s)": 0.130462
},
{
"clip_ratio": 0.005457588471472263,
"epoch": 9.48,
"grad_norm": 2.7726047039031982,
"kl": 0.626953125,
"learning_rate": 6.791885693514132e-09,
"loss": 0.005159153137356043,
"memory(GiB)": 25.14,
"step": 474,
"train_speed(iter/s)": 0.130692
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 244.5,
"completions/mean_length": 136.453125,
"completions/min_length": 83.0,
"epoch": 9.5,
"grad_norm": 2.2565746307373047,
"kl": 0.595703125,
"learning_rate": 6.280555661802856e-09,
"loss": 0.011247138492763042,
"memory(GiB)": 25.14,
"reward": 0.4296618103981018,
"reward_std": 0.021635888144373894,
"rewards/MCQ_Reward/mean": 0.4296618103981018,
"rewards/MCQ_Reward/std": 0.06789225153625011,
"step": 475,
"train_speed(iter/s)": 0.130512
},
{
"clip_ratio": 0.005767492577433586,
"epoch": 9.52,
"grad_norm": 2.250284433364868,
"kl": 0.6015625,
"learning_rate": 5.789112577318789e-09,
"loss": 0.011374367401003838,
"memory(GiB)": 25.14,
"step": 476,
"train_speed(iter/s)": 0.130746
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 185.5,
"completions/mean_length": 118.5703125,
"completions/min_length": 73.5,
"epoch": 9.54,
"grad_norm": 2.5178654193878174,
"kl": 0.728515625,
"learning_rate": 5.317576235317756e-09,
"loss": 0.007045174017548561,
"memory(GiB)": 25.14,
"reward": 0.44049952924251556,
"reward_std": 0.02334336470812559,
"rewards/MCQ_Reward/mean": 0.44049952924251556,
"rewards/MCQ_Reward/std": 0.0808117426931858,
"step": 477,
"train_speed(iter/s)": 0.130671
},
{
"clip_ratio": 0.004105736967176199,
"epoch": 9.56,
"grad_norm": 2.5065832138061523,
"kl": 0.6953125,
"learning_rate": 4.865965629214819e-09,
"loss": 0.007527303881943226,
"memory(GiB)": 25.14,
"step": 478,
"train_speed(iter/s)": 0.130887
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 191.0,
"completions/mean_length": 117.73046875,
"completions/min_length": 75.0,
"epoch": 9.58,
"grad_norm": 3.128554105758667,
"kl": 0.59765625,
"learning_rate": 4.434298949819448e-09,
"loss": -0.021542608737945557,
"memory(GiB)": 25.14,
"reward": 0.4070900082588196,
"reward_std": 0.023668975569307804,
"rewards/MCQ_Reward/mean": 0.4070900082588196,
"rewards/MCQ_Reward/std": 0.08471970073878765,
"step": 479,
"train_speed(iter/s)": 0.130803
},
{
"clip_ratio": 0.00539792119525373,
"epoch": 9.6,
"grad_norm": 3.067028045654297,
"kl": 0.59765625,
"learning_rate": 4.022593584602329e-09,
"loss": -0.02082860842347145,
"memory(GiB)": 25.14,
"step": 480,
"train_speed(iter/s)": 0.131034
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 251.0,
"completions/mean_length": 130.140625,
"completions/min_length": 54.0,
"epoch": 9.62,
"grad_norm": 2.8921902179718018,
"kl": 0.59375,
"learning_rate": 3.6308661169957565e-09,
"loss": -0.0016225441358983517,
"memory(GiB)": 25.14,
"reward": 0.42697805166244507,
"reward_std": 0.0217811968177557,
"rewards/MCQ_Reward/mean": 0.42697805166244507,
"rewards/MCQ_Reward/std": 0.0660354271531105,
"step": 481,
"train_speed(iter/s)": 0.130674
},
{
"clip_ratio": 0.007906233426183462,
"epoch": 9.64,
"grad_norm": 2.9274981021881104,
"kl": 0.595703125,
"learning_rate": 3.2591323257248894e-09,
"loss": -0.0016696015372872353,
"memory(GiB)": 25.14,
"step": 482,
"train_speed(iter/s)": 0.130879
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 222.0,
"completions/mean_length": 135.94921875,
"completions/min_length": 71.0,
"epoch": 9.66,
"grad_norm": 2.4433958530426025,
"kl": 0.5546875,
"learning_rate": 2.9074071841727054e-09,
"loss": 0.019563939422369003,
"memory(GiB)": 25.14,
"reward": 0.42691025137901306,
"reward_std": 0.020791654475033283,
"rewards/MCQ_Reward/mean": 0.42691025137901306,
"rewards/MCQ_Reward/std": 0.0828494131565094,
"step": 483,
"train_speed(iter/s)": 0.13078
},
{
"clip_ratio": 0.004861004883423448,
"epoch": 9.68,
"grad_norm": 2.2269864082336426,
"kl": 0.55859375,
"learning_rate": 2.5757048597765395e-09,
"loss": 0.019545655697584152,
"memory(GiB)": 25.14,
"step": 484,
"train_speed(iter/s)": 0.131008
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 199.0,
"completions/mean_length": 141.06640625,
"completions/min_length": 89.0,
"epoch": 9.7,
"grad_norm": 2.19620418548584,
"kl": 0.513671875,
"learning_rate": 2.2640387134577053e-09,
"loss": 0.010847845114767551,
"memory(GiB)": 25.14,
"reward": 0.42219071090221405,
"reward_std": 0.022757427766919136,
"rewards/MCQ_Reward/mean": 0.42219071090221405,
"rewards/MCQ_Reward/std": 0.0853536631911993,
"step": 485,
"train_speed(iter/s)": 0.130923
},
{
"clip_ratio": 0.006320674438029528,
"epoch": 9.72,
"grad_norm": 2.1190598011016846,
"kl": 0.5048828125,
"learning_rate": 1.9724212990830936e-09,
"loss": 0.010512834414839745,
"memory(GiB)": 25.14,
"step": 486,
"train_speed(iter/s)": 0.13115
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 180.5,
"completions/mean_length": 111.921875,
"completions/min_length": 63.5,
"epoch": 9.74,
"grad_norm": 2.5479891300201416,
"kl": 0.59765625,
"learning_rate": 1.7008643629596864e-09,
"loss": -0.008141995407640934,
"memory(GiB)": 25.14,
"reward": 0.41020119190216064,
"reward_std": 0.022871771827340126,
"rewards/MCQ_Reward/mean": 0.41020119190216064,
"rewards/MCQ_Reward/std": 0.10586465150117874,
"step": 487,
"train_speed(iter/s)": 0.131123
},
{
"clip_ratio": 0.004743925994262099,
"epoch": 9.76,
"grad_norm": 2.7629165649414062,
"kl": 0.591796875,
"learning_rate": 1.4493788433612708e-09,
"loss": -0.008076684549450874,
"memory(GiB)": 25.14,
"step": 488,
"train_speed(iter/s)": 0.131348
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 177.0,
"completions/mean_length": 116.10546875,
"completions/min_length": 67.0,
"epoch": 9.78,
"grad_norm": 2.770082950592041,
"kl": 0.576171875,
"learning_rate": 1.217974870087901e-09,
"loss": 0.010374639183282852,
"memory(GiB)": 25.14,
"reward": 0.47805055975914,
"reward_std": 0.023321266286075115,
"rewards/MCQ_Reward/mean": 0.47805055975914,
"rewards/MCQ_Reward/std": 0.1008174680173397,
"step": 489,
"train_speed(iter/s)": 0.131298
},
{
"clip_ratio": 0.005443725967779756,
"epoch": 9.8,
"grad_norm": 2.5658154487609863,
"kl": 0.583984375,
"learning_rate": 1.0066617640578368e-09,
"loss": 0.010389911010861397,
"memory(GiB)": 25.14,
"step": 490,
"train_speed(iter/s)": 0.131523
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 181.5,
"completions/mean_length": 128.69921875,
"completions/min_length": 71.5,
"epoch": 9.82,
"grad_norm": 2.3105576038360596,
"kl": 0.90625,
"learning_rate": 8.154480369321759e-10,
"loss": -0.004896960221230984,
"memory(GiB)": 25.14,
"reward": 0.43206796050071716,
"reward_std": 0.02110449317842722,
"rewards/MCQ_Reward/mean": 0.43206796050071716,
"rewards/MCQ_Reward/std": 0.10026764124631882,
"step": 491,
"train_speed(iter/s)": 0.13119
},
{
"clip_ratio": 0.004017886472865939,
"epoch": 9.84,
"grad_norm": 2.2543957233428955,
"kl": 0.892578125,
"learning_rate": 6.443413907720186e-10,
"loss": -0.004858216270804405,
"memory(GiB)": 25.14,
"step": 492,
"train_speed(iter/s)": 0.131415
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 221.5,
"completions/mean_length": 131.3203125,
"completions/min_length": 58.0,
"epoch": 9.86,
"grad_norm": 2.459817409515381,
"kl": 0.5390625,
"learning_rate": 4.933487177280482e-10,
"loss": 0.0025399066507816315,
"memory(GiB)": 25.14,
"reward": 0.47691330313682556,
"reward_std": 0.022764784283936024,
"rewards/MCQ_Reward/mean": 0.47691330313682556,
"rewards/MCQ_Reward/std": 0.09778410196304321,
"step": 493,
"train_speed(iter/s)": 0.131346
},
{
"clip_ratio": 0.004864038084633648,
"epoch": 9.88,
"grad_norm": 2.518949508666992,
"kl": 0.537109375,
"learning_rate": 3.6247609976319817e-10,
"loss": 0.0027223415672779083,
"memory(GiB)": 25.14,
"step": 494,
"train_speed(iter/s)": 0.131569
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 184.0,
"completions/mean_length": 113.7421875,
"completions/min_length": 57.5,
"epoch": 9.9,
"grad_norm": 2.7932207584381104,
"kl": 0.640625,
"learning_rate": 2.517288084074587e-10,
"loss": -0.008804459124803543,
"memory(GiB)": 25.14,
"reward": 0.45272429287433624,
"reward_std": 0.02382285613566637,
"rewards/MCQ_Reward/mean": 0.45272429287433624,
"rewards/MCQ_Reward/std": 0.08811983093619347,
"step": 495,
"train_speed(iter/s)": 0.13153
},
{
"clip_ratio": 0.005316317779943347,
"epoch": 9.92,
"grad_norm": 2.3468141555786133,
"kl": 0.634765625,
"learning_rate": 1.6111130454543597e-10,
"loss": -0.00884802732616663,
"memory(GiB)": 25.14,
"step": 496,
"train_speed(iter/s)": 0.131752
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 175.0,
"completions/mean_length": 111.8515625,
"completions/min_length": 57.5,
"epoch": 9.94,
"grad_norm": 2.973198413848877,
"kl": 0.642578125,
"learning_rate": 9.06272382371065e-11,
"loss": 0.002287194598466158,
"memory(GiB)": 25.14,
"reward": 0.4001469016075134,
"reward_std": 0.0235411636531353,
"rewards/MCQ_Reward/mean": 0.4001469016075134,
"rewards/MCQ_Reward/std": 0.07189228385686874,
"step": 497,
"train_speed(iter/s)": 0.131698
},
{
"clip_ratio": 0.0034996896283701062,
"epoch": 9.96,
"grad_norm": 3.0021812915802,
"kl": 0.6484375,
"learning_rate": 4.0279448570323946e-11,
"loss": 0.002919801976531744,
"memory(GiB)": 25.14,
"step": 498,
"train_speed(iter/s)": 0.131924
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 225.0,
"completions/mean_length": 135.265625,
"completions/min_length": 68.5,
"epoch": 9.98,
"grad_norm": 2.244234085083008,
"kl": 0.55078125,
"learning_rate": 1.0069963546743831e-11,
"loss": -0.0014414777979254723,
"memory(GiB)": 25.14,
"reward": 0.46473294496536255,
"reward_std": 0.02351410035043955,
"rewards/MCQ_Reward/mean": 0.46473294496536255,
"rewards/MCQ_Reward/std": 0.06907243467867374,
"step": 499,
"train_speed(iter/s)": 0.131777
},
{
"clip_ratio": 0.0020644072210416198,
"epoch": 10.0,
"grad_norm": 2.3687548637390137,
"kl": 0.55078125,
"learning_rate": 0.0,
"loss": -0.0014774189330637455,
"memory(GiB)": 25.14,
"step": 500,
"train_speed(iter/s)": 0.131993
}
],
"logging_steps": 1,
"max_steps": 500,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 10,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}