TBAC-VLR1-3B / trainer_state.json
oulinyu's picture
Upload folder using huggingface_hub
ed917e0 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9554140127388535,
"eval_steps": 500,
"global_step": 300,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5714285714285714,
"completions/max_length": 8057.0,
"completions/mean_length": 6127.464599609375,
"completions/min_length": 1222.5,
"epoch": 0.0031847133757961785,
"grad_norm": 0.4097345173358917,
"kl": 0.0001392364501953125,
"learning_rate": 2.5e-07,
"loss": -0.22166921198368073,
"memory(GiB)": 142.96,
"reward": 0.3214285969734192,
"reward_std": 0.18409645557403564,
"rewards/AnswerTagAccuracyORM/mean": 0.3214285969734192,
"rewards/AnswerTagAccuracyORM/std": 0.4609040319919586,
"step": 1,
"train_speed(iter/s)": 0.002718
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5892857142857143,
"completions/max_length": 8075.0,
"completions/mean_length": 6311.14306640625,
"completions/min_length": 1474.0,
"epoch": 0.006369426751592357,
"grad_norm": 0.19287629425525665,
"kl": 0.00012159347534179688,
"learning_rate": 5e-07,
"loss": -0.09268201887607574,
"memory(GiB)": 157.67,
"reward": 0.5357142984867096,
"reward_std": 0.25552501529455185,
"rewards/AnswerTagAccuracyORM/mean": 0.5357142984867096,
"rewards/AnswerTagAccuracyORM/std": 0.4739968925714493,
"step": 2,
"train_speed(iter/s)": 0.002969
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.625,
"completions/max_length": 8049.5,
"completions/mean_length": 6904.018310546875,
"completions/min_length": 1641.0,
"epoch": 0.009554140127388535,
"grad_norm": 0.19940905272960663,
"kl": 4.482269287109375e-05,
"learning_rate": 7.5e-07,
"loss": -0.1511116474866867,
"memory(GiB)": 157.67,
"reward": 0.4464285969734192,
"reward_std": 0.29123930633068085,
"rewards/AnswerTagAccuracyORM/mean": 0.4464285969734192,
"rewards/AnswerTagAccuracyORM/std": 0.4897737503051758,
"step": 3,
"train_speed(iter/s)": 0.003082
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.6785714285714286,
"completions/max_length": 8045.0,
"completions/mean_length": 6924.089599609375,
"completions/min_length": 2193.5,
"epoch": 0.012738853503184714,
"grad_norm": 0.6029608845710754,
"kl": 0.00013637542724609375,
"learning_rate": 1e-06,
"loss": -0.41891923546791077,
"memory(GiB)": 157.67,
"reward": 0.2500000149011612,
"reward_std": 0.2253357544541359,
"rewards/AnswerTagAccuracyORM/mean": 0.2500000149011612,
"rewards/AnswerTagAccuracyORM/std": 0.4389495849609375,
"step": 4,
"train_speed(iter/s)": 0.003138
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.6964285714285714,
"completions/max_length": 8054.0,
"completions/mean_length": 7148.3037109375,
"completions/min_length": 2388.5,
"epoch": 0.01592356687898089,
"grad_norm": 0.2799333333969116,
"kl": 0.00018405914306640625,
"learning_rate": 9.999743248701019e-07,
"loss": -0.09833915531635284,
"memory(GiB)": 157.67,
"reward": 0.3214285969734192,
"reward_std": 0.32695358991622925,
"rewards/AnswerTagAccuracyORM/mean": 0.3214285969734192,
"rewards/AnswerTagAccuracyORM/std": 0.4755948781967163,
"step": 5,
"train_speed(iter/s)": 0.003168
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.7857142857142857,
"completions/max_length": 8034.5,
"completions/mean_length": 7279.303955078125,
"completions/min_length": 3276.0,
"epoch": 0.01910828025477707,
"grad_norm": 0.2061210572719574,
"kl": 0.00016450881958007812,
"learning_rate": 9.998973021172564e-07,
"loss": 0.03570747375488281,
"memory(GiB)": 157.75,
"reward": 0.3035714477300644,
"reward_std": 0.23086077719926834,
"rewards/AnswerTagAccuracyORM/mean": 0.3035714477300644,
"rewards/AnswerTagAccuracyORM/std": 0.46781930327415466,
"step": 6,
"train_speed(iter/s)": 0.003171
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.7857142857142857,
"completions/max_length": 8047.5,
"completions/mean_length": 7143.1611328125,
"completions/min_length": 2512.5,
"epoch": 0.022292993630573247,
"grad_norm": 0.5545817017555237,
"kl": 9.918212890625e-05,
"learning_rate": 9.997689396517406e-07,
"loss": -0.36179664731025696,
"memory(GiB)": 157.86,
"reward": 0.2500000149011612,
"reward_std": 0.33800362795591354,
"rewards/AnswerTagAccuracyORM/mean": 0.2500000149011612,
"rewards/AnswerTagAccuracyORM/std": 0.4389495849609375,
"step": 7,
"train_speed(iter/s)": 0.003182
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.7142857142857142,
"completions/max_length": 8051.0,
"completions/mean_length": 6924.375244140625,
"completions/min_length": 1848.0,
"epoch": 0.025477707006369428,
"grad_norm": 0.3819968104362488,
"kl": 0.0002493858337402344,
"learning_rate": 9.99589250656446e-07,
"loss": -0.23172156512737274,
"memory(GiB)": 157.86,
"reward": 0.2678571566939354,
"reward_std": 0.21981073915958405,
"rewards/AnswerTagAccuracyORM/mean": 0.2678571566939354,
"rewards/AnswerTagAccuracyORM/std": 0.426847904920578,
"step": 8,
"train_speed(iter/s)": 0.003194
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.6964285714285714,
"completions/max_length": 8042.5,
"completions/mean_length": 6457.82177734375,
"completions/min_length": 1266.0,
"epoch": 0.028662420382165606,
"grad_norm": 0.4406679570674896,
"kl": 1.0362602143682022e-41,
"learning_rate": 9.993582535855263e-07,
"loss": -0.26652470231056213,
"memory(GiB)": 157.86,
"reward": 0.3928571492433548,
"reward_std": 0.2253357619047165,
"rewards/AnswerTagAccuracyORM/mean": 0.3928571492433548,
"rewards/AnswerTagAccuracyORM/std": 0.4846093952655792,
"step": 9,
"train_speed(iter/s)": 0.003209
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.6071428571428572,
"completions/max_length": 8019.0,
"completions/mean_length": 6910.01806640625,
"completions/min_length": 1922.5,
"epoch": 0.03184713375796178,
"grad_norm": 0.22980190813541412,
"kl": 0.0003070831298828125,
"learning_rate": 9.990759721625005e-07,
"loss": 0.014422202482819557,
"memory(GiB)": 157.89,
"reward": 0.392857164144516,
"reward_std": 0.33800362050533295,
"rewards/AnswerTagAccuracyORM/mean": 0.392857164144516,
"rewards/AnswerTagAccuracyORM/std": 0.4959513247013092,
"step": 10,
"train_speed(iter/s)": 0.003216
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.8035714285714286,
"completions/max_length": 8046.0,
"completions/mean_length": 7162.714599609375,
"completions/min_length": 3054.0,
"epoch": 0.03503184713375796,
"grad_norm": 0.13719218969345093,
"kl": 5.929594451790463e-42,
"learning_rate": 9.98742435377817e-07,
"loss": -0.0598757378757,
"memory(GiB)": 158.35,
"reward": 0.2500000186264515,
"reward_std": 0.19514648616313934,
"rewards/AnswerTagAccuracyORM/mean": 0.2500000186264515,
"rewards/AnswerTagAccuracyORM/std": 0.3831089437007904,
"step": 11,
"train_speed(iter/s)": 0.003205
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.6964285714285714,
"completions/max_length": 8062.0,
"completions/mean_length": 6935.589599609375,
"completions/min_length": 1498.0,
"epoch": 0.03821656050955414,
"grad_norm": 0.3652593195438385,
"kl": 5.194803875951948e-08,
"learning_rate": 9.983576774858775e-07,
"loss": -0.02829546295106411,
"memory(GiB)": 158.35,
"reward": 0.2857142984867096,
"reward_std": 0.3681928962469101,
"rewards/AnswerTagAccuracyORM/mean": 0.2857142984867096,
"rewards/AnswerTagAccuracyORM/std": 0.458276703953743,
"step": 12,
"train_speed(iter/s)": 0.0032
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.7142857142857143,
"completions/max_length": 8041.0,
"completions/mean_length": 7132.393310546875,
"completions/min_length": 2462.0,
"epoch": 0.041401273885350316,
"grad_norm": 0.2656404376029968,
"kl": 0.00013446807861328125,
"learning_rate": 9.979217380015173e-07,
"loss": -0.09244758635759354,
"memory(GiB)": 158.35,
"reward": 0.2857142984867096,
"reward_std": 0.2253357619047165,
"rewards/AnswerTagAccuracyORM/mean": 0.2857142984867096,
"rewards/AnswerTagAccuracyORM/std": 0.4600437134504318,
"step": 13,
"train_speed(iter/s)": 0.003197
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.8214285714285714,
"completions/max_length": 8044.5,
"completions/mean_length": 7196.393310546875,
"completions/min_length": 1879.0,
"epoch": 0.044585987261146494,
"grad_norm": 0.5126783847808838,
"kl": 0.00011968612670898438,
"learning_rate": 9.974346616959475e-07,
"loss": -0.23802340030670166,
"memory(GiB)": 158.35,
"reward": 0.1428571492433548,
"reward_std": 0.2857142984867096,
"rewards/AnswerTagAccuracyORM/mean": 0.1428571492433548,
"rewards/AnswerTagAccuracyORM/std": 0.3524957150220871,
"step": 14,
"train_speed(iter/s)": 0.003188
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.7142857142857143,
"completions/max_length": 8062.5,
"completions/mean_length": 7267.26806640625,
"completions/min_length": 2604.5,
"epoch": 0.04777070063694268,
"grad_norm": 0.6457058787345886,
"kl": 0.000148773193359375,
"learning_rate": 9.968964985921581e-07,
"loss": -0.41891902685165405,
"memory(GiB)": 158.35,
"reward": 0.3750000298023224,
"reward_std": 0.3324785977602005,
"rewards/AnswerTagAccuracyORM/mean": 0.3750000298023224,
"rewards/AnswerTagAccuracyORM/std": 0.4897737503051758,
"step": 15,
"train_speed(iter/s)": 0.003189
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.6964285714285714,
"completions/max_length": 8051.5,
"completions/mean_length": 7219.57177734375,
"completions/min_length": 3177.0,
"epoch": 0.050955414012738856,
"grad_norm": 0.29857584834098816,
"kl": 0.0001468658447265625,
"learning_rate": 9.963073039597796e-07,
"loss": -0.1913096308708191,
"memory(GiB)": 158.35,
"reward": 0.3035714477300644,
"reward_std": 0.30228935927152634,
"rewards/AnswerTagAccuracyORM/mean": 0.3035714477300644,
"rewards/AnswerTagAccuracyORM/std": 0.46781930327415466,
"step": 16,
"train_speed(iter/s)": 0.003187
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.7142857142857142,
"completions/max_length": 8070.0,
"completions/mean_length": 7195.3037109375,
"completions/min_length": 2742.0,
"epoch": 0.054140127388535034,
"grad_norm": 0.5090013742446899,
"kl": 1.1901227857510671e-41,
"learning_rate": 9.956671383094068e-07,
"loss": -0.181706503033638,
"memory(GiB)": 158.35,
"reward": 0.2678571566939354,
"reward_std": 0.2500000149011612,
"rewards/AnswerTagAccuracyORM/mean": 0.2678571566939354,
"rewards/AnswerTagAccuracyORM/std": 0.426847904920578,
"step": 17,
"train_speed(iter/s)": 0.003177
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.6785714285714286,
"completions/max_length": 8023.5,
"completions/mean_length": 6882.964599609375,
"completions/min_length": 1486.5,
"epoch": 0.05732484076433121,
"grad_norm": 0.24539124965667725,
"kl": 0.0003590583801269531,
"learning_rate": 9.949760673863846e-07,
"loss": -0.04751904308795929,
"memory(GiB)": 158.35,
"reward": 0.3035714477300644,
"reward_std": 0.37371790409088135,
"rewards/AnswerTagAccuracyORM/mean": 0.3035714477300644,
"rewards/AnswerTagAccuracyORM/std": 0.4469868242740631,
"step": 18,
"train_speed(iter/s)": 0.003182
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.6428571428571428,
"completions/max_length": 8032.0,
"completions/mean_length": 6654.82177734375,
"completions/min_length": 1637.0,
"epoch": 0.06050955414012739,
"grad_norm": 0.2233801782131195,
"kl": 0.0003261566162109375,
"learning_rate": 9.942341621640557e-07,
"loss": -0.11176574230194092,
"memory(GiB)": 158.35,
"reward": 0.446428582072258,
"reward_std": 0.30228935927152634,
"rewards/AnswerTagAccuracyORM/mean": 0.446428582072258,
"rewards/AnswerTagAccuracyORM/std": 0.5032612681388855,
"step": 19,
"train_speed(iter/s)": 0.003186
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.6071428571428572,
"completions/max_length": 8052.5,
"completions/mean_length": 6537.839599609375,
"completions/min_length": 1345.0,
"epoch": 0.06369426751592357,
"grad_norm": 0.2528194189071655,
"kl": 0.0002574920654296875,
"learning_rate": 9.934414988364722e-07,
"loss": -0.1300119161605835,
"memory(GiB)": 158.38,
"reward": 0.392857164144516,
"reward_std": 0.3078143745660782,
"rewards/AnswerTagAccuracyORM/mean": 0.392857164144516,
"rewards/AnswerTagAccuracyORM/std": 0.4744165241718292,
"step": 20,
"train_speed(iter/s)": 0.003187
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.7142857142857143,
"completions/max_length": 8054.5,
"completions/mean_length": 7098.500244140625,
"completions/min_length": 2948.5,
"epoch": 0.06687898089171974,
"grad_norm": 0.35901573300361633,
"kl": 0.0002613067626953125,
"learning_rate": 9.925981588105694e-07,
"loss": 0.0739278644323349,
"memory(GiB)": 158.38,
"reward": 0.3571428656578064,
"reward_std": 0.3078143820166588,
"rewards/AnswerTagAccuracyORM/mean": 0.3571428656578064,
"rewards/AnswerTagAccuracyORM/std": 0.4635152816772461,
"step": 21,
"train_speed(iter/s)": 0.003186
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.6428571428571428,
"completions/max_length": 8043.0,
"completions/mean_length": 6802.339599609375,
"completions/min_length": 2142.5,
"epoch": 0.07006369426751592,
"grad_norm": 0.31139233708381653,
"kl": 0.0002899169921875,
"learning_rate": 9.917042286978063e-07,
"loss": -0.23081684112548828,
"memory(GiB)": 158.38,
"reward": 0.3392857313156128,
"reward_std": 0.2610500380396843,
"rewards/AnswerTagAccuracyORM/mean": 0.3392857313156128,
"rewards/AnswerTagAccuracyORM/std": 0.4786955863237381,
"step": 22,
"train_speed(iter/s)": 0.003189
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5357142857142857,
"completions/max_length": 8046.5,
"completions/mean_length": 6078.089599609375,
"completions/min_length": 1577.5,
"epoch": 0.0732484076433121,
"grad_norm": 0.3595212697982788,
"kl": 0.06200312077999115,
"learning_rate": 9.9075980030527e-07,
"loss": -0.26652395725250244,
"memory(GiB)": 158.38,
"reward": 0.6250000298023224,
"reward_std": 0.30228935182094574,
"rewards/AnswerTagAccuracyORM/mean": 0.6250000298023224,
"rewards/AnswerTagAccuracyORM/std": 0.4750668406486511,
"step": 23,
"train_speed(iter/s)": 0.00319
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.6964285714285714,
"completions/max_length": 8046.0,
"completions/mean_length": 6813.75048828125,
"completions/min_length": 1453.0,
"epoch": 0.07643312101910828,
"grad_norm": 0.3403284549713135,
"kl": 0.0003566741943359375,
"learning_rate": 9.897649706262473e-07,
"loss": -0.11760014295578003,
"memory(GiB)": 158.38,
"reward": 0.3035714477300644,
"reward_std": 0.31333938241004944,
"rewards/AnswerTagAccuracyORM/mean": 0.3035714477300644,
"rewards/AnswerTagAccuracyORM/std": 0.4469868242740631,
"step": 24,
"train_speed(iter/s)": 0.00319
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.6428571428571428,
"completions/max_length": 8073.5,
"completions/mean_length": 6462.9111328125,
"completions/min_length": 474.5,
"epoch": 0.07961783439490445,
"grad_norm": 0.5593795776367188,
"kl": 25.92991018295288,
"learning_rate": 9.887198418302628e-07,
"loss": -0.2458166778087616,
"memory(GiB)": 158.38,
"reward": 0.4285714477300644,
"reward_std": 0.3681928962469101,
"rewards/AnswerTagAccuracyORM/mean": 0.4285714477300644,
"rewards/AnswerTagAccuracyORM/std": 0.502610981464386,
"step": 25,
"train_speed(iter/s)": 0.00319
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.7678571428571428,
"completions/max_length": 8025.0,
"completions/mean_length": 7177.714599609375,
"completions/min_length": 1826.5,
"epoch": 0.08280254777070063,
"grad_norm": 0.3172476291656494,
"kl": 0.0003204345703125,
"learning_rate": 9.87624521252587e-07,
"loss": 0.09654846042394638,
"memory(GiB)": 158.38,
"reward": 0.267857164144516,
"reward_std": 0.3214285969734192,
"rewards/AnswerTagAccuracyORM/mean": 0.267857164144516,
"rewards/AnswerTagAccuracyORM/std": 0.40946151316165924,
"step": 26,
"train_speed(iter/s)": 0.003186
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.875,
"completions/max_length": 8030.5,
"completions/mean_length": 7575.19677734375,
"completions/min_length": 5771.5,
"epoch": 0.08598726114649681,
"grad_norm": 0.24004344642162323,
"kl": 0.0003147125244140625,
"learning_rate": 9.864791213832125e-07,
"loss": -0.0921306237578392,
"memory(GiB)": 158.38,
"reward": 0.1964285857975483,
"reward_std": 0.23086078464984894,
"rewards/AnswerTagAccuracyORM/mean": 0.1964285857975483,
"rewards/AnswerTagAccuracyORM/std": 0.36893007159233093,
"step": 27,
"train_speed(iter/s)": 0.003183
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.7321428571428571,
"completions/max_length": 8037.0,
"completions/mean_length": 7293.5361328125,
"completions/min_length": 3104.0,
"epoch": 0.08917197452229299,
"grad_norm": 0.46742865443229675,
"kl": 0.0008754730224609375,
"learning_rate": 9.852837598553008e-07,
"loss": -0.4069012999534607,
"memory(GiB)": 158.38,
"reward": 0.2142857313156128,
"reward_std": 0.27762509882450104,
"rewards/AnswerTagAccuracyORM/mean": 0.2142857313156128,
"rewards/AnswerTagAccuracyORM/std": 0.417855441570282,
"step": 28,
"train_speed(iter/s)": 0.003179
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.6071428571428572,
"completions/max_length": 8046.5,
"completions/mean_length": 6678.232421875,
"completions/min_length": 2332.5,
"epoch": 0.09235668789808917,
"grad_norm": 0.4905704855918884,
"kl": 1.6553538759069064e-41,
"learning_rate": 9.840385594331022e-07,
"loss": -0.22333285212516785,
"memory(GiB)": 169.42,
"reward": 0.2321428656578064,
"reward_std": 0.37371791899204254,
"rewards/AnswerTagAccuracyORM/mean": 0.2321428656578064,
"rewards/AnswerTagAccuracyORM/std": 0.429407000541687,
"step": 29,
"train_speed(iter/s)": 0.003179
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5892857142857143,
"completions/max_length": 8048.5,
"completions/mean_length": 6784.232421875,
"completions/min_length": 1839.5,
"epoch": 0.09554140127388536,
"grad_norm": 0.2499741017818451,
"kl": 0.0011577606201171875,
"learning_rate": 9.827436479993468e-07,
"loss": -0.10406609624624252,
"memory(GiB)": 169.42,
"reward": 0.3035714477300644,
"reward_std": 0.3324786201119423,
"rewards/AnswerTagAccuracyORM/mean": 0.3035714477300644,
"rewards/AnswerTagAccuracyORM/std": 0.4576014578342438,
"step": 30,
"train_speed(iter/s)": 0.00318
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.75,
"completions/max_length": 8053.5,
"completions/mean_length": 7322.607666015625,
"completions/min_length": 3317.5,
"epoch": 0.09872611464968153,
"grad_norm": 0.22784624993801117,
"kl": 0.0009670257568359375,
"learning_rate": 9.813991585421116e-07,
"loss": -0.1821945756673813,
"memory(GiB)": 169.42,
"reward": 0.25,
"reward_std": 0.23638580739498138,
"rewards/AnswerTagAccuracyORM/mean": 0.25,
"rewards/AnswerTagAccuracyORM/std": 0.44095855951309204,
"step": 31,
"train_speed(iter/s)": 0.003176
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.7321428571428572,
"completions/max_length": 8045.5,
"completions/mean_length": 7113.89306640625,
"completions/min_length": 2601.0,
"epoch": 0.10191082802547771,
"grad_norm": 0.27732953429222107,
"kl": 0.000583648681640625,
"learning_rate": 9.800052291411629e-07,
"loss": -0.1299564391374588,
"memory(GiB)": 169.42,
"reward": 0.2678571492433548,
"reward_std": 0.23086076974868774,
"rewards/AnswerTagAccuracyORM/mean": 0.2678571492433548,
"rewards/AnswerTagAccuracyORM/std": 0.45050112903118134,
"step": 32,
"train_speed(iter/s)": 0.00317
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.7142857142857143,
"completions/max_length": 8044.0,
"completions/mean_length": 6726.625244140625,
"completions/min_length": 1058.0,
"epoch": 0.10509554140127389,
"grad_norm": 0.4480609595775604,
"kl": 0.000911712646484375,
"learning_rate": 9.78562002953774e-07,
"loss": -0.21174360811710358,
"memory(GiB)": 169.42,
"reward": 0.2321428656578064,
"reward_std": 0.2610500454902649,
"rewards/AnswerTagAccuracyORM/mean": 0.2321428656578064,
"rewards/AnswerTagAccuracyORM/std": 0.425032377243042,
"step": 33,
"train_speed(iter/s)": 0.003163
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.6071428571428572,
"completions/max_length": 8025.0,
"completions/mean_length": 6545.64306640625,
"completions/min_length": 1659.0,
"epoch": 0.10828025477707007,
"grad_norm": 0.2851637005805969,
"kl": 0.00090789794921875,
"learning_rate": 9.770696282000244e-07,
"loss": -0.11861994117498398,
"memory(GiB)": 169.42,
"reward": 0.3928571492433548,
"reward_std": 0.26657506078481674,
"rewards/AnswerTagAccuracyORM/mean": 0.3928571492433548,
"rewards/AnswerTagAccuracyORM/std": 0.4846093952655792,
"step": 34,
"train_speed(iter/s)": 0.003162
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.625,
"completions/max_length": 8052.0,
"completions/mean_length": 6577.803955078125,
"completions/min_length": 1984.5,
"epoch": 0.11146496815286625,
"grad_norm": 0.32200559973716736,
"kl": 0.00152587890625,
"learning_rate": 9.755282581475767e-07,
"loss": -0.16413375735282898,
"memory(GiB)": 169.42,
"reward": 0.4464285969734192,
"reward_std": 0.30228935927152634,
"rewards/AnswerTagAccuracyORM/mean": 0.4464285969734192,
"rewards/AnswerTagAccuracyORM/std": 0.5059135556221008,
"step": 35,
"train_speed(iter/s)": 0.003164
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5357142857142857,
"completions/max_length": 8069.0,
"completions/mean_length": 6215.32177734375,
"completions/min_length": 921.0,
"epoch": 0.11464968152866242,
"grad_norm": 0.3323407471179962,
"kl": 0.00124359130859375,
"learning_rate": 9.739380510959364e-07,
"loss": -0.22452522814273834,
"memory(GiB)": 169.42,
"reward": 0.5000000298023224,
"reward_std": 0.33800363540649414,
"rewards/AnswerTagAccuracyORM/mean": 0.5000000298023224,
"rewards/AnswerTagAccuracyORM/std": 0.44095855951309204,
"step": 36,
"train_speed(iter/s)": 0.003167
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.8035714285714286,
"completions/max_length": 8056.0,
"completions/mean_length": 7640.928955078125,
"completions/min_length": 4844.0,
"epoch": 0.1178343949044586,
"grad_norm": 0.22709085047245026,
"kl": 9.104936771950499e-42,
"learning_rate": 9.722991703601935e-07,
"loss": -0.06687425076961517,
"memory(GiB)": 169.42,
"reward": 0.160714291036129,
"reward_std": 0.23086076974868774,
"rewards/AnswerTagAccuracyORM/mean": 0.160714291036129,
"rewards/AnswerTagAccuracyORM/std": 0.3731846660375595,
"step": 37,
"train_speed(iter/s)": 0.003163
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5714285714285714,
"completions/max_length": 8069.0,
"completions/mean_length": 6600.446533203125,
"completions/min_length": 2290.0,
"epoch": 0.12101910828025478,
"grad_norm": 0.4062687158584595,
"kl": 0.002471923828125,
"learning_rate": 9.706117842542516e-07,
"loss": -0.22931034862995148,
"memory(GiB)": 169.42,
"reward": 0.3571428656578064,
"reward_std": 0.33800362050533295,
"rewards/AnswerTagAccuracyORM/mean": 0.3571428656578064,
"rewards/AnswerTagAccuracyORM/std": 0.48795004189014435,
"step": 38,
"train_speed(iter/s)": 0.003164
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.7678571428571428,
"completions/max_length": 8054.5,
"completions/mean_length": 7287.62548828125,
"completions/min_length": 2747.5,
"epoch": 0.12420382165605096,
"grad_norm": 0.306149959564209,
"kl": 0.000904083251953125,
"learning_rate": 9.688760660735402e-07,
"loss": -0.11831033229827881,
"memory(GiB)": 169.42,
"reward": 0.1785714328289032,
"reward_std": 0.18409645557403564,
"rewards/AnswerTagAccuracyORM/mean": 0.1785714328289032,
"rewards/AnswerTagAccuracyORM/std": 0.39002102613449097,
"step": 39,
"train_speed(iter/s)": 0.003162
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.7857142857142858,
"completions/max_length": 8063.5,
"completions/mean_length": 7219.143310546875,
"completions/min_length": 3600.0,
"epoch": 0.12738853503184713,
"grad_norm": 0.29684150218963623,
"kl": 0.000675201416015625,
"learning_rate": 9.670921940772186e-07,
"loss": -0.16423112154006958,
"memory(GiB)": 169.42,
"reward": 0.3035714477300644,
"reward_std": 0.2721000760793686,
"rewards/AnswerTagAccuracyORM/mean": 0.3035714477300644,
"rewards/AnswerTagAccuracyORM/std": 0.4469868242740631,
"step": 40,
"train_speed(iter/s)": 0.003158
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.4821428571428571,
"completions/max_length": 8044.5,
"completions/mean_length": 5554.14306640625,
"completions/min_length": 769.5,
"epoch": 0.1305732484076433,
"grad_norm": 0.34710463881492615,
"kl": 0.00290679931640625,
"learning_rate": 9.652603514698673e-07,
"loss": -0.11703464388847351,
"memory(GiB)": 169.48,
"reward": 0.4642857313156128,
"reward_std": 0.379242941737175,
"rewards/AnswerTagAccuracyORM/mean": 0.4642857313156128,
"rewards/AnswerTagAccuracyORM/std": 0.5065638720989227,
"step": 41,
"train_speed(iter/s)": 0.003157
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.23214285714285715,
"completions/max_length": 8039.5,
"completions/mean_length": 4597.910888671875,
"completions/min_length": 949.0,
"epoch": 0.1337579617834395,
"grad_norm": 0.33481189608573914,
"kl": 0.00357818603515625,
"learning_rate": 9.633807263826744e-07,
"loss": -0.02082175388932228,
"memory(GiB)": 169.48,
"reward": 0.5892857313156128,
"reward_std": 0.2610500380396843,
"rewards/AnswerTagAccuracyORM/mean": 0.5892857313156128,
"rewards/AnswerTagAccuracyORM/std": 0.4839591085910797,
"step": 42,
"train_speed(iter/s)": 0.00316
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5178571428571428,
"completions/max_length": 7991.5,
"completions/mean_length": 6022.285888671875,
"completions/min_length": 1342.0,
"epoch": 0.13694267515923567,
"grad_norm": 0.321031391620636,
"kl": 0.003082275390625,
"learning_rate": 9.614535118541125e-07,
"loss": -0.06112157553434372,
"memory(GiB)": 169.48,
"reward": 0.3035714477300644,
"reward_std": 0.14838216826319695,
"rewards/AnswerTagAccuracyORM/mean": 0.3035714477300644,
"rewards/AnswerTagAccuracyORM/std": 0.46781930327415466,
"step": 43,
"train_speed(iter/s)": 0.00316
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.75,
"completions/max_length": 8059.5,
"completions/mean_length": 7382.285888671875,
"completions/min_length": 3295.0,
"epoch": 0.14012738853503184,
"grad_norm": 0.33003029227256775,
"kl": 0.00177764892578125,
"learning_rate": 9.594789058101153e-07,
"loss": -0.22145554423332214,
"memory(GiB)": 169.48,
"reward": 0.3035714328289032,
"reward_std": 0.3324786126613617,
"rewards/AnswerTagAccuracyORM/mean": 0.3035714328289032,
"rewards/AnswerTagAccuracyORM/std": 0.4644543081521988,
"step": 44,
"train_speed(iter/s)": 0.003155
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.48214285714285715,
"completions/max_length": 8033.0,
"completions/mean_length": 5761.893310546875,
"completions/min_length": 988.5,
"epoch": 0.14331210191082802,
"grad_norm": 0.3568911552429199,
"kl": 0.00131988525390625,
"learning_rate": 9.574571110437496e-07,
"loss": -0.1308659315109253,
"memory(GiB)": 169.48,
"reward": 0.535714328289032,
"reward_std": 0.19514648616313934,
"rewards/AnswerTagAccuracyORM/mean": 0.535714328289032,
"rewards/AnswerTagAccuracyORM/std": 0.5078744888305664,
"step": 45,
"train_speed(iter/s)": 0.003156
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.7857142857142858,
"completions/max_length": 8067.0,
"completions/mean_length": 7363.285888671875,
"completions/min_length": 3301.0,
"epoch": 0.1464968152866242,
"grad_norm": 0.4601195454597473,
"kl": 0.0016326904296875,
"learning_rate": 9.55388335194388e-07,
"loss": -0.4354901909828186,
"memory(GiB)": 169.48,
"reward": 0.3571428656578064,
"reward_std": 0.40943221747875214,
"rewards/AnswerTagAccuracyORM/mean": 0.3571428656578064,
"rewards/AnswerTagAccuracyORM/std": 0.4744165241718292,
"step": 46,
"train_speed(iter/s)": 0.003157
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5714285714285714,
"completions/max_length": 8056.5,
"completions/mean_length": 6383.57177734375,
"completions/min_length": 1777.5,
"epoch": 0.14968152866242038,
"grad_norm": 0.32571953535079956,
"kl": 0.00157928466796875,
"learning_rate": 9.532727907263859e-07,
"loss": -0.08489307761192322,
"memory(GiB)": 169.48,
"reward": 0.2500000149011612,
"reward_std": 0.32695360481739044,
"rewards/AnswerTagAccuracyORM/mean": 0.2500000149011612,
"rewards/AnswerTagAccuracyORM/std": 0.4061589241027832,
"step": 47,
"train_speed(iter/s)": 0.003158
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.6964285714285714,
"completions/max_length": 8029.0,
"completions/mean_length": 6672.178955078125,
"completions/min_length": 1226.0,
"epoch": 0.15286624203821655,
"grad_norm": 0.2804900109767914,
"kl": 0.001705169677734375,
"learning_rate": 9.511106949072587e-07,
"loss": -0.07148804515600204,
"memory(GiB)": 169.48,
"reward": 0.4285714328289032,
"reward_std": 0.2142857238650322,
"rewards/AnswerTagAccuracyORM/mean": 0.4285714328289032,
"rewards/AnswerTagAccuracyORM/std": 0.4985625892877579,
"step": 48,
"train_speed(iter/s)": 0.003156
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5892857142857142,
"completions/max_length": 8053.5,
"completions/mean_length": 6692.678955078125,
"completions/min_length": 2195.5,
"epoch": 0.15605095541401273,
"grad_norm": 0.43050575256347656,
"kl": 0.00308990478515625,
"learning_rate": 9.489022697853708e-07,
"loss": -0.3393845856189728,
"memory(GiB)": 169.48,
"reward": 0.3750000298023224,
"reward_std": 0.3324786126613617,
"rewards/AnswerTagAccuracyORM/mean": 0.3750000298023224,
"rewards/AnswerTagAccuracyORM/std": 0.4897737354040146,
"step": 49,
"train_speed(iter/s)": 0.003156
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.6607142857142857,
"completions/max_length": 8043.0,
"completions/mean_length": 6784.5537109375,
"completions/min_length": 2953.5,
"epoch": 0.1592356687898089,
"grad_norm": 0.3050151765346527,
"kl": 0.00262451171875,
"learning_rate": 9.466477421671295e-07,
"loss": -0.11103521287441254,
"memory(GiB)": 169.48,
"reward": 0.23214287497103214,
"reward_std": 0.1785714365541935,
"rewards/AnswerTagAccuracyORM/mean": 0.23214287497103214,
"rewards/AnswerTagAccuracyORM/std": 0.34646742790937424,
"step": 50,
"train_speed(iter/s)": 0.003152
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.8035714285714286,
"completions/max_length": 8053.0,
"completions/mean_length": 7592.14306640625,
"completions/min_length": 4495.5,
"epoch": 0.1624203821656051,
"grad_norm": 0.2979692220687866,
"kl": 9.197422470595937e-42,
"learning_rate": 9.443473435936927e-07,
"loss": -0.14950904250144958,
"memory(GiB)": 169.48,
"reward": 0.1428571492433548,
"reward_std": 0.18409645557403564,
"rewards/AnswerTagAccuracyORM/mean": 0.1428571492433548,
"rewards/AnswerTagAccuracyORM/std": 0.35634833574295044,
"step": 51,
"train_speed(iter/s)": 0.003147
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5357142857142857,
"completions/max_length": 8060.5,
"completions/mean_length": 6415.9111328125,
"completions/min_length": 1379.5,
"epoch": 0.16560509554140126,
"grad_norm": 0.19361363351345062,
"kl": 965.6489562988281,
"learning_rate": 9.420013103171891e-07,
"loss": -0.16895829141139984,
"memory(GiB)": 169.48,
"reward": 0.4107143133878708,
"reward_std": 0.14838215708732605,
"rewards/AnswerTagAccuracyORM/mean": 0.4107143133878708,
"rewards/AnswerTagAccuracyORM/std": 0.5006500333547592,
"step": 52,
"train_speed(iter/s)": 0.003141
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.625,
"completions/max_length": 8066.5,
"completions/mean_length": 6355.839599609375,
"completions/min_length": 1968.0,
"epoch": 0.16878980891719744,
"grad_norm": 0.3892778158187866,
"kl": 0.0029144287109375,
"learning_rate": 9.396098832764554e-07,
"loss": -0.2460786998271942,
"memory(GiB)": 169.48,
"reward": 0.3392857313156128,
"reward_std": 0.30228935182094574,
"rewards/AnswerTagAccuracyORM/mean": 0.3392857313156128,
"rewards/AnswerTagAccuracyORM/std": 0.48177245259284973,
"step": 53,
"train_speed(iter/s)": 0.003138
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5357142857142857,
"completions/max_length": 8032.5,
"completions/mean_length": 6231.6787109375,
"completions/min_length": 1345.0,
"epoch": 0.17197452229299362,
"grad_norm": 0.5784780383110046,
"kl": 0.00372314453125,
"learning_rate": 9.37173308072291e-07,
"loss": -0.3872688412666321,
"memory(GiB)": 175.0,
"reward": 0.3571428805589676,
"reward_std": 0.25552503019571304,
"rewards/AnswerTagAccuracyORM/mean": 0.3571428805589676,
"rewards/AnswerTagAccuracyORM/std": 0.48199816048145294,
"step": 54,
"train_speed(iter/s)": 0.003136
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.4285714285714286,
"completions/max_length": 8043.5,
"completions/mean_length": 5922.51806640625,
"completions/min_length": 920.5,
"epoch": 0.1751592356687898,
"grad_norm": 0.2570510804653168,
"kl": 0.0035492679744493216,
"learning_rate": 9.346918349422355e-07,
"loss": -0.05529748648405075,
"memory(GiB)": 175.0,
"reward": 0.3750000149011612,
"reward_std": 0.2006715089082718,
"rewards/AnswerTagAccuracyORM/mean": 0.3750000149011612,
"rewards/AnswerTagAccuracyORM/std": 0.4469868391752243,
"step": 55,
"train_speed(iter/s)": 0.003134
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.6964285714285714,
"completions/max_length": 8032.0,
"completions/mean_length": 6564.035888671875,
"completions/min_length": 1182.0,
"epoch": 0.17834394904458598,
"grad_norm": 0.526690661907196,
"kl": 0.0024261474609375,
"learning_rate": 9.321657187348688e-07,
"loss": -0.20846779644489288,
"memory(GiB)": 175.0,
"reward": 0.321428582072258,
"reward_std": 0.3681928962469101,
"rewards/AnswerTagAccuracyORM/mean": 0.321428582072258,
"rewards/AnswerTagAccuracyORM/std": 0.4489477872848511,
"step": 56,
"train_speed(iter/s)": 0.003129
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.7857142857142857,
"completions/max_length": 8056.0,
"completions/mean_length": 7038.2861328125,
"completions/min_length": 4610.0,
"epoch": 0.18152866242038215,
"grad_norm": 0.15928295254707336,
"kl": 0.0027923583984375,
"learning_rate": 9.295952188836378e-07,
"loss": 0.021755440160632133,
"memory(GiB)": 175.0,
"reward": 0.1250000074505806,
"reward_std": 0.14838216453790665,
"rewards/AnswerTagAccuracyORM/mean": 0.1250000074505806,
"rewards/AnswerTagAccuracyORM/std": 0.33565935492515564,
"step": 57,
"train_speed(iter/s)": 0.003121
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.6428571428571428,
"completions/max_length": 8024.0,
"completions/mean_length": 6775.214599609375,
"completions/min_length": 1785.0,
"epoch": 0.18471337579617833,
"grad_norm": 0.5611134767532349,
"kl": 7.1264767029788345e-06,
"learning_rate": 9.269805993802128e-07,
"loss": -0.15547017753124237,
"memory(GiB)": 175.0,
"reward": 0.3392857313156128,
"reward_std": 0.29123931378126144,
"rewards/AnswerTagAccuracyORM/mean": 0.3392857313156128,
"rewards/AnswerTagAccuracyORM/std": 0.4628649652004242,
"step": 58,
"train_speed(iter/s)": 0.003119
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.625,
"completions/max_length": 8037.5,
"completions/mean_length": 6358.4111328125,
"completions/min_length": 1612.5,
"epoch": 0.18789808917197454,
"grad_norm": 0.15718594193458557,
"kl": 0.004180908203125,
"learning_rate": 9.243221287473755e-07,
"loss": 0.03484642878174782,
"memory(GiB)": 175.0,
"reward": 0.3571428656578064,
"reward_std": 0.11266787722706795,
"rewards/AnswerTagAccuracyORM/mean": 0.3571428656578064,
"rewards/AnswerTagAccuracyORM/std": 0.4744165539741516,
"step": 59,
"train_speed(iter/s)": 0.003118
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.6607142857142858,
"completions/max_length": 8040.0,
"completions/mean_length": 6776.6787109375,
"completions/min_length": 1845.0,
"epoch": 0.1910828025477707,
"grad_norm": 0.36074620485305786,
"kl": 0.0035858154296875,
"learning_rate": 9.216200800114411e-07,
"loss": -0.19808490574359894,
"memory(GiB)": 175.0,
"reward": 0.2142857164144516,
"reward_std": 0.19514648616313934,
"rewards/AnswerTagAccuracyORM/mean": 0.2142857164144516,
"rewards/AnswerTagAccuracyORM/std": 0.4154897928237915,
"step": 60,
"train_speed(iter/s)": 0.003113
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.6607142857142857,
"completions/max_length": 8027.0,
"completions/mean_length": 6271.26806640625,
"completions/min_length": 1195.5,
"epoch": 0.1942675159235669,
"grad_norm": 0.39658188819885254,
"kl": 1.2364356999970023e-41,
"learning_rate": 9.188747306742189e-07,
"loss": -0.1974707692861557,
"memory(GiB)": 175.0,
"reward": 0.321428582072258,
"reward_std": 0.2967643216252327,
"rewards/AnswerTagAccuracyORM/mean": 0.321428582072258,
"rewards/AnswerTagAccuracyORM/std": 0.4739968627691269,
"step": 61,
"train_speed(iter/s)": 0.003113
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.42857142857142855,
"completions/max_length": 8047.5,
"completions/mean_length": 5597.178955078125,
"completions/min_length": 1193.0,
"epoch": 0.19745222929936307,
"grad_norm": 0.20000393688678741,
"kl": 0.0042724609375,
"learning_rate": 9.160863626845119e-07,
"loss": -0.044626351445913315,
"memory(GiB)": 175.0,
"reward": 0.3571428656578064,
"reward_std": 0.1539071872830391,
"rewards/AnswerTagAccuracyORM/mean": 0.3571428656578064,
"rewards/AnswerTagAccuracyORM/std": 0.4635152518749237,
"step": 62,
"train_speed(iter/s)": 0.003113
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.7142857142857142,
"completions/max_length": 8042.5,
"completions/mean_length": 6918.053955078125,
"completions/min_length": 1768.0,
"epoch": 0.20063694267515925,
"grad_norm": 0.6508347988128662,
"kl": 0.003936767578125,
"learning_rate": 9.132552624091619e-07,
"loss": -0.21150559186935425,
"memory(GiB)": 175.0,
"reward": 0.1607142984867096,
"reward_std": 0.1785714402794838,
"rewards/AnswerTagAccuracyORM/mean": 0.1607142984867096,
"rewards/AnswerTagAccuracyORM/std": 0.3664129227399826,
"step": 63,
"train_speed(iter/s)": 0.003111
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.6785714285714286,
"completions/max_length": 8040.5,
"completions/mean_length": 6990.643310546875,
"completions/min_length": 2698.5,
"epoch": 0.20382165605095542,
"grad_norm": 0.44794604182243347,
"kl": 0.00531005859375,
"learning_rate": 9.103817206036382e-07,
"loss": -0.43547752499580383,
"memory(GiB)": 175.0,
"reward": 0.4107143133878708,
"reward_std": 0.30228935927152634,
"rewards/AnswerTagAccuracyORM/mean": 0.4107143133878708,
"rewards/AnswerTagAccuracyORM/std": 0.4839591383934021,
"step": 64,
"train_speed(iter/s)": 0.003107
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.7678571428571428,
"completions/max_length": 8033.0,
"completions/mean_length": 6935.285888671875,
"completions/min_length": 1890.5,
"epoch": 0.2070063694267516,
"grad_norm": 1.0245670080184937,
"kl": 0.0034942626953125,
"learning_rate": 9.07466032382177e-07,
"loss": -0.16633543372154236,
"memory(GiB)": 175.0,
"reward": 0.321428582072258,
"reward_std": 0.26657506078481674,
"rewards/AnswerTagAccuracyORM/mean": 0.321428582072258,
"rewards/AnswerTagAccuracyORM/std": 0.4739968627691269,
"step": 65,
"train_speed(iter/s)": 0.003103
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5535714285714286,
"completions/max_length": 8060.5,
"completions/mean_length": 6056.26806640625,
"completions/min_length": 1797.0,
"epoch": 0.21019108280254778,
"grad_norm": 0.26687300205230713,
"kl": 0.005523681640625,
"learning_rate": 9.045084971874737e-07,
"loss": -0.17107920348644257,
"memory(GiB)": 175.0,
"reward": 0.464285746216774,
"reward_std": 0.26657507568597794,
"rewards/AnswerTagAccuracyORM/mean": 0.464285746216774,
"rewards/AnswerTagAccuracyORM/std": 0.4739968925714493,
"step": 66,
"train_speed(iter/s)": 0.003104
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.6964285714285714,
"completions/max_length": 8050.0,
"completions/mean_length": 7268.107421875,
"completions/min_length": 1853.5,
"epoch": 0.21337579617834396,
"grad_norm": 0.39480164647102356,
"kl": 0.00036018589162267745,
"learning_rate": 9.015094187599296e-07,
"loss": -0.2682667374610901,
"memory(GiB)": 175.0,
"reward": 0.2142857238650322,
"reward_std": 0.2253357544541359,
"rewards/AnswerTagAccuracyORM/mean": 0.2142857238650322,
"rewards/AnswerTagAccuracyORM/std": 0.40819603204727173,
"step": 67,
"train_speed(iter/s)": 0.003105
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5357142857142857,
"completions/max_length": 8069.5,
"completions/mean_length": 5839.25048828125,
"completions/min_length": 776.0,
"epoch": 0.21656050955414013,
"grad_norm": 0.37252077460289,
"kl": 0.0123291015625,
"learning_rate": 8.984691051064574e-07,
"loss": -0.14431779086589813,
"memory(GiB)": 175.0,
"reward": 0.4107142984867096,
"reward_std": 0.21981073170900345,
"rewards/AnswerTagAccuracyORM/mean": 0.4107142984867096,
"rewards/AnswerTagAccuracyORM/std": 0.497912272810936,
"step": 68,
"train_speed(iter/s)": 0.003108
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5535714285714286,
"completions/max_length": 8044.0,
"completions/mean_length": 5993.5361328125,
"completions/min_length": 1093.0,
"epoch": 0.2197452229299363,
"grad_norm": 0.3685239553451538,
"kl": 0.01165771484375,
"learning_rate": 8.953878684688492e-07,
"loss": -0.1928965002298355,
"memory(GiB)": 175.0,
"reward": 0.321428582072258,
"reward_std": 0.18409645557403564,
"rewards/AnswerTagAccuracyORM/mean": 0.321428582072258,
"rewards/AnswerTagAccuracyORM/std": 0.4739968627691269,
"step": 69,
"train_speed(iter/s)": 0.003111
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5357142857142857,
"completions/max_length": 8056.0,
"completions/mean_length": 6114.82177734375,
"completions/min_length": 1662.5,
"epoch": 0.2229299363057325,
"grad_norm": 0.5651078224182129,
"kl": 0.013641357421875,
"learning_rate": 8.922660252917087e-07,
"loss": -0.3238877058029175,
"memory(GiB)": 175.0,
"reward": 0.3571428805589676,
"reward_std": 0.33800363540649414,
"rewards/AnswerTagAccuracyORM/mean": 0.3571428805589676,
"rewards/AnswerTagAccuracyORM/std": 0.48647116124629974,
"step": 70,
"train_speed(iter/s)": 0.003113
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5178571428571428,
"completions/max_length": 8029.0,
"completions/mean_length": 6308.1787109375,
"completions/min_length": 1943.5,
"epoch": 0.22611464968152867,
"grad_norm": 0.41033416986465454,
"kl": 0.0162353515625,
"learning_rate": 8.891038961899519e-07,
"loss": -0.17454411089420319,
"memory(GiB)": 175.0,
"reward": 0.3392857387661934,
"reward_std": 0.1896214708685875,
"rewards/AnswerTagAccuracyORM/mean": 0.3392857387661934,
"rewards/AnswerTagAccuracyORM/std": 0.43211139738559723,
"step": 71,
"train_speed(iter/s)": 0.003115
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.4107142857142857,
"completions/max_length": 8051.5,
"completions/mean_length": 5688.107177734375,
"completions/min_length": 1738.5,
"epoch": 0.22929936305732485,
"grad_norm": 0.4853333830833435,
"kl": 0.015869140625,
"learning_rate": 8.859018059158809e-07,
"loss": -0.2635350823402405,
"memory(GiB)": 175.0,
"reward": 0.392857164144516,
"reward_std": 0.3792429566383362,
"rewards/AnswerTagAccuracyORM/mean": 0.392857164144516,
"rewards/AnswerTagAccuracyORM/std": 0.49173468351364136,
"step": 72,
"train_speed(iter/s)": 0.003117
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.6071428571428571,
"completions/max_length": 8056.0,
"completions/mean_length": 6535.660888671875,
"completions/min_length": 2464.0,
"epoch": 0.23248407643312102,
"grad_norm": 0.18295517563819885,
"kl": 0.00787353515625,
"learning_rate": 8.826600833258307e-07,
"loss": -0.05387752130627632,
"memory(GiB)": 175.14,
"reward": 0.2857142984867096,
"reward_std": 0.18409645557403564,
"rewards/AnswerTagAccuracyORM/mean": 0.2857142984867096,
"rewards/AnswerTagAccuracyORM/std": 0.460043728351593,
"step": 73,
"train_speed(iter/s)": 0.003115
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.44642857142857145,
"completions/max_length": 8065.5,
"completions/mean_length": 5659.46435546875,
"completions/min_length": 1074.5,
"epoch": 0.2356687898089172,
"grad_norm": 0.27235084772109985,
"kl": 0.01446533203125,
"learning_rate": 8.793790613463954e-07,
"loss": -0.07979360222816467,
"memory(GiB)": 175.14,
"reward": 0.375,
"reward_std": 0.23086076974868774,
"rewards/AnswerTagAccuracyORM/mean": 0.375,
"rewards/AnswerTagAccuracyORM/std": 0.4750668406486511,
"step": 74,
"train_speed(iter/s)": 0.003117
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.6785714285714286,
"completions/max_length": 8040.5,
"completions/mean_length": 6586.518310546875,
"completions/min_length": 1586.5,
"epoch": 0.23885350318471338,
"grad_norm": 0.4448719322681427,
"kl": 1.2463148541704923e-41,
"learning_rate": 8.760590769402371e-07,
"loss": -0.14089421927928925,
"memory(GiB)": 175.14,
"reward": 0.3571428656578064,
"reward_std": 0.27762509882450104,
"rewards/AnswerTagAccuracyORM/mean": 0.3571428656578064,
"rewards/AnswerTagAccuracyORM/std": 0.4635152518749237,
"step": 75,
"train_speed(iter/s)": 0.003119
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.4107142857142857,
"completions/max_length": 7960.5,
"completions/mean_length": 5350.035888671875,
"completions/min_length": 1008.5,
"epoch": 0.24203821656050956,
"grad_norm": 0.3529123067855835,
"kl": 0.007568359375,
"learning_rate": 8.727004710714798e-07,
"loss": -0.06856054067611694,
"memory(GiB)": 175.14,
"reward": 0.4285714477300644,
"reward_std": 0.18409645557403564,
"rewards/AnswerTagAccuracyORM/mean": 0.4285714477300644,
"rewards/AnswerTagAccuracyORM/std": 0.5026109665632248,
"step": 76,
"train_speed(iter/s)": 0.00312
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.3928571428571429,
"completions/max_length": 8051.5,
"completions/mean_length": 5923.339599609375,
"completions/min_length": 1552.0,
"epoch": 0.24522292993630573,
"grad_norm": 0.4466400742530823,
"kl": 0.01898193359375,
"learning_rate": 8.693035886706907e-07,
"loss": -0.28264355659484863,
"memory(GiB)": 175.14,
"reward": 0.4821428954601288,
"reward_std": 0.29123931378126144,
"rewards/AnswerTagAccuracyORM/mean": 0.4821428954601288,
"rewards/AnswerTagAccuracyORM/std": 0.5059135556221008,
"step": 77,
"train_speed(iter/s)": 0.003121
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5535714285714286,
"completions/max_length": 8047.5,
"completions/mean_length": 6365.14306640625,
"completions/min_length": 1254.0,
"epoch": 0.2484076433121019,
"grad_norm": 0.42635998129844666,
"kl": 0.0216064453125,
"learning_rate": 8.658687785994578e-07,
"loss": -0.2270558625459671,
"memory(GiB)": 175.23,
"reward": 0.2678571492433548,
"reward_std": 0.1896214708685875,
"rewards/AnswerTagAccuracyORM/mean": 0.2678571492433548,
"rewards/AnswerTagAccuracyORM/std": 0.45050114393234253,
"step": 78,
"train_speed(iter/s)": 0.003119
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.6071428571428571,
"completions/max_length": 8055.5,
"completions/mean_length": 6524.4111328125,
"completions/min_length": 2012.5,
"epoch": 0.2515923566878981,
"grad_norm": 0.2484586536884308,
"kl": 0.00823974609375,
"learning_rate": 8.623963936145599e-07,
"loss": -0.10795820504426956,
"memory(GiB)": 175.23,
"reward": 0.267857164144516,
"reward_std": 0.07695359364151955,
"rewards/AnswerTagAccuracyORM/mean": 0.267857164144516,
"rewards/AnswerTagAccuracyORM/std": 0.44672515988349915,
"step": 79,
"train_speed(iter/s)": 0.00312
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.4642857142857143,
"completions/max_length": 8033.0,
"completions/mean_length": 5579.375244140625,
"completions/min_length": 1700.5,
"epoch": 0.25477707006369427,
"grad_norm": 0.4129941463470459,
"kl": 0.02008056640625,
"learning_rate": 8.588867903317394e-07,
"loss": -0.17563505470752716,
"memory(GiB)": 175.23,
"reward": 0.4821428656578064,
"reward_std": 0.2610500380396843,
"rewards/AnswerTagAccuracyORM/mean": 0.4821428656578064,
"rewards/AnswerTagAccuracyORM/std": 0.5085247755050659,
"step": 80,
"train_speed(iter/s)": 0.003123
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.4107142857142857,
"completions/max_length": 8030.5,
"completions/mean_length": 5784.410888671875,
"completions/min_length": 1787.0,
"epoch": 0.25796178343949044,
"grad_norm": 0.44289883971214294,
"kl": 0.0218505859375,
"learning_rate": 8.553403291890767e-07,
"loss": -0.2814217805862427,
"memory(GiB)": 175.23,
"reward": 0.4821428805589676,
"reward_std": 0.30228935927152634,
"rewards/AnswerTagAccuracyORM/mean": 0.4821428805589676,
"rewards/AnswerTagAccuracyORM/std": 0.5006500482559204,
"step": 81,
"train_speed(iter/s)": 0.003126
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.39285714285714285,
"completions/max_length": 8028.5,
"completions/mean_length": 5141.1787109375,
"completions/min_length": 1243.5,
"epoch": 0.2611464968152866,
"grad_norm": 0.556716799736023,
"kl": 0.0203857421875,
"learning_rate": 8.51757374409974e-07,
"loss": -0.24188844859600067,
"memory(GiB)": 175.23,
"reward": 0.4285714626312256,
"reward_std": 0.2967643439769745,
"rewards/AnswerTagAccuracyORM/mean": 0.4285714626312256,
"rewards/AnswerTagAccuracyORM/std": 0.49173468351364136,
"step": 82,
"train_speed(iter/s)": 0.003129
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.4642857142857143,
"completions/max_length": 8033.5,
"completions/mean_length": 5490.625244140625,
"completions/min_length": 1102.5,
"epoch": 0.2643312101910828,
"grad_norm": 0.43611571192741394,
"kl": 0.01397705078125,
"learning_rate": 8.481382939657488e-07,
"loss": -0.06833790987730026,
"memory(GiB)": 175.23,
"reward": 0.517857164144516,
"reward_std": 0.21981073170900345,
"rewards/AnswerTagAccuracyORM/mean": 0.517857164144516,
"rewards/AnswerTagAccuracyORM/std": 0.5006500482559204,
"step": 83,
"train_speed(iter/s)": 0.003131
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.4642857142857143,
"completions/max_length": 8042.5,
"completions/mean_length": 6488.946533203125,
"completions/min_length": 2348.5,
"epoch": 0.267515923566879,
"grad_norm": 0.5264121890068054,
"kl": 0.0264892578125,
"learning_rate": 8.444834595378433e-07,
"loss": -0.28356218338012695,
"memory(GiB)": 175.23,
"reward": 0.3035714477300644,
"reward_std": 0.30228933691978455,
"rewards/AnswerTagAccuracyORM/mean": 0.3035714477300644,
"rewards/AnswerTagAccuracyORM/std": 0.46781930327415466,
"step": 84,
"train_speed(iter/s)": 0.003133
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5357142857142857,
"completions/max_length": 8023.5,
"completions/mean_length": 5915.893310546875,
"completions/min_length": 1153.0,
"epoch": 0.27070063694267515,
"grad_norm": 0.852773129940033,
"kl": 0.01702880859375,
"learning_rate": 8.407932464796521e-07,
"loss": -0.2204403430223465,
"memory(GiB)": 175.23,
"reward": 0.3392857313156128,
"reward_std": 0.29123931378126144,
"rewards/AnswerTagAccuracyORM/mean": 0.3392857313156128,
"rewards/AnswerTagAccuracyORM/std": 0.47245559096336365,
"step": 85,
"train_speed(iter/s)": 0.003133
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.35714285714285715,
"completions/max_length": 8021.5,
"completions/mean_length": 5667.4111328125,
"completions/min_length": 1922.0,
"epoch": 0.27388535031847133,
"grad_norm": 0.41208142042160034,
"kl": 0.02435302734375,
"learning_rate": 8.370680337779736e-07,
"loss": -0.09956123679876328,
"memory(GiB)": 175.23,
"reward": 0.4464285969734192,
"reward_std": 0.29123932123184204,
"rewards/AnswerTagAccuracyORM/mean": 0.4464285969734192,
"rewards/AnswerTagAccuracyORM/std": 0.5059135854244232,
"step": 86,
"train_speed(iter/s)": 0.003136
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.6607142857142857,
"completions/max_length": 8047.0,
"completions/mean_length": 6915.0361328125,
"completions/min_length": 2560.5,
"epoch": 0.2770700636942675,
"grad_norm": 0.5504844188690186,
"kl": 0.021484375,
"learning_rate": 8.333082040140882e-07,
"loss": -0.33471736311912537,
"memory(GiB)": 175.23,
"reward": 0.3750000149011612,
"reward_std": 0.2610500305891037,
"rewards/AnswerTagAccuracyORM/mean": 0.3750000149011612,
"rewards/AnswerTagAccuracyORM/std": 0.49264875054359436,
"step": 87,
"train_speed(iter/s)": 0.003136
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5357142857142857,
"completions/max_length": 8103.0,
"completions/mean_length": 5899.035888671875,
"completions/min_length": 1566.0,
"epoch": 0.2802547770700637,
"grad_norm": 0.6481395959854126,
"kl": 0.023193359375,
"learning_rate": 8.295141433244659e-07,
"loss": -0.3118484616279602,
"memory(GiB)": 175.23,
"reward": 0.4107143133878708,
"reward_std": 0.30228935927152634,
"rewards/AnswerTagAccuracyORM/mean": 0.4107143133878708,
"rewards/AnswerTagAccuracyORM/std": 0.4839591085910797,
"step": 88,
"train_speed(iter/s)": 0.003138
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 8064.5,
"completions/mean_length": 5719.839599609375,
"completions/min_length": 1776.0,
"epoch": 0.28343949044585987,
"grad_norm": 0.3430127203464508,
"kl": 0.01434326171875,
"learning_rate": 8.256862413611112e-07,
"loss": 0.07326733320951462,
"memory(GiB)": 175.55,
"reward": 0.4285714328289032,
"reward_std": 0.40943220257759094,
"rewards/AnswerTagAccuracyORM/mean": 0.4285714328289032,
"rewards/AnswerTagAccuracyORM/std": 0.4985625892877579,
"step": 89,
"train_speed(iter/s)": 0.003141
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5535714285714286,
"completions/max_length": 8071.0,
"completions/mean_length": 6036.107421875,
"completions/min_length": 1410.5,
"epoch": 0.28662420382165604,
"grad_norm": 0.9747743010520935,
"kl": 0.0157470703125,
"learning_rate": 8.218248912515442e-07,
"loss": -0.32189831137657166,
"memory(GiB)": 175.55,
"reward": 0.4285714477300644,
"reward_std": 0.379242941737175,
"rewards/AnswerTagAccuracyORM/mean": 0.4285714477300644,
"rewards/AnswerTagAccuracyORM/std": 0.48199817538261414,
"step": 90,
"train_speed(iter/s)": 0.003143
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 8030.0,
"completions/mean_length": 5853.910888671875,
"completions/min_length": 1923.5,
"epoch": 0.2898089171974522,
"grad_norm": 0.6417209506034851,
"kl": 0.0321044921875,
"learning_rate": 8.179304895584281e-07,
"loss": -0.20055779814720154,
"memory(GiB)": 175.55,
"reward": 0.4285714477300644,
"reward_std": 0.40943220257759094,
"rewards/AnswerTagAccuracyORM/mean": 0.4285714477300644,
"rewards/AnswerTagAccuracyORM/std": 0.502610981464386,
"step": 91,
"train_speed(iter/s)": 0.003145
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5714285714285714,
"completions/max_length": 8069.0,
"completions/mean_length": 6345.964599609375,
"completions/min_length": 1514.0,
"epoch": 0.2929936305732484,
"grad_norm": 0.157789409160614,
"kl": 0.01397705078125,
"learning_rate": 8.140034362388397e-07,
"loss": -0.0897713378071785,
"memory(GiB)": 175.55,
"reward": 0.4285714328289032,
"reward_std": 0.2142857238650322,
"rewards/AnswerTagAccuracyORM/mean": 0.4285714328289032,
"rewards/AnswerTagAccuracyORM/std": 0.4985625743865967,
"step": 92,
"train_speed(iter/s)": 0.003146
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 8048.0,
"completions/mean_length": 6575.268310546875,
"completions/min_length": 1400.5,
"epoch": 0.2961783439490446,
"grad_norm": 0.19858166575431824,
"kl": 0.02850341796875,
"learning_rate": 8.100441346031958e-07,
"loss": -0.004586466588079929,
"memory(GiB)": 175.55,
"reward": 0.3392857313156128,
"reward_std": 0.1785714365541935,
"rewards/AnswerTagAccuracyORM/mean": 0.3392857313156128,
"rewards/AnswerTagAccuracyORM/std": 0.4628649652004242,
"step": 93,
"train_speed(iter/s)": 0.003146
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.48214285714285715,
"completions/max_length": 8063.5,
"completions/mean_length": 5983.518310546875,
"completions/min_length": 1320.5,
"epoch": 0.29936305732484075,
"grad_norm": 0.5492147207260132,
"kl": 796.5401611328125,
"learning_rate": 8.060529912738314e-07,
"loss": -0.20520265400409698,
"memory(GiB)": 175.55,
"reward": 0.4464285969734192,
"reward_std": 0.31333939731121063,
"rewards/AnswerTagAccuracyORM/mean": 0.4464285969734192,
"rewards/AnswerTagAccuracyORM/std": 0.4979122579097748,
"step": 94,
"train_speed(iter/s)": 0.003147
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.7678571428571428,
"completions/max_length": 8047.5,
"completions/mean_length": 6954.500244140625,
"completions/min_length": 2610.0,
"epoch": 0.30254777070063693,
"grad_norm": 0.38487347960472107,
"kl": 1.0918917634018975e-41,
"learning_rate": 8.020304161432403e-07,
"loss": -0.061524372547864914,
"memory(GiB)": 175.55,
"reward": 0.1785714402794838,
"reward_std": 0.1539071798324585,
"rewards/AnswerTagAccuracyORM/mean": 0.1785714402794838,
"rewards/AnswerTagAccuracyORM/std": 0.3871018886566162,
"step": 95,
"train_speed(iter/s)": 0.003145
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.4821428571428571,
"completions/max_length": 8047.5,
"completions/mean_length": 5583.64306640625,
"completions/min_length": 1472.5,
"epoch": 0.3057324840764331,
"grad_norm": 0.3516985774040222,
"kl": 0.018798828125,
"learning_rate": 7.979768223319785e-07,
"loss": -0.08431195467710495,
"memory(GiB)": 175.55,
"reward": 0.392857164144516,
"reward_std": 0.26657505333423615,
"rewards/AnswerTagAccuracyORM/mean": 0.392857164144516,
"rewards/AnswerTagAccuracyORM/std": 0.4973474591970444,
"step": 96,
"train_speed(iter/s)": 0.003147
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.4642857142857143,
"completions/max_length": 8028.5,
"completions/mean_length": 5748.21435546875,
"completions/min_length": 1341.0,
"epoch": 0.3089171974522293,
"grad_norm": 0.4274459779262543,
"kl": 0.0357666015625,
"learning_rate": 7.938926261462365e-07,
"loss": -0.17042537033557892,
"memory(GiB)": 175.55,
"reward": 0.321428582072258,
"reward_std": 0.2253357619047165,
"rewards/AnswerTagAccuracyORM/mean": 0.321428582072258,
"rewards/AnswerTagAccuracyORM/std": 0.4691530019044876,
"step": 97,
"train_speed(iter/s)": 0.003149
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5714285714285714,
"completions/max_length": 8063.5,
"completions/mean_length": 6201.875244140625,
"completions/min_length": 1993.5,
"epoch": 0.31210191082802546,
"grad_norm": 0.32356998324394226,
"kl": 1.5625879175686035e-41,
"learning_rate": 7.897782470350849e-07,
"loss": -0.13458389043807983,
"memory(GiB)": 175.55,
"reward": 0.392857164144516,
"reward_std": 0.26657506078481674,
"rewards/AnswerTagAccuracyORM/mean": 0.392857164144516,
"rewards/AnswerTagAccuracyORM/std": 0.4959513247013092,
"step": 98,
"train_speed(iter/s)": 0.003151
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5535714285714286,
"completions/max_length": 8029.5,
"completions/mean_length": 6730.339599609375,
"completions/min_length": 2250.0,
"epoch": 0.31528662420382164,
"grad_norm": 0.7448033690452576,
"kl": 0.00970458984375,
"learning_rate": 7.856341075473961e-07,
"loss": -0.19996413588523865,
"memory(GiB)": 175.55,
"reward": 0.3750000149011612,
"reward_std": 0.3435286581516266,
"rewards/AnswerTagAccuracyORM/mean": 0.3750000149011612,
"rewards/AnswerTagAccuracyORM/std": 0.4839591085910797,
"step": 99,
"train_speed(iter/s)": 0.003152
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.3214285714285714,
"completions/max_length": 8037.5,
"completions/mean_length": 5035.035888671875,
"completions/min_length": 932.0,
"epoch": 0.3184713375796178,
"grad_norm": 0.5652780532836914,
"kl": 0.04150390625,
"learning_rate": 7.814606332884488e-07,
"loss": -0.10035921633243561,
"memory(GiB)": 175.55,
"reward": 0.5714285969734192,
"reward_std": 0.32695361226797104,
"rewards/AnswerTagAccuracyORM/mean": 0.5714285969734192,
"rewards/AnswerTagAccuracyORM/std": 0.4985625892877579,
"step": 100,
"train_speed(iter/s)": 0.003155
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 8061.0,
"completions/mean_length": 6390.910888671875,
"completions/min_length": 1904.0,
"epoch": 0.321656050955414,
"grad_norm": 0.4127773344516754,
"kl": 0.038330078125,
"learning_rate": 7.772582528762178e-07,
"loss": -0.16605715453624725,
"memory(GiB)": 175.55,
"reward": 0.4464285969734192,
"reward_std": 0.23086078464984894,
"rewards/AnswerTagAccuracyORM/mean": 0.4464285969734192,
"rewards/AnswerTagAccuracyORM/std": 0.4979122579097748,
"step": 101,
"train_speed(iter/s)": 0.003152
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5892857142857143,
"completions/max_length": 8040.0,
"completions/mean_length": 6349.01806640625,
"completions/min_length": 1580.0,
"epoch": 0.3248407643312102,
"grad_norm": 0.4982652962207794,
"kl": 22.047773361206055,
"learning_rate": 7.730273978973552e-07,
"loss": -0.2726196050643921,
"memory(GiB)": 175.55,
"reward": 0.4464285969734192,
"reward_std": 0.2610500380396843,
"rewards/AnswerTagAccuracyORM/mean": 0.4464285969734192,
"rewards/AnswerTagAccuracyORM/std": 0.49791230261325836,
"step": 102,
"train_speed(iter/s)": 0.003152
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.4642857142857143,
"completions/max_length": 8051.0,
"completions/mean_length": 5684.39306640625,
"completions/min_length": 1680.0,
"epoch": 0.32802547770700635,
"grad_norm": 0.6498633027076721,
"kl": 0.082763671875,
"learning_rate": 7.687685028628652e-07,
"loss": -0.21366629004478455,
"memory(GiB)": 175.55,
"reward": 0.3750000149011612,
"reward_std": 0.1896214783191681,
"rewards/AnswerTagAccuracyORM/mean": 0.3750000149011612,
"rewards/AnswerTagAccuracyORM/std": 0.4839591085910797,
"step": 103,
"train_speed(iter/s)": 0.003152
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.6071428571428572,
"completions/max_length": 8018.5,
"completions/mean_length": 6570.8037109375,
"completions/min_length": 2079.0,
"epoch": 0.33121019108280253,
"grad_norm": 0.7568531632423401,
"kl": 0.0177001953125,
"learning_rate": 7.644820051634812e-07,
"loss": -0.23152737319469452,
"memory(GiB)": 175.55,
"reward": 0.196428582072258,
"reward_std": 0.2610500454902649,
"rewards/AnswerTagAccuracyORM/mean": 0.196428582072258,
"rewards/AnswerTagAccuracyORM/std": 0.4039382338523865,
"step": 104,
"train_speed(iter/s)": 0.003152
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.4642857142857143,
"completions/max_length": 8056.0,
"completions/mean_length": 5679.3037109375,
"completions/min_length": 1807.0,
"epoch": 0.3343949044585987,
"grad_norm": 0.48169225454330444,
"kl": 0.0501708984375,
"learning_rate": 7.60168345024744e-07,
"loss": -0.10911425948143005,
"memory(GiB)": 175.55,
"reward": 0.4107143133878708,
"reward_std": 0.4149572253227234,
"rewards/AnswerTagAccuracyORM/mean": 0.4107143133878708,
"rewards/AnswerTagAccuracyORM/std": 0.5006500482559204,
"step": 105,
"train_speed(iter/s)": 0.003153
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5714285714285714,
"completions/max_length": 8050.5,
"completions/mean_length": 6120.9111328125,
"completions/min_length": 1163.5,
"epoch": 0.3375796178343949,
"grad_norm": 0.41283684968948364,
"kl": 0.013671875,
"learning_rate": 7.558279654617912e-07,
"loss": -0.15918892621994019,
"memory(GiB)": 175.55,
"reward": 0.3571428805589676,
"reward_std": 0.307814359664917,
"rewards/AnswerTagAccuracyORM/mean": 0.3571428805589676,
"rewards/AnswerTagAccuracyORM/std": 0.48199817538261414,
"step": 106,
"train_speed(iter/s)": 0.003153
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5892857142857143,
"completions/max_length": 8047.5,
"completions/mean_length": 6480.857421875,
"completions/min_length": 1826.5,
"epoch": 0.34076433121019106,
"grad_norm": 0.5248873829841614,
"kl": 0.03424072265625,
"learning_rate": 7.514613122338589e-07,
"loss": -0.17167718708515167,
"memory(GiB)": 175.55,
"reward": 0.267857164144516,
"reward_std": 0.2610500454902649,
"rewards/AnswerTagAccuracyORM/mean": 0.267857164144516,
"rewards/AnswerTagAccuracyORM/std": 0.44672515988349915,
"step": 107,
"train_speed(iter/s)": 0.003154
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.625,
"completions/max_length": 8055.0,
"completions/mean_length": 6560.357421875,
"completions/min_length": 1519.0,
"epoch": 0.34394904458598724,
"grad_norm": 0.7283872365951538,
"kl": 0.01104736328125,
"learning_rate": 7.470688337985029e-07,
"loss": -0.3855530917644501,
"memory(GiB)": 175.55,
"reward": 0.3035714328289032,
"reward_std": 0.21981074661016464,
"rewards/AnswerTagAccuracyORM/mean": 0.3035714328289032,
"rewards/AnswerTagAccuracyORM/std": 0.4644543081521988,
"step": 108,
"train_speed(iter/s)": 0.003153
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.6428571428571428,
"completions/max_length": 8042.5,
"completions/mean_length": 6183.893310546875,
"completions/min_length": 1794.5,
"epoch": 0.3471337579617834,
"grad_norm": 0.6313491463661194,
"kl": 0.0653076171875,
"learning_rate": 7.426509812655405e-07,
"loss": -0.2580084502696991,
"memory(GiB)": 175.55,
"reward": 0.2857142984867096,
"reward_std": 0.26657505333423615,
"rewards/AnswerTagAccuracyORM/mean": 0.2857142984867096,
"rewards/AnswerTagAccuracyORM/std": 0.4436842352151871,
"step": 109,
"train_speed(iter/s)": 0.003153
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.6785714285714286,
"completions/max_length": 8056.5,
"completions/mean_length": 6767.071533203125,
"completions/min_length": 1435.5,
"epoch": 0.3503184713375796,
"grad_norm": 0.45339030027389526,
"kl": 0.030517578125,
"learning_rate": 7.382082083507225e-07,
"loss": -0.16426539421081543,
"memory(GiB)": 175.55,
"reward": 0.3392857313156128,
"reward_std": 0.2721000909805298,
"rewards/AnswerTagAccuracyORM/mean": 0.3392857313156128,
"rewards/AnswerTagAccuracyORM/std": 0.4786956012248993,
"step": 110,
"train_speed(iter/s)": 0.003152
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5535714285714286,
"completions/max_length": 8042.0,
"completions/mean_length": 6121.696533203125,
"completions/min_length": 1248.0,
"epoch": 0.3535031847133758,
"grad_norm": 0.44927987456321716,
"kl": 1.280361533164978,
"learning_rate": 7.337409713291355e-07,
"loss": -0.032493047416210175,
"memory(GiB)": 175.77,
"reward": 0.3392857313156128,
"reward_std": 0.2610500454902649,
"rewards/AnswerTagAccuracyORM/mean": 0.3392857313156128,
"rewards/AnswerTagAccuracyORM/std": 0.4817724674940109,
"step": 111,
"train_speed(iter/s)": 0.003152
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5178571428571428,
"completions/max_length": 8030.5,
"completions/mean_length": 6174.0361328125,
"completions/min_length": 1399.0,
"epoch": 0.35668789808917195,
"grad_norm": 0.451442688703537,
"kl": 0.1644287109375,
"learning_rate": 7.292497289883432e-07,
"loss": 0.022751769050955772,
"memory(GiB)": 175.77,
"reward": 0.321428582072258,
"reward_std": 0.2142857201397419,
"rewards/AnswerTagAccuracyORM/mean": 0.321428582072258,
"rewards/AnswerTagAccuracyORM/std": 0.4691530168056488,
"step": 112,
"train_speed(iter/s)": 0.003152
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.4464285714285714,
"completions/max_length": 8033.5,
"completions/mean_length": 6115.357421875,
"completions/min_length": 1842.5,
"epoch": 0.35987261146496813,
"grad_norm": 0.5630915760993958,
"kl": 0.02783203125,
"learning_rate": 7.24734942581267e-07,
"loss": -0.30773723125457764,
"memory(GiB)": 175.77,
"reward": 0.3035714402794838,
"reward_std": 0.2610500380396843,
"rewards/AnswerTagAccuracyORM/mean": 0.3035714402794838,
"rewards/AnswerTagAccuracyORM/std": 0.4321114122867584,
"step": 113,
"train_speed(iter/s)": 0.003153
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 8043.0,
"completions/mean_length": 6186.803955078125,
"completions/min_length": 1296.5,
"epoch": 0.3630573248407643,
"grad_norm": 0.49433982372283936,
"kl": 0.0421142578125,
"learning_rate": 7.201970757788171e-07,
"loss": -0.055917054414749146,
"memory(GiB)": 175.77,
"reward": 0.3571428656578064,
"reward_std": 0.3792429566383362,
"rewards/AnswerTagAccuracyORM/mean": 0.3571428656578064,
"rewards/AnswerTagAccuracyORM/std": 0.4744165241718292,
"step": 114,
"train_speed(iter/s)": 0.003153
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.6428571428571428,
"completions/max_length": 8041.0,
"completions/mean_length": 6332.6611328125,
"completions/min_length": 1067.0,
"epoch": 0.3662420382165605,
"grad_norm": 0.2365662157535553,
"kl": 0.02978515625,
"learning_rate": 7.15636594622272e-07,
"loss": 0.03591850399971008,
"memory(GiB)": 175.77,
"reward": 0.1964285857975483,
"reward_std": 0.07695359364151955,
"rewards/AnswerTagAccuracyORM/mean": 0.1964285857975483,
"rewards/AnswerTagAccuracyORM/std": 0.36893007159233093,
"step": 115,
"train_speed(iter/s)": 0.003154
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.4821428571428572,
"completions/max_length": 8068.0,
"completions/mean_length": 5660.196533203125,
"completions/min_length": 1355.0,
"epoch": 0.36942675159235666,
"grad_norm": 0.8137925863265991,
"kl": 0.21121002733707428,
"learning_rate": 7.110539674754159e-07,
"loss": -0.3542415499687195,
"memory(GiB)": 175.77,
"reward": 0.375,
"reward_std": 0.29123930633068085,
"rewards/AnswerTagAccuracyORM/mean": 0.375,
"rewards/AnswerTagAccuracyORM/std": 0.47506681084632874,
"step": 116,
"train_speed(iter/s)": 0.003154
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 8045.0,
"completions/mean_length": 5915.482421875,
"completions/min_length": 1733.0,
"epoch": 0.37261146496815284,
"grad_norm": 71.4358139038086,
"kl": 18.5830078125,
"learning_rate": 7.06449664976438e-07,
"loss": -0.01425784919410944,
"memory(GiB)": 175.77,
"reward": 0.2678571492433548,
"reward_std": 0.2610500380396843,
"rewards/AnswerTagAccuracyORM/mean": 0.2678571492433548,
"rewards/AnswerTagAccuracyORM/std": 0.45050112903118134,
"step": 117,
"train_speed(iter/s)": 0.003154
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.3214285714285714,
"completions/max_length": 8045.5,
"completions/mean_length": 5088.660888671875,
"completions/min_length": 908.5,
"epoch": 0.37579617834394907,
"grad_norm": 0.42832183837890625,
"kl": 0.0203857421875,
"learning_rate": 7.018241599895973e-07,
"loss": -0.0722283348441124,
"memory(GiB)": 175.77,
"reward": 0.4464285969734192,
"reward_std": 0.3324786126613617,
"rewards/AnswerTagAccuracyORM/mean": 0.4464285969734192,
"rewards/AnswerTagAccuracyORM/std": 0.5059135556221008,
"step": 118,
"train_speed(iter/s)": 0.003156
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5357142857142857,
"completions/max_length": 8042.0,
"completions/mean_length": 6498.76806640625,
"completions/min_length": 2193.0,
"epoch": 0.37898089171974525,
"grad_norm": 0.5034194588661194,
"kl": 0.0457763671875,
"learning_rate": 6.971779275566593e-07,
"loss": -0.14812861382961273,
"memory(GiB)": 175.77,
"reward": 0.1785714328289032,
"reward_std": 0.2253357470035553,
"rewards/AnswerTagAccuracyORM/mean": 0.1785714328289032,
"rewards/AnswerTagAccuracyORM/std": 0.37796448171138763,
"step": 119,
"train_speed(iter/s)": 0.003156
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.35714285714285715,
"completions/max_length": 8051.0,
"completions/mean_length": 4917.14306640625,
"completions/min_length": 1183.5,
"epoch": 0.3821656050955414,
"grad_norm": 0.7444789409637451,
"kl": 0.0513916015625,
"learning_rate": 6.925114448481088e-07,
"loss": -0.26919350028038025,
"memory(GiB)": 175.77,
"reward": 0.642857164144516,
"reward_std": 0.3681929111480713,
"rewards/AnswerTagAccuracyORM/mean": 0.642857164144516,
"rewards/AnswerTagAccuracyORM/std": 0.48199817538261414,
"step": 120,
"train_speed(iter/s)": 0.003157
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.625,
"completions/max_length": 8067.5,
"completions/mean_length": 6616.857421875,
"completions/min_length": 1642.0,
"epoch": 0.3853503184713376,
"grad_norm": 0.4549780786037445,
"kl": 0.0621337890625,
"learning_rate": 6.87825191114145e-07,
"loss": -0.012696207500994205,
"memory(GiB)": 175.77,
"reward": 0.2857142984867096,
"reward_std": 0.27762509882450104,
"rewards/AnswerTagAccuracyORM/mean": 0.2857142984867096,
"rewards/AnswerTagAccuracyORM/std": 0.45290274918079376,
"step": 121,
"train_speed(iter/s)": 0.003157
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 8049.5,
"completions/mean_length": 6262.821533203125,
"completions/min_length": 1352.0,
"epoch": 0.3885350318471338,
"grad_norm": 0.6874606013298035,
"kl": 0.025634765625,
"learning_rate": 6.831196476354614e-07,
"loss": -0.19306860864162445,
"memory(GiB)": 175.77,
"reward": 0.4107143133878708,
"reward_std": 0.37371791899204254,
"rewards/AnswerTagAccuracyORM/mean": 0.4107143133878708,
"rewards/AnswerTagAccuracyORM/std": 0.5006500333547592,
"step": 122,
"train_speed(iter/s)": 0.003157
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.3392857142857143,
"completions/max_length": 8052.5,
"completions/mean_length": 5354.339599609375,
"completions/min_length": 1056.5,
"epoch": 0.39171974522292996,
"grad_norm": 0.5245152711868286,
"kl": 0.041748046875,
"learning_rate": 6.783952976738178e-07,
"loss": -0.24547605216503143,
"memory(GiB)": 175.77,
"reward": 0.5,
"reward_std": 0.26657505333423615,
"rewards/AnswerTagAccuracyORM/mean": 0.5,
"rewards/AnswerTagAccuracyORM/std": 0.5091750919818878,
"step": 123,
"train_speed(iter/s)": 0.003158
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 8028.5,
"completions/mean_length": 4598.0179443359375,
"completions/min_length": 803.0,
"epoch": 0.39490445859872614,
"grad_norm": 0.31059756875038147,
"kl": 0.03350830078125,
"learning_rate": 6.7365262642241e-07,
"loss": -0.010546373203396797,
"memory(GiB)": 175.77,
"reward": 0.5357142984867096,
"reward_std": 0.2142857238650322,
"rewards/AnswerTagAccuracyORM/mean": 0.5357142984867096,
"rewards/AnswerTagAccuracyORM/std": 0.5065638422966003,
"step": 124,
"train_speed(iter/s)": 0.00316
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5357142857142857,
"completions/max_length": 8051.0,
"completions/mean_length": 6164.375244140625,
"completions/min_length": 1348.5,
"epoch": 0.3980891719745223,
"grad_norm": 0.36853721737861633,
"kl": 0.0347900390625,
"learning_rate": 6.688921209560403e-07,
"loss": -0.15686392784118652,
"memory(GiB)": 175.77,
"reward": 0.3392857164144516,
"reward_std": 0.2610500305891037,
"rewards/AnswerTagAccuracyORM/mean": 0.3392857164144516,
"rewards/AnswerTagAccuracyORM/std": 0.4495980441570282,
"step": 125,
"train_speed(iter/s)": 0.00316
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5357142857142857,
"completions/max_length": 8056.5,
"completions/mean_length": 6088.44677734375,
"completions/min_length": 1482.5,
"epoch": 0.4012738853503185,
"grad_norm": 0.4430883228778839,
"kl": 0.0606689453125,
"learning_rate": 6.641142701810931e-07,
"loss": -0.24629399180412292,
"memory(GiB)": 175.77,
"reward": 0.3928571492433548,
"reward_std": 0.20619653165340424,
"rewards/AnswerTagAccuracyORM/mean": 0.3928571492433548,
"rewards/AnswerTagAccuracyORM/std": 0.4846093952655792,
"step": 126,
"train_speed(iter/s)": 0.00316
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.4642857142857143,
"completions/max_length": 8057.5,
"completions/mean_length": 6104.5361328125,
"completions/min_length": 2068.5,
"epoch": 0.40445859872611467,
"grad_norm": 0.6949173808097839,
"kl": 0.106201171875,
"learning_rate": 6.593195647853258e-07,
"loss": -0.3462884724140167,
"memory(GiB)": 175.77,
"reward": 0.3035714477300644,
"reward_std": 0.4039071798324585,
"rewards/AnswerTagAccuracyORM/mean": 0.3035714477300644,
"rewards/AnswerTagAccuracyORM/std": 0.4576014429330826,
"step": 127,
"train_speed(iter/s)": 0.00316
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 8028.5,
"completions/mean_length": 5471.26806640625,
"completions/min_length": 798.5,
"epoch": 0.40764331210191085,
"grad_norm": 0.4856749176979065,
"kl": 0.0289306640625,
"learning_rate": 6.545084971874736e-07,
"loss": -0.14708048105239868,
"memory(GiB)": 175.77,
"reward": 0.3392857313156128,
"reward_std": 0.1896214708685875,
"rewards/AnswerTagAccuracyORM/mean": 0.3392857313156128,
"rewards/AnswerTagAccuracyORM/std": 0.47245559096336365,
"step": 128,
"train_speed(iter/s)": 0.003161
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.4642857142857143,
"completions/max_length": 8055.0,
"completions/mean_length": 5762.50048828125,
"completions/min_length": 1270.0,
"epoch": 0.410828025477707,
"grad_norm": 0.5902029275894165,
"kl": 0.03662109375,
"learning_rate": 6.496815614866791e-07,
"loss": -0.08738569170236588,
"memory(GiB)": 175.77,
"reward": 0.4821428805589676,
"reward_std": 0.2610500380396843,
"rewards/AnswerTagAccuracyORM/mean": 0.4821428805589676,
"rewards/AnswerTagAccuracyORM/std": 0.5006500333547592,
"step": 129,
"train_speed(iter/s)": 0.003162
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5357142857142857,
"completions/max_length": 8043.5,
"completions/mean_length": 6093.4287109375,
"completions/min_length": 1134.0,
"epoch": 0.4140127388535032,
"grad_norm": 0.3767712712287903,
"kl": 0.006173963658511639,
"learning_rate": 6.448392534117461e-07,
"loss": -0.17554393410682678,
"memory(GiB)": 175.77,
"reward": 0.4464285969734192,
"reward_std": 0.23086077719926834,
"rewards/AnswerTagAccuracyORM/mean": 0.4464285969734192,
"rewards/AnswerTagAccuracyORM/std": 0.4897737503051758,
"step": 130,
"train_speed(iter/s)": 0.003162
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.625,
"completions/max_length": 8051.0,
"completions/mean_length": 6678.94677734375,
"completions/min_length": 2177.0,
"epoch": 0.4171974522292994,
"grad_norm": 0.7001217603683472,
"kl": 0.0213623046875,
"learning_rate": 6.399820702702304e-07,
"loss": -0.2599309980869293,
"memory(GiB)": 175.77,
"reward": 0.3750000149011612,
"reward_std": 0.37371791899204254,
"rewards/AnswerTagAccuracyORM/mean": 0.3750000149011612,
"rewards/AnswerTagAccuracyORM/std": 0.49264875054359436,
"step": 131,
"train_speed(iter/s)": 0.003162
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.6607142857142857,
"completions/max_length": 8036.5,
"completions/mean_length": 6684.57177734375,
"completions/min_length": 1943.5,
"epoch": 0.42038216560509556,
"grad_norm": 0.5899467468261719,
"kl": 0.05859375,
"learning_rate": 6.351105108973644e-07,
"loss": -0.17598707973957062,
"memory(GiB)": 175.77,
"reward": 0.267857164144516,
"reward_std": 0.29123930633068085,
"rewards/AnswerTagAccuracyORM/mean": 0.267857164144516,
"rewards/AnswerTagAccuracyORM/std": 0.44672515988349915,
"step": 132,
"train_speed(iter/s)": 0.003162
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5535714285714286,
"completions/max_length": 8050.0,
"completions/mean_length": 6337.08935546875,
"completions/min_length": 1953.0,
"epoch": 0.42356687898089174,
"grad_norm": 0.4514944553375244,
"kl": 0.0384521484375,
"learning_rate": 6.302250756048267e-07,
"loss": -0.23234152793884277,
"memory(GiB)": 175.77,
"reward": 0.3571428656578064,
"reward_std": 0.25552502274513245,
"rewards/AnswerTagAccuracyORM/mean": 0.3571428656578064,
"rewards/AnswerTagAccuracyORM/std": 0.4744165241718292,
"step": 133,
"train_speed(iter/s)": 0.003161
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.4642857142857143,
"completions/max_length": 8029.0,
"completions/mean_length": 5813.14306640625,
"completions/min_length": 780.0,
"epoch": 0.4267515923566879,
"grad_norm": 0.5305230617523193,
"kl": 0.084716796875,
"learning_rate": 6.253262661293602e-07,
"loss": -0.16682913899421692,
"memory(GiB)": 175.77,
"reward": 0.4285714626312256,
"reward_std": 0.2363857924938202,
"rewards/AnswerTagAccuracyORM/mean": 0.4285714626312256,
"rewards/AnswerTagAccuracyORM/std": 0.49173471331596375,
"step": 134,
"train_speed(iter/s)": 0.003161
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.6607142857142858,
"completions/max_length": 8067.0,
"completions/mean_length": 6596.125244140625,
"completions/min_length": 1434.0,
"epoch": 0.4299363057324841,
"grad_norm": 6475.49609375,
"kl": 2048.0140380859375,
"learning_rate": 6.204145855812438e-07,
"loss": 3.8663876056671143,
"memory(GiB)": 175.77,
"reward": 0.3571428805589676,
"reward_std": 0.26657505333423615,
"rewards/AnswerTagAccuracyORM/mean": 0.3571428805589676,
"rewards/AnswerTagAccuracyORM/std": 0.48647116124629974,
"step": 135,
"train_speed(iter/s)": 0.003161
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.3392857142857143,
"completions/max_length": 8056.0,
"completions/mean_length": 4872.14306640625,
"completions/min_length": 1017.0,
"epoch": 0.43312101910828027,
"grad_norm": 0.26369690895080566,
"kl": 0.047607421875,
"learning_rate": 6.154905383926216e-07,
"loss": -0.016700170934200287,
"memory(GiB)": 175.77,
"reward": 0.517857164144516,
"reward_std": 0.21981074661016464,
"rewards/AnswerTagAccuracyORM/mean": 0.517857164144516,
"rewards/AnswerTagAccuracyORM/std": 0.5085247755050659,
"step": 136,
"train_speed(iter/s)": 0.003161
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.6428571428571429,
"completions/max_length": 8029.5,
"completions/mean_length": 6843.303955078125,
"completions/min_length": 2125.5,
"epoch": 0.43630573248407645,
"grad_norm": 0.48116692900657654,
"kl": 0.04315185546875,
"learning_rate": 6.105546302656986e-07,
"loss": -0.08012282848358154,
"memory(GiB)": 175.77,
"reward": 0.2142857201397419,
"reward_std": 0.2967643365263939,
"rewards/AnswerTagAccuracyORM/mean": 0.2142857201397419,
"rewards/AnswerTagAccuracyORM/std": 0.37510766088962555,
"step": 137,
"train_speed(iter/s)": 0.00316
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.6785714285714286,
"completions/max_length": 8045.5,
"completions/mean_length": 6495.821533203125,
"completions/min_length": 1893.5,
"epoch": 0.4394904458598726,
"grad_norm": 0.39181163907051086,
"kl": 289.435302734375,
"learning_rate": 6.056073681208037e-07,
"loss": -0.048386890441179276,
"memory(GiB)": 175.77,
"reward": 0.3035714477300644,
"reward_std": 0.2721000984311104,
"rewards/AnswerTagAccuracyORM/mean": 0.3035714477300644,
"rewards/AnswerTagAccuracyORM/std": 0.4576014429330826,
"step": 138,
"train_speed(iter/s)": 0.00316
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 8016.0,
"completions/mean_length": 5637.39306640625,
"completions/min_length": 1424.5,
"epoch": 0.4426751592356688,
"grad_norm": 0.2754450738430023,
"kl": 0.046875,
"learning_rate": 6.0064926004433e-07,
"loss": -0.08374255150556564,
"memory(GiB)": 175.77,
"reward": 0.2678571492433548,
"reward_std": 0.1785714365541935,
"rewards/AnswerTagAccuracyORM/mean": 0.2678571492433548,
"rewards/AnswerTagAccuracyORM/std": 0.45050114393234253,
"step": 139,
"train_speed(iter/s)": 0.003161
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.48214285714285715,
"completions/max_length": 8044.0,
"completions/mean_length": 5262.000244140625,
"completions/min_length": 1212.5,
"epoch": 0.445859872611465,
"grad_norm": 0.641979455947876,
"kl": 0.07440185546875,
"learning_rate": 5.956808152365532e-07,
"loss": -0.11813775449991226,
"memory(GiB)": 175.77,
"reward": 0.3214285746216774,
"reward_std": 0.25552503019571304,
"rewards/AnswerTagAccuracyORM/mean": 0.3214285746216774,
"rewards/AnswerTagAccuracyORM/std": 0.4327617287635803,
"step": 140,
"train_speed(iter/s)": 0.003163
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.4821428571428571,
"completions/max_length": 8020.0,
"completions/mean_length": 5747.625244140625,
"completions/min_length": 1448.5,
"epoch": 0.44904458598726116,
"grad_norm": 0.30898842215538025,
"kl": 0.0184326171875,
"learning_rate": 5.907025439593365e-07,
"loss": -0.03316553309559822,
"memory(GiB)": 175.77,
"reward": 0.4464285969734192,
"reward_std": 0.2610500305891037,
"rewards/AnswerTagAccuracyORM/mean": 0.4464285969734192,
"rewards/AnswerTagAccuracyORM/std": 0.5059135556221008,
"step": 141,
"train_speed(iter/s)": 0.003164
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.4464285714285714,
"completions/max_length": 8040.5,
"completions/mean_length": 5516.482177734375,
"completions/min_length": 1085.5,
"epoch": 0.45222929936305734,
"grad_norm": 0.5380728244781494,
"kl": 0.0506591796875,
"learning_rate": 5.857149574837268e-07,
"loss": -0.25722232460975647,
"memory(GiB)": 175.77,
"reward": 0.4464285969734192,
"reward_std": 0.30228933691978455,
"rewards/AnswerTagAccuracyORM/mean": 0.4464285969734192,
"rewards/AnswerTagAccuracyORM/std": 0.497912272810936,
"step": 142,
"train_speed(iter/s)": 0.003165
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.4821428571428571,
"completions/max_length": 8030.0,
"completions/mean_length": 5218.1611328125,
"completions/min_length": 1178.5,
"epoch": 0.4554140127388535,
"grad_norm": 0.4074670672416687,
"kl": 0.0584716796875,
"learning_rate": 5.807185680374467e-07,
"loss": -0.0464337095618248,
"memory(GiB)": 175.77,
"reward": 0.3035714328289032,
"reward_std": 0.14838216453790665,
"rewards/AnswerTagAccuracyORM/mean": 0.3035714328289032,
"rewards/AnswerTagAccuracyORM/std": 0.4644543081521988,
"step": 143,
"train_speed(iter/s)": 0.003166
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5714285714285714,
"completions/max_length": 8053.5,
"completions/mean_length": 6286.125244140625,
"completions/min_length": 1153.5,
"epoch": 0.4585987261146497,
"grad_norm": 0.4895031154155731,
"kl": 0.0423583984375,
"learning_rate": 5.757138887522883e-07,
"loss": -0.06021181866526604,
"memory(GiB)": 175.77,
"reward": 0.2500000149011612,
"reward_std": 0.26657506823539734,
"rewards/AnswerTagAccuracyORM/mean": 0.2500000149011612,
"rewards/AnswerTagAccuracyORM/std": 0.4389495849609375,
"step": 144,
"train_speed(iter/s)": 0.003165
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5178571428571428,
"completions/max_length": 8067.0,
"completions/mean_length": 6192.5537109375,
"completions/min_length": 1276.5,
"epoch": 0.46178343949044587,
"grad_norm": 0.40592557191848755,
"kl": 0.0712890625,
"learning_rate": 5.707014336114146e-07,
"loss": -0.048989661037921906,
"memory(GiB)": 175.77,
"reward": 0.4285714477300644,
"reward_std": 0.27762511372566223,
"rewards/AnswerTagAccuracyORM/mean": 0.4285714477300644,
"rewards/AnswerTagAccuracyORM/std": 0.5026109665632248,
"step": 145,
"train_speed(iter/s)": 0.003165
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5357142857142857,
"completions/max_length": 8031.0,
"completions/mean_length": 6168.714599609375,
"completions/min_length": 1853.0,
"epoch": 0.46496815286624205,
"grad_norm": 0.4599376618862152,
"kl": 0.051025390625,
"learning_rate": 5.656817173965732e-07,
"loss": -0.06779219955205917,
"memory(GiB)": 175.77,
"reward": 0.3571428805589676,
"reward_std": 0.33800363540649414,
"rewards/AnswerTagAccuracyORM/mean": 0.3571428805589676,
"rewards/AnswerTagAccuracyORM/std": 0.4489477574825287,
"step": 146,
"train_speed(iter/s)": 0.003165
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5178571428571428,
"completions/max_length": 8048.0,
"completions/mean_length": 6351.232421875,
"completions/min_length": 2068.5,
"epoch": 0.4681528662420382,
"grad_norm": 0.3567911684513092,
"kl": 0.0421142578125,
"learning_rate": 5.606552556352274e-07,
"loss": -0.09004680067300797,
"memory(GiB)": 175.77,
"reward": 0.3392857313156128,
"reward_std": 0.29123930633068085,
"rewards/AnswerTagAccuracyORM/mean": 0.3392857313156128,
"rewards/AnswerTagAccuracyORM/std": 0.47245559096336365,
"step": 147,
"train_speed(iter/s)": 0.003165
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 8061.5,
"completions/mean_length": 6130.57177734375,
"completions/min_length": 1451.0,
"epoch": 0.4713375796178344,
"grad_norm": 0.5385421514511108,
"kl": 0.0621337890625,
"learning_rate": 5.556225645476118e-07,
"loss": -0.12423180043697357,
"memory(GiB)": 175.77,
"reward": 0.392857164144516,
"reward_std": 0.26657505333423615,
"rewards/AnswerTagAccuracyORM/mean": 0.392857164144516,
"rewards/AnswerTagAccuracyORM/std": 0.4959513247013092,
"step": 148,
"train_speed(iter/s)": 0.003165
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.3035714285714286,
"completions/max_length": 8047.5,
"completions/mean_length": 4891.875244140625,
"completions/min_length": 1196.5,
"epoch": 0.4745222929936306,
"grad_norm": 0.39023974537849426,
"kl": 0.0255126953125,
"learning_rate": 5.505841609937161e-07,
"loss": -0.18357349932193756,
"memory(GiB)": 175.77,
"reward": 0.517857164144516,
"reward_std": 0.29123931378126144,
"rewards/AnswerTagAccuracyORM/mean": 0.517857164144516,
"rewards/AnswerTagAccuracyORM/std": 0.5059135556221008,
"step": 149,
"train_speed(iter/s)": 0.003166
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 8016.0,
"completions/mean_length": 4689.14306640625,
"completions/min_length": 833.5,
"epoch": 0.47770700636942676,
"grad_norm": 0.5281595587730408,
"kl": 0.052490234375,
"learning_rate": 5.455405624202032e-07,
"loss": -0.15326838195323944,
"memory(GiB)": 175.77,
"reward": 0.3928571492433548,
"reward_std": 0.1539071872830391,
"rewards/AnswerTagAccuracyORM/mean": 0.3928571492433548,
"rewards/AnswerTagAccuracyORM/std": 0.4846093952655792,
"step": 150,
"train_speed(iter/s)": 0.003167
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5357142857142857,
"completions/max_length": 8023.0,
"completions/mean_length": 5550.000244140625,
"completions/min_length": 1138.5,
"epoch": 0.48089171974522293,
"grad_norm": 0.2876298427581787,
"kl": 0.03759765625,
"learning_rate": 5.404922868072672e-07,
"loss": 0.08055908977985382,
"memory(GiB)": 175.77,
"reward": 0.4107142984867096,
"reward_std": 0.14838216453790665,
"rewards/AnswerTagAccuracyORM/mean": 0.4107142984867096,
"rewards/AnswerTagAccuracyORM/std": 0.4576014578342438,
"step": 151,
"train_speed(iter/s)": 0.003168
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.26785714285714285,
"completions/max_length": 8016.0,
"completions/mean_length": 3929.5538330078125,
"completions/min_length": 986.0,
"epoch": 0.4840764331210191,
"grad_norm": 1.2029175758361816,
"kl": 0.45068359375,
"learning_rate": 5.354398526154365e-07,
"loss": -0.09824319183826447,
"memory(GiB)": 175.77,
"reward": 0.5357142984867096,
"reward_std": 0.25552502274513245,
"rewards/AnswerTagAccuracyORM/mean": 0.5357142984867096,
"rewards/AnswerTagAccuracyORM/std": 0.5026109963655472,
"step": 152,
"train_speed(iter/s)": 0.00317
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.4285714285714286,
"completions/max_length": 8062.0,
"completions/mean_length": 5090.232421875,
"completions/min_length": 1046.0,
"epoch": 0.4872611464968153,
"grad_norm": 0.44915616512298584,
"kl": 0.0460205078125,
"learning_rate": 5.30383778732328e-07,
"loss": -0.08847501873970032,
"memory(GiB)": 175.77,
"reward": 0.4285714477300644,
"reward_std": 0.34905366599559784,
"rewards/AnswerTagAccuracyORM/mean": 0.4285714477300644,
"rewards/AnswerTagAccuracyORM/std": 0.502610981464386,
"step": 153,
"train_speed(iter/s)": 0.003169
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.4642857142857143,
"completions/max_length": 8054.0,
"completions/mean_length": 5964.4287109375,
"completions/min_length": 1407.5,
"epoch": 0.49044585987261147,
"grad_norm": 0.7181938886642456,
"kl": 0.0526123046875,
"learning_rate": 5.253245844193564e-07,
"loss": -0.25980204343795776,
"memory(GiB)": 175.77,
"reward": 0.392857164144516,
"reward_std": 0.4727715849876404,
"rewards/AnswerTagAccuracyORM/mean": 0.392857164144516,
"rewards/AnswerTagAccuracyORM/std": 0.4959513396024704,
"step": 154,
"train_speed(iter/s)": 0.003169
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5535714285714286,
"completions/max_length": 8052.0,
"completions/mean_length": 6524.339599609375,
"completions/min_length": 1993.0,
"epoch": 0.49363057324840764,
"grad_norm": 0.6525817513465881,
"kl": 0.08203125,
"learning_rate": 5.202627892584065e-07,
"loss": -0.14491651952266693,
"memory(GiB)": 175.77,
"reward": 0.3392857313156128,
"reward_std": 0.3324785977602005,
"rewards/AnswerTagAccuracyORM/mean": 0.3392857313156128,
"rewards/AnswerTagAccuracyORM/std": 0.4817724674940109,
"step": 155,
"train_speed(iter/s)": 0.003169
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 8034.5,
"completions/mean_length": 5646.178955078125,
"completions/min_length": 1162.5,
"epoch": 0.4968152866242038,
"grad_norm": 0.7706606388092041,
"kl": 0.07666015625,
"learning_rate": 5.151989130984714e-07,
"loss": -0.09087943285703659,
"memory(GiB)": 175.77,
"reward": 0.2857142984867096,
"reward_std": 0.32695358991622925,
"rewards/AnswerTagAccuracyORM/mean": 0.2857142984867096,
"rewards/AnswerTagAccuracyORM/std": 0.460043728351593,
"step": 156,
"train_speed(iter/s)": 0.003169
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.4821428571428572,
"completions/max_length": 8046.5,
"completions/mean_length": 5837.58935546875,
"completions/min_length": 1139.5,
"epoch": 0.5,
"grad_norm": 0.4703036844730377,
"kl": 0.03955078125,
"learning_rate": 5.101334760022639e-07,
"loss": -0.15095964074134827,
"memory(GiB)": 175.77,
"reward": 0.3571428805589676,
"reward_std": 0.25552502274513245,
"rewards/AnswerTagAccuracyORM/mean": 0.3571428805589676,
"rewards/AnswerTagAccuracyORM/std": 0.48647117614746094,
"step": 157,
"train_speed(iter/s)": 0.00317
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.3214285714285714,
"completions/max_length": 8043.5,
"completions/mean_length": 5477.0361328125,
"completions/min_length": 1285.0,
"epoch": 0.5031847133757962,
"grad_norm": 0.975563645362854,
"kl": 0.197265625,
"learning_rate": 5.050669981928055e-07,
"loss": -0.09782678633928299,
"memory(GiB)": 175.77,
"reward": 0.392857164144516,
"reward_std": 0.4396214783191681,
"rewards/AnswerTagAccuracyORM/mean": 0.392857164144516,
"rewards/AnswerTagAccuracyORM/std": 0.4973474591970444,
"step": 158,
"train_speed(iter/s)": 0.003169
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.4107142857142857,
"completions/max_length": 8048.5,
"completions/mean_length": 5391.803955078125,
"completions/min_length": 1264.5,
"epoch": 0.5063694267515924,
"grad_norm": 0.6060774922370911,
"kl": 0.044677734375,
"learning_rate": 5e-07,
"loss": -0.30903252959251404,
"memory(GiB)": 175.77,
"reward": 0.4821428656578064,
"reward_std": 0.3435286581516266,
"rewards/AnswerTagAccuracyORM/mean": 0.4821428656578064,
"rewards/AnswerTagAccuracyORM/std": 0.5085247755050659,
"step": 159,
"train_speed(iter/s)": 0.00317
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5535714285714286,
"completions/max_length": 8036.5,
"completions/mean_length": 6244.39306640625,
"completions/min_length": 1104.5,
"epoch": 0.5095541401273885,
"grad_norm": 0.3611801564693451,
"kl": 0.032743350418968475,
"learning_rate": 4.949330018071946e-07,
"loss": -0.14383217692375183,
"memory(GiB)": 175.77,
"reward": 0.4285714626312256,
"reward_std": 0.2253357656300068,
"rewards/AnswerTagAccuracyORM/mean": 0.4285714626312256,
"rewards/AnswerTagAccuracyORM/std": 0.49173468351364136,
"step": 160,
"train_speed(iter/s)": 0.00317
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5357142857142857,
"completions/max_length": 8052.0,
"completions/mean_length": 6116.875244140625,
"completions/min_length": 1644.5,
"epoch": 0.5127388535031847,
"grad_norm": 0.3732077479362488,
"kl": 0.07177734375,
"learning_rate": 4.898665239977362e-07,
"loss": 0.008805501274764538,
"memory(GiB)": 175.77,
"reward": 0.3035714402794838,
"reward_std": 0.23086079210042953,
"rewards/AnswerTagAccuracyORM/mean": 0.3035714402794838,
"rewards/AnswerTagAccuracyORM/std": 0.4321114122867584,
"step": 161,
"train_speed(iter/s)": 0.003169
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.6964285714285714,
"completions/max_length": 8043.5,
"completions/mean_length": 6551.1611328125,
"completions/min_length": 2065.5,
"epoch": 0.5159235668789809,
"grad_norm": 0.40818125009536743,
"kl": 0.0394287109375,
"learning_rate": 4.848010869015287e-07,
"loss": -0.172414168715477,
"memory(GiB)": 175.77,
"reward": 0.2500000149011612,
"reward_std": 0.1428571529686451,
"rewards/AnswerTagAccuracyORM/mean": 0.2500000149011612,
"rewards/AnswerTagAccuracyORM/std": 0.4061589390039444,
"step": 162,
"train_speed(iter/s)": 0.003168
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.2857142857142857,
"completions/max_length": 8026.5,
"completions/mean_length": 5044.535888671875,
"completions/min_length": 1356.5,
"epoch": 0.5191082802547771,
"grad_norm": 1.3621019124984741,
"kl": 0.4180908203125,
"learning_rate": 4.797372107415935e-07,
"loss": -0.2163340151309967,
"memory(GiB)": 175.77,
"reward": 0.5357142984867096,
"reward_std": 0.42048224806785583,
"rewards/AnswerTagAccuracyORM/mean": 0.5357142984867096,
"rewards/AnswerTagAccuracyORM/std": 0.5065638720989227,
"step": 163,
"train_speed(iter/s)": 0.003169
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.2857142857142857,
"completions/max_length": 8046.5,
"completions/mean_length": 5025.2861328125,
"completions/min_length": 1086.0,
"epoch": 0.5222929936305732,
"grad_norm": 0.5102728605270386,
"kl": 0.0535888671875,
"learning_rate": 4.746754155806437e-07,
"loss": -0.23832020163536072,
"memory(GiB)": 175.77,
"reward": 0.517857164144516,
"reward_std": 0.2721000909805298,
"rewards/AnswerTagAccuracyORM/mean": 0.517857164144516,
"rewards/AnswerTagAccuracyORM/std": 0.5059135556221008,
"step": 164,
"train_speed(iter/s)": 0.00317
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.4464285714285714,
"completions/max_length": 8029.5,
"completions/mean_length": 5601.375244140625,
"completions/min_length": 1904.5,
"epoch": 0.5254777070063694,
"grad_norm": 0.7859033942222595,
"kl": 0.1533203125,
"learning_rate": 4.69616221267672e-07,
"loss": -0.3384000360965729,
"memory(GiB)": 175.77,
"reward": 0.4642857313156128,
"reward_std": 0.42048226296901703,
"rewards/AnswerTagAccuracyORM/mean": 0.4642857313156128,
"rewards/AnswerTagAccuracyORM/std": 0.4959513396024704,
"step": 165,
"train_speed(iter/s)": 0.00317
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.48214285714285715,
"completions/max_length": 8026.0,
"completions/mean_length": 5810.482421875,
"completions/min_length": 1074.0,
"epoch": 0.5286624203821656,
"grad_norm": 0.5610930323600769,
"kl": 0.06005859375,
"learning_rate": 4.645601473845635e-07,
"loss": -0.1656591296195984,
"memory(GiB)": 175.77,
"reward": 0.3571428656578064,
"reward_std": 0.307814359664917,
"rewards/AnswerTagAccuracyORM/mean": 0.3571428656578064,
"rewards/AnswerTagAccuracyORM/std": 0.48795005679130554,
"step": 166,
"train_speed(iter/s)": 0.003169
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 8055.5,
"completions/mean_length": 5735.08935546875,
"completions/min_length": 1439.0,
"epoch": 0.5318471337579618,
"grad_norm": 0.912844717502594,
"kl": 0.1314697265625,
"learning_rate": 4.5950771319273296e-07,
"loss": -0.04387956112623215,
"memory(GiB)": 175.77,
"reward": 0.4107143133878708,
"reward_std": 0.21981074661016464,
"rewards/AnswerTagAccuracyORM/mean": 0.4107143133878708,
"rewards/AnswerTagAccuracyORM/std": 0.5006500333547592,
"step": 167,
"train_speed(iter/s)": 0.003169
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5714285714285714,
"completions/max_length": 8049.0,
"completions/mean_length": 6489.285888671875,
"completions/min_length": 1574.5,
"epoch": 0.535031847133758,
"grad_norm": 0.3967674970626831,
"kl": 1.4857299551628638e+34,
"learning_rate": 4.544594375797968e-07,
"loss": 0.07432208955287933,
"memory(GiB)": 175.77,
"reward": 0.357142873108387,
"reward_std": 0.2967643216252327,
"rewards/AnswerTagAccuracyORM/mean": 0.357142873108387,
"rewards/AnswerTagAccuracyORM/std": 0.43015046417713165,
"step": 168,
"train_speed(iter/s)": 0.003169
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.4821428571428571,
"completions/max_length": 8061.0,
"completions/mean_length": 5971.535888671875,
"completions/min_length": 867.0,
"epoch": 0.5382165605095541,
"grad_norm": 3.6906192302703857,
"kl": 0.22985238194814883,
"learning_rate": 4.4941583900628393e-07,
"loss": -0.2558591365814209,
"memory(GiB)": 175.77,
"reward": 0.267857164144516,
"reward_std": 0.23086077719926834,
"rewards/AnswerTagAccuracyORM/mean": 0.267857164144516,
"rewards/AnswerTagAccuracyORM/std": 0.44672515988349915,
"step": 169,
"train_speed(iter/s)": 0.003169
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.2857142857142857,
"completions/max_length": 8057.5,
"completions/mean_length": 4240.500244140625,
"completions/min_length": 650.0,
"epoch": 0.5414012738853503,
"grad_norm": 0.5648459196090698,
"kl": 0.0484619140625,
"learning_rate": 4.443774354523882e-07,
"loss": -0.026298800483345985,
"memory(GiB)": 175.77,
"reward": 0.5535714626312256,
"reward_std": 0.3324786126613617,
"rewards/AnswerTagAccuracyORM/mean": 0.5535714626312256,
"rewards/AnswerTagAccuracyORM/std": 0.4897737503051758,
"step": 170,
"train_speed(iter/s)": 0.003171
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.4642857142857143,
"completions/max_length": 8025.0,
"completions/mean_length": 5686.607421875,
"completions/min_length": 1502.0,
"epoch": 0.5445859872611465,
"grad_norm": 0.7092170715332031,
"kl": 0.0477294921875,
"learning_rate": 4.3934474436477253e-07,
"loss": -0.25052332878112793,
"memory(GiB)": 175.77,
"reward": 0.3392857313156128,
"reward_std": 0.23086077719926834,
"rewards/AnswerTagAccuracyORM/mean": 0.3392857313156128,
"rewards/AnswerTagAccuracyORM/std": 0.48177245259284973,
"step": 171,
"train_speed(iter/s)": 0.003172
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 8053.0,
"completions/mean_length": 5796.964599609375,
"completions/min_length": 1074.0,
"epoch": 0.5477707006369427,
"grad_norm": 0.448478639125824,
"kl": 0.0557861328125,
"learning_rate": 4.3431828260342675e-07,
"loss": -0.27337488532066345,
"memory(GiB)": 175.77,
"reward": 0.3571428656578064,
"reward_std": 0.27762509882450104,
"rewards/AnswerTagAccuracyORM/mean": 0.3571428656578064,
"rewards/AnswerTagAccuracyORM/std": 0.4635152518749237,
"step": 172,
"train_speed(iter/s)": 0.003172
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5178571428571428,
"completions/max_length": 8049.5,
"completions/mean_length": 6193.589599609375,
"completions/min_length": 1322.0,
"epoch": 0.5509554140127388,
"grad_norm": 0.4961269795894623,
"kl": 0.0238037109375,
"learning_rate": 4.292985663885853e-07,
"loss": -0.1575016975402832,
"memory(GiB)": 175.77,
"reward": 0.3392857313156128,
"reward_std": 0.3324786126613617,
"rewards/AnswerTagAccuracyORM/mean": 0.3392857313156128,
"rewards/AnswerTagAccuracyORM/std": 0.4817724674940109,
"step": 173,
"train_speed(iter/s)": 0.003172
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.625,
"completions/max_length": 8028.0,
"completions/mean_length": 6504.14306640625,
"completions/min_length": 1982.0,
"epoch": 0.554140127388535,
"grad_norm": 0.8586503267288208,
"kl": 0.0418701171875,
"learning_rate": 4.242861112477118e-07,
"loss": -0.26731064915657043,
"memory(GiB)": 175.77,
"reward": 0.3392857313156128,
"reward_std": 0.4149572402238846,
"rewards/AnswerTagAccuracyORM/mean": 0.3392857313156128,
"rewards/AnswerTagAccuracyORM/std": 0.4786956012248993,
"step": 174,
"train_speed(iter/s)": 0.003172
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.4821428571428571,
"completions/max_length": 8055.0,
"completions/mean_length": 5515.000244140625,
"completions/min_length": 1278.0,
"epoch": 0.5573248407643312,
"grad_norm": 0.5378698110580444,
"kl": 3752.9690551757812,
"learning_rate": 4.192814319625533e-07,
"loss": -0.10730624943971634,
"memory(GiB)": 175.77,
"reward": 0.4285714328289032,
"reward_std": 0.2967643290758133,
"rewards/AnswerTagAccuracyORM/mean": 0.4285714328289032,
"rewards/AnswerTagAccuracyORM/std": 0.4985625743865967,
"step": 175,
"train_speed(iter/s)": 0.003173
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.3392857142857143,
"completions/max_length": 8059.0,
"completions/mean_length": 5019.607421875,
"completions/min_length": 1034.0,
"epoch": 0.5605095541401274,
"grad_norm": 0.39968299865722656,
"kl": 0.0423583984375,
"learning_rate": 4.1428504251627325e-07,
"loss": 0.0431935153901577,
"memory(GiB)": 175.77,
"reward": 0.535714328289032,
"reward_std": 0.19514648616313934,
"rewards/AnswerTagAccuracyORM/mean": 0.535714328289032,
"rewards/AnswerTagAccuracyORM/std": 0.5078745484352112,
"step": 176,
"train_speed(iter/s)": 0.003175
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.48214285714285715,
"completions/max_length": 8054.5,
"completions/mean_length": 5915.3037109375,
"completions/min_length": 1198.5,
"epoch": 0.5636942675159236,
"grad_norm": 0.39187169075012207,
"kl": 3.19933819770813,
"learning_rate": 4.0929745604066343e-07,
"loss": -0.09193204343318939,
"memory(GiB)": 176.78,
"reward": 0.5000000149011612,
"reward_std": 0.2967643290758133,
"rewards/AnswerTagAccuracyORM/mean": 0.5000000149011612,
"rewards/AnswerTagAccuracyORM/std": 0.4973474442958832,
"step": 177,
"train_speed(iter/s)": 0.003175
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.3928571428571429,
"completions/max_length": 8029.5,
"completions/mean_length": 5273.268310546875,
"completions/min_length": 1048.5,
"epoch": 0.5668789808917197,
"grad_norm": 0.6660857796669006,
"kl": 0.1064453125,
"learning_rate": 4.0431918476344685e-07,
"loss": -0.19565197825431824,
"memory(GiB)": 176.78,
"reward": 0.535714328289032,
"reward_std": 0.2967643216252327,
"rewards/AnswerTagAccuracyORM/mean": 0.535714328289032,
"rewards/AnswerTagAccuracyORM/std": 0.4959513396024704,
"step": 178,
"train_speed(iter/s)": 0.003177
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.6071428571428571,
"completions/max_length": 8059.5,
"completions/mean_length": 6514.857421875,
"completions/min_length": 1473.0,
"epoch": 0.5700636942675159,
"grad_norm": 0.698070228099823,
"kl": 0.085205078125,
"learning_rate": 3.9935073995566987e-07,
"loss": -0.09061294049024582,
"memory(GiB)": 176.78,
"reward": 0.2857142984867096,
"reward_std": 0.25552503019571304,
"rewards/AnswerTagAccuracyORM/mean": 0.2857142984867096,
"rewards/AnswerTagAccuracyORM/std": 0.460043728351593,
"step": 179,
"train_speed(iter/s)": 0.003177
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.4821428571428571,
"completions/max_length": 8024.0,
"completions/mean_length": 5687.553955078125,
"completions/min_length": 1122.0,
"epoch": 0.5732484076433121,
"grad_norm": 0.31134656071662903,
"kl": 0.03778076171875,
"learning_rate": 3.943926318791963e-07,
"loss": -0.14330630004405975,
"memory(GiB)": 176.78,
"reward": 0.4821428954601288,
"reward_std": 0.2610500454902649,
"rewards/AnswerTagAccuracyORM/mean": 0.4821428954601288,
"rewards/AnswerTagAccuracyORM/std": 0.4817724674940109,
"step": 180,
"train_speed(iter/s)": 0.003178
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5714285714285714,
"completions/max_length": 8041.5,
"completions/mean_length": 6372.910888671875,
"completions/min_length": 1543.5,
"epoch": 0.5764331210191083,
"grad_norm": 0.5216385126113892,
"kl": 0.023681640625,
"learning_rate": 3.8944536973430156e-07,
"loss": -0.11478479206562042,
"memory(GiB)": 176.78,
"reward": 0.2857142984867096,
"reward_std": 0.18409644439816475,
"rewards/AnswerTagAccuracyORM/mean": 0.2857142984867096,
"rewards/AnswerTagAccuracyORM/std": 0.4582767188549042,
"step": 181,
"train_speed(iter/s)": 0.003179
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5892857142857143,
"completions/max_length": 8024.5,
"completions/mean_length": 6145.232421875,
"completions/min_length": 1394.5,
"epoch": 0.5796178343949044,
"grad_norm": 0.4548920691013336,
"kl": 0.0491943359375,
"learning_rate": 3.845094616073783e-07,
"loss": -0.04961169883608818,
"memory(GiB)": 176.78,
"reward": 0.2857142984867096,
"reward_std": 0.18409645557403564,
"rewards/AnswerTagAccuracyORM/mean": 0.2857142984867096,
"rewards/AnswerTagAccuracyORM/std": 0.44368425011634827,
"step": 182,
"train_speed(iter/s)": 0.003179
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.35714285714285715,
"completions/max_length": 8025.0,
"completions/mean_length": 4988.08935546875,
"completions/min_length": 1431.0,
"epoch": 0.5828025477707006,
"grad_norm": 0.45550552010536194,
"kl": 0.0574951171875,
"learning_rate": 3.7958541441875627e-07,
"loss": -0.0993984192609787,
"memory(GiB)": 176.78,
"reward": 0.5535714626312256,
"reward_std": 0.21981073170900345,
"rewards/AnswerTagAccuracyORM/mean": 0.5535714626312256,
"rewards/AnswerTagAccuracyORM/std": 0.4897737503051758,
"step": 183,
"train_speed(iter/s)": 0.003181
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5535714285714286,
"completions/max_length": 8052.5,
"completions/mean_length": 6105.107421875,
"completions/min_length": 1428.0,
"epoch": 0.5859872611464968,
"grad_norm": 0.4129337668418884,
"kl": 0.03155517578125,
"learning_rate": 3.7467373387063964e-07,
"loss": -0.18813078105449677,
"memory(GiB)": 176.78,
"reward": 0.4107142984867096,
"reward_std": 0.2610500305891037,
"rewards/AnswerTagAccuracyORM/mean": 0.4107142984867096,
"rewards/AnswerTagAccuracyORM/std": 0.497912272810936,
"step": 184,
"train_speed(iter/s)": 0.003181
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 8026.5,
"completions/mean_length": 5963.446533203125,
"completions/min_length": 1203.5,
"epoch": 0.589171974522293,
"grad_norm": 0.4038497805595398,
"kl": 0.0673828125,
"learning_rate": 3.6977492439517346e-07,
"loss": -0.08315330743789673,
"memory(GiB)": 176.78,
"reward": 0.446428582072258,
"reward_std": 0.2610500454902649,
"rewards/AnswerTagAccuracyORM/mean": 0.446428582072258,
"rewards/AnswerTagAccuracyORM/std": 0.4786956012248993,
"step": 185,
"train_speed(iter/s)": 0.003182
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5535714285714286,
"completions/max_length": 8037.5,
"completions/mean_length": 6319.321533203125,
"completions/min_length": 1822.0,
"epoch": 0.5923566878980892,
"grad_norm": 1.8196825981140137,
"kl": 0.7462158203125,
"learning_rate": 3.648894891026358e-07,
"loss": -0.03824207931756973,
"memory(GiB)": 176.78,
"reward": 0.3035714328289032,
"reward_std": 0.1785714402794838,
"rewards/AnswerTagAccuracyORM/mean": 0.3035714328289032,
"rewards/AnswerTagAccuracyORM/std": 0.4644542932510376,
"step": 186,
"train_speed(iter/s)": 0.003183
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.4642857142857143,
"completions/max_length": 8033.5,
"completions/mean_length": 5305.8037109375,
"completions/min_length": 942.0,
"epoch": 0.5955414012738853,
"grad_norm": 0.41588449478149414,
"kl": 0.025146484375,
"learning_rate": 3.600179297297695e-07,
"loss": -0.10517025738954544,
"memory(GiB)": 176.78,
"reward": 0.517857164144516,
"reward_std": 0.3435286581516266,
"rewards/AnswerTagAccuracyORM/mean": 0.517857164144516,
"rewards/AnswerTagAccuracyORM/std": 0.48177245259284973,
"step": 187,
"train_speed(iter/s)": 0.003184
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 8029.5,
"completions/mean_length": 5222.250244140625,
"completions/min_length": 1221.5,
"epoch": 0.5987261146496815,
"grad_norm": 0.42883527278900146,
"kl": 0.033935546875,
"learning_rate": 3.5516074658825397e-07,
"loss": -0.04254484549164772,
"memory(GiB)": 176.78,
"reward": 0.392857164144516,
"reward_std": 0.2967643141746521,
"rewards/AnswerTagAccuracyORM/mean": 0.392857164144516,
"rewards/AnswerTagAccuracyORM/std": 0.49734747409820557,
"step": 188,
"train_speed(iter/s)": 0.003185
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.35714285714285715,
"completions/max_length": 8027.5,
"completions/mean_length": 4837.39306640625,
"completions/min_length": 1332.0,
"epoch": 0.6019108280254777,
"grad_norm": 0.6678107380867004,
"kl": 0.08056640625,
"learning_rate": 3.50318438513321e-07,
"loss": -0.23368430137634277,
"memory(GiB)": 176.78,
"reward": 0.4285714626312256,
"reward_std": 0.3681928962469101,
"rewards/AnswerTagAccuracyORM/mean": 0.4285714626312256,
"rewards/AnswerTagAccuracyORM/std": 0.49173466861248016,
"step": 189,
"train_speed(iter/s)": 0.003187
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5357142857142857,
"completions/max_length": 8052.0,
"completions/mean_length": 6229.303955078125,
"completions/min_length": 2022.0,
"epoch": 0.6050955414012739,
"grad_norm": 0.6864339113235474,
"kl": 1.5787729148315552e-41,
"learning_rate": 3.454915028125263e-07,
"loss": -0.17627106606960297,
"memory(GiB)": 176.78,
"reward": 0.4642857313156128,
"reward_std": 0.3078143820166588,
"rewards/AnswerTagAccuracyORM/mean": 0.4642857313156128,
"rewards/AnswerTagAccuracyORM/std": 0.5065638422966003,
"step": 190,
"train_speed(iter/s)": 0.003187
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.4285714285714286,
"completions/max_length": 8023.5,
"completions/mean_length": 5785.803955078125,
"completions/min_length": 1411.0,
"epoch": 0.60828025477707,
"grad_norm": 0.4775516092777252,
"kl": 0.027587890625,
"learning_rate": 3.406804352146742e-07,
"loss": -0.1085924282670021,
"memory(GiB)": 176.78,
"reward": 0.3392857313156128,
"reward_std": 0.1896214708685875,
"rewards/AnswerTagAccuracyORM/mean": 0.3392857313156128,
"rewards/AnswerTagAccuracyORM/std": 0.48177245259284973,
"step": 191,
"train_speed(iter/s)": 0.003189
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5714285714285714,
"completions/max_length": 8018.0,
"completions/mean_length": 6275.1611328125,
"completions/min_length": 1364.0,
"epoch": 0.6114649681528662,
"grad_norm": 0.3889318108558655,
"kl": 0.0579833984375,
"learning_rate": 3.3588572981890684e-07,
"loss": -0.09743942320346832,
"memory(GiB)": 176.78,
"reward": 0.2321428656578064,
"reward_std": 0.14838216453790665,
"rewards/AnswerTagAccuracyORM/mean": 0.2321428656578064,
"rewards/AnswerTagAccuracyORM/std": 0.429407000541687,
"step": 192,
"train_speed(iter/s)": 0.003189
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5178571428571428,
"completions/max_length": 8026.0,
"completions/mean_length": 6002.982421875,
"completions/min_length": 805.0,
"epoch": 0.6146496815286624,
"grad_norm": 0.6355292201042175,
"kl": 0.0274658203125,
"learning_rate": 3.311078790439598e-07,
"loss": -0.29961735010147095,
"memory(GiB)": 176.78,
"reward": 0.4285714626312256,
"reward_std": 0.3078143820166588,
"rewards/AnswerTagAccuracyORM/mean": 0.4285714626312256,
"rewards/AnswerTagAccuracyORM/std": 0.5039526224136353,
"step": 193,
"train_speed(iter/s)": 0.003189
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 8070.0,
"completions/mean_length": 6182.6787109375,
"completions/min_length": 1595.0,
"epoch": 0.6178343949044586,
"grad_norm": 0.5008262395858765,
"kl": 0.0277099609375,
"learning_rate": 3.263473735775899e-07,
"loss": -0.17556621134281158,
"memory(GiB)": 176.78,
"reward": 0.3750000298023224,
"reward_std": 0.3243894428014755,
"rewards/AnswerTagAccuracyORM/mean": 0.3750000298023224,
"rewards/AnswerTagAccuracyORM/std": 0.4897737503051758,
"step": 194,
"train_speed(iter/s)": 0.003189
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.48214285714285715,
"completions/max_length": 8046.5,
"completions/mean_length": 6006.875244140625,
"completions/min_length": 1325.0,
"epoch": 0.6210191082802548,
"grad_norm": 0.6817752122879028,
"kl": 0.0562744140625,
"learning_rate": 3.2160470232618225e-07,
"loss": -0.2873086631298065,
"memory(GiB)": 176.78,
"reward": 0.267857164144516,
"reward_std": 0.3435286581516266,
"rewards/AnswerTagAccuracyORM/mean": 0.267857164144516,
"rewards/AnswerTagAccuracyORM/std": 0.44672515988349915,
"step": 195,
"train_speed(iter/s)": 0.003189
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 8042.5,
"completions/mean_length": 5104.1787109375,
"completions/min_length": 1014.0,
"epoch": 0.6242038216560509,
"grad_norm": 0.40815719962120056,
"kl": 22.176969528198242,
"learning_rate": 3.1688035236453865e-07,
"loss": 0.02237345091998577,
"memory(GiB)": 176.78,
"reward": 0.4464285969734192,
"reward_std": 0.23086077719926834,
"rewards/AnswerTagAccuracyORM/mean": 0.4464285969734192,
"rewards/AnswerTagAccuracyORM/std": 0.5059135556221008,
"step": 196,
"train_speed(iter/s)": 0.00319
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5892857142857143,
"completions/max_length": 8032.0,
"completions/mean_length": 6547.00048828125,
"completions/min_length": 1434.5,
"epoch": 0.6273885350318471,
"grad_norm": 0.4397486746311188,
"kl": 0.02099609375,
"learning_rate": 3.121748088858549e-07,
"loss": -0.10549207031726837,
"memory(GiB)": 176.78,
"reward": 0.2857142984867096,
"reward_std": 0.3681929111480713,
"rewards/AnswerTagAccuracyORM/mean": 0.2857142984867096,
"rewards/AnswerTagAccuracyORM/std": 0.4114224463701248,
"step": 197,
"train_speed(iter/s)": 0.00319
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5535714285714286,
"completions/max_length": 8035.0,
"completions/mean_length": 5620.51806640625,
"completions/min_length": 597.0,
"epoch": 0.6305732484076433,
"grad_norm": 27.555814743041992,
"kl": 2.0523681640625,
"learning_rate": 3.0748855515189096e-07,
"loss": -0.24800211191177368,
"memory(GiB)": 176.78,
"reward": 0.464285746216774,
"reward_std": 0.26657505333423615,
"rewards/AnswerTagAccuracyORM/mean": 0.464285746216774,
"rewards/AnswerTagAccuracyORM/std": 0.4739968776702881,
"step": 198,
"train_speed(iter/s)": 0.00319
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.42857142857142855,
"completions/max_length": 8045.5,
"completions/mean_length": 5171.607421875,
"completions/min_length": 424.0,
"epoch": 0.6337579617834395,
"grad_norm": 0.4833768606185913,
"kl": 1.5786327849851227e-41,
"learning_rate": 3.028220724433408e-07,
"loss": -0.06498469412326813,
"memory(GiB)": 176.78,
"reward": 0.375,
"reward_std": 0.30228936672210693,
"rewards/AnswerTagAccuracyORM/mean": 0.375,
"rewards/AnswerTagAccuracyORM/std": 0.4750668406486511,
"step": 199,
"train_speed(iter/s)": 0.00319
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.4285714285714286,
"completions/max_length": 8049.0,
"completions/mean_length": 5697.178955078125,
"completions/min_length": 1219.0,
"epoch": 0.6369426751592356,
"grad_norm": 1.165695071220398,
"kl": 1483.6046142578125,
"learning_rate": 2.981758400104028e-07,
"loss": -0.07902750372886658,
"memory(GiB)": 176.78,
"reward": 0.392857164144516,
"reward_std": 0.2967643439769745,
"rewards/AnswerTagAccuracyORM/mean": 0.392857164144516,
"rewards/AnswerTagAccuracyORM/std": 0.4973474442958832,
"step": 200,
"train_speed(iter/s)": 0.00319
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.4464285714285714,
"completions/max_length": 8044.0,
"completions/mean_length": 6021.26806640625,
"completions/min_length": 1541.5,
"epoch": 0.6401273885350318,
"grad_norm": 0.3752177655696869,
"kl": 0.0467529296875,
"learning_rate": 2.9355033502356194e-07,
"loss": -0.0795094221830368,
"memory(GiB)": 176.78,
"reward": 0.3392857313156128,
"reward_std": 0.21981073915958405,
"rewards/AnswerTagAccuracyORM/mean": 0.3392857313156128,
"rewards/AnswerTagAccuracyORM/std": 0.47245559096336365,
"step": 201,
"train_speed(iter/s)": 0.003189
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5178571428571429,
"completions/max_length": 8032.5,
"completions/mean_length": 6366.946533203125,
"completions/min_length": 1789.0,
"epoch": 0.643312101910828,
"grad_norm": 0.5095097422599792,
"kl": 0.042724609375,
"learning_rate": 2.8894603252458403e-07,
"loss": -0.24165716767311096,
"memory(GiB)": 176.78,
"reward": 0.2678571492433548,
"reward_std": 0.30228933691978455,
"rewards/AnswerTagAccuracyORM/mean": 0.2678571492433548,
"rewards/AnswerTagAccuracyORM/std": 0.45050114393234253,
"step": 202,
"train_speed(iter/s)": 0.003189
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.625,
"completions/max_length": 8052.5,
"completions/mean_length": 6997.57177734375,
"completions/min_length": 1818.5,
"epoch": 0.6464968152866242,
"grad_norm": 0.716253399848938,
"kl": 0.0411376953125,
"learning_rate": 2.8436340537772794e-07,
"loss": -0.3257341980934143,
"memory(GiB)": 176.78,
"reward": 0.2500000149011612,
"reward_std": 0.2967643216252327,
"rewards/AnswerTagAccuracyORM/mean": 0.2500000149011612,
"rewards/AnswerTagAccuracyORM/std": 0.4389495849609375,
"step": 203,
"train_speed(iter/s)": 0.00319
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5357142857142857,
"completions/max_length": 8052.0,
"completions/mean_length": 6138.303955078125,
"completions/min_length": 1044.5,
"epoch": 0.6496815286624203,
"grad_norm": 0.6485111713409424,
"kl": 0.0439453125,
"learning_rate": 2.7980292422118277e-07,
"loss": -0.20691558718681335,
"memory(GiB)": 176.78,
"reward": 0.535714328289032,
"reward_std": 0.3902929872274399,
"rewards/AnswerTagAccuracyORM/mean": 0.535714328289032,
"rewards/AnswerTagAccuracyORM/std": 0.5078744888305664,
"step": 204,
"train_speed(iter/s)": 0.00319
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.30357142857142855,
"completions/max_length": 8048.0,
"completions/mean_length": 5014.732177734375,
"completions/min_length": 1183.5,
"epoch": 0.6528662420382165,
"grad_norm": 0.4565260410308838,
"kl": 0.0528564453125,
"learning_rate": 2.75265057418733e-07,
"loss": -0.19542962312698364,
"memory(GiB)": 176.78,
"reward": 0.446428582072258,
"reward_std": 0.21981074661016464,
"rewards/AnswerTagAccuracyORM/mean": 0.446428582072258,
"rewards/AnswerTagAccuracyORM/std": 0.5032612979412079,
"step": 205,
"train_speed(iter/s)": 0.003192
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5357142857142857,
"completions/max_length": 8044.0,
"completions/mean_length": 6365.250244140625,
"completions/min_length": 2251.0,
"epoch": 0.6560509554140127,
"grad_norm": 0.7114933729171753,
"kl": 0.169677734375,
"learning_rate": 2.70750271011657e-07,
"loss": -0.12340293079614639,
"memory(GiB)": 176.78,
"reward": 0.321428582072258,
"reward_std": 0.2857142984867096,
"rewards/AnswerTagAccuracyORM/mean": 0.321428582072258,
"rewards/AnswerTagAccuracyORM/std": 0.4691530168056488,
"step": 206,
"train_speed(iter/s)": 0.003192
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 8031.5,
"completions/mean_length": 5673.46435546875,
"completions/min_length": 1684.0,
"epoch": 0.6592356687898089,
"grad_norm": 0.5769023299217224,
"kl": 0.03271484375,
"learning_rate": 2.6625902867086447e-07,
"loss": -0.10204778611660004,
"memory(GiB)": 176.78,
"reward": 0.4821428656578064,
"reward_std": 0.2721000798046589,
"rewards/AnswerTagAccuracyORM/mean": 0.4821428656578064,
"rewards/AnswerTagAccuracyORM/std": 0.5085248351097107,
"step": 207,
"train_speed(iter/s)": 0.003193
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.35714285714285715,
"completions/max_length": 8049.0,
"completions/mean_length": 5337.607421875,
"completions/min_length": 1091.5,
"epoch": 0.6624203821656051,
"grad_norm": 7.4786057472229,
"kl": 4.21484375,
"learning_rate": 2.6179179164927754e-07,
"loss": 0.045502904802560806,
"memory(GiB)": 176.78,
"reward": 0.3750000149011612,
"reward_std": 0.14838217198848724,
"rewards/AnswerTagAccuracyORM/mean": 0.3750000149011612,
"rewards/AnswerTagAccuracyORM/std": 0.49264875054359436,
"step": 208,
"train_speed(iter/s)": 0.003194
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5357142857142857,
"completions/max_length": 8050.5,
"completions/mean_length": 6416.553955078125,
"completions/min_length": 2526.0,
"epoch": 0.6656050955414012,
"grad_norm": 0.5907986760139465,
"kl": NaN,
"learning_rate": 2.5734901873445956e-07,
"loss": -0.22783614695072174,
"memory(GiB)": 176.78,
"reward": 0.321428582072258,
"reward_std": 0.32695358991622925,
"rewards/AnswerTagAccuracyORM/mean": 0.321428582072258,
"rewards/AnswerTagAccuracyORM/std": 0.4739968925714493,
"step": 209,
"train_speed(iter/s)": 0.003194
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.4464285714285714,
"completions/max_length": 8041.0,
"completions/mean_length": 5512.57177734375,
"completions/min_length": 1217.5,
"epoch": 0.6687898089171974,
"grad_norm": 6.297171115875244,
"kl": 1.9635009765625,
"learning_rate": 2.529311662014972e-07,
"loss": -0.09180951863527298,
"memory(GiB)": 176.78,
"reward": 0.3392857313156128,
"reward_std": 0.30228934437036514,
"rewards/AnswerTagAccuracyORM/mean": 0.3392857313156128,
"rewards/AnswerTagAccuracyORM/std": 0.4817724674940109,
"step": 210,
"train_speed(iter/s)": 0.003195
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.6428571428571428,
"completions/max_length": 8047.5,
"completions/mean_length": 6852.94677734375,
"completions/min_length": 2993.5,
"epoch": 0.6719745222929936,
"grad_norm": 0.5260760188102722,
"kl": 0.0394287109375,
"learning_rate": 2.485386877661411e-07,
"loss": -0.13763374090194702,
"memory(GiB)": 176.78,
"reward": 0.2857142984867096,
"reward_std": 0.32695360481739044,
"rewards/AnswerTagAccuracyORM/mean": 0.2857142984867096,
"rewards/AnswerTagAccuracyORM/std": 0.458276703953743,
"step": 211,
"train_speed(iter/s)": 0.003194
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.4821428571428572,
"completions/max_length": 8028.0,
"completions/mean_length": 6248.76806640625,
"completions/min_length": 1682.0,
"epoch": 0.6751592356687898,
"grad_norm": 2.8984975814819336,
"kl": 0.337890625,
"learning_rate": 2.441720345382089e-07,
"loss": -0.07186296582221985,
"memory(GiB)": 176.78,
"reward": 0.2678571492433548,
"reward_std": 0.29123930633068085,
"rewards/AnswerTagAccuracyORM/mean": 0.2678571492433548,
"rewards/AnswerTagAccuracyORM/std": 0.45050112903118134,
"step": 212,
"train_speed(iter/s)": 0.003194
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5178571428571428,
"completions/max_length": 8037.0,
"completions/mean_length": 5828.625244140625,
"completions/min_length": 1334.0,
"epoch": 0.678343949044586,
"grad_norm": 47.080162048339844,
"kl": 19.67626953125,
"learning_rate": 2.3983165497525596e-07,
"loss": 0.14914795756340027,
"memory(GiB)": 176.78,
"reward": 0.2857142984867096,
"reward_std": 0.32695358991622925,
"rewards/AnswerTagAccuracyORM/mean": 0.2857142984867096,
"rewards/AnswerTagAccuracyORM/std": 0.4600437134504318,
"step": 213,
"train_speed(iter/s)": 0.003193
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.39285714285714285,
"completions/max_length": 8034.5,
"completions/mean_length": 5060.107421875,
"completions/min_length": 1518.5,
"epoch": 0.6815286624203821,
"grad_norm": 0.5471820831298828,
"kl": 0.0250244140625,
"learning_rate": 2.355179948365189e-07,
"loss": -0.1757272332906723,
"memory(GiB)": 176.78,
"reward": 0.4642857313156128,
"reward_std": 0.26657506078481674,
"rewards/AnswerTagAccuracyORM/mean": 0.4642857313156128,
"rewards/AnswerTagAccuracyORM/std": 0.4959513247013092,
"step": 214,
"train_speed(iter/s)": 0.003194
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5178571428571429,
"completions/max_length": 8038.5,
"completions/mean_length": 5785.375244140625,
"completions/min_length": 1677.0,
"epoch": 0.6847133757961783,
"grad_norm": 0.6869480013847351,
"kl": 0.026123046875,
"learning_rate": 2.3123149713713468e-07,
"loss": -0.2949236035346985,
"memory(GiB)": 176.78,
"reward": 0.392857164144516,
"reward_std": 0.307814359664917,
"rewards/AnswerTagAccuracyORM/mean": 0.392857164144516,
"rewards/AnswerTagAccuracyORM/std": 0.4609040319919586,
"step": 215,
"train_speed(iter/s)": 0.003194
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.4821428571428571,
"completions/max_length": 8027.5,
"completions/mean_length": 6098.482421875,
"completions/min_length": 1733.5,
"epoch": 0.6878980891719745,
"grad_norm": 0.38128015398979187,
"kl": 0.0662841796875,
"learning_rate": 2.26972602102645e-07,
"loss": 0.0502837672829628,
"memory(GiB)": 176.78,
"reward": 0.5000000298023224,
"reward_std": 0.2253357619047165,
"rewards/AnswerTagAccuracyORM/mean": 0.5000000298023224,
"rewards/AnswerTagAccuracyORM/std": 0.48795005679130554,
"step": 216,
"train_speed(iter/s)": 0.003194
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5714285714285714,
"completions/max_length": 8033.5,
"completions/mean_length": 6116.053955078125,
"completions/min_length": 1466.5,
"epoch": 0.6910828025477707,
"grad_norm": 0.926658034324646,
"kl": 0.076416015625,
"learning_rate": 2.2274174712378207e-07,
"loss": -0.12618423998355865,
"memory(GiB)": 176.78,
"reward": 0.3392857313156128,
"reward_std": 0.3435286581516266,
"rewards/AnswerTagAccuracyORM/mean": 0.3392857313156128,
"rewards/AnswerTagAccuracyORM/std": 0.48177245259284973,
"step": 217,
"train_speed(iter/s)": 0.003195
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.4821428571428571,
"completions/max_length": 8053.5,
"completions/mean_length": 5445.69677734375,
"completions/min_length": 853.0,
"epoch": 0.6942675159235668,
"grad_norm": 0.6047350764274597,
"kl": 51.23466873168945,
"learning_rate": 2.1853936671155127e-07,
"loss": -0.11778053641319275,
"memory(GiB)": 176.78,
"reward": 0.4285714626312256,
"reward_std": 0.49191083014011383,
"rewards/AnswerTagAccuracyORM/mean": 0.4285714626312256,
"rewards/AnswerTagAccuracyORM/std": 0.49173468351364136,
"step": 218,
"train_speed(iter/s)": 0.003195
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 8047.0,
"completions/mean_length": 6047.035888671875,
"completions/min_length": 1355.5,
"epoch": 0.697452229299363,
"grad_norm": 0.9835929274559021,
"kl": 0.0267333984375,
"learning_rate": 2.1436589245260372e-07,
"loss": -0.29198572039604187,
"memory(GiB)": 176.78,
"reward": 0.3392857313156128,
"reward_std": 0.37371790409088135,
"rewards/AnswerTagAccuracyORM/mean": 0.3392857313156128,
"rewards/AnswerTagAccuracyORM/std": 0.47245559096336365,
"step": 219,
"train_speed(iter/s)": 0.003195
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.4642857142857143,
"completions/max_length": 8063.5,
"completions/mean_length": 5899.7861328125,
"completions/min_length": 1418.5,
"epoch": 0.7006369426751592,
"grad_norm": 0.48660025000572205,
"kl": 0.0634765625,
"learning_rate": 2.1022175296491512e-07,
"loss": -0.18401746451854706,
"memory(GiB)": 176.78,
"reward": 0.4107143133878708,
"reward_std": 0.3435286581516266,
"rewards/AnswerTagAccuracyORM/mean": 0.4107143133878708,
"rewards/AnswerTagAccuracyORM/std": 0.5006500333547592,
"step": 220,
"train_speed(iter/s)": 0.003195
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.4285714285714286,
"completions/max_length": 8063.5,
"completions/mean_length": 5425.0003662109375,
"completions/min_length": 1516.5,
"epoch": 0.7038216560509554,
"grad_norm": 0.38574594259262085,
"kl": 0.03814697265625,
"learning_rate": 2.0610737385376348e-07,
"loss": -0.04302213340997696,
"memory(GiB)": 176.78,
"reward": 0.4642857313156128,
"reward_std": 0.25552501529455185,
"rewards/AnswerTagAccuracyORM/mean": 0.4642857313156128,
"rewards/AnswerTagAccuracyORM/std": 0.4389495849609375,
"step": 221,
"train_speed(iter/s)": 0.003195
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.3035714285714286,
"completions/max_length": 8049.5,
"completions/mean_length": 5389.4287109375,
"completions/min_length": 1246.5,
"epoch": 0.7070063694267515,
"grad_norm": 5124.12841796875,
"kl": 274.0233154296875,
"learning_rate": 2.0202317766802152e-07,
"loss": 0.4420851469039917,
"memory(GiB)": 176.78,
"reward": 0.392857164144516,
"reward_std": 0.3078143745660782,
"rewards/AnswerTagAccuracyORM/mean": 0.392857164144516,
"rewards/AnswerTagAccuracyORM/std": 0.4973474591970444,
"step": 222,
"train_speed(iter/s)": 0.003196
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.26785714285714285,
"completions/max_length": 8044.5,
"completions/mean_length": 4609.19677734375,
"completions/min_length": 923.5,
"epoch": 0.7101910828025477,
"grad_norm": 4.193013668060303,
"kl": 2.1361083984375,
"learning_rate": 1.9796958385675965e-07,
"loss": 0.02818513847887516,
"memory(GiB)": 176.78,
"reward": 0.6071428954601288,
"reward_std": 0.2253357544541359,
"rewards/AnswerTagAccuracyORM/mean": 0.6071428954601288,
"rewards/AnswerTagAccuracyORM/std": 0.4959513247013092,
"step": 223,
"train_speed(iter/s)": 0.003197
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5714285714285714,
"completions/max_length": 8045.5,
"completions/mean_length": 6820.643310546875,
"completions/min_length": 1928.0,
"epoch": 0.7133757961783439,
"grad_norm": 0.7545607089996338,
"kl": 0.048828125,
"learning_rate": 1.9394700872616853e-07,
"loss": -0.3313080370426178,
"memory(GiB)": 176.78,
"reward": 0.3392857313156128,
"reward_std": 0.45619654655456543,
"rewards/AnswerTagAccuracyORM/mean": 0.3392857313156128,
"rewards/AnswerTagAccuracyORM/std": 0.47245559096336365,
"step": 224,
"train_speed(iter/s)": 0.003197
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 8044.0,
"completions/mean_length": 5816.178955078125,
"completions/min_length": 1331.5,
"epoch": 0.7165605095541401,
"grad_norm": 0.531544029712677,
"kl": 0.094482421875,
"learning_rate": 1.899558653968042e-07,
"loss": -0.11124253273010254,
"memory(GiB)": 176.78,
"reward": 0.4107143133878708,
"reward_std": 0.2610500454902649,
"rewards/AnswerTagAccuracyORM/mean": 0.4107143133878708,
"rewards/AnswerTagAccuracyORM/std": 0.5006500482559204,
"step": 225,
"train_speed(iter/s)": 0.003197
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.39285714285714285,
"completions/max_length": 8038.5,
"completions/mean_length": 5465.143310546875,
"completions/min_length": 1681.0,
"epoch": 0.7197452229299363,
"grad_norm": 0.45902571082115173,
"kl": 0.0625,
"learning_rate": 1.8599656376116024e-07,
"loss": -0.2800550162792206,
"memory(GiB)": 176.78,
"reward": 0.5000000149011612,
"reward_std": 0.3490536957979202,
"rewards/AnswerTagAccuracyORM/mean": 0.5000000149011612,
"rewards/AnswerTagAccuracyORM/std": 0.49734747409820557,
"step": 226,
"train_speed(iter/s)": 0.003198
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 8061.0,
"completions/mean_length": 5977.035888671875,
"completions/min_length": 1405.5,
"epoch": 0.7229299363057324,
"grad_norm": 0.40895432233810425,
"kl": 0.06298828125,
"learning_rate": 1.820695104415721e-07,
"loss": -0.07818439602851868,
"memory(GiB)": 176.78,
"reward": 0.267857164144516,
"reward_std": 0.2500000149011612,
"rewards/AnswerTagAccuracyORM/mean": 0.267857164144516,
"rewards/AnswerTagAccuracyORM/std": 0.44672515988349915,
"step": 227,
"train_speed(iter/s)": 0.003199
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5357142857142857,
"completions/max_length": 8046.0,
"completions/mean_length": 6236.82177734375,
"completions/min_length": 2348.5,
"epoch": 0.7261146496815286,
"grad_norm": 1.3278292417526245,
"kl": 0.188232421875,
"learning_rate": 1.7817510874845582e-07,
"loss": -0.1289130449295044,
"memory(GiB)": 176.78,
"reward": 0.321428582072258,
"reward_std": 0.19514649361371994,
"rewards/AnswerTagAccuracyORM/mean": 0.321428582072258,
"rewards/AnswerTagAccuracyORM/std": 0.4739968925714493,
"step": 228,
"train_speed(iter/s)": 0.003199
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.4464285714285714,
"completions/max_length": 8058.0,
"completions/mean_length": 6219.535888671875,
"completions/min_length": 2286.5,
"epoch": 0.7292993630573248,
"grad_norm": 2.8997809886932373,
"kl": 1.2813720703125,
"learning_rate": 1.7431375863888898e-07,
"loss": -0.2479753941297531,
"memory(GiB)": 176.78,
"reward": 0.321428582072258,
"reward_std": 0.32695358991622925,
"rewards/AnswerTagAccuracyORM/mean": 0.321428582072258,
"rewards/AnswerTagAccuracyORM/std": 0.4489477574825287,
"step": 229,
"train_speed(iter/s)": 0.0032
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 8028.5,
"completions/mean_length": 5949.357421875,
"completions/min_length": 1292.5,
"epoch": 0.732484076433121,
"grad_norm": 0.37234172224998474,
"kl": 0.020263671875,
"learning_rate": 1.7048585667553412e-07,
"loss": 0.023813849315047264,
"memory(GiB)": 176.78,
"reward": 0.375,
"reward_std": 0.2500000149011612,
"rewards/AnswerTagAccuracyORM/mean": 0.375,
"rewards/AnswerTagAccuracyORM/std": 0.4750668406486511,
"step": 230,
"train_speed(iter/s)": 0.0032
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5357142857142857,
"completions/max_length": 8021.5,
"completions/mean_length": 5838.39306640625,
"completions/min_length": 1460.0,
"epoch": 0.7356687898089171,
"grad_norm": 0.3578107953071594,
"kl": 0.1229248046875,
"learning_rate": 1.6669179598591183e-07,
"loss": -0.21507693827152252,
"memory(GiB)": 176.78,
"reward": 0.5535714626312256,
"reward_std": 0.29123930633068085,
"rewards/AnswerTagAccuracyORM/mean": 0.5535714626312256,
"rewards/AnswerTagAccuracyORM/std": 0.49791228771209717,
"step": 231,
"train_speed(iter/s)": 0.0032
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.3035714285714286,
"completions/max_length": 8058.5,
"completions/mean_length": 4920.39306640625,
"completions/min_length": 1596.0,
"epoch": 0.7388535031847133,
"grad_norm": 0.4056342840194702,
"kl": 0.05517578125,
"learning_rate": 1.6293196622202632e-07,
"loss": 0.024090681225061417,
"memory(GiB)": 176.78,
"reward": 0.517857164144516,
"reward_std": 0.2721000760793686,
"rewards/AnswerTagAccuracyORM/mean": 0.517857164144516,
"rewards/AnswerTagAccuracyORM/std": 0.5006500333547592,
"step": 232,
"train_speed(iter/s)": 0.003201
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 8033.5,
"completions/mean_length": 5438.732421875,
"completions/min_length": 2205.0,
"epoch": 0.7420382165605095,
"grad_norm": 0.5016524195671082,
"kl": 0.0782470703125,
"learning_rate": 1.592067535203479e-07,
"loss": -0.18710020184516907,
"memory(GiB)": 176.78,
"reward": 0.517857164144516,
"reward_std": 0.2610500380396843,
"rewards/AnswerTagAccuracyORM/mean": 0.517857164144516,
"rewards/AnswerTagAccuracyORM/std": 0.429407000541687,
"step": 233,
"train_speed(iter/s)": 0.003202
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.4107142857142857,
"completions/max_length": 8018.0,
"completions/mean_length": 5327.535888671875,
"completions/min_length": 1035.5,
"epoch": 0.7452229299363057,
"grad_norm": 0.6563963294029236,
"kl": 0.0595703125,
"learning_rate": 1.555165404621567e-07,
"loss": -0.38865458965301514,
"memory(GiB)": 176.78,
"reward": 0.5,
"reward_std": 0.40943218767642975,
"rewards/AnswerTagAccuracyORM/mean": 0.5,
"rewards/AnswerTagAccuracyORM/std": 0.5091750919818878,
"step": 234,
"train_speed(iter/s)": 0.003202
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.6607142857142857,
"completions/max_length": 8040.5,
"completions/mean_length": 6642.339599609375,
"completions/min_length": 1762.5,
"epoch": 0.7484076433121019,
"grad_norm": 0.815308153629303,
"kl": 0.0169677734375,
"learning_rate": 1.518617060342513e-07,
"loss": -0.25266438722610474,
"memory(GiB)": 176.78,
"reward": 0.321428582072258,
"reward_std": 0.3078143820166588,
"rewards/AnswerTagAccuracyORM/mean": 0.321428582072258,
"rewards/AnswerTagAccuracyORM/std": 0.4739968925714493,
"step": 235,
"train_speed(iter/s)": 0.003202
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.4642857142857143,
"completions/max_length": 8059.5,
"completions/mean_length": 5736.250244140625,
"completions/min_length": 998.0,
"epoch": 0.7515923566878981,
"grad_norm": 0.5217832922935486,
"kl": 0.04364013671875,
"learning_rate": 1.4824262559002592e-07,
"loss": -0.21336083114147186,
"memory(GiB)": 176.78,
"reward": 0.3750000298023224,
"reward_std": 0.30228935927152634,
"rewards/AnswerTagAccuracyORM/mean": 0.3750000298023224,
"rewards/AnswerTagAccuracyORM/std": 0.4628649652004242,
"step": 236,
"train_speed(iter/s)": 0.003203
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 8044.5,
"completions/mean_length": 5882.714599609375,
"completions/min_length": 1346.5,
"epoch": 0.7547770700636943,
"grad_norm": 0.5135810971260071,
"kl": 0.0560302734375,
"learning_rate": 1.4465967081092345e-07,
"loss": -0.15495836734771729,
"memory(GiB)": 176.78,
"reward": 0.2857142984867096,
"reward_std": 0.26657506823539734,
"rewards/AnswerTagAccuracyORM/mean": 0.2857142984867096,
"rewards/AnswerTagAccuracyORM/std": 0.4582767188549042,
"step": 237,
"train_speed(iter/s)": 0.003203
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.6428571428571429,
"completions/max_length": 8056.5,
"completions/mean_length": 6505.57177734375,
"completions/min_length": 1349.0,
"epoch": 0.7579617834394905,
"grad_norm": 0.20377548038959503,
"kl": 1.419865668977121e-41,
"learning_rate": 1.4111320966826057e-07,
"loss": -0.06175333261489868,
"memory(GiB)": 176.78,
"reward": 0.321428582072258,
"reward_std": 0.2253357470035553,
"rewards/AnswerTagAccuracyORM/mean": 0.321428582072258,
"rewards/AnswerTagAccuracyORM/std": 0.4691530168056488,
"step": 238,
"train_speed(iter/s)": 0.003203
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 8044.0,
"completions/mean_length": 5583.607421875,
"completions/min_length": 1573.5,
"epoch": 0.7611464968152867,
"grad_norm": 0.7509793639183044,
"kl": 0.03759765625,
"learning_rate": 1.376036063854401e-07,
"loss": -0.21666017174720764,
"memory(GiB)": 176.78,
"reward": 0.4285714477300644,
"reward_std": 0.32695360481739044,
"rewards/AnswerTagAccuracyORM/mean": 0.4285714477300644,
"rewards/AnswerTagAccuracyORM/std": 0.5026109665632248,
"step": 239,
"train_speed(iter/s)": 0.003204
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.39285714285714285,
"completions/max_length": 8051.0,
"completions/mean_length": 5945.732421875,
"completions/min_length": 1600.0,
"epoch": 0.7643312101910829,
"grad_norm": 0.36760520935058594,
"kl": 0.0548095703125,
"learning_rate": 1.3413122140054217e-07,
"loss": -0.12507101893424988,
"memory(GiB)": 176.78,
"reward": 0.3571428656578064,
"reward_std": 0.2967643365263939,
"rewards/AnswerTagAccuracyORM/mean": 0.3571428656578064,
"rewards/AnswerTagAccuracyORM/std": 0.48795004189014435,
"step": 240,
"train_speed(iter/s)": 0.003205
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.4464285714285714,
"completions/max_length": 8023.5,
"completions/mean_length": 5641.893310546875,
"completions/min_length": 1097.5,
"epoch": 0.767515923566879,
"grad_norm": 0.5026525259017944,
"kl": 0.0391845703125,
"learning_rate": 1.3069641132930926e-07,
"loss": -0.16692417860031128,
"memory(GiB)": 176.78,
"reward": 0.4642857313156128,
"reward_std": 0.2967643365263939,
"rewards/AnswerTagAccuracyORM/mean": 0.4642857313156128,
"rewards/AnswerTagAccuracyORM/std": 0.5065638720989227,
"step": 241,
"train_speed(iter/s)": 0.003206
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.625,
"completions/max_length": 8050.0,
"completions/mean_length": 6501.6787109375,
"completions/min_length": 2805.0,
"epoch": 0.7707006369426752,
"grad_norm": 0.4156489968299866,
"kl": 0.023803617145823353,
"learning_rate": 1.272995289285202e-07,
"loss": -0.18214215338230133,
"memory(GiB)": 176.78,
"reward": 0.2857142984867096,
"reward_std": 0.26657506078481674,
"rewards/AnswerTagAccuracyORM/mean": 0.2857142984867096,
"rewards/AnswerTagAccuracyORM/std": 0.45290274918079376,
"step": 242,
"train_speed(iter/s)": 0.003206
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.625,
"completions/max_length": 8062.0,
"completions/mean_length": 6547.01806640625,
"completions/min_length": 1906.5,
"epoch": 0.7738853503184714,
"grad_norm": 0.29880252480506897,
"kl": 0.0003601927019190043,
"learning_rate": 1.2394092305976272e-07,
"loss": -0.12600275874137878,
"memory(GiB)": 176.78,
"reward": 0.267857164144516,
"reward_std": 0.23086078837513924,
"rewards/AnswerTagAccuracyORM/mean": 0.267857164144516,
"rewards/AnswerTagAccuracyORM/std": 0.40946151316165924,
"step": 243,
"train_speed(iter/s)": 0.003206
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.625,
"completions/max_length": 8044.5,
"completions/mean_length": 6625.714599609375,
"completions/min_length": 1873.5,
"epoch": 0.7770700636942676,
"grad_norm": 0.6597678661346436,
"kl": 556.0035400390625,
"learning_rate": 1.2062093865360457e-07,
"loss": -0.10461865365505219,
"memory(GiB)": 176.78,
"reward": 0.160714291036129,
"reward_std": 0.29123931378126144,
"rewards/AnswerTagAccuracyORM/mean": 0.160714291036129,
"rewards/AnswerTagAccuracyORM/std": 0.3731846809387207,
"step": 244,
"train_speed(iter/s)": 0.003206
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.4107142857142857,
"completions/max_length": 8078.0,
"completions/mean_length": 5632.14306640625,
"completions/min_length": 2093.0,
"epoch": 0.7802547770700637,
"grad_norm": 0.4589294493198395,
"kl": 0.025390625,
"learning_rate": 1.1733991667416926e-07,
"loss": 0.09096799790859222,
"memory(GiB)": 176.78,
"reward": 0.5000000298023224,
"reward_std": 0.33800365030765533,
"rewards/AnswerTagAccuracyORM/mean": 0.5000000298023224,
"rewards/AnswerTagAccuracyORM/std": 0.5078744888305664,
"step": 245,
"train_speed(iter/s)": 0.003207
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.4642857142857143,
"completions/max_length": 8042.5,
"completions/mean_length": 5813.6611328125,
"completions/min_length": 1675.5,
"epoch": 0.7834394904458599,
"grad_norm": 0.5432755947113037,
"kl": 0.048095703125,
"learning_rate": 1.1409819408411897e-07,
"loss": -0.0925760492682457,
"memory(GiB)": 176.78,
"reward": 0.4642857313156128,
"reward_std": 0.25552502274513245,
"rewards/AnswerTagAccuracyORM/mean": 0.4642857313156128,
"rewards/AnswerTagAccuracyORM/std": 0.4582767188549042,
"step": 246,
"train_speed(iter/s)": 0.003207
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.4285714285714286,
"completions/max_length": 8064.0,
"completions/mean_length": 5805.607421875,
"completions/min_length": 1816.5,
"epoch": 0.7866242038216561,
"grad_norm": 0.5733675360679626,
"kl": 0.028795340098440647,
"learning_rate": 1.108961038100481e-07,
"loss": 0.0018586559453979135,
"memory(GiB)": 176.78,
"reward": 0.446428582072258,
"reward_std": 0.23086076974868774,
"rewards/AnswerTagAccuracyORM/mean": 0.446428582072258,
"rewards/AnswerTagAccuracyORM/std": 0.4786955714225769,
"step": 247,
"train_speed(iter/s)": 0.003208
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.3392857142857143,
"completions/max_length": 8042.0,
"completions/mean_length": 5231.625244140625,
"completions/min_length": 1361.0,
"epoch": 0.7898089171974523,
"grad_norm": 0.3587133586406708,
"kl": 2.521376371383667,
"learning_rate": 1.0773397470829143e-07,
"loss": -0.1912791132926941,
"memory(GiB)": 176.78,
"reward": 0.6071428954601288,
"reward_std": 0.34905368089675903,
"rewards/AnswerTagAccuracyORM/mean": 0.6071428954601288,
"rewards/AnswerTagAccuracyORM/std": 0.4609040319919586,
"step": 248,
"train_speed(iter/s)": 0.003209
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.4642857142857143,
"completions/max_length": 8039.0,
"completions/mean_length": 5721.500244140625,
"completions/min_length": 1892.0,
"epoch": 0.7929936305732485,
"grad_norm": 0.48776480555534363,
"kl": 0.077392578125,
"learning_rate": 1.0461213153115079e-07,
"loss": -0.07181628048419952,
"memory(GiB)": 176.78,
"reward": 0.5000000298023224,
"reward_std": 0.34905368089675903,
"rewards/AnswerTagAccuracyORM/mean": 0.5000000298023224,
"rewards/AnswerTagAccuracyORM/std": 0.5078745186328888,
"step": 249,
"train_speed(iter/s)": 0.00321
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5178571428571428,
"completions/max_length": 8027.5,
"completions/mean_length": 5601.21435546875,
"completions/min_length": 1040.0,
"epoch": 0.7961783439490446,
"grad_norm": 0.2700536251068115,
"kl": 0.0224609375,
"learning_rate": 1.0153089489354256e-07,
"loss": -0.038272541016340256,
"memory(GiB)": 176.78,
"reward": 0.2500000149011612,
"reward_std": 0.11266787722706795,
"rewards/AnswerTagAccuracyORM/mean": 0.2500000149011612,
"rewards/AnswerTagAccuracyORM/std": 0.4061589390039444,
"step": 250,
"train_speed(iter/s)": 0.003211
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5535714285714286,
"completions/max_length": 8045.0,
"completions/mean_length": 6220.410888671875,
"completions/min_length": 1321.5,
"epoch": 0.7993630573248408,
"grad_norm": 0.547614574432373,
"kl": 0.04052734375,
"learning_rate": 9.849058124007043e-08,
"loss": -0.17825300991535187,
"memory(GiB)": 176.78,
"reward": 0.4107142984867096,
"reward_std": 0.37371791899204254,
"rewards/AnswerTagAccuracyORM/mean": 0.4107142984867096,
"rewards/AnswerTagAccuracyORM/std": 0.497912272810936,
"step": 251,
"train_speed(iter/s)": 0.003211
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.4464285714285714,
"completions/max_length": 8047.5,
"completions/mean_length": 6005.39306640625,
"completions/min_length": 1652.0,
"epoch": 0.802547770700637,
"grad_norm": 0.3931257426738739,
"kl": 0.0443115234375,
"learning_rate": 9.549150281252632e-08,
"loss": 0.03433360904455185,
"memory(GiB)": 176.78,
"reward": 0.4642857313156128,
"reward_std": 0.2253357470035553,
"rewards/AnswerTagAccuracyORM/mean": 0.4642857313156128,
"rewards/AnswerTagAccuracyORM/std": 0.5078744888305664,
"step": 252,
"train_speed(iter/s)": 0.003212
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5714285714285714,
"completions/max_length": 8042.5,
"completions/mean_length": 6229.875244140625,
"completions/min_length": 1784.5,
"epoch": 0.8057324840764332,
"grad_norm": 0.9034252166748047,
"kl": 0.07275390625,
"learning_rate": 9.253396761782306e-08,
"loss": 0.06343323737382889,
"memory(GiB)": 176.78,
"reward": 0.3750000149011612,
"reward_std": 0.3324786275625229,
"rewards/AnswerTagAccuracyORM/mean": 0.3750000149011612,
"rewards/AnswerTagAccuracyORM/std": 0.4839591085910797,
"step": 253,
"train_speed(iter/s)": 0.003212
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.3214285714285714,
"completions/max_length": 8007.0,
"completions/mean_length": 4358.8216552734375,
"completions/min_length": 924.5,
"epoch": 0.8089171974522293,
"grad_norm": 0.36683788895606995,
"kl": 0.0548095703125,
"learning_rate": 8.961827939636196e-08,
"loss": -0.12078238278627396,
"memory(GiB)": 176.78,
"reward": 0.3750000149011612,
"reward_std": 0.21981072798371315,
"rewards/AnswerTagAccuracyORM/mean": 0.3750000149011612,
"rewards/AnswerTagAccuracyORM/std": 0.49264873564243317,
"step": 254,
"train_speed(iter/s)": 0.003213
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.26785714285714285,
"completions/max_length": 8039.5,
"completions/mean_length": 4816.250244140625,
"completions/min_length": 1190.0,
"epoch": 0.8121019108280255,
"grad_norm": 0.5869824886322021,
"kl": 0.0762939453125,
"learning_rate": 8.6744737590838e-08,
"loss": -0.1321803778409958,
"memory(GiB)": 176.78,
"reward": 0.5535714328289032,
"reward_std": 0.3324786275625229,
"rewards/AnswerTagAccuracyORM/mean": 0.5535714328289032,
"rewards/AnswerTagAccuracyORM/std": 0.5032612830400467,
"step": 255,
"train_speed(iter/s)": 0.003214
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.30357142857142855,
"completions/max_length": 8047.0,
"completions/mean_length": 5286.732421875,
"completions/min_length": 1331.5,
"epoch": 0.8152866242038217,
"grad_norm": 0.562877893447876,
"kl": 0.0516357421875,
"learning_rate": 8.391363731548811e-08,
"loss": -0.20328308641910553,
"memory(GiB)": 176.78,
"reward": 0.5535714477300644,
"reward_std": 0.38476796448230743,
"rewards/AnswerTagAccuracyORM/mean": 0.5535714477300644,
"rewards/AnswerTagAccuracyORM/std": 0.4786955863237381,
"step": 256,
"train_speed(iter/s)": 0.003215
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.4107142857142857,
"completions/max_length": 8021.0,
"completions/mean_length": 5327.553955078125,
"completions/min_length": 1074.0,
"epoch": 0.8184713375796179,
"grad_norm": 0.4446698725223541,
"kl": 0.0565185546875,
"learning_rate": 8.112526932578117e-08,
"loss": -0.039517223834991455,
"memory(GiB)": 176.78,
"reward": 0.4821428656578064,
"reward_std": 0.37371791899204254,
"rewards/AnswerTagAccuracyORM/mean": 0.4821428656578064,
"rewards/AnswerTagAccuracyORM/std": 0.5085247755050659,
"step": 257,
"train_speed(iter/s)": 0.003215
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 8030.5,
"completions/mean_length": 5912.285888671875,
"completions/min_length": 1241.0,
"epoch": 0.821656050955414,
"grad_norm": 0.6693341732025146,
"kl": 0.130615234375,
"learning_rate": 7.837991998855897e-08,
"loss": -0.08013840764760971,
"memory(GiB)": 176.78,
"reward": 0.3214285969734192,
"reward_std": 0.2967643216252327,
"rewards/AnswerTagAccuracyORM/mean": 0.3214285969734192,
"rewards/AnswerTagAccuracyORM/std": 0.4609040319919586,
"step": 258,
"train_speed(iter/s)": 0.003216
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.4642857142857143,
"completions/max_length": 8041.5,
"completions/mean_length": 5953.500244140625,
"completions/min_length": 1419.0,
"epoch": 0.8248407643312102,
"grad_norm": 0.6183769702911377,
"kl": 0.085205078125,
"learning_rate": 7.567787125262449e-08,
"loss": -0.2941018044948578,
"memory(GiB)": 176.78,
"reward": 0.3571428805589676,
"reward_std": 0.19514649361371994,
"rewards/AnswerTagAccuracyORM/mean": 0.3571428805589676,
"rewards/AnswerTagAccuracyORM/std": 0.48199816048145294,
"step": 259,
"train_speed(iter/s)": 0.003216
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5178571428571428,
"completions/max_length": 8020.0,
"completions/mean_length": 6054.143310546875,
"completions/min_length": 1255.0,
"epoch": 0.8280254777070064,
"grad_norm": 0.35015594959259033,
"kl": 0.024169921875,
"learning_rate": 7.301940061978722e-08,
"loss": 0.021139614284038544,
"memory(GiB)": 176.78,
"reward": 0.2321428656578064,
"reward_std": 0.29123930633068085,
"rewards/AnswerTagAccuracyORM/mean": 0.2321428656578064,
"rewards/AnswerTagAccuracyORM/std": 0.425032377243042,
"step": 260,
"train_speed(iter/s)": 0.003215
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.625,
"completions/max_length": 8042.5,
"completions/mean_length": 6572.178955078125,
"completions/min_length": 2292.5,
"epoch": 0.8312101910828026,
"grad_norm": 0.41886037588119507,
"kl": 0.01409912109375,
"learning_rate": 7.040478111636228e-08,
"loss": -0.14545117318630219,
"memory(GiB)": 176.78,
"reward": 0.2678571492433548,
"reward_std": 0.3435286581516266,
"rewards/AnswerTagAccuracyORM/mean": 0.2678571492433548,
"rewards/AnswerTagAccuracyORM/std": 0.45050114393234253,
"step": 261,
"train_speed(iter/s)": 0.003215
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 8034.5,
"completions/mean_length": 5691.9287109375,
"completions/min_length": 1192.0,
"epoch": 0.8343949044585988,
"grad_norm": 1.4889556169509888,
"kl": 0.3389892578125,
"learning_rate": 6.783428126513125e-08,
"loss": -0.09831805527210236,
"memory(GiB)": 176.78,
"reward": 0.2142857201397419,
"reward_std": 0.19514648616313934,
"rewards/AnswerTagAccuracyORM/mean": 0.2142857201397419,
"rewards/AnswerTagAccuracyORM/std": 0.37510764598846436,
"step": 262,
"train_speed(iter/s)": 0.003216
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 8028.0,
"completions/mean_length": 5496.607421875,
"completions/min_length": 1581.0,
"epoch": 0.8375796178343949,
"grad_norm": 0.2921965718269348,
"kl": 0.0533447265625,
"learning_rate": 6.530816505776443e-08,
"loss": -0.031957417726516724,
"memory(GiB)": 176.78,
"reward": 0.4107143133878708,
"reward_std": 0.1896214708685875,
"rewards/AnswerTagAccuracyORM/mean": 0.4107143133878708,
"rewards/AnswerTagAccuracyORM/std": 0.4839591085910797,
"step": 263,
"train_speed(iter/s)": 0.003217
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 8054.5,
"completions/mean_length": 5091.857421875,
"completions/min_length": 1231.0,
"epoch": 0.8407643312101911,
"grad_norm": 0.24407930672168732,
"kl": 0.06640625,
"learning_rate": 6.282669192770895e-08,
"loss": -0.011408509686589241,
"memory(GiB)": 176.78,
"reward": 0.5000000149011612,
"reward_std": 0.19514649361371994,
"rewards/AnswerTagAccuracyORM/mean": 0.5000000149011612,
"rewards/AnswerTagAccuracyORM/std": 0.460043728351593,
"step": 264,
"train_speed(iter/s)": 0.003217
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.7321428571428572,
"completions/max_length": 8064.0,
"completions/mean_length": 7200.107666015625,
"completions/min_length": 3825.5,
"epoch": 0.8439490445859873,
"grad_norm": 0.6117250919342041,
"kl": 0.03643798828125,
"learning_rate": 6.039011672354455e-08,
"loss": -0.25999927520751953,
"memory(GiB)": 176.78,
"reward": 0.2500000149011612,
"reward_std": 0.3078143745660782,
"rewards/AnswerTagAccuracyORM/mean": 0.2500000149011612,
"rewards/AnswerTagAccuracyORM/std": 0.43280795216560364,
"step": 265,
"train_speed(iter/s)": 0.003217
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.2857142857142857,
"completions/max_length": 8020.5,
"completions/mean_length": 4265.500244140625,
"completions/min_length": 979.0,
"epoch": 0.8471337579617835,
"grad_norm": 8.540395736694336,
"kl": 3.001220703125,
"learning_rate": 5.799868968281074e-08,
"loss": -0.06987228244543076,
"memory(GiB)": 176.78,
"reward": 0.6250000298023224,
"reward_std": 0.14838216453790665,
"rewards/AnswerTagAccuracyORM/mean": 0.6250000298023224,
"rewards/AnswerTagAccuracyORM/std": 0.4897737503051758,
"step": 266,
"train_speed(iter/s)": 0.003219
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5535714285714286,
"completions/max_length": 8050.5,
"completions/mean_length": 6214.964599609375,
"completions/min_length": 1640.0,
"epoch": 0.8503184713375797,
"grad_norm": 0.4220220446586609,
"kl": 0.03466796875,
"learning_rate": 5.565265640630723e-08,
"loss": -0.0669555515050888,
"memory(GiB)": 176.78,
"reward": 0.1785714328289032,
"reward_std": 0.2253357470035553,
"rewards/AnswerTagAccuracyORM/mean": 0.1785714328289032,
"rewards/AnswerTagAccuracyORM/std": 0.37796446681022644,
"step": 267,
"train_speed(iter/s)": 0.003219
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 8021.0,
"completions/mean_length": 5096.89306640625,
"completions/min_length": 1902.5,
"epoch": 0.8535031847133758,
"grad_norm": 562.29443359375,
"kl": 174.029296875,
"learning_rate": 5.335225783287051e-08,
"loss": 0.30988335609436035,
"memory(GiB)": 176.78,
"reward": 0.446428582072258,
"reward_std": 0.2610500454902649,
"rewards/AnswerTagAccuracyORM/mean": 0.446428582072258,
"rewards/AnswerTagAccuracyORM/std": 0.5032612681388855,
"step": 268,
"train_speed(iter/s)": 0.00322
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.4285714285714286,
"completions/max_length": 8059.5,
"completions/mean_length": 5627.44677734375,
"completions/min_length": 1379.0,
"epoch": 0.856687898089172,
"grad_norm": 0.6505483984947205,
"kl": 0.04296875,
"learning_rate": 5.109773021462921e-08,
"loss": -0.24035362899303436,
"memory(GiB)": 176.78,
"reward": 0.3392857313156128,
"reward_std": 0.3435286581516266,
"rewards/AnswerTagAccuracyORM/mean": 0.3392857313156128,
"rewards/AnswerTagAccuracyORM/std": 0.48177245259284973,
"step": 269,
"train_speed(iter/s)": 0.00322
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.4821428571428571,
"completions/max_length": 8059.0,
"completions/mean_length": 5999.339599609375,
"completions/min_length": 2076.0,
"epoch": 0.8598726114649682,
"grad_norm": 1.334702730178833,
"kl": 1.001953125,
"learning_rate": 4.888930509274125e-08,
"loss": 0.008383408188819885,
"memory(GiB)": 176.78,
"reward": 0.321428582072258,
"reward_std": 0.2253357619047165,
"rewards/AnswerTagAccuracyORM/mean": 0.321428582072258,
"rewards/AnswerTagAccuracyORM/std": 0.4739968925714493,
"step": 270,
"train_speed(iter/s)": 0.00322
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.4107142857142857,
"completions/max_length": 8038.5,
"completions/mean_length": 5342.410888671875,
"completions/min_length": 1213.5,
"epoch": 0.8630573248407644,
"grad_norm": 0.2792721390724182,
"kl": 0.041748046875,
"learning_rate": 4.6727209273614124e-08,
"loss": -0.08284494280815125,
"memory(GiB)": 176.78,
"reward": 0.4107143133878708,
"reward_std": 0.21981074661016464,
"rewards/AnswerTagAccuracyORM/mean": 0.4107143133878708,
"rewards/AnswerTagAccuracyORM/std": 0.5006500333547592,
"step": 271,
"train_speed(iter/s)": 0.003221
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.4821428571428571,
"completions/max_length": 8046.0,
"completions/mean_length": 5400.964599609375,
"completions/min_length": 739.0,
"epoch": 0.8662420382165605,
"grad_norm": 0.5285795331001282,
"kl": 21.917213439941406,
"learning_rate": 4.4611664805611794e-08,
"loss": -0.2615126073360443,
"memory(GiB)": 176.78,
"reward": 0.446428582072258,
"reward_std": 0.2610500529408455,
"rewards/AnswerTagAccuracyORM/mean": 0.446428582072258,
"rewards/AnswerTagAccuracyORM/std": 0.5032612979412079,
"step": 272,
"train_speed(iter/s)": 0.003221
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.4285714285714286,
"completions/max_length": 8027.5,
"completions/mean_length": 5324.553955078125,
"completions/min_length": 855.0,
"epoch": 0.8694267515923567,
"grad_norm": 0.3686632513999939,
"kl": 0.06591796875,
"learning_rate": 4.2542888956250464e-08,
"loss": -0.04334472492337227,
"memory(GiB)": 176.78,
"reward": 0.5535714626312256,
"reward_std": 0.30228933691978455,
"rewards/AnswerTagAccuracyORM/mean": 0.5535714626312256,
"rewards/AnswerTagAccuracyORM/std": 0.5059135854244232,
"step": 273,
"train_speed(iter/s)": 0.003222
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5535714285714286,
"completions/max_length": 8016.5,
"completions/mean_length": 5866.160888671875,
"completions/min_length": 1242.5,
"epoch": 0.8726114649681529,
"grad_norm": 0.23201723396778107,
"kl": 0.03265380859375,
"learning_rate": 4.0521094189884696e-08,
"loss": -0.03820869326591492,
"memory(GiB)": 176.78,
"reward": 0.2678571566939354,
"reward_std": 0.14838216453790665,
"rewards/AnswerTagAccuracyORM/mean": 0.2678571566939354,
"rewards/AnswerTagAccuracyORM/std": 0.4268478900194168,
"step": 274,
"train_speed(iter/s)": 0.003222
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5892857142857143,
"completions/max_length": 8049.5,
"completions/mean_length": 6177.125244140625,
"completions/min_length": 1355.0,
"epoch": 0.8757961783439491,
"grad_norm": 0.4513340890407562,
"kl": 0.0380859375,
"learning_rate": 3.8546488145887624e-08,
"loss": -0.08043000847101212,
"memory(GiB)": 176.78,
"reward": 0.321428582072258,
"reward_std": 0.3078143745660782,
"rewards/AnswerTagAccuracyORM/mean": 0.321428582072258,
"rewards/AnswerTagAccuracyORM/std": 0.4489477574825287,
"step": 275,
"train_speed(iter/s)": 0.003222
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.6964285714285714,
"completions/max_length": 8028.0,
"completions/mean_length": 6920.803955078125,
"completions/min_length": 2485.0,
"epoch": 0.8789808917197452,
"grad_norm": 1.4028353691101074,
"kl": 0.6268310546875,
"learning_rate": 3.6619273617325695e-08,
"loss": -0.16914184391498566,
"memory(GiB)": 176.78,
"reward": 0.232142873108387,
"reward_std": 0.23086076974868774,
"rewards/AnswerTagAccuracyORM/mean": 0.232142873108387,
"rewards/AnswerTagAccuracyORM/std": 0.4159715920686722,
"step": 276,
"train_speed(iter/s)": 0.003222
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.6071428571428572,
"completions/max_length": 8043.0,
"completions/mean_length": 6354.357421875,
"completions/min_length": 1492.5,
"epoch": 0.8821656050955414,
"grad_norm": 0.5193430185317993,
"kl": 1.416993007125255e-41,
"learning_rate": 3.473964853013273e-08,
"loss": -0.09646855294704437,
"memory(GiB)": 176.78,
"reward": 0.3035714477300644,
"reward_std": 0.30228933691978455,
"rewards/AnswerTagAccuracyORM/mean": 0.3035714477300644,
"rewards/AnswerTagAccuracyORM/std": 0.46781928837299347,
"step": 277,
"train_speed(iter/s)": 0.003222
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 8023.0,
"completions/mean_length": 5800.464599609375,
"completions/min_length": 1759.0,
"epoch": 0.8853503184713376,
"grad_norm": 1.6207529306411743,
"kl": 0.4185791015625,
"learning_rate": 3.2907805922781476e-08,
"loss": -0.2863817811012268,
"memory(GiB)": 176.78,
"reward": 0.4285714626312256,
"reward_std": 0.26657505333423615,
"rewards/AnswerTagAccuracyORM/mean": 0.4285714626312256,
"rewards/AnswerTagAccuracyORM/std": 0.5039526224136353,
"step": 278,
"train_speed(iter/s)": 0.003222
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5357142857142857,
"completions/max_length": 8027.5,
"completions/mean_length": 6033.375244140625,
"completions/min_length": 1543.0,
"epoch": 0.8885350318471338,
"grad_norm": 0.6210139989852905,
"kl": 0.0206298828125,
"learning_rate": 3.1123933926459845e-08,
"loss": -0.1716410219669342,
"memory(GiB)": 176.78,
"reward": 0.446428582072258,
"reward_std": 0.3324785977602005,
"rewards/AnswerTagAccuracyORM/mean": 0.446428582072258,
"rewards/AnswerTagAccuracyORM/std": 0.4786955863237381,
"step": 279,
"train_speed(iter/s)": 0.003222
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5178571428571428,
"completions/max_length": 8034.0,
"completions/mean_length": 6295.035888671875,
"completions/min_length": 1541.5,
"epoch": 0.89171974522293,
"grad_norm": 0.38655340671539307,
"kl": 0.0438232421875,
"learning_rate": 2.9388215745748345e-08,
"loss": -0.10300473123788834,
"memory(GiB)": 176.78,
"reward": 0.2142857238650322,
"reward_std": 0.2967643216252327,
"rewards/AnswerTagAccuracyORM/mean": 0.2142857238650322,
"rewards/AnswerTagAccuracyORM/std": 0.40819603204727173,
"step": 280,
"train_speed(iter/s)": 0.003222
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5714285714285714,
"completions/max_length": 8059.0,
"completions/mean_length": 6408.1611328125,
"completions/min_length": 1532.0,
"epoch": 0.8949044585987261,
"grad_norm": 0.5052747130393982,
"kl": 0.039306640625,
"learning_rate": 2.7700829639806465e-08,
"loss": -0.17020417749881744,
"memory(GiB)": 176.78,
"reward": 0.321428582072258,
"reward_std": 0.26657506078481674,
"rewards/AnswerTagAccuracyORM/mean": 0.321428582072258,
"rewards/AnswerTagAccuracyORM/std": 0.4739968776702881,
"step": 281,
"train_speed(iter/s)": 0.003223
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.4107142857142857,
"completions/max_length": 8031.5,
"completions/mean_length": 5205.26806640625,
"completions/min_length": 1111.0,
"epoch": 0.8980891719745223,
"grad_norm": 0.7017294764518738,
"kl": 0.04931640625,
"learning_rate": 2.6061948904063658e-08,
"loss": -0.0019177369540557265,
"memory(GiB)": 176.78,
"reward": 0.4642857313156128,
"reward_std": 0.33800363540649414,
"rewards/AnswerTagAccuracyORM/mean": 0.4642857313156128,
"rewards/AnswerTagAccuracyORM/std": 0.5078744888305664,
"step": 282,
"train_speed(iter/s)": 0.003223
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.4642857142857143,
"completions/max_length": 8032.0,
"completions/mean_length": 6105.125244140625,
"completions/min_length": 1046.5,
"epoch": 0.9012738853503185,
"grad_norm": 0.5392616391181946,
"kl": 0.0572509765625,
"learning_rate": 2.4471741852423233e-08,
"loss": -0.040537793189287186,
"memory(GiB)": 176.78,
"reward": 0.392857164144516,
"reward_std": 0.33800365030765533,
"rewards/AnswerTagAccuracyORM/mean": 0.392857164144516,
"rewards/AnswerTagAccuracyORM/std": 0.49173468351364136,
"step": 283,
"train_speed(iter/s)": 0.003224
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.4642857142857143,
"completions/max_length": 8060.0,
"completions/mean_length": 6067.14306640625,
"completions/min_length": 745.5,
"epoch": 0.9044585987261147,
"grad_norm": 0.5731423497200012,
"kl": 0.05810546875,
"learning_rate": 2.293037179997559e-08,
"loss": -0.07903746515512466,
"memory(GiB)": 176.78,
"reward": 0.4642857313156128,
"reward_std": 0.33800363540649414,
"rewards/AnswerTagAccuracyORM/mean": 0.4642857313156128,
"rewards/AnswerTagAccuracyORM/std": 0.48647116124629974,
"step": 284,
"train_speed(iter/s)": 0.003223
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 8035.0,
"completions/mean_length": 5818.035888671875,
"completions/min_length": 1439.0,
"epoch": 0.9076433121019108,
"grad_norm": 0.6655234694480896,
"kl": 0.0400390625,
"learning_rate": 2.1437997046226008e-08,
"loss": -0.2412337064743042,
"memory(GiB)": 176.78,
"reward": 0.3392857313156128,
"reward_std": 0.2610500380396843,
"rewards/AnswerTagAccuracyORM/mean": 0.3392857313156128,
"rewards/AnswerTagAccuracyORM/std": 0.48177245259284973,
"step": 285,
"train_speed(iter/s)": 0.003224
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5714285714285714,
"completions/max_length": 8061.0,
"completions/mean_length": 6306.19677734375,
"completions/min_length": 1288.0,
"epoch": 0.910828025477707,
"grad_norm": 0.8818116188049316,
"kl": 0.0340576171875,
"learning_rate": 1.9994770858837107e-08,
"loss": -0.10960516333580017,
"memory(GiB)": 176.78,
"reward": 0.3571428805589676,
"reward_std": 0.26657506078481674,
"rewards/AnswerTagAccuracyORM/mean": 0.3571428805589676,
"rewards/AnswerTagAccuracyORM/std": 0.48199817538261414,
"step": 286,
"train_speed(iter/s)": 0.003224
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5535714285714286,
"completions/max_length": 8053.0,
"completions/mean_length": 6440.303955078125,
"completions/min_length": 1986.5,
"epoch": 0.9140127388535032,
"grad_norm": 0.39754489064216614,
"kl": 0.03631591796875,
"learning_rate": 1.860084145788826e-08,
"loss": 0.06631383299827576,
"memory(GiB)": 176.78,
"reward": 0.2678571492433548,
"reward_std": 0.3324785977602005,
"rewards/AnswerTagAccuracyORM/mean": 0.2678571492433548,
"rewards/AnswerTagAccuracyORM/std": 0.43898552656173706,
"step": 287,
"train_speed(iter/s)": 0.003224
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.4642857142857143,
"completions/max_length": 8059.0,
"completions/mean_length": 5826.82177734375,
"completions/min_length": 1675.0,
"epoch": 0.9171974522292994,
"grad_norm": 0.6225547194480896,
"kl": 0.0421142578125,
"learning_rate": 1.725635200065323e-08,
"loss": -0.3206044137477875,
"memory(GiB)": 176.78,
"reward": 0.4285714477300644,
"reward_std": 0.33800363540649414,
"rewards/AnswerTagAccuracyORM/mean": 0.4285714477300644,
"rewards/AnswerTagAccuracyORM/std": 0.502610981464386,
"step": 288,
"train_speed(iter/s)": 0.003224
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.39285714285714285,
"completions/max_length": 8045.0,
"completions/mean_length": 5416.107177734375,
"completions/min_length": 1183.5,
"epoch": 0.9203821656050956,
"grad_norm": 0.47386959195137024,
"kl": 0.115966796875,
"learning_rate": 1.596144056689791e-08,
"loss": -0.10028056055307388,
"memory(GiB)": 176.78,
"reward": 0.4821428954601288,
"reward_std": 0.2610500305891037,
"rewards/AnswerTagAccuracyORM/mean": 0.4821428954601288,
"rewards/AnswerTagAccuracyORM/std": 0.4817724674940109,
"step": 289,
"train_speed(iter/s)": 0.003225
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5178571428571429,
"completions/max_length": 8044.0,
"completions/mean_length": 6445.464599609375,
"completions/min_length": 1398.5,
"epoch": 0.9235668789808917,
"grad_norm": 0.49086901545524597,
"kl": 0.0467529296875,
"learning_rate": 1.4716240144699187e-08,
"loss": -0.14612972736358643,
"memory(GiB)": 176.78,
"reward": 0.2142857313156128,
"reward_std": 0.2253357470035553,
"rewards/AnswerTagAccuracyORM/mean": 0.2142857313156128,
"rewards/AnswerTagAccuracyORM/std": 0.39528264105319977,
"step": 290,
"train_speed(iter/s)": 0.003225
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 8029.5,
"completions/mean_length": 5734.375244140625,
"completions/min_length": 844.5,
"epoch": 0.9267515923566879,
"grad_norm": 0.5175443291664124,
"kl": 0.0667724609375,
"learning_rate": 1.3520878616787523e-08,
"loss": 0.018897494301199913,
"memory(GiB)": 176.78,
"reward": 0.392857164144516,
"reward_std": 0.3681929111480713,
"rewards/AnswerTagAccuracyORM/mean": 0.392857164144516,
"rewards/AnswerTagAccuracyORM/std": 0.4744165241718292,
"step": 291,
"train_speed(iter/s)": 0.003225
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.42857142857142855,
"completions/max_length": 8033.0,
"completions/mean_length": 5356.482421875,
"completions/min_length": 1571.0,
"epoch": 0.9299363057324841,
"grad_norm": 0.33042165637016296,
"kl": 0.0430908203125,
"learning_rate": 1.2375478747413015e-08,
"loss": 0.04050120711326599,
"memory(GiB)": 176.78,
"reward": 0.5000000298023224,
"reward_std": 0.2967643141746521,
"rewards/AnswerTagAccuracyORM/mean": 0.5000000298023224,
"rewards/AnswerTagAccuracyORM/std": 0.5078744888305664,
"step": 292,
"train_speed(iter/s)": 0.003226
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.3214285714285714,
"completions/max_length": 8054.5,
"completions/mean_length": 5093.76806640625,
"completions/min_length": 1073.5,
"epoch": 0.9331210191082803,
"grad_norm": 0.8467631936073303,
"kl": 0.052978515625,
"learning_rate": 1.1280158169737265e-08,
"loss": -0.26039645075798035,
"memory(GiB)": 176.78,
"reward": 0.5535714626312256,
"reward_std": 0.3324785977602005,
"rewards/AnswerTagAccuracyORM/mean": 0.5535714626312256,
"rewards/AnswerTagAccuracyORM/std": 0.5059135556221008,
"step": 293,
"train_speed(iter/s)": 0.003226
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.3392857142857143,
"completions/max_length": 8050.0,
"completions/mean_length": 4675.946533203125,
"completions/min_length": 587.0,
"epoch": 0.9363057324840764,
"grad_norm": 0.6540654301643372,
"kl": 0.0511474609375,
"learning_rate": 1.0235029373752757e-08,
"loss": -0.02355077676475048,
"memory(GiB)": 176.78,
"reward": 0.4464285969734192,
"reward_std": 0.3324786126613617,
"rewards/AnswerTagAccuracyORM/mean": 0.4464285969734192,
"rewards/AnswerTagAccuracyORM/std": 0.4979122579097748,
"step": 294,
"train_speed(iter/s)": 0.003227
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5535714285714286,
"completions/max_length": 8030.5,
"completions/mean_length": 6136.69677734375,
"completions/min_length": 1404.0,
"epoch": 0.9394904458598726,
"grad_norm": 0.40905487537384033,
"kl": 0.0523681640625,
"learning_rate": 9.240199694729944e-09,
"loss": 0.014087937772274017,
"memory(GiB)": 176.78,
"reward": 0.392857164144516,
"reward_std": 0.18409644439816475,
"rewards/AnswerTagAccuracyORM/mean": 0.392857164144516,
"rewards/AnswerTagAccuracyORM/std": 0.49173468351364136,
"step": 295,
"train_speed(iter/s)": 0.003227
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.4107142857142857,
"completions/max_length": 8051.0,
"completions/mean_length": 5592.5537109375,
"completions/min_length": 1196.0,
"epoch": 0.9426751592356688,
"grad_norm": 0.3601033091545105,
"kl": 0.0228271484375,
"learning_rate": 8.295771302193721e-09,
"loss": -0.025663327425718307,
"memory(GiB)": 176.78,
"reward": 0.2857143059372902,
"reward_std": 0.11266788095235825,
"rewards/AnswerTagAccuracyORM/mean": 0.2857143059372902,
"rewards/AnswerTagAccuracyORM/std": 0.43015047907829285,
"step": 296,
"train_speed(iter/s)": 0.003227
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.4107142857142857,
"completions/max_length": 8053.0,
"completions/mean_length": 5372.053955078125,
"completions/min_length": 1333.0,
"epoch": 0.945859872611465,
"grad_norm": 0.5271080732345581,
"kl": 0.040283203125,
"learning_rate": 7.401841189430657e-09,
"loss": -0.12217244505882263,
"memory(GiB)": 176.78,
"reward": 0.3750000149011612,
"reward_std": 0.2721000909805298,
"rewards/AnswerTagAccuracyORM/mean": 0.3750000149011612,
"rewards/AnswerTagAccuracyORM/std": 0.49264876544475555,
"step": 297,
"train_speed(iter/s)": 0.003227
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.5357142857142857,
"completions/max_length": 8052.0,
"completions/mean_length": 5935.625244140625,
"completions/min_length": 1075.5,
"epoch": 0.9490445859872612,
"grad_norm": 0.5965413451194763,
"kl": 0.05285042445757426,
"learning_rate": 6.558501163527963e-09,
"loss": -0.2230261266231537,
"memory(GiB)": 176.78,
"reward": 0.3392857313156128,
"reward_std": 0.2721000760793686,
"rewards/AnswerTagAccuracyORM/mean": 0.3392857313156128,
"rewards/AnswerTagAccuracyORM/std": 0.4628649652004242,
"step": 298,
"train_speed(iter/s)": 0.003227
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 8037.0,
"completions/mean_length": 5281.518310546875,
"completions/min_length": 893.0,
"epoch": 0.9522292993630573,
"grad_norm": 0.32658663392066956,
"kl": 0.07177734375,
"learning_rate": 5.765837835944309e-09,
"loss": 0.05634969845414162,
"memory(GiB)": 176.78,
"reward": 0.517857164144516,
"reward_std": 0.2610500529408455,
"rewards/AnswerTagAccuracyORM/mean": 0.517857164144516,
"rewards/AnswerTagAccuracyORM/std": 0.5059135854244232,
"step": 299,
"train_speed(iter/s)": 0.003226
},
{
"clip_ratio": 0.0,
"completions/clipped_ratio": 0.6071428571428572,
"completions/max_length": 8031.0,
"completions/mean_length": 6464.035888671875,
"completions/min_length": 1501.5,
"epoch": 0.9554140127388535,
"grad_norm": 0.33558884263038635,
"kl": 1.261028488045903e-41,
"learning_rate": 5.023932613615445e-09,
"loss": -0.09524659812450409,
"memory(GiB)": 176.78,
"reward": 0.3035714477300644,
"reward_std": 0.3435286656022072,
"rewards/AnswerTagAccuracyORM/mean": 0.3035714477300644,
"rewards/AnswerTagAccuracyORM/std": 0.4576014429330826,
"step": 300,
"train_speed(iter/s)": 0.003225
}
],
"logging_steps": 1,
"max_steps": 314,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}