adeos-rexomni-v1 / trainer_state.json
nnpy's picture
Upload folder using huggingface_hub
ec8b075 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 336,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005961251862891207,
"grad_norm": 83.69901275634766,
"learning_rate": 0.0,
"loss": 2.889,
"step": 1
},
{
"epoch": 0.011922503725782414,
"grad_norm": 101.080810546875,
"learning_rate": 4.5454545454545457e-07,
"loss": 2.8534,
"step": 2
},
{
"epoch": 0.01788375558867362,
"grad_norm": 133.3448028564453,
"learning_rate": 9.090909090909091e-07,
"loss": 2.7436,
"step": 3
},
{
"epoch": 0.02384500745156483,
"grad_norm": 181.99168395996094,
"learning_rate": 1.3636363636363636e-06,
"loss": 2.9621,
"step": 4
},
{
"epoch": 0.029806259314456036,
"grad_norm": 7933.1826171875,
"learning_rate": 1.8181818181818183e-06,
"loss": 2.7936,
"step": 5
},
{
"epoch": 0.03576751117734724,
"grad_norm": 97.71904754638672,
"learning_rate": 2.2727272727272728e-06,
"loss": 2.8072,
"step": 6
},
{
"epoch": 0.041728763040238454,
"grad_norm": 275.55499267578125,
"learning_rate": 2.7272727272727272e-06,
"loss": 2.8261,
"step": 7
},
{
"epoch": 0.04769001490312966,
"grad_norm": 64.11652374267578,
"learning_rate": 3.181818181818182e-06,
"loss": 2.7701,
"step": 8
},
{
"epoch": 0.05365126676602087,
"grad_norm": 73.77549743652344,
"learning_rate": 3.6363636363636366e-06,
"loss": 2.5822,
"step": 9
},
{
"epoch": 0.05961251862891207,
"grad_norm": 69.88496398925781,
"learning_rate": 4.0909090909090915e-06,
"loss": 2.2658,
"step": 10
},
{
"epoch": 0.06557377049180328,
"grad_norm": 34.03704833984375,
"learning_rate": 4.5454545454545455e-06,
"loss": 2.0155,
"step": 11
},
{
"epoch": 0.07153502235469449,
"grad_norm": 47.30327224731445,
"learning_rate": 5e-06,
"loss": 1.9482,
"step": 12
},
{
"epoch": 0.07749627421758569,
"grad_norm": 36.03778076171875,
"learning_rate": 4.9998832008573975e-06,
"loss": 1.9698,
"step": 13
},
{
"epoch": 0.08345752608047691,
"grad_norm": 28.477731704711914,
"learning_rate": 4.999532814343219e-06,
"loss": 1.9087,
"step": 14
},
{
"epoch": 0.08941877794336811,
"grad_norm": 20.54388427734375,
"learning_rate": 4.998948873197342e-06,
"loss": 2.0298,
"step": 15
},
{
"epoch": 0.09538002980625931,
"grad_norm": 21.707738876342773,
"learning_rate": 4.998131431982826e-06,
"loss": 1.8872,
"step": 16
},
{
"epoch": 0.10134128166915052,
"grad_norm": 16.112577438354492,
"learning_rate": 4.9970805670808174e-06,
"loss": 2.0104,
"step": 17
},
{
"epoch": 0.10730253353204174,
"grad_norm": 14.217989921569824,
"learning_rate": 4.995796376683411e-06,
"loss": 1.9303,
"step": 18
},
{
"epoch": 0.11326378539493294,
"grad_norm": 15.643560409545898,
"learning_rate": 4.994278980784478e-06,
"loss": 1.9199,
"step": 19
},
{
"epoch": 0.11922503725782414,
"grad_norm": 17.744998931884766,
"learning_rate": 4.992528521168449e-06,
"loss": 1.9722,
"step": 20
},
{
"epoch": 0.12518628912071536,
"grad_norm": 13.886046409606934,
"learning_rate": 4.990545161397073e-06,
"loss": 1.8426,
"step": 21
},
{
"epoch": 0.13114754098360656,
"grad_norm": 11.871990203857422,
"learning_rate": 4.988329086794122e-06,
"loss": 1.9522,
"step": 22
},
{
"epoch": 0.13710879284649777,
"grad_norm": 13.753679275512695,
"learning_rate": 4.98588050442809e-06,
"loss": 1.9687,
"step": 23
},
{
"epoch": 0.14307004470938897,
"grad_norm": 17.359724044799805,
"learning_rate": 4.983199643092833e-06,
"loss": 1.9103,
"step": 24
},
{
"epoch": 0.14903129657228018,
"grad_norm": 12.743254661560059,
"learning_rate": 4.980286753286196e-06,
"loss": 1.8854,
"step": 25
},
{
"epoch": 0.15499254843517138,
"grad_norm": 13.894977569580078,
"learning_rate": 4.977142107186602e-06,
"loss": 1.8205,
"step": 26
},
{
"epoch": 0.16095380029806258,
"grad_norm": 45.56089401245117,
"learning_rate": 4.973765998627628e-06,
"loss": 1.8602,
"step": 27
},
{
"epoch": 0.16691505216095381,
"grad_norm": 12.6051664352417,
"learning_rate": 4.970158743070542e-06,
"loss": 1.7526,
"step": 28
},
{
"epoch": 0.17287630402384502,
"grad_norm": 15.922001838684082,
"learning_rate": 4.966320677574828e-06,
"loss": 1.7807,
"step": 29
},
{
"epoch": 0.17883755588673622,
"grad_norm": 17.59123992919922,
"learning_rate": 4.9622521607666936e-06,
"loss": 1.8304,
"step": 30
},
{
"epoch": 0.18479880774962743,
"grad_norm": 35.71995162963867,
"learning_rate": 4.957953572805558e-06,
"loss": 1.8205,
"step": 31
},
{
"epoch": 0.19076005961251863,
"grad_norm": 14.873682975769043,
"learning_rate": 4.953425315348534e-06,
"loss": 1.7274,
"step": 32
},
{
"epoch": 0.19672131147540983,
"grad_norm": 11.887964248657227,
"learning_rate": 4.94866781151289e-06,
"loss": 1.8059,
"step": 33
},
{
"epoch": 0.20268256333830104,
"grad_norm": 20.716062545776367,
"learning_rate": 4.943681505836523e-06,
"loss": 1.7239,
"step": 34
},
{
"epoch": 0.20864381520119224,
"grad_norm": 16.12997817993164,
"learning_rate": 4.938466864236413e-06,
"loss": 1.6606,
"step": 35
},
{
"epoch": 0.21460506706408347,
"grad_norm": 16.572912216186523,
"learning_rate": 4.933024373965097e-06,
"loss": 1.7082,
"step": 36
},
{
"epoch": 0.22056631892697467,
"grad_norm": 15.159687042236328,
"learning_rate": 4.927354543565131e-06,
"loss": 1.5943,
"step": 37
},
{
"epoch": 0.22652757078986588,
"grad_norm": 24.082124710083008,
"learning_rate": 4.921457902821578e-06,
"loss": 1.6194,
"step": 38
},
{
"epoch": 0.23248882265275708,
"grad_norm": 14.97282600402832,
"learning_rate": 4.915335002712506e-06,
"loss": 1.6057,
"step": 39
},
{
"epoch": 0.23845007451564829,
"grad_norm": 21.004867553710938,
"learning_rate": 4.9089864153575016e-06,
"loss": 1.7454,
"step": 40
},
{
"epoch": 0.2444113263785395,
"grad_norm": 31.531965255737305,
"learning_rate": 4.902412733964212e-06,
"loss": 1.5509,
"step": 41
},
{
"epoch": 0.2503725782414307,
"grad_norm": 120.07006072998047,
"learning_rate": 4.895614572772916e-06,
"loss": 1.6287,
"step": 42
},
{
"epoch": 0.2563338301043219,
"grad_norm": 17.399808883666992,
"learning_rate": 4.888592566999134e-06,
"loss": 1.583,
"step": 43
},
{
"epoch": 0.26229508196721313,
"grad_norm": 17.383283615112305,
"learning_rate": 4.88134737277427e-06,
"loss": 1.6961,
"step": 44
},
{
"epoch": 0.26825633383010433,
"grad_norm": 17.128171920776367,
"learning_rate": 4.873879667084301e-06,
"loss": 1.4649,
"step": 45
},
{
"epoch": 0.27421758569299554,
"grad_norm": 42.8609619140625,
"learning_rate": 4.866190147706525e-06,
"loss": 1.5739,
"step": 46
},
{
"epoch": 0.28017883755588674,
"grad_norm": 31.926063537597656,
"learning_rate": 4.858279533144358e-06,
"loss": 1.565,
"step": 47
},
{
"epoch": 0.28614008941877794,
"grad_norm": 23.29640769958496,
"learning_rate": 4.8501485625602e-06,
"loss": 1.5657,
"step": 48
},
{
"epoch": 0.29210134128166915,
"grad_norm": 25.43850326538086,
"learning_rate": 4.841797995706362e-06,
"loss": 1.5481,
"step": 49
},
{
"epoch": 0.29806259314456035,
"grad_norm": 40.31404495239258,
"learning_rate": 4.833228612854088e-06,
"loss": 1.404,
"step": 50
},
{
"epoch": 0.30402384500745155,
"grad_norm": 30.751497268676758,
"learning_rate": 4.824441214720629e-06,
"loss": 1.7065,
"step": 51
},
{
"epoch": 0.30998509687034276,
"grad_norm": 21.12566375732422,
"learning_rate": 4.815436622394442e-06,
"loss": 1.4373,
"step": 52
},
{
"epoch": 0.31594634873323396,
"grad_norm": 41.19392776489258,
"learning_rate": 4.806215677258456e-06,
"loss": 1.4945,
"step": 53
},
{
"epoch": 0.32190760059612517,
"grad_norm": 32.416107177734375,
"learning_rate": 4.796779240911461e-06,
"loss": 1.4667,
"step": 54
},
{
"epoch": 0.32786885245901637,
"grad_norm": 19.345081329345703,
"learning_rate": 4.787128195087596e-06,
"loss": 1.4775,
"step": 55
},
{
"epoch": 0.33383010432190763,
"grad_norm": 27.58094024658203,
"learning_rate": 4.777263441573963e-06,
"loss": 1.4283,
"step": 56
},
{
"epoch": 0.33979135618479883,
"grad_norm": 23.647533416748047,
"learning_rate": 4.7671859021263635e-06,
"loss": 1.4977,
"step": 57
},
{
"epoch": 0.34575260804769004,
"grad_norm": 33.37125015258789,
"learning_rate": 4.756896518383173e-06,
"loss": 1.4604,
"step": 58
},
{
"epoch": 0.35171385991058124,
"grad_norm": 24.486753463745117,
"learning_rate": 4.746396251777348e-06,
"loss": 1.5416,
"step": 59
},
{
"epoch": 0.35767511177347244,
"grad_norm": 21.387479782104492,
"learning_rate": 4.7356860834466e-06,
"loss": 1.4861,
"step": 60
},
{
"epoch": 0.36363636363636365,
"grad_norm": 16.769519805908203,
"learning_rate": 4.72476701414171e-06,
"loss": 1.3392,
"step": 61
},
{
"epoch": 0.36959761549925485,
"grad_norm": 28.92209815979004,
"learning_rate": 4.7136400641330245e-06,
"loss": 1.4868,
"step": 62
},
{
"epoch": 0.37555886736214605,
"grad_norm": 29.672182083129883,
"learning_rate": 4.702306273115122e-06,
"loss": 1.405,
"step": 63
},
{
"epoch": 0.38152011922503726,
"grad_norm": 23.519418716430664,
"learning_rate": 4.690766700109659e-06,
"loss": 1.4502,
"step": 64
},
{
"epoch": 0.38748137108792846,
"grad_norm": 46.69130325317383,
"learning_rate": 4.679022423366424e-06,
"loss": 1.4917,
"step": 65
},
{
"epoch": 0.39344262295081966,
"grad_norm": 14.234804153442383,
"learning_rate": 4.667074540262577e-06,
"loss": 1.4556,
"step": 66
},
{
"epoch": 0.39940387481371087,
"grad_norm": 50.061729431152344,
"learning_rate": 4.654924167200124e-06,
"loss": 1.5412,
"step": 67
},
{
"epoch": 0.40536512667660207,
"grad_norm": 26.193071365356445,
"learning_rate": 4.6425724395015865e-06,
"loss": 1.4453,
"step": 68
},
{
"epoch": 0.4113263785394933,
"grad_norm": 150.2034454345703,
"learning_rate": 4.63002051130393e-06,
"loss": 1.4456,
"step": 69
},
{
"epoch": 0.4172876304023845,
"grad_norm": 30.46050262451172,
"learning_rate": 4.617269555450715e-06,
"loss": 1.4075,
"step": 70
},
{
"epoch": 0.4232488822652757,
"grad_norm": 27.013071060180664,
"learning_rate": 4.604320763382512e-06,
"loss": 1.3264,
"step": 71
},
{
"epoch": 0.42921013412816694,
"grad_norm": 29.879329681396484,
"learning_rate": 4.591175345025567e-06,
"loss": 1.3855,
"step": 72
},
{
"epoch": 0.43517138599105815,
"grad_norm": 20.89598274230957,
"learning_rate": 4.5778345286787575e-06,
"loss": 1.4808,
"step": 73
},
{
"epoch": 0.44113263785394935,
"grad_norm": 24.181364059448242,
"learning_rate": 4.56429956089881e-06,
"loss": 1.5116,
"step": 74
},
{
"epoch": 0.44709388971684055,
"grad_norm": 16.009437561035156,
"learning_rate": 4.550571706383833e-06,
"loss": 1.3564,
"step": 75
},
{
"epoch": 0.45305514157973176,
"grad_norm": 22.36256217956543,
"learning_rate": 4.536652247855133e-06,
"loss": 1.4962,
"step": 76
},
{
"epoch": 0.45901639344262296,
"grad_norm": 20.505599975585938,
"learning_rate": 4.522542485937369e-06,
"loss": 1.3784,
"step": 77
},
{
"epoch": 0.46497764530551416,
"grad_norm": 16.868972778320312,
"learning_rate": 4.508243739037016e-06,
"loss": 1.4388,
"step": 78
},
{
"epoch": 0.47093889716840537,
"grad_norm": 31.546695709228516,
"learning_rate": 4.4937573432191766e-06,
"loss": 1.4437,
"step": 79
},
{
"epoch": 0.47690014903129657,
"grad_norm": 23.937522888183594,
"learning_rate": 4.47908465208274e-06,
"loss": 1.3848,
"step": 80
},
{
"epoch": 0.4828614008941878,
"grad_norm": 26.49115753173828,
"learning_rate": 4.464227036633901e-06,
"loss": 1.3967,
"step": 81
},
{
"epoch": 0.488822652757079,
"grad_norm": 28.350820541381836,
"learning_rate": 4.449185885158056e-06,
"loss": 1.4275,
"step": 82
},
{
"epoch": 0.4947839046199702,
"grad_norm": 332.1685485839844,
"learning_rate": 4.433962603090083e-06,
"loss": 1.2602,
"step": 83
},
{
"epoch": 0.5007451564828614,
"grad_norm": 22.370702743530273,
"learning_rate": 4.418558612883016e-06,
"loss": 1.3573,
"step": 84
},
{
"epoch": 0.5067064083457526,
"grad_norm": 17.595949172973633,
"learning_rate": 4.402975353875134e-06,
"loss": 1.304,
"step": 85
},
{
"epoch": 0.5126676602086438,
"grad_norm": 36.13862228393555,
"learning_rate": 4.3872142821554695e-06,
"loss": 1.3921,
"step": 86
},
{
"epoch": 0.518628912071535,
"grad_norm": 21.56857681274414,
"learning_rate": 4.3712768704277535e-06,
"loss": 1.3921,
"step": 87
},
{
"epoch": 0.5245901639344263,
"grad_norm": 12.474554061889648,
"learning_rate": 4.355164607872806e-06,
"loss": 1.3618,
"step": 88
},
{
"epoch": 0.5305514157973175,
"grad_norm": 19.896780014038086,
"learning_rate": 4.338879000009389e-06,
"loss": 1.2586,
"step": 89
},
{
"epoch": 0.5365126676602087,
"grad_norm": 29.72560691833496,
"learning_rate": 4.322421568553529e-06,
"loss": 1.1915,
"step": 90
},
{
"epoch": 0.5424739195230999,
"grad_norm": 30.1799259185791,
"learning_rate": 4.305793851276335e-06,
"loss": 1.4017,
"step": 91
},
{
"epoch": 0.5484351713859911,
"grad_norm": 17.576276779174805,
"learning_rate": 4.288997401860303e-06,
"loss": 1.2945,
"step": 92
},
{
"epoch": 0.5543964232488823,
"grad_norm": 13.149911880493164,
"learning_rate": 4.272033789754146e-06,
"loss": 1.2814,
"step": 93
},
{
"epoch": 0.5603576751117735,
"grad_norm": 77.57002258300781,
"learning_rate": 4.254904600026143e-06,
"loss": 1.3343,
"step": 94
},
{
"epoch": 0.5663189269746647,
"grad_norm": 20.16973114013672,
"learning_rate": 4.2376114332160325e-06,
"loss": 1.2908,
"step": 95
},
{
"epoch": 0.5722801788375559,
"grad_norm": 13.326008796691895,
"learning_rate": 4.220155905185461e-06,
"loss": 1.2593,
"step": 96
},
{
"epoch": 0.5782414307004471,
"grad_norm": 311.2110290527344,
"learning_rate": 4.202539646966993e-06,
"loss": 1.3667,
"step": 97
},
{
"epoch": 0.5842026825633383,
"grad_norm": 15.870148658752441,
"learning_rate": 4.184764304611715e-06,
"loss": 1.4308,
"step": 98
},
{
"epoch": 0.5901639344262295,
"grad_norm": 31.768085479736328,
"learning_rate": 4.166831539035423e-06,
"loss": 1.3856,
"step": 99
},
{
"epoch": 0.5961251862891207,
"grad_norm": 17.089149475097656,
"learning_rate": 4.148743025863432e-06,
"loss": 1.3226,
"step": 100
},
{
"epoch": 0.6020864381520119,
"grad_norm": 29.87621307373047,
"learning_rate": 4.130500455274005e-06,
"loss": 1.3646,
"step": 101
},
{
"epoch": 0.6080476900149031,
"grad_norm": 42.509464263916016,
"learning_rate": 4.112105531840427e-06,
"loss": 1.2531,
"step": 102
},
{
"epoch": 0.6140089418777943,
"grad_norm": 40.10889434814453,
"learning_rate": 4.093559974371725e-06,
"loss": 1.3317,
"step": 103
},
{
"epoch": 0.6199701937406855,
"grad_norm": 22.96758270263672,
"learning_rate": 4.074865515752068e-06,
"loss": 1.2678,
"step": 104
},
{
"epoch": 0.6259314456035767,
"grad_norm": 41.362876892089844,
"learning_rate": 4.056023902778846e-06,
"loss": 1.2246,
"step": 105
},
{
"epoch": 0.6318926974664679,
"grad_norm": 17.242877960205078,
"learning_rate": 4.037036895999453e-06,
"loss": 1.3034,
"step": 106
},
{
"epoch": 0.6378539493293591,
"grad_norm": 28.293981552124023,
"learning_rate": 4.017906269546778e-06,
"loss": 1.4299,
"step": 107
},
{
"epoch": 0.6438152011922503,
"grad_norm": 26.767963409423828,
"learning_rate": 3.9986338109734354e-06,
"loss": 1.341,
"step": 108
},
{
"epoch": 0.6497764530551415,
"grad_norm": 15.613986015319824,
"learning_rate": 3.979221321084734e-06,
"loss": 1.2554,
"step": 109
},
{
"epoch": 0.6557377049180327,
"grad_norm": 20.716270446777344,
"learning_rate": 3.959670613770414e-06,
"loss": 1.3222,
"step": 110
},
{
"epoch": 0.6616989567809239,
"grad_norm": 9.510931015014648,
"learning_rate": 3.939983515835157e-06,
"loss": 1.2529,
"step": 111
},
{
"epoch": 0.6676602086438153,
"grad_norm": 23.321115493774414,
"learning_rate": 3.92016186682789e-06,
"loss": 1.2977,
"step": 112
},
{
"epoch": 0.6736214605067065,
"grad_norm": 57.38838577270508,
"learning_rate": 3.900207518869901e-06,
"loss": 1.3167,
"step": 113
},
{
"epoch": 0.6795827123695977,
"grad_norm": 22.515439987182617,
"learning_rate": 3.880122336481774e-06,
"loss": 1.2809,
"step": 114
},
{
"epoch": 0.6855439642324889,
"grad_norm": 12.670492172241211,
"learning_rate": 3.859908196409177e-06,
"loss": 1.2513,
"step": 115
},
{
"epoch": 0.6915052160953801,
"grad_norm": 18.576292037963867,
"learning_rate": 3.839566987447492e-06,
"loss": 1.3572,
"step": 116
},
{
"epoch": 0.6974664679582713,
"grad_norm": 22.152652740478516,
"learning_rate": 3.819100610265332e-06,
"loss": 1.2051,
"step": 117
},
{
"epoch": 0.7034277198211625,
"grad_norm": 12.753473281860352,
"learning_rate": 3.7985109772269435e-06,
"loss": 1.275,
"step": 118
},
{
"epoch": 0.7093889716840537,
"grad_norm": 23.44584846496582,
"learning_rate": 3.777800012213514e-06,
"loss": 1.3305,
"step": 119
},
{
"epoch": 0.7153502235469449,
"grad_norm": 18.46709442138672,
"learning_rate": 3.756969650443408e-06,
"loss": 1.2177,
"step": 120
},
{
"epoch": 0.7213114754098361,
"grad_norm": 18.8616886138916,
"learning_rate": 3.7360218382913426e-06,
"loss": 1.3302,
"step": 121
},
{
"epoch": 0.7272727272727273,
"grad_norm": 39.76200485229492,
"learning_rate": 3.714958533106515e-06,
"loss": 1.3266,
"step": 122
},
{
"epoch": 0.7332339791356185,
"grad_norm": 17.528858184814453,
"learning_rate": 3.6937817030297164e-06,
"loss": 1.2974,
"step": 123
},
{
"epoch": 0.7391952309985097,
"grad_norm": 29.327545166015625,
"learning_rate": 3.672493326809422e-06,
"loss": 1.2878,
"step": 124
},
{
"epoch": 0.7451564828614009,
"grad_norm": 14.248793601989746,
"learning_rate": 3.651095393616904e-06,
"loss": 1.2808,
"step": 125
},
{
"epoch": 0.7511177347242921,
"grad_norm": 14.539497375488281,
"learning_rate": 3.629589902860363e-06,
"loss": 1.2453,
"step": 126
},
{
"epoch": 0.7570789865871833,
"grad_norm": 15.273882865905762,
"learning_rate": 3.607978863998104e-06,
"loss": 1.3493,
"step": 127
},
{
"epoch": 0.7630402384500745,
"grad_norm": 14.30077075958252,
"learning_rate": 3.586264296350775e-06,
"loss": 1.2065,
"step": 128
},
{
"epoch": 0.7690014903129657,
"grad_norm": 25.420032501220703,
"learning_rate": 3.564448228912682e-06,
"loss": 1.3278,
"step": 129
},
{
"epoch": 0.7749627421758569,
"grad_norm": 18.863452911376953,
"learning_rate": 3.5425327001622034e-06,
"loss": 1.1876,
"step": 130
},
{
"epoch": 0.7809239940387481,
"grad_norm": 18.1572322845459,
"learning_rate": 3.520519757871313e-06,
"loss": 1.2363,
"step": 131
},
{
"epoch": 0.7868852459016393,
"grad_norm": 21.26305389404297,
"learning_rate": 3.4984114589142388e-06,
"loss": 1.2117,
"step": 132
},
{
"epoch": 0.7928464977645305,
"grad_norm": 34.8569221496582,
"learning_rate": 3.476209869075273e-06,
"loss": 1.2962,
"step": 133
},
{
"epoch": 0.7988077496274217,
"grad_norm": 11.790558815002441,
"learning_rate": 3.4539170628557383e-06,
"loss": 1.2112,
"step": 134
},
{
"epoch": 0.8047690014903129,
"grad_norm": 71.28162384033203,
"learning_rate": 3.4315351232801597e-06,
"loss": 1.1849,
"step": 135
},
{
"epoch": 0.8107302533532041,
"grad_norm": 17.917343139648438,
"learning_rate": 3.409066141701618e-06,
"loss": 1.2936,
"step": 136
},
{
"epoch": 0.8166915052160953,
"grad_norm": 14.670162200927734,
"learning_rate": 3.386512217606339e-06,
"loss": 1.3622,
"step": 137
},
{
"epoch": 0.8226527570789866,
"grad_norm": 135.63331604003906,
"learning_rate": 3.3638754584175222e-06,
"loss": 1.2687,
"step": 138
},
{
"epoch": 0.8286140089418778,
"grad_norm": 20.351055145263672,
"learning_rate": 3.3411579792984178e-06,
"loss": 1.2849,
"step": 139
},
{
"epoch": 0.834575260804769,
"grad_norm": 15.531908988952637,
"learning_rate": 3.318361902954692e-06,
"loss": 1.2378,
"step": 140
},
{
"epoch": 0.8405365126676602,
"grad_norm": 92.7786865234375,
"learning_rate": 3.295489359436083e-06,
"loss": 1.3383,
"step": 141
},
{
"epoch": 0.8464977645305514,
"grad_norm": 17.087692260742188,
"learning_rate": 3.272542485937369e-06,
"loss": 1.2544,
"step": 142
},
{
"epoch": 0.8524590163934426,
"grad_norm": 21.718421936035156,
"learning_rate": 3.249523426598669e-06,
"loss": 1.2632,
"step": 143
},
{
"epoch": 0.8584202682563339,
"grad_norm": 15.318682670593262,
"learning_rate": 3.2264343323050985e-06,
"loss": 1.1569,
"step": 144
},
{
"epoch": 0.8643815201192251,
"grad_norm": 23.35086441040039,
"learning_rate": 3.2032773604857915e-06,
"loss": 1.1956,
"step": 145
},
{
"epoch": 0.8703427719821163,
"grad_norm": 40.38860321044922,
"learning_rate": 3.1800546749123108e-06,
"loss": 1.2296,
"step": 146
},
{
"epoch": 0.8763040238450075,
"grad_norm": 90.90715026855469,
"learning_rate": 3.1567684454964674e-06,
"loss": 1.1541,
"step": 147
},
{
"epoch": 0.8822652757078987,
"grad_norm": 21.368518829345703,
"learning_rate": 3.133420848087566e-06,
"loss": 1.3271,
"step": 148
},
{
"epoch": 0.8882265275707899,
"grad_norm": 88.14790344238281,
"learning_rate": 3.110014064269094e-06,
"loss": 1.2658,
"step": 149
},
{
"epoch": 0.8941877794336811,
"grad_norm": 21.05089569091797,
"learning_rate": 3.0865502811548755e-06,
"loss": 1.1987,
"step": 150
},
{
"epoch": 0.9001490312965723,
"grad_norm": 16.833553314208984,
"learning_rate": 3.0630316911847112e-06,
"loss": 1.2963,
"step": 151
},
{
"epoch": 0.9061102831594635,
"grad_norm": 13.26325511932373,
"learning_rate": 3.039460491919516e-06,
"loss": 1.2032,
"step": 152
},
{
"epoch": 0.9120715350223547,
"grad_norm": 20.50226402282715,
"learning_rate": 3.015838885835981e-06,
"loss": 1.1907,
"step": 153
},
{
"epoch": 0.9180327868852459,
"grad_norm": 14.766148567199707,
"learning_rate": 2.992169080120776e-06,
"loss": 1.2646,
"step": 154
},
{
"epoch": 0.9239940387481371,
"grad_norm": 25.122909545898438,
"learning_rate": 2.9684532864643123e-06,
"loss": 1.256,
"step": 155
},
{
"epoch": 0.9299552906110283,
"grad_norm": 26.095359802246094,
"learning_rate": 2.944693720854081e-06,
"loss": 1.2392,
"step": 156
},
{
"epoch": 0.9359165424739195,
"grad_norm": 9.14566421508789,
"learning_rate": 2.920892603367596e-06,
"loss": 1.1698,
"step": 157
},
{
"epoch": 0.9418777943368107,
"grad_norm": 28.01599884033203,
"learning_rate": 2.897052157964952e-06,
"loss": 1.2073,
"step": 158
},
{
"epoch": 0.9478390461997019,
"grad_norm": 27.741567611694336,
"learning_rate": 2.8731746122810105e-06,
"loss": 1.243,
"step": 159
},
{
"epoch": 0.9538002980625931,
"grad_norm": 14.678361892700195,
"learning_rate": 2.8492621974172653e-06,
"loss": 1.1756,
"step": 160
},
{
"epoch": 0.9597615499254843,
"grad_norm": 15.269275665283203,
"learning_rate": 2.8253171477333585e-06,
"loss": 1.1728,
"step": 161
},
{
"epoch": 0.9657228017883756,
"grad_norm": 64.93920135498047,
"learning_rate": 2.8013417006383078e-06,
"loss": 1.2727,
"step": 162
},
{
"epoch": 0.9716840536512668,
"grad_norm": 15.335084915161133,
"learning_rate": 2.7773380963814454e-06,
"loss": 1.2749,
"step": 163
},
{
"epoch": 0.977645305514158,
"grad_norm": 37.09837341308594,
"learning_rate": 2.7533085778430884e-06,
"loss": 1.1588,
"step": 164
},
{
"epoch": 0.9836065573770492,
"grad_norm": 130.5791778564453,
"learning_rate": 2.729255390324966e-06,
"loss": 1.2001,
"step": 165
},
{
"epoch": 0.9895678092399404,
"grad_norm": 50.33445358276367,
"learning_rate": 2.7051807813404213e-06,
"loss": 1.2772,
"step": 166
},
{
"epoch": 0.9955290611028316,
"grad_norm": 32.990840911865234,
"learning_rate": 2.6810870004044065e-06,
"loss": 1.2903,
"step": 167
},
{
"epoch": 1.0,
"grad_norm": 26.173009872436523,
"learning_rate": 2.6569762988232838e-06,
"loss": 0.8979,
"step": 168
},
{
"epoch": 1.0059612518628913,
"grad_norm": 12.552504539489746,
"learning_rate": 2.632850929484472e-06,
"loss": 1.0755,
"step": 169
},
{
"epoch": 1.0119225037257824,
"grad_norm": 17.015302658081055,
"learning_rate": 2.6087131466459344e-06,
"loss": 1.2362,
"step": 170
},
{
"epoch": 1.0178837555886737,
"grad_norm": 49.570556640625,
"learning_rate": 2.5845652057255414e-06,
"loss": 1.2128,
"step": 171
},
{
"epoch": 1.0238450074515648,
"grad_norm": 42.604183197021484,
"learning_rate": 2.560409363090331e-06,
"loss": 1.1702,
"step": 172
},
{
"epoch": 1.0298062593144561,
"grad_norm": 20.264986038208008,
"learning_rate": 2.536247875845669e-06,
"loss": 1.1781,
"step": 173
},
{
"epoch": 1.0357675111773472,
"grad_norm": 12.800230979919434,
"learning_rate": 2.5120830016243515e-06,
"loss": 1.1768,
"step": 174
},
{
"epoch": 1.0417287630402385,
"grad_norm": 23.823047637939453,
"learning_rate": 2.4879169983756498e-06,
"loss": 1.2566,
"step": 175
},
{
"epoch": 1.0476900149031296,
"grad_norm": 21.49330711364746,
"learning_rate": 2.4637521241543315e-06,
"loss": 1.084,
"step": 176
},
{
"epoch": 1.053651266766021,
"grad_norm": 23.708070755004883,
"learning_rate": 2.43959063690967e-06,
"loss": 1.1973,
"step": 177
},
{
"epoch": 1.059612518628912,
"grad_norm": 14.244556427001953,
"learning_rate": 2.415434794274459e-06,
"loss": 1.1781,
"step": 178
},
{
"epoch": 1.0655737704918034,
"grad_norm": 18.469478607177734,
"learning_rate": 2.3912868533540665e-06,
"loss": 1.0891,
"step": 179
},
{
"epoch": 1.0715350223546944,
"grad_norm": 13.801828384399414,
"learning_rate": 2.3671490705155285e-06,
"loss": 1.1838,
"step": 180
},
{
"epoch": 1.0774962742175858,
"grad_norm": 97.42760467529297,
"learning_rate": 2.3430237011767166e-06,
"loss": 1.1887,
"step": 181
},
{
"epoch": 1.0834575260804769,
"grad_norm": 22.47690200805664,
"learning_rate": 2.3189129995955944e-06,
"loss": 1.2309,
"step": 182
},
{
"epoch": 1.0894187779433682,
"grad_norm": 82.70649719238281,
"learning_rate": 2.2948192186595787e-06,
"loss": 1.2702,
"step": 183
},
{
"epoch": 1.0953800298062593,
"grad_norm": 143.4379425048828,
"learning_rate": 2.2707446096750345e-06,
"loss": 1.2171,
"step": 184
},
{
"epoch": 1.1013412816691506,
"grad_norm": 52.544227600097656,
"learning_rate": 2.246691422156913e-06,
"loss": 1.2009,
"step": 185
},
{
"epoch": 1.1073025335320417,
"grad_norm": 41.363922119140625,
"learning_rate": 2.222661903618556e-06,
"loss": 1.1588,
"step": 186
},
{
"epoch": 1.113263785394933,
"grad_norm": 12.45494556427002,
"learning_rate": 2.1986582993616926e-06,
"loss": 1.2078,
"step": 187
},
{
"epoch": 1.119225037257824,
"grad_norm": 15.55370807647705,
"learning_rate": 2.1746828522666423e-06,
"loss": 1.1447,
"step": 188
},
{
"epoch": 1.1251862891207154,
"grad_norm": 16.289379119873047,
"learning_rate": 2.1507378025827355e-06,
"loss": 1.2212,
"step": 189
},
{
"epoch": 1.1311475409836065,
"grad_norm": 20.157041549682617,
"learning_rate": 2.12682538771899e-06,
"loss": 1.0832,
"step": 190
},
{
"epoch": 1.1371087928464978,
"grad_norm": 43.083984375,
"learning_rate": 2.1029478420350493e-06,
"loss": 1.1558,
"step": 191
},
{
"epoch": 1.1430700447093889,
"grad_norm": 43.636539459228516,
"learning_rate": 2.079107396632404e-06,
"loss": 1.2275,
"step": 192
},
{
"epoch": 1.1490312965722802,
"grad_norm": 13.149125099182129,
"learning_rate": 2.0553062791459193e-06,
"loss": 1.0831,
"step": 193
},
{
"epoch": 1.1549925484351713,
"grad_norm": 17.64810562133789,
"learning_rate": 2.031546713535688e-06,
"loss": 1.2873,
"step": 194
},
{
"epoch": 1.1609538002980626,
"grad_norm": 23.816091537475586,
"learning_rate": 2.007830919879225e-06,
"loss": 1.1185,
"step": 195
},
{
"epoch": 1.1669150521609537,
"grad_norm": 23.4075984954834,
"learning_rate": 1.9841611141640205e-06,
"loss": 1.1648,
"step": 196
},
{
"epoch": 1.172876304023845,
"grad_norm": 75.27896881103516,
"learning_rate": 1.960539508080485e-06,
"loss": 1.1569,
"step": 197
},
{
"epoch": 1.1788375558867363,
"grad_norm": 27.79124641418457,
"learning_rate": 1.936968308815289e-06,
"loss": 1.1506,
"step": 198
},
{
"epoch": 1.1847988077496274,
"grad_norm": 88.08030700683594,
"learning_rate": 1.913449718845125e-06,
"loss": 1.1021,
"step": 199
},
{
"epoch": 1.1907600596125185,
"grad_norm": 20.72780990600586,
"learning_rate": 1.8899859357309064e-06,
"loss": 1.1408,
"step": 200
},
{
"epoch": 1.1967213114754098,
"grad_norm": 34.17177963256836,
"learning_rate": 1.8665791519124344e-06,
"loss": 1.2407,
"step": 201
},
{
"epoch": 1.2026825633383011,
"grad_norm": 16.424531936645508,
"learning_rate": 1.8432315545035328e-06,
"loss": 1.2134,
"step": 202
},
{
"epoch": 1.2086438152011922,
"grad_norm": 68.55332946777344,
"learning_rate": 1.8199453250876894e-06,
"loss": 1.1283,
"step": 203
},
{
"epoch": 1.2146050670640836,
"grad_norm": 18.79085922241211,
"learning_rate": 1.796722639514209e-06,
"loss": 1.1635,
"step": 204
},
{
"epoch": 1.2205663189269746,
"grad_norm": 18.540996551513672,
"learning_rate": 1.7735656676949028e-06,
"loss": 1.304,
"step": 205
},
{
"epoch": 1.226527570789866,
"grad_norm": 24.513444900512695,
"learning_rate": 1.7504765734013323e-06,
"loss": 1.2594,
"step": 206
},
{
"epoch": 1.232488822652757,
"grad_norm": 15.741020202636719,
"learning_rate": 1.7274575140626318e-06,
"loss": 1.1887,
"step": 207
},
{
"epoch": 1.2384500745156484,
"grad_norm": 14.231233596801758,
"learning_rate": 1.7045106405639175e-06,
"loss": 1.2081,
"step": 208
},
{
"epoch": 1.2444113263785395,
"grad_norm": 10.00545883178711,
"learning_rate": 1.6816380970453084e-06,
"loss": 1.1334,
"step": 209
},
{
"epoch": 1.2503725782414308,
"grad_norm": 39.42339324951172,
"learning_rate": 1.6588420207015826e-06,
"loss": 1.2011,
"step": 210
},
{
"epoch": 1.2563338301043219,
"grad_norm": 12.922039985656738,
"learning_rate": 1.6361245415824784e-06,
"loss": 1.1481,
"step": 211
},
{
"epoch": 1.2622950819672132,
"grad_norm": 11.270161628723145,
"learning_rate": 1.613487782393661e-06,
"loss": 1.2256,
"step": 212
},
{
"epoch": 1.2682563338301043,
"grad_norm": 47.40531921386719,
"learning_rate": 1.5909338582983825e-06,
"loss": 1.1946,
"step": 213
},
{
"epoch": 1.2742175856929956,
"grad_norm": 22.412290573120117,
"learning_rate": 1.5684648767198412e-06,
"loss": 1.1862,
"step": 214
},
{
"epoch": 1.2801788375558867,
"grad_norm": 13.300796508789062,
"learning_rate": 1.5460829371442626e-06,
"loss": 1.2043,
"step": 215
},
{
"epoch": 1.286140089418778,
"grad_norm": 30.526369094848633,
"learning_rate": 1.5237901309247282e-06,
"loss": 1.1886,
"step": 216
},
{
"epoch": 1.292101341281669,
"grad_norm": 28.1364803314209,
"learning_rate": 1.5015885410857617e-06,
"loss": 1.1393,
"step": 217
},
{
"epoch": 1.2980625931445604,
"grad_norm": 16.774173736572266,
"learning_rate": 1.4794802421286881e-06,
"loss": 1.0952,
"step": 218
},
{
"epoch": 1.3040238450074515,
"grad_norm": 11.247906684875488,
"learning_rate": 1.457467299837797e-06,
"loss": 1.118,
"step": 219
},
{
"epoch": 1.3099850968703428,
"grad_norm": 27.284690856933594,
"learning_rate": 1.4355517710873184e-06,
"loss": 1.2864,
"step": 220
},
{
"epoch": 1.315946348733234,
"grad_norm": 16.30120849609375,
"learning_rate": 1.4137357036492255e-06,
"loss": 1.1174,
"step": 221
},
{
"epoch": 1.3219076005961252,
"grad_norm": 21.61345672607422,
"learning_rate": 1.3920211360018971e-06,
"loss": 1.2607,
"step": 222
},
{
"epoch": 1.3278688524590163,
"grad_norm": 82.09076690673828,
"learning_rate": 1.3704100971396378e-06,
"loss": 1.0947,
"step": 223
},
{
"epoch": 1.3338301043219076,
"grad_norm": 28.745494842529297,
"learning_rate": 1.3489046063830974e-06,
"loss": 1.1263,
"step": 224
},
{
"epoch": 1.339791356184799,
"grad_norm": 20.771093368530273,
"learning_rate": 1.327506673190579e-06,
"loss": 1.163,
"step": 225
},
{
"epoch": 1.34575260804769,
"grad_norm": 16.300779342651367,
"learning_rate": 1.306218296970284e-06,
"loss": 1.15,
"step": 226
},
{
"epoch": 1.3517138599105811,
"grad_norm": 16.083730697631836,
"learning_rate": 1.285041466893485e-06,
"loss": 1.2341,
"step": 227
},
{
"epoch": 1.3576751117734724,
"grad_norm": 70.427978515625,
"learning_rate": 1.2639781617086589e-06,
"loss": 1.0874,
"step": 228
},
{
"epoch": 1.3636363636363638,
"grad_norm": 16.239727020263672,
"learning_rate": 1.2430303495565928e-06,
"loss": 1.1787,
"step": 229
},
{
"epoch": 1.3695976154992549,
"grad_norm": 20.107654571533203,
"learning_rate": 1.222199987786487e-06,
"loss": 1.0678,
"step": 230
},
{
"epoch": 1.375558867362146,
"grad_norm": 18.42952537536621,
"learning_rate": 1.201489022773057e-06,
"loss": 1.1223,
"step": 231
},
{
"epoch": 1.3815201192250373,
"grad_norm": 20.96135711669922,
"learning_rate": 1.1808993897346679e-06,
"loss": 1.1385,
"step": 232
},
{
"epoch": 1.3874813710879286,
"grad_norm": 35.41992950439453,
"learning_rate": 1.160433012552508e-06,
"loss": 1.1785,
"step": 233
},
{
"epoch": 1.3934426229508197,
"grad_norm": 15.66014575958252,
"learning_rate": 1.1400918035908238e-06,
"loss": 1.1337,
"step": 234
},
{
"epoch": 1.3994038748137108,
"grad_norm": 11.301997184753418,
"learning_rate": 1.1198776635182273e-06,
"loss": 1.1967,
"step": 235
},
{
"epoch": 1.405365126676602,
"grad_norm": 20.638965606689453,
"learning_rate": 1.0997924811301008e-06,
"loss": 1.2153,
"step": 236
},
{
"epoch": 1.4113263785394934,
"grad_norm": 28.17490005493164,
"learning_rate": 1.079838133172111e-06,
"loss": 1.1678,
"step": 237
},
{
"epoch": 1.4172876304023845,
"grad_norm": 23.852008819580078,
"learning_rate": 1.0600164841648435e-06,
"loss": 1.1183,
"step": 238
},
{
"epoch": 1.4232488822652756,
"grad_norm": 15.352867126464844,
"learning_rate": 1.0403293862295863e-06,
"loss": 1.2151,
"step": 239
},
{
"epoch": 1.4292101341281669,
"grad_norm": 26.934675216674805,
"learning_rate": 1.0207786789152672e-06,
"loss": 1.1222,
"step": 240
},
{
"epoch": 1.4351713859910582,
"grad_norm": 10.703940391540527,
"learning_rate": 1.0013661890265656e-06,
"loss": 1.0719,
"step": 241
},
{
"epoch": 1.4411326378539493,
"grad_norm": 12.302316665649414,
"learning_rate": 9.820937304532221e-07,
"loss": 1.2103,
"step": 242
},
{
"epoch": 1.4470938897168406,
"grad_norm": 16.27391815185547,
"learning_rate": 9.629631040005469e-07,
"loss": 1.1064,
"step": 243
},
{
"epoch": 1.4530551415797317,
"grad_norm": 16.081787109375,
"learning_rate": 9.439760972211545e-07,
"loss": 1.1514,
"step": 244
},
{
"epoch": 1.459016393442623,
"grad_norm": 16.725788116455078,
"learning_rate": 9.251344842479332e-07,
"loss": 1.1508,
"step": 245
},
{
"epoch": 1.464977645305514,
"grad_norm": 43.19313049316406,
"learning_rate": 9.064400256282757e-07,
"loss": 1.205,
"step": 246
},
{
"epoch": 1.4709388971684054,
"grad_norm": 44.95701599121094,
"learning_rate": 8.878944681595742e-07,
"loss": 1.1407,
"step": 247
},
{
"epoch": 1.4769001490312965,
"grad_norm": 16.098857879638672,
"learning_rate": 8.694995447259955e-07,
"loss": 1.1705,
"step": 248
},
{
"epoch": 1.4828614008941878,
"grad_norm": 21.242740631103516,
"learning_rate": 8.512569741365692e-07,
"loss": 1.1271,
"step": 249
},
{
"epoch": 1.488822652757079,
"grad_norm": 44.7149772644043,
"learning_rate": 8.331684609645779e-07,
"loss": 1.1659,
"step": 250
},
{
"epoch": 1.4947839046199702,
"grad_norm": 27.816669464111328,
"learning_rate": 8.152356953882857e-07,
"loss": 1.1576,
"step": 251
},
{
"epoch": 1.5007451564828616,
"grad_norm": 28.994232177734375,
"learning_rate": 7.974603530330069e-07,
"loss": 1.104,
"step": 252
},
{
"epoch": 1.5067064083457526,
"grad_norm": 277.7655334472656,
"learning_rate": 7.7984409481454e-07,
"loss": 1.1457,
"step": 253
},
{
"epoch": 1.5126676602086437,
"grad_norm": 21.334651947021484,
"learning_rate": 7.623885667839686e-07,
"loss": 1.0349,
"step": 254
},
{
"epoch": 1.518628912071535,
"grad_norm": 29.977994918823242,
"learning_rate": 7.450953999738584e-07,
"loss": 1.1867,
"step": 255
},
{
"epoch": 1.5245901639344264,
"grad_norm": 13.20008659362793,
"learning_rate": 7.279662102458551e-07,
"loss": 1.1297,
"step": 256
},
{
"epoch": 1.5305514157973175,
"grad_norm": 22.25104522705078,
"learning_rate": 7.110025981396976e-07,
"loss": 1.1136,
"step": 257
},
{
"epoch": 1.5365126676602086,
"grad_norm": 75.48139190673828,
"learning_rate": 6.942061487236654e-07,
"loss": 1.1347,
"step": 258
},
{
"epoch": 1.5424739195230999,
"grad_norm": 13.588129997253418,
"learning_rate": 6.775784314464717e-07,
"loss": 1.1234,
"step": 259
},
{
"epoch": 1.5484351713859912,
"grad_norm": 17.303014755249023,
"learning_rate": 6.611209999906124e-07,
"loss": 1.1218,
"step": 260
},
{
"epoch": 1.5543964232488823,
"grad_norm": 13.967643737792969,
"learning_rate": 6.448353921271949e-07,
"loss": 1.297,
"step": 261
},
{
"epoch": 1.5603576751117734,
"grad_norm": 14.436738014221191,
"learning_rate": 6.28723129572247e-07,
"loss": 1.1563,
"step": 262
},
{
"epoch": 1.5663189269746647,
"grad_norm": 18.954809188842773,
"learning_rate": 6.12785717844531e-07,
"loss": 1.1938,
"step": 263
},
{
"epoch": 1.572280178837556,
"grad_norm": 13.521712303161621,
"learning_rate": 5.970246461248668e-07,
"loss": 1.1681,
"step": 264
},
{
"epoch": 1.578241430700447,
"grad_norm": 22.596717834472656,
"learning_rate": 5.814413871169844e-07,
"loss": 1.1362,
"step": 265
},
{
"epoch": 1.5842026825633382,
"grad_norm": 23.537677764892578,
"learning_rate": 5.660373969099178e-07,
"loss": 1.249,
"step": 266
},
{
"epoch": 1.5901639344262295,
"grad_norm": 41.0632209777832,
"learning_rate": 5.508141148419443e-07,
"loss": 1.2773,
"step": 267
},
{
"epoch": 1.5961251862891208,
"grad_norm": 11.993062973022461,
"learning_rate": 5.357729633660999e-07,
"loss": 1.0553,
"step": 268
},
{
"epoch": 1.602086438152012,
"grad_norm": 16.83928871154785,
"learning_rate": 5.209153479172607e-07,
"loss": 1.19,
"step": 269
},
{
"epoch": 1.608047690014903,
"grad_norm": 13.438224792480469,
"learning_rate": 5.062426567808237e-07,
"loss": 1.1166,
"step": 270
},
{
"epoch": 1.6140089418777943,
"grad_norm": 12.740294456481934,
"learning_rate": 4.917562609629847e-07,
"loss": 1.0806,
"step": 271
},
{
"epoch": 1.6199701937406856,
"grad_norm": 40.68648910522461,
"learning_rate": 4.774575140626317e-07,
"loss": 1.278,
"step": 272
},
{
"epoch": 1.6259314456035767,
"grad_norm": 45.1893310546875,
"learning_rate": 4.6334775214486786e-07,
"loss": 1.1741,
"step": 273
},
{
"epoch": 1.6318926974664678,
"grad_norm": 11.675634384155273,
"learning_rate": 4.494282936161681e-07,
"loss": 1.1918,
"step": 274
},
{
"epoch": 1.6378539493293591,
"grad_norm": 18.91298484802246,
"learning_rate": 4.3570043910118986e-07,
"loss": 1.2231,
"step": 275
},
{
"epoch": 1.6438152011922504,
"grad_norm": 14.937652587890625,
"learning_rate": 4.221654713212431e-07,
"loss": 1.1785,
"step": 276
},
{
"epoch": 1.6497764530551415,
"grad_norm": 12.2354097366333,
"learning_rate": 4.088246549744332e-07,
"loss": 1.0918,
"step": 277
},
{
"epoch": 1.6557377049180326,
"grad_norm": 17.386564254760742,
"learning_rate": 3.956792366174894e-07,
"loss": 1.1152,
"step": 278
},
{
"epoch": 1.661698956780924,
"grad_norm": 13.0751314163208,
"learning_rate": 3.8273044454928547e-07,
"loss": 1.1894,
"step": 279
},
{
"epoch": 1.6676602086438153,
"grad_norm": 40.181907653808594,
"learning_rate": 3.699794886960706e-07,
"loss": 1.206,
"step": 280
},
{
"epoch": 1.6736214605067063,
"grad_norm": 16.61675453186035,
"learning_rate": 3.5742756049841397e-07,
"loss": 1.1038,
"step": 281
},
{
"epoch": 1.6795827123695977,
"grad_norm": 20.62757682800293,
"learning_rate": 3.450758327998768e-07,
"loss": 1.12,
"step": 282
},
{
"epoch": 1.685543964232489,
"grad_norm": 11.517589569091797,
"learning_rate": 3.329254597374232e-07,
"loss": 1.1531,
"step": 283
},
{
"epoch": 1.69150521609538,
"grad_norm": 15.695626258850098,
"learning_rate": 3.209775766335771e-07,
"loss": 1.1176,
"step": 284
},
{
"epoch": 1.6974664679582712,
"grad_norm": 13.518576622009277,
"learning_rate": 3.092332998903416e-07,
"loss": 1.1052,
"step": 285
},
{
"epoch": 1.7034277198211625,
"grad_norm": 26.3131160736084,
"learning_rate": 2.976937268848787e-07,
"loss": 1.1914,
"step": 286
},
{
"epoch": 1.7093889716840538,
"grad_norm": 8.992045402526855,
"learning_rate": 2.8635993586697555e-07,
"loss": 1.1349,
"step": 287
},
{
"epoch": 1.7153502235469449,
"grad_norm": 15.58350944519043,
"learning_rate": 2.752329858582906e-07,
"loss": 1.1917,
"step": 288
},
{
"epoch": 1.721311475409836,
"grad_norm": 19.328996658325195,
"learning_rate": 2.643139165534009e-07,
"loss": 1.1012,
"step": 289
},
{
"epoch": 1.7272727272727273,
"grad_norm": 53.852378845214844,
"learning_rate": 2.5360374822265276e-07,
"loss": 1.1354,
"step": 290
},
{
"epoch": 1.7332339791356186,
"grad_norm": 21.8743896484375,
"learning_rate": 2.431034816168279e-07,
"loss": 1.2384,
"step": 291
},
{
"epoch": 1.7391952309985097,
"grad_norm": 17.44244956970215,
"learning_rate": 2.3281409787363652e-07,
"loss": 1.1438,
"step": 292
},
{
"epoch": 1.7451564828614008,
"grad_norm": 11.067893981933594,
"learning_rate": 2.227365584260377e-07,
"loss": 1.0889,
"step": 293
},
{
"epoch": 1.751117734724292,
"grad_norm": 28.883140563964844,
"learning_rate": 2.1287180491240455e-07,
"loss": 1.0806,
"step": 294
},
{
"epoch": 1.7570789865871834,
"grad_norm": 15.901183128356934,
"learning_rate": 2.0322075908853934e-07,
"loss": 1.1334,
"step": 295
},
{
"epoch": 1.7630402384500745,
"grad_norm": 36.24775695800781,
"learning_rate": 1.9378432274154424e-07,
"loss": 1.2407,
"step": 296
},
{
"epoch": 1.7690014903129656,
"grad_norm": 20.610580444335938,
"learning_rate": 1.8456337760555915e-07,
"loss": 1.2004,
"step": 297
},
{
"epoch": 1.774962742175857,
"grad_norm": 25.599275588989258,
"learning_rate": 1.7555878527937164e-07,
"loss": 1.2447,
"step": 298
},
{
"epoch": 1.7809239940387482,
"grad_norm": 27.45682716369629,
"learning_rate": 1.6677138714591313e-07,
"loss": 1.1359,
"step": 299
},
{
"epoch": 1.7868852459016393,
"grad_norm": 9.919366836547852,
"learning_rate": 1.5820200429363775e-07,
"loss": 1.1541,
"step": 300
},
{
"epoch": 1.7928464977645304,
"grad_norm": 25.252059936523438,
"learning_rate": 1.498514374398008e-07,
"loss": 1.1065,
"step": 301
},
{
"epoch": 1.7988077496274217,
"grad_norm": 10.612046241760254,
"learning_rate": 1.417204668556421e-07,
"loss": 1.0322,
"step": 302
},
{
"epoch": 1.804769001490313,
"grad_norm": 29.110383987426758,
"learning_rate": 1.3380985229347555e-07,
"loss": 1.0643,
"step": 303
},
{
"epoch": 1.8107302533532041,
"grad_norm": 28.44003677368164,
"learning_rate": 1.2612033291569985e-07,
"loss": 1.197,
"step": 304
},
{
"epoch": 1.8166915052160952,
"grad_norm": 16.720949172973633,
"learning_rate": 1.1865262722573073e-07,
"loss": 1.1588,
"step": 305
},
{
"epoch": 1.8226527570789866,
"grad_norm": 24.2500057220459,
"learning_rate": 1.1140743300086603e-07,
"loss": 1.1925,
"step": 306
},
{
"epoch": 1.8286140089418779,
"grad_norm": 29.950042724609375,
"learning_rate": 1.0438542722708444e-07,
"loss": 1.2223,
"step": 307
},
{
"epoch": 1.834575260804769,
"grad_norm": 13.418863296508789,
"learning_rate": 9.758726603578932e-08,
"loss": 1.0422,
"step": 308
},
{
"epoch": 1.84053651266766,
"grad_norm": 31.109628677368164,
"learning_rate": 9.101358464249921e-08,
"loss": 1.1159,
"step": 309
},
{
"epoch": 1.8464977645305514,
"grad_norm": 13.534027099609375,
"learning_rate": 8.466499728749411e-08,
"loss": 1.2084,
"step": 310
},
{
"epoch": 1.8524590163934427,
"grad_norm": 17.113624572753906,
"learning_rate": 7.854209717842231e-08,
"loss": 1.125,
"step": 311
},
{
"epoch": 1.8584202682563338,
"grad_norm": 11.193347930908203,
"learning_rate": 7.264545643486997e-08,
"loss": 1.03,
"step": 312
},
{
"epoch": 1.864381520119225,
"grad_norm": 90.95343017578125,
"learning_rate": 6.697562603490387e-08,
"loss": 1.1461,
"step": 313
},
{
"epoch": 1.8703427719821164,
"grad_norm": 9.864035606384277,
"learning_rate": 6.153313576358705e-08,
"loss": 1.1409,
"step": 314
},
{
"epoch": 1.8763040238450075,
"grad_norm": 9.691558837890625,
"learning_rate": 5.6318494163477564e-08,
"loss": 1.2217,
"step": 315
},
{
"epoch": 1.8822652757078986,
"grad_norm": 14.702143669128418,
"learning_rate": 5.133218848711013e-08,
"loss": 1.0934,
"step": 316
},
{
"epoch": 1.88822652757079,
"grad_norm": 17.254133224487305,
"learning_rate": 4.657468465146642e-08,
"loss": 1.1534,
"step": 317
},
{
"epoch": 1.8941877794336812,
"grad_norm": 60.02861022949219,
"learning_rate": 4.20464271944418e-08,
"loss": 1.2898,
"step": 318
},
{
"epoch": 1.9001490312965723,
"grad_norm": 16.171571731567383,
"learning_rate": 3.774783923330694e-08,
"loss": 1.0887,
"step": 319
},
{
"epoch": 1.9061102831594634,
"grad_norm": 11.587321281433105,
"learning_rate": 3.3679322425172466e-08,
"loss": 1.0143,
"step": 320
},
{
"epoch": 1.9120715350223547,
"grad_norm": 28.936967849731445,
"learning_rate": 2.984125692945872e-08,
"loss": 1.2466,
"step": 321
},
{
"epoch": 1.918032786885246,
"grad_norm": 11.389568328857422,
"learning_rate": 2.6234001372372196e-08,
"loss": 1.1364,
"step": 322
},
{
"epoch": 1.9239940387481371,
"grad_norm": 11.347761154174805,
"learning_rate": 2.2857892813398785e-08,
"loss": 1.0225,
"step": 323
},
{
"epoch": 1.9299552906110282,
"grad_norm": 17.69095802307129,
"learning_rate": 1.9713246713805588e-08,
"loss": 1.0806,
"step": 324
},
{
"epoch": 1.9359165424739195,
"grad_norm": 18.81931495666504,
"learning_rate": 1.680035690716758e-08,
"loss": 1.1148,
"step": 325
},
{
"epoch": 1.9418777943368108,
"grad_norm": 14.300539016723633,
"learning_rate": 1.411949557191039e-08,
"loss": 1.1152,
"step": 326
},
{
"epoch": 1.947839046199702,
"grad_norm": 14.622347831726074,
"learning_rate": 1.1670913205878431e-08,
"loss": 1.0456,
"step": 327
},
{
"epoch": 1.953800298062593,
"grad_norm": 15.9490327835083,
"learning_rate": 9.454838602928341e-09,
"loss": 1.0858,
"step": 328
},
{
"epoch": 1.9597615499254843,
"grad_norm": 17.17402458190918,
"learning_rate": 7.471478831550804e-09,
"loss": 1.0958,
"step": 329
},
{
"epoch": 1.9657228017883757,
"grad_norm": 15.29993724822998,
"learning_rate": 5.721019215522428e-09,
"loss": 1.0968,
"step": 330
},
{
"epoch": 1.9716840536512668,
"grad_norm": 20.376798629760742,
"learning_rate": 4.2036233165893006e-09,
"loss": 1.1139,
"step": 331
},
{
"epoch": 1.9776453055141578,
"grad_norm": 38.81945037841797,
"learning_rate": 2.919432919183396e-09,
"loss": 1.1274,
"step": 332
},
{
"epoch": 1.9836065573770492,
"grad_norm": 54.10487747192383,
"learning_rate": 1.8685680171745547e-09,
"loss": 1.1034,
"step": 333
},
{
"epoch": 1.9895678092399405,
"grad_norm": 18.45663070678711,
"learning_rate": 1.051126802658342e-09,
"loss": 1.0882,
"step": 334
},
{
"epoch": 1.9955290611028316,
"grad_norm": 22.40350341796875,
"learning_rate": 4.671856567811661e-10,
"loss": 1.2242,
"step": 335
},
{
"epoch": 2.0,
"grad_norm": 11.405494689941406,
"learning_rate": 1.167991426032078e-10,
"loss": 0.8141,
"step": 336
}
],
"logging_steps": 1.0,
"max_steps": 336,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.533791144373125e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}