ViCA-ScanNetPP-10p / trainer_state.json
nkkbr's picture
Initial commit
7ca12ca
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.10004486316733961,
"eval_steps": 500,
"global_step": 669,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00014954389113204725,
"grad_norm": 35.95169344376715,
"learning_rate": 4.975124378109453e-08,
"loss": 1.1911,
"step": 1
},
{
"epoch": 0.0002990877822640945,
"grad_norm": 29.047342527504238,
"learning_rate": 9.950248756218906e-08,
"loss": 1.4707,
"step": 2
},
{
"epoch": 0.00044863167339614175,
"grad_norm": 24.718727160032117,
"learning_rate": 1.4925373134328358e-07,
"loss": 0.9534,
"step": 3
},
{
"epoch": 0.000598175564528189,
"grad_norm": 32.87218994198639,
"learning_rate": 1.9900497512437812e-07,
"loss": 1.2192,
"step": 4
},
{
"epoch": 0.0007477194556602363,
"grad_norm": 25.398344980222138,
"learning_rate": 2.4875621890547267e-07,
"loss": 1.1835,
"step": 5
},
{
"epoch": 0.0008972633467922835,
"grad_norm": 30.48079389364258,
"learning_rate": 2.9850746268656716e-07,
"loss": 1.0024,
"step": 6
},
{
"epoch": 0.0010468072379243307,
"grad_norm": 27.780032565686206,
"learning_rate": 3.4825870646766175e-07,
"loss": 1.1796,
"step": 7
},
{
"epoch": 0.001196351129056378,
"grad_norm": 33.19634259772052,
"learning_rate": 3.9800995024875624e-07,
"loss": 0.9585,
"step": 8
},
{
"epoch": 0.0013458950201884253,
"grad_norm": 32.92097675417938,
"learning_rate": 4.4776119402985074e-07,
"loss": 1.1831,
"step": 9
},
{
"epoch": 0.0014954389113204726,
"grad_norm": 31.267461918177617,
"learning_rate": 4.975124378109453e-07,
"loss": 0.9208,
"step": 10
},
{
"epoch": 0.0016449828024525197,
"grad_norm": 31.652990928454088,
"learning_rate": 5.472636815920398e-07,
"loss": 0.8882,
"step": 11
},
{
"epoch": 0.001794526693584567,
"grad_norm": 33.800482625732165,
"learning_rate": 5.970149253731343e-07,
"loss": 1.2138,
"step": 12
},
{
"epoch": 0.0019440705847166143,
"grad_norm": 30.753216086819556,
"learning_rate": 6.467661691542289e-07,
"loss": 0.9896,
"step": 13
},
{
"epoch": 0.0020936144758486614,
"grad_norm": 32.57679525538582,
"learning_rate": 6.965174129353235e-07,
"loss": 0.9195,
"step": 14
},
{
"epoch": 0.0022431583669807087,
"grad_norm": 25.334089702892793,
"learning_rate": 7.462686567164179e-07,
"loss": 0.7515,
"step": 15
},
{
"epoch": 0.002392702258112756,
"grad_norm": 22.2961872211284,
"learning_rate": 7.960199004975125e-07,
"loss": 0.6638,
"step": 16
},
{
"epoch": 0.0025422461492448033,
"grad_norm": 24.245556768411276,
"learning_rate": 8.457711442786071e-07,
"loss": 0.7704,
"step": 17
},
{
"epoch": 0.0026917900403768506,
"grad_norm": 19.23412917202397,
"learning_rate": 8.955223880597015e-07,
"loss": 0.7354,
"step": 18
},
{
"epoch": 0.002841333931508898,
"grad_norm": 18.58051317424024,
"learning_rate": 9.452736318407961e-07,
"loss": 0.5749,
"step": 19
},
{
"epoch": 0.0029908778226409452,
"grad_norm": 11.242228896944281,
"learning_rate": 9.950248756218907e-07,
"loss": 0.4914,
"step": 20
},
{
"epoch": 0.0031404217137729925,
"grad_norm": 11.163527479225325,
"learning_rate": 1.044776119402985e-06,
"loss": 0.5823,
"step": 21
},
{
"epoch": 0.0032899656049050394,
"grad_norm": 9.100766388616314,
"learning_rate": 1.0945273631840796e-06,
"loss": 0.6887,
"step": 22
},
{
"epoch": 0.0034395094960370867,
"grad_norm": 9.371427313022828,
"learning_rate": 1.1442786069651742e-06,
"loss": 0.3365,
"step": 23
},
{
"epoch": 0.003589053387169134,
"grad_norm": 6.591365654298028,
"learning_rate": 1.1940298507462686e-06,
"loss": 0.4092,
"step": 24
},
{
"epoch": 0.0037385972783011813,
"grad_norm": 6.692920733889971,
"learning_rate": 1.2437810945273632e-06,
"loss": 0.4459,
"step": 25
},
{
"epoch": 0.0038881411694332286,
"grad_norm": 6.609492289627464,
"learning_rate": 1.2935323383084578e-06,
"loss": 0.4577,
"step": 26
},
{
"epoch": 0.004037685060565276,
"grad_norm": 4.9115623336358,
"learning_rate": 1.3432835820895524e-06,
"loss": 0.5349,
"step": 27
},
{
"epoch": 0.004187228951697323,
"grad_norm": 5.117676678055004,
"learning_rate": 1.393034825870647e-06,
"loss": 0.5483,
"step": 28
},
{
"epoch": 0.0043367728428293706,
"grad_norm": 5.263481949191207,
"learning_rate": 1.4427860696517414e-06,
"loss": 0.5991,
"step": 29
},
{
"epoch": 0.004486316733961417,
"grad_norm": 6.131569220022702,
"learning_rate": 1.4925373134328358e-06,
"loss": 0.3908,
"step": 30
},
{
"epoch": 0.004635860625093465,
"grad_norm": 5.928579435490833,
"learning_rate": 1.5422885572139304e-06,
"loss": 0.2084,
"step": 31
},
{
"epoch": 0.004785404516225512,
"grad_norm": 5.916757088180695,
"learning_rate": 1.592039800995025e-06,
"loss": 0.3858,
"step": 32
},
{
"epoch": 0.00493494840735756,
"grad_norm": 8.20423570651997,
"learning_rate": 1.6417910447761196e-06,
"loss": 0.2901,
"step": 33
},
{
"epoch": 0.005084492298489607,
"grad_norm": 8.219360009824356,
"learning_rate": 1.6915422885572142e-06,
"loss": 0.3919,
"step": 34
},
{
"epoch": 0.005234036189621654,
"grad_norm": 5.998450714995048,
"learning_rate": 1.7412935323383088e-06,
"loss": 0.2445,
"step": 35
},
{
"epoch": 0.005383580080753701,
"grad_norm": 4.267389037528284,
"learning_rate": 1.791044776119403e-06,
"loss": 0.2062,
"step": 36
},
{
"epoch": 0.005533123971885748,
"grad_norm": 5.463746992191978,
"learning_rate": 1.8407960199004975e-06,
"loss": 0.5357,
"step": 37
},
{
"epoch": 0.005682667863017796,
"grad_norm": 4.306281637510176,
"learning_rate": 1.8905472636815921e-06,
"loss": 0.1867,
"step": 38
},
{
"epoch": 0.005832211754149843,
"grad_norm": 6.551059942168939,
"learning_rate": 1.9402985074626867e-06,
"loss": 0.5944,
"step": 39
},
{
"epoch": 0.0059817556452818905,
"grad_norm": 6.110559490141819,
"learning_rate": 1.9900497512437813e-06,
"loss": 0.6173,
"step": 40
},
{
"epoch": 0.006131299536413937,
"grad_norm": 4.577457366278138,
"learning_rate": 2.0398009950248755e-06,
"loss": 0.3634,
"step": 41
},
{
"epoch": 0.006280843427545985,
"grad_norm": 6.020057986889502,
"learning_rate": 2.08955223880597e-06,
"loss": 0.5398,
"step": 42
},
{
"epoch": 0.006430387318678032,
"grad_norm": 12.119213807947853,
"learning_rate": 2.1393034825870647e-06,
"loss": 0.2376,
"step": 43
},
{
"epoch": 0.006579931209810079,
"grad_norm": 4.977979102095054,
"learning_rate": 2.1890547263681593e-06,
"loss": 0.2455,
"step": 44
},
{
"epoch": 0.006729475100942127,
"grad_norm": 3.4274663141099166,
"learning_rate": 2.238805970149254e-06,
"loss": 0.2356,
"step": 45
},
{
"epoch": 0.0068790189920741734,
"grad_norm": 4.552279062958819,
"learning_rate": 2.2885572139303485e-06,
"loss": 0.1681,
"step": 46
},
{
"epoch": 0.007028562883206221,
"grad_norm": 2.9323320786902496,
"learning_rate": 2.338308457711443e-06,
"loss": 0.2303,
"step": 47
},
{
"epoch": 0.007178106774338268,
"grad_norm": 4.623033466327724,
"learning_rate": 2.3880597014925373e-06,
"loss": 0.2404,
"step": 48
},
{
"epoch": 0.007327650665470316,
"grad_norm": 5.05007020882628,
"learning_rate": 2.437810945273632e-06,
"loss": 0.4128,
"step": 49
},
{
"epoch": 0.007477194556602363,
"grad_norm": 2.5237349934200273,
"learning_rate": 2.4875621890547264e-06,
"loss": 0.2196,
"step": 50
},
{
"epoch": 0.00762673844773441,
"grad_norm": 3.7483142878646594,
"learning_rate": 2.537313432835821e-06,
"loss": 0.1725,
"step": 51
},
{
"epoch": 0.007776282338866457,
"grad_norm": 4.032155563605261,
"learning_rate": 2.5870646766169156e-06,
"loss": 0.3821,
"step": 52
},
{
"epoch": 0.007925826229998505,
"grad_norm": 3.7782327104964333,
"learning_rate": 2.6368159203980102e-06,
"loss": 0.2207,
"step": 53
},
{
"epoch": 0.008075370121130552,
"grad_norm": 4.816720331969929,
"learning_rate": 2.686567164179105e-06,
"loss": 0.2265,
"step": 54
},
{
"epoch": 0.008224914012262599,
"grad_norm": 2.8481845548797478,
"learning_rate": 2.736318407960199e-06,
"loss": 0.2174,
"step": 55
},
{
"epoch": 0.008374457903394646,
"grad_norm": 4.501151176073331,
"learning_rate": 2.786069651741294e-06,
"loss": 0.2306,
"step": 56
},
{
"epoch": 0.008524001794526694,
"grad_norm": 4.326693136186164,
"learning_rate": 2.835820895522388e-06,
"loss": 0.4023,
"step": 57
},
{
"epoch": 0.008673545685658741,
"grad_norm": 4.061925818141106,
"learning_rate": 2.885572139303483e-06,
"loss": 0.7602,
"step": 58
},
{
"epoch": 0.008823089576790788,
"grad_norm": 6.144988240043741,
"learning_rate": 2.9353233830845774e-06,
"loss": 0.4451,
"step": 59
},
{
"epoch": 0.008972633467922835,
"grad_norm": 4.985549166627373,
"learning_rate": 2.9850746268656716e-06,
"loss": 0.4621,
"step": 60
},
{
"epoch": 0.009122177359054883,
"grad_norm": 3.192079125281125,
"learning_rate": 3.0348258706467666e-06,
"loss": 0.3694,
"step": 61
},
{
"epoch": 0.00927172125018693,
"grad_norm": 4.653619400771914,
"learning_rate": 3.0845771144278608e-06,
"loss": 0.2416,
"step": 62
},
{
"epoch": 0.009421265141318977,
"grad_norm": 3.4214006556775156,
"learning_rate": 3.1343283582089558e-06,
"loss": 0.4755,
"step": 63
},
{
"epoch": 0.009570809032451024,
"grad_norm": 3.0809019894250613,
"learning_rate": 3.18407960199005e-06,
"loss": 0.4154,
"step": 64
},
{
"epoch": 0.009720352923583071,
"grad_norm": 4.190290076677796,
"learning_rate": 3.233830845771145e-06,
"loss": 0.4362,
"step": 65
},
{
"epoch": 0.00986989681471512,
"grad_norm": 3.1777725686355356,
"learning_rate": 3.283582089552239e-06,
"loss": 0.3635,
"step": 66
},
{
"epoch": 0.010019440705847166,
"grad_norm": 2.592442539170553,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.1739,
"step": 67
},
{
"epoch": 0.010168984596979213,
"grad_norm": 4.610893839801018,
"learning_rate": 3.3830845771144283e-06,
"loss": 0.3845,
"step": 68
},
{
"epoch": 0.01031852848811126,
"grad_norm": 2.941030939381248,
"learning_rate": 3.4328358208955225e-06,
"loss": 0.226,
"step": 69
},
{
"epoch": 0.010468072379243309,
"grad_norm": 2.641062959772403,
"learning_rate": 3.4825870646766175e-06,
"loss": 0.2083,
"step": 70
},
{
"epoch": 0.010617616270375356,
"grad_norm": 4.573399002022637,
"learning_rate": 3.5323383084577117e-06,
"loss": 0.3639,
"step": 71
},
{
"epoch": 0.010767160161507403,
"grad_norm": 3.811597787697304,
"learning_rate": 3.582089552238806e-06,
"loss": 0.2046,
"step": 72
},
{
"epoch": 0.01091670405263945,
"grad_norm": 7.593654702612937,
"learning_rate": 3.631840796019901e-06,
"loss": 0.3831,
"step": 73
},
{
"epoch": 0.011066247943771496,
"grad_norm": 2.6372126137968013,
"learning_rate": 3.681592039800995e-06,
"loss": 0.2155,
"step": 74
},
{
"epoch": 0.011215791834903545,
"grad_norm": 3.401033168780161,
"learning_rate": 3.73134328358209e-06,
"loss": 0.2439,
"step": 75
},
{
"epoch": 0.011365335726035592,
"grad_norm": 2.8172647382036047,
"learning_rate": 3.7810945273631843e-06,
"loss": 0.1614,
"step": 76
},
{
"epoch": 0.011514879617167639,
"grad_norm": 3.525793180439174,
"learning_rate": 3.8308457711442784e-06,
"loss": 0.2176,
"step": 77
},
{
"epoch": 0.011664423508299685,
"grad_norm": 2.4029805525684527,
"learning_rate": 3.8805970149253735e-06,
"loss": 0.1893,
"step": 78
},
{
"epoch": 0.011813967399431732,
"grad_norm": 5.727795685387504,
"learning_rate": 3.930348258706468e-06,
"loss": 0.5702,
"step": 79
},
{
"epoch": 0.011963511290563781,
"grad_norm": 4.021893784746645,
"learning_rate": 3.980099502487563e-06,
"loss": 0.4027,
"step": 80
},
{
"epoch": 0.012113055181695828,
"grad_norm": 2.7773808558650535,
"learning_rate": 4.029850746268657e-06,
"loss": 0.2963,
"step": 81
},
{
"epoch": 0.012262599072827875,
"grad_norm": 3.4349426033049992,
"learning_rate": 4.079601990049751e-06,
"loss": 0.2211,
"step": 82
},
{
"epoch": 0.012412142963959922,
"grad_norm": 4.127258766074891,
"learning_rate": 4.129353233830846e-06,
"loss": 0.2516,
"step": 83
},
{
"epoch": 0.01256168685509197,
"grad_norm": 3.551977981988865,
"learning_rate": 4.17910447761194e-06,
"loss": 0.2206,
"step": 84
},
{
"epoch": 0.012711230746224017,
"grad_norm": 2.988554589230421,
"learning_rate": 4.228855721393035e-06,
"loss": 0.366,
"step": 85
},
{
"epoch": 0.012860774637356064,
"grad_norm": 3.256233912334862,
"learning_rate": 4.278606965174129e-06,
"loss": 0.341,
"step": 86
},
{
"epoch": 0.01301031852848811,
"grad_norm": 3.917242635149468,
"learning_rate": 4.3283582089552236e-06,
"loss": 0.281,
"step": 87
},
{
"epoch": 0.013159862419620158,
"grad_norm": 3.8372869351661247,
"learning_rate": 4.378109452736319e-06,
"loss": 0.1933,
"step": 88
},
{
"epoch": 0.013309406310752206,
"grad_norm": 4.03192980896834,
"learning_rate": 4.427860696517413e-06,
"loss": 0.184,
"step": 89
},
{
"epoch": 0.013458950201884253,
"grad_norm": 4.944440623197377,
"learning_rate": 4.477611940298508e-06,
"loss": 0.2406,
"step": 90
},
{
"epoch": 0.0136084940930163,
"grad_norm": 3.2771345760625916,
"learning_rate": 4.527363184079602e-06,
"loss": 0.3635,
"step": 91
},
{
"epoch": 0.013758037984148347,
"grad_norm": 2.5552685161479913,
"learning_rate": 4.577114427860697e-06,
"loss": 0.3581,
"step": 92
},
{
"epoch": 0.013907581875280395,
"grad_norm": 3.825258197515859,
"learning_rate": 4.626865671641791e-06,
"loss": 0.2157,
"step": 93
},
{
"epoch": 0.014057125766412442,
"grad_norm": 3.820006828326968,
"learning_rate": 4.676616915422886e-06,
"loss": 0.401,
"step": 94
},
{
"epoch": 0.01420666965754449,
"grad_norm": 3.4269639891084056,
"learning_rate": 4.72636815920398e-06,
"loss": 0.21,
"step": 95
},
{
"epoch": 0.014356213548676536,
"grad_norm": 3.614177044324435,
"learning_rate": 4.7761194029850745e-06,
"loss": 0.2305,
"step": 96
},
{
"epoch": 0.014505757439808583,
"grad_norm": 2.8474787904051633,
"learning_rate": 4.8258706467661695e-06,
"loss": 0.2002,
"step": 97
},
{
"epoch": 0.014655301330940632,
"grad_norm": 3.1529185682156333,
"learning_rate": 4.875621890547264e-06,
"loss": 0.3126,
"step": 98
},
{
"epoch": 0.014804845222072678,
"grad_norm": 2.805579699726101,
"learning_rate": 4.925373134328359e-06,
"loss": 0.3977,
"step": 99
},
{
"epoch": 0.014954389113204725,
"grad_norm": 2.5072872378288134,
"learning_rate": 4.975124378109453e-06,
"loss": 0.1986,
"step": 100
},
{
"epoch": 0.015103933004336772,
"grad_norm": 2.8773082972301816,
"learning_rate": 5.024875621890548e-06,
"loss": 0.2421,
"step": 101
},
{
"epoch": 0.01525347689546882,
"grad_norm": 2.3650776175631765,
"learning_rate": 5.074626865671642e-06,
"loss": 0.1864,
"step": 102
},
{
"epoch": 0.015403020786600868,
"grad_norm": 4.721891286027898,
"learning_rate": 5.124378109452737e-06,
"loss": 0.2939,
"step": 103
},
{
"epoch": 0.015552564677732915,
"grad_norm": 2.6753396233648705,
"learning_rate": 5.174129353233831e-06,
"loss": 0.2558,
"step": 104
},
{
"epoch": 0.01570210856886496,
"grad_norm": 3.149876968312327,
"learning_rate": 5.2238805970149255e-06,
"loss": 0.3405,
"step": 105
},
{
"epoch": 0.01585165245999701,
"grad_norm": 1.6322197066205648,
"learning_rate": 5.2736318407960205e-06,
"loss": 0.1453,
"step": 106
},
{
"epoch": 0.016001196351129055,
"grad_norm": 3.3492234789043236,
"learning_rate": 5.323383084577115e-06,
"loss": 0.404,
"step": 107
},
{
"epoch": 0.016150740242261104,
"grad_norm": 2.2518951047915157,
"learning_rate": 5.37313432835821e-06,
"loss": 0.2278,
"step": 108
},
{
"epoch": 0.016300284133393152,
"grad_norm": 3.0471913491370404,
"learning_rate": 5.422885572139304e-06,
"loss": 0.265,
"step": 109
},
{
"epoch": 0.016449828024525198,
"grad_norm": 1.6928519222295142,
"learning_rate": 5.472636815920398e-06,
"loss": 0.2169,
"step": 110
},
{
"epoch": 0.016599371915657246,
"grad_norm": 3.265018826674296,
"learning_rate": 5.522388059701493e-06,
"loss": 0.429,
"step": 111
},
{
"epoch": 0.01674891580678929,
"grad_norm": 2.637671664378066,
"learning_rate": 5.572139303482588e-06,
"loss": 0.2762,
"step": 112
},
{
"epoch": 0.01689845969792134,
"grad_norm": 3.1617986987096134,
"learning_rate": 5.621890547263682e-06,
"loss": 0.4272,
"step": 113
},
{
"epoch": 0.01704800358905339,
"grad_norm": 3.0132316717807175,
"learning_rate": 5.671641791044776e-06,
"loss": 0.3644,
"step": 114
},
{
"epoch": 0.017197547480185434,
"grad_norm": 2.2850314864309813,
"learning_rate": 5.721393034825871e-06,
"loss": 0.1967,
"step": 115
},
{
"epoch": 0.017347091371317482,
"grad_norm": 3.0835871860462314,
"learning_rate": 5.771144278606966e-06,
"loss": 0.2322,
"step": 116
},
{
"epoch": 0.017496635262449527,
"grad_norm": 3.5275796788122893,
"learning_rate": 5.820895522388061e-06,
"loss": 0.3543,
"step": 117
},
{
"epoch": 0.017646179153581576,
"grad_norm": 3.1301356173345494,
"learning_rate": 5.870646766169155e-06,
"loss": 0.5064,
"step": 118
},
{
"epoch": 0.017795723044713625,
"grad_norm": 3.9689250366780313,
"learning_rate": 5.920398009950249e-06,
"loss": 0.8428,
"step": 119
},
{
"epoch": 0.01794526693584567,
"grad_norm": 2.6992548320472984,
"learning_rate": 5.970149253731343e-06,
"loss": 0.2727,
"step": 120
},
{
"epoch": 0.01809481082697772,
"grad_norm": 2.8823271138601414,
"learning_rate": 6.019900497512439e-06,
"loss": 0.3301,
"step": 121
},
{
"epoch": 0.018244354718109767,
"grad_norm": 2.652199321292131,
"learning_rate": 6.069651741293533e-06,
"loss": 0.234,
"step": 122
},
{
"epoch": 0.018393898609241812,
"grad_norm": 4.008459949806747,
"learning_rate": 6.119402985074627e-06,
"loss": 0.5713,
"step": 123
},
{
"epoch": 0.01854344250037386,
"grad_norm": 2.8867543983581236,
"learning_rate": 6.1691542288557215e-06,
"loss": 0.2146,
"step": 124
},
{
"epoch": 0.018692986391505906,
"grad_norm": 2.379666412119815,
"learning_rate": 6.218905472636816e-06,
"loss": 0.3812,
"step": 125
},
{
"epoch": 0.018842530282637954,
"grad_norm": 2.8364015730213716,
"learning_rate": 6.2686567164179116e-06,
"loss": 0.3729,
"step": 126
},
{
"epoch": 0.018992074173770003,
"grad_norm": 2.9731590306978957,
"learning_rate": 6.318407960199006e-06,
"loss": 0.3922,
"step": 127
},
{
"epoch": 0.019141618064902048,
"grad_norm": 2.431931443805707,
"learning_rate": 6.3681592039801e-06,
"loss": 0.2316,
"step": 128
},
{
"epoch": 0.019291161956034097,
"grad_norm": 2.5964092588685594,
"learning_rate": 6.417910447761194e-06,
"loss": 0.2129,
"step": 129
},
{
"epoch": 0.019440705847166142,
"grad_norm": 4.241711858566103,
"learning_rate": 6.46766169154229e-06,
"loss": 0.2677,
"step": 130
},
{
"epoch": 0.01959024973829819,
"grad_norm": 3.743763522090278,
"learning_rate": 6.517412935323384e-06,
"loss": 0.7324,
"step": 131
},
{
"epoch": 0.01973979362943024,
"grad_norm": 2.325325226468886,
"learning_rate": 6.567164179104478e-06,
"loss": 0.2282,
"step": 132
},
{
"epoch": 0.019889337520562284,
"grad_norm": 2.187485810642544,
"learning_rate": 6.6169154228855725e-06,
"loss": 0.3479,
"step": 133
},
{
"epoch": 0.020038881411694333,
"grad_norm": 2.555235252803596,
"learning_rate": 6.666666666666667e-06,
"loss": 0.3084,
"step": 134
},
{
"epoch": 0.020188425302826378,
"grad_norm": 2.1409254211343405,
"learning_rate": 6.7164179104477625e-06,
"loss": 0.2413,
"step": 135
},
{
"epoch": 0.020337969193958427,
"grad_norm": 2.9475030013466292,
"learning_rate": 6.766169154228857e-06,
"loss": 0.5899,
"step": 136
},
{
"epoch": 0.020487513085090475,
"grad_norm": 3.161190387153201,
"learning_rate": 6.815920398009951e-06,
"loss": 0.2722,
"step": 137
},
{
"epoch": 0.02063705697622252,
"grad_norm": 3.4231688087143786,
"learning_rate": 6.865671641791045e-06,
"loss": 0.25,
"step": 138
},
{
"epoch": 0.02078660086735457,
"grad_norm": 2.891852432700459,
"learning_rate": 6.915422885572139e-06,
"loss": 0.5206,
"step": 139
},
{
"epoch": 0.020936144758486618,
"grad_norm": 2.4149596821734645,
"learning_rate": 6.965174129353235e-06,
"loss": 0.2792,
"step": 140
},
{
"epoch": 0.021085688649618663,
"grad_norm": 2.737327253049286,
"learning_rate": 7.014925373134329e-06,
"loss": 0.1785,
"step": 141
},
{
"epoch": 0.02123523254075071,
"grad_norm": 2.271710572333297,
"learning_rate": 7.064676616915423e-06,
"loss": 0.2216,
"step": 142
},
{
"epoch": 0.021384776431882756,
"grad_norm": 3.123818135886555,
"learning_rate": 7.114427860696518e-06,
"loss": 0.5292,
"step": 143
},
{
"epoch": 0.021534320323014805,
"grad_norm": 3.4353230085188775,
"learning_rate": 7.164179104477612e-06,
"loss": 0.257,
"step": 144
},
{
"epoch": 0.021683864214146854,
"grad_norm": 3.292198842322858,
"learning_rate": 7.213930348258708e-06,
"loss": 0.4413,
"step": 145
},
{
"epoch": 0.0218334081052789,
"grad_norm": 2.408669543365234,
"learning_rate": 7.263681592039802e-06,
"loss": 0.4034,
"step": 146
},
{
"epoch": 0.021982951996410947,
"grad_norm": 2.918318139010717,
"learning_rate": 7.313432835820896e-06,
"loss": 0.1789,
"step": 147
},
{
"epoch": 0.022132495887542993,
"grad_norm": 2.016064943310167,
"learning_rate": 7.36318407960199e-06,
"loss": 0.2454,
"step": 148
},
{
"epoch": 0.02228203977867504,
"grad_norm": 3.375282717272202,
"learning_rate": 7.412935323383084e-06,
"loss": 0.5047,
"step": 149
},
{
"epoch": 0.02243158366980709,
"grad_norm": 2.747548142801912,
"learning_rate": 7.46268656716418e-06,
"loss": 0.3193,
"step": 150
},
{
"epoch": 0.022581127560939135,
"grad_norm": 5.014531999850111,
"learning_rate": 7.512437810945274e-06,
"loss": 0.5367,
"step": 151
},
{
"epoch": 0.022730671452071183,
"grad_norm": 1.7396197448467992,
"learning_rate": 7.5621890547263685e-06,
"loss": 0.1602,
"step": 152
},
{
"epoch": 0.02288021534320323,
"grad_norm": 3.9271159318267452,
"learning_rate": 7.611940298507463e-06,
"loss": 0.2763,
"step": 153
},
{
"epoch": 0.023029759234335277,
"grad_norm": 2.093726492507833,
"learning_rate": 7.661691542288557e-06,
"loss": 0.169,
"step": 154
},
{
"epoch": 0.023179303125467326,
"grad_norm": 1.5357011381308088,
"learning_rate": 7.711442786069654e-06,
"loss": 0.1619,
"step": 155
},
{
"epoch": 0.02332884701659937,
"grad_norm": 2.3824458230974863,
"learning_rate": 7.761194029850747e-06,
"loss": 0.2094,
"step": 156
},
{
"epoch": 0.02347839090773142,
"grad_norm": 2.8236663879690784,
"learning_rate": 7.810945273631842e-06,
"loss": 0.3426,
"step": 157
},
{
"epoch": 0.023627934798863465,
"grad_norm": 3.1375695638809815,
"learning_rate": 7.860696517412935e-06,
"loss": 0.5518,
"step": 158
},
{
"epoch": 0.023777478689995513,
"grad_norm": 3.2182906468856105,
"learning_rate": 7.91044776119403e-06,
"loss": 0.1995,
"step": 159
},
{
"epoch": 0.023927022581127562,
"grad_norm": 14.749841980168513,
"learning_rate": 7.960199004975125e-06,
"loss": 0.5578,
"step": 160
},
{
"epoch": 0.024076566472259607,
"grad_norm": 3.0100123201004045,
"learning_rate": 8.00995024875622e-06,
"loss": 0.5091,
"step": 161
},
{
"epoch": 0.024226110363391656,
"grad_norm": 3.5091520525666433,
"learning_rate": 8.059701492537314e-06,
"loss": 0.5357,
"step": 162
},
{
"epoch": 0.024375654254523704,
"grad_norm": 2.934851375582722,
"learning_rate": 8.109452736318409e-06,
"loss": 0.2267,
"step": 163
},
{
"epoch": 0.02452519814565575,
"grad_norm": 2.5911339240383544,
"learning_rate": 8.159203980099502e-06,
"loss": 0.1782,
"step": 164
},
{
"epoch": 0.024674742036787798,
"grad_norm": 2.847206263316536,
"learning_rate": 8.208955223880599e-06,
"loss": 0.2252,
"step": 165
},
{
"epoch": 0.024824285927919843,
"grad_norm": 3.5380431553535976,
"learning_rate": 8.258706467661692e-06,
"loss": 0.4295,
"step": 166
},
{
"epoch": 0.024973829819051892,
"grad_norm": 3.150492354924513,
"learning_rate": 8.308457711442787e-06,
"loss": 0.3276,
"step": 167
},
{
"epoch": 0.02512337371018394,
"grad_norm": 3.114695975436696,
"learning_rate": 8.35820895522388e-06,
"loss": 0.5181,
"step": 168
},
{
"epoch": 0.025272917601315985,
"grad_norm": 2.6180846619509355,
"learning_rate": 8.407960199004975e-06,
"loss": 0.2577,
"step": 169
},
{
"epoch": 0.025422461492448034,
"grad_norm": 1.859950631659999,
"learning_rate": 8.45771144278607e-06,
"loss": 0.1838,
"step": 170
},
{
"epoch": 0.02557200538358008,
"grad_norm": 4.092195798232618,
"learning_rate": 8.507462686567165e-06,
"loss": 0.2676,
"step": 171
},
{
"epoch": 0.025721549274712128,
"grad_norm": 2.0820308098425766,
"learning_rate": 8.557213930348259e-06,
"loss": 0.2528,
"step": 172
},
{
"epoch": 0.025871093165844176,
"grad_norm": 2.8153771201369087,
"learning_rate": 8.606965174129354e-06,
"loss": 0.3374,
"step": 173
},
{
"epoch": 0.02602063705697622,
"grad_norm": 2.6417342231989114,
"learning_rate": 8.656716417910447e-06,
"loss": 0.4309,
"step": 174
},
{
"epoch": 0.02617018094810827,
"grad_norm": 3.3553357791865825,
"learning_rate": 8.706467661691544e-06,
"loss": 0.279,
"step": 175
},
{
"epoch": 0.026319724839240315,
"grad_norm": 2.5896987414147707,
"learning_rate": 8.756218905472637e-06,
"loss": 0.2505,
"step": 176
},
{
"epoch": 0.026469268730372364,
"grad_norm": 15.917959164107543,
"learning_rate": 8.805970149253732e-06,
"loss": 0.3903,
"step": 177
},
{
"epoch": 0.026618812621504413,
"grad_norm": 1.897502276352634,
"learning_rate": 8.855721393034826e-06,
"loss": 0.3051,
"step": 178
},
{
"epoch": 0.026768356512636458,
"grad_norm": 3.498345426750877,
"learning_rate": 8.905472636815922e-06,
"loss": 0.8122,
"step": 179
},
{
"epoch": 0.026917900403768506,
"grad_norm": 3.2270107650642297,
"learning_rate": 8.955223880597016e-06,
"loss": 0.2312,
"step": 180
},
{
"epoch": 0.027067444294900555,
"grad_norm": 2.373617987334166,
"learning_rate": 9.00497512437811e-06,
"loss": 0.3553,
"step": 181
},
{
"epoch": 0.0272169881860326,
"grad_norm": 2.022495433415561,
"learning_rate": 9.054726368159204e-06,
"loss": 0.3372,
"step": 182
},
{
"epoch": 0.02736653207716465,
"grad_norm": 2.471303542690233,
"learning_rate": 9.104477611940299e-06,
"loss": 0.2764,
"step": 183
},
{
"epoch": 0.027516075968296694,
"grad_norm": 2.170550660433261,
"learning_rate": 9.154228855721394e-06,
"loss": 0.2429,
"step": 184
},
{
"epoch": 0.027665619859428742,
"grad_norm": 1.7750572924031363,
"learning_rate": 9.203980099502489e-06,
"loss": 0.1749,
"step": 185
},
{
"epoch": 0.02781516375056079,
"grad_norm": 1.9803173977955488,
"learning_rate": 9.253731343283582e-06,
"loss": 0.3061,
"step": 186
},
{
"epoch": 0.027964707641692836,
"grad_norm": 2.686793479118654,
"learning_rate": 9.303482587064677e-06,
"loss": 0.2704,
"step": 187
},
{
"epoch": 0.028114251532824885,
"grad_norm": 3.0095995560762088,
"learning_rate": 9.353233830845772e-06,
"loss": 0.3935,
"step": 188
},
{
"epoch": 0.02826379542395693,
"grad_norm": 3.296780241377357,
"learning_rate": 9.402985074626867e-06,
"loss": 0.4349,
"step": 189
},
{
"epoch": 0.02841333931508898,
"grad_norm": 2.0473844316492262,
"learning_rate": 9.45273631840796e-06,
"loss": 0.3594,
"step": 190
},
{
"epoch": 0.028562883206221027,
"grad_norm": 2.6746439974295986,
"learning_rate": 9.502487562189056e-06,
"loss": 0.2507,
"step": 191
},
{
"epoch": 0.028712427097353072,
"grad_norm": 2.171372767224107,
"learning_rate": 9.552238805970149e-06,
"loss": 0.4442,
"step": 192
},
{
"epoch": 0.02886197098848512,
"grad_norm": 3.412610878033882,
"learning_rate": 9.601990049751244e-06,
"loss": 0.5065,
"step": 193
},
{
"epoch": 0.029011514879617166,
"grad_norm": 2.5249672849820843,
"learning_rate": 9.651741293532339e-06,
"loss": 0.2775,
"step": 194
},
{
"epoch": 0.029161058770749215,
"grad_norm": 1.9244063665371054,
"learning_rate": 9.701492537313434e-06,
"loss": 0.2501,
"step": 195
},
{
"epoch": 0.029310602661881263,
"grad_norm": 2.2928756876943788,
"learning_rate": 9.751243781094527e-06,
"loss": 0.391,
"step": 196
},
{
"epoch": 0.02946014655301331,
"grad_norm": 3.2090175671059464,
"learning_rate": 9.800995024875622e-06,
"loss": 0.355,
"step": 197
},
{
"epoch": 0.029609690444145357,
"grad_norm": 2.564275054094989,
"learning_rate": 9.850746268656717e-06,
"loss": 0.3824,
"step": 198
},
{
"epoch": 0.029759234335277406,
"grad_norm": 2.2612313847384473,
"learning_rate": 9.900497512437812e-06,
"loss": 0.255,
"step": 199
},
{
"epoch": 0.02990877822640945,
"grad_norm": 2.867410801811384,
"learning_rate": 9.950248756218906e-06,
"loss": 0.2321,
"step": 200
},
{
"epoch": 0.0300583221175415,
"grad_norm": 2.7017080308625316,
"learning_rate": 1e-05,
"loss": 0.5355,
"step": 201
},
{
"epoch": 0.030207866008673544,
"grad_norm": 1.7563631058650533,
"learning_rate": 9.999999413475907e-06,
"loss": 0.2366,
"step": 202
},
{
"epoch": 0.030357409899805593,
"grad_norm": 2.7923486514729134,
"learning_rate": 9.999997653903764e-06,
"loss": 0.5735,
"step": 203
},
{
"epoch": 0.03050695379093764,
"grad_norm": 2.5477270678585935,
"learning_rate": 9.999994721283985e-06,
"loss": 0.2316,
"step": 204
},
{
"epoch": 0.030656497682069687,
"grad_norm": 1.6435827637040603,
"learning_rate": 9.99999061561726e-06,
"loss": 0.1958,
"step": 205
},
{
"epoch": 0.030806041573201735,
"grad_norm": 4.225438559077688,
"learning_rate": 9.999985336904546e-06,
"loss": 0.6052,
"step": 206
},
{
"epoch": 0.03095558546433378,
"grad_norm": 2.384218907777814,
"learning_rate": 9.999978885147086e-06,
"loss": 0.382,
"step": 207
},
{
"epoch": 0.03110512935546583,
"grad_norm": 3.082533240684358,
"learning_rate": 9.999971260346394e-06,
"loss": 0.4615,
"step": 208
},
{
"epoch": 0.03125467324659788,
"grad_norm": 2.126341746782405,
"learning_rate": 9.999962462504259e-06,
"loss": 0.3489,
"step": 209
},
{
"epoch": 0.03140421713772992,
"grad_norm": 2.3157719584793974,
"learning_rate": 9.99995249162274e-06,
"loss": 0.351,
"step": 210
},
{
"epoch": 0.03155376102886197,
"grad_norm": 3.2569828989709046,
"learning_rate": 9.999941347704183e-06,
"loss": 0.5452,
"step": 211
},
{
"epoch": 0.03170330491999402,
"grad_norm": 2.4010549422177747,
"learning_rate": 9.999929030751199e-06,
"loss": 0.5511,
"step": 212
},
{
"epoch": 0.031852848811126065,
"grad_norm": 2.2021354319659956,
"learning_rate": 9.999915540766679e-06,
"loss": 0.409,
"step": 213
},
{
"epoch": 0.03200239270225811,
"grad_norm": 2.7467598032746467,
"learning_rate": 9.999900877753786e-06,
"loss": 0.2769,
"step": 214
},
{
"epoch": 0.03215193659339016,
"grad_norm": 2.250991470386846,
"learning_rate": 9.99988504171596e-06,
"loss": 0.4243,
"step": 215
},
{
"epoch": 0.03230148048452221,
"grad_norm": 7.389570164962262,
"learning_rate": 9.999868032656921e-06,
"loss": 0.5661,
"step": 216
},
{
"epoch": 0.03245102437565425,
"grad_norm": 2.3232325152419904,
"learning_rate": 9.999849850580653e-06,
"loss": 0.3622,
"step": 217
},
{
"epoch": 0.032600568266786305,
"grad_norm": 2.8448629192721153,
"learning_rate": 9.999830495491425e-06,
"loss": 0.5013,
"step": 218
},
{
"epoch": 0.03275011215791835,
"grad_norm": 1.9203985094095042,
"learning_rate": 9.99980996739378e-06,
"loss": 0.2597,
"step": 219
},
{
"epoch": 0.032899656049050395,
"grad_norm": 2.1343351176097705,
"learning_rate": 9.99978826629253e-06,
"loss": 0.333,
"step": 220
},
{
"epoch": 0.03304919994018244,
"grad_norm": 2.675496675158128,
"learning_rate": 9.999765392192766e-06,
"loss": 0.4679,
"step": 221
},
{
"epoch": 0.03319874383131449,
"grad_norm": 2.954897252892918,
"learning_rate": 9.99974134509986e-06,
"loss": 0.5779,
"step": 222
},
{
"epoch": 0.03334828772244654,
"grad_norm": 3.164155125145253,
"learning_rate": 9.999716125019448e-06,
"loss": 0.5192,
"step": 223
},
{
"epoch": 0.03349783161357858,
"grad_norm": 2.9422429580445377,
"learning_rate": 9.99968973195745e-06,
"loss": 0.3514,
"step": 224
},
{
"epoch": 0.033647375504710635,
"grad_norm": 2.016818218277119,
"learning_rate": 9.999662165920056e-06,
"loss": 0.3657,
"step": 225
},
{
"epoch": 0.03379691939584268,
"grad_norm": 2.805692301474297,
"learning_rate": 9.999633426913733e-06,
"loss": 0.1912,
"step": 226
},
{
"epoch": 0.033946463286974725,
"grad_norm": 2.205403428118743,
"learning_rate": 9.999603514945227e-06,
"loss": 0.234,
"step": 227
},
{
"epoch": 0.03409600717810678,
"grad_norm": 2.013271573198516,
"learning_rate": 9.999572430021553e-06,
"loss": 0.464,
"step": 228
},
{
"epoch": 0.03424555106923882,
"grad_norm": 3.033803346792209,
"learning_rate": 9.999540172150005e-06,
"loss": 0.2599,
"step": 229
},
{
"epoch": 0.03439509496037087,
"grad_norm": 2.854186400231596,
"learning_rate": 9.99950674133815e-06,
"loss": 0.6431,
"step": 230
},
{
"epoch": 0.03454463885150292,
"grad_norm": 2.162434347622467,
"learning_rate": 9.999472137593829e-06,
"loss": 0.4779,
"step": 231
},
{
"epoch": 0.034694182742634964,
"grad_norm": 1.4691335020169023,
"learning_rate": 9.999436360925165e-06,
"loss": 0.1827,
"step": 232
},
{
"epoch": 0.03484372663376701,
"grad_norm": 1.6955188606947214,
"learning_rate": 9.99939941134055e-06,
"loss": 0.2336,
"step": 233
},
{
"epoch": 0.034993270524899055,
"grad_norm": 2.0710606069082167,
"learning_rate": 9.99936128884865e-06,
"loss": 0.3671,
"step": 234
},
{
"epoch": 0.03514281441603111,
"grad_norm": 2.128464465717484,
"learning_rate": 9.999321993458411e-06,
"loss": 0.2928,
"step": 235
},
{
"epoch": 0.03529235830716315,
"grad_norm": 1.9685227247781487,
"learning_rate": 9.999281525179054e-06,
"loss": 0.185,
"step": 236
},
{
"epoch": 0.0354419021982952,
"grad_norm": 2.3203573768463115,
"learning_rate": 9.99923988402007e-06,
"loss": 0.3733,
"step": 237
},
{
"epoch": 0.03559144608942725,
"grad_norm": 2.2161639851963457,
"learning_rate": 9.99919706999123e-06,
"loss": 0.4,
"step": 238
},
{
"epoch": 0.035740989980559294,
"grad_norm": 1.551687214387557,
"learning_rate": 9.99915308310258e-06,
"loss": 0.1723,
"step": 239
},
{
"epoch": 0.03589053387169134,
"grad_norm": 1.9544776771870587,
"learning_rate": 9.999107923364436e-06,
"loss": 0.2587,
"step": 240
},
{
"epoch": 0.03604007776282339,
"grad_norm": 2.1986380601508375,
"learning_rate": 9.999061590787394e-06,
"loss": 0.544,
"step": 241
},
{
"epoch": 0.03618962165395544,
"grad_norm": 2.5816888510040457,
"learning_rate": 9.999014085382326e-06,
"loss": 0.4619,
"step": 242
},
{
"epoch": 0.03633916554508748,
"grad_norm": 1.8291845348661409,
"learning_rate": 9.998965407160377e-06,
"loss": 0.2052,
"step": 243
},
{
"epoch": 0.036488709436219534,
"grad_norm": 3.167062575704647,
"learning_rate": 9.998915556132966e-06,
"loss": 0.6123,
"step": 244
},
{
"epoch": 0.03663825332735158,
"grad_norm": 1.8628898225455814,
"learning_rate": 9.99886453231179e-06,
"loss": 0.3634,
"step": 245
},
{
"epoch": 0.036787797218483624,
"grad_norm": 1.7903762911789451,
"learning_rate": 9.998812335708818e-06,
"loss": 0.2162,
"step": 246
},
{
"epoch": 0.03693734110961567,
"grad_norm": 1.3282642487848175,
"learning_rate": 9.998758966336296e-06,
"loss": 0.1875,
"step": 247
},
{
"epoch": 0.03708688500074772,
"grad_norm": 1.8364953512469955,
"learning_rate": 9.998704424206747e-06,
"loss": 0.208,
"step": 248
},
{
"epoch": 0.037236428891879766,
"grad_norm": 1.3941303606582691,
"learning_rate": 9.998648709332965e-06,
"loss": 0.1737,
"step": 249
},
{
"epoch": 0.03738597278301181,
"grad_norm": 1.7239196409011197,
"learning_rate": 9.998591821728022e-06,
"loss": 0.2339,
"step": 250
},
{
"epoch": 0.037535516674143864,
"grad_norm": 2.623262386600702,
"learning_rate": 9.998533761405265e-06,
"loss": 0.3988,
"step": 251
},
{
"epoch": 0.03768506056527591,
"grad_norm": 3.0417113736320354,
"learning_rate": 9.998474528378315e-06,
"loss": 0.3998,
"step": 252
},
{
"epoch": 0.037834604456407954,
"grad_norm": 2.3389769972346532,
"learning_rate": 9.998414122661066e-06,
"loss": 0.2157,
"step": 253
},
{
"epoch": 0.037984148347540006,
"grad_norm": 2.776666496961099,
"learning_rate": 9.998352544267696e-06,
"loss": 0.5598,
"step": 254
},
{
"epoch": 0.03813369223867205,
"grad_norm": 2.1472401976055746,
"learning_rate": 9.998289793212645e-06,
"loss": 0.2375,
"step": 255
},
{
"epoch": 0.038283236129804096,
"grad_norm": 2.258529852719024,
"learning_rate": 9.99822586951064e-06,
"loss": 0.257,
"step": 256
},
{
"epoch": 0.03843278002093614,
"grad_norm": 2.234662282588329,
"learning_rate": 9.998160773176676e-06,
"loss": 0.2513,
"step": 257
},
{
"epoch": 0.038582323912068194,
"grad_norm": 1.557075634748184,
"learning_rate": 9.998094504226025e-06,
"loss": 0.2154,
"step": 258
},
{
"epoch": 0.03873186780320024,
"grad_norm": 1.2782097805836874,
"learning_rate": 9.998027062674236e-06,
"loss": 0.1997,
"step": 259
},
{
"epoch": 0.038881411694332284,
"grad_norm": 1.5754692941437902,
"learning_rate": 9.997958448537129e-06,
"loss": 0.2271,
"step": 260
},
{
"epoch": 0.039030955585464336,
"grad_norm": 2.3273358127526516,
"learning_rate": 9.997888661830803e-06,
"loss": 0.4129,
"step": 261
},
{
"epoch": 0.03918049947659638,
"grad_norm": 2.5932478274973705,
"learning_rate": 9.997817702571631e-06,
"loss": 0.2762,
"step": 262
},
{
"epoch": 0.039330043367728426,
"grad_norm": 1.7415819067090217,
"learning_rate": 9.99774557077626e-06,
"loss": 0.2677,
"step": 263
},
{
"epoch": 0.03947958725886048,
"grad_norm": 2.1983315861883974,
"learning_rate": 9.997672266461613e-06,
"loss": 0.3412,
"step": 264
},
{
"epoch": 0.03962913114999252,
"grad_norm": 2.8445138272257666,
"learning_rate": 9.997597789644889e-06,
"loss": 0.3471,
"step": 265
},
{
"epoch": 0.03977867504112457,
"grad_norm": 2.6658347323464575,
"learning_rate": 9.997522140343558e-06,
"loss": 0.3785,
"step": 266
},
{
"epoch": 0.03992821893225662,
"grad_norm": 1.2913669477506569,
"learning_rate": 9.997445318575371e-06,
"loss": 0.2089,
"step": 267
},
{
"epoch": 0.040077762823388666,
"grad_norm": 2.440102551085522,
"learning_rate": 9.99736732435835e-06,
"loss": 0.5639,
"step": 268
},
{
"epoch": 0.04022730671452071,
"grad_norm": 2.252623935384866,
"learning_rate": 9.997288157710795e-06,
"loss": 0.447,
"step": 269
},
{
"epoch": 0.040376850605652756,
"grad_norm": 1.9038309319538977,
"learning_rate": 9.997207818651273e-06,
"loss": 0.2784,
"step": 270
},
{
"epoch": 0.04052639449678481,
"grad_norm": 2.05316637395224,
"learning_rate": 9.99712630719864e-06,
"loss": 0.3874,
"step": 271
},
{
"epoch": 0.04067593838791685,
"grad_norm": 4.663034399257074,
"learning_rate": 9.997043623372016e-06,
"loss": 0.3558,
"step": 272
},
{
"epoch": 0.0408254822790489,
"grad_norm": 2.0324793909935375,
"learning_rate": 9.996959767190799e-06,
"loss": 0.3884,
"step": 273
},
{
"epoch": 0.04097502617018095,
"grad_norm": 2.1897027573531003,
"learning_rate": 9.996874738674663e-06,
"loss": 0.2372,
"step": 274
},
{
"epoch": 0.041124570061312996,
"grad_norm": 1.9410471939157525,
"learning_rate": 9.996788537843558e-06,
"loss": 0.3478,
"step": 275
},
{
"epoch": 0.04127411395244504,
"grad_norm": 3.650983914269082,
"learning_rate": 9.996701164717704e-06,
"loss": 0.4213,
"step": 276
},
{
"epoch": 0.04142365784357709,
"grad_norm": 3.067988013237884,
"learning_rate": 9.996612619317602e-06,
"loss": 0.7209,
"step": 277
},
{
"epoch": 0.04157320173470914,
"grad_norm": 2.5863303551652033,
"learning_rate": 9.996522901664028e-06,
"loss": 0.5418,
"step": 278
},
{
"epoch": 0.04172274562584118,
"grad_norm": 2.1885641779249476,
"learning_rate": 9.996432011778026e-06,
"loss": 0.371,
"step": 279
},
{
"epoch": 0.041872289516973235,
"grad_norm": 2.398824728854803,
"learning_rate": 9.99633994968092e-06,
"loss": 0.5508,
"step": 280
},
{
"epoch": 0.04202183340810528,
"grad_norm": 1.5732032420608302,
"learning_rate": 9.996246715394314e-06,
"loss": 0.2468,
"step": 281
},
{
"epoch": 0.042171377299237325,
"grad_norm": 2.8532279807617944,
"learning_rate": 9.996152308940075e-06,
"loss": 0.5503,
"step": 282
},
{
"epoch": 0.04232092119036937,
"grad_norm": 2.4502727303222733,
"learning_rate": 9.996056730340356e-06,
"loss": 0.4046,
"step": 283
},
{
"epoch": 0.04247046508150142,
"grad_norm": 1.9272098426705169,
"learning_rate": 9.995959979617578e-06,
"loss": 0.3906,
"step": 284
},
{
"epoch": 0.04262000897263347,
"grad_norm": 2.290690335549339,
"learning_rate": 9.995862056794441e-06,
"loss": 0.2464,
"step": 285
},
{
"epoch": 0.04276955286376551,
"grad_norm": 1.656564250859485,
"learning_rate": 9.99576296189392e-06,
"loss": 0.1996,
"step": 286
},
{
"epoch": 0.042919096754897565,
"grad_norm": 2.1259148220336965,
"learning_rate": 9.995662694939262e-06,
"loss": 0.3994,
"step": 287
},
{
"epoch": 0.04306864064602961,
"grad_norm": 2.286901143642134,
"learning_rate": 9.99556125595399e-06,
"loss": 0.4047,
"step": 288
},
{
"epoch": 0.043218184537161655,
"grad_norm": 1.3559455912309712,
"learning_rate": 9.995458644961902e-06,
"loss": 0.2228,
"step": 289
},
{
"epoch": 0.04336772842829371,
"grad_norm": 2.285750924681825,
"learning_rate": 9.995354861987075e-06,
"loss": 0.2367,
"step": 290
},
{
"epoch": 0.04351727231942575,
"grad_norm": 1.923824453592428,
"learning_rate": 9.995249907053854e-06,
"loss": 0.3951,
"step": 291
},
{
"epoch": 0.0436668162105578,
"grad_norm": 1.968047953500074,
"learning_rate": 9.995143780186865e-06,
"loss": 0.2149,
"step": 292
},
{
"epoch": 0.04381636010168984,
"grad_norm": 2.3975790519132074,
"learning_rate": 9.995036481411005e-06,
"loss": 0.5312,
"step": 293
},
{
"epoch": 0.043965903992821895,
"grad_norm": 1.9664546058841197,
"learning_rate": 9.994928010751447e-06,
"loss": 0.4832,
"step": 294
},
{
"epoch": 0.04411544788395394,
"grad_norm": 2.1609011533249785,
"learning_rate": 9.994818368233639e-06,
"loss": 0.571,
"step": 295
},
{
"epoch": 0.044264991775085985,
"grad_norm": 1.2099666806993736,
"learning_rate": 9.994707553883305e-06,
"loss": 0.1801,
"step": 296
},
{
"epoch": 0.04441453566621804,
"grad_norm": 1.8811137964659612,
"learning_rate": 9.994595567726444e-06,
"loss": 0.2708,
"step": 297
},
{
"epoch": 0.04456407955735008,
"grad_norm": 1.6387011737954997,
"learning_rate": 9.994482409789329e-06,
"loss": 0.245,
"step": 298
},
{
"epoch": 0.04471362344848213,
"grad_norm": 2.4061797367092486,
"learning_rate": 9.994368080098505e-06,
"loss": 0.204,
"step": 299
},
{
"epoch": 0.04486316733961418,
"grad_norm": 2.555264958903577,
"learning_rate": 9.994252578680796e-06,
"loss": 0.5251,
"step": 300
},
{
"epoch": 0.045012711230746225,
"grad_norm": 3.1965886018503897,
"learning_rate": 9.994135905563302e-06,
"loss": 0.4353,
"step": 301
},
{
"epoch": 0.04516225512187827,
"grad_norm": 2.390530599961774,
"learning_rate": 9.994018060773396e-06,
"loss": 0.4199,
"step": 302
},
{
"epoch": 0.04531179901301032,
"grad_norm": 2.694731420269419,
"learning_rate": 9.993899044338722e-06,
"loss": 0.4029,
"step": 303
},
{
"epoch": 0.04546134290414237,
"grad_norm": 2.5518583518075437,
"learning_rate": 9.993778856287205e-06,
"loss": 0.3712,
"step": 304
},
{
"epoch": 0.04561088679527441,
"grad_norm": 1.958382495979976,
"learning_rate": 9.99365749664704e-06,
"loss": 0.3617,
"step": 305
},
{
"epoch": 0.04576043068640646,
"grad_norm": 2.299652220902115,
"learning_rate": 9.993534965446701e-06,
"loss": 0.4059,
"step": 306
},
{
"epoch": 0.04590997457753851,
"grad_norm": 4.086258301258261,
"learning_rate": 9.993411262714934e-06,
"loss": 0.2774,
"step": 307
},
{
"epoch": 0.046059518468670554,
"grad_norm": 2.0081624141767156,
"learning_rate": 9.993286388480763e-06,
"loss": 0.2724,
"step": 308
},
{
"epoch": 0.0462090623598026,
"grad_norm": 2.388037596587926,
"learning_rate": 9.993160342773483e-06,
"loss": 0.2706,
"step": 309
},
{
"epoch": 0.04635860625093465,
"grad_norm": 1.5868739255084185,
"learning_rate": 9.993033125622665e-06,
"loss": 0.256,
"step": 310
},
{
"epoch": 0.0465081501420667,
"grad_norm": 1.8286822342955051,
"learning_rate": 9.992904737058157e-06,
"loss": 0.209,
"step": 311
},
{
"epoch": 0.04665769403319874,
"grad_norm": 2.2060332987484306,
"learning_rate": 9.992775177110078e-06,
"loss": 0.4253,
"step": 312
},
{
"epoch": 0.046807237924330794,
"grad_norm": 1.39628419375001,
"learning_rate": 9.992644445808826e-06,
"loss": 0.1693,
"step": 313
},
{
"epoch": 0.04695678181546284,
"grad_norm": 1.5668060198088787,
"learning_rate": 9.99251254318507e-06,
"loss": 0.24,
"step": 314
},
{
"epoch": 0.047106325706594884,
"grad_norm": 1.998270389587923,
"learning_rate": 9.992379469269758e-06,
"loss": 0.2519,
"step": 315
},
{
"epoch": 0.04725586959772693,
"grad_norm": 1.9609810436779118,
"learning_rate": 9.99224522409411e-06,
"loss": 0.2023,
"step": 316
},
{
"epoch": 0.04740541348885898,
"grad_norm": 1.4580736241239847,
"learning_rate": 9.992109807689619e-06,
"loss": 0.2387,
"step": 317
},
{
"epoch": 0.04755495737999103,
"grad_norm": 2.710681694340303,
"learning_rate": 9.991973220088057e-06,
"loss": 0.6738,
"step": 318
},
{
"epoch": 0.04770450127112307,
"grad_norm": 1.2469776099691643,
"learning_rate": 9.991835461321466e-06,
"loss": 0.2013,
"step": 319
},
{
"epoch": 0.047854045162255124,
"grad_norm": 2.128896128779159,
"learning_rate": 9.99169653142217e-06,
"loss": 0.3432,
"step": 320
},
{
"epoch": 0.04800358905338717,
"grad_norm": 1.6053097848087672,
"learning_rate": 9.991556430422759e-06,
"loss": 0.2301,
"step": 321
},
{
"epoch": 0.048153132944519214,
"grad_norm": 1.7774787600035602,
"learning_rate": 9.991415158356106e-06,
"loss": 0.2535,
"step": 322
},
{
"epoch": 0.048302676835651266,
"grad_norm": 1.449815289318445,
"learning_rate": 9.991272715255351e-06,
"loss": 0.1878,
"step": 323
},
{
"epoch": 0.04845222072678331,
"grad_norm": 1.5118547669168991,
"learning_rate": 9.991129101153916e-06,
"loss": 0.3186,
"step": 324
},
{
"epoch": 0.048601764617915356,
"grad_norm": 1.461388444407636,
"learning_rate": 9.99098431608549e-06,
"loss": 0.1747,
"step": 325
},
{
"epoch": 0.04875130850904741,
"grad_norm": 2.3912366570769974,
"learning_rate": 9.990838360084045e-06,
"loss": 0.5325,
"step": 326
},
{
"epoch": 0.048900852400179454,
"grad_norm": 2.5611474084390937,
"learning_rate": 9.990691233183823e-06,
"loss": 0.2606,
"step": 327
},
{
"epoch": 0.0490503962913115,
"grad_norm": 2.21899436894442,
"learning_rate": 9.990542935419341e-06,
"loss": 0.4253,
"step": 328
},
{
"epoch": 0.049199940182443544,
"grad_norm": 1.6883179263006298,
"learning_rate": 9.99039346682539e-06,
"loss": 0.1768,
"step": 329
},
{
"epoch": 0.049349484073575596,
"grad_norm": 3.2358870266119006,
"learning_rate": 9.990242827437036e-06,
"loss": 0.7866,
"step": 330
},
{
"epoch": 0.04949902796470764,
"grad_norm": 2.0627143054944153,
"learning_rate": 9.990091017289623e-06,
"loss": 0.3286,
"step": 331
},
{
"epoch": 0.049648571855839686,
"grad_norm": 2.1246533005850523,
"learning_rate": 9.989938036418766e-06,
"loss": 0.2716,
"step": 332
},
{
"epoch": 0.04979811574697174,
"grad_norm": 2.6250279686209828,
"learning_rate": 9.989783884860355e-06,
"loss": 0.5058,
"step": 333
},
{
"epoch": 0.049947659638103784,
"grad_norm": 2.3409062617647627,
"learning_rate": 9.989628562650558e-06,
"loss": 0.2589,
"step": 334
},
{
"epoch": 0.05009720352923583,
"grad_norm": 1.835901073337933,
"learning_rate": 9.989472069825811e-06,
"loss": 0.3493,
"step": 335
},
{
"epoch": 0.05024674742036788,
"grad_norm": 2.2454393810241298,
"learning_rate": 9.989314406422835e-06,
"loss": 0.4113,
"step": 336
},
{
"epoch": 0.050396291311499926,
"grad_norm": 2.2906853778474674,
"learning_rate": 9.989155572478611e-06,
"loss": 0.5289,
"step": 337
},
{
"epoch": 0.05054583520263197,
"grad_norm": 2.3899442476389665,
"learning_rate": 9.98899556803041e-06,
"loss": 0.2174,
"step": 338
},
{
"epoch": 0.05069537909376402,
"grad_norm": 1.3681982854338133,
"learning_rate": 9.988834393115768e-06,
"loss": 0.2021,
"step": 339
},
{
"epoch": 0.05084492298489607,
"grad_norm": 1.5118760155287632,
"learning_rate": 9.988672047772497e-06,
"loss": 0.1927,
"step": 340
},
{
"epoch": 0.05099446687602811,
"grad_norm": 2.1144895431001105,
"learning_rate": 9.988508532038685e-06,
"loss": 0.3325,
"step": 341
},
{
"epoch": 0.05114401076716016,
"grad_norm": 1.8616803287346595,
"learning_rate": 9.988343845952697e-06,
"loss": 0.3018,
"step": 342
},
{
"epoch": 0.05129355465829221,
"grad_norm": 2.787967616575242,
"learning_rate": 9.988177989553167e-06,
"loss": 0.4641,
"step": 343
},
{
"epoch": 0.051443098549424256,
"grad_norm": 2.2905797584406242,
"learning_rate": 9.98801096287901e-06,
"loss": 0.5336,
"step": 344
},
{
"epoch": 0.0515926424405563,
"grad_norm": 1.769311364935245,
"learning_rate": 9.987842765969408e-06,
"loss": 0.2843,
"step": 345
},
{
"epoch": 0.05174218633168835,
"grad_norm": 1.7122732613639495,
"learning_rate": 9.987673398863824e-06,
"loss": 0.2272,
"step": 346
},
{
"epoch": 0.0518917302228204,
"grad_norm": 2.328359950454365,
"learning_rate": 9.987502861601991e-06,
"loss": 0.2645,
"step": 347
},
{
"epoch": 0.05204127411395244,
"grad_norm": 2.208277642399548,
"learning_rate": 9.987331154223922e-06,
"loss": 0.5877,
"step": 348
},
{
"epoch": 0.052190818005084495,
"grad_norm": 2.154817789687723,
"learning_rate": 9.9871582767699e-06,
"loss": 0.3414,
"step": 349
},
{
"epoch": 0.05234036189621654,
"grad_norm": 2.0510314098551814,
"learning_rate": 9.986984229280483e-06,
"loss": 0.3981,
"step": 350
},
{
"epoch": 0.052489905787348586,
"grad_norm": 2.346735661125246,
"learning_rate": 9.986809011796503e-06,
"loss": 0.6596,
"step": 351
},
{
"epoch": 0.05263944967848063,
"grad_norm": 1.641693244293744,
"learning_rate": 9.98663262435907e-06,
"loss": 0.3657,
"step": 352
},
{
"epoch": 0.05278899356961268,
"grad_norm": 2.240226359797858,
"learning_rate": 9.986455067009566e-06,
"loss": 0.3706,
"step": 353
},
{
"epoch": 0.05293853746074473,
"grad_norm": 2.3791485993411357,
"learning_rate": 9.986276339789648e-06,
"loss": 0.5428,
"step": 354
},
{
"epoch": 0.05308808135187677,
"grad_norm": 1.7806897327965683,
"learning_rate": 9.986096442741241e-06,
"loss": 0.2336,
"step": 355
},
{
"epoch": 0.053237625243008825,
"grad_norm": 1.8563417208131827,
"learning_rate": 9.98591537590656e-06,
"loss": 0.2129,
"step": 356
},
{
"epoch": 0.05338716913414087,
"grad_norm": 2.2115041121315895,
"learning_rate": 9.98573313932808e-06,
"loss": 0.5232,
"step": 357
},
{
"epoch": 0.053536713025272915,
"grad_norm": 1.3693709893910027,
"learning_rate": 9.985549733048556e-06,
"loss": 0.3524,
"step": 358
},
{
"epoch": 0.05368625691640497,
"grad_norm": 2.033727598383455,
"learning_rate": 9.985365157111017e-06,
"loss": 0.3987,
"step": 359
},
{
"epoch": 0.05383580080753701,
"grad_norm": 2.3258255541409505,
"learning_rate": 9.985179411558767e-06,
"loss": 0.5489,
"step": 360
},
{
"epoch": 0.05398534469866906,
"grad_norm": 2.0805855861837057,
"learning_rate": 9.984992496435383e-06,
"loss": 0.3982,
"step": 361
},
{
"epoch": 0.05413488858980111,
"grad_norm": 1.4938394292792039,
"learning_rate": 9.984804411784717e-06,
"loss": 0.2279,
"step": 362
},
{
"epoch": 0.054284432480933155,
"grad_norm": 1.935765339737269,
"learning_rate": 9.984615157650896e-06,
"loss": 0.2208,
"step": 363
},
{
"epoch": 0.0544339763720652,
"grad_norm": 2.294825440673555,
"learning_rate": 9.98442473407832e-06,
"loss": 0.4006,
"step": 364
},
{
"epoch": 0.054583520263197245,
"grad_norm": 1.7404498428206792,
"learning_rate": 9.984233141111663e-06,
"loss": 0.3859,
"step": 365
},
{
"epoch": 0.0547330641543293,
"grad_norm": 2.382616866788976,
"learning_rate": 9.984040378795879e-06,
"loss": 0.5393,
"step": 366
},
{
"epoch": 0.05488260804546134,
"grad_norm": 2.121310368782044,
"learning_rate": 9.983846447176186e-06,
"loss": 0.3808,
"step": 367
},
{
"epoch": 0.05503215193659339,
"grad_norm": 1.4327836947551182,
"learning_rate": 9.983651346298089e-06,
"loss": 0.21,
"step": 368
},
{
"epoch": 0.05518169582772544,
"grad_norm": 1.8551217286702022,
"learning_rate": 9.983455076207353e-06,
"loss": 0.3611,
"step": 369
},
{
"epoch": 0.055331239718857485,
"grad_norm": 1.1962615317465979,
"learning_rate": 9.983257636950032e-06,
"loss": 0.1632,
"step": 370
},
{
"epoch": 0.05548078360998953,
"grad_norm": 2.210937603202386,
"learning_rate": 9.983059028572443e-06,
"loss": 0.2054,
"step": 371
},
{
"epoch": 0.05563032750112158,
"grad_norm": 1.3676870965949202,
"learning_rate": 9.982859251121183e-06,
"loss": 0.2257,
"step": 372
},
{
"epoch": 0.05577987139225363,
"grad_norm": 1.877238753038072,
"learning_rate": 9.98265830464312e-06,
"loss": 0.3069,
"step": 373
},
{
"epoch": 0.05592941528338567,
"grad_norm": 2.6215120058588743,
"learning_rate": 9.9824561891854e-06,
"loss": 0.3812,
"step": 374
},
{
"epoch": 0.056078959174517724,
"grad_norm": 1.5353869053774183,
"learning_rate": 9.982252904795437e-06,
"loss": 0.3038,
"step": 375
},
{
"epoch": 0.05622850306564977,
"grad_norm": 1.5387274188562523,
"learning_rate": 9.98204845152093e-06,
"loss": 0.1784,
"step": 376
},
{
"epoch": 0.056378046956781815,
"grad_norm": 2.3221296907492444,
"learning_rate": 9.981842829409842e-06,
"loss": 0.4253,
"step": 377
},
{
"epoch": 0.05652759084791386,
"grad_norm": 1.8464138105889263,
"learning_rate": 9.981636038510414e-06,
"loss": 0.2137,
"step": 378
},
{
"epoch": 0.05667713473904591,
"grad_norm": 1.9213502252741161,
"learning_rate": 9.98142807887116e-06,
"loss": 0.2652,
"step": 379
},
{
"epoch": 0.05682667863017796,
"grad_norm": 1.7697460473662174,
"learning_rate": 9.981218950540874e-06,
"loss": 0.2525,
"step": 380
},
{
"epoch": 0.05697622252131,
"grad_norm": 2.001502054151958,
"learning_rate": 9.981008653568613e-06,
"loss": 0.3749,
"step": 381
},
{
"epoch": 0.057125766412442054,
"grad_norm": 1.7507480997796745,
"learning_rate": 9.98079718800372e-06,
"loss": 0.3293,
"step": 382
},
{
"epoch": 0.0572753103035741,
"grad_norm": 1.8995856376763527,
"learning_rate": 9.980584553895805e-06,
"loss": 0.2595,
"step": 383
},
{
"epoch": 0.057424854194706144,
"grad_norm": 1.6960817341003291,
"learning_rate": 9.980370751294754e-06,
"loss": 0.3214,
"step": 384
},
{
"epoch": 0.057574398085838197,
"grad_norm": 2.747620756274178,
"learning_rate": 9.980155780250728e-06,
"loss": 0.4678,
"step": 385
},
{
"epoch": 0.05772394197697024,
"grad_norm": 1.429295181164985,
"learning_rate": 9.979939640814158e-06,
"loss": 0.3417,
"step": 386
},
{
"epoch": 0.05787348586810229,
"grad_norm": 1.546941524577904,
"learning_rate": 9.979722333035757e-06,
"loss": 0.3017,
"step": 387
},
{
"epoch": 0.05802302975923433,
"grad_norm": 2.3243262803022753,
"learning_rate": 9.979503856966504e-06,
"loss": 0.3906,
"step": 388
},
{
"epoch": 0.058172573650366384,
"grad_norm": 1.5367077444523152,
"learning_rate": 9.979284212657658e-06,
"loss": 0.2735,
"step": 389
},
{
"epoch": 0.05832211754149843,
"grad_norm": 1.0259751361449947,
"learning_rate": 9.979063400160747e-06,
"loss": 0.1788,
"step": 390
},
{
"epoch": 0.058471661432630474,
"grad_norm": 1.7811616961442123,
"learning_rate": 9.97884141952758e-06,
"loss": 0.2071,
"step": 391
},
{
"epoch": 0.058621205323762526,
"grad_norm": 2.347009922116326,
"learning_rate": 9.978618270810229e-06,
"loss": 0.4248,
"step": 392
},
{
"epoch": 0.05877074921489457,
"grad_norm": 1.3076474084417338,
"learning_rate": 9.978393954061052e-06,
"loss": 0.1771,
"step": 393
},
{
"epoch": 0.05892029310602662,
"grad_norm": 2.4165379692755455,
"learning_rate": 9.978168469332677e-06,
"loss": 0.4913,
"step": 394
},
{
"epoch": 0.05906983699715867,
"grad_norm": 1.6584516839965744,
"learning_rate": 9.977941816678e-06,
"loss": 0.2292,
"step": 395
},
{
"epoch": 0.059219380888290714,
"grad_norm": 1.3323879687206615,
"learning_rate": 9.9777139961502e-06,
"loss": 0.2042,
"step": 396
},
{
"epoch": 0.05936892477942276,
"grad_norm": 1.242996863833067,
"learning_rate": 9.977485007802725e-06,
"loss": 0.1759,
"step": 397
},
{
"epoch": 0.05951846867055481,
"grad_norm": 2.0289613301318057,
"learning_rate": 9.977254851689297e-06,
"loss": 0.3391,
"step": 398
},
{
"epoch": 0.059668012561686856,
"grad_norm": 1.7111890076718022,
"learning_rate": 9.977023527863913e-06,
"loss": 0.318,
"step": 399
},
{
"epoch": 0.0598175564528189,
"grad_norm": 2.360289838407607,
"learning_rate": 9.976791036380844e-06,
"loss": 0.7436,
"step": 400
},
{
"epoch": 0.059967100343950946,
"grad_norm": 1.6556682149662436,
"learning_rate": 9.976557377294634e-06,
"loss": 0.3579,
"step": 401
},
{
"epoch": 0.060116644235083,
"grad_norm": 1.9472299876725607,
"learning_rate": 9.976322550660103e-06,
"loss": 0.3939,
"step": 402
},
{
"epoch": 0.060266188126215044,
"grad_norm": 1.2625006623785717,
"learning_rate": 9.976086556532343e-06,
"loss": 0.1777,
"step": 403
},
{
"epoch": 0.06041573201734709,
"grad_norm": 2.142440158571368,
"learning_rate": 9.975849394966721e-06,
"loss": 0.4728,
"step": 404
},
{
"epoch": 0.06056527590847914,
"grad_norm": 1.3109446375337697,
"learning_rate": 9.975611066018876e-06,
"loss": 0.2035,
"step": 405
},
{
"epoch": 0.060714819799611186,
"grad_norm": 1.473069250695052,
"learning_rate": 9.975371569744723e-06,
"loss": 0.2502,
"step": 406
},
{
"epoch": 0.06086436369074323,
"grad_norm": 1.4147256960977963,
"learning_rate": 9.975130906200453e-06,
"loss": 0.1861,
"step": 407
},
{
"epoch": 0.06101390758187528,
"grad_norm": 1.5107559691714745,
"learning_rate": 9.97488907544252e-06,
"loss": 0.2309,
"step": 408
},
{
"epoch": 0.06116345147300733,
"grad_norm": 1.5467720756101462,
"learning_rate": 9.97464607752767e-06,
"loss": 0.235,
"step": 409
},
{
"epoch": 0.061312995364139374,
"grad_norm": 1.2901444374034334,
"learning_rate": 9.974401912512905e-06,
"loss": 0.1877,
"step": 410
},
{
"epoch": 0.061462539255271426,
"grad_norm": 1.8751659558285558,
"learning_rate": 9.974156580455512e-06,
"loss": 0.2941,
"step": 411
},
{
"epoch": 0.06161208314640347,
"grad_norm": 1.2187366523072891,
"learning_rate": 9.973910081413048e-06,
"loss": 0.2,
"step": 412
},
{
"epoch": 0.061761627037535516,
"grad_norm": 2.56665763030278,
"learning_rate": 9.973662415443342e-06,
"loss": 0.4259,
"step": 413
},
{
"epoch": 0.06191117092866756,
"grad_norm": 1.5201509236946156,
"learning_rate": 9.973413582604502e-06,
"loss": 0.2098,
"step": 414
},
{
"epoch": 0.06206071481979961,
"grad_norm": 2.2299268067487183,
"learning_rate": 9.973163582954903e-06,
"loss": 0.5054,
"step": 415
},
{
"epoch": 0.06221025871093166,
"grad_norm": 2.195400724979985,
"learning_rate": 9.972912416553202e-06,
"loss": 0.3856,
"step": 416
},
{
"epoch": 0.0623598026020637,
"grad_norm": 2.3196273331545876,
"learning_rate": 9.972660083458321e-06,
"loss": 0.5608,
"step": 417
},
{
"epoch": 0.06250934649319576,
"grad_norm": 1.6815269422927719,
"learning_rate": 9.97240658372946e-06,
"loss": 0.3682,
"step": 418
},
{
"epoch": 0.0626588903843278,
"grad_norm": 1.7582779956751238,
"learning_rate": 9.972151917426095e-06,
"loss": 0.2256,
"step": 419
},
{
"epoch": 0.06280843427545985,
"grad_norm": 1.9523974169697056,
"learning_rate": 9.97189608460797e-06,
"loss": 0.2303,
"step": 420
},
{
"epoch": 0.06295797816659189,
"grad_norm": 2.120409254412015,
"learning_rate": 9.97163908533511e-06,
"loss": 0.2198,
"step": 421
},
{
"epoch": 0.06310752205772394,
"grad_norm": 1.7213130956608376,
"learning_rate": 9.971380919667806e-06,
"loss": 0.3355,
"step": 422
},
{
"epoch": 0.063257065948856,
"grad_norm": 1.6609701125154137,
"learning_rate": 9.971121587666627e-06,
"loss": 0.2354,
"step": 423
},
{
"epoch": 0.06340660983998804,
"grad_norm": 1.2809919353271448,
"learning_rate": 9.970861089392415e-06,
"loss": 0.2043,
"step": 424
},
{
"epoch": 0.06355615373112009,
"grad_norm": 1.137987748410028,
"learning_rate": 9.970599424906285e-06,
"loss": 0.1714,
"step": 425
},
{
"epoch": 0.06370569762225213,
"grad_norm": 2.241505455994119,
"learning_rate": 9.970336594269627e-06,
"loss": 0.559,
"step": 426
},
{
"epoch": 0.06385524151338418,
"grad_norm": 1.8145782296174282,
"learning_rate": 9.970072597544102e-06,
"loss": 0.4695,
"step": 427
},
{
"epoch": 0.06400478540451622,
"grad_norm": 2.6609160560733924,
"learning_rate": 9.96980743479165e-06,
"loss": 0.3927,
"step": 428
},
{
"epoch": 0.06415432929564828,
"grad_norm": 1.5902127205656447,
"learning_rate": 9.969541106074477e-06,
"loss": 0.3221,
"step": 429
},
{
"epoch": 0.06430387318678032,
"grad_norm": 1.354440824254012,
"learning_rate": 9.969273611455066e-06,
"loss": 0.1982,
"step": 430
},
{
"epoch": 0.06445341707791237,
"grad_norm": 2.1796464676908682,
"learning_rate": 9.969004950996175e-06,
"loss": 0.5947,
"step": 431
},
{
"epoch": 0.06460296096904442,
"grad_norm": 1.6772295444343943,
"learning_rate": 9.968735124760834e-06,
"loss": 0.3567,
"step": 432
},
{
"epoch": 0.06475250486017646,
"grad_norm": 2.326608368656497,
"learning_rate": 9.968464132812348e-06,
"loss": 0.3934,
"step": 433
},
{
"epoch": 0.0649020487513085,
"grad_norm": 1.9737750855760885,
"learning_rate": 9.968191975214293e-06,
"loss": 0.3936,
"step": 434
},
{
"epoch": 0.06505159264244055,
"grad_norm": 2.09687169461338,
"learning_rate": 9.967918652030522e-06,
"loss": 0.3644,
"step": 435
},
{
"epoch": 0.06520113653357261,
"grad_norm": 2.1122151786614967,
"learning_rate": 9.967644163325157e-06,
"loss": 0.2169,
"step": 436
},
{
"epoch": 0.06535068042470465,
"grad_norm": 1.8368706867911107,
"learning_rate": 9.967368509162595e-06,
"loss": 0.3956,
"step": 437
},
{
"epoch": 0.0655002243158367,
"grad_norm": 1.7823169737575542,
"learning_rate": 9.96709168960751e-06,
"loss": 0.232,
"step": 438
},
{
"epoch": 0.06564976820696874,
"grad_norm": 2.1565508943507194,
"learning_rate": 9.966813704724844e-06,
"loss": 0.2228,
"step": 439
},
{
"epoch": 0.06579931209810079,
"grad_norm": 2.2075342060994414,
"learning_rate": 9.966534554579816e-06,
"loss": 0.204,
"step": 440
},
{
"epoch": 0.06594885598923284,
"grad_norm": 2.0929887441012602,
"learning_rate": 9.966254239237917e-06,
"loss": 0.3946,
"step": 441
},
{
"epoch": 0.06609839988036488,
"grad_norm": 2.0382287962872834,
"learning_rate": 9.965972758764912e-06,
"loss": 0.4633,
"step": 442
},
{
"epoch": 0.06624794377149694,
"grad_norm": 1.2772439274586147,
"learning_rate": 9.96569011322684e-06,
"loss": 0.1784,
"step": 443
},
{
"epoch": 0.06639748766262898,
"grad_norm": 1.1024457344648066,
"learning_rate": 9.965406302690011e-06,
"loss": 0.1625,
"step": 444
},
{
"epoch": 0.06654703155376103,
"grad_norm": 1.2184559623271476,
"learning_rate": 9.965121327221007e-06,
"loss": 0.1959,
"step": 445
},
{
"epoch": 0.06669657544489307,
"grad_norm": 1.9215235980087064,
"learning_rate": 9.964835186886692e-06,
"loss": 0.2493,
"step": 446
},
{
"epoch": 0.06684611933602512,
"grad_norm": 2.1443052954533974,
"learning_rate": 9.964547881754194e-06,
"loss": 0.3611,
"step": 447
},
{
"epoch": 0.06699566322715717,
"grad_norm": 2.6967138020110712,
"learning_rate": 9.964259411890918e-06,
"loss": 0.5427,
"step": 448
},
{
"epoch": 0.06714520711828922,
"grad_norm": 1.688779610685555,
"learning_rate": 9.96396977736454e-06,
"loss": 0.2569,
"step": 449
},
{
"epoch": 0.06729475100942127,
"grad_norm": 2.1241026975378694,
"learning_rate": 9.963678978243014e-06,
"loss": 0.3863,
"step": 450
},
{
"epoch": 0.06744429490055331,
"grad_norm": 1.9388647656441462,
"learning_rate": 9.96338701459456e-06,
"loss": 0.2726,
"step": 451
},
{
"epoch": 0.06759383879168536,
"grad_norm": 1.4657993620125664,
"learning_rate": 9.963093886487683e-06,
"loss": 0.2338,
"step": 452
},
{
"epoch": 0.0677433826828174,
"grad_norm": 2.307173509923502,
"learning_rate": 9.962799593991146e-06,
"loss": 0.8039,
"step": 453
},
{
"epoch": 0.06789292657394945,
"grad_norm": 1.2669540134016812,
"learning_rate": 9.962504137173997e-06,
"loss": 0.169,
"step": 454
},
{
"epoch": 0.0680424704650815,
"grad_norm": 1.5981790001004936,
"learning_rate": 9.962207516105552e-06,
"loss": 0.2019,
"step": 455
},
{
"epoch": 0.06819201435621355,
"grad_norm": 1.740837427237262,
"learning_rate": 9.9619097308554e-06,
"loss": 0.2116,
"step": 456
},
{
"epoch": 0.0683415582473456,
"grad_norm": 1.9511590671787182,
"learning_rate": 9.961610781493407e-06,
"loss": 0.2611,
"step": 457
},
{
"epoch": 0.06849110213847764,
"grad_norm": 1.9814713665794252,
"learning_rate": 9.961310668089708e-06,
"loss": 0.3714,
"step": 458
},
{
"epoch": 0.06864064602960969,
"grad_norm": 2.755804773731971,
"learning_rate": 9.96100939071471e-06,
"loss": 0.5178,
"step": 459
},
{
"epoch": 0.06879018992074173,
"grad_norm": 2.5378159735000225,
"learning_rate": 9.960706949439101e-06,
"loss": 0.7334,
"step": 460
},
{
"epoch": 0.06893973381187378,
"grad_norm": 2.3557582569765003,
"learning_rate": 9.960403344333832e-06,
"loss": 0.5763,
"step": 461
},
{
"epoch": 0.06908927770300584,
"grad_norm": 1.6501148783544786,
"learning_rate": 9.960098575470131e-06,
"loss": 0.3681,
"step": 462
},
{
"epoch": 0.06923882159413788,
"grad_norm": 1.3521314881367383,
"learning_rate": 9.959792642919505e-06,
"loss": 0.216,
"step": 463
},
{
"epoch": 0.06938836548526993,
"grad_norm": 1.9967115308447656,
"learning_rate": 9.959485546753724e-06,
"loss": 0.4411,
"step": 464
},
{
"epoch": 0.06953790937640197,
"grad_norm": 1.6934835527025132,
"learning_rate": 9.959177287044839e-06,
"loss": 0.3013,
"step": 465
},
{
"epoch": 0.06968745326753402,
"grad_norm": 2.1881268216288703,
"learning_rate": 9.958867863865168e-06,
"loss": 0.386,
"step": 466
},
{
"epoch": 0.06983699715866606,
"grad_norm": 1.746249573857031,
"learning_rate": 9.958557277287307e-06,
"loss": 0.3486,
"step": 467
},
{
"epoch": 0.06998654104979811,
"grad_norm": 1.3309239290400467,
"learning_rate": 9.958245527384118e-06,
"loss": 0.2512,
"step": 468
},
{
"epoch": 0.07013608494093017,
"grad_norm": 1.780095751208227,
"learning_rate": 9.957932614228746e-06,
"loss": 0.3579,
"step": 469
},
{
"epoch": 0.07028562883206221,
"grad_norm": 2.058627302052003,
"learning_rate": 9.957618537894602e-06,
"loss": 0.2234,
"step": 470
},
{
"epoch": 0.07043517272319426,
"grad_norm": 2.1643867800571286,
"learning_rate": 9.95730329845537e-06,
"loss": 0.2658,
"step": 471
},
{
"epoch": 0.0705847166143263,
"grad_norm": 1.9162877246393155,
"learning_rate": 9.956986895985009e-06,
"loss": 0.3514,
"step": 472
},
{
"epoch": 0.07073426050545835,
"grad_norm": 2.0198300655217474,
"learning_rate": 9.95666933055775e-06,
"loss": 0.4191,
"step": 473
},
{
"epoch": 0.0708838043965904,
"grad_norm": 1.8174642496449622,
"learning_rate": 9.956350602248095e-06,
"loss": 0.1802,
"step": 474
},
{
"epoch": 0.07103334828772245,
"grad_norm": 1.7641599345266465,
"learning_rate": 9.956030711130824e-06,
"loss": 0.2181,
"step": 475
},
{
"epoch": 0.0711828921788545,
"grad_norm": 1.5149058769435404,
"learning_rate": 9.955709657280985e-06,
"loss": 0.2068,
"step": 476
},
{
"epoch": 0.07133243606998654,
"grad_norm": 2.14267612952952,
"learning_rate": 9.955387440773902e-06,
"loss": 0.2799,
"step": 477
},
{
"epoch": 0.07148197996111859,
"grad_norm": 1.8794948861297893,
"learning_rate": 9.955064061685166e-06,
"loss": 0.3437,
"step": 478
},
{
"epoch": 0.07163152385225063,
"grad_norm": 1.595856928796192,
"learning_rate": 9.954739520090649e-06,
"loss": 0.1741,
"step": 479
},
{
"epoch": 0.07178106774338268,
"grad_norm": 1.4775459266699813,
"learning_rate": 9.95441381606649e-06,
"loss": 0.2009,
"step": 480
},
{
"epoch": 0.07193061163451472,
"grad_norm": 1.4624583034603231,
"learning_rate": 9.954086949689102e-06,
"loss": 0.2413,
"step": 481
},
{
"epoch": 0.07208015552564678,
"grad_norm": 1.5685428117813849,
"learning_rate": 9.953758921035171e-06,
"loss": 0.2381,
"step": 482
},
{
"epoch": 0.07222969941677883,
"grad_norm": 2.0490413587537524,
"learning_rate": 9.953429730181653e-06,
"loss": 0.4092,
"step": 483
},
{
"epoch": 0.07237924330791087,
"grad_norm": 2.605633491672469,
"learning_rate": 9.953099377205786e-06,
"loss": 0.56,
"step": 484
},
{
"epoch": 0.07252878719904292,
"grad_norm": 1.6836189923086853,
"learning_rate": 9.952767862185071e-06,
"loss": 0.3514,
"step": 485
},
{
"epoch": 0.07267833109017496,
"grad_norm": 2.165692386982445,
"learning_rate": 9.952435185197281e-06,
"loss": 0.4363,
"step": 486
},
{
"epoch": 0.07282787498130701,
"grad_norm": 2.328987566639375,
"learning_rate": 9.952101346320471e-06,
"loss": 0.5953,
"step": 487
},
{
"epoch": 0.07297741887243907,
"grad_norm": 1.857109300243422,
"learning_rate": 9.951766345632957e-06,
"loss": 0.4125,
"step": 488
},
{
"epoch": 0.07312696276357111,
"grad_norm": 1.780608988332075,
"learning_rate": 9.951430183213338e-06,
"loss": 0.2793,
"step": 489
},
{
"epoch": 0.07327650665470316,
"grad_norm": 1.2718866410706833,
"learning_rate": 9.951092859140479e-06,
"loss": 0.1878,
"step": 490
},
{
"epoch": 0.0734260505458352,
"grad_norm": 1.389385388824981,
"learning_rate": 9.95075437349352e-06,
"loss": 0.1922,
"step": 491
},
{
"epoch": 0.07357559443696725,
"grad_norm": 1.2364018773804621,
"learning_rate": 9.950414726351873e-06,
"loss": 0.1972,
"step": 492
},
{
"epoch": 0.0737251383280993,
"grad_norm": 1.6438922682719497,
"learning_rate": 9.95007391779522e-06,
"loss": 0.3835,
"step": 493
},
{
"epoch": 0.07387468221923134,
"grad_norm": 1.9223258334837023,
"learning_rate": 9.949731947903523e-06,
"loss": 0.5421,
"step": 494
},
{
"epoch": 0.0740242261103634,
"grad_norm": 2.1294087718057955,
"learning_rate": 9.949388816757009e-06,
"loss": 0.6584,
"step": 495
},
{
"epoch": 0.07417377000149544,
"grad_norm": 1.9620720670123732,
"learning_rate": 9.949044524436178e-06,
"loss": 0.3427,
"step": 496
},
{
"epoch": 0.07432331389262749,
"grad_norm": 1.8767982308843718,
"learning_rate": 9.948699071021806e-06,
"loss": 0.2221,
"step": 497
},
{
"epoch": 0.07447285778375953,
"grad_norm": 1.5717369659821445,
"learning_rate": 9.948352456594938e-06,
"loss": 0.3915,
"step": 498
},
{
"epoch": 0.07462240167489158,
"grad_norm": 1.9105988284269253,
"learning_rate": 9.948004681236896e-06,
"loss": 0.4049,
"step": 499
},
{
"epoch": 0.07477194556602362,
"grad_norm": 2.051255434710168,
"learning_rate": 9.94765574502927e-06,
"loss": 0.263,
"step": 500
},
{
"epoch": 0.07492148945715567,
"grad_norm": 1.1727115808022262,
"learning_rate": 9.947305648053924e-06,
"loss": 0.2061,
"step": 501
},
{
"epoch": 0.07507103334828773,
"grad_norm": 2.3851218898633566,
"learning_rate": 9.946954390392995e-06,
"loss": 0.3587,
"step": 502
},
{
"epoch": 0.07522057723941977,
"grad_norm": 2.668333899893354,
"learning_rate": 9.94660197212889e-06,
"loss": 0.279,
"step": 503
},
{
"epoch": 0.07537012113055182,
"grad_norm": 2.324044177768054,
"learning_rate": 9.946248393344289e-06,
"loss": 0.5219,
"step": 504
},
{
"epoch": 0.07551966502168386,
"grad_norm": 2.252535927387564,
"learning_rate": 9.945893654122147e-06,
"loss": 0.4462,
"step": 505
},
{
"epoch": 0.07566920891281591,
"grad_norm": 1.2553962948323492,
"learning_rate": 9.945537754545689e-06,
"loss": 0.1829,
"step": 506
},
{
"epoch": 0.07581875280394795,
"grad_norm": 2.009514792075129,
"learning_rate": 9.94518069469841e-06,
"loss": 0.334,
"step": 507
},
{
"epoch": 0.07596829669508001,
"grad_norm": 1.7045023449590413,
"learning_rate": 9.944822474664082e-06,
"loss": 0.3202,
"step": 508
},
{
"epoch": 0.07611784058621206,
"grad_norm": 1.0508191419172128,
"learning_rate": 9.944463094526747e-06,
"loss": 0.205,
"step": 509
},
{
"epoch": 0.0762673844773441,
"grad_norm": 1.6097293192900886,
"learning_rate": 9.944102554370718e-06,
"loss": 0.2324,
"step": 510
},
{
"epoch": 0.07641692836847615,
"grad_norm": 1.9399148366487866,
"learning_rate": 9.943740854280582e-06,
"loss": 0.4526,
"step": 511
},
{
"epoch": 0.07656647225960819,
"grad_norm": 2.0362256511499335,
"learning_rate": 9.943377994341197e-06,
"loss": 0.3979,
"step": 512
},
{
"epoch": 0.07671601615074024,
"grad_norm": 1.5296316888698338,
"learning_rate": 9.943013974637693e-06,
"loss": 0.3789,
"step": 513
},
{
"epoch": 0.07686556004187228,
"grad_norm": 1.496691000675503,
"learning_rate": 9.942648795255473e-06,
"loss": 0.2497,
"step": 514
},
{
"epoch": 0.07701510393300434,
"grad_norm": 1.4146486247851384,
"learning_rate": 9.942282456280212e-06,
"loss": 0.3088,
"step": 515
},
{
"epoch": 0.07716464782413639,
"grad_norm": 1.3671722765483707,
"learning_rate": 9.941914957797855e-06,
"loss": 0.2076,
"step": 516
},
{
"epoch": 0.07731419171526843,
"grad_norm": 1.8485057563465108,
"learning_rate": 9.941546299894623e-06,
"loss": 0.3676,
"step": 517
},
{
"epoch": 0.07746373560640048,
"grad_norm": 2.0438588429845255,
"learning_rate": 9.941176482657005e-06,
"loss": 0.4905,
"step": 518
},
{
"epoch": 0.07761327949753252,
"grad_norm": 1.3215533906334498,
"learning_rate": 9.940805506171765e-06,
"loss": 0.2028,
"step": 519
},
{
"epoch": 0.07776282338866457,
"grad_norm": 2.499241081917891,
"learning_rate": 9.940433370525937e-06,
"loss": 0.4323,
"step": 520
},
{
"epoch": 0.07791236727979663,
"grad_norm": 1.4654220634749195,
"learning_rate": 9.940060075806827e-06,
"loss": 0.1928,
"step": 521
},
{
"epoch": 0.07806191117092867,
"grad_norm": 2.32501667334618,
"learning_rate": 9.939685622102013e-06,
"loss": 0.6039,
"step": 522
},
{
"epoch": 0.07821145506206072,
"grad_norm": 2.0353313744113644,
"learning_rate": 9.939310009499348e-06,
"loss": 0.434,
"step": 523
},
{
"epoch": 0.07836099895319276,
"grad_norm": 1.5916248439200642,
"learning_rate": 9.938933238086952e-06,
"loss": 0.2484,
"step": 524
},
{
"epoch": 0.07851054284432481,
"grad_norm": 1.510761606083,
"learning_rate": 9.938555307953221e-06,
"loss": 0.2761,
"step": 525
},
{
"epoch": 0.07866008673545685,
"grad_norm": 1.6041562012438388,
"learning_rate": 9.93817621918682e-06,
"loss": 0.3032,
"step": 526
},
{
"epoch": 0.0788096306265889,
"grad_norm": 1.5831322947558841,
"learning_rate": 9.937795971876686e-06,
"loss": 0.3486,
"step": 527
},
{
"epoch": 0.07895917451772096,
"grad_norm": 2.2247878916503856,
"learning_rate": 9.93741456611203e-06,
"loss": 0.4087,
"step": 528
},
{
"epoch": 0.079108718408853,
"grad_norm": 2.152252638423622,
"learning_rate": 9.937032001982334e-06,
"loss": 0.5629,
"step": 529
},
{
"epoch": 0.07925826229998505,
"grad_norm": 2.0483514105705525,
"learning_rate": 9.93664827957735e-06,
"loss": 0.5279,
"step": 530
},
{
"epoch": 0.07940780619111709,
"grad_norm": 1.2448870158155207,
"learning_rate": 9.936263398987103e-06,
"loss": 0.3744,
"step": 531
},
{
"epoch": 0.07955735008224914,
"grad_norm": 0.9489762178863248,
"learning_rate": 9.93587736030189e-06,
"loss": 0.1631,
"step": 532
},
{
"epoch": 0.07970689397338118,
"grad_norm": 1.3545590640653586,
"learning_rate": 9.935490163612279e-06,
"loss": 0.1975,
"step": 533
},
{
"epoch": 0.07985643786451324,
"grad_norm": 1.3663228011672384,
"learning_rate": 9.93510180900911e-06,
"loss": 0.184,
"step": 534
},
{
"epoch": 0.08000598175564529,
"grad_norm": 1.5768436668872405,
"learning_rate": 9.934712296583497e-06,
"loss": 0.3183,
"step": 535
},
{
"epoch": 0.08015552564677733,
"grad_norm": 1.926347057489139,
"learning_rate": 9.93432162642682e-06,
"loss": 0.3305,
"step": 536
},
{
"epoch": 0.08030506953790938,
"grad_norm": 2.0791782850566474,
"learning_rate": 9.933929798630738e-06,
"loss": 0.5009,
"step": 537
},
{
"epoch": 0.08045461342904142,
"grad_norm": 2.1023331544425523,
"learning_rate": 9.933536813287172e-06,
"loss": 0.4292,
"step": 538
},
{
"epoch": 0.08060415732017347,
"grad_norm": 2.8605361415271493,
"learning_rate": 9.933142670488324e-06,
"loss": 0.2666,
"step": 539
},
{
"epoch": 0.08075370121130551,
"grad_norm": 2.7087693572573968,
"learning_rate": 9.932747370326664e-06,
"loss": 0.2544,
"step": 540
},
{
"epoch": 0.08090324510243757,
"grad_norm": 1.5804074183588281,
"learning_rate": 9.932350912894932e-06,
"loss": 0.2089,
"step": 541
},
{
"epoch": 0.08105278899356962,
"grad_norm": 1.6448934387271092,
"learning_rate": 9.931953298286141e-06,
"loss": 0.181,
"step": 542
},
{
"epoch": 0.08120233288470166,
"grad_norm": 1.373017928034036,
"learning_rate": 9.931554526593576e-06,
"loss": 0.3218,
"step": 543
},
{
"epoch": 0.0813518767758337,
"grad_norm": 1.4895748889012388,
"learning_rate": 9.931154597910791e-06,
"loss": 0.2472,
"step": 544
},
{
"epoch": 0.08150142066696575,
"grad_norm": 2.064608760225509,
"learning_rate": 9.930753512331615e-06,
"loss": 0.3765,
"step": 545
},
{
"epoch": 0.0816509645580978,
"grad_norm": 1.6526846905937504,
"learning_rate": 9.930351269950144e-06,
"loss": 0.3177,
"step": 546
},
{
"epoch": 0.08180050844922986,
"grad_norm": 2.047798829134187,
"learning_rate": 9.92994787086075e-06,
"loss": 0.3192,
"step": 547
},
{
"epoch": 0.0819500523403619,
"grad_norm": 2.122394373762569,
"learning_rate": 9.929543315158073e-06,
"loss": 0.5554,
"step": 548
},
{
"epoch": 0.08209959623149395,
"grad_norm": 2.311960518258969,
"learning_rate": 9.929137602937028e-06,
"loss": 0.3797,
"step": 549
},
{
"epoch": 0.08224914012262599,
"grad_norm": 1.8449832380251867,
"learning_rate": 9.928730734292797e-06,
"loss": 0.3894,
"step": 550
},
{
"epoch": 0.08239868401375804,
"grad_norm": 1.995255157883457,
"learning_rate": 9.928322709320834e-06,
"loss": 0.3925,
"step": 551
},
{
"epoch": 0.08254822790489008,
"grad_norm": 2.755405061449222,
"learning_rate": 9.92791352811687e-06,
"loss": 0.6899,
"step": 552
},
{
"epoch": 0.08269777179602213,
"grad_norm": 1.2254981142470793,
"learning_rate": 9.9275031907769e-06,
"loss": 0.2225,
"step": 553
},
{
"epoch": 0.08284731568715419,
"grad_norm": 1.9323036995913243,
"learning_rate": 9.927091697397192e-06,
"loss": 0.3865,
"step": 554
},
{
"epoch": 0.08299685957828623,
"grad_norm": 2.0962863974348593,
"learning_rate": 9.926679048074289e-06,
"loss": 0.4,
"step": 555
},
{
"epoch": 0.08314640346941828,
"grad_norm": 1.5847691098448267,
"learning_rate": 9.926265242904998e-06,
"loss": 0.247,
"step": 556
},
{
"epoch": 0.08329594736055032,
"grad_norm": 2.5967594290859903,
"learning_rate": 9.925850281986408e-06,
"loss": 0.2083,
"step": 557
},
{
"epoch": 0.08344549125168237,
"grad_norm": 2.0426826933231226,
"learning_rate": 9.925434165415868e-06,
"loss": 0.449,
"step": 558
},
{
"epoch": 0.08359503514281441,
"grad_norm": 1.7693278888452375,
"learning_rate": 9.925016893291007e-06,
"loss": 0.2789,
"step": 559
},
{
"epoch": 0.08374457903394647,
"grad_norm": 1.6227416269049326,
"learning_rate": 9.924598465709717e-06,
"loss": 0.2209,
"step": 560
},
{
"epoch": 0.08389412292507852,
"grad_norm": 1.7055307729140163,
"learning_rate": 9.924178882770166e-06,
"loss": 0.3554,
"step": 561
},
{
"epoch": 0.08404366681621056,
"grad_norm": 1.9245436136675982,
"learning_rate": 9.923758144570792e-06,
"loss": 0.5343,
"step": 562
},
{
"epoch": 0.0841932107073426,
"grad_norm": 1.3916186974123048,
"learning_rate": 9.923336251210306e-06,
"loss": 0.2328,
"step": 563
},
{
"epoch": 0.08434275459847465,
"grad_norm": 1.8724253939088875,
"learning_rate": 9.92291320278769e-06,
"loss": 0.2691,
"step": 564
},
{
"epoch": 0.0844922984896067,
"grad_norm": 1.545927153493535,
"learning_rate": 9.922488999402191e-06,
"loss": 0.2049,
"step": 565
},
{
"epoch": 0.08464184238073874,
"grad_norm": 2.216312298348258,
"learning_rate": 9.922063641153332e-06,
"loss": 0.5844,
"step": 566
},
{
"epoch": 0.0847913862718708,
"grad_norm": 1.2444734652143745,
"learning_rate": 9.921637128140909e-06,
"loss": 0.2872,
"step": 567
},
{
"epoch": 0.08494093016300285,
"grad_norm": 2.133851301389792,
"learning_rate": 9.921209460464983e-06,
"loss": 0.2418,
"step": 568
},
{
"epoch": 0.08509047405413489,
"grad_norm": 1.5462263702909163,
"learning_rate": 9.92078063822589e-06,
"loss": 0.3438,
"step": 569
},
{
"epoch": 0.08524001794526694,
"grad_norm": 2.341879963295622,
"learning_rate": 9.920350661524237e-06,
"loss": 0.5783,
"step": 570
},
{
"epoch": 0.08538956183639898,
"grad_norm": 1.7633187330163729,
"learning_rate": 9.919919530460899e-06,
"loss": 0.3503,
"step": 571
},
{
"epoch": 0.08553910572753103,
"grad_norm": 2.1676160714531107,
"learning_rate": 9.919487245137024e-06,
"loss": 0.2098,
"step": 572
},
{
"epoch": 0.08568864961866307,
"grad_norm": 2.198855334486466,
"learning_rate": 9.919053805654029e-06,
"loss": 0.3876,
"step": 573
},
{
"epoch": 0.08583819350979513,
"grad_norm": 1.821472616891953,
"learning_rate": 9.918619212113607e-06,
"loss": 0.391,
"step": 574
},
{
"epoch": 0.08598773740092717,
"grad_norm": 1.4553776733520012,
"learning_rate": 9.918183464617714e-06,
"loss": 0.2032,
"step": 575
},
{
"epoch": 0.08613728129205922,
"grad_norm": 1.5817735791823646,
"learning_rate": 9.917746563268581e-06,
"loss": 0.2658,
"step": 576
},
{
"epoch": 0.08628682518319127,
"grad_norm": 2.255323258805483,
"learning_rate": 9.917308508168712e-06,
"loss": 0.39,
"step": 577
},
{
"epoch": 0.08643636907432331,
"grad_norm": 1.699175902078527,
"learning_rate": 9.916869299420875e-06,
"loss": 0.1906,
"step": 578
},
{
"epoch": 0.08658591296545536,
"grad_norm": 1.5572993513277051,
"learning_rate": 9.916428937128117e-06,
"loss": 0.3438,
"step": 579
},
{
"epoch": 0.08673545685658741,
"grad_norm": 1.5095119263162684,
"learning_rate": 9.915987421393747e-06,
"loss": 0.272,
"step": 580
},
{
"epoch": 0.08688500074771946,
"grad_norm": 2.8137128440101735,
"learning_rate": 9.91554475232135e-06,
"loss": 0.3833,
"step": 581
},
{
"epoch": 0.0870345446388515,
"grad_norm": 1.845156278788705,
"learning_rate": 9.915100930014786e-06,
"loss": 0.4658,
"step": 582
},
{
"epoch": 0.08718408852998355,
"grad_norm": 1.7624433765379017,
"learning_rate": 9.914655954578171e-06,
"loss": 0.3968,
"step": 583
},
{
"epoch": 0.0873336324211156,
"grad_norm": 1.7915618837196812,
"learning_rate": 9.914209826115906e-06,
"loss": 0.4901,
"step": 584
},
{
"epoch": 0.08748317631224764,
"grad_norm": 1.8335500777788887,
"learning_rate": 9.913762544732654e-06,
"loss": 0.249,
"step": 585
},
{
"epoch": 0.08763272020337969,
"grad_norm": 1.5116580783389033,
"learning_rate": 9.913314110533355e-06,
"loss": 0.3999,
"step": 586
},
{
"epoch": 0.08778226409451174,
"grad_norm": 1.9828537343745032,
"learning_rate": 9.912864523623214e-06,
"loss": 0.4153,
"step": 587
},
{
"epoch": 0.08793180798564379,
"grad_norm": 1.6056147158647165,
"learning_rate": 9.912413784107709e-06,
"loss": 0.357,
"step": 588
},
{
"epoch": 0.08808135187677583,
"grad_norm": 1.7642170812152784,
"learning_rate": 9.911961892092587e-06,
"loss": 0.3425,
"step": 589
},
{
"epoch": 0.08823089576790788,
"grad_norm": 1.925307511563271,
"learning_rate": 9.911508847683867e-06,
"loss": 0.4476,
"step": 590
},
{
"epoch": 0.08838043965903992,
"grad_norm": 1.9824372539957273,
"learning_rate": 9.911054650987837e-06,
"loss": 0.4597,
"step": 591
},
{
"epoch": 0.08852998355017197,
"grad_norm": 1.5805088418089035,
"learning_rate": 9.910599302111057e-06,
"loss": 0.1935,
"step": 592
},
{
"epoch": 0.08867952744130403,
"grad_norm": 2.157404890931188,
"learning_rate": 9.910142801160355e-06,
"loss": 0.3443,
"step": 593
},
{
"epoch": 0.08882907133243607,
"grad_norm": 2.094900000445731,
"learning_rate": 9.909685148242831e-06,
"loss": 0.404,
"step": 594
},
{
"epoch": 0.08897861522356812,
"grad_norm": 2.336415519412793,
"learning_rate": 9.909226343465856e-06,
"loss": 0.6382,
"step": 595
},
{
"epoch": 0.08912815911470016,
"grad_norm": 2.0552137049182497,
"learning_rate": 9.908766386937067e-06,
"loss": 0.3908,
"step": 596
},
{
"epoch": 0.08927770300583221,
"grad_norm": 1.1564393734179468,
"learning_rate": 9.908305278764376e-06,
"loss": 0.2457,
"step": 597
},
{
"epoch": 0.08942724689696425,
"grad_norm": 1.8704284289450437,
"learning_rate": 9.907843019055966e-06,
"loss": 0.3604,
"step": 598
},
{
"epoch": 0.0895767907880963,
"grad_norm": 1.295042190600909,
"learning_rate": 9.907379607920281e-06,
"loss": 0.2075,
"step": 599
},
{
"epoch": 0.08972633467922836,
"grad_norm": 1.8305770820800886,
"learning_rate": 9.90691504546605e-06,
"loss": 0.2698,
"step": 600
},
{
"epoch": 0.0898758785703604,
"grad_norm": 1.7240290275544472,
"learning_rate": 9.906449331802256e-06,
"loss": 0.2504,
"step": 601
},
{
"epoch": 0.09002542246149245,
"grad_norm": 1.0036789417827203,
"learning_rate": 9.905982467038167e-06,
"loss": 0.195,
"step": 602
},
{
"epoch": 0.0901749663526245,
"grad_norm": 1.6777253578130231,
"learning_rate": 9.905514451283308e-06,
"loss": 0.2436,
"step": 603
},
{
"epoch": 0.09032451024375654,
"grad_norm": 1.9190873052270145,
"learning_rate": 9.905045284647483e-06,
"loss": 0.4006,
"step": 604
},
{
"epoch": 0.09047405413488858,
"grad_norm": 1.77001911452716,
"learning_rate": 9.904574967240764e-06,
"loss": 0.3703,
"step": 605
},
{
"epoch": 0.09062359802602064,
"grad_norm": 1.3114492277508998,
"learning_rate": 9.904103499173487e-06,
"loss": 0.2323,
"step": 606
},
{
"epoch": 0.09077314191715269,
"grad_norm": 1.6694643051834908,
"learning_rate": 9.90363088055627e-06,
"loss": 0.2881,
"step": 607
},
{
"epoch": 0.09092268580828473,
"grad_norm": 1.4448454411512122,
"learning_rate": 9.903157111499988e-06,
"loss": 0.2341,
"step": 608
},
{
"epoch": 0.09107222969941678,
"grad_norm": 1.8302982894061834,
"learning_rate": 9.902682192115795e-06,
"loss": 0.3497,
"step": 609
},
{
"epoch": 0.09122177359054882,
"grad_norm": 1.4089802820999182,
"learning_rate": 9.902206122515113e-06,
"loss": 0.1565,
"step": 610
},
{
"epoch": 0.09137131748168087,
"grad_norm": 2.275670976517465,
"learning_rate": 9.901728902809627e-06,
"loss": 0.482,
"step": 611
},
{
"epoch": 0.09152086137281291,
"grad_norm": 2.3916744409549997,
"learning_rate": 9.901250533111301e-06,
"loss": 0.539,
"step": 612
},
{
"epoch": 0.09167040526394497,
"grad_norm": 1.110965438282227,
"learning_rate": 9.900771013532367e-06,
"loss": 0.2257,
"step": 613
},
{
"epoch": 0.09181994915507702,
"grad_norm": 1.6169969209154105,
"learning_rate": 9.900290344185321e-06,
"loss": 0.2316,
"step": 614
},
{
"epoch": 0.09196949304620906,
"grad_norm": 1.390950490331229,
"learning_rate": 9.899808525182935e-06,
"loss": 0.1735,
"step": 615
},
{
"epoch": 0.09211903693734111,
"grad_norm": 1.26641152514348,
"learning_rate": 9.899325556638247e-06,
"loss": 0.2269,
"step": 616
},
{
"epoch": 0.09226858082847315,
"grad_norm": 1.107259968960053,
"learning_rate": 9.898841438664568e-06,
"loss": 0.2082,
"step": 617
},
{
"epoch": 0.0924181247196052,
"grad_norm": 1.6779136428714192,
"learning_rate": 9.898356171375473e-06,
"loss": 0.3744,
"step": 618
},
{
"epoch": 0.09256766861073726,
"grad_norm": 1.8012739115801626,
"learning_rate": 9.897869754884816e-06,
"loss": 0.2438,
"step": 619
},
{
"epoch": 0.0927172125018693,
"grad_norm": 1.6400812519548655,
"learning_rate": 9.89738218930671e-06,
"loss": 0.3692,
"step": 620
},
{
"epoch": 0.09286675639300135,
"grad_norm": 2.7659374426954972,
"learning_rate": 9.896893474755547e-06,
"loss": 0.5873,
"step": 621
},
{
"epoch": 0.0930163002841334,
"grad_norm": 3.020452608035097,
"learning_rate": 9.89640361134598e-06,
"loss": 0.4177,
"step": 622
},
{
"epoch": 0.09316584417526544,
"grad_norm": 1.4907614824403637,
"learning_rate": 9.895912599192937e-06,
"loss": 0.2516,
"step": 623
},
{
"epoch": 0.09331538806639748,
"grad_norm": 1.6636615032724535,
"learning_rate": 9.895420438411616e-06,
"loss": 0.1935,
"step": 624
},
{
"epoch": 0.09346493195752953,
"grad_norm": 1.9719905447621995,
"learning_rate": 9.89492712911748e-06,
"loss": 0.2135,
"step": 625
},
{
"epoch": 0.09361447584866159,
"grad_norm": 1.3681787330772102,
"learning_rate": 9.894432671426264e-06,
"loss": 0.208,
"step": 626
},
{
"epoch": 0.09376401973979363,
"grad_norm": 2.0793649946453043,
"learning_rate": 9.893937065453976e-06,
"loss": 0.3719,
"step": 627
},
{
"epoch": 0.09391356363092568,
"grad_norm": 1.685584025343787,
"learning_rate": 9.893440311316887e-06,
"loss": 0.2164,
"step": 628
},
{
"epoch": 0.09406310752205772,
"grad_norm": 1.2145425693019332,
"learning_rate": 9.892942409131541e-06,
"loss": 0.1725,
"step": 629
},
{
"epoch": 0.09421265141318977,
"grad_norm": 1.1438517718036314,
"learning_rate": 9.892443359014752e-06,
"loss": 0.2367,
"step": 630
},
{
"epoch": 0.09436219530432181,
"grad_norm": 1.4416913213257094,
"learning_rate": 9.8919431610836e-06,
"loss": 0.2254,
"step": 631
},
{
"epoch": 0.09451173919545386,
"grad_norm": 1.2656296241346114,
"learning_rate": 9.891441815455436e-06,
"loss": 0.2485,
"step": 632
},
{
"epoch": 0.09466128308658592,
"grad_norm": 1.4276056880724206,
"learning_rate": 9.890939322247881e-06,
"loss": 0.1908,
"step": 633
},
{
"epoch": 0.09481082697771796,
"grad_norm": 1.8185771152087218,
"learning_rate": 9.890435681578827e-06,
"loss": 0.2096,
"step": 634
},
{
"epoch": 0.09496037086885001,
"grad_norm": 1.2794518689910337,
"learning_rate": 9.88993089356643e-06,
"loss": 0.2394,
"step": 635
},
{
"epoch": 0.09510991475998205,
"grad_norm": 2.0227594086297738,
"learning_rate": 9.88942495832912e-06,
"loss": 0.59,
"step": 636
},
{
"epoch": 0.0952594586511141,
"grad_norm": 1.3323082817593526,
"learning_rate": 9.888917875985593e-06,
"loss": 0.2073,
"step": 637
},
{
"epoch": 0.09540900254224614,
"grad_norm": 1.7884206661676574,
"learning_rate": 9.888409646654818e-06,
"loss": 0.3897,
"step": 638
},
{
"epoch": 0.0955585464333782,
"grad_norm": 2.124144136353745,
"learning_rate": 9.887900270456025e-06,
"loss": 0.5683,
"step": 639
},
{
"epoch": 0.09570809032451025,
"grad_norm": 1.4793433841619534,
"learning_rate": 9.887389747508725e-06,
"loss": 0.3727,
"step": 640
},
{
"epoch": 0.09585763421564229,
"grad_norm": 1.0661747667222115,
"learning_rate": 9.88687807793269e-06,
"loss": 0.1983,
"step": 641
},
{
"epoch": 0.09600717810677434,
"grad_norm": 1.615153009655538,
"learning_rate": 9.886365261847957e-06,
"loss": 0.3675,
"step": 642
},
{
"epoch": 0.09615672199790638,
"grad_norm": 1.4963878387365324,
"learning_rate": 9.885851299374844e-06,
"loss": 0.1805,
"step": 643
},
{
"epoch": 0.09630626588903843,
"grad_norm": 1.8529323065992462,
"learning_rate": 9.88533619063393e-06,
"loss": 0.391,
"step": 644
},
{
"epoch": 0.09645580978017047,
"grad_norm": 2.4764246014732145,
"learning_rate": 9.884819935746063e-06,
"loss": 0.2605,
"step": 645
},
{
"epoch": 0.09660535367130253,
"grad_norm": 1.904672440883197,
"learning_rate": 9.884302534832361e-06,
"loss": 0.3935,
"step": 646
},
{
"epoch": 0.09675489756243458,
"grad_norm": 1.9431435460380113,
"learning_rate": 9.883783988014216e-06,
"loss": 0.2092,
"step": 647
},
{
"epoch": 0.09690444145356662,
"grad_norm": 2.0946695671241553,
"learning_rate": 9.883264295413278e-06,
"loss": 0.3957,
"step": 648
},
{
"epoch": 0.09705398534469867,
"grad_norm": 1.0944344711946927,
"learning_rate": 9.882743457151476e-06,
"loss": 0.202,
"step": 649
},
{
"epoch": 0.09720352923583071,
"grad_norm": 1.5147259026498003,
"learning_rate": 9.882221473351e-06,
"loss": 0.3029,
"step": 650
},
{
"epoch": 0.09735307312696276,
"grad_norm": 1.3452835965457643,
"learning_rate": 9.881698344134316e-06,
"loss": 0.2159,
"step": 651
},
{
"epoch": 0.09750261701809482,
"grad_norm": 1.7952640402406481,
"learning_rate": 9.881174069624155e-06,
"loss": 0.4006,
"step": 652
},
{
"epoch": 0.09765216090922686,
"grad_norm": 2.468540255171398,
"learning_rate": 9.880648649943515e-06,
"loss": 0.4393,
"step": 653
},
{
"epoch": 0.09780170480035891,
"grad_norm": 1.5332585075726441,
"learning_rate": 9.880122085215664e-06,
"loss": 0.2401,
"step": 654
},
{
"epoch": 0.09795124869149095,
"grad_norm": 1.5882881108110953,
"learning_rate": 9.87959437556414e-06,
"loss": 0.2078,
"step": 655
},
{
"epoch": 0.098100792582623,
"grad_norm": 1.7962702189497488,
"learning_rate": 9.87906552111275e-06,
"loss": 0.4793,
"step": 656
},
{
"epoch": 0.09825033647375504,
"grad_norm": 1.860004859316795,
"learning_rate": 9.878535521985568e-06,
"loss": 0.2388,
"step": 657
},
{
"epoch": 0.09839988036488709,
"grad_norm": 1.9861019609665855,
"learning_rate": 9.878004378306934e-06,
"loss": 0.3721,
"step": 658
},
{
"epoch": 0.09854942425601915,
"grad_norm": 1.5404208138898199,
"learning_rate": 9.877472090201463e-06,
"loss": 0.3534,
"step": 659
},
{
"epoch": 0.09869896814715119,
"grad_norm": 3.0119825067072306,
"learning_rate": 9.876938657794036e-06,
"loss": 0.6732,
"step": 660
},
{
"epoch": 0.09884851203828324,
"grad_norm": 1.5069735817087104,
"learning_rate": 9.876404081209796e-06,
"loss": 0.4004,
"step": 661
},
{
"epoch": 0.09899805592941528,
"grad_norm": 1.6856753387650372,
"learning_rate": 9.875868360574164e-06,
"loss": 0.2942,
"step": 662
},
{
"epoch": 0.09914759982054733,
"grad_norm": 1.6896901311725145,
"learning_rate": 9.875331496012822e-06,
"loss": 0.239,
"step": 663
},
{
"epoch": 0.09929714371167937,
"grad_norm": 2.2770505228904225,
"learning_rate": 9.87479348765173e-06,
"loss": 0.4755,
"step": 664
},
{
"epoch": 0.09944668760281143,
"grad_norm": 1.9016485099179228,
"learning_rate": 9.874254335617102e-06,
"loss": 0.4645,
"step": 665
},
{
"epoch": 0.09959623149394348,
"grad_norm": 1.6638896812103354,
"learning_rate": 9.873714040035434e-06,
"loss": 0.2512,
"step": 666
},
{
"epoch": 0.09974577538507552,
"grad_norm": 1.7233554952000107,
"learning_rate": 9.873172601033482e-06,
"loss": 0.3958,
"step": 667
},
{
"epoch": 0.09989531927620757,
"grad_norm": 1.7250170911584946,
"learning_rate": 9.872630018738271e-06,
"loss": 0.3115,
"step": 668
},
{
"epoch": 0.10004486316733961,
"grad_norm": 1.8843746906489027,
"learning_rate": 9.872086293277101e-06,
"loss": 0.3789,
"step": 669
}
],
"logging_steps": 1.0,
"max_steps": 6687,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 669,
"total_flos": 41613770170368.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}