9b-136 / trainer_state.json
furproxy's picture
Upload folder using huggingface_hub
de6e7f3 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 3564,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0016835016835016834,
"grad_norm": 11.373913764953613,
"learning_rate": 1.1173184357541899e-08,
"loss": 1.7052544355392456,
"step": 2
},
{
"epoch": 0.003367003367003367,
"grad_norm": 7.921729564666748,
"learning_rate": 3.3519553072625695e-08,
"loss": 1.2431142330169678,
"step": 4
},
{
"epoch": 0.005050505050505051,
"grad_norm": 10.424066543579102,
"learning_rate": 5.586592178770949e-08,
"loss": 1.6871027946472168,
"step": 6
},
{
"epoch": 0.006734006734006734,
"grad_norm": 12.496847152709961,
"learning_rate": 7.82122905027933e-08,
"loss": 1.605969786643982,
"step": 8
},
{
"epoch": 0.008417508417508417,
"grad_norm": 41.12433624267578,
"learning_rate": 1.005586592178771e-07,
"loss": 4.507528781890869,
"step": 10
},
{
"epoch": 0.010101010101010102,
"grad_norm": 5.525810718536377,
"learning_rate": 1.2290502793296089e-07,
"loss": 1.978538990020752,
"step": 12
},
{
"epoch": 0.011784511784511785,
"grad_norm": 5.537006855010986,
"learning_rate": 1.452513966480447e-07,
"loss": 1.6731345653533936,
"step": 14
},
{
"epoch": 0.013468013468013467,
"grad_norm": 5.548595905303955,
"learning_rate": 1.6759776536312846e-07,
"loss": 1.6586148738861084,
"step": 16
},
{
"epoch": 0.015151515151515152,
"grad_norm": 30.940366744995117,
"learning_rate": 1.8994413407821228e-07,
"loss": 2.7178502082824707,
"step": 18
},
{
"epoch": 0.016835016835016835,
"grad_norm": 13.579764366149902,
"learning_rate": 2.122905027932961e-07,
"loss": 1.946686029434204,
"step": 20
},
{
"epoch": 0.018518518518518517,
"grad_norm": 27.936044692993164,
"learning_rate": 2.3463687150837988e-07,
"loss": 1.845343828201294,
"step": 22
},
{
"epoch": 0.020202020202020204,
"grad_norm": 15.411078453063965,
"learning_rate": 2.5698324022346367e-07,
"loss": 3.498995780944824,
"step": 24
},
{
"epoch": 0.021885521885521887,
"grad_norm": 14.533377647399902,
"learning_rate": 2.7932960893854745e-07,
"loss": 2.7915127277374268,
"step": 26
},
{
"epoch": 0.02356902356902357,
"grad_norm": 14.302261352539062,
"learning_rate": 3.016759776536313e-07,
"loss": 2.1403520107269287,
"step": 28
},
{
"epoch": 0.025252525252525252,
"grad_norm": 128.7482147216797,
"learning_rate": 3.240223463687151e-07,
"loss": 3.224329948425293,
"step": 30
},
{
"epoch": 0.026936026936026935,
"grad_norm": 19.047321319580078,
"learning_rate": 3.4636871508379887e-07,
"loss": 1.7786729335784912,
"step": 32
},
{
"epoch": 0.02861952861952862,
"grad_norm": 34.735931396484375,
"learning_rate": 3.6871508379888266e-07,
"loss": 3.5843183994293213,
"step": 34
},
{
"epoch": 0.030303030303030304,
"grad_norm": 14.215960502624512,
"learning_rate": 3.9106145251396645e-07,
"loss": 1.935349464416504,
"step": 36
},
{
"epoch": 0.03198653198653199,
"grad_norm": 19.77899742126465,
"learning_rate": 4.134078212290503e-07,
"loss": 1.4627070426940918,
"step": 38
},
{
"epoch": 0.03367003367003367,
"grad_norm": 11.175507545471191,
"learning_rate": 4.35754189944134e-07,
"loss": 2.240666389465332,
"step": 40
},
{
"epoch": 0.03535353535353535,
"grad_norm": 12.57774543762207,
"learning_rate": 4.5810055865921786e-07,
"loss": 2.6894779205322266,
"step": 42
},
{
"epoch": 0.037037037037037035,
"grad_norm": 10.773889541625977,
"learning_rate": 4.804469273743016e-07,
"loss": 2.1831870079040527,
"step": 44
},
{
"epoch": 0.03872053872053872,
"grad_norm": 19.548690795898438,
"learning_rate": 5.027932960893855e-07,
"loss": 1.439545750617981,
"step": 46
},
{
"epoch": 0.04040404040404041,
"grad_norm": 36.42060089111328,
"learning_rate": 5.251396648044693e-07,
"loss": 2.12111759185791,
"step": 48
},
{
"epoch": 0.04208754208754209,
"grad_norm": 21.412006378173828,
"learning_rate": 5.474860335195531e-07,
"loss": 1.8557084798812866,
"step": 50
},
{
"epoch": 0.04377104377104377,
"grad_norm": 34.71672821044922,
"learning_rate": 5.698324022346367e-07,
"loss": 1.9305258989334106,
"step": 52
},
{
"epoch": 0.045454545454545456,
"grad_norm": 9.617766380310059,
"learning_rate": 5.921787709497206e-07,
"loss": 1.8030924797058105,
"step": 54
},
{
"epoch": 0.04713804713804714,
"grad_norm": 7.223271369934082,
"learning_rate": 6.145251396648044e-07,
"loss": 1.5451292991638184,
"step": 56
},
{
"epoch": 0.04882154882154882,
"grad_norm": 23.102691650390625,
"learning_rate": 6.368715083798882e-07,
"loss": 1.7897560596466064,
"step": 58
},
{
"epoch": 0.050505050505050504,
"grad_norm": 5.478814125061035,
"learning_rate": 6.59217877094972e-07,
"loss": 1.5537652969360352,
"step": 60
},
{
"epoch": 0.05218855218855219,
"grad_norm": 24.925573348999023,
"learning_rate": 6.815642458100558e-07,
"loss": 1.5619089603424072,
"step": 62
},
{
"epoch": 0.05387205387205387,
"grad_norm": 6.516916751861572,
"learning_rate": 7.039106145251397e-07,
"loss": 1.3759074211120605,
"step": 64
},
{
"epoch": 0.05555555555555555,
"grad_norm": 15.737017631530762,
"learning_rate": 7.262569832402235e-07,
"loss": 1.7463831901550293,
"step": 66
},
{
"epoch": 0.05723905723905724,
"grad_norm": 8.381811141967773,
"learning_rate": 7.486033519553073e-07,
"loss": 1.5691605806350708,
"step": 68
},
{
"epoch": 0.058922558922558925,
"grad_norm": 4.55023193359375,
"learning_rate": 7.709497206703909e-07,
"loss": 1.477018117904663,
"step": 70
},
{
"epoch": 0.06060606060606061,
"grad_norm": 10.475610733032227,
"learning_rate": 7.932960893854748e-07,
"loss": 1.7785859107971191,
"step": 72
},
{
"epoch": 0.06228956228956229,
"grad_norm": 3.722770929336548,
"learning_rate": 8.156424581005586e-07,
"loss": 1.5958442687988281,
"step": 74
},
{
"epoch": 0.06397306397306397,
"grad_norm": 10.343371391296387,
"learning_rate": 8.379888268156424e-07,
"loss": 1.0664887428283691,
"step": 76
},
{
"epoch": 0.06565656565656566,
"grad_norm": 4.329504489898682,
"learning_rate": 8.603351955307262e-07,
"loss": 1.624394178390503,
"step": 78
},
{
"epoch": 0.06734006734006734,
"grad_norm": 3.1892311573028564,
"learning_rate": 8.8268156424581e-07,
"loss": 1.4226157665252686,
"step": 80
},
{
"epoch": 0.06902356902356903,
"grad_norm": 4.654183387756348,
"learning_rate": 9.050279329608939e-07,
"loss": 1.5613698959350586,
"step": 82
},
{
"epoch": 0.0707070707070707,
"grad_norm": 11.606276512145996,
"learning_rate": 9.273743016759777e-07,
"loss": 1.3501516580581665,
"step": 84
},
{
"epoch": 0.0723905723905724,
"grad_norm": 16.68027114868164,
"learning_rate": 9.497206703910615e-07,
"loss": 1.4939382076263428,
"step": 86
},
{
"epoch": 0.07407407407407407,
"grad_norm": 6.979609489440918,
"learning_rate": 9.720670391061452e-07,
"loss": 1.3073322772979736,
"step": 88
},
{
"epoch": 0.07575757575757576,
"grad_norm": 14.462837219238281,
"learning_rate": 9.94413407821229e-07,
"loss": 1.4457292556762695,
"step": 90
},
{
"epoch": 0.07744107744107744,
"grad_norm": 28.974502563476562,
"learning_rate": 1.0167597765363128e-06,
"loss": 1.1620988845825195,
"step": 92
},
{
"epoch": 0.07912457912457913,
"grad_norm": 4.1516923904418945,
"learning_rate": 1.0391061452513965e-06,
"loss": 1.238929271697998,
"step": 94
},
{
"epoch": 0.08080808080808081,
"grad_norm": 4.485579490661621,
"learning_rate": 1.0614525139664804e-06,
"loss": 1.2894108295440674,
"step": 96
},
{
"epoch": 0.08249158249158249,
"grad_norm": 14.860766410827637,
"learning_rate": 1.0837988826815643e-06,
"loss": 1.233134150505066,
"step": 98
},
{
"epoch": 0.08417508417508418,
"grad_norm": 3.1615281105041504,
"learning_rate": 1.106145251396648e-06,
"loss": 1.21131432056427,
"step": 100
},
{
"epoch": 0.08585858585858586,
"grad_norm": 6.936952114105225,
"learning_rate": 1.1284916201117319e-06,
"loss": 1.0955811738967896,
"step": 102
},
{
"epoch": 0.08754208754208755,
"grad_norm": 4.402707576751709,
"learning_rate": 1.1508379888268155e-06,
"loss": 0.8408428430557251,
"step": 104
},
{
"epoch": 0.08922558922558922,
"grad_norm": 26.573022842407227,
"learning_rate": 1.1731843575418994e-06,
"loss": 0.9416179656982422,
"step": 106
},
{
"epoch": 0.09090909090909091,
"grad_norm": 3.3700878620147705,
"learning_rate": 1.1955307262569831e-06,
"loss": 1.2525057792663574,
"step": 108
},
{
"epoch": 0.09259259259259259,
"grad_norm": 6.59627103805542,
"learning_rate": 1.217877094972067e-06,
"loss": 0.8143967986106873,
"step": 110
},
{
"epoch": 0.09427609427609428,
"grad_norm": 6.249094486236572,
"learning_rate": 1.2402234636871507e-06,
"loss": 1.253350853919983,
"step": 112
},
{
"epoch": 0.09595959595959595,
"grad_norm": 14.433463096618652,
"learning_rate": 1.2625698324022344e-06,
"loss": 1.0653541088104248,
"step": 114
},
{
"epoch": 0.09764309764309764,
"grad_norm": 3.963433265686035,
"learning_rate": 1.2849162011173185e-06,
"loss": 0.7410316467285156,
"step": 116
},
{
"epoch": 0.09932659932659933,
"grad_norm": 3.3435471057891846,
"learning_rate": 1.3072625698324022e-06,
"loss": 1.144932746887207,
"step": 118
},
{
"epoch": 0.10101010101010101,
"grad_norm": 3.1995925903320312,
"learning_rate": 1.329608938547486e-06,
"loss": 0.9971737861633301,
"step": 120
},
{
"epoch": 0.1026936026936027,
"grad_norm": 4.980305194854736,
"learning_rate": 1.3519553072625697e-06,
"loss": 1.1758251190185547,
"step": 122
},
{
"epoch": 0.10437710437710437,
"grad_norm": 6.944910049438477,
"learning_rate": 1.3743016759776536e-06,
"loss": 0.8010753393173218,
"step": 124
},
{
"epoch": 0.10606060606060606,
"grad_norm": 5.405940055847168,
"learning_rate": 1.3966480446927373e-06,
"loss": 0.708318829536438,
"step": 126
},
{
"epoch": 0.10774410774410774,
"grad_norm": 24.046825408935547,
"learning_rate": 1.4189944134078212e-06,
"loss": 1.0953171253204346,
"step": 128
},
{
"epoch": 0.10942760942760943,
"grad_norm": 9.63823127746582,
"learning_rate": 1.441340782122905e-06,
"loss": 0.9741929173469543,
"step": 130
},
{
"epoch": 0.1111111111111111,
"grad_norm": 15.6827974319458,
"learning_rate": 1.4636871508379886e-06,
"loss": 0.9290119409561157,
"step": 132
},
{
"epoch": 0.1127946127946128,
"grad_norm": 4.126307010650635,
"learning_rate": 1.4860335195530727e-06,
"loss": 1.1921985149383545,
"step": 134
},
{
"epoch": 0.11447811447811448,
"grad_norm": 26.01188087463379,
"learning_rate": 1.5083798882681564e-06,
"loss": 1.1901376247406006,
"step": 136
},
{
"epoch": 0.11616161616161616,
"grad_norm": 4.109427452087402,
"learning_rate": 1.5307262569832403e-06,
"loss": 1.5455617904663086,
"step": 138
},
{
"epoch": 0.11784511784511785,
"grad_norm": 5.346724987030029,
"learning_rate": 1.553072625698324e-06,
"loss": 1.1195063591003418,
"step": 140
},
{
"epoch": 0.11952861952861953,
"grad_norm": 4.396357536315918,
"learning_rate": 1.5754189944134078e-06,
"loss": 1.060058832168579,
"step": 142
},
{
"epoch": 0.12121212121212122,
"grad_norm": 102.16704559326172,
"learning_rate": 1.5977653631284915e-06,
"loss": 0.9710292816162109,
"step": 144
},
{
"epoch": 0.12289562289562289,
"grad_norm": 3.3568778038024902,
"learning_rate": 1.6201117318435752e-06,
"loss": 1.1372500658035278,
"step": 146
},
{
"epoch": 0.12457912457912458,
"grad_norm": 4.527273178100586,
"learning_rate": 1.642458100558659e-06,
"loss": 0.991235077381134,
"step": 148
},
{
"epoch": 0.12626262626262627,
"grad_norm": 26.466514587402344,
"learning_rate": 1.6648044692737428e-06,
"loss": 1.206244945526123,
"step": 150
},
{
"epoch": 0.12794612794612795,
"grad_norm": 14.26403522491455,
"learning_rate": 1.6871508379888269e-06,
"loss": 0.972631573677063,
"step": 152
},
{
"epoch": 0.12962962962962962,
"grad_norm": 22.82804298400879,
"learning_rate": 1.7094972067039106e-06,
"loss": 1.2080013751983643,
"step": 154
},
{
"epoch": 0.13131313131313133,
"grad_norm": 15.690414428710938,
"learning_rate": 1.7318435754189945e-06,
"loss": 1.0757750272750854,
"step": 156
},
{
"epoch": 0.132996632996633,
"grad_norm": 4.205806732177734,
"learning_rate": 1.7541899441340781e-06,
"loss": 1.054223656654358,
"step": 158
},
{
"epoch": 0.13468013468013468,
"grad_norm": 5.085096836090088,
"learning_rate": 1.776536312849162e-06,
"loss": 1.1487317085266113,
"step": 160
},
{
"epoch": 0.13636363636363635,
"grad_norm": 2.9503731727600098,
"learning_rate": 1.7988826815642457e-06,
"loss": 1.0323597192764282,
"step": 162
},
{
"epoch": 0.13804713804713806,
"grad_norm": 15.863119125366211,
"learning_rate": 1.8212290502793294e-06,
"loss": 0.9738507866859436,
"step": 164
},
{
"epoch": 0.13973063973063973,
"grad_norm": 8.238275527954102,
"learning_rate": 1.8435754189944133e-06,
"loss": 0.9224099516868591,
"step": 166
},
{
"epoch": 0.1414141414141414,
"grad_norm": 3.9588847160339355,
"learning_rate": 1.865921787709497e-06,
"loss": 1.1515543460845947,
"step": 168
},
{
"epoch": 0.14309764309764308,
"grad_norm": 8.48491382598877,
"learning_rate": 1.8882681564245809e-06,
"loss": 1.013866662979126,
"step": 170
},
{
"epoch": 0.1447811447811448,
"grad_norm": 13.511845588684082,
"learning_rate": 1.9106145251396648e-06,
"loss": 1.0169274806976318,
"step": 172
},
{
"epoch": 0.14646464646464646,
"grad_norm": 3.410078287124634,
"learning_rate": 1.9329608938547484e-06,
"loss": 0.7148650288581848,
"step": 174
},
{
"epoch": 0.14814814814814814,
"grad_norm": 6.603743553161621,
"learning_rate": 1.9553072625698325e-06,
"loss": 1.0830409526824951,
"step": 176
},
{
"epoch": 0.14983164983164984,
"grad_norm": 3.631049871444702,
"learning_rate": 1.9776536312849162e-06,
"loss": 1.2707182168960571,
"step": 178
},
{
"epoch": 0.15151515151515152,
"grad_norm": 4.926392555236816,
"learning_rate": 2e-06,
"loss": 1.0230215787887573,
"step": 180
},
{
"epoch": 0.1531986531986532,
"grad_norm": 3.419835329055786,
"learning_rate": 1.9999984495606584e-06,
"loss": 1.4019644260406494,
"step": 182
},
{
"epoch": 0.15488215488215487,
"grad_norm": 91.24452209472656,
"learning_rate": 1.999993798247977e-06,
"loss": 0.9929481744766235,
"step": 184
},
{
"epoch": 0.15656565656565657,
"grad_norm": 11.83352279663086,
"learning_rate": 1.99998604607798e-06,
"loss": 1.1366297006607056,
"step": 186
},
{
"epoch": 0.15824915824915825,
"grad_norm": 8.025732040405273,
"learning_rate": 1.9999751930773778e-06,
"loss": 0.9216547608375549,
"step": 188
},
{
"epoch": 0.15993265993265993,
"grad_norm": 23.363126754760742,
"learning_rate": 1.999961239283563e-06,
"loss": 0.7902772426605225,
"step": 190
},
{
"epoch": 0.16161616161616163,
"grad_norm": 5.2719621658325195,
"learning_rate": 1.999944184744613e-06,
"loss": 1.3453216552734375,
"step": 192
},
{
"epoch": 0.1632996632996633,
"grad_norm": 12.32300090789795,
"learning_rate": 1.999924029519287e-06,
"loss": 1.2495100498199463,
"step": 194
},
{
"epoch": 0.16498316498316498,
"grad_norm": 3.889246940612793,
"learning_rate": 1.9999007736770295e-06,
"loss": 1.0745317935943604,
"step": 196
},
{
"epoch": 0.16666666666666666,
"grad_norm": 3.305817127227783,
"learning_rate": 1.9998744172979654e-06,
"loss": 1.173724889755249,
"step": 198
},
{
"epoch": 0.16835016835016836,
"grad_norm": 17.490114212036133,
"learning_rate": 1.9998449604729044e-06,
"loss": 0.8745306730270386,
"step": 200
},
{
"epoch": 0.17003367003367004,
"grad_norm": 23.266372680664062,
"learning_rate": 1.9998124033033366e-06,
"loss": 0.8984509706497192,
"step": 202
},
{
"epoch": 0.1717171717171717,
"grad_norm": 3.1825051307678223,
"learning_rate": 1.9997767459014363e-06,
"loss": 1.029420018196106,
"step": 204
},
{
"epoch": 0.1734006734006734,
"grad_norm": 6.860640048980713,
"learning_rate": 1.9997379883900572e-06,
"loss": 1.0008872747421265,
"step": 206
},
{
"epoch": 0.1750841750841751,
"grad_norm": 3.333308458328247,
"learning_rate": 1.999696130902736e-06,
"loss": 1.2087559700012207,
"step": 208
},
{
"epoch": 0.17676767676767677,
"grad_norm": 14.786520957946777,
"learning_rate": 1.9996511735836895e-06,
"loss": 0.7541635036468506,
"step": 210
},
{
"epoch": 0.17845117845117844,
"grad_norm": 12.577203750610352,
"learning_rate": 1.999603116587814e-06,
"loss": 0.8843977451324463,
"step": 212
},
{
"epoch": 0.18013468013468015,
"grad_norm": 3.1734931468963623,
"learning_rate": 1.9995519600806863e-06,
"loss": 1.3309192657470703,
"step": 214
},
{
"epoch": 0.18181818181818182,
"grad_norm": 3.05130672454834,
"learning_rate": 1.999497704238562e-06,
"loss": 0.837327241897583,
"step": 216
},
{
"epoch": 0.1835016835016835,
"grad_norm": 17.063798904418945,
"learning_rate": 1.9994403492483755e-06,
"loss": 0.6734769344329834,
"step": 218
},
{
"epoch": 0.18518518518518517,
"grad_norm": 3.6013314723968506,
"learning_rate": 1.999379895307739e-06,
"loss": 1.3345911502838135,
"step": 220
},
{
"epoch": 0.18686868686868688,
"grad_norm": 12.002358436584473,
"learning_rate": 1.999316342624941e-06,
"loss": 0.9222342371940613,
"step": 222
},
{
"epoch": 0.18855218855218855,
"grad_norm": 6.592970848083496,
"learning_rate": 1.999249691418948e-06,
"loss": 1.2030017375946045,
"step": 224
},
{
"epoch": 0.19023569023569023,
"grad_norm": 5.709416389465332,
"learning_rate": 1.999179941919401e-06,
"loss": 0.9448881149291992,
"step": 226
},
{
"epoch": 0.1919191919191919,
"grad_norm": 15.942882537841797,
"learning_rate": 1.999107094366617e-06,
"loss": 1.201434850692749,
"step": 228
},
{
"epoch": 0.1936026936026936,
"grad_norm": 7.546412467956543,
"learning_rate": 1.9990311490115858e-06,
"loss": 1.3325837850570679,
"step": 230
},
{
"epoch": 0.19528619528619529,
"grad_norm": 8.624425888061523,
"learning_rate": 1.9989521061159715e-06,
"loss": 1.0627577304840088,
"step": 232
},
{
"epoch": 0.19696969696969696,
"grad_norm": 205.3386688232422,
"learning_rate": 1.9988699659521098e-06,
"loss": 1.1965469121932983,
"step": 234
},
{
"epoch": 0.19865319865319866,
"grad_norm": 3.602259874343872,
"learning_rate": 1.9987847288030083e-06,
"loss": 0.9878703355789185,
"step": 236
},
{
"epoch": 0.20033670033670034,
"grad_norm": 3.7958240509033203,
"learning_rate": 1.998696394962345e-06,
"loss": 1.1146423816680908,
"step": 238
},
{
"epoch": 0.20202020202020202,
"grad_norm": 5.15401029586792,
"learning_rate": 1.998604964734467e-06,
"loss": 0.9246745109558105,
"step": 240
},
{
"epoch": 0.2037037037037037,
"grad_norm": 7.467074394226074,
"learning_rate": 1.99851043843439e-06,
"loss": 1.1710363626480103,
"step": 242
},
{
"epoch": 0.2053872053872054,
"grad_norm": 5.450447082519531,
"learning_rate": 1.9984128163877964e-06,
"loss": 0.99492347240448,
"step": 244
},
{
"epoch": 0.20707070707070707,
"grad_norm": 13.562344551086426,
"learning_rate": 1.998312098931036e-06,
"loss": 0.6625456809997559,
"step": 246
},
{
"epoch": 0.20875420875420875,
"grad_norm": 13.973015785217285,
"learning_rate": 1.998208286411122e-06,
"loss": 1.2143261432647705,
"step": 248
},
{
"epoch": 0.21043771043771045,
"grad_norm": 4.950899124145508,
"learning_rate": 1.9981013791857327e-06,
"loss": 1.0001804828643799,
"step": 250
},
{
"epoch": 0.21212121212121213,
"grad_norm": 8.739319801330566,
"learning_rate": 1.997991377623209e-06,
"loss": 0.9055366516113281,
"step": 252
},
{
"epoch": 0.2138047138047138,
"grad_norm": 20.48418426513672,
"learning_rate": 1.9978782821025513e-06,
"loss": 1.0460654497146606,
"step": 254
},
{
"epoch": 0.21548821548821548,
"grad_norm": 3.14092755317688,
"learning_rate": 1.9977620930134223e-06,
"loss": 1.1977128982543945,
"step": 256
},
{
"epoch": 0.21717171717171718,
"grad_norm": 3.1732327938079834,
"learning_rate": 1.9976428107561415e-06,
"loss": 0.8446206450462341,
"step": 258
},
{
"epoch": 0.21885521885521886,
"grad_norm": 4.083011150360107,
"learning_rate": 1.997520435741687e-06,
"loss": 1.0371161699295044,
"step": 260
},
{
"epoch": 0.22053872053872053,
"grad_norm": 18.663959503173828,
"learning_rate": 1.9973949683916927e-06,
"loss": 1.0510563850402832,
"step": 262
},
{
"epoch": 0.2222222222222222,
"grad_norm": 7.912849426269531,
"learning_rate": 1.9972664091384454e-06,
"loss": 1.1071124076843262,
"step": 264
},
{
"epoch": 0.2239057239057239,
"grad_norm": 8.017518997192383,
"learning_rate": 1.997134758424886e-06,
"loss": 1.1996357440948486,
"step": 266
},
{
"epoch": 0.2255892255892256,
"grad_norm": 8.789745330810547,
"learning_rate": 1.9970000167046075e-06,
"loss": 0.6464065313339233,
"step": 268
},
{
"epoch": 0.22727272727272727,
"grad_norm": 2.892493963241577,
"learning_rate": 1.996862184441851e-06,
"loss": 0.9799895882606506,
"step": 270
},
{
"epoch": 0.22895622895622897,
"grad_norm": 15.957945823669434,
"learning_rate": 1.9967212621115065e-06,
"loss": 1.310072898864746,
"step": 272
},
{
"epoch": 0.23063973063973064,
"grad_norm": 15.204549789428711,
"learning_rate": 1.996577250199111e-06,
"loss": 1.1751708984375,
"step": 274
},
{
"epoch": 0.23232323232323232,
"grad_norm": 13.538248062133789,
"learning_rate": 1.9964301492008464e-06,
"loss": 0.9009586572647095,
"step": 276
},
{
"epoch": 0.234006734006734,
"grad_norm": 3.318108558654785,
"learning_rate": 1.996279959623537e-06,
"loss": 1.160229206085205,
"step": 278
},
{
"epoch": 0.2356902356902357,
"grad_norm": 19.02191925048828,
"learning_rate": 1.9961266819846495e-06,
"loss": 0.9633986353874207,
"step": 280
},
{
"epoch": 0.23737373737373738,
"grad_norm": 2.9140703678131104,
"learning_rate": 1.9959703168122897e-06,
"loss": 0.9368491172790527,
"step": 282
},
{
"epoch": 0.23905723905723905,
"grad_norm": 6.261467933654785,
"learning_rate": 1.995810864645202e-06,
"loss": 1.2744100093841553,
"step": 284
},
{
"epoch": 0.24074074074074073,
"grad_norm": 10.625744819641113,
"learning_rate": 1.995648326032765e-06,
"loss": 0.93353271484375,
"step": 286
},
{
"epoch": 0.24242424242424243,
"grad_norm": 4.416279315948486,
"learning_rate": 1.9954827015349937e-06,
"loss": 0.8594992160797119,
"step": 288
},
{
"epoch": 0.2441077441077441,
"grad_norm": 12.509293556213379,
"learning_rate": 1.9953139917225333e-06,
"loss": 1.1634539365768433,
"step": 290
},
{
"epoch": 0.24579124579124578,
"grad_norm": 6.1216559410095215,
"learning_rate": 1.995142197176661e-06,
"loss": 0.6774170398712158,
"step": 292
},
{
"epoch": 0.2474747474747475,
"grad_norm": 11.188633918762207,
"learning_rate": 1.9949673184892803e-06,
"loss": 1.2763657569885254,
"step": 294
},
{
"epoch": 0.24915824915824916,
"grad_norm": 14.471898078918457,
"learning_rate": 1.9947893562629227e-06,
"loss": 1.0749788284301758,
"step": 296
},
{
"epoch": 0.25084175084175087,
"grad_norm": 4.776523113250732,
"learning_rate": 1.9946083111107425e-06,
"loss": 0.6309652328491211,
"step": 298
},
{
"epoch": 0.25252525252525254,
"grad_norm": 13.419336318969727,
"learning_rate": 1.9944241836565167e-06,
"loss": 0.7867164611816406,
"step": 300
},
{
"epoch": 0.2542087542087542,
"grad_norm": 13.28558349609375,
"learning_rate": 1.9942369745346417e-06,
"loss": 1.074715256690979,
"step": 302
},
{
"epoch": 0.2558922558922559,
"grad_norm": 13.307300567626953,
"learning_rate": 1.9940466843901318e-06,
"loss": 0.909276008605957,
"step": 304
},
{
"epoch": 0.25757575757575757,
"grad_norm": 3.531205415725708,
"learning_rate": 1.9938533138786163e-06,
"loss": 1.3488116264343262,
"step": 306
},
{
"epoch": 0.25925925925925924,
"grad_norm": 18.24886131286621,
"learning_rate": 1.9936568636663383e-06,
"loss": 1.1045993566513062,
"step": 308
},
{
"epoch": 0.2609427609427609,
"grad_norm": 8.636100769042969,
"learning_rate": 1.9934573344301514e-06,
"loss": 1.0755970478057861,
"step": 310
},
{
"epoch": 0.26262626262626265,
"grad_norm": 21.606901168823242,
"learning_rate": 1.993254726857518e-06,
"loss": 1.192507266998291,
"step": 312
},
{
"epoch": 0.26430976430976433,
"grad_norm": 3.9044668674468994,
"learning_rate": 1.9930490416465057e-06,
"loss": 1.0830907821655273,
"step": 314
},
{
"epoch": 0.265993265993266,
"grad_norm": 4.083668231964111,
"learning_rate": 1.992840279505787e-06,
"loss": 1.2332277297973633,
"step": 316
},
{
"epoch": 0.2676767676767677,
"grad_norm": 2.9494712352752686,
"learning_rate": 1.9926284411546355e-06,
"loss": 1.0134263038635254,
"step": 318
},
{
"epoch": 0.26936026936026936,
"grad_norm": 15.981578826904297,
"learning_rate": 1.9924135273229235e-06,
"loss": 0.7042160034179688,
"step": 320
},
{
"epoch": 0.27104377104377103,
"grad_norm": 3.9529871940612793,
"learning_rate": 1.9921955387511195e-06,
"loss": 0.9744091033935547,
"step": 322
},
{
"epoch": 0.2727272727272727,
"grad_norm": 4.732446193695068,
"learning_rate": 1.991974476190285e-06,
"loss": 1.2661027908325195,
"step": 324
},
{
"epoch": 0.27441077441077444,
"grad_norm": 3.1209988594055176,
"learning_rate": 1.9917503404020747e-06,
"loss": 1.0432727336883545,
"step": 326
},
{
"epoch": 0.2760942760942761,
"grad_norm": 2.297736644744873,
"learning_rate": 1.9915231321587305e-06,
"loss": 0.7997782230377197,
"step": 328
},
{
"epoch": 0.2777777777777778,
"grad_norm": 3.8393845558166504,
"learning_rate": 1.99129285224308e-06,
"loss": 0.8995693922042847,
"step": 330
},
{
"epoch": 0.27946127946127947,
"grad_norm": 5.347554683685303,
"learning_rate": 1.9910595014485347e-06,
"loss": 1.094329595565796,
"step": 332
},
{
"epoch": 0.28114478114478114,
"grad_norm": 9.933974266052246,
"learning_rate": 1.990823080579086e-06,
"loss": 1.0780812501907349,
"step": 334
},
{
"epoch": 0.2828282828282828,
"grad_norm": 6.838101863861084,
"learning_rate": 1.990583590449303e-06,
"loss": 1.0089993476867676,
"step": 336
},
{
"epoch": 0.2845117845117845,
"grad_norm": 2.9299662113189697,
"learning_rate": 1.990341031884331e-06,
"loss": 1.188491702079773,
"step": 338
},
{
"epoch": 0.28619528619528617,
"grad_norm": 11.96368408203125,
"learning_rate": 1.9900954057198856e-06,
"loss": 0.9743690490722656,
"step": 340
},
{
"epoch": 0.2878787878787879,
"grad_norm": 10.01843547821045,
"learning_rate": 1.989846712802252e-06,
"loss": 1.091504454612732,
"step": 342
},
{
"epoch": 0.2895622895622896,
"grad_norm": 11.638251304626465,
"learning_rate": 1.9895949539882827e-06,
"loss": 0.8539205193519592,
"step": 344
},
{
"epoch": 0.29124579124579125,
"grad_norm": 4.232053279876709,
"learning_rate": 1.9893401301453926e-06,
"loss": 1.1060683727264404,
"step": 346
},
{
"epoch": 0.29292929292929293,
"grad_norm": 4.821753025054932,
"learning_rate": 1.989082242151556e-06,
"loss": 1.054430365562439,
"step": 348
},
{
"epoch": 0.2946127946127946,
"grad_norm": 13.823630332946777,
"learning_rate": 1.988821290895307e-06,
"loss": 0.7408787608146667,
"step": 350
},
{
"epoch": 0.2962962962962963,
"grad_norm": 4.8311028480529785,
"learning_rate": 1.988557277275732e-06,
"loss": 0.5961899757385254,
"step": 352
},
{
"epoch": 0.29797979797979796,
"grad_norm": 6.172464847564697,
"learning_rate": 1.9882902022024683e-06,
"loss": 1.0046343803405762,
"step": 354
},
{
"epoch": 0.2996632996632997,
"grad_norm": 8.489828109741211,
"learning_rate": 1.9880200665957026e-06,
"loss": 1.0770823955535889,
"step": 356
},
{
"epoch": 0.30134680134680136,
"grad_norm": 20.501022338867188,
"learning_rate": 1.9877468713861656e-06,
"loss": 0.9369664192199707,
"step": 358
},
{
"epoch": 0.30303030303030304,
"grad_norm": 3.7023608684539795,
"learning_rate": 1.98747061751513e-06,
"loss": 0.813412070274353,
"step": 360
},
{
"epoch": 0.3047138047138047,
"grad_norm": 3.5265374183654785,
"learning_rate": 1.987191305934406e-06,
"loss": 0.9706151485443115,
"step": 362
},
{
"epoch": 0.3063973063973064,
"grad_norm": 2.9974145889282227,
"learning_rate": 1.98690893760634e-06,
"loss": 1.2250468730926514,
"step": 364
},
{
"epoch": 0.30808080808080807,
"grad_norm": 6.445283889770508,
"learning_rate": 1.9866235135038095e-06,
"loss": 0.8330235481262207,
"step": 366
},
{
"epoch": 0.30976430976430974,
"grad_norm": 3.45355224609375,
"learning_rate": 1.986335034610221e-06,
"loss": 0.8574585318565369,
"step": 368
},
{
"epoch": 0.3114478114478115,
"grad_norm": 4.574197769165039,
"learning_rate": 1.9860435019195054e-06,
"loss": 1.0763049125671387,
"step": 370
},
{
"epoch": 0.31313131313131315,
"grad_norm": 5.326190948486328,
"learning_rate": 1.9857489164361147e-06,
"loss": 1.2134881019592285,
"step": 372
},
{
"epoch": 0.3148148148148148,
"grad_norm": 18.84362030029297,
"learning_rate": 1.9854512791750214e-06,
"loss": 0.6605836153030396,
"step": 374
},
{
"epoch": 0.3164983164983165,
"grad_norm": 5.314328193664551,
"learning_rate": 1.9851505911617097e-06,
"loss": 0.9535898566246033,
"step": 376
},
{
"epoch": 0.3181818181818182,
"grad_norm": 4.1279168128967285,
"learning_rate": 1.984846853432177e-06,
"loss": 1.2825720310211182,
"step": 378
},
{
"epoch": 0.31986531986531985,
"grad_norm": 8.377116203308105,
"learning_rate": 1.9845400670329275e-06,
"loss": 0.734359860420227,
"step": 380
},
{
"epoch": 0.32154882154882153,
"grad_norm": 10.643378257751465,
"learning_rate": 1.98423023302097e-06,
"loss": 1.0042654275894165,
"step": 382
},
{
"epoch": 0.32323232323232326,
"grad_norm": 7.596747875213623,
"learning_rate": 1.9839173524638115e-06,
"loss": 1.110269546508789,
"step": 384
},
{
"epoch": 0.32491582491582494,
"grad_norm": 6.244058132171631,
"learning_rate": 1.9836014264394587e-06,
"loss": 0.7185302972793579,
"step": 386
},
{
"epoch": 0.3265993265993266,
"grad_norm": 6.148385524749756,
"learning_rate": 1.9832824560364093e-06,
"loss": 0.9159483909606934,
"step": 388
},
{
"epoch": 0.3282828282828283,
"grad_norm": 183.32968139648438,
"learning_rate": 1.98296044235365e-06,
"loss": 1.0852017402648926,
"step": 390
},
{
"epoch": 0.32996632996632996,
"grad_norm": 8.698363304138184,
"learning_rate": 1.9826353865006538e-06,
"loss": 0.7871326208114624,
"step": 392
},
{
"epoch": 0.33164983164983164,
"grad_norm": 3.863551139831543,
"learning_rate": 1.9823072895973748e-06,
"loss": 1.3192460536956787,
"step": 394
},
{
"epoch": 0.3333333333333333,
"grad_norm": 13.84194564819336,
"learning_rate": 1.981976152774245e-06,
"loss": 1.158171534538269,
"step": 396
},
{
"epoch": 0.335016835016835,
"grad_norm": 18.21632194519043,
"learning_rate": 1.98164197717217e-06,
"loss": 0.7753697037696838,
"step": 398
},
{
"epoch": 0.3367003367003367,
"grad_norm": 5.396472930908203,
"learning_rate": 1.9813047639425253e-06,
"loss": 0.9357776641845703,
"step": 400
},
{
"epoch": 0.3383838383838384,
"grad_norm": 9.759978294372559,
"learning_rate": 1.9809645142471528e-06,
"loss": 0.9591242074966431,
"step": 402
},
{
"epoch": 0.3400673400673401,
"grad_norm": 6.960322380065918,
"learning_rate": 1.980621229258355e-06,
"loss": 0.9946481585502625,
"step": 404
},
{
"epoch": 0.34175084175084175,
"grad_norm": 3.891620635986328,
"learning_rate": 1.9802749101588942e-06,
"loss": 1.068068265914917,
"step": 406
},
{
"epoch": 0.3434343434343434,
"grad_norm": 8.289826393127441,
"learning_rate": 1.9799255581419844e-06,
"loss": 0.9243034720420837,
"step": 408
},
{
"epoch": 0.3451178451178451,
"grad_norm": 16.256540298461914,
"learning_rate": 1.9795731744112908e-06,
"loss": 0.5437488555908203,
"step": 410
},
{
"epoch": 0.3468013468013468,
"grad_norm": 3.7120189666748047,
"learning_rate": 1.9792177601809234e-06,
"loss": 0.8597297668457031,
"step": 412
},
{
"epoch": 0.3484848484848485,
"grad_norm": 9.203973770141602,
"learning_rate": 1.9788593166754343e-06,
"loss": 0.914923906326294,
"step": 414
},
{
"epoch": 0.3501683501683502,
"grad_norm": 15.325188636779785,
"learning_rate": 1.9784978451298115e-06,
"loss": 1.1473793983459473,
"step": 416
},
{
"epoch": 0.35185185185185186,
"grad_norm": 9.513066291809082,
"learning_rate": 1.9781333467894773e-06,
"loss": 0.7187636494636536,
"step": 418
},
{
"epoch": 0.35353535353535354,
"grad_norm": 11.042696952819824,
"learning_rate": 1.9777658229102807e-06,
"loss": 0.8753368258476257,
"step": 420
},
{
"epoch": 0.3552188552188552,
"grad_norm": 19.48780059814453,
"learning_rate": 1.9773952747584976e-06,
"loss": 1.1081957817077637,
"step": 422
},
{
"epoch": 0.3569023569023569,
"grad_norm": 7.343861103057861,
"learning_rate": 1.9770217036108212e-06,
"loss": 0.5900806188583374,
"step": 424
},
{
"epoch": 0.35858585858585856,
"grad_norm": 13.56103801727295,
"learning_rate": 1.9766451107543614e-06,
"loss": 1.0243406295776367,
"step": 426
},
{
"epoch": 0.3602693602693603,
"grad_norm": 5.026733875274658,
"learning_rate": 1.9762654974866396e-06,
"loss": 0.7951416969299316,
"step": 428
},
{
"epoch": 0.36195286195286197,
"grad_norm": 30.160125732421875,
"learning_rate": 1.975882865115583e-06,
"loss": 1.226210117340088,
"step": 430
},
{
"epoch": 0.36363636363636365,
"grad_norm": 3.663498640060425,
"learning_rate": 1.9754972149595204e-06,
"loss": 0.954987645149231,
"step": 432
},
{
"epoch": 0.3653198653198653,
"grad_norm": 3.128584146499634,
"learning_rate": 1.97510854834718e-06,
"loss": 0.8251101970672607,
"step": 434
},
{
"epoch": 0.367003367003367,
"grad_norm": 2.8099863529205322,
"learning_rate": 1.9747168666176813e-06,
"loss": 0.9983630180358887,
"step": 436
},
{
"epoch": 0.3686868686868687,
"grad_norm": 9.544251441955566,
"learning_rate": 1.9743221711205323e-06,
"loss": 1.074230432510376,
"step": 438
},
{
"epoch": 0.37037037037037035,
"grad_norm": 3.466240406036377,
"learning_rate": 1.9739244632156256e-06,
"loss": 0.9089052677154541,
"step": 440
},
{
"epoch": 0.3720538720538721,
"grad_norm": 3.713217258453369,
"learning_rate": 1.973523744273232e-06,
"loss": 0.9246188402175903,
"step": 442
},
{
"epoch": 0.37373737373737376,
"grad_norm": 4.951727867126465,
"learning_rate": 1.973120015673997e-06,
"loss": 0.7823818922042847,
"step": 444
},
{
"epoch": 0.37542087542087543,
"grad_norm": 14.46354866027832,
"learning_rate": 1.9727132788089354e-06,
"loss": 0.6286232471466064,
"step": 446
},
{
"epoch": 0.3771043771043771,
"grad_norm": 3.3994994163513184,
"learning_rate": 1.972303535079427e-06,
"loss": 1.116566777229309,
"step": 448
},
{
"epoch": 0.3787878787878788,
"grad_norm": 13.856016159057617,
"learning_rate": 1.971890785897211e-06,
"loss": 1.0384341478347778,
"step": 450
},
{
"epoch": 0.38047138047138046,
"grad_norm": 5.349656581878662,
"learning_rate": 1.9714750326843825e-06,
"loss": 0.7245984077453613,
"step": 452
},
{
"epoch": 0.38215488215488214,
"grad_norm": 16.106748580932617,
"learning_rate": 1.9710562768733857e-06,
"loss": 0.9850279688835144,
"step": 454
},
{
"epoch": 0.3838383838383838,
"grad_norm": 2.889192819595337,
"learning_rate": 1.9706345199070107e-06,
"loss": 0.7854516506195068,
"step": 456
},
{
"epoch": 0.38552188552188554,
"grad_norm": 4.588443756103516,
"learning_rate": 1.970209763238388e-06,
"loss": 0.9596130847930908,
"step": 458
},
{
"epoch": 0.3872053872053872,
"grad_norm": 2.0430006980895996,
"learning_rate": 1.969782008330983e-06,
"loss": 1.125518560409546,
"step": 460
},
{
"epoch": 0.3888888888888889,
"grad_norm": 3.873711347579956,
"learning_rate": 1.969351256658591e-06,
"loss": 0.8866109848022461,
"step": 462
},
{
"epoch": 0.39057239057239057,
"grad_norm": 3.8733267784118652,
"learning_rate": 1.968917509705333e-06,
"loss": 0.8248393535614014,
"step": 464
},
{
"epoch": 0.39225589225589225,
"grad_norm": 10.369402885437012,
"learning_rate": 1.9684807689656497e-06,
"loss": 0.8977053165435791,
"step": 466
},
{
"epoch": 0.3939393939393939,
"grad_norm": 1.9317212104797363,
"learning_rate": 1.9680410359442972e-06,
"loss": 0.9425126314163208,
"step": 468
},
{
"epoch": 0.3956228956228956,
"grad_norm": 3.9970741271972656,
"learning_rate": 1.9675983121563397e-06,
"loss": 0.9490628242492676,
"step": 470
},
{
"epoch": 0.39730639730639733,
"grad_norm": 4.277144908905029,
"learning_rate": 1.9671525991271478e-06,
"loss": 0.7922143340110779,
"step": 472
},
{
"epoch": 0.398989898989899,
"grad_norm": 6.69656229019165,
"learning_rate": 1.9667038983923902e-06,
"loss": 0.9853019714355469,
"step": 474
},
{
"epoch": 0.4006734006734007,
"grad_norm": 10.086434364318848,
"learning_rate": 1.9662522114980296e-06,
"loss": 0.7648198008537292,
"step": 476
},
{
"epoch": 0.40235690235690236,
"grad_norm": 75.10104370117188,
"learning_rate": 1.965797540000318e-06,
"loss": 0.9607178568840027,
"step": 478
},
{
"epoch": 0.40404040404040403,
"grad_norm": 13.168256759643555,
"learning_rate": 1.9653398854657887e-06,
"loss": 1.0317054986953735,
"step": 480
},
{
"epoch": 0.4057239057239057,
"grad_norm": 7.137551784515381,
"learning_rate": 1.9648792494712553e-06,
"loss": 1.0325589179992676,
"step": 482
},
{
"epoch": 0.4074074074074074,
"grad_norm": 7.48604679107666,
"learning_rate": 1.9644156336038024e-06,
"loss": 0.838646411895752,
"step": 484
},
{
"epoch": 0.4090909090909091,
"grad_norm": 9.154224395751953,
"learning_rate": 1.9639490394607813e-06,
"loss": 0.8662800192832947,
"step": 486
},
{
"epoch": 0.4107744107744108,
"grad_norm": 5.478043556213379,
"learning_rate": 1.9634794686498055e-06,
"loss": 1.0649371147155762,
"step": 488
},
{
"epoch": 0.41245791245791247,
"grad_norm": 4.0281901359558105,
"learning_rate": 1.9630069227887444e-06,
"loss": 1.1006402969360352,
"step": 490
},
{
"epoch": 0.41414141414141414,
"grad_norm": 11.18668270111084,
"learning_rate": 1.9625314035057167e-06,
"loss": 1.0519776344299316,
"step": 492
},
{
"epoch": 0.4158249158249158,
"grad_norm": 6.470438003540039,
"learning_rate": 1.9620529124390863e-06,
"loss": 0.9292422533035278,
"step": 494
},
{
"epoch": 0.4175084175084175,
"grad_norm": 2.4250965118408203,
"learning_rate": 1.9615714512374567e-06,
"loss": 1.0612026453018188,
"step": 496
},
{
"epoch": 0.41919191919191917,
"grad_norm": 4.183928489685059,
"learning_rate": 1.9610870215596643e-06,
"loss": 1.081310510635376,
"step": 498
},
{
"epoch": 0.4208754208754209,
"grad_norm": 15.612129211425781,
"learning_rate": 1.960599625074773e-06,
"loss": 0.8103876709938049,
"step": 500
},
{
"epoch": 0.4225589225589226,
"grad_norm": 7.831202983856201,
"learning_rate": 1.9601092634620687e-06,
"loss": 0.633713960647583,
"step": 502
},
{
"epoch": 0.42424242424242425,
"grad_norm": 7.164036750793457,
"learning_rate": 1.9596159384110535e-06,
"loss": 0.8758570551872253,
"step": 504
},
{
"epoch": 0.42592592592592593,
"grad_norm": 10.661258697509766,
"learning_rate": 1.95911965162144e-06,
"loss": 0.9336118698120117,
"step": 506
},
{
"epoch": 0.4276094276094276,
"grad_norm": 9.44550895690918,
"learning_rate": 1.958620404803145e-06,
"loss": 1.2653324604034424,
"step": 508
},
{
"epoch": 0.4292929292929293,
"grad_norm": 6.048154354095459,
"learning_rate": 1.9581181996762834e-06,
"loss": 1.0118142366409302,
"step": 510
},
{
"epoch": 0.43097643097643096,
"grad_norm": 2.742072105407715,
"learning_rate": 1.9576130379711634e-06,
"loss": 1.170724630355835,
"step": 512
},
{
"epoch": 0.43265993265993263,
"grad_norm": 5.8300089836120605,
"learning_rate": 1.95710492142828e-06,
"loss": 1.1076912879943848,
"step": 514
},
{
"epoch": 0.43434343434343436,
"grad_norm": 5.84092903137207,
"learning_rate": 1.956593851798308e-06,
"loss": 1.0066879987716675,
"step": 516
},
{
"epoch": 0.43602693602693604,
"grad_norm": 2.712181568145752,
"learning_rate": 1.9560798308420974e-06,
"loss": 1.0203490257263184,
"step": 518
},
{
"epoch": 0.4377104377104377,
"grad_norm": 10.564250946044922,
"learning_rate": 1.955562860330667e-06,
"loss": 0.8946080207824707,
"step": 520
},
{
"epoch": 0.4393939393939394,
"grad_norm": 3.632702589035034,
"learning_rate": 1.9550429420451973e-06,
"loss": 0.7722439169883728,
"step": 522
},
{
"epoch": 0.44107744107744107,
"grad_norm": 5.241938591003418,
"learning_rate": 1.954520077777026e-06,
"loss": 1.09357750415802,
"step": 524
},
{
"epoch": 0.44276094276094274,
"grad_norm": 12.312281608581543,
"learning_rate": 1.9539942693276405e-06,
"loss": 0.7668850421905518,
"step": 526
},
{
"epoch": 0.4444444444444444,
"grad_norm": 3.640024423599243,
"learning_rate": 1.9534655185086717e-06,
"loss": 1.1953470706939697,
"step": 528
},
{
"epoch": 0.44612794612794615,
"grad_norm": 6.970150470733643,
"learning_rate": 1.9529338271418886e-06,
"loss": 0.7888380289077759,
"step": 530
},
{
"epoch": 0.4478114478114478,
"grad_norm": 5.498988151550293,
"learning_rate": 1.952399197059192e-06,
"loss": 0.8151825070381165,
"step": 532
},
{
"epoch": 0.4494949494949495,
"grad_norm": 4.296585559844971,
"learning_rate": 1.9518616301026077e-06,
"loss": 0.9414000511169434,
"step": 534
},
{
"epoch": 0.4511784511784512,
"grad_norm": 3.7714059352874756,
"learning_rate": 1.9513211281242795e-06,
"loss": 1.2206546068191528,
"step": 536
},
{
"epoch": 0.45286195286195285,
"grad_norm": 5.960073471069336,
"learning_rate": 1.9507776929864643e-06,
"loss": 1.045861840248108,
"step": 538
},
{
"epoch": 0.45454545454545453,
"grad_norm": 4.743275165557861,
"learning_rate": 1.950231326561525e-06,
"loss": 0.8734741806983948,
"step": 540
},
{
"epoch": 0.4562289562289562,
"grad_norm": 4.74852180480957,
"learning_rate": 1.9496820307319237e-06,
"loss": 1.0024454593658447,
"step": 542
},
{
"epoch": 0.45791245791245794,
"grad_norm": 3.7979114055633545,
"learning_rate": 1.9491298073902157e-06,
"loss": 1.1115365028381348,
"step": 544
},
{
"epoch": 0.4595959595959596,
"grad_norm": 11.823755264282227,
"learning_rate": 1.9485746584390426e-06,
"loss": 1.154505729675293,
"step": 546
},
{
"epoch": 0.4612794612794613,
"grad_norm": 4.7486090660095215,
"learning_rate": 1.948016585791127e-06,
"loss": 1.3286551237106323,
"step": 548
},
{
"epoch": 0.46296296296296297,
"grad_norm": 4.785913467407227,
"learning_rate": 1.9474555913692627e-06,
"loss": 0.8783373832702637,
"step": 550
},
{
"epoch": 0.46464646464646464,
"grad_norm": 23.855112075805664,
"learning_rate": 1.946891677106312e-06,
"loss": 0.8687731027603149,
"step": 552
},
{
"epoch": 0.4663299663299663,
"grad_norm": 4.286966323852539,
"learning_rate": 1.946324844945197e-06,
"loss": 1.031162977218628,
"step": 554
},
{
"epoch": 0.468013468013468,
"grad_norm": 2.7228028774261475,
"learning_rate": 1.9457550968388928e-06,
"loss": 0.7218068242073059,
"step": 556
},
{
"epoch": 0.4696969696969697,
"grad_norm": 4.717339038848877,
"learning_rate": 1.9451824347504213e-06,
"loss": 1.17518949508667,
"step": 558
},
{
"epoch": 0.4713804713804714,
"grad_norm": 13.350486755371094,
"learning_rate": 1.944606860652845e-06,
"loss": 0.4006010890007019,
"step": 560
},
{
"epoch": 0.4730639730639731,
"grad_norm": 3.6367268562316895,
"learning_rate": 1.944028376529258e-06,
"loss": 0.5863475799560547,
"step": 562
},
{
"epoch": 0.47474747474747475,
"grad_norm": 2.8083655834198,
"learning_rate": 1.943446984372782e-06,
"loss": 1.2751696109771729,
"step": 564
},
{
"epoch": 0.4764309764309764,
"grad_norm": 6.395586967468262,
"learning_rate": 1.942862686186557e-06,
"loss": 1.1098227500915527,
"step": 566
},
{
"epoch": 0.4781144781144781,
"grad_norm": 10.825981140136719,
"learning_rate": 1.9422754839837366e-06,
"loss": 0.4494704604148865,
"step": 568
},
{
"epoch": 0.4797979797979798,
"grad_norm": 32.269683837890625,
"learning_rate": 1.9416853797874797e-06,
"loss": 1.0807325839996338,
"step": 570
},
{
"epoch": 0.48148148148148145,
"grad_norm": 20.739370346069336,
"learning_rate": 1.941092375630943e-06,
"loss": 0.6750832796096802,
"step": 572
},
{
"epoch": 0.4831649831649832,
"grad_norm": 11.944796562194824,
"learning_rate": 1.9404964735572754e-06,
"loss": 0.9658522605895996,
"step": 574
},
{
"epoch": 0.48484848484848486,
"grad_norm": 8.797262191772461,
"learning_rate": 1.939897675619611e-06,
"loss": 0.8590230941772461,
"step": 576
},
{
"epoch": 0.48653198653198654,
"grad_norm": 9.839401245117188,
"learning_rate": 1.9392959838810597e-06,
"loss": 1.0677263736724854,
"step": 578
},
{
"epoch": 0.4882154882154882,
"grad_norm": 8.498374938964844,
"learning_rate": 1.9386914004147034e-06,
"loss": 0.7860367298126221,
"step": 580
},
{
"epoch": 0.4898989898989899,
"grad_norm": 6.413960933685303,
"learning_rate": 1.938083927303586e-06,
"loss": 1.3258328437805176,
"step": 582
},
{
"epoch": 0.49158249158249157,
"grad_norm": 2.855747699737549,
"learning_rate": 1.937473566640708e-06,
"loss": 0.9856802821159363,
"step": 584
},
{
"epoch": 0.49326599326599324,
"grad_norm": 5.174104690551758,
"learning_rate": 1.9368603205290196e-06,
"loss": 0.8397727012634277,
"step": 586
},
{
"epoch": 0.494949494949495,
"grad_norm": 3.8909213542938232,
"learning_rate": 1.9362441910814105e-06,
"loss": 0.6163880825042725,
"step": 588
},
{
"epoch": 0.49663299663299665,
"grad_norm": 4.697168350219727,
"learning_rate": 1.935625180420706e-06,
"loss": 1.087604284286499,
"step": 590
},
{
"epoch": 0.4983164983164983,
"grad_norm": 2.995621681213379,
"learning_rate": 1.935003290679659e-06,
"loss": 1.0904521942138672,
"step": 592
},
{
"epoch": 0.5,
"grad_norm": 13.57467269897461,
"learning_rate": 1.934378524000941e-06,
"loss": 0.8232730031013489,
"step": 594
},
{
"epoch": 0.5016835016835017,
"grad_norm": 6.692266464233398,
"learning_rate": 1.933750882537136e-06,
"loss": 0.9355677366256714,
"step": 596
},
{
"epoch": 0.5033670033670034,
"grad_norm": 2.513978958129883,
"learning_rate": 1.9331203684507333e-06,
"loss": 1.284334421157837,
"step": 598
},
{
"epoch": 0.5050505050505051,
"grad_norm": 23.219905853271484,
"learning_rate": 1.9324869839141184e-06,
"loss": 0.7689567804336548,
"step": 600
},
{
"epoch": 0.5067340067340067,
"grad_norm": 16.52220344543457,
"learning_rate": 1.9318507311095686e-06,
"loss": 1.0293747186660767,
"step": 602
},
{
"epoch": 0.5084175084175084,
"grad_norm": 10.209641456604004,
"learning_rate": 1.9312116122292414e-06,
"loss": 0.9961596727371216,
"step": 604
},
{
"epoch": 0.51010101010101,
"grad_norm": 3.1684632301330566,
"learning_rate": 1.9305696294751707e-06,
"loss": 1.0693247318267822,
"step": 606
},
{
"epoch": 0.5117845117845118,
"grad_norm": 15.678545951843262,
"learning_rate": 1.9299247850592575e-06,
"loss": 0.5298241376876831,
"step": 608
},
{
"epoch": 0.5134680134680135,
"grad_norm": 3.837263822555542,
"learning_rate": 1.9292770812032626e-06,
"loss": 0.9167294502258301,
"step": 610
},
{
"epoch": 0.5151515151515151,
"grad_norm": 8.804614067077637,
"learning_rate": 1.9286265201387966e-06,
"loss": 0.8463789224624634,
"step": 612
},
{
"epoch": 0.5168350168350169,
"grad_norm": 6.136633396148682,
"learning_rate": 1.9279731041073177e-06,
"loss": 0.6948338747024536,
"step": 614
},
{
"epoch": 0.5185185185185185,
"grad_norm": 5.291085720062256,
"learning_rate": 1.9273168353601185e-06,
"loss": 1.080240249633789,
"step": 616
},
{
"epoch": 0.5202020202020202,
"grad_norm": 5.781073093414307,
"learning_rate": 1.9266577161583207e-06,
"loss": 1.0078164339065552,
"step": 618
},
{
"epoch": 0.5218855218855218,
"grad_norm": 4.246747970581055,
"learning_rate": 1.925995748772868e-06,
"loss": 0.9573478102684021,
"step": 620
},
{
"epoch": 0.5235690235690236,
"grad_norm": 6.759246349334717,
"learning_rate": 1.925330935484516e-06,
"loss": 1.0398313999176025,
"step": 622
},
{
"epoch": 0.5252525252525253,
"grad_norm": 2.2948110103607178,
"learning_rate": 1.9246632785838263e-06,
"loss": 0.7390921711921692,
"step": 624
},
{
"epoch": 0.5269360269360269,
"grad_norm": 9.203880310058594,
"learning_rate": 1.9239927803711578e-06,
"loss": 0.9215421676635742,
"step": 626
},
{
"epoch": 0.5286195286195287,
"grad_norm": 18.581615447998047,
"learning_rate": 1.923319443156659e-06,
"loss": 0.8367900252342224,
"step": 628
},
{
"epoch": 0.5303030303030303,
"grad_norm": 16.1141357421875,
"learning_rate": 1.92264326926026e-06,
"loss": 0.7088955640792847,
"step": 630
},
{
"epoch": 0.531986531986532,
"grad_norm": 4.339905738830566,
"learning_rate": 1.9219642610116647e-06,
"loss": 1.1045582294464111,
"step": 632
},
{
"epoch": 0.5336700336700336,
"grad_norm": 5.655019760131836,
"learning_rate": 1.9212824207503415e-06,
"loss": 0.9011019468307495,
"step": 634
},
{
"epoch": 0.5353535353535354,
"grad_norm": 18.707368850708008,
"learning_rate": 1.920597750825517e-06,
"loss": 0.7971285581588745,
"step": 636
},
{
"epoch": 0.5370370370370371,
"grad_norm": 15.80146312713623,
"learning_rate": 1.919910253596168e-06,
"loss": 0.9591305255889893,
"step": 638
},
{
"epoch": 0.5387205387205387,
"grad_norm": 4.86085319519043,
"learning_rate": 1.919219931431011e-06,
"loss": 0.810368537902832,
"step": 640
},
{
"epoch": 0.5404040404040404,
"grad_norm": 6.4632792472839355,
"learning_rate": 1.918526786708497e-06,
"loss": 0.9356435537338257,
"step": 642
},
{
"epoch": 0.5420875420875421,
"grad_norm": 16.430055618286133,
"learning_rate": 1.9178308218168e-06,
"loss": 0.8751171231269836,
"step": 644
},
{
"epoch": 0.5437710437710438,
"grad_norm": 8.275667190551758,
"learning_rate": 1.9171320391538132e-06,
"loss": 0.8758902549743652,
"step": 646
},
{
"epoch": 0.5454545454545454,
"grad_norm": 3.1901955604553223,
"learning_rate": 1.9164304411271364e-06,
"loss": 0.9705331325531006,
"step": 648
},
{
"epoch": 0.5471380471380471,
"grad_norm": 16.389245986938477,
"learning_rate": 1.9157260301540697e-06,
"loss": 1.0938405990600586,
"step": 650
},
{
"epoch": 0.5488215488215489,
"grad_norm": 7.667538642883301,
"learning_rate": 1.9150188086616055e-06,
"loss": 1.0371794700622559,
"step": 652
},
{
"epoch": 0.5505050505050505,
"grad_norm": 19.274045944213867,
"learning_rate": 1.91430877908642e-06,
"loss": 1.0635974407196045,
"step": 654
},
{
"epoch": 0.5521885521885522,
"grad_norm": 11.602453231811523,
"learning_rate": 1.9135959438748626e-06,
"loss": 0.8951305747032166,
"step": 656
},
{
"epoch": 0.5538720538720538,
"grad_norm": 5.208285331726074,
"learning_rate": 1.9128803054829515e-06,
"loss": 0.7661327719688416,
"step": 658
},
{
"epoch": 0.5555555555555556,
"grad_norm": 13.546456336975098,
"learning_rate": 1.912161866376362e-06,
"loss": 0.7583224177360535,
"step": 660
},
{
"epoch": 0.5572390572390572,
"grad_norm": 3.013401508331299,
"learning_rate": 1.9114406290304186e-06,
"loss": 1.0516358613967896,
"step": 662
},
{
"epoch": 0.5589225589225589,
"grad_norm": 7.981349468231201,
"learning_rate": 1.910716595930088e-06,
"loss": 1.0741899013519287,
"step": 664
},
{
"epoch": 0.5606060606060606,
"grad_norm": 5.936778545379639,
"learning_rate": 1.9099897695699684e-06,
"loss": 0.49627187848091125,
"step": 666
},
{
"epoch": 0.5622895622895623,
"grad_norm": 13.402975082397461,
"learning_rate": 1.9092601524542828e-06,
"loss": 0.6627441644668579,
"step": 668
},
{
"epoch": 0.563973063973064,
"grad_norm": 44.2243537902832,
"learning_rate": 1.9085277470968692e-06,
"loss": 1.0360723733901978,
"step": 670
},
{
"epoch": 0.5656565656565656,
"grad_norm": 40.057151794433594,
"learning_rate": 1.907792556021171e-06,
"loss": 0.6004194021224976,
"step": 672
},
{
"epoch": 0.5673400673400674,
"grad_norm": 10.742400169372559,
"learning_rate": 1.9070545817602328e-06,
"loss": 0.5512696504592896,
"step": 674
},
{
"epoch": 0.569023569023569,
"grad_norm": 4.548379898071289,
"learning_rate": 1.9063138268566851e-06,
"loss": 0.6692613959312439,
"step": 676
},
{
"epoch": 0.5707070707070707,
"grad_norm": 5.4406023025512695,
"learning_rate": 1.9055702938627407e-06,
"loss": 1.1743131875991821,
"step": 678
},
{
"epoch": 0.5723905723905723,
"grad_norm": 11.110426902770996,
"learning_rate": 1.9048239853401833e-06,
"loss": 0.41852569580078125,
"step": 680
},
{
"epoch": 0.5740740740740741,
"grad_norm": 8.052200317382812,
"learning_rate": 1.9040749038603602e-06,
"loss": 1.0331244468688965,
"step": 682
},
{
"epoch": 0.5757575757575758,
"grad_norm": 9.13505744934082,
"learning_rate": 1.9033230520041719e-06,
"loss": 1.1170430183410645,
"step": 684
},
{
"epoch": 0.5774410774410774,
"grad_norm": 5.407991886138916,
"learning_rate": 1.9025684323620645e-06,
"loss": 1.0954296588897705,
"step": 686
},
{
"epoch": 0.5791245791245792,
"grad_norm": 4.380704879760742,
"learning_rate": 1.9018110475340203e-06,
"loss": 0.8225352168083191,
"step": 688
},
{
"epoch": 0.5808080808080808,
"grad_norm": 10.951150894165039,
"learning_rate": 1.9010509001295485e-06,
"loss": 0.7188082337379456,
"step": 690
},
{
"epoch": 0.5824915824915825,
"grad_norm": 3.9585494995117188,
"learning_rate": 1.9002879927676767e-06,
"loss": 0.8001824617385864,
"step": 692
},
{
"epoch": 0.5841750841750841,
"grad_norm": 3.551115036010742,
"learning_rate": 1.8995223280769424e-06,
"loss": 0.9616529941558838,
"step": 694
},
{
"epoch": 0.5858585858585859,
"grad_norm": 6.161308765411377,
"learning_rate": 1.8987539086953819e-06,
"loss": 0.8874322772026062,
"step": 696
},
{
"epoch": 0.5875420875420876,
"grad_norm": 10.766314506530762,
"learning_rate": 1.8979827372705233e-06,
"loss": 0.8692164421081543,
"step": 698
},
{
"epoch": 0.5892255892255892,
"grad_norm": 7.8653035163879395,
"learning_rate": 1.8972088164593771e-06,
"loss": 0.8069002032279968,
"step": 700
},
{
"epoch": 0.5909090909090909,
"grad_norm": 7.134982585906982,
"learning_rate": 1.896432148928426e-06,
"loss": 0.9260559678077698,
"step": 702
},
{
"epoch": 0.5925925925925926,
"grad_norm": 6.079588890075684,
"learning_rate": 1.895652737353616e-06,
"loss": 1.0575344562530518,
"step": 704
},
{
"epoch": 0.5942760942760943,
"grad_norm": 28.564146041870117,
"learning_rate": 1.8948705844203482e-06,
"loss": 0.9762513041496277,
"step": 706
},
{
"epoch": 0.5959595959595959,
"grad_norm": 9.878491401672363,
"learning_rate": 1.8940856928234689e-06,
"loss": 0.7743998765945435,
"step": 708
},
{
"epoch": 0.5976430976430976,
"grad_norm": 6.6208720207214355,
"learning_rate": 1.8932980652672597e-06,
"loss": 0.8060773015022278,
"step": 710
},
{
"epoch": 0.5993265993265994,
"grad_norm": 6.425124168395996,
"learning_rate": 1.8925077044654288e-06,
"loss": 1.1068170070648193,
"step": 712
},
{
"epoch": 0.601010101010101,
"grad_norm": 2.99337100982666,
"learning_rate": 1.8917146131411015e-06,
"loss": 1.0512995719909668,
"step": 714
},
{
"epoch": 0.6026936026936027,
"grad_norm": 3.8051576614379883,
"learning_rate": 1.8909187940268115e-06,
"loss": 0.7426064610481262,
"step": 716
},
{
"epoch": 0.6043771043771043,
"grad_norm": 6.566201686859131,
"learning_rate": 1.89012024986449e-06,
"loss": 0.894334614276886,
"step": 718
},
{
"epoch": 0.6060606060606061,
"grad_norm": 8.337869644165039,
"learning_rate": 1.8893189834054586e-06,
"loss": 0.9385843276977539,
"step": 720
},
{
"epoch": 0.6077441077441077,
"grad_norm": 9.33846664428711,
"learning_rate": 1.8885149974104164e-06,
"loss": 0.9482979774475098,
"step": 722
},
{
"epoch": 0.6094276094276094,
"grad_norm": 3.8621480464935303,
"learning_rate": 1.8877082946494339e-06,
"loss": 0.8786056041717529,
"step": 724
},
{
"epoch": 0.6111111111111112,
"grad_norm": 19.156356811523438,
"learning_rate": 1.8868988779019414e-06,
"loss": 0.9990079402923584,
"step": 726
},
{
"epoch": 0.6127946127946128,
"grad_norm": 27.229507446289062,
"learning_rate": 1.8860867499567203e-06,
"loss": 0.908332347869873,
"step": 728
},
{
"epoch": 0.6144781144781145,
"grad_norm": 11.623302459716797,
"learning_rate": 1.885271913611893e-06,
"loss": 1.1277103424072266,
"step": 730
},
{
"epoch": 0.6161616161616161,
"grad_norm": 3.196768283843994,
"learning_rate": 1.8844543716749134e-06,
"loss": 1.0839519500732422,
"step": 732
},
{
"epoch": 0.6178451178451179,
"grad_norm": 3.471727132797241,
"learning_rate": 1.8836341269625578e-06,
"loss": 0.7715842723846436,
"step": 734
},
{
"epoch": 0.6195286195286195,
"grad_norm": 8.554580688476562,
"learning_rate": 1.882811182300914e-06,
"loss": 0.7822331190109253,
"step": 736
},
{
"epoch": 0.6212121212121212,
"grad_norm": 22.184911727905273,
"learning_rate": 1.881985540525373e-06,
"loss": 0.6754369139671326,
"step": 738
},
{
"epoch": 0.622895622895623,
"grad_norm": 5.2334442138671875,
"learning_rate": 1.8811572044806178e-06,
"loss": 1.2211134433746338,
"step": 740
},
{
"epoch": 0.6245791245791246,
"grad_norm": 5.914177417755127,
"learning_rate": 1.8803261770206149e-06,
"loss": 0.9921356439590454,
"step": 742
},
{
"epoch": 0.6262626262626263,
"grad_norm": 6.881519794464111,
"learning_rate": 1.8794924610086031e-06,
"loss": 1.1868412494659424,
"step": 744
},
{
"epoch": 0.6279461279461279,
"grad_norm": 3.5606613159179688,
"learning_rate": 1.8786560593170854e-06,
"loss": 0.9340991377830505,
"step": 746
},
{
"epoch": 0.6296296296296297,
"grad_norm": 81.61597442626953,
"learning_rate": 1.877816974827817e-06,
"loss": 1.1839344501495361,
"step": 748
},
{
"epoch": 0.6313131313131313,
"grad_norm": 2.52506685256958,
"learning_rate": 1.8769752104317973e-06,
"loss": 1.280696153640747,
"step": 750
},
{
"epoch": 0.632996632996633,
"grad_norm": 6.496135711669922,
"learning_rate": 1.8761307690292589e-06,
"loss": 0.7088183164596558,
"step": 752
},
{
"epoch": 0.6346801346801347,
"grad_norm": 20.767459869384766,
"learning_rate": 1.875283653529658e-06,
"loss": 0.9602365493774414,
"step": 754
},
{
"epoch": 0.6363636363636364,
"grad_norm": 3.436274290084839,
"learning_rate": 1.874433866851663e-06,
"loss": 0.7587154507637024,
"step": 756
},
{
"epoch": 0.6380471380471381,
"grad_norm": 6.604635238647461,
"learning_rate": 1.8735814119231475e-06,
"loss": 0.8278650641441345,
"step": 758
},
{
"epoch": 0.6397306397306397,
"grad_norm": 17.961626052856445,
"learning_rate": 1.872726291681177e-06,
"loss": 0.6165801286697388,
"step": 760
},
{
"epoch": 0.6414141414141414,
"grad_norm": 4.451328754425049,
"learning_rate": 1.8718685090720004e-06,
"loss": 0.4456964433193207,
"step": 762
},
{
"epoch": 0.6430976430976431,
"grad_norm": 4.893067359924316,
"learning_rate": 1.8710080670510402e-06,
"loss": 0.9912799000740051,
"step": 764
},
{
"epoch": 0.6447811447811448,
"grad_norm": 9.001324653625488,
"learning_rate": 1.8701449685828806e-06,
"loss": 1.0763907432556152,
"step": 766
},
{
"epoch": 0.6464646464646465,
"grad_norm": 10.884461402893066,
"learning_rate": 1.8692792166412595e-06,
"loss": 0.761760950088501,
"step": 768
},
{
"epoch": 0.6481481481481481,
"grad_norm": 7.378164768218994,
"learning_rate": 1.8684108142090562e-06,
"loss": 0.7692549824714661,
"step": 770
},
{
"epoch": 0.6498316498316499,
"grad_norm": 3.8816888332366943,
"learning_rate": 1.8675397642782827e-06,
"loss": 0.7803175449371338,
"step": 772
},
{
"epoch": 0.6515151515151515,
"grad_norm": 28.640594482421875,
"learning_rate": 1.8666660698500726e-06,
"loss": 0.7042616009712219,
"step": 774
},
{
"epoch": 0.6531986531986532,
"grad_norm": 3.964298725128174,
"learning_rate": 1.8657897339346707e-06,
"loss": 0.9174256920814514,
"step": 776
},
{
"epoch": 0.6548821548821548,
"grad_norm": 2.9591541290283203,
"learning_rate": 1.8649107595514226e-06,
"loss": 1.040077805519104,
"step": 778
},
{
"epoch": 0.6565656565656566,
"grad_norm": 15.031349182128906,
"learning_rate": 1.8640291497287654e-06,
"loss": 0.9099994897842407,
"step": 780
},
{
"epoch": 0.6582491582491582,
"grad_norm": 15.89492416381836,
"learning_rate": 1.8631449075042156e-06,
"loss": 1.0717145204544067,
"step": 782
},
{
"epoch": 0.6599326599326599,
"grad_norm": 5.403634071350098,
"learning_rate": 1.8622580359243601e-06,
"loss": 0.984376847743988,
"step": 784
},
{
"epoch": 0.6616161616161617,
"grad_norm": 12.673766136169434,
"learning_rate": 1.8613685380448441e-06,
"loss": 1.05198073387146,
"step": 786
},
{
"epoch": 0.6632996632996633,
"grad_norm": 14.643843650817871,
"learning_rate": 1.8604764169303626e-06,
"loss": 0.8343431949615479,
"step": 788
},
{
"epoch": 0.664983164983165,
"grad_norm": 3.391157627105713,
"learning_rate": 1.8595816756546477e-06,
"loss": 0.935477614402771,
"step": 790
},
{
"epoch": 0.6666666666666666,
"grad_norm": 10.033073425292969,
"learning_rate": 1.8586843173004598e-06,
"loss": 0.9675720930099487,
"step": 792
},
{
"epoch": 0.6683501683501684,
"grad_norm": 3.087076187133789,
"learning_rate": 1.8577843449595763e-06,
"loss": 0.6215054392814636,
"step": 794
},
{
"epoch": 0.67003367003367,
"grad_norm": 2.3780627250671387,
"learning_rate": 1.85688176173278e-06,
"loss": 0.9712251424789429,
"step": 796
},
{
"epoch": 0.6717171717171717,
"grad_norm": 5.441427230834961,
"learning_rate": 1.8559765707298502e-06,
"loss": 0.993064820766449,
"step": 798
},
{
"epoch": 0.6734006734006734,
"grad_norm": 3.6938350200653076,
"learning_rate": 1.8550687750695509e-06,
"loss": 0.6260876655578613,
"step": 800
},
{
"epoch": 0.6750841750841751,
"grad_norm": 2.9936280250549316,
"learning_rate": 1.8541583778796196e-06,
"loss": 0.9794340133666992,
"step": 802
},
{
"epoch": 0.6767676767676768,
"grad_norm": 11.732361793518066,
"learning_rate": 1.8532453822967584e-06,
"loss": 0.7467688322067261,
"step": 804
},
{
"epoch": 0.6784511784511784,
"grad_norm": 14.32625675201416,
"learning_rate": 1.8523297914666207e-06,
"loss": 0.6042066812515259,
"step": 806
},
{
"epoch": 0.6801346801346801,
"grad_norm": 8.490279197692871,
"learning_rate": 1.8514116085438027e-06,
"loss": 0.9197585582733154,
"step": 808
},
{
"epoch": 0.6818181818181818,
"grad_norm": 8.056469917297363,
"learning_rate": 1.8504908366918302e-06,
"loss": 0.9674583077430725,
"step": 810
},
{
"epoch": 0.6835016835016835,
"grad_norm": 3.1704888343811035,
"learning_rate": 1.84956747908315e-06,
"loss": 1.158250331878662,
"step": 812
},
{
"epoch": 0.6851851851851852,
"grad_norm": 21.666156768798828,
"learning_rate": 1.8486415388991173e-06,
"loss": 0.5964489579200745,
"step": 814
},
{
"epoch": 0.6868686868686869,
"grad_norm": 2.8897705078125,
"learning_rate": 1.8477130193299863e-06,
"loss": 1.0845026969909668,
"step": 816
},
{
"epoch": 0.6885521885521886,
"grad_norm": 4.6460371017456055,
"learning_rate": 1.846781923574897e-06,
"loss": 0.7914435863494873,
"step": 818
},
{
"epoch": 0.6902356902356902,
"grad_norm": 15.396445274353027,
"learning_rate": 1.8458482548418661e-06,
"loss": 0.6972349882125854,
"step": 820
},
{
"epoch": 0.6919191919191919,
"grad_norm": 12.507894515991211,
"learning_rate": 1.8449120163477753e-06,
"loss": 0.7580819129943848,
"step": 822
},
{
"epoch": 0.6936026936026936,
"grad_norm": 3.151318073272705,
"learning_rate": 1.8439732113183607e-06,
"loss": 0.8469318151473999,
"step": 824
},
{
"epoch": 0.6952861952861953,
"grad_norm": 7.642462730407715,
"learning_rate": 1.8430318429881997e-06,
"loss": 0.8898569941520691,
"step": 826
},
{
"epoch": 0.696969696969697,
"grad_norm": 3.9848973751068115,
"learning_rate": 1.8420879146007025e-06,
"loss": 0.7908803224563599,
"step": 828
},
{
"epoch": 0.6986531986531986,
"grad_norm": 3.608306884765625,
"learning_rate": 1.8411414294081003e-06,
"loss": 1.208510398864746,
"step": 830
},
{
"epoch": 0.7003367003367004,
"grad_norm": 5.017977237701416,
"learning_rate": 1.8401923906714321e-06,
"loss": 0.8827351331710815,
"step": 832
},
{
"epoch": 0.702020202020202,
"grad_norm": 6.648691177368164,
"learning_rate": 1.8392408016605358e-06,
"loss": 0.6782714128494263,
"step": 834
},
{
"epoch": 0.7037037037037037,
"grad_norm": 2.3364169597625732,
"learning_rate": 1.8382866656540361e-06,
"loss": 0.8600856065750122,
"step": 836
},
{
"epoch": 0.7053872053872053,
"grad_norm": 8.938956260681152,
"learning_rate": 1.8373299859393326e-06,
"loss": 0.598077654838562,
"step": 838
},
{
"epoch": 0.7070707070707071,
"grad_norm": 2.977544069290161,
"learning_rate": 1.8363707658125905e-06,
"loss": 1.239319920539856,
"step": 840
},
{
"epoch": 0.7087542087542088,
"grad_norm": 8.318215370178223,
"learning_rate": 1.8354090085787252e-06,
"loss": 1.1046662330627441,
"step": 842
},
{
"epoch": 0.7104377104377104,
"grad_norm": 11.01289176940918,
"learning_rate": 1.8344447175513965e-06,
"loss": 1.0052223205566406,
"step": 844
},
{
"epoch": 0.7121212121212122,
"grad_norm": 4.2134199142456055,
"learning_rate": 1.8334778960529916e-06,
"loss": 0.8582904934883118,
"step": 846
},
{
"epoch": 0.7138047138047138,
"grad_norm": 12.371885299682617,
"learning_rate": 1.8325085474146178e-06,
"loss": 0.7332583665847778,
"step": 848
},
{
"epoch": 0.7154882154882155,
"grad_norm": 17.55687713623047,
"learning_rate": 1.8315366749760892e-06,
"loss": 0.8967425227165222,
"step": 850
},
{
"epoch": 0.7171717171717171,
"grad_norm": 8.929709434509277,
"learning_rate": 1.8305622820859153e-06,
"loss": 0.7431824207305908,
"step": 852
},
{
"epoch": 0.7188552188552189,
"grad_norm": 8.052350044250488,
"learning_rate": 1.829585372101289e-06,
"loss": 0.74913489818573,
"step": 854
},
{
"epoch": 0.7205387205387206,
"grad_norm": 7.0438432693481445,
"learning_rate": 1.828605948388077e-06,
"loss": 1.1222918033599854,
"step": 856
},
{
"epoch": 0.7222222222222222,
"grad_norm": 4.850925445556641,
"learning_rate": 1.8276240143208054e-06,
"loss": 0.7487032413482666,
"step": 858
},
{
"epoch": 0.7239057239057239,
"grad_norm": 4.052372932434082,
"learning_rate": 1.8266395732826508e-06,
"loss": 0.9676373600959778,
"step": 860
},
{
"epoch": 0.7255892255892256,
"grad_norm": 3.9550697803497314,
"learning_rate": 1.8256526286654264e-06,
"loss": 1.170372724533081,
"step": 862
},
{
"epoch": 0.7272727272727273,
"grad_norm": 5.656938076019287,
"learning_rate": 1.824663183869572e-06,
"loss": 0.9866449236869812,
"step": 864
},
{
"epoch": 0.7289562289562289,
"grad_norm": 38.397705078125,
"learning_rate": 1.8236712423041408e-06,
"loss": 0.9790170192718506,
"step": 866
},
{
"epoch": 0.7306397306397306,
"grad_norm": 5.537583827972412,
"learning_rate": 1.822676807386789e-06,
"loss": 1.2290745973587036,
"step": 868
},
{
"epoch": 0.7323232323232324,
"grad_norm": 5.6285080909729,
"learning_rate": 1.8216798825437635e-06,
"loss": 1.1579557657241821,
"step": 870
},
{
"epoch": 0.734006734006734,
"grad_norm": 6.440390586853027,
"learning_rate": 1.8206804712098903e-06,
"loss": 1.0755215883255005,
"step": 872
},
{
"epoch": 0.7356902356902357,
"grad_norm": 3.566018581390381,
"learning_rate": 1.819678576828561e-06,
"loss": 1.0724159479141235,
"step": 874
},
{
"epoch": 0.7373737373737373,
"grad_norm": 76.55033111572266,
"learning_rate": 1.8186742028517237e-06,
"loss": 0.8843256235122681,
"step": 876
},
{
"epoch": 0.7390572390572391,
"grad_norm": 12.517910957336426,
"learning_rate": 1.8176673527398694e-06,
"loss": 0.6147758960723877,
"step": 878
},
{
"epoch": 0.7407407407407407,
"grad_norm": 16.583293914794922,
"learning_rate": 1.8166580299620202e-06,
"loss": 0.6138923764228821,
"step": 880
},
{
"epoch": 0.7424242424242424,
"grad_norm": 2.747283935546875,
"learning_rate": 1.815646237995718e-06,
"loss": 1.1428195238113403,
"step": 882
},
{
"epoch": 0.7441077441077442,
"grad_norm": 92.89835357666016,
"learning_rate": 1.814631980327012e-06,
"loss": 1.0840024948120117,
"step": 884
},
{
"epoch": 0.7457912457912458,
"grad_norm": 4.928184509277344,
"learning_rate": 1.813615260450446e-06,
"loss": 0.646350622177124,
"step": 886
},
{
"epoch": 0.7474747474747475,
"grad_norm": 2.7117934226989746,
"learning_rate": 1.8125960818690485e-06,
"loss": 0.991912841796875,
"step": 888
},
{
"epoch": 0.7491582491582491,
"grad_norm": 17.214120864868164,
"learning_rate": 1.811574448094318e-06,
"loss": 0.8976044058799744,
"step": 890
},
{
"epoch": 0.7508417508417509,
"grad_norm": 27.415754318237305,
"learning_rate": 1.8105503626462129e-06,
"loss": 0.9429522752761841,
"step": 892
},
{
"epoch": 0.7525252525252525,
"grad_norm": 7.359311580657959,
"learning_rate": 1.8095238290531385e-06,
"loss": 0.7071723937988281,
"step": 894
},
{
"epoch": 0.7542087542087542,
"grad_norm": 8.892601013183594,
"learning_rate": 1.8084948508519346e-06,
"loss": 0.8216047286987305,
"step": 896
},
{
"epoch": 0.7558922558922558,
"grad_norm": 62.057533264160156,
"learning_rate": 1.8074634315878644e-06,
"loss": 0.6230831146240234,
"step": 898
},
{
"epoch": 0.7575757575757576,
"grad_norm": 2.9089205265045166,
"learning_rate": 1.8064295748146014e-06,
"loss": 0.8760740160942078,
"step": 900
},
{
"epoch": 0.7592592592592593,
"grad_norm": 14.121993064880371,
"learning_rate": 1.8053932840942175e-06,
"loss": 0.6401835680007935,
"step": 902
},
{
"epoch": 0.7609427609427609,
"grad_norm": 5.602692604064941,
"learning_rate": 1.8043545629971689e-06,
"loss": 1.1890406608581543,
"step": 904
},
{
"epoch": 0.7626262626262627,
"grad_norm": 3.653724431991577,
"learning_rate": 1.8033134151022881e-06,
"loss": 0.8872392177581787,
"step": 906
},
{
"epoch": 0.7643097643097643,
"grad_norm": 5.650278568267822,
"learning_rate": 1.8022698439967673e-06,
"loss": 0.8761744499206543,
"step": 908
},
{
"epoch": 0.765993265993266,
"grad_norm": 3.7810301780700684,
"learning_rate": 1.8012238532761476e-06,
"loss": 0.8327740430831909,
"step": 910
},
{
"epoch": 0.7676767676767676,
"grad_norm": 16.808286666870117,
"learning_rate": 1.8001754465443078e-06,
"loss": 0.9591882228851318,
"step": 912
},
{
"epoch": 0.7693602693602694,
"grad_norm": 8.369492530822754,
"learning_rate": 1.79912462741345e-06,
"loss": 0.8368163704872131,
"step": 914
},
{
"epoch": 0.7710437710437711,
"grad_norm": 8.313328742980957,
"learning_rate": 1.798071399504088e-06,
"loss": 0.9555743336677551,
"step": 916
},
{
"epoch": 0.7727272727272727,
"grad_norm": 4.798566818237305,
"learning_rate": 1.7970157664450357e-06,
"loss": 0.6112362146377563,
"step": 918
},
{
"epoch": 0.7744107744107744,
"grad_norm": 18.712345123291016,
"learning_rate": 1.7959577318733925e-06,
"loss": 0.5020445585250854,
"step": 920
},
{
"epoch": 0.7760942760942761,
"grad_norm": 2.20595383644104,
"learning_rate": 1.7948972994345328e-06,
"loss": 0.6102715134620667,
"step": 922
},
{
"epoch": 0.7777777777777778,
"grad_norm": 6.520366191864014,
"learning_rate": 1.7938344727820928e-06,
"loss": 0.9018456935882568,
"step": 924
},
{
"epoch": 0.7794612794612794,
"grad_norm": 12.06176471710205,
"learning_rate": 1.7927692555779577e-06,
"loss": 1.130429744720459,
"step": 926
},
{
"epoch": 0.7811447811447811,
"grad_norm": 4.479389190673828,
"learning_rate": 1.791701651492248e-06,
"loss": 0.46166184544563293,
"step": 928
},
{
"epoch": 0.7828282828282829,
"grad_norm": 9.225821495056152,
"learning_rate": 1.7906316642033099e-06,
"loss": 1.3147855997085571,
"step": 930
},
{
"epoch": 0.7845117845117845,
"grad_norm": 13.229998588562012,
"learning_rate": 1.7895592973976998e-06,
"loss": 0.8350358605384827,
"step": 932
},
{
"epoch": 0.7861952861952862,
"grad_norm": 2.252268075942993,
"learning_rate": 1.7884845547701721e-06,
"loss": 0.991974949836731,
"step": 934
},
{
"epoch": 0.7878787878787878,
"grad_norm": 2.7008936405181885,
"learning_rate": 1.7874074400236677e-06,
"loss": 0.8550293445587158,
"step": 936
},
{
"epoch": 0.7895622895622896,
"grad_norm": 2.743255376815796,
"learning_rate": 1.7863279568692999e-06,
"loss": 0.8677815198898315,
"step": 938
},
{
"epoch": 0.7912457912457912,
"grad_norm": 5.70646858215332,
"learning_rate": 1.7852461090263422e-06,
"loss": 0.9757652282714844,
"step": 940
},
{
"epoch": 0.7929292929292929,
"grad_norm": 4.7707200050354,
"learning_rate": 1.7841619002222164e-06,
"loss": 0.4027637541294098,
"step": 942
},
{
"epoch": 0.7946127946127947,
"grad_norm": 2.5659232139587402,
"learning_rate": 1.7830753341924768e-06,
"loss": 0.8958191275596619,
"step": 944
},
{
"epoch": 0.7962962962962963,
"grad_norm": 5.1869049072265625,
"learning_rate": 1.781986414680802e-06,
"loss": 0.8641246557235718,
"step": 946
},
{
"epoch": 0.797979797979798,
"grad_norm": 2.7495296001434326,
"learning_rate": 1.7808951454389761e-06,
"loss": 1.0250309705734253,
"step": 948
},
{
"epoch": 0.7996632996632996,
"grad_norm": 8.12884521484375,
"learning_rate": 1.7798015302268826e-06,
"loss": 0.8447544574737549,
"step": 950
},
{
"epoch": 0.8013468013468014,
"grad_norm": 9.825166702270508,
"learning_rate": 1.7787055728124853e-06,
"loss": 0.44982272386550903,
"step": 952
},
{
"epoch": 0.803030303030303,
"grad_norm": 2.7511558532714844,
"learning_rate": 1.777607276971818e-06,
"loss": 0.934439480304718,
"step": 954
},
{
"epoch": 0.8047138047138047,
"grad_norm": 10.230318069458008,
"learning_rate": 1.7765066464889729e-06,
"loss": 0.9457552433013916,
"step": 956
},
{
"epoch": 0.8063973063973064,
"grad_norm": 11.444622039794922,
"learning_rate": 1.775403685156085e-06,
"loss": 1.083388090133667,
"step": 958
},
{
"epoch": 0.8080808080808081,
"grad_norm": 6.961023330688477,
"learning_rate": 1.77429839677332e-06,
"loss": 0.6390881538391113,
"step": 960
},
{
"epoch": 0.8097643097643098,
"grad_norm": 3.448756217956543,
"learning_rate": 1.773190785148861e-06,
"loss": 0.7549522519111633,
"step": 962
},
{
"epoch": 0.8114478114478114,
"grad_norm": 9.252376556396484,
"learning_rate": 1.7720808540988965e-06,
"loss": 0.6879374980926514,
"step": 964
},
{
"epoch": 0.8131313131313131,
"grad_norm": 2.4772350788116455,
"learning_rate": 1.770968607447606e-06,
"loss": 0.9675562977790833,
"step": 966
},
{
"epoch": 0.8148148148148148,
"grad_norm": 4.749292850494385,
"learning_rate": 1.7698540490271475e-06,
"loss": 1.174008846282959,
"step": 968
},
{
"epoch": 0.8164983164983165,
"grad_norm": 2.8017964363098145,
"learning_rate": 1.7687371826776432e-06,
"loss": 0.9735618829727173,
"step": 970
},
{
"epoch": 0.8181818181818182,
"grad_norm": 26.424652099609375,
"learning_rate": 1.7676180122471677e-06,
"loss": 0.9349749088287354,
"step": 972
},
{
"epoch": 0.8198653198653199,
"grad_norm": 3.5407838821411133,
"learning_rate": 1.7664965415917342e-06,
"loss": 0.7211604714393616,
"step": 974
},
{
"epoch": 0.8215488215488216,
"grad_norm": 4.120766639709473,
"learning_rate": 1.765372774575281e-06,
"loss": 0.9185746908187866,
"step": 976
},
{
"epoch": 0.8232323232323232,
"grad_norm": 2.634417772293091,
"learning_rate": 1.764246715069658e-06,
"loss": 1.179499626159668,
"step": 978
},
{
"epoch": 0.8249158249158249,
"grad_norm": 4.83583927154541,
"learning_rate": 1.7631183669546146e-06,
"loss": 1.140142798423767,
"step": 980
},
{
"epoch": 0.8265993265993266,
"grad_norm": 4.506636142730713,
"learning_rate": 1.761987734117784e-06,
"loss": 1.0069242715835571,
"step": 982
},
{
"epoch": 0.8282828282828283,
"grad_norm": 4.123355388641357,
"learning_rate": 1.7608548204546724e-06,
"loss": 1.0207629203796387,
"step": 984
},
{
"epoch": 0.82996632996633,
"grad_norm": 16.72430419921875,
"learning_rate": 1.7597196298686446e-06,
"loss": 0.9050367474555969,
"step": 986
},
{
"epoch": 0.8316498316498316,
"grad_norm": 19.312665939331055,
"learning_rate": 1.7585821662709088e-06,
"loss": 0.8223767280578613,
"step": 988
},
{
"epoch": 0.8333333333333334,
"grad_norm": 13.517312049865723,
"learning_rate": 1.7574424335805066e-06,
"loss": 0.8045912384986877,
"step": 990
},
{
"epoch": 0.835016835016835,
"grad_norm": 10.205414772033691,
"learning_rate": 1.7563004357242962e-06,
"loss": 0.6719659566879272,
"step": 992
},
{
"epoch": 0.8367003367003367,
"grad_norm": 2.9161360263824463,
"learning_rate": 1.755156176636941e-06,
"loss": 0.9085012674331665,
"step": 994
},
{
"epoch": 0.8383838383838383,
"grad_norm": 6.5189714431762695,
"learning_rate": 1.7540096602608946e-06,
"loss": 0.6452804803848267,
"step": 996
},
{
"epoch": 0.8400673400673401,
"grad_norm": 12.399802207946777,
"learning_rate": 1.7528608905463881e-06,
"loss": 0.8944587707519531,
"step": 998
},
{
"epoch": 0.8417508417508418,
"grad_norm": 2.180464029312134,
"learning_rate": 1.7517098714514175e-06,
"loss": 1.0595688819885254,
"step": 1000
},
{
"epoch": 0.8434343434343434,
"grad_norm": 2.6750636100769043,
"learning_rate": 1.7505566069417272e-06,
"loss": 0.7289663553237915,
"step": 1002
},
{
"epoch": 0.8451178451178452,
"grad_norm": 3.7006335258483887,
"learning_rate": 1.749401100990799e-06,
"loss": 0.6673641204833984,
"step": 1004
},
{
"epoch": 0.8468013468013468,
"grad_norm": 5.355027675628662,
"learning_rate": 1.748243357579837e-06,
"loss": 1.0645616054534912,
"step": 1006
},
{
"epoch": 0.8484848484848485,
"grad_norm": 3.5920186042785645,
"learning_rate": 1.747083380697754e-06,
"loss": 0.5822446346282959,
"step": 1008
},
{
"epoch": 0.8501683501683501,
"grad_norm": 11.027798652648926,
"learning_rate": 1.7459211743411589e-06,
"loss": 0.9186769723892212,
"step": 1010
},
{
"epoch": 0.8518518518518519,
"grad_norm": 12.822773933410645,
"learning_rate": 1.7447567425143413e-06,
"loss": 0.8671125769615173,
"step": 1012
},
{
"epoch": 0.8535353535353535,
"grad_norm": 6.343443393707275,
"learning_rate": 1.7435900892292593e-06,
"loss": 0.7743659019470215,
"step": 1014
},
{
"epoch": 0.8552188552188552,
"grad_norm": 19.62537384033203,
"learning_rate": 1.7424212185055236e-06,
"loss": 0.627282977104187,
"step": 1016
},
{
"epoch": 0.8569023569023569,
"grad_norm": 11.67722225189209,
"learning_rate": 1.7412501343703858e-06,
"loss": 0.9576413631439209,
"step": 1018
},
{
"epoch": 0.8585858585858586,
"grad_norm": 4.532960414886475,
"learning_rate": 1.740076840858724e-06,
"loss": 1.2340772151947021,
"step": 1020
},
{
"epoch": 0.8602693602693603,
"grad_norm": 6.776996612548828,
"learning_rate": 1.7389013420130278e-06,
"loss": 0.8961556553840637,
"step": 1022
},
{
"epoch": 0.8619528619528619,
"grad_norm": 5.019154071807861,
"learning_rate": 1.7377236418833855e-06,
"loss": 0.9290032386779785,
"step": 1024
},
{
"epoch": 0.8636363636363636,
"grad_norm": 6.809414863586426,
"learning_rate": 1.736543744527469e-06,
"loss": 0.8829033374786377,
"step": 1026
},
{
"epoch": 0.8653198653198653,
"grad_norm": 7.587875843048096,
"learning_rate": 1.7353616540105214e-06,
"loss": 0.950920581817627,
"step": 1028
},
{
"epoch": 0.867003367003367,
"grad_norm": 2.2208216190338135,
"learning_rate": 1.7341773744053423e-06,
"loss": 0.621329128742218,
"step": 1030
},
{
"epoch": 0.8686868686868687,
"grad_norm": 2.647271156311035,
"learning_rate": 1.7329909097922726e-06,
"loss": 0.8295049667358398,
"step": 1032
},
{
"epoch": 0.8703703703703703,
"grad_norm": 4.676586151123047,
"learning_rate": 1.7318022642591826e-06,
"loss": 0.9272868633270264,
"step": 1034
},
{
"epoch": 0.8720538720538721,
"grad_norm": 2.2150022983551025,
"learning_rate": 1.730611441901456e-06,
"loss": 0.9140334129333496,
"step": 1036
},
{
"epoch": 0.8737373737373737,
"grad_norm": 6.269838809967041,
"learning_rate": 1.7294184468219768e-06,
"loss": 1.0908087491989136,
"step": 1038
},
{
"epoch": 0.8754208754208754,
"grad_norm": 7.025053024291992,
"learning_rate": 1.728223283131116e-06,
"loss": 1.0729374885559082,
"step": 1040
},
{
"epoch": 0.877104377104377,
"grad_norm": 21.954816818237305,
"learning_rate": 1.727025954946714e-06,
"loss": 0.9535812139511108,
"step": 1042
},
{
"epoch": 0.8787878787878788,
"grad_norm": 11.953475952148438,
"learning_rate": 1.7258264663940706e-06,
"loss": 1.0482563972473145,
"step": 1044
},
{
"epoch": 0.8804713804713805,
"grad_norm": 2.643186092376709,
"learning_rate": 1.724624821605929e-06,
"loss": 1.0523911714553833,
"step": 1046
},
{
"epoch": 0.8821548821548821,
"grad_norm": 21.0428409576416,
"learning_rate": 1.7234210247224608e-06,
"loss": 0.9557990431785583,
"step": 1048
},
{
"epoch": 0.8838383838383839,
"grad_norm": 13.664984703063965,
"learning_rate": 1.7222150798912527e-06,
"loss": 0.7585754990577698,
"step": 1050
},
{
"epoch": 0.8855218855218855,
"grad_norm": 7.943265438079834,
"learning_rate": 1.7210069912672924e-06,
"loss": 0.9970930218696594,
"step": 1052
},
{
"epoch": 0.8872053872053872,
"grad_norm": 13.632953643798828,
"learning_rate": 1.7197967630129533e-06,
"loss": 0.5471928715705872,
"step": 1054
},
{
"epoch": 0.8888888888888888,
"grad_norm": 2.960538625717163,
"learning_rate": 1.7185843992979805e-06,
"loss": 0.9481421113014221,
"step": 1056
},
{
"epoch": 0.8905723905723906,
"grad_norm": 2.4345412254333496,
"learning_rate": 1.7173699042994778e-06,
"loss": 0.9089041948318481,
"step": 1058
},
{
"epoch": 0.8922558922558923,
"grad_norm": 8.801026344299316,
"learning_rate": 1.716153282201891e-06,
"loss": 0.958892822265625,
"step": 1060
},
{
"epoch": 0.8939393939393939,
"grad_norm": 6.052116870880127,
"learning_rate": 1.7149345371969958e-06,
"loss": 0.8855940699577332,
"step": 1062
},
{
"epoch": 0.8956228956228957,
"grad_norm": 29.812705993652344,
"learning_rate": 1.7137136734838809e-06,
"loss": 0.8104236125946045,
"step": 1064
},
{
"epoch": 0.8973063973063973,
"grad_norm": 7.219144344329834,
"learning_rate": 1.7124906952689354e-06,
"loss": 1.0544826984405518,
"step": 1066
},
{
"epoch": 0.898989898989899,
"grad_norm": 8.607142448425293,
"learning_rate": 1.7112656067658345e-06,
"loss": 0.7836295366287231,
"step": 1068
},
{
"epoch": 0.9006734006734006,
"grad_norm": 3.9157323837280273,
"learning_rate": 1.7100384121955229e-06,
"loss": 0.9466323852539062,
"step": 1070
},
{
"epoch": 0.9023569023569024,
"grad_norm": 3.7519919872283936,
"learning_rate": 1.7088091157862026e-06,
"loss": 1.1859047412872314,
"step": 1072
},
{
"epoch": 0.9040404040404041,
"grad_norm": 22.836341857910156,
"learning_rate": 1.7075777217733169e-06,
"loss": 0.8282434344291687,
"step": 1074
},
{
"epoch": 0.9057239057239057,
"grad_norm": 7.615798473358154,
"learning_rate": 1.7063442343995361e-06,
"loss": 0.4293259382247925,
"step": 1076
},
{
"epoch": 0.9074074074074074,
"grad_norm": 14.713326454162598,
"learning_rate": 1.7051086579147436e-06,
"loss": 1.0748696327209473,
"step": 1078
},
{
"epoch": 0.9090909090909091,
"grad_norm": 6.053645610809326,
"learning_rate": 1.7038709965760198e-06,
"loss": 0.9073866605758667,
"step": 1080
},
{
"epoch": 0.9107744107744108,
"grad_norm": 3.388359785079956,
"learning_rate": 1.7026312546476292e-06,
"loss": 0.9109467267990112,
"step": 1082
},
{
"epoch": 0.9124579124579124,
"grad_norm": 9.549911499023438,
"learning_rate": 1.701389436401004e-06,
"loss": 0.697003960609436,
"step": 1084
},
{
"epoch": 0.9141414141414141,
"grad_norm": 3.0203182697296143,
"learning_rate": 1.700145546114731e-06,
"loss": 1.1409720182418823,
"step": 1086
},
{
"epoch": 0.9158249158249159,
"grad_norm": 8.64733600616455,
"learning_rate": 1.698899588074535e-06,
"loss": 0.8965491056442261,
"step": 1088
},
{
"epoch": 0.9175084175084175,
"grad_norm": 2.4151153564453125,
"learning_rate": 1.6976515665732663e-06,
"loss": 0.9052882790565491,
"step": 1090
},
{
"epoch": 0.9191919191919192,
"grad_norm": 6.9435224533081055,
"learning_rate": 1.6964014859108837e-06,
"loss": 1.0003384351730347,
"step": 1092
},
{
"epoch": 0.9208754208754208,
"grad_norm": 4.513472557067871,
"learning_rate": 1.6951493503944414e-06,
"loss": 0.8998319506645203,
"step": 1094
},
{
"epoch": 0.9225589225589226,
"grad_norm": 11.51063060760498,
"learning_rate": 1.693895164338073e-06,
"loss": 0.7377707958221436,
"step": 1096
},
{
"epoch": 0.9242424242424242,
"grad_norm": 6.038638591766357,
"learning_rate": 1.6926389320629768e-06,
"loss": 0.3615678548812866,
"step": 1098
},
{
"epoch": 0.9259259259259259,
"grad_norm": 6.45628023147583,
"learning_rate": 1.6913806578974016e-06,
"loss": 0.9533661603927612,
"step": 1100
},
{
"epoch": 0.9276094276094277,
"grad_norm": 11.960179328918457,
"learning_rate": 1.690120346176632e-06,
"loss": 0.5207856893539429,
"step": 1102
},
{
"epoch": 0.9292929292929293,
"grad_norm": 5.242334842681885,
"learning_rate": 1.6888580012429717e-06,
"loss": 1.1098419427871704,
"step": 1104
},
{
"epoch": 0.930976430976431,
"grad_norm": 2.832732677459717,
"learning_rate": 1.68759362744573e-06,
"loss": 1.0050939321517944,
"step": 1106
},
{
"epoch": 0.9326599326599326,
"grad_norm": 16.379804611206055,
"learning_rate": 1.686327229141207e-06,
"loss": 0.7864120602607727,
"step": 1108
},
{
"epoch": 0.9343434343434344,
"grad_norm": 6.306436538696289,
"learning_rate": 1.6850588106926773e-06,
"loss": 1.20371413230896,
"step": 1110
},
{
"epoch": 0.936026936026936,
"grad_norm": 5.394667625427246,
"learning_rate": 1.6837883764703765e-06,
"loss": 1.1867024898529053,
"step": 1112
},
{
"epoch": 0.9377104377104377,
"grad_norm": 4.2957305908203125,
"learning_rate": 1.6825159308514847e-06,
"loss": 1.0403316020965576,
"step": 1114
},
{
"epoch": 0.9393939393939394,
"grad_norm": 3.2342448234558105,
"learning_rate": 1.6812414782201127e-06,
"loss": 1.1196048259735107,
"step": 1116
},
{
"epoch": 0.9410774410774411,
"grad_norm": 4.326461315155029,
"learning_rate": 1.6799650229672862e-06,
"loss": 0.9937688708305359,
"step": 1118
},
{
"epoch": 0.9427609427609428,
"grad_norm": 8.076350212097168,
"learning_rate": 1.6786865694909301e-06,
"loss": 1.2609586715698242,
"step": 1120
},
{
"epoch": 0.9444444444444444,
"grad_norm": 2.848473310470581,
"learning_rate": 1.6774061221958552e-06,
"loss": 0.7693970203399658,
"step": 1122
},
{
"epoch": 0.9461279461279462,
"grad_norm": 21.549283981323242,
"learning_rate": 1.6761236854937406e-06,
"loss": 0.8295996189117432,
"step": 1124
},
{
"epoch": 0.9478114478114478,
"grad_norm": 3.2013320922851562,
"learning_rate": 1.674839263803121e-06,
"loss": 0.8039145469665527,
"step": 1126
},
{
"epoch": 0.9494949494949495,
"grad_norm": 4.9571099281311035,
"learning_rate": 1.6735528615493686e-06,
"loss": 0.9634122848510742,
"step": 1128
},
{
"epoch": 0.9511784511784511,
"grad_norm": 16.527570724487305,
"learning_rate": 1.6722644831646815e-06,
"loss": 0.79341059923172,
"step": 1130
},
{
"epoch": 0.9528619528619529,
"grad_norm": 2.471346855163574,
"learning_rate": 1.6709741330880644e-06,
"loss": 0.9218388795852661,
"step": 1132
},
{
"epoch": 0.9545454545454546,
"grad_norm": 30.464435577392578,
"learning_rate": 1.6696818157653172e-06,
"loss": 0.946638286113739,
"step": 1134
},
{
"epoch": 0.9562289562289562,
"grad_norm": 6.8406453132629395,
"learning_rate": 1.6683875356490157e-06,
"loss": 0.8108268976211548,
"step": 1136
},
{
"epoch": 0.9579124579124579,
"grad_norm": 5.6103620529174805,
"learning_rate": 1.6670912971985002e-06,
"loss": 0.6951830387115479,
"step": 1138
},
{
"epoch": 0.9595959595959596,
"grad_norm": 4.111386299133301,
"learning_rate": 1.6657931048798576e-06,
"loss": 0.5389662384986877,
"step": 1140
},
{
"epoch": 0.9612794612794613,
"grad_norm": 5.01594352722168,
"learning_rate": 1.6644929631659061e-06,
"loss": 0.8873554468154907,
"step": 1142
},
{
"epoch": 0.9629629629629629,
"grad_norm": 5.6005096435546875,
"learning_rate": 1.6631908765361818e-06,
"loss": 0.5947662591934204,
"step": 1144
},
{
"epoch": 0.9646464646464646,
"grad_norm": 4.118565082550049,
"learning_rate": 1.6618868494769202e-06,
"loss": 0.8753615617752075,
"step": 1146
},
{
"epoch": 0.9663299663299664,
"grad_norm": 10.705119132995605,
"learning_rate": 1.6605808864810437e-06,
"loss": 0.7432312965393066,
"step": 1148
},
{
"epoch": 0.968013468013468,
"grad_norm": 6.360631465911865,
"learning_rate": 1.6592729920481443e-06,
"loss": 0.9374081492424011,
"step": 1150
},
{
"epoch": 0.9696969696969697,
"grad_norm": 22.604328155517578,
"learning_rate": 1.6579631706844683e-06,
"loss": 0.5783393383026123,
"step": 1152
},
{
"epoch": 0.9713804713804713,
"grad_norm": 10.371187210083008,
"learning_rate": 1.6566514269029015e-06,
"loss": 0.8774973750114441,
"step": 1154
},
{
"epoch": 0.9730639730639731,
"grad_norm": 2.2685441970825195,
"learning_rate": 1.6553377652229536e-06,
"loss": 0.5517897605895996,
"step": 1156
},
{
"epoch": 0.9747474747474747,
"grad_norm": 15.745230674743652,
"learning_rate": 1.6540221901707413e-06,
"loss": 0.9311755895614624,
"step": 1158
},
{
"epoch": 0.9764309764309764,
"grad_norm": 6.642886161804199,
"learning_rate": 1.6527047062789743e-06,
"loss": 0.4048464298248291,
"step": 1160
},
{
"epoch": 0.9781144781144782,
"grad_norm": 23.364538192749023,
"learning_rate": 1.6513853180869391e-06,
"loss": 1.0577645301818848,
"step": 1162
},
{
"epoch": 0.9797979797979798,
"grad_norm": 2.7986645698547363,
"learning_rate": 1.6500640301404832e-06,
"loss": 0.6768155694007874,
"step": 1164
},
{
"epoch": 0.9814814814814815,
"grad_norm": 4.8387131690979,
"learning_rate": 1.6487408469919992e-06,
"loss": 0.7736034393310547,
"step": 1166
},
{
"epoch": 0.9831649831649831,
"grad_norm": 4.39155387878418,
"learning_rate": 1.6474157732004101e-06,
"loss": 0.7835286855697632,
"step": 1168
},
{
"epoch": 0.9848484848484849,
"grad_norm": 4.676360607147217,
"learning_rate": 1.6460888133311526e-06,
"loss": 0.8302567005157471,
"step": 1170
},
{
"epoch": 0.9865319865319865,
"grad_norm": 3.651604413986206,
"learning_rate": 1.6447599719561616e-06,
"loss": 0.6171858310699463,
"step": 1172
},
{
"epoch": 0.9882154882154882,
"grad_norm": 3.5588345527648926,
"learning_rate": 1.6434292536538547e-06,
"loss": 0.8998767137527466,
"step": 1174
},
{
"epoch": 0.98989898989899,
"grad_norm": 11.439290046691895,
"learning_rate": 1.6420966630091168e-06,
"loss": 0.41087231040000916,
"step": 1176
},
{
"epoch": 0.9915824915824916,
"grad_norm": 11.601485252380371,
"learning_rate": 1.6407622046132831e-06,
"loss": 1.0380841493606567,
"step": 1178
},
{
"epoch": 0.9932659932659933,
"grad_norm": 7.792235374450684,
"learning_rate": 1.6394258830641243e-06,
"loss": 0.43105313181877136,
"step": 1180
},
{
"epoch": 0.9949494949494949,
"grad_norm": 21.001230239868164,
"learning_rate": 1.6380877029658303e-06,
"loss": 0.8770669102668762,
"step": 1182
},
{
"epoch": 0.9966329966329966,
"grad_norm": 13.98222827911377,
"learning_rate": 1.6367476689289947e-06,
"loss": 0.9919424057006836,
"step": 1184
},
{
"epoch": 0.9983164983164983,
"grad_norm": 36.48440933227539,
"learning_rate": 1.6354057855705984e-06,
"loss": 0.6105228066444397,
"step": 1186
},
{
"epoch": 1.0,
"grad_norm": 9.162494659423828,
"learning_rate": 1.6340620575139947e-06,
"loss": 0.7021905183792114,
"step": 1188
},
{
"epoch": 1.0016835016835017,
"grad_norm": 6.610725402832031,
"learning_rate": 1.6327164893888913e-06,
"loss": 0.3793674111366272,
"step": 1190
},
{
"epoch": 1.0033670033670035,
"grad_norm": 6.908663272857666,
"learning_rate": 1.6313690858313374e-06,
"loss": 0.39230918884277344,
"step": 1192
},
{
"epoch": 1.005050505050505,
"grad_norm": 2.9396955966949463,
"learning_rate": 1.6300198514837045e-06,
"loss": 1.0317349433898926,
"step": 1194
},
{
"epoch": 1.0067340067340067,
"grad_norm": 12.543563842773438,
"learning_rate": 1.6286687909946732e-06,
"loss": 0.8607063293457031,
"step": 1196
},
{
"epoch": 1.0084175084175084,
"grad_norm": 3.976856231689453,
"learning_rate": 1.6273159090192152e-06,
"loss": 0.9927105903625488,
"step": 1198
},
{
"epoch": 1.0101010101010102,
"grad_norm": 7.6159348487854,
"learning_rate": 1.6259612102185778e-06,
"loss": 1.056520938873291,
"step": 1200
},
{
"epoch": 1.0117845117845117,
"grad_norm": 13.293722152709961,
"learning_rate": 1.6246046992602685e-06,
"loss": 0.9043182134628296,
"step": 1202
},
{
"epoch": 1.0134680134680134,
"grad_norm": 7.976161003112793,
"learning_rate": 1.6232463808180385e-06,
"loss": 0.8953118920326233,
"step": 1204
},
{
"epoch": 1.0151515151515151,
"grad_norm": 14.81564998626709,
"learning_rate": 1.6218862595718664e-06,
"loss": 1.0292134284973145,
"step": 1206
},
{
"epoch": 1.0168350168350169,
"grad_norm": 8.188558578491211,
"learning_rate": 1.620524340207942e-06,
"loss": 0.5569553375244141,
"step": 1208
},
{
"epoch": 1.0185185185185186,
"grad_norm": 7.264322757720947,
"learning_rate": 1.6191606274186504e-06,
"loss": 0.5535443425178528,
"step": 1210
},
{
"epoch": 1.02020202020202,
"grad_norm": 5.589961528778076,
"learning_rate": 1.6177951259025562e-06,
"loss": 0.5485536456108093,
"step": 1212
},
{
"epoch": 1.0218855218855218,
"grad_norm": 6.598013401031494,
"learning_rate": 1.6164278403643867e-06,
"loss": 0.7326016426086426,
"step": 1214
},
{
"epoch": 1.0235690235690236,
"grad_norm": 2.7756152153015137,
"learning_rate": 1.6150587755150158e-06,
"loss": 0.4036520719528198,
"step": 1216
},
{
"epoch": 1.0252525252525253,
"grad_norm": 12.781232833862305,
"learning_rate": 1.6136879360714478e-06,
"loss": 0.8799995183944702,
"step": 1218
},
{
"epoch": 1.026936026936027,
"grad_norm": 2.7762389183044434,
"learning_rate": 1.612315326756802e-06,
"loss": 0.7381196022033691,
"step": 1220
},
{
"epoch": 1.0286195286195285,
"grad_norm": 11.140121459960938,
"learning_rate": 1.6109409523002942e-06,
"loss": 0.8362076282501221,
"step": 1222
},
{
"epoch": 1.0303030303030303,
"grad_norm": 2.88662052154541,
"learning_rate": 1.6095648174372231e-06,
"loss": 1.0976812839508057,
"step": 1224
},
{
"epoch": 1.031986531986532,
"grad_norm": 3.13314151763916,
"learning_rate": 1.6081869269089522e-06,
"loss": 0.709804892539978,
"step": 1226
},
{
"epoch": 1.0336700336700337,
"grad_norm": 5.350557327270508,
"learning_rate": 1.606807285462894e-06,
"loss": 0.8039405941963196,
"step": 1228
},
{
"epoch": 1.0353535353535352,
"grad_norm": 2.7725930213928223,
"learning_rate": 1.6054258978524943e-06,
"loss": 0.8068400025367737,
"step": 1230
},
{
"epoch": 1.037037037037037,
"grad_norm": 29.508012771606445,
"learning_rate": 1.6040427688372143e-06,
"loss": 0.47366365790367126,
"step": 1232
},
{
"epoch": 1.0387205387205387,
"grad_norm": 1.3913285732269287,
"learning_rate": 1.602657903182515e-06,
"loss": 0.7617353796958923,
"step": 1234
},
{
"epoch": 1.0404040404040404,
"grad_norm": 4.6602630615234375,
"learning_rate": 1.6012713056598423e-06,
"loss": 0.7685100436210632,
"step": 1236
},
{
"epoch": 1.0420875420875422,
"grad_norm": 3.482510805130005,
"learning_rate": 1.599882981046607e-06,
"loss": 0.512657105922699,
"step": 1238
},
{
"epoch": 1.0437710437710437,
"grad_norm": 3.340650796890259,
"learning_rate": 1.5984929341261724e-06,
"loss": 1.0025690793991089,
"step": 1240
},
{
"epoch": 1.0454545454545454,
"grad_norm": 5.567379474639893,
"learning_rate": 1.5971011696878342e-06,
"loss": 0.9806394577026367,
"step": 1242
},
{
"epoch": 1.0471380471380471,
"grad_norm": 7.478330612182617,
"learning_rate": 1.5957076925268072e-06,
"loss": 0.6606462001800537,
"step": 1244
},
{
"epoch": 1.0488215488215489,
"grad_norm": 5.497067451477051,
"learning_rate": 1.5943125074442064e-06,
"loss": 0.6403665542602539,
"step": 1246
},
{
"epoch": 1.0505050505050506,
"grad_norm": 5.195033550262451,
"learning_rate": 1.5929156192470313e-06,
"loss": 0.8676759004592896,
"step": 1248
},
{
"epoch": 1.0521885521885521,
"grad_norm": 3.5050344467163086,
"learning_rate": 1.5915170327481491e-06,
"loss": 0.7130298614501953,
"step": 1250
},
{
"epoch": 1.0538720538720538,
"grad_norm": 6.229882717132568,
"learning_rate": 1.5901167527662796e-06,
"loss": 0.6191893815994263,
"step": 1252
},
{
"epoch": 1.0555555555555556,
"grad_norm": 3.6591920852661133,
"learning_rate": 1.5887147841259758e-06,
"loss": 0.9453639388084412,
"step": 1254
},
{
"epoch": 1.0572390572390573,
"grad_norm": 8.242814064025879,
"learning_rate": 1.5873111316576102e-06,
"loss": 0.711391270160675,
"step": 1256
},
{
"epoch": 1.0589225589225588,
"grad_norm": 9.809550285339355,
"learning_rate": 1.5859058001973555e-06,
"loss": 0.5224330425262451,
"step": 1258
},
{
"epoch": 1.0606060606060606,
"grad_norm": 8.671676635742188,
"learning_rate": 1.5844987945871701e-06,
"loss": 0.736186146736145,
"step": 1260
},
{
"epoch": 1.0622895622895623,
"grad_norm": 8.753976821899414,
"learning_rate": 1.5830901196747805e-06,
"loss": 0.632482647895813,
"step": 1262
},
{
"epoch": 1.063973063973064,
"grad_norm": 3.3778975009918213,
"learning_rate": 1.5816797803136647e-06,
"loss": 0.7275056838989258,
"step": 1264
},
{
"epoch": 1.0656565656565657,
"grad_norm": 6.493520736694336,
"learning_rate": 1.5802677813630348e-06,
"loss": 0.7164782285690308,
"step": 1266
},
{
"epoch": 1.0673400673400673,
"grad_norm": 12.627816200256348,
"learning_rate": 1.5788541276878212e-06,
"loss": 0.5824927687644958,
"step": 1268
},
{
"epoch": 1.069023569023569,
"grad_norm": 7.747696876525879,
"learning_rate": 1.577438824158656e-06,
"loss": 0.5714269876480103,
"step": 1270
},
{
"epoch": 1.0707070707070707,
"grad_norm": 12.949309349060059,
"learning_rate": 1.5760218756518548e-06,
"loss": 0.7176691293716431,
"step": 1272
},
{
"epoch": 1.0723905723905724,
"grad_norm": 6.077565670013428,
"learning_rate": 1.5746032870494022e-06,
"loss": 0.4697990417480469,
"step": 1274
},
{
"epoch": 1.074074074074074,
"grad_norm": 4.4054155349731445,
"learning_rate": 1.5731830632389322e-06,
"loss": 0.6759170293807983,
"step": 1276
},
{
"epoch": 1.0757575757575757,
"grad_norm": 62.43513488769531,
"learning_rate": 1.5717612091137137e-06,
"loss": 0.9693543910980225,
"step": 1278
},
{
"epoch": 1.0774410774410774,
"grad_norm": 27.173269271850586,
"learning_rate": 1.570337729572632e-06,
"loss": 0.4767664670944214,
"step": 1280
},
{
"epoch": 1.0791245791245792,
"grad_norm": 6.065430164337158,
"learning_rate": 1.5689126295201738e-06,
"loss": 0.33717769384384155,
"step": 1282
},
{
"epoch": 1.0808080808080809,
"grad_norm": 5.10385799407959,
"learning_rate": 1.5674859138664076e-06,
"loss": 0.9727071523666382,
"step": 1284
},
{
"epoch": 1.0824915824915824,
"grad_norm": 4.059802055358887,
"learning_rate": 1.5660575875269696e-06,
"loss": 0.7808531522750854,
"step": 1286
},
{
"epoch": 1.0841750841750841,
"grad_norm": 3.3735897541046143,
"learning_rate": 1.5646276554230454e-06,
"loss": 0.5864525437355042,
"step": 1288
},
{
"epoch": 1.0858585858585859,
"grad_norm": 3.3175692558288574,
"learning_rate": 1.563196122481352e-06,
"loss": 0.6308066844940186,
"step": 1290
},
{
"epoch": 1.0875420875420876,
"grad_norm": 8.797651290893555,
"learning_rate": 1.5617629936341225e-06,
"loss": 1.049008846282959,
"step": 1292
},
{
"epoch": 1.0892255892255893,
"grad_norm": 7.429879188537598,
"learning_rate": 1.5603282738190898e-06,
"loss": 0.766440749168396,
"step": 1294
},
{
"epoch": 1.0909090909090908,
"grad_norm": 14.650995254516602,
"learning_rate": 1.5588919679794668e-06,
"loss": 0.5494952201843262,
"step": 1296
},
{
"epoch": 1.0925925925925926,
"grad_norm": 7.310492515563965,
"learning_rate": 1.5574540810639312e-06,
"loss": 0.5477076768875122,
"step": 1298
},
{
"epoch": 1.0942760942760943,
"grad_norm": 21.442401885986328,
"learning_rate": 1.556014618026609e-06,
"loss": 0.6048269271850586,
"step": 1300
},
{
"epoch": 1.095959595959596,
"grad_norm": 78.25362396240234,
"learning_rate": 1.5545735838270556e-06,
"loss": 0.5611992478370667,
"step": 1302
},
{
"epoch": 1.0976430976430978,
"grad_norm": 7.619815826416016,
"learning_rate": 1.5531309834302403e-06,
"loss": 0.5441624522209167,
"step": 1304
},
{
"epoch": 1.0993265993265993,
"grad_norm": 26.699399948120117,
"learning_rate": 1.5516868218065283e-06,
"loss": 0.5887436866760254,
"step": 1306
},
{
"epoch": 1.101010101010101,
"grad_norm": 15.65885066986084,
"learning_rate": 1.5502411039316642e-06,
"loss": 0.5249545574188232,
"step": 1308
},
{
"epoch": 1.1026936026936027,
"grad_norm": 25.263103485107422,
"learning_rate": 1.5487938347867542e-06,
"loss": 0.36874455213546753,
"step": 1310
},
{
"epoch": 1.1043771043771045,
"grad_norm": 9.12649917602539,
"learning_rate": 1.5473450193582498e-06,
"loss": 1.1010559797286987,
"step": 1312
},
{
"epoch": 1.106060606060606,
"grad_norm": 23.143815994262695,
"learning_rate": 1.5458946626379293e-06,
"loss": 0.8757441639900208,
"step": 1314
},
{
"epoch": 1.1077441077441077,
"grad_norm": 11.386807441711426,
"learning_rate": 1.5444427696228822e-06,
"loss": 0.8766863346099854,
"step": 1316
},
{
"epoch": 1.1094276094276094,
"grad_norm": 5.802887439727783,
"learning_rate": 1.5429893453154906e-06,
"loss": 0.8725073337554932,
"step": 1318
},
{
"epoch": 1.1111111111111112,
"grad_norm": 5.350346088409424,
"learning_rate": 1.5415343947234132e-06,
"loss": 0.5795699954032898,
"step": 1320
},
{
"epoch": 1.112794612794613,
"grad_norm": 15.783977508544922,
"learning_rate": 1.5400779228595663e-06,
"loss": 0.8113459348678589,
"step": 1322
},
{
"epoch": 1.1144781144781144,
"grad_norm": 9.137958526611328,
"learning_rate": 1.538619934742109e-06,
"loss": 0.46189528703689575,
"step": 1324
},
{
"epoch": 1.1161616161616161,
"grad_norm": 5.9258527755737305,
"learning_rate": 1.5371604353944235e-06,
"loss": 0.8045957684516907,
"step": 1326
},
{
"epoch": 1.1178451178451179,
"grad_norm": 2.5547056198120117,
"learning_rate": 1.5356994298450989e-06,
"loss": 0.6314079165458679,
"step": 1328
},
{
"epoch": 1.1195286195286196,
"grad_norm": 6.180763244628906,
"learning_rate": 1.5342369231279145e-06,
"loss": 0.9923676252365112,
"step": 1330
},
{
"epoch": 1.121212121212121,
"grad_norm": 10.539793968200684,
"learning_rate": 1.5327729202818212e-06,
"loss": 0.6905699372291565,
"step": 1332
},
{
"epoch": 1.1228956228956228,
"grad_norm": 3.815638780593872,
"learning_rate": 1.5313074263509242e-06,
"loss": 1.0867717266082764,
"step": 1334
},
{
"epoch": 1.1245791245791246,
"grad_norm": 7.576748847961426,
"learning_rate": 1.5298404463844675e-06,
"loss": 0.5058388113975525,
"step": 1336
},
{
"epoch": 1.1262626262626263,
"grad_norm": 6.077386856079102,
"learning_rate": 1.5283719854368142e-06,
"loss": 0.6739003658294678,
"step": 1338
},
{
"epoch": 1.127946127946128,
"grad_norm": 18.228174209594727,
"learning_rate": 1.5269020485674299e-06,
"loss": 0.5296186208724976,
"step": 1340
},
{
"epoch": 1.1296296296296295,
"grad_norm": 7.708940029144287,
"learning_rate": 1.5254306408408657e-06,
"loss": 0.8153047561645508,
"step": 1342
},
{
"epoch": 1.1313131313131313,
"grad_norm": 3.31766414642334,
"learning_rate": 1.5239577673267401e-06,
"loss": 1.0957720279693604,
"step": 1344
},
{
"epoch": 1.132996632996633,
"grad_norm": 5.293587684631348,
"learning_rate": 1.5224834330997222e-06,
"loss": 0.9039838314056396,
"step": 1346
},
{
"epoch": 1.1346801346801347,
"grad_norm": 3.792046070098877,
"learning_rate": 1.5210076432395138e-06,
"loss": 0.6438568234443665,
"step": 1348
},
{
"epoch": 1.1363636363636362,
"grad_norm": 7.245974063873291,
"learning_rate": 1.5195304028308324e-06,
"loss": 0.4882217049598694,
"step": 1350
},
{
"epoch": 1.138047138047138,
"grad_norm": 26.42631721496582,
"learning_rate": 1.5180517169633914e-06,
"loss": 0.2949609160423279,
"step": 1352
},
{
"epoch": 1.1397306397306397,
"grad_norm": 3.006683111190796,
"learning_rate": 1.5165715907318874e-06,
"loss": 1.0205047130584717,
"step": 1354
},
{
"epoch": 1.1414141414141414,
"grad_norm": 3.6523959636688232,
"learning_rate": 1.5150900292359775e-06,
"loss": 1.0392919778823853,
"step": 1356
},
{
"epoch": 1.1430976430976432,
"grad_norm": 4.214179992675781,
"learning_rate": 1.513607037580264e-06,
"loss": 0.6601721048355103,
"step": 1358
},
{
"epoch": 1.144781144781145,
"grad_norm": 10.945768356323242,
"learning_rate": 1.5121226208742771e-06,
"loss": 0.6551761627197266,
"step": 1360
},
{
"epoch": 1.1464646464646464,
"grad_norm": 3.450727701187134,
"learning_rate": 1.5106367842324578e-06,
"loss": 0.8425558805465698,
"step": 1362
},
{
"epoch": 1.1481481481481481,
"grad_norm": 13.319304466247559,
"learning_rate": 1.5091495327741375e-06,
"loss": 0.8309493064880371,
"step": 1364
},
{
"epoch": 1.1498316498316499,
"grad_norm": 3.239384889602661,
"learning_rate": 1.507660871623524e-06,
"loss": 0.6987888813018799,
"step": 1366
},
{
"epoch": 1.1515151515151516,
"grad_norm": 9.262398719787598,
"learning_rate": 1.5061708059096807e-06,
"loss": 0.7337237596511841,
"step": 1368
},
{
"epoch": 1.1531986531986531,
"grad_norm": 3.1679928302764893,
"learning_rate": 1.5046793407665114e-06,
"loss": 1.047074794769287,
"step": 1370
},
{
"epoch": 1.1548821548821548,
"grad_norm": 5.058619976043701,
"learning_rate": 1.503186481332741e-06,
"loss": 1.0454055070877075,
"step": 1372
},
{
"epoch": 1.1565656565656566,
"grad_norm": 9.18127155303955,
"learning_rate": 1.5016922327518986e-06,
"loss": 0.38407066464424133,
"step": 1374
},
{
"epoch": 1.1582491582491583,
"grad_norm": 6.960140228271484,
"learning_rate": 1.5001966001722986e-06,
"loss": 0.38796305656433105,
"step": 1376
},
{
"epoch": 1.15993265993266,
"grad_norm": 18.356365203857422,
"learning_rate": 1.4986995887470248e-06,
"loss": 0.8758000135421753,
"step": 1378
},
{
"epoch": 1.1616161616161615,
"grad_norm": 2.5531139373779297,
"learning_rate": 1.497201203633912e-06,
"loss": 0.6682250499725342,
"step": 1380
},
{
"epoch": 1.1632996632996633,
"grad_norm": 18.195405960083008,
"learning_rate": 1.4957014499955265e-06,
"loss": 0.5331791639328003,
"step": 1382
},
{
"epoch": 1.164983164983165,
"grad_norm": 4.818270683288574,
"learning_rate": 1.4942003329991513e-06,
"loss": 0.3785390257835388,
"step": 1384
},
{
"epoch": 1.1666666666666667,
"grad_norm": 66.33992767333984,
"learning_rate": 1.492697857816766e-06,
"loss": 0.48905232548713684,
"step": 1386
},
{
"epoch": 1.1683501683501682,
"grad_norm": 6.675547122955322,
"learning_rate": 1.491194029625029e-06,
"loss": 0.5575925707817078,
"step": 1388
},
{
"epoch": 1.17003367003367,
"grad_norm": 2.9333407878875732,
"learning_rate": 1.489688853605262e-06,
"loss": 0.8529257774353027,
"step": 1390
},
{
"epoch": 1.1717171717171717,
"grad_norm": 14.85582447052002,
"learning_rate": 1.4881823349434296e-06,
"loss": 0.8529238104820251,
"step": 1392
},
{
"epoch": 1.1734006734006734,
"grad_norm": 4.551332473754883,
"learning_rate": 1.4866744788301226e-06,
"loss": 0.6012097597122192,
"step": 1394
},
{
"epoch": 1.1750841750841752,
"grad_norm": 5.803267955780029,
"learning_rate": 1.485165290460539e-06,
"loss": 0.5330957770347595,
"step": 1396
},
{
"epoch": 1.1767676767676767,
"grad_norm": 4.956878185272217,
"learning_rate": 1.4836547750344688e-06,
"loss": 0.7069591283798218,
"step": 1398
},
{
"epoch": 1.1784511784511784,
"grad_norm": 11.88759708404541,
"learning_rate": 1.4821429377562725e-06,
"loss": 0.4460894763469696,
"step": 1400
},
{
"epoch": 1.1801346801346801,
"grad_norm": 3.5958197116851807,
"learning_rate": 1.4806297838348653e-06,
"loss": 0.909576952457428,
"step": 1402
},
{
"epoch": 1.1818181818181819,
"grad_norm": 4.076791286468506,
"learning_rate": 1.4791153184837e-06,
"loss": 0.6851646900177002,
"step": 1404
},
{
"epoch": 1.1835016835016834,
"grad_norm": 8.969018936157227,
"learning_rate": 1.4775995469207467e-06,
"loss": 0.7221487760543823,
"step": 1406
},
{
"epoch": 1.1851851851851851,
"grad_norm": 24.653610229492188,
"learning_rate": 1.476082474368476e-06,
"loss": 1.0442817211151123,
"step": 1408
},
{
"epoch": 1.1868686868686869,
"grad_norm": 6.7254557609558105,
"learning_rate": 1.4745641060538407e-06,
"loss": 0.6711673140525818,
"step": 1410
},
{
"epoch": 1.1885521885521886,
"grad_norm": 38.141719818115234,
"learning_rate": 1.4730444472082597e-06,
"loss": 0.6712204217910767,
"step": 1412
},
{
"epoch": 1.1902356902356903,
"grad_norm": 5.311680793762207,
"learning_rate": 1.471523503067596e-06,
"loss": 0.7601330280303955,
"step": 1414
},
{
"epoch": 1.1919191919191918,
"grad_norm": 6.686192512512207,
"learning_rate": 1.4700012788721431e-06,
"loss": 0.6655834913253784,
"step": 1416
},
{
"epoch": 1.1936026936026936,
"grad_norm": 12.520559310913086,
"learning_rate": 1.4684777798666028e-06,
"loss": 1.0070924758911133,
"step": 1418
},
{
"epoch": 1.1952861952861953,
"grad_norm": 39.29856491088867,
"learning_rate": 1.4669530113000712e-06,
"loss": 0.8293688297271729,
"step": 1420
},
{
"epoch": 1.196969696969697,
"grad_norm": 5.298742294311523,
"learning_rate": 1.465426978426017e-06,
"loss": 0.7399046421051025,
"step": 1422
},
{
"epoch": 1.1986531986531987,
"grad_norm": 4.998674392700195,
"learning_rate": 1.4638996865022658e-06,
"loss": 0.5819299221038818,
"step": 1424
},
{
"epoch": 1.2003367003367003,
"grad_norm": 19.531993865966797,
"learning_rate": 1.4623711407909802e-06,
"loss": 0.8090528845787048,
"step": 1426
},
{
"epoch": 1.202020202020202,
"grad_norm": 5.534289836883545,
"learning_rate": 1.4608413465586444e-06,
"loss": 0.4998140335083008,
"step": 1428
},
{
"epoch": 1.2037037037037037,
"grad_norm": 4.479226589202881,
"learning_rate": 1.4593103090760426e-06,
"loss": 0.8749973177909851,
"step": 1430
},
{
"epoch": 1.2053872053872055,
"grad_norm": 6.119904518127441,
"learning_rate": 1.4577780336182429e-06,
"loss": 0.6631636619567871,
"step": 1432
},
{
"epoch": 1.2070707070707072,
"grad_norm": 6.20470666885376,
"learning_rate": 1.4562445254645793e-06,
"loss": 1.0941792726516724,
"step": 1434
},
{
"epoch": 1.2087542087542087,
"grad_norm": 13.419809341430664,
"learning_rate": 1.4547097898986332e-06,
"loss": 0.5603539347648621,
"step": 1436
},
{
"epoch": 1.2104377104377104,
"grad_norm": 10.74496841430664,
"learning_rate": 1.453173832208213e-06,
"loss": 0.3947031497955322,
"step": 1438
},
{
"epoch": 1.2121212121212122,
"grad_norm": 2.647723436355591,
"learning_rate": 1.4516366576853406e-06,
"loss": 0.3918086886405945,
"step": 1440
},
{
"epoch": 1.2138047138047139,
"grad_norm": 7.783057689666748,
"learning_rate": 1.450098271626228e-06,
"loss": 0.6404916048049927,
"step": 1442
},
{
"epoch": 1.2154882154882154,
"grad_norm": 7.518592834472656,
"learning_rate": 1.448558679331263e-06,
"loss": 0.8621898889541626,
"step": 1444
},
{
"epoch": 1.2171717171717171,
"grad_norm": 15.241488456726074,
"learning_rate": 1.4470178861049886e-06,
"loss": 0.8157280683517456,
"step": 1446
},
{
"epoch": 1.2188552188552189,
"grad_norm": 5.622246742248535,
"learning_rate": 1.4454758972560863e-06,
"loss": 0.6764127612113953,
"step": 1448
},
{
"epoch": 1.2205387205387206,
"grad_norm": 3.0841257572174072,
"learning_rate": 1.4439327180973556e-06,
"loss": 0.8733148574829102,
"step": 1450
},
{
"epoch": 1.2222222222222223,
"grad_norm": 4.749155521392822,
"learning_rate": 1.4423883539456987e-06,
"loss": 0.828094482421875,
"step": 1452
},
{
"epoch": 1.2239057239057238,
"grad_norm": 14.270376205444336,
"learning_rate": 1.4408428101220997e-06,
"loss": 0.5771759152412415,
"step": 1454
},
{
"epoch": 1.2255892255892256,
"grad_norm": 4.161510467529297,
"learning_rate": 1.439296091951607e-06,
"loss": 0.8248889446258545,
"step": 1456
},
{
"epoch": 1.2272727272727273,
"grad_norm": 7.337621212005615,
"learning_rate": 1.4377482047633162e-06,
"loss": 0.8380516767501831,
"step": 1458
},
{
"epoch": 1.228956228956229,
"grad_norm": 15.451786041259766,
"learning_rate": 1.4361991538903495e-06,
"loss": 0.9264905452728271,
"step": 1460
},
{
"epoch": 1.2306397306397305,
"grad_norm": 17.90766143798828,
"learning_rate": 1.4346489446698388e-06,
"loss": 0.616461455821991,
"step": 1462
},
{
"epoch": 1.2323232323232323,
"grad_norm": 4.267929553985596,
"learning_rate": 1.4330975824429076e-06,
"loss": 0.587724506855011,
"step": 1464
},
{
"epoch": 1.234006734006734,
"grad_norm": 3.7121894359588623,
"learning_rate": 1.4315450725546516e-06,
"loss": 0.7742079496383667,
"step": 1466
},
{
"epoch": 1.2356902356902357,
"grad_norm": 3.0019185543060303,
"learning_rate": 1.42999142035412e-06,
"loss": 0.8585535287857056,
"step": 1468
},
{
"epoch": 1.2373737373737375,
"grad_norm": 2.829047441482544,
"learning_rate": 1.4284366311942985e-06,
"loss": 1.0342047214508057,
"step": 1470
},
{
"epoch": 1.239057239057239,
"grad_norm": 8.36631965637207,
"learning_rate": 1.42688071043209e-06,
"loss": 0.5781531929969788,
"step": 1472
},
{
"epoch": 1.2407407407407407,
"grad_norm": 11.143059730529785,
"learning_rate": 1.4253236634282964e-06,
"loss": 0.6396032571792603,
"step": 1474
},
{
"epoch": 1.2424242424242424,
"grad_norm": 26.655942916870117,
"learning_rate": 1.4237654955475997e-06,
"loss": 0.4640727639198303,
"step": 1476
},
{
"epoch": 1.2441077441077442,
"grad_norm": 6.614319801330566,
"learning_rate": 1.4222062121585438e-06,
"loss": 0.6802918910980225,
"step": 1478
},
{
"epoch": 1.2457912457912457,
"grad_norm": 3.256394863128662,
"learning_rate": 1.4206458186335158e-06,
"loss": 0.666190505027771,
"step": 1480
},
{
"epoch": 1.2474747474747474,
"grad_norm": 4.943792819976807,
"learning_rate": 1.4190843203487285e-06,
"loss": 0.7142783403396606,
"step": 1482
},
{
"epoch": 1.2491582491582491,
"grad_norm": 3.698286771774292,
"learning_rate": 1.4175217226842e-06,
"loss": 0.3970263600349426,
"step": 1484
},
{
"epoch": 1.2508417508417509,
"grad_norm": 8.15507984161377,
"learning_rate": 1.4159580310237368e-06,
"loss": 0.5399370193481445,
"step": 1486
},
{
"epoch": 1.2525252525252526,
"grad_norm": 12.810306549072266,
"learning_rate": 1.414393250754915e-06,
"loss": 0.6834887266159058,
"step": 1488
},
{
"epoch": 1.2542087542087543,
"grad_norm": 5.88965368270874,
"learning_rate": 1.4128273872690608e-06,
"loss": 0.6449817419052124,
"step": 1490
},
{
"epoch": 1.2558922558922558,
"grad_norm": 3.2324328422546387,
"learning_rate": 1.4112604459612326e-06,
"loss": 0.7542852759361267,
"step": 1492
},
{
"epoch": 1.2575757575757576,
"grad_norm": 30.748018264770508,
"learning_rate": 1.4096924322302025e-06,
"loss": 0.7624866962432861,
"step": 1494
},
{
"epoch": 1.2592592592592593,
"grad_norm": 6.311125755310059,
"learning_rate": 1.4081233514784377e-06,
"loss": 0.6044232845306396,
"step": 1496
},
{
"epoch": 1.2609427609427608,
"grad_norm": 2.3243467807769775,
"learning_rate": 1.4065532091120815e-06,
"loss": 0.8974160552024841,
"step": 1498
},
{
"epoch": 1.2626262626262625,
"grad_norm": 7.767407417297363,
"learning_rate": 1.4049820105409354e-06,
"loss": 1.017437219619751,
"step": 1500
},
{
"epoch": 1.2643097643097643,
"grad_norm": 79.28764343261719,
"learning_rate": 1.4034097611784388e-06,
"loss": 0.5455498695373535,
"step": 1502
},
{
"epoch": 1.265993265993266,
"grad_norm": 7.436858654022217,
"learning_rate": 1.4018364664416531e-06,
"loss": 0.7246487140655518,
"step": 1504
},
{
"epoch": 1.2676767676767677,
"grad_norm": 3.221330165863037,
"learning_rate": 1.4002621317512402e-06,
"loss": 1.0642752647399902,
"step": 1506
},
{
"epoch": 1.2693602693602695,
"grad_norm": 2.4483256340026855,
"learning_rate": 1.3986867625314453e-06,
"loss": 1.104174256324768,
"step": 1508
},
{
"epoch": 1.271043771043771,
"grad_norm": 16.08315086364746,
"learning_rate": 1.397110364210079e-06,
"loss": 0.5644181966781616,
"step": 1510
},
{
"epoch": 1.2727272727272727,
"grad_norm": 26.60236930847168,
"learning_rate": 1.395532942218496e-06,
"loss": 0.5067999362945557,
"step": 1512
},
{
"epoch": 1.2744107744107744,
"grad_norm": 4.2767558097839355,
"learning_rate": 1.393954501991579e-06,
"loss": 0.5825619697570801,
"step": 1514
},
{
"epoch": 1.2760942760942762,
"grad_norm": 5.0948896408081055,
"learning_rate": 1.3923750489677192e-06,
"loss": 0.7657870054244995,
"step": 1516
},
{
"epoch": 1.2777777777777777,
"grad_norm": 6.115753650665283,
"learning_rate": 1.3907945885887963e-06,
"loss": 0.6665242910385132,
"step": 1518
},
{
"epoch": 1.2794612794612794,
"grad_norm": 3.168313980102539,
"learning_rate": 1.389213126300161e-06,
"loss": 0.8947120904922485,
"step": 1520
},
{
"epoch": 1.2811447811447811,
"grad_norm": 6.6659746170043945,
"learning_rate": 1.3876306675506176e-06,
"loss": 0.5565755367279053,
"step": 1522
},
{
"epoch": 1.2828282828282829,
"grad_norm": 10.685264587402344,
"learning_rate": 1.3860472177924008e-06,
"loss": 0.5323166847229004,
"step": 1524
},
{
"epoch": 1.2845117845117846,
"grad_norm": 3.2777657508850098,
"learning_rate": 1.3844627824811623e-06,
"loss": 0.7731577157974243,
"step": 1526
},
{
"epoch": 1.2861952861952861,
"grad_norm": 4.757735729217529,
"learning_rate": 1.3828773670759476e-06,
"loss": 0.6660727262496948,
"step": 1528
},
{
"epoch": 1.2878787878787878,
"grad_norm": 6.414804458618164,
"learning_rate": 1.3812909770391808e-06,
"loss": 0.2846236228942871,
"step": 1530
},
{
"epoch": 1.2895622895622896,
"grad_norm": 4.582427978515625,
"learning_rate": 1.3797036178366422e-06,
"loss": 0.7430540919303894,
"step": 1532
},
{
"epoch": 1.2912457912457913,
"grad_norm": 20.26249122619629,
"learning_rate": 1.3781152949374526e-06,
"loss": 0.9778026938438416,
"step": 1534
},
{
"epoch": 1.2929292929292928,
"grad_norm": 5.21143913269043,
"learning_rate": 1.3765260138140523e-06,
"loss": 0.9354510307312012,
"step": 1536
},
{
"epoch": 1.2946127946127945,
"grad_norm": 7.789968013763428,
"learning_rate": 1.3749357799421846e-06,
"loss": 0.6247372627258301,
"step": 1538
},
{
"epoch": 1.2962962962962963,
"grad_norm": 13.731108665466309,
"learning_rate": 1.3733445988008729e-06,
"loss": 0.6366062164306641,
"step": 1540
},
{
"epoch": 1.297979797979798,
"grad_norm": 13.730175018310547,
"learning_rate": 1.3717524758724065e-06,
"loss": 0.6833373308181763,
"step": 1542
},
{
"epoch": 1.2996632996632997,
"grad_norm": 10.050169944763184,
"learning_rate": 1.3701594166423182e-06,
"loss": 0.8749772310256958,
"step": 1544
},
{
"epoch": 1.3013468013468015,
"grad_norm": 37.633522033691406,
"learning_rate": 1.3685654265993682e-06,
"loss": 0.7598909139633179,
"step": 1546
},
{
"epoch": 1.303030303030303,
"grad_norm": 5.5588178634643555,
"learning_rate": 1.366970511235522e-06,
"loss": 0.8211129903793335,
"step": 1548
},
{
"epoch": 1.3047138047138047,
"grad_norm": 6.061704158782959,
"learning_rate": 1.3653746760459345e-06,
"loss": 0.5478522777557373,
"step": 1550
},
{
"epoch": 1.3063973063973064,
"grad_norm": 5.632637977600098,
"learning_rate": 1.3637779265289299e-06,
"loss": 0.8678094148635864,
"step": 1552
},
{
"epoch": 1.308080808080808,
"grad_norm": 7.475294589996338,
"learning_rate": 1.3621802681859812e-06,
"loss": 0.9599659442901611,
"step": 1554
},
{
"epoch": 1.3097643097643097,
"grad_norm": 3.182800769805908,
"learning_rate": 1.3605817065216944e-06,
"loss": 0.9627713561058044,
"step": 1556
},
{
"epoch": 1.3114478114478114,
"grad_norm": 7.048341274261475,
"learning_rate": 1.3589822470437864e-06,
"loss": 0.8731982707977295,
"step": 1558
},
{
"epoch": 1.3131313131313131,
"grad_norm": 12.228373527526855,
"learning_rate": 1.3573818952630683e-06,
"loss": 0.3814980089664459,
"step": 1560
},
{
"epoch": 1.3148148148148149,
"grad_norm": 5.152705192565918,
"learning_rate": 1.3557806566934256e-06,
"loss": 0.47562462091445923,
"step": 1562
},
{
"epoch": 1.3164983164983166,
"grad_norm": 12.943581581115723,
"learning_rate": 1.354178536851799e-06,
"loss": 0.5296528935432434,
"step": 1564
},
{
"epoch": 1.3181818181818181,
"grad_norm": 3.833484172821045,
"learning_rate": 1.3525755412581645e-06,
"loss": 1.0292046070098877,
"step": 1566
},
{
"epoch": 1.3198653198653199,
"grad_norm": 9.532318115234375,
"learning_rate": 1.3509716754355174e-06,
"loss": 0.4947565197944641,
"step": 1568
},
{
"epoch": 1.3215488215488216,
"grad_norm": 6.8037848472595215,
"learning_rate": 1.34936694490985e-06,
"loss": 0.897117018699646,
"step": 1570
},
{
"epoch": 1.3232323232323233,
"grad_norm": 4.932839393615723,
"learning_rate": 1.3477613552101344e-06,
"loss": 0.738558292388916,
"step": 1572
},
{
"epoch": 1.3249158249158248,
"grad_norm": 4.227520942687988,
"learning_rate": 1.3461549118683023e-06,
"loss": 0.6831085681915283,
"step": 1574
},
{
"epoch": 1.3265993265993266,
"grad_norm": 4.703937530517578,
"learning_rate": 1.344547620419227e-06,
"loss": 0.931479811668396,
"step": 1576
},
{
"epoch": 1.3282828282828283,
"grad_norm": 8.815512657165527,
"learning_rate": 1.3429394864007037e-06,
"loss": 0.6243126392364502,
"step": 1578
},
{
"epoch": 1.32996632996633,
"grad_norm": 14.775157928466797,
"learning_rate": 1.3413305153534313e-06,
"loss": 0.5434067249298096,
"step": 1580
},
{
"epoch": 1.3316498316498318,
"grad_norm": 4.071495056152344,
"learning_rate": 1.3397207128209916e-06,
"loss": 0.62471604347229,
"step": 1582
},
{
"epoch": 1.3333333333333333,
"grad_norm": 3.9714295864105225,
"learning_rate": 1.3381100843498315e-06,
"loss": 0.9411803483963013,
"step": 1584
},
{
"epoch": 1.335016835016835,
"grad_norm": 7.909718990325928,
"learning_rate": 1.3364986354892442e-06,
"loss": 0.7755764722824097,
"step": 1586
},
{
"epoch": 1.3367003367003367,
"grad_norm": 9.560751914978027,
"learning_rate": 1.3348863717913485e-06,
"loss": 0.4694201350212097,
"step": 1588
},
{
"epoch": 1.3383838383838385,
"grad_norm": 4.796677589416504,
"learning_rate": 1.3332732988110717e-06,
"loss": 0.6505795121192932,
"step": 1590
},
{
"epoch": 1.34006734006734,
"grad_norm": 13.761187553405762,
"learning_rate": 1.3316594221061293e-06,
"loss": 0.5099287033081055,
"step": 1592
},
{
"epoch": 1.3417508417508417,
"grad_norm": 3.4837796688079834,
"learning_rate": 1.3300447472370047e-06,
"loss": 0.9218275547027588,
"step": 1594
},
{
"epoch": 1.3434343434343434,
"grad_norm": 7.638758659362793,
"learning_rate": 1.3284292797669325e-06,
"loss": 0.343423992395401,
"step": 1596
},
{
"epoch": 1.3451178451178452,
"grad_norm": 3.7108771800994873,
"learning_rate": 1.326813025261878e-06,
"loss": 0.8066189289093018,
"step": 1598
},
{
"epoch": 1.3468013468013469,
"grad_norm": 5.8035359382629395,
"learning_rate": 1.3251959892905183e-06,
"loss": 0.7118152976036072,
"step": 1600
},
{
"epoch": 1.3484848484848486,
"grad_norm": 3.8060877323150635,
"learning_rate": 1.3235781774242221e-06,
"loss": 0.64288330078125,
"step": 1602
},
{
"epoch": 1.3501683501683501,
"grad_norm": 23.318649291992188,
"learning_rate": 1.321959595237032e-06,
"loss": 0.7593903541564941,
"step": 1604
},
{
"epoch": 1.3518518518518519,
"grad_norm": 6.9713640213012695,
"learning_rate": 1.3203402483056457e-06,
"loss": 1.0495635271072388,
"step": 1606
},
{
"epoch": 1.3535353535353536,
"grad_norm": 3.930389642715454,
"learning_rate": 1.3187201422093937e-06,
"loss": 0.8280398845672607,
"step": 1608
},
{
"epoch": 1.355218855218855,
"grad_norm": 5.5319743156433105,
"learning_rate": 1.3170992825302231e-06,
"loss": 0.37589627504348755,
"step": 1610
},
{
"epoch": 1.3569023569023568,
"grad_norm": 15.358514785766602,
"learning_rate": 1.315477674852678e-06,
"loss": 0.9352704286575317,
"step": 1612
},
{
"epoch": 1.3585858585858586,
"grad_norm": 10.822661399841309,
"learning_rate": 1.3138553247638793e-06,
"loss": 0.6205300092697144,
"step": 1614
},
{
"epoch": 1.3602693602693603,
"grad_norm": 8.775157928466797,
"learning_rate": 1.3122322378535052e-06,
"loss": 0.6584144830703735,
"step": 1616
},
{
"epoch": 1.361952861952862,
"grad_norm": 8.134145736694336,
"learning_rate": 1.310608419713773e-06,
"loss": 0.8533636927604675,
"step": 1618
},
{
"epoch": 1.3636363636363638,
"grad_norm": 3.848430871963501,
"learning_rate": 1.3089838759394198e-06,
"loss": 0.6382489204406738,
"step": 1620
},
{
"epoch": 1.3653198653198653,
"grad_norm": 5.8508620262146,
"learning_rate": 1.3073586121276824e-06,
"loss": 0.947349488735199,
"step": 1622
},
{
"epoch": 1.367003367003367,
"grad_norm": 2.4821629524230957,
"learning_rate": 1.3057326338782782e-06,
"loss": 0.8861122131347656,
"step": 1624
},
{
"epoch": 1.3686868686868687,
"grad_norm": 14.790640830993652,
"learning_rate": 1.3041059467933864e-06,
"loss": 0.6823830604553223,
"step": 1626
},
{
"epoch": 1.3703703703703702,
"grad_norm": 3.717794418334961,
"learning_rate": 1.3024785564776287e-06,
"loss": 0.8171314001083374,
"step": 1628
},
{
"epoch": 1.372053872053872,
"grad_norm": 5.684549331665039,
"learning_rate": 1.3008504685380493e-06,
"loss": 0.8313175439834595,
"step": 1630
},
{
"epoch": 1.3737373737373737,
"grad_norm": 4.290356159210205,
"learning_rate": 1.2992216885840964e-06,
"loss": 1.0408048629760742,
"step": 1632
},
{
"epoch": 1.3754208754208754,
"grad_norm": 12.80916690826416,
"learning_rate": 1.297592222227602e-06,
"loss": 0.613922655582428,
"step": 1634
},
{
"epoch": 1.3771043771043772,
"grad_norm": 5.713363170623779,
"learning_rate": 1.2959620750827637e-06,
"loss": 0.34947091341018677,
"step": 1636
},
{
"epoch": 1.378787878787879,
"grad_norm": 5.641543388366699,
"learning_rate": 1.2943312527661236e-06,
"loss": 0.5458937883377075,
"step": 1638
},
{
"epoch": 1.3804713804713804,
"grad_norm": 2.729052782058716,
"learning_rate": 1.2926997608965515e-06,
"loss": 0.6660902500152588,
"step": 1640
},
{
"epoch": 1.3821548821548821,
"grad_norm": 3.4759159088134766,
"learning_rate": 1.2910676050952232e-06,
"loss": 0.9125963449478149,
"step": 1642
},
{
"epoch": 1.3838383838383839,
"grad_norm": 3.339698314666748,
"learning_rate": 1.2894347909856021e-06,
"loss": 1.1126599311828613,
"step": 1644
},
{
"epoch": 1.3855218855218856,
"grad_norm": 9.030343055725098,
"learning_rate": 1.2878013241934195e-06,
"loss": 0.7313506603240967,
"step": 1646
},
{
"epoch": 1.387205387205387,
"grad_norm": 2.8494951725006104,
"learning_rate": 1.2861672103466564e-06,
"loss": 0.9350987672805786,
"step": 1648
},
{
"epoch": 1.3888888888888888,
"grad_norm": 4.453482151031494,
"learning_rate": 1.284532455075522e-06,
"loss": 0.8794913291931152,
"step": 1650
},
{
"epoch": 1.3905723905723906,
"grad_norm": 3.846998691558838,
"learning_rate": 1.2828970640124361e-06,
"loss": 1.1854183673858643,
"step": 1652
},
{
"epoch": 1.3922558922558923,
"grad_norm": 4.283193588256836,
"learning_rate": 1.281261042792009e-06,
"loss": 0.9548810720443726,
"step": 1654
},
{
"epoch": 1.393939393939394,
"grad_norm": 44.709163665771484,
"learning_rate": 1.2796243970510232e-06,
"loss": 0.5343578457832336,
"step": 1656
},
{
"epoch": 1.3956228956228955,
"grad_norm": 6.563719272613525,
"learning_rate": 1.2779871324284106e-06,
"loss": 0.6447005271911621,
"step": 1658
},
{
"epoch": 1.3973063973063973,
"grad_norm": 31.160367965698242,
"learning_rate": 1.2763492545652373e-06,
"loss": 0.9052919149398804,
"step": 1660
},
{
"epoch": 1.398989898989899,
"grad_norm": 22.833118438720703,
"learning_rate": 1.2747107691046815e-06,
"loss": 0.7731602191925049,
"step": 1662
},
{
"epoch": 1.4006734006734007,
"grad_norm": 8.412581443786621,
"learning_rate": 1.2730716816920151e-06,
"loss": 0.711165189743042,
"step": 1664
},
{
"epoch": 1.4023569023569022,
"grad_norm": 4.195555686950684,
"learning_rate": 1.271431997974584e-06,
"loss": 0.4324186444282532,
"step": 1666
},
{
"epoch": 1.404040404040404,
"grad_norm": 5.849745273590088,
"learning_rate": 1.2697917236017886e-06,
"loss": 0.780827522277832,
"step": 1668
},
{
"epoch": 1.4057239057239057,
"grad_norm": 15.396544456481934,
"learning_rate": 1.2681508642250637e-06,
"loss": 0.7758296728134155,
"step": 1670
},
{
"epoch": 1.4074074074074074,
"grad_norm": 2.463407516479492,
"learning_rate": 1.266509425497861e-06,
"loss": 0.7455316781997681,
"step": 1672
},
{
"epoch": 1.4090909090909092,
"grad_norm": 4.74429988861084,
"learning_rate": 1.2648674130756271e-06,
"loss": 1.1411914825439453,
"step": 1674
},
{
"epoch": 1.410774410774411,
"grad_norm": 23.014007568359375,
"learning_rate": 1.2632248326157854e-06,
"loss": 0.43792814016342163,
"step": 1676
},
{
"epoch": 1.4124579124579124,
"grad_norm": 7.774737358093262,
"learning_rate": 1.2615816897777176e-06,
"loss": 0.9449222087860107,
"step": 1678
},
{
"epoch": 1.4141414141414141,
"grad_norm": 73.73876190185547,
"learning_rate": 1.2599379902227419e-06,
"loss": 0.9584387540817261,
"step": 1680
},
{
"epoch": 1.4158249158249159,
"grad_norm": 2.815396785736084,
"learning_rate": 1.258293739614094e-06,
"loss": 0.6266515254974365,
"step": 1682
},
{
"epoch": 1.4175084175084174,
"grad_norm": 7.27461051940918,
"learning_rate": 1.2566489436169101e-06,
"loss": 0.4212794899940491,
"step": 1684
},
{
"epoch": 1.4191919191919191,
"grad_norm": 14.794193267822266,
"learning_rate": 1.255003607898204e-06,
"loss": 0.5568593740463257,
"step": 1686
},
{
"epoch": 1.4208754208754208,
"grad_norm": 3.4901039600372314,
"learning_rate": 1.2533577381268495e-06,
"loss": 1.1170185804367065,
"step": 1688
},
{
"epoch": 1.4225589225589226,
"grad_norm": 23.822872161865234,
"learning_rate": 1.2517113399735608e-06,
"loss": 0.5119540691375732,
"step": 1690
},
{
"epoch": 1.4242424242424243,
"grad_norm": 4.004513263702393,
"learning_rate": 1.250064419110872e-06,
"loss": 0.5368912220001221,
"step": 1692
},
{
"epoch": 1.425925925925926,
"grad_norm": 4.135901927947998,
"learning_rate": 1.2484169812131184e-06,
"loss": 0.44615352153778076,
"step": 1694
},
{
"epoch": 1.4276094276094276,
"grad_norm": 3.496605396270752,
"learning_rate": 1.246769031956417e-06,
"loss": 1.1422553062438965,
"step": 1696
},
{
"epoch": 1.4292929292929293,
"grad_norm": 15.598981857299805,
"learning_rate": 1.245120577018646e-06,
"loss": 1.0423638820648193,
"step": 1698
},
{
"epoch": 1.430976430976431,
"grad_norm": 5.298037052154541,
"learning_rate": 1.2434716220794265e-06,
"loss": 0.7629603147506714,
"step": 1700
},
{
"epoch": 1.4326599326599325,
"grad_norm": 8.531468391418457,
"learning_rate": 1.2418221728201023e-06,
"loss": 0.8187654614448547,
"step": 1702
},
{
"epoch": 1.4343434343434343,
"grad_norm": 4.175232410430908,
"learning_rate": 1.2401722349237198e-06,
"loss": 0.28517311811447144,
"step": 1704
},
{
"epoch": 1.436026936026936,
"grad_norm": 9.86863899230957,
"learning_rate": 1.238521814075009e-06,
"loss": 0.46237754821777344,
"step": 1706
},
{
"epoch": 1.4377104377104377,
"grad_norm": 13.564935684204102,
"learning_rate": 1.236870915960365e-06,
"loss": 0.958651065826416,
"step": 1708
},
{
"epoch": 1.4393939393939394,
"grad_norm": 6.472082614898682,
"learning_rate": 1.2352195462678257e-06,
"loss": 1.0340254306793213,
"step": 1710
},
{
"epoch": 1.4410774410774412,
"grad_norm": 17.20500373840332,
"learning_rate": 1.2335677106870546e-06,
"loss": 0.9463751316070557,
"step": 1712
},
{
"epoch": 1.4427609427609427,
"grad_norm": 3.069565773010254,
"learning_rate": 1.2319154149093202e-06,
"loss": 0.8773708939552307,
"step": 1714
},
{
"epoch": 1.4444444444444444,
"grad_norm": 65.60623168945312,
"learning_rate": 1.2302626646274773e-06,
"loss": 0.9023821353912354,
"step": 1716
},
{
"epoch": 1.4461279461279462,
"grad_norm": 3.0062930583953857,
"learning_rate": 1.228609465535946e-06,
"loss": 0.7161345481872559,
"step": 1718
},
{
"epoch": 1.4478114478114479,
"grad_norm": 7.399210453033447,
"learning_rate": 1.2269558233306918e-06,
"loss": 0.711788535118103,
"step": 1720
},
{
"epoch": 1.4494949494949494,
"grad_norm": 4.035950660705566,
"learning_rate": 1.2253017437092088e-06,
"loss": 0.5917500257492065,
"step": 1722
},
{
"epoch": 1.4511784511784511,
"grad_norm": 3.7948551177978516,
"learning_rate": 1.2236472323704971e-06,
"loss": 0.7458564043045044,
"step": 1724
},
{
"epoch": 1.4528619528619529,
"grad_norm": 4.743600845336914,
"learning_rate": 1.221992295015044e-06,
"loss": 0.8199291229248047,
"step": 1726
},
{
"epoch": 1.4545454545454546,
"grad_norm": 7.403223514556885,
"learning_rate": 1.2203369373448053e-06,
"loss": 0.651489794254303,
"step": 1728
},
{
"epoch": 1.4562289562289563,
"grad_norm": 9.762937545776367,
"learning_rate": 1.2186811650631847e-06,
"loss": 0.9804219007492065,
"step": 1730
},
{
"epoch": 1.457912457912458,
"grad_norm": 4.465795516967773,
"learning_rate": 1.217024983875014e-06,
"loss": 1.19962739944458,
"step": 1732
},
{
"epoch": 1.4595959595959596,
"grad_norm": 3.2770626544952393,
"learning_rate": 1.2153683994865354e-06,
"loss": 0.8254581093788147,
"step": 1734
},
{
"epoch": 1.4612794612794613,
"grad_norm": 7.466728687286377,
"learning_rate": 1.213711417605378e-06,
"loss": 0.6395374536514282,
"step": 1736
},
{
"epoch": 1.462962962962963,
"grad_norm": 5.3473920822143555,
"learning_rate": 1.2120540439405418e-06,
"loss": 0.5120725631713867,
"step": 1738
},
{
"epoch": 1.4646464646464645,
"grad_norm": 5.0326924324035645,
"learning_rate": 1.2103962842023765e-06,
"loss": 1.0951206684112549,
"step": 1740
},
{
"epoch": 1.4663299663299663,
"grad_norm": 3.968535900115967,
"learning_rate": 1.2087381441025624e-06,
"loss": 0.5963525772094727,
"step": 1742
},
{
"epoch": 1.468013468013468,
"grad_norm": 14.208403587341309,
"learning_rate": 1.2070796293540887e-06,
"loss": 0.5282841324806213,
"step": 1744
},
{
"epoch": 1.4696969696969697,
"grad_norm": 3.4273574352264404,
"learning_rate": 1.2054207456712377e-06,
"loss": 0.9493914246559143,
"step": 1746
},
{
"epoch": 1.4713804713804715,
"grad_norm": 6.042728424072266,
"learning_rate": 1.2037614987695609e-06,
"loss": 0.9857927560806274,
"step": 1748
},
{
"epoch": 1.4730639730639732,
"grad_norm": 13.786114692687988,
"learning_rate": 1.2021018943658623e-06,
"loss": 0.7202122211456299,
"step": 1750
},
{
"epoch": 1.4747474747474747,
"grad_norm": 7.52271842956543,
"learning_rate": 1.2004419381781779e-06,
"loss": 0.6241959929466248,
"step": 1752
},
{
"epoch": 1.4764309764309764,
"grad_norm": 7.334821701049805,
"learning_rate": 1.1987816359257543e-06,
"loss": 0.6670255661010742,
"step": 1754
},
{
"epoch": 1.4781144781144782,
"grad_norm": 8.069925308227539,
"learning_rate": 1.1971209933290318e-06,
"loss": 0.8243575096130371,
"step": 1756
},
{
"epoch": 1.4797979797979797,
"grad_norm": 18.11806297302246,
"learning_rate": 1.1954600161096226e-06,
"loss": 0.5894988775253296,
"step": 1758
},
{
"epoch": 1.4814814814814814,
"grad_norm": 5.887277126312256,
"learning_rate": 1.1937987099902927e-06,
"loss": 0.7406305074691772,
"step": 1760
},
{
"epoch": 1.4831649831649831,
"grad_norm": 4.559276103973389,
"learning_rate": 1.19213708069494e-06,
"loss": 0.9007562398910522,
"step": 1762
},
{
"epoch": 1.4848484848484849,
"grad_norm": 3.148066759109497,
"learning_rate": 1.190475133948577e-06,
"loss": 0.8763662576675415,
"step": 1764
},
{
"epoch": 1.4865319865319866,
"grad_norm": 4.318105697631836,
"learning_rate": 1.1888128754773092e-06,
"loss": 0.586820125579834,
"step": 1766
},
{
"epoch": 1.4882154882154883,
"grad_norm": 8.943533897399902,
"learning_rate": 1.1871503110083167e-06,
"loss": 0.7152913808822632,
"step": 1768
},
{
"epoch": 1.4898989898989898,
"grad_norm": 4.288205146789551,
"learning_rate": 1.1854874462698337e-06,
"loss": 0.9644764065742493,
"step": 1770
},
{
"epoch": 1.4915824915824916,
"grad_norm": 8.917333602905273,
"learning_rate": 1.1838242869911285e-06,
"loss": 0.26478564739227295,
"step": 1772
},
{
"epoch": 1.4932659932659933,
"grad_norm": 2.5780608654022217,
"learning_rate": 1.182160838902485e-06,
"loss": 0.6720756888389587,
"step": 1774
},
{
"epoch": 1.494949494949495,
"grad_norm": 4.799160480499268,
"learning_rate": 1.1804971077351818e-06,
"loss": 0.703216016292572,
"step": 1776
},
{
"epoch": 1.4966329966329965,
"grad_norm": 6.038239002227783,
"learning_rate": 1.1788330992214724e-06,
"loss": 0.8697667121887207,
"step": 1778
},
{
"epoch": 1.4983164983164983,
"grad_norm": 7.712295055389404,
"learning_rate": 1.1771688190945664e-06,
"loss": 0.8953297138214111,
"step": 1780
},
{
"epoch": 1.5,
"grad_norm": 13.718062400817871,
"learning_rate": 1.1755042730886093e-06,
"loss": 0.5260931253433228,
"step": 1782
},
{
"epoch": 1.5016835016835017,
"grad_norm": 2.981576681137085,
"learning_rate": 1.1738394669386621e-06,
"loss": 1.1269118785858154,
"step": 1784
},
{
"epoch": 1.5033670033670035,
"grad_norm": 6.8433427810668945,
"learning_rate": 1.172174406380683e-06,
"loss": 0.7218701839447021,
"step": 1786
},
{
"epoch": 1.5050505050505052,
"grad_norm": 4.18980073928833,
"learning_rate": 1.170509097151506e-06,
"loss": 0.9302811622619629,
"step": 1788
},
{
"epoch": 1.5067340067340067,
"grad_norm": 16.984750747680664,
"learning_rate": 1.168843544988822e-06,
"loss": 0.5803855657577515,
"step": 1790
},
{
"epoch": 1.5084175084175084,
"grad_norm": 7.404435157775879,
"learning_rate": 1.1671777556311587e-06,
"loss": 0.5785191059112549,
"step": 1792
},
{
"epoch": 1.51010101010101,
"grad_norm": 9.237391471862793,
"learning_rate": 1.1655117348178619e-06,
"loss": 0.8854154348373413,
"step": 1794
},
{
"epoch": 1.5117845117845117,
"grad_norm": 15.445114135742188,
"learning_rate": 1.163845488289074e-06,
"loss": 0.8979889154434204,
"step": 1796
},
{
"epoch": 1.5134680134680134,
"grad_norm": 13.821887016296387,
"learning_rate": 1.1621790217857153e-06,
"loss": 0.9836833477020264,
"step": 1798
},
{
"epoch": 1.5151515151515151,
"grad_norm": 3.357462167739868,
"learning_rate": 1.1605123410494643e-06,
"loss": 0.6817135810852051,
"step": 1800
},
{
"epoch": 1.5168350168350169,
"grad_norm": 3.545977830886841,
"learning_rate": 1.1588454518227375e-06,
"loss": 1.0103018283843994,
"step": 1802
},
{
"epoch": 1.5185185185185186,
"grad_norm": 6.9810333251953125,
"learning_rate": 1.157178359848669e-06,
"loss": 0.8972345590591431,
"step": 1804
},
{
"epoch": 1.5202020202020203,
"grad_norm": 4.857126235961914,
"learning_rate": 1.155511070871093e-06,
"loss": 0.8954426050186157,
"step": 1806
},
{
"epoch": 1.5218855218855218,
"grad_norm": 8.7957763671875,
"learning_rate": 1.1538435906345213e-06,
"loss": 0.7016856670379639,
"step": 1808
},
{
"epoch": 1.5235690235690236,
"grad_norm": 6.070329189300537,
"learning_rate": 1.1521759248841237e-06,
"loss": 0.6799755096435547,
"step": 1810
},
{
"epoch": 1.5252525252525253,
"grad_norm": 1.5235867500305176,
"learning_rate": 1.1505080793657124e-06,
"loss": 0.2342766374349594,
"step": 1812
},
{
"epoch": 1.5269360269360268,
"grad_norm": 3.382413864135742,
"learning_rate": 1.1488400598257157e-06,
"loss": 1.052855134010315,
"step": 1814
},
{
"epoch": 1.5286195286195285,
"grad_norm": 27.871402740478516,
"learning_rate": 1.1471718720111629e-06,
"loss": 0.783697247505188,
"step": 1816
},
{
"epoch": 1.5303030303030303,
"grad_norm": 61.98154067993164,
"learning_rate": 1.1455035216696634e-06,
"loss": 0.8607441186904907,
"step": 1818
},
{
"epoch": 1.531986531986532,
"grad_norm": 5.8887763023376465,
"learning_rate": 1.1438350145493853e-06,
"loss": 0.7033579349517822,
"step": 1820
},
{
"epoch": 1.5336700336700337,
"grad_norm": 5.281957149505615,
"learning_rate": 1.1421663563990383e-06,
"loss": 0.6749075651168823,
"step": 1822
},
{
"epoch": 1.5353535353535355,
"grad_norm": 5.970940589904785,
"learning_rate": 1.1404975529678515e-06,
"loss": 0.9477555751800537,
"step": 1824
},
{
"epoch": 1.5370370370370372,
"grad_norm": 2.8316867351531982,
"learning_rate": 1.1388286100055555e-06,
"loss": 0.8103057742118835,
"step": 1826
},
{
"epoch": 1.5387205387205387,
"grad_norm": 3.575162410736084,
"learning_rate": 1.1371595332623601e-06,
"loss": 0.9152002334594727,
"step": 1828
},
{
"epoch": 1.5404040404040404,
"grad_norm": 8.131978988647461,
"learning_rate": 1.1354903284889377e-06,
"loss": 0.6978881359100342,
"step": 1830
},
{
"epoch": 1.542087542087542,
"grad_norm": 6.332693099975586,
"learning_rate": 1.133821001436401e-06,
"loss": 0.6509323120117188,
"step": 1832
},
{
"epoch": 1.5437710437710437,
"grad_norm": 2.458233594894409,
"learning_rate": 1.1321515578562835e-06,
"loss": 1.04543936252594,
"step": 1834
},
{
"epoch": 1.5454545454545454,
"grad_norm": 270.1246032714844,
"learning_rate": 1.1304820035005211e-06,
"loss": 0.9024485349655151,
"step": 1836
},
{
"epoch": 1.5471380471380471,
"grad_norm": 4.0058183670043945,
"learning_rate": 1.1288123441214315e-06,
"loss": 0.4209427535533905,
"step": 1838
},
{
"epoch": 1.5488215488215489,
"grad_norm": 4.2291083335876465,
"learning_rate": 1.1271425854716931e-06,
"loss": 0.6784233450889587,
"step": 1840
},
{
"epoch": 1.5505050505050506,
"grad_norm": 15.050762176513672,
"learning_rate": 1.125472733304327e-06,
"loss": 0.5746853351593018,
"step": 1842
},
{
"epoch": 1.5521885521885523,
"grad_norm": 9.32972526550293,
"learning_rate": 1.1238027933726776e-06,
"loss": 0.41324469447135925,
"step": 1844
},
{
"epoch": 1.5538720538720538,
"grad_norm": 3.370657205581665,
"learning_rate": 1.122132771430389e-06,
"loss": 0.9156204462051392,
"step": 1846
},
{
"epoch": 1.5555555555555556,
"grad_norm": 4.2874650955200195,
"learning_rate": 1.1204626732313907e-06,
"loss": 0.9899235367774963,
"step": 1848
},
{
"epoch": 1.557239057239057,
"grad_norm": 3.843651294708252,
"learning_rate": 1.1187925045298732e-06,
"loss": 0.8029769659042358,
"step": 1850
},
{
"epoch": 1.5589225589225588,
"grad_norm": 3.7915287017822266,
"learning_rate": 1.1171222710802704e-06,
"loss": 0.9333086013793945,
"step": 1852
},
{
"epoch": 1.5606060606060606,
"grad_norm": 17.623516082763672,
"learning_rate": 1.1154519786372392e-06,
"loss": 0.5394339561462402,
"step": 1854
},
{
"epoch": 1.5622895622895623,
"grad_norm": 2.851343870162964,
"learning_rate": 1.1137816329556403e-06,
"loss": 0.617688775062561,
"step": 1856
},
{
"epoch": 1.563973063973064,
"grad_norm": 3.1740288734436035,
"learning_rate": 1.112111239790517e-06,
"loss": 0.902677059173584,
"step": 1858
},
{
"epoch": 1.5656565656565657,
"grad_norm": 7.28153133392334,
"learning_rate": 1.1104408048970765e-06,
"loss": 0.3739192485809326,
"step": 1860
},
{
"epoch": 1.5673400673400675,
"grad_norm": 3.828963279724121,
"learning_rate": 1.1087703340306707e-06,
"loss": 0.9757977724075317,
"step": 1862
},
{
"epoch": 1.569023569023569,
"grad_norm": 4.035392761230469,
"learning_rate": 1.1070998329467738e-06,
"loss": 0.33518415689468384,
"step": 1864
},
{
"epoch": 1.5707070707070707,
"grad_norm": 7.576591968536377,
"learning_rate": 1.1054293074009646e-06,
"loss": 0.9643778800964355,
"step": 1866
},
{
"epoch": 1.5723905723905722,
"grad_norm": 10.830273628234863,
"learning_rate": 1.1037587631489077e-06,
"loss": 0.6072518825531006,
"step": 1868
},
{
"epoch": 1.574074074074074,
"grad_norm": 2.8351891040802,
"learning_rate": 1.1020882059463297e-06,
"loss": 0.8100966215133667,
"step": 1870
},
{
"epoch": 1.5757575757575757,
"grad_norm": 2.7790122032165527,
"learning_rate": 1.1004176415490036e-06,
"loss": 0.7995985746383667,
"step": 1872
},
{
"epoch": 1.5774410774410774,
"grad_norm": 3.3753979206085205,
"learning_rate": 1.0987470757127267e-06,
"loss": 0.8837331533432007,
"step": 1874
},
{
"epoch": 1.5791245791245792,
"grad_norm": 12.539671897888184,
"learning_rate": 1.0970765141933012e-06,
"loss": 0.5485697388648987,
"step": 1876
},
{
"epoch": 1.5808080808080809,
"grad_norm": 2.4989864826202393,
"learning_rate": 1.0954059627465144e-06,
"loss": 1.1583393812179565,
"step": 1878
},
{
"epoch": 1.5824915824915826,
"grad_norm": 6.108792304992676,
"learning_rate": 1.093735427128119e-06,
"loss": 0.7429193258285522,
"step": 1880
},
{
"epoch": 1.5841750841750841,
"grad_norm": 5.460334300994873,
"learning_rate": 1.092064913093813e-06,
"loss": 0.4204625189304352,
"step": 1882
},
{
"epoch": 1.5858585858585859,
"grad_norm": 19.09606170654297,
"learning_rate": 1.09039442639922e-06,
"loss": 0.5326663255691528,
"step": 1884
},
{
"epoch": 1.5875420875420876,
"grad_norm": 12.058549880981445,
"learning_rate": 1.0887239727998697e-06,
"loss": 0.6357114315032959,
"step": 1886
},
{
"epoch": 1.589225589225589,
"grad_norm": 14.583388328552246,
"learning_rate": 1.0870535580511778e-06,
"loss": 1.0033700466156006,
"step": 1888
},
{
"epoch": 1.5909090909090908,
"grad_norm": 3.857271671295166,
"learning_rate": 1.0853831879084254e-06,
"loss": 0.19512847065925598,
"step": 1890
},
{
"epoch": 1.5925925925925926,
"grad_norm": 8.160994529724121,
"learning_rate": 1.0837128681267409e-06,
"loss": 1.0877628326416016,
"step": 1892
},
{
"epoch": 1.5942760942760943,
"grad_norm": 4.857079982757568,
"learning_rate": 1.082042604461079e-06,
"loss": 0.7703442573547363,
"step": 1894
},
{
"epoch": 1.595959595959596,
"grad_norm": 22.629634857177734,
"learning_rate": 1.0803724026662e-06,
"loss": 0.9460948705673218,
"step": 1896
},
{
"epoch": 1.5976430976430978,
"grad_norm": 12.860857963562012,
"learning_rate": 1.0787022684966524e-06,
"loss": 0.8795516490936279,
"step": 1898
},
{
"epoch": 1.5993265993265995,
"grad_norm": 7.2853193283081055,
"learning_rate": 1.0770322077067512e-06,
"loss": 0.8695672750473022,
"step": 1900
},
{
"epoch": 1.601010101010101,
"grad_norm": 3.358490467071533,
"learning_rate": 1.0753622260505582e-06,
"loss": 0.8867776393890381,
"step": 1902
},
{
"epoch": 1.6026936026936027,
"grad_norm": 4.54321813583374,
"learning_rate": 1.0736923292818631e-06,
"loss": 0.730638861656189,
"step": 1904
},
{
"epoch": 1.6043771043771042,
"grad_norm": 2.6725199222564697,
"learning_rate": 1.0720225231541629e-06,
"loss": 1.1262996196746826,
"step": 1906
},
{
"epoch": 1.606060606060606,
"grad_norm": 4.424936771392822,
"learning_rate": 1.0703528134206418e-06,
"loss": 0.9013878703117371,
"step": 1908
},
{
"epoch": 1.6077441077441077,
"grad_norm": 4.459665775299072,
"learning_rate": 1.0686832058341534e-06,
"loss": 0.5786502957344055,
"step": 1910
},
{
"epoch": 1.6094276094276094,
"grad_norm": 7.450462341308594,
"learning_rate": 1.0670137061471972e-06,
"loss": 0.5591634511947632,
"step": 1912
},
{
"epoch": 1.6111111111111112,
"grad_norm": 6.1925764083862305,
"learning_rate": 1.0653443201119026e-06,
"loss": 0.7897850275039673,
"step": 1914
},
{
"epoch": 1.612794612794613,
"grad_norm": 14.83733081817627,
"learning_rate": 1.063675053480007e-06,
"loss": 0.6848697066307068,
"step": 1916
},
{
"epoch": 1.6144781144781146,
"grad_norm": 7.251978397369385,
"learning_rate": 1.0620059120028363e-06,
"loss": 0.6231127977371216,
"step": 1918
},
{
"epoch": 1.6161616161616161,
"grad_norm": 6.919361591339111,
"learning_rate": 1.0603369014312848e-06,
"loss": 0.665825605392456,
"step": 1920
},
{
"epoch": 1.6178451178451179,
"grad_norm": 5.165210247039795,
"learning_rate": 1.0586680275157966e-06,
"loss": 0.9070066809654236,
"step": 1922
},
{
"epoch": 1.6195286195286194,
"grad_norm": 14.22563648223877,
"learning_rate": 1.0569992960063445e-06,
"loss": 0.6462626457214355,
"step": 1924
},
{
"epoch": 1.621212121212121,
"grad_norm": 10.909793853759766,
"learning_rate": 1.0553307126524105e-06,
"loss": 0.6075209379196167,
"step": 1926
},
{
"epoch": 1.6228956228956228,
"grad_norm": 2.4525344371795654,
"learning_rate": 1.0536622832029663e-06,
"loss": 0.7241764068603516,
"step": 1928
},
{
"epoch": 1.6245791245791246,
"grad_norm": 7.981225967407227,
"learning_rate": 1.0519940134064535e-06,
"loss": 0.7813702821731567,
"step": 1930
},
{
"epoch": 1.6262626262626263,
"grad_norm": 3.924685478210449,
"learning_rate": 1.0503259090107635e-06,
"loss": 0.6770836114883423,
"step": 1932
},
{
"epoch": 1.627946127946128,
"grad_norm": 6.139669895172119,
"learning_rate": 1.0486579757632177e-06,
"loss": 0.9623356461524963,
"step": 1934
},
{
"epoch": 1.6296296296296298,
"grad_norm": 12.121989250183105,
"learning_rate": 1.046990219410548e-06,
"loss": 0.9487285614013672,
"step": 1936
},
{
"epoch": 1.6313131313131313,
"grad_norm": 3.9633893966674805,
"learning_rate": 1.0453226456988766e-06,
"loss": 1.0289177894592285,
"step": 1938
},
{
"epoch": 1.632996632996633,
"grad_norm": 7.765763759613037,
"learning_rate": 1.0436552603736967e-06,
"loss": 0.8020685315132141,
"step": 1940
},
{
"epoch": 1.6346801346801347,
"grad_norm": 4.463337421417236,
"learning_rate": 1.0419880691798526e-06,
"loss": 1.010524868965149,
"step": 1942
},
{
"epoch": 1.6363636363636362,
"grad_norm": 29.483732223510742,
"learning_rate": 1.040321077861519e-06,
"loss": 0.7623812556266785,
"step": 1944
},
{
"epoch": 1.638047138047138,
"grad_norm": 3.125913619995117,
"learning_rate": 1.0386542921621824e-06,
"loss": 0.41824889183044434,
"step": 1946
},
{
"epoch": 1.6397306397306397,
"grad_norm": 6.553778648376465,
"learning_rate": 1.036987717824621e-06,
"loss": 0.9804911613464355,
"step": 1948
},
{
"epoch": 1.6414141414141414,
"grad_norm": 3.5837337970733643,
"learning_rate": 1.0353213605908854e-06,
"loss": 0.999625563621521,
"step": 1950
},
{
"epoch": 1.6430976430976432,
"grad_norm": 6.643466949462891,
"learning_rate": 1.0336552262022756e-06,
"loss": 0.49242016673088074,
"step": 1952
},
{
"epoch": 1.644781144781145,
"grad_norm": 5.533985614776611,
"learning_rate": 1.0319893203993276e-06,
"loss": 0.39796119928359985,
"step": 1954
},
{
"epoch": 1.6464646464646466,
"grad_norm": 15.642714500427246,
"learning_rate": 1.0303236489217863e-06,
"loss": 0.22867411375045776,
"step": 1956
},
{
"epoch": 1.6481481481481481,
"grad_norm": 6.918363571166992,
"learning_rate": 1.0286582175085913e-06,
"loss": 0.6615217924118042,
"step": 1958
},
{
"epoch": 1.6498316498316499,
"grad_norm": 9.181211471557617,
"learning_rate": 1.0269930318978552e-06,
"loss": 0.7599420547485352,
"step": 1960
},
{
"epoch": 1.6515151515151514,
"grad_norm": 7.992411136627197,
"learning_rate": 1.0253280978268421e-06,
"loss": 0.6117727756500244,
"step": 1962
},
{
"epoch": 1.6531986531986531,
"grad_norm": 6.661476135253906,
"learning_rate": 1.0236634210319507e-06,
"loss": 0.446529746055603,
"step": 1964
},
{
"epoch": 1.6548821548821548,
"grad_norm": 7.98351526260376,
"learning_rate": 1.0219990072486938e-06,
"loss": 0.5448979139328003,
"step": 1966
},
{
"epoch": 1.6565656565656566,
"grad_norm": 3.0457189083099365,
"learning_rate": 1.020334862211676e-06,
"loss": 0.8045427799224854,
"step": 1968
},
{
"epoch": 1.6582491582491583,
"grad_norm": 11.37780475616455,
"learning_rate": 1.0186709916545775e-06,
"loss": 0.7433644533157349,
"step": 1970
},
{
"epoch": 1.65993265993266,
"grad_norm": 18.77171516418457,
"learning_rate": 1.0170074013101329e-06,
"loss": 0.9492733478546143,
"step": 1972
},
{
"epoch": 1.6616161616161618,
"grad_norm": 2.508883237838745,
"learning_rate": 1.0153440969101103e-06,
"loss": 0.7720388174057007,
"step": 1974
},
{
"epoch": 1.6632996632996633,
"grad_norm": 24.94049644470215,
"learning_rate": 1.0136810841852937e-06,
"loss": 0.6722294688224792,
"step": 1976
},
{
"epoch": 1.664983164983165,
"grad_norm": 24.168481826782227,
"learning_rate": 1.0120183688654616e-06,
"loss": 0.6629032492637634,
"step": 1978
},
{
"epoch": 1.6666666666666665,
"grad_norm": 6.165626525878906,
"learning_rate": 1.0103559566793679e-06,
"loss": 0.7866932153701782,
"step": 1980
},
{
"epoch": 1.6683501683501682,
"grad_norm": 10.85080337524414,
"learning_rate": 1.0086938533547213e-06,
"loss": 0.5067884922027588,
"step": 1982
},
{
"epoch": 1.67003367003367,
"grad_norm": 6.6474199295043945,
"learning_rate": 1.0070320646181684e-06,
"loss": 0.3992816209793091,
"step": 1984
},
{
"epoch": 1.6717171717171717,
"grad_norm": 3.2397751808166504,
"learning_rate": 1.0053705961952697e-06,
"loss": 0.9870185256004333,
"step": 1986
},
{
"epoch": 1.6734006734006734,
"grad_norm": 4.640201568603516,
"learning_rate": 1.0037094538104832e-06,
"loss": 0.916529655456543,
"step": 1988
},
{
"epoch": 1.6750841750841752,
"grad_norm": 2.8144161701202393,
"learning_rate": 1.002048643187143e-06,
"loss": 0.5871807932853699,
"step": 1990
},
{
"epoch": 1.676767676767677,
"grad_norm": 2.0470471382141113,
"learning_rate": 1.0003881700474415e-06,
"loss": 1.0817761421203613,
"step": 1992
},
{
"epoch": 1.6784511784511784,
"grad_norm": 13.735745429992676,
"learning_rate": 9.987280401124063e-07,
"loss": 0.6647434234619141,
"step": 1994
},
{
"epoch": 1.6801346801346801,
"grad_norm": 8.261211395263672,
"learning_rate": 9.970682591018842e-07,
"loss": 0.6305195689201355,
"step": 1996
},
{
"epoch": 1.6818181818181817,
"grad_norm": 2.91133713722229,
"learning_rate": 9.95408832734519e-07,
"loss": 0.9166790246963501,
"step": 1998
},
{
"epoch": 1.6835016835016834,
"grad_norm": 12.362771987915039,
"learning_rate": 9.937497667277322e-07,
"loss": 0.7117506861686707,
"step": 2000
},
{
"epoch": 1.6851851851851851,
"grad_norm": 6.4341349601745605,
"learning_rate": 9.92091066797705e-07,
"loss": 0.5901815891265869,
"step": 2002
},
{
"epoch": 1.6868686868686869,
"grad_norm": 6.667015075683594,
"learning_rate": 9.904327386593563e-07,
"loss": 0.6358145475387573,
"step": 2004
},
{
"epoch": 1.6885521885521886,
"grad_norm": 5.651986122131348,
"learning_rate": 9.887747880263236e-07,
"loss": 0.6100403666496277,
"step": 2006
},
{
"epoch": 1.6902356902356903,
"grad_norm": 2.491840124130249,
"learning_rate": 9.871172206109458e-07,
"loss": 0.9090219736099243,
"step": 2008
},
{
"epoch": 1.691919191919192,
"grad_norm": 5.323090553283691,
"learning_rate": 9.854600421242396e-07,
"loss": 0.464111328125,
"step": 2010
},
{
"epoch": 1.6936026936026936,
"grad_norm": 5.4663496017456055,
"learning_rate": 9.838032582758814e-07,
"loss": 0.7845708727836609,
"step": 2012
},
{
"epoch": 1.6952861952861953,
"grad_norm": 5.896030902862549,
"learning_rate": 9.821468747741893e-07,
"loss": 0.5561348795890808,
"step": 2014
},
{
"epoch": 1.696969696969697,
"grad_norm": 11.155455589294434,
"learning_rate": 9.804908973261012e-07,
"loss": 0.7063945531845093,
"step": 2016
},
{
"epoch": 1.6986531986531985,
"grad_norm": 12.239091873168945,
"learning_rate": 9.788353316371562e-07,
"loss": 0.7154542803764343,
"step": 2018
},
{
"epoch": 1.7003367003367003,
"grad_norm": 7.434993743896484,
"learning_rate": 9.771801834114748e-07,
"loss": 0.6947083473205566,
"step": 2020
},
{
"epoch": 1.702020202020202,
"grad_norm": 8.770514488220215,
"learning_rate": 9.755254583517394e-07,
"loss": 0.998549222946167,
"step": 2022
},
{
"epoch": 1.7037037037037037,
"grad_norm": 3.318065643310547,
"learning_rate": 9.738711621591733e-07,
"loss": 0.7664910554885864,
"step": 2024
},
{
"epoch": 1.7053872053872055,
"grad_norm": 3.665529489517212,
"learning_rate": 9.722173005335235e-07,
"loss": 0.8967854976654053,
"step": 2026
},
{
"epoch": 1.7070707070707072,
"grad_norm": 4.913332939147949,
"learning_rate": 9.705638791730391e-07,
"loss": 1.01124906539917,
"step": 2028
},
{
"epoch": 1.708754208754209,
"grad_norm": 13.111969947814941,
"learning_rate": 9.689109037744522e-07,
"loss": 0.5944876670837402,
"step": 2030
},
{
"epoch": 1.7104377104377104,
"grad_norm": 4.983813762664795,
"learning_rate": 9.672583800329585e-07,
"loss": 0.3871064782142639,
"step": 2032
},
{
"epoch": 1.7121212121212122,
"grad_norm": 3.5434679985046387,
"learning_rate": 9.65606313642198e-07,
"loss": 0.8279162645339966,
"step": 2034
},
{
"epoch": 1.7138047138047137,
"grad_norm": 4.728488922119141,
"learning_rate": 9.63954710294234e-07,
"loss": 0.7765666246414185,
"step": 2036
},
{
"epoch": 1.7154882154882154,
"grad_norm": 9.771429061889648,
"learning_rate": 9.623035756795352e-07,
"loss": 0.38172125816345215,
"step": 2038
},
{
"epoch": 1.7171717171717171,
"grad_norm": 6.764921188354492,
"learning_rate": 9.606529154869556e-07,
"loss": 0.4684080481529236,
"step": 2040
},
{
"epoch": 1.7188552188552189,
"grad_norm": 3.6262731552124023,
"learning_rate": 9.590027354037134e-07,
"loss": 0.8603177070617676,
"step": 2042
},
{
"epoch": 1.7205387205387206,
"grad_norm": 3.9443676471710205,
"learning_rate": 9.573530411153732e-07,
"loss": 0.8025220632553101,
"step": 2044
},
{
"epoch": 1.7222222222222223,
"grad_norm": 3.0719077587127686,
"learning_rate": 9.557038383058265e-07,
"loss": 1.0896143913269043,
"step": 2046
},
{
"epoch": 1.723905723905724,
"grad_norm": 2.531261920928955,
"learning_rate": 9.540551326572709e-07,
"loss": 0.9985212087631226,
"step": 2048
},
{
"epoch": 1.7255892255892256,
"grad_norm": 12.073436737060547,
"learning_rate": 9.524069298501902e-07,
"loss": 0.3480485677719116,
"step": 2050
},
{
"epoch": 1.7272727272727273,
"grad_norm": 3.418630599975586,
"learning_rate": 9.507592355633376e-07,
"loss": 1.0522449016571045,
"step": 2052
},
{
"epoch": 1.7289562289562288,
"grad_norm": 8.016219139099121,
"learning_rate": 9.491120554737126e-07,
"loss": 0.8546870350837708,
"step": 2054
},
{
"epoch": 1.7306397306397305,
"grad_norm": 8.340877532958984,
"learning_rate": 9.474653952565439e-07,
"loss": 0.7133148908615112,
"step": 2056
},
{
"epoch": 1.7323232323232323,
"grad_norm": 7.340385437011719,
"learning_rate": 9.458192605852691e-07,
"loss": 1.0671539306640625,
"step": 2058
},
{
"epoch": 1.734006734006734,
"grad_norm": 3.097386121749878,
"learning_rate": 9.441736571315142e-07,
"loss": 0.6089422702789307,
"step": 2060
},
{
"epoch": 1.7356902356902357,
"grad_norm": 24.186203002929688,
"learning_rate": 9.425285905650755e-07,
"loss": 1.0324299335479736,
"step": 2062
},
{
"epoch": 1.7373737373737375,
"grad_norm": 6.780990123748779,
"learning_rate": 9.408840665538999e-07,
"loss": 0.6091172099113464,
"step": 2064
},
{
"epoch": 1.7390572390572392,
"grad_norm": 3.00539231300354,
"learning_rate": 9.392400907640645e-07,
"loss": 0.6669168472290039,
"step": 2066
},
{
"epoch": 1.7407407407407407,
"grad_norm": 9.17961311340332,
"learning_rate": 9.375966688597572e-07,
"loss": 0.8047370314598083,
"step": 2068
},
{
"epoch": 1.7424242424242424,
"grad_norm": 3.1698920726776123,
"learning_rate": 9.359538065032586e-07,
"loss": 0.6602023839950562,
"step": 2070
},
{
"epoch": 1.7441077441077442,
"grad_norm": 30.58012580871582,
"learning_rate": 9.343115093549203e-07,
"loss": 0.6329094171524048,
"step": 2072
},
{
"epoch": 1.7457912457912457,
"grad_norm": 4.666545867919922,
"learning_rate": 9.32669783073147e-07,
"loss": 0.5041278600692749,
"step": 2074
},
{
"epoch": 1.7474747474747474,
"grad_norm": 9.805131912231445,
"learning_rate": 9.310286333143767e-07,
"loss": 1.0198402404785156,
"step": 2076
},
{
"epoch": 1.7491582491582491,
"grad_norm": 11.345261573791504,
"learning_rate": 9.293880657330604e-07,
"loss": 0.7572150230407715,
"step": 2078
},
{
"epoch": 1.7508417508417509,
"grad_norm": 5.581562519073486,
"learning_rate": 9.277480859816444e-07,
"loss": 0.9102179408073425,
"step": 2080
},
{
"epoch": 1.7525252525252526,
"grad_norm": 2.126108169555664,
"learning_rate": 9.261086997105487e-07,
"loss": 0.5880842208862305,
"step": 2082
},
{
"epoch": 1.7542087542087543,
"grad_norm": 2.7663979530334473,
"learning_rate": 9.244699125681485e-07,
"loss": 1.1094093322753906,
"step": 2084
},
{
"epoch": 1.7558922558922558,
"grad_norm": 11.278059959411621,
"learning_rate": 9.228317302007556e-07,
"loss": 0.7268582582473755,
"step": 2086
},
{
"epoch": 1.7575757575757576,
"grad_norm": 9.18853759765625,
"learning_rate": 9.211941582525968e-07,
"loss": 0.44798004627227783,
"step": 2088
},
{
"epoch": 1.7592592592592593,
"grad_norm": 11.152181625366211,
"learning_rate": 9.195572023657969e-07,
"loss": 0.4857521653175354,
"step": 2090
},
{
"epoch": 1.7609427609427608,
"grad_norm": 6.81666374206543,
"learning_rate": 9.179208681803579e-07,
"loss": 0.510983943939209,
"step": 2092
},
{
"epoch": 1.7626262626262625,
"grad_norm": 6.450544834136963,
"learning_rate": 9.162851613341389e-07,
"loss": 0.4013763964176178,
"step": 2094
},
{
"epoch": 1.7643097643097643,
"grad_norm": 4.122218608856201,
"learning_rate": 9.146500874628391e-07,
"loss": 0.6035534143447876,
"step": 2096
},
{
"epoch": 1.765993265993266,
"grad_norm": 7.275836944580078,
"learning_rate": 9.130156521999757e-07,
"loss": 0.9859648942947388,
"step": 2098
},
{
"epoch": 1.7676767676767677,
"grad_norm": 3.9273769855499268,
"learning_rate": 9.113818611768654e-07,
"loss": 0.918908953666687,
"step": 2100
},
{
"epoch": 1.7693602693602695,
"grad_norm": 5.415125846862793,
"learning_rate": 9.097487200226059e-07,
"loss": 0.8446367979049683,
"step": 2102
},
{
"epoch": 1.7710437710437712,
"grad_norm": 15.805021286010742,
"learning_rate": 9.081162343640561e-07,
"loss": 0.5240712761878967,
"step": 2104
},
{
"epoch": 1.7727272727272727,
"grad_norm": 5.227410316467285,
"learning_rate": 9.064844098258153e-07,
"loss": 0.6734915375709534,
"step": 2106
},
{
"epoch": 1.7744107744107744,
"grad_norm": 9.779236793518066,
"learning_rate": 9.048532520302061e-07,
"loss": 0.8648114204406738,
"step": 2108
},
{
"epoch": 1.776094276094276,
"grad_norm": 9.622480392456055,
"learning_rate": 9.032227665972534e-07,
"loss": 0.4809529781341553,
"step": 2110
},
{
"epoch": 1.7777777777777777,
"grad_norm": 3.092237949371338,
"learning_rate": 9.015929591446651e-07,
"loss": 0.775432288646698,
"step": 2112
},
{
"epoch": 1.7794612794612794,
"grad_norm": 6.282991409301758,
"learning_rate": 8.999638352878142e-07,
"loss": 0.8989666700363159,
"step": 2114
},
{
"epoch": 1.7811447811447811,
"grad_norm": 2.8331105709075928,
"learning_rate": 8.983354006397177e-07,
"loss": 0.9354023933410645,
"step": 2116
},
{
"epoch": 1.7828282828282829,
"grad_norm": 3.6972124576568604,
"learning_rate": 8.96707660811018e-07,
"loss": 0.8982851505279541,
"step": 2118
},
{
"epoch": 1.7845117845117846,
"grad_norm": 7.385217189788818,
"learning_rate": 8.950806214099638e-07,
"loss": 0.6171048879623413,
"step": 2120
},
{
"epoch": 1.7861952861952863,
"grad_norm": 6.615528106689453,
"learning_rate": 8.934542880423903e-07,
"loss": 0.5291919708251953,
"step": 2122
},
{
"epoch": 1.7878787878787878,
"grad_norm": 4.079862117767334,
"learning_rate": 8.918286663117005e-07,
"loss": 0.7172562479972839,
"step": 2124
},
{
"epoch": 1.7895622895622896,
"grad_norm": 3.52138352394104,
"learning_rate": 8.902037618188449e-07,
"loss": 0.6790080666542053,
"step": 2126
},
{
"epoch": 1.791245791245791,
"grad_norm": 3.611370325088501,
"learning_rate": 8.885795801623035e-07,
"loss": 0.6517022848129272,
"step": 2128
},
{
"epoch": 1.7929292929292928,
"grad_norm": 14.185620307922363,
"learning_rate": 8.869561269380652e-07,
"loss": 0.6533136367797852,
"step": 2130
},
{
"epoch": 1.7946127946127945,
"grad_norm": 4.437119007110596,
"learning_rate": 8.853334077396098e-07,
"loss": 0.5168370008468628,
"step": 2132
},
{
"epoch": 1.7962962962962963,
"grad_norm": 2.8739631175994873,
"learning_rate": 8.837114281578872e-07,
"loss": 0.6581718921661377,
"step": 2134
},
{
"epoch": 1.797979797979798,
"grad_norm": 6.71103572845459,
"learning_rate": 8.820901937813003e-07,
"loss": 0.3342350125312805,
"step": 2136
},
{
"epoch": 1.7996632996632997,
"grad_norm": 4.6629486083984375,
"learning_rate": 8.804697101956828e-07,
"loss": 0.9553017616271973,
"step": 2138
},
{
"epoch": 1.8013468013468015,
"grad_norm": 3.458785057067871,
"learning_rate": 8.78849982984283e-07,
"loss": 0.7399221658706665,
"step": 2140
},
{
"epoch": 1.803030303030303,
"grad_norm": 6.880527973175049,
"learning_rate": 8.772310177277427e-07,
"loss": 0.7662659287452698,
"step": 2142
},
{
"epoch": 1.8047138047138047,
"grad_norm": 4.792196273803711,
"learning_rate": 8.756128200040782e-07,
"loss": 0.6991869211196899,
"step": 2144
},
{
"epoch": 1.8063973063973064,
"grad_norm": 20.593137741088867,
"learning_rate": 8.739953953886614e-07,
"loss": 0.8479831218719482,
"step": 2146
},
{
"epoch": 1.808080808080808,
"grad_norm": 5.395805358886719,
"learning_rate": 8.72378749454201e-07,
"loss": 0.8385607004165649,
"step": 2148
},
{
"epoch": 1.8097643097643097,
"grad_norm": 2.312955379486084,
"learning_rate": 8.707628877707221e-07,
"loss": 0.9476625919342041,
"step": 2150
},
{
"epoch": 1.8114478114478114,
"grad_norm": 4.342343807220459,
"learning_rate": 8.691478159055483e-07,
"loss": 0.9815539121627808,
"step": 2152
},
{
"epoch": 1.8131313131313131,
"grad_norm": 7.069920063018799,
"learning_rate": 8.675335394232819e-07,
"loss": 0.7816078066825867,
"step": 2154
},
{
"epoch": 1.8148148148148149,
"grad_norm": 3.6072463989257812,
"learning_rate": 8.659200638857845e-07,
"loss": 0.642024576663971,
"step": 2156
},
{
"epoch": 1.8164983164983166,
"grad_norm": 3.091968297958374,
"learning_rate": 8.643073948521576e-07,
"loss": 0.4574873447418213,
"step": 2158
},
{
"epoch": 1.8181818181818183,
"grad_norm": 3.328583002090454,
"learning_rate": 8.626955378787256e-07,
"loss": 0.8696750402450562,
"step": 2160
},
{
"epoch": 1.8198653198653199,
"grad_norm": 10.013894081115723,
"learning_rate": 8.610844985190127e-07,
"loss": 0.6890912652015686,
"step": 2162
},
{
"epoch": 1.8215488215488216,
"grad_norm": 9.785984992980957,
"learning_rate": 8.594742823237287e-07,
"loss": 0.8184359073638916,
"step": 2164
},
{
"epoch": 1.823232323232323,
"grad_norm": 6.415750026702881,
"learning_rate": 8.578648948407452e-07,
"loss": 0.9016733169555664,
"step": 2166
},
{
"epoch": 1.8249158249158248,
"grad_norm": 4.460061073303223,
"learning_rate": 8.562563416150794e-07,
"loss": 0.8134877681732178,
"step": 2168
},
{
"epoch": 1.8265993265993266,
"grad_norm": 9.919402122497559,
"learning_rate": 8.546486281888739e-07,
"loss": 0.5249311923980713,
"step": 2170
},
{
"epoch": 1.8282828282828283,
"grad_norm": 4.25754976272583,
"learning_rate": 8.53041760101378e-07,
"loss": 0.7299934029579163,
"step": 2172
},
{
"epoch": 1.82996632996633,
"grad_norm": 5.09484338760376,
"learning_rate": 8.51435742888928e-07,
"loss": 0.3953469395637512,
"step": 2174
},
{
"epoch": 1.8316498316498318,
"grad_norm": 3.4278955459594727,
"learning_rate": 8.498305820849296e-07,
"loss": 0.4628002643585205,
"step": 2176
},
{
"epoch": 1.8333333333333335,
"grad_norm": 9.368400573730469,
"learning_rate": 8.482262832198365e-07,
"loss": 0.6508548259735107,
"step": 2178
},
{
"epoch": 1.835016835016835,
"grad_norm": 2.587501287460327,
"learning_rate": 8.46622851821134e-07,
"loss": 0.8062055110931396,
"step": 2180
},
{
"epoch": 1.8367003367003367,
"grad_norm": 19.423526763916016,
"learning_rate": 8.450202934133174e-07,
"loss": 0.395694375038147,
"step": 2182
},
{
"epoch": 1.8383838383838382,
"grad_norm": 9.420888900756836,
"learning_rate": 8.434186135178749e-07,
"loss": 0.9303032159805298,
"step": 2184
},
{
"epoch": 1.84006734006734,
"grad_norm": 3.11016845703125,
"learning_rate": 8.418178176532674e-07,
"loss": 0.9512186050415039,
"step": 2186
},
{
"epoch": 1.8417508417508417,
"grad_norm": 3.171823501586914,
"learning_rate": 8.402179113349106e-07,
"loss": 0.8358129262924194,
"step": 2188
},
{
"epoch": 1.8434343434343434,
"grad_norm": 20.7672176361084,
"learning_rate": 8.386189000751544e-07,
"loss": 0.4782221019268036,
"step": 2190
},
{
"epoch": 1.8451178451178452,
"grad_norm": 16.821311950683594,
"learning_rate": 8.370207893832661e-07,
"loss": 0.7589244246482849,
"step": 2192
},
{
"epoch": 1.8468013468013469,
"grad_norm": 13.138861656188965,
"learning_rate": 8.354235847654092e-07,
"loss": 0.5737025141716003,
"step": 2194
},
{
"epoch": 1.8484848484848486,
"grad_norm": 7.118038177490234,
"learning_rate": 8.338272917246252e-07,
"loss": 0.7278249263763428,
"step": 2196
},
{
"epoch": 1.8501683501683501,
"grad_norm": 10.954305648803711,
"learning_rate": 8.322319157608158e-07,
"loss": 0.4181557893753052,
"step": 2198
},
{
"epoch": 1.8518518518518519,
"grad_norm": 2.7884762287139893,
"learning_rate": 8.306374623707222e-07,
"loss": 0.9623106718063354,
"step": 2200
},
{
"epoch": 1.8535353535353534,
"grad_norm": 20.612829208374023,
"learning_rate": 8.29043937047907e-07,
"loss": 0.6952165365219116,
"step": 2202
},
{
"epoch": 1.855218855218855,
"grad_norm": 4.281187534332275,
"learning_rate": 8.274513452827361e-07,
"loss": 0.5947088003158569,
"step": 2204
},
{
"epoch": 1.8569023569023568,
"grad_norm": 4.014023780822754,
"learning_rate": 8.258596925623578e-07,
"loss": 0.8658819198608398,
"step": 2206
},
{
"epoch": 1.8585858585858586,
"grad_norm": 5.036103248596191,
"learning_rate": 8.242689843706852e-07,
"loss": 0.7244065999984741,
"step": 2208
},
{
"epoch": 1.8602693602693603,
"grad_norm": 10.339949607849121,
"learning_rate": 8.226792261883777e-07,
"loss": 0.28258228302001953,
"step": 2210
},
{
"epoch": 1.861952861952862,
"grad_norm": 3.753382921218872,
"learning_rate": 8.210904234928213e-07,
"loss": 0.7527827620506287,
"step": 2212
},
{
"epoch": 1.8636363636363638,
"grad_norm": 6.2754082679748535,
"learning_rate": 8.195025817581092e-07,
"loss": 1.0558643341064453,
"step": 2214
},
{
"epoch": 1.8653198653198653,
"grad_norm": 26.839725494384766,
"learning_rate": 8.179157064550246e-07,
"loss": 0.30213648080825806,
"step": 2216
},
{
"epoch": 1.867003367003367,
"grad_norm": 9.034235954284668,
"learning_rate": 8.163298030510208e-07,
"loss": 0.5420745611190796,
"step": 2218
},
{
"epoch": 1.8686868686868687,
"grad_norm": 11.360336303710938,
"learning_rate": 8.147448770102019e-07,
"loss": 0.4777377247810364,
"step": 2220
},
{
"epoch": 1.8703703703703702,
"grad_norm": 20.048816680908203,
"learning_rate": 8.131609337933054e-07,
"loss": 0.6607373952865601,
"step": 2222
},
{
"epoch": 1.872053872053872,
"grad_norm": 4.080456733703613,
"learning_rate": 8.115779788576818e-07,
"loss": 0.9611594676971436,
"step": 2224
},
{
"epoch": 1.8737373737373737,
"grad_norm": 6.5773539543151855,
"learning_rate": 8.099960176572768e-07,
"loss": 0.5292639136314392,
"step": 2226
},
{
"epoch": 1.8754208754208754,
"grad_norm": 8.111262321472168,
"learning_rate": 8.08415055642613e-07,
"loss": 0.4228924512863159,
"step": 2228
},
{
"epoch": 1.8771043771043772,
"grad_norm": 7.139439105987549,
"learning_rate": 8.068350982607693e-07,
"loss": 1.036095380783081,
"step": 2230
},
{
"epoch": 1.878787878787879,
"grad_norm": 5.3863019943237305,
"learning_rate": 8.052561509553633e-07,
"loss": 0.879308819770813,
"step": 2232
},
{
"epoch": 1.8804713804713806,
"grad_norm": 2.3317646980285645,
"learning_rate": 8.03678219166533e-07,
"loss": 0.4804232120513916,
"step": 2234
},
{
"epoch": 1.8821548821548821,
"grad_norm": 7.66561222076416,
"learning_rate": 8.021013083309181e-07,
"loss": 0.6847870349884033,
"step": 2236
},
{
"epoch": 1.8838383838383839,
"grad_norm": 20.43767738342285,
"learning_rate": 8.005254238816392e-07,
"loss": 0.9153972864151001,
"step": 2238
},
{
"epoch": 1.8855218855218854,
"grad_norm": 4.095572471618652,
"learning_rate": 7.989505712482814e-07,
"loss": 1.1261423826217651,
"step": 2240
},
{
"epoch": 1.887205387205387,
"grad_norm": 3.0072097778320312,
"learning_rate": 7.973767558568749e-07,
"loss": 0.9912216663360596,
"step": 2242
},
{
"epoch": 1.8888888888888888,
"grad_norm": 5.47834587097168,
"learning_rate": 7.95803983129876e-07,
"loss": 0.914950430393219,
"step": 2244
},
{
"epoch": 1.8905723905723906,
"grad_norm": 2.8906898498535156,
"learning_rate": 7.942322584861476e-07,
"loss": 0.9614880681037903,
"step": 2246
},
{
"epoch": 1.8922558922558923,
"grad_norm": 6.5934977531433105,
"learning_rate": 7.926615873409435e-07,
"loss": 0.9108870029449463,
"step": 2248
},
{
"epoch": 1.893939393939394,
"grad_norm": 3.955982208251953,
"learning_rate": 7.910919751058863e-07,
"loss": 0.9415953755378723,
"step": 2250
},
{
"epoch": 1.8956228956228958,
"grad_norm": 3.4299967288970947,
"learning_rate": 7.895234271889502e-07,
"loss": 1.1199214458465576,
"step": 2252
},
{
"epoch": 1.8973063973063973,
"grad_norm": 2.3956785202026367,
"learning_rate": 7.879559489944431e-07,
"loss": 0.8545496463775635,
"step": 2254
},
{
"epoch": 1.898989898989899,
"grad_norm": 5.623586654663086,
"learning_rate": 7.86389545922987e-07,
"loss": 0.8165835738182068,
"step": 2256
},
{
"epoch": 1.9006734006734005,
"grad_norm": 3.9756457805633545,
"learning_rate": 7.848242233714992e-07,
"loss": 0.9491643905639648,
"step": 2258
},
{
"epoch": 1.9023569023569022,
"grad_norm": 12.658402442932129,
"learning_rate": 7.832599867331751e-07,
"loss": 0.7046935558319092,
"step": 2260
},
{
"epoch": 1.904040404040404,
"grad_norm": 3.2188074588775635,
"learning_rate": 7.816968413974676e-07,
"loss": 0.7821506261825562,
"step": 2262
},
{
"epoch": 1.9057239057239057,
"grad_norm": 6.156993865966797,
"learning_rate": 7.801347927500701e-07,
"loss": 0.4521103501319885,
"step": 2264
},
{
"epoch": 1.9074074074074074,
"grad_norm": 7.991714000701904,
"learning_rate": 7.785738461728975e-07,
"loss": 0.7530079483985901,
"step": 2266
},
{
"epoch": 1.9090909090909092,
"grad_norm": 5.461221694946289,
"learning_rate": 7.770140070440679e-07,
"loss": 0.6550673842430115,
"step": 2268
},
{
"epoch": 1.910774410774411,
"grad_norm": 4.9177446365356445,
"learning_rate": 7.754552807378827e-07,
"loss": 0.8085366487503052,
"step": 2270
},
{
"epoch": 1.9124579124579124,
"grad_norm": 5.982177257537842,
"learning_rate": 7.738976726248105e-07,
"loss": 0.8757312893867493,
"step": 2272
},
{
"epoch": 1.9141414141414141,
"grad_norm": 5.69901704788208,
"learning_rate": 7.723411880714663e-07,
"loss": 0.6707878112792969,
"step": 2274
},
{
"epoch": 1.9158249158249159,
"grad_norm": 5.210788249969482,
"learning_rate": 7.707858324405945e-07,
"loss": 0.8715642690658569,
"step": 2276
},
{
"epoch": 1.9175084175084174,
"grad_norm": 9.772908210754395,
"learning_rate": 7.692316110910495e-07,
"loss": 0.5358242392539978,
"step": 2278
},
{
"epoch": 1.9191919191919191,
"grad_norm": 10.29883861541748,
"learning_rate": 7.676785293777779e-07,
"loss": 0.18194249272346497,
"step": 2280
},
{
"epoch": 1.9208754208754208,
"grad_norm": 12.38522720336914,
"learning_rate": 7.661265926517997e-07,
"loss": 0.9799966812133789,
"step": 2282
},
{
"epoch": 1.9225589225589226,
"grad_norm": 10.77762222290039,
"learning_rate": 7.6457580626019e-07,
"loss": 0.4065392315387726,
"step": 2284
},
{
"epoch": 1.9242424242424243,
"grad_norm": 4.503013610839844,
"learning_rate": 7.630261755460598e-07,
"loss": 0.6107114553451538,
"step": 2286
},
{
"epoch": 1.925925925925926,
"grad_norm": 4.1006574630737305,
"learning_rate": 7.614777058485398e-07,
"loss": 0.9525327086448669,
"step": 2288
},
{
"epoch": 1.9276094276094278,
"grad_norm": 4.675087928771973,
"learning_rate": 7.59930402502759e-07,
"loss": 0.64920973777771,
"step": 2290
},
{
"epoch": 1.9292929292929293,
"grad_norm": 10.305427551269531,
"learning_rate": 7.58384270839829e-07,
"loss": 0.4203697741031647,
"step": 2292
},
{
"epoch": 1.930976430976431,
"grad_norm": 10.108484268188477,
"learning_rate": 7.568393161868234e-07,
"loss": 0.8978174924850464,
"step": 2294
},
{
"epoch": 1.9326599326599325,
"grad_norm": 9.093255996704102,
"learning_rate": 7.552955438667612e-07,
"loss": 0.7504777908325195,
"step": 2296
},
{
"epoch": 1.9343434343434343,
"grad_norm": 2.7709944248199463,
"learning_rate": 7.537529591985879e-07,
"loss": 0.7725180983543396,
"step": 2298
},
{
"epoch": 1.936026936026936,
"grad_norm": 18.586732864379883,
"learning_rate": 7.522115674971564e-07,
"loss": 0.5804815292358398,
"step": 2300
},
{
"epoch": 1.9377104377104377,
"grad_norm": 5.98298978805542,
"learning_rate": 7.506713740732098e-07,
"loss": 1.1325410604476929,
"step": 2302
},
{
"epoch": 1.9393939393939394,
"grad_norm": 2.994622230529785,
"learning_rate": 7.491323842333626e-07,
"loss": 0.9246529340744019,
"step": 2304
},
{
"epoch": 1.9410774410774412,
"grad_norm": 10.680237770080566,
"learning_rate": 7.47594603280082e-07,
"loss": 0.2776586413383484,
"step": 2306
},
{
"epoch": 1.942760942760943,
"grad_norm": 5.663994312286377,
"learning_rate": 7.460580365116704e-07,
"loss": 0.7812565565109253,
"step": 2308
},
{
"epoch": 1.9444444444444444,
"grad_norm": 5.120817184448242,
"learning_rate": 7.445226892222476e-07,
"loss": 1.0193424224853516,
"step": 2310
},
{
"epoch": 1.9461279461279462,
"grad_norm": 23.06011962890625,
"learning_rate": 7.429885667017301e-07,
"loss": 0.9074631929397583,
"step": 2312
},
{
"epoch": 1.9478114478114477,
"grad_norm": 27.372034072875977,
"learning_rate": 7.41455674235816e-07,
"loss": 0.860990583896637,
"step": 2314
},
{
"epoch": 1.9494949494949494,
"grad_norm": 4.709370136260986,
"learning_rate": 7.399240171059649e-07,
"loss": 0.6999090313911438,
"step": 2316
},
{
"epoch": 1.9511784511784511,
"grad_norm": 3.648000955581665,
"learning_rate": 7.383936005893798e-07,
"loss": 0.8313673138618469,
"step": 2318
},
{
"epoch": 1.9528619528619529,
"grad_norm": 6.1949005126953125,
"learning_rate": 7.368644299589894e-07,
"loss": 0.8585817217826843,
"step": 2320
},
{
"epoch": 1.9545454545454546,
"grad_norm": 6.129204273223877,
"learning_rate": 7.353365104834304e-07,
"loss": 0.9358435869216919,
"step": 2322
},
{
"epoch": 1.9562289562289563,
"grad_norm": 5.967504501342773,
"learning_rate": 7.338098474270277e-07,
"loss": 0.6934836506843567,
"step": 2324
},
{
"epoch": 1.957912457912458,
"grad_norm": 8.291871070861816,
"learning_rate": 7.322844460497783e-07,
"loss": 0.4362953305244446,
"step": 2326
},
{
"epoch": 1.9595959595959596,
"grad_norm": 4.457334995269775,
"learning_rate": 7.307603116073317e-07,
"loss": 1.026896595954895,
"step": 2328
},
{
"epoch": 1.9612794612794613,
"grad_norm": 34.517372131347656,
"learning_rate": 7.292374493509725e-07,
"loss": 0.9922385811805725,
"step": 2330
},
{
"epoch": 1.9629629629629628,
"grad_norm": 5.860324859619141,
"learning_rate": 7.277158645276014e-07,
"loss": 0.9369185566902161,
"step": 2332
},
{
"epoch": 1.9646464646464645,
"grad_norm": 6.046477317810059,
"learning_rate": 7.261955623797189e-07,
"loss": 0.9571334719657898,
"step": 2334
},
{
"epoch": 1.9663299663299663,
"grad_norm": 7.769930362701416,
"learning_rate": 7.246765481454056e-07,
"loss": 0.8826982975006104,
"step": 2336
},
{
"epoch": 1.968013468013468,
"grad_norm": 6.431835651397705,
"learning_rate": 7.23158827058304e-07,
"loss": 0.9630632400512695,
"step": 2338
},
{
"epoch": 1.9696969696969697,
"grad_norm": 22.74308967590332,
"learning_rate": 7.216424043476022e-07,
"loss": 0.3798217177391052,
"step": 2340
},
{
"epoch": 1.9713804713804715,
"grad_norm": 4.635124683380127,
"learning_rate": 7.20127285238015e-07,
"loss": 0.6755929589271545,
"step": 2342
},
{
"epoch": 1.9730639730639732,
"grad_norm": 11.939600944519043,
"learning_rate": 7.186134749497645e-07,
"loss": 0.4677308201789856,
"step": 2344
},
{
"epoch": 1.9747474747474747,
"grad_norm": 7.168182849884033,
"learning_rate": 7.171009786985642e-07,
"loss": 0.7687026858329773,
"step": 2346
},
{
"epoch": 1.9764309764309764,
"grad_norm": 8.714031219482422,
"learning_rate": 7.155898016956008e-07,
"loss": 0.6937582492828369,
"step": 2348
},
{
"epoch": 1.9781144781144782,
"grad_norm": 12.572547912597656,
"learning_rate": 7.14079949147514e-07,
"loss": 0.6481941342353821,
"step": 2350
},
{
"epoch": 1.9797979797979797,
"grad_norm": 11.015668869018555,
"learning_rate": 7.125714262563814e-07,
"loss": 0.5940038561820984,
"step": 2352
},
{
"epoch": 1.9814814814814814,
"grad_norm": 2.312359571456909,
"learning_rate": 7.110642382196996e-07,
"loss": 0.3644195795059204,
"step": 2354
},
{
"epoch": 1.9831649831649831,
"grad_norm": 2.8572607040405273,
"learning_rate": 7.095583902303648e-07,
"loss": 0.9964379668235779,
"step": 2356
},
{
"epoch": 1.9848484848484849,
"grad_norm": 3.200378179550171,
"learning_rate": 7.080538874766573e-07,
"loss": 0.901992917060852,
"step": 2358
},
{
"epoch": 1.9865319865319866,
"grad_norm": 3.659830093383789,
"learning_rate": 7.06550735142222e-07,
"loss": 0.8655633926391602,
"step": 2360
},
{
"epoch": 1.9882154882154883,
"grad_norm": 4.574953079223633,
"learning_rate": 7.050489384060512e-07,
"loss": 0.6048173904418945,
"step": 2362
},
{
"epoch": 1.98989898989899,
"grad_norm": 2.2669944763183594,
"learning_rate": 7.035485024424666e-07,
"loss": 0.8642423152923584,
"step": 2364
},
{
"epoch": 1.9915824915824916,
"grad_norm": 9.807171821594238,
"learning_rate": 7.020494324211017e-07,
"loss": 0.8357862234115601,
"step": 2366
},
{
"epoch": 1.9932659932659933,
"grad_norm": 4.8804097175598145,
"learning_rate": 7.005517335068827e-07,
"loss": 0.9583761692047119,
"step": 2368
},
{
"epoch": 1.9949494949494948,
"grad_norm": 3.2299656867980957,
"learning_rate": 6.99055410860013e-07,
"loss": 0.3349935710430145,
"step": 2370
},
{
"epoch": 1.9966329966329965,
"grad_norm": 10.934320449829102,
"learning_rate": 6.975604696359542e-07,
"loss": 0.4770701825618744,
"step": 2372
},
{
"epoch": 1.9983164983164983,
"grad_norm": 4.283078193664551,
"learning_rate": 6.960669149854068e-07,
"loss": 0.8760964870452881,
"step": 2374
},
{
"epoch": 2.0,
"grad_norm": 2.9110515117645264,
"learning_rate": 6.945747520542955e-07,
"loss": 0.8032587766647339,
"step": 2376
},
{
"epoch": 2.0016835016835017,
"grad_norm": 13.453629493713379,
"learning_rate": 6.930839859837496e-07,
"loss": 0.5529247522354126,
"step": 2378
},
{
"epoch": 2.0033670033670035,
"grad_norm": 2.993082284927368,
"learning_rate": 6.915946219100852e-07,
"loss": 1.0674469470977783,
"step": 2380
},
{
"epoch": 2.005050505050505,
"grad_norm": 9.452933311462402,
"learning_rate": 6.901066649647887e-07,
"loss": 0.5305376052856445,
"step": 2382
},
{
"epoch": 2.006734006734007,
"grad_norm": 10.649518013000488,
"learning_rate": 6.886201202744972e-07,
"loss": 0.4740598499774933,
"step": 2384
},
{
"epoch": 2.008417508417508,
"grad_norm": 2.3154711723327637,
"learning_rate": 6.871349929609826e-07,
"loss": 0.6172109842300415,
"step": 2386
},
{
"epoch": 2.01010101010101,
"grad_norm": 4.7962565422058105,
"learning_rate": 6.856512881411343e-07,
"loss": 0.751620352268219,
"step": 2388
},
{
"epoch": 2.0117845117845117,
"grad_norm": 3.493546485900879,
"learning_rate": 6.841690109269386e-07,
"loss": 0.9236295223236084,
"step": 2390
},
{
"epoch": 2.0134680134680134,
"grad_norm": 4.757162094116211,
"learning_rate": 6.826881664254646e-07,
"loss": 0.5604578852653503,
"step": 2392
},
{
"epoch": 2.015151515151515,
"grad_norm": 15.590490341186523,
"learning_rate": 6.812087597388452e-07,
"loss": 0.656000018119812,
"step": 2394
},
{
"epoch": 2.016835016835017,
"grad_norm": 3.1684648990631104,
"learning_rate": 6.79730795964258e-07,
"loss": 0.7373712062835693,
"step": 2396
},
{
"epoch": 2.0185185185185186,
"grad_norm": 4.949743270874023,
"learning_rate": 6.782542801939105e-07,
"loss": 0.7128652930259705,
"step": 2398
},
{
"epoch": 2.0202020202020203,
"grad_norm": 6.615425109863281,
"learning_rate": 6.767792175150211e-07,
"loss": 0.5110639333724976,
"step": 2400
},
{
"epoch": 2.021885521885522,
"grad_norm": 29.939123153686523,
"learning_rate": 6.753056130098009e-07,
"loss": 0.3357080817222595,
"step": 2402
},
{
"epoch": 2.0235690235690234,
"grad_norm": 7.256524562835693,
"learning_rate": 6.738334717554373e-07,
"loss": 0.803414523601532,
"step": 2404
},
{
"epoch": 2.025252525252525,
"grad_norm": 8.12669563293457,
"learning_rate": 6.723627988240772e-07,
"loss": 0.6509519815444946,
"step": 2406
},
{
"epoch": 2.026936026936027,
"grad_norm": 2.850172519683838,
"learning_rate": 6.708935992828068e-07,
"loss": 0.6972189545631409,
"step": 2408
},
{
"epoch": 2.0286195286195285,
"grad_norm": 4.508718967437744,
"learning_rate": 6.694258781936369e-07,
"loss": 0.607012152671814,
"step": 2410
},
{
"epoch": 2.0303030303030303,
"grad_norm": 6.282621383666992,
"learning_rate": 6.679596406134844e-07,
"loss": 0.8239716291427612,
"step": 2412
},
{
"epoch": 2.031986531986532,
"grad_norm": 4.128354549407959,
"learning_rate": 6.664948915941546e-07,
"loss": 0.6955975294113159,
"step": 2414
},
{
"epoch": 2.0336700336700337,
"grad_norm": 2.551084518432617,
"learning_rate": 6.65031636182324e-07,
"loss": 0.7895976305007935,
"step": 2416
},
{
"epoch": 2.0353535353535355,
"grad_norm": 5.181878089904785,
"learning_rate": 6.635698794195237e-07,
"loss": 0.5881921052932739,
"step": 2418
},
{
"epoch": 2.037037037037037,
"grad_norm": 15.786396980285645,
"learning_rate": 6.621096263421202e-07,
"loss": 0.3898243308067322,
"step": 2420
},
{
"epoch": 2.038720538720539,
"grad_norm": 5.842708587646484,
"learning_rate": 6.606508819813001e-07,
"loss": 0.7089550495147705,
"step": 2422
},
{
"epoch": 2.04040404040404,
"grad_norm": 48.35086441040039,
"learning_rate": 6.591936513630514e-07,
"loss": 0.17687079310417175,
"step": 2424
},
{
"epoch": 2.042087542087542,
"grad_norm": 7.372962951660156,
"learning_rate": 6.577379395081466e-07,
"loss": 0.33852899074554443,
"step": 2426
},
{
"epoch": 2.0437710437710437,
"grad_norm": 35.252044677734375,
"learning_rate": 6.562837514321258e-07,
"loss": 0.5263517498970032,
"step": 2428
},
{
"epoch": 2.0454545454545454,
"grad_norm": 2.461886405944824,
"learning_rate": 6.548310921452784e-07,
"loss": 0.7057082653045654,
"step": 2430
},
{
"epoch": 2.047138047138047,
"grad_norm": 3.156841516494751,
"learning_rate": 6.533799666526275e-07,
"loss": 0.6170644760131836,
"step": 2432
},
{
"epoch": 2.048821548821549,
"grad_norm": 3.1701977252960205,
"learning_rate": 6.519303799539104e-07,
"loss": 0.7602715492248535,
"step": 2434
},
{
"epoch": 2.0505050505050506,
"grad_norm": 3.952972412109375,
"learning_rate": 6.504823370435633e-07,
"loss": 1.1037501096725464,
"step": 2436
},
{
"epoch": 2.0521885521885523,
"grad_norm": 3.426377773284912,
"learning_rate": 6.490358429107038e-07,
"loss": 0.6811984181404114,
"step": 2438
},
{
"epoch": 2.053872053872054,
"grad_norm": 11.86534309387207,
"learning_rate": 6.47590902539112e-07,
"loss": 0.7863556146621704,
"step": 2440
},
{
"epoch": 2.0555555555555554,
"grad_norm": 8.28430461883545,
"learning_rate": 6.461475209072161e-07,
"loss": 0.6948744654655457,
"step": 2442
},
{
"epoch": 2.057239057239057,
"grad_norm": 2.8476991653442383,
"learning_rate": 6.44705702988073e-07,
"loss": 0.7017114162445068,
"step": 2444
},
{
"epoch": 2.058922558922559,
"grad_norm": 5.588902950286865,
"learning_rate": 6.432654537493518e-07,
"loss": 0.8929611444473267,
"step": 2446
},
{
"epoch": 2.0606060606060606,
"grad_norm": 3.2887089252471924,
"learning_rate": 6.418267781533173e-07,
"loss": 0.79296875,
"step": 2448
},
{
"epoch": 2.0622895622895623,
"grad_norm": 14.845014572143555,
"learning_rate": 6.403896811568124e-07,
"loss": 0.5820084810256958,
"step": 2450
},
{
"epoch": 2.063973063973064,
"grad_norm": 6.622726917266846,
"learning_rate": 6.389541677112407e-07,
"loss": 0.8630738258361816,
"step": 2452
},
{
"epoch": 2.0656565656565657,
"grad_norm": 8.550455093383789,
"learning_rate": 6.375202427625505e-07,
"loss": 0.6654762625694275,
"step": 2454
},
{
"epoch": 2.0673400673400675,
"grad_norm": 2.6550607681274414,
"learning_rate": 6.360879112512159e-07,
"loss": 0.7484475374221802,
"step": 2456
},
{
"epoch": 2.069023569023569,
"grad_norm": 3.6882874965667725,
"learning_rate": 6.346571781122218e-07,
"loss": 0.709972620010376,
"step": 2458
},
{
"epoch": 2.0707070707070705,
"grad_norm": 10.880833625793457,
"learning_rate": 6.332280482750466e-07,
"loss": 0.5114179849624634,
"step": 2460
},
{
"epoch": 2.0723905723905722,
"grad_norm": 7.7855000495910645,
"learning_rate": 6.318005266636428e-07,
"loss": 0.5731675028800964,
"step": 2462
},
{
"epoch": 2.074074074074074,
"grad_norm": 9.688587188720703,
"learning_rate": 6.303746181964234e-07,
"loss": 0.5561926364898682,
"step": 2464
},
{
"epoch": 2.0757575757575757,
"grad_norm": 10.193296432495117,
"learning_rate": 6.289503277862438e-07,
"loss": 0.7371481657028198,
"step": 2466
},
{
"epoch": 2.0774410774410774,
"grad_norm": 5.605756759643555,
"learning_rate": 6.275276603403824e-07,
"loss": 0.5109883546829224,
"step": 2468
},
{
"epoch": 2.079124579124579,
"grad_norm": 6.081256866455078,
"learning_rate": 6.26106620760528e-07,
"loss": 0.9331031441688538,
"step": 2470
},
{
"epoch": 2.080808080808081,
"grad_norm": 5.508481979370117,
"learning_rate": 6.246872139427602e-07,
"loss": 0.9123448133468628,
"step": 2472
},
{
"epoch": 2.0824915824915826,
"grad_norm": 4.696747779846191,
"learning_rate": 6.232694447775316e-07,
"loss": 0.4582900106906891,
"step": 2474
},
{
"epoch": 2.0841750841750843,
"grad_norm": 8.642160415649414,
"learning_rate": 6.218533181496541e-07,
"loss": 0.5799881815910339,
"step": 2476
},
{
"epoch": 2.0858585858585856,
"grad_norm": 4.685534954071045,
"learning_rate": 6.204388389382804e-07,
"loss": 0.7565197944641113,
"step": 2478
},
{
"epoch": 2.0875420875420874,
"grad_norm": 3.0699758529663086,
"learning_rate": 6.190260120168855e-07,
"loss": 0.6127052307128906,
"step": 2480
},
{
"epoch": 2.089225589225589,
"grad_norm": 2.890374183654785,
"learning_rate": 6.17614842253253e-07,
"loss": 0.6200038194656372,
"step": 2482
},
{
"epoch": 2.090909090909091,
"grad_norm": 5.803356647491455,
"learning_rate": 6.162053345094569e-07,
"loss": 1.080254316329956,
"step": 2484
},
{
"epoch": 2.0925925925925926,
"grad_norm": 6.378223419189453,
"learning_rate": 6.147974936418436e-07,
"loss": 0.5638513565063477,
"step": 2486
},
{
"epoch": 2.0942760942760943,
"grad_norm": 2.6295933723449707,
"learning_rate": 6.133913245010181e-07,
"loss": 0.5809881687164307,
"step": 2488
},
{
"epoch": 2.095959595959596,
"grad_norm": 9.536388397216797,
"learning_rate": 6.119868319318244e-07,
"loss": 0.7412412166595459,
"step": 2490
},
{
"epoch": 2.0976430976430978,
"grad_norm": 6.749050140380859,
"learning_rate": 6.105840207733302e-07,
"loss": 0.8024865388870239,
"step": 2492
},
{
"epoch": 2.0993265993265995,
"grad_norm": 3.2662672996520996,
"learning_rate": 6.091828958588101e-07,
"loss": 0.49432703852653503,
"step": 2494
},
{
"epoch": 2.101010101010101,
"grad_norm": 4.484532356262207,
"learning_rate": 6.077834620157296e-07,
"loss": 0.7933484315872192,
"step": 2496
},
{
"epoch": 2.1026936026936025,
"grad_norm": 2.866675853729248,
"learning_rate": 6.063857240657264e-07,
"loss": 0.4336718022823334,
"step": 2498
},
{
"epoch": 2.1043771043771042,
"grad_norm": 5.029768943786621,
"learning_rate": 6.049896868245962e-07,
"loss": 0.5639874339103699,
"step": 2500
},
{
"epoch": 2.106060606060606,
"grad_norm": 4.42257833480835,
"learning_rate": 6.035953551022748e-07,
"loss": 0.9859836101531982,
"step": 2502
},
{
"epoch": 2.1077441077441077,
"grad_norm": 25.789899826049805,
"learning_rate": 6.022027337028212e-07,
"loss": 0.8477144241333008,
"step": 2504
},
{
"epoch": 2.1094276094276094,
"grad_norm": 27.71114158630371,
"learning_rate": 6.008118274244025e-07,
"loss": 0.8800366520881653,
"step": 2506
},
{
"epoch": 2.111111111111111,
"grad_norm": 5.043661594390869,
"learning_rate": 5.994226410592762e-07,
"loss": 0.40974220633506775,
"step": 2508
},
{
"epoch": 2.112794612794613,
"grad_norm": 3.724855661392212,
"learning_rate": 5.980351793937734e-07,
"loss": 0.5578930377960205,
"step": 2510
},
{
"epoch": 2.1144781144781146,
"grad_norm": 7.778206825256348,
"learning_rate": 5.966494472082832e-07,
"loss": 0.6988534927368164,
"step": 2512
},
{
"epoch": 2.1161616161616164,
"grad_norm": 9.13245964050293,
"learning_rate": 5.952654492772369e-07,
"loss": 0.38724464178085327,
"step": 2514
},
{
"epoch": 2.1178451178451176,
"grad_norm": 5.150360584259033,
"learning_rate": 5.938831903690887e-07,
"loss": 0.8136914968490601,
"step": 2516
},
{
"epoch": 2.1195286195286194,
"grad_norm": 8.077790260314941,
"learning_rate": 5.925026752463027e-07,
"loss": 0.13099154829978943,
"step": 2518
},
{
"epoch": 2.121212121212121,
"grad_norm": 14.749094009399414,
"learning_rate": 5.911239086653345e-07,
"loss": 0.33465084433555603,
"step": 2520
},
{
"epoch": 2.122895622895623,
"grad_norm": 3.972292184829712,
"learning_rate": 5.89746895376614e-07,
"loss": 0.2251596450805664,
"step": 2522
},
{
"epoch": 2.1245791245791246,
"grad_norm": 3.6862993240356445,
"learning_rate": 5.883716401245329e-07,
"loss": 0.41063302755355835,
"step": 2524
},
{
"epoch": 2.1262626262626263,
"grad_norm": 13.53211498260498,
"learning_rate": 5.869981476474235e-07,
"loss": 0.32705599069595337,
"step": 2526
},
{
"epoch": 2.127946127946128,
"grad_norm": 11.80972671508789,
"learning_rate": 5.856264226775451e-07,
"loss": 0.28738293051719666,
"step": 2528
},
{
"epoch": 2.1296296296296298,
"grad_norm": 4.907763481140137,
"learning_rate": 5.842564699410676e-07,
"loss": 0.5695469379425049,
"step": 2530
},
{
"epoch": 2.1313131313131315,
"grad_norm": 7.322058200836182,
"learning_rate": 5.828882941580548e-07,
"loss": 0.7862983345985413,
"step": 2532
},
{
"epoch": 2.1329966329966332,
"grad_norm": 3.169811725616455,
"learning_rate": 5.815219000424475e-07,
"loss": 0.32265302538871765,
"step": 2534
},
{
"epoch": 2.1346801346801345,
"grad_norm": 4.123760223388672,
"learning_rate": 5.801572923020486e-07,
"loss": 0.6733647584915161,
"step": 2536
},
{
"epoch": 2.1363636363636362,
"grad_norm": 10.175186157226562,
"learning_rate": 5.787944756385061e-07,
"loss": 0.34301066398620605,
"step": 2538
},
{
"epoch": 2.138047138047138,
"grad_norm": 0.8496463894844055,
"learning_rate": 5.774334547472963e-07,
"loss": 0.31534287333488464,
"step": 2540
},
{
"epoch": 2.1397306397306397,
"grad_norm": 4.556532382965088,
"learning_rate": 5.760742343177091e-07,
"loss": 0.6951263546943665,
"step": 2542
},
{
"epoch": 2.1414141414141414,
"grad_norm": 2.392409086227417,
"learning_rate": 5.747168190328313e-07,
"loss": 0.09168624877929688,
"step": 2544
},
{
"epoch": 2.143097643097643,
"grad_norm": 2.1044692993164062,
"learning_rate": 5.73361213569529e-07,
"loss": 0.34088313579559326,
"step": 2546
},
{
"epoch": 2.144781144781145,
"grad_norm": 12.998042106628418,
"learning_rate": 5.720074225984335e-07,
"loss": 0.6928970813751221,
"step": 2548
},
{
"epoch": 2.1464646464646466,
"grad_norm": 3.534303903579712,
"learning_rate": 5.706554507839247e-07,
"loss": 0.8698376417160034,
"step": 2550
},
{
"epoch": 2.148148148148148,
"grad_norm": 3.9357972145080566,
"learning_rate": 5.693053027841139e-07,
"loss": 0.5156476497650146,
"step": 2552
},
{
"epoch": 2.1498316498316496,
"grad_norm": 12.438335418701172,
"learning_rate": 5.679569832508294e-07,
"loss": 0.14811789989471436,
"step": 2554
},
{
"epoch": 2.1515151515151514,
"grad_norm": 8.0103759765625,
"learning_rate": 5.666104968295993e-07,
"loss": 0.4402310848236084,
"step": 2556
},
{
"epoch": 2.153198653198653,
"grad_norm": 3.672968864440918,
"learning_rate": 5.652658481596355e-07,
"loss": 0.6228591203689575,
"step": 2558
},
{
"epoch": 2.154882154882155,
"grad_norm": 6.100817680358887,
"learning_rate": 5.639230418738186e-07,
"loss": 0.3809899091720581,
"step": 2560
},
{
"epoch": 2.1565656565656566,
"grad_norm": 25.523374557495117,
"learning_rate": 5.625820825986818e-07,
"loss": 0.4754774570465088,
"step": 2562
},
{
"epoch": 2.1582491582491583,
"grad_norm": 4.202336311340332,
"learning_rate": 5.61242974954393e-07,
"loss": 0.7122776508331299,
"step": 2564
},
{
"epoch": 2.15993265993266,
"grad_norm": 16.867658615112305,
"learning_rate": 5.599057235547422e-07,
"loss": 0.45209017395973206,
"step": 2566
},
{
"epoch": 2.1616161616161618,
"grad_norm": 5.021929740905762,
"learning_rate": 5.585703330071232e-07,
"loss": 0.3703120946884155,
"step": 2568
},
{
"epoch": 2.1632996632996635,
"grad_norm": 3.3957135677337646,
"learning_rate": 5.572368079125177e-07,
"loss": 0.8958742618560791,
"step": 2570
},
{
"epoch": 2.164983164983165,
"grad_norm": 3.049757957458496,
"learning_rate": 5.559051528654812e-07,
"loss": 1.0562491416931152,
"step": 2572
},
{
"epoch": 2.1666666666666665,
"grad_norm": 6.365866184234619,
"learning_rate": 5.545753724541259e-07,
"loss": 0.7664850950241089,
"step": 2574
},
{
"epoch": 2.1683501683501682,
"grad_norm": 3.9597971439361572,
"learning_rate": 5.532474712601041e-07,
"loss": 0.2349638044834137,
"step": 2576
},
{
"epoch": 2.17003367003367,
"grad_norm": 14.629343032836914,
"learning_rate": 5.519214538585945e-07,
"loss": 0.5862404108047485,
"step": 2578
},
{
"epoch": 2.1717171717171717,
"grad_norm": 13.472465515136719,
"learning_rate": 5.505973248182854e-07,
"loss": 0.25796785950660706,
"step": 2580
},
{
"epoch": 2.1734006734006734,
"grad_norm": 4.650449275970459,
"learning_rate": 5.492750887013576e-07,
"loss": 0.40474733710289,
"step": 2582
},
{
"epoch": 2.175084175084175,
"grad_norm": 4.238655090332031,
"learning_rate": 5.479547500634716e-07,
"loss": 0.25570929050445557,
"step": 2584
},
{
"epoch": 2.176767676767677,
"grad_norm": 9.685871124267578,
"learning_rate": 5.466363134537495e-07,
"loss": 0.582108736038208,
"step": 2586
},
{
"epoch": 2.1784511784511786,
"grad_norm": 20.41779899597168,
"learning_rate": 5.453197834147596e-07,
"loss": 0.5546954274177551,
"step": 2588
},
{
"epoch": 2.18013468013468,
"grad_norm": 89.8573226928711,
"learning_rate": 5.440051644825024e-07,
"loss": 0.6109448671340942,
"step": 2590
},
{
"epoch": 2.1818181818181817,
"grad_norm": 9.055795669555664,
"learning_rate": 5.426924611863932e-07,
"loss": 0.4381883144378662,
"step": 2592
},
{
"epoch": 2.1835016835016834,
"grad_norm": 7.171759605407715,
"learning_rate": 5.413816780492464e-07,
"loss": 0.28566718101501465,
"step": 2594
},
{
"epoch": 2.185185185185185,
"grad_norm": 5.162403583526611,
"learning_rate": 5.400728195872627e-07,
"loss": 0.6839703321456909,
"step": 2596
},
{
"epoch": 2.186868686868687,
"grad_norm": 4.578564643859863,
"learning_rate": 5.387658903100093e-07,
"loss": 0.7969393134117126,
"step": 2598
},
{
"epoch": 2.1885521885521886,
"grad_norm": 3.3671751022338867,
"learning_rate": 5.374608947204078e-07,
"loss": 0.5756024122238159,
"step": 2600
},
{
"epoch": 2.1902356902356903,
"grad_norm": 3.339944362640381,
"learning_rate": 5.361578373147173e-07,
"loss": 0.8270890116691589,
"step": 2602
},
{
"epoch": 2.191919191919192,
"grad_norm": 61.960235595703125,
"learning_rate": 5.348567225825182e-07,
"loss": 0.7463648319244385,
"step": 2604
},
{
"epoch": 2.1936026936026938,
"grad_norm": 12.145258903503418,
"learning_rate": 5.335575550066987e-07,
"loss": 0.3755905032157898,
"step": 2606
},
{
"epoch": 2.1952861952861955,
"grad_norm": 4.23495626449585,
"learning_rate": 5.322603390634379e-07,
"loss": 0.828824520111084,
"step": 2608
},
{
"epoch": 2.196969696969697,
"grad_norm": 5.706808090209961,
"learning_rate": 5.3096507922219e-07,
"loss": 0.7120569944381714,
"step": 2610
},
{
"epoch": 2.1986531986531985,
"grad_norm": 7.548922538757324,
"learning_rate": 5.296717799456703e-07,
"loss": 0.2670977711677551,
"step": 2612
},
{
"epoch": 2.2003367003367003,
"grad_norm": 6.819214820861816,
"learning_rate": 5.283804456898393e-07,
"loss": 0.7222539782524109,
"step": 2614
},
{
"epoch": 2.202020202020202,
"grad_norm": 6.466555595397949,
"learning_rate": 5.270910809038866e-07,
"loss": 0.5107656717300415,
"step": 2616
},
{
"epoch": 2.2037037037037037,
"grad_norm": 9.062774658203125,
"learning_rate": 5.258036900302162e-07,
"loss": 0.44302040338516235,
"step": 2618
},
{
"epoch": 2.2053872053872055,
"grad_norm": 3.68121600151062,
"learning_rate": 5.245182775044319e-07,
"loss": 0.28953254222869873,
"step": 2620
},
{
"epoch": 2.207070707070707,
"grad_norm": 4.225932598114014,
"learning_rate": 5.2323484775532e-07,
"loss": 0.5604819655418396,
"step": 2622
},
{
"epoch": 2.208754208754209,
"grad_norm": 6.57682466506958,
"learning_rate": 5.219534052048364e-07,
"loss": 0.4838787317276001,
"step": 2624
},
{
"epoch": 2.2104377104377106,
"grad_norm": 5.847450256347656,
"learning_rate": 5.206739542680903e-07,
"loss": 0.41042160987854004,
"step": 2626
},
{
"epoch": 2.212121212121212,
"grad_norm": 10.914462089538574,
"learning_rate": 5.193964993533275e-07,
"loss": 0.5403867959976196,
"step": 2628
},
{
"epoch": 2.2138047138047137,
"grad_norm": 8.292633056640625,
"learning_rate": 5.181210448619185e-07,
"loss": 0.25527873635292053,
"step": 2630
},
{
"epoch": 2.2154882154882154,
"grad_norm": 18.88636016845703,
"learning_rate": 5.168475951883405e-07,
"loss": 0.404461145401001,
"step": 2632
},
{
"epoch": 2.217171717171717,
"grad_norm": 3.0683631896972656,
"learning_rate": 5.155761547201631e-07,
"loss": 0.07407370954751968,
"step": 2634
},
{
"epoch": 2.218855218855219,
"grad_norm": 3.333080291748047,
"learning_rate": 5.143067278380339e-07,
"loss": 0.7165415287017822,
"step": 2636
},
{
"epoch": 2.2205387205387206,
"grad_norm": 11.401552200317383,
"learning_rate": 5.13039318915663e-07,
"loss": 1.0603926181793213,
"step": 2638
},
{
"epoch": 2.2222222222222223,
"grad_norm": 7.289011001586914,
"learning_rate": 5.117739323198067e-07,
"loss": 0.997651219367981,
"step": 2640
},
{
"epoch": 2.223905723905724,
"grad_norm": 4.159246444702148,
"learning_rate": 5.105105724102547e-07,
"loss": 0.6530795097351074,
"step": 2642
},
{
"epoch": 2.225589225589226,
"grad_norm": 30.293039321899414,
"learning_rate": 5.092492435398137e-07,
"loss": 0.6192750930786133,
"step": 2644
},
{
"epoch": 2.227272727272727,
"grad_norm": 13.535540580749512,
"learning_rate": 5.079899500542917e-07,
"loss": 0.5436962246894836,
"step": 2646
},
{
"epoch": 2.228956228956229,
"grad_norm": 5.441864967346191,
"learning_rate": 5.067326962924848e-07,
"loss": 0.2577816843986511,
"step": 2648
},
{
"epoch": 2.2306397306397305,
"grad_norm": 8.864923477172852,
"learning_rate": 5.054774865861617e-07,
"loss": 0.9602568745613098,
"step": 2650
},
{
"epoch": 2.2323232323232323,
"grad_norm": 14.644983291625977,
"learning_rate": 5.042243252600475e-07,
"loss": 0.5225367546081543,
"step": 2652
},
{
"epoch": 2.234006734006734,
"grad_norm": 17.72758674621582,
"learning_rate": 5.029732166318106e-07,
"loss": 0.47632715106010437,
"step": 2654
},
{
"epoch": 2.2356902356902357,
"grad_norm": 3.0727274417877197,
"learning_rate": 5.017241650120462e-07,
"loss": 0.5418964624404907,
"step": 2656
},
{
"epoch": 2.2373737373737375,
"grad_norm": 12.295948028564453,
"learning_rate": 5.004771747042631e-07,
"loss": 0.8024328351020813,
"step": 2658
},
{
"epoch": 2.239057239057239,
"grad_norm": 10.540696144104004,
"learning_rate": 4.992322500048673e-07,
"loss": 0.5871691703796387,
"step": 2660
},
{
"epoch": 2.240740740740741,
"grad_norm": 3.301222324371338,
"learning_rate": 4.979893952031483e-07,
"loss": 0.7337244153022766,
"step": 2662
},
{
"epoch": 2.242424242424242,
"grad_norm": 18.132505416870117,
"learning_rate": 4.96748614581264e-07,
"loss": 0.3517826795578003,
"step": 2664
},
{
"epoch": 2.244107744107744,
"grad_norm": 5.087287902832031,
"learning_rate": 4.955099124142251e-07,
"loss": 0.7348419427871704,
"step": 2666
},
{
"epoch": 2.2457912457912457,
"grad_norm": 5.434046268463135,
"learning_rate": 4.942732929698827e-07,
"loss": 0.5416382551193237,
"step": 2668
},
{
"epoch": 2.2474747474747474,
"grad_norm": 5.668000221252441,
"learning_rate": 4.930387605089104e-07,
"loss": 0.44201749563217163,
"step": 2670
},
{
"epoch": 2.249158249158249,
"grad_norm": 2.4525139331817627,
"learning_rate": 4.918063192847921e-07,
"loss": 0.34817391633987427,
"step": 2672
},
{
"epoch": 2.250841750841751,
"grad_norm": 10.748351097106934,
"learning_rate": 4.905759735438068e-07,
"loss": 0.6200217008590698,
"step": 2674
},
{
"epoch": 2.2525252525252526,
"grad_norm": 4.222598075866699,
"learning_rate": 4.893477275250127e-07,
"loss": 0.7119044065475464,
"step": 2676
},
{
"epoch": 2.2542087542087543,
"grad_norm": 3.8408939838409424,
"learning_rate": 4.881215854602342e-07,
"loss": 0.4421549141407013,
"step": 2678
},
{
"epoch": 2.255892255892256,
"grad_norm": 2.2825546264648438,
"learning_rate": 4.868975515740471e-07,
"loss": 0.835530161857605,
"step": 2680
},
{
"epoch": 2.257575757575758,
"grad_norm": 11.838665962219238,
"learning_rate": 4.856756300837625e-07,
"loss": 0.19798390567302704,
"step": 2682
},
{
"epoch": 2.259259259259259,
"grad_norm": 25.079456329345703,
"learning_rate": 4.844558251994146e-07,
"loss": 0.1048535406589508,
"step": 2684
},
{
"epoch": 2.260942760942761,
"grad_norm": 4.039831638336182,
"learning_rate": 4.832381411237444e-07,
"loss": 0.604271650314331,
"step": 2686
},
{
"epoch": 2.2626262626262625,
"grad_norm": 4.378790855407715,
"learning_rate": 4.820225820521855e-07,
"loss": 0.36290663480758667,
"step": 2688
},
{
"epoch": 2.2643097643097643,
"grad_norm": 4.032955169677734,
"learning_rate": 4.808091521728506e-07,
"loss": 0.8970327377319336,
"step": 2690
},
{
"epoch": 2.265993265993266,
"grad_norm": 6.259299278259277,
"learning_rate": 4.795978556665165e-07,
"loss": 0.8129058480262756,
"step": 2692
},
{
"epoch": 2.2676767676767677,
"grad_norm": 4.226785182952881,
"learning_rate": 4.783886967066088e-07,
"loss": 0.653793454170227,
"step": 2694
},
{
"epoch": 2.2693602693602695,
"grad_norm": 8.080623626708984,
"learning_rate": 4.77181679459189e-07,
"loss": 0.5345746874809265,
"step": 2696
},
{
"epoch": 2.271043771043771,
"grad_norm": 2.8720853328704834,
"learning_rate": 4.759768080829399e-07,
"loss": 0.638217568397522,
"step": 2698
},
{
"epoch": 2.2727272727272725,
"grad_norm": 5.371377944946289,
"learning_rate": 4.747740867291497e-07,
"loss": 0.7549663782119751,
"step": 2700
},
{
"epoch": 2.274410774410774,
"grad_norm": 2.809866428375244,
"learning_rate": 4.7357351954169973e-07,
"loss": 0.5037040114402771,
"step": 2702
},
{
"epoch": 2.276094276094276,
"grad_norm": 11.470369338989258,
"learning_rate": 4.7237511065704933e-07,
"loss": 0.8505884408950806,
"step": 2704
},
{
"epoch": 2.2777777777777777,
"grad_norm": 5.015624523162842,
"learning_rate": 4.7117886420422094e-07,
"loss": 0.9292435050010681,
"step": 2706
},
{
"epoch": 2.2794612794612794,
"grad_norm": 3.195216655731201,
"learning_rate": 4.6998478430478714e-07,
"loss": 0.4456526041030884,
"step": 2708
},
{
"epoch": 2.281144781144781,
"grad_norm": 7.715219497680664,
"learning_rate": 4.6879287507285596e-07,
"loss": 0.49354496598243713,
"step": 2710
},
{
"epoch": 2.282828282828283,
"grad_norm": 10.351372718811035,
"learning_rate": 4.676031406150555e-07,
"loss": 0.517022430896759,
"step": 2712
},
{
"epoch": 2.2845117845117846,
"grad_norm": 4.449305534362793,
"learning_rate": 4.66415585030522e-07,
"loss": 0.42631667852401733,
"step": 2714
},
{
"epoch": 2.2861952861952863,
"grad_norm": 21.76262855529785,
"learning_rate": 4.6523021241088416e-07,
"loss": 0.7113944292068481,
"step": 2716
},
{
"epoch": 2.287878787878788,
"grad_norm": 37.7462272644043,
"learning_rate": 4.6404702684024905e-07,
"loss": 0.5162969827651978,
"step": 2718
},
{
"epoch": 2.28956228956229,
"grad_norm": 4.822917938232422,
"learning_rate": 4.628660323951891e-07,
"loss": 0.5146564841270447,
"step": 2720
},
{
"epoch": 2.291245791245791,
"grad_norm": 2.2735533714294434,
"learning_rate": 4.616872331447272e-07,
"loss": 0.6732128262519836,
"step": 2722
},
{
"epoch": 2.292929292929293,
"grad_norm": 3.959578514099121,
"learning_rate": 4.605106331503223e-07,
"loss": 0.6910574436187744,
"step": 2724
},
{
"epoch": 2.2946127946127945,
"grad_norm": 6.245284080505371,
"learning_rate": 4.5933623646585683e-07,
"loss": 0.6672347784042358,
"step": 2726
},
{
"epoch": 2.2962962962962963,
"grad_norm": 18.67147445678711,
"learning_rate": 4.581640471376215e-07,
"loss": 0.509329617023468,
"step": 2728
},
{
"epoch": 2.297979797979798,
"grad_norm": 5.631857395172119,
"learning_rate": 4.5699406920430155e-07,
"loss": 0.9162227511405945,
"step": 2730
},
{
"epoch": 2.2996632996632997,
"grad_norm": 4.981385707855225,
"learning_rate": 4.5582630669696324e-07,
"loss": 0.46352601051330566,
"step": 2732
},
{
"epoch": 2.3013468013468015,
"grad_norm": 11.902592658996582,
"learning_rate": 4.5466076363904e-07,
"loss": 0.44609200954437256,
"step": 2734
},
{
"epoch": 2.303030303030303,
"grad_norm": 7.000277042388916,
"learning_rate": 4.5349744404631785e-07,
"loss": 0.38603392243385315,
"step": 2736
},
{
"epoch": 2.3047138047138045,
"grad_norm": 19.020755767822266,
"learning_rate": 4.5233635192692206e-07,
"loss": 0.5370512008666992,
"step": 2738
},
{
"epoch": 2.3063973063973062,
"grad_norm": 9.254744529724121,
"learning_rate": 4.511774912813043e-07,
"loss": 0.35465237498283386,
"step": 2740
},
{
"epoch": 2.308080808080808,
"grad_norm": 2.5461535453796387,
"learning_rate": 4.5002086610222626e-07,
"loss": 0.7493946552276611,
"step": 2742
},
{
"epoch": 2.3097643097643097,
"grad_norm": 7.801723003387451,
"learning_rate": 4.488664803747487e-07,
"loss": 0.7291615009307861,
"step": 2744
},
{
"epoch": 2.3114478114478114,
"grad_norm": 4.71798849105835,
"learning_rate": 4.4771433807621644e-07,
"loss": 0.8265661001205444,
"step": 2746
},
{
"epoch": 2.313131313131313,
"grad_norm": 11.469908714294434,
"learning_rate": 4.4656444317624397e-07,
"loss": 0.6443151831626892,
"step": 2748
},
{
"epoch": 2.314814814814815,
"grad_norm": 0.9388121962547302,
"learning_rate": 4.454167996367032e-07,
"loss": 0.0978798121213913,
"step": 2750
},
{
"epoch": 2.3164983164983166,
"grad_norm": 7.400945663452148,
"learning_rate": 4.442714114117092e-07,
"loss": 0.2580530345439911,
"step": 2752
},
{
"epoch": 2.3181818181818183,
"grad_norm": 3.6424386501312256,
"learning_rate": 4.4312828244760613e-07,
"loss": 0.46834707260131836,
"step": 2754
},
{
"epoch": 2.31986531986532,
"grad_norm": 10.415234565734863,
"learning_rate": 4.4198741668295425e-07,
"loss": 0.900390625,
"step": 2756
},
{
"epoch": 2.3215488215488214,
"grad_norm": 2.8194925785064697,
"learning_rate": 4.4084881804851644e-07,
"loss": 0.6006342172622681,
"step": 2758
},
{
"epoch": 2.323232323232323,
"grad_norm": 9.550015449523926,
"learning_rate": 4.397124904672437e-07,
"loss": 0.7037711143493652,
"step": 2760
},
{
"epoch": 2.324915824915825,
"grad_norm": 5.865845203399658,
"learning_rate": 4.3857843785426263e-07,
"loss": 0.4606119990348816,
"step": 2762
},
{
"epoch": 2.3265993265993266,
"grad_norm": 9.260407447814941,
"learning_rate": 4.374466641168622e-07,
"loss": 0.9028510451316833,
"step": 2764
},
{
"epoch": 2.3282828282828283,
"grad_norm": 30.487369537353516,
"learning_rate": 4.363171731544786e-07,
"loss": 0.6837437152862549,
"step": 2766
},
{
"epoch": 2.32996632996633,
"grad_norm": 3.4019174575805664,
"learning_rate": 4.351899688586834e-07,
"loss": 0.5506434440612793,
"step": 2768
},
{
"epoch": 2.3316498316498318,
"grad_norm": 9.221944808959961,
"learning_rate": 4.3406505511317025e-07,
"loss": 0.6231704354286194,
"step": 2770
},
{
"epoch": 2.3333333333333335,
"grad_norm": 5.134349346160889,
"learning_rate": 4.329424357937397e-07,
"loss": 0.5775326490402222,
"step": 2772
},
{
"epoch": 2.3350168350168348,
"grad_norm": 3.2986905574798584,
"learning_rate": 4.318221147682879e-07,
"loss": 0.6728795766830444,
"step": 2774
},
{
"epoch": 2.3367003367003365,
"grad_norm": 7.071535587310791,
"learning_rate": 4.307040958967924e-07,
"loss": 0.7195960879325867,
"step": 2776
},
{
"epoch": 2.3383838383838382,
"grad_norm": 6.33209228515625,
"learning_rate": 4.2958838303129817e-07,
"loss": 0.3605208098888397,
"step": 2778
},
{
"epoch": 2.34006734006734,
"grad_norm": 15.394960403442383,
"learning_rate": 4.2847498001590573e-07,
"loss": 0.6560809016227722,
"step": 2780
},
{
"epoch": 2.3417508417508417,
"grad_norm": 5.364711761474609,
"learning_rate": 4.273638906867573e-07,
"loss": 0.5723754167556763,
"step": 2782
},
{
"epoch": 2.3434343434343434,
"grad_norm": 4.554681301116943,
"learning_rate": 4.2625511887202225e-07,
"loss": 0.786733090877533,
"step": 2784
},
{
"epoch": 2.345117845117845,
"grad_norm": 5.919230937957764,
"learning_rate": 4.2514866839188657e-07,
"loss": 0.5187538862228394,
"step": 2786
},
{
"epoch": 2.346801346801347,
"grad_norm": 2.8754208087921143,
"learning_rate": 4.2404454305853796e-07,
"loss": 0.9200822114944458,
"step": 2788
},
{
"epoch": 2.3484848484848486,
"grad_norm": 4.2973833084106445,
"learning_rate": 4.229427466761522e-07,
"loss": 0.7082578539848328,
"step": 2790
},
{
"epoch": 2.3501683501683504,
"grad_norm": 2.8982136249542236,
"learning_rate": 4.2184328304088164e-07,
"loss": 0.5452355146408081,
"step": 2792
},
{
"epoch": 2.351851851851852,
"grad_norm": 10.917097091674805,
"learning_rate": 4.2074615594084146e-07,
"loss": 0.5780555009841919,
"step": 2794
},
{
"epoch": 2.3535353535353534,
"grad_norm": 4.399576187133789,
"learning_rate": 4.1965136915609543e-07,
"loss": 0.9775782823562622,
"step": 2796
},
{
"epoch": 2.355218855218855,
"grad_norm": 3.9406611919403076,
"learning_rate": 4.1855892645864513e-07,
"loss": 0.4702543616294861,
"step": 2798
},
{
"epoch": 2.356902356902357,
"grad_norm": 2.8284730911254883,
"learning_rate": 4.1746883161241555e-07,
"loss": 1.041868805885315,
"step": 2800
},
{
"epoch": 2.3585858585858586,
"grad_norm": 2.9816761016845703,
"learning_rate": 4.1638108837324137e-07,
"loss": 0.8972384333610535,
"step": 2802
},
{
"epoch": 2.3602693602693603,
"grad_norm": 4.195338249206543,
"learning_rate": 4.152957004888563e-07,
"loss": 0.8051435947418213,
"step": 2804
},
{
"epoch": 2.361952861952862,
"grad_norm": 7.884792804718018,
"learning_rate": 4.142126716988784e-07,
"loss": 0.805417001247406,
"step": 2806
},
{
"epoch": 2.3636363636363638,
"grad_norm": 3.174224853515625,
"learning_rate": 4.131320057347969e-07,
"loss": 0.7631466388702393,
"step": 2808
},
{
"epoch": 2.3653198653198655,
"grad_norm": 2.2100088596343994,
"learning_rate": 4.120537063199612e-07,
"loss": 0.9656248688697815,
"step": 2810
},
{
"epoch": 2.3670033670033668,
"grad_norm": 21.951086044311523,
"learning_rate": 4.109777771695663e-07,
"loss": 0.6510505676269531,
"step": 2812
},
{
"epoch": 2.3686868686868685,
"grad_norm": 4.415777683258057,
"learning_rate": 4.0990422199064103e-07,
"loss": 0.5992385745048523,
"step": 2814
},
{
"epoch": 2.3703703703703702,
"grad_norm": 4.938045024871826,
"learning_rate": 4.0883304448203477e-07,
"loss": 0.6755191087722778,
"step": 2816
},
{
"epoch": 2.372053872053872,
"grad_norm": 5.014671325683594,
"learning_rate": 4.077642483344044e-07,
"loss": 0.6416581869125366,
"step": 2818
},
{
"epoch": 2.3737373737373737,
"grad_norm": 3.0677618980407715,
"learning_rate": 4.066978372302025e-07,
"loss": 0.7114299535751343,
"step": 2820
},
{
"epoch": 2.3754208754208754,
"grad_norm": 5.499224662780762,
"learning_rate": 4.056338148436643e-07,
"loss": 0.38672173023223877,
"step": 2822
},
{
"epoch": 2.377104377104377,
"grad_norm": 3.9416239261627197,
"learning_rate": 4.0457218484079414e-07,
"loss": 0.9695321321487427,
"step": 2824
},
{
"epoch": 2.378787878787879,
"grad_norm": 4.72567892074585,
"learning_rate": 4.035129508793542e-07,
"loss": 0.899653971195221,
"step": 2826
},
{
"epoch": 2.3804713804713806,
"grad_norm": 4.175594806671143,
"learning_rate": 4.024561166088516e-07,
"loss": 0.4069860577583313,
"step": 2828
},
{
"epoch": 2.3821548821548824,
"grad_norm": 12.212733268737793,
"learning_rate": 4.0140168567052447e-07,
"loss": 0.90252685546875,
"step": 2830
},
{
"epoch": 2.3838383838383836,
"grad_norm": 10.1971435546875,
"learning_rate": 4.003496616973312e-07,
"loss": 0.6742314100265503,
"step": 2832
},
{
"epoch": 2.3855218855218854,
"grad_norm": 19.07830238342285,
"learning_rate": 3.9930004831393757e-07,
"loss": 0.5178687572479248,
"step": 2834
},
{
"epoch": 2.387205387205387,
"grad_norm": 5.426108360290527,
"learning_rate": 3.982528491367025e-07,
"loss": 0.5686367154121399,
"step": 2836
},
{
"epoch": 2.388888888888889,
"grad_norm": 15.152667045593262,
"learning_rate": 3.9720806777366817e-07,
"loss": 0.4284480810165405,
"step": 2838
},
{
"epoch": 2.3905723905723906,
"grad_norm": 3.7981669902801514,
"learning_rate": 3.961657078245462e-07,
"loss": 0.7795579433441162,
"step": 2840
},
{
"epoch": 2.3922558922558923,
"grad_norm": 2.7446529865264893,
"learning_rate": 3.9512577288070487e-07,
"loss": 0.3763793110847473,
"step": 2842
},
{
"epoch": 2.393939393939394,
"grad_norm": 2.8617823123931885,
"learning_rate": 3.940882665251576e-07,
"loss": 0.9840795993804932,
"step": 2844
},
{
"epoch": 2.3956228956228958,
"grad_norm": 3.311777114868164,
"learning_rate": 3.930531923325506e-07,
"loss": 0.7532452344894409,
"step": 2846
},
{
"epoch": 2.3973063973063975,
"grad_norm": 7.39417839050293,
"learning_rate": 3.920205538691497e-07,
"loss": 0.9117331504821777,
"step": 2848
},
{
"epoch": 2.398989898989899,
"grad_norm": 2.8873496055603027,
"learning_rate": 3.9099035469282906e-07,
"loss": 0.7445226907730103,
"step": 2850
},
{
"epoch": 2.4006734006734005,
"grad_norm": 5.140913486480713,
"learning_rate": 3.8996259835305835e-07,
"loss": 0.3813757598400116,
"step": 2852
},
{
"epoch": 2.4023569023569022,
"grad_norm": 18.368505477905273,
"learning_rate": 3.8893728839089035e-07,
"loss": 0.589090883731842,
"step": 2854
},
{
"epoch": 2.404040404040404,
"grad_norm": 7.0607709884643555,
"learning_rate": 3.879144283389495e-07,
"loss": 0.5158854126930237,
"step": 2856
},
{
"epoch": 2.4057239057239057,
"grad_norm": 6.402346134185791,
"learning_rate": 3.8689402172141915e-07,
"loss": 0.6101418733596802,
"step": 2858
},
{
"epoch": 2.4074074074074074,
"grad_norm": 11.600252151489258,
"learning_rate": 3.8587607205402916e-07,
"loss": 0.3425447642803192,
"step": 2860
},
{
"epoch": 2.409090909090909,
"grad_norm": 3.169504165649414,
"learning_rate": 3.848605828440444e-07,
"loss": 0.7518799901008606,
"step": 2862
},
{
"epoch": 2.410774410774411,
"grad_norm": 6.338188171386719,
"learning_rate": 3.8384755759025313e-07,
"loss": 0.4169810712337494,
"step": 2864
},
{
"epoch": 2.4124579124579126,
"grad_norm": 4.593759536743164,
"learning_rate": 3.828369997829528e-07,
"loss": 0.6622034907341003,
"step": 2866
},
{
"epoch": 2.4141414141414144,
"grad_norm": 10.378397941589355,
"learning_rate": 3.818289129039405e-07,
"loss": 0.7845497131347656,
"step": 2868
},
{
"epoch": 2.4158249158249157,
"grad_norm": 2.801703453063965,
"learning_rate": 3.808233004264997e-07,
"loss": 0.5676144361495972,
"step": 2870
},
{
"epoch": 2.4175084175084174,
"grad_norm": 3.49591064453125,
"learning_rate": 3.79820165815389e-07,
"loss": 0.4738210439682007,
"step": 2872
},
{
"epoch": 2.419191919191919,
"grad_norm": 3.7410953044891357,
"learning_rate": 3.788195125268284e-07,
"loss": 0.8427296876907349,
"step": 2874
},
{
"epoch": 2.420875420875421,
"grad_norm": 5.019288063049316,
"learning_rate": 3.7782134400848995e-07,
"loss": 0.7298943996429443,
"step": 2876
},
{
"epoch": 2.4225589225589226,
"grad_norm": 3.775413751602173,
"learning_rate": 3.768256636994843e-07,
"loss": 0.4356338381767273,
"step": 2878
},
{
"epoch": 2.4242424242424243,
"grad_norm": 2.9583945274353027,
"learning_rate": 3.7583247503034864e-07,
"loss": 0.7260875701904297,
"step": 2880
},
{
"epoch": 2.425925925925926,
"grad_norm": 3.2975947856903076,
"learning_rate": 3.7484178142303625e-07,
"loss": 0.5450549721717834,
"step": 2882
},
{
"epoch": 2.4276094276094278,
"grad_norm": 16.18134307861328,
"learning_rate": 3.738535862909031e-07,
"loss": 0.4824645519256592,
"step": 2884
},
{
"epoch": 2.429292929292929,
"grad_norm": 5.209835529327393,
"learning_rate": 3.7286789303869735e-07,
"loss": 0.4984836280345917,
"step": 2886
},
{
"epoch": 2.430976430976431,
"grad_norm": 9.006096839904785,
"learning_rate": 3.7188470506254744e-07,
"loss": 0.6126713156700134,
"step": 2888
},
{
"epoch": 2.4326599326599325,
"grad_norm": 2.905740261077881,
"learning_rate": 3.7090402574994885e-07,
"loss": 0.5302858352661133,
"step": 2890
},
{
"epoch": 2.4343434343434343,
"grad_norm": 7.235422134399414,
"learning_rate": 3.699258584797548e-07,
"loss": 0.5883275270462036,
"step": 2892
},
{
"epoch": 2.436026936026936,
"grad_norm": 4.7563157081604,
"learning_rate": 3.6895020662216326e-07,
"loss": 0.8630578517913818,
"step": 2894
},
{
"epoch": 2.4377104377104377,
"grad_norm": 3.8442506790161133,
"learning_rate": 3.679770735387052e-07,
"loss": 0.720264732837677,
"step": 2896
},
{
"epoch": 2.4393939393939394,
"grad_norm": 6.493531703948975,
"learning_rate": 3.6700646258223343e-07,
"loss": 0.6094503998756409,
"step": 2898
},
{
"epoch": 2.441077441077441,
"grad_norm": 24.394699096679688,
"learning_rate": 3.6603837709691153e-07,
"loss": 0.40544137358665466,
"step": 2900
},
{
"epoch": 2.442760942760943,
"grad_norm": 4.592130661010742,
"learning_rate": 3.6507282041820085e-07,
"loss": 0.8314005136489868,
"step": 2902
},
{
"epoch": 2.4444444444444446,
"grad_norm": 9.8695707321167,
"learning_rate": 3.641097958728506e-07,
"loss": 0.49147939682006836,
"step": 2904
},
{
"epoch": 2.4461279461279464,
"grad_norm": 6.742786407470703,
"learning_rate": 3.631493067788858e-07,
"loss": 0.34731265902519226,
"step": 2906
},
{
"epoch": 2.4478114478114477,
"grad_norm": 7.511764049530029,
"learning_rate": 3.6219135644559506e-07,
"loss": 0.5173161029815674,
"step": 2908
},
{
"epoch": 2.4494949494949494,
"grad_norm": 3.2894692420959473,
"learning_rate": 3.6123594817352046e-07,
"loss": 0.6695667505264282,
"step": 2910
},
{
"epoch": 2.451178451178451,
"grad_norm": 5.603763103485107,
"learning_rate": 3.602830852544458e-07,
"loss": 0.4327901005744934,
"step": 2912
},
{
"epoch": 2.452861952861953,
"grad_norm": 3.399629592895508,
"learning_rate": 3.593327709713844e-07,
"loss": 0.7913680672645569,
"step": 2914
},
{
"epoch": 2.4545454545454546,
"grad_norm": 3.867079257965088,
"learning_rate": 3.5838500859856893e-07,
"loss": 0.6534749865531921,
"step": 2916
},
{
"epoch": 2.4562289562289563,
"grad_norm": 1.4564638137817383,
"learning_rate": 3.5743980140143975e-07,
"loss": 0.19182810187339783,
"step": 2918
},
{
"epoch": 2.457912457912458,
"grad_norm": 4.126720905303955,
"learning_rate": 3.5649715263663297e-07,
"loss": 0.8050523996353149,
"step": 2920
},
{
"epoch": 2.45959595959596,
"grad_norm": 518.9237670898438,
"learning_rate": 3.5555706555197043e-07,
"loss": 0.3782300353050232,
"step": 2922
},
{
"epoch": 2.461279461279461,
"grad_norm": 4.448193073272705,
"learning_rate": 3.5461954338644795e-07,
"loss": 0.316059410572052,
"step": 2924
},
{
"epoch": 2.462962962962963,
"grad_norm": 3.345587730407715,
"learning_rate": 3.536845893702234e-07,
"loss": 0.5723974704742432,
"step": 2926
},
{
"epoch": 2.4646464646464645,
"grad_norm": 8.732227325439453,
"learning_rate": 3.527522067246068e-07,
"loss": 0.5091125965118408,
"step": 2928
},
{
"epoch": 2.4663299663299663,
"grad_norm": 3.8187427520751953,
"learning_rate": 3.518223986620491e-07,
"loss": 0.3073745667934418,
"step": 2930
},
{
"epoch": 2.468013468013468,
"grad_norm": 8.199573516845703,
"learning_rate": 3.5089516838612986e-07,
"loss": 0.6242831945419312,
"step": 2932
},
{
"epoch": 2.4696969696969697,
"grad_norm": 6.898658752441406,
"learning_rate": 3.499705190915476e-07,
"loss": 0.627583384513855,
"step": 2934
},
{
"epoch": 2.4713804713804715,
"grad_norm": 7.565421104431152,
"learning_rate": 3.4904845396410854e-07,
"loss": 0.43692106008529663,
"step": 2936
},
{
"epoch": 2.473063973063973,
"grad_norm": 3.025193691253662,
"learning_rate": 3.4812897618071445e-07,
"loss": 0.5572280883789062,
"step": 2938
},
{
"epoch": 2.474747474747475,
"grad_norm": 8.582428932189941,
"learning_rate": 3.472120889093536e-07,
"loss": 0.5607247352600098,
"step": 2940
},
{
"epoch": 2.4764309764309766,
"grad_norm": 5.5012640953063965,
"learning_rate": 3.462977953090884e-07,
"loss": 0.3747951090335846,
"step": 2942
},
{
"epoch": 2.478114478114478,
"grad_norm": 4.519533634185791,
"learning_rate": 3.453860985300446e-07,
"loss": 0.43182575702667236,
"step": 2944
},
{
"epoch": 2.4797979797979797,
"grad_norm": 2.845407247543335,
"learning_rate": 3.4447700171340164e-07,
"loss": 0.9047005772590637,
"step": 2946
},
{
"epoch": 2.4814814814814814,
"grad_norm": 2.432866334915161,
"learning_rate": 3.4357050799138053e-07,
"loss": 0.938655436038971,
"step": 2948
},
{
"epoch": 2.483164983164983,
"grad_norm": 3.2918946743011475,
"learning_rate": 3.4266662048723337e-07,
"loss": 1.013432502746582,
"step": 2950
},
{
"epoch": 2.484848484848485,
"grad_norm": 18.99071502685547,
"learning_rate": 3.417653423152329e-07,
"loss": 0.8985989093780518,
"step": 2952
},
{
"epoch": 2.4865319865319866,
"grad_norm": 6.151244163513184,
"learning_rate": 3.4086667658066186e-07,
"loss": 0.5609415769577026,
"step": 2954
},
{
"epoch": 2.4882154882154883,
"grad_norm": 8.208552360534668,
"learning_rate": 3.3997062637980167e-07,
"loss": 0.8369396924972534,
"step": 2956
},
{
"epoch": 2.48989898989899,
"grad_norm": 6.0119853019714355,
"learning_rate": 3.390771947999224e-07,
"loss": 0.5242006182670593,
"step": 2958
},
{
"epoch": 2.4915824915824913,
"grad_norm": 7.873940467834473,
"learning_rate": 3.381863849192718e-07,
"loss": 0.8243865370750427,
"step": 2960
},
{
"epoch": 2.493265993265993,
"grad_norm": 7.8693742752075195,
"learning_rate": 3.3729819980706444e-07,
"loss": 0.5058671832084656,
"step": 2962
},
{
"epoch": 2.494949494949495,
"grad_norm": 2.891031503677368,
"learning_rate": 3.364126425234719e-07,
"loss": 0.7412878274917603,
"step": 2964
},
{
"epoch": 2.4966329966329965,
"grad_norm": 18.90471649169922,
"learning_rate": 3.3552971611961187e-07,
"loss": 0.5835074186325073,
"step": 2966
},
{
"epoch": 2.4983164983164983,
"grad_norm": 3.6547908782958984,
"learning_rate": 3.34649423637537e-07,
"loss": 0.8192091584205627,
"step": 2968
},
{
"epoch": 2.5,
"grad_norm": 11.999411582946777,
"learning_rate": 3.337717681102253e-07,
"loss": 0.8428059816360474,
"step": 2970
},
{
"epoch": 2.5016835016835017,
"grad_norm": 5.940135955810547,
"learning_rate": 3.328967525615697e-07,
"loss": 0.39063435792922974,
"step": 2972
},
{
"epoch": 2.5033670033670035,
"grad_norm": 23.767696380615234,
"learning_rate": 3.3202438000636634e-07,
"loss": 0.47806400060653687,
"step": 2974
},
{
"epoch": 2.505050505050505,
"grad_norm": 6.031237602233887,
"learning_rate": 3.311546534503061e-07,
"loss": 0.6802424788475037,
"step": 2976
},
{
"epoch": 2.506734006734007,
"grad_norm": 25.405719757080078,
"learning_rate": 3.3028757588996303e-07,
"loss": 0.38681331276893616,
"step": 2978
},
{
"epoch": 2.5084175084175087,
"grad_norm": 6.533238887786865,
"learning_rate": 3.294231503127839e-07,
"loss": 0.7302665710449219,
"step": 2980
},
{
"epoch": 2.51010101010101,
"grad_norm": 6.384099006652832,
"learning_rate": 3.2856137969707847e-07,
"loss": 0.7972818613052368,
"step": 2982
},
{
"epoch": 2.5117845117845117,
"grad_norm": 6.987396240234375,
"learning_rate": 3.277022670120095e-07,
"loss": 0.39771410822868347,
"step": 2984
},
{
"epoch": 2.5134680134680134,
"grad_norm": 16.85350227355957,
"learning_rate": 3.268458152175813e-07,
"loss": 0.7731115818023682,
"step": 2986
},
{
"epoch": 2.515151515151515,
"grad_norm": 4.062409400939941,
"learning_rate": 3.2599202726463084e-07,
"loss": 0.5933781862258911,
"step": 2988
},
{
"epoch": 2.516835016835017,
"grad_norm": 7.16248083114624,
"learning_rate": 3.2514090609481683e-07,
"loss": 0.09502522647380829,
"step": 2990
},
{
"epoch": 2.5185185185185186,
"grad_norm": 4.739719867706299,
"learning_rate": 3.2429245464060965e-07,
"loss": 0.8891875147819519,
"step": 2992
},
{
"epoch": 2.5202020202020203,
"grad_norm": 6.538869857788086,
"learning_rate": 3.234466758252818e-07,
"loss": 0.5735270977020264,
"step": 2994
},
{
"epoch": 2.5218855218855216,
"grad_norm": 5.069677352905273,
"learning_rate": 3.2260357256289715e-07,
"loss": 0.7090741395950317,
"step": 2996
},
{
"epoch": 2.5235690235690234,
"grad_norm": 4.84168004989624,
"learning_rate": 3.217631477583009e-07,
"loss": 0.5537684559822083,
"step": 2998
},
{
"epoch": 2.525252525252525,
"grad_norm": 4.053093910217285,
"learning_rate": 3.2092540430711044e-07,
"loss": 0.5045433044433594,
"step": 3000
},
{
"epoch": 2.526936026936027,
"grad_norm": 6.442458152770996,
"learning_rate": 3.200903450957044e-07,
"loss": 0.4958549439907074,
"step": 3002
},
{
"epoch": 2.5286195286195285,
"grad_norm": 4.950314521789551,
"learning_rate": 3.192579730012129e-07,
"loss": 0.9713015556335449,
"step": 3004
},
{
"epoch": 2.5303030303030303,
"grad_norm": 32.4094123840332,
"learning_rate": 3.184282908915081e-07,
"loss": 0.7774836421012878,
"step": 3006
},
{
"epoch": 2.531986531986532,
"grad_norm": 8.05980396270752,
"learning_rate": 3.1760130162519427e-07,
"loss": 0.6949951648712158,
"step": 3008
},
{
"epoch": 2.5336700336700337,
"grad_norm": 6.453157901763916,
"learning_rate": 3.16777008051597e-07,
"loss": 0.2635032832622528,
"step": 3010
},
{
"epoch": 2.5353535353535355,
"grad_norm": 8.72614860534668,
"learning_rate": 3.159554130107546e-07,
"loss": 0.7169020771980286,
"step": 3012
},
{
"epoch": 2.537037037037037,
"grad_norm": 2.803579807281494,
"learning_rate": 3.1513651933340797e-07,
"loss": 0.6434400677680969,
"step": 3014
},
{
"epoch": 2.538720538720539,
"grad_norm": 2.3139851093292236,
"learning_rate": 3.143203298409899e-07,
"loss": 0.522533655166626,
"step": 3016
},
{
"epoch": 2.5404040404040407,
"grad_norm": 7.905545711517334,
"learning_rate": 3.1350684734561676e-07,
"loss": 0.8724677562713623,
"step": 3018
},
{
"epoch": 2.542087542087542,
"grad_norm": 3.6152162551879883,
"learning_rate": 3.126960746500784e-07,
"loss": 0.6959270238876343,
"step": 3020
},
{
"epoch": 2.5437710437710437,
"grad_norm": 14.063467025756836,
"learning_rate": 3.118880145478274e-07,
"loss": 0.7995277643203735,
"step": 3022
},
{
"epoch": 2.5454545454545454,
"grad_norm": 3.315876007080078,
"learning_rate": 3.110826698229711e-07,
"loss": 0.9624471664428711,
"step": 3024
},
{
"epoch": 2.547138047138047,
"grad_norm": 11.101134300231934,
"learning_rate": 3.102800432502607e-07,
"loss": 0.22170954942703247,
"step": 3026
},
{
"epoch": 2.548821548821549,
"grad_norm": 6.456979751586914,
"learning_rate": 3.0948013759508274e-07,
"loss": 0.5246233344078064,
"step": 3028
},
{
"epoch": 2.5505050505050506,
"grad_norm": 3.3993847370147705,
"learning_rate": 3.0868295561344874e-07,
"loss": 0.4475906491279602,
"step": 3030
},
{
"epoch": 2.5521885521885523,
"grad_norm": 1.6112107038497925,
"learning_rate": 3.078885000519858e-07,
"loss": 0.4590218961238861,
"step": 3032
},
{
"epoch": 2.5538720538720536,
"grad_norm": 14.948426246643066,
"learning_rate": 3.0709677364792767e-07,
"loss": 0.8541072607040405,
"step": 3034
},
{
"epoch": 2.5555555555555554,
"grad_norm": 3.668416976928711,
"learning_rate": 3.0630777912910533e-07,
"loss": 0.9300471544265747,
"step": 3036
},
{
"epoch": 2.557239057239057,
"grad_norm": 4.241018772125244,
"learning_rate": 3.0552151921393633e-07,
"loss": 0.6171663999557495,
"step": 3038
},
{
"epoch": 2.558922558922559,
"grad_norm": 6.009745121002197,
"learning_rate": 3.0473799661141707e-07,
"loss": 0.865818977355957,
"step": 3040
},
{
"epoch": 2.5606060606060606,
"grad_norm": 12.198860168457031,
"learning_rate": 3.0395721402111286e-07,
"loss": 0.6238538026809692,
"step": 3042
},
{
"epoch": 2.5622895622895623,
"grad_norm": 7.544912338256836,
"learning_rate": 3.031791741331478e-07,
"loss": 0.778638482093811,
"step": 3044
},
{
"epoch": 2.563973063973064,
"grad_norm": 6.367477893829346,
"learning_rate": 3.0240387962819695e-07,
"loss": 0.6787006855010986,
"step": 3046
},
{
"epoch": 2.5656565656565657,
"grad_norm": 3.4499287605285645,
"learning_rate": 3.016313331774762e-07,
"loss": 0.8738001585006714,
"step": 3048
},
{
"epoch": 2.5673400673400675,
"grad_norm": 2.4657773971557617,
"learning_rate": 3.008615374427329e-07,
"loss": 0.3498271703720093,
"step": 3050
},
{
"epoch": 2.569023569023569,
"grad_norm": 5.700794696807861,
"learning_rate": 3.000944950762373e-07,
"loss": 0.9484968185424805,
"step": 3052
},
{
"epoch": 2.570707070707071,
"grad_norm": 18.000146865844727,
"learning_rate": 2.993302087207732e-07,
"loss": 0.0691433697938919,
"step": 3054
},
{
"epoch": 2.5723905723905722,
"grad_norm": 16.131559371948242,
"learning_rate": 2.985686810096285e-07,
"loss": 0.6116932034492493,
"step": 3056
},
{
"epoch": 2.574074074074074,
"grad_norm": 5.901321887969971,
"learning_rate": 2.978099145665867e-07,
"loss": 0.3154261112213135,
"step": 3058
},
{
"epoch": 2.5757575757575757,
"grad_norm": 4.957643508911133,
"learning_rate": 2.970539120059174e-07,
"loss": 0.6580586433410645,
"step": 3060
},
{
"epoch": 2.5774410774410774,
"grad_norm": 5.373193264007568,
"learning_rate": 2.963006759323676e-07,
"loss": 0.6125509142875671,
"step": 3062
},
{
"epoch": 2.579124579124579,
"grad_norm": 5.6912522315979,
"learning_rate": 2.955502089411523e-07,
"loss": 0.4061823785305023,
"step": 3064
},
{
"epoch": 2.580808080808081,
"grad_norm": 10.119878768920898,
"learning_rate": 2.9480251361794656e-07,
"loss": 0.5432108044624329,
"step": 3066
},
{
"epoch": 2.5824915824915826,
"grad_norm": 5.718217372894287,
"learning_rate": 2.940575925388746e-07,
"loss": 0.2773892879486084,
"step": 3068
},
{
"epoch": 2.584175084175084,
"grad_norm": 7.640798091888428,
"learning_rate": 2.933154482705035e-07,
"loss": 0.08487945795059204,
"step": 3070
},
{
"epoch": 2.5858585858585856,
"grad_norm": 6.283509731292725,
"learning_rate": 2.925760833698327e-07,
"loss": 0.41717803478240967,
"step": 3072
},
{
"epoch": 2.5875420875420874,
"grad_norm": 3.45359468460083,
"learning_rate": 2.9183950038428475e-07,
"loss": 0.9503785371780396,
"step": 3074
},
{
"epoch": 2.589225589225589,
"grad_norm": 4.694600582122803,
"learning_rate": 2.9110570185169834e-07,
"loss": 0.3452813923358917,
"step": 3076
},
{
"epoch": 2.590909090909091,
"grad_norm": 3.6043646335601807,
"learning_rate": 2.903746903003184e-07,
"loss": 0.8001734614372253,
"step": 3078
},
{
"epoch": 2.5925925925925926,
"grad_norm": 5.150274753570557,
"learning_rate": 2.896464682487866e-07,
"loss": 0.6741084456443787,
"step": 3080
},
{
"epoch": 2.5942760942760943,
"grad_norm": 6.488956928253174,
"learning_rate": 2.8892103820613487e-07,
"loss": 0.9191502332687378,
"step": 3082
},
{
"epoch": 2.595959595959596,
"grad_norm": 6.83146333694458,
"learning_rate": 2.88198402671775e-07,
"loss": 0.5582960844039917,
"step": 3084
},
{
"epoch": 2.5976430976430978,
"grad_norm": 5.457592964172363,
"learning_rate": 2.874785641354901e-07,
"loss": 0.5779297947883606,
"step": 3086
},
{
"epoch": 2.5993265993265995,
"grad_norm": 3.332746744155884,
"learning_rate": 2.867615250774269e-07,
"loss": 0.7671989798545837,
"step": 3088
},
{
"epoch": 2.601010101010101,
"grad_norm": 3.9494850635528564,
"learning_rate": 2.860472879680869e-07,
"loss": 0.8642760515213013,
"step": 3090
},
{
"epoch": 2.602693602693603,
"grad_norm": 4.3518757820129395,
"learning_rate": 2.8533585526831726e-07,
"loss": 0.6304323673248291,
"step": 3092
},
{
"epoch": 2.6043771043771042,
"grad_norm": 6.350977897644043,
"learning_rate": 2.8462722942930286e-07,
"loss": 0.4931812286376953,
"step": 3094
},
{
"epoch": 2.606060606060606,
"grad_norm": 3.1723833084106445,
"learning_rate": 2.8392141289255806e-07,
"loss": 0.6241375207901001,
"step": 3096
},
{
"epoch": 2.6077441077441077,
"grad_norm": 6.107673168182373,
"learning_rate": 2.8321840808991775e-07,
"loss": 0.5527880191802979,
"step": 3098
},
{
"epoch": 2.6094276094276094,
"grad_norm": 5.577755928039551,
"learning_rate": 2.8251821744352933e-07,
"loss": 0.6250026226043701,
"step": 3100
},
{
"epoch": 2.611111111111111,
"grad_norm": 7.03651762008667,
"learning_rate": 2.8182084336584423e-07,
"loss": 0.5582347512245178,
"step": 3102
},
{
"epoch": 2.612794612794613,
"grad_norm": 2.2495877742767334,
"learning_rate": 2.8112628825960926e-07,
"loss": 0.791733980178833,
"step": 3104
},
{
"epoch": 2.6144781144781146,
"grad_norm": 17.0977725982666,
"learning_rate": 2.804345545178594e-07,
"loss": 0.7450399398803711,
"step": 3106
},
{
"epoch": 2.616161616161616,
"grad_norm": 4.711960792541504,
"learning_rate": 2.7974564452390833e-07,
"loss": 0.17849119007587433,
"step": 3108
},
{
"epoch": 2.6178451178451176,
"grad_norm": 10.859472274780273,
"learning_rate": 2.790595606513406e-07,
"loss": 0.7354204654693604,
"step": 3110
},
{
"epoch": 2.6195286195286194,
"grad_norm": 3.239361047744751,
"learning_rate": 2.78376305264004e-07,
"loss": 0.41245055198669434,
"step": 3112
},
{
"epoch": 2.621212121212121,
"grad_norm": 2.8065528869628906,
"learning_rate": 2.776958807160011e-07,
"loss": 0.37273505330085754,
"step": 3114
},
{
"epoch": 2.622895622895623,
"grad_norm": 4.473266124725342,
"learning_rate": 2.7701828935168026e-07,
"loss": 0.8599231243133545,
"step": 3116
},
{
"epoch": 2.6245791245791246,
"grad_norm": 7.686254501342773,
"learning_rate": 2.763435335056291e-07,
"loss": 0.9832479953765869,
"step": 3118
},
{
"epoch": 2.6262626262626263,
"grad_norm": 2.1346304416656494,
"learning_rate": 2.756716155026656e-07,
"loss": 0.5217673778533936,
"step": 3120
},
{
"epoch": 2.627946127946128,
"grad_norm": 3.8724348545074463,
"learning_rate": 2.750025376578295e-07,
"loss": 0.8622322082519531,
"step": 3122
},
{
"epoch": 2.6296296296296298,
"grad_norm": 2.7656431198120117,
"learning_rate": 2.743363022763758e-07,
"loss": 0.8336771726608276,
"step": 3124
},
{
"epoch": 2.6313131313131315,
"grad_norm": 2.994492769241333,
"learning_rate": 2.7367291165376593e-07,
"loss": 0.5954484939575195,
"step": 3126
},
{
"epoch": 2.6329966329966332,
"grad_norm": 6.633072376251221,
"learning_rate": 2.7301236807565925e-07,
"loss": 0.8022388219833374,
"step": 3128
},
{
"epoch": 2.634680134680135,
"grad_norm": 9.094773292541504,
"learning_rate": 2.7235467381790654e-07,
"loss": 0.5048923492431641,
"step": 3130
},
{
"epoch": 2.6363636363636362,
"grad_norm": 5.657838821411133,
"learning_rate": 2.716998311465415e-07,
"loss": 0.2697800397872925,
"step": 3132
},
{
"epoch": 2.638047138047138,
"grad_norm": 4.260385513305664,
"learning_rate": 2.710478423177722e-07,
"loss": 0.8560886383056641,
"step": 3134
},
{
"epoch": 2.6397306397306397,
"grad_norm": 5.333981513977051,
"learning_rate": 2.7039870957797464e-07,
"loss": 0.7351222038269043,
"step": 3136
},
{
"epoch": 2.6414141414141414,
"grad_norm": 8.460240364074707,
"learning_rate": 2.697524351636844e-07,
"loss": 0.41435521841049194,
"step": 3138
},
{
"epoch": 2.643097643097643,
"grad_norm": 4.321287155151367,
"learning_rate": 2.691090213015886e-07,
"loss": 0.9173501133918762,
"step": 3140
},
{
"epoch": 2.644781144781145,
"grad_norm": 4.3384857177734375,
"learning_rate": 2.6846847020851884e-07,
"loss": 0.5904110670089722,
"step": 3142
},
{
"epoch": 2.6464646464646466,
"grad_norm": 6.099468231201172,
"learning_rate": 2.678307840914431e-07,
"loss": 0.8097279071807861,
"step": 3144
},
{
"epoch": 2.648148148148148,
"grad_norm": 3.9057722091674805,
"learning_rate": 2.6719596514745826e-07,
"loss": 0.8938575983047485,
"step": 3146
},
{
"epoch": 2.6498316498316496,
"grad_norm": 8.309523582458496,
"learning_rate": 2.665640155637828e-07,
"loss": 0.5425578355789185,
"step": 3148
},
{
"epoch": 2.6515151515151514,
"grad_norm": 3.0026330947875977,
"learning_rate": 2.659349375177489e-07,
"loss": 0.8360292911529541,
"step": 3150
},
{
"epoch": 2.653198653198653,
"grad_norm": 5.204579830169678,
"learning_rate": 2.6530873317679515e-07,
"loss": 0.2029864341020584,
"step": 3152
},
{
"epoch": 2.654882154882155,
"grad_norm": 23.3417911529541,
"learning_rate": 2.6468540469845895e-07,
"loss": 0.9556988477706909,
"step": 3154
},
{
"epoch": 2.6565656565656566,
"grad_norm": 13.595047950744629,
"learning_rate": 2.640649542303693e-07,
"loss": 0.5114415884017944,
"step": 3156
},
{
"epoch": 2.6582491582491583,
"grad_norm": 6.162187576293945,
"learning_rate": 2.634473839102389e-07,
"loss": 0.39493846893310547,
"step": 3158
},
{
"epoch": 2.65993265993266,
"grad_norm": 43.08856964111328,
"learning_rate": 2.6283269586585737e-07,
"loss": 0.5446680784225464,
"step": 3160
},
{
"epoch": 2.6616161616161618,
"grad_norm": 11.108345031738281,
"learning_rate": 2.6222089221508404e-07,
"loss": 0.6248540282249451,
"step": 3162
},
{
"epoch": 2.6632996632996635,
"grad_norm": 4.680754661560059,
"learning_rate": 2.6161197506583944e-07,
"loss": 0.8368432521820068,
"step": 3164
},
{
"epoch": 2.6649831649831652,
"grad_norm": 7.473052978515625,
"learning_rate": 2.610059465160995e-07,
"loss": 0.619489312171936,
"step": 3166
},
{
"epoch": 2.6666666666666665,
"grad_norm": 2.3733127117156982,
"learning_rate": 2.6040280865388773e-07,
"loss": 0.7894487380981445,
"step": 3168
},
{
"epoch": 2.6683501683501682,
"grad_norm": 1.7357522249221802,
"learning_rate": 2.5980256355726744e-07,
"loss": 0.5782526135444641,
"step": 3170
},
{
"epoch": 2.67003367003367,
"grad_norm": 7.880289554595947,
"learning_rate": 2.5920521329433606e-07,
"loss": 1.0222315788269043,
"step": 3172
},
{
"epoch": 2.6717171717171717,
"grad_norm": 3.272036075592041,
"learning_rate": 2.586107599232164e-07,
"loss": 0.9073632955551147,
"step": 3174
},
{
"epoch": 2.6734006734006734,
"grad_norm": 3.847628355026245,
"learning_rate": 2.5801920549205023e-07,
"loss": 0.46630191802978516,
"step": 3176
},
{
"epoch": 2.675084175084175,
"grad_norm": 2.5537798404693604,
"learning_rate": 2.5743055203899167e-07,
"loss": 0.9780217409133911,
"step": 3178
},
{
"epoch": 2.676767676767677,
"grad_norm": 4.765364170074463,
"learning_rate": 2.568448015921996e-07,
"loss": 0.639081597328186,
"step": 3180
},
{
"epoch": 2.678451178451178,
"grad_norm": 5.098658084869385,
"learning_rate": 2.562619561698306e-07,
"loss": 0.7984585762023926,
"step": 3182
},
{
"epoch": 2.68013468013468,
"grad_norm": 2.4715800285339355,
"learning_rate": 2.556820177800324e-07,
"loss": 0.9407286643981934,
"step": 3184
},
{
"epoch": 2.6818181818181817,
"grad_norm": 2.711570978164673,
"learning_rate": 2.551049884209371e-07,
"loss": 0.8115611672401428,
"step": 3186
},
{
"epoch": 2.6835016835016834,
"grad_norm": 9.145926475524902,
"learning_rate": 2.5453087008065307e-07,
"loss": 0.7339519262313843,
"step": 3188
},
{
"epoch": 2.685185185185185,
"grad_norm": 1.2086787223815918,
"learning_rate": 2.5395966473725994e-07,
"loss": 0.49706321954727173,
"step": 3190
},
{
"epoch": 2.686868686868687,
"grad_norm": 14.16477108001709,
"learning_rate": 2.5339137435880043e-07,
"loss": 0.6397048234939575,
"step": 3192
},
{
"epoch": 2.6885521885521886,
"grad_norm": 3.3142552375793457,
"learning_rate": 2.5282600090327383e-07,
"loss": 0.7652658820152283,
"step": 3194
},
{
"epoch": 2.6902356902356903,
"grad_norm": 19.05327606201172,
"learning_rate": 2.5226354631862966e-07,
"loss": 0.6125460863113403,
"step": 3196
},
{
"epoch": 2.691919191919192,
"grad_norm": 4.221333026885986,
"learning_rate": 2.517040125427608e-07,
"loss": 0.7383702397346497,
"step": 3198
},
{
"epoch": 2.6936026936026938,
"grad_norm": 2.8563621044158936,
"learning_rate": 2.511474015034964e-07,
"loss": 0.8494305610656738,
"step": 3200
},
{
"epoch": 2.6952861952861955,
"grad_norm": 3.877546548843384,
"learning_rate": 2.5059371511859557e-07,
"loss": 0.6800326108932495,
"step": 3202
},
{
"epoch": 2.6969696969696972,
"grad_norm": 3.861481189727783,
"learning_rate": 2.50042955295741e-07,
"loss": 0.6918296813964844,
"step": 3204
},
{
"epoch": 2.6986531986531985,
"grad_norm": 9.997620582580566,
"learning_rate": 2.494951239325321e-07,
"loss": 0.6519820094108582,
"step": 3206
},
{
"epoch": 2.7003367003367003,
"grad_norm": 4.166572093963623,
"learning_rate": 2.489502229164781e-07,
"loss": 0.5281827449798584,
"step": 3208
},
{
"epoch": 2.702020202020202,
"grad_norm": 4.448598384857178,
"learning_rate": 2.4840825412499274e-07,
"loss": 0.8719410300254822,
"step": 3210
},
{
"epoch": 2.7037037037037037,
"grad_norm": 4.639568328857422,
"learning_rate": 2.478692194253861e-07,
"loss": 0.5532783269882202,
"step": 3212
},
{
"epoch": 2.7053872053872055,
"grad_norm": 8.537738800048828,
"learning_rate": 2.473331206748597e-07,
"loss": 0.5865626931190491,
"step": 3214
},
{
"epoch": 2.707070707070707,
"grad_norm": 10.096135139465332,
"learning_rate": 2.467999597204996e-07,
"loss": 0.2805863618850708,
"step": 3216
},
{
"epoch": 2.708754208754209,
"grad_norm": 6.932223320007324,
"learning_rate": 2.462697383992691e-07,
"loss": 0.7335485219955444,
"step": 3218
},
{
"epoch": 2.71043771043771,
"grad_norm": 12.214366912841797,
"learning_rate": 2.457424585380041e-07,
"loss": 0.3276599943637848,
"step": 3220
},
{
"epoch": 2.712121212121212,
"grad_norm": 10.359675407409668,
"learning_rate": 2.4521812195340544e-07,
"loss": 0.672775149345398,
"step": 3222
},
{
"epoch": 2.7138047138047137,
"grad_norm": 27.647464752197266,
"learning_rate": 2.4469673045203333e-07,
"loss": 0.40836215019226074,
"step": 3224
},
{
"epoch": 2.7154882154882154,
"grad_norm": 15.687188148498535,
"learning_rate": 2.441782858303007e-07,
"loss": 0.4133344888687134,
"step": 3226
},
{
"epoch": 2.717171717171717,
"grad_norm": 17.905902862548828,
"learning_rate": 2.436627898744678e-07,
"loss": 0.7267272472381592,
"step": 3228
},
{
"epoch": 2.718855218855219,
"grad_norm": 9.417744636535645,
"learning_rate": 2.4315024436063464e-07,
"loss": 0.42516928911209106,
"step": 3230
},
{
"epoch": 2.7205387205387206,
"grad_norm": 8.572908401489258,
"learning_rate": 2.4264065105473637e-07,
"loss": 0.768959641456604,
"step": 3232
},
{
"epoch": 2.7222222222222223,
"grad_norm": 1.9153132438659668,
"learning_rate": 2.4213401171253656e-07,
"loss": 0.6403470039367676,
"step": 3234
},
{
"epoch": 2.723905723905724,
"grad_norm": 5.261312484741211,
"learning_rate": 2.416303280796206e-07,
"loss": 0.7732399106025696,
"step": 3236
},
{
"epoch": 2.725589225589226,
"grad_norm": 3.5602827072143555,
"learning_rate": 2.411296018913907e-07,
"loss": 0.7329007387161255,
"step": 3238
},
{
"epoch": 2.7272727272727275,
"grad_norm": 3.6793055534362793,
"learning_rate": 2.406318348730592e-07,
"loss": 0.7464162111282349,
"step": 3240
},
{
"epoch": 2.728956228956229,
"grad_norm": 2.7270774841308594,
"learning_rate": 2.401370287396428e-07,
"loss": 0.7636083364486694,
"step": 3242
},
{
"epoch": 2.7306397306397305,
"grad_norm": 4.971183776855469,
"learning_rate": 2.396451851959571e-07,
"loss": 0.599960207939148,
"step": 3244
},
{
"epoch": 2.7323232323232323,
"grad_norm": 4.194789886474609,
"learning_rate": 2.391563059366099e-07,
"loss": 0.7824025750160217,
"step": 3246
},
{
"epoch": 2.734006734006734,
"grad_norm": 5.917283535003662,
"learning_rate": 2.3867039264599587e-07,
"loss": 0.8408564329147339,
"step": 3248
},
{
"epoch": 2.7356902356902357,
"grad_norm": 3.7883689403533936,
"learning_rate": 2.3818744699829105e-07,
"loss": 0.6503514051437378,
"step": 3250
},
{
"epoch": 2.7373737373737375,
"grad_norm": 6.666152000427246,
"learning_rate": 2.3770747065744594e-07,
"loss": 0.3846713900566101,
"step": 3252
},
{
"epoch": 2.739057239057239,
"grad_norm": 4.073997497558594,
"learning_rate": 2.3723046527718137e-07,
"loss": 0.5147488713264465,
"step": 3254
},
{
"epoch": 2.7407407407407405,
"grad_norm": 6.8026018142700195,
"learning_rate": 2.367564325009815e-07,
"loss": 0.5139864087104797,
"step": 3256
},
{
"epoch": 2.742424242424242,
"grad_norm": 2.5795681476593018,
"learning_rate": 2.362853739620885e-07,
"loss": 0.5290718078613281,
"step": 3258
},
{
"epoch": 2.744107744107744,
"grad_norm": 14.904226303100586,
"learning_rate": 2.3581729128349745e-07,
"loss": 0.3965787887573242,
"step": 3260
},
{
"epoch": 2.7457912457912457,
"grad_norm": 5.50350284576416,
"learning_rate": 2.3535218607795013e-07,
"loss": 0.6484100222587585,
"step": 3262
},
{
"epoch": 2.7474747474747474,
"grad_norm": 5.252780437469482,
"learning_rate": 2.3489005994792948e-07,
"loss": 0.8430534601211548,
"step": 3264
},
{
"epoch": 2.749158249158249,
"grad_norm": 7.023755073547363,
"learning_rate": 2.3443091448565454e-07,
"loss": 0.957166314125061,
"step": 3266
},
{
"epoch": 2.750841750841751,
"grad_norm": 11.244546890258789,
"learning_rate": 2.339747512730749e-07,
"loss": 0.3728073835372925,
"step": 3268
},
{
"epoch": 2.7525252525252526,
"grad_norm": 3.2135775089263916,
"learning_rate": 2.3352157188186424e-07,
"loss": 0.9523381590843201,
"step": 3270
},
{
"epoch": 2.7542087542087543,
"grad_norm": 7.215963840484619,
"learning_rate": 2.3307137787341667e-07,
"loss": 0.4420832395553589,
"step": 3272
},
{
"epoch": 2.755892255892256,
"grad_norm": 2.81378436088562,
"learning_rate": 2.3262417079883986e-07,
"loss": 0.660933792591095,
"step": 3274
},
{
"epoch": 2.757575757575758,
"grad_norm": 127.56824493408203,
"learning_rate": 2.3217995219895016e-07,
"loss": 0.3062414228916168,
"step": 3276
},
{
"epoch": 2.7592592592592595,
"grad_norm": 0.698665201663971,
"learning_rate": 2.317387236042678e-07,
"loss": 0.021941782906651497,
"step": 3278
},
{
"epoch": 2.760942760942761,
"grad_norm": 4.418609619140625,
"learning_rate": 2.313004865350109e-07,
"loss": 1.040034532546997,
"step": 3280
},
{
"epoch": 2.7626262626262625,
"grad_norm": 3.401939868927002,
"learning_rate": 2.3086524250109045e-07,
"loss": 1.0358326435089111,
"step": 3282
},
{
"epoch": 2.7643097643097643,
"grad_norm": 18.86932945251465,
"learning_rate": 2.3043299300210528e-07,
"loss": 0.23045207560062408,
"step": 3284
},
{
"epoch": 2.765993265993266,
"grad_norm": 3.0848443508148193,
"learning_rate": 2.30003739527337e-07,
"loss": 0.7953276038169861,
"step": 3286
},
{
"epoch": 2.7676767676767677,
"grad_norm": 4.258274078369141,
"learning_rate": 2.2957748355574408e-07,
"loss": 0.7808912396430969,
"step": 3288
},
{
"epoch": 2.7693602693602695,
"grad_norm": 8.350629806518555,
"learning_rate": 2.2915422655595795e-07,
"loss": 0.2024976909160614,
"step": 3290
},
{
"epoch": 2.771043771043771,
"grad_norm": 3.212890386581421,
"learning_rate": 2.287339699862771e-07,
"loss": 0.9757770299911499,
"step": 3292
},
{
"epoch": 2.7727272727272725,
"grad_norm": 4.119185447692871,
"learning_rate": 2.2831671529466205e-07,
"loss": 0.8145531415939331,
"step": 3294
},
{
"epoch": 2.774410774410774,
"grad_norm": 4.300760269165039,
"learning_rate": 2.2790246391873086e-07,
"loss": 0.8364596366882324,
"step": 3296
},
{
"epoch": 2.776094276094276,
"grad_norm": 5.6328630447387695,
"learning_rate": 2.2749121728575393e-07,
"loss": 0.2111830711364746,
"step": 3298
},
{
"epoch": 2.7777777777777777,
"grad_norm": 6.152875900268555,
"learning_rate": 2.2708297681264874e-07,
"loss": 0.4531656801700592,
"step": 3300
},
{
"epoch": 2.7794612794612794,
"grad_norm": 6.0950164794921875,
"learning_rate": 2.2667774390597562e-07,
"loss": 0.486369788646698,
"step": 3302
},
{
"epoch": 2.781144781144781,
"grad_norm": 12.233784675598145,
"learning_rate": 2.2627551996193247e-07,
"loss": 0.4338839054107666,
"step": 3304
},
{
"epoch": 2.782828282828283,
"grad_norm": 11.843306541442871,
"learning_rate": 2.2587630636634985e-07,
"loss": 0.7146729230880737,
"step": 3306
},
{
"epoch": 2.7845117845117846,
"grad_norm": 26.314231872558594,
"learning_rate": 2.2548010449468676e-07,
"loss": 0.426150381565094,
"step": 3308
},
{
"epoch": 2.7861952861952863,
"grad_norm": 5.808564186096191,
"learning_rate": 2.2508691571202528e-07,
"loss": 0.6131501793861389,
"step": 3310
},
{
"epoch": 2.787878787878788,
"grad_norm": 4.843730926513672,
"learning_rate": 2.2469674137306627e-07,
"loss": 0.4474066197872162,
"step": 3312
},
{
"epoch": 2.78956228956229,
"grad_norm": 5.842626571655273,
"learning_rate": 2.2430958282212414e-07,
"loss": 0.676105260848999,
"step": 3314
},
{
"epoch": 2.791245791245791,
"grad_norm": 10.79865550994873,
"learning_rate": 2.239254413931236e-07,
"loss": 0.9383071660995483,
"step": 3316
},
{
"epoch": 2.792929292929293,
"grad_norm": 2.2393341064453125,
"learning_rate": 2.2354431840959307e-07,
"loss": 0.7455552220344543,
"step": 3318
},
{
"epoch": 2.7946127946127945,
"grad_norm": 5.729065895080566,
"learning_rate": 2.2316621518466167e-07,
"loss": 0.28741055727005005,
"step": 3320
},
{
"epoch": 2.7962962962962963,
"grad_norm": 9.186633110046387,
"learning_rate": 2.227911330210542e-07,
"loss": 0.6114668250083923,
"step": 3322
},
{
"epoch": 2.797979797979798,
"grad_norm": 12.35034465789795,
"learning_rate": 2.2241907321108638e-07,
"loss": 0.6540449857711792,
"step": 3324
},
{
"epoch": 2.7996632996632997,
"grad_norm": 2.6777584552764893,
"learning_rate": 2.22050037036661e-07,
"loss": 0.30680525302886963,
"step": 3326
},
{
"epoch": 2.8013468013468015,
"grad_norm": 3.350935697555542,
"learning_rate": 2.216840257692628e-07,
"loss": 0.7153966426849365,
"step": 3328
},
{
"epoch": 2.8030303030303028,
"grad_norm": 2.8656368255615234,
"learning_rate": 2.213210406699547e-07,
"loss": 0.7619553804397583,
"step": 3330
},
{
"epoch": 2.8047138047138045,
"grad_norm": 7.474374294281006,
"learning_rate": 2.209610829893729e-07,
"loss": 0.5717604160308838,
"step": 3332
},
{
"epoch": 2.8063973063973062,
"grad_norm": 8.4893798828125,
"learning_rate": 2.2060415396772337e-07,
"loss": 0.5182145833969116,
"step": 3334
},
{
"epoch": 2.808080808080808,
"grad_norm": 8.64901065826416,
"learning_rate": 2.2025025483477654e-07,
"loss": 0.5500608682632446,
"step": 3336
},
{
"epoch": 2.8097643097643097,
"grad_norm": 2.9587368965148926,
"learning_rate": 2.1989938680986382e-07,
"loss": 0.2802525758743286,
"step": 3338
},
{
"epoch": 2.8114478114478114,
"grad_norm": 7.318872928619385,
"learning_rate": 2.1955155110187344e-07,
"loss": 0.6136119365692139,
"step": 3340
},
{
"epoch": 2.813131313131313,
"grad_norm": 7.030915260314941,
"learning_rate": 2.1920674890924545e-07,
"loss": 0.7545953989028931,
"step": 3342
},
{
"epoch": 2.814814814814815,
"grad_norm": 2.9126713275909424,
"learning_rate": 2.1886498141996858e-07,
"loss": 0.33089566230773926,
"step": 3344
},
{
"epoch": 2.8164983164983166,
"grad_norm": 2.292778968811035,
"learning_rate": 2.185262498115759e-07,
"loss": 0.820242166519165,
"step": 3346
},
{
"epoch": 2.8181818181818183,
"grad_norm": 9.872072219848633,
"learning_rate": 2.1819055525113995e-07,
"loss": 0.4794435501098633,
"step": 3348
},
{
"epoch": 2.81986531986532,
"grad_norm": 6.807747840881348,
"learning_rate": 2.178578988952698e-07,
"loss": 0.8766056299209595,
"step": 3350
},
{
"epoch": 2.821548821548822,
"grad_norm": 11.850113868713379,
"learning_rate": 2.1752828189010677e-07,
"loss": 0.8210408687591553,
"step": 3352
},
{
"epoch": 2.823232323232323,
"grad_norm": 4.237025260925293,
"learning_rate": 2.1720170537132003e-07,
"loss": 0.7889919281005859,
"step": 3354
},
{
"epoch": 2.824915824915825,
"grad_norm": 6.600332736968994,
"learning_rate": 2.16878170464103e-07,
"loss": 0.7373786568641663,
"step": 3356
},
{
"epoch": 2.8265993265993266,
"grad_norm": 3.782309055328369,
"learning_rate": 2.1655767828316967e-07,
"loss": 0.4632776975631714,
"step": 3358
},
{
"epoch": 2.8282828282828283,
"grad_norm": 20.347566604614258,
"learning_rate": 2.1624022993275042e-07,
"loss": 0.47924166917800903,
"step": 3360
},
{
"epoch": 2.82996632996633,
"grad_norm": 3.760439872741699,
"learning_rate": 2.1592582650658838e-07,
"loss": 0.5661218166351318,
"step": 3362
},
{
"epoch": 2.8316498316498318,
"grad_norm": 12.392730712890625,
"learning_rate": 2.1561446908793575e-07,
"loss": 0.5744220018386841,
"step": 3364
},
{
"epoch": 2.8333333333333335,
"grad_norm": 9.636838912963867,
"learning_rate": 2.1530615874954978e-07,
"loss": 0.4627985954284668,
"step": 3366
},
{
"epoch": 2.8350168350168348,
"grad_norm": 21.72933578491211,
"learning_rate": 2.1500089655368913e-07,
"loss": 0.4576794505119324,
"step": 3368
},
{
"epoch": 2.8367003367003365,
"grad_norm": 7.211141586303711,
"learning_rate": 2.146986835521108e-07,
"loss": 0.8104113340377808,
"step": 3370
},
{
"epoch": 2.8383838383838382,
"grad_norm": 3.049208879470825,
"learning_rate": 2.143995207860655e-07,
"loss": 0.6803615093231201,
"step": 3372
},
{
"epoch": 2.84006734006734,
"grad_norm": 15.541363716125488,
"learning_rate": 2.1410340928629483e-07,
"loss": 0.2819385230541229,
"step": 3374
},
{
"epoch": 2.8417508417508417,
"grad_norm": 3.854581832885742,
"learning_rate": 2.138103500730278e-07,
"loss": 0.8866885900497437,
"step": 3376
},
{
"epoch": 2.8434343434343434,
"grad_norm": 2.881070613861084,
"learning_rate": 2.1352034415597635e-07,
"loss": 0.7249988317489624,
"step": 3378
},
{
"epoch": 2.845117845117845,
"grad_norm": 2.772418260574341,
"learning_rate": 2.1323339253433309e-07,
"loss": 0.5438086986541748,
"step": 3380
},
{
"epoch": 2.846801346801347,
"grad_norm": 5.94671106338501,
"learning_rate": 2.1294949619676717e-07,
"loss": 0.5575168132781982,
"step": 3382
},
{
"epoch": 2.8484848484848486,
"grad_norm": 10.924814224243164,
"learning_rate": 2.1266865612142064e-07,
"loss": 0.5616028308868408,
"step": 3384
},
{
"epoch": 2.8501683501683504,
"grad_norm": 4.334954261779785,
"learning_rate": 2.1239087327590582e-07,
"loss": 0.7617322206497192,
"step": 3386
},
{
"epoch": 2.851851851851852,
"grad_norm": 1.0559417009353638,
"learning_rate": 2.121161486173017e-07,
"loss": 0.7200487852096558,
"step": 3388
},
{
"epoch": 2.8535353535353534,
"grad_norm": 8.445873260498047,
"learning_rate": 2.1184448309215015e-07,
"loss": 0.4146542549133301,
"step": 3390
},
{
"epoch": 2.855218855218855,
"grad_norm": 3.8039331436157227,
"learning_rate": 2.1157587763645322e-07,
"loss": 0.46166175603866577,
"step": 3392
},
{
"epoch": 2.856902356902357,
"grad_norm": 6.415493488311768,
"learning_rate": 2.113103331756698e-07,
"loss": 0.930475652217865,
"step": 3394
},
{
"epoch": 2.8585858585858586,
"grad_norm": 3.632256507873535,
"learning_rate": 2.110478506247122e-07,
"loss": 0.9054207801818848,
"step": 3396
},
{
"epoch": 2.8602693602693603,
"grad_norm": 4.30327844619751,
"learning_rate": 2.1078843088794325e-07,
"loss": 0.4588157534599304,
"step": 3398
},
{
"epoch": 2.861952861952862,
"grad_norm": 7.749840259552002,
"learning_rate": 2.105320748591732e-07,
"loss": 0.3445073962211609,
"step": 3400
},
{
"epoch": 2.8636363636363638,
"grad_norm": 12.756885528564453,
"learning_rate": 2.1027878342165624e-07,
"loss": 0.4542715847492218,
"step": 3402
},
{
"epoch": 2.865319865319865,
"grad_norm": 4.2234296798706055,
"learning_rate": 2.1002855744808815e-07,
"loss": 0.38249820470809937,
"step": 3404
},
{
"epoch": 2.8670033670033668,
"grad_norm": 11.025925636291504,
"learning_rate": 2.0978139780060257e-07,
"loss": 0.7736653089523315,
"step": 3406
},
{
"epoch": 2.8686868686868685,
"grad_norm": 6.31485652923584,
"learning_rate": 2.0953730533076862e-07,
"loss": 0.30026775598526,
"step": 3408
},
{
"epoch": 2.8703703703703702,
"grad_norm": 4.0879034996032715,
"learning_rate": 2.0929628087958734e-07,
"loss": 0.7915642261505127,
"step": 3410
},
{
"epoch": 2.872053872053872,
"grad_norm": 8.910355567932129,
"learning_rate": 2.0905832527748953e-07,
"loss": 0.4548564851284027,
"step": 3412
},
{
"epoch": 2.8737373737373737,
"grad_norm": 4.792451858520508,
"learning_rate": 2.0882343934433236e-07,
"loss": 0.6330816745758057,
"step": 3414
},
{
"epoch": 2.8754208754208754,
"grad_norm": 6.679534912109375,
"learning_rate": 2.085916238893966e-07,
"loss": 0.17160841822624207,
"step": 3416
},
{
"epoch": 2.877104377104377,
"grad_norm": 4.708609104156494,
"learning_rate": 2.0836287971138418e-07,
"loss": 0.6133572459220886,
"step": 3418
},
{
"epoch": 2.878787878787879,
"grad_norm": 2.8028249740600586,
"learning_rate": 2.0813720759841492e-07,
"loss": 0.37677788734436035,
"step": 3420
},
{
"epoch": 2.8804713804713806,
"grad_norm": 17.95976448059082,
"learning_rate": 2.0791460832802423e-07,
"loss": 0.6834679841995239,
"step": 3422
},
{
"epoch": 2.8821548821548824,
"grad_norm": 1.99964439868927,
"learning_rate": 2.0769508266716027e-07,
"loss": 0.5820834636688232,
"step": 3424
},
{
"epoch": 2.883838383838384,
"grad_norm": 4.93143367767334,
"learning_rate": 2.0747863137218126e-07,
"loss": 0.6087404489517212,
"step": 3426
},
{
"epoch": 2.8855218855218854,
"grad_norm": 4.417807102203369,
"learning_rate": 2.0726525518885308e-07,
"loss": 0.5436590909957886,
"step": 3428
},
{
"epoch": 2.887205387205387,
"grad_norm": 17.931697845458984,
"learning_rate": 2.0705495485234653e-07,
"loss": 0.28521019220352173,
"step": 3430
},
{
"epoch": 2.888888888888889,
"grad_norm": 4.5258026123046875,
"learning_rate": 2.0684773108723455e-07,
"loss": 0.5188443660736084,
"step": 3432
},
{
"epoch": 2.8905723905723906,
"grad_norm": 7.992106914520264,
"learning_rate": 2.0664358460749018e-07,
"loss": 0.2710973620414734,
"step": 3434
},
{
"epoch": 2.8922558922558923,
"grad_norm": 2.4972705841064453,
"learning_rate": 2.064425161164842e-07,
"loss": 0.9403241872787476,
"step": 3436
},
{
"epoch": 2.893939393939394,
"grad_norm": 7.593927383422852,
"learning_rate": 2.0624452630698195e-07,
"loss": 0.8685269355773926,
"step": 3438
},
{
"epoch": 2.8956228956228958,
"grad_norm": 5.5332746505737305,
"learning_rate": 2.0604961586114163e-07,
"loss": 0.7080799341201782,
"step": 3440
},
{
"epoch": 2.897306397306397,
"grad_norm": 4.279024600982666,
"learning_rate": 2.0585778545051195e-07,
"loss": 0.9225847721099854,
"step": 3442
},
{
"epoch": 2.898989898989899,
"grad_norm": 7.960180282592773,
"learning_rate": 2.0566903573602913e-07,
"loss": 0.26514777541160583,
"step": 3444
},
{
"epoch": 2.9006734006734005,
"grad_norm": 52.408592224121094,
"learning_rate": 2.0548336736801548e-07,
"loss": 0.5182454586029053,
"step": 3446
},
{
"epoch": 2.9023569023569022,
"grad_norm": 3.880129098892212,
"learning_rate": 2.0530078098617668e-07,
"loss": 1.0010104179382324,
"step": 3448
},
{
"epoch": 2.904040404040404,
"grad_norm": 5.750271320343018,
"learning_rate": 2.0512127721959954e-07,
"loss": 0.23654749989509583,
"step": 3450
},
{
"epoch": 2.9057239057239057,
"grad_norm": 4.4567551612854,
"learning_rate": 2.0494485668675003e-07,
"loss": 0.6079249382019043,
"step": 3452
},
{
"epoch": 2.9074074074074074,
"grad_norm": 13.503162384033203,
"learning_rate": 2.0477151999547137e-07,
"loss": 0.5366786122322083,
"step": 3454
},
{
"epoch": 2.909090909090909,
"grad_norm": 3.5950307846069336,
"learning_rate": 2.0460126774298115e-07,
"loss": 0.9563678503036499,
"step": 3456
},
{
"epoch": 2.910774410774411,
"grad_norm": 2.127427339553833,
"learning_rate": 2.044341005158701e-07,
"loss": 0.7329115867614746,
"step": 3458
},
{
"epoch": 2.9124579124579126,
"grad_norm": 10.821589469909668,
"learning_rate": 2.042700188900996e-07,
"loss": 0.9082905054092407,
"step": 3460
},
{
"epoch": 2.9141414141414144,
"grad_norm": 11.092399597167969,
"learning_rate": 2.0410902343099998e-07,
"loss": 1.0648142099380493,
"step": 3462
},
{
"epoch": 2.915824915824916,
"grad_norm": 8.53269100189209,
"learning_rate": 2.039511146932683e-07,
"loss": 0.6280519962310791,
"step": 3464
},
{
"epoch": 2.9175084175084174,
"grad_norm": 4.54081916809082,
"learning_rate": 2.0379629322096658e-07,
"loss": 0.9411839246749878,
"step": 3466
},
{
"epoch": 2.919191919191919,
"grad_norm": 3.7969729900360107,
"learning_rate": 2.036445595475199e-07,
"loss": 0.5461298823356628,
"step": 3468
},
{
"epoch": 2.920875420875421,
"grad_norm": 2.2279632091522217,
"learning_rate": 2.0349591419571473e-07,
"loss": 0.0855223536491394,
"step": 3470
},
{
"epoch": 2.9225589225589226,
"grad_norm": 8.34626293182373,
"learning_rate": 2.0335035767769674e-07,
"loss": 0.6720945835113525,
"step": 3472
},
{
"epoch": 2.9242424242424243,
"grad_norm": 4.789892673492432,
"learning_rate": 2.032078904949694e-07,
"loss": 0.6181377172470093,
"step": 3474
},
{
"epoch": 2.925925925925926,
"grad_norm": 4.624399662017822,
"learning_rate": 2.0306851313839217e-07,
"loss": 0.25879359245300293,
"step": 3476
},
{
"epoch": 2.9276094276094278,
"grad_norm": 6.712757587432861,
"learning_rate": 2.0293222608817862e-07,
"loss": 0.7951024770736694,
"step": 3478
},
{
"epoch": 2.929292929292929,
"grad_norm": 4.2503814697265625,
"learning_rate": 2.0279902981389491e-07,
"loss": 0.4090489447116852,
"step": 3480
},
{
"epoch": 2.930976430976431,
"grad_norm": 4.199467182159424,
"learning_rate": 2.026689247744584e-07,
"loss": 0.7058537602424622,
"step": 3482
},
{
"epoch": 2.9326599326599325,
"grad_norm": 2.017397165298462,
"learning_rate": 2.0254191141813563e-07,
"loss": 0.4949754476547241,
"step": 3484
},
{
"epoch": 2.9343434343434343,
"grad_norm": 2.5312118530273438,
"learning_rate": 2.0241799018254102e-07,
"loss": 0.6103169322013855,
"step": 3486
},
{
"epoch": 2.936026936026936,
"grad_norm": 7.283255577087402,
"learning_rate": 2.0229716149463543e-07,
"loss": 0.5724541544914246,
"step": 3488
},
{
"epoch": 2.9377104377104377,
"grad_norm": 15.510021209716797,
"learning_rate": 2.0217942577072447e-07,
"loss": 0.5570365190505981,
"step": 3490
},
{
"epoch": 2.9393939393939394,
"grad_norm": 15.865419387817383,
"learning_rate": 2.0206478341645734e-07,
"loss": 0.8093217611312866,
"step": 3492
},
{
"epoch": 2.941077441077441,
"grad_norm": 16.84939956665039,
"learning_rate": 2.0195323482682508e-07,
"loss": 0.40408650040626526,
"step": 3494
},
{
"epoch": 2.942760942760943,
"grad_norm": 2.694458246231079,
"learning_rate": 2.0184478038615948e-07,
"loss": 0.6976212859153748,
"step": 3496
},
{
"epoch": 2.9444444444444446,
"grad_norm": 6.089773654937744,
"learning_rate": 2.0173942046813191e-07,
"loss": 0.30283308029174805,
"step": 3498
},
{
"epoch": 2.9461279461279464,
"grad_norm": 17.606487274169922,
"learning_rate": 2.016371554357515e-07,
"loss": 0.6129805445671082,
"step": 3500
},
{
"epoch": 2.9478114478114477,
"grad_norm": 48.08317565917969,
"learning_rate": 2.015379856413643e-07,
"loss": 0.6700767278671265,
"step": 3502
},
{
"epoch": 2.9494949494949494,
"grad_norm": 10.773337364196777,
"learning_rate": 2.01441911426652e-07,
"loss": 0.32376813888549805,
"step": 3504
},
{
"epoch": 2.951178451178451,
"grad_norm": 1.6822550296783447,
"learning_rate": 2.013489331226307e-07,
"loss": 0.6684743762016296,
"step": 3506
},
{
"epoch": 2.952861952861953,
"grad_norm": 4.8438568115234375,
"learning_rate": 2.0125905104964978e-07,
"loss": 0.846743106842041,
"step": 3508
},
{
"epoch": 2.9545454545454546,
"grad_norm": 5.908998012542725,
"learning_rate": 2.0117226551739068e-07,
"loss": 0.6087542772293091,
"step": 3510
},
{
"epoch": 2.9562289562289563,
"grad_norm": 7.448733329772949,
"learning_rate": 2.0108857682486629e-07,
"loss": 0.8167439103126526,
"step": 3512
},
{
"epoch": 2.957912457912458,
"grad_norm": 9.953859329223633,
"learning_rate": 2.0100798526041927e-07,
"loss": 0.304475873708725,
"step": 3514
},
{
"epoch": 2.9595959595959593,
"grad_norm": 5.336069107055664,
"learning_rate": 2.009304911017215e-07,
"loss": 0.8450760841369629,
"step": 3516
},
{
"epoch": 2.961279461279461,
"grad_norm": 3.322150707244873,
"learning_rate": 2.0085609461577295e-07,
"loss": 0.8154351711273193,
"step": 3518
},
{
"epoch": 2.962962962962963,
"grad_norm": 7.335842132568359,
"learning_rate": 2.0078479605890064e-07,
"loss": 0.35378673672676086,
"step": 3520
},
{
"epoch": 2.9646464646464645,
"grad_norm": 4.2547783851623535,
"learning_rate": 2.007165956767584e-07,
"loss": 0.6887914538383484,
"step": 3522
},
{
"epoch": 2.9663299663299663,
"grad_norm": 3.4846153259277344,
"learning_rate": 2.00651493704325e-07,
"loss": 0.22204965353012085,
"step": 3524
},
{
"epoch": 2.968013468013468,
"grad_norm": 20.680572509765625,
"learning_rate": 2.0058949036590426e-07,
"loss": 0.8485254645347595,
"step": 3526
},
{
"epoch": 2.9696969696969697,
"grad_norm": 3.527207851409912,
"learning_rate": 2.0053058587512378e-07,
"loss": 0.7592622637748718,
"step": 3528
},
{
"epoch": 2.9713804713804715,
"grad_norm": 4.903465270996094,
"learning_rate": 2.0047478043493418e-07,
"loss": 0.7468944191932678,
"step": 3530
},
{
"epoch": 2.973063973063973,
"grad_norm": 6.085175514221191,
"learning_rate": 2.004220742376088e-07,
"loss": 0.6274712681770325,
"step": 3532
},
{
"epoch": 2.974747474747475,
"grad_norm": 13.613375663757324,
"learning_rate": 2.0037246746474277e-07,
"loss": 0.19880472123622894,
"step": 3534
},
{
"epoch": 2.9764309764309766,
"grad_norm": 3.277733325958252,
"learning_rate": 2.0032596028725204e-07,
"loss": 0.8517122268676758,
"step": 3536
},
{
"epoch": 2.9781144781144784,
"grad_norm": 9.69018268585205,
"learning_rate": 2.0028255286537355e-07,
"loss": 0.4260925352573395,
"step": 3538
},
{
"epoch": 2.9797979797979797,
"grad_norm": 3.108520269393921,
"learning_rate": 2.0024224534866408e-07,
"loss": 0.9670834541320801,
"step": 3540
},
{
"epoch": 2.9814814814814814,
"grad_norm": 3.3656985759735107,
"learning_rate": 2.0020503787599998e-07,
"loss": 0.8684190511703491,
"step": 3542
},
{
"epoch": 2.983164983164983,
"grad_norm": 5.216827392578125,
"learning_rate": 2.001709305755767e-07,
"loss": 0.4294402599334717,
"step": 3544
},
{
"epoch": 2.984848484848485,
"grad_norm": 3.578760862350464,
"learning_rate": 2.0013992356490827e-07,
"loss": 0.8262860178947449,
"step": 3546
},
{
"epoch": 2.9865319865319866,
"grad_norm": 6.799862861633301,
"learning_rate": 2.0011201695082687e-07,
"loss": 0.39053958654403687,
"step": 3548
},
{
"epoch": 2.9882154882154883,
"grad_norm": 8.427506446838379,
"learning_rate": 2.0008721082948243e-07,
"loss": 0.2766346037387848,
"step": 3550
},
{
"epoch": 2.98989898989899,
"grad_norm": 4.960444927215576,
"learning_rate": 2.0006550528634258e-07,
"loss": 0.5050246715545654,
"step": 3552
},
{
"epoch": 2.9915824915824913,
"grad_norm": 18.282289505004883,
"learning_rate": 2.00046900396192e-07,
"loss": 0.8541325926780701,
"step": 3554
},
{
"epoch": 2.993265993265993,
"grad_norm": 3.258129358291626,
"learning_rate": 2.0003139622313241e-07,
"loss": 0.7546226978302002,
"step": 3556
},
{
"epoch": 2.994949494949495,
"grad_norm": 3.466796398162842,
"learning_rate": 2.0001899282058216e-07,
"loss": 0.6056807041168213,
"step": 3558
},
{
"epoch": 2.9966329966329965,
"grad_norm": 4.726839542388916,
"learning_rate": 2.000096902312762e-07,
"loss": 0.3962956964969635,
"step": 3560
},
{
"epoch": 2.9983164983164983,
"grad_norm": 5.164308071136475,
"learning_rate": 2.0000348848726586e-07,
"loss": 0.5580795407295227,
"step": 3562
},
{
"epoch": 3.0,
"grad_norm": 9.059016227722168,
"learning_rate": 2.0000038760991877e-07,
"loss": 0.46740537881851196,
"step": 3564
},
{
"epoch": 3.0,
"step": 3564,
"total_flos": 4.2988160857187287e+18,
"train_loss": 0.7857236749096433,
"train_runtime": 6229.2125,
"train_samples_per_second": 9.154,
"train_steps_per_second": 0.572
}
],
"logging_steps": 2,
"max_steps": 3564,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 99999,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 4.2988160857187287e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}