Text Classification
Safetensors
GLiClass
text classification
nli
sentiment analysis
gliclass-instruct-large-v1.0 / trainer_state.json
Ihor's picture
Upload folder using huggingface_hub
0ef78e5 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.4063565778971047,
"eval_steps": 500,
"global_step": 70000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0005805093969958639,
"grad_norm": 0.7240252494812012,
"learning_rate": 5.746459252379847e-07,
"loss": 0.1079,
"step": 100
},
{
"epoch": 0.0011610187939917278,
"grad_norm": 5.649069786071777,
"learning_rate": 1.1550963547713026e-06,
"loss": 0.1034,
"step": 200
},
{
"epoch": 0.0017415281909875916,
"grad_norm": 2.677654504776001,
"learning_rate": 1.7355467843046206e-06,
"loss": 0.0933,
"step": 300
},
{
"epoch": 0.0023220375879834556,
"grad_norm": 1.4429970979690552,
"learning_rate": 2.3159972138379382e-06,
"loss": 0.0998,
"step": 400
},
{
"epoch": 0.0029025469849793192,
"grad_norm": 1.217147707939148,
"learning_rate": 2.8964476433712563e-06,
"loss": 0.0968,
"step": 500
},
{
"epoch": 0.0034830563819751833,
"grad_norm": 0.10677797347307205,
"learning_rate": 3.476898072904574e-06,
"loss": 0.0903,
"step": 600
},
{
"epoch": 0.004063565778971047,
"grad_norm": 18.24258804321289,
"learning_rate": 4.0573485024378915e-06,
"loss": 0.0898,
"step": 700
},
{
"epoch": 0.004644075175966911,
"grad_norm": 0.49379172921180725,
"learning_rate": 4.63779893197121e-06,
"loss": 0.0741,
"step": 800
},
{
"epoch": 0.0052245845729627744,
"grad_norm": 0.35244712233543396,
"learning_rate": 5.218249361504528e-06,
"loss": 0.0691,
"step": 900
},
{
"epoch": 0.0058050939699586385,
"grad_norm": 1.6648738384246826,
"learning_rate": 5.798699791037845e-06,
"loss": 0.0652,
"step": 1000
},
{
"epoch": 0.0063856033669545025,
"grad_norm": 1.3582569360733032,
"learning_rate": 6.379150220571163e-06,
"loss": 0.0632,
"step": 1100
},
{
"epoch": 0.0069661127639503665,
"grad_norm": 8.560323715209961,
"learning_rate": 6.9596006501044805e-06,
"loss": 0.0573,
"step": 1200
},
{
"epoch": 0.0075466221609462305,
"grad_norm": 1.7866237163543701,
"learning_rate": 7.5400510796378e-06,
"loss": 0.0517,
"step": 1300
},
{
"epoch": 0.008127131557942095,
"grad_norm": 2.1602494716644287,
"learning_rate": 8.120501509171117e-06,
"loss": 0.0531,
"step": 1400
},
{
"epoch": 0.008707640954937958,
"grad_norm": 3.2753777503967285,
"learning_rate": 8.700951938704436e-06,
"loss": 0.0446,
"step": 1500
},
{
"epoch": 0.009288150351933823,
"grad_norm": 7.379251003265381,
"learning_rate": 9.281402368237753e-06,
"loss": 0.0415,
"step": 1600
},
{
"epoch": 0.009868659748929686,
"grad_norm": 2.0278890132904053,
"learning_rate": 9.861852797771071e-06,
"loss": 0.0471,
"step": 1700
},
{
"epoch": 0.010449169145925549,
"grad_norm": 1.9143238067626953,
"learning_rate": 1.0442303227304388e-05,
"loss": 0.0431,
"step": 1800
},
{
"epoch": 0.011029678542921414,
"grad_norm": 3.3644015789031982,
"learning_rate": 1.1022753656837706e-05,
"loss": 0.0388,
"step": 1900
},
{
"epoch": 0.011610187939917277,
"grad_norm": 5.762856960296631,
"learning_rate": 1.1603204086371025e-05,
"loss": 0.0387,
"step": 2000
},
{
"epoch": 0.012190697336913142,
"grad_norm": 3.5617690086364746,
"learning_rate": 1.2183654515904343e-05,
"loss": 0.0437,
"step": 2100
},
{
"epoch": 0.012771206733909005,
"grad_norm": 4.118985176086426,
"learning_rate": 1.276410494543766e-05,
"loss": 0.0368,
"step": 2200
},
{
"epoch": 0.01335171613090487,
"grad_norm": 0.7795373201370239,
"learning_rate": 1.3344555374970977e-05,
"loss": 0.0441,
"step": 2300
},
{
"epoch": 0.013932225527900733,
"grad_norm": 6.839756011962891,
"learning_rate": 1.3925005804504295e-05,
"loss": 0.0362,
"step": 2400
},
{
"epoch": 0.014512734924896596,
"grad_norm": 0.0,
"learning_rate": 1.4505456234037616e-05,
"loss": 0.0355,
"step": 2500
},
{
"epoch": 0.015093244321892461,
"grad_norm": 0.7248427271842957,
"learning_rate": 1.508590666357093e-05,
"loss": 0.0352,
"step": 2600
},
{
"epoch": 0.015673753718888326,
"grad_norm": 1.2530128955841064,
"learning_rate": 1.566635709310425e-05,
"loss": 0.0337,
"step": 2700
},
{
"epoch": 0.01625426311588419,
"grad_norm": 1.8551280498504639,
"learning_rate": 1.6246807522637568e-05,
"loss": 0.0392,
"step": 2800
},
{
"epoch": 0.016834772512880052,
"grad_norm": 1.7844129800796509,
"learning_rate": 1.6827257952170884e-05,
"loss": 0.0327,
"step": 2900
},
{
"epoch": 0.017415281909875915,
"grad_norm": 1.0492717027664185,
"learning_rate": 1.74077083817042e-05,
"loss": 0.0326,
"step": 3000
},
{
"epoch": 0.01799579130687178,
"grad_norm": 1.5272563695907593,
"learning_rate": 1.798815881123752e-05,
"loss": 0.0363,
"step": 3100
},
{
"epoch": 0.018576300703867645,
"grad_norm": 1.028568983078003,
"learning_rate": 1.856860924077084e-05,
"loss": 0.0326,
"step": 3200
},
{
"epoch": 0.01915681010086351,
"grad_norm": 3.0390264987945557,
"learning_rate": 1.9149059670304155e-05,
"loss": 0.0327,
"step": 3300
},
{
"epoch": 0.01973731949785937,
"grad_norm": 0.0,
"learning_rate": 1.9729510099837475e-05,
"loss": 0.034,
"step": 3400
},
{
"epoch": 0.020317828894855235,
"grad_norm": 3.5412075519561768,
"learning_rate": 2.0309960529370792e-05,
"loss": 0.0301,
"step": 3500
},
{
"epoch": 0.020898338291851098,
"grad_norm": 63.408573150634766,
"learning_rate": 2.0890410958904112e-05,
"loss": 0.0391,
"step": 3600
},
{
"epoch": 0.021478847688846964,
"grad_norm": 2.8030202388763428,
"learning_rate": 2.147086138843743e-05,
"loss": 0.0269,
"step": 3700
},
{
"epoch": 0.022059357085842828,
"grad_norm": 9.339249610900879,
"learning_rate": 2.2051311817970746e-05,
"loss": 0.0322,
"step": 3800
},
{
"epoch": 0.02263986648283869,
"grad_norm": 1.6153712272644043,
"learning_rate": 2.2631762247504066e-05,
"loss": 0.0347,
"step": 3900
},
{
"epoch": 0.023220375879834554,
"grad_norm": 2.073003053665161,
"learning_rate": 2.3212212677037383e-05,
"loss": 0.0252,
"step": 4000
},
{
"epoch": 0.023800885276830417,
"grad_norm": 1.535114049911499,
"learning_rate": 2.37926631065707e-05,
"loss": 0.0319,
"step": 4100
},
{
"epoch": 0.024381394673826284,
"grad_norm": 2.764871120452881,
"learning_rate": 2.4373113536104016e-05,
"loss": 0.0316,
"step": 4200
},
{
"epoch": 0.024961904070822147,
"grad_norm": 1.169055700302124,
"learning_rate": 2.4953563965637336e-05,
"loss": 0.0289,
"step": 4300
},
{
"epoch": 0.02554241346781801,
"grad_norm": 0.0,
"learning_rate": 2.5534014395170653e-05,
"loss": 0.0301,
"step": 4400
},
{
"epoch": 0.026122922864813873,
"grad_norm": 5.054708957672119,
"learning_rate": 2.611446482470397e-05,
"loss": 0.0276,
"step": 4500
},
{
"epoch": 0.02670343226180974,
"grad_norm": 3.1561946868896484,
"learning_rate": 2.669491525423729e-05,
"loss": 0.0284,
"step": 4600
},
{
"epoch": 0.027283941658805603,
"grad_norm": 1.3254655599594116,
"learning_rate": 2.727536568377061e-05,
"loss": 0.029,
"step": 4700
},
{
"epoch": 0.027864451055801466,
"grad_norm": 15.622490882873535,
"learning_rate": 2.7855816113303924e-05,
"loss": 0.0319,
"step": 4800
},
{
"epoch": 0.02844496045279733,
"grad_norm": 11.673898696899414,
"learning_rate": 2.8436266542837244e-05,
"loss": 0.0311,
"step": 4900
},
{
"epoch": 0.029025469849793192,
"grad_norm": 3.8026087284088135,
"learning_rate": 2.901671697237056e-05,
"loss": 0.0273,
"step": 5000
},
{
"epoch": 0.02960597924678906,
"grad_norm": 0.24046263098716736,
"learning_rate": 2.959716740190388e-05,
"loss": 0.0283,
"step": 5100
},
{
"epoch": 0.030186488643784922,
"grad_norm": 3.2492616176605225,
"learning_rate": 3.01776178314372e-05,
"loss": 0.0288,
"step": 5200
},
{
"epoch": 0.030766998040780785,
"grad_norm": 5.840951919555664,
"learning_rate": 3.075806826097051e-05,
"loss": 0.0267,
"step": 5300
},
{
"epoch": 0.03134750743777665,
"grad_norm": 4.141648769378662,
"learning_rate": 3.1338518690503834e-05,
"loss": 0.031,
"step": 5400
},
{
"epoch": 0.03192801683477251,
"grad_norm": 0.6267948746681213,
"learning_rate": 3.191896912003715e-05,
"loss": 0.027,
"step": 5500
},
{
"epoch": 0.03250852623176838,
"grad_norm": 4.212204933166504,
"learning_rate": 3.249941954957047e-05,
"loss": 0.0312,
"step": 5600
},
{
"epoch": 0.03308903562876424,
"grad_norm": 9.002190589904785,
"learning_rate": 3.3079869979103785e-05,
"loss": 0.0328,
"step": 5700
},
{
"epoch": 0.033669545025760104,
"grad_norm": 5.377740383148193,
"learning_rate": 3.36603204086371e-05,
"loss": 0.0289,
"step": 5800
},
{
"epoch": 0.03425005442275597,
"grad_norm": 4.514215469360352,
"learning_rate": 3.4240770838170425e-05,
"loss": 0.0288,
"step": 5900
},
{
"epoch": 0.03483056381975183,
"grad_norm": 11.520332336425781,
"learning_rate": 3.482122126770374e-05,
"loss": 0.0315,
"step": 6000
},
{
"epoch": 0.0354110732167477,
"grad_norm": 2.2946815490722656,
"learning_rate": 3.540167169723706e-05,
"loss": 0.0289,
"step": 6100
},
{
"epoch": 0.03599158261374356,
"grad_norm": 2.24802565574646,
"learning_rate": 3.5982122126770375e-05,
"loss": 0.0233,
"step": 6200
},
{
"epoch": 0.036572092010739424,
"grad_norm": 0.0,
"learning_rate": 3.656257255630369e-05,
"loss": 0.0258,
"step": 6300
},
{
"epoch": 0.03715260140773529,
"grad_norm": 0.0,
"learning_rate": 3.714302298583701e-05,
"loss": 0.0279,
"step": 6400
},
{
"epoch": 0.03773311080473115,
"grad_norm": 0.2940079867839813,
"learning_rate": 3.772347341537033e-05,
"loss": 0.0239,
"step": 6500
},
{
"epoch": 0.03831362020172702,
"grad_norm": 1.4886199235916138,
"learning_rate": 3.830392384490365e-05,
"loss": 0.0284,
"step": 6600
},
{
"epoch": 0.038894129598722876,
"grad_norm": 2.378464698791504,
"learning_rate": 3.8884374274436966e-05,
"loss": 0.0292,
"step": 6700
},
{
"epoch": 0.03947463899571874,
"grad_norm": 1.4737799167633057,
"learning_rate": 3.946482470397028e-05,
"loss": 0.0252,
"step": 6800
},
{
"epoch": 0.04005514839271461,
"grad_norm": 1.7435976266860962,
"learning_rate": 4.00452751335036e-05,
"loss": 0.0252,
"step": 6900
},
{
"epoch": 0.04063565778971047,
"grad_norm": 3.012014627456665,
"learning_rate": 4.062572556303692e-05,
"loss": 0.0341,
"step": 7000
},
{
"epoch": 0.041216167186706336,
"grad_norm": 1.463402509689331,
"learning_rate": 4.120617599257023e-05,
"loss": 0.0242,
"step": 7100
},
{
"epoch": 0.041796676583702196,
"grad_norm": 2.936508893966675,
"learning_rate": 4.178662642210355e-05,
"loss": 0.0303,
"step": 7200
},
{
"epoch": 0.04237718598069806,
"grad_norm": 2.3473055362701416,
"learning_rate": 4.2367076851636874e-05,
"loss": 0.0268,
"step": 7300
},
{
"epoch": 0.04295769537769393,
"grad_norm": 3.3722922801971436,
"learning_rate": 4.294752728117019e-05,
"loss": 0.0212,
"step": 7400
},
{
"epoch": 0.04353820477468979,
"grad_norm": 3.0405075550079346,
"learning_rate": 4.352797771070351e-05,
"loss": 0.0277,
"step": 7500
},
{
"epoch": 0.044118714171685655,
"grad_norm": 1.044071912765503,
"learning_rate": 4.4108428140236824e-05,
"loss": 0.0225,
"step": 7600
},
{
"epoch": 0.044699223568681515,
"grad_norm": 1.0134261846542358,
"learning_rate": 4.468887856977014e-05,
"loss": 0.0253,
"step": 7700
},
{
"epoch": 0.04527973296567738,
"grad_norm": 9.729911804199219,
"learning_rate": 4.5269328999303464e-05,
"loss": 0.0323,
"step": 7800
},
{
"epoch": 0.04586024236267325,
"grad_norm": 2.1204724311828613,
"learning_rate": 4.584977942883678e-05,
"loss": 0.0279,
"step": 7900
},
{
"epoch": 0.04644075175966911,
"grad_norm": 9.481526374816895,
"learning_rate": 4.64302298583701e-05,
"loss": 0.031,
"step": 8000
},
{
"epoch": 0.047021261156664974,
"grad_norm": 0.6424680948257446,
"learning_rate": 4.7010680287903415e-05,
"loss": 0.0278,
"step": 8100
},
{
"epoch": 0.047601770553660834,
"grad_norm": 2.0485119819641113,
"learning_rate": 4.759113071743673e-05,
"loss": 0.0261,
"step": 8200
},
{
"epoch": 0.0481822799506567,
"grad_norm": 2.485046148300171,
"learning_rate": 4.8171581146970055e-05,
"loss": 0.0257,
"step": 8300
},
{
"epoch": 0.04876278934765257,
"grad_norm": 1.421764612197876,
"learning_rate": 4.875203157650337e-05,
"loss": 0.0312,
"step": 8400
},
{
"epoch": 0.04934329874464843,
"grad_norm": 1.3517789840698242,
"learning_rate": 4.933248200603668e-05,
"loss": 0.0325,
"step": 8500
},
{
"epoch": 0.049923808141644294,
"grad_norm": 0.42899224162101746,
"learning_rate": 4.9912932435570005e-05,
"loss": 0.0225,
"step": 8600
},
{
"epoch": 0.05050431753864015,
"grad_norm": 1.7937917709350586,
"learning_rate": 4.9999966717127464e-05,
"loss": 0.0289,
"step": 8700
},
{
"epoch": 0.05108482693563602,
"grad_norm": 0.9099732041358948,
"learning_rate": 4.9999842338357364e-05,
"loss": 0.0277,
"step": 8800
},
{
"epoch": 0.051665336332631887,
"grad_norm": 1.6560391187667847,
"learning_rate": 4.999962582765702e-05,
"loss": 0.0218,
"step": 8900
},
{
"epoch": 0.052245845729627746,
"grad_norm": 1.163989543914795,
"learning_rate": 4.999931718582432e-05,
"loss": 0.0225,
"step": 9000
},
{
"epoch": 0.05282635512662361,
"grad_norm": 7.1312971115112305,
"learning_rate": 4.9998916413996715e-05,
"loss": 0.0301,
"step": 9100
},
{
"epoch": 0.05340686452361948,
"grad_norm": 10.158438682556152,
"learning_rate": 4.999842351365117e-05,
"loss": 0.0241,
"step": 9200
},
{
"epoch": 0.05398737392061534,
"grad_norm": 0.19601650536060333,
"learning_rate": 4.999783848660417e-05,
"loss": 0.026,
"step": 9300
},
{
"epoch": 0.054567883317611206,
"grad_norm": 0.716726541519165,
"learning_rate": 4.999716133501171e-05,
"loss": 0.0247,
"step": 9400
},
{
"epoch": 0.055148392714607065,
"grad_norm": 2.3324790000915527,
"learning_rate": 4.99963920613693e-05,
"loss": 0.0237,
"step": 9500
},
{
"epoch": 0.05572890211160293,
"grad_norm": 5.88576078414917,
"learning_rate": 4.9995530668511946e-05,
"loss": 0.0213,
"step": 9600
},
{
"epoch": 0.0563094115085988,
"grad_norm": 0.17054684460163116,
"learning_rate": 4.9994577159614144e-05,
"loss": 0.0225,
"step": 9700
},
{
"epoch": 0.05688992090559466,
"grad_norm": 1.9155703783035278,
"learning_rate": 4.9993531538189854e-05,
"loss": 0.0292,
"step": 9800
},
{
"epoch": 0.057470430302590525,
"grad_norm": 0.0,
"learning_rate": 4.99923938080925e-05,
"loss": 0.025,
"step": 9900
},
{
"epoch": 0.058050939699586385,
"grad_norm": 3.8306055068969727,
"learning_rate": 4.999116397351497e-05,
"loss": 0.0257,
"step": 10000
},
{
"epoch": 0.05863144909658225,
"grad_norm": 2.8682026863098145,
"learning_rate": 4.998984203898957e-05,
"loss": 0.0276,
"step": 10100
},
{
"epoch": 0.05921195849357812,
"grad_norm": 1.6766282320022583,
"learning_rate": 4.9988428009388026e-05,
"loss": 0.0261,
"step": 10200
},
{
"epoch": 0.05979246789057398,
"grad_norm": 1.1323601007461548,
"learning_rate": 4.998692188992147e-05,
"loss": 0.0219,
"step": 10300
},
{
"epoch": 0.060372977287569844,
"grad_norm": 1.3007240295410156,
"learning_rate": 4.998532368614038e-05,
"loss": 0.0321,
"step": 10400
},
{
"epoch": 0.060953486684565704,
"grad_norm": 0.4534103572368622,
"learning_rate": 4.998363340393465e-05,
"loss": 0.0271,
"step": 10500
},
{
"epoch": 0.06153399608156157,
"grad_norm": 1.0366442203521729,
"learning_rate": 4.9981851049533446e-05,
"loss": 0.0212,
"step": 10600
},
{
"epoch": 0.06211450547855744,
"grad_norm": 0.7673536539077759,
"learning_rate": 4.9979976629505305e-05,
"loss": 0.0214,
"step": 10700
},
{
"epoch": 0.0626950148755533,
"grad_norm": 6.375418186187744,
"learning_rate": 4.9978010150758016e-05,
"loss": 0.0241,
"step": 10800
},
{
"epoch": 0.06327552427254916,
"grad_norm": 2.6672778129577637,
"learning_rate": 4.9975951620538644e-05,
"loss": 0.023,
"step": 10900
},
{
"epoch": 0.06385603366954502,
"grad_norm": 0.0,
"learning_rate": 4.9973801046433494e-05,
"loss": 0.0238,
"step": 11000
},
{
"epoch": 0.06443654306654088,
"grad_norm": 0.47239989042282104,
"learning_rate": 4.997155843636808e-05,
"loss": 0.023,
"step": 11100
},
{
"epoch": 0.06501705246353676,
"grad_norm": 2.6296310424804688,
"learning_rate": 4.996922379860708e-05,
"loss": 0.0247,
"step": 11200
},
{
"epoch": 0.06559756186053262,
"grad_norm": 1.4962682723999023,
"learning_rate": 4.996679714175436e-05,
"loss": 0.0217,
"step": 11300
},
{
"epoch": 0.06617807125752848,
"grad_norm": 1.51057767868042,
"learning_rate": 4.996427847475286e-05,
"loss": 0.0249,
"step": 11400
},
{
"epoch": 0.06675858065452435,
"grad_norm": 0.9044837951660156,
"learning_rate": 4.9961667806884625e-05,
"loss": 0.0274,
"step": 11500
},
{
"epoch": 0.06733909005152021,
"grad_norm": 1.1930240392684937,
"learning_rate": 4.9958965147770764e-05,
"loss": 0.0249,
"step": 11600
},
{
"epoch": 0.06791959944851607,
"grad_norm": 0.13868218660354614,
"learning_rate": 4.995617050737138e-05,
"loss": 0.0285,
"step": 11700
},
{
"epoch": 0.06850010884551194,
"grad_norm": 1.4061717987060547,
"learning_rate": 4.995328389598556e-05,
"loss": 0.0235,
"step": 11800
},
{
"epoch": 0.0690806182425078,
"grad_norm": 3.641000509262085,
"learning_rate": 4.995030532425134e-05,
"loss": 0.0179,
"step": 11900
},
{
"epoch": 0.06966112763950366,
"grad_norm": 0.7060804963111877,
"learning_rate": 4.994723480314565e-05,
"loss": 0.0205,
"step": 12000
},
{
"epoch": 0.07024163703649954,
"grad_norm": 0.5786570310592651,
"learning_rate": 4.994407234398427e-05,
"loss": 0.0236,
"step": 12100
},
{
"epoch": 0.0708221464334954,
"grad_norm": 1.2352372407913208,
"learning_rate": 4.994081795842183e-05,
"loss": 0.0217,
"step": 12200
},
{
"epoch": 0.07140265583049125,
"grad_norm": 4.269515037536621,
"learning_rate": 4.9937471658451715e-05,
"loss": 0.0207,
"step": 12300
},
{
"epoch": 0.07198316522748711,
"grad_norm": 0.17409491539001465,
"learning_rate": 4.9934033456406035e-05,
"loss": 0.0186,
"step": 12400
},
{
"epoch": 0.07256367462448299,
"grad_norm": 1.2208822965621948,
"learning_rate": 4.993050336495562e-05,
"loss": 0.0261,
"step": 12500
},
{
"epoch": 0.07314418402147885,
"grad_norm": 0.0,
"learning_rate": 4.9926881397109896e-05,
"loss": 0.0285,
"step": 12600
},
{
"epoch": 0.07372469341847471,
"grad_norm": 7.06033992767334,
"learning_rate": 4.99231675662169e-05,
"loss": 0.0202,
"step": 12700
},
{
"epoch": 0.07430520281547058,
"grad_norm": 0.0,
"learning_rate": 4.9919361885963234e-05,
"loss": 0.0276,
"step": 12800
},
{
"epoch": 0.07488571221246644,
"grad_norm": 0.406585693359375,
"learning_rate": 4.991546437037396e-05,
"loss": 0.02,
"step": 12900
},
{
"epoch": 0.0754662216094623,
"grad_norm": 0.26512107253074646,
"learning_rate": 4.9911475033812596e-05,
"loss": 0.0234,
"step": 13000
},
{
"epoch": 0.07604673100645817,
"grad_norm": 1.349731206893921,
"learning_rate": 4.990739389098105e-05,
"loss": 0.027,
"step": 13100
},
{
"epoch": 0.07662724040345403,
"grad_norm": 4.345367908477783,
"learning_rate": 4.990322095691956e-05,
"loss": 0.02,
"step": 13200
},
{
"epoch": 0.07720774980044989,
"grad_norm": 0.672780454158783,
"learning_rate": 4.9898956247006636e-05,
"loss": 0.0199,
"step": 13300
},
{
"epoch": 0.07778825919744575,
"grad_norm": 0.8305972218513489,
"learning_rate": 4.9894599776959015e-05,
"loss": 0.0217,
"step": 13400
},
{
"epoch": 0.07836876859444163,
"grad_norm": 0.0,
"learning_rate": 4.9890151562831606e-05,
"loss": 0.0245,
"step": 13500
},
{
"epoch": 0.07894927799143749,
"grad_norm": 0.9174224734306335,
"learning_rate": 4.9885611621017403e-05,
"loss": 0.0184,
"step": 13600
},
{
"epoch": 0.07952978738843335,
"grad_norm": 5.607458114624023,
"learning_rate": 4.988097996824746e-05,
"loss": 0.0196,
"step": 13700
},
{
"epoch": 0.08011029678542922,
"grad_norm": 0.9925137162208557,
"learning_rate": 4.987625662159083e-05,
"loss": 0.021,
"step": 13800
},
{
"epoch": 0.08069080618242508,
"grad_norm": 5.234767436981201,
"learning_rate": 4.987144159845443e-05,
"loss": 0.0226,
"step": 13900
},
{
"epoch": 0.08127131557942094,
"grad_norm": 0.462223082780838,
"learning_rate": 4.986653491658309e-05,
"loss": 0.0201,
"step": 14000
},
{
"epoch": 0.08185182497641681,
"grad_norm": 0.5672245025634766,
"learning_rate": 4.986153659405939e-05,
"loss": 0.0173,
"step": 14100
},
{
"epoch": 0.08243233437341267,
"grad_norm": 7.8867011070251465,
"learning_rate": 4.985644664930367e-05,
"loss": 0.0173,
"step": 14200
},
{
"epoch": 0.08301284377040853,
"grad_norm": 0.7449125647544861,
"learning_rate": 4.9851265101073886e-05,
"loss": 0.024,
"step": 14300
},
{
"epoch": 0.08359335316740439,
"grad_norm": 0.16529901325702667,
"learning_rate": 4.984599196846562e-05,
"loss": 0.0227,
"step": 14400
},
{
"epoch": 0.08417386256440026,
"grad_norm": 11.49720287322998,
"learning_rate": 4.9840627270911934e-05,
"loss": 0.0232,
"step": 14500
},
{
"epoch": 0.08475437196139612,
"grad_norm": 1.724212884902954,
"learning_rate": 4.9835171028183355e-05,
"loss": 0.0222,
"step": 14600
},
{
"epoch": 0.08533488135839198,
"grad_norm": 0.8492684364318848,
"learning_rate": 4.982962326038778e-05,
"loss": 0.0202,
"step": 14700
},
{
"epoch": 0.08591539075538786,
"grad_norm": 1.6152923107147217,
"learning_rate": 4.9823983987970396e-05,
"loss": 0.0195,
"step": 14800
},
{
"epoch": 0.08649590015238372,
"grad_norm": 0.52412348985672,
"learning_rate": 4.981825323171362e-05,
"loss": 0.0206,
"step": 14900
},
{
"epoch": 0.08707640954937958,
"grad_norm": 0.41687363386154175,
"learning_rate": 4.9812431012737006e-05,
"loss": 0.023,
"step": 15000
},
{
"epoch": 0.08765691894637545,
"grad_norm": 1.0384039878845215,
"learning_rate": 4.9806517352497184e-05,
"loss": 0.0244,
"step": 15100
},
{
"epoch": 0.08823742834337131,
"grad_norm": 5.219573974609375,
"learning_rate": 4.980051227278777e-05,
"loss": 0.0209,
"step": 15200
},
{
"epoch": 0.08881793774036717,
"grad_norm": 2.8465518951416016,
"learning_rate": 4.979441579573928e-05,
"loss": 0.0253,
"step": 15300
},
{
"epoch": 0.08939844713736303,
"grad_norm": 7.259213924407959,
"learning_rate": 4.978822794381908e-05,
"loss": 0.0246,
"step": 15400
},
{
"epoch": 0.0899789565343589,
"grad_norm": 1.2073447704315186,
"learning_rate": 4.978194873983124e-05,
"loss": 0.0168,
"step": 15500
},
{
"epoch": 0.09055946593135476,
"grad_norm": 3.7044851779937744,
"learning_rate": 4.977557820691653e-05,
"loss": 0.0188,
"step": 15600
},
{
"epoch": 0.09113997532835062,
"grad_norm": 0.4414062798023224,
"learning_rate": 4.976911636855227e-05,
"loss": 0.0224,
"step": 15700
},
{
"epoch": 0.0917204847253465,
"grad_norm": 2.013897657394409,
"learning_rate": 4.976256324855227e-05,
"loss": 0.0198,
"step": 15800
},
{
"epoch": 0.09230099412234236,
"grad_norm": 0.45843204855918884,
"learning_rate": 4.975591887106677e-05,
"loss": 0.0176,
"step": 15900
},
{
"epoch": 0.09288150351933822,
"grad_norm": 1.0656216144561768,
"learning_rate": 4.9749183260582274e-05,
"loss": 0.0249,
"step": 16000
},
{
"epoch": 0.09346201291633409,
"grad_norm": 0.3733506500720978,
"learning_rate": 4.9742356441921544e-05,
"loss": 0.0203,
"step": 16100
},
{
"epoch": 0.09404252231332995,
"grad_norm": 0.9329887628555298,
"learning_rate": 4.973543844024345e-05,
"loss": 0.0218,
"step": 16200
},
{
"epoch": 0.09462303171032581,
"grad_norm": 4.0852251052856445,
"learning_rate": 4.972842928104291e-05,
"loss": 0.027,
"step": 16300
},
{
"epoch": 0.09520354110732167,
"grad_norm": 0.3162221610546112,
"learning_rate": 4.9721328990150776e-05,
"loss": 0.0225,
"step": 16400
},
{
"epoch": 0.09578405050431754,
"grad_norm": 0.2578160762786865,
"learning_rate": 4.971413759373376e-05,
"loss": 0.0176,
"step": 16500
},
{
"epoch": 0.0963645599013134,
"grad_norm": 0.3880905210971832,
"learning_rate": 4.970685511829432e-05,
"loss": 0.0183,
"step": 16600
},
{
"epoch": 0.09694506929830926,
"grad_norm": 1.3224152326583862,
"learning_rate": 4.969948159067056e-05,
"loss": 0.0202,
"step": 16700
},
{
"epoch": 0.09752557869530513,
"grad_norm": 1.7293118238449097,
"learning_rate": 4.969201703803614e-05,
"loss": 0.0234,
"step": 16800
},
{
"epoch": 0.098106088092301,
"grad_norm": 1.8660351037979126,
"learning_rate": 4.9684461487900195e-05,
"loss": 0.0207,
"step": 16900
},
{
"epoch": 0.09868659748929685,
"grad_norm": 2.0726287364959717,
"learning_rate": 4.967681496810719e-05,
"loss": 0.0218,
"step": 17000
},
{
"epoch": 0.09926710688629273,
"grad_norm": 6.986308574676514,
"learning_rate": 4.966907750683684e-05,
"loss": 0.0194,
"step": 17100
},
{
"epoch": 0.09984761628328859,
"grad_norm": 0.9702723622322083,
"learning_rate": 4.966124913260402e-05,
"loss": 0.022,
"step": 17200
},
{
"epoch": 0.10042812568028445,
"grad_norm": 0.1596653163433075,
"learning_rate": 4.9653329874258647e-05,
"loss": 0.0195,
"step": 17300
},
{
"epoch": 0.1010086350772803,
"grad_norm": 4.516726970672607,
"learning_rate": 4.964531976098556e-05,
"loss": 0.0216,
"step": 17400
},
{
"epoch": 0.10158914447427618,
"grad_norm": 3.8601644039154053,
"learning_rate": 4.9637218822304446e-05,
"loss": 0.0211,
"step": 17500
},
{
"epoch": 0.10216965387127204,
"grad_norm": 0.3691064715385437,
"learning_rate": 4.962902708806968e-05,
"loss": 0.0237,
"step": 17600
},
{
"epoch": 0.1027501632682679,
"grad_norm": 1.635680913925171,
"learning_rate": 4.9620744588470256e-05,
"loss": 0.0229,
"step": 17700
},
{
"epoch": 0.10333067266526377,
"grad_norm": 0.3783847391605377,
"learning_rate": 4.9612371354029706e-05,
"loss": 0.0167,
"step": 17800
},
{
"epoch": 0.10391118206225963,
"grad_norm": 1.172295093536377,
"learning_rate": 4.96039074156059e-05,
"loss": 0.0217,
"step": 17900
},
{
"epoch": 0.10449169145925549,
"grad_norm": 8.094454765319824,
"learning_rate": 4.959535280439098e-05,
"loss": 0.019,
"step": 18000
},
{
"epoch": 0.10507220085625137,
"grad_norm": 0.4028318524360657,
"learning_rate": 4.958670755191127e-05,
"loss": 0.0234,
"step": 18100
},
{
"epoch": 0.10565271025324723,
"grad_norm": 0.5673860907554626,
"learning_rate": 4.9577971690027136e-05,
"loss": 0.0214,
"step": 18200
},
{
"epoch": 0.10623321965024309,
"grad_norm": 0.0952591523528099,
"learning_rate": 4.956914525093283e-05,
"loss": 0.0195,
"step": 18300
},
{
"epoch": 0.10681372904723896,
"grad_norm": 0.11975416541099548,
"learning_rate": 4.9560228267156445e-05,
"loss": 0.0214,
"step": 18400
},
{
"epoch": 0.10739423844423482,
"grad_norm": 0.31427842378616333,
"learning_rate": 4.955122077155974e-05,
"loss": 0.0204,
"step": 18500
},
{
"epoch": 0.10797474784123068,
"grad_norm": 0.2761117219924927,
"learning_rate": 4.9542122797338054e-05,
"loss": 0.018,
"step": 18600
},
{
"epoch": 0.10855525723822654,
"grad_norm": 0.0,
"learning_rate": 4.953293437802014e-05,
"loss": 0.0203,
"step": 18700
},
{
"epoch": 0.10913576663522241,
"grad_norm": 1.229196548461914,
"learning_rate": 4.9523655547468095e-05,
"loss": 0.0209,
"step": 18800
},
{
"epoch": 0.10971627603221827,
"grad_norm": 1.3233908414840698,
"learning_rate": 4.951428633987719e-05,
"loss": 0.0192,
"step": 18900
},
{
"epoch": 0.11029678542921413,
"grad_norm": 4.7784857749938965,
"learning_rate": 4.950482678977577e-05,
"loss": 0.021,
"step": 19000
},
{
"epoch": 0.11087729482621,
"grad_norm": 0.059221718460321426,
"learning_rate": 4.949527693202513e-05,
"loss": 0.0232,
"step": 19100
},
{
"epoch": 0.11145780422320586,
"grad_norm": 1.453174352645874,
"learning_rate": 4.9485636801819356e-05,
"loss": 0.0222,
"step": 19200
},
{
"epoch": 0.11203831362020172,
"grad_norm": 0.19113394618034363,
"learning_rate": 4.947590643468523e-05,
"loss": 0.0213,
"step": 19300
},
{
"epoch": 0.1126188230171976,
"grad_norm": 0.06702837347984314,
"learning_rate": 4.946608586648206e-05,
"loss": 0.0262,
"step": 19400
},
{
"epoch": 0.11319933241419346,
"grad_norm": 0.7900282144546509,
"learning_rate": 4.945617513340162e-05,
"loss": 0.0179,
"step": 19500
},
{
"epoch": 0.11377984181118932,
"grad_norm": 0.9422081112861633,
"learning_rate": 4.944617427196792e-05,
"loss": 0.0179,
"step": 19600
},
{
"epoch": 0.11436035120818518,
"grad_norm": 6.721597194671631,
"learning_rate": 4.9436083319037134e-05,
"loss": 0.0228,
"step": 19700
},
{
"epoch": 0.11494086060518105,
"grad_norm": 2.548957109451294,
"learning_rate": 4.942590231179747e-05,
"loss": 0.0208,
"step": 19800
},
{
"epoch": 0.11552137000217691,
"grad_norm": 2.1897802352905273,
"learning_rate": 4.9415631287768995e-05,
"loss": 0.0293,
"step": 19900
},
{
"epoch": 0.11610187939917277,
"grad_norm": 1.0778768062591553,
"learning_rate": 4.9405270284803516e-05,
"loss": 0.0205,
"step": 20000
},
{
"epoch": 0.11668238879616864,
"grad_norm": 0.8228683471679688,
"learning_rate": 4.939481934108444e-05,
"loss": 0.0182,
"step": 20100
},
{
"epoch": 0.1172628981931645,
"grad_norm": 0.5803897976875305,
"learning_rate": 4.938427849512664e-05,
"loss": 0.0253,
"step": 20200
},
{
"epoch": 0.11784340759016036,
"grad_norm": 1.7605079412460327,
"learning_rate": 4.93736477857763e-05,
"loss": 0.022,
"step": 20300
},
{
"epoch": 0.11842391698715624,
"grad_norm": 6.337480068206787,
"learning_rate": 4.9362927252210764e-05,
"loss": 0.0167,
"step": 20400
},
{
"epoch": 0.1190044263841521,
"grad_norm": 1.6917483806610107,
"learning_rate": 4.935211693393844e-05,
"loss": 0.0197,
"step": 20500
},
{
"epoch": 0.11958493578114796,
"grad_norm": 3.9351799488067627,
"learning_rate": 4.934121687079859e-05,
"loss": 0.024,
"step": 20600
},
{
"epoch": 0.12016544517814381,
"grad_norm": 1.328538417816162,
"learning_rate": 4.933022710296121e-05,
"loss": 0.0215,
"step": 20700
},
{
"epoch": 0.12074595457513969,
"grad_norm": 0.6960548758506775,
"learning_rate": 4.931914767092692e-05,
"loss": 0.0214,
"step": 20800
},
{
"epoch": 0.12132646397213555,
"grad_norm": 3.212674140930176,
"learning_rate": 4.930797861552674e-05,
"loss": 0.0201,
"step": 20900
},
{
"epoch": 0.12190697336913141,
"grad_norm": 0.33600953221321106,
"learning_rate": 4.929671997792199e-05,
"loss": 0.0188,
"step": 21000
},
{
"epoch": 0.12248748276612728,
"grad_norm": 0.37020212411880493,
"learning_rate": 4.928537179960415e-05,
"loss": 0.0172,
"step": 21100
},
{
"epoch": 0.12306799216312314,
"grad_norm": 1.471659541130066,
"learning_rate": 4.927393412239465e-05,
"loss": 0.022,
"step": 21200
},
{
"epoch": 0.123648501560119,
"grad_norm": 0.34243813157081604,
"learning_rate": 4.9262406988444773e-05,
"loss": 0.0186,
"step": 21300
},
{
"epoch": 0.12422901095711487,
"grad_norm": 1.0350617170333862,
"learning_rate": 4.9250790440235487e-05,
"loss": 0.0192,
"step": 21400
},
{
"epoch": 0.12480952035411073,
"grad_norm": 0.2393186092376709,
"learning_rate": 4.923908452057723e-05,
"loss": 0.0202,
"step": 21500
},
{
"epoch": 0.1253900297511066,
"grad_norm": 0.8566457629203796,
"learning_rate": 4.9227289272609855e-05,
"loss": 0.0225,
"step": 21600
},
{
"epoch": 0.12597053914810247,
"grad_norm": 3.1956393718719482,
"learning_rate": 4.92154047398024e-05,
"loss": 0.0238,
"step": 21700
},
{
"epoch": 0.12655104854509833,
"grad_norm": 2.6811676025390625,
"learning_rate": 4.920343096595291e-05,
"loss": 0.0225,
"step": 21800
},
{
"epoch": 0.1271315579420942,
"grad_norm": 0.4638591706752777,
"learning_rate": 4.9191367995188376e-05,
"loss": 0.018,
"step": 21900
},
{
"epoch": 0.12771206733909005,
"grad_norm": 1.0029135942459106,
"learning_rate": 4.917921587196444e-05,
"loss": 0.0282,
"step": 22000
},
{
"epoch": 0.1282925767360859,
"grad_norm": 1.0247883796691895,
"learning_rate": 4.916697464106535e-05,
"loss": 0.0196,
"step": 22100
},
{
"epoch": 0.12887308613308177,
"grad_norm": 1.2868847846984863,
"learning_rate": 4.915464434760369e-05,
"loss": 0.0239,
"step": 22200
},
{
"epoch": 0.12945359553007765,
"grad_norm": 1.3176194429397583,
"learning_rate": 4.914222503702033e-05,
"loss": 0.0174,
"step": 22300
},
{
"epoch": 0.1300341049270735,
"grad_norm": 0.4307650029659271,
"learning_rate": 4.912971675508414e-05,
"loss": 0.0205,
"step": 22400
},
{
"epoch": 0.13061461432406937,
"grad_norm": 0.6782764196395874,
"learning_rate": 4.911711954789191e-05,
"loss": 0.0155,
"step": 22500
},
{
"epoch": 0.13119512372106523,
"grad_norm": 4.9244866371154785,
"learning_rate": 4.910443346186812e-05,
"loss": 0.0216,
"step": 22600
},
{
"epoch": 0.1317756331180611,
"grad_norm": 0.7279213666915894,
"learning_rate": 4.9091658543764816e-05,
"loss": 0.0192,
"step": 22700
},
{
"epoch": 0.13235614251505695,
"grad_norm": 0.7150142192840576,
"learning_rate": 4.9078794840661415e-05,
"loss": 0.023,
"step": 22800
},
{
"epoch": 0.13293665191205284,
"grad_norm": 0.3685489594936371,
"learning_rate": 4.906584239996451e-05,
"loss": 0.022,
"step": 22900
},
{
"epoch": 0.1335171613090487,
"grad_norm": 2.9621119499206543,
"learning_rate": 4.905280126940775e-05,
"loss": 0.0172,
"step": 23000
},
{
"epoch": 0.13409767070604456,
"grad_norm": 3.5731096267700195,
"learning_rate": 4.9039671497051623e-05,
"loss": 0.0197,
"step": 23100
},
{
"epoch": 0.13467818010304042,
"grad_norm": 0.0,
"learning_rate": 4.902645313128327e-05,
"loss": 0.0168,
"step": 23200
},
{
"epoch": 0.13525868950003628,
"grad_norm": 0.6063820123672485,
"learning_rate": 4.901314622081635e-05,
"loss": 0.0178,
"step": 23300
},
{
"epoch": 0.13583919889703214,
"grad_norm": 1.5601248741149902,
"learning_rate": 4.8999750814690825e-05,
"loss": 0.0153,
"step": 23400
},
{
"epoch": 0.136419708294028,
"grad_norm": 1.3011990785598755,
"learning_rate": 4.89862669622728e-05,
"loss": 0.0182,
"step": 23500
},
{
"epoch": 0.13700021769102388,
"grad_norm": 2.8852713108062744,
"learning_rate": 4.897269471325431e-05,
"loss": 0.0187,
"step": 23600
},
{
"epoch": 0.13758072708801974,
"grad_norm": 0.9321713447570801,
"learning_rate": 4.895903411765317e-05,
"loss": 0.0163,
"step": 23700
},
{
"epoch": 0.1381612364850156,
"grad_norm": 1.3897167444229126,
"learning_rate": 4.894528522581279e-05,
"loss": 0.0255,
"step": 23800
},
{
"epoch": 0.13874174588201146,
"grad_norm": 0.32952451705932617,
"learning_rate": 4.893144808840196e-05,
"loss": 0.0206,
"step": 23900
},
{
"epoch": 0.13932225527900732,
"grad_norm": 0.41645577549934387,
"learning_rate": 4.891752275641468e-05,
"loss": 0.0187,
"step": 24000
},
{
"epoch": 0.13990276467600318,
"grad_norm": 1.9071933031082153,
"learning_rate": 4.890350928117e-05,
"loss": 0.0189,
"step": 24100
},
{
"epoch": 0.14048327407299907,
"grad_norm": 11.354212760925293,
"learning_rate": 4.888940771431178e-05,
"loss": 0.0193,
"step": 24200
},
{
"epoch": 0.14106378346999493,
"grad_norm": 0.22728995978832245,
"learning_rate": 4.887521810780853e-05,
"loss": 0.0197,
"step": 24300
},
{
"epoch": 0.1416442928669908,
"grad_norm": 0.6692954897880554,
"learning_rate": 4.88609405139532e-05,
"loss": 0.0211,
"step": 24400
},
{
"epoch": 0.14222480226398665,
"grad_norm": 0.3103114664554596,
"learning_rate": 4.884657498536304e-05,
"loss": 0.0171,
"step": 24500
},
{
"epoch": 0.1428053116609825,
"grad_norm": 0.7131453156471252,
"learning_rate": 4.8832121574979314e-05,
"loss": 0.0171,
"step": 24600
},
{
"epoch": 0.14338582105797837,
"grad_norm": 0.8742627501487732,
"learning_rate": 4.88175803360672e-05,
"loss": 0.0171,
"step": 24700
},
{
"epoch": 0.14396633045497423,
"grad_norm": 1.464080810546875,
"learning_rate": 4.880295132221552e-05,
"loss": 0.0217,
"step": 24800
},
{
"epoch": 0.14454683985197012,
"grad_norm": 0.1914157271385193,
"learning_rate": 4.87882345873366e-05,
"loss": 0.0226,
"step": 24900
},
{
"epoch": 0.14512734924896598,
"grad_norm": 0.7907546162605286,
"learning_rate": 4.877343018566601e-05,
"loss": 0.014,
"step": 25000
},
{
"epoch": 0.14570785864596184,
"grad_norm": 0.7815316319465637,
"learning_rate": 4.875853817176243e-05,
"loss": 0.0208,
"step": 25100
},
{
"epoch": 0.1462883680429577,
"grad_norm": 0.8793790340423584,
"learning_rate": 4.87435586005074e-05,
"loss": 0.0188,
"step": 25200
},
{
"epoch": 0.14686887743995355,
"grad_norm": 0.35067594051361084,
"learning_rate": 4.872849152710515e-05,
"loss": 0.0247,
"step": 25300
},
{
"epoch": 0.14744938683694941,
"grad_norm": 0.6180335283279419,
"learning_rate": 4.871333700708236e-05,
"loss": 0.0202,
"step": 25400
},
{
"epoch": 0.14802989623394527,
"grad_norm": 1.0985257625579834,
"learning_rate": 4.8698095096288e-05,
"loss": 0.0197,
"step": 25500
},
{
"epoch": 0.14861040563094116,
"grad_norm": 0.47718220949172974,
"learning_rate": 4.8682765850893085e-05,
"loss": 0.019,
"step": 25600
},
{
"epoch": 0.14919091502793702,
"grad_norm": 3.357231616973877,
"learning_rate": 4.866734932739049e-05,
"loss": 0.021,
"step": 25700
},
{
"epoch": 0.14977142442493288,
"grad_norm": 0.9318442940711975,
"learning_rate": 4.865184558259474e-05,
"loss": 0.0185,
"step": 25800
},
{
"epoch": 0.15035193382192874,
"grad_norm": 0.1340111941099167,
"learning_rate": 4.863625467364179e-05,
"loss": 0.0164,
"step": 25900
},
{
"epoch": 0.1509324432189246,
"grad_norm": 1.3237409591674805,
"learning_rate": 4.862057665798883e-05,
"loss": 0.0195,
"step": 26000
},
{
"epoch": 0.15151295261592046,
"grad_norm": 0.23375193774700165,
"learning_rate": 4.860481159341405e-05,
"loss": 0.0169,
"step": 26100
},
{
"epoch": 0.15209346201291635,
"grad_norm": 3.856459617614746,
"learning_rate": 4.858895953801644e-05,
"loss": 0.0181,
"step": 26200
},
{
"epoch": 0.1526739714099122,
"grad_norm": 0.30206841230392456,
"learning_rate": 4.8573020550215606e-05,
"loss": 0.0203,
"step": 26300
},
{
"epoch": 0.15325448080690807,
"grad_norm": 0.32139310240745544,
"learning_rate": 4.855699468875151e-05,
"loss": 0.0153,
"step": 26400
},
{
"epoch": 0.15383499020390393,
"grad_norm": 0.3401035964488983,
"learning_rate": 4.854088201268425e-05,
"loss": 0.02,
"step": 26500
},
{
"epoch": 0.15441549960089979,
"grad_norm": 1.1033488512039185,
"learning_rate": 4.852468258139388e-05,
"loss": 0.019,
"step": 26600
},
{
"epoch": 0.15499600899789565,
"grad_norm": 0.26150408387184143,
"learning_rate": 4.8508396454580174e-05,
"loss": 0.0217,
"step": 26700
},
{
"epoch": 0.1555765183948915,
"grad_norm": 1.0009557008743286,
"learning_rate": 4.849202369226241e-05,
"loss": 0.0178,
"step": 26800
},
{
"epoch": 0.1561570277918874,
"grad_norm": 0.949277400970459,
"learning_rate": 4.8475564354779135e-05,
"loss": 0.021,
"step": 26900
},
{
"epoch": 0.15673753718888325,
"grad_norm": 0.5669822692871094,
"learning_rate": 4.845901850278794e-05,
"loss": 0.0203,
"step": 27000
},
{
"epoch": 0.1573180465858791,
"grad_norm": 0.8850467205047607,
"learning_rate": 4.844238619726528e-05,
"loss": 0.0161,
"step": 27100
},
{
"epoch": 0.15789855598287497,
"grad_norm": 1.5471025705337524,
"learning_rate": 4.842566749950618e-05,
"loss": 0.0169,
"step": 27200
},
{
"epoch": 0.15847906537987083,
"grad_norm": 0.35896119475364685,
"learning_rate": 4.8408862471124075e-05,
"loss": 0.0232,
"step": 27300
},
{
"epoch": 0.1590595747768667,
"grad_norm": 1.772006630897522,
"learning_rate": 4.839197117405053e-05,
"loss": 0.0195,
"step": 27400
},
{
"epoch": 0.15964008417386255,
"grad_norm": 0.5070587396621704,
"learning_rate": 4.837499367053508e-05,
"loss": 0.0159,
"step": 27500
},
{
"epoch": 0.16022059357085844,
"grad_norm": 0.4093703329563141,
"learning_rate": 4.835793002314489e-05,
"loss": 0.018,
"step": 27600
},
{
"epoch": 0.1608011029678543,
"grad_norm": 0.20872582495212555,
"learning_rate": 4.8340780294764655e-05,
"loss": 0.0206,
"step": 27700
},
{
"epoch": 0.16138161236485016,
"grad_norm": 1.1325550079345703,
"learning_rate": 4.8323544548596256e-05,
"loss": 0.0179,
"step": 27800
},
{
"epoch": 0.16196212176184602,
"grad_norm": 0.0,
"learning_rate": 4.8306222848158615e-05,
"loss": 0.0198,
"step": 27900
},
{
"epoch": 0.16254263115884188,
"grad_norm": 0.10438452661037445,
"learning_rate": 4.828881525728739e-05,
"loss": 0.0246,
"step": 28000
},
{
"epoch": 0.16312314055583774,
"grad_norm": 0.5154972076416016,
"learning_rate": 4.827132184013479e-05,
"loss": 0.0203,
"step": 28100
},
{
"epoch": 0.16370364995283362,
"grad_norm": 0.43772637844085693,
"learning_rate": 4.825374266116931e-05,
"loss": 0.0152,
"step": 28200
},
{
"epoch": 0.16428415934982948,
"grad_norm": 0.9545837640762329,
"learning_rate": 4.82360777851755e-05,
"loss": 0.0193,
"step": 28300
},
{
"epoch": 0.16486466874682534,
"grad_norm": 0.4668686091899872,
"learning_rate": 4.821832727725375e-05,
"loss": 0.0163,
"step": 28400
},
{
"epoch": 0.1654451781438212,
"grad_norm": 1.1754403114318848,
"learning_rate": 4.8200491202819995e-05,
"loss": 0.018,
"step": 28500
},
{
"epoch": 0.16602568754081706,
"grad_norm": 0.6412404179573059,
"learning_rate": 4.8182569627605556e-05,
"loss": 0.015,
"step": 28600
},
{
"epoch": 0.16660619693781292,
"grad_norm": 3.5581717491149902,
"learning_rate": 4.81645626176568e-05,
"loss": 0.0128,
"step": 28700
},
{
"epoch": 0.16718670633480878,
"grad_norm": 0.8452811241149902,
"learning_rate": 4.814647023933497e-05,
"loss": 0.021,
"step": 28800
},
{
"epoch": 0.16776721573180467,
"grad_norm": 0.7324305772781372,
"learning_rate": 4.812829255931592e-05,
"loss": 0.0228,
"step": 28900
},
{
"epoch": 0.16834772512880053,
"grad_norm": 2.6767971515655518,
"learning_rate": 4.811002964458987e-05,
"loss": 0.0194,
"step": 29000
},
{
"epoch": 0.1689282345257964,
"grad_norm": 1.0238885879516602,
"learning_rate": 4.809168156246113e-05,
"loss": 0.0145,
"step": 29100
},
{
"epoch": 0.16950874392279225,
"grad_norm": 0.48919227719306946,
"learning_rate": 4.807324838054792e-05,
"loss": 0.0199,
"step": 29200
},
{
"epoch": 0.1700892533197881,
"grad_norm": 1.0482101440429688,
"learning_rate": 4.8054730166782035e-05,
"loss": 0.0204,
"step": 29300
},
{
"epoch": 0.17066976271678397,
"grad_norm": 0.22599902749061584,
"learning_rate": 4.8036126989408666e-05,
"loss": 0.0197,
"step": 29400
},
{
"epoch": 0.17125027211377986,
"grad_norm": 0.2554647922515869,
"learning_rate": 4.80174389169861e-05,
"loss": 0.0178,
"step": 29500
},
{
"epoch": 0.17183078151077572,
"grad_norm": 1.0132629871368408,
"learning_rate": 4.7998666018385506e-05,
"loss": 0.0172,
"step": 29600
},
{
"epoch": 0.17241129090777157,
"grad_norm": 3.19964599609375,
"learning_rate": 4.7979808362790655e-05,
"loss": 0.0183,
"step": 29700
},
{
"epoch": 0.17299180030476743,
"grad_norm": 0.5451412200927734,
"learning_rate": 4.796086601969768e-05,
"loss": 0.0189,
"step": 29800
},
{
"epoch": 0.1735723097017633,
"grad_norm": 0.18623612821102142,
"learning_rate": 4.7941839058914796e-05,
"loss": 0.0165,
"step": 29900
},
{
"epoch": 0.17415281909875915,
"grad_norm": 1.2114591598510742,
"learning_rate": 4.792272755056207e-05,
"loss": 0.0185,
"step": 30000
},
{
"epoch": 0.174733328495755,
"grad_norm": 0.8469420075416565,
"learning_rate": 4.790353156507117e-05,
"loss": 0.0191,
"step": 30100
},
{
"epoch": 0.1753138378927509,
"grad_norm": 1.4895416498184204,
"learning_rate": 4.7884251173185045e-05,
"loss": 0.0202,
"step": 30200
},
{
"epoch": 0.17589434728974676,
"grad_norm": 0.26737430691719055,
"learning_rate": 4.786488644595775e-05,
"loss": 0.0154,
"step": 30300
},
{
"epoch": 0.17647485668674262,
"grad_norm": 1.4356130361557007,
"learning_rate": 4.7845437454754116e-05,
"loss": 0.0164,
"step": 30400
},
{
"epoch": 0.17705536608373848,
"grad_norm": 0.28000500798225403,
"learning_rate": 4.782590427124952e-05,
"loss": 0.0158,
"step": 30500
},
{
"epoch": 0.17763587548073434,
"grad_norm": 2.8763926029205322,
"learning_rate": 4.7806286967429606e-05,
"loss": 0.0182,
"step": 30600
},
{
"epoch": 0.1782163848777302,
"grad_norm": 1.897760272026062,
"learning_rate": 4.778658561559004e-05,
"loss": 0.0255,
"step": 30700
},
{
"epoch": 0.17879689427472606,
"grad_norm": 1.896153450012207,
"learning_rate": 4.776680028833623e-05,
"loss": 0.0187,
"step": 30800
},
{
"epoch": 0.17937740367172195,
"grad_norm": 0.31827715039253235,
"learning_rate": 4.7746931058583035e-05,
"loss": 0.0172,
"step": 30900
},
{
"epoch": 0.1799579130687178,
"grad_norm": 2.0092689990997314,
"learning_rate": 4.772697799955455e-05,
"loss": 0.0156,
"step": 31000
},
{
"epoch": 0.18053842246571367,
"grad_norm": 3.24516224861145,
"learning_rate": 4.7706941184783776e-05,
"loss": 0.0157,
"step": 31100
},
{
"epoch": 0.18111893186270953,
"grad_norm": 4.248687744140625,
"learning_rate": 4.768682068811241e-05,
"loss": 0.0223,
"step": 31200
},
{
"epoch": 0.18169944125970539,
"grad_norm": 1.3310073614120483,
"learning_rate": 4.7666616583690525e-05,
"loss": 0.0181,
"step": 31300
},
{
"epoch": 0.18227995065670125,
"grad_norm": 0.7074719071388245,
"learning_rate": 4.764632894597632e-05,
"loss": 0.0165,
"step": 31400
},
{
"epoch": 0.18286046005369713,
"grad_norm": 1.0923957824707031,
"learning_rate": 4.7625957849735826e-05,
"loss": 0.0209,
"step": 31500
},
{
"epoch": 0.183440969450693,
"grad_norm": 0.4126807749271393,
"learning_rate": 4.760550337004266e-05,
"loss": 0.021,
"step": 31600
},
{
"epoch": 0.18402147884768885,
"grad_norm": 0.7564171552658081,
"learning_rate": 4.758496558227771e-05,
"loss": 0.02,
"step": 31700
},
{
"epoch": 0.1846019882446847,
"grad_norm": 5.621452808380127,
"learning_rate": 4.756434456212892e-05,
"loss": 0.0218,
"step": 31800
},
{
"epoch": 0.18518249764168057,
"grad_norm": 0.5979048013687134,
"learning_rate": 4.7543640385590925e-05,
"loss": 0.018,
"step": 31900
},
{
"epoch": 0.18576300703867643,
"grad_norm": 0.4124324917793274,
"learning_rate": 4.752285312896485e-05,
"loss": 0.0192,
"step": 32000
},
{
"epoch": 0.1863435164356723,
"grad_norm": 19.721843719482422,
"learning_rate": 4.750198286885797e-05,
"loss": 0.0191,
"step": 32100
},
{
"epoch": 0.18692402583266818,
"grad_norm": 1.7610396146774292,
"learning_rate": 4.748102968218347e-05,
"loss": 0.0205,
"step": 32200
},
{
"epoch": 0.18750453522966404,
"grad_norm": 0.6155940890312195,
"learning_rate": 4.745999364616014e-05,
"loss": 0.0233,
"step": 32300
},
{
"epoch": 0.1880850446266599,
"grad_norm": 1.3487342596054077,
"learning_rate": 4.743887483831208e-05,
"loss": 0.0182,
"step": 32400
},
{
"epoch": 0.18866555402365576,
"grad_norm": 1.6848351955413818,
"learning_rate": 4.741767333646846e-05,
"loss": 0.0196,
"step": 32500
},
{
"epoch": 0.18924606342065162,
"grad_norm": 0.3916856348514557,
"learning_rate": 4.739638921876317e-05,
"loss": 0.0157,
"step": 32600
},
{
"epoch": 0.18982657281764748,
"grad_norm": 0.43171945214271545,
"learning_rate": 4.737502256363459e-05,
"loss": 0.015,
"step": 32700
},
{
"epoch": 0.19040708221464334,
"grad_norm": 2.9373903274536133,
"learning_rate": 4.735357344982525e-05,
"loss": 0.0182,
"step": 32800
},
{
"epoch": 0.19098759161163922,
"grad_norm": 1.7581216096878052,
"learning_rate": 4.733204195638159e-05,
"loss": 0.021,
"step": 32900
},
{
"epoch": 0.19156810100863508,
"grad_norm": 0.5465153455734253,
"learning_rate": 4.731042816265364e-05,
"loss": 0.0165,
"step": 33000
},
{
"epoch": 0.19214861040563094,
"grad_norm": 1.2427330017089844,
"learning_rate": 4.72887321482947e-05,
"loss": 0.0172,
"step": 33100
},
{
"epoch": 0.1927291198026268,
"grad_norm": 1.4515526294708252,
"learning_rate": 4.726695399326113e-05,
"loss": 0.0166,
"step": 33200
},
{
"epoch": 0.19330962919962266,
"grad_norm": 0.47813165187835693,
"learning_rate": 4.7245093777811945e-05,
"loss": 0.0165,
"step": 33300
},
{
"epoch": 0.19389013859661852,
"grad_norm": 0.4670131206512451,
"learning_rate": 4.722315158250863e-05,
"loss": 0.0171,
"step": 33400
},
{
"epoch": 0.1944706479936144,
"grad_norm": 1.1390752792358398,
"learning_rate": 4.720112748821475e-05,
"loss": 0.0219,
"step": 33500
},
{
"epoch": 0.19505115739061027,
"grad_norm": 0.6364420652389526,
"learning_rate": 4.7179021576095724e-05,
"loss": 0.0186,
"step": 33600
},
{
"epoch": 0.19563166678760613,
"grad_norm": 0.0812646821141243,
"learning_rate": 4.7156833927618475e-05,
"loss": 0.0184,
"step": 33700
},
{
"epoch": 0.196212176184602,
"grad_norm": 0.5782191753387451,
"learning_rate": 4.713456462455116e-05,
"loss": 0.0212,
"step": 33800
},
{
"epoch": 0.19679268558159785,
"grad_norm": 0.3221706449985504,
"learning_rate": 4.711221374896283e-05,
"loss": 0.0183,
"step": 33900
},
{
"epoch": 0.1973731949785937,
"grad_norm": 0.4310432970523834,
"learning_rate": 4.7089781383223203e-05,
"loss": 0.0194,
"step": 34000
},
{
"epoch": 0.19795370437558957,
"grad_norm": 0.4918677508831024,
"learning_rate": 4.706726761000227e-05,
"loss": 0.0192,
"step": 34100
},
{
"epoch": 0.19853421377258545,
"grad_norm": 0.8517147302627563,
"learning_rate": 4.704467251227006e-05,
"loss": 0.0179,
"step": 34200
},
{
"epoch": 0.19911472316958131,
"grad_norm": 0.8899397850036621,
"learning_rate": 4.702199617329629e-05,
"loss": 0.0216,
"step": 34300
},
{
"epoch": 0.19969523256657717,
"grad_norm": 1.310464859008789,
"learning_rate": 4.6999238676650074e-05,
"loss": 0.0196,
"step": 34400
},
{
"epoch": 0.20027574196357303,
"grad_norm": 2.9320359230041504,
"learning_rate": 4.697640010619965e-05,
"loss": 0.0167,
"step": 34500
},
{
"epoch": 0.2008562513605689,
"grad_norm": 0.2071259319782257,
"learning_rate": 4.6953480546111986e-05,
"loss": 0.019,
"step": 34600
},
{
"epoch": 0.20143676075756475,
"grad_norm": 5.163755893707275,
"learning_rate": 4.6930480080852553e-05,
"loss": 0.0147,
"step": 34700
},
{
"epoch": 0.2020172701545606,
"grad_norm": 0.6986225843429565,
"learning_rate": 4.6907398795184995e-05,
"loss": 0.0248,
"step": 34800
},
{
"epoch": 0.2025977795515565,
"grad_norm": 0.5469736456871033,
"learning_rate": 4.6884236774170766e-05,
"loss": 0.0147,
"step": 34900
},
{
"epoch": 0.20317828894855236,
"grad_norm": 1.1931012868881226,
"learning_rate": 4.686099410316888e-05,
"loss": 0.0183,
"step": 35000
},
{
"epoch": 0.20375879834554822,
"grad_norm": 3.972321033477783,
"learning_rate": 4.6837670867835546e-05,
"loss": 0.0199,
"step": 35100
},
{
"epoch": 0.20433930774254408,
"grad_norm": 0.9743729829788208,
"learning_rate": 4.681426715412392e-05,
"loss": 0.0161,
"step": 35200
},
{
"epoch": 0.20491981713953994,
"grad_norm": 1.3011478185653687,
"learning_rate": 4.67907830482837e-05,
"loss": 0.0177,
"step": 35300
},
{
"epoch": 0.2055003265365358,
"grad_norm": 0.2887895405292511,
"learning_rate": 4.676721863686088e-05,
"loss": 0.0156,
"step": 35400
},
{
"epoch": 0.2060808359335317,
"grad_norm": 0.5841540098190308,
"learning_rate": 4.67435740066974e-05,
"loss": 0.0201,
"step": 35500
},
{
"epoch": 0.20666134533052755,
"grad_norm": 0.38853272795677185,
"learning_rate": 4.671984924493081e-05,
"loss": 0.0185,
"step": 35600
},
{
"epoch": 0.2072418547275234,
"grad_norm": 2.471120834350586,
"learning_rate": 4.6696044438994004e-05,
"loss": 0.0201,
"step": 35700
},
{
"epoch": 0.20782236412451927,
"grad_norm": 1.8866631984710693,
"learning_rate": 4.667215967661483e-05,
"loss": 0.0199,
"step": 35800
},
{
"epoch": 0.20840287352151513,
"grad_norm": 0.6856237053871155,
"learning_rate": 4.664819504581582e-05,
"loss": 0.0161,
"step": 35900
},
{
"epoch": 0.20898338291851098,
"grad_norm": 0.3085954487323761,
"learning_rate": 4.662415063491384e-05,
"loss": 0.0173,
"step": 36000
},
{
"epoch": 0.20956389231550684,
"grad_norm": 0.6165205240249634,
"learning_rate": 4.660002653251977e-05,
"loss": 0.0184,
"step": 36100
},
{
"epoch": 0.21014440171250273,
"grad_norm": 3.60754132270813,
"learning_rate": 4.657582282753816e-05,
"loss": 0.0212,
"step": 36200
},
{
"epoch": 0.2107249111094986,
"grad_norm": 0.378738135099411,
"learning_rate": 4.655153960916695e-05,
"loss": 0.0247,
"step": 36300
},
{
"epoch": 0.21130542050649445,
"grad_norm": 1.4534658193588257,
"learning_rate": 4.652717696689709e-05,
"loss": 0.02,
"step": 36400
},
{
"epoch": 0.2118859299034903,
"grad_norm": 0.41120943427085876,
"learning_rate": 4.6502734990512255e-05,
"loss": 0.0136,
"step": 36500
},
{
"epoch": 0.21246643930048617,
"grad_norm": 11.224574089050293,
"learning_rate": 4.647821377008844e-05,
"loss": 0.0208,
"step": 36600
},
{
"epoch": 0.21304694869748203,
"grad_norm": 2.768289089202881,
"learning_rate": 4.645361339599373e-05,
"loss": 0.0174,
"step": 36700
},
{
"epoch": 0.21362745809447792,
"grad_norm": 1.1909620761871338,
"learning_rate": 4.6428933958887885e-05,
"loss": 0.0194,
"step": 36800
},
{
"epoch": 0.21420796749147378,
"grad_norm": 2.583638906478882,
"learning_rate": 4.6404175549722055e-05,
"loss": 0.0151,
"step": 36900
},
{
"epoch": 0.21478847688846964,
"grad_norm": 1.1585899591445923,
"learning_rate": 4.6379338259738414e-05,
"loss": 0.019,
"step": 37000
},
{
"epoch": 0.2153689862854655,
"grad_norm": 0.24345244467258453,
"learning_rate": 4.6354422180469834e-05,
"loss": 0.0158,
"step": 37100
},
{
"epoch": 0.21594949568246136,
"grad_norm": 0.9591251611709595,
"learning_rate": 4.632942740373955e-05,
"loss": 0.0162,
"step": 37200
},
{
"epoch": 0.21653000507945722,
"grad_norm": 0.12427254766225815,
"learning_rate": 4.630435402166083e-05,
"loss": 0.0291,
"step": 37300
},
{
"epoch": 0.21711051447645308,
"grad_norm": 12.628028869628906,
"learning_rate": 4.6279202126636624e-05,
"loss": 0.0147,
"step": 37400
},
{
"epoch": 0.21769102387344896,
"grad_norm": 0.6221101880073547,
"learning_rate": 4.625397181135922e-05,
"loss": 0.0188,
"step": 37500
},
{
"epoch": 0.21827153327044482,
"grad_norm": 0.11033707112073898,
"learning_rate": 4.6228663168809904e-05,
"loss": 0.0141,
"step": 37600
},
{
"epoch": 0.21885204266744068,
"grad_norm": 0.5746279954910278,
"learning_rate": 4.620327629225863e-05,
"loss": 0.0169,
"step": 37700
},
{
"epoch": 0.21943255206443654,
"grad_norm": 1.458079218864441,
"learning_rate": 4.6177811275263665e-05,
"loss": 0.0195,
"step": 37800
},
{
"epoch": 0.2200130614614324,
"grad_norm": 8.152167320251465,
"learning_rate": 4.615226821167126e-05,
"loss": 0.0155,
"step": 37900
},
{
"epoch": 0.22059357085842826,
"grad_norm": 2.139084577560425,
"learning_rate": 4.612664719561526e-05,
"loss": 0.0179,
"step": 38000
},
{
"epoch": 0.22117408025542412,
"grad_norm": 2.3556296825408936,
"learning_rate": 4.610094832151681e-05,
"loss": 0.0187,
"step": 38100
},
{
"epoch": 0.22175458965242,
"grad_norm": 0.26898398995399475,
"learning_rate": 4.6075171684084e-05,
"loss": 0.0208,
"step": 38200
},
{
"epoch": 0.22233509904941587,
"grad_norm": 0.16663870215415955,
"learning_rate": 4.604931737831146e-05,
"loss": 0.0189,
"step": 38300
},
{
"epoch": 0.22291560844641173,
"grad_norm": 4.2696943283081055,
"learning_rate": 4.60233854994801e-05,
"loss": 0.0168,
"step": 38400
},
{
"epoch": 0.2234961178434076,
"grad_norm": 1.2985143661499023,
"learning_rate": 4.5997376143156654e-05,
"loss": 0.0161,
"step": 38500
},
{
"epoch": 0.22407662724040345,
"grad_norm": 0.7481074333190918,
"learning_rate": 4.597128940519344e-05,
"loss": 0.0132,
"step": 38600
},
{
"epoch": 0.2246571366373993,
"grad_norm": 1.7323493957519531,
"learning_rate": 4.5945125381727924e-05,
"loss": 0.0147,
"step": 38700
},
{
"epoch": 0.2252376460343952,
"grad_norm": 0.9689391851425171,
"learning_rate": 4.591888416918238e-05,
"loss": 0.0175,
"step": 38800
},
{
"epoch": 0.22581815543139105,
"grad_norm": 0.9196175336837769,
"learning_rate": 4.589256586426356e-05,
"loss": 0.0167,
"step": 38900
},
{
"epoch": 0.22639866482838691,
"grad_norm": 1.5890405178070068,
"learning_rate": 4.586617056396234e-05,
"loss": 0.0198,
"step": 39000
},
{
"epoch": 0.22697917422538277,
"grad_norm": 4.1506829261779785,
"learning_rate": 4.583969836555333e-05,
"loss": 0.015,
"step": 39100
},
{
"epoch": 0.22755968362237863,
"grad_norm": 1.0635732412338257,
"learning_rate": 4.581314936659451e-05,
"loss": 0.0186,
"step": 39200
},
{
"epoch": 0.2281401930193745,
"grad_norm": 1.0515848398208618,
"learning_rate": 4.578652366492695e-05,
"loss": 0.0248,
"step": 39300
},
{
"epoch": 0.22872070241637035,
"grad_norm": 0.3499925434589386,
"learning_rate": 4.5759821358674346e-05,
"loss": 0.0176,
"step": 39400
},
{
"epoch": 0.22930121181336624,
"grad_norm": 0.0,
"learning_rate": 4.573304254624271e-05,
"loss": 0.0164,
"step": 39500
},
{
"epoch": 0.2298817212103621,
"grad_norm": 0.6939957141876221,
"learning_rate": 4.570618732632003e-05,
"loss": 0.0191,
"step": 39600
},
{
"epoch": 0.23046223060735796,
"grad_norm": 0.3061050772666931,
"learning_rate": 4.5679255797875856e-05,
"loss": 0.0188,
"step": 39700
},
{
"epoch": 0.23104274000435382,
"grad_norm": 0.3814498484134674,
"learning_rate": 4.565224806016095e-05,
"loss": 0.0164,
"step": 39800
},
{
"epoch": 0.23162324940134968,
"grad_norm": 0.0,
"learning_rate": 4.562516421270695e-05,
"loss": 0.017,
"step": 39900
},
{
"epoch": 0.23220375879834554,
"grad_norm": 1.1685712337493896,
"learning_rate": 4.559800435532596e-05,
"loss": 0.018,
"step": 40000
},
{
"epoch": 0.2327842681953414,
"grad_norm": 0.8218551874160767,
"learning_rate": 4.5570768588110235e-05,
"loss": 0.0162,
"step": 40100
},
{
"epoch": 0.23336477759233729,
"grad_norm": 2.128337860107422,
"learning_rate": 4.5543457011431744e-05,
"loss": 0.0178,
"step": 40200
},
{
"epoch": 0.23394528698933315,
"grad_norm": 36.10731887817383,
"learning_rate": 4.5516069725941854e-05,
"loss": 0.0185,
"step": 40300
},
{
"epoch": 0.234525796386329,
"grad_norm": 1.233340859413147,
"learning_rate": 4.548860683257096e-05,
"loss": 0.0175,
"step": 40400
},
{
"epoch": 0.23510630578332486,
"grad_norm": 0.9268346428871155,
"learning_rate": 4.546106843252804e-05,
"loss": 0.0245,
"step": 40500
},
{
"epoch": 0.23568681518032072,
"grad_norm": 0.29465100169181824,
"learning_rate": 4.54334546273004e-05,
"loss": 0.0211,
"step": 40600
},
{
"epoch": 0.23626732457731658,
"grad_norm": 0.3362191319465637,
"learning_rate": 4.5405765518653204e-05,
"loss": 0.0151,
"step": 40700
},
{
"epoch": 0.23684783397431247,
"grad_norm": 0.8512314558029175,
"learning_rate": 4.537800120862913e-05,
"loss": 0.0162,
"step": 40800
},
{
"epoch": 0.23742834337130833,
"grad_norm": 0.07062964141368866,
"learning_rate": 4.5350161799548e-05,
"loss": 0.0162,
"step": 40900
},
{
"epoch": 0.2380088527683042,
"grad_norm": 0.0,
"learning_rate": 4.5322247394006415e-05,
"loss": 0.0164,
"step": 41000
},
{
"epoch": 0.23858936216530005,
"grad_norm": 6.001936435699463,
"learning_rate": 4.529425809487733e-05,
"loss": 0.018,
"step": 41100
},
{
"epoch": 0.2391698715622959,
"grad_norm": 1.6303260326385498,
"learning_rate": 4.526619400530973e-05,
"loss": 0.0154,
"step": 41200
},
{
"epoch": 0.23975038095929177,
"grad_norm": 0.2997598648071289,
"learning_rate": 4.523805522872822e-05,
"loss": 0.0133,
"step": 41300
},
{
"epoch": 0.24033089035628763,
"grad_norm": 0.1703944355249405,
"learning_rate": 4.5209841868832635e-05,
"loss": 0.0161,
"step": 41400
},
{
"epoch": 0.24091139975328352,
"grad_norm": 2.1550252437591553,
"learning_rate": 4.51815540295977e-05,
"loss": 0.0148,
"step": 41500
},
{
"epoch": 0.24149190915027938,
"grad_norm": 3.253269910812378,
"learning_rate": 4.515319181527259e-05,
"loss": 0.0197,
"step": 41600
},
{
"epoch": 0.24207241854727524,
"grad_norm": 0.663278341293335,
"learning_rate": 4.512475533038059e-05,
"loss": 0.0152,
"step": 41700
},
{
"epoch": 0.2426529279442711,
"grad_norm": 0.05046732723712921,
"learning_rate": 4.5096244679718676e-05,
"loss": 0.0207,
"step": 41800
},
{
"epoch": 0.24323343734126696,
"grad_norm": 2.685068368911743,
"learning_rate": 4.506765996835718e-05,
"loss": 0.0154,
"step": 41900
},
{
"epoch": 0.24381394673826282,
"grad_norm": 0.698753833770752,
"learning_rate": 4.503900130163935e-05,
"loss": 0.0161,
"step": 42000
},
{
"epoch": 0.2443944561352587,
"grad_norm": 0.11762864887714386,
"learning_rate": 4.501026878518097e-05,
"loss": 0.0187,
"step": 42100
},
{
"epoch": 0.24497496553225456,
"grad_norm": 1.077953577041626,
"learning_rate": 4.498146252487002e-05,
"loss": 0.0185,
"step": 42200
},
{
"epoch": 0.24555547492925042,
"grad_norm": 0.18901602923870087,
"learning_rate": 4.49525826268662e-05,
"loss": 0.0153,
"step": 42300
},
{
"epoch": 0.24613598432624628,
"grad_norm": 1.0348716974258423,
"learning_rate": 4.492362919760063e-05,
"loss": 0.0178,
"step": 42400
},
{
"epoch": 0.24671649372324214,
"grad_norm": 0.6340203285217285,
"learning_rate": 4.489460234377538e-05,
"loss": 0.0158,
"step": 42500
},
{
"epoch": 0.247297003120238,
"grad_norm": 1.0056567192077637,
"learning_rate": 4.4865502172363126e-05,
"loss": 0.0189,
"step": 42600
},
{
"epoch": 0.24787751251723386,
"grad_norm": 2.1102306842803955,
"learning_rate": 4.483632879060676e-05,
"loss": 0.0158,
"step": 42700
},
{
"epoch": 0.24845802191422975,
"grad_norm": 0.1478302925825119,
"learning_rate": 4.480708230601895e-05,
"loss": 0.0166,
"step": 42800
},
{
"epoch": 0.2490385313112256,
"grad_norm": 0.5817315578460693,
"learning_rate": 4.4777762826381775e-05,
"loss": 0.0202,
"step": 42900
},
{
"epoch": 0.24961904070822147,
"grad_norm": 2.631985664367676,
"learning_rate": 4.4748370459746334e-05,
"loss": 0.0151,
"step": 43000
},
{
"epoch": 0.2501995501052173,
"grad_norm": 0.3138137459754944,
"learning_rate": 4.471890531443232e-05,
"loss": 0.0188,
"step": 43100
},
{
"epoch": 0.2507800595022132,
"grad_norm": 0.6783995628356934,
"learning_rate": 4.4689367499027654e-05,
"loss": 0.0195,
"step": 43200
},
{
"epoch": 0.25136056889920905,
"grad_norm": 2.4091744422912598,
"learning_rate": 4.4659757122388066e-05,
"loss": 0.0158,
"step": 43300
},
{
"epoch": 0.25194107829620493,
"grad_norm": 1.260305643081665,
"learning_rate": 4.463007429363668e-05,
"loss": 0.0186,
"step": 43400
},
{
"epoch": 0.25252158769320077,
"grad_norm": 1.3665226697921753,
"learning_rate": 4.460031912216363e-05,
"loss": 0.0163,
"step": 43500
},
{
"epoch": 0.25310209709019665,
"grad_norm": 0.0,
"learning_rate": 4.457049171762568e-05,
"loss": 0.0163,
"step": 43600
},
{
"epoch": 0.2536826064871925,
"grad_norm": 0.5342715382575989,
"learning_rate": 4.454059218994577e-05,
"loss": 0.0164,
"step": 43700
},
{
"epoch": 0.2542631158841884,
"grad_norm": 0.4309409558773041,
"learning_rate": 4.4510620649312643e-05,
"loss": 0.0182,
"step": 43800
},
{
"epoch": 0.25484362528118426,
"grad_norm": 0.212936669588089,
"learning_rate": 4.4480577206180436e-05,
"loss": 0.0145,
"step": 43900
},
{
"epoch": 0.2554241346781801,
"grad_norm": 2.9309751987457275,
"learning_rate": 4.4450461971268256e-05,
"loss": 0.0152,
"step": 44000
},
{
"epoch": 0.256004644075176,
"grad_norm": 6.400189399719238,
"learning_rate": 4.4420275055559795e-05,
"loss": 0.0172,
"step": 44100
},
{
"epoch": 0.2565851534721718,
"grad_norm": 0.43217283487319946,
"learning_rate": 4.43900165703029e-05,
"loss": 0.0223,
"step": 44200
},
{
"epoch": 0.2571656628691677,
"grad_norm": 0.4472719132900238,
"learning_rate": 4.4359686627009204e-05,
"loss": 0.0166,
"step": 44300
},
{
"epoch": 0.25774617226616353,
"grad_norm": 0.8468680381774902,
"learning_rate": 4.432928533745364e-05,
"loss": 0.0267,
"step": 44400
},
{
"epoch": 0.2583266816631594,
"grad_norm": 0.5906082987785339,
"learning_rate": 4.4298812813674096e-05,
"loss": 0.019,
"step": 44500
},
{
"epoch": 0.2589071910601553,
"grad_norm": 0.6340333819389343,
"learning_rate": 4.4268269167970977e-05,
"loss": 0.015,
"step": 44600
},
{
"epoch": 0.25948770045715114,
"grad_norm": 3.455953598022461,
"learning_rate": 4.42376545129068e-05,
"loss": 0.0179,
"step": 44700
},
{
"epoch": 0.260068209854147,
"grad_norm": 1.7209389209747314,
"learning_rate": 4.420696896130576e-05,
"loss": 0.0182,
"step": 44800
},
{
"epoch": 0.26064871925114286,
"grad_norm": 0.23180946707725525,
"learning_rate": 4.417621262625334e-05,
"loss": 0.0234,
"step": 44900
},
{
"epoch": 0.26122922864813874,
"grad_norm": 1.2846555709838867,
"learning_rate": 4.414538562109588e-05,
"loss": 0.0185,
"step": 45000
},
{
"epoch": 0.26180973804513463,
"grad_norm": 1.6205638647079468,
"learning_rate": 4.411448805944015e-05,
"loss": 0.0164,
"step": 45100
},
{
"epoch": 0.26239024744213046,
"grad_norm": 1.5716558694839478,
"learning_rate": 4.408352005515295e-05,
"loss": 0.0152,
"step": 45200
},
{
"epoch": 0.26297075683912635,
"grad_norm": 2.2886829376220703,
"learning_rate": 4.4052481722360675e-05,
"loss": 0.0124,
"step": 45300
},
{
"epoch": 0.2635512662361222,
"grad_norm": 0.44392430782318115,
"learning_rate": 4.402137317544891e-05,
"loss": 0.0182,
"step": 45400
},
{
"epoch": 0.26413177563311807,
"grad_norm": 1.4996153116226196,
"learning_rate": 4.399019452906199e-05,
"loss": 0.0181,
"step": 45500
},
{
"epoch": 0.2647122850301139,
"grad_norm": 0.37333735823631287,
"learning_rate": 4.395894589810261e-05,
"loss": 0.0187,
"step": 45600
},
{
"epoch": 0.2652927944271098,
"grad_norm": 1.011172890663147,
"learning_rate": 4.392762739773135e-05,
"loss": 0.0132,
"step": 45700
},
{
"epoch": 0.2658733038241057,
"grad_norm": 0.7283264398574829,
"learning_rate": 4.389623914336631e-05,
"loss": 0.0212,
"step": 45800
},
{
"epoch": 0.2664538132211015,
"grad_norm": 0.0,
"learning_rate": 4.386478125068262e-05,
"loss": 0.0191,
"step": 45900
},
{
"epoch": 0.2670343226180974,
"grad_norm": 0.8834022879600525,
"learning_rate": 4.3833253835612074e-05,
"loss": 0.018,
"step": 46000
},
{
"epoch": 0.26761483201509323,
"grad_norm": 0.6831231713294983,
"learning_rate": 4.380165701434267e-05,
"loss": 0.0145,
"step": 46100
},
{
"epoch": 0.2681953414120891,
"grad_norm": 0.0,
"learning_rate": 4.376999090331818e-05,
"loss": 0.0154,
"step": 46200
},
{
"epoch": 0.26877585080908495,
"grad_norm": 0.10557834059000015,
"learning_rate": 4.3738255619237745e-05,
"loss": 0.0124,
"step": 46300
},
{
"epoch": 0.26935636020608084,
"grad_norm": 1.9961583614349365,
"learning_rate": 4.370645127905542e-05,
"loss": 0.0208,
"step": 46400
},
{
"epoch": 0.2699368696030767,
"grad_norm": 0.8881611824035645,
"learning_rate": 4.367457799997976e-05,
"loss": 0.0132,
"step": 46500
},
{
"epoch": 0.27051737900007256,
"grad_norm": 0.7779310345649719,
"learning_rate": 4.3642635899473364e-05,
"loss": 0.0207,
"step": 46600
},
{
"epoch": 0.27109788839706844,
"grad_norm": 1.000813364982605,
"learning_rate": 4.3610625095252474e-05,
"loss": 0.0217,
"step": 46700
},
{
"epoch": 0.2716783977940643,
"grad_norm": 0.2656024396419525,
"learning_rate": 4.357854570528652e-05,
"loss": 0.0197,
"step": 46800
},
{
"epoch": 0.27225890719106016,
"grad_norm": 0.3503284156322479,
"learning_rate": 4.3546397847797695e-05,
"loss": 0.0155,
"step": 46900
},
{
"epoch": 0.272839416588056,
"grad_norm": 2.816612482070923,
"learning_rate": 4.3514181641260515e-05,
"loss": 0.0196,
"step": 47000
},
{
"epoch": 0.2734199259850519,
"grad_norm": 0.44417452812194824,
"learning_rate": 4.3481897204401376e-05,
"loss": 0.0164,
"step": 47100
},
{
"epoch": 0.27400043538204777,
"grad_norm": 1.1510508060455322,
"learning_rate": 4.3449544656198123e-05,
"loss": 0.0155,
"step": 47200
},
{
"epoch": 0.2745809447790436,
"grad_norm": 0.0,
"learning_rate": 4.3417124115879623e-05,
"loss": 0.0151,
"step": 47300
},
{
"epoch": 0.2751614541760395,
"grad_norm": 0.687237024307251,
"learning_rate": 4.3384635702925315e-05,
"loss": 0.0161,
"step": 47400
},
{
"epoch": 0.2757419635730353,
"grad_norm": 15.054317474365234,
"learning_rate": 4.335207953706475e-05,
"loss": 0.0159,
"step": 47500
},
{
"epoch": 0.2763224729700312,
"grad_norm": 0.6675468683242798,
"learning_rate": 4.3319455738277184e-05,
"loss": 0.0226,
"step": 47600
},
{
"epoch": 0.27690298236702704,
"grad_norm": 0.44973939657211304,
"learning_rate": 4.328676442679112e-05,
"loss": 0.0161,
"step": 47700
},
{
"epoch": 0.2774834917640229,
"grad_norm": 0.5629274249076843,
"learning_rate": 4.3254005723083855e-05,
"loss": 0.0145,
"step": 47800
},
{
"epoch": 0.2780640011610188,
"grad_norm": 1.0498775243759155,
"learning_rate": 4.322117974788107e-05,
"loss": 0.0166,
"step": 47900
},
{
"epoch": 0.27864451055801465,
"grad_norm": 0.404738187789917,
"learning_rate": 4.318828662215633e-05,
"loss": 0.0148,
"step": 48000
},
{
"epoch": 0.27922501995501053,
"grad_norm": 0.3776521384716034,
"learning_rate": 4.3155326467130696e-05,
"loss": 0.0208,
"step": 48100
},
{
"epoch": 0.27980552935200637,
"grad_norm": 0.3664938807487488,
"learning_rate": 4.312229940427224e-05,
"loss": 0.014,
"step": 48200
},
{
"epoch": 0.28038603874900225,
"grad_norm": 0.41216275095939636,
"learning_rate": 4.308920555529561e-05,
"loss": 0.0159,
"step": 48300
},
{
"epoch": 0.28096654814599814,
"grad_norm": 0.7025476694107056,
"learning_rate": 4.305604504216157e-05,
"loss": 0.0144,
"step": 48400
},
{
"epoch": 0.281547057542994,
"grad_norm": 0.5805770754814148,
"learning_rate": 4.3022817987076615e-05,
"loss": 0.0206,
"step": 48500
},
{
"epoch": 0.28212756693998986,
"grad_norm": 0.10411791503429413,
"learning_rate": 4.298952451249238e-05,
"loss": 0.0139,
"step": 48600
},
{
"epoch": 0.2827080763369857,
"grad_norm": 1.3499836921691895,
"learning_rate": 4.295616474110534e-05,
"loss": 0.0168,
"step": 48700
},
{
"epoch": 0.2832885857339816,
"grad_norm": 0.8422473073005676,
"learning_rate": 4.292273879585628e-05,
"loss": 0.0169,
"step": 48800
},
{
"epoch": 0.2838690951309774,
"grad_norm": 1.0992027521133423,
"learning_rate": 4.288924679992985e-05,
"loss": 0.0179,
"step": 48900
},
{
"epoch": 0.2844496045279733,
"grad_norm": 0.9558140635490417,
"learning_rate": 4.2855688876754104e-05,
"loss": 0.0162,
"step": 49000
},
{
"epoch": 0.2850301139249692,
"grad_norm": 2.5504753589630127,
"learning_rate": 4.2822065150000105e-05,
"loss": 0.0125,
"step": 49100
},
{
"epoch": 0.285610623321965,
"grad_norm": 6.963260650634766,
"learning_rate": 4.278837574358134e-05,
"loss": 0.0145,
"step": 49200
},
{
"epoch": 0.2861911327189609,
"grad_norm": 0.19555258750915527,
"learning_rate": 4.275462078165343e-05,
"loss": 0.0144,
"step": 49300
},
{
"epoch": 0.28677164211595674,
"grad_norm": 0.13861818611621857,
"learning_rate": 4.2720800388613545e-05,
"loss": 0.015,
"step": 49400
},
{
"epoch": 0.2873521515129526,
"grad_norm": 0.10115107148885727,
"learning_rate": 4.2686914689099986e-05,
"loss": 0.0208,
"step": 49500
},
{
"epoch": 0.28793266090994846,
"grad_norm": 0.12271959334611893,
"learning_rate": 4.265296380799174e-05,
"loss": 0.0177,
"step": 49600
},
{
"epoch": 0.28851317030694434,
"grad_norm": 0.6255984902381897,
"learning_rate": 4.261894787040801e-05,
"loss": 0.0142,
"step": 49700
},
{
"epoch": 0.28909367970394023,
"grad_norm": 0.24981549382209778,
"learning_rate": 4.258486700170774e-05,
"loss": 0.0129,
"step": 49800
},
{
"epoch": 0.28967418910093606,
"grad_norm": 0.42702168226242065,
"learning_rate": 4.2550721327489165e-05,
"loss": 0.0197,
"step": 49900
},
{
"epoch": 0.29025469849793195,
"grad_norm": 0.2005091905593872,
"learning_rate": 4.2516510973589366e-05,
"loss": 0.0165,
"step": 50000
},
{
"epoch": 0.2908352078949278,
"grad_norm": 0.18545077741146088,
"learning_rate": 4.248223606608378e-05,
"loss": 0.0197,
"step": 50100
},
{
"epoch": 0.29141571729192367,
"grad_norm": 2.60361385345459,
"learning_rate": 4.244789673128572e-05,
"loss": 0.0208,
"step": 50200
},
{
"epoch": 0.2919962266889195,
"grad_norm": 1.1765265464782715,
"learning_rate": 4.241349309574596e-05,
"loss": 0.0161,
"step": 50300
},
{
"epoch": 0.2925767360859154,
"grad_norm": 0.3382522463798523,
"learning_rate": 4.237902528625224e-05,
"loss": 0.0164,
"step": 50400
},
{
"epoch": 0.2931572454829113,
"grad_norm": 0.8997277021408081,
"learning_rate": 4.234449342982879e-05,
"loss": 0.0173,
"step": 50500
},
{
"epoch": 0.2937377548799071,
"grad_norm": 0.3323515057563782,
"learning_rate": 4.230989765373587e-05,
"loss": 0.0156,
"step": 50600
},
{
"epoch": 0.294318264276903,
"grad_norm": 0.0954294502735138,
"learning_rate": 4.2275238085469326e-05,
"loss": 0.0171,
"step": 50700
},
{
"epoch": 0.29489877367389883,
"grad_norm": 0.3944256007671356,
"learning_rate": 4.224051485276006e-05,
"loss": 0.0147,
"step": 50800
},
{
"epoch": 0.2954792830708947,
"grad_norm": 0.0,
"learning_rate": 4.220572808357363e-05,
"loss": 0.0178,
"step": 50900
},
{
"epoch": 0.29605979246789055,
"grad_norm": 1.3922127485275269,
"learning_rate": 4.217087790610973e-05,
"loss": 0.016,
"step": 51000
},
{
"epoch": 0.29664030186488644,
"grad_norm": 0.48834431171417236,
"learning_rate": 4.213596444880173e-05,
"loss": 0.013,
"step": 51100
},
{
"epoch": 0.2972208112618823,
"grad_norm": 1.1236047744750977,
"learning_rate": 4.210098784031621e-05,
"loss": 0.0177,
"step": 51200
},
{
"epoch": 0.29780132065887815,
"grad_norm": 0.22413845360279083,
"learning_rate": 4.206594820955249e-05,
"loss": 0.0195,
"step": 51300
},
{
"epoch": 0.29838183005587404,
"grad_norm": 1.6964247226715088,
"learning_rate": 4.2030845685642136e-05,
"loss": 0.0171,
"step": 51400
},
{
"epoch": 0.2989623394528699,
"grad_norm": 0.4666268825531006,
"learning_rate": 4.199568039794848e-05,
"loss": 0.0181,
"step": 51500
},
{
"epoch": 0.29954284884986576,
"grad_norm": 0.7793068289756775,
"learning_rate": 4.196045247606619e-05,
"loss": 0.0149,
"step": 51600
},
{
"epoch": 0.3001233582468616,
"grad_norm": 0.6577598452568054,
"learning_rate": 4.192516204982073e-05,
"loss": 0.0215,
"step": 51700
},
{
"epoch": 0.3007038676438575,
"grad_norm": 0.31358566880226135,
"learning_rate": 4.188980924926794e-05,
"loss": 0.0208,
"step": 51800
},
{
"epoch": 0.30128437704085337,
"grad_norm": 0.1279175728559494,
"learning_rate": 4.1854394204693495e-05,
"loss": 0.0132,
"step": 51900
},
{
"epoch": 0.3018648864378492,
"grad_norm": 1.288215160369873,
"learning_rate": 4.1818917046612474e-05,
"loss": 0.0196,
"step": 52000
},
{
"epoch": 0.3024453958348451,
"grad_norm": 0.7573376893997192,
"learning_rate": 4.178337790576888e-05,
"loss": 0.018,
"step": 52100
},
{
"epoch": 0.3030259052318409,
"grad_norm": 0.07305464148521423,
"learning_rate": 4.1747776913135115e-05,
"loss": 0.0144,
"step": 52200
},
{
"epoch": 0.3036064146288368,
"grad_norm": 0.41413378715515137,
"learning_rate": 4.1712114199911534e-05,
"loss": 0.0195,
"step": 52300
},
{
"epoch": 0.3041869240258327,
"grad_norm": 0.6894093155860901,
"learning_rate": 4.1676389897525946e-05,
"loss": 0.0147,
"step": 52400
},
{
"epoch": 0.3047674334228285,
"grad_norm": 0.0,
"learning_rate": 4.1640604137633144e-05,
"loss": 0.0191,
"step": 52500
},
{
"epoch": 0.3053479428198244,
"grad_norm": 0.42625299096107483,
"learning_rate": 4.16047570521144e-05,
"loss": 0.0143,
"step": 52600
},
{
"epoch": 0.30592845221682025,
"grad_norm": 0.7680391073226929,
"learning_rate": 4.156884877307701e-05,
"loss": 0.0141,
"step": 52700
},
{
"epoch": 0.30650896161381613,
"grad_norm": 0.7309791445732117,
"learning_rate": 4.1532879432853744e-05,
"loss": 0.0142,
"step": 52800
},
{
"epoch": 0.30708947101081197,
"grad_norm": 0.5670241117477417,
"learning_rate": 4.149684916400246e-05,
"loss": 0.016,
"step": 52900
},
{
"epoch": 0.30766998040780785,
"grad_norm": 1.2159571647644043,
"learning_rate": 4.146075809930549e-05,
"loss": 0.0192,
"step": 53000
},
{
"epoch": 0.30825048980480374,
"grad_norm": 1.0170856714248657,
"learning_rate": 4.142460637176928e-05,
"loss": 0.0139,
"step": 53100
},
{
"epoch": 0.30883099920179957,
"grad_norm": 0.7249845266342163,
"learning_rate": 4.138839411462379e-05,
"loss": 0.0162,
"step": 53200
},
{
"epoch": 0.30941150859879546,
"grad_norm": 0.25050070881843567,
"learning_rate": 4.1352121461322065e-05,
"loss": 0.0164,
"step": 53300
},
{
"epoch": 0.3099920179957913,
"grad_norm": 0.68352210521698,
"learning_rate": 4.131578854553976e-05,
"loss": 0.0175,
"step": 53400
},
{
"epoch": 0.3105725273927872,
"grad_norm": 0.3992615342140198,
"learning_rate": 4.1279395501174544e-05,
"loss": 0.0178,
"step": 53500
},
{
"epoch": 0.311153036789783,
"grad_norm": 0.0,
"learning_rate": 4.1242942462345744e-05,
"loss": 0.0192,
"step": 53600
},
{
"epoch": 0.3117335461867789,
"grad_norm": 1.5646060705184937,
"learning_rate": 4.1206429563393765e-05,
"loss": 0.0185,
"step": 53700
},
{
"epoch": 0.3123140555837748,
"grad_norm": 0.9312039613723755,
"learning_rate": 4.11698569388796e-05,
"loss": 0.0136,
"step": 53800
},
{
"epoch": 0.3128945649807706,
"grad_norm": 1.540569543838501,
"learning_rate": 4.113322472358436e-05,
"loss": 0.0188,
"step": 53900
},
{
"epoch": 0.3134750743777665,
"grad_norm": 0.13842260837554932,
"learning_rate": 4.109653305250877e-05,
"loss": 0.0142,
"step": 54000
},
{
"epoch": 0.31405558377476234,
"grad_norm": 1.753185510635376,
"learning_rate": 4.105978206087265e-05,
"loss": 0.0188,
"step": 54100
},
{
"epoch": 0.3146360931717582,
"grad_norm": 0.4443669319152832,
"learning_rate": 4.102297188411446e-05,
"loss": 0.0165,
"step": 54200
},
{
"epoch": 0.31521660256875406,
"grad_norm": 0.6722429990768433,
"learning_rate": 4.0986102657890744e-05,
"loss": 0.0192,
"step": 54300
},
{
"epoch": 0.31579711196574994,
"grad_norm": 0.7747224569320679,
"learning_rate": 4.09491745180757e-05,
"loss": 0.0169,
"step": 54400
},
{
"epoch": 0.31637762136274583,
"grad_norm": 0.2598312795162201,
"learning_rate": 4.09121876007606e-05,
"loss": 0.0172,
"step": 54500
},
{
"epoch": 0.31695813075974166,
"grad_norm": 0.11571415513753891,
"learning_rate": 4.087514204225336e-05,
"loss": 0.013,
"step": 54600
},
{
"epoch": 0.31753864015673755,
"grad_norm": 0.6480383276939392,
"learning_rate": 4.0838037979077976e-05,
"loss": 0.0182,
"step": 54700
},
{
"epoch": 0.3181191495537334,
"grad_norm": 0.6114629507064819,
"learning_rate": 4.080087554797408e-05,
"loss": 0.0206,
"step": 54800
},
{
"epoch": 0.31869965895072927,
"grad_norm": 0.8482924699783325,
"learning_rate": 4.076365488589641e-05,
"loss": 0.0229,
"step": 54900
},
{
"epoch": 0.3192801683477251,
"grad_norm": 0.37672215700149536,
"learning_rate": 4.072637613001426e-05,
"loss": 0.0188,
"step": 55000
},
{
"epoch": 0.319860677744721,
"grad_norm": 0.7157580256462097,
"learning_rate": 4.0689039417711075e-05,
"loss": 0.0176,
"step": 55100
},
{
"epoch": 0.3204411871417169,
"grad_norm": 1.3182079792022705,
"learning_rate": 4.065164488658383e-05,
"loss": 0.0183,
"step": 55200
},
{
"epoch": 0.3210216965387127,
"grad_norm": 17.849079132080078,
"learning_rate": 4.061419267444263e-05,
"loss": 0.0147,
"step": 55300
},
{
"epoch": 0.3216022059357086,
"grad_norm": 2.8836376667022705,
"learning_rate": 4.057668291931012e-05,
"loss": 0.0164,
"step": 55400
},
{
"epoch": 0.32218271533270443,
"grad_norm": 2.2379231452941895,
"learning_rate": 4.0539115759421016e-05,
"loss": 0.0155,
"step": 55500
},
{
"epoch": 0.3227632247297003,
"grad_norm": 0.6565619707107544,
"learning_rate": 4.050149133322158e-05,
"loss": 0.0158,
"step": 55600
},
{
"epoch": 0.3233437341266962,
"grad_norm": 9.40256404876709,
"learning_rate": 4.046380977936915e-05,
"loss": 0.0161,
"step": 55700
},
{
"epoch": 0.32392424352369203,
"grad_norm": 1.0256812572479248,
"learning_rate": 4.042607123673156e-05,
"loss": 0.0185,
"step": 55800
},
{
"epoch": 0.3245047529206879,
"grad_norm": 0.3539896011352539,
"learning_rate": 4.038827584438668e-05,
"loss": 0.013,
"step": 55900
},
{
"epoch": 0.32508526231768375,
"grad_norm": 0.3358542323112488,
"learning_rate": 4.035042374162189e-05,
"loss": 0.0179,
"step": 56000
},
{
"epoch": 0.32566577171467964,
"grad_norm": 0.6049757599830627,
"learning_rate": 4.0312515067933545e-05,
"loss": 0.0158,
"step": 56100
},
{
"epoch": 0.3262462811116755,
"grad_norm": 0.1417369246482849,
"learning_rate": 4.027454996302652e-05,
"loss": 0.019,
"step": 56200
},
{
"epoch": 0.32682679050867136,
"grad_norm": 1.0133875608444214,
"learning_rate": 4.023652856681363e-05,
"loss": 0.0145,
"step": 56300
},
{
"epoch": 0.32740729990566725,
"grad_norm": 5.445352554321289,
"learning_rate": 4.019845101941512e-05,
"loss": 0.0202,
"step": 56400
},
{
"epoch": 0.3279878093026631,
"grad_norm": 2.076885223388672,
"learning_rate": 4.0160317461158213e-05,
"loss": 0.0138,
"step": 56500
},
{
"epoch": 0.32856831869965897,
"grad_norm": 0.7435348033905029,
"learning_rate": 4.0122128032576524e-05,
"loss": 0.0163,
"step": 56600
},
{
"epoch": 0.3291488280966548,
"grad_norm": 0.662987470626831,
"learning_rate": 4.0083882874409576e-05,
"loss": 0.0179,
"step": 56700
},
{
"epoch": 0.3297293374936507,
"grad_norm": 0.7310676574707031,
"learning_rate": 4.004558212760227e-05,
"loss": 0.0136,
"step": 56800
},
{
"epoch": 0.3303098468906465,
"grad_norm": 1.144674301147461,
"learning_rate": 4.0007225933304344e-05,
"loss": 0.0183,
"step": 56900
},
{
"epoch": 0.3308903562876424,
"grad_norm": 0.7550173997879028,
"learning_rate": 3.9968814432869914e-05,
"loss": 0.0125,
"step": 57000
},
{
"epoch": 0.3314708656846383,
"grad_norm": 0.5192617774009705,
"learning_rate": 3.993034776785691e-05,
"loss": 0.014,
"step": 57100
},
{
"epoch": 0.3320513750816341,
"grad_norm": 0.10176233947277069,
"learning_rate": 3.9891826080026535e-05,
"loss": 0.0148,
"step": 57200
},
{
"epoch": 0.33263188447863,
"grad_norm": 0.0,
"learning_rate": 3.9853249511342786e-05,
"loss": 0.0153,
"step": 57300
},
{
"epoch": 0.33321239387562585,
"grad_norm": 0.5603938698768616,
"learning_rate": 3.981461820397191e-05,
"loss": 0.0153,
"step": 57400
},
{
"epoch": 0.33379290327262173,
"grad_norm": 0.9487095475196838,
"learning_rate": 3.977593230028188e-05,
"loss": 0.0158,
"step": 57500
},
{
"epoch": 0.33437341266961756,
"grad_norm": 4.972527027130127,
"learning_rate": 3.973719194284188e-05,
"loss": 0.016,
"step": 57600
},
{
"epoch": 0.33495392206661345,
"grad_norm": 0.1742544323205948,
"learning_rate": 3.969839727442175e-05,
"loss": 0.017,
"step": 57700
},
{
"epoch": 0.33553443146360934,
"grad_norm": 0.43199607729911804,
"learning_rate": 3.965954843799152e-05,
"loss": 0.0156,
"step": 57800
},
{
"epoch": 0.33611494086060517,
"grad_norm": 2.0231590270996094,
"learning_rate": 3.9620645576720815e-05,
"loss": 0.0173,
"step": 57900
},
{
"epoch": 0.33669545025760106,
"grad_norm": 1.236526608467102,
"learning_rate": 3.9581688833978375e-05,
"loss": 0.0171,
"step": 58000
},
{
"epoch": 0.3372759596545969,
"grad_norm": 1.1368087530136108,
"learning_rate": 3.954267835333148e-05,
"loss": 0.0118,
"step": 58100
},
{
"epoch": 0.3378564690515928,
"grad_norm": 0.8430467844009399,
"learning_rate": 3.9503614278545494e-05,
"loss": 0.0141,
"step": 58200
},
{
"epoch": 0.3384369784485886,
"grad_norm": 0.19449672102928162,
"learning_rate": 3.946449675358327e-05,
"loss": 0.0158,
"step": 58300
},
{
"epoch": 0.3390174878455845,
"grad_norm": 0.10014590620994568,
"learning_rate": 3.9425325922604615e-05,
"loss": 0.0152,
"step": 58400
},
{
"epoch": 0.3395979972425804,
"grad_norm": 0.4984476864337921,
"learning_rate": 3.938610192996584e-05,
"loss": 0.0164,
"step": 58500
},
{
"epoch": 0.3401785066395762,
"grad_norm": 2.3255436420440674,
"learning_rate": 3.934682492021913e-05,
"loss": 0.0181,
"step": 58600
},
{
"epoch": 0.3407590160365721,
"grad_norm": 1.8835875988006592,
"learning_rate": 3.930749503811206e-05,
"loss": 0.012,
"step": 58700
},
{
"epoch": 0.34133952543356794,
"grad_norm": 1.3894046545028687,
"learning_rate": 3.9268112428587074e-05,
"loss": 0.015,
"step": 58800
},
{
"epoch": 0.3419200348305638,
"grad_norm": 0.15835818648338318,
"learning_rate": 3.922867723678091e-05,
"loss": 0.0166,
"step": 58900
},
{
"epoch": 0.3425005442275597,
"grad_norm": 0.3661365807056427,
"learning_rate": 3.918918960802411e-05,
"loss": 0.0162,
"step": 59000
},
{
"epoch": 0.34308105362455554,
"grad_norm": 0.11089111864566803,
"learning_rate": 3.914964968784044e-05,
"loss": 0.0232,
"step": 59100
},
{
"epoch": 0.34366156302155143,
"grad_norm": 2.527754306793213,
"learning_rate": 3.911005762194639e-05,
"loss": 0.0147,
"step": 59200
},
{
"epoch": 0.34424207241854726,
"grad_norm": 0.15072380006313324,
"learning_rate": 3.9070413556250616e-05,
"loss": 0.0189,
"step": 59300
},
{
"epoch": 0.34482258181554315,
"grad_norm": 0.5700109004974365,
"learning_rate": 3.903071763685342e-05,
"loss": 0.0151,
"step": 59400
},
{
"epoch": 0.345403091212539,
"grad_norm": 0.8213745951652527,
"learning_rate": 3.899097001004618e-05,
"loss": 0.0167,
"step": 59500
},
{
"epoch": 0.34598360060953487,
"grad_norm": 1.022154450416565,
"learning_rate": 3.895117082231085e-05,
"loss": 0.0146,
"step": 59600
},
{
"epoch": 0.34656411000653076,
"grad_norm": 0.2379520982503891,
"learning_rate": 3.891132022031939e-05,
"loss": 0.0179,
"step": 59700
},
{
"epoch": 0.3471446194035266,
"grad_norm": 0.835014283657074,
"learning_rate": 3.8871418350933256e-05,
"loss": 0.0145,
"step": 59800
},
{
"epoch": 0.3477251288005225,
"grad_norm": 5.786501884460449,
"learning_rate": 3.8831465361202794e-05,
"loss": 0.0145,
"step": 59900
},
{
"epoch": 0.3483056381975183,
"grad_norm": 0.40479740500450134,
"learning_rate": 3.87914613983668e-05,
"loss": 0.0175,
"step": 60000
},
{
"epoch": 0.3488861475945142,
"grad_norm": 0.2653241753578186,
"learning_rate": 3.875140660985189e-05,
"loss": 0.0156,
"step": 60100
},
{
"epoch": 0.34946665699151,
"grad_norm": 0.2719464600086212,
"learning_rate": 3.8711301143272004e-05,
"loss": 0.0122,
"step": 60200
},
{
"epoch": 0.3500471663885059,
"grad_norm": 0.23439522087574005,
"learning_rate": 3.8671145146427825e-05,
"loss": 0.0169,
"step": 60300
},
{
"epoch": 0.3506276757855018,
"grad_norm": 0.06842320412397385,
"learning_rate": 3.8630938767306256e-05,
"loss": 0.0141,
"step": 60400
},
{
"epoch": 0.35120818518249763,
"grad_norm": 0.0,
"learning_rate": 3.85906821540799e-05,
"loss": 0.0144,
"step": 60500
},
{
"epoch": 0.3517886945794935,
"grad_norm": 0.0,
"learning_rate": 3.855037545510648e-05,
"loss": 0.017,
"step": 60600
},
{
"epoch": 0.35236920397648935,
"grad_norm": 0.27966034412384033,
"learning_rate": 3.851001881892827e-05,
"loss": 0.0197,
"step": 60700
},
{
"epoch": 0.35294971337348524,
"grad_norm": 2.5935139656066895,
"learning_rate": 3.846961239427161e-05,
"loss": 0.0164,
"step": 60800
},
{
"epoch": 0.3535302227704811,
"grad_norm": 0.5523900985717773,
"learning_rate": 3.842915633004632e-05,
"loss": 0.0186,
"step": 60900
},
{
"epoch": 0.35411073216747696,
"grad_norm": 7.492378234863281,
"learning_rate": 3.8388650775345144e-05,
"loss": 0.0182,
"step": 61000
},
{
"epoch": 0.35469124156447285,
"grad_norm": 0.31653234362602234,
"learning_rate": 3.8348095879443226e-05,
"loss": 0.0145,
"step": 61100
},
{
"epoch": 0.3552717509614687,
"grad_norm": 1.4802820682525635,
"learning_rate": 3.830749179179752e-05,
"loss": 0.015,
"step": 61200
},
{
"epoch": 0.35585226035846457,
"grad_norm": 0.4932232201099396,
"learning_rate": 3.8266838662046334e-05,
"loss": 0.0133,
"step": 61300
},
{
"epoch": 0.3564327697554604,
"grad_norm": 3.2751312255859375,
"learning_rate": 3.822613664000862e-05,
"loss": 0.0155,
"step": 61400
},
{
"epoch": 0.3570132791524563,
"grad_norm": 0.8037987351417542,
"learning_rate": 3.818538587568359e-05,
"loss": 0.0196,
"step": 61500
},
{
"epoch": 0.3575937885494521,
"grad_norm": 0.5324920415878296,
"learning_rate": 3.8144586519250044e-05,
"loss": 0.0161,
"step": 61600
},
{
"epoch": 0.358174297946448,
"grad_norm": 0.251559317111969,
"learning_rate": 3.8103738721065856e-05,
"loss": 0.014,
"step": 61700
},
{
"epoch": 0.3587548073434439,
"grad_norm": 2.9223034381866455,
"learning_rate": 3.806284263166745e-05,
"loss": 0.0119,
"step": 61800
},
{
"epoch": 0.3593353167404397,
"grad_norm": 0.4311857521533966,
"learning_rate": 3.8021898401769205e-05,
"loss": 0.0149,
"step": 61900
},
{
"epoch": 0.3599158261374356,
"grad_norm": 0.5429189801216125,
"learning_rate": 3.7980906182262893e-05,
"loss": 0.0211,
"step": 62000
},
{
"epoch": 0.36049633553443144,
"grad_norm": 1.3071308135986328,
"learning_rate": 3.793986612421717e-05,
"loss": 0.0132,
"step": 62100
},
{
"epoch": 0.36107684493142733,
"grad_norm": 0.4390534460544586,
"learning_rate": 3.789877837887698e-05,
"loss": 0.0165,
"step": 62200
},
{
"epoch": 0.36165735432842316,
"grad_norm": 42.51701354980469,
"learning_rate": 3.7857643097663006e-05,
"loss": 0.0151,
"step": 62300
},
{
"epoch": 0.36223786372541905,
"grad_norm": 1.4220099449157715,
"learning_rate": 3.7816460432171135e-05,
"loss": 0.014,
"step": 62400
},
{
"epoch": 0.36281837312241494,
"grad_norm": 0.5632671117782593,
"learning_rate": 3.777523053417184e-05,
"loss": 0.0168,
"step": 62500
},
{
"epoch": 0.36339888251941077,
"grad_norm": 0.7514089345932007,
"learning_rate": 3.7733953555609696e-05,
"loss": 0.0171,
"step": 62600
},
{
"epoch": 0.36397939191640666,
"grad_norm": 0.2681547701358795,
"learning_rate": 3.769262964860276e-05,
"loss": 0.0134,
"step": 62700
},
{
"epoch": 0.3645599013134025,
"grad_norm": 1.961303472518921,
"learning_rate": 3.765125896544206e-05,
"loss": 0.0201,
"step": 62800
},
{
"epoch": 0.3651404107103984,
"grad_norm": 0.6701322197914124,
"learning_rate": 3.7609841658590985e-05,
"loss": 0.0159,
"step": 62900
},
{
"epoch": 0.36572092010739427,
"grad_norm": 0.3040947914123535,
"learning_rate": 3.756837788068475e-05,
"loss": 0.0157,
"step": 63000
},
{
"epoch": 0.3663014295043901,
"grad_norm": 0.28461697697639465,
"learning_rate": 3.7526867784529835e-05,
"loss": 0.0172,
"step": 63100
},
{
"epoch": 0.366881938901386,
"grad_norm": 1.9548020362854004,
"learning_rate": 3.7485311523103427e-05,
"loss": 0.0199,
"step": 63200
},
{
"epoch": 0.3674624482983818,
"grad_norm": 0.6341608762741089,
"learning_rate": 3.744370924955282e-05,
"loss": 0.0206,
"step": 63300
},
{
"epoch": 0.3680429576953777,
"grad_norm": 0.3764314651489258,
"learning_rate": 3.7402061117194915e-05,
"loss": 0.0151,
"step": 63400
},
{
"epoch": 0.36862346709237354,
"grad_norm": 0.3538680076599121,
"learning_rate": 3.7360367279515565e-05,
"loss": 0.0157,
"step": 63500
},
{
"epoch": 0.3692039764893694,
"grad_norm": 0.1386338174343109,
"learning_rate": 3.731862789016911e-05,
"loss": 0.0147,
"step": 63600
},
{
"epoch": 0.3697844858863653,
"grad_norm": 0.5350472331047058,
"learning_rate": 3.7276843102977725e-05,
"loss": 0.0128,
"step": 63700
},
{
"epoch": 0.37036499528336114,
"grad_norm": 1.673578143119812,
"learning_rate": 3.723501307193091e-05,
"loss": 0.0138,
"step": 63800
},
{
"epoch": 0.37094550468035703,
"grad_norm": 0.18305979669094086,
"learning_rate": 3.719313795118491e-05,
"loss": 0.0198,
"step": 63900
},
{
"epoch": 0.37152601407735286,
"grad_norm": 0.8609408736228943,
"learning_rate": 3.7151217895062105e-05,
"loss": 0.0188,
"step": 64000
},
{
"epoch": 0.37210652347434875,
"grad_norm": 0.28579720854759216,
"learning_rate": 3.710925305805051e-05,
"loss": 0.0163,
"step": 64100
},
{
"epoch": 0.3726870328713446,
"grad_norm": 0.5904589295387268,
"learning_rate": 3.706724359480316e-05,
"loss": 0.0156,
"step": 64200
},
{
"epoch": 0.37326754226834047,
"grad_norm": 1.1671172380447388,
"learning_rate": 3.7025189660137535e-05,
"loss": 0.0157,
"step": 64300
},
{
"epoch": 0.37384805166533636,
"grad_norm": 0.12482750415802002,
"learning_rate": 3.698309140903504e-05,
"loss": 0.0143,
"step": 64400
},
{
"epoch": 0.3744285610623322,
"grad_norm": 2.280238151550293,
"learning_rate": 3.694094899664037e-05,
"loss": 0.0142,
"step": 64500
},
{
"epoch": 0.3750090704593281,
"grad_norm": 0.1061575785279274,
"learning_rate": 3.689876257826096e-05,
"loss": 0.0228,
"step": 64600
},
{
"epoch": 0.3755895798563239,
"grad_norm": 0.531701385974884,
"learning_rate": 3.685653230936646e-05,
"loss": 0.0193,
"step": 64700
},
{
"epoch": 0.3761700892533198,
"grad_norm": 0.49610480666160583,
"learning_rate": 3.681425834558808e-05,
"loss": 0.0182,
"step": 64800
},
{
"epoch": 0.3767505986503156,
"grad_norm": 2.2673187255859375,
"learning_rate": 3.67719408427181e-05,
"loss": 0.0153,
"step": 64900
},
{
"epoch": 0.3773311080473115,
"grad_norm": 0.8174604177474976,
"learning_rate": 3.672957995670921e-05,
"loss": 0.0152,
"step": 65000
},
{
"epoch": 0.3779116174443074,
"grad_norm": 2.4207687377929688,
"learning_rate": 3.668717584367401e-05,
"loss": 0.0135,
"step": 65100
},
{
"epoch": 0.37849212684130323,
"grad_norm": 0.7421271204948425,
"learning_rate": 3.664472865988441e-05,
"loss": 0.0171,
"step": 65200
},
{
"epoch": 0.3790726362382991,
"grad_norm": 0.5504394173622131,
"learning_rate": 3.660223856177102e-05,
"loss": 0.0171,
"step": 65300
},
{
"epoch": 0.37965314563529495,
"grad_norm": 0.41542255878448486,
"learning_rate": 3.655970570592262e-05,
"loss": 0.0118,
"step": 65400
},
{
"epoch": 0.38023365503229084,
"grad_norm": 0.0,
"learning_rate": 3.651713024908556e-05,
"loss": 0.012,
"step": 65500
},
{
"epoch": 0.3808141644292867,
"grad_norm": 0.054853569716215134,
"learning_rate": 3.64745123481632e-05,
"loss": 0.0201,
"step": 65600
},
{
"epoch": 0.38139467382628256,
"grad_norm": 1.7980788946151733,
"learning_rate": 3.643185216021531e-05,
"loss": 0.0114,
"step": 65700
},
{
"epoch": 0.38197518322327845,
"grad_norm": 0.3637460470199585,
"learning_rate": 3.6389149842457486e-05,
"loss": 0.0158,
"step": 65800
},
{
"epoch": 0.3825556926202743,
"grad_norm": 1.9792327880859375,
"learning_rate": 3.634640555226062e-05,
"loss": 0.0156,
"step": 65900
},
{
"epoch": 0.38313620201727017,
"grad_norm": 0.38713720440864563,
"learning_rate": 3.630361944715024e-05,
"loss": 0.0162,
"step": 66000
},
{
"epoch": 0.383716711414266,
"grad_norm": 0.6612209677696228,
"learning_rate": 3.626079168480601e-05,
"loss": 0.0147,
"step": 66100
},
{
"epoch": 0.3842972208112619,
"grad_norm": 1.3527421951293945,
"learning_rate": 3.621792242306111e-05,
"loss": 0.0168,
"step": 66200
},
{
"epoch": 0.3848777302082578,
"grad_norm": 3.078646421432495,
"learning_rate": 3.617501181990164e-05,
"loss": 0.015,
"step": 66300
},
{
"epoch": 0.3854582396052536,
"grad_norm": 2.9273505210876465,
"learning_rate": 3.613206003346606e-05,
"loss": 0.0182,
"step": 66400
},
{
"epoch": 0.3860387490022495,
"grad_norm": 0.700567364692688,
"learning_rate": 3.608906722204463e-05,
"loss": 0.0138,
"step": 66500
},
{
"epoch": 0.3866192583992453,
"grad_norm": 0.4075513482093811,
"learning_rate": 3.6046033544078736e-05,
"loss": 0.0151,
"step": 66600
},
{
"epoch": 0.3871997677962412,
"grad_norm": 1.424938678741455,
"learning_rate": 3.6002959158160454e-05,
"loss": 0.0141,
"step": 66700
},
{
"epoch": 0.38778027719323704,
"grad_norm": 1.095062255859375,
"learning_rate": 3.595984422303182e-05,
"loss": 0.0177,
"step": 66800
},
{
"epoch": 0.38836078659023293,
"grad_norm": 1.1501364707946777,
"learning_rate": 3.591668889758432e-05,
"loss": 0.0128,
"step": 66900
},
{
"epoch": 0.3889412959872288,
"grad_norm": 0.35930997133255005,
"learning_rate": 3.587349334085831e-05,
"loss": 0.0163,
"step": 67000
},
{
"epoch": 0.38952180538422465,
"grad_norm": 0.22883236408233643,
"learning_rate": 3.5830257712042374e-05,
"loss": 0.0144,
"step": 67100
},
{
"epoch": 0.39010231478122054,
"grad_norm": 0.0,
"learning_rate": 3.578698217047281e-05,
"loss": 0.0146,
"step": 67200
},
{
"epoch": 0.39068282417821637,
"grad_norm": 0.4836377501487732,
"learning_rate": 3.574366687563298e-05,
"loss": 0.0155,
"step": 67300
},
{
"epoch": 0.39126333357521226,
"grad_norm": 1.9902615547180176,
"learning_rate": 3.570031198715277e-05,
"loss": 0.0189,
"step": 67400
},
{
"epoch": 0.3918438429722081,
"grad_norm": 0.6981222629547119,
"learning_rate": 3.565691766480795e-05,
"loss": 0.0167,
"step": 67500
},
{
"epoch": 0.392424352369204,
"grad_norm": 1.2047152519226074,
"learning_rate": 3.561348406851966e-05,
"loss": 0.0158,
"step": 67600
},
{
"epoch": 0.39300486176619986,
"grad_norm": 1.062116026878357,
"learning_rate": 3.557001135835375e-05,
"loss": 0.0156,
"step": 67700
},
{
"epoch": 0.3935853711631957,
"grad_norm": 3.757115602493286,
"learning_rate": 3.55264996945202e-05,
"loss": 0.0157,
"step": 67800
},
{
"epoch": 0.3941658805601916,
"grad_norm": 2.9534924030303955,
"learning_rate": 3.548294923737258e-05,
"loss": 0.0157,
"step": 67900
},
{
"epoch": 0.3947463899571874,
"grad_norm": 0.46122029423713684,
"learning_rate": 3.5439360147407404e-05,
"loss": 0.016,
"step": 68000
},
{
"epoch": 0.3953268993541833,
"grad_norm": 2.8722681999206543,
"learning_rate": 3.5395732585263566e-05,
"loss": 0.0144,
"step": 68100
},
{
"epoch": 0.39590740875117914,
"grad_norm": 0.988606870174408,
"learning_rate": 3.535206671172175e-05,
"loss": 0.014,
"step": 68200
},
{
"epoch": 0.396487918148175,
"grad_norm": 0.39610621333122253,
"learning_rate": 3.530836268770379e-05,
"loss": 0.0141,
"step": 68300
},
{
"epoch": 0.3970684275451709,
"grad_norm": 3.2667903900146484,
"learning_rate": 3.526462067427218e-05,
"loss": 0.0212,
"step": 68400
},
{
"epoch": 0.39764893694216674,
"grad_norm": 0.9565812945365906,
"learning_rate": 3.522084083262935e-05,
"loss": 0.0145,
"step": 68500
},
{
"epoch": 0.39822944633916263,
"grad_norm": 1.0002511739730835,
"learning_rate": 3.5177023324117206e-05,
"loss": 0.0158,
"step": 68600
},
{
"epoch": 0.39880995573615846,
"grad_norm": 0.8633850812911987,
"learning_rate": 3.51331683102164e-05,
"loss": 0.0192,
"step": 68700
},
{
"epoch": 0.39939046513315435,
"grad_norm": 0.359651654958725,
"learning_rate": 3.508927595254585e-05,
"loss": 0.0198,
"step": 68800
},
{
"epoch": 0.3999709745301502,
"grad_norm": 1.0570274591445923,
"learning_rate": 3.504534641286209e-05,
"loss": 0.0163,
"step": 68900
},
{
"epoch": 0.40055148392714607,
"grad_norm": 0.0,
"learning_rate": 3.500137985305865e-05,
"loss": 0.0141,
"step": 69000
},
{
"epoch": 0.40113199332414196,
"grad_norm": 1.52181077003479,
"learning_rate": 3.495737643516552e-05,
"loss": 0.0145,
"step": 69100
},
{
"epoch": 0.4017125027211378,
"grad_norm": 0.6070308685302734,
"learning_rate": 3.491333632134852e-05,
"loss": 0.0179,
"step": 69200
},
{
"epoch": 0.4022930121181337,
"grad_norm": 0.19646623730659485,
"learning_rate": 3.486925967390871e-05,
"loss": 0.0139,
"step": 69300
},
{
"epoch": 0.4028735215151295,
"grad_norm": 0.0677868127822876,
"learning_rate": 3.482514665528176e-05,
"loss": 0.0186,
"step": 69400
},
{
"epoch": 0.4034540309121254,
"grad_norm": 9.331048965454102,
"learning_rate": 3.4780997428037424e-05,
"loss": 0.0139,
"step": 69500
},
{
"epoch": 0.4040345403091212,
"grad_norm": 1.8702892065048218,
"learning_rate": 3.473681215487884e-05,
"loss": 0.0162,
"step": 69600
},
{
"epoch": 0.4046150497061171,
"grad_norm": 0.36429017782211304,
"learning_rate": 3.4692590998642026e-05,
"loss": 0.0164,
"step": 69700
},
{
"epoch": 0.405195559103113,
"grad_norm": 2.0621962547302246,
"learning_rate": 3.464833412229523e-05,
"loss": 0.0125,
"step": 69800
},
{
"epoch": 0.40577606850010883,
"grad_norm": 0.6523299217224121,
"learning_rate": 3.460404168893834e-05,
"loss": 0.0171,
"step": 69900
},
{
"epoch": 0.4063565778971047,
"grad_norm": 0.20909562706947327,
"learning_rate": 3.455971386180229e-05,
"loss": 0.0179,
"step": 70000
}
],
"logging_steps": 100,
"max_steps": 172263,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 10000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.1534812258533112e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}