TrustNet / trainer_state.json
Laugh1ng's picture
Upload folder using huggingface_hub
94cc0a4 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.9949212136997,
"eval_steps": 500,
"global_step": 4795,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.010418023180101576,
"grad_norm": 11.121468544006348,
"learning_rate": 2.0833333333333333e-07,
"loss": 1.0155,
"step": 10
},
{
"epoch": 0.020836046360203152,
"grad_norm": 10.899005889892578,
"learning_rate": 4.1666666666666667e-07,
"loss": 1.0247,
"step": 20
},
{
"epoch": 0.03125406954030473,
"grad_norm": 11.787379264831543,
"learning_rate": 6.25e-07,
"loss": 0.9569,
"step": 30
},
{
"epoch": 0.041672092720406305,
"grad_norm": 10.255136489868164,
"learning_rate": 8.333333333333333e-07,
"loss": 0.9531,
"step": 40
},
{
"epoch": 0.052090115900507876,
"grad_norm": 11.36585807800293,
"learning_rate": 1.0416666666666667e-06,
"loss": 0.913,
"step": 50
},
{
"epoch": 0.06250813908060945,
"grad_norm": 8.4786958694458,
"learning_rate": 1.25e-06,
"loss": 0.9508,
"step": 60
},
{
"epoch": 0.07292616226071102,
"grad_norm": 13.468831062316895,
"learning_rate": 1.4583333333333335e-06,
"loss": 0.9788,
"step": 70
},
{
"epoch": 0.08334418544081261,
"grad_norm": 13.184584617614746,
"learning_rate": 1.6666666666666667e-06,
"loss": 0.9664,
"step": 80
},
{
"epoch": 0.09376220862091418,
"grad_norm": 8.695037841796875,
"learning_rate": 1.8750000000000003e-06,
"loss": 0.9333,
"step": 90
},
{
"epoch": 0.10418023180101575,
"grad_norm": 10.278554916381836,
"learning_rate": 2.0833333333333334e-06,
"loss": 0.8748,
"step": 100
},
{
"epoch": 0.11459825498111734,
"grad_norm": 10.934048652648926,
"learning_rate": 2.2916666666666666e-06,
"loss": 0.8221,
"step": 110
},
{
"epoch": 0.1250162781612189,
"grad_norm": 6.949201583862305,
"learning_rate": 2.5e-06,
"loss": 0.7869,
"step": 120
},
{
"epoch": 0.13543430134132048,
"grad_norm": 9.884735107421875,
"learning_rate": 2.7083333333333334e-06,
"loss": 0.7208,
"step": 130
},
{
"epoch": 0.14585232452142205,
"grad_norm": 8.795683860778809,
"learning_rate": 2.916666666666667e-06,
"loss": 0.7022,
"step": 140
},
{
"epoch": 0.15627034770152365,
"grad_norm": 7.725055694580078,
"learning_rate": 3.125e-06,
"loss": 0.6499,
"step": 150
},
{
"epoch": 0.16668837088162522,
"grad_norm": 5.934634208679199,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.5223,
"step": 160
},
{
"epoch": 0.1771063940617268,
"grad_norm": 5.791957378387451,
"learning_rate": 3.5416666666666673e-06,
"loss": 0.426,
"step": 170
},
{
"epoch": 0.18752441724182836,
"grad_norm": 4.230670928955078,
"learning_rate": 3.7500000000000005e-06,
"loss": 0.2739,
"step": 180
},
{
"epoch": 0.19794244042192993,
"grad_norm": 3.5117030143737793,
"learning_rate": 3.958333333333333e-06,
"loss": 0.2038,
"step": 190
},
{
"epoch": 0.2083604636020315,
"grad_norm": 1.7673850059509277,
"learning_rate": 4.166666666666667e-06,
"loss": 0.1047,
"step": 200
},
{
"epoch": 0.2187784867821331,
"grad_norm": 2.710550308227539,
"learning_rate": 4.3750000000000005e-06,
"loss": 0.0695,
"step": 210
},
{
"epoch": 0.22919650996223467,
"grad_norm": 1.8703516721725464,
"learning_rate": 4.583333333333333e-06,
"loss": 0.0448,
"step": 220
},
{
"epoch": 0.23961453314233624,
"grad_norm": 2.233571767807007,
"learning_rate": 4.791666666666668e-06,
"loss": 0.0482,
"step": 230
},
{
"epoch": 0.2500325563224378,
"grad_norm": 2.937638282775879,
"learning_rate": 5e-06,
"loss": 0.0464,
"step": 240
},
{
"epoch": 0.2604505795025394,
"grad_norm": 3.5171966552734375,
"learning_rate": 5.208333333333334e-06,
"loss": 0.026,
"step": 250
},
{
"epoch": 0.27086860268264096,
"grad_norm": 3.3302793502807617,
"learning_rate": 5.416666666666667e-06,
"loss": 0.0131,
"step": 260
},
{
"epoch": 0.28128662586274256,
"grad_norm": 4.264321804046631,
"learning_rate": 5.625e-06,
"loss": 0.0267,
"step": 270
},
{
"epoch": 0.2917046490428441,
"grad_norm": 1.3102387189865112,
"learning_rate": 5.833333333333334e-06,
"loss": 0.0192,
"step": 280
},
{
"epoch": 0.3021226722229457,
"grad_norm": 2.629667043685913,
"learning_rate": 6.041666666666667e-06,
"loss": 0.0221,
"step": 290
},
{
"epoch": 0.3125406954030473,
"grad_norm": 5.036803245544434,
"learning_rate": 6.25e-06,
"loss": 0.043,
"step": 300
},
{
"epoch": 0.32295871858314884,
"grad_norm": 1.3269509077072144,
"learning_rate": 6.458333333333334e-06,
"loss": 0.049,
"step": 310
},
{
"epoch": 0.33337674176325044,
"grad_norm": 3.0598790645599365,
"learning_rate": 6.666666666666667e-06,
"loss": 0.0131,
"step": 320
},
{
"epoch": 0.343794764943352,
"grad_norm": 0.758718729019165,
"learning_rate": 6.875e-06,
"loss": 0.0244,
"step": 330
},
{
"epoch": 0.3542127881234536,
"grad_norm": 1.2521103620529175,
"learning_rate": 7.083333333333335e-06,
"loss": 0.0149,
"step": 340
},
{
"epoch": 0.3646308113035551,
"grad_norm": 0.35801389813423157,
"learning_rate": 7.291666666666667e-06,
"loss": 0.0086,
"step": 350
},
{
"epoch": 0.3750488344836567,
"grad_norm": 1.5886716842651367,
"learning_rate": 7.500000000000001e-06,
"loss": 0.0158,
"step": 360
},
{
"epoch": 0.3854668576637583,
"grad_norm": 4.816068649291992,
"learning_rate": 7.708333333333334e-06,
"loss": 0.0331,
"step": 370
},
{
"epoch": 0.39588488084385987,
"grad_norm": 1.0439949035644531,
"learning_rate": 7.916666666666667e-06,
"loss": 0.0109,
"step": 380
},
{
"epoch": 0.40630290402396146,
"grad_norm": 0.6839566826820374,
"learning_rate": 8.125000000000001e-06,
"loss": 0.0116,
"step": 390
},
{
"epoch": 0.416720927204063,
"grad_norm": 2.3680825233459473,
"learning_rate": 8.333333333333334e-06,
"loss": 0.0086,
"step": 400
},
{
"epoch": 0.4271389503841646,
"grad_norm": 2.888700485229492,
"learning_rate": 8.541666666666666e-06,
"loss": 0.0257,
"step": 410
},
{
"epoch": 0.4375569735642662,
"grad_norm": 1.5104542970657349,
"learning_rate": 8.750000000000001e-06,
"loss": 0.0211,
"step": 420
},
{
"epoch": 0.44797499674436775,
"grad_norm": 3.3087830543518066,
"learning_rate": 8.958333333333334e-06,
"loss": 0.0195,
"step": 430
},
{
"epoch": 0.45839301992446935,
"grad_norm": 0.7269104719161987,
"learning_rate": 9.166666666666666e-06,
"loss": 0.0042,
"step": 440
},
{
"epoch": 0.4688110431045709,
"grad_norm": 0.03662832826375961,
"learning_rate": 9.375000000000001e-06,
"loss": 0.0225,
"step": 450
},
{
"epoch": 0.4792290662846725,
"grad_norm": 0.027355490252375603,
"learning_rate": 9.583333333333335e-06,
"loss": 0.0207,
"step": 460
},
{
"epoch": 0.48964708946477403,
"grad_norm": 1.296669602394104,
"learning_rate": 9.791666666666666e-06,
"loss": 0.0184,
"step": 470
},
{
"epoch": 0.5000651126448756,
"grad_norm": 0.7718325853347778,
"learning_rate": 1e-05,
"loss": 0.0082,
"step": 480
},
{
"epoch": 0.5104831358249772,
"grad_norm": 0.6721673607826233,
"learning_rate": 9.999867481584167e-06,
"loss": 0.0201,
"step": 490
},
{
"epoch": 0.5209011590050788,
"grad_norm": 2.800171136856079,
"learning_rate": 9.99946993336112e-06,
"loss": 0.0118,
"step": 500
},
{
"epoch": 0.5313191821851804,
"grad_norm": 0.13001485168933868,
"learning_rate": 9.998807376403843e-06,
"loss": 0.0028,
"step": 510
},
{
"epoch": 0.5417372053652819,
"grad_norm": 0.11868428438901901,
"learning_rate": 9.997879845832736e-06,
"loss": 0.0248,
"step": 520
},
{
"epoch": 0.5521552285453835,
"grad_norm": 0.1077343001961708,
"learning_rate": 9.996687390813751e-06,
"loss": 0.0026,
"step": 530
},
{
"epoch": 0.5625732517254851,
"grad_norm": 0.004652015864849091,
"learning_rate": 9.995230074555788e-06,
"loss": 0.0216,
"step": 540
},
{
"epoch": 0.5729912749055867,
"grad_norm": 2.4979023933410645,
"learning_rate": 9.993507974307346e-06,
"loss": 0.0169,
"step": 550
},
{
"epoch": 0.5834092980856882,
"grad_norm": 2.996638059616089,
"learning_rate": 9.991521181352419e-06,
"loss": 0.0111,
"step": 560
},
{
"epoch": 0.5938273212657899,
"grad_norm": 1.996310830116272,
"learning_rate": 9.989269801005675e-06,
"loss": 0.0115,
"step": 570
},
{
"epoch": 0.6042453444458914,
"grad_norm": 0.1705595701932907,
"learning_rate": 9.986753952606851e-06,
"loss": 0.0072,
"step": 580
},
{
"epoch": 0.6146633676259929,
"grad_norm": 1.5530083179473877,
"learning_rate": 9.983973769514448e-06,
"loss": 0.0056,
"step": 590
},
{
"epoch": 0.6250813908060946,
"grad_norm": 0.45140549540519714,
"learning_rate": 9.98092939909865e-06,
"loss": 0.0153,
"step": 600
},
{
"epoch": 0.6354994139861961,
"grad_norm": 0.16771870851516724,
"learning_rate": 9.977621002733512e-06,
"loss": 0.0186,
"step": 610
},
{
"epoch": 0.6459174371662977,
"grad_norm": 1.3226218223571777,
"learning_rate": 9.974048755788416e-06,
"loss": 0.0042,
"step": 620
},
{
"epoch": 0.6563354603463992,
"grad_norm": 0.0040002502501010895,
"learning_rate": 9.970212847618761e-06,
"loss": 0.0136,
"step": 630
},
{
"epoch": 0.6667534835265009,
"grad_norm": 0.26981475949287415,
"learning_rate": 9.96611348155594e-06,
"loss": 0.0068,
"step": 640
},
{
"epoch": 0.6771715067066024,
"grad_norm": 0.12803907692432404,
"learning_rate": 9.961750874896548e-06,
"loss": 0.0075,
"step": 650
},
{
"epoch": 0.687589529886704,
"grad_norm": 0.15524764358997345,
"learning_rate": 9.957125258890877e-06,
"loss": 0.0154,
"step": 660
},
{
"epoch": 0.6980075530668056,
"grad_norm": 1.0714681148529053,
"learning_rate": 9.952236878730648e-06,
"loss": 0.008,
"step": 670
},
{
"epoch": 0.7084255762469072,
"grad_norm": 5.858039379119873,
"learning_rate": 9.947085993536019e-06,
"loss": 0.0293,
"step": 680
},
{
"epoch": 0.7188435994270087,
"grad_norm": 0.6457405090332031,
"learning_rate": 9.941672876341848e-06,
"loss": 0.0225,
"step": 690
},
{
"epoch": 0.7292616226071102,
"grad_norm": 0.08024393022060394,
"learning_rate": 9.935997814083221e-06,
"loss": 0.003,
"step": 700
},
{
"epoch": 0.7396796457872119,
"grad_norm": 2.8071186542510986,
"learning_rate": 9.930061107580245e-06,
"loss": 0.0134,
"step": 710
},
{
"epoch": 0.7500976689673134,
"grad_norm": 1.7718358039855957,
"learning_rate": 9.923863071522092e-06,
"loss": 0.0228,
"step": 720
},
{
"epoch": 0.760515692147415,
"grad_norm": 0.015169305726885796,
"learning_rate": 9.917404034450333e-06,
"loss": 0.0066,
"step": 730
},
{
"epoch": 0.7709337153275166,
"grad_norm": 0.020366037264466286,
"learning_rate": 9.91068433874151e-06,
"loss": 0.0034,
"step": 740
},
{
"epoch": 0.7813517385076182,
"grad_norm": 0.4552544951438904,
"learning_rate": 9.903704340589e-06,
"loss": 0.0172,
"step": 750
},
{
"epoch": 0.7917697616877197,
"grad_norm": 0.586904764175415,
"learning_rate": 9.896464409984115e-06,
"loss": 0.0158,
"step": 760
},
{
"epoch": 0.8021877848678214,
"grad_norm": 0.003931673243641853,
"learning_rate": 9.888964930696514e-06,
"loss": 0.0118,
"step": 770
},
{
"epoch": 0.8126058080479229,
"grad_norm": 0.14547383785247803,
"learning_rate": 9.881206300253838e-06,
"loss": 0.0091,
"step": 780
},
{
"epoch": 0.8230238312280245,
"grad_norm": 3.7878336906433105,
"learning_rate": 9.87318892992066e-06,
"loss": 0.0195,
"step": 790
},
{
"epoch": 0.833441854408126,
"grad_norm": 0.4627358913421631,
"learning_rate": 9.864913244676661e-06,
"loss": 0.0072,
"step": 800
},
{
"epoch": 0.8438598775882277,
"grad_norm": 4.011754035949707,
"learning_rate": 9.856379683194122e-06,
"loss": 0.0102,
"step": 810
},
{
"epoch": 0.8542779007683292,
"grad_norm": 0.15841282904148102,
"learning_rate": 9.847588697814662e-06,
"loss": 0.0037,
"step": 820
},
{
"epoch": 0.8646959239484308,
"grad_norm": 0.011311404407024384,
"learning_rate": 9.838540754525266e-06,
"loss": 0.0057,
"step": 830
},
{
"epoch": 0.8751139471285324,
"grad_norm": 2.975158929824829,
"learning_rate": 9.829236332933573e-06,
"loss": 0.0094,
"step": 840
},
{
"epoch": 0.885531970308634,
"grad_norm": 0.0023622314911335707,
"learning_rate": 9.819675926242473e-06,
"loss": 0.0051,
"step": 850
},
{
"epoch": 0.8959499934887355,
"grad_norm": 1.6916306018829346,
"learning_rate": 9.809860041223945e-06,
"loss": 0.0179,
"step": 860
},
{
"epoch": 0.906368016668837,
"grad_norm": 0.002486775629222393,
"learning_rate": 9.799789198192197e-06,
"loss": 0.0257,
"step": 870
},
{
"epoch": 0.9167860398489387,
"grad_norm": 0.03367191180586815,
"learning_rate": 9.789463930976098e-06,
"loss": 0.0044,
"step": 880
},
{
"epoch": 0.9272040630290402,
"grad_norm": 3.230694055557251,
"learning_rate": 9.77888478689087e-06,
"loss": 0.0096,
"step": 890
},
{
"epoch": 0.9376220862091418,
"grad_norm": 0.7497555017471313,
"learning_rate": 9.76805232670908e-06,
"loss": 0.0132,
"step": 900
},
{
"epoch": 0.9480401093892434,
"grad_norm": 0.5059885382652283,
"learning_rate": 9.756967124630911e-06,
"loss": 0.0021,
"step": 910
},
{
"epoch": 0.958458132569345,
"grad_norm": 0.031383853405714035,
"learning_rate": 9.745629768253735e-06,
"loss": 0.0027,
"step": 920
},
{
"epoch": 0.9688761557494465,
"grad_norm": 0.2179183065891266,
"learning_rate": 9.734040858540947e-06,
"loss": 0.0081,
"step": 930
},
{
"epoch": 0.9792941789295481,
"grad_norm": 0.02150609716773033,
"learning_rate": 9.722201009790135e-06,
"loss": 0.0013,
"step": 940
},
{
"epoch": 0.9897122021096497,
"grad_norm": 0.7268407344818115,
"learning_rate": 9.710110849600498e-06,
"loss": 0.0154,
"step": 950
},
{
"epoch": 1.0,
"grad_norm": 4.076342582702637,
"learning_rate": 9.697771018839586e-06,
"loss": 0.0182,
"step": 960
},
{
"epoch": 1.0104180231801017,
"grad_norm": 0.0026843964587897062,
"learning_rate": 9.685182171609328e-06,
"loss": 0.0078,
"step": 970
},
{
"epoch": 1.020836046360203,
"grad_norm": 1.928469181060791,
"learning_rate": 9.672344975211361e-06,
"loss": 0.0026,
"step": 980
},
{
"epoch": 1.0312540695403047,
"grad_norm": 0.19345267117023468,
"learning_rate": 9.659260110111659e-06,
"loss": 0.0013,
"step": 990
},
{
"epoch": 1.0416720927204064,
"grad_norm": 0.10904184728860855,
"learning_rate": 9.645928269904459e-06,
"loss": 0.0006,
"step": 1000
},
{
"epoch": 1.0520901159005078,
"grad_norm": 0.0044245533645153046,
"learning_rate": 9.632350161275496e-06,
"loss": 0.0078,
"step": 1010
},
{
"epoch": 1.0625081390806095,
"grad_norm": 0.052690137177705765,
"learning_rate": 9.618526503964552e-06,
"loss": 0.0033,
"step": 1020
},
{
"epoch": 1.072926162260711,
"grad_norm": 8.081074338406324e-05,
"learning_rate": 9.604458030727292e-06,
"loss": 0.0041,
"step": 1030
},
{
"epoch": 1.0833441854408126,
"grad_norm": 0.9879348278045654,
"learning_rate": 9.590145487296431e-06,
"loss": 0.0015,
"step": 1040
},
{
"epoch": 1.0937622086209142,
"grad_norm": 0.1349502056837082,
"learning_rate": 9.575589632342201e-06,
"loss": 0.0003,
"step": 1050
},
{
"epoch": 1.1041802318010157,
"grad_norm": 0.0031191352754831314,
"learning_rate": 9.560791237432141e-06,
"loss": 0.0157,
"step": 1060
},
{
"epoch": 1.1145982549811173,
"grad_norm": 0.003794798394665122,
"learning_rate": 9.545751086990187e-06,
"loss": 0.0074,
"step": 1070
},
{
"epoch": 1.125016278161219,
"grad_norm": 0.04094767943024635,
"learning_rate": 9.530469978255105e-06,
"loss": 0.0001,
"step": 1080
},
{
"epoch": 1.1354343013413204,
"grad_norm": 1.0790066880872473e-05,
"learning_rate": 9.514948721238227e-06,
"loss": 0.0074,
"step": 1090
},
{
"epoch": 1.145852324521422,
"grad_norm": 3.5826034545898438,
"learning_rate": 9.499188138680504e-06,
"loss": 0.0108,
"step": 1100
},
{
"epoch": 1.1562703477015237,
"grad_norm": 0.39376917481422424,
"learning_rate": 9.48318906600891e-06,
"loss": 0.002,
"step": 1110
},
{
"epoch": 1.1666883708816251,
"grad_norm": 0.008306131698191166,
"learning_rate": 9.466952351292158e-06,
"loss": 0.0033,
"step": 1120
},
{
"epoch": 1.1771063940617268,
"grad_norm": 0.08543845266103745,
"learning_rate": 9.450478855195724e-06,
"loss": 0.0002,
"step": 1130
},
{
"epoch": 1.1875244172418284,
"grad_norm": 1.0472006797790527,
"learning_rate": 9.433769450936254e-06,
"loss": 0.0033,
"step": 1140
},
{
"epoch": 1.1979424404219299,
"grad_norm": 0.0014151977375149727,
"learning_rate": 9.416825024235262e-06,
"loss": 0.0005,
"step": 1150
},
{
"epoch": 1.2083604636020315,
"grad_norm": 3.7921109199523926,
"learning_rate": 9.399646473272181e-06,
"loss": 0.0062,
"step": 1160
},
{
"epoch": 1.2187784867821332,
"grad_norm": 0.0031763967126607895,
"learning_rate": 9.382234708636753e-06,
"loss": 0.0018,
"step": 1170
},
{
"epoch": 1.2291965099622346,
"grad_norm": 0.00694266939535737,
"learning_rate": 9.364590653280767e-06,
"loss": 0.0023,
"step": 1180
},
{
"epoch": 1.2396145331423363,
"grad_norm": 0.0004896274767816067,
"learning_rate": 9.346715242469128e-06,
"loss": 0.0065,
"step": 1190
},
{
"epoch": 1.250032556322438,
"grad_norm": 0.008612466044723988,
"learning_rate": 9.328609423730285e-06,
"loss": 0.0004,
"step": 1200
},
{
"epoch": 1.2604505795025394,
"grad_norm": 0.00032048547291196883,
"learning_rate": 9.310274156806006e-06,
"loss": 0.0122,
"step": 1210
},
{
"epoch": 1.270868602682641,
"grad_norm": 0.024016475304961205,
"learning_rate": 9.291710413600498e-06,
"loss": 0.0105,
"step": 1220
},
{
"epoch": 1.2812866258627427,
"grad_norm": 0.059294044971466064,
"learning_rate": 9.272919178128902e-06,
"loss": 0.0005,
"step": 1230
},
{
"epoch": 1.291704649042844,
"grad_norm": 0.0428699254989624,
"learning_rate": 9.253901446465116e-06,
"loss": 0.0032,
"step": 1240
},
{
"epoch": 1.3021226722229458,
"grad_norm": 0.1531277745962143,
"learning_rate": 9.234658226689015e-06,
"loss": 0.0003,
"step": 1250
},
{
"epoch": 1.3125406954030474,
"grad_norm": 2.18913197517395,
"learning_rate": 9.215190538832995e-06,
"loss": 0.0046,
"step": 1260
},
{
"epoch": 1.3229587185831488,
"grad_norm": 0.017851542681455612,
"learning_rate": 9.195499414827917e-06,
"loss": 0.0001,
"step": 1270
},
{
"epoch": 1.3333767417632505,
"grad_norm": 9.223073959350586,
"learning_rate": 9.175585898448408e-06,
"loss": 0.0267,
"step": 1280
},
{
"epoch": 1.343794764943352,
"grad_norm": 0.0561683289706707,
"learning_rate": 9.155451045257523e-06,
"loss": 0.0157,
"step": 1290
},
{
"epoch": 1.3542127881234536,
"grad_norm": 0.02189466543495655,
"learning_rate": 9.135095922550801e-06,
"loss": 0.002,
"step": 1300
},
{
"epoch": 1.364630811303555,
"grad_norm": 3.704211950302124,
"learning_rate": 9.114521609299691e-06,
"loss": 0.008,
"step": 1310
},
{
"epoch": 1.3750488344836567,
"grad_norm": 0.09731532633304596,
"learning_rate": 9.09372919609435e-06,
"loss": 0.0023,
"step": 1320
},
{
"epoch": 1.3854668576637583,
"grad_norm": 0.18077053129673004,
"learning_rate": 9.072719785085842e-06,
"loss": 0.013,
"step": 1330
},
{
"epoch": 1.3958848808438598,
"grad_norm": 0.026519620791077614,
"learning_rate": 9.051494489927714e-06,
"loss": 0.0098,
"step": 1340
},
{
"epoch": 1.4063029040239614,
"grad_norm": 3.807239294052124,
"learning_rate": 9.030054435716961e-06,
"loss": 0.0152,
"step": 1350
},
{
"epoch": 1.416720927204063,
"grad_norm": 1.5597429275512695,
"learning_rate": 9.008400758934392e-06,
"loss": 0.0014,
"step": 1360
},
{
"epoch": 1.4271389503841645,
"grad_norm": 0.00023376860190182924,
"learning_rate": 8.986534607384383e-06,
"loss": 0.0089,
"step": 1370
},
{
"epoch": 1.4375569735642661,
"grad_norm": 0.13520453870296478,
"learning_rate": 8.96445714013404e-06,
"loss": 0.0008,
"step": 1380
},
{
"epoch": 1.4479749967443678,
"grad_norm": 3.3357255458831787,
"learning_rate": 8.942169527451756e-06,
"loss": 0.0177,
"step": 1390
},
{
"epoch": 1.4583930199244692,
"grad_norm": 0.13861112296581268,
"learning_rate": 8.919672950745185e-06,
"loss": 0.0177,
"step": 1400
},
{
"epoch": 1.468811043104571,
"grad_norm": 0.0005054327775724232,
"learning_rate": 8.896968602498605e-06,
"loss": 0.0087,
"step": 1410
},
{
"epoch": 1.4792290662846725,
"grad_norm": 0.188633993268013,
"learning_rate": 8.874057686209727e-06,
"loss": 0.0023,
"step": 1420
},
{
"epoch": 1.489647089464774,
"grad_norm": 4.461554527282715,
"learning_rate": 8.850941416325876e-06,
"loss": 0.01,
"step": 1430
},
{
"epoch": 1.5000651126448756,
"grad_norm": 0.0012838880065828562,
"learning_rate": 8.827621018179644e-06,
"loss": 0.0129,
"step": 1440
},
{
"epoch": 1.5104831358249773,
"grad_norm": 0.40561285614967346,
"learning_rate": 8.804097727923916e-06,
"loss": 0.0049,
"step": 1450
},
{
"epoch": 1.5209011590050787,
"grad_norm": 0.038120679557323456,
"learning_rate": 8.780372792466356e-06,
"loss": 0.0104,
"step": 1460
},
{
"epoch": 1.5313191821851804,
"grad_norm": 2.760854959487915,
"learning_rate": 8.75644746940331e-06,
"loss": 0.0032,
"step": 1470
},
{
"epoch": 1.541737205365282,
"grad_norm": 0.5340994596481323,
"learning_rate": 8.732323026953141e-06,
"loss": 0.008,
"step": 1480
},
{
"epoch": 1.5521552285453835,
"grad_norm": 2.0399329662323,
"learning_rate": 8.708000743889009e-06,
"loss": 0.0052,
"step": 1490
},
{
"epoch": 1.5625732517254851,
"grad_norm": 2.4984633922576904,
"learning_rate": 8.68348190947108e-06,
"loss": 0.0151,
"step": 1500
},
{
"epoch": 1.5729912749055868,
"grad_norm": 0.07911086082458496,
"learning_rate": 8.658767823378198e-06,
"loss": 0.0012,
"step": 1510
},
{
"epoch": 1.5834092980856882,
"grad_norm": 0.12755373120307922,
"learning_rate": 8.63385979563897e-06,
"loss": 0.0039,
"step": 1520
},
{
"epoch": 1.5938273212657899,
"grad_norm": 0.03218844160437584,
"learning_rate": 8.608759146562352e-06,
"loss": 0.0013,
"step": 1530
},
{
"epoch": 1.6042453444458915,
"grad_norm": 0.00013593579933512956,
"learning_rate": 8.583467206667643e-06,
"loss": 0.0007,
"step": 1540
},
{
"epoch": 1.614663367625993,
"grad_norm": 0.00014157673285808414,
"learning_rate": 8.557985316613967e-06,
"loss": 0.0005,
"step": 1550
},
{
"epoch": 1.6250813908060946,
"grad_norm": 0.001298804534599185,
"learning_rate": 8.532314827129207e-06,
"loss": 0.0003,
"step": 1560
},
{
"epoch": 1.6354994139861962,
"grad_norm": 0.00196313438937068,
"learning_rate": 8.5064570989384e-06,
"loss": 0.0042,
"step": 1570
},
{
"epoch": 1.6459174371662977,
"grad_norm": 0.05907834693789482,
"learning_rate": 8.480413502691618e-06,
"loss": 0.0042,
"step": 1580
},
{
"epoch": 1.6563354603463991,
"grad_norm": 2.9666049480438232,
"learning_rate": 8.454185418891305e-06,
"loss": 0.0009,
"step": 1590
},
{
"epoch": 1.666753483526501,
"grad_norm": 0.04065735638141632,
"learning_rate": 8.427774237819113e-06,
"loss": 0.0071,
"step": 1600
},
{
"epoch": 1.6771715067066024,
"grad_norm": 2.4857747554779053,
"learning_rate": 8.401181359462187e-06,
"loss": 0.0078,
"step": 1610
},
{
"epoch": 1.6875895298867039,
"grad_norm": 0.0004068031266797334,
"learning_rate": 8.374408193438977e-06,
"loss": 0.0003,
"step": 1620
},
{
"epoch": 1.6980075530668057,
"grad_norm": 0.000535597384441644,
"learning_rate": 8.347456158924496e-06,
"loss": 0.0054,
"step": 1630
},
{
"epoch": 1.7084255762469072,
"grad_norm": 0.06767778843641281,
"learning_rate": 8.320326684575116e-06,
"loss": 0.0016,
"step": 1640
},
{
"epoch": 1.7188435994270086,
"grad_norm": 0.00011920960969291627,
"learning_rate": 8.29302120845282e-06,
"loss": 0.0008,
"step": 1650
},
{
"epoch": 1.7292616226071102,
"grad_norm": 0.1632106900215149,
"learning_rate": 8.265541177948986e-06,
"loss": 0.0011,
"step": 1660
},
{
"epoch": 1.739679645787212,
"grad_norm": 0.03428950533270836,
"learning_rate": 8.237888049707656e-06,
"loss": 0.0087,
"step": 1670
},
{
"epoch": 1.7500976689673133,
"grad_norm": 0.0007083571399562061,
"learning_rate": 8.210063289548328e-06,
"loss": 0.0025,
"step": 1680
},
{
"epoch": 1.760515692147415,
"grad_norm": 1.6527312254766002e-05,
"learning_rate": 8.182068372388259e-06,
"loss": 0.0044,
"step": 1690
},
{
"epoch": 1.7709337153275166,
"grad_norm": 0.0002037805679719895,
"learning_rate": 8.153904782164275e-06,
"loss": 0.0011,
"step": 1700
},
{
"epoch": 1.781351738507618,
"grad_norm": 0.0011550731724128127,
"learning_rate": 8.125574011754125e-06,
"loss": 0.0028,
"step": 1710
},
{
"epoch": 1.7917697616877197,
"grad_norm": 0.058955416083335876,
"learning_rate": 8.097077562897332e-06,
"loss": 0.0003,
"step": 1720
},
{
"epoch": 1.8021877848678214,
"grad_norm": 0.6716377139091492,
"learning_rate": 8.0684169461156e-06,
"loss": 0.0082,
"step": 1730
},
{
"epoch": 1.8126058080479228,
"grad_norm": 0.0014718943275511265,
"learning_rate": 8.03959368063274e-06,
"loss": 0.0069,
"step": 1740
},
{
"epoch": 1.8230238312280245,
"grad_norm": 0.00023426729603670537,
"learning_rate": 8.01060929429415e-06,
"loss": 0.0107,
"step": 1750
},
{
"epoch": 1.8334418544081261,
"grad_norm": 0.002175110625103116,
"learning_rate": 7.981465323485808e-06,
"loss": 0.0082,
"step": 1760
},
{
"epoch": 1.8438598775882276,
"grad_norm": 0.719397783279419,
"learning_rate": 7.952163313052856e-06,
"loss": 0.0009,
"step": 1770
},
{
"epoch": 1.8542779007683292,
"grad_norm": 0.976002037525177,
"learning_rate": 7.922704816217693e-06,
"loss": 0.0082,
"step": 1780
},
{
"epoch": 1.8646959239484309,
"grad_norm": 3.530712604522705,
"learning_rate": 7.893091394497651e-06,
"loss": 0.0239,
"step": 1790
},
{
"epoch": 1.8751139471285323,
"grad_norm": 1.4467989206314087,
"learning_rate": 7.863324617622227e-06,
"loss": 0.0019,
"step": 1800
},
{
"epoch": 1.885531970308634,
"grad_norm": 0.0016262925928458571,
"learning_rate": 7.833406063449866e-06,
"loss": 0.0004,
"step": 1810
},
{
"epoch": 1.8959499934887356,
"grad_norm": 0.007764583453536034,
"learning_rate": 7.803337317884328e-06,
"loss": 0.0025,
"step": 1820
},
{
"epoch": 1.906368016668837,
"grad_norm": 4.686193278757855e-06,
"learning_rate": 7.773119974790626e-06,
"loss": 0.0008,
"step": 1830
},
{
"epoch": 1.9167860398489387,
"grad_norm": 0.02094154804944992,
"learning_rate": 7.742755635910532e-06,
"loss": 0.0,
"step": 1840
},
{
"epoch": 1.9272040630290403,
"grad_norm": 0.026232223957777023,
"learning_rate": 7.712245910777684e-06,
"loss": 0.0004,
"step": 1850
},
{
"epoch": 1.9376220862091418,
"grad_norm": 0.07305476814508438,
"learning_rate": 7.68159241663226e-06,
"loss": 0.0108,
"step": 1860
},
{
"epoch": 1.9480401093892434,
"grad_norm": 0.11340963840484619,
"learning_rate": 7.650796778335248e-06,
"loss": 0.0006,
"step": 1870
},
{
"epoch": 1.958458132569345,
"grad_norm": 2.027043592534028e-05,
"learning_rate": 7.619860628282333e-06,
"loss": 0.0178,
"step": 1880
},
{
"epoch": 1.9688761557494465,
"grad_norm": 0.14668358862400055,
"learning_rate": 7.5887856063173525e-06,
"loss": 0.0031,
"step": 1890
},
{
"epoch": 1.979294178929548,
"grad_norm": 0.056070610880851746,
"learning_rate": 7.5575733596453805e-06,
"loss": 0.0006,
"step": 1900
},
{
"epoch": 1.9897122021096498,
"grad_norm": 0.01226119790226221,
"learning_rate": 7.526225542745408e-06,
"loss": 0.0086,
"step": 1910
},
{
"epoch": 2.0,
"grad_norm": 0.026876511052250862,
"learning_rate": 7.494743817282651e-06,
"loss": 0.0021,
"step": 1920
},
{
"epoch": 2.0104180231801014,
"grad_norm": 1.010679006576538,
"learning_rate": 7.463129852020465e-06,
"loss": 0.001,
"step": 1930
},
{
"epoch": 2.0208360463602033,
"grad_norm": 0.05851694196462631,
"learning_rate": 7.431385322731885e-06,
"loss": 0.0002,
"step": 1940
},
{
"epoch": 2.0312540695403047,
"grad_norm": 0.0013823857298120856,
"learning_rate": 7.399511912110807e-06,
"loss": 0.0003,
"step": 1950
},
{
"epoch": 2.041672092720406,
"grad_norm": 0.0012095741694793105,
"learning_rate": 7.367511309682782e-06,
"loss": 0.0009,
"step": 1960
},
{
"epoch": 2.052090115900508,
"grad_norm": 5.620197296142578,
"learning_rate": 7.335385211715467e-06,
"loss": 0.0064,
"step": 1970
},
{
"epoch": 2.0625081390806095,
"grad_norm": 0.0062335277907550335,
"learning_rate": 7.303135321128705e-06,
"loss": 0.0002,
"step": 1980
},
{
"epoch": 2.072926162260711,
"grad_norm": 0.001201906125061214,
"learning_rate": 7.270763347404262e-06,
"loss": 0.0017,
"step": 1990
},
{
"epoch": 2.083344185440813,
"grad_norm": 0.016548430547118187,
"learning_rate": 7.238271006495206e-06,
"loss": 0.0015,
"step": 2000
},
{
"epoch": 2.0937622086209142,
"grad_norm": 0.0003294723283033818,
"learning_rate": 7.205660020734955e-06,
"loss": 0.0002,
"step": 2010
},
{
"epoch": 2.1041802318010157,
"grad_norm": 0.02431473508477211,
"learning_rate": 7.172932118745978e-06,
"loss": 0.0001,
"step": 2020
},
{
"epoch": 2.1145982549811175,
"grad_norm": 0.006785488221794367,
"learning_rate": 7.140089035348166e-06,
"loss": 0.0018,
"step": 2030
},
{
"epoch": 2.125016278161219,
"grad_norm": 0.06791669875383377,
"learning_rate": 7.107132511466872e-06,
"loss": 0.0005,
"step": 2040
},
{
"epoch": 2.1354343013413204,
"grad_norm": 0.0033805356360971928,
"learning_rate": 7.074064294040629e-06,
"loss": 0.0026,
"step": 2050
},
{
"epoch": 2.145852324521422,
"grad_norm": 0.0271758995950222,
"learning_rate": 7.040886135928554e-06,
"loss": 0.0004,
"step": 2060
},
{
"epoch": 2.1562703477015237,
"grad_norm": 0.07952375710010529,
"learning_rate": 7.007599795817426e-06,
"loss": 0.0,
"step": 2070
},
{
"epoch": 2.166688370881625,
"grad_norm": 0.12942220270633698,
"learning_rate": 6.9742070381284724e-06,
"loss": 0.0006,
"step": 2080
},
{
"epoch": 2.1771063940617266,
"grad_norm": 5.3233266953611746e-05,
"learning_rate": 6.94070963292383e-06,
"loss": 0.0001,
"step": 2090
},
{
"epoch": 2.1875244172418284,
"grad_norm": 1.8653046254257788e-06,
"learning_rate": 6.907109355812729e-06,
"loss": 0.0014,
"step": 2100
},
{
"epoch": 2.19794244042193,
"grad_norm": 0.2578833997249603,
"learning_rate": 6.873407987857367e-06,
"loss": 0.0005,
"step": 2110
},
{
"epoch": 2.2083604636020313,
"grad_norm": 4.3816239667648915e-06,
"learning_rate": 6.8396073154785e-06,
"loss": 0.0,
"step": 2120
},
{
"epoch": 2.218778486782133,
"grad_norm": 0.4164877235889435,
"learning_rate": 6.805709130360754e-06,
"loss": 0.0017,
"step": 2130
},
{
"epoch": 2.2291965099622346,
"grad_norm": 7.609223303006729e-06,
"learning_rate": 6.771715229357643e-06,
"loss": 0.0002,
"step": 2140
},
{
"epoch": 2.239614533142336,
"grad_norm": 0.17523327469825745,
"learning_rate": 6.73762741439633e-06,
"loss": 0.0018,
"step": 2150
},
{
"epoch": 2.250032556322438,
"grad_norm": 9.692726135253906,
"learning_rate": 6.7034474923821135e-06,
"loss": 0.0034,
"step": 2160
},
{
"epoch": 2.2604505795025394,
"grad_norm": 3.903836841345765e-05,
"learning_rate": 6.669177275102637e-06,
"loss": 0.0004,
"step": 2170
},
{
"epoch": 2.270868602682641,
"grad_norm": 0.26724299788475037,
"learning_rate": 6.634818579131865e-06,
"loss": 0.0007,
"step": 2180
},
{
"epoch": 2.2812866258627427,
"grad_norm": 0.002805814379826188,
"learning_rate": 6.6003732257337805e-06,
"loss": 0.0009,
"step": 2190
},
{
"epoch": 2.291704649042844,
"grad_norm": 0.00600125128403306,
"learning_rate": 6.565843040765849e-06,
"loss": 0.0003,
"step": 2200
},
{
"epoch": 2.3021226722229455,
"grad_norm": 0.07256772369146347,
"learning_rate": 6.531229854582235e-06,
"loss": 0.0002,
"step": 2210
},
{
"epoch": 2.3125406954030474,
"grad_norm": 0.05456492677330971,
"learning_rate": 6.496535501936781e-06,
"loss": 0.0,
"step": 2220
},
{
"epoch": 2.322958718583149,
"grad_norm": 0.002974656643345952,
"learning_rate": 6.461761821885742e-06,
"loss": 0.0008,
"step": 2230
},
{
"epoch": 2.3333767417632503,
"grad_norm": 0.32170239090919495,
"learning_rate": 6.42691065769032e-06,
"loss": 0.0002,
"step": 2240
},
{
"epoch": 2.343794764943352,
"grad_norm": 0.0577409565448761,
"learning_rate": 6.391983856718941e-06,
"loss": 0.0138,
"step": 2250
},
{
"epoch": 2.3542127881234536,
"grad_norm": 0.5012153387069702,
"learning_rate": 6.356983270349339e-06,
"loss": 0.0047,
"step": 2260
},
{
"epoch": 2.364630811303555,
"grad_norm": 0.0003676996275316924,
"learning_rate": 6.321910753870416e-06,
"loss": 0.0003,
"step": 2270
},
{
"epoch": 2.375048834483657,
"grad_norm": 0.14126235246658325,
"learning_rate": 6.286768166383905e-06,
"loss": 0.0001,
"step": 2280
},
{
"epoch": 2.3854668576637583,
"grad_norm": 0.0022616020869463682,
"learning_rate": 6.251557370705809e-06,
"loss": 0.0002,
"step": 2290
},
{
"epoch": 2.3958848808438598,
"grad_norm": 6.679229736328125,
"learning_rate": 6.216280233267681e-06,
"loss": 0.0135,
"step": 2300
},
{
"epoch": 2.4063029040239616,
"grad_norm": 0.006057079881429672,
"learning_rate": 6.180938624017663e-06,
"loss": 0.0067,
"step": 2310
},
{
"epoch": 2.416720927204063,
"grad_norm": 0.058568984270095825,
"learning_rate": 6.145534416321384e-06,
"loss": 0.0002,
"step": 2320
},
{
"epoch": 2.4271389503841645,
"grad_norm": 0.001173496013507247,
"learning_rate": 6.110069486862652e-06,
"loss": 0.0083,
"step": 2330
},
{
"epoch": 2.4375569735642664,
"grad_norm": 7.219270706176758,
"learning_rate": 6.074545715543975e-06,
"loss": 0.0173,
"step": 2340
},
{
"epoch": 2.447974996744368,
"grad_norm": 0.1928091198205948,
"learning_rate": 6.0389649853869116e-06,
"loss": 0.0001,
"step": 2350
},
{
"epoch": 2.4583930199244692,
"grad_norm": 0.2435365468263626,
"learning_rate": 6.00332918243226e-06,
"loss": 0.0003,
"step": 2360
},
{
"epoch": 2.468811043104571,
"grad_norm": 1.0856934785842896,
"learning_rate": 5.967640195640083e-06,
"loss": 0.0016,
"step": 2370
},
{
"epoch": 2.4792290662846725,
"grad_norm": 0.0007231564377434552,
"learning_rate": 5.931899916789576e-06,
"loss": 0.001,
"step": 2380
},
{
"epoch": 2.489647089464774,
"grad_norm": 0.0067249564453959465,
"learning_rate": 5.896110240378794e-06,
"loss": 0.001,
"step": 2390
},
{
"epoch": 2.500065112644876,
"grad_norm": 0.00024194463912863284,
"learning_rate": 5.860273063524227e-06,
"loss": 0.0019,
"step": 2400
},
{
"epoch": 2.5104831358249773,
"grad_norm": 0.011660280637443066,
"learning_rate": 5.824390285860233e-06,
"loss": 0.0,
"step": 2410
},
{
"epoch": 2.5209011590050787,
"grad_norm": 7.278788689291105e-05,
"learning_rate": 5.788463809438356e-06,
"loss": 0.0,
"step": 2420
},
{
"epoch": 2.5313191821851806,
"grad_norm": 0.0003818434779532254,
"learning_rate": 5.752495538626493e-06,
"loss": 0.0042,
"step": 2430
},
{
"epoch": 2.541737205365282,
"grad_norm": 4.750408172607422,
"learning_rate": 5.7164873800079475e-06,
"loss": 0.009,
"step": 2440
},
{
"epoch": 2.5521552285453835,
"grad_norm": 9.329643830824352e-07,
"learning_rate": 5.680441242280378e-06,
"loss": 0.0067,
"step": 2450
},
{
"epoch": 2.5625732517254853,
"grad_norm": 3.5650442441692576e-05,
"learning_rate": 5.6443590361546095e-06,
"loss": 0.0017,
"step": 2460
},
{
"epoch": 2.5729912749055868,
"grad_norm": 0.030909501016139984,
"learning_rate": 5.608242674253362e-06,
"loss": 0.0,
"step": 2470
},
{
"epoch": 2.583409298085688,
"grad_norm": 0.011211927980184555,
"learning_rate": 5.57209407100986e-06,
"loss": 0.0101,
"step": 2480
},
{
"epoch": 2.59382732126579,
"grad_norm": 0.0002943580038845539,
"learning_rate": 5.535915142566361e-06,
"loss": 0.0003,
"step": 2490
},
{
"epoch": 2.6042453444458915,
"grad_norm": 6.406533066183329e-05,
"learning_rate": 5.499707806672575e-06,
"loss": 0.0077,
"step": 2500
},
{
"epoch": 2.614663367625993,
"grad_norm": 0.0030457451939582825,
"learning_rate": 5.463473982584023e-06,
"loss": 0.0019,
"step": 2510
},
{
"epoch": 2.625081390806095,
"grad_norm": 0.00026603075093589723,
"learning_rate": 5.4272155909602875e-06,
"loss": 0.0161,
"step": 2520
},
{
"epoch": 2.6354994139861962,
"grad_norm": 0.03001578524708748,
"learning_rate": 5.3909345537632205e-06,
"loss": 0.0153,
"step": 2530
},
{
"epoch": 2.6459174371662977,
"grad_norm": 0.5489205121994019,
"learning_rate": 5.354632794155049e-06,
"loss": 0.0008,
"step": 2540
},
{
"epoch": 2.656335460346399,
"grad_norm": 0.00020121924171689898,
"learning_rate": 5.318312236396445e-06,
"loss": 0.0025,
"step": 2550
},
{
"epoch": 2.666753483526501,
"grad_norm": 0.16872790455818176,
"learning_rate": 5.281974805744516e-06,
"loss": 0.0009,
"step": 2560
},
{
"epoch": 2.6771715067066024,
"grad_norm": 1.6018518635974033e-06,
"learning_rate": 5.245622428350764e-06,
"loss": 0.0019,
"step": 2570
},
{
"epoch": 2.687589529886704,
"grad_norm": 0.15927599370479584,
"learning_rate": 5.209257031158972e-06,
"loss": 0.0042,
"step": 2580
},
{
"epoch": 2.6980075530668057,
"grad_norm": 0.002104927320033312,
"learning_rate": 5.1728805418030725e-06,
"loss": 0.002,
"step": 2590
},
{
"epoch": 2.708425576246907,
"grad_norm": 0.00043586574611254036,
"learning_rate": 5.13649488850496e-06,
"loss": 0.011,
"step": 2600
},
{
"epoch": 2.7188435994270086,
"grad_norm": 0.5086974501609802,
"learning_rate": 5.100101999972291e-06,
"loss": 0.0004,
"step": 2610
},
{
"epoch": 2.72926162260711,
"grad_norm": 0.04724502190947533,
"learning_rate": 5.063703805296239e-06,
"loss": 0.0001,
"step": 2620
},
{
"epoch": 2.739679645787212,
"grad_norm": 2.4284263417939655e-06,
"learning_rate": 5.027302233849243e-06,
"loss": 0.0028,
"step": 2630
},
{
"epoch": 2.7500976689673133,
"grad_norm": 0.24547390639781952,
"learning_rate": 4.990899215182735e-06,
"loss": 0.0032,
"step": 2640
},
{
"epoch": 2.7605156921474148,
"grad_norm": 0.001153081888332963,
"learning_rate": 4.954496678924861e-06,
"loss": 0.0004,
"step": 2650
},
{
"epoch": 2.7709337153275166,
"grad_norm": 0.005142812617123127,
"learning_rate": 4.9180965546781985e-06,
"loss": 0.0001,
"step": 2660
},
{
"epoch": 2.781351738507618,
"grad_norm": 0.0020258312579244375,
"learning_rate": 4.8817007719174635e-06,
"loss": 0.0014,
"step": 2670
},
{
"epoch": 2.7917697616877195,
"grad_norm": 0.09598887711763382,
"learning_rate": 4.845311259887251e-06,
"loss": 0.0017,
"step": 2680
},
{
"epoch": 2.8021877848678214,
"grad_norm": 5.523069921764545e-05,
"learning_rate": 4.808929947499751e-06,
"loss": 0.0001,
"step": 2690
},
{
"epoch": 2.812605808047923,
"grad_norm": 4.3237105273874477e-05,
"learning_rate": 4.772558763232521e-06,
"loss": 0.0,
"step": 2700
},
{
"epoch": 2.8230238312280242,
"grad_norm": 0.017797382548451424,
"learning_rate": 4.736199635026247e-06,
"loss": 0.0004,
"step": 2710
},
{
"epoch": 2.833441854408126,
"grad_norm": 0.0372743122279644,
"learning_rate": 4.699854490182558e-06,
"loss": 0.0001,
"step": 2720
},
{
"epoch": 2.8438598775882276,
"grad_norm": 0.017943061888217926,
"learning_rate": 4.663525255261861e-06,
"loss": 0.0026,
"step": 2730
},
{
"epoch": 2.854277900768329,
"grad_norm": 0.0005965960444882512,
"learning_rate": 4.627213855981221e-06,
"loss": 0.0002,
"step": 2740
},
{
"epoch": 2.864695923948431,
"grad_norm": 8.808985148789361e-05,
"learning_rate": 4.590922217112279e-06,
"loss": 0.0005,
"step": 2750
},
{
"epoch": 2.8751139471285323,
"grad_norm": 0.010554715059697628,
"learning_rate": 4.554652262379236e-06,
"loss": 0.0031,
"step": 2760
},
{
"epoch": 2.8855319703086337,
"grad_norm": 0.030894100666046143,
"learning_rate": 4.518405914356865e-06,
"loss": 0.0002,
"step": 2770
},
{
"epoch": 2.8959499934887356,
"grad_norm": 0.13782520592212677,
"learning_rate": 4.4821850943686164e-06,
"loss": 0.0003,
"step": 2780
},
{
"epoch": 2.906368016668837,
"grad_norm": 0.0024414719082415104,
"learning_rate": 4.445991722384763e-06,
"loss": 0.0002,
"step": 2790
},
{
"epoch": 2.9167860398489385,
"grad_norm": 0.0004072992014698684,
"learning_rate": 4.4098277169206315e-06,
"loss": 0.0001,
"step": 2800
},
{
"epoch": 2.9272040630290403,
"grad_norm": 0.0007056115427985787,
"learning_rate": 4.373694994934911e-06,
"loss": 0.0194,
"step": 2810
},
{
"epoch": 2.937622086209142,
"grad_norm": 0.005580561701208353,
"learning_rate": 4.337595471728029e-06,
"loss": 0.0004,
"step": 2820
},
{
"epoch": 2.948040109389243,
"grad_norm": 1.4271708726882935,
"learning_rate": 4.30153106084064e-06,
"loss": 0.0005,
"step": 2830
},
{
"epoch": 2.958458132569345,
"grad_norm": 0.6616150736808777,
"learning_rate": 4.2655036739521795e-06,
"loss": 0.0016,
"step": 2840
},
{
"epoch": 2.9688761557494465,
"grad_norm": 0.0038947018329054117,
"learning_rate": 4.229515220779545e-06,
"loss": 0.0001,
"step": 2850
},
{
"epoch": 2.979294178929548,
"grad_norm": 0.049666061997413635,
"learning_rate": 4.193567608975856e-06,
"loss": 0.0002,
"step": 2860
},
{
"epoch": 2.98971220210965,
"grad_norm": 0.001238349243067205,
"learning_rate": 4.1576627440293425e-06,
"loss": 0.0006,
"step": 2870
},
{
"epoch": 3.0,
"grad_norm": 0.0001485509128542617,
"learning_rate": 4.121802529162331e-06,
"loss": 0.0,
"step": 2880
},
{
"epoch": 3.0104180231801014,
"grad_norm": 0.0052583469077944756,
"learning_rate": 4.08598886523037e-06,
"loss": 0.0008,
"step": 2890
},
{
"epoch": 3.0208360463602033,
"grad_norm": 0.0016122941160574555,
"learning_rate": 4.050223650621461e-06,
"loss": 0.0003,
"step": 2900
},
{
"epoch": 3.0312540695403047,
"grad_norm": 3.2575666409684345e-05,
"learning_rate": 4.014508781155441e-06,
"loss": 0.0003,
"step": 2910
},
{
"epoch": 3.041672092720406,
"grad_norm": 0.056297626346349716,
"learning_rate": 3.978846149983473e-06,
"loss": 0.0002,
"step": 2920
},
{
"epoch": 3.052090115900508,
"grad_norm": 2.323356420674827e-05,
"learning_rate": 3.943237647487719e-06,
"loss": 0.0014,
"step": 2930
},
{
"epoch": 3.0625081390806095,
"grad_norm": 0.05314822122454643,
"learning_rate": 3.907685161181109e-06,
"loss": 0.0001,
"step": 2940
},
{
"epoch": 3.072926162260711,
"grad_norm": 1.262199878692627,
"learning_rate": 3.8721905756073135e-06,
"loss": 0.0004,
"step": 2950
},
{
"epoch": 3.083344185440813,
"grad_norm": 0.012801315635442734,
"learning_rate": 3.836755772240829e-06,
"loss": 0.0005,
"step": 2960
},
{
"epoch": 3.0937622086209142,
"grad_norm": 5.073728516435949e-06,
"learning_rate": 3.8013826293872623e-06,
"loss": 0.0,
"step": 2970
},
{
"epoch": 3.1041802318010157,
"grad_norm": 0.0036370500456541777,
"learning_rate": 3.7660730220837512e-06,
"loss": 0.0001,
"step": 2980
},
{
"epoch": 3.1145982549811175,
"grad_norm": 0.006650363560765982,
"learning_rate": 3.7308288219995884e-06,
"loss": 0.0001,
"step": 2990
},
{
"epoch": 3.125016278161219,
"grad_norm": 0.00013867947563994676,
"learning_rate": 3.695651897336996e-06,
"loss": 0.0,
"step": 3000
},
{
"epoch": 3.1354343013413204,
"grad_norm": 4.687119508162141e-05,
"learning_rate": 3.6605441127321074e-06,
"loss": 0.0001,
"step": 3010
},
{
"epoch": 3.145852324521422,
"grad_norm": 0.0005026389262638986,
"learning_rate": 3.6255073291561248e-06,
"loss": 0.0,
"step": 3020
},
{
"epoch": 3.1562703477015237,
"grad_norm": 0.01530701294541359,
"learning_rate": 3.5905434038166653e-06,
"loss": 0.0001,
"step": 3030
},
{
"epoch": 3.166688370881625,
"grad_norm": 0.15575547516345978,
"learning_rate": 3.555654190059333e-06,
"loss": 0.004,
"step": 3040
},
{
"epoch": 3.1771063940617266,
"grad_norm": 0.0063483756966888905,
"learning_rate": 3.520841537269458e-06,
"loss": 0.0025,
"step": 3050
},
{
"epoch": 3.1875244172418284,
"grad_norm": 0.0002552935038693249,
"learning_rate": 3.486107290774083e-06,
"loss": 0.0004,
"step": 3060
},
{
"epoch": 3.19794244042193,
"grad_norm": 1.8458106296748156e-06,
"learning_rate": 3.4514532917441345e-06,
"loss": 0.0004,
"step": 3070
},
{
"epoch": 3.2083604636020313,
"grad_norm": 3.37355140800355e-06,
"learning_rate": 3.4168813770968357e-06,
"loss": 0.0001,
"step": 3080
},
{
"epoch": 3.218778486782133,
"grad_norm": 0.00028720268164761364,
"learning_rate": 3.3823933793983298e-06,
"loss": 0.0003,
"step": 3090
},
{
"epoch": 3.2291965099622346,
"grad_norm": 4.652118150261231e-06,
"learning_rate": 3.347991126766545e-06,
"loss": 0.0,
"step": 3100
},
{
"epoch": 3.239614533142336,
"grad_norm": 0.0003041624731849879,
"learning_rate": 3.3136764427742885e-06,
"loss": 0.0001,
"step": 3110
},
{
"epoch": 3.250032556322438,
"grad_norm": 0.00011942172568524256,
"learning_rate": 3.279451146352588e-06,
"loss": 0.0001,
"step": 3120
},
{
"epoch": 3.2604505795025394,
"grad_norm": 0.044789642095565796,
"learning_rate": 3.2453170516942657e-06,
"loss": 0.0001,
"step": 3130
},
{
"epoch": 3.270868602682641,
"grad_norm": 0.02395668812096119,
"learning_rate": 3.211275968157784e-06,
"loss": 0.0001,
"step": 3140
},
{
"epoch": 3.2812866258627427,
"grad_norm": 0.02200975827872753,
"learning_rate": 3.1773297001713266e-06,
"loss": 0.0002,
"step": 3150
},
{
"epoch": 3.291704649042844,
"grad_norm": 0.0002672448754310608,
"learning_rate": 3.1434800471371586e-06,
"loss": 0.0,
"step": 3160
},
{
"epoch": 3.3021226722229455,
"grad_norm": 1.4595940228900872e-05,
"learning_rate": 3.109728803336234e-06,
"loss": 0.0,
"step": 3170
},
{
"epoch": 3.3125406954030474,
"grad_norm": 0.0023994040675461292,
"learning_rate": 3.076077757833103e-06,
"loss": 0.0001,
"step": 3180
},
{
"epoch": 3.322958718583149,
"grad_norm": 0.009379798546433449,
"learning_rate": 3.042528694381057e-06,
"loss": 0.0,
"step": 3190
},
{
"epoch": 3.3333767417632503,
"grad_norm": 0.008108630776405334,
"learning_rate": 3.0090833913275965e-06,
"loss": 0.0001,
"step": 3200
},
{
"epoch": 3.343794764943352,
"grad_norm": 0.03200116753578186,
"learning_rate": 2.9757436215201497e-06,
"loss": 0.0,
"step": 3210
},
{
"epoch": 3.3542127881234536,
"grad_norm": 7.638196279913245e-07,
"learning_rate": 2.9425111522121085e-06,
"loss": 0.0009,
"step": 3220
},
{
"epoch": 3.364630811303555,
"grad_norm": 3.822772123385221e-05,
"learning_rate": 2.9093877449691523e-06,
"loss": 0.0,
"step": 3230
},
{
"epoch": 3.375048834483657,
"grad_norm": 1.9638941287994385,
"learning_rate": 2.8763751555758544e-06,
"loss": 0.0011,
"step": 3240
},
{
"epoch": 3.3854668576637583,
"grad_norm": 8.83774831095252e-08,
"learning_rate": 2.843475133942642e-06,
"loss": 0.0007,
"step": 3250
},
{
"epoch": 3.3958848808438598,
"grad_norm": 5.0836853915825486e-05,
"learning_rate": 2.810689424013011e-06,
"loss": 0.0,
"step": 3260
},
{
"epoch": 3.4063029040239616,
"grad_norm": 3.2860607461770996e-05,
"learning_rate": 2.7780197636710993e-06,
"loss": 0.0,
"step": 3270
},
{
"epoch": 3.416720927204063,
"grad_norm": 0.1506471037864685,
"learning_rate": 2.7454678846495593e-06,
"loss": 0.0002,
"step": 3280
},
{
"epoch": 3.4271389503841645,
"grad_norm": 0.0003242450475227088,
"learning_rate": 2.7130355124377684e-06,
"loss": 0.0001,
"step": 3290
},
{
"epoch": 3.4375569735642664,
"grad_norm": 4.1059661270992365e-06,
"learning_rate": 2.6807243661903597e-06,
"loss": 0.0,
"step": 3300
},
{
"epoch": 3.447974996744368,
"grad_norm": 0.007434395141899586,
"learning_rate": 2.6485361586361012e-06,
"loss": 0.0001,
"step": 3310
},
{
"epoch": 3.4583930199244692,
"grad_norm": 0.2851642966270447,
"learning_rate": 2.6164725959871005e-06,
"loss": 0.0008,
"step": 3320
},
{
"epoch": 3.468811043104571,
"grad_norm": 0.001458955928683281,
"learning_rate": 2.5845353778483707e-06,
"loss": 0.0,
"step": 3330
},
{
"epoch": 3.4792290662846725,
"grad_norm": 3.6944745716027683e-06,
"learning_rate": 2.552726197127732e-06,
"loss": 0.0001,
"step": 3340
},
{
"epoch": 3.489647089464774,
"grad_norm": 0.00017191942606586963,
"learning_rate": 2.5210467399460794e-06,
"loss": 0.0,
"step": 3350
},
{
"epoch": 3.500065112644876,
"grad_norm": 8.9817083789967e-05,
"learning_rate": 2.489498685548005e-06,
"loss": 0.0002,
"step": 3360
},
{
"epoch": 3.5104831358249773,
"grad_norm": 0.0007733273669146001,
"learning_rate": 2.4580837062127867e-06,
"loss": 0.0,
"step": 3370
},
{
"epoch": 3.5209011590050787,
"grad_norm": 0.05479538068175316,
"learning_rate": 2.4268034671657412e-06,
"loss": 0.0002,
"step": 3380
},
{
"epoch": 3.5313191821851806,
"grad_norm": 0.02038147673010826,
"learning_rate": 2.39565962648996e-06,
"loss": 0.0,
"step": 3390
},
{
"epoch": 3.541737205365282,
"grad_norm": 1.4380566426552832e-06,
"learning_rate": 2.364653835038413e-06,
"loss": 0.0001,
"step": 3400
},
{
"epoch": 3.5521552285453835,
"grad_norm": 0.00023164517187979072,
"learning_rate": 2.3337877363464475e-06,
"loss": 0.0008,
"step": 3410
},
{
"epoch": 3.5625732517254853,
"grad_norm": 0.00041943430551327765,
"learning_rate": 2.3030629665446635e-06,
"loss": 0.0001,
"step": 3420
},
{
"epoch": 3.5729912749055868,
"grad_norm": 5.017073154449463,
"learning_rate": 2.272481154272188e-06,
"loss": 0.0062,
"step": 3430
},
{
"epoch": 3.583409298085688,
"grad_norm": 0.13844065368175507,
"learning_rate": 2.2420439205903523e-06,
"loss": 0.0006,
"step": 3440
},
{
"epoch": 3.59382732126579,
"grad_norm": 5.108922004699707,
"learning_rate": 2.211752878896745e-06,
"loss": 0.0131,
"step": 3450
},
{
"epoch": 3.6042453444458915,
"grad_norm": 0.012104752473533154,
"learning_rate": 2.1816096348397176e-06,
"loss": 0.0,
"step": 3460
},
{
"epoch": 3.614663367625993,
"grad_norm": 0.31581804156303406,
"learning_rate": 2.1516157862332425e-06,
"loss": 0.0002,
"step": 3470
},
{
"epoch": 3.625081390806095,
"grad_norm": 0.0012989668175578117,
"learning_rate": 2.1217729229722485e-06,
"loss": 0.0001,
"step": 3480
},
{
"epoch": 3.6354994139861962,
"grad_norm": 5.394697291194461e-05,
"learning_rate": 2.092082626948313e-06,
"loss": 0.0001,
"step": 3490
},
{
"epoch": 3.6459174371662977,
"grad_norm": 0.0002785645192489028,
"learning_rate": 2.062546471965841e-06,
"loss": 0.0017,
"step": 3500
},
{
"epoch": 3.656335460346399,
"grad_norm": 0.04909211024641991,
"learning_rate": 2.033166023658613e-06,
"loss": 0.0055,
"step": 3510
},
{
"epoch": 3.666753483526501,
"grad_norm": 0.007108056452125311,
"learning_rate": 2.0039428394068224e-06,
"loss": 0.0,
"step": 3520
},
{
"epoch": 3.6771715067066024,
"grad_norm": 0.0002842152607627213,
"learning_rate": 1.9748784682544955e-06,
"loss": 0.0,
"step": 3530
},
{
"epoch": 3.687589529886704,
"grad_norm": 0.037801098078489304,
"learning_rate": 1.9459744508274076e-06,
"loss": 0.0014,
"step": 3540
},
{
"epoch": 3.6980075530668057,
"grad_norm": 0.004841256886720657,
"learning_rate": 1.917232319251392e-06,
"loss": 0.0008,
"step": 3550
},
{
"epoch": 3.708425576246907,
"grad_norm": 0.3873371481895447,
"learning_rate": 1.8886535970711522e-06,
"loss": 0.0062,
"step": 3560
},
{
"epoch": 3.7188435994270086,
"grad_norm": 1.5684008758398704e-07,
"learning_rate": 1.860239799169482e-06,
"loss": 0.0001,
"step": 3570
},
{
"epoch": 3.72926162260711,
"grad_norm": 0.0005140849971212447,
"learning_rate": 1.8319924316869763e-06,
"loss": 0.0003,
"step": 3580
},
{
"epoch": 3.739679645787212,
"grad_norm": 8.463976882921997e-06,
"learning_rate": 1.8039129919421905e-06,
"loss": 0.0036,
"step": 3590
},
{
"epoch": 3.7500976689673133,
"grad_norm": 0.0017335577867925167,
"learning_rate": 1.7760029683522734e-06,
"loss": 0.0,
"step": 3600
},
{
"epoch": 3.7605156921474148,
"grad_norm": 6.008242417010479e-05,
"learning_rate": 1.7482638403540703e-06,
"loss": 0.0001,
"step": 3610
},
{
"epoch": 3.7709337153275166,
"grad_norm": 0.0020610829815268517,
"learning_rate": 1.720697078325701e-06,
"loss": 0.0005,
"step": 3620
},
{
"epoch": 3.781351738507618,
"grad_norm": 0.009846694767475128,
"learning_rate": 1.693304143508618e-06,
"loss": 0.0,
"step": 3630
},
{
"epoch": 3.7917697616877195,
"grad_norm": 0.00021927843044977635,
"learning_rate": 1.666086487930153e-06,
"loss": 0.0002,
"step": 3640
},
{
"epoch": 3.8021877848678214,
"grad_norm": 6.052812750567682e-05,
"learning_rate": 1.6390455543265454e-06,
"loss": 0.0001,
"step": 3650
},
{
"epoch": 3.812605808047923,
"grad_norm": 0.0021989853121340275,
"learning_rate": 1.6121827760664677e-06,
"loss": 0.0001,
"step": 3660
},
{
"epoch": 3.8230238312280242,
"grad_norm": 0.00010219099931418896,
"learning_rate": 1.5854995770750526e-06,
"loss": 0.0001,
"step": 3670
},
{
"epoch": 3.833441854408126,
"grad_norm": 0.00012348932796157897,
"learning_rate": 1.5589973717583968e-06,
"loss": 0.0,
"step": 3680
},
{
"epoch": 3.8438598775882276,
"grad_norm": 0.008015728555619717,
"learning_rate": 1.5326775649286135e-06,
"loss": 0.0001,
"step": 3690
},
{
"epoch": 3.854277900768329,
"grad_norm": 3.6400449516804656e-06,
"learning_rate": 1.506541551729338e-06,
"loss": 0.0001,
"step": 3700
},
{
"epoch": 3.864695923948431,
"grad_norm": 0.05682307854294777,
"learning_rate": 1.4805907175618034e-06,
"loss": 0.0001,
"step": 3710
},
{
"epoch": 3.8751139471285323,
"grad_norm": 2.3464983314624988e-05,
"learning_rate": 1.4548264380113763e-06,
"loss": 0.0,
"step": 3720
},
{
"epoch": 3.8855319703086337,
"grad_norm": 3.6688052205136046e-05,
"learning_rate": 1.4292500787746677e-06,
"loss": 0.0,
"step": 3730
},
{
"epoch": 3.8959499934887356,
"grad_norm": 6.737604053341784e-06,
"learning_rate": 1.4038629955871147e-06,
"loss": 0.0,
"step": 3740
},
{
"epoch": 3.906368016668837,
"grad_norm": 0.3771498501300812,
"learning_rate": 1.3786665341511424e-06,
"loss": 0.0002,
"step": 3750
},
{
"epoch": 3.9167860398489385,
"grad_norm": 0.21408240497112274,
"learning_rate": 1.3536620300648062e-06,
"loss": 0.0001,
"step": 3760
},
{
"epoch": 3.9272040630290403,
"grad_norm": 0.020450890064239502,
"learning_rate": 1.3288508087510198e-06,
"loss": 0.0,
"step": 3770
},
{
"epoch": 3.937622086209142,
"grad_norm": 0.0006295586354099214,
"learning_rate": 1.3042341853872753e-06,
"loss": 0.0001,
"step": 3780
},
{
"epoch": 3.948040109389243,
"grad_norm": 0.018750308081507683,
"learning_rate": 1.2798134648359512e-06,
"loss": 0.0,
"step": 3790
},
{
"epoch": 3.958458132569345,
"grad_norm": 0.00036703175283037126,
"learning_rate": 1.2555899415751211e-06,
"loss": 0.0002,
"step": 3800
},
{
"epoch": 3.9688761557494465,
"grad_norm": 0.005201701074838638,
"learning_rate": 1.2315648996299628e-06,
"loss": 0.0,
"step": 3810
},
{
"epoch": 3.979294178929548,
"grad_norm": 1.7224958355654962e-05,
"learning_rate": 1.2077396125046703e-06,
"loss": 0.0002,
"step": 3820
},
{
"epoch": 3.98971220210965,
"grad_norm": 0.0006623083609156311,
"learning_rate": 1.1841153431149715e-06,
"loss": 0.0,
"step": 3830
},
{
"epoch": 4.0,
"grad_norm": 0.05106138437986374,
"learning_rate": 1.160693343721163e-06,
"loss": 0.0,
"step": 3840
},
{
"epoch": 4.010418023180102,
"grad_norm": 0.13067518174648285,
"learning_rate": 1.137474855861751e-06,
"loss": 0.0001,
"step": 3850
},
{
"epoch": 4.020836046360203,
"grad_norm": 0.024557417258620262,
"learning_rate": 1.1144611102876251e-06,
"loss": 0.0,
"step": 3860
},
{
"epoch": 4.031254069540305,
"grad_norm": 0.00017925056454259902,
"learning_rate": 1.0916533268968293e-06,
"loss": 0.0,
"step": 3870
},
{
"epoch": 4.041672092720407,
"grad_norm": 0.0006559378234669566,
"learning_rate": 1.0690527146698915e-06,
"loss": 0.0001,
"step": 3880
},
{
"epoch": 4.052090115900508,
"grad_norm": 0.0031467515509575605,
"learning_rate": 1.046660471605744e-06,
"loss": 0.0,
"step": 3890
},
{
"epoch": 4.0625081390806095,
"grad_norm": 1.6517557241968461e-06,
"learning_rate": 1.0244777846582172e-06,
"loss": 0.0001,
"step": 3900
},
{
"epoch": 4.072926162260711,
"grad_norm": 3.2786699648568174e-06,
"learning_rate": 1.0025058296731254e-06,
"loss": 0.0004,
"step": 3910
},
{
"epoch": 4.083344185440812,
"grad_norm": 2.993703674292192e-05,
"learning_rate": 9.807457713259354e-07,
"loss": 0.0025,
"step": 3920
},
{
"epoch": 4.093762208620914,
"grad_norm": 0.00212846789509058,
"learning_rate": 9.591987630600313e-07,
"loss": 0.0002,
"step": 3930
},
{
"epoch": 4.104180231801016,
"grad_norm": 0.0008644104236736894,
"learning_rate": 9.37865947025573e-07,
"loss": 0.0,
"step": 3940
},
{
"epoch": 4.114598254981117,
"grad_norm": 4.824515417567454e-05,
"learning_rate": 9.167484540189558e-07,
"loss": 0.0,
"step": 3950
},
{
"epoch": 4.125016278161219,
"grad_norm": 0.010090484283864498,
"learning_rate": 8.958474034228676e-07,
"loss": 0.0002,
"step": 3960
},
{
"epoch": 4.135434301341321,
"grad_norm": 0.0014343432849273086,
"learning_rate": 8.751639031469522e-07,
"loss": 0.0001,
"step": 3970
},
{
"epoch": 4.145852324521422,
"grad_norm": 0.0008025519782677293,
"learning_rate": 8.546990495690893e-07,
"loss": 0.0,
"step": 3980
},
{
"epoch": 4.156270347701524,
"grad_norm": 0.0038316240534186363,
"learning_rate": 8.344539274772657e-07,
"loss": 0.0,
"step": 3990
},
{
"epoch": 4.166688370881626,
"grad_norm": 0.0011604970786720514,
"learning_rate": 8.144296100120886e-07,
"loss": 0.0001,
"step": 4000
},
{
"epoch": 4.177106394061727,
"grad_norm": 0.0030226227827370167,
"learning_rate": 7.946271586098847e-07,
"loss": 0.0001,
"step": 4010
},
{
"epoch": 4.1875244172418284,
"grad_norm": 0.024601083248853683,
"learning_rate": 7.750476229464543e-07,
"loss": 0.0,
"step": 4020
},
{
"epoch": 4.19794244042193,
"grad_norm": 0.00018696175538934767,
"learning_rate": 7.556920408814133e-07,
"loss": 0.0,
"step": 4030
},
{
"epoch": 4.208360463602031,
"grad_norm": 2.6345056539867073e-05,
"learning_rate": 7.365614384031944e-07,
"loss": 0.0002,
"step": 4040
},
{
"epoch": 4.218778486782133,
"grad_norm": 0.00012209890701342374,
"learning_rate": 7.176568295746467e-07,
"loss": 0.0,
"step": 4050
},
{
"epoch": 4.229196509962235,
"grad_norm": 0.0035056450869888067,
"learning_rate": 6.989792164793003e-07,
"loss": 0.0,
"step": 4060
},
{
"epoch": 4.239614533142336,
"grad_norm": 0.00014033827756065875,
"learning_rate": 6.805295891682323e-07,
"loss": 0.0,
"step": 4070
},
{
"epoch": 4.250032556322438,
"grad_norm": 8.16184274299303e-06,
"learning_rate": 6.62308925607597e-07,
"loss": 0.0,
"step": 4080
},
{
"epoch": 4.26045057950254,
"grad_norm": 0.0005164833273738623,
"learning_rate": 6.443181916267826e-07,
"loss": 0.0,
"step": 4090
},
{
"epoch": 4.270868602682641,
"grad_norm": 0.01795101910829544,
"learning_rate": 6.26558340867216e-07,
"loss": 0.0,
"step": 4100
},
{
"epoch": 4.281286625862743,
"grad_norm": 0.01263391226530075,
"learning_rate": 6.090303147318122e-07,
"loss": 0.0,
"step": 4110
},
{
"epoch": 4.291704649042844,
"grad_norm": 0.0007271830691024661,
"learning_rate": 5.917350423350731e-07,
"loss": 0.0,
"step": 4120
},
{
"epoch": 4.3021226722229455,
"grad_norm": 5.166072696738411e-06,
"learning_rate": 5.74673440453839e-07,
"loss": 0.0001,
"step": 4130
},
{
"epoch": 4.312540695403047,
"grad_norm": 0.005551379173994064,
"learning_rate": 5.578464134786916e-07,
"loss": 0.0,
"step": 4140
},
{
"epoch": 4.322958718583148,
"grad_norm": 0.0028216512873768806,
"learning_rate": 5.412548533660133e-07,
"loss": 0.0,
"step": 4150
},
{
"epoch": 4.33337674176325,
"grad_norm": 0.0007700317073613405,
"learning_rate": 5.248996395907091e-07,
"loss": 0.0,
"step": 4160
},
{
"epoch": 4.343794764943352,
"grad_norm": 0.01867193728685379,
"learning_rate": 5.08781639099587e-07,
"loss": 0.0,
"step": 4170
},
{
"epoch": 4.354212788123453,
"grad_norm": 0.0008056411170400679,
"learning_rate": 4.92901706265404e-07,
"loss": 0.0,
"step": 4180
},
{
"epoch": 4.364630811303555,
"grad_norm": 0.008770433254539967,
"learning_rate": 4.772606828415772e-07,
"loss": 0.001,
"step": 4190
},
{
"epoch": 4.375048834483657,
"grad_norm": 0.005740197841078043,
"learning_rate": 4.6185939791756397e-07,
"loss": 0.0,
"step": 4200
},
{
"epoch": 4.385466857663758,
"grad_norm": 0.03286215290427208,
"learning_rate": 4.46698667874918e-07,
"loss": 0.0025,
"step": 4210
},
{
"epoch": 4.39588488084386,
"grad_norm": 0.000286923284875229,
"learning_rate": 4.3177929634400926e-07,
"loss": 0.0,
"step": 4220
},
{
"epoch": 4.406302904023962,
"grad_norm": 0.00014024133270140737,
"learning_rate": 4.1710207416143066e-07,
"loss": 0.0,
"step": 4230
},
{
"epoch": 4.416720927204063,
"grad_norm": 0.004977494943886995,
"learning_rate": 4.026677793280748e-07,
"loss": 0.0001,
"step": 4240
},
{
"epoch": 4.4271389503841645,
"grad_norm": 6.539112655445933e-05,
"learning_rate": 3.884771769678952e-07,
"loss": 0.0002,
"step": 4250
},
{
"epoch": 4.437556973564266,
"grad_norm": 0.009294900111854076,
"learning_rate": 3.745310192873508e-07,
"loss": 0.0,
"step": 4260
},
{
"epoch": 4.447974996744367,
"grad_norm": 5.570728262682678e-06,
"learning_rate": 3.6083004553552905e-07,
"loss": 0.0001,
"step": 4270
},
{
"epoch": 4.458393019924469,
"grad_norm": 0.0003835258539766073,
"learning_rate": 3.4737498196496686e-07,
"loss": 0.0,
"step": 4280
},
{
"epoch": 4.468811043104571,
"grad_norm": 2.6702812405687837e-08,
"learning_rate": 3.3416654179314665e-07,
"loss": 0.0,
"step": 4290
},
{
"epoch": 4.479229066284672,
"grad_norm": 0.00016556118498556316,
"learning_rate": 3.212054251646962e-07,
"loss": 0.0,
"step": 4300
},
{
"epoch": 4.489647089464774,
"grad_norm": 3.6678444303106517e-05,
"learning_rate": 3.0849231911427134e-07,
"loss": 0.0,
"step": 4310
},
{
"epoch": 4.500065112644876,
"grad_norm": 0.00038032321026548743,
"learning_rate": 2.960278975301428e-07,
"loss": 0.0,
"step": 4320
},
{
"epoch": 4.510483135824977,
"grad_norm": 0.00020073131599929184,
"learning_rate": 2.8381282111847097e-07,
"loss": 0.0,
"step": 4330
},
{
"epoch": 4.520901159005079,
"grad_norm": 4.448674735613167e-05,
"learning_rate": 2.718477373682865e-07,
"loss": 0.0001,
"step": 4340
},
{
"epoch": 4.531319182185181,
"grad_norm": 0.17879758775234222,
"learning_rate": 2.601332805171669e-07,
"loss": 0.0001,
"step": 4350
},
{
"epoch": 4.541737205365282,
"grad_norm": 0.0016016721492633224,
"learning_rate": 2.4867007151761766e-07,
"loss": 0.0002,
"step": 4360
},
{
"epoch": 4.5521552285453835,
"grad_norm": 0.011347103863954544,
"learning_rate": 2.3745871800415765e-07,
"loss": 0.0049,
"step": 4370
},
{
"epoch": 4.562573251725485,
"grad_norm": 0.000959846016485244,
"learning_rate": 2.2649981426110967e-07,
"loss": 0.0002,
"step": 4380
},
{
"epoch": 4.572991274905586,
"grad_norm": 0.00030386244179680943,
"learning_rate": 2.1579394119109863e-07,
"loss": 0.0,
"step": 4390
},
{
"epoch": 4.583409298085688,
"grad_norm": 0.000289548363070935,
"learning_rate": 2.053416662842611e-07,
"loss": 0.014,
"step": 4400
},
{
"epoch": 4.59382732126579,
"grad_norm": 0.009739086031913757,
"learning_rate": 1.9514354358816102e-07,
"loss": 0.0005,
"step": 4410
},
{
"epoch": 4.604245344445891,
"grad_norm": 0.0015085875056684017,
"learning_rate": 1.8520011367842538e-07,
"loss": 0.0,
"step": 4420
},
{
"epoch": 4.614663367625993,
"grad_norm": 8.586205694882665e-06,
"learning_rate": 1.7551190363008453e-07,
"loss": 0.0001,
"step": 4430
},
{
"epoch": 4.625081390806095,
"grad_norm": 5.23238668392878e-07,
"learning_rate": 1.6607942698963875e-07,
"loss": 0.0,
"step": 4440
},
{
"epoch": 4.635499413986196,
"grad_norm": 0.006870610639452934,
"learning_rate": 1.5690318374783243e-07,
"loss": 0.0,
"step": 4450
},
{
"epoch": 4.645917437166298,
"grad_norm": 0.005947893485426903,
"learning_rate": 1.4798366031315292e-07,
"loss": 0.0026,
"step": 4460
},
{
"epoch": 4.6563354603464,
"grad_norm": 1.6190052747333539e-06,
"learning_rate": 1.393213294860457e-07,
"loss": 0.0,
"step": 4470
},
{
"epoch": 4.6667534835265005,
"grad_norm": 0.11864887177944183,
"learning_rate": 1.3091665043385383e-07,
"loss": 0.0001,
"step": 4480
},
{
"epoch": 4.677171506706602,
"grad_norm": 0.002785899443551898,
"learning_rate": 1.2277006866648122e-07,
"loss": 0.0003,
"step": 4490
},
{
"epoch": 4.687589529886704,
"grad_norm": 2.433152758385404e-06,
"learning_rate": 1.1488201601276894e-07,
"loss": 0.0,
"step": 4500
},
{
"epoch": 4.698007553066805,
"grad_norm": 0.0006944002816453576,
"learning_rate": 1.0725291059761611e-07,
"loss": 0.0001,
"step": 4510
},
{
"epoch": 4.708425576246907,
"grad_norm": 4.231273487675935e-05,
"learning_rate": 9.98831568198061e-08,
"loss": 0.0,
"step": 4520
},
{
"epoch": 4.718843599427009,
"grad_norm": 1.5037702496556449e-06,
"learning_rate": 9.277314533057913e-08,
"loss": 0.0,
"step": 4530
},
{
"epoch": 4.72926162260711,
"grad_norm": 9.992829745897325e-07,
"learning_rate": 8.592325301291782e-08,
"loss": 0.0,
"step": 4540
},
{
"epoch": 4.739679645787212,
"grad_norm": 0.0003942627226933837,
"learning_rate": 7.933384296157365e-08,
"loss": 0.0,
"step": 4550
},
{
"epoch": 4.750097668967314,
"grad_norm": 0.09215450286865234,
"learning_rate": 7.300526446381906e-08,
"loss": 0.0002,
"step": 4560
},
{
"epoch": 4.760515692147415,
"grad_norm": 2.6581383281154558e-05,
"learning_rate": 6.693785298093336e-08,
"loss": 0.0,
"step": 4570
},
{
"epoch": 4.770933715327517,
"grad_norm": 9.696443470375016e-08,
"learning_rate": 6.113193013041918e-08,
"loss": 0.0,
"step": 4580
},
{
"epoch": 4.7813517385076185,
"grad_norm": 0.0001541363453725353,
"learning_rate": 5.558780366895611e-08,
"loss": 0.0,
"step": 4590
},
{
"epoch": 4.7917697616877195,
"grad_norm": 0.001118164393119514,
"learning_rate": 5.0305767476087066e-08,
"loss": 0.0001,
"step": 4600
},
{
"epoch": 4.802187784867821,
"grad_norm": 5.372874511522241e-06,
"learning_rate": 4.528610153863855e-08,
"loss": 0.0001,
"step": 4610
},
{
"epoch": 4.812605808047923,
"grad_norm": 4.5697615860262886e-05,
"learning_rate": 4.052907193588251e-08,
"loss": 0.0,
"step": 4620
},
{
"epoch": 4.823023831228024,
"grad_norm": 1.0059036867460236e-05,
"learning_rate": 3.60349308254293e-08,
"loss": 0.0001,
"step": 4630
},
{
"epoch": 4.833441854408126,
"grad_norm": 0.00015976910071913153,
"learning_rate": 3.1803916429863355e-08,
"loss": 0.0,
"step": 4640
},
{
"epoch": 4.843859877588228,
"grad_norm": 1.2175183296203613,
"learning_rate": 2.7836253024114412e-08,
"loss": 0.0003,
"step": 4650
},
{
"epoch": 4.854277900768329,
"grad_norm": 0.0008595722611062229,
"learning_rate": 2.4132150923570353e-08,
"loss": 0.0003,
"step": 4660
},
{
"epoch": 4.864695923948431,
"grad_norm": 1.9864066871377872e-06,
"learning_rate": 2.0691806472928344e-08,
"loss": 0.0001,
"step": 4670
},
{
"epoch": 4.875113947128533,
"grad_norm": 1.0369881238148082e-06,
"learning_rate": 1.7515402035787053e-08,
"loss": 0.0,
"step": 4680
},
{
"epoch": 4.885531970308634,
"grad_norm": 0.08182670176029205,
"learning_rate": 1.4603105984979382e-08,
"loss": 0.0001,
"step": 4690
},
{
"epoch": 4.895949993488736,
"grad_norm": 1.3140484043105971e-06,
"learning_rate": 1.1955072693649594e-08,
"loss": 0.0,
"step": 4700
},
{
"epoch": 4.9063680166688375,
"grad_norm": 0.008761771954596043,
"learning_rate": 9.571442527068209e-09,
"loss": 0.0,
"step": 4710
},
{
"epoch": 4.9167860398489385,
"grad_norm": 9.459521243115887e-05,
"learning_rate": 7.452341835192388e-09,
"loss": 0.0,
"step": 4720
},
{
"epoch": 4.92720406302904,
"grad_norm": 0.00031195214251056314,
"learning_rate": 5.597882945969635e-09,
"loss": 0.0001,
"step": 4730
},
{
"epoch": 4.937622086209142,
"grad_norm": 0.00505842175334692,
"learning_rate": 4.008164159380879e-09,
"loss": 0.0001,
"step": 4740
},
{
"epoch": 4.948040109389243,
"grad_norm": 0.028236793354153633,
"learning_rate": 2.6832697422324307e-09,
"loss": 0.0002,
"step": 4750
},
{
"epoch": 4.958458132569345,
"grad_norm": 3.2701405871193856e-05,
"learning_rate": 1.623269923688442e-09,
"loss": 0.0001,
"step": 4760
},
{
"epoch": 4.968876155749447,
"grad_norm": 0.0009896554984152317,
"learning_rate": 8.282208915466605e-10,
"loss": 0.0001,
"step": 4770
},
{
"epoch": 4.979294178929548,
"grad_norm": 4.405805157148279e-06,
"learning_rate": 2.981647892635886e-10,
"loss": 0.0,
"step": 4780
},
{
"epoch": 4.98971220210965,
"grad_norm": 2.589948280729004e-06,
"learning_rate": 3.312971371627338e-11,
"loss": 0.0001,
"step": 4790
},
{
"epoch": 4.9949212136997,
"step": 4795,
"total_flos": 0.0,
"train_loss": 0.03522291941339332,
"train_runtime": 16429.2929,
"train_samples_per_second": 18.696,
"train_steps_per_second": 0.292
}
],
"logging_steps": 10,
"max_steps": 4795,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 1,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}