checkpoint-3921 / trainer_state.json
boyuzhuGPT's picture
update files
672af52 verified
{
"best_global_step": 2600,
"best_metric": 0.4455747,
"best_model_checkpoint": "/root/ms-swift/output_1/v4-20250825-221955/checkpoint-2600",
"epoch": 3.0,
"eval_steps": 50,
"global_step": 3921,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0007651109410864575,
"grad_norm": 15.001960754394531,
"learning_rate": 5.076142131979695e-07,
"loss": 1.2726802825927734,
"step": 1,
"token_acc": 0.6764705777168274
},
{
"epoch": 0.0038255547054322878,
"grad_norm": 13.028708457946777,
"learning_rate": 2.5380710659898476e-06,
"loss": 1.495189905166626,
"step": 5,
"token_acc": 0.6392497420310974
},
{
"epoch": 0.0076511094108645756,
"grad_norm": 5.605969429016113,
"learning_rate": 5.076142131979695e-06,
"loss": 1.1087797164916993,
"step": 10,
"token_acc": 0.7032846808433533
},
{
"epoch": 0.011476664116296864,
"grad_norm": 4.179737091064453,
"learning_rate": 7.614213197969544e-06,
"loss": 0.7857523918151855,
"step": 15,
"token_acc": 0.7791855931282043
},
{
"epoch": 0.015302218821729151,
"grad_norm": 4.184815883636475,
"learning_rate": 1.015228426395939e-05,
"loss": 0.6412610054016114,
"step": 20,
"token_acc": 0.8024289011955261
},
{
"epoch": 0.019127773527161437,
"grad_norm": 3.188452959060669,
"learning_rate": 1.2690355329949238e-05,
"loss": 0.6599317073822022,
"step": 25,
"token_acc": 0.7991740703582764
},
{
"epoch": 0.022953328232593728,
"grad_norm": 2.735691785812378,
"learning_rate": 1.5228426395939088e-05,
"loss": 0.6142410278320313,
"step": 30,
"token_acc": 0.8127740025520325
},
{
"epoch": 0.026778882938026015,
"grad_norm": 2.9147984981536865,
"learning_rate": 1.7766497461928935e-05,
"loss": 0.6038710117340088,
"step": 35,
"token_acc": 0.813315212726593
},
{
"epoch": 0.030604437643458302,
"grad_norm": 2.701826572418213,
"learning_rate": 2.030456852791878e-05,
"loss": 0.5683969497680664,
"step": 40,
"token_acc": 0.8215563893318176
},
{
"epoch": 0.03442999234889059,
"grad_norm": 2.8082520961761475,
"learning_rate": 2.284263959390863e-05,
"loss": 0.6069915771484375,
"step": 45,
"token_acc": 0.8085312843322754
},
{
"epoch": 0.03825554705432287,
"grad_norm": 2.6436877250671387,
"learning_rate": 2.5380710659898476e-05,
"loss": 0.5704009056091308,
"step": 50,
"token_acc": 0.8219647407531738
},
{
"epoch": 0.03825554705432287,
"eval_loss": 0.5656692981719971,
"eval_runtime": 6.1089,
"eval_samples_per_second": 17.024,
"eval_steps_per_second": 2.128,
"eval_token_acc": 0.8207153677940369,
"step": 50
},
{
"epoch": 0.042081101759755164,
"grad_norm": 2.689117670059204,
"learning_rate": 2.7918781725888326e-05,
"loss": 0.575815486907959,
"step": 55,
"token_acc": 0.8211867213249207
},
{
"epoch": 0.045906656465187455,
"grad_norm": 2.2790122032165527,
"learning_rate": 3.0456852791878175e-05,
"loss": 0.5862385749816894,
"step": 60,
"token_acc": 0.8205827474594116
},
{
"epoch": 0.04973221117061974,
"grad_norm": 2.6730895042419434,
"learning_rate": 3.299492385786802e-05,
"loss": 0.5797908782958985,
"step": 65,
"token_acc": 0.819099485874176
},
{
"epoch": 0.05355776587605203,
"grad_norm": 2.4526894092559814,
"learning_rate": 3.553299492385787e-05,
"loss": 0.6487821102142334,
"step": 70,
"token_acc": 0.7994943857192993
},
{
"epoch": 0.057383320581484314,
"grad_norm": 2.265002489089966,
"learning_rate": 3.8071065989847716e-05,
"loss": 0.6046820640563965,
"step": 75,
"token_acc": 0.8156428933143616
},
{
"epoch": 0.061208875286916604,
"grad_norm": 2.5733046531677246,
"learning_rate": 4.060913705583756e-05,
"loss": 0.5806538581848144,
"step": 80,
"token_acc": 0.8199408054351807
},
{
"epoch": 0.06503442999234889,
"grad_norm": 2.3223984241485596,
"learning_rate": 4.3147208121827415e-05,
"loss": 0.6687778949737548,
"step": 85,
"token_acc": 0.7976916432380676
},
{
"epoch": 0.06885998469778118,
"grad_norm": 1.9996718168258667,
"learning_rate": 4.568527918781726e-05,
"loss": 0.5714664459228516,
"step": 90,
"token_acc": 0.8250343203544617
},
{
"epoch": 0.07268553940321347,
"grad_norm": 2.2907140254974365,
"learning_rate": 4.822335025380711e-05,
"loss": 0.6378528118133545,
"step": 95,
"token_acc": 0.8057200312614441
},
{
"epoch": 0.07651109410864575,
"grad_norm": 1.9822206497192383,
"learning_rate": 5.076142131979695e-05,
"loss": 0.6435206413269043,
"step": 100,
"token_acc": 0.8065351843833923
},
{
"epoch": 0.07651109410864575,
"eval_loss": 0.6021918654441833,
"eval_runtime": 6.7812,
"eval_samples_per_second": 15.337,
"eval_steps_per_second": 1.917,
"eval_token_acc": 0.814389705657959,
"step": 100
},
{
"epoch": 0.08033664881407804,
"grad_norm": 1.8460628986358643,
"learning_rate": 5.329949238578681e-05,
"loss": 0.6554917335510254,
"step": 105,
"token_acc": 0.8218502998352051
},
{
"epoch": 0.08416220351951033,
"grad_norm": 2.0430757999420166,
"learning_rate": 5.583756345177665e-05,
"loss": 0.7082652091979981,
"step": 110,
"token_acc": 0.7900523543357849
},
{
"epoch": 0.08798775822494262,
"grad_norm": 2.1763596534729004,
"learning_rate": 5.83756345177665e-05,
"loss": 0.6629996299743652,
"step": 115,
"token_acc": 0.7997561097145081
},
{
"epoch": 0.09181331293037491,
"grad_norm": 1.8452140092849731,
"learning_rate": 6.091370558375635e-05,
"loss": 0.6425168991088868,
"step": 120,
"token_acc": 0.8068760633468628
},
{
"epoch": 0.09563886763580719,
"grad_norm": 2.0671913623809814,
"learning_rate": 6.34517766497462e-05,
"loss": 0.6626197814941406,
"step": 125,
"token_acc": 0.8050779700279236
},
{
"epoch": 0.09946442234123948,
"grad_norm": 1.9707857370376587,
"learning_rate": 6.598984771573604e-05,
"loss": 0.6357526779174805,
"step": 130,
"token_acc": 0.8117111921310425
},
{
"epoch": 0.10328997704667177,
"grad_norm": 1.684924840927124,
"learning_rate": 6.852791878172589e-05,
"loss": 0.6633370399475098,
"step": 135,
"token_acc": 0.8078529834747314
},
{
"epoch": 0.10711553175210406,
"grad_norm": 1.8460227251052856,
"learning_rate": 7.106598984771574e-05,
"loss": 0.7214941501617431,
"step": 140,
"token_acc": 0.7888500690460205
},
{
"epoch": 0.11094108645753634,
"grad_norm": 1.8344098329544067,
"learning_rate": 7.360406091370558e-05,
"loss": 0.7153414249420166,
"step": 145,
"token_acc": 0.7917036414146423
},
{
"epoch": 0.11476664116296863,
"grad_norm": 2.0649237632751465,
"learning_rate": 7.614213197969543e-05,
"loss": 0.8018023490905761,
"step": 150,
"token_acc": 0.7870769500732422
},
{
"epoch": 0.11476664116296863,
"eval_loss": 0.6869359612464905,
"eval_runtime": 7.176,
"eval_samples_per_second": 14.493,
"eval_steps_per_second": 1.812,
"eval_token_acc": 0.8004150390625,
"step": 150
},
{
"epoch": 0.11859219586840092,
"grad_norm": 2.0781986713409424,
"learning_rate": 7.868020304568529e-05,
"loss": 0.7426050186157227,
"step": 155,
"token_acc": 0.784966230392456
},
{
"epoch": 0.12241775057383321,
"grad_norm": 3.169353485107422,
"learning_rate": 8.121827411167512e-05,
"loss": 0.6967845916748047,
"step": 160,
"token_acc": 0.799592137336731
},
{
"epoch": 0.1262433052792655,
"grad_norm": 2.8000311851501465,
"learning_rate": 8.375634517766498e-05,
"loss": 0.6940568923950196,
"step": 165,
"token_acc": 0.7990803718566895
},
{
"epoch": 0.13006885998469778,
"grad_norm": 1.7199612855911255,
"learning_rate": 8.629441624365483e-05,
"loss": 0.6588430404663086,
"step": 170,
"token_acc": 0.8097391724586487
},
{
"epoch": 0.13389441469013008,
"grad_norm": 1.6225758790969849,
"learning_rate": 8.883248730964467e-05,
"loss": 0.7546923160552979,
"step": 175,
"token_acc": 0.7823401093482971
},
{
"epoch": 0.13771996939556236,
"grad_norm": 1.738344430923462,
"learning_rate": 9.137055837563452e-05,
"loss": 0.6869890213012695,
"step": 180,
"token_acc": 0.8029044270515442
},
{
"epoch": 0.14154552410099464,
"grad_norm": 1.7446883916854858,
"learning_rate": 9.390862944162437e-05,
"loss": 0.744170093536377,
"step": 185,
"token_acc": 0.7861586213111877
},
{
"epoch": 0.14537107880642694,
"grad_norm": 1.5875240564346313,
"learning_rate": 9.644670050761421e-05,
"loss": 0.6316198348999024,
"step": 190,
"token_acc": 0.8180323839187622
},
{
"epoch": 0.14919663351185922,
"grad_norm": 1.83012855052948,
"learning_rate": 9.898477157360407e-05,
"loss": 1.0572455406188965,
"step": 195,
"token_acc": 0.7630072236061096
},
{
"epoch": 0.1530221882172915,
"grad_norm": 9.883597373962402,
"learning_rate": 9.99998398736932e-05,
"loss": 0.703323221206665,
"step": 200,
"token_acc": 0.8030744194984436
},
{
"epoch": 0.1530221882172915,
"eval_loss": 0.7220072150230408,
"eval_runtime": 7.3149,
"eval_samples_per_second": 14.218,
"eval_steps_per_second": 1.777,
"eval_token_acc": 0.7940492630004883,
"step": 200
},
{
"epoch": 0.1568477429227238,
"grad_norm": 1.4011379480361938,
"learning_rate": 9.999886132775469e-05,
"loss": 0.7197819232940674,
"step": 205,
"token_acc": 0.7953398823738098
},
{
"epoch": 0.16067329762815608,
"grad_norm": 1.5504759550094604,
"learning_rate": 9.999699321232598e-05,
"loss": 0.6872771263122559,
"step": 210,
"token_acc": 0.804167628288269
},
{
"epoch": 0.16449885233358838,
"grad_norm": 2.0014920234680176,
"learning_rate": 9.999423556064422e-05,
"loss": 0.6684097290039063,
"step": 215,
"token_acc": 0.8079100847244263
},
{
"epoch": 0.16832440703902066,
"grad_norm": 1.3064231872558594,
"learning_rate": 9.999058842177297e-05,
"loss": 0.747900390625,
"step": 220,
"token_acc": 0.7928001880645752
},
{
"epoch": 0.17214996174445293,
"grad_norm": 1.6330523490905762,
"learning_rate": 9.998605186060137e-05,
"loss": 0.715455961227417,
"step": 225,
"token_acc": 0.7988653779029846
},
{
"epoch": 0.17597551644988524,
"grad_norm": 1.6291477680206299,
"learning_rate": 9.9980625957843e-05,
"loss": 0.792291784286499,
"step": 230,
"token_acc": 0.7906692624092102
},
{
"epoch": 0.17980107115531752,
"grad_norm": 1.3224996328353882,
"learning_rate": 9.99743108100344e-05,
"loss": 0.6187815189361572,
"step": 235,
"token_acc": 0.8209345936775208
},
{
"epoch": 0.18362662586074982,
"grad_norm": 1.3888137340545654,
"learning_rate": 9.996710652953338e-05,
"loss": 0.7097324371337891,
"step": 240,
"token_acc": 0.8024294376373291
},
{
"epoch": 0.1874521805661821,
"grad_norm": 1.340208649635315,
"learning_rate": 9.995901324451704e-05,
"loss": 0.7415911674499511,
"step": 245,
"token_acc": 0.7968400716781616
},
{
"epoch": 0.19127773527161437,
"grad_norm": 1.1856446266174316,
"learning_rate": 9.995003109897942e-05,
"loss": 0.7001552581787109,
"step": 250,
"token_acc": 0.8009890913963318
},
{
"epoch": 0.19127773527161437,
"eval_loss": 0.6857067942619324,
"eval_runtime": 7.3358,
"eval_samples_per_second": 14.177,
"eval_steps_per_second": 1.772,
"eval_token_acc": 0.803743302822113,
"step": 250
},
{
"epoch": 0.19510328997704668,
"grad_norm": 1.2998038530349731,
"learning_rate": 9.994016025272905e-05,
"loss": 0.6838603019714355,
"step": 255,
"token_acc": 0.8089724779129028
},
{
"epoch": 0.19892884468247896,
"grad_norm": 1.449840784072876,
"learning_rate": 9.992940088138597e-05,
"loss": 0.6695821762084961,
"step": 260,
"token_acc": 0.8115434646606445
},
{
"epoch": 0.20275439938791126,
"grad_norm": 2.188504219055176,
"learning_rate": 9.991775317637873e-05,
"loss": 0.7405529499053956,
"step": 265,
"token_acc": 0.7956330180168152
},
{
"epoch": 0.20657995409334354,
"grad_norm": 1.2301571369171143,
"learning_rate": 9.99052173449409e-05,
"loss": 0.7626109600067139,
"step": 270,
"token_acc": 0.7877880334854126
},
{
"epoch": 0.21040550879877581,
"grad_norm": 1.217523455619812,
"learning_rate": 9.989179361010741e-05,
"loss": 0.7369673728942872,
"step": 275,
"token_acc": 0.7953155040740967
},
{
"epoch": 0.21423106350420812,
"grad_norm": 1.3204615116119385,
"learning_rate": 9.987748221071062e-05,
"loss": 0.6772171497344971,
"step": 280,
"token_acc": 0.8045340180397034
},
{
"epoch": 0.2180566182096404,
"grad_norm": 1.3093225955963135,
"learning_rate": 9.9862283401376e-05,
"loss": 0.904904556274414,
"step": 285,
"token_acc": 0.7854760885238647
},
{
"epoch": 0.22188217291507267,
"grad_norm": 1.4255338907241821,
"learning_rate": 9.984619745251767e-05,
"loss": 0.669553565979004,
"step": 290,
"token_acc": 0.8050349354743958
},
{
"epoch": 0.22570772762050498,
"grad_norm": 1.4884202480316162,
"learning_rate": 9.98292246503335e-05,
"loss": 0.7445178508758545,
"step": 295,
"token_acc": 0.8016032576560974
},
{
"epoch": 0.22953328232593725,
"grad_norm": 1.3081945180892944,
"learning_rate": 9.981136529680013e-05,
"loss": 0.6435537815093995,
"step": 300,
"token_acc": 0.8145782947540283
},
{
"epoch": 0.22953328232593725,
"eval_loss": 0.6707971096038818,
"eval_runtime": 7.6759,
"eval_samples_per_second": 13.549,
"eval_steps_per_second": 1.694,
"eval_token_acc": 0.809266984462738,
"step": 300
},
{
"epoch": 0.23335883703136956,
"grad_norm": 1.2945371866226196,
"learning_rate": 9.979261970966752e-05,
"loss": 0.671229362487793,
"step": 305,
"token_acc": 0.8093103170394897
},
{
"epoch": 0.23718439173680184,
"grad_norm": 1.094642996788025,
"learning_rate": 9.97729882224533e-05,
"loss": 0.638882064819336,
"step": 310,
"token_acc": 0.8210087418556213
},
{
"epoch": 0.2410099464422341,
"grad_norm": 1.2039848566055298,
"learning_rate": 9.975247118443686e-05,
"loss": 0.7105097770690918,
"step": 315,
"token_acc": 0.79979407787323
},
{
"epoch": 0.24483550114766642,
"grad_norm": 9.3181734085083,
"learning_rate": 9.973106896065318e-05,
"loss": 0.7334442615509034,
"step": 320,
"token_acc": 0.8001999855041504
},
{
"epoch": 0.2486610558530987,
"grad_norm": 1.2156879901885986,
"learning_rate": 9.970878193188617e-05,
"loss": 0.6516756534576416,
"step": 325,
"token_acc": 0.8167580366134644
},
{
"epoch": 0.252486610558531,
"grad_norm": 1.382604956626892,
"learning_rate": 9.968561049466214e-05,
"loss": 0.7214525222778321,
"step": 330,
"token_acc": 0.7979754209518433
},
{
"epoch": 0.2563121652639633,
"grad_norm": 1.0208624601364136,
"learning_rate": 9.96615550612425e-05,
"loss": 0.6243480205535888,
"step": 335,
"token_acc": 0.822067379951477
},
{
"epoch": 0.26013771996939555,
"grad_norm": 1.2273170948028564,
"learning_rate": 9.96366160596166e-05,
"loss": 0.7538263320922851,
"step": 340,
"token_acc": 0.7931398749351501
},
{
"epoch": 0.26396327467482783,
"grad_norm": 1.005936622619629,
"learning_rate": 9.961079393349408e-05,
"loss": 0.6441500663757325,
"step": 345,
"token_acc": 0.8183194398880005
},
{
"epoch": 0.26778882938026016,
"grad_norm": 1.2466620206832886,
"learning_rate": 9.958408914229687e-05,
"loss": 0.7031271934509278,
"step": 350,
"token_acc": 0.8006601929664612
},
{
"epoch": 0.26778882938026016,
"eval_loss": 0.6655329465866089,
"eval_runtime": 8.6572,
"eval_samples_per_second": 12.013,
"eval_steps_per_second": 1.502,
"eval_token_acc": 0.810479998588562,
"step": 350
},
{
"epoch": 0.27161438408569244,
"grad_norm": 1.1055852174758911,
"learning_rate": 9.955650216115118e-05,
"loss": 0.7128757953643798,
"step": 355,
"token_acc": 0.8017191886901855
},
{
"epoch": 0.2754399387911247,
"grad_norm": 0.9971266388893127,
"learning_rate": 9.952803348087888e-05,
"loss": 0.6931791305541992,
"step": 360,
"token_acc": 0.8039373159408569
},
{
"epoch": 0.279265493496557,
"grad_norm": 1.3013373613357544,
"learning_rate": 9.949868360798893e-05,
"loss": 0.6467844486236572,
"step": 365,
"token_acc": 0.8141829371452332
},
{
"epoch": 0.28309104820198927,
"grad_norm": 1.1281312704086304,
"learning_rate": 9.946845306466822e-05,
"loss": 0.6698862075805664,
"step": 370,
"token_acc": 0.8099541664123535
},
{
"epoch": 0.2869166029074216,
"grad_norm": 1.0093694925308228,
"learning_rate": 9.943734238877241e-05,
"loss": 0.640196704864502,
"step": 375,
"token_acc": 0.8200778961181641
},
{
"epoch": 0.2907421576128539,
"grad_norm": 1.161116361618042,
"learning_rate": 9.940535213381623e-05,
"loss": 0.7982209682464599,
"step": 380,
"token_acc": 0.803227961063385
},
{
"epoch": 0.29456771231828616,
"grad_norm": 1.17842435836792,
"learning_rate": 9.937248286896376e-05,
"loss": 0.674342155456543,
"step": 385,
"token_acc": 0.8081824779510498
},
{
"epoch": 0.29839326702371843,
"grad_norm": 1.2346426248550415,
"learning_rate": 9.933873517901825e-05,
"loss": 0.6990632057189942,
"step": 390,
"token_acc": 0.8067554235458374
},
{
"epoch": 0.3022188217291507,
"grad_norm": 1.1731232404708862,
"learning_rate": 9.930410966441164e-05,
"loss": 0.7052478790283203,
"step": 395,
"token_acc": 0.8015207052230835
},
{
"epoch": 0.306044376434583,
"grad_norm": 1.1818660497665405,
"learning_rate": 9.926860694119398e-05,
"loss": 0.6852362632751465,
"step": 400,
"token_acc": 0.8096556663513184
},
{
"epoch": 0.306044376434583,
"eval_loss": 0.6521208882331848,
"eval_runtime": 7.4215,
"eval_samples_per_second": 14.013,
"eval_steps_per_second": 1.752,
"eval_token_acc": 0.8145099878311157,
"step": 400
},
{
"epoch": 0.3098699311400153,
"grad_norm": 1.166639804840088,
"learning_rate": 9.923222764102248e-05,
"loss": 0.6215761661529541,
"step": 405,
"token_acc": 0.8188217282295227
},
{
"epoch": 0.3136954858454476,
"grad_norm": 1.0579371452331543,
"learning_rate": 9.919497241115016e-05,
"loss": 0.6619209289550781,
"step": 410,
"token_acc": 0.8130149841308594
},
{
"epoch": 0.3175210405508799,
"grad_norm": 1.025505542755127,
"learning_rate": 9.915684191441446e-05,
"loss": 0.681110954284668,
"step": 415,
"token_acc": 0.8061873316764832
},
{
"epoch": 0.32134659525631215,
"grad_norm": 1.1900734901428223,
"learning_rate": 9.911783682922533e-05,
"loss": 0.6414823532104492,
"step": 420,
"token_acc": 0.8169435262680054
},
{
"epoch": 0.32517214996174443,
"grad_norm": 1.0435925722122192,
"learning_rate": 9.907795784955327e-05,
"loss": 0.650167179107666,
"step": 425,
"token_acc": 0.8135402202606201
},
{
"epoch": 0.32899770466717676,
"grad_norm": 0.9976479411125183,
"learning_rate": 9.90372056849169e-05,
"loss": 0.6622737884521485,
"step": 430,
"token_acc": 0.8130133152008057
},
{
"epoch": 0.33282325937260904,
"grad_norm": 1.025640606880188,
"learning_rate": 9.899558106037039e-05,
"loss": 0.7082881927490234,
"step": 435,
"token_acc": 0.8012630343437195
},
{
"epoch": 0.3366488140780413,
"grad_norm": 1.1692794561386108,
"learning_rate": 9.895308471649052e-05,
"loss": 0.7149417877197266,
"step": 440,
"token_acc": 0.8121411204338074
},
{
"epoch": 0.3404743687834736,
"grad_norm": 1.0781068801879883,
"learning_rate": 9.890971740936352e-05,
"loss": 0.6460227012634278,
"step": 445,
"token_acc": 0.8171982169151306
},
{
"epoch": 0.34429992348890587,
"grad_norm": 1.7874302864074707,
"learning_rate": 9.886547991057162e-05,
"loss": 0.6831697463989258,
"step": 450,
"token_acc": 0.8117350339889526
},
{
"epoch": 0.34429992348890587,
"eval_loss": 0.6621751189231873,
"eval_runtime": 7.4514,
"eval_samples_per_second": 13.957,
"eval_steps_per_second": 1.745,
"eval_token_acc": 0.8132668733596802,
"step": 450
},
{
"epoch": 0.3481254781943382,
"grad_norm": 1.1658034324645996,
"learning_rate": 9.882037300717936e-05,
"loss": 0.6283795356750488,
"step": 455,
"token_acc": 0.8232808709144592
},
{
"epoch": 0.3519510328997705,
"grad_norm": 0.8861122727394104,
"learning_rate": 9.87743975017195e-05,
"loss": 0.5845287322998047,
"step": 460,
"token_acc": 0.8338332176208496
},
{
"epoch": 0.35577658760520275,
"grad_norm": 1.1082383394241333,
"learning_rate": 9.872755421217881e-05,
"loss": 0.7373793125152588,
"step": 465,
"token_acc": 0.7927750945091248
},
{
"epoch": 0.35960214231063503,
"grad_norm": 0.9668710827827454,
"learning_rate": 9.867984397198348e-05,
"loss": 0.6381460189819336,
"step": 470,
"token_acc": 0.8192023038864136
},
{
"epoch": 0.3634276970160673,
"grad_norm": 1.0808384418487549,
"learning_rate": 9.863126762998436e-05,
"loss": 0.7160910606384278,
"step": 475,
"token_acc": 0.8008524179458618
},
{
"epoch": 0.36725325172149964,
"grad_norm": 1.0136635303497314,
"learning_rate": 9.858182605044172e-05,
"loss": 0.6220456123352051,
"step": 480,
"token_acc": 0.8248037099838257
},
{
"epoch": 0.3710788064269319,
"grad_norm": 1.2998031377792358,
"learning_rate": 9.853152011301003e-05,
"loss": 0.6555353164672851,
"step": 485,
"token_acc": 0.8161742687225342
},
{
"epoch": 0.3749043611323642,
"grad_norm": 1.0749304294586182,
"learning_rate": 9.848035071272222e-05,
"loss": 0.6211759567260742,
"step": 490,
"token_acc": 0.821867048740387
},
{
"epoch": 0.37872991583779647,
"grad_norm": 0.9710472226142883,
"learning_rate": 9.842831875997375e-05,
"loss": 0.6431370735168457,
"step": 495,
"token_acc": 0.8220862150192261
},
{
"epoch": 0.38255547054322875,
"grad_norm": 1.0042985677719116,
"learning_rate": 9.837542518050649e-05,
"loss": 0.6818212509155274,
"step": 500,
"token_acc": 0.8100237846374512
},
{
"epoch": 0.38255547054322875,
"eval_loss": 0.6374099254608154,
"eval_runtime": 7.6101,
"eval_samples_per_second": 13.666,
"eval_steps_per_second": 1.708,
"eval_token_acc": 0.8186302185058594,
"step": 500
},
{
"epoch": 0.3863810252486611,
"grad_norm": 1.0197993516921997,
"learning_rate": 9.832167091539214e-05,
"loss": 0.6007397174835205,
"step": 505,
"token_acc": 0.8282684683799744
},
{
"epoch": 0.39020657995409336,
"grad_norm": 1.0835719108581543,
"learning_rate": 9.826705692101555e-05,
"loss": 0.7205737113952637,
"step": 510,
"token_acc": 0.7967984080314636
},
{
"epoch": 0.39403213465952563,
"grad_norm": 0.9672032594680786,
"learning_rate": 9.821158416905773e-05,
"loss": 0.6137794494628906,
"step": 515,
"token_acc": 0.8238478899002075
},
{
"epoch": 0.3978576893649579,
"grad_norm": 1.0274014472961426,
"learning_rate": 9.815525364647853e-05,
"loss": 0.6839157104492187,
"step": 520,
"token_acc": 0.8090466856956482
},
{
"epoch": 0.4016832440703902,
"grad_norm": 0.966098427772522,
"learning_rate": 9.809806635549901e-05,
"loss": 0.5641196250915528,
"step": 525,
"token_acc": 0.8359003663063049
},
{
"epoch": 0.4055087987758225,
"grad_norm": 1.1138949394226074,
"learning_rate": 9.804002331358377e-05,
"loss": 0.615296745300293,
"step": 530,
"token_acc": 0.8272916674613953
},
{
"epoch": 0.4093343534812548,
"grad_norm": 2.4379749298095703,
"learning_rate": 9.798112555342268e-05,
"loss": 0.5940766334533691,
"step": 535,
"token_acc": 0.8358057737350464
},
{
"epoch": 0.4131599081866871,
"grad_norm": 1.1517431735992432,
"learning_rate": 9.792137412291265e-05,
"loss": 0.6338438034057617,
"step": 540,
"token_acc": 0.8158274292945862
},
{
"epoch": 0.41698546289211935,
"grad_norm": 6.055464744567871,
"learning_rate": 9.786077008513883e-05,
"loss": 0.6075318336486817,
"step": 545,
"token_acc": 0.8209756016731262
},
{
"epoch": 0.42081101759755163,
"grad_norm": 0.9165500402450562,
"learning_rate": 9.779931451835589e-05,
"loss": 0.659608793258667,
"step": 550,
"token_acc": 0.815700113773346
},
{
"epoch": 0.42081101759755163,
"eval_loss": 0.6386705636978149,
"eval_runtime": 8.1335,
"eval_samples_per_second": 12.787,
"eval_steps_per_second": 1.598,
"eval_token_acc": 0.8192116618156433,
"step": 550
},
{
"epoch": 0.4246365723029839,
"grad_norm": 3.8534209728240967,
"learning_rate": 9.773700851596864e-05,
"loss": 0.689471435546875,
"step": 555,
"token_acc": 0.8077275156974792
},
{
"epoch": 0.42846212700841624,
"grad_norm": 1.0717378854751587,
"learning_rate": 9.767385318651272e-05,
"loss": 0.6236325740814209,
"step": 560,
"token_acc": 0.826772928237915
},
{
"epoch": 0.4322876817138485,
"grad_norm": 0.9380275011062622,
"learning_rate": 9.760984965363478e-05,
"loss": 0.6055815696716309,
"step": 565,
"token_acc": 0.8277127146720886
},
{
"epoch": 0.4361132364192808,
"grad_norm": 0.9301455020904541,
"learning_rate": 9.75449990560726e-05,
"loss": 0.5975317001342774,
"step": 570,
"token_acc": 0.8306687474250793
},
{
"epoch": 0.43993879112471307,
"grad_norm": 0.9384899735450745,
"learning_rate": 9.747930254763467e-05,
"loss": 0.631765604019165,
"step": 575,
"token_acc": 0.8169443011283875
},
{
"epoch": 0.44376434583014535,
"grad_norm": 0.9002703428268433,
"learning_rate": 9.74127612971798e-05,
"loss": 0.6044256210327148,
"step": 580,
"token_acc": 0.8257142305374146
},
{
"epoch": 0.4475899005355777,
"grad_norm": 0.8999844193458557,
"learning_rate": 9.73453764885963e-05,
"loss": 0.6237145900726319,
"step": 585,
"token_acc": 0.8252273797988892
},
{
"epoch": 0.45141545524100996,
"grad_norm": 0.9064670205116272,
"learning_rate": 9.727714932078088e-05,
"loss": 0.6549233436584473,
"step": 590,
"token_acc": 0.8153916001319885
},
{
"epoch": 0.45524100994644223,
"grad_norm": 1.0747268199920654,
"learning_rate": 9.720808100761729e-05,
"loss": 0.6232728004455567,
"step": 595,
"token_acc": 0.8211687207221985
},
{
"epoch": 0.4590665646518745,
"grad_norm": 1.031503438949585,
"learning_rate": 9.713817277795482e-05,
"loss": 0.6111268043518067,
"step": 600,
"token_acc": 0.8248355984687805
},
{
"epoch": 0.4590665646518745,
"eval_loss": 0.634019136428833,
"eval_runtime": 7.7263,
"eval_samples_per_second": 13.46,
"eval_steps_per_second": 1.683,
"eval_token_acc": 0.8194121718406677,
"step": 600
},
{
"epoch": 0.4628921193573068,
"grad_norm": 18.878767013549805,
"learning_rate": 9.706742587558635e-05,
"loss": 0.7319217681884765,
"step": 605,
"token_acc": 0.8135314583778381
},
{
"epoch": 0.4667176740627391,
"grad_norm": 0.9823316931724548,
"learning_rate": 9.699584155922625e-05,
"loss": 0.658491849899292,
"step": 610,
"token_acc": 0.8164398670196533
},
{
"epoch": 0.4705432287681714,
"grad_norm": 1.1845817565917969,
"learning_rate": 9.692342110248802e-05,
"loss": 0.6585088729858398,
"step": 615,
"token_acc": 0.8140710592269897
},
{
"epoch": 0.4743687834736037,
"grad_norm": 1.0284193754196167,
"learning_rate": 9.685016579386159e-05,
"loss": 0.6060408592224121,
"step": 620,
"token_acc": 0.8255147933959961
},
{
"epoch": 0.47819433817903595,
"grad_norm": 1.0485318899154663,
"learning_rate": 9.677607693669035e-05,
"loss": 0.6855095863342285,
"step": 625,
"token_acc": 0.8098092079162598
},
{
"epoch": 0.4820198928844682,
"grad_norm": 2.119432210922241,
"learning_rate": 9.67011558491481e-05,
"loss": 0.6514041423797607,
"step": 630,
"token_acc": 0.8163265585899353
},
{
"epoch": 0.48584544758990056,
"grad_norm": 0.9313147664070129,
"learning_rate": 9.662540386421546e-05,
"loss": 0.6687870025634766,
"step": 635,
"token_acc": 0.8119432330131531
},
{
"epoch": 0.48967100229533284,
"grad_norm": 0.9492276310920715,
"learning_rate": 9.65488223296562e-05,
"loss": 0.6563722610473632,
"step": 640,
"token_acc": 0.8168354034423828
},
{
"epoch": 0.4934965570007651,
"grad_norm": 1.0297837257385254,
"learning_rate": 9.64714126079933e-05,
"loss": 0.5913913726806641,
"step": 645,
"token_acc": 0.828011691570282
},
{
"epoch": 0.4973221117061974,
"grad_norm": 1.0799224376678467,
"learning_rate": 9.639317607648463e-05,
"loss": 0.6493720054626465,
"step": 650,
"token_acc": 0.8191680312156677
},
{
"epoch": 0.4973221117061974,
"eval_loss": 0.6336340308189392,
"eval_runtime": 8.085,
"eval_samples_per_second": 12.863,
"eval_steps_per_second": 1.608,
"eval_token_acc": 0.8203945755958557,
"step": 650
},
{
"epoch": 0.5011476664116297,
"grad_norm": 0.9438362717628479,
"learning_rate": 9.631411412709856e-05,
"loss": 0.634061050415039,
"step": 655,
"token_acc": 0.8196708559989929
},
{
"epoch": 0.504973221117062,
"grad_norm": 0.9886628985404968,
"learning_rate": 9.623422816648905e-05,
"loss": 0.6314868450164794,
"step": 660,
"token_acc": 0.8192417025566101
},
{
"epoch": 0.5087987758224942,
"grad_norm": 1.053757667541504,
"learning_rate": 9.615351961597075e-05,
"loss": 0.6161402225494385,
"step": 665,
"token_acc": 0.8249170780181885
},
{
"epoch": 0.5126243305279266,
"grad_norm": 0.8857008814811707,
"learning_rate": 9.607198991149365e-05,
"loss": 0.6382771968841553,
"step": 670,
"token_acc": 0.8191618323326111
},
{
"epoch": 0.5164498852333589,
"grad_norm": 0.9176872968673706,
"learning_rate": 9.598964050361749e-05,
"loss": 0.6668461799621582,
"step": 675,
"token_acc": 0.8112070560455322
},
{
"epoch": 0.5202754399387911,
"grad_norm": 0.8668197393417358,
"learning_rate": 9.590647285748613e-05,
"loss": 0.6178393363952637,
"step": 680,
"token_acc": 0.8246564269065857
},
{
"epoch": 0.5241009946442234,
"grad_norm": 0.8694312572479248,
"learning_rate": 9.582248845280121e-05,
"loss": 0.6056000709533691,
"step": 685,
"token_acc": 0.8267983198165894
},
{
"epoch": 0.5279265493496557,
"grad_norm": 1.0597003698349,
"learning_rate": 9.57376887837961e-05,
"loss": 0.6181661128997803,
"step": 690,
"token_acc": 0.8232805728912354
},
{
"epoch": 0.531752104055088,
"grad_norm": 0.8571362495422363,
"learning_rate": 9.565207535920906e-05,
"loss": 0.6172348976135253,
"step": 695,
"token_acc": 0.8221156597137451
},
{
"epoch": 0.5355776587605203,
"grad_norm": 0.9073564410209656,
"learning_rate": 9.556564970225666e-05,
"loss": 0.6466682434082032,
"step": 700,
"token_acc": 0.8197444081306458
},
{
"epoch": 0.5355776587605203,
"eval_loss": 0.6152887344360352,
"eval_runtime": 7.5903,
"eval_samples_per_second": 13.702,
"eval_steps_per_second": 1.713,
"eval_token_acc": 0.8228907585144043,
"step": 700
},
{
"epoch": 0.5394032134659525,
"grad_norm": 0.9663663506507874,
"learning_rate": 9.547841335060641e-05,
"loss": 0.6051031112670898,
"step": 705,
"token_acc": 0.8252653479576111
},
{
"epoch": 0.5432287681713849,
"grad_norm": 0.9873702526092529,
"learning_rate": 9.539036785634961e-05,
"loss": 0.6133259296417236,
"step": 710,
"token_acc": 0.8265376687049866
},
{
"epoch": 0.5470543228768171,
"grad_norm": 0.8775202035903931,
"learning_rate": 9.530151478597366e-05,
"loss": 0.6536783218383789,
"step": 715,
"token_acc": 0.8136675357818604
},
{
"epoch": 0.5508798775822494,
"grad_norm": 0.8767590522766113,
"learning_rate": 9.521185572033416e-05,
"loss": 0.5738767147064209,
"step": 720,
"token_acc": 0.8351121544837952
},
{
"epoch": 0.5547054322876818,
"grad_norm": 0.9340411424636841,
"learning_rate": 9.512139225462682e-05,
"loss": 0.60714693069458,
"step": 725,
"token_acc": 0.8243422508239746
},
{
"epoch": 0.558530986993114,
"grad_norm": 0.924868643283844,
"learning_rate": 9.503012599835907e-05,
"loss": 0.5976818084716797,
"step": 730,
"token_acc": 0.8307338953018188
},
{
"epoch": 0.5623565416985463,
"grad_norm": 1.1880912780761719,
"learning_rate": 9.493805857532148e-05,
"loss": 0.7305125236511231,
"step": 735,
"token_acc": 0.7984393239021301
},
{
"epoch": 0.5661820964039785,
"grad_norm": 0.8552014827728271,
"learning_rate": 9.48451916235587e-05,
"loss": 0.631963062286377,
"step": 740,
"token_acc": 0.8215923309326172
},
{
"epoch": 0.5700076511094109,
"grad_norm": 0.9064537882804871,
"learning_rate": 9.475152679534052e-05,
"loss": 0.5955155849456787,
"step": 745,
"token_acc": 0.8277559876441956
},
{
"epoch": 0.5738332058148432,
"grad_norm": 0.953490138053894,
"learning_rate": 9.465706575713236e-05,
"loss": 0.5581603050231934,
"step": 750,
"token_acc": 0.8392514586448669
},
{
"epoch": 0.5738332058148432,
"eval_loss": 0.6101195812225342,
"eval_runtime": 7.839,
"eval_samples_per_second": 13.267,
"eval_steps_per_second": 1.658,
"eval_token_acc": 0.8254771828651428,
"step": 750
},
{
"epoch": 0.5776587605202754,
"grad_norm": 0.9111331105232239,
"learning_rate": 9.456181018956567e-05,
"loss": 0.5761038780212402,
"step": 755,
"token_acc": 0.8335671424865723
},
{
"epoch": 0.5814843152257078,
"grad_norm": 0.9279806613922119,
"learning_rate": 9.446576178740795e-05,
"loss": 0.6236689567565918,
"step": 760,
"token_acc": 0.8229003548622131
},
{
"epoch": 0.58530986993114,
"grad_norm": 0.8497107028961182,
"learning_rate": 9.436892225953269e-05,
"loss": 0.6130060672760009,
"step": 765,
"token_acc": 0.8241313099861145
},
{
"epoch": 0.5891354246365723,
"grad_norm": 0.933496356010437,
"learning_rate": 9.427129332888891e-05,
"loss": 0.6331747055053711,
"step": 770,
"token_acc": 0.8258751034736633
},
{
"epoch": 0.5929609793420046,
"grad_norm": 0.95807945728302,
"learning_rate": 9.417287673247052e-05,
"loss": 0.5901139259338379,
"step": 775,
"token_acc": 0.8336220383644104
},
{
"epoch": 0.5967865340474369,
"grad_norm": 0.9931139349937439,
"learning_rate": 9.407367422128547e-05,
"loss": 0.6363272666931152,
"step": 780,
"token_acc": 0.8183371424674988
},
{
"epoch": 0.6006120887528692,
"grad_norm": 0.8274650573730469,
"learning_rate": 9.397368756032445e-05,
"loss": 0.5664173603057862,
"step": 785,
"token_acc": 0.8378447890281677
},
{
"epoch": 0.6044376434583014,
"grad_norm": 1.017050862312317,
"learning_rate": 9.387291852852967e-05,
"loss": 0.6467793464660645,
"step": 790,
"token_acc": 0.818406343460083
},
{
"epoch": 0.6082631981637338,
"grad_norm": 0.8612256050109863,
"learning_rate": 9.377136891876306e-05,
"loss": 0.644353199005127,
"step": 795,
"token_acc": 0.8149409294128418
},
{
"epoch": 0.612088752869166,
"grad_norm": 0.9359307289123535,
"learning_rate": 9.366904053777447e-05,
"loss": 0.6541380882263184,
"step": 800,
"token_acc": 0.8136578798294067
},
{
"epoch": 0.612088752869166,
"eval_loss": 0.600931704044342,
"eval_runtime": 7.7335,
"eval_samples_per_second": 13.448,
"eval_steps_per_second": 1.681,
"eval_token_acc": 0.8262491226196289,
"step": 800
},
{
"epoch": 0.6159143075745983,
"grad_norm": 0.8562702536582947,
"learning_rate": 9.356593520616948e-05,
"loss": 0.5768568038940429,
"step": 805,
"token_acc": 0.8369309902191162
},
{
"epoch": 0.6197398622800306,
"grad_norm": 0.8822196125984192,
"learning_rate": 9.3462054758377e-05,
"loss": 0.6508576393127441,
"step": 810,
"token_acc": 0.8174927234649658
},
{
"epoch": 0.6235654169854629,
"grad_norm": 0.8938590288162231,
"learning_rate": 9.335740104261664e-05,
"loss": 0.6667316436767579,
"step": 815,
"token_acc": 0.8100781440734863
},
{
"epoch": 0.6273909716908952,
"grad_norm": 1.007367491722107,
"learning_rate": 9.32519759208659e-05,
"loss": 0.72325439453125,
"step": 820,
"token_acc": 0.8077250123023987
},
{
"epoch": 0.6312165263963274,
"grad_norm": 1.01559579372406,
"learning_rate": 9.314578126882691e-05,
"loss": 0.5955130577087402,
"step": 825,
"token_acc": 0.8294063806533813
},
{
"epoch": 0.6350420811017597,
"grad_norm": 0.9418911933898926,
"learning_rate": 9.303881897589315e-05,
"loss": 0.6099714279174805,
"step": 830,
"token_acc": 0.8279644250869751
},
{
"epoch": 0.6388676358071921,
"grad_norm": 0.9409440755844116,
"learning_rate": 9.29310909451158e-05,
"loss": 0.5885293006896972,
"step": 835,
"token_acc": 0.8318097591400146
},
{
"epoch": 0.6426931905126243,
"grad_norm": 0.9052807688713074,
"learning_rate": 9.28225990931699e-05,
"loss": 0.5844202995300293,
"step": 840,
"token_acc": 0.8323644399642944
},
{
"epoch": 0.6465187452180566,
"grad_norm": 1.170585036277771,
"learning_rate": 9.271334535032026e-05,
"loss": 0.6612658500671387,
"step": 845,
"token_acc": 0.8123800754547119
},
{
"epoch": 0.6503442999234889,
"grad_norm": 0.89767986536026,
"learning_rate": 9.260333166038704e-05,
"loss": 0.6106939315795898,
"step": 850,
"token_acc": 0.8253637552261353
},
{
"epoch": 0.6503442999234889,
"eval_loss": 0.595952033996582,
"eval_runtime": 7.7396,
"eval_samples_per_second": 13.437,
"eval_steps_per_second": 1.68,
"eval_token_acc": 0.8275924324989319,
"step": 850
},
{
"epoch": 0.6541698546289212,
"grad_norm": 0.8901084661483765,
"learning_rate": 9.249255998071126e-05,
"loss": 0.5618688106536865,
"step": 855,
"token_acc": 0.8380252718925476
},
{
"epoch": 0.6579954093343535,
"grad_norm": 0.8414104580879211,
"learning_rate": 9.238103228211997e-05,
"loss": 0.5890965461730957,
"step": 860,
"token_acc": 0.8292516469955444
},
{
"epoch": 0.6618209640397857,
"grad_norm": 0.8542090058326721,
"learning_rate": 9.226875054889108e-05,
"loss": 0.5492356300354004,
"step": 865,
"token_acc": 0.8417258858680725
},
{
"epoch": 0.6656465187452181,
"grad_norm": 0.928252100944519,
"learning_rate": 9.21557167787182e-05,
"loss": 0.6059693813323974,
"step": 870,
"token_acc": 0.827387273311615
},
{
"epoch": 0.6694720734506503,
"grad_norm": 0.8323174118995667,
"learning_rate": 9.204193298267496e-05,
"loss": 0.6152177810668945,
"step": 875,
"token_acc": 0.8236430287361145
},
{
"epoch": 0.6732976281560826,
"grad_norm": 0.8953769207000732,
"learning_rate": 9.192740118517935e-05,
"loss": 0.6013946056365966,
"step": 880,
"token_acc": 0.8297914862632751
},
{
"epoch": 0.677123182861515,
"grad_norm": 0.9411488771438599,
"learning_rate": 9.181212342395764e-05,
"loss": 0.521054458618164,
"step": 885,
"token_acc": 0.8486282229423523
},
{
"epoch": 0.6809487375669472,
"grad_norm": 0.9547863602638245,
"learning_rate": 9.169610175000812e-05,
"loss": 0.5880234718322754,
"step": 890,
"token_acc": 0.8322908878326416
},
{
"epoch": 0.6847742922723795,
"grad_norm": 1.0470699071884155,
"learning_rate": 9.157933822756459e-05,
"loss": 0.6081759452819824,
"step": 895,
"token_acc": 0.8250705003738403
},
{
"epoch": 0.6885998469778117,
"grad_norm": 0.9556779861450195,
"learning_rate": 9.146183493405975e-05,
"loss": 0.6601164817810059,
"step": 900,
"token_acc": 0.8116152286529541
},
{
"epoch": 0.6885998469778117,
"eval_loss": 0.5903816223144531,
"eval_runtime": 7.6904,
"eval_samples_per_second": 13.523,
"eval_steps_per_second": 1.69,
"eval_token_acc": 0.8289057016372681,
"step": 900
},
{
"epoch": 0.6924254016832441,
"grad_norm": 1.1069297790527344,
"learning_rate": 9.13435939600881e-05,
"loss": 0.6385367393493653,
"step": 905,
"token_acc": 0.8162096738815308
},
{
"epoch": 0.6962509563886764,
"grad_norm": 0.9318839311599731,
"learning_rate": 9.12246174093688e-05,
"loss": 0.604517650604248,
"step": 910,
"token_acc": 0.82686847448349
},
{
"epoch": 0.7000765110941086,
"grad_norm": 0.8273342251777649,
"learning_rate": 9.110490739870824e-05,
"loss": 0.6841697216033935,
"step": 915,
"token_acc": 0.8044203519821167
},
{
"epoch": 0.703902065799541,
"grad_norm": 0.8293759822845459,
"learning_rate": 9.098446605796239e-05,
"loss": 0.5717193603515625,
"step": 920,
"token_acc": 0.8351298570632935
},
{
"epoch": 0.7077276205049732,
"grad_norm": 7.753383636474609,
"learning_rate": 9.086329552999891e-05,
"loss": 0.5882965564727783,
"step": 925,
"token_acc": 0.8285040259361267
},
{
"epoch": 0.7115531752104055,
"grad_norm": 0.9893306493759155,
"learning_rate": 9.074139797065897e-05,
"loss": 0.648917293548584,
"step": 930,
"token_acc": 0.8116658329963684
},
{
"epoch": 0.7153787299158378,
"grad_norm": 0.902746856212616,
"learning_rate": 9.061877554871896e-05,
"loss": 0.6094418525695801,
"step": 935,
"token_acc": 0.8259324431419373
},
{
"epoch": 0.7192042846212701,
"grad_norm": 0.9152299165725708,
"learning_rate": 9.049543044585187e-05,
"loss": 0.6678308486938477,
"step": 940,
"token_acc": 0.816949725151062
},
{
"epoch": 0.7230298393267024,
"grad_norm": 1.0613242387771606,
"learning_rate": 9.03713648565885e-05,
"loss": 0.6197181701660156,
"step": 945,
"token_acc": 0.8243659138679504
},
{
"epoch": 0.7268553940321346,
"grad_norm": 0.7965312600135803,
"learning_rate": 9.024658098827838e-05,
"loss": 0.6047243118286133,
"step": 950,
"token_acc": 0.8313871622085571
},
{
"epoch": 0.7268553940321346,
"eval_loss": 0.587164044380188,
"eval_runtime": 7.7893,
"eval_samples_per_second": 13.352,
"eval_steps_per_second": 1.669,
"eval_token_acc": 0.8293668031692505,
"step": 950
},
{
"epoch": 0.730680948737567,
"grad_norm": 0.8924623131752014,
"learning_rate": 9.012108106105048e-05,
"loss": 0.5776640892028808,
"step": 955,
"token_acc": 0.8302121758460999
},
{
"epoch": 0.7345065034429993,
"grad_norm": 1.0438350439071655,
"learning_rate": 8.99948673077738e-05,
"loss": 0.5650456428527832,
"step": 960,
"token_acc": 0.8433432579040527
},
{
"epoch": 0.7383320581484315,
"grad_norm": 0.8841288685798645,
"learning_rate": 8.986794197401754e-05,
"loss": 0.5597739219665527,
"step": 965,
"token_acc": 0.8350304365158081
},
{
"epoch": 0.7421576128538638,
"grad_norm": 0.9303543567657471,
"learning_rate": 8.974030731801127e-05,
"loss": 0.6170159816741944,
"step": 970,
"token_acc": 0.8251381516456604
},
{
"epoch": 0.7459831675592961,
"grad_norm": 1.05469810962677,
"learning_rate": 8.961196561060454e-05,
"loss": 0.61129789352417,
"step": 975,
"token_acc": 0.8258439302444458
},
{
"epoch": 0.7498087222647284,
"grad_norm": 0.8528873920440674,
"learning_rate": 8.948291913522677e-05,
"loss": 0.642275619506836,
"step": 980,
"token_acc": 0.8284429907798767
},
{
"epoch": 0.7536342769701607,
"grad_norm": 0.7755897641181946,
"learning_rate": 8.935317018784637e-05,
"loss": 0.5369032859802246,
"step": 985,
"token_acc": 0.8431283235549927
},
{
"epoch": 0.7574598316755929,
"grad_norm": 0.8636773228645325,
"learning_rate": 8.922272107693e-05,
"loss": 0.5884841442108154,
"step": 990,
"token_acc": 0.830573558807373
},
{
"epoch": 0.7612853863810253,
"grad_norm": 0.8464745283126831,
"learning_rate": 8.90915741234015e-05,
"loss": 0.5174911022186279,
"step": 995,
"token_acc": 0.8450327515602112
},
{
"epoch": 0.7651109410864575,
"grad_norm": 0.8121261596679688,
"learning_rate": 8.895973166060058e-05,
"loss": 0.5794853687286377,
"step": 1000,
"token_acc": 0.8360881209373474
},
{
"epoch": 0.7651109410864575,
"eval_loss": 0.5729076862335205,
"eval_runtime": 7.9584,
"eval_samples_per_second": 13.068,
"eval_steps_per_second": 1.634,
"eval_token_acc": 0.8330559730529785,
"step": 1000
},
{
"epoch": 0.7689364957918898,
"grad_norm": 0.8082830309867859,
"learning_rate": 8.882719603424133e-05,
"loss": 0.6191754341125488,
"step": 1005,
"token_acc": 0.8256863355636597
},
{
"epoch": 0.7727620504973222,
"grad_norm": 0.8163895010948181,
"learning_rate": 8.86939696023704e-05,
"loss": 0.5695658683776855,
"step": 1010,
"token_acc": 0.8331784605979919
},
{
"epoch": 0.7765876052027544,
"grad_norm": 0.8397212624549866,
"learning_rate": 8.856005473532519e-05,
"loss": 0.5332405090332031,
"step": 1015,
"token_acc": 0.8452962636947632
},
{
"epoch": 0.7804131599081867,
"grad_norm": 0.8272839188575745,
"learning_rate": 8.842545381569155e-05,
"loss": 0.5343279838562012,
"step": 1020,
"token_acc": 0.8402997255325317
},
{
"epoch": 0.7842387146136189,
"grad_norm": 0.8609519004821777,
"learning_rate": 8.829016923826144e-05,
"loss": 0.5459603309631348,
"step": 1025,
"token_acc": 0.8402543067932129
},
{
"epoch": 0.7880642693190513,
"grad_norm": 0.8439111113548279,
"learning_rate": 8.815420340999033e-05,
"loss": 0.5824572563171386,
"step": 1030,
"token_acc": 0.8306134343147278
},
{
"epoch": 0.7918898240244836,
"grad_norm": 0.8207530975341797,
"learning_rate": 8.801755874995437e-05,
"loss": 0.5932113647460937,
"step": 1035,
"token_acc": 0.8294033408164978
},
{
"epoch": 0.7957153787299158,
"grad_norm": 0.9178765416145325,
"learning_rate": 8.788023768930732e-05,
"loss": 0.5900128364562989,
"step": 1040,
"token_acc": 0.8334224820137024
},
{
"epoch": 0.7995409334353482,
"grad_norm": 0.7986139059066772,
"learning_rate": 8.774224267123734e-05,
"loss": 0.6000078678131103,
"step": 1045,
"token_acc": 0.8272825479507446
},
{
"epoch": 0.8033664881407804,
"grad_norm": 0.8349852561950684,
"learning_rate": 8.760357615092351e-05,
"loss": 0.5280231475830078,
"step": 1050,
"token_acc": 0.8440104722976685
},
{
"epoch": 0.8033664881407804,
"eval_loss": 0.574630856513977,
"eval_runtime": 7.7226,
"eval_samples_per_second": 13.467,
"eval_steps_per_second": 1.683,
"eval_token_acc": 0.833396852016449,
"step": 1050
},
{
"epoch": 0.8071920428462127,
"grad_norm": 0.7667945027351379,
"learning_rate": 8.746424059549213e-05,
"loss": 0.5487701416015625,
"step": 1055,
"token_acc": 0.8400689959526062
},
{
"epoch": 0.811017597551645,
"grad_norm": 0.9147979617118835,
"learning_rate": 8.732423848397284e-05,
"loss": 0.5697606563568115,
"step": 1060,
"token_acc": 0.8328049182891846
},
{
"epoch": 0.8148431522570773,
"grad_norm": 0.8798291087150574,
"learning_rate": 8.718357230725449e-05,
"loss": 0.5843188285827636,
"step": 1065,
"token_acc": 0.8351316452026367
},
{
"epoch": 0.8186687069625096,
"grad_norm": 0.9299157857894897,
"learning_rate": 8.704224456804087e-05,
"loss": 0.6090686798095704,
"step": 1070,
"token_acc": 0.8255612850189209
},
{
"epoch": 0.8224942616679418,
"grad_norm": 0.8285570740699768,
"learning_rate": 8.690025778080613e-05,
"loss": 0.5678855419158936,
"step": 1075,
"token_acc": 0.834744930267334
},
{
"epoch": 0.8263198163733741,
"grad_norm": 1.0449912548065186,
"learning_rate": 8.67576144717501e-05,
"loss": 0.5510326385498047,
"step": 1080,
"token_acc": 0.8414307832717896
},
{
"epoch": 0.8301453710788065,
"grad_norm": 0.7922863364219666,
"learning_rate": 8.661431717875328e-05,
"loss": 0.5484563827514648,
"step": 1085,
"token_acc": 0.8401945233345032
},
{
"epoch": 0.8339709257842387,
"grad_norm": 1.0209932327270508,
"learning_rate": 8.647036845133172e-05,
"loss": 0.5764856338500977,
"step": 1090,
"token_acc": 0.8333871960639954
},
{
"epoch": 0.837796480489671,
"grad_norm": 0.8326112627983093,
"learning_rate": 8.632577085059168e-05,
"loss": 0.6004890441894531,
"step": 1095,
"token_acc": 0.827037513256073
},
{
"epoch": 0.8416220351951033,
"grad_norm": 0.7816240787506104,
"learning_rate": 8.618052694918399e-05,
"loss": 0.5333565711975098,
"step": 1100,
"token_acc": 0.8430129885673523
},
{
"epoch": 0.8416220351951033,
"eval_loss": 0.5720469951629639,
"eval_runtime": 7.8984,
"eval_samples_per_second": 13.167,
"eval_steps_per_second": 1.646,
"eval_token_acc": 0.8314921259880066,
"step": 1100
},
{
"epoch": 0.8454475899005356,
"grad_norm": 0.9619238376617432,
"learning_rate": 8.603463933125842e-05,
"loss": 0.5509546756744385,
"step": 1105,
"token_acc": 0.8384957313537598
},
{
"epoch": 0.8492731446059678,
"grad_norm": 0.9528924822807312,
"learning_rate": 8.588811059241755e-05,
"loss": 0.6007543563842773,
"step": 1110,
"token_acc": 0.8273714780807495
},
{
"epoch": 0.8530986993114001,
"grad_norm": 0.812016487121582,
"learning_rate": 8.574094333967064e-05,
"loss": 0.5877734661102295,
"step": 1115,
"token_acc": 0.8291584253311157
},
{
"epoch": 0.8569242540168325,
"grad_norm": 1.103339433670044,
"learning_rate": 8.559314019138727e-05,
"loss": 0.6196231842041016,
"step": 1120,
"token_acc": 0.8281660676002502
},
{
"epoch": 0.8607498087222647,
"grad_norm": 0.9961858987808228,
"learning_rate": 8.544470377725078e-05,
"loss": 0.571223258972168,
"step": 1125,
"token_acc": 0.8321356177330017
},
{
"epoch": 0.864575363427697,
"grad_norm": 0.8015458583831787,
"learning_rate": 8.529563673821141e-05,
"loss": 0.538951301574707,
"step": 1130,
"token_acc": 0.8429505228996277
},
{
"epoch": 0.8684009181331293,
"grad_norm": 0.8478720784187317,
"learning_rate": 8.514594172643934e-05,
"loss": 0.5572677612304687,
"step": 1135,
"token_acc": 0.8356977105140686
},
{
"epoch": 0.8722264728385616,
"grad_norm": 0.814361572265625,
"learning_rate": 8.499562140527754e-05,
"loss": 0.5883401870727539,
"step": 1140,
"token_acc": 0.8291968107223511
},
{
"epoch": 0.8760520275439939,
"grad_norm": 0.8049572706222534,
"learning_rate": 8.484467844919437e-05,
"loss": 0.5637226104736328,
"step": 1145,
"token_acc": 0.8390661478042603
},
{
"epoch": 0.8798775822494261,
"grad_norm": 0.749894917011261,
"learning_rate": 8.469311554373594e-05,
"loss": 0.4973104000091553,
"step": 1150,
"token_acc": 0.8528492450714111
},
{
"epoch": 0.8798775822494261,
"eval_loss": 0.564576268196106,
"eval_runtime": 7.7633,
"eval_samples_per_second": 13.396,
"eval_steps_per_second": 1.675,
"eval_token_acc": 0.835151195526123,
"step": 1150
},
{
"epoch": 0.8837031369548585,
"grad_norm": 0.9036749005317688,
"learning_rate": 8.454093538547838e-05,
"loss": 0.5535676956176758,
"step": 1155,
"token_acc": 0.8383986949920654
},
{
"epoch": 0.8875286916602907,
"grad_norm": 0.7430348992347717,
"learning_rate": 8.438814068197988e-05,
"loss": 0.557097339630127,
"step": 1160,
"token_acc": 0.8430325388908386
},
{
"epoch": 0.891354246365723,
"grad_norm": 0.9356522560119629,
"learning_rate": 8.423473415173247e-05,
"loss": 0.5787965774536132,
"step": 1165,
"token_acc": 0.8321569561958313
},
{
"epoch": 0.8951798010711554,
"grad_norm": 0.7668983340263367,
"learning_rate": 8.40807185241137e-05,
"loss": 0.5303655624389648,
"step": 1170,
"token_acc": 0.8440219163894653
},
{
"epoch": 0.8990053557765876,
"grad_norm": 0.7720690965652466,
"learning_rate": 8.392609653933803e-05,
"loss": 0.5396030426025391,
"step": 1175,
"token_acc": 0.8430536985397339
},
{
"epoch": 0.9028309104820199,
"grad_norm": 0.7427228689193726,
"learning_rate": 8.377087094840813e-05,
"loss": 0.5650552749633789,
"step": 1180,
"token_acc": 0.8388790488243103
},
{
"epoch": 0.9066564651874521,
"grad_norm": 0.8698520660400391,
"learning_rate": 8.361504451306585e-05,
"loss": 0.5175793647766114,
"step": 1185,
"token_acc": 0.8486889004707336
},
{
"epoch": 0.9104820198928845,
"grad_norm": 0.838016927242279,
"learning_rate": 8.345862000574321e-05,
"loss": 0.5568198204040528,
"step": 1190,
"token_acc": 0.8362753391265869
},
{
"epoch": 0.9143075745983168,
"grad_norm": 0.7980285286903381,
"learning_rate": 8.330160020951299e-05,
"loss": 0.5795284748077393,
"step": 1195,
"token_acc": 0.8336633443832397
},
{
"epoch": 0.918133129303749,
"grad_norm": 0.7379786968231201,
"learning_rate": 8.314398791803916e-05,
"loss": 0.5594221115112304,
"step": 1200,
"token_acc": 0.8377372026443481
},
{
"epoch": 0.918133129303749,
"eval_loss": 0.5564058423042297,
"eval_runtime": 7.7456,
"eval_samples_per_second": 13.427,
"eval_steps_per_second": 1.678,
"eval_token_acc": 0.8375070095062256,
"step": 1200
},
{
"epoch": 0.9219586840091814,
"grad_norm": 0.8150419592857361,
"learning_rate": 8.298578593552737e-05,
"loss": 0.5221155166625977,
"step": 1205,
"token_acc": 0.8457277417182922
},
{
"epoch": 0.9257842387146136,
"grad_norm": 0.9086570739746094,
"learning_rate": 8.28269970766748e-05,
"loss": 0.574681568145752,
"step": 1210,
"token_acc": 0.8327599763870239
},
{
"epoch": 0.9296097934200459,
"grad_norm": 0.8389135599136353,
"learning_rate": 8.26676241666203e-05,
"loss": 0.5882039070129395,
"step": 1215,
"token_acc": 0.8281732797622681
},
{
"epoch": 0.9334353481254782,
"grad_norm": 1.0141870975494385,
"learning_rate": 8.250767004089399e-05,
"loss": 0.5588771820068359,
"step": 1220,
"token_acc": 0.8358601331710815
},
{
"epoch": 0.9372609028309105,
"grad_norm": 0.8374904990196228,
"learning_rate": 8.23471375453669e-05,
"loss": 0.5152300834655762,
"step": 1225,
"token_acc": 0.8489376902580261
},
{
"epoch": 0.9410864575363428,
"grad_norm": 0.8244453072547913,
"learning_rate": 8.21860295362003e-05,
"loss": 0.500080680847168,
"step": 1230,
"token_acc": 0.8521796464920044
},
{
"epoch": 0.944912012241775,
"grad_norm": 0.9917334318161011,
"learning_rate": 8.20243488797948e-05,
"loss": 0.5609046459197998,
"step": 1235,
"token_acc": 0.8397351503372192
},
{
"epoch": 0.9487375669472073,
"grad_norm": 1.4528796672821045,
"learning_rate": 8.186209845273954e-05,
"loss": 0.6106361389160156,
"step": 1240,
"token_acc": 0.8287570476531982
},
{
"epoch": 0.9525631216526397,
"grad_norm": 0.8477284908294678,
"learning_rate": 8.169928114176084e-05,
"loss": 0.534299659729004,
"step": 1245,
"token_acc": 0.8459932208061218
},
{
"epoch": 0.9563886763580719,
"grad_norm": 0.9785248041152954,
"learning_rate": 8.153589984367091e-05,
"loss": 0.5453691959381104,
"step": 1250,
"token_acc": 0.8423656821250916
},
{
"epoch": 0.9563886763580719,
"eval_loss": 0.5528830885887146,
"eval_runtime": 8.7628,
"eval_samples_per_second": 11.868,
"eval_steps_per_second": 1.484,
"eval_token_acc": 0.8377977609634399,
"step": 1250
},
{
"epoch": 0.9602142310635042,
"grad_norm": 0.7743374705314636,
"learning_rate": 8.137195746531635e-05,
"loss": 0.5649035453796387,
"step": 1255,
"token_acc": 0.83652263879776
},
{
"epoch": 0.9640397857689365,
"grad_norm": 0.9111794829368591,
"learning_rate": 8.120745692352627e-05,
"loss": 0.5429101943969726,
"step": 1260,
"token_acc": 0.8409203886985779
},
{
"epoch": 0.9678653404743688,
"grad_norm": 0.8705430030822754,
"learning_rate": 8.104240114506065e-05,
"loss": 0.5348100185394287,
"step": 1265,
"token_acc": 0.8432644009590149
},
{
"epoch": 0.9716908951798011,
"grad_norm": 0.7576097249984741,
"learning_rate": 8.087679306655804e-05,
"loss": 0.5683703422546387,
"step": 1270,
"token_acc": 0.836378276348114
},
{
"epoch": 0.9755164498852333,
"grad_norm": 1.1635630130767822,
"learning_rate": 8.07106356344834e-05,
"loss": 0.6346898078918457,
"step": 1275,
"token_acc": 0.8285390734672546
},
{
"epoch": 0.9793420045906657,
"grad_norm": 0.827690601348877,
"learning_rate": 8.054393180507572e-05,
"loss": 0.5661238193511963,
"step": 1280,
"token_acc": 0.8387032747268677
},
{
"epoch": 0.9831675592960979,
"grad_norm": 0.888037383556366,
"learning_rate": 8.037668454429534e-05,
"loss": 0.5784870624542237,
"step": 1285,
"token_acc": 0.8306419849395752
},
{
"epoch": 0.9869931140015302,
"grad_norm": 0.7650582790374756,
"learning_rate": 8.020889682777127e-05,
"loss": 0.5594500064849853,
"step": 1290,
"token_acc": 0.8358885645866394
},
{
"epoch": 0.9908186687069626,
"grad_norm": 0.8132854104042053,
"learning_rate": 8.004057164074814e-05,
"loss": 0.5590912818908691,
"step": 1295,
"token_acc": 0.8387227654457092
},
{
"epoch": 0.9946442234123948,
"grad_norm": 0.8819040656089783,
"learning_rate": 7.987171197803315e-05,
"loss": 0.5425111770629882,
"step": 1300,
"token_acc": 0.8366984128952026
},
{
"epoch": 0.9946442234123948,
"eval_loss": 0.5410341024398804,
"eval_runtime": 7.8851,
"eval_samples_per_second": 13.189,
"eval_steps_per_second": 1.649,
"eval_token_acc": 0.8402237296104431,
"step": 1300
},
{
"epoch": 0.9984697781178271,
"grad_norm": 0.7759367227554321,
"learning_rate": 7.970232084394282e-05,
"loss": 0.4794795989990234,
"step": 1305,
"token_acc": 0.8576377034187317
},
{
"epoch": 1.0022953328232593,
"grad_norm": 0.7615346908569336,
"learning_rate": 7.953240125224948e-05,
"loss": 0.4416775703430176,
"step": 1310,
"token_acc": 0.8654638528823853
},
{
"epoch": 1.0061208875286916,
"grad_norm": 0.7918492555618286,
"learning_rate": 7.936195622612767e-05,
"loss": 0.37592229843139646,
"step": 1315,
"token_acc": 0.8773406147956848
},
{
"epoch": 1.009946442234124,
"grad_norm": 0.717467725276947,
"learning_rate": 7.919098879810036e-05,
"loss": 0.4267716407775879,
"step": 1320,
"token_acc": 0.8671300411224365
},
{
"epoch": 1.0137719969395562,
"grad_norm": 0.7892487645149231,
"learning_rate": 7.901950200998493e-05,
"loss": 0.382064151763916,
"step": 1325,
"token_acc": 0.8785242438316345
},
{
"epoch": 1.0175975516449884,
"grad_norm": 0.7296363711357117,
"learning_rate": 7.884749891283922e-05,
"loss": 0.36800203323364256,
"step": 1330,
"token_acc": 0.881615400314331
},
{
"epoch": 1.0214231063504209,
"grad_norm": 1.087638258934021,
"learning_rate": 7.867498256690704e-05,
"loss": 0.37799820899963377,
"step": 1335,
"token_acc": 0.877220094203949
},
{
"epoch": 1.025248661055853,
"grad_norm": 0.7339928150177002,
"learning_rate": 7.850195604156385e-05,
"loss": 0.37110204696655275,
"step": 1340,
"token_acc": 0.884996771812439
},
{
"epoch": 1.0290742157612853,
"grad_norm": 0.8464434742927551,
"learning_rate": 7.832842241526212e-05,
"loss": 0.3805660009384155,
"step": 1345,
"token_acc": 0.879789412021637
},
{
"epoch": 1.0328997704667178,
"grad_norm": 0.689896821975708,
"learning_rate": 7.815438477547655e-05,
"loss": 0.3583992481231689,
"step": 1350,
"token_acc": 0.8869645595550537
},
{
"epoch": 1.0328997704667178,
"eval_loss": 0.5596266984939575,
"eval_runtime": 7.7953,
"eval_samples_per_second": 13.341,
"eval_steps_per_second": 1.668,
"eval_token_acc": 0.8413565754890442,
"step": 1350
},
{
"epoch": 1.03672532517215,
"grad_norm": 0.6902993321418762,
"learning_rate": 7.797984621864916e-05,
"loss": 0.42625932693481444,
"step": 1355,
"token_acc": 0.8614287376403809
},
{
"epoch": 1.0405508798775822,
"grad_norm": 0.7562316060066223,
"learning_rate": 7.780480985013413e-05,
"loss": 0.3689578533172607,
"step": 1360,
"token_acc": 0.8820473551750183
},
{
"epoch": 1.0443764345830144,
"grad_norm": 0.6517492532730103,
"learning_rate": 7.762927878414267e-05,
"loss": 0.32921748161315917,
"step": 1365,
"token_acc": 0.8926072120666504
},
{
"epoch": 1.0482019892884469,
"grad_norm": 0.8397619724273682,
"learning_rate": 7.745325614368755e-05,
"loss": 0.3830822229385376,
"step": 1370,
"token_acc": 0.8756515383720398
},
{
"epoch": 1.052027543993879,
"grad_norm": 0.7649819254875183,
"learning_rate": 7.727674506052743e-05,
"loss": 0.37806334495544436,
"step": 1375,
"token_acc": 0.8804787993431091
},
{
"epoch": 1.0558530986993113,
"grad_norm": 0.7365129590034485,
"learning_rate": 7.709974867511138e-05,
"loss": 0.3349802017211914,
"step": 1380,
"token_acc": 0.8934342861175537
},
{
"epoch": 1.0596786534047438,
"grad_norm": 0.884164571762085,
"learning_rate": 7.692227013652278e-05,
"loss": 0.36524980068206786,
"step": 1385,
"token_acc": 0.8806947469711304
},
{
"epoch": 1.063504208110176,
"grad_norm": 0.6860577464103699,
"learning_rate": 7.674431260242338e-05,
"loss": 0.367877721786499,
"step": 1390,
"token_acc": 0.8842624425888062
},
{
"epoch": 1.0673297628156082,
"grad_norm": 0.7009398937225342,
"learning_rate": 7.656587923899718e-05,
"loss": 0.3564207315444946,
"step": 1395,
"token_acc": 0.8837472200393677
},
{
"epoch": 1.0711553175210407,
"grad_norm": 0.7540706396102905,
"learning_rate": 7.638697322089398e-05,
"loss": 0.3640351057052612,
"step": 1400,
"token_acc": 0.8847005367279053
},
{
"epoch": 1.0711553175210407,
"eval_loss": 0.5507253408432007,
"eval_runtime": 7.7117,
"eval_samples_per_second": 13.486,
"eval_steps_per_second": 1.686,
"eval_token_acc": 0.8438527584075928,
"step": 1400
},
{
"epoch": 1.0749808722264729,
"grad_norm": 0.6863798499107361,
"learning_rate": 7.620759773117299e-05,
"loss": 0.3779132604598999,
"step": 1405,
"token_acc": 0.8826145529747009
},
{
"epoch": 1.078806426931905,
"grad_norm": 0.7733192443847656,
"learning_rate": 7.602775596124611e-05,
"loss": 0.3633275032043457,
"step": 1410,
"token_acc": 0.886398196220398
},
{
"epoch": 1.0826319816373373,
"grad_norm": 0.7949317693710327,
"learning_rate": 7.584745111082127e-05,
"loss": 0.3376323699951172,
"step": 1415,
"token_acc": 0.8887669444084167
},
{
"epoch": 1.0864575363427698,
"grad_norm": 0.6832326650619507,
"learning_rate": 7.566668638784542e-05,
"loss": 0.33144965171813967,
"step": 1420,
"token_acc": 0.8916584849357605
},
{
"epoch": 1.090283091048202,
"grad_norm": 0.8551044464111328,
"learning_rate": 7.548546500844742e-05,
"loss": 0.3287867546081543,
"step": 1425,
"token_acc": 0.8930348753929138
},
{
"epoch": 1.0941086457536342,
"grad_norm": 0.7423316240310669,
"learning_rate": 7.530379019688092e-05,
"loss": 0.3902039289474487,
"step": 1430,
"token_acc": 0.8757656812667847
},
{
"epoch": 1.0979342004590666,
"grad_norm": 0.8404172658920288,
"learning_rate": 7.51216651854669e-05,
"loss": 0.390373969078064,
"step": 1435,
"token_acc": 0.8776587843894958
},
{
"epoch": 1.1017597551644989,
"grad_norm": 0.8963853120803833,
"learning_rate": 7.493909321453625e-05,
"loss": 0.4068464279174805,
"step": 1440,
"token_acc": 0.8700478076934814
},
{
"epoch": 1.105585309869931,
"grad_norm": 0.7311558723449707,
"learning_rate": 7.475607753237202e-05,
"loss": 0.3884909629821777,
"step": 1445,
"token_acc": 0.8745863437652588
},
{
"epoch": 1.1094108645753635,
"grad_norm": 0.7590047121047974,
"learning_rate": 7.457262139515171e-05,
"loss": 0.3895232677459717,
"step": 1450,
"token_acc": 0.8725248575210571
},
{
"epoch": 1.1094108645753635,
"eval_loss": 0.5504098534584045,
"eval_runtime": 7.7559,
"eval_samples_per_second": 13.409,
"eval_steps_per_second": 1.676,
"eval_token_acc": 0.8443038463592529,
"step": 1450
},
{
"epoch": 1.1132364192807958,
"grad_norm": 0.882554292678833,
"learning_rate": 7.438872806688934e-05,
"loss": 0.40759758949279784,
"step": 1455,
"token_acc": 0.8712476491928101
},
{
"epoch": 1.117061973986228,
"grad_norm": 0.6808732748031616,
"learning_rate": 7.420440081937728e-05,
"loss": 0.3652071237564087,
"step": 1460,
"token_acc": 0.8835034370422363
},
{
"epoch": 1.1208875286916602,
"grad_norm": 0.837759256362915,
"learning_rate": 7.401964293212809e-05,
"loss": 0.409121036529541,
"step": 1465,
"token_acc": 0.8712127208709717
},
{
"epoch": 1.1247130833970926,
"grad_norm": 0.6652865409851074,
"learning_rate": 7.383445769231627e-05,
"loss": 0.3703787803649902,
"step": 1470,
"token_acc": 0.8831153512001038
},
{
"epoch": 1.1285386381025249,
"grad_norm": 0.8179388642311096,
"learning_rate": 7.364884839471964e-05,
"loss": 0.39147076606750486,
"step": 1475,
"token_acc": 0.8752105236053467
},
{
"epoch": 1.132364192807957,
"grad_norm": 0.719514012336731,
"learning_rate": 7.346281834166075e-05,
"loss": 0.37967238426208494,
"step": 1480,
"token_acc": 0.8796840906143188
},
{
"epoch": 1.1361897475133895,
"grad_norm": 0.9179552793502808,
"learning_rate": 7.327637084294817e-05,
"loss": 0.3995789051055908,
"step": 1485,
"token_acc": 0.8751766085624695
},
{
"epoch": 1.1400153022188217,
"grad_norm": 0.7656182050704956,
"learning_rate": 7.308950921581756e-05,
"loss": 0.34888639450073244,
"step": 1490,
"token_acc": 0.89056795835495
},
{
"epoch": 1.143840856924254,
"grad_norm": 0.7309355735778809,
"learning_rate": 7.290223678487272e-05,
"loss": 0.39315025806427,
"step": 1495,
"token_acc": 0.876833438873291
},
{
"epoch": 1.1476664116296864,
"grad_norm": 0.7618235349655151,
"learning_rate": 7.27145568820263e-05,
"loss": 0.35439176559448243,
"step": 1500,
"token_acc": 0.8836838006973267
},
{
"epoch": 1.1476664116296864,
"eval_loss": 0.5430108904838562,
"eval_runtime": 7.6873,
"eval_samples_per_second": 13.529,
"eval_steps_per_second": 1.691,
"eval_token_acc": 0.8448953628540039,
"step": 1500
},
{
"epoch": 1.1514919663351186,
"grad_norm": 0.8058356046676636,
"learning_rate": 7.25264728464407e-05,
"loss": 0.3466159820556641,
"step": 1505,
"token_acc": 0.8869272470474243
},
{
"epoch": 1.1553175210405509,
"grad_norm": 0.7806113362312317,
"learning_rate": 7.233798802446847e-05,
"loss": 0.40935721397399905,
"step": 1510,
"token_acc": 0.8709314465522766
},
{
"epoch": 1.159143075745983,
"grad_norm": 0.8264714479446411,
"learning_rate": 7.214910576959297e-05,
"loss": 0.38201849460601806,
"step": 1515,
"token_acc": 0.8780457973480225
},
{
"epoch": 1.1629686304514155,
"grad_norm": 0.6713389158248901,
"learning_rate": 7.195982944236851e-05,
"loss": 0.3252051115036011,
"step": 1520,
"token_acc": 0.892856240272522
},
{
"epoch": 1.1667941851568477,
"grad_norm": 0.7945072650909424,
"learning_rate": 7.177016241036075e-05,
"loss": 0.35387892723083497,
"step": 1525,
"token_acc": 0.8838560581207275
},
{
"epoch": 1.17061973986228,
"grad_norm": 0.8310626745223999,
"learning_rate": 7.15801080480866e-05,
"loss": 0.3746853590011597,
"step": 1530,
"token_acc": 0.8799676299095154
},
{
"epoch": 1.1744452945677124,
"grad_norm": 0.9108403325080872,
"learning_rate": 7.138966973695431e-05,
"loss": 0.36667909622192385,
"step": 1535,
"token_acc": 0.8820632100105286
},
{
"epoch": 1.1782708492731446,
"grad_norm": 0.7420673966407776,
"learning_rate": 7.119885086520329e-05,
"loss": 0.36235547065734863,
"step": 1540,
"token_acc": 0.8849785923957825
},
{
"epoch": 1.1820964039785768,
"grad_norm": 0.6693369150161743,
"learning_rate": 7.100765482784377e-05,
"loss": 0.3710158824920654,
"step": 1545,
"token_acc": 0.8811267614364624
},
{
"epoch": 1.185921958684009,
"grad_norm": 0.7249651551246643,
"learning_rate": 7.081608502659646e-05,
"loss": 0.3993852615356445,
"step": 1550,
"token_acc": 0.8718493580818176
},
{
"epoch": 1.185921958684009,
"eval_loss": 0.5383990406990051,
"eval_runtime": 7.5793,
"eval_samples_per_second": 13.722,
"eval_steps_per_second": 1.715,
"eval_token_acc": 0.8461685180664062,
"step": 1550
},
{
"epoch": 1.1897475133894415,
"grad_norm": 0.9157434105873108,
"learning_rate": 7.062414486983197e-05,
"loss": 0.3987370491027832,
"step": 1555,
"token_acc": 0.8729732036590576
},
{
"epoch": 1.1935730680948737,
"grad_norm": 0.6402376890182495,
"learning_rate": 7.043183777251024e-05,
"loss": 0.2903183698654175,
"step": 1560,
"token_acc": 0.9057296514511108
},
{
"epoch": 1.197398622800306,
"grad_norm": 0.7679566144943237,
"learning_rate": 7.023916715611969e-05,
"loss": 0.4904749870300293,
"step": 1565,
"token_acc": 0.8663699626922607
},
{
"epoch": 1.2012241775057384,
"grad_norm": 0.8699092864990234,
"learning_rate": 7.004613644861647e-05,
"loss": 0.4231747627258301,
"step": 1570,
"token_acc": 0.8677194714546204
},
{
"epoch": 1.2050497322111706,
"grad_norm": 0.6792256832122803,
"learning_rate": 6.985274908436333e-05,
"loss": 0.44817123413085935,
"step": 1575,
"token_acc": 0.8659628629684448
},
{
"epoch": 1.2088752869166028,
"grad_norm": 0.7418417930603027,
"learning_rate": 6.965900850406859e-05,
"loss": 0.33240585327148436,
"step": 1580,
"token_acc": 0.8937970399856567
},
{
"epoch": 1.2127008416220353,
"grad_norm": 0.8835020065307617,
"learning_rate": 6.946491815472496e-05,
"loss": 0.3884410381317139,
"step": 1585,
"token_acc": 0.876690149307251
},
{
"epoch": 1.2165263963274675,
"grad_norm": 0.9086595177650452,
"learning_rate": 6.927048148954812e-05,
"loss": 0.410748291015625,
"step": 1590,
"token_acc": 0.8735622763633728
},
{
"epoch": 1.2203519510328997,
"grad_norm": 0.7838605642318726,
"learning_rate": 6.907570196791538e-05,
"loss": 0.3603389739990234,
"step": 1595,
"token_acc": 0.8829374313354492
},
{
"epoch": 1.2241775057383322,
"grad_norm": 0.7454732060432434,
"learning_rate": 6.888058305530406e-05,
"loss": 0.37654249668121337,
"step": 1600,
"token_acc": 0.8782923817634583
},
{
"epoch": 1.2241775057383322,
"eval_loss": 0.5343810319900513,
"eval_runtime": 7.6236,
"eval_samples_per_second": 13.642,
"eval_steps_per_second": 1.705,
"eval_token_acc": 0.8475719690322876,
"step": 1600
},
{
"epoch": 1.2280030604437644,
"grad_norm": 0.7611352801322937,
"learning_rate": 6.868512822322981e-05,
"loss": 0.38566131591796876,
"step": 1605,
"token_acc": 0.8766804337501526
},
{
"epoch": 1.2318286151491966,
"grad_norm": 0.8874756693840027,
"learning_rate": 6.848934094918498e-05,
"loss": 0.38291475772857664,
"step": 1610,
"token_acc": 0.87657630443573
},
{
"epoch": 1.2356541698546288,
"grad_norm": 0.7193310260772705,
"learning_rate": 6.829322471657658e-05,
"loss": 0.3452467441558838,
"step": 1615,
"token_acc": 0.8881570100784302
},
{
"epoch": 1.2394797245600613,
"grad_norm": 0.661790668964386,
"learning_rate": 6.809678301466443e-05,
"loss": 0.3452208757400513,
"step": 1620,
"token_acc": 0.8885095715522766
},
{
"epoch": 1.2433052792654935,
"grad_norm": 0.8313160538673401,
"learning_rate": 6.790001933849899e-05,
"loss": 0.39090492725372317,
"step": 1625,
"token_acc": 0.8772667646408081
},
{
"epoch": 1.2471308339709257,
"grad_norm": 0.7543197870254517,
"learning_rate": 6.770293718885928e-05,
"loss": 0.37844099998474123,
"step": 1630,
"token_acc": 0.8773866295814514
},
{
"epoch": 1.2509563886763582,
"grad_norm": 0.7187685370445251,
"learning_rate": 6.750554007219047e-05,
"loss": 0.37274966239929197,
"step": 1635,
"token_acc": 0.8813634514808655
},
{
"epoch": 1.2547819433817904,
"grad_norm": 0.7216220498085022,
"learning_rate": 6.730783150054164e-05,
"loss": 0.40465946197509767,
"step": 1640,
"token_acc": 0.8722350597381592
},
{
"epoch": 1.2586074980872226,
"grad_norm": 0.808250367641449,
"learning_rate": 6.71098149915031e-05,
"loss": 0.39015932083129884,
"step": 1645,
"token_acc": 0.8755351901054382
},
{
"epoch": 1.2624330527926548,
"grad_norm": 0.6570851802825928,
"learning_rate": 6.691149406814403e-05,
"loss": 0.33088486194610595,
"step": 1650,
"token_acc": 0.8907855153083801
},
{
"epoch": 1.2624330527926548,
"eval_loss": 0.5374127626419067,
"eval_runtime": 7.8026,
"eval_samples_per_second": 13.329,
"eval_steps_per_second": 1.666,
"eval_token_acc": 0.8472611904144287,
"step": 1650
},
{
"epoch": 1.2662586074980873,
"grad_norm": 0.6985551714897156,
"learning_rate": 6.67128722589496e-05,
"loss": 0.3755918502807617,
"step": 1655,
"token_acc": 0.8816916346549988
},
{
"epoch": 1.2700841622035195,
"grad_norm": 0.7275698781013489,
"learning_rate": 6.651395309775837e-05,
"loss": 0.3765554428100586,
"step": 1660,
"token_acc": 0.8811103701591492
},
{
"epoch": 1.2739097169089517,
"grad_norm": 0.729633092880249,
"learning_rate": 6.631474012369921e-05,
"loss": 0.3696659803390503,
"step": 1665,
"token_acc": 0.8816789984703064
},
{
"epoch": 1.2777352716143842,
"grad_norm": 0.7620216012001038,
"learning_rate": 6.611523688112858e-05,
"loss": 0.35426578521728513,
"step": 1670,
"token_acc": 0.8883428573608398
},
{
"epoch": 1.2815608263198164,
"grad_norm": 0.8159366846084595,
"learning_rate": 6.591544691956723e-05,
"loss": 0.38610110282897947,
"step": 1675,
"token_acc": 0.8776164054870605
},
{
"epoch": 1.2853863810252486,
"grad_norm": 0.8567126989364624,
"learning_rate": 6.571537379363719e-05,
"loss": 0.4222766399383545,
"step": 1680,
"token_acc": 0.8723132610321045
},
{
"epoch": 1.2892119357306808,
"grad_norm": 0.8297275304794312,
"learning_rate": 6.551502106299851e-05,
"loss": 0.37399892807006835,
"step": 1685,
"token_acc": 0.8821731209754944
},
{
"epoch": 1.2930374904361133,
"grad_norm": 0.6843409538269043,
"learning_rate": 6.531439229228591e-05,
"loss": 0.3343992233276367,
"step": 1690,
"token_acc": 0.892397403717041
},
{
"epoch": 1.2968630451415455,
"grad_norm": 0.7213367819786072,
"learning_rate": 6.511349105104534e-05,
"loss": 0.38822097778320314,
"step": 1695,
"token_acc": 0.8769423365592957
},
{
"epoch": 1.300688599846978,
"grad_norm": 0.700702428817749,
"learning_rate": 6.491232091367049e-05,
"loss": 0.35975372791290283,
"step": 1700,
"token_acc": 0.8861437439918518
},
{
"epoch": 1.300688599846978,
"eval_loss": 0.526591956615448,
"eval_runtime": 7.7916,
"eval_samples_per_second": 13.348,
"eval_steps_per_second": 1.668,
"eval_token_acc": 0.8482837677001953,
"step": 1700
},
{
"epoch": 1.3045141545524102,
"grad_norm": 0.7598251104354858,
"learning_rate": 6.471088545933921e-05,
"loss": 0.3564164638519287,
"step": 1705,
"token_acc": 0.8872470855712891
},
{
"epoch": 1.3083397092578424,
"grad_norm": 0.7174568176269531,
"learning_rate": 6.450918827194978e-05,
"loss": 0.3287261962890625,
"step": 1710,
"token_acc": 0.894193708896637
},
{
"epoch": 1.3121652639632746,
"grad_norm": 0.7934249043464661,
"learning_rate": 6.430723294005726e-05,
"loss": 0.3405998468399048,
"step": 1715,
"token_acc": 0.8878347277641296
},
{
"epoch": 1.315990818668707,
"grad_norm": 0.8109247088432312,
"learning_rate": 6.410502305680946e-05,
"loss": 0.3818791389465332,
"step": 1720,
"token_acc": 0.8762706518173218
},
{
"epoch": 1.3198163733741393,
"grad_norm": 0.7905654311180115,
"learning_rate": 6.390256221988318e-05,
"loss": 0.3510235548019409,
"step": 1725,
"token_acc": 0.8884668946266174
},
{
"epoch": 1.3236419280795715,
"grad_norm": 0.7302840352058411,
"learning_rate": 6.369985403142014e-05,
"loss": 0.3860185146331787,
"step": 1730,
"token_acc": 0.8776938915252686
},
{
"epoch": 1.327467482785004,
"grad_norm": 0.7890005111694336,
"learning_rate": 6.349690209796285e-05,
"loss": 0.4002682685852051,
"step": 1735,
"token_acc": 0.8717520236968994
},
{
"epoch": 1.3312930374904361,
"grad_norm": 0.6541386842727661,
"learning_rate": 6.329371003039051e-05,
"loss": 0.3814365863800049,
"step": 1740,
"token_acc": 0.8806993365287781
},
{
"epoch": 1.3351185921958684,
"grad_norm": 0.7147980332374573,
"learning_rate": 6.309028144385472e-05,
"loss": 0.3602738380432129,
"step": 1745,
"token_acc": 0.8850005269050598
},
{
"epoch": 1.3389441469013006,
"grad_norm": 0.6951248049736023,
"learning_rate": 6.288661995771522e-05,
"loss": 0.35432114601135256,
"step": 1750,
"token_acc": 0.8871864080429077
},
{
"epoch": 1.3389441469013006,
"eval_loss": 0.5236285924911499,
"eval_runtime": 7.7868,
"eval_samples_per_second": 13.356,
"eval_steps_per_second": 1.669,
"eval_token_acc": 0.8510806560516357,
"step": 1750
},
{
"epoch": 1.342769701606733,
"grad_norm": 0.7138703465461731,
"learning_rate": 6.268272919547537e-05,
"loss": 0.3437394857406616,
"step": 1755,
"token_acc": 0.8870205283164978
},
{
"epoch": 1.3465952563121653,
"grad_norm": 0.7315565943717957,
"learning_rate": 6.247861278471785e-05,
"loss": 0.3766175270080566,
"step": 1760,
"token_acc": 0.883225679397583
},
{
"epoch": 1.3504208110175975,
"grad_norm": 0.7530694603919983,
"learning_rate": 6.227427435703997e-05,
"loss": 0.3583348035812378,
"step": 1765,
"token_acc": 0.8860324025154114
},
{
"epoch": 1.35424636572303,
"grad_norm": 0.7517703175544739,
"learning_rate": 6.206971754798913e-05,
"loss": 0.3681065559387207,
"step": 1770,
"token_acc": 0.8821339011192322
},
{
"epoch": 1.3580719204284621,
"grad_norm": 1.0113003253936768,
"learning_rate": 6.186494599699819e-05,
"loss": 0.34742186069488523,
"step": 1775,
"token_acc": 0.8917561173439026
},
{
"epoch": 1.3618974751338944,
"grad_norm": 0.9447914361953735,
"learning_rate": 6.165996334732055e-05,
"loss": 0.3852540969848633,
"step": 1780,
"token_acc": 0.8768330216407776
},
{
"epoch": 1.3657230298393266,
"grad_norm": 1.0187249183654785,
"learning_rate": 6.145477324596552e-05,
"loss": 0.41319589614868163,
"step": 1785,
"token_acc": 0.8700772523880005
},
{
"epoch": 1.369548584544759,
"grad_norm": 0.716583251953125,
"learning_rate": 6.124937934363331e-05,
"loss": 0.33546440601348876,
"step": 1790,
"token_acc": 0.8909059166908264
},
{
"epoch": 1.3733741392501913,
"grad_norm": 0.7670001983642578,
"learning_rate": 6.104378529465009e-05,
"loss": 0.35624008178710936,
"step": 1795,
"token_acc": 0.8858749270439148
},
{
"epoch": 1.3771996939556237,
"grad_norm": 0.7541671991348267,
"learning_rate": 6.083799475690309e-05,
"loss": 0.38024513721466063,
"step": 1800,
"token_acc": 0.8788754343986511
},
{
"epoch": 1.3771996939556237,
"eval_loss": 0.5222176909446716,
"eval_runtime": 7.9549,
"eval_samples_per_second": 13.074,
"eval_steps_per_second": 1.634,
"eval_token_acc": 0.8502886891365051,
"step": 1800
},
{
"epoch": 1.381025248661056,
"grad_norm": 0.7164918184280396,
"learning_rate": 6.0632011391775325e-05,
"loss": 0.3274393081665039,
"step": 1805,
"token_acc": 0.8930581212043762
},
{
"epoch": 1.3848508033664881,
"grad_norm": 0.7994803786277771,
"learning_rate": 6.0425838864080594e-05,
"loss": 0.37533011436462405,
"step": 1810,
"token_acc": 0.8814812898635864
},
{
"epoch": 1.3886763580719204,
"grad_norm": 0.610385000705719,
"learning_rate": 6.0219480841998265e-05,
"loss": 0.3626489877700806,
"step": 1815,
"token_acc": 0.8824625611305237
},
{
"epoch": 1.3925019127773526,
"grad_norm": 0.8779500126838684,
"learning_rate": 6.001294099700795e-05,
"loss": 0.3818621873855591,
"step": 1820,
"token_acc": 0.8814284205436707
},
{
"epoch": 1.396327467482785,
"grad_norm": 0.9023825526237488,
"learning_rate": 5.980622300382424e-05,
"loss": 0.34031038284301757,
"step": 1825,
"token_acc": 0.8901993036270142
},
{
"epoch": 1.4001530221882172,
"grad_norm": 0.7254869937896729,
"learning_rate": 5.959933054033125e-05,
"loss": 0.33964922428131106,
"step": 1830,
"token_acc": 0.8894827365875244
},
{
"epoch": 1.4039785768936497,
"grad_norm": 0.7711949944496155,
"learning_rate": 5.9392267287517325e-05,
"loss": 0.37581453323364256,
"step": 1835,
"token_acc": 0.8802526593208313
},
{
"epoch": 1.407804131599082,
"grad_norm": 0.8236564993858337,
"learning_rate": 5.918503692940936e-05,
"loss": 0.3631006717681885,
"step": 1840,
"token_acc": 0.8837177753448486
},
{
"epoch": 1.4116296863045141,
"grad_norm": 0.729147732257843,
"learning_rate": 5.8977643153007436e-05,
"loss": 0.39508538246154784,
"step": 1845,
"token_acc": 0.8759874701499939
},
{
"epoch": 1.4154552410099464,
"grad_norm": 0.7146396636962891,
"learning_rate": 5.8770089648219086e-05,
"loss": 0.38811707496643066,
"step": 1850,
"token_acc": 0.8768134713172913
},
{
"epoch": 1.4154552410099464,
"eval_loss": 0.508669912815094,
"eval_runtime": 7.7202,
"eval_samples_per_second": 13.471,
"eval_steps_per_second": 1.684,
"eval_token_acc": 0.8521132469177246,
"step": 1850
},
{
"epoch": 1.4192807957153788,
"grad_norm": 0.729438066482544,
"learning_rate": 5.8562380107793723e-05,
"loss": 0.39258522987365724,
"step": 1855,
"token_acc": 0.8775860071182251
},
{
"epoch": 1.423106350420811,
"grad_norm": 0.6797559857368469,
"learning_rate": 5.835451822725691e-05,
"loss": 0.3752496957778931,
"step": 1860,
"token_acc": 0.8779392242431641
},
{
"epoch": 1.4269319051262432,
"grad_norm": 0.49813270568847656,
"learning_rate": 5.814650770484461e-05,
"loss": 0.36016933917999266,
"step": 1865,
"token_acc": 0.885236382484436
},
{
"epoch": 1.4307574598316757,
"grad_norm": 0.7051418423652649,
"learning_rate": 5.7938352241437366e-05,
"loss": 0.3023838996887207,
"step": 1870,
"token_acc": 0.9016345143318176
},
{
"epoch": 1.434583014537108,
"grad_norm": 0.7764083743095398,
"learning_rate": 5.773005554049455e-05,
"loss": 0.3270875453948975,
"step": 1875,
"token_acc": 0.8943535685539246
},
{
"epoch": 1.4384085692425401,
"grad_norm": 0.6883430480957031,
"learning_rate": 5.752162130798833e-05,
"loss": 0.3316964864730835,
"step": 1880,
"token_acc": 0.8921953439712524
},
{
"epoch": 1.4422341239479723,
"grad_norm": 0.7114600539207458,
"learning_rate": 5.7313053252337854e-05,
"loss": 0.31533355712890626,
"step": 1885,
"token_acc": 0.8978268504142761
},
{
"epoch": 1.4460596786534048,
"grad_norm": 0.8558183908462524,
"learning_rate": 5.7104355084343196e-05,
"loss": 0.3653078556060791,
"step": 1890,
"token_acc": 0.885123074054718
},
{
"epoch": 1.449885233358837,
"grad_norm": 0.7565247416496277,
"learning_rate": 5.689553051711939e-05,
"loss": 0.3589335441589355,
"step": 1895,
"token_acc": 0.8860511779785156
},
{
"epoch": 1.4537107880642695,
"grad_norm": 0.830723762512207,
"learning_rate": 5.668658326603032e-05,
"loss": 0.32294435501098634,
"step": 1900,
"token_acc": 0.8950970768928528
},
{
"epoch": 1.4537107880642695,
"eval_loss": 0.5095290541648865,
"eval_runtime": 7.9225,
"eval_samples_per_second": 13.127,
"eval_steps_per_second": 1.641,
"eval_token_acc": 0.8521934151649475,
"step": 1900
},
{
"epoch": 1.4575363427697017,
"grad_norm": 0.707747220993042,
"learning_rate": 5.647751704862263e-05,
"loss": 0.3198162794113159,
"step": 1905,
"token_acc": 0.8932924270629883
},
{
"epoch": 1.461361897475134,
"grad_norm": 0.8484877347946167,
"learning_rate": 5.626833558455961e-05,
"loss": 0.34911117553710935,
"step": 1910,
"token_acc": 0.8880250453948975
},
{
"epoch": 1.4651874521805661,
"grad_norm": 0.6321529150009155,
"learning_rate": 5.605904259555496e-05,
"loss": 0.3261146306991577,
"step": 1915,
"token_acc": 0.8926582932472229
},
{
"epoch": 1.4690130068859983,
"grad_norm": 0.888900101184845,
"learning_rate": 5.5849641805306654e-05,
"loss": 0.34900679588317873,
"step": 1920,
"token_acc": 0.8897786736488342
},
{
"epoch": 1.4728385615914308,
"grad_norm": 0.687582790851593,
"learning_rate": 5.564013693943062e-05,
"loss": 0.34392595291137695,
"step": 1925,
"token_acc": 0.8870816826820374
},
{
"epoch": 1.476664116296863,
"grad_norm": 0.7888776659965515,
"learning_rate": 5.5430531725394485e-05,
"loss": 0.40218586921691896,
"step": 1930,
"token_acc": 0.8707258105278015
},
{
"epoch": 1.4804896710022954,
"grad_norm": 0.7543318867683411,
"learning_rate": 5.522082989245122e-05,
"loss": 0.3061817646026611,
"step": 1935,
"token_acc": 0.9006242156028748
},
{
"epoch": 1.4843152257077277,
"grad_norm": 0.8193092942237854,
"learning_rate": 5.501103517157288e-05,
"loss": 0.36248459815979006,
"step": 1940,
"token_acc": 0.8838417530059814
},
{
"epoch": 1.48814078041316,
"grad_norm": 0.7776079177856445,
"learning_rate": 5.480115129538409e-05,
"loss": 0.3319098949432373,
"step": 1945,
"token_acc": 0.8905050754547119
},
{
"epoch": 1.4919663351185921,
"grad_norm": 0.6906784176826477,
"learning_rate": 5.459118199809577e-05,
"loss": 0.30999135971069336,
"step": 1950,
"token_acc": 0.9011686444282532
},
{
"epoch": 1.4919663351185921,
"eval_loss": 0.5100167989730835,
"eval_runtime": 8.6421,
"eval_samples_per_second": 12.034,
"eval_steps_per_second": 1.504,
"eval_token_acc": 0.8534665703773499,
"step": 1950
},
{
"epoch": 1.4957918898240246,
"grad_norm": 0.60188227891922,
"learning_rate": 5.438113101543861e-05,
"loss": 0.3165478467941284,
"step": 1955,
"token_acc": 0.8978914022445679
},
{
"epoch": 1.4996174445294568,
"grad_norm": 0.7757999300956726,
"learning_rate": 5.417100208459662e-05,
"loss": 0.33252928256988523,
"step": 1960,
"token_acc": 0.8919309377670288
},
{
"epoch": 1.5034429992348892,
"grad_norm": 0.8450996279716492,
"learning_rate": 5.396079894414067e-05,
"loss": 0.3332216739654541,
"step": 1965,
"token_acc": 0.8910924196243286
},
{
"epoch": 1.5072685539403214,
"grad_norm": 0.7125052809715271,
"learning_rate": 5.375052533396191e-05,
"loss": 0.32312803268432616,
"step": 1970,
"token_acc": 0.8956630229949951
},
{
"epoch": 1.5110941086457537,
"grad_norm": 0.728113055229187,
"learning_rate": 5.354018499520536e-05,
"loss": 0.3401800155639648,
"step": 1975,
"token_acc": 0.8904479742050171
},
{
"epoch": 1.5149196633511859,
"grad_norm": 0.5629063844680786,
"learning_rate": 5.332978167020314e-05,
"loss": 0.33483114242553713,
"step": 1980,
"token_acc": 0.8900842666625977
},
{
"epoch": 1.518745218056618,
"grad_norm": 0.7541650533676147,
"learning_rate": 5.31193191024081e-05,
"loss": 0.3606285095214844,
"step": 1985,
"token_acc": 0.8818128705024719
},
{
"epoch": 1.5225707727620506,
"grad_norm": 0.7752453684806824,
"learning_rate": 5.2908801036327115e-05,
"loss": 0.3571962356567383,
"step": 1990,
"token_acc": 0.8851061463356018
},
{
"epoch": 1.5263963274674828,
"grad_norm": 0.7320619225502014,
"learning_rate": 5.269823121745443e-05,
"loss": 0.34485607147216796,
"step": 1995,
"token_acc": 0.8938528895378113
},
{
"epoch": 1.5302218821729152,
"grad_norm": 0.7084663510322571,
"learning_rate": 5.248761339220511e-05,
"loss": 0.3630984306335449,
"step": 2000,
"token_acc": 0.8881708979606628
},
{
"epoch": 1.5302218821729152,
"eval_loss": 0.501686155796051,
"eval_runtime": 8.3476,
"eval_samples_per_second": 12.459,
"eval_steps_per_second": 1.557,
"eval_token_acc": 0.8552410006523132,
"step": 2000
},
{
"epoch": 1.5340474368783474,
"grad_norm": 0.7146458029747009,
"learning_rate": 5.227695130784833e-05,
"loss": 0.3331026554107666,
"step": 2005,
"token_acc": 0.8917819261550903
},
{
"epoch": 1.5378729915837797,
"grad_norm": 0.8245148062705994,
"learning_rate": 5.2066248712440656e-05,
"loss": 0.37367663383483884,
"step": 2010,
"token_acc": 0.879398763179779
},
{
"epoch": 1.5416985462892119,
"grad_norm": 0.7592694163322449,
"learning_rate": 5.185550935475953e-05,
"loss": 0.30876760482788085,
"step": 2015,
"token_acc": 0.8983100056648254
},
{
"epoch": 1.545524100994644,
"grad_norm": 0.9255443215370178,
"learning_rate": 5.164473698423636e-05,
"loss": 0.3594630241394043,
"step": 2020,
"token_acc": 0.8848262429237366
},
{
"epoch": 1.5493496557000765,
"grad_norm": 0.7179040908813477,
"learning_rate": 5.143393535088998e-05,
"loss": 0.3523809194564819,
"step": 2025,
"token_acc": 0.8905043601989746
},
{
"epoch": 1.5531752104055088,
"grad_norm": 0.7476411461830139,
"learning_rate": 5.122310820525981e-05,
"loss": 0.3416067361831665,
"step": 2030,
"token_acc": 0.8892166018486023
},
{
"epoch": 1.5570007651109412,
"grad_norm": 0.7161547541618347,
"learning_rate": 5.101225929833921e-05,
"loss": 0.30915536880493166,
"step": 2035,
"token_acc": 0.8991933465003967
},
{
"epoch": 1.5608263198163734,
"grad_norm": 0.935799777507782,
"learning_rate": 5.08013923815087e-05,
"loss": 0.31090846061706545,
"step": 2040,
"token_acc": 0.8967577815055847
},
{
"epoch": 1.5646518745218057,
"grad_norm": 0.7758647799491882,
"learning_rate": 5.059051120646924e-05,
"loss": 0.3375053882598877,
"step": 2045,
"token_acc": 0.8911775350570679
},
{
"epoch": 1.5684774292272379,
"grad_norm": 0.6921541094779968,
"learning_rate": 5.0379619525175437e-05,
"loss": 0.3175233840942383,
"step": 2050,
"token_acc": 0.897928774356842
},
{
"epoch": 1.5684774292272379,
"eval_loss": 0.49708712100982666,
"eval_runtime": 8.4069,
"eval_samples_per_second": 12.371,
"eval_steps_per_second": 1.546,
"eval_token_acc": 0.8562836050987244,
"step": 2050
},
{
"epoch": 1.57230298393267,
"grad_norm": 0.8368853330612183,
"learning_rate": 5.016872108976889e-05,
"loss": 0.3685647964477539,
"step": 2055,
"token_acc": 0.8830959796905518
},
{
"epoch": 1.5761285386381025,
"grad_norm": 0.7727574706077576,
"learning_rate": 4.99578196525113e-05,
"loss": 0.326021146774292,
"step": 2060,
"token_acc": 0.8955893516540527
},
{
"epoch": 1.5799540933435348,
"grad_norm": 0.7962800860404968,
"learning_rate": 4.974691896571781e-05,
"loss": 0.36289157867431643,
"step": 2065,
"token_acc": 0.8842934370040894
},
{
"epoch": 1.5837796480489672,
"grad_norm": 0.7509872317314148,
"learning_rate": 4.9536022781690185e-05,
"loss": 0.31728103160858157,
"step": 2070,
"token_acc": 0.8965554237365723
},
{
"epoch": 1.5876052027543994,
"grad_norm": 0.6993099451065063,
"learning_rate": 4.9325134852650124e-05,
"loss": 0.36268980503082277,
"step": 2075,
"token_acc": 0.8835968375205994
},
{
"epoch": 1.5914307574598316,
"grad_norm": 0.7634088397026062,
"learning_rate": 4.911425893067239e-05,
"loss": 0.368328332901001,
"step": 2080,
"token_acc": 0.8840143084526062
},
{
"epoch": 1.5952563121652639,
"grad_norm": 0.734311580657959,
"learning_rate": 4.8903398767618165e-05,
"loss": 0.3379722833633423,
"step": 2085,
"token_acc": 0.8937978148460388
},
{
"epoch": 1.599081866870696,
"grad_norm": 1.7793625593185425,
"learning_rate": 4.8692558115068254e-05,
"loss": 0.33839640617370603,
"step": 2090,
"token_acc": 0.8909159302711487
},
{
"epoch": 1.6029074215761285,
"grad_norm": 0.6846344470977783,
"learning_rate": 4.8481740724256324e-05,
"loss": 0.36859283447265623,
"step": 2095,
"token_acc": 0.8814284801483154
},
{
"epoch": 1.606732976281561,
"grad_norm": 0.7191367149353027,
"learning_rate": 4.827095034600215e-05,
"loss": 0.32262775897979734,
"step": 2100,
"token_acc": 0.8948466777801514
},
{
"epoch": 1.606732976281561,
"eval_loss": 0.49555426836013794,
"eval_runtime": 8.4995,
"eval_samples_per_second": 12.236,
"eval_steps_per_second": 1.53,
"eval_token_acc": 0.8567647933959961,
"step": 2100
},
{
"epoch": 1.6105585309869932,
"grad_norm": 0.7318239808082581,
"learning_rate": 4.806019073064493e-05,
"loss": 0.28886990547180175,
"step": 2105,
"token_acc": 0.9057518243789673
},
{
"epoch": 1.6143840856924254,
"grad_norm": 0.7161886096000671,
"learning_rate": 4.7849465627976574e-05,
"loss": 0.3786638259887695,
"step": 2110,
"token_acc": 0.877372682094574
},
{
"epoch": 1.6182096403978576,
"grad_norm": 0.7079288959503174,
"learning_rate": 4.763877878717484e-05,
"loss": 0.3339807987213135,
"step": 2115,
"token_acc": 0.892234742641449
},
{
"epoch": 1.6220351951032899,
"grad_norm": 0.7738683819770813,
"learning_rate": 4.742813395673684e-05,
"loss": 0.3155964851379395,
"step": 2120,
"token_acc": 0.8984229564666748
},
{
"epoch": 1.6258607498087223,
"grad_norm": 0.7651445269584656,
"learning_rate": 4.721753488441222e-05,
"loss": 0.34331388473510743,
"step": 2125,
"token_acc": 0.8891043663024902
},
{
"epoch": 1.6296863045141545,
"grad_norm": 0.7328031063079834,
"learning_rate": 4.700698531713648e-05,
"loss": 0.3365816354751587,
"step": 2130,
"token_acc": 0.8924189805984497
},
{
"epoch": 1.633511859219587,
"grad_norm": 0.7824881672859192,
"learning_rate": 4.679648900096436e-05,
"loss": 0.3375370502471924,
"step": 2135,
"token_acc": 0.8933680653572083
},
{
"epoch": 1.6373374139250192,
"grad_norm": 0.7239261269569397,
"learning_rate": 4.658604968100318e-05,
"loss": 0.44536380767822265,
"step": 2140,
"token_acc": 0.8609479665756226
},
{
"epoch": 1.6411629686304514,
"grad_norm": 0.8158916234970093,
"learning_rate": 4.6375671101346135e-05,
"loss": 0.31634106636047366,
"step": 2145,
"token_acc": 0.8972258567810059
},
{
"epoch": 1.6449885233358836,
"grad_norm": 0.6787914633750916,
"learning_rate": 4.616535700500583e-05,
"loss": 0.3428164005279541,
"step": 2150,
"token_acc": 0.8936346769332886
},
{
"epoch": 1.6449885233358836,
"eval_loss": 0.4892226755619049,
"eval_runtime": 8.5201,
"eval_samples_per_second": 12.206,
"eval_steps_per_second": 1.526,
"eval_token_acc": 0.8588098287582397,
"step": 2150
},
{
"epoch": 1.6488140780413159,
"grad_norm": 0.7179057002067566,
"learning_rate": 4.5955111133847516e-05,
"loss": 0.3500206470489502,
"step": 2155,
"token_acc": 0.8879844546318054
},
{
"epoch": 1.6526396327467483,
"grad_norm": 0.9363833665847778,
"learning_rate": 4.574493722852266e-05,
"loss": 0.33152313232421876,
"step": 2160,
"token_acc": 0.8924428820610046
},
{
"epoch": 1.6564651874521805,
"grad_norm": 0.8011144995689392,
"learning_rate": 4.553483902840227e-05,
"loss": 0.33824012279510496,
"step": 2165,
"token_acc": 0.888818621635437
},
{
"epoch": 1.660290742157613,
"grad_norm": 0.754247784614563,
"learning_rate": 4.5324820271510446e-05,
"loss": 0.3261884689331055,
"step": 2170,
"token_acc": 0.8930807709693909
},
{
"epoch": 1.6641162968630452,
"grad_norm": 0.8901833891868591,
"learning_rate": 4.5114884694457906e-05,
"loss": 0.3530290603637695,
"step": 2175,
"token_acc": 0.8864350914955139
},
{
"epoch": 1.6679418515684774,
"grad_norm": 0.7795696258544922,
"learning_rate": 4.490503603237532e-05,
"loss": 0.28058276176452634,
"step": 2180,
"token_acc": 0.9070743322372437
},
{
"epoch": 1.6717674062739096,
"grad_norm": 0.7988150119781494,
"learning_rate": 4.4695278018847105e-05,
"loss": 0.3197885036468506,
"step": 2185,
"token_acc": 0.8948556780815125
},
{
"epoch": 1.6755929609793418,
"grad_norm": 0.7500495910644531,
"learning_rate": 4.448561438584484e-05,
"loss": 0.30902011394500734,
"step": 2190,
"token_acc": 0.8987115621566772
},
{
"epoch": 1.6794185156847743,
"grad_norm": 0.8123504519462585,
"learning_rate": 4.4276048863660874e-05,
"loss": 0.34034423828125,
"step": 2195,
"token_acc": 0.8910139203071594
},
{
"epoch": 1.6832440703902067,
"grad_norm": 1.9124935865402222,
"learning_rate": 4.406658518084201e-05,
"loss": 0.27848803997039795,
"step": 2200,
"token_acc": 0.9100915789604187
},
{
"epoch": 1.6832440703902067,
"eval_loss": 0.48997873067855835,
"eval_runtime": 7.7857,
"eval_samples_per_second": 13.358,
"eval_steps_per_second": 1.67,
"eval_token_acc": 0.8590003252029419,
"step": 2200
},
{
"epoch": 1.687069625095639,
"grad_norm": 0.7550795674324036,
"learning_rate": 4.3857227064123184e-05,
"loss": 0.3289813995361328,
"step": 2205,
"token_acc": 0.8935672044754028
},
{
"epoch": 1.6908951798010712,
"grad_norm": 0.6573622822761536,
"learning_rate": 4.364797823836108e-05,
"loss": 0.3325567483901978,
"step": 2210,
"token_acc": 0.8916365504264832
},
{
"epoch": 1.6947207345065034,
"grad_norm": 0.7994371056556702,
"learning_rate": 4.3438842426467885e-05,
"loss": 0.3089787483215332,
"step": 2215,
"token_acc": 0.8987955451011658
},
{
"epoch": 1.6985462892119356,
"grad_norm": 0.7001591920852661,
"learning_rate": 4.322982334934509e-05,
"loss": 0.3258508682250977,
"step": 2220,
"token_acc": 0.89599609375
},
{
"epoch": 1.702371843917368,
"grad_norm": 0.7623443603515625,
"learning_rate": 4.302092472581729e-05,
"loss": 0.29424998760223386,
"step": 2225,
"token_acc": 0.9034655094146729
},
{
"epoch": 1.7061973986228003,
"grad_norm": 0.8438885807991028,
"learning_rate": 4.281215027256592e-05,
"loss": 0.30596625804901123,
"step": 2230,
"token_acc": 0.8992859125137329
},
{
"epoch": 1.7100229533282327,
"grad_norm": 0.7240939736366272,
"learning_rate": 4.260350370406329e-05,
"loss": 0.30459909439086913,
"step": 2235,
"token_acc": 0.8981994986534119
},
{
"epoch": 1.713848508033665,
"grad_norm": 0.630903422832489,
"learning_rate": 4.239498873250637e-05,
"loss": 0.2987601041793823,
"step": 2240,
"token_acc": 0.9012813568115234
},
{
"epoch": 1.7176740627390972,
"grad_norm": 0.6413953304290771,
"learning_rate": 4.218660906775076e-05,
"loss": 0.27812976837158204,
"step": 2245,
"token_acc": 0.9085516929626465
},
{
"epoch": 1.7214996174445294,
"grad_norm": 0.8842605948448181,
"learning_rate": 4.1978368417244754e-05,
"loss": 0.3460667610168457,
"step": 2250,
"token_acc": 0.8905196785926819
},
{
"epoch": 1.7214996174445294,
"eval_loss": 0.48436981439590454,
"eval_runtime": 6.1279,
"eval_samples_per_second": 16.972,
"eval_steps_per_second": 2.121,
"eval_token_acc": 0.860263466835022,
"step": 2250
},
{
"epoch": 1.7253251721499616,
"grad_norm": 0.6968632340431213,
"learning_rate": 4.17702704859633e-05,
"loss": 0.29213814735412597,
"step": 2255,
"token_acc": 0.9040796160697937
},
{
"epoch": 1.729150726855394,
"grad_norm": 0.7017317414283752,
"learning_rate": 4.1562318976342165e-05,
"loss": 0.3319288730621338,
"step": 2260,
"token_acc": 0.8922781944274902
},
{
"epoch": 1.7329762815608263,
"grad_norm": 0.7793192267417908,
"learning_rate": 4.135451758821191e-05,
"loss": 0.3711602210998535,
"step": 2265,
"token_acc": 0.8815440535545349
},
{
"epoch": 1.7368018362662587,
"grad_norm": 0.870146632194519,
"learning_rate": 4.114687001873228e-05,
"loss": 0.3280991554260254,
"step": 2270,
"token_acc": 0.8962957262992859
},
{
"epoch": 1.740627390971691,
"grad_norm": 0.6839405298233032,
"learning_rate": 4.093937996232625e-05,
"loss": 0.31872236728668213,
"step": 2275,
"token_acc": 0.8943005204200745
},
{
"epoch": 1.7444529456771232,
"grad_norm": 0.7605020999908447,
"learning_rate": 4.073205111061436e-05,
"loss": 0.31961095333099365,
"step": 2280,
"token_acc": 0.8964794278144836
},
{
"epoch": 1.7482785003825554,
"grad_norm": 0.6984594464302063,
"learning_rate": 4.052488715234902e-05,
"loss": 0.31977455615997313,
"step": 2285,
"token_acc": 0.8969309329986572
},
{
"epoch": 1.7521040550879876,
"grad_norm": 0.7754748463630676,
"learning_rate": 4.0317891773348946e-05,
"loss": 0.31035671234130857,
"step": 2290,
"token_acc": 0.8990971446037292
},
{
"epoch": 1.75592960979342,
"grad_norm": 0.8007567524909973,
"learning_rate": 4.0111068656433426e-05,
"loss": 0.34440956115722654,
"step": 2295,
"token_acc": 0.8881877660751343
},
{
"epoch": 1.7597551644988525,
"grad_norm": 0.9330772161483765,
"learning_rate": 3.9904421481357e-05,
"loss": 0.3286851406097412,
"step": 2300,
"token_acc": 0.8939043283462524
},
{
"epoch": 1.7597551644988525,
"eval_loss": 0.4778790771961212,
"eval_runtime": 7.806,
"eval_samples_per_second": 13.323,
"eval_steps_per_second": 1.665,
"eval_token_acc": 0.8623987436294556,
"step": 2300
},
{
"epoch": 1.7635807192042847,
"grad_norm": 0.5906277894973755,
"learning_rate": 3.969795392474383e-05,
"loss": 0.34573922157287595,
"step": 2305,
"token_acc": 0.8903287053108215
},
{
"epoch": 1.767406273909717,
"grad_norm": 0.7397768497467041,
"learning_rate": 3.9491669660022345e-05,
"loss": 0.35153021812438967,
"step": 2310,
"token_acc": 0.8872886896133423
},
{
"epoch": 1.7712318286151492,
"grad_norm": 0.7996999025344849,
"learning_rate": 3.928557235735989e-05,
"loss": 0.31516518592834475,
"step": 2315,
"token_acc": 0.8970757722854614
},
{
"epoch": 1.7750573833205814,
"grad_norm": 0.6419305205345154,
"learning_rate": 3.907966568359742e-05,
"loss": 0.3054972171783447,
"step": 2320,
"token_acc": 0.8993676900863647
},
{
"epoch": 1.7788829380260138,
"grad_norm": 0.6739971041679382,
"learning_rate": 3.887395330218429e-05,
"loss": 0.3448510646820068,
"step": 2325,
"token_acc": 0.8888943195343018
},
{
"epoch": 1.782708492731446,
"grad_norm": 0.7799039483070374,
"learning_rate": 3.866843887311297e-05,
"loss": 0.31788105964660646,
"step": 2330,
"token_acc": 0.8954451680183411
},
{
"epoch": 1.7865340474368785,
"grad_norm": 0.7341748476028442,
"learning_rate": 3.846312605285408e-05,
"loss": 0.34601006507873533,
"step": 2335,
"token_acc": 0.8898206353187561
},
{
"epoch": 1.7903596021423107,
"grad_norm": 0.7024774551391602,
"learning_rate": 3.8258018494291234e-05,
"loss": 0.32241551876068114,
"step": 2340,
"token_acc": 0.89708012342453
},
{
"epoch": 1.794185156847743,
"grad_norm": 0.7515860795974731,
"learning_rate": 3.8053119846656026e-05,
"loss": 0.30928614139556887,
"step": 2345,
"token_acc": 0.8996888995170593
},
{
"epoch": 1.7980107115531752,
"grad_norm": 0.8652954697608948,
"learning_rate": 3.78484337554632e-05,
"loss": 0.30088629722595217,
"step": 2350,
"token_acc": 0.9041286110877991
},
{
"epoch": 1.7980107115531752,
"eval_loss": 0.47428014874458313,
"eval_runtime": 7.8145,
"eval_samples_per_second": 13.309,
"eval_steps_per_second": 1.664,
"eval_token_acc": 0.8631907105445862,
"step": 2350
},
{
"epoch": 1.8018362662586074,
"grad_norm": 0.9508410692214966,
"learning_rate": 3.764396386244577e-05,
"loss": 0.34288840293884276,
"step": 2355,
"token_acc": 0.8890052437782288
},
{
"epoch": 1.8056618209640398,
"grad_norm": 0.775829017162323,
"learning_rate": 3.743971380549008e-05,
"loss": 0.30949153900146487,
"step": 2360,
"token_acc": 0.8984510898590088
},
{
"epoch": 1.809487375669472,
"grad_norm": 0.6938086152076721,
"learning_rate": 3.723568721857133e-05,
"loss": 0.28354833126068113,
"step": 2365,
"token_acc": 0.9054216146469116
},
{
"epoch": 1.8133129303749045,
"grad_norm": 0.6911359429359436,
"learning_rate": 3.703188773168869e-05,
"loss": 0.2959973096847534,
"step": 2370,
"token_acc": 0.9038095474243164
},
{
"epoch": 1.8171384850803367,
"grad_norm": 60.64387130737305,
"learning_rate": 3.682831897080087e-05,
"loss": 0.40934906005859373,
"step": 2375,
"token_acc": 0.8823349475860596
},
{
"epoch": 1.820964039785769,
"grad_norm": 0.7439799308776855,
"learning_rate": 3.6624984557761504e-05,
"loss": 0.2931365489959717,
"step": 2380,
"token_acc": 0.9051112532615662
},
{
"epoch": 1.8247895944912012,
"grad_norm": 0.6623691320419312,
"learning_rate": 3.642188811025481e-05,
"loss": 0.3292604207992554,
"step": 2385,
"token_acc": 0.8928682208061218
},
{
"epoch": 1.8286151491966334,
"grad_norm": 0.6264249086380005,
"learning_rate": 3.621903324173114e-05,
"loss": 0.265956974029541,
"step": 2390,
"token_acc": 0.9118374586105347
},
{
"epoch": 1.8324407039020658,
"grad_norm": 0.8278756737709045,
"learning_rate": 3.6016423561342706e-05,
"loss": 0.29644384384155276,
"step": 2395,
"token_acc": 0.9024685025215149
},
{
"epoch": 1.836266258607498,
"grad_norm": 0.810718297958374,
"learning_rate": 3.581406267387941e-05,
"loss": 0.281774640083313,
"step": 2400,
"token_acc": 0.9071557521820068
},
{
"epoch": 1.836266258607498,
"eval_loss": 0.47047871351242065,
"eval_runtime": 7.868,
"eval_samples_per_second": 13.218,
"eval_steps_per_second": 1.652,
"eval_token_acc": 0.8643736243247986,
"step": 2400
},
{
"epoch": 1.8400918133129305,
"grad_norm": 0.7788925170898438,
"learning_rate": 3.56119541797047e-05,
"loss": 0.3004364013671875,
"step": 2405,
"token_acc": 0.8989213705062866
},
{
"epoch": 1.8439173680183627,
"grad_norm": 0.7350240349769592,
"learning_rate": 3.5410101674691434e-05,
"loss": 0.3446574449539185,
"step": 2410,
"token_acc": 0.8929014801979065
},
{
"epoch": 1.847742922723795,
"grad_norm": 0.7535839080810547,
"learning_rate": 3.520850875015801e-05,
"loss": 0.31823389530181884,
"step": 2415,
"token_acc": 0.896795928478241
},
{
"epoch": 1.8515684774292271,
"grad_norm": 0.8284432291984558,
"learning_rate": 3.5007178992804416e-05,
"loss": 0.30584444999694826,
"step": 2420,
"token_acc": 0.9038248658180237
},
{
"epoch": 1.8553940321346594,
"grad_norm": 0.8060945272445679,
"learning_rate": 3.480611598464844e-05,
"loss": 0.2657127857208252,
"step": 2425,
"token_acc": 0.9115975499153137
},
{
"epoch": 1.8592195868400918,
"grad_norm": 0.6967042684555054,
"learning_rate": 3.4605323302961854e-05,
"loss": 0.30145883560180664,
"step": 2430,
"token_acc": 0.9007070064544678
},
{
"epoch": 1.8630451415455243,
"grad_norm": 0.827389657497406,
"learning_rate": 3.4404804520206915e-05,
"loss": 0.3457145929336548,
"step": 2435,
"token_acc": 0.889440655708313
},
{
"epoch": 1.8668706962509565,
"grad_norm": 0.7290979027748108,
"learning_rate": 3.42045632039727e-05,
"loss": 0.29812381267547605,
"step": 2440,
"token_acc": 0.9029287695884705
},
{
"epoch": 1.8706962509563887,
"grad_norm": 0.8037905693054199,
"learning_rate": 3.400460291691164e-05,
"loss": 0.32248711585998535,
"step": 2445,
"token_acc": 0.8946207165718079
},
{
"epoch": 1.874521805661821,
"grad_norm": 0.7474591732025146,
"learning_rate": 3.380492721667618e-05,
"loss": 0.3022623062133789,
"step": 2450,
"token_acc": 0.9007507562637329
},
{
"epoch": 1.874521805661821,
"eval_loss": 0.46530866622924805,
"eval_runtime": 7.8984,
"eval_samples_per_second": 13.167,
"eval_steps_per_second": 1.646,
"eval_token_acc": 0.8647946715354919,
"step": 2450
},
{
"epoch": 1.8783473603672531,
"grad_norm": 0.71452397108078,
"learning_rate": 3.3605539655855445e-05,
"loss": 0.28342552185058595,
"step": 2455,
"token_acc": 0.9065305590629578
},
{
"epoch": 1.8821729150726856,
"grad_norm": 0.7897852659225464,
"learning_rate": 3.3406443781912014e-05,
"loss": 0.2861522912979126,
"step": 2460,
"token_acc": 0.9051787257194519
},
{
"epoch": 1.8859984697781178,
"grad_norm": 0.7614904642105103,
"learning_rate": 3.3207643137118874e-05,
"loss": 0.2704183578491211,
"step": 2465,
"token_acc": 0.911378026008606
},
{
"epoch": 1.8898240244835502,
"grad_norm": 0.6754797697067261,
"learning_rate": 3.3009141258496344e-05,
"loss": 0.31130855083465575,
"step": 2470,
"token_acc": 0.8980752229690552
},
{
"epoch": 1.8936495791889825,
"grad_norm": 0.7454941272735596,
"learning_rate": 3.2810941677749164e-05,
"loss": 0.34280953407287595,
"step": 2475,
"token_acc": 0.8920162320137024
},
{
"epoch": 1.8974751338944147,
"grad_norm": 0.7202689051628113,
"learning_rate": 3.261304792120361e-05,
"loss": 0.2786979675292969,
"step": 2480,
"token_acc": 0.907993495464325
},
{
"epoch": 1.901300688599847,
"grad_norm": 0.7289252281188965,
"learning_rate": 3.2415463509744855e-05,
"loss": 0.28704142570495605,
"step": 2485,
"token_acc": 0.9051684141159058
},
{
"epoch": 1.9051262433052791,
"grad_norm": 0.7389020919799805,
"learning_rate": 3.2218191958754226e-05,
"loss": 0.3317502498626709,
"step": 2490,
"token_acc": 0.8912999629974365
},
{
"epoch": 1.9089517980107116,
"grad_norm": 0.7187902331352234,
"learning_rate": 3.202123677804672e-05,
"loss": 0.32085230350494387,
"step": 2495,
"token_acc": 0.8973221182823181
},
{
"epoch": 1.9127773527161438,
"grad_norm": 0.780617892742157,
"learning_rate": 3.18246014718085e-05,
"loss": 0.2799449682235718,
"step": 2500,
"token_acc": 0.9089812636375427
},
{
"epoch": 1.9127773527161438,
"eval_loss": 0.4558640122413635,
"eval_runtime": 7.6268,
"eval_samples_per_second": 13.636,
"eval_steps_per_second": 1.705,
"eval_token_acc": 0.8680527806282043,
"step": 2500
},
{
"epoch": 1.9166029074215762,
"grad_norm": 0.7578943967819214,
"learning_rate": 3.162828953853469e-05,
"loss": 0.283012843132019,
"step": 2505,
"token_acc": 0.908361554145813
},
{
"epoch": 1.9204284621270085,
"grad_norm": 0.7080029249191284,
"learning_rate": 3.14323044709669e-05,
"loss": 0.26364171504974365,
"step": 2510,
"token_acc": 0.9134095311164856
},
{
"epoch": 1.9242540168324407,
"grad_norm": 0.7052859663963318,
"learning_rate": 3.12366497560313e-05,
"loss": 0.28186535835266113,
"step": 2515,
"token_acc": 0.9079092741012573
},
{
"epoch": 1.928079571537873,
"grad_norm": 0.722137451171875,
"learning_rate": 3.104132887477647e-05,
"loss": 0.2929178953170776,
"step": 2520,
"token_acc": 0.9022585153579712
},
{
"epoch": 1.9319051262433051,
"grad_norm": 0.6590465903282166,
"learning_rate": 3.084634530231145e-05,
"loss": 0.29388132095336916,
"step": 2525,
"token_acc": 0.9019988179206848
},
{
"epoch": 1.9357306809487376,
"grad_norm": 0.7757251858711243,
"learning_rate": 3.065170250774401e-05,
"loss": 0.3049909591674805,
"step": 2530,
"token_acc": 0.8986476063728333
},
{
"epoch": 1.93955623565417,
"grad_norm": 0.7149041295051575,
"learning_rate": 3.0457403954118856e-05,
"loss": 0.2536777019500732,
"step": 2535,
"token_acc": 0.9141318202018738
},
{
"epoch": 1.9433817903596022,
"grad_norm": 0.6480096578598022,
"learning_rate": 3.026345309835602e-05,
"loss": 0.3146909952163696,
"step": 2540,
"token_acc": 0.8978093266487122
},
{
"epoch": 1.9472073450650345,
"grad_norm": 0.7162771224975586,
"learning_rate": 3.0069853391189352e-05,
"loss": 0.29620161056518557,
"step": 2545,
"token_acc": 0.9032965302467346
},
{
"epoch": 1.9510328997704667,
"grad_norm": 0.6839264631271362,
"learning_rate": 2.9876608277105145e-05,
"loss": 0.3268457889556885,
"step": 2550,
"token_acc": 0.8923251032829285
},
{
"epoch": 1.9510328997704667,
"eval_loss": 0.4463500678539276,
"eval_runtime": 7.6674,
"eval_samples_per_second": 13.564,
"eval_steps_per_second": 1.695,
"eval_token_acc": 0.8691655397415161,
"step": 2550
},
{
"epoch": 1.954858454475899,
"grad_norm": 0.7191382050514221,
"learning_rate": 2.9683721194280877e-05,
"loss": 0.2873558044433594,
"step": 2555,
"token_acc": 0.9027012586593628
},
{
"epoch": 1.9586840091813313,
"grad_norm": 0.7788121700286865,
"learning_rate": 2.9491195574523945e-05,
"loss": 0.29071290493011476,
"step": 2560,
"token_acc": 0.9054592251777649
},
{
"epoch": 1.9625095638867636,
"grad_norm": 0.6830841302871704,
"learning_rate": 2.9299034843210726e-05,
"loss": 0.2975457668304443,
"step": 2565,
"token_acc": 0.9023709297180176
},
{
"epoch": 1.966335118592196,
"grad_norm": 0.8139908909797668,
"learning_rate": 2.9107242419225577e-05,
"loss": 0.2521679401397705,
"step": 2570,
"token_acc": 0.9153000116348267
},
{
"epoch": 1.9701606732976282,
"grad_norm": 0.6574170589447021,
"learning_rate": 2.8915821714899917e-05,
"loss": 0.268428373336792,
"step": 2575,
"token_acc": 0.9112088680267334
},
{
"epoch": 1.9739862280030605,
"grad_norm": 0.7240482568740845,
"learning_rate": 2.8724776135951747e-05,
"loss": 0.2789809226989746,
"step": 2580,
"token_acc": 0.9081910848617554
},
{
"epoch": 1.9778117827084927,
"grad_norm": 0.675998330116272,
"learning_rate": 2.85341090814248e-05,
"loss": 0.300505256652832,
"step": 2585,
"token_acc": 0.9032467007637024
},
{
"epoch": 1.981637337413925,
"grad_norm": 0.7282765507698059,
"learning_rate": 2.8343823943628257e-05,
"loss": 0.2605840444564819,
"step": 2590,
"token_acc": 0.9125819206237793
},
{
"epoch": 1.9854628921193573,
"grad_norm": 0.8446104526519775,
"learning_rate": 2.8153924108076234e-05,
"loss": 0.3036641120910645,
"step": 2595,
"token_acc": 0.9020313024520874
},
{
"epoch": 1.9892884468247896,
"grad_norm": 0.8688914179801941,
"learning_rate": 2.7964412953427667e-05,
"loss": 0.301717472076416,
"step": 2600,
"token_acc": 0.90234375
},
{
"epoch": 1.9892884468247896,
"eval_loss": 0.44557470083236694,
"eval_runtime": 7.7719,
"eval_samples_per_second": 13.382,
"eval_steps_per_second": 1.673,
"eval_token_acc": 0.869877278804779,
"step": 2600
},
{
"epoch": 1.993114001530222,
"grad_norm": 0.6388227343559265,
"learning_rate": 2.7775293851426232e-05,
"loss": 0.28205983638763427,
"step": 2605,
"token_acc": 0.9057275056838989
},
{
"epoch": 1.9969395562356542,
"grad_norm": 0.6498620510101318,
"learning_rate": 2.7586570166840153e-05,
"loss": 0.28784162998199464,
"step": 2610,
"token_acc": 0.9042630791664124
},
{
"epoch": 2.0007651109410864,
"grad_norm": 0.46216583251953125,
"learning_rate": 2.7398245257402567e-05,
"loss": 0.24226248264312744,
"step": 2615,
"token_acc": 0.9181912541389465
},
{
"epoch": 2.0045906656465187,
"grad_norm": 0.4526701867580414,
"learning_rate": 2.721032247375165e-05,
"loss": 0.13410005569458008,
"step": 2620,
"token_acc": 0.9554323554039001
},
{
"epoch": 2.008416220351951,
"grad_norm": 0.5027770400047302,
"learning_rate": 2.7022805159371023e-05,
"loss": 0.14986848831176758,
"step": 2625,
"token_acc": 0.9495237469673157
},
{
"epoch": 2.012241775057383,
"grad_norm": 0.6318019032478333,
"learning_rate": 2.683569665053033e-05,
"loss": 0.13008542060852052,
"step": 2630,
"token_acc": 0.9539133906364441
},
{
"epoch": 2.0160673297628158,
"grad_norm": 0.738571286201477,
"learning_rate": 2.664900027622577e-05,
"loss": 0.15502784252166749,
"step": 2635,
"token_acc": 0.9469853043556213
},
{
"epoch": 2.019892884468248,
"grad_norm": 0.6892253160476685,
"learning_rate": 2.646271935812098e-05,
"loss": 0.13881022930145265,
"step": 2640,
"token_acc": 0.9516469240188599
},
{
"epoch": 2.02371843917368,
"grad_norm": 0.6470181941986084,
"learning_rate": 2.6276857210487858e-05,
"loss": 0.1207735538482666,
"step": 2645,
"token_acc": 0.9576534032821655
},
{
"epoch": 2.0275439938791124,
"grad_norm": 0.6596648097038269,
"learning_rate": 2.6091417140147634e-05,
"loss": 0.11292877197265624,
"step": 2650,
"token_acc": 0.9626390337944031
},
{
"epoch": 2.0275439938791124,
"eval_loss": 0.5168122053146362,
"eval_runtime": 8.1996,
"eval_samples_per_second": 12.684,
"eval_steps_per_second": 1.585,
"eval_token_acc": 0.8695364594459534,
"step": 2650
},
{
"epoch": 2.0313695485845447,
"grad_norm": 0.577893853187561,
"learning_rate": 2.5906402446412027e-05,
"loss": 0.14242198467254638,
"step": 2655,
"token_acc": 0.9518451690673828
},
{
"epoch": 2.035195103289977,
"grad_norm": 0.6954317688941956,
"learning_rate": 2.5721816421024515e-05,
"loss": 0.12017552852630616,
"step": 2660,
"token_acc": 0.9579612016677856
},
{
"epoch": 2.0390206579954095,
"grad_norm": 0.5604422688484192,
"learning_rate": 2.553766234810181e-05,
"loss": 0.12801860570907592,
"step": 2665,
"token_acc": 0.9555345773696899
},
{
"epoch": 2.0428462127008418,
"grad_norm": 0.6638826727867126,
"learning_rate": 2.535394350407548e-05,
"loss": 0.1116684079170227,
"step": 2670,
"token_acc": 0.960515022277832
},
{
"epoch": 2.046671767406274,
"grad_norm": 0.5910780429840088,
"learning_rate": 2.5170663157633477e-05,
"loss": 0.13454906940460204,
"step": 2675,
"token_acc": 0.9548289775848389
},
{
"epoch": 2.050497322111706,
"grad_norm": 0.6535590291023254,
"learning_rate": 2.4987824569662167e-05,
"loss": 0.12083430290222168,
"step": 2680,
"token_acc": 0.9585192799568176
},
{
"epoch": 2.0543228768171384,
"grad_norm": 0.5576914548873901,
"learning_rate": 2.4805430993188228e-05,
"loss": 0.12852833271026612,
"step": 2685,
"token_acc": 0.9565430879592896
},
{
"epoch": 2.0581484315225707,
"grad_norm": 0.57133549451828,
"learning_rate": 2.4623485673320772e-05,
"loss": 0.13395898342132567,
"step": 2690,
"token_acc": 0.9541014432907104
},
{
"epoch": 2.061973986228003,
"grad_norm": 0.824409008026123,
"learning_rate": 2.4441991847193636e-05,
"loss": 0.1304774522781372,
"step": 2695,
"token_acc": 0.9565969109535217
},
{
"epoch": 2.0657995409334355,
"grad_norm": 0.6546271443367004,
"learning_rate": 2.4260952743907756e-05,
"loss": 0.13317997455596925,
"step": 2700,
"token_acc": 0.9548870325088501
},
{
"epoch": 2.0657995409334355,
"eval_loss": 0.5218855142593384,
"eval_runtime": 8.6536,
"eval_samples_per_second": 12.018,
"eval_steps_per_second": 1.502,
"eval_token_acc": 0.8702181577682495,
"step": 2700
},
{
"epoch": 2.0696250956388678,
"grad_norm": 0.49882644414901733,
"learning_rate": 2.4080371584473748e-05,
"loss": 0.10250062942504883,
"step": 2705,
"token_acc": 0.9647969007492065
},
{
"epoch": 2.0734506503443,
"grad_norm": 0.6716576814651489,
"learning_rate": 2.390025158175458e-05,
"loss": 0.12553690671920775,
"step": 2710,
"token_acc": 0.9559978246688843
},
{
"epoch": 2.077276205049732,
"grad_norm": 0.630893349647522,
"learning_rate": 2.3720595940408413e-05,
"loss": 0.1133840560913086,
"step": 2715,
"token_acc": 0.960378885269165
},
{
"epoch": 2.0811017597551644,
"grad_norm": 0.6294081211090088,
"learning_rate": 2.3541407856831598e-05,
"loss": 0.11989744901657104,
"step": 2720,
"token_acc": 0.9581653475761414
},
{
"epoch": 2.0849273144605966,
"grad_norm": 0.6295720934867859,
"learning_rate": 2.3362690519101728e-05,
"loss": 0.10788016319274903,
"step": 2725,
"token_acc": 0.9615026116371155
},
{
"epoch": 2.088752869166029,
"grad_norm": 0.6127709150314331,
"learning_rate": 2.318444710692109e-05,
"loss": 0.18858987092971802,
"step": 2730,
"token_acc": 0.9527615308761597
},
{
"epoch": 2.0925784238714615,
"grad_norm": 0.6840873956680298,
"learning_rate": 2.3006680791559943e-05,
"loss": 0.13058118820190429,
"step": 2735,
"token_acc": 0.9559764862060547
},
{
"epoch": 2.0964039785768938,
"grad_norm": 0.6548556089401245,
"learning_rate": 2.2829394735800075e-05,
"loss": 0.12637789249420167,
"step": 2740,
"token_acc": 0.9560421705245972
},
{
"epoch": 2.100229533282326,
"grad_norm": 0.6251739263534546,
"learning_rate": 2.2652592093878666e-05,
"loss": 0.1079249382019043,
"step": 2745,
"token_acc": 0.9616904854774475
},
{
"epoch": 2.104055087987758,
"grad_norm": 0.5070903301239014,
"learning_rate": 2.2476276011432056e-05,
"loss": 0.10909421443939209,
"step": 2750,
"token_acc": 0.9607372879981995
},
{
"epoch": 2.104055087987758,
"eval_loss": 0.5240176916122437,
"eval_runtime": 7.9542,
"eval_samples_per_second": 13.075,
"eval_steps_per_second": 1.634,
"eval_token_acc": 0.8699575066566467,
"step": 2750
},
{
"epoch": 2.1078806426931904,
"grad_norm": 0.5303053259849548,
"learning_rate": 2.230044962543989e-05,
"loss": 0.10541150569915772,
"step": 2755,
"token_acc": 0.9636523723602295
},
{
"epoch": 2.1117061973986226,
"grad_norm": 0.6467751264572144,
"learning_rate": 2.2125116064169125e-05,
"loss": 0.11249511241912842,
"step": 2760,
"token_acc": 0.9602897763252258
},
{
"epoch": 2.1155317521040553,
"grad_norm": 0.6789493560791016,
"learning_rate": 2.195027844711856e-05,
"loss": 0.13851575851440429,
"step": 2765,
"token_acc": 0.9524257183074951
},
{
"epoch": 2.1193573068094875,
"grad_norm": 0.5706949234008789,
"learning_rate": 2.177593988496323e-05,
"loss": 0.0956031322479248,
"step": 2770,
"token_acc": 0.9663383960723877
},
{
"epoch": 2.1231828615149198,
"grad_norm": 0.5609292984008789,
"learning_rate": 2.1602103479499093e-05,
"loss": 0.11319952011108399,
"step": 2775,
"token_acc": 0.9608060717582703
},
{
"epoch": 2.127008416220352,
"grad_norm": 0.639937162399292,
"learning_rate": 2.1428772323587827e-05,
"loss": 0.13543224334716797,
"step": 2780,
"token_acc": 0.9520896077156067
},
{
"epoch": 2.130833970925784,
"grad_norm": 0.6833350658416748,
"learning_rate": 2.1255949501101847e-05,
"loss": 0.14142370223999023,
"step": 2785,
"token_acc": 0.9528786540031433
},
{
"epoch": 2.1346595256312164,
"grad_norm": 0.5408839583396912,
"learning_rate": 2.1083638086869327e-05,
"loss": 0.12588857412338256,
"step": 2790,
"token_acc": 0.9563543200492859
},
{
"epoch": 2.1384850803366486,
"grad_norm": 0.5438815355300903,
"learning_rate": 2.0911841146619676e-05,
"loss": 0.12137541770935059,
"step": 2795,
"token_acc": 0.958185613155365
},
{
"epoch": 2.1423106350420813,
"grad_norm": 0.6048544645309448,
"learning_rate": 2.074056173692881e-05,
"loss": 0.10157194137573242,
"step": 2800,
"token_acc": 0.9674689769744873
},
{
"epoch": 2.1423106350420813,
"eval_loss": 0.5312597751617432,
"eval_runtime": 9.0822,
"eval_samples_per_second": 11.451,
"eval_steps_per_second": 1.431,
"eval_token_acc": 0.8708697557449341,
"step": 2800
},
{
"epoch": 2.1461361897475135,
"grad_norm": 0.689985990524292,
"learning_rate": 2.05698029051649e-05,
"loss": 0.12691206932067872,
"step": 2805,
"token_acc": 0.9552291035652161
},
{
"epoch": 2.1499617444529457,
"grad_norm": 0.628235936164856,
"learning_rate": 2.0399567689434007e-05,
"loss": 0.12962342500686647,
"step": 2810,
"token_acc": 0.9563965201377869
},
{
"epoch": 2.153787299158378,
"grad_norm": 0.583711564540863,
"learning_rate": 2.0229859118526244e-05,
"loss": 0.11104552745819092,
"step": 2815,
"token_acc": 0.9605592489242554
},
{
"epoch": 2.15761285386381,
"grad_norm": 0.749139666557312,
"learning_rate": 2.0060680211861722e-05,
"loss": 0.11064702272415161,
"step": 2820,
"token_acc": 0.9618842601776123
},
{
"epoch": 2.1614384085692424,
"grad_norm": 0.6225452423095703,
"learning_rate": 1.989203397943682e-05,
"loss": 0.1368303894996643,
"step": 2825,
"token_acc": 0.9523999691009521
},
{
"epoch": 2.1652639632746746,
"grad_norm": 0.7548052072525024,
"learning_rate": 1.9723923421770744e-05,
"loss": 0.12567458152770997,
"step": 2830,
"token_acc": 0.9570740461349487
},
{
"epoch": 2.1690895179801073,
"grad_norm": 0.6393832564353943,
"learning_rate": 1.9556351529852086e-05,
"loss": 0.12716997861862184,
"step": 2835,
"token_acc": 0.9550226926803589
},
{
"epoch": 2.1729150726855395,
"grad_norm": 0.5963457822799683,
"learning_rate": 1.9389321285085572e-05,
"loss": 0.12617888450622558,
"step": 2840,
"token_acc": 0.9543135166168213
},
{
"epoch": 2.1767406273909717,
"grad_norm": 0.7114848494529724,
"learning_rate": 1.9222835659239086e-05,
"loss": 0.12233096361160278,
"step": 2845,
"token_acc": 0.9570853114128113
},
{
"epoch": 2.180566182096404,
"grad_norm": 0.6505621671676636,
"learning_rate": 1.905689761439075e-05,
"loss": 0.13814208507537842,
"step": 2850,
"token_acc": 0.9528710246086121
},
{
"epoch": 2.180566182096404,
"eval_loss": 0.5228633284568787,
"eval_runtime": 7.9764,
"eval_samples_per_second": 13.038,
"eval_steps_per_second": 1.63,
"eval_token_acc": 0.8718922734260559,
"step": 2850
},
{
"epoch": 2.184391736801836,
"grad_norm": 0.6201128959655762,
"learning_rate": 1.8891510102876235e-05,
"loss": 0.12893006801605225,
"step": 2855,
"token_acc": 0.9550007581710815
},
{
"epoch": 2.1882172915072684,
"grad_norm": 0.6673233509063721,
"learning_rate": 1.8726676067236245e-05,
"loss": 0.10436077117919922,
"step": 2860,
"token_acc": 0.9645984172821045
},
{
"epoch": 2.1920428462127006,
"grad_norm": 0.7207808494567871,
"learning_rate": 1.8562398440164135e-05,
"loss": 0.14118155241012573,
"step": 2865,
"token_acc": 0.9522634148597717
},
{
"epoch": 2.1958684009181333,
"grad_norm": 0.7116675972938538,
"learning_rate": 1.8398680144453794e-05,
"loss": 0.11731832027435303,
"step": 2870,
"token_acc": 0.9581528902053833
},
{
"epoch": 2.1996939556235655,
"grad_norm": 0.5616986155509949,
"learning_rate": 1.823552409294752e-05,
"loss": 0.10328438282012939,
"step": 2875,
"token_acc": 0.9635567665100098
},
{
"epoch": 2.2035195103289977,
"grad_norm": 0.7303850650787354,
"learning_rate": 1.8072933188484385e-05,
"loss": 0.12835383415222168,
"step": 2880,
"token_acc": 0.9546709060668945
},
{
"epoch": 2.20734506503443,
"grad_norm": 0.684688925743103,
"learning_rate": 1.7910910323848433e-05,
"loss": 0.12336525917053223,
"step": 2885,
"token_acc": 0.9571567177772522
},
{
"epoch": 2.211170619739862,
"grad_norm": 0.5825948119163513,
"learning_rate": 1.774945838171721e-05,
"loss": 0.12321670055389404,
"step": 2890,
"token_acc": 0.9568530321121216
},
{
"epoch": 2.2149961744452944,
"grad_norm": 0.5380724668502808,
"learning_rate": 1.758858023461059e-05,
"loss": 0.1462591528892517,
"step": 2895,
"token_acc": 0.9520248770713806
},
{
"epoch": 2.218821729150727,
"grad_norm": 0.7477222084999084,
"learning_rate": 1.742827874483958e-05,
"loss": 0.1159374475479126,
"step": 2900,
"token_acc": 0.9597063660621643
},
{
"epoch": 2.218821729150727,
"eval_loss": 0.5222508311271667,
"eval_runtime": 8.103,
"eval_samples_per_second": 12.835,
"eval_steps_per_second": 1.604,
"eval_token_acc": 0.872032642364502,
"step": 2900
},
{
"epoch": 2.2226472838561593,
"grad_norm": 0.578953206539154,
"learning_rate": 1.7268556764455433e-05,
"loss": 0.1094053030014038,
"step": 2905,
"token_acc": 0.9612045884132385
},
{
"epoch": 2.2264728385615915,
"grad_norm": 0.6454194188117981,
"learning_rate": 1.7109417135198875e-05,
"loss": 0.09978902339935303,
"step": 2910,
"token_acc": 0.9648175239562988
},
{
"epoch": 2.2302983932670237,
"grad_norm": 0.6507310271263123,
"learning_rate": 1.6950862688449555e-05,
"loss": 0.12494430541992188,
"step": 2915,
"token_acc": 0.9561623930931091
},
{
"epoch": 2.234123947972456,
"grad_norm": 0.5561665296554565,
"learning_rate": 1.6792896245175695e-05,
"loss": 0.12519459724426268,
"step": 2920,
"token_acc": 0.957149863243103
},
{
"epoch": 2.237949502677888,
"grad_norm": 0.6335827708244324,
"learning_rate": 1.6635520615883854e-05,
"loss": 0.12490168809890748,
"step": 2925,
"token_acc": 0.956473171710968
},
{
"epoch": 2.2417750573833204,
"grad_norm": 0.518527090549469,
"learning_rate": 1.6478738600568978e-05,
"loss": 0.11815754175186158,
"step": 2930,
"token_acc": 0.9581723809242249
},
{
"epoch": 2.245600612088753,
"grad_norm": 0.7105391025543213,
"learning_rate": 1.6322552988664548e-05,
"loss": 0.1265929937362671,
"step": 2935,
"token_acc": 0.9559991359710693
},
{
"epoch": 2.2494261667941853,
"grad_norm": 0.6597128510475159,
"learning_rate": 1.616696655899291e-05,
"loss": 0.10472848415374755,
"step": 2940,
"token_acc": 0.9618938565254211
},
{
"epoch": 2.2532517214996175,
"grad_norm": 0.5978385806083679,
"learning_rate": 1.601198207971596e-05,
"loss": 0.11347222328186035,
"step": 2945,
"token_acc": 0.9598453640937805
},
{
"epoch": 2.2570772762050497,
"grad_norm": 0.5900003910064697,
"learning_rate": 1.585760230828579e-05,
"loss": 0.1062214732170105,
"step": 2950,
"token_acc": 0.9621166586875916
},
{
"epoch": 2.2570772762050497,
"eval_loss": 0.529563307762146,
"eval_runtime": 7.925,
"eval_samples_per_second": 13.123,
"eval_steps_per_second": 1.64,
"eval_token_acc": 0.8730752468109131,
"step": 2950
},
{
"epoch": 2.260902830910482,
"grad_norm": 0.6690232753753662,
"learning_rate": 1.57038299913956e-05,
"loss": 0.12313377857208252,
"step": 2955,
"token_acc": 0.9577500820159912
},
{
"epoch": 2.264728385615914,
"grad_norm": 0.6129235625267029,
"learning_rate": 1.555066786493094e-05,
"loss": 0.11549534797668456,
"step": 2960,
"token_acc": 0.9599046111106873
},
{
"epoch": 2.268553940321347,
"grad_norm": 0.7165189385414124,
"learning_rate": 1.5398118653920986e-05,
"loss": 0.10570051670074462,
"step": 2965,
"token_acc": 0.9616792798042297
},
{
"epoch": 2.272379495026779,
"grad_norm": 0.7057157754898071,
"learning_rate": 1.5246185072490027e-05,
"loss": 0.11799094676971436,
"step": 2970,
"token_acc": 0.9599979519844055
},
{
"epoch": 2.2762050497322113,
"grad_norm": 0.6109249591827393,
"learning_rate": 1.5094869823809166e-05,
"loss": 0.12232885360717774,
"step": 2975,
"token_acc": 0.9563071727752686
},
{
"epoch": 2.2800306044376435,
"grad_norm": 0.6849731206893921,
"learning_rate": 1.4944175600048294e-05,
"loss": 0.12355262041091919,
"step": 2980,
"token_acc": 0.9571903944015503
},
{
"epoch": 2.2838561591430757,
"grad_norm": 0.551438570022583,
"learning_rate": 1.4794105082328158e-05,
"loss": 0.10952677726745605,
"step": 2985,
"token_acc": 0.963117241859436
},
{
"epoch": 2.287681713848508,
"grad_norm": 0.7222511172294617,
"learning_rate": 1.4644660940672627e-05,
"loss": 0.1401592493057251,
"step": 2990,
"token_acc": 0.9511399865150452
},
{
"epoch": 2.29150726855394,
"grad_norm": 0.7186452150344849,
"learning_rate": 1.449584583396124e-05,
"loss": 0.1436525344848633,
"step": 2995,
"token_acc": 0.9500516653060913
},
{
"epoch": 2.295332823259373,
"grad_norm": 0.7001931071281433,
"learning_rate": 1.4347662409881868e-05,
"loss": 0.12311695814132691,
"step": 3000,
"token_acc": 0.9562889337539673
},
{
"epoch": 2.295332823259373,
"eval_loss": 0.5203014612197876,
"eval_runtime": 9.1094,
"eval_samples_per_second": 11.417,
"eval_steps_per_second": 1.427,
"eval_token_acc": 0.8741077780723572,
"step": 3000
},
{
"epoch": 2.299158377964805,
"grad_norm": 0.6098562479019165,
"learning_rate": 1.4200113304883611e-05,
"loss": 0.13382203578948976,
"step": 3005,
"token_acc": 0.9541038274765015
},
{
"epoch": 2.3029839326702373,
"grad_norm": 0.6261680126190186,
"learning_rate": 1.405320114412989e-05,
"loss": 0.0949715256690979,
"step": 3010,
"token_acc": 0.96717369556427
},
{
"epoch": 2.3068094873756695,
"grad_norm": 0.5904762744903564,
"learning_rate": 1.3906928541451775e-05,
"loss": 0.10795230865478515,
"step": 3015,
"token_acc": 0.9621407985687256
},
{
"epoch": 2.3106350420811017,
"grad_norm": 0.6883955001831055,
"learning_rate": 1.3761298099301378e-05,
"loss": 0.12801848649978637,
"step": 3020,
"token_acc": 0.9559524059295654
},
{
"epoch": 2.314460596786534,
"grad_norm": 0.6712023615837097,
"learning_rate": 1.3616312408705689e-05,
"loss": 0.12017567157745361,
"step": 3025,
"token_acc": 0.9589926600456238
},
{
"epoch": 2.318286151491966,
"grad_norm": 0.5586845874786377,
"learning_rate": 1.3471974049220403e-05,
"loss": 0.09736464023590088,
"step": 3030,
"token_acc": 0.9669448733329773
},
{
"epoch": 2.322111706197399,
"grad_norm": 0.7812525033950806,
"learning_rate": 1.3328285588884032e-05,
"loss": 0.11876866817474366,
"step": 3035,
"token_acc": 0.9586123824119568
},
{
"epoch": 2.325937260902831,
"grad_norm": 0.5611070394515991,
"learning_rate": 1.3185249584172172e-05,
"loss": 0.09341703653335572,
"step": 3040,
"token_acc": 0.9679653644561768
},
{
"epoch": 2.3297628156082633,
"grad_norm": 0.7015408873558044,
"learning_rate": 1.304286857995209e-05,
"loss": 0.10733482837677003,
"step": 3045,
"token_acc": 0.9623789191246033
},
{
"epoch": 2.3335883703136955,
"grad_norm": 0.6591479778289795,
"learning_rate": 1.2901145109437474e-05,
"loss": 0.11940803527832031,
"step": 3050,
"token_acc": 0.9576820135116577
},
{
"epoch": 2.3335883703136955,
"eval_loss": 0.5162126421928406,
"eval_runtime": 7.6579,
"eval_samples_per_second": 13.581,
"eval_steps_per_second": 1.698,
"eval_token_acc": 0.8742882609367371,
"step": 3050
},
{
"epoch": 2.3374139250191277,
"grad_norm": 0.5746079087257385,
"learning_rate": 1.27600816941432e-05,
"loss": 0.12224366664886474,
"step": 3055,
"token_acc": 0.95743727684021
},
{
"epoch": 2.34123947972456,
"grad_norm": 0.6104121208190918,
"learning_rate": 1.2619680843840659e-05,
"loss": 0.12069646120071412,
"step": 3060,
"token_acc": 0.9580378532409668
},
{
"epoch": 2.345065034429992,
"grad_norm": 0.6610199213027954,
"learning_rate": 1.2479945056512993e-05,
"loss": 0.10805834531784057,
"step": 3065,
"token_acc": 0.9605792760848999
},
{
"epoch": 2.348890589135425,
"grad_norm": 0.6179318428039551,
"learning_rate": 1.2340876818310682e-05,
"loss": 0.1121566653251648,
"step": 3070,
"token_acc": 0.9616247415542603
},
{
"epoch": 2.352716143840857,
"grad_norm": 0.6470217108726501,
"learning_rate": 1.22024786035073e-05,
"loss": 0.09998181462287903,
"step": 3075,
"token_acc": 0.9644249081611633
},
{
"epoch": 2.3565416985462893,
"grad_norm": 0.6415740847587585,
"learning_rate": 1.206475287445552e-05,
"loss": 0.10013750791549683,
"step": 3080,
"token_acc": 0.9655629396438599
},
{
"epoch": 2.3603672532517215,
"grad_norm": 0.5981183648109436,
"learning_rate": 1.1927702081543279e-05,
"loss": 0.10144208669662476,
"step": 3085,
"token_acc": 0.965247631072998
},
{
"epoch": 2.3641928079571537,
"grad_norm": 0.4865865409374237,
"learning_rate": 1.179132866315018e-05,
"loss": 0.10601496696472168,
"step": 3090,
"token_acc": 0.9624915719032288
},
{
"epoch": 2.368018362662586,
"grad_norm": 0.5336887240409851,
"learning_rate": 1.165563504560413e-05,
"loss": 0.11365892887115478,
"step": 3095,
"token_acc": 0.9594626426696777
},
{
"epoch": 2.371843917368018,
"grad_norm": 0.4895932376384735,
"learning_rate": 1.1520623643138162e-05,
"loss": 0.11079982519149781,
"step": 3100,
"token_acc": 0.9616596102714539
},
{
"epoch": 2.371843917368018,
"eval_loss": 0.5221489667892456,
"eval_runtime": 8.0254,
"eval_samples_per_second": 12.959,
"eval_steps_per_second": 1.62,
"eval_token_acc": 0.8750301003456116,
"step": 3100
},
{
"epoch": 2.375669472073451,
"grad_norm": 0.6662837266921997,
"learning_rate": 1.1386296857847444e-05,
"loss": 0.09341274499893189,
"step": 3105,
"token_acc": 0.9671337008476257
},
{
"epoch": 2.379495026778883,
"grad_norm": 0.5832562446594238,
"learning_rate": 1.12526570796466e-05,
"loss": 0.11719496250152588,
"step": 3110,
"token_acc": 0.9592087864875793
},
{
"epoch": 2.3833205814843152,
"grad_norm": 0.5843919515609741,
"learning_rate": 1.1119706686227211e-05,
"loss": 0.10511226654052734,
"step": 3115,
"token_acc": 0.9644036889076233
},
{
"epoch": 2.3871461361897475,
"grad_norm": 0.49912717938423157,
"learning_rate": 1.0987448043015374e-05,
"loss": 0.09345480799674988,
"step": 3120,
"token_acc": 0.9667991399765015
},
{
"epoch": 2.3909716908951797,
"grad_norm": 0.7507015466690063,
"learning_rate": 1.0855883503129772e-05,
"loss": 0.11863377094268798,
"step": 3125,
"token_acc": 0.9587963819503784
},
{
"epoch": 2.394797245600612,
"grad_norm": 0.7630432844161987,
"learning_rate": 1.0725015407339717e-05,
"loss": 0.1126257300376892,
"step": 3130,
"token_acc": 0.9607234597206116
},
{
"epoch": 2.398622800306044,
"grad_norm": 0.6372060179710388,
"learning_rate": 1.0594846084023547e-05,
"loss": 0.10468795299530029,
"step": 3135,
"token_acc": 0.9627901315689087
},
{
"epoch": 2.402448355011477,
"grad_norm": 0.6120291352272034,
"learning_rate": 1.0465377849127172e-05,
"loss": 0.09292224049568176,
"step": 3140,
"token_acc": 0.9677795171737671
},
{
"epoch": 2.406273909716909,
"grad_norm": 0.5614500045776367,
"learning_rate": 1.0336613006122892e-05,
"loss": 0.09670157432556152,
"step": 3145,
"token_acc": 0.9674481153488159
},
{
"epoch": 2.4100994644223412,
"grad_norm": 0.5987251996994019,
"learning_rate": 1.0208553845968383e-05,
"loss": 0.13896613121032714,
"step": 3150,
"token_acc": 0.9524605870246887
},
{
"epoch": 2.4100994644223412,
"eval_loss": 0.5215019583702087,
"eval_runtime": 7.8548,
"eval_samples_per_second": 13.24,
"eval_steps_per_second": 1.655,
"eval_token_acc": 0.8747493624687195,
"step": 3150
},
{
"epoch": 2.4139250191277735,
"grad_norm": 0.5754761695861816,
"learning_rate": 1.008120264706598e-05,
"loss": 0.10798046588897706,
"step": 3155,
"token_acc": 0.9625075459480286
},
{
"epoch": 2.4177505738332057,
"grad_norm": 0.5995942950248718,
"learning_rate": 9.95456167522209e-06,
"loss": 0.11118266582489014,
"step": 3160,
"token_acc": 0.9624667167663574
},
{
"epoch": 2.4215761285386384,
"grad_norm": 0.6560847759246826,
"learning_rate": 9.82863318360695e-06,
"loss": 0.11946277618408203,
"step": 3165,
"token_acc": 0.9585193991661072
},
{
"epoch": 2.4254016832440706,
"grad_norm": 0.5231161713600159,
"learning_rate": 9.703419412714431e-06,
"loss": 0.1082839012145996,
"step": 3170,
"token_acc": 0.9630952477455139
},
{
"epoch": 2.429227237949503,
"grad_norm": 0.6471136808395386,
"learning_rate": 9.578922590322276e-06,
"loss": 0.10554378032684326,
"step": 3175,
"token_acc": 0.9643285870552063
},
{
"epoch": 2.433052792654935,
"grad_norm": 0.6062421202659607,
"learning_rate": 9.45514493145246e-06,
"loss": 0.11804389953613281,
"step": 3180,
"token_acc": 0.9601839780807495
},
{
"epoch": 2.4368783473603672,
"grad_norm": 0.6130327582359314,
"learning_rate": 9.332088638331682e-06,
"loss": 0.12830252647399903,
"step": 3185,
"token_acc": 0.955107569694519
},
{
"epoch": 2.4407039020657995,
"grad_norm": 0.5650054812431335,
"learning_rate": 9.209755900352285e-06,
"loss": 0.08745735883712769,
"step": 3190,
"token_acc": 0.9680666327476501
},
{
"epoch": 2.4445294567712317,
"grad_norm": 0.6417719125747681,
"learning_rate": 9.088148894033255e-06,
"loss": 0.10346298217773438,
"step": 3195,
"token_acc": 0.9632440209388733
},
{
"epoch": 2.4483550114766643,
"grad_norm": 0.549809992313385,
"learning_rate": 8.967269782981557e-06,
"loss": 0.10478920936584472,
"step": 3200,
"token_acc": 0.964032769203186
},
{
"epoch": 2.4483550114766643,
"eval_loss": 0.524568498134613,
"eval_runtime": 7.9187,
"eval_samples_per_second": 13.133,
"eval_steps_per_second": 1.642,
"eval_token_acc": 0.8750100135803223,
"step": 3200
},
{
"epoch": 2.4521805661820966,
"grad_norm": 0.5881340503692627,
"learning_rate": 8.847120717853513e-06,
"loss": 0.09231488704681397,
"step": 3205,
"token_acc": 0.967642068862915
},
{
"epoch": 2.456006120887529,
"grad_norm": 0.49171632528305054,
"learning_rate": 8.727703836316664e-06,
"loss": 0.08269585371017456,
"step": 3210,
"token_acc": 0.9714418053627014
},
{
"epoch": 2.459831675592961,
"grad_norm": 0.5847451090812683,
"learning_rate": 8.609021263011696e-06,
"loss": 0.09583220481872559,
"step": 3215,
"token_acc": 0.967701256275177
},
{
"epoch": 2.4636572302983932,
"grad_norm": 0.6022827625274658,
"learning_rate": 8.491075109514612e-06,
"loss": 0.0968513011932373,
"step": 3220,
"token_acc": 0.965691328048706
},
{
"epoch": 2.4674827850038255,
"grad_norm": 0.6396250128746033,
"learning_rate": 8.373867474299197e-06,
"loss": 0.09366763830184936,
"step": 3225,
"token_acc": 0.967291533946991
},
{
"epoch": 2.4713083397092577,
"grad_norm": 0.6564737558364868,
"learning_rate": 8.257400442699681e-06,
"loss": 0.09510574340820313,
"step": 3230,
"token_acc": 0.9668706059455872
},
{
"epoch": 2.4751338944146903,
"grad_norm": 0.5506086945533752,
"learning_rate": 8.141676086873572e-06,
"loss": 0.09186252355575561,
"step": 3235,
"token_acc": 0.9672021865844727
},
{
"epoch": 2.4789594491201226,
"grad_norm": 0.5937402844429016,
"learning_rate": 8.026696465764922e-06,
"loss": 0.09575964212417602,
"step": 3240,
"token_acc": 0.9655571579933167
},
{
"epoch": 2.482785003825555,
"grad_norm": 0.5168645977973938,
"learning_rate": 7.912463625067568e-06,
"loss": 0.11513475179672242,
"step": 3245,
"token_acc": 0.9584820866584778
},
{
"epoch": 2.486610558530987,
"grad_norm": 12.089369773864746,
"learning_rate": 7.7989795971888e-06,
"loss": 0.29053955078125,
"step": 3250,
"token_acc": 0.9437501430511475
},
{
"epoch": 2.486610558530987,
"eval_loss": 0.5287056565284729,
"eval_runtime": 7.9028,
"eval_samples_per_second": 13.16,
"eval_steps_per_second": 1.645,
"eval_token_acc": 0.8761628866195679,
"step": 3250
},
{
"epoch": 2.4904361132364192,
"grad_norm": 0.6238409876823425,
"learning_rate": 7.68624640121316e-06,
"loss": 0.1205405831336975,
"step": 3255,
"token_acc": 0.9586801528930664
},
{
"epoch": 2.4942616679418514,
"grad_norm": 0.6099902391433716,
"learning_rate": 7.574266042866546e-06,
"loss": 0.09387488961219788,
"step": 3260,
"token_acc": 0.9670175909996033
},
{
"epoch": 2.4980872226472837,
"grad_norm": 0.6190466284751892,
"learning_rate": 7.463040514480579e-06,
"loss": 0.11645488739013672,
"step": 3265,
"token_acc": 0.9598995447158813
},
{
"epoch": 2.5019127773527163,
"grad_norm": 0.6443151235580444,
"learning_rate": 7.352571794957025e-06,
"loss": 0.08591481447219848,
"step": 3270,
"token_acc": 0.9710960388183594
},
{
"epoch": 2.5057383320581486,
"grad_norm": 0.6558806896209717,
"learning_rate": 7.242861849732696e-06,
"loss": 0.1108025312423706,
"step": 3275,
"token_acc": 0.9633561968803406
},
{
"epoch": 2.5095638867635808,
"grad_norm": 0.6043168306350708,
"learning_rate": 7.133912630744455e-06,
"loss": 0.08010676503181458,
"step": 3280,
"token_acc": 0.9711145162582397
},
{
"epoch": 2.513389441469013,
"grad_norm": 0.671475887298584,
"learning_rate": 7.025726076394462e-06,
"loss": 0.1144939661026001,
"step": 3285,
"token_acc": 0.9594224691390991
},
{
"epoch": 2.517214996174445,
"grad_norm": 0.5959923267364502,
"learning_rate": 6.9183041115157165e-06,
"loss": 0.08532092571258545,
"step": 3290,
"token_acc": 0.9698848724365234
},
{
"epoch": 2.5210405508798774,
"grad_norm": 0.552179217338562,
"learning_rate": 6.8116486473377985e-06,
"loss": 0.09567714929580688,
"step": 3295,
"token_acc": 0.966461718082428
},
{
"epoch": 2.5248661055853097,
"grad_norm": 0.8035470843315125,
"learning_rate": 6.7057615814528514e-06,
"loss": 0.11172772645950317,
"step": 3300,
"token_acc": 0.9609107375144958
},
{
"epoch": 2.5248661055853097,
"eval_loss": 0.5269036889076233,
"eval_runtime": 8.3826,
"eval_samples_per_second": 12.407,
"eval_steps_per_second": 1.551,
"eval_token_acc": 0.8761628866195679,
"step": 3300
},
{
"epoch": 2.5286916602907423,
"grad_norm": 0.5826445817947388,
"learning_rate": 6.600644797781847e-06,
"loss": 0.09061547517776489,
"step": 3305,
"token_acc": 0.9684428572654724
},
{
"epoch": 2.5325172149961745,
"grad_norm": 0.6639491319656372,
"learning_rate": 6.496300166541052e-06,
"loss": 0.1045493245124817,
"step": 3310,
"token_acc": 0.9641888737678528
},
{
"epoch": 2.5363427697016068,
"grad_norm": 0.5682926177978516,
"learning_rate": 6.392729544208758e-06,
"loss": 0.10315026044845581,
"step": 3315,
"token_acc": 0.963904619216919
},
{
"epoch": 2.540168324407039,
"grad_norm": 0.6878834962844849,
"learning_rate": 6.289934773492223e-06,
"loss": 0.10737843513488769,
"step": 3320,
"token_acc": 0.963394284248352
},
{
"epoch": 2.543993879112471,
"grad_norm": 0.5965612530708313,
"learning_rate": 6.1879176832949525e-06,
"loss": 0.11070966720581055,
"step": 3325,
"token_acc": 0.9651868939399719
},
{
"epoch": 2.5478194338179034,
"grad_norm": 0.6844844818115234,
"learning_rate": 6.086680088684105e-06,
"loss": 0.10959099531173706,
"step": 3330,
"token_acc": 0.9614537358283997
},
{
"epoch": 2.5516449885233357,
"grad_norm": 0.5353488922119141,
"learning_rate": 5.986223790858186e-06,
"loss": 0.09058489799499511,
"step": 3335,
"token_acc": 0.9692246317863464
},
{
"epoch": 2.5554705432287683,
"grad_norm": 0.6746286749839783,
"learning_rate": 5.886550577115069e-06,
"loss": 0.1055182695388794,
"step": 3340,
"token_acc": 0.9636992812156677
},
{
"epoch": 2.5592960979342005,
"grad_norm": 0.5335373282432556,
"learning_rate": 5.787662220820134e-06,
"loss": 0.1255274772644043,
"step": 3345,
"token_acc": 0.9566043615341187
},
{
"epoch": 2.5631216526396328,
"grad_norm": 0.6528668403625488,
"learning_rate": 5.689560481374734e-06,
"loss": 0.10252002477645875,
"step": 3350,
"token_acc": 0.9639867544174194
},
{
"epoch": 2.5631216526396328,
"eval_loss": 0.5217230319976807,
"eval_runtime": 8.1191,
"eval_samples_per_second": 12.809,
"eval_steps_per_second": 1.601,
"eval_token_acc": 0.8769047260284424,
"step": 3350
},
{
"epoch": 2.566947207345065,
"grad_norm": 0.49694639444351196,
"learning_rate": 5.592247104184917e-06,
"loss": 0.08688923120498657,
"step": 3355,
"token_acc": 0.9706814289093018
},
{
"epoch": 2.570772762050497,
"grad_norm": 0.5503761172294617,
"learning_rate": 5.495723820630333e-06,
"loss": 0.12382068634033203,
"step": 3360,
"token_acc": 0.9561320543289185
},
{
"epoch": 2.57459831675593,
"grad_norm": 0.6813068985939026,
"learning_rate": 5.399992348033461e-06,
"loss": 0.12225714921951295,
"step": 3365,
"token_acc": 0.9570099711418152
},
{
"epoch": 2.5784238714613616,
"grad_norm": 0.5871702432632446,
"learning_rate": 5.305054389629022e-06,
"loss": 0.07900494337081909,
"step": 3370,
"token_acc": 0.9720001220703125
},
{
"epoch": 2.5822494261667943,
"grad_norm": 0.7074242830276489,
"learning_rate": 5.210911634533721e-06,
"loss": 0.11348228454589844,
"step": 3375,
"token_acc": 0.9611703157424927
},
{
"epoch": 2.5860749808722265,
"grad_norm": 0.6286773085594177,
"learning_rate": 5.117565757716158e-06,
"loss": 0.11759569644927978,
"step": 3380,
"token_acc": 0.9579370617866516
},
{
"epoch": 2.5899005355776588,
"grad_norm": 0.6363070607185364,
"learning_rate": 5.025018419967009e-06,
"loss": 0.11911303997039795,
"step": 3385,
"token_acc": 0.9589115977287292
},
{
"epoch": 2.593726090283091,
"grad_norm": 0.6866349577903748,
"learning_rate": 4.933271267869566e-06,
"loss": 0.11872742176055909,
"step": 3390,
"token_acc": 0.9597334265708923
},
{
"epoch": 2.597551644988523,
"grad_norm": 0.5686379075050354,
"learning_rate": 4.842325933770342e-06,
"loss": 0.10091429948806763,
"step": 3395,
"token_acc": 0.9646428227424622
},
{
"epoch": 2.601377199693956,
"grad_norm": 0.5744697451591492,
"learning_rate": 4.752184035750068e-06,
"loss": 0.1112870454788208,
"step": 3400,
"token_acc": 0.9629582166671753
},
{
"epoch": 2.601377199693956,
"eval_loss": 0.5221067667007446,
"eval_runtime": 7.949,
"eval_samples_per_second": 13.083,
"eval_steps_per_second": 1.635,
"eval_token_acc": 0.8777067065238953,
"step": 3400
},
{
"epoch": 2.6052027543993876,
"grad_norm": 0.5436497926712036,
"learning_rate": 4.662847177594909e-06,
"loss": 0.09204695224761963,
"step": 3405,
"token_acc": 0.9677549004554749
},
{
"epoch": 2.6090283091048203,
"grad_norm": 0.5940696001052856,
"learning_rate": 4.5743169487679316e-06,
"loss": 0.09365889430046082,
"step": 3410,
"token_acc": 0.9672086834907532
},
{
"epoch": 2.6128538638102525,
"grad_norm": 0.5806345343589783,
"learning_rate": 4.486594924380838e-06,
"loss": 0.07467930316925049,
"step": 3415,
"token_acc": 0.9740605354309082
},
{
"epoch": 2.6166794185156848,
"grad_norm": 0.6086448431015015,
"learning_rate": 4.3996826651658775e-06,
"loss": 0.09224212169647217,
"step": 3420,
"token_acc": 0.9681790471076965
},
{
"epoch": 2.620504973221117,
"grad_norm": 0.4966646432876587,
"learning_rate": 4.313581717448156e-06,
"loss": 0.08799538612365723,
"step": 3425,
"token_acc": 0.9687092304229736
},
{
"epoch": 2.624330527926549,
"grad_norm": 0.7006512880325317,
"learning_rate": 4.228293613118089e-06,
"loss": 0.10830029249191284,
"step": 3430,
"token_acc": 0.962169885635376
},
{
"epoch": 2.628156082631982,
"grad_norm": 0.7951710820198059,
"learning_rate": 4.143819869604132e-06,
"loss": 0.09951411485671997,
"step": 3435,
"token_acc": 0.9649299383163452
},
{
"epoch": 2.631981637337414,
"grad_norm": 0.6713584661483765,
"learning_rate": 4.060161989845818e-06,
"loss": 0.09943540692329407,
"step": 3440,
"token_acc": 0.9660786390304565
},
{
"epoch": 2.6358071920428463,
"grad_norm": 0.8555734753608704,
"learning_rate": 3.977321462266998e-06,
"loss": 0.12329368591308594,
"step": 3445,
"token_acc": 0.9588665962219238
},
{
"epoch": 2.6396327467482785,
"grad_norm": 0.7402066588401794,
"learning_rate": 3.8952997607493325e-06,
"loss": 0.1296180248260498,
"step": 3450,
"token_acc": 0.9544374942779541
},
{
"epoch": 2.6396327467482785,
"eval_loss": 0.5221165418624878,
"eval_runtime": 7.8424,
"eval_samples_per_second": 13.261,
"eval_steps_per_second": 1.658,
"eval_token_acc": 0.8774861693382263,
"step": 3450
},
{
"epoch": 2.6434583014537107,
"grad_norm": 0.5311779975891113,
"learning_rate": 3.814098344606143e-06,
"loss": 0.08472838401794433,
"step": 3455,
"token_acc": 0.9710620045661926
},
{
"epoch": 2.647283856159143,
"grad_norm": 0.572284460067749,
"learning_rate": 3.7337186585563732e-06,
"loss": 0.08200944662094116,
"step": 3460,
"token_acc": 0.9718431234359741
},
{
"epoch": 2.651109410864575,
"grad_norm": 0.4984256327152252,
"learning_rate": 3.654162132698918e-06,
"loss": 0.10278162956237794,
"step": 3465,
"token_acc": 0.965274453163147
},
{
"epoch": 2.654934965570008,
"grad_norm": 0.5390318036079407,
"learning_rate": 3.5754301824871605e-06,
"loss": 0.10632505416870117,
"step": 3470,
"token_acc": 0.9644556641578674
},
{
"epoch": 2.65876052027544,
"grad_norm": 0.5882481336593628,
"learning_rate": 3.497524208703834e-06,
"loss": 0.10900474786758423,
"step": 3475,
"token_acc": 0.9621248841285706
},
{
"epoch": 2.6625860749808723,
"grad_norm": 0.6717934608459473,
"learning_rate": 3.420445597436056e-06,
"loss": 0.0886709749698639,
"step": 3480,
"token_acc": 0.9691559672355652
},
{
"epoch": 2.6664116296863045,
"grad_norm": 0.5694244503974915,
"learning_rate": 3.344195720050658e-06,
"loss": 0.09270554780960083,
"step": 3485,
"token_acc": 0.9656193852424622
},
{
"epoch": 2.6702371843917367,
"grad_norm": 0.7296086549758911,
"learning_rate": 3.2687759331698375e-06,
"loss": 0.10218125581741333,
"step": 3490,
"token_acc": 0.9648373126983643
},
{
"epoch": 2.674062739097169,
"grad_norm": 0.4986768662929535,
"learning_rate": 3.194187578646979e-06,
"loss": 0.09201115369796753,
"step": 3495,
"token_acc": 0.9665822982788086
},
{
"epoch": 2.677888293802601,
"grad_norm": 0.6790587306022644,
"learning_rate": 3.120431983542793e-06,
"loss": 0.10237842798233032,
"step": 3500,
"token_acc": 0.9661151170730591
},
{
"epoch": 2.677888293802601,
"eval_loss": 0.5228468179702759,
"eval_runtime": 7.9645,
"eval_samples_per_second": 13.058,
"eval_steps_per_second": 1.632,
"eval_token_acc": 0.8785387873649597,
"step": 3500
},
{
"epoch": 2.681713848508034,
"grad_norm": 0.6572025418281555,
"learning_rate": 3.047510460101705e-06,
"loss": 0.13050510883331298,
"step": 3505,
"token_acc": 0.9555116295814514
},
{
"epoch": 2.685539403213466,
"grad_norm": 0.8115324378013611,
"learning_rate": 2.9754243057285134e-06,
"loss": 0.1264261245727539,
"step": 3510,
"token_acc": 0.956243634223938
},
{
"epoch": 2.6893649579188983,
"grad_norm": 0.5161707401275635,
"learning_rate": 2.9041748029652927e-06,
"loss": 0.08881696462631225,
"step": 3515,
"token_acc": 0.9682623147964478
},
{
"epoch": 2.6931905126243305,
"grad_norm": 0.5522788763046265,
"learning_rate": 2.8337632194685993e-06,
"loss": 0.08286306858062745,
"step": 3520,
"token_acc": 0.9708802700042725
},
{
"epoch": 2.6970160673297627,
"grad_norm": 0.5946321487426758,
"learning_rate": 2.7641908079868827e-06,
"loss": 0.10248844623565674,
"step": 3525,
"token_acc": 0.9636382460594177
},
{
"epoch": 2.700841622035195,
"grad_norm": 0.6317991018295288,
"learning_rate": 2.69545880633823e-06,
"loss": 0.10524777173995972,
"step": 3530,
"token_acc": 0.9621507525444031
},
{
"epoch": 2.704667176740627,
"grad_norm": 0.41846737265586853,
"learning_rate": 2.627568437388306e-06,
"loss": 0.08343310356140136,
"step": 3535,
"token_acc": 0.970815122127533
},
{
"epoch": 2.70849273144606,
"grad_norm": 0.592873752117157,
"learning_rate": 2.560520909028663e-06,
"loss": 0.08635797500610351,
"step": 3540,
"token_acc": 0.9700879454612732
},
{
"epoch": 2.712318286151492,
"grad_norm": 0.5590534210205078,
"learning_rate": 2.4943174141551674e-06,
"loss": 0.10181926488876343,
"step": 3545,
"token_acc": 0.9652162194252014
},
{
"epoch": 2.7161438408569243,
"grad_norm": 0.5901391506195068,
"learning_rate": 2.428959130646824e-06,
"loss": 0.09749918580055236,
"step": 3550,
"token_acc": 0.9646121263504028
},
{
"epoch": 2.7161438408569243,
"eval_loss": 0.5235512256622314,
"eval_runtime": 7.8855,
"eval_samples_per_second": 13.189,
"eval_steps_per_second": 1.649,
"eval_token_acc": 0.8786590695381165,
"step": 3550
},
{
"epoch": 2.7199693955623565,
"grad_norm": 0.5816419720649719,
"learning_rate": 2.364447221344812e-06,
"loss": 0.12211033105850219,
"step": 3555,
"token_acc": 0.9581829905509949
},
{
"epoch": 2.7237949502677887,
"grad_norm": 0.6168470978736877,
"learning_rate": 2.3007828340318114e-06,
"loss": 0.09811439514160156,
"step": 3560,
"token_acc": 0.9663928151130676
},
{
"epoch": 2.7276205049732214,
"grad_norm": 0.599656343460083,
"learning_rate": 2.237967101411531e-06,
"loss": 0.12740142345428468,
"step": 3565,
"token_acc": 0.9561182260513306
},
{
"epoch": 2.731446059678653,
"grad_norm": 0.6238080263137817,
"learning_rate": 2.1760011410886126e-06,
"loss": 0.09838619828224182,
"step": 3570,
"token_acc": 0.9653590321540833
},
{
"epoch": 2.735271614384086,
"grad_norm": 0.5564831495285034,
"learning_rate": 2.1148860555487204e-06,
"loss": 0.09222927689552307,
"step": 3575,
"token_acc": 0.9685646891593933
},
{
"epoch": 2.739097169089518,
"grad_norm": 0.6360819935798645,
"learning_rate": 2.0546229321389278e-06,
"loss": 0.09308220148086548,
"step": 3580,
"token_acc": 0.9680613279342651
},
{
"epoch": 2.7429227237949503,
"grad_norm": 0.5651523470878601,
"learning_rate": 1.995212843048372e-06,
"loss": 0.09616876244544983,
"step": 3585,
"token_acc": 0.9660496115684509
},
{
"epoch": 2.7467482785003825,
"grad_norm": 0.6321117877960205,
"learning_rate": 1.93665684528917e-06,
"loss": 0.09454690217971802,
"step": 3590,
"token_acc": 0.9675334692001343
},
{
"epoch": 2.7505738332058147,
"grad_norm": 0.5536521077156067,
"learning_rate": 1.878955980677638e-06,
"loss": 0.07992898225784302,
"step": 3595,
"token_acc": 0.9721735119819641
},
{
"epoch": 2.7543993879112474,
"grad_norm": 0.688173770904541,
"learning_rate": 1.82211127581573e-06,
"loss": 0.09609293937683105,
"step": 3600,
"token_acc": 0.9671096205711365
},
{
"epoch": 2.7543993879112474,
"eval_loss": 0.5215653777122498,
"eval_runtime": 8.0108,
"eval_samples_per_second": 12.982,
"eval_steps_per_second": 1.623,
"eval_token_acc": 0.8788695931434631,
"step": 3600
},
{
"epoch": 2.758224942616679,
"grad_norm": 0.6505938768386841,
"learning_rate": 1.7661237420727784e-06,
"loss": 0.1013750433921814,
"step": 3605,
"token_acc": 0.9644123315811157
},
{
"epoch": 2.762050497322112,
"grad_norm": 0.5934735536575317,
"learning_rate": 1.710994375567504e-06,
"loss": 0.0851688802242279,
"step": 3610,
"token_acc": 0.9705018997192383
},
{
"epoch": 2.765876052027544,
"grad_norm": 0.6007834076881409,
"learning_rate": 1.6567241571502912e-06,
"loss": 0.07638438940048217,
"step": 3615,
"token_acc": 0.9712318778038025
},
{
"epoch": 2.7697016067329763,
"grad_norm": 0.5481213927268982,
"learning_rate": 1.6033140523857404e-06,
"loss": 0.09145662784576417,
"step": 3620,
"token_acc": 0.9675630927085876
},
{
"epoch": 2.7735271614384085,
"grad_norm": 0.6200750470161438,
"learning_rate": 1.5507650115354877e-06,
"loss": 0.10738480091094971,
"step": 3625,
"token_acc": 0.9640287756919861
},
{
"epoch": 2.7773527161438407,
"grad_norm": 0.6538658142089844,
"learning_rate": 1.499077969541307e-06,
"loss": 0.10229132175445557,
"step": 3630,
"token_acc": 0.9641778469085693
},
{
"epoch": 2.7811782708492734,
"grad_norm": 1.8193166255950928,
"learning_rate": 1.4482538460084293e-06,
"loss": 0.13732895851135254,
"step": 3635,
"token_acc": 0.958136796951294
},
{
"epoch": 2.785003825554705,
"grad_norm": 0.5257523655891418,
"learning_rate": 1.3982935451892498e-06,
"loss": 0.08640526533126831,
"step": 3640,
"token_acc": 0.971260130405426
},
{
"epoch": 2.788829380260138,
"grad_norm": 0.568705141544342,
"learning_rate": 1.3491979559672075e-06,
"loss": 0.08791974782943726,
"step": 3645,
"token_acc": 0.9699133038520813
},
{
"epoch": 2.79265493496557,
"grad_norm": 0.5045759081840515,
"learning_rate": 1.3009679518409479e-06,
"loss": 0.07553626298904419,
"step": 3650,
"token_acc": 0.9740194082260132
},
{
"epoch": 2.79265493496557,
"eval_loss": 0.5219829678535461,
"eval_runtime": 8.0288,
"eval_samples_per_second": 12.953,
"eval_steps_per_second": 1.619,
"eval_token_acc": 0.8788595795631409,
"step": 3650
},
{
"epoch": 2.7964804896710023,
"grad_norm": 0.610518217086792,
"learning_rate": 1.2536043909088191e-06,
"loss": 0.10455150604248047,
"step": 3655,
"token_acc": 0.9636396765708923
},
{
"epoch": 2.8003060443764345,
"grad_norm": 0.5319099426269531,
"learning_rate": 1.2071081158535947e-06,
"loss": 0.08882582187652588,
"step": 3660,
"token_acc": 0.968651294708252
},
{
"epoch": 2.8041315990818667,
"grad_norm": 0.6065900325775146,
"learning_rate": 1.1614799539274634e-06,
"loss": 0.08307374119758607,
"step": 3665,
"token_acc": 0.9706868529319763
},
{
"epoch": 2.8079571537872994,
"grad_norm": 0.6401634812355042,
"learning_rate": 1.1167207169373195e-06,
"loss": 0.09725141525268555,
"step": 3670,
"token_acc": 0.9657084941864014
},
{
"epoch": 2.8117827084927316,
"grad_norm": 0.524497389793396,
"learning_rate": 1.0728312012303454e-06,
"loss": 0.11780104637145997,
"step": 3675,
"token_acc": 0.960728108882904
},
{
"epoch": 2.815608263198164,
"grad_norm": 0.7346832156181335,
"learning_rate": 1.0298121876797962e-06,
"loss": 0.11407887935638428,
"step": 3680,
"token_acc": 0.9612630605697632
},
{
"epoch": 2.819433817903596,
"grad_norm": 0.6890755295753479,
"learning_rate": 9.876644416711488e-07,
"loss": 0.11829521656036376,
"step": 3685,
"token_acc": 0.9585215449333191
},
{
"epoch": 2.8232593726090283,
"grad_norm": 0.5342867970466614,
"learning_rate": 9.46388713088453e-07,
"loss": 0.09410252571105956,
"step": 3690,
"token_acc": 0.9661674499511719
},
{
"epoch": 2.8270849273144605,
"grad_norm": 0.4889836311340332,
"learning_rate": 9.059857363010183e-07,
"loss": 0.09603096842765808,
"step": 3695,
"token_acc": 0.965887188911438
},
{
"epoch": 2.8309104820198927,
"grad_norm": 0.5685746073722839,
"learning_rate": 8.664562301503143e-07,
"loss": 0.08459590077400207,
"step": 3700,
"token_acc": 0.9699506163597107
},
{
"epoch": 2.8309104820198927,
"eval_loss": 0.5205320119857788,
"eval_runtime": 7.8427,
"eval_samples_per_second": 13.261,
"eval_steps_per_second": 1.658,
"eval_token_acc": 0.8790299892425537,
"step": 3700
},
{
"epoch": 2.8347360367253254,
"grad_norm": 0.5299521684646606,
"learning_rate": 8.278008979372087e-07,
"loss": 0.09127166271209716,
"step": 3705,
"token_acc": 0.9684864282608032
},
{
"epoch": 2.8385615914307576,
"grad_norm": 0.4766036868095398,
"learning_rate": 7.900204274094602e-07,
"loss": 0.09881120324134826,
"step": 3710,
"token_acc": 0.9655190706253052
},
{
"epoch": 2.84238714613619,
"grad_norm": 8.799799919128418,
"learning_rate": 7.531154907494397e-07,
"loss": 0.13544522523880004,
"step": 3715,
"token_acc": 0.9555306434631348
},
{
"epoch": 2.846212700841622,
"grad_norm": 0.563325822353363,
"learning_rate": 7.170867445622287e-07,
"loss": 0.10241570472717285,
"step": 3720,
"token_acc": 0.9647788405418396
},
{
"epoch": 2.8500382555470543,
"grad_norm": 0.6075456142425537,
"learning_rate": 6.819348298638839e-07,
"loss": 0.12761690616607665,
"step": 3725,
"token_acc": 0.9584816098213196
},
{
"epoch": 2.8538638102524865,
"grad_norm": 0.6337462663650513,
"learning_rate": 6.476603720700636e-07,
"loss": 0.09158645272254944,
"step": 3730,
"token_acc": 0.9687730669975281
},
{
"epoch": 2.8576893649579187,
"grad_norm": 0.5899404287338257,
"learning_rate": 6.142639809849027e-07,
"loss": 0.09597094655036927,
"step": 3735,
"token_acc": 0.9665765762329102
},
{
"epoch": 2.8615149196633514,
"grad_norm": 0.5653353929519653,
"learning_rate": 5.817462507901383e-07,
"loss": 0.10877490043640137,
"step": 3740,
"token_acc": 0.9619103074073792
},
{
"epoch": 2.8653404743687836,
"grad_norm": 0.49452540278434753,
"learning_rate": 5.501077600345572e-07,
"loss": 0.08857889175415039,
"step": 3745,
"token_acc": 0.9700949192047119
},
{
"epoch": 2.869166029074216,
"grad_norm": 0.731597900390625,
"learning_rate": 5.193490716237037e-07,
"loss": 0.12281218767166138,
"step": 3750,
"token_acc": 0.9560735821723938
},
{
"epoch": 2.869166029074216,
"eval_loss": 0.5206364989280701,
"eval_runtime": 9.2942,
"eval_samples_per_second": 11.19,
"eval_steps_per_second": 1.399,
"eval_token_acc": 0.879270613193512,
"step": 3750
},
{
"epoch": 2.872991583779648,
"grad_norm": 0.6116617321968079,
"learning_rate": 4.894707328098602e-07,
"loss": 0.11083317995071411,
"step": 3755,
"token_acc": 0.9610885977745056
},
{
"epoch": 2.8768171384850802,
"grad_norm": 0.5174733400344849,
"learning_rate": 4.6047327518230485e-07,
"loss": 0.08961974382400513,
"step": 3760,
"token_acc": 0.9690099954605103
},
{
"epoch": 2.8806426931905125,
"grad_norm": 0.5262379050254822,
"learning_rate": 4.3235721465784697e-07,
"loss": 0.09585506916046142,
"step": 3765,
"token_acc": 0.9667736887931824
},
{
"epoch": 2.8844682478959447,
"grad_norm": 0.5788334012031555,
"learning_rate": 4.0512305147167863e-07,
"loss": 0.08268014192581177,
"step": 3770,
"token_acc": 0.9712512493133545
},
{
"epoch": 2.8882938026013774,
"grad_norm": 0.687783420085907,
"learning_rate": 3.787712701684598e-07,
"loss": 0.08984529376029968,
"step": 3775,
"token_acc": 0.9686997532844543
},
{
"epoch": 2.8921193573068096,
"grad_norm": 0.6016952395439148,
"learning_rate": 3.5330233959365853e-07,
"loss": 0.09222807884216308,
"step": 3780,
"token_acc": 0.9685728549957275
},
{
"epoch": 2.895944912012242,
"grad_norm": 0.5089208483695984,
"learning_rate": 3.2871671288528525e-07,
"loss": 0.09786663055419922,
"step": 3785,
"token_acc": 0.9665623903274536
},
{
"epoch": 2.899770466717674,
"grad_norm": 1.769921898841858,
"learning_rate": 3.050148274657549e-07,
"loss": 0.12438170909881592,
"step": 3790,
"token_acc": 0.9624179601669312
},
{
"epoch": 2.9035960214231062,
"grad_norm": 0.5424771904945374,
"learning_rate": 2.821971050341654e-07,
"loss": 0.0890495777130127,
"step": 3795,
"token_acc": 0.9703425765037537
},
{
"epoch": 2.907421576128539,
"grad_norm": 0.5487825274467468,
"learning_rate": 2.6026395155874795e-07,
"loss": 0.10370445251464844,
"step": 3800,
"token_acc": 0.9638125896453857
},
{
"epoch": 2.907421576128539,
"eval_loss": 0.5206490159034729,
"eval_runtime": 8.3112,
"eval_samples_per_second": 12.513,
"eval_steps_per_second": 1.564,
"eval_token_acc": 0.8794209361076355,
"step": 3800
},
{
"epoch": 2.9112471308339707,
"grad_norm": 0.5681285262107849,
"learning_rate": 2.3921575726967846e-07,
"loss": 0.09305150508880615,
"step": 3805,
"token_acc": 0.9688363075256348
},
{
"epoch": 2.9150726855394034,
"grad_norm": 0.4438033103942871,
"learning_rate": 2.1905289665211104e-07,
"loss": 0.08973047733306885,
"step": 3810,
"token_acc": 0.9688341021537781
},
{
"epoch": 2.9188982402448356,
"grad_norm": 0.5287227630615234,
"learning_rate": 1.9977572843953296e-07,
"loss": 0.07862873077392578,
"step": 3815,
"token_acc": 0.9715408086776733
},
{
"epoch": 2.922723794950268,
"grad_norm": 0.5739708542823792,
"learning_rate": 1.8138459560735899e-07,
"loss": 0.08315033316612244,
"step": 3820,
"token_acc": 0.9718932509422302
},
{
"epoch": 2.9265493496557,
"grad_norm": 0.6123870611190796,
"learning_rate": 1.638798253668694e-07,
"loss": 0.125601065158844,
"step": 3825,
"token_acc": 0.9556345343589783
},
{
"epoch": 2.9303749043611322,
"grad_norm": 0.6285126209259033,
"learning_rate": 1.4726172915933146e-07,
"loss": 0.09772306680679321,
"step": 3830,
"token_acc": 0.9654306769371033
},
{
"epoch": 2.934200459066565,
"grad_norm": 0.4770904779434204,
"learning_rate": 1.315306026505092e-07,
"loss": 0.0937896728515625,
"step": 3835,
"token_acc": 0.9662994146347046
},
{
"epoch": 2.9380260137719967,
"grad_norm": 0.4980320632457733,
"learning_rate": 1.1668672572539008e-07,
"loss": 0.08644679784774781,
"step": 3840,
"token_acc": 0.969020664691925
},
{
"epoch": 2.9418515684774293,
"grad_norm": 0.5362405180931091,
"learning_rate": 1.0273036248318324e-07,
"loss": 0.08760695457458496,
"step": 3845,
"token_acc": 0.9707760214805603
},
{
"epoch": 2.9456771231828616,
"grad_norm": 0.4886132776737213,
"learning_rate": 8.966176123264003e-08,
"loss": 0.06749528646469116,
"step": 3850,
"token_acc": 0.9768878221511841
},
{
"epoch": 2.9456771231828616,
"eval_loss": 0.5208922028541565,
"eval_runtime": 8.161,
"eval_samples_per_second": 12.743,
"eval_steps_per_second": 1.593,
"eval_token_acc": 0.8793407678604126,
"step": 3850
},
{
"epoch": 2.949502677888294,
"grad_norm": 0.5290758013725281,
"learning_rate": 7.748115448763526e-08,
"loss": 0.07928290963172913,
"step": 3855,
"token_acc": 0.971563458442688
},
{
"epoch": 2.953328232593726,
"grad_norm": 0.6795271039009094,
"learning_rate": 6.618875896303167e-08,
"loss": 0.10474317073822022,
"step": 3860,
"token_acc": 0.9640142321586609
},
{
"epoch": 2.9571537872991582,
"grad_norm": 0.6599166989326477,
"learning_rate": 5.578477557081074e-08,
"loss": 0.10668476819992065,
"step": 3865,
"token_acc": 0.9629032015800476
},
{
"epoch": 2.960979342004591,
"grad_norm": 0.6517552733421326,
"learning_rate": 4.6269389416514486e-08,
"loss": 0.08918753862380982,
"step": 3870,
"token_acc": 0.9688775539398193
},
{
"epoch": 2.964804896710023,
"grad_norm": 0.6627753376960754,
"learning_rate": 3.764276979593695e-08,
"loss": 0.08152820467948914,
"step": 3875,
"token_acc": 0.9715802669525146
},
{
"epoch": 2.9686304514154553,
"grad_norm": 0.5488728284835815,
"learning_rate": 2.990507019213218e-08,
"loss": 0.08794408440589904,
"step": 3880,
"token_acc": 0.9700236916542053
},
{
"epoch": 2.9724560061208876,
"grad_norm": 0.5994005799293518,
"learning_rate": 2.305642827266641e-08,
"loss": 0.10513956546783447,
"step": 3885,
"token_acc": 0.9652788639068604
},
{
"epoch": 2.97628156082632,
"grad_norm": 0.5402779579162598,
"learning_rate": 1.7096965887164475e-08,
"loss": 0.10320125818252564,
"step": 3890,
"token_acc": 0.964747428894043
},
{
"epoch": 2.980107115531752,
"grad_norm": 0.5638807415962219,
"learning_rate": 1.2026789065167077e-08,
"loss": 0.09008901119232178,
"step": 3895,
"token_acc": 0.9677461385726929
},
{
"epoch": 2.9839326702371842,
"grad_norm": 0.6424400806427002,
"learning_rate": 7.845988014215655e-09,
"loss": 0.09886548519134522,
"step": 3900,
"token_acc": 0.9671627879142761
},
{
"epoch": 2.9839326702371842,
"eval_loss": 0.5208696126937866,
"eval_runtime": 8.2236,
"eval_samples_per_second": 12.647,
"eval_steps_per_second": 1.581,
"eval_token_acc": 0.8792405128479004,
"step": 3900
},
{
"epoch": 2.987758224942617,
"grad_norm": 0.6108574867248535,
"learning_rate": 4.554637118270311e-09,
"loss": 0.10293105840682984,
"step": 3905,
"token_acc": 0.9645171165466309
},
{
"epoch": 2.991583779648049,
"grad_norm": 0.5026504993438721,
"learning_rate": 2.1527949363664425e-09,
"loss": 0.1074068307876587,
"step": 3910,
"token_acc": 0.9619331359863281
},
{
"epoch": 2.9954093343534813,
"grad_norm": 0.6875292658805847,
"learning_rate": 6.405042015877882e-10,
"loss": 0.11073212623596192,
"step": 3915,
"token_acc": 0.9605428576469421
},
{
"epoch": 2.9992348890589136,
"grad_norm": 0.6482424139976501,
"learning_rate": 1.7791820305923523e-11,
"loss": 0.11924041509628296,
"step": 3920,
"token_acc": 0.9589547514915466
},
{
"epoch": 3.0,
"eval_loss": 0.5209956765174866,
"eval_runtime": 8.1308,
"eval_samples_per_second": 12.791,
"eval_steps_per_second": 1.599,
"eval_token_acc": 0.8794109225273132,
"step": 3921
}
],
"logging_steps": 5,
"max_steps": 3921,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 6.411019928798757e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}