{ "best_global_step": 2600, "best_metric": 0.4455747, "best_model_checkpoint": "/root/ms-swift/output_1/v4-20250825-221955/checkpoint-2600", "epoch": 3.0, "eval_steps": 50, "global_step": 3921, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007651109410864575, "grad_norm": 15.001960754394531, "learning_rate": 5.076142131979695e-07, "loss": 1.2726802825927734, "step": 1, "token_acc": 0.6764705777168274 }, { "epoch": 0.0038255547054322878, "grad_norm": 13.028708457946777, "learning_rate": 2.5380710659898476e-06, "loss": 1.495189905166626, "step": 5, "token_acc": 0.6392497420310974 }, { "epoch": 0.0076511094108645756, "grad_norm": 5.605969429016113, "learning_rate": 5.076142131979695e-06, "loss": 1.1087797164916993, "step": 10, "token_acc": 0.7032846808433533 }, { "epoch": 0.011476664116296864, "grad_norm": 4.179737091064453, "learning_rate": 7.614213197969544e-06, "loss": 0.7857523918151855, "step": 15, "token_acc": 0.7791855931282043 }, { "epoch": 0.015302218821729151, "grad_norm": 4.184815883636475, "learning_rate": 1.015228426395939e-05, "loss": 0.6412610054016114, "step": 20, "token_acc": 0.8024289011955261 }, { "epoch": 0.019127773527161437, "grad_norm": 3.188452959060669, "learning_rate": 1.2690355329949238e-05, "loss": 0.6599317073822022, "step": 25, "token_acc": 0.7991740703582764 }, { "epoch": 0.022953328232593728, "grad_norm": 2.735691785812378, "learning_rate": 1.5228426395939088e-05, "loss": 0.6142410278320313, "step": 30, "token_acc": 0.8127740025520325 }, { "epoch": 0.026778882938026015, "grad_norm": 2.9147984981536865, "learning_rate": 1.7766497461928935e-05, "loss": 0.6038710117340088, "step": 35, "token_acc": 0.813315212726593 }, { "epoch": 0.030604437643458302, "grad_norm": 2.701826572418213, "learning_rate": 2.030456852791878e-05, "loss": 0.5683969497680664, "step": 40, "token_acc": 0.8215563893318176 }, { "epoch": 0.03442999234889059, "grad_norm": 2.8082520961761475, "learning_rate": 2.284263959390863e-05, "loss": 0.6069915771484375, "step": 45, "token_acc": 0.8085312843322754 }, { "epoch": 0.03825554705432287, "grad_norm": 2.6436877250671387, "learning_rate": 2.5380710659898476e-05, "loss": 0.5704009056091308, "step": 50, "token_acc": 0.8219647407531738 }, { "epoch": 0.03825554705432287, "eval_loss": 0.5656692981719971, "eval_runtime": 6.1089, "eval_samples_per_second": 17.024, "eval_steps_per_second": 2.128, "eval_token_acc": 0.8207153677940369, "step": 50 }, { "epoch": 0.042081101759755164, "grad_norm": 2.689117670059204, "learning_rate": 2.7918781725888326e-05, "loss": 0.575815486907959, "step": 55, "token_acc": 0.8211867213249207 }, { "epoch": 0.045906656465187455, "grad_norm": 2.2790122032165527, "learning_rate": 3.0456852791878175e-05, "loss": 0.5862385749816894, "step": 60, "token_acc": 0.8205827474594116 }, { "epoch": 0.04973221117061974, "grad_norm": 2.6730895042419434, "learning_rate": 3.299492385786802e-05, "loss": 0.5797908782958985, "step": 65, "token_acc": 0.819099485874176 }, { "epoch": 0.05355776587605203, "grad_norm": 2.4526894092559814, "learning_rate": 3.553299492385787e-05, "loss": 0.6487821102142334, "step": 70, "token_acc": 0.7994943857192993 }, { "epoch": 0.057383320581484314, "grad_norm": 2.265002489089966, "learning_rate": 3.8071065989847716e-05, "loss": 0.6046820640563965, "step": 75, "token_acc": 0.8156428933143616 }, { "epoch": 0.061208875286916604, "grad_norm": 2.5733046531677246, "learning_rate": 4.060913705583756e-05, "loss": 0.5806538581848144, "step": 80, "token_acc": 0.8199408054351807 }, { "epoch": 0.06503442999234889, "grad_norm": 2.3223984241485596, "learning_rate": 4.3147208121827415e-05, "loss": 0.6687778949737548, "step": 85, "token_acc": 0.7976916432380676 }, { "epoch": 0.06885998469778118, "grad_norm": 1.9996718168258667, "learning_rate": 4.568527918781726e-05, "loss": 0.5714664459228516, "step": 90, "token_acc": 0.8250343203544617 }, { "epoch": 0.07268553940321347, "grad_norm": 2.2907140254974365, "learning_rate": 4.822335025380711e-05, "loss": 0.6378528118133545, "step": 95, "token_acc": 0.8057200312614441 }, { "epoch": 0.07651109410864575, "grad_norm": 1.9822206497192383, "learning_rate": 5.076142131979695e-05, "loss": 0.6435206413269043, "step": 100, "token_acc": 0.8065351843833923 }, { "epoch": 0.07651109410864575, "eval_loss": 0.6021918654441833, "eval_runtime": 6.7812, "eval_samples_per_second": 15.337, "eval_steps_per_second": 1.917, "eval_token_acc": 0.814389705657959, "step": 100 }, { "epoch": 0.08033664881407804, "grad_norm": 1.8460628986358643, "learning_rate": 5.329949238578681e-05, "loss": 0.6554917335510254, "step": 105, "token_acc": 0.8218502998352051 }, { "epoch": 0.08416220351951033, "grad_norm": 2.0430757999420166, "learning_rate": 5.583756345177665e-05, "loss": 0.7082652091979981, "step": 110, "token_acc": 0.7900523543357849 }, { "epoch": 0.08798775822494262, "grad_norm": 2.1763596534729004, "learning_rate": 5.83756345177665e-05, "loss": 0.6629996299743652, "step": 115, "token_acc": 0.7997561097145081 }, { "epoch": 0.09181331293037491, "grad_norm": 1.8452140092849731, "learning_rate": 6.091370558375635e-05, "loss": 0.6425168991088868, "step": 120, "token_acc": 0.8068760633468628 }, { "epoch": 0.09563886763580719, "grad_norm": 2.0671913623809814, "learning_rate": 6.34517766497462e-05, "loss": 0.6626197814941406, "step": 125, "token_acc": 0.8050779700279236 }, { "epoch": 0.09946442234123948, "grad_norm": 1.9707857370376587, "learning_rate": 6.598984771573604e-05, "loss": 0.6357526779174805, "step": 130, "token_acc": 0.8117111921310425 }, { "epoch": 0.10328997704667177, "grad_norm": 1.684924840927124, "learning_rate": 6.852791878172589e-05, "loss": 0.6633370399475098, "step": 135, "token_acc": 0.8078529834747314 }, { "epoch": 0.10711553175210406, "grad_norm": 1.8460227251052856, "learning_rate": 7.106598984771574e-05, "loss": 0.7214941501617431, "step": 140, "token_acc": 0.7888500690460205 }, { "epoch": 0.11094108645753634, "grad_norm": 1.8344098329544067, "learning_rate": 7.360406091370558e-05, "loss": 0.7153414249420166, "step": 145, "token_acc": 0.7917036414146423 }, { "epoch": 0.11476664116296863, "grad_norm": 2.0649237632751465, "learning_rate": 7.614213197969543e-05, "loss": 0.8018023490905761, "step": 150, "token_acc": 0.7870769500732422 }, { "epoch": 0.11476664116296863, "eval_loss": 0.6869359612464905, "eval_runtime": 7.176, "eval_samples_per_second": 14.493, "eval_steps_per_second": 1.812, "eval_token_acc": 0.8004150390625, "step": 150 }, { "epoch": 0.11859219586840092, "grad_norm": 2.0781986713409424, "learning_rate": 7.868020304568529e-05, "loss": 0.7426050186157227, "step": 155, "token_acc": 0.784966230392456 }, { "epoch": 0.12241775057383321, "grad_norm": 3.169353485107422, "learning_rate": 8.121827411167512e-05, "loss": 0.6967845916748047, "step": 160, "token_acc": 0.799592137336731 }, { "epoch": 0.1262433052792655, "grad_norm": 2.8000311851501465, "learning_rate": 8.375634517766498e-05, "loss": 0.6940568923950196, "step": 165, "token_acc": 0.7990803718566895 }, { "epoch": 0.13006885998469778, "grad_norm": 1.7199612855911255, "learning_rate": 8.629441624365483e-05, "loss": 0.6588430404663086, "step": 170, "token_acc": 0.8097391724586487 }, { "epoch": 0.13389441469013008, "grad_norm": 1.6225758790969849, "learning_rate": 8.883248730964467e-05, "loss": 0.7546923160552979, "step": 175, "token_acc": 0.7823401093482971 }, { "epoch": 0.13771996939556236, "grad_norm": 1.738344430923462, "learning_rate": 9.137055837563452e-05, "loss": 0.6869890213012695, "step": 180, "token_acc": 0.8029044270515442 }, { "epoch": 0.14154552410099464, "grad_norm": 1.7446883916854858, "learning_rate": 9.390862944162437e-05, "loss": 0.744170093536377, "step": 185, "token_acc": 0.7861586213111877 }, { "epoch": 0.14537107880642694, "grad_norm": 1.5875240564346313, "learning_rate": 9.644670050761421e-05, "loss": 0.6316198348999024, "step": 190, "token_acc": 0.8180323839187622 }, { "epoch": 0.14919663351185922, "grad_norm": 1.83012855052948, "learning_rate": 9.898477157360407e-05, "loss": 1.0572455406188965, "step": 195, "token_acc": 0.7630072236061096 }, { "epoch": 0.1530221882172915, "grad_norm": 9.883597373962402, "learning_rate": 9.99998398736932e-05, "loss": 0.703323221206665, "step": 200, "token_acc": 0.8030744194984436 }, { "epoch": 0.1530221882172915, "eval_loss": 0.7220072150230408, "eval_runtime": 7.3149, "eval_samples_per_second": 14.218, "eval_steps_per_second": 1.777, "eval_token_acc": 0.7940492630004883, "step": 200 }, { "epoch": 0.1568477429227238, "grad_norm": 1.4011379480361938, "learning_rate": 9.999886132775469e-05, "loss": 0.7197819232940674, "step": 205, "token_acc": 0.7953398823738098 }, { "epoch": 0.16067329762815608, "grad_norm": 1.5504759550094604, "learning_rate": 9.999699321232598e-05, "loss": 0.6872771263122559, "step": 210, "token_acc": 0.804167628288269 }, { "epoch": 0.16449885233358838, "grad_norm": 2.0014920234680176, "learning_rate": 9.999423556064422e-05, "loss": 0.6684097290039063, "step": 215, "token_acc": 0.8079100847244263 }, { "epoch": 0.16832440703902066, "grad_norm": 1.3064231872558594, "learning_rate": 9.999058842177297e-05, "loss": 0.747900390625, "step": 220, "token_acc": 0.7928001880645752 }, { "epoch": 0.17214996174445293, "grad_norm": 1.6330523490905762, "learning_rate": 9.998605186060137e-05, "loss": 0.715455961227417, "step": 225, "token_acc": 0.7988653779029846 }, { "epoch": 0.17597551644988524, "grad_norm": 1.6291477680206299, "learning_rate": 9.9980625957843e-05, "loss": 0.792291784286499, "step": 230, "token_acc": 0.7906692624092102 }, { "epoch": 0.17980107115531752, "grad_norm": 1.3224996328353882, "learning_rate": 9.99743108100344e-05, "loss": 0.6187815189361572, "step": 235, "token_acc": 0.8209345936775208 }, { "epoch": 0.18362662586074982, "grad_norm": 1.3888137340545654, "learning_rate": 9.996710652953338e-05, "loss": 0.7097324371337891, "step": 240, "token_acc": 0.8024294376373291 }, { "epoch": 0.1874521805661821, "grad_norm": 1.340208649635315, "learning_rate": 9.995901324451704e-05, "loss": 0.7415911674499511, "step": 245, "token_acc": 0.7968400716781616 }, { "epoch": 0.19127773527161437, "grad_norm": 1.1856446266174316, "learning_rate": 9.995003109897942e-05, "loss": 0.7001552581787109, "step": 250, "token_acc": 0.8009890913963318 }, { "epoch": 0.19127773527161437, "eval_loss": 0.6857067942619324, "eval_runtime": 7.3358, "eval_samples_per_second": 14.177, "eval_steps_per_second": 1.772, "eval_token_acc": 0.803743302822113, "step": 250 }, { "epoch": 0.19510328997704668, "grad_norm": 1.2998038530349731, "learning_rate": 9.994016025272905e-05, "loss": 0.6838603019714355, "step": 255, "token_acc": 0.8089724779129028 }, { "epoch": 0.19892884468247896, "grad_norm": 1.449840784072876, "learning_rate": 9.992940088138597e-05, "loss": 0.6695821762084961, "step": 260, "token_acc": 0.8115434646606445 }, { "epoch": 0.20275439938791126, "grad_norm": 2.188504219055176, "learning_rate": 9.991775317637873e-05, "loss": 0.7405529499053956, "step": 265, "token_acc": 0.7956330180168152 }, { "epoch": 0.20657995409334354, "grad_norm": 1.2301571369171143, "learning_rate": 9.99052173449409e-05, "loss": 0.7626109600067139, "step": 270, "token_acc": 0.7877880334854126 }, { "epoch": 0.21040550879877581, "grad_norm": 1.217523455619812, "learning_rate": 9.989179361010741e-05, "loss": 0.7369673728942872, "step": 275, "token_acc": 0.7953155040740967 }, { "epoch": 0.21423106350420812, "grad_norm": 1.3204615116119385, "learning_rate": 9.987748221071062e-05, "loss": 0.6772171497344971, "step": 280, "token_acc": 0.8045340180397034 }, { "epoch": 0.2180566182096404, "grad_norm": 1.3093225955963135, "learning_rate": 9.9862283401376e-05, "loss": 0.904904556274414, "step": 285, "token_acc": 0.7854760885238647 }, { "epoch": 0.22188217291507267, "grad_norm": 1.4255338907241821, "learning_rate": 9.984619745251767e-05, "loss": 0.669553565979004, "step": 290, "token_acc": 0.8050349354743958 }, { "epoch": 0.22570772762050498, "grad_norm": 1.4884202480316162, "learning_rate": 9.98292246503335e-05, "loss": 0.7445178508758545, "step": 295, "token_acc": 0.8016032576560974 }, { "epoch": 0.22953328232593725, "grad_norm": 1.3081945180892944, "learning_rate": 9.981136529680013e-05, "loss": 0.6435537815093995, "step": 300, "token_acc": 0.8145782947540283 }, { "epoch": 0.22953328232593725, "eval_loss": 0.6707971096038818, "eval_runtime": 7.6759, "eval_samples_per_second": 13.549, "eval_steps_per_second": 1.694, "eval_token_acc": 0.809266984462738, "step": 300 }, { "epoch": 0.23335883703136956, "grad_norm": 1.2945371866226196, "learning_rate": 9.979261970966752e-05, "loss": 0.671229362487793, "step": 305, "token_acc": 0.8093103170394897 }, { "epoch": 0.23718439173680184, "grad_norm": 1.094642996788025, "learning_rate": 9.97729882224533e-05, "loss": 0.638882064819336, "step": 310, "token_acc": 0.8210087418556213 }, { "epoch": 0.2410099464422341, "grad_norm": 1.2039848566055298, "learning_rate": 9.975247118443686e-05, "loss": 0.7105097770690918, "step": 315, "token_acc": 0.79979407787323 }, { "epoch": 0.24483550114766642, "grad_norm": 9.3181734085083, "learning_rate": 9.973106896065318e-05, "loss": 0.7334442615509034, "step": 320, "token_acc": 0.8001999855041504 }, { "epoch": 0.2486610558530987, "grad_norm": 1.2156879901885986, "learning_rate": 9.970878193188617e-05, "loss": 0.6516756534576416, "step": 325, "token_acc": 0.8167580366134644 }, { "epoch": 0.252486610558531, "grad_norm": 1.382604956626892, "learning_rate": 9.968561049466214e-05, "loss": 0.7214525222778321, "step": 330, "token_acc": 0.7979754209518433 }, { "epoch": 0.2563121652639633, "grad_norm": 1.0208624601364136, "learning_rate": 9.96615550612425e-05, "loss": 0.6243480205535888, "step": 335, "token_acc": 0.822067379951477 }, { "epoch": 0.26013771996939555, "grad_norm": 1.2273170948028564, "learning_rate": 9.96366160596166e-05, "loss": 0.7538263320922851, "step": 340, "token_acc": 0.7931398749351501 }, { "epoch": 0.26396327467482783, "grad_norm": 1.005936622619629, "learning_rate": 9.961079393349408e-05, "loss": 0.6441500663757325, "step": 345, "token_acc": 0.8183194398880005 }, { "epoch": 0.26778882938026016, "grad_norm": 1.2466620206832886, "learning_rate": 9.958408914229687e-05, "loss": 0.7031271934509278, "step": 350, "token_acc": 0.8006601929664612 }, { "epoch": 0.26778882938026016, "eval_loss": 0.6655329465866089, "eval_runtime": 8.6572, "eval_samples_per_second": 12.013, "eval_steps_per_second": 1.502, "eval_token_acc": 0.810479998588562, "step": 350 }, { "epoch": 0.27161438408569244, "grad_norm": 1.1055852174758911, "learning_rate": 9.955650216115118e-05, "loss": 0.7128757953643798, "step": 355, "token_acc": 0.8017191886901855 }, { "epoch": 0.2754399387911247, "grad_norm": 0.9971266388893127, "learning_rate": 9.952803348087888e-05, "loss": 0.6931791305541992, "step": 360, "token_acc": 0.8039373159408569 }, { "epoch": 0.279265493496557, "grad_norm": 1.3013373613357544, "learning_rate": 9.949868360798893e-05, "loss": 0.6467844486236572, "step": 365, "token_acc": 0.8141829371452332 }, { "epoch": 0.28309104820198927, "grad_norm": 1.1281312704086304, "learning_rate": 9.946845306466822e-05, "loss": 0.6698862075805664, "step": 370, "token_acc": 0.8099541664123535 }, { "epoch": 0.2869166029074216, "grad_norm": 1.0093694925308228, "learning_rate": 9.943734238877241e-05, "loss": 0.640196704864502, "step": 375, "token_acc": 0.8200778961181641 }, { "epoch": 0.2907421576128539, "grad_norm": 1.161116361618042, "learning_rate": 9.940535213381623e-05, "loss": 0.7982209682464599, "step": 380, "token_acc": 0.803227961063385 }, { "epoch": 0.29456771231828616, "grad_norm": 1.17842435836792, "learning_rate": 9.937248286896376e-05, "loss": 0.674342155456543, "step": 385, "token_acc": 0.8081824779510498 }, { "epoch": 0.29839326702371843, "grad_norm": 1.2346426248550415, "learning_rate": 9.933873517901825e-05, "loss": 0.6990632057189942, "step": 390, "token_acc": 0.8067554235458374 }, { "epoch": 0.3022188217291507, "grad_norm": 1.1731232404708862, "learning_rate": 9.930410966441164e-05, "loss": 0.7052478790283203, "step": 395, "token_acc": 0.8015207052230835 }, { "epoch": 0.306044376434583, "grad_norm": 1.1818660497665405, "learning_rate": 9.926860694119398e-05, "loss": 0.6852362632751465, "step": 400, "token_acc": 0.8096556663513184 }, { "epoch": 0.306044376434583, "eval_loss": 0.6521208882331848, "eval_runtime": 7.4215, "eval_samples_per_second": 14.013, "eval_steps_per_second": 1.752, "eval_token_acc": 0.8145099878311157, "step": 400 }, { "epoch": 0.3098699311400153, "grad_norm": 1.166639804840088, "learning_rate": 9.923222764102248e-05, "loss": 0.6215761661529541, "step": 405, "token_acc": 0.8188217282295227 }, { "epoch": 0.3136954858454476, "grad_norm": 1.0579371452331543, "learning_rate": 9.919497241115016e-05, "loss": 0.6619209289550781, "step": 410, "token_acc": 0.8130149841308594 }, { "epoch": 0.3175210405508799, "grad_norm": 1.025505542755127, "learning_rate": 9.915684191441446e-05, "loss": 0.681110954284668, "step": 415, "token_acc": 0.8061873316764832 }, { "epoch": 0.32134659525631215, "grad_norm": 1.1900734901428223, "learning_rate": 9.911783682922533e-05, "loss": 0.6414823532104492, "step": 420, "token_acc": 0.8169435262680054 }, { "epoch": 0.32517214996174443, "grad_norm": 1.0435925722122192, "learning_rate": 9.907795784955327e-05, "loss": 0.650167179107666, "step": 425, "token_acc": 0.8135402202606201 }, { "epoch": 0.32899770466717676, "grad_norm": 0.9976479411125183, "learning_rate": 9.90372056849169e-05, "loss": 0.6622737884521485, "step": 430, "token_acc": 0.8130133152008057 }, { "epoch": 0.33282325937260904, "grad_norm": 1.025640606880188, "learning_rate": 9.899558106037039e-05, "loss": 0.7082881927490234, "step": 435, "token_acc": 0.8012630343437195 }, { "epoch": 0.3366488140780413, "grad_norm": 1.1692794561386108, "learning_rate": 9.895308471649052e-05, "loss": 0.7149417877197266, "step": 440, "token_acc": 0.8121411204338074 }, { "epoch": 0.3404743687834736, "grad_norm": 1.0781068801879883, "learning_rate": 9.890971740936352e-05, "loss": 0.6460227012634278, "step": 445, "token_acc": 0.8171982169151306 }, { "epoch": 0.34429992348890587, "grad_norm": 1.7874302864074707, "learning_rate": 9.886547991057162e-05, "loss": 0.6831697463989258, "step": 450, "token_acc": 0.8117350339889526 }, { "epoch": 0.34429992348890587, "eval_loss": 0.6621751189231873, "eval_runtime": 7.4514, "eval_samples_per_second": 13.957, "eval_steps_per_second": 1.745, "eval_token_acc": 0.8132668733596802, "step": 450 }, { "epoch": 0.3481254781943382, "grad_norm": 1.1658034324645996, "learning_rate": 9.882037300717936e-05, "loss": 0.6283795356750488, "step": 455, "token_acc": 0.8232808709144592 }, { "epoch": 0.3519510328997705, "grad_norm": 0.8861122727394104, "learning_rate": 9.87743975017195e-05, "loss": 0.5845287322998047, "step": 460, "token_acc": 0.8338332176208496 }, { "epoch": 0.35577658760520275, "grad_norm": 1.1082383394241333, "learning_rate": 9.872755421217881e-05, "loss": 0.7373793125152588, "step": 465, "token_acc": 0.7927750945091248 }, { "epoch": 0.35960214231063503, "grad_norm": 0.9668710827827454, "learning_rate": 9.867984397198348e-05, "loss": 0.6381460189819336, "step": 470, "token_acc": 0.8192023038864136 }, { "epoch": 0.3634276970160673, "grad_norm": 1.0808384418487549, "learning_rate": 9.863126762998436e-05, "loss": 0.7160910606384278, "step": 475, "token_acc": 0.8008524179458618 }, { "epoch": 0.36725325172149964, "grad_norm": 1.0136635303497314, "learning_rate": 9.858182605044172e-05, "loss": 0.6220456123352051, "step": 480, "token_acc": 0.8248037099838257 }, { "epoch": 0.3710788064269319, "grad_norm": 1.2998031377792358, "learning_rate": 9.853152011301003e-05, "loss": 0.6555353164672851, "step": 485, "token_acc": 0.8161742687225342 }, { "epoch": 0.3749043611323642, "grad_norm": 1.0749304294586182, "learning_rate": 9.848035071272222e-05, "loss": 0.6211759567260742, "step": 490, "token_acc": 0.821867048740387 }, { "epoch": 0.37872991583779647, "grad_norm": 0.9710472226142883, "learning_rate": 9.842831875997375e-05, "loss": 0.6431370735168457, "step": 495, "token_acc": 0.8220862150192261 }, { "epoch": 0.38255547054322875, "grad_norm": 1.0042985677719116, "learning_rate": 9.837542518050649e-05, "loss": 0.6818212509155274, "step": 500, "token_acc": 0.8100237846374512 }, { "epoch": 0.38255547054322875, "eval_loss": 0.6374099254608154, "eval_runtime": 7.6101, "eval_samples_per_second": 13.666, "eval_steps_per_second": 1.708, "eval_token_acc": 0.8186302185058594, "step": 500 }, { "epoch": 0.3863810252486611, "grad_norm": 1.0197993516921997, "learning_rate": 9.832167091539214e-05, "loss": 0.6007397174835205, "step": 505, "token_acc": 0.8282684683799744 }, { "epoch": 0.39020657995409336, "grad_norm": 1.0835719108581543, "learning_rate": 9.826705692101555e-05, "loss": 0.7205737113952637, "step": 510, "token_acc": 0.7967984080314636 }, { "epoch": 0.39403213465952563, "grad_norm": 0.9672032594680786, "learning_rate": 9.821158416905773e-05, "loss": 0.6137794494628906, "step": 515, "token_acc": 0.8238478899002075 }, { "epoch": 0.3978576893649579, "grad_norm": 1.0274014472961426, "learning_rate": 9.815525364647853e-05, "loss": 0.6839157104492187, "step": 520, "token_acc": 0.8090466856956482 }, { "epoch": 0.4016832440703902, "grad_norm": 0.966098427772522, "learning_rate": 9.809806635549901e-05, "loss": 0.5641196250915528, "step": 525, "token_acc": 0.8359003663063049 }, { "epoch": 0.4055087987758225, "grad_norm": 1.1138949394226074, "learning_rate": 9.804002331358377e-05, "loss": 0.615296745300293, "step": 530, "token_acc": 0.8272916674613953 }, { "epoch": 0.4093343534812548, "grad_norm": 2.4379749298095703, "learning_rate": 9.798112555342268e-05, "loss": 0.5940766334533691, "step": 535, "token_acc": 0.8358057737350464 }, { "epoch": 0.4131599081866871, "grad_norm": 1.1517431735992432, "learning_rate": 9.792137412291265e-05, "loss": 0.6338438034057617, "step": 540, "token_acc": 0.8158274292945862 }, { "epoch": 0.41698546289211935, "grad_norm": 6.055464744567871, "learning_rate": 9.786077008513883e-05, "loss": 0.6075318336486817, "step": 545, "token_acc": 0.8209756016731262 }, { "epoch": 0.42081101759755163, "grad_norm": 0.9165500402450562, "learning_rate": 9.779931451835589e-05, "loss": 0.659608793258667, "step": 550, "token_acc": 0.815700113773346 }, { "epoch": 0.42081101759755163, "eval_loss": 0.6386705636978149, "eval_runtime": 8.1335, "eval_samples_per_second": 12.787, "eval_steps_per_second": 1.598, "eval_token_acc": 0.8192116618156433, "step": 550 }, { "epoch": 0.4246365723029839, "grad_norm": 3.8534209728240967, "learning_rate": 9.773700851596864e-05, "loss": 0.689471435546875, "step": 555, "token_acc": 0.8077275156974792 }, { "epoch": 0.42846212700841624, "grad_norm": 1.0717378854751587, "learning_rate": 9.767385318651272e-05, "loss": 0.6236325740814209, "step": 560, "token_acc": 0.826772928237915 }, { "epoch": 0.4322876817138485, "grad_norm": 0.9380275011062622, "learning_rate": 9.760984965363478e-05, "loss": 0.6055815696716309, "step": 565, "token_acc": 0.8277127146720886 }, { "epoch": 0.4361132364192808, "grad_norm": 0.9301455020904541, "learning_rate": 9.75449990560726e-05, "loss": 0.5975317001342774, "step": 570, "token_acc": 0.8306687474250793 }, { "epoch": 0.43993879112471307, "grad_norm": 0.9384899735450745, "learning_rate": 9.747930254763467e-05, "loss": 0.631765604019165, "step": 575, "token_acc": 0.8169443011283875 }, { "epoch": 0.44376434583014535, "grad_norm": 0.9002703428268433, "learning_rate": 9.74127612971798e-05, "loss": 0.6044256210327148, "step": 580, "token_acc": 0.8257142305374146 }, { "epoch": 0.4475899005355777, "grad_norm": 0.8999844193458557, "learning_rate": 9.73453764885963e-05, "loss": 0.6237145900726319, "step": 585, "token_acc": 0.8252273797988892 }, { "epoch": 0.45141545524100996, "grad_norm": 0.9064670205116272, "learning_rate": 9.727714932078088e-05, "loss": 0.6549233436584473, "step": 590, "token_acc": 0.8153916001319885 }, { "epoch": 0.45524100994644223, "grad_norm": 1.0747268199920654, "learning_rate": 9.720808100761729e-05, "loss": 0.6232728004455567, "step": 595, "token_acc": 0.8211687207221985 }, { "epoch": 0.4590665646518745, "grad_norm": 1.031503438949585, "learning_rate": 9.713817277795482e-05, "loss": 0.6111268043518067, "step": 600, "token_acc": 0.8248355984687805 }, { "epoch": 0.4590665646518745, "eval_loss": 0.634019136428833, "eval_runtime": 7.7263, "eval_samples_per_second": 13.46, "eval_steps_per_second": 1.683, "eval_token_acc": 0.8194121718406677, "step": 600 }, { "epoch": 0.4628921193573068, "grad_norm": 18.878767013549805, "learning_rate": 9.706742587558635e-05, "loss": 0.7319217681884765, "step": 605, "token_acc": 0.8135314583778381 }, { "epoch": 0.4667176740627391, "grad_norm": 0.9823316931724548, "learning_rate": 9.699584155922625e-05, "loss": 0.658491849899292, "step": 610, "token_acc": 0.8164398670196533 }, { "epoch": 0.4705432287681714, "grad_norm": 1.1845817565917969, "learning_rate": 9.692342110248802e-05, "loss": 0.6585088729858398, "step": 615, "token_acc": 0.8140710592269897 }, { "epoch": 0.4743687834736037, "grad_norm": 1.0284193754196167, "learning_rate": 9.685016579386159e-05, "loss": 0.6060408592224121, "step": 620, "token_acc": 0.8255147933959961 }, { "epoch": 0.47819433817903595, "grad_norm": 1.0485318899154663, "learning_rate": 9.677607693669035e-05, "loss": 0.6855095863342285, "step": 625, "token_acc": 0.8098092079162598 }, { "epoch": 0.4820198928844682, "grad_norm": 2.119432210922241, "learning_rate": 9.67011558491481e-05, "loss": 0.6514041423797607, "step": 630, "token_acc": 0.8163265585899353 }, { "epoch": 0.48584544758990056, "grad_norm": 0.9313147664070129, "learning_rate": 9.662540386421546e-05, "loss": 0.6687870025634766, "step": 635, "token_acc": 0.8119432330131531 }, { "epoch": 0.48967100229533284, "grad_norm": 0.9492276310920715, "learning_rate": 9.65488223296562e-05, "loss": 0.6563722610473632, "step": 640, "token_acc": 0.8168354034423828 }, { "epoch": 0.4934965570007651, "grad_norm": 1.0297837257385254, "learning_rate": 9.64714126079933e-05, "loss": 0.5913913726806641, "step": 645, "token_acc": 0.828011691570282 }, { "epoch": 0.4973221117061974, "grad_norm": 1.0799224376678467, "learning_rate": 9.639317607648463e-05, "loss": 0.6493720054626465, "step": 650, "token_acc": 0.8191680312156677 }, { "epoch": 0.4973221117061974, "eval_loss": 0.6336340308189392, "eval_runtime": 8.085, "eval_samples_per_second": 12.863, "eval_steps_per_second": 1.608, "eval_token_acc": 0.8203945755958557, "step": 650 }, { "epoch": 0.5011476664116297, "grad_norm": 0.9438362717628479, "learning_rate": 9.631411412709856e-05, "loss": 0.634061050415039, "step": 655, "token_acc": 0.8196708559989929 }, { "epoch": 0.504973221117062, "grad_norm": 0.9886628985404968, "learning_rate": 9.623422816648905e-05, "loss": 0.6314868450164794, "step": 660, "token_acc": 0.8192417025566101 }, { "epoch": 0.5087987758224942, "grad_norm": 1.053757667541504, "learning_rate": 9.615351961597075e-05, "loss": 0.6161402225494385, "step": 665, "token_acc": 0.8249170780181885 }, { "epoch": 0.5126243305279266, "grad_norm": 0.8857008814811707, "learning_rate": 9.607198991149365e-05, "loss": 0.6382771968841553, "step": 670, "token_acc": 0.8191618323326111 }, { "epoch": 0.5164498852333589, "grad_norm": 0.9176872968673706, "learning_rate": 9.598964050361749e-05, "loss": 0.6668461799621582, "step": 675, "token_acc": 0.8112070560455322 }, { "epoch": 0.5202754399387911, "grad_norm": 0.8668197393417358, "learning_rate": 9.590647285748613e-05, "loss": 0.6178393363952637, "step": 680, "token_acc": 0.8246564269065857 }, { "epoch": 0.5241009946442234, "grad_norm": 0.8694312572479248, "learning_rate": 9.582248845280121e-05, "loss": 0.6056000709533691, "step": 685, "token_acc": 0.8267983198165894 }, { "epoch": 0.5279265493496557, "grad_norm": 1.0597003698349, "learning_rate": 9.57376887837961e-05, "loss": 0.6181661128997803, "step": 690, "token_acc": 0.8232805728912354 }, { "epoch": 0.531752104055088, "grad_norm": 0.8571362495422363, "learning_rate": 9.565207535920906e-05, "loss": 0.6172348976135253, "step": 695, "token_acc": 0.8221156597137451 }, { "epoch": 0.5355776587605203, "grad_norm": 0.9073564410209656, "learning_rate": 9.556564970225666e-05, "loss": 0.6466682434082032, "step": 700, "token_acc": 0.8197444081306458 }, { "epoch": 0.5355776587605203, "eval_loss": 0.6152887344360352, "eval_runtime": 7.5903, "eval_samples_per_second": 13.702, "eval_steps_per_second": 1.713, "eval_token_acc": 0.8228907585144043, "step": 700 }, { "epoch": 0.5394032134659525, "grad_norm": 0.9663663506507874, "learning_rate": 9.547841335060641e-05, "loss": 0.6051031112670898, "step": 705, "token_acc": 0.8252653479576111 }, { "epoch": 0.5432287681713849, "grad_norm": 0.9873702526092529, "learning_rate": 9.539036785634961e-05, "loss": 0.6133259296417236, "step": 710, "token_acc": 0.8265376687049866 }, { "epoch": 0.5470543228768171, "grad_norm": 0.8775202035903931, "learning_rate": 9.530151478597366e-05, "loss": 0.6536783218383789, "step": 715, "token_acc": 0.8136675357818604 }, { "epoch": 0.5508798775822494, "grad_norm": 0.8767590522766113, "learning_rate": 9.521185572033416e-05, "loss": 0.5738767147064209, "step": 720, "token_acc": 0.8351121544837952 }, { "epoch": 0.5547054322876818, "grad_norm": 0.9340411424636841, "learning_rate": 9.512139225462682e-05, "loss": 0.60714693069458, "step": 725, "token_acc": 0.8243422508239746 }, { "epoch": 0.558530986993114, "grad_norm": 0.924868643283844, "learning_rate": 9.503012599835907e-05, "loss": 0.5976818084716797, "step": 730, "token_acc": 0.8307338953018188 }, { "epoch": 0.5623565416985463, "grad_norm": 1.1880912780761719, "learning_rate": 9.493805857532148e-05, "loss": 0.7305125236511231, "step": 735, "token_acc": 0.7984393239021301 }, { "epoch": 0.5661820964039785, "grad_norm": 0.8552014827728271, "learning_rate": 9.48451916235587e-05, "loss": 0.631963062286377, "step": 740, "token_acc": 0.8215923309326172 }, { "epoch": 0.5700076511094109, "grad_norm": 0.9064537882804871, "learning_rate": 9.475152679534052e-05, "loss": 0.5955155849456787, "step": 745, "token_acc": 0.8277559876441956 }, { "epoch": 0.5738332058148432, "grad_norm": 0.953490138053894, "learning_rate": 9.465706575713236e-05, "loss": 0.5581603050231934, "step": 750, "token_acc": 0.8392514586448669 }, { "epoch": 0.5738332058148432, "eval_loss": 0.6101195812225342, "eval_runtime": 7.839, "eval_samples_per_second": 13.267, "eval_steps_per_second": 1.658, "eval_token_acc": 0.8254771828651428, "step": 750 }, { "epoch": 0.5776587605202754, "grad_norm": 0.9111331105232239, "learning_rate": 9.456181018956567e-05, "loss": 0.5761038780212402, "step": 755, "token_acc": 0.8335671424865723 }, { "epoch": 0.5814843152257078, "grad_norm": 0.9279806613922119, "learning_rate": 9.446576178740795e-05, "loss": 0.6236689567565918, "step": 760, "token_acc": 0.8229003548622131 }, { "epoch": 0.58530986993114, "grad_norm": 0.8497107028961182, "learning_rate": 9.436892225953269e-05, "loss": 0.6130060672760009, "step": 765, "token_acc": 0.8241313099861145 }, { "epoch": 0.5891354246365723, "grad_norm": 0.933496356010437, "learning_rate": 9.427129332888891e-05, "loss": 0.6331747055053711, "step": 770, "token_acc": 0.8258751034736633 }, { "epoch": 0.5929609793420046, "grad_norm": 0.95807945728302, "learning_rate": 9.417287673247052e-05, "loss": 0.5901139259338379, "step": 775, "token_acc": 0.8336220383644104 }, { "epoch": 0.5967865340474369, "grad_norm": 0.9931139349937439, "learning_rate": 9.407367422128547e-05, "loss": 0.6363272666931152, "step": 780, "token_acc": 0.8183371424674988 }, { "epoch": 0.6006120887528692, "grad_norm": 0.8274650573730469, "learning_rate": 9.397368756032445e-05, "loss": 0.5664173603057862, "step": 785, "token_acc": 0.8378447890281677 }, { "epoch": 0.6044376434583014, "grad_norm": 1.017050862312317, "learning_rate": 9.387291852852967e-05, "loss": 0.6467793464660645, "step": 790, "token_acc": 0.818406343460083 }, { "epoch": 0.6082631981637338, "grad_norm": 0.8612256050109863, "learning_rate": 9.377136891876306e-05, "loss": 0.644353199005127, "step": 795, "token_acc": 0.8149409294128418 }, { "epoch": 0.612088752869166, "grad_norm": 0.9359307289123535, "learning_rate": 9.366904053777447e-05, "loss": 0.6541380882263184, "step": 800, "token_acc": 0.8136578798294067 }, { "epoch": 0.612088752869166, "eval_loss": 0.600931704044342, "eval_runtime": 7.7335, "eval_samples_per_second": 13.448, "eval_steps_per_second": 1.681, "eval_token_acc": 0.8262491226196289, "step": 800 }, { "epoch": 0.6159143075745983, "grad_norm": 0.8562702536582947, "learning_rate": 9.356593520616948e-05, "loss": 0.5768568038940429, "step": 805, "token_acc": 0.8369309902191162 }, { "epoch": 0.6197398622800306, "grad_norm": 0.8822196125984192, "learning_rate": 9.3462054758377e-05, "loss": 0.6508576393127441, "step": 810, "token_acc": 0.8174927234649658 }, { "epoch": 0.6235654169854629, "grad_norm": 0.8938590288162231, "learning_rate": 9.335740104261664e-05, "loss": 0.6667316436767579, "step": 815, "token_acc": 0.8100781440734863 }, { "epoch": 0.6273909716908952, "grad_norm": 1.007367491722107, "learning_rate": 9.32519759208659e-05, "loss": 0.72325439453125, "step": 820, "token_acc": 0.8077250123023987 }, { "epoch": 0.6312165263963274, "grad_norm": 1.01559579372406, "learning_rate": 9.314578126882691e-05, "loss": 0.5955130577087402, "step": 825, "token_acc": 0.8294063806533813 }, { "epoch": 0.6350420811017597, "grad_norm": 0.9418911933898926, "learning_rate": 9.303881897589315e-05, "loss": 0.6099714279174805, "step": 830, "token_acc": 0.8279644250869751 }, { "epoch": 0.6388676358071921, "grad_norm": 0.9409440755844116, "learning_rate": 9.29310909451158e-05, "loss": 0.5885293006896972, "step": 835, "token_acc": 0.8318097591400146 }, { "epoch": 0.6426931905126243, "grad_norm": 0.9052807688713074, "learning_rate": 9.28225990931699e-05, "loss": 0.5844202995300293, "step": 840, "token_acc": 0.8323644399642944 }, { "epoch": 0.6465187452180566, "grad_norm": 1.170585036277771, "learning_rate": 9.271334535032026e-05, "loss": 0.6612658500671387, "step": 845, "token_acc": 0.8123800754547119 }, { "epoch": 0.6503442999234889, "grad_norm": 0.89767986536026, "learning_rate": 9.260333166038704e-05, "loss": 0.6106939315795898, "step": 850, "token_acc": 0.8253637552261353 }, { "epoch": 0.6503442999234889, "eval_loss": 0.595952033996582, "eval_runtime": 7.7396, "eval_samples_per_second": 13.437, "eval_steps_per_second": 1.68, "eval_token_acc": 0.8275924324989319, "step": 850 }, { "epoch": 0.6541698546289212, "grad_norm": 0.8901084661483765, "learning_rate": 9.249255998071126e-05, "loss": 0.5618688106536865, "step": 855, "token_acc": 0.8380252718925476 }, { "epoch": 0.6579954093343535, "grad_norm": 0.8414104580879211, "learning_rate": 9.238103228211997e-05, "loss": 0.5890965461730957, "step": 860, "token_acc": 0.8292516469955444 }, { "epoch": 0.6618209640397857, "grad_norm": 0.8542090058326721, "learning_rate": 9.226875054889108e-05, "loss": 0.5492356300354004, "step": 865, "token_acc": 0.8417258858680725 }, { "epoch": 0.6656465187452181, "grad_norm": 0.928252100944519, "learning_rate": 9.21557167787182e-05, "loss": 0.6059693813323974, "step": 870, "token_acc": 0.827387273311615 }, { "epoch": 0.6694720734506503, "grad_norm": 0.8323174118995667, "learning_rate": 9.204193298267496e-05, "loss": 0.6152177810668945, "step": 875, "token_acc": 0.8236430287361145 }, { "epoch": 0.6732976281560826, "grad_norm": 0.8953769207000732, "learning_rate": 9.192740118517935e-05, "loss": 0.6013946056365966, "step": 880, "token_acc": 0.8297914862632751 }, { "epoch": 0.677123182861515, "grad_norm": 0.9411488771438599, "learning_rate": 9.181212342395764e-05, "loss": 0.521054458618164, "step": 885, "token_acc": 0.8486282229423523 }, { "epoch": 0.6809487375669472, "grad_norm": 0.9547863602638245, "learning_rate": 9.169610175000812e-05, "loss": 0.5880234718322754, "step": 890, "token_acc": 0.8322908878326416 }, { "epoch": 0.6847742922723795, "grad_norm": 1.0470699071884155, "learning_rate": 9.157933822756459e-05, "loss": 0.6081759452819824, "step": 895, "token_acc": 0.8250705003738403 }, { "epoch": 0.6885998469778117, "grad_norm": 0.9556779861450195, "learning_rate": 9.146183493405975e-05, "loss": 0.6601164817810059, "step": 900, "token_acc": 0.8116152286529541 }, { "epoch": 0.6885998469778117, "eval_loss": 0.5903816223144531, "eval_runtime": 7.6904, "eval_samples_per_second": 13.523, "eval_steps_per_second": 1.69, "eval_token_acc": 0.8289057016372681, "step": 900 }, { "epoch": 0.6924254016832441, "grad_norm": 1.1069297790527344, "learning_rate": 9.13435939600881e-05, "loss": 0.6385367393493653, "step": 905, "token_acc": 0.8162096738815308 }, { "epoch": 0.6962509563886764, "grad_norm": 0.9318839311599731, "learning_rate": 9.12246174093688e-05, "loss": 0.604517650604248, "step": 910, "token_acc": 0.82686847448349 }, { "epoch": 0.7000765110941086, "grad_norm": 0.8273342251777649, "learning_rate": 9.110490739870824e-05, "loss": 0.6841697216033935, "step": 915, "token_acc": 0.8044203519821167 }, { "epoch": 0.703902065799541, "grad_norm": 0.8293759822845459, "learning_rate": 9.098446605796239e-05, "loss": 0.5717193603515625, "step": 920, "token_acc": 0.8351298570632935 }, { "epoch": 0.7077276205049732, "grad_norm": 7.753383636474609, "learning_rate": 9.086329552999891e-05, "loss": 0.5882965564727783, "step": 925, "token_acc": 0.8285040259361267 }, { "epoch": 0.7115531752104055, "grad_norm": 0.9893306493759155, "learning_rate": 9.074139797065897e-05, "loss": 0.648917293548584, "step": 930, "token_acc": 0.8116658329963684 }, { "epoch": 0.7153787299158378, "grad_norm": 0.902746856212616, "learning_rate": 9.061877554871896e-05, "loss": 0.6094418525695801, "step": 935, "token_acc": 0.8259324431419373 }, { "epoch": 0.7192042846212701, "grad_norm": 0.9152299165725708, "learning_rate": 9.049543044585187e-05, "loss": 0.6678308486938477, "step": 940, "token_acc": 0.816949725151062 }, { "epoch": 0.7230298393267024, "grad_norm": 1.0613242387771606, "learning_rate": 9.03713648565885e-05, "loss": 0.6197181701660156, "step": 945, "token_acc": 0.8243659138679504 }, { "epoch": 0.7268553940321346, "grad_norm": 0.7965312600135803, "learning_rate": 9.024658098827838e-05, "loss": 0.6047243118286133, "step": 950, "token_acc": 0.8313871622085571 }, { "epoch": 0.7268553940321346, "eval_loss": 0.587164044380188, "eval_runtime": 7.7893, "eval_samples_per_second": 13.352, "eval_steps_per_second": 1.669, "eval_token_acc": 0.8293668031692505, "step": 950 }, { "epoch": 0.730680948737567, "grad_norm": 0.8924623131752014, "learning_rate": 9.012108106105048e-05, "loss": 0.5776640892028808, "step": 955, "token_acc": 0.8302121758460999 }, { "epoch": 0.7345065034429993, "grad_norm": 1.0438350439071655, "learning_rate": 8.99948673077738e-05, "loss": 0.5650456428527832, "step": 960, "token_acc": 0.8433432579040527 }, { "epoch": 0.7383320581484315, "grad_norm": 0.8841288685798645, "learning_rate": 8.986794197401754e-05, "loss": 0.5597739219665527, "step": 965, "token_acc": 0.8350304365158081 }, { "epoch": 0.7421576128538638, "grad_norm": 0.9303543567657471, "learning_rate": 8.974030731801127e-05, "loss": 0.6170159816741944, "step": 970, "token_acc": 0.8251381516456604 }, { "epoch": 0.7459831675592961, "grad_norm": 1.05469810962677, "learning_rate": 8.961196561060454e-05, "loss": 0.61129789352417, "step": 975, "token_acc": 0.8258439302444458 }, { "epoch": 0.7498087222647284, "grad_norm": 0.8528873920440674, "learning_rate": 8.948291913522677e-05, "loss": 0.642275619506836, "step": 980, "token_acc": 0.8284429907798767 }, { "epoch": 0.7536342769701607, "grad_norm": 0.7755897641181946, "learning_rate": 8.935317018784637e-05, "loss": 0.5369032859802246, "step": 985, "token_acc": 0.8431283235549927 }, { "epoch": 0.7574598316755929, "grad_norm": 0.8636773228645325, "learning_rate": 8.922272107693e-05, "loss": 0.5884841442108154, "step": 990, "token_acc": 0.830573558807373 }, { "epoch": 0.7612853863810253, "grad_norm": 0.8464745283126831, "learning_rate": 8.90915741234015e-05, "loss": 0.5174911022186279, "step": 995, "token_acc": 0.8450327515602112 }, { "epoch": 0.7651109410864575, "grad_norm": 0.8121261596679688, "learning_rate": 8.895973166060058e-05, "loss": 0.5794853687286377, "step": 1000, "token_acc": 0.8360881209373474 }, { "epoch": 0.7651109410864575, "eval_loss": 0.5729076862335205, "eval_runtime": 7.9584, "eval_samples_per_second": 13.068, "eval_steps_per_second": 1.634, "eval_token_acc": 0.8330559730529785, "step": 1000 }, { "epoch": 0.7689364957918898, "grad_norm": 0.8082830309867859, "learning_rate": 8.882719603424133e-05, "loss": 0.6191754341125488, "step": 1005, "token_acc": 0.8256863355636597 }, { "epoch": 0.7727620504973222, "grad_norm": 0.8163895010948181, "learning_rate": 8.86939696023704e-05, "loss": 0.5695658683776855, "step": 1010, "token_acc": 0.8331784605979919 }, { "epoch": 0.7765876052027544, "grad_norm": 0.8397212624549866, "learning_rate": 8.856005473532519e-05, "loss": 0.5332405090332031, "step": 1015, "token_acc": 0.8452962636947632 }, { "epoch": 0.7804131599081867, "grad_norm": 0.8272839188575745, "learning_rate": 8.842545381569155e-05, "loss": 0.5343279838562012, "step": 1020, "token_acc": 0.8402997255325317 }, { "epoch": 0.7842387146136189, "grad_norm": 0.8609519004821777, "learning_rate": 8.829016923826144e-05, "loss": 0.5459603309631348, "step": 1025, "token_acc": 0.8402543067932129 }, { "epoch": 0.7880642693190513, "grad_norm": 0.8439111113548279, "learning_rate": 8.815420340999033e-05, "loss": 0.5824572563171386, "step": 1030, "token_acc": 0.8306134343147278 }, { "epoch": 0.7918898240244836, "grad_norm": 0.8207530975341797, "learning_rate": 8.801755874995437e-05, "loss": 0.5932113647460937, "step": 1035, "token_acc": 0.8294033408164978 }, { "epoch": 0.7957153787299158, "grad_norm": 0.9178765416145325, "learning_rate": 8.788023768930732e-05, "loss": 0.5900128364562989, "step": 1040, "token_acc": 0.8334224820137024 }, { "epoch": 0.7995409334353482, "grad_norm": 0.7986139059066772, "learning_rate": 8.774224267123734e-05, "loss": 0.6000078678131103, "step": 1045, "token_acc": 0.8272825479507446 }, { "epoch": 0.8033664881407804, "grad_norm": 0.8349852561950684, "learning_rate": 8.760357615092351e-05, "loss": 0.5280231475830078, "step": 1050, "token_acc": 0.8440104722976685 }, { "epoch": 0.8033664881407804, "eval_loss": 0.574630856513977, "eval_runtime": 7.7226, "eval_samples_per_second": 13.467, "eval_steps_per_second": 1.683, "eval_token_acc": 0.833396852016449, "step": 1050 }, { "epoch": 0.8071920428462127, "grad_norm": 0.7667945027351379, "learning_rate": 8.746424059549213e-05, "loss": 0.5487701416015625, "step": 1055, "token_acc": 0.8400689959526062 }, { "epoch": 0.811017597551645, "grad_norm": 0.9147979617118835, "learning_rate": 8.732423848397284e-05, "loss": 0.5697606563568115, "step": 1060, "token_acc": 0.8328049182891846 }, { "epoch": 0.8148431522570773, "grad_norm": 0.8798291087150574, "learning_rate": 8.718357230725449e-05, "loss": 0.5843188285827636, "step": 1065, "token_acc": 0.8351316452026367 }, { "epoch": 0.8186687069625096, "grad_norm": 0.9299157857894897, "learning_rate": 8.704224456804087e-05, "loss": 0.6090686798095704, "step": 1070, "token_acc": 0.8255612850189209 }, { "epoch": 0.8224942616679418, "grad_norm": 0.8285570740699768, "learning_rate": 8.690025778080613e-05, "loss": 0.5678855419158936, "step": 1075, "token_acc": 0.834744930267334 }, { "epoch": 0.8263198163733741, "grad_norm": 1.0449912548065186, "learning_rate": 8.67576144717501e-05, "loss": 0.5510326385498047, "step": 1080, "token_acc": 0.8414307832717896 }, { "epoch": 0.8301453710788065, "grad_norm": 0.7922863364219666, "learning_rate": 8.661431717875328e-05, "loss": 0.5484563827514648, "step": 1085, "token_acc": 0.8401945233345032 }, { "epoch": 0.8339709257842387, "grad_norm": 1.0209932327270508, "learning_rate": 8.647036845133172e-05, "loss": 0.5764856338500977, "step": 1090, "token_acc": 0.8333871960639954 }, { "epoch": 0.837796480489671, "grad_norm": 0.8326112627983093, "learning_rate": 8.632577085059168e-05, "loss": 0.6004890441894531, "step": 1095, "token_acc": 0.827037513256073 }, { "epoch": 0.8416220351951033, "grad_norm": 0.7816240787506104, "learning_rate": 8.618052694918399e-05, "loss": 0.5333565711975098, "step": 1100, "token_acc": 0.8430129885673523 }, { "epoch": 0.8416220351951033, "eval_loss": 0.5720469951629639, "eval_runtime": 7.8984, "eval_samples_per_second": 13.167, "eval_steps_per_second": 1.646, "eval_token_acc": 0.8314921259880066, "step": 1100 }, { "epoch": 0.8454475899005356, "grad_norm": 0.9619238376617432, "learning_rate": 8.603463933125842e-05, "loss": 0.5509546756744385, "step": 1105, "token_acc": 0.8384957313537598 }, { "epoch": 0.8492731446059678, "grad_norm": 0.9528924822807312, "learning_rate": 8.588811059241755e-05, "loss": 0.6007543563842773, "step": 1110, "token_acc": 0.8273714780807495 }, { "epoch": 0.8530986993114001, "grad_norm": 0.812016487121582, "learning_rate": 8.574094333967064e-05, "loss": 0.5877734661102295, "step": 1115, "token_acc": 0.8291584253311157 }, { "epoch": 0.8569242540168325, "grad_norm": 1.103339433670044, "learning_rate": 8.559314019138727e-05, "loss": 0.6196231842041016, "step": 1120, "token_acc": 0.8281660676002502 }, { "epoch": 0.8607498087222647, "grad_norm": 0.9961858987808228, "learning_rate": 8.544470377725078e-05, "loss": 0.571223258972168, "step": 1125, "token_acc": 0.8321356177330017 }, { "epoch": 0.864575363427697, "grad_norm": 0.8015458583831787, "learning_rate": 8.529563673821141e-05, "loss": 0.538951301574707, "step": 1130, "token_acc": 0.8429505228996277 }, { "epoch": 0.8684009181331293, "grad_norm": 0.8478720784187317, "learning_rate": 8.514594172643934e-05, "loss": 0.5572677612304687, "step": 1135, "token_acc": 0.8356977105140686 }, { "epoch": 0.8722264728385616, "grad_norm": 0.814361572265625, "learning_rate": 8.499562140527754e-05, "loss": 0.5883401870727539, "step": 1140, "token_acc": 0.8291968107223511 }, { "epoch": 0.8760520275439939, "grad_norm": 0.8049572706222534, "learning_rate": 8.484467844919437e-05, "loss": 0.5637226104736328, "step": 1145, "token_acc": 0.8390661478042603 }, { "epoch": 0.8798775822494261, "grad_norm": 0.749894917011261, "learning_rate": 8.469311554373594e-05, "loss": 0.4973104000091553, "step": 1150, "token_acc": 0.8528492450714111 }, { "epoch": 0.8798775822494261, "eval_loss": 0.564576268196106, "eval_runtime": 7.7633, "eval_samples_per_second": 13.396, "eval_steps_per_second": 1.675, "eval_token_acc": 0.835151195526123, "step": 1150 }, { "epoch": 0.8837031369548585, "grad_norm": 0.9036749005317688, "learning_rate": 8.454093538547838e-05, "loss": 0.5535676956176758, "step": 1155, "token_acc": 0.8383986949920654 }, { "epoch": 0.8875286916602907, "grad_norm": 0.7430348992347717, "learning_rate": 8.438814068197988e-05, "loss": 0.557097339630127, "step": 1160, "token_acc": 0.8430325388908386 }, { "epoch": 0.891354246365723, "grad_norm": 0.9356522560119629, "learning_rate": 8.423473415173247e-05, "loss": 0.5787965774536132, "step": 1165, "token_acc": 0.8321569561958313 }, { "epoch": 0.8951798010711554, "grad_norm": 0.7668983340263367, "learning_rate": 8.40807185241137e-05, "loss": 0.5303655624389648, "step": 1170, "token_acc": 0.8440219163894653 }, { "epoch": 0.8990053557765876, "grad_norm": 0.7720690965652466, "learning_rate": 8.392609653933803e-05, "loss": 0.5396030426025391, "step": 1175, "token_acc": 0.8430536985397339 }, { "epoch": 0.9028309104820199, "grad_norm": 0.7427228689193726, "learning_rate": 8.377087094840813e-05, "loss": 0.5650552749633789, "step": 1180, "token_acc": 0.8388790488243103 }, { "epoch": 0.9066564651874521, "grad_norm": 0.8698520660400391, "learning_rate": 8.361504451306585e-05, "loss": 0.5175793647766114, "step": 1185, "token_acc": 0.8486889004707336 }, { "epoch": 0.9104820198928845, "grad_norm": 0.838016927242279, "learning_rate": 8.345862000574321e-05, "loss": 0.5568198204040528, "step": 1190, "token_acc": 0.8362753391265869 }, { "epoch": 0.9143075745983168, "grad_norm": 0.7980285286903381, "learning_rate": 8.330160020951299e-05, "loss": 0.5795284748077393, "step": 1195, "token_acc": 0.8336633443832397 }, { "epoch": 0.918133129303749, "grad_norm": 0.7379786968231201, "learning_rate": 8.314398791803916e-05, "loss": 0.5594221115112304, "step": 1200, "token_acc": 0.8377372026443481 }, { "epoch": 0.918133129303749, "eval_loss": 0.5564058423042297, "eval_runtime": 7.7456, "eval_samples_per_second": 13.427, "eval_steps_per_second": 1.678, "eval_token_acc": 0.8375070095062256, "step": 1200 }, { "epoch": 0.9219586840091814, "grad_norm": 0.8150419592857361, "learning_rate": 8.298578593552737e-05, "loss": 0.5221155166625977, "step": 1205, "token_acc": 0.8457277417182922 }, { "epoch": 0.9257842387146136, "grad_norm": 0.9086570739746094, "learning_rate": 8.28269970766748e-05, "loss": 0.574681568145752, "step": 1210, "token_acc": 0.8327599763870239 }, { "epoch": 0.9296097934200459, "grad_norm": 0.8389135599136353, "learning_rate": 8.26676241666203e-05, "loss": 0.5882039070129395, "step": 1215, "token_acc": 0.8281732797622681 }, { "epoch": 0.9334353481254782, "grad_norm": 1.0141870975494385, "learning_rate": 8.250767004089399e-05, "loss": 0.5588771820068359, "step": 1220, "token_acc": 0.8358601331710815 }, { "epoch": 0.9372609028309105, "grad_norm": 0.8374904990196228, "learning_rate": 8.23471375453669e-05, "loss": 0.5152300834655762, "step": 1225, "token_acc": 0.8489376902580261 }, { "epoch": 0.9410864575363428, "grad_norm": 0.8244453072547913, "learning_rate": 8.21860295362003e-05, "loss": 0.500080680847168, "step": 1230, "token_acc": 0.8521796464920044 }, { "epoch": 0.944912012241775, "grad_norm": 0.9917334318161011, "learning_rate": 8.20243488797948e-05, "loss": 0.5609046459197998, "step": 1235, "token_acc": 0.8397351503372192 }, { "epoch": 0.9487375669472073, "grad_norm": 1.4528796672821045, "learning_rate": 8.186209845273954e-05, "loss": 0.6106361389160156, "step": 1240, "token_acc": 0.8287570476531982 }, { "epoch": 0.9525631216526397, "grad_norm": 0.8477284908294678, "learning_rate": 8.169928114176084e-05, "loss": 0.534299659729004, "step": 1245, "token_acc": 0.8459932208061218 }, { "epoch": 0.9563886763580719, "grad_norm": 0.9785248041152954, "learning_rate": 8.153589984367091e-05, "loss": 0.5453691959381104, "step": 1250, "token_acc": 0.8423656821250916 }, { "epoch": 0.9563886763580719, "eval_loss": 0.5528830885887146, "eval_runtime": 8.7628, "eval_samples_per_second": 11.868, "eval_steps_per_second": 1.484, "eval_token_acc": 0.8377977609634399, "step": 1250 }, { "epoch": 0.9602142310635042, "grad_norm": 0.7743374705314636, "learning_rate": 8.137195746531635e-05, "loss": 0.5649035453796387, "step": 1255, "token_acc": 0.83652263879776 }, { "epoch": 0.9640397857689365, "grad_norm": 0.9111794829368591, "learning_rate": 8.120745692352627e-05, "loss": 0.5429101943969726, "step": 1260, "token_acc": 0.8409203886985779 }, { "epoch": 0.9678653404743688, "grad_norm": 0.8705430030822754, "learning_rate": 8.104240114506065e-05, "loss": 0.5348100185394287, "step": 1265, "token_acc": 0.8432644009590149 }, { "epoch": 0.9716908951798011, "grad_norm": 0.7576097249984741, "learning_rate": 8.087679306655804e-05, "loss": 0.5683703422546387, "step": 1270, "token_acc": 0.836378276348114 }, { "epoch": 0.9755164498852333, "grad_norm": 1.1635630130767822, "learning_rate": 8.07106356344834e-05, "loss": 0.6346898078918457, "step": 1275, "token_acc": 0.8285390734672546 }, { "epoch": 0.9793420045906657, "grad_norm": 0.827690601348877, "learning_rate": 8.054393180507572e-05, "loss": 0.5661238193511963, "step": 1280, "token_acc": 0.8387032747268677 }, { "epoch": 0.9831675592960979, "grad_norm": 0.888037383556366, "learning_rate": 8.037668454429534e-05, "loss": 0.5784870624542237, "step": 1285, "token_acc": 0.8306419849395752 }, { "epoch": 0.9869931140015302, "grad_norm": 0.7650582790374756, "learning_rate": 8.020889682777127e-05, "loss": 0.5594500064849853, "step": 1290, "token_acc": 0.8358885645866394 }, { "epoch": 0.9908186687069626, "grad_norm": 0.8132854104042053, "learning_rate": 8.004057164074814e-05, "loss": 0.5590912818908691, "step": 1295, "token_acc": 0.8387227654457092 }, { "epoch": 0.9946442234123948, "grad_norm": 0.8819040656089783, "learning_rate": 7.987171197803315e-05, "loss": 0.5425111770629882, "step": 1300, "token_acc": 0.8366984128952026 }, { "epoch": 0.9946442234123948, "eval_loss": 0.5410341024398804, "eval_runtime": 7.8851, "eval_samples_per_second": 13.189, "eval_steps_per_second": 1.649, "eval_token_acc": 0.8402237296104431, "step": 1300 }, { "epoch": 0.9984697781178271, "grad_norm": 0.7759367227554321, "learning_rate": 7.970232084394282e-05, "loss": 0.4794795989990234, "step": 1305, "token_acc": 0.8576377034187317 }, { "epoch": 1.0022953328232593, "grad_norm": 0.7615346908569336, "learning_rate": 7.953240125224948e-05, "loss": 0.4416775703430176, "step": 1310, "token_acc": 0.8654638528823853 }, { "epoch": 1.0061208875286916, "grad_norm": 0.7918492555618286, "learning_rate": 7.936195622612767e-05, "loss": 0.37592229843139646, "step": 1315, "token_acc": 0.8773406147956848 }, { "epoch": 1.009946442234124, "grad_norm": 0.717467725276947, "learning_rate": 7.919098879810036e-05, "loss": 0.4267716407775879, "step": 1320, "token_acc": 0.8671300411224365 }, { "epoch": 1.0137719969395562, "grad_norm": 0.7892487645149231, "learning_rate": 7.901950200998493e-05, "loss": 0.382064151763916, "step": 1325, "token_acc": 0.8785242438316345 }, { "epoch": 1.0175975516449884, "grad_norm": 0.7296363711357117, "learning_rate": 7.884749891283922e-05, "loss": 0.36800203323364256, "step": 1330, "token_acc": 0.881615400314331 }, { "epoch": 1.0214231063504209, "grad_norm": 1.087638258934021, "learning_rate": 7.867498256690704e-05, "loss": 0.37799820899963377, "step": 1335, "token_acc": 0.877220094203949 }, { "epoch": 1.025248661055853, "grad_norm": 0.7339928150177002, "learning_rate": 7.850195604156385e-05, "loss": 0.37110204696655275, "step": 1340, "token_acc": 0.884996771812439 }, { "epoch": 1.0290742157612853, "grad_norm": 0.8464434742927551, "learning_rate": 7.832842241526212e-05, "loss": 0.3805660009384155, "step": 1345, "token_acc": 0.879789412021637 }, { "epoch": 1.0328997704667178, "grad_norm": 0.689896821975708, "learning_rate": 7.815438477547655e-05, "loss": 0.3583992481231689, "step": 1350, "token_acc": 0.8869645595550537 }, { "epoch": 1.0328997704667178, "eval_loss": 0.5596266984939575, "eval_runtime": 7.7953, "eval_samples_per_second": 13.341, "eval_steps_per_second": 1.668, "eval_token_acc": 0.8413565754890442, "step": 1350 }, { "epoch": 1.03672532517215, "grad_norm": 0.6902993321418762, "learning_rate": 7.797984621864916e-05, "loss": 0.42625932693481444, "step": 1355, "token_acc": 0.8614287376403809 }, { "epoch": 1.0405508798775822, "grad_norm": 0.7562316060066223, "learning_rate": 7.780480985013413e-05, "loss": 0.3689578533172607, "step": 1360, "token_acc": 0.8820473551750183 }, { "epoch": 1.0443764345830144, "grad_norm": 0.6517492532730103, "learning_rate": 7.762927878414267e-05, "loss": 0.32921748161315917, "step": 1365, "token_acc": 0.8926072120666504 }, { "epoch": 1.0482019892884469, "grad_norm": 0.8397619724273682, "learning_rate": 7.745325614368755e-05, "loss": 0.3830822229385376, "step": 1370, "token_acc": 0.8756515383720398 }, { "epoch": 1.052027543993879, "grad_norm": 0.7649819254875183, "learning_rate": 7.727674506052743e-05, "loss": 0.37806334495544436, "step": 1375, "token_acc": 0.8804787993431091 }, { "epoch": 1.0558530986993113, "grad_norm": 0.7365129590034485, "learning_rate": 7.709974867511138e-05, "loss": 0.3349802017211914, "step": 1380, "token_acc": 0.8934342861175537 }, { "epoch": 1.0596786534047438, "grad_norm": 0.884164571762085, "learning_rate": 7.692227013652278e-05, "loss": 0.36524980068206786, "step": 1385, "token_acc": 0.8806947469711304 }, { "epoch": 1.063504208110176, "grad_norm": 0.6860577464103699, "learning_rate": 7.674431260242338e-05, "loss": 0.367877721786499, "step": 1390, "token_acc": 0.8842624425888062 }, { "epoch": 1.0673297628156082, "grad_norm": 0.7009398937225342, "learning_rate": 7.656587923899718e-05, "loss": 0.3564207315444946, "step": 1395, "token_acc": 0.8837472200393677 }, { "epoch": 1.0711553175210407, "grad_norm": 0.7540706396102905, "learning_rate": 7.638697322089398e-05, "loss": 0.3640351057052612, "step": 1400, "token_acc": 0.8847005367279053 }, { "epoch": 1.0711553175210407, "eval_loss": 0.5507253408432007, "eval_runtime": 7.7117, "eval_samples_per_second": 13.486, "eval_steps_per_second": 1.686, "eval_token_acc": 0.8438527584075928, "step": 1400 }, { "epoch": 1.0749808722264729, "grad_norm": 0.6863798499107361, "learning_rate": 7.620759773117299e-05, "loss": 0.3779132604598999, "step": 1405, "token_acc": 0.8826145529747009 }, { "epoch": 1.078806426931905, "grad_norm": 0.7733192443847656, "learning_rate": 7.602775596124611e-05, "loss": 0.3633275032043457, "step": 1410, "token_acc": 0.886398196220398 }, { "epoch": 1.0826319816373373, "grad_norm": 0.7949317693710327, "learning_rate": 7.584745111082127e-05, "loss": 0.3376323699951172, "step": 1415, "token_acc": 0.8887669444084167 }, { "epoch": 1.0864575363427698, "grad_norm": 0.6832326650619507, "learning_rate": 7.566668638784542e-05, "loss": 0.33144965171813967, "step": 1420, "token_acc": 0.8916584849357605 }, { "epoch": 1.090283091048202, "grad_norm": 0.8551044464111328, "learning_rate": 7.548546500844742e-05, "loss": 0.3287867546081543, "step": 1425, "token_acc": 0.8930348753929138 }, { "epoch": 1.0941086457536342, "grad_norm": 0.7423316240310669, "learning_rate": 7.530379019688092e-05, "loss": 0.3902039289474487, "step": 1430, "token_acc": 0.8757656812667847 }, { "epoch": 1.0979342004590666, "grad_norm": 0.8404172658920288, "learning_rate": 7.51216651854669e-05, "loss": 0.390373969078064, "step": 1435, "token_acc": 0.8776587843894958 }, { "epoch": 1.1017597551644989, "grad_norm": 0.8963853120803833, "learning_rate": 7.493909321453625e-05, "loss": 0.4068464279174805, "step": 1440, "token_acc": 0.8700478076934814 }, { "epoch": 1.105585309869931, "grad_norm": 0.7311558723449707, "learning_rate": 7.475607753237202e-05, "loss": 0.3884909629821777, "step": 1445, "token_acc": 0.8745863437652588 }, { "epoch": 1.1094108645753635, "grad_norm": 0.7590047121047974, "learning_rate": 7.457262139515171e-05, "loss": 0.3895232677459717, "step": 1450, "token_acc": 0.8725248575210571 }, { "epoch": 1.1094108645753635, "eval_loss": 0.5504098534584045, "eval_runtime": 7.7559, "eval_samples_per_second": 13.409, "eval_steps_per_second": 1.676, "eval_token_acc": 0.8443038463592529, "step": 1450 }, { "epoch": 1.1132364192807958, "grad_norm": 0.882554292678833, "learning_rate": 7.438872806688934e-05, "loss": 0.40759758949279784, "step": 1455, "token_acc": 0.8712476491928101 }, { "epoch": 1.117061973986228, "grad_norm": 0.6808732748031616, "learning_rate": 7.420440081937728e-05, "loss": 0.3652071237564087, "step": 1460, "token_acc": 0.8835034370422363 }, { "epoch": 1.1208875286916602, "grad_norm": 0.837759256362915, "learning_rate": 7.401964293212809e-05, "loss": 0.409121036529541, "step": 1465, "token_acc": 0.8712127208709717 }, { "epoch": 1.1247130833970926, "grad_norm": 0.6652865409851074, "learning_rate": 7.383445769231627e-05, "loss": 0.3703787803649902, "step": 1470, "token_acc": 0.8831153512001038 }, { "epoch": 1.1285386381025249, "grad_norm": 0.8179388642311096, "learning_rate": 7.364884839471964e-05, "loss": 0.39147076606750486, "step": 1475, "token_acc": 0.8752105236053467 }, { "epoch": 1.132364192807957, "grad_norm": 0.719514012336731, "learning_rate": 7.346281834166075e-05, "loss": 0.37967238426208494, "step": 1480, "token_acc": 0.8796840906143188 }, { "epoch": 1.1361897475133895, "grad_norm": 0.9179552793502808, "learning_rate": 7.327637084294817e-05, "loss": 0.3995789051055908, "step": 1485, "token_acc": 0.8751766085624695 }, { "epoch": 1.1400153022188217, "grad_norm": 0.7656182050704956, "learning_rate": 7.308950921581756e-05, "loss": 0.34888639450073244, "step": 1490, "token_acc": 0.89056795835495 }, { "epoch": 1.143840856924254, "grad_norm": 0.7309355735778809, "learning_rate": 7.290223678487272e-05, "loss": 0.39315025806427, "step": 1495, "token_acc": 0.876833438873291 }, { "epoch": 1.1476664116296864, "grad_norm": 0.7618235349655151, "learning_rate": 7.27145568820263e-05, "loss": 0.35439176559448243, "step": 1500, "token_acc": 0.8836838006973267 }, { "epoch": 1.1476664116296864, "eval_loss": 0.5430108904838562, "eval_runtime": 7.6873, "eval_samples_per_second": 13.529, "eval_steps_per_second": 1.691, "eval_token_acc": 0.8448953628540039, "step": 1500 }, { "epoch": 1.1514919663351186, "grad_norm": 0.8058356046676636, "learning_rate": 7.25264728464407e-05, "loss": 0.3466159820556641, "step": 1505, "token_acc": 0.8869272470474243 }, { "epoch": 1.1553175210405509, "grad_norm": 0.7806113362312317, "learning_rate": 7.233798802446847e-05, "loss": 0.40935721397399905, "step": 1510, "token_acc": 0.8709314465522766 }, { "epoch": 1.159143075745983, "grad_norm": 0.8264714479446411, "learning_rate": 7.214910576959297e-05, "loss": 0.38201849460601806, "step": 1515, "token_acc": 0.8780457973480225 }, { "epoch": 1.1629686304514155, "grad_norm": 0.6713389158248901, "learning_rate": 7.195982944236851e-05, "loss": 0.3252051115036011, "step": 1520, "token_acc": 0.892856240272522 }, { "epoch": 1.1667941851568477, "grad_norm": 0.7945072650909424, "learning_rate": 7.177016241036075e-05, "loss": 0.35387892723083497, "step": 1525, "token_acc": 0.8838560581207275 }, { "epoch": 1.17061973986228, "grad_norm": 0.8310626745223999, "learning_rate": 7.15801080480866e-05, "loss": 0.3746853590011597, "step": 1530, "token_acc": 0.8799676299095154 }, { "epoch": 1.1744452945677124, "grad_norm": 0.9108403325080872, "learning_rate": 7.138966973695431e-05, "loss": 0.36667909622192385, "step": 1535, "token_acc": 0.8820632100105286 }, { "epoch": 1.1782708492731446, "grad_norm": 0.7420673966407776, "learning_rate": 7.119885086520329e-05, "loss": 0.36235547065734863, "step": 1540, "token_acc": 0.8849785923957825 }, { "epoch": 1.1820964039785768, "grad_norm": 0.6693369150161743, "learning_rate": 7.100765482784377e-05, "loss": 0.3710158824920654, "step": 1545, "token_acc": 0.8811267614364624 }, { "epoch": 1.185921958684009, "grad_norm": 0.7249651551246643, "learning_rate": 7.081608502659646e-05, "loss": 0.3993852615356445, "step": 1550, "token_acc": 0.8718493580818176 }, { "epoch": 1.185921958684009, "eval_loss": 0.5383990406990051, "eval_runtime": 7.5793, "eval_samples_per_second": 13.722, "eval_steps_per_second": 1.715, "eval_token_acc": 0.8461685180664062, "step": 1550 }, { "epoch": 1.1897475133894415, "grad_norm": 0.9157434105873108, "learning_rate": 7.062414486983197e-05, "loss": 0.3987370491027832, "step": 1555, "token_acc": 0.8729732036590576 }, { "epoch": 1.1935730680948737, "grad_norm": 0.6402376890182495, "learning_rate": 7.043183777251024e-05, "loss": 0.2903183698654175, "step": 1560, "token_acc": 0.9057296514511108 }, { "epoch": 1.197398622800306, "grad_norm": 0.7679566144943237, "learning_rate": 7.023916715611969e-05, "loss": 0.4904749870300293, "step": 1565, "token_acc": 0.8663699626922607 }, { "epoch": 1.2012241775057384, "grad_norm": 0.8699092864990234, "learning_rate": 7.004613644861647e-05, "loss": 0.4231747627258301, "step": 1570, "token_acc": 0.8677194714546204 }, { "epoch": 1.2050497322111706, "grad_norm": 0.6792256832122803, "learning_rate": 6.985274908436333e-05, "loss": 0.44817123413085935, "step": 1575, "token_acc": 0.8659628629684448 }, { "epoch": 1.2088752869166028, "grad_norm": 0.7418417930603027, "learning_rate": 6.965900850406859e-05, "loss": 0.33240585327148436, "step": 1580, "token_acc": 0.8937970399856567 }, { "epoch": 1.2127008416220353, "grad_norm": 0.8835020065307617, "learning_rate": 6.946491815472496e-05, "loss": 0.3884410381317139, "step": 1585, "token_acc": 0.876690149307251 }, { "epoch": 1.2165263963274675, "grad_norm": 0.9086595177650452, "learning_rate": 6.927048148954812e-05, "loss": 0.410748291015625, "step": 1590, "token_acc": 0.8735622763633728 }, { "epoch": 1.2203519510328997, "grad_norm": 0.7838605642318726, "learning_rate": 6.907570196791538e-05, "loss": 0.3603389739990234, "step": 1595, "token_acc": 0.8829374313354492 }, { "epoch": 1.2241775057383322, "grad_norm": 0.7454732060432434, "learning_rate": 6.888058305530406e-05, "loss": 0.37654249668121337, "step": 1600, "token_acc": 0.8782923817634583 }, { "epoch": 1.2241775057383322, "eval_loss": 0.5343810319900513, "eval_runtime": 7.6236, "eval_samples_per_second": 13.642, "eval_steps_per_second": 1.705, "eval_token_acc": 0.8475719690322876, "step": 1600 }, { "epoch": 1.2280030604437644, "grad_norm": 0.7611352801322937, "learning_rate": 6.868512822322981e-05, "loss": 0.38566131591796876, "step": 1605, "token_acc": 0.8766804337501526 }, { "epoch": 1.2318286151491966, "grad_norm": 0.8874756693840027, "learning_rate": 6.848934094918498e-05, "loss": 0.38291475772857664, "step": 1610, "token_acc": 0.87657630443573 }, { "epoch": 1.2356541698546288, "grad_norm": 0.7193310260772705, "learning_rate": 6.829322471657658e-05, "loss": 0.3452467441558838, "step": 1615, "token_acc": 0.8881570100784302 }, { "epoch": 1.2394797245600613, "grad_norm": 0.661790668964386, "learning_rate": 6.809678301466443e-05, "loss": 0.3452208757400513, "step": 1620, "token_acc": 0.8885095715522766 }, { "epoch": 1.2433052792654935, "grad_norm": 0.8313160538673401, "learning_rate": 6.790001933849899e-05, "loss": 0.39090492725372317, "step": 1625, "token_acc": 0.8772667646408081 }, { "epoch": 1.2471308339709257, "grad_norm": 0.7543197870254517, "learning_rate": 6.770293718885928e-05, "loss": 0.37844099998474123, "step": 1630, "token_acc": 0.8773866295814514 }, { "epoch": 1.2509563886763582, "grad_norm": 0.7187685370445251, "learning_rate": 6.750554007219047e-05, "loss": 0.37274966239929197, "step": 1635, "token_acc": 0.8813634514808655 }, { "epoch": 1.2547819433817904, "grad_norm": 0.7216220498085022, "learning_rate": 6.730783150054164e-05, "loss": 0.40465946197509767, "step": 1640, "token_acc": 0.8722350597381592 }, { "epoch": 1.2586074980872226, "grad_norm": 0.808250367641449, "learning_rate": 6.71098149915031e-05, "loss": 0.39015932083129884, "step": 1645, "token_acc": 0.8755351901054382 }, { "epoch": 1.2624330527926548, "grad_norm": 0.6570851802825928, "learning_rate": 6.691149406814403e-05, "loss": 0.33088486194610595, "step": 1650, "token_acc": 0.8907855153083801 }, { "epoch": 1.2624330527926548, "eval_loss": 0.5374127626419067, "eval_runtime": 7.8026, "eval_samples_per_second": 13.329, "eval_steps_per_second": 1.666, "eval_token_acc": 0.8472611904144287, "step": 1650 }, { "epoch": 1.2662586074980873, "grad_norm": 0.6985551714897156, "learning_rate": 6.67128722589496e-05, "loss": 0.3755918502807617, "step": 1655, "token_acc": 0.8816916346549988 }, { "epoch": 1.2700841622035195, "grad_norm": 0.7275698781013489, "learning_rate": 6.651395309775837e-05, "loss": 0.3765554428100586, "step": 1660, "token_acc": 0.8811103701591492 }, { "epoch": 1.2739097169089517, "grad_norm": 0.729633092880249, "learning_rate": 6.631474012369921e-05, "loss": 0.3696659803390503, "step": 1665, "token_acc": 0.8816789984703064 }, { "epoch": 1.2777352716143842, "grad_norm": 0.7620216012001038, "learning_rate": 6.611523688112858e-05, "loss": 0.35426578521728513, "step": 1670, "token_acc": 0.8883428573608398 }, { "epoch": 1.2815608263198164, "grad_norm": 0.8159366846084595, "learning_rate": 6.591544691956723e-05, "loss": 0.38610110282897947, "step": 1675, "token_acc": 0.8776164054870605 }, { "epoch": 1.2853863810252486, "grad_norm": 0.8567126989364624, "learning_rate": 6.571537379363719e-05, "loss": 0.4222766399383545, "step": 1680, "token_acc": 0.8723132610321045 }, { "epoch": 1.2892119357306808, "grad_norm": 0.8297275304794312, "learning_rate": 6.551502106299851e-05, "loss": 0.37399892807006835, "step": 1685, "token_acc": 0.8821731209754944 }, { "epoch": 1.2930374904361133, "grad_norm": 0.6843409538269043, "learning_rate": 6.531439229228591e-05, "loss": 0.3343992233276367, "step": 1690, "token_acc": 0.892397403717041 }, { "epoch": 1.2968630451415455, "grad_norm": 0.7213367819786072, "learning_rate": 6.511349105104534e-05, "loss": 0.38822097778320314, "step": 1695, "token_acc": 0.8769423365592957 }, { "epoch": 1.300688599846978, "grad_norm": 0.700702428817749, "learning_rate": 6.491232091367049e-05, "loss": 0.35975372791290283, "step": 1700, "token_acc": 0.8861437439918518 }, { "epoch": 1.300688599846978, "eval_loss": 0.526591956615448, "eval_runtime": 7.7916, "eval_samples_per_second": 13.348, "eval_steps_per_second": 1.668, "eval_token_acc": 0.8482837677001953, "step": 1700 }, { "epoch": 1.3045141545524102, "grad_norm": 0.7598251104354858, "learning_rate": 6.471088545933921e-05, "loss": 0.3564164638519287, "step": 1705, "token_acc": 0.8872470855712891 }, { "epoch": 1.3083397092578424, "grad_norm": 0.7174568176269531, "learning_rate": 6.450918827194978e-05, "loss": 0.3287261962890625, "step": 1710, "token_acc": 0.894193708896637 }, { "epoch": 1.3121652639632746, "grad_norm": 0.7934249043464661, "learning_rate": 6.430723294005726e-05, "loss": 0.3405998468399048, "step": 1715, "token_acc": 0.8878347277641296 }, { "epoch": 1.315990818668707, "grad_norm": 0.8109247088432312, "learning_rate": 6.410502305680946e-05, "loss": 0.3818791389465332, "step": 1720, "token_acc": 0.8762706518173218 }, { "epoch": 1.3198163733741393, "grad_norm": 0.7905654311180115, "learning_rate": 6.390256221988318e-05, "loss": 0.3510235548019409, "step": 1725, "token_acc": 0.8884668946266174 }, { "epoch": 1.3236419280795715, "grad_norm": 0.7302840352058411, "learning_rate": 6.369985403142014e-05, "loss": 0.3860185146331787, "step": 1730, "token_acc": 0.8776938915252686 }, { "epoch": 1.327467482785004, "grad_norm": 0.7890005111694336, "learning_rate": 6.349690209796285e-05, "loss": 0.4002682685852051, "step": 1735, "token_acc": 0.8717520236968994 }, { "epoch": 1.3312930374904361, "grad_norm": 0.6541386842727661, "learning_rate": 6.329371003039051e-05, "loss": 0.3814365863800049, "step": 1740, "token_acc": 0.8806993365287781 }, { "epoch": 1.3351185921958684, "grad_norm": 0.7147980332374573, "learning_rate": 6.309028144385472e-05, "loss": 0.3602738380432129, "step": 1745, "token_acc": 0.8850005269050598 }, { "epoch": 1.3389441469013006, "grad_norm": 0.6951248049736023, "learning_rate": 6.288661995771522e-05, "loss": 0.35432114601135256, "step": 1750, "token_acc": 0.8871864080429077 }, { "epoch": 1.3389441469013006, "eval_loss": 0.5236285924911499, "eval_runtime": 7.7868, "eval_samples_per_second": 13.356, "eval_steps_per_second": 1.669, "eval_token_acc": 0.8510806560516357, "step": 1750 }, { "epoch": 1.342769701606733, "grad_norm": 0.7138703465461731, "learning_rate": 6.268272919547537e-05, "loss": 0.3437394857406616, "step": 1755, "token_acc": 0.8870205283164978 }, { "epoch": 1.3465952563121653, "grad_norm": 0.7315565943717957, "learning_rate": 6.247861278471785e-05, "loss": 0.3766175270080566, "step": 1760, "token_acc": 0.883225679397583 }, { "epoch": 1.3504208110175975, "grad_norm": 0.7530694603919983, "learning_rate": 6.227427435703997e-05, "loss": 0.3583348035812378, "step": 1765, "token_acc": 0.8860324025154114 }, { "epoch": 1.35424636572303, "grad_norm": 0.7517703175544739, "learning_rate": 6.206971754798913e-05, "loss": 0.3681065559387207, "step": 1770, "token_acc": 0.8821339011192322 }, { "epoch": 1.3580719204284621, "grad_norm": 1.0113003253936768, "learning_rate": 6.186494599699819e-05, "loss": 0.34742186069488523, "step": 1775, "token_acc": 0.8917561173439026 }, { "epoch": 1.3618974751338944, "grad_norm": 0.9447914361953735, "learning_rate": 6.165996334732055e-05, "loss": 0.3852540969848633, "step": 1780, "token_acc": 0.8768330216407776 }, { "epoch": 1.3657230298393266, "grad_norm": 1.0187249183654785, "learning_rate": 6.145477324596552e-05, "loss": 0.41319589614868163, "step": 1785, "token_acc": 0.8700772523880005 }, { "epoch": 1.369548584544759, "grad_norm": 0.716583251953125, "learning_rate": 6.124937934363331e-05, "loss": 0.33546440601348876, "step": 1790, "token_acc": 0.8909059166908264 }, { "epoch": 1.3733741392501913, "grad_norm": 0.7670001983642578, "learning_rate": 6.104378529465009e-05, "loss": 0.35624008178710936, "step": 1795, "token_acc": 0.8858749270439148 }, { "epoch": 1.3771996939556237, "grad_norm": 0.7541671991348267, "learning_rate": 6.083799475690309e-05, "loss": 0.38024513721466063, "step": 1800, "token_acc": 0.8788754343986511 }, { "epoch": 1.3771996939556237, "eval_loss": 0.5222176909446716, "eval_runtime": 7.9549, "eval_samples_per_second": 13.074, "eval_steps_per_second": 1.634, "eval_token_acc": 0.8502886891365051, "step": 1800 }, { "epoch": 1.381025248661056, "grad_norm": 0.7164918184280396, "learning_rate": 6.0632011391775325e-05, "loss": 0.3274393081665039, "step": 1805, "token_acc": 0.8930581212043762 }, { "epoch": 1.3848508033664881, "grad_norm": 0.7994803786277771, "learning_rate": 6.0425838864080594e-05, "loss": 0.37533011436462405, "step": 1810, "token_acc": 0.8814812898635864 }, { "epoch": 1.3886763580719204, "grad_norm": 0.610385000705719, "learning_rate": 6.0219480841998265e-05, "loss": 0.3626489877700806, "step": 1815, "token_acc": 0.8824625611305237 }, { "epoch": 1.3925019127773526, "grad_norm": 0.8779500126838684, "learning_rate": 6.001294099700795e-05, "loss": 0.3818621873855591, "step": 1820, "token_acc": 0.8814284205436707 }, { "epoch": 1.396327467482785, "grad_norm": 0.9023825526237488, "learning_rate": 5.980622300382424e-05, "loss": 0.34031038284301757, "step": 1825, "token_acc": 0.8901993036270142 }, { "epoch": 1.4001530221882172, "grad_norm": 0.7254869937896729, "learning_rate": 5.959933054033125e-05, "loss": 0.33964922428131106, "step": 1830, "token_acc": 0.8894827365875244 }, { "epoch": 1.4039785768936497, "grad_norm": 0.7711949944496155, "learning_rate": 5.9392267287517325e-05, "loss": 0.37581453323364256, "step": 1835, "token_acc": 0.8802526593208313 }, { "epoch": 1.407804131599082, "grad_norm": 0.8236564993858337, "learning_rate": 5.918503692940936e-05, "loss": 0.3631006717681885, "step": 1840, "token_acc": 0.8837177753448486 }, { "epoch": 1.4116296863045141, "grad_norm": 0.729147732257843, "learning_rate": 5.8977643153007436e-05, "loss": 0.39508538246154784, "step": 1845, "token_acc": 0.8759874701499939 }, { "epoch": 1.4154552410099464, "grad_norm": 0.7146396636962891, "learning_rate": 5.8770089648219086e-05, "loss": 0.38811707496643066, "step": 1850, "token_acc": 0.8768134713172913 }, { "epoch": 1.4154552410099464, "eval_loss": 0.508669912815094, "eval_runtime": 7.7202, "eval_samples_per_second": 13.471, "eval_steps_per_second": 1.684, "eval_token_acc": 0.8521132469177246, "step": 1850 }, { "epoch": 1.4192807957153788, "grad_norm": 0.729438066482544, "learning_rate": 5.8562380107793723e-05, "loss": 0.39258522987365724, "step": 1855, "token_acc": 0.8775860071182251 }, { "epoch": 1.423106350420811, "grad_norm": 0.6797559857368469, "learning_rate": 5.835451822725691e-05, "loss": 0.3752496957778931, "step": 1860, "token_acc": 0.8779392242431641 }, { "epoch": 1.4269319051262432, "grad_norm": 0.49813270568847656, "learning_rate": 5.814650770484461e-05, "loss": 0.36016933917999266, "step": 1865, "token_acc": 0.885236382484436 }, { "epoch": 1.4307574598316757, "grad_norm": 0.7051418423652649, "learning_rate": 5.7938352241437366e-05, "loss": 0.3023838996887207, "step": 1870, "token_acc": 0.9016345143318176 }, { "epoch": 1.434583014537108, "grad_norm": 0.7764083743095398, "learning_rate": 5.773005554049455e-05, "loss": 0.3270875453948975, "step": 1875, "token_acc": 0.8943535685539246 }, { "epoch": 1.4384085692425401, "grad_norm": 0.6883430480957031, "learning_rate": 5.752162130798833e-05, "loss": 0.3316964864730835, "step": 1880, "token_acc": 0.8921953439712524 }, { "epoch": 1.4422341239479723, "grad_norm": 0.7114600539207458, "learning_rate": 5.7313053252337854e-05, "loss": 0.31533355712890626, "step": 1885, "token_acc": 0.8978268504142761 }, { "epoch": 1.4460596786534048, "grad_norm": 0.8558183908462524, "learning_rate": 5.7104355084343196e-05, "loss": 0.3653078556060791, "step": 1890, "token_acc": 0.885123074054718 }, { "epoch": 1.449885233358837, "grad_norm": 0.7565247416496277, "learning_rate": 5.689553051711939e-05, "loss": 0.3589335441589355, "step": 1895, "token_acc": 0.8860511779785156 }, { "epoch": 1.4537107880642695, "grad_norm": 0.830723762512207, "learning_rate": 5.668658326603032e-05, "loss": 0.32294435501098634, "step": 1900, "token_acc": 0.8950970768928528 }, { "epoch": 1.4537107880642695, "eval_loss": 0.5095290541648865, "eval_runtime": 7.9225, "eval_samples_per_second": 13.127, "eval_steps_per_second": 1.641, "eval_token_acc": 0.8521934151649475, "step": 1900 }, { "epoch": 1.4575363427697017, "grad_norm": 0.707747220993042, "learning_rate": 5.647751704862263e-05, "loss": 0.3198162794113159, "step": 1905, "token_acc": 0.8932924270629883 }, { "epoch": 1.461361897475134, "grad_norm": 0.8484877347946167, "learning_rate": 5.626833558455961e-05, "loss": 0.34911117553710935, "step": 1910, "token_acc": 0.8880250453948975 }, { "epoch": 1.4651874521805661, "grad_norm": 0.6321529150009155, "learning_rate": 5.605904259555496e-05, "loss": 0.3261146306991577, "step": 1915, "token_acc": 0.8926582932472229 }, { "epoch": 1.4690130068859983, "grad_norm": 0.888900101184845, "learning_rate": 5.5849641805306654e-05, "loss": 0.34900679588317873, "step": 1920, "token_acc": 0.8897786736488342 }, { "epoch": 1.4728385615914308, "grad_norm": 0.687582790851593, "learning_rate": 5.564013693943062e-05, "loss": 0.34392595291137695, "step": 1925, "token_acc": 0.8870816826820374 }, { "epoch": 1.476664116296863, "grad_norm": 0.7888776659965515, "learning_rate": 5.5430531725394485e-05, "loss": 0.40218586921691896, "step": 1930, "token_acc": 0.8707258105278015 }, { "epoch": 1.4804896710022954, "grad_norm": 0.7543318867683411, "learning_rate": 5.522082989245122e-05, "loss": 0.3061817646026611, "step": 1935, "token_acc": 0.9006242156028748 }, { "epoch": 1.4843152257077277, "grad_norm": 0.8193092942237854, "learning_rate": 5.501103517157288e-05, "loss": 0.36248459815979006, "step": 1940, "token_acc": 0.8838417530059814 }, { "epoch": 1.48814078041316, "grad_norm": 0.7776079177856445, "learning_rate": 5.480115129538409e-05, "loss": 0.3319098949432373, "step": 1945, "token_acc": 0.8905050754547119 }, { "epoch": 1.4919663351185921, "grad_norm": 0.6906784176826477, "learning_rate": 5.459118199809577e-05, "loss": 0.30999135971069336, "step": 1950, "token_acc": 0.9011686444282532 }, { "epoch": 1.4919663351185921, "eval_loss": 0.5100167989730835, "eval_runtime": 8.6421, "eval_samples_per_second": 12.034, "eval_steps_per_second": 1.504, "eval_token_acc": 0.8534665703773499, "step": 1950 }, { "epoch": 1.4957918898240246, "grad_norm": 0.60188227891922, "learning_rate": 5.438113101543861e-05, "loss": 0.3165478467941284, "step": 1955, "token_acc": 0.8978914022445679 }, { "epoch": 1.4996174445294568, "grad_norm": 0.7757999300956726, "learning_rate": 5.417100208459662e-05, "loss": 0.33252928256988523, "step": 1960, "token_acc": 0.8919309377670288 }, { "epoch": 1.5034429992348892, "grad_norm": 0.8450996279716492, "learning_rate": 5.396079894414067e-05, "loss": 0.3332216739654541, "step": 1965, "token_acc": 0.8910924196243286 }, { "epoch": 1.5072685539403214, "grad_norm": 0.7125052809715271, "learning_rate": 5.375052533396191e-05, "loss": 0.32312803268432616, "step": 1970, "token_acc": 0.8956630229949951 }, { "epoch": 1.5110941086457537, "grad_norm": 0.728113055229187, "learning_rate": 5.354018499520536e-05, "loss": 0.3401800155639648, "step": 1975, "token_acc": 0.8904479742050171 }, { "epoch": 1.5149196633511859, "grad_norm": 0.5629063844680786, "learning_rate": 5.332978167020314e-05, "loss": 0.33483114242553713, "step": 1980, "token_acc": 0.8900842666625977 }, { "epoch": 1.518745218056618, "grad_norm": 0.7541650533676147, "learning_rate": 5.31193191024081e-05, "loss": 0.3606285095214844, "step": 1985, "token_acc": 0.8818128705024719 }, { "epoch": 1.5225707727620506, "grad_norm": 0.7752453684806824, "learning_rate": 5.2908801036327115e-05, "loss": 0.3571962356567383, "step": 1990, "token_acc": 0.8851061463356018 }, { "epoch": 1.5263963274674828, "grad_norm": 0.7320619225502014, "learning_rate": 5.269823121745443e-05, "loss": 0.34485607147216796, "step": 1995, "token_acc": 0.8938528895378113 }, { "epoch": 1.5302218821729152, "grad_norm": 0.7084663510322571, "learning_rate": 5.248761339220511e-05, "loss": 0.3630984306335449, "step": 2000, "token_acc": 0.8881708979606628 }, { "epoch": 1.5302218821729152, "eval_loss": 0.501686155796051, "eval_runtime": 8.3476, "eval_samples_per_second": 12.459, "eval_steps_per_second": 1.557, "eval_token_acc": 0.8552410006523132, "step": 2000 }, { "epoch": 1.5340474368783474, "grad_norm": 0.7146458029747009, "learning_rate": 5.227695130784833e-05, "loss": 0.3331026554107666, "step": 2005, "token_acc": 0.8917819261550903 }, { "epoch": 1.5378729915837797, "grad_norm": 0.8245148062705994, "learning_rate": 5.2066248712440656e-05, "loss": 0.37367663383483884, "step": 2010, "token_acc": 0.879398763179779 }, { "epoch": 1.5416985462892119, "grad_norm": 0.7592694163322449, "learning_rate": 5.185550935475953e-05, "loss": 0.30876760482788085, "step": 2015, "token_acc": 0.8983100056648254 }, { "epoch": 1.545524100994644, "grad_norm": 0.9255443215370178, "learning_rate": 5.164473698423636e-05, "loss": 0.3594630241394043, "step": 2020, "token_acc": 0.8848262429237366 }, { "epoch": 1.5493496557000765, "grad_norm": 0.7179040908813477, "learning_rate": 5.143393535088998e-05, "loss": 0.3523809194564819, "step": 2025, "token_acc": 0.8905043601989746 }, { "epoch": 1.5531752104055088, "grad_norm": 0.7476411461830139, "learning_rate": 5.122310820525981e-05, "loss": 0.3416067361831665, "step": 2030, "token_acc": 0.8892166018486023 }, { "epoch": 1.5570007651109412, "grad_norm": 0.7161547541618347, "learning_rate": 5.101225929833921e-05, "loss": 0.30915536880493166, "step": 2035, "token_acc": 0.8991933465003967 }, { "epoch": 1.5608263198163734, "grad_norm": 0.935799777507782, "learning_rate": 5.08013923815087e-05, "loss": 0.31090846061706545, "step": 2040, "token_acc": 0.8967577815055847 }, { "epoch": 1.5646518745218057, "grad_norm": 0.7758647799491882, "learning_rate": 5.059051120646924e-05, "loss": 0.3375053882598877, "step": 2045, "token_acc": 0.8911775350570679 }, { "epoch": 1.5684774292272379, "grad_norm": 0.6921541094779968, "learning_rate": 5.0379619525175437e-05, "loss": 0.3175233840942383, "step": 2050, "token_acc": 0.897928774356842 }, { "epoch": 1.5684774292272379, "eval_loss": 0.49708712100982666, "eval_runtime": 8.4069, "eval_samples_per_second": 12.371, "eval_steps_per_second": 1.546, "eval_token_acc": 0.8562836050987244, "step": 2050 }, { "epoch": 1.57230298393267, "grad_norm": 0.8368853330612183, "learning_rate": 5.016872108976889e-05, "loss": 0.3685647964477539, "step": 2055, "token_acc": 0.8830959796905518 }, { "epoch": 1.5761285386381025, "grad_norm": 0.7727574706077576, "learning_rate": 4.99578196525113e-05, "loss": 0.326021146774292, "step": 2060, "token_acc": 0.8955893516540527 }, { "epoch": 1.5799540933435348, "grad_norm": 0.7962800860404968, "learning_rate": 4.974691896571781e-05, "loss": 0.36289157867431643, "step": 2065, "token_acc": 0.8842934370040894 }, { "epoch": 1.5837796480489672, "grad_norm": 0.7509872317314148, "learning_rate": 4.9536022781690185e-05, "loss": 0.31728103160858157, "step": 2070, "token_acc": 0.8965554237365723 }, { "epoch": 1.5876052027543994, "grad_norm": 0.6993099451065063, "learning_rate": 4.9325134852650124e-05, "loss": 0.36268980503082277, "step": 2075, "token_acc": 0.8835968375205994 }, { "epoch": 1.5914307574598316, "grad_norm": 0.7634088397026062, "learning_rate": 4.911425893067239e-05, "loss": 0.368328332901001, "step": 2080, "token_acc": 0.8840143084526062 }, { "epoch": 1.5952563121652639, "grad_norm": 0.734311580657959, "learning_rate": 4.8903398767618165e-05, "loss": 0.3379722833633423, "step": 2085, "token_acc": 0.8937978148460388 }, { "epoch": 1.599081866870696, "grad_norm": 1.7793625593185425, "learning_rate": 4.8692558115068254e-05, "loss": 0.33839640617370603, "step": 2090, "token_acc": 0.8909159302711487 }, { "epoch": 1.6029074215761285, "grad_norm": 0.6846344470977783, "learning_rate": 4.8481740724256324e-05, "loss": 0.36859283447265623, "step": 2095, "token_acc": 0.8814284801483154 }, { "epoch": 1.606732976281561, "grad_norm": 0.7191367149353027, "learning_rate": 4.827095034600215e-05, "loss": 0.32262775897979734, "step": 2100, "token_acc": 0.8948466777801514 }, { "epoch": 1.606732976281561, "eval_loss": 0.49555426836013794, "eval_runtime": 8.4995, "eval_samples_per_second": 12.236, "eval_steps_per_second": 1.53, "eval_token_acc": 0.8567647933959961, "step": 2100 }, { "epoch": 1.6105585309869932, "grad_norm": 0.7318239808082581, "learning_rate": 4.806019073064493e-05, "loss": 0.28886990547180175, "step": 2105, "token_acc": 0.9057518243789673 }, { "epoch": 1.6143840856924254, "grad_norm": 0.7161886096000671, "learning_rate": 4.7849465627976574e-05, "loss": 0.3786638259887695, "step": 2110, "token_acc": 0.877372682094574 }, { "epoch": 1.6182096403978576, "grad_norm": 0.7079288959503174, "learning_rate": 4.763877878717484e-05, "loss": 0.3339807987213135, "step": 2115, "token_acc": 0.892234742641449 }, { "epoch": 1.6220351951032899, "grad_norm": 0.7738683819770813, "learning_rate": 4.742813395673684e-05, "loss": 0.3155964851379395, "step": 2120, "token_acc": 0.8984229564666748 }, { "epoch": 1.6258607498087223, "grad_norm": 0.7651445269584656, "learning_rate": 4.721753488441222e-05, "loss": 0.34331388473510743, "step": 2125, "token_acc": 0.8891043663024902 }, { "epoch": 1.6296863045141545, "grad_norm": 0.7328031063079834, "learning_rate": 4.700698531713648e-05, "loss": 0.3365816354751587, "step": 2130, "token_acc": 0.8924189805984497 }, { "epoch": 1.633511859219587, "grad_norm": 0.7824881672859192, "learning_rate": 4.679648900096436e-05, "loss": 0.3375370502471924, "step": 2135, "token_acc": 0.8933680653572083 }, { "epoch": 1.6373374139250192, "grad_norm": 0.7239261269569397, "learning_rate": 4.658604968100318e-05, "loss": 0.44536380767822265, "step": 2140, "token_acc": 0.8609479665756226 }, { "epoch": 1.6411629686304514, "grad_norm": 0.8158916234970093, "learning_rate": 4.6375671101346135e-05, "loss": 0.31634106636047366, "step": 2145, "token_acc": 0.8972258567810059 }, { "epoch": 1.6449885233358836, "grad_norm": 0.6787914633750916, "learning_rate": 4.616535700500583e-05, "loss": 0.3428164005279541, "step": 2150, "token_acc": 0.8936346769332886 }, { "epoch": 1.6449885233358836, "eval_loss": 0.4892226755619049, "eval_runtime": 8.5201, "eval_samples_per_second": 12.206, "eval_steps_per_second": 1.526, "eval_token_acc": 0.8588098287582397, "step": 2150 }, { "epoch": 1.6488140780413159, "grad_norm": 0.7179057002067566, "learning_rate": 4.5955111133847516e-05, "loss": 0.3500206470489502, "step": 2155, "token_acc": 0.8879844546318054 }, { "epoch": 1.6526396327467483, "grad_norm": 0.9363833665847778, "learning_rate": 4.574493722852266e-05, "loss": 0.33152313232421876, "step": 2160, "token_acc": 0.8924428820610046 }, { "epoch": 1.6564651874521805, "grad_norm": 0.8011144995689392, "learning_rate": 4.553483902840227e-05, "loss": 0.33824012279510496, "step": 2165, "token_acc": 0.888818621635437 }, { "epoch": 1.660290742157613, "grad_norm": 0.754247784614563, "learning_rate": 4.5324820271510446e-05, "loss": 0.3261884689331055, "step": 2170, "token_acc": 0.8930807709693909 }, { "epoch": 1.6641162968630452, "grad_norm": 0.8901833891868591, "learning_rate": 4.5114884694457906e-05, "loss": 0.3530290603637695, "step": 2175, "token_acc": 0.8864350914955139 }, { "epoch": 1.6679418515684774, "grad_norm": 0.7795696258544922, "learning_rate": 4.490503603237532e-05, "loss": 0.28058276176452634, "step": 2180, "token_acc": 0.9070743322372437 }, { "epoch": 1.6717674062739096, "grad_norm": 0.7988150119781494, "learning_rate": 4.4695278018847105e-05, "loss": 0.3197885036468506, "step": 2185, "token_acc": 0.8948556780815125 }, { "epoch": 1.6755929609793418, "grad_norm": 0.7500495910644531, "learning_rate": 4.448561438584484e-05, "loss": 0.30902011394500734, "step": 2190, "token_acc": 0.8987115621566772 }, { "epoch": 1.6794185156847743, "grad_norm": 0.8123504519462585, "learning_rate": 4.4276048863660874e-05, "loss": 0.34034423828125, "step": 2195, "token_acc": 0.8910139203071594 }, { "epoch": 1.6832440703902067, "grad_norm": 1.9124935865402222, "learning_rate": 4.406658518084201e-05, "loss": 0.27848803997039795, "step": 2200, "token_acc": 0.9100915789604187 }, { "epoch": 1.6832440703902067, "eval_loss": 0.48997873067855835, "eval_runtime": 7.7857, "eval_samples_per_second": 13.358, "eval_steps_per_second": 1.67, "eval_token_acc": 0.8590003252029419, "step": 2200 }, { "epoch": 1.687069625095639, "grad_norm": 0.7550795674324036, "learning_rate": 4.3857227064123184e-05, "loss": 0.3289813995361328, "step": 2205, "token_acc": 0.8935672044754028 }, { "epoch": 1.6908951798010712, "grad_norm": 0.6573622822761536, "learning_rate": 4.364797823836108e-05, "loss": 0.3325567483901978, "step": 2210, "token_acc": 0.8916365504264832 }, { "epoch": 1.6947207345065034, "grad_norm": 0.7994371056556702, "learning_rate": 4.3438842426467885e-05, "loss": 0.3089787483215332, "step": 2215, "token_acc": 0.8987955451011658 }, { "epoch": 1.6985462892119356, "grad_norm": 0.7001591920852661, "learning_rate": 4.322982334934509e-05, "loss": 0.3258508682250977, "step": 2220, "token_acc": 0.89599609375 }, { "epoch": 1.702371843917368, "grad_norm": 0.7623443603515625, "learning_rate": 4.302092472581729e-05, "loss": 0.29424998760223386, "step": 2225, "token_acc": 0.9034655094146729 }, { "epoch": 1.7061973986228003, "grad_norm": 0.8438885807991028, "learning_rate": 4.281215027256592e-05, "loss": 0.30596625804901123, "step": 2230, "token_acc": 0.8992859125137329 }, { "epoch": 1.7100229533282327, "grad_norm": 0.7240939736366272, "learning_rate": 4.260350370406329e-05, "loss": 0.30459909439086913, "step": 2235, "token_acc": 0.8981994986534119 }, { "epoch": 1.713848508033665, "grad_norm": 0.630903422832489, "learning_rate": 4.239498873250637e-05, "loss": 0.2987601041793823, "step": 2240, "token_acc": 0.9012813568115234 }, { "epoch": 1.7176740627390972, "grad_norm": 0.6413953304290771, "learning_rate": 4.218660906775076e-05, "loss": 0.27812976837158204, "step": 2245, "token_acc": 0.9085516929626465 }, { "epoch": 1.7214996174445294, "grad_norm": 0.8842605948448181, "learning_rate": 4.1978368417244754e-05, "loss": 0.3460667610168457, "step": 2250, "token_acc": 0.8905196785926819 }, { "epoch": 1.7214996174445294, "eval_loss": 0.48436981439590454, "eval_runtime": 6.1279, "eval_samples_per_second": 16.972, "eval_steps_per_second": 2.121, "eval_token_acc": 0.860263466835022, "step": 2250 }, { "epoch": 1.7253251721499616, "grad_norm": 0.6968632340431213, "learning_rate": 4.17702704859633e-05, "loss": 0.29213814735412597, "step": 2255, "token_acc": 0.9040796160697937 }, { "epoch": 1.729150726855394, "grad_norm": 0.7017317414283752, "learning_rate": 4.1562318976342165e-05, "loss": 0.3319288730621338, "step": 2260, "token_acc": 0.8922781944274902 }, { "epoch": 1.7329762815608263, "grad_norm": 0.7793192267417908, "learning_rate": 4.135451758821191e-05, "loss": 0.3711602210998535, "step": 2265, "token_acc": 0.8815440535545349 }, { "epoch": 1.7368018362662587, "grad_norm": 0.870146632194519, "learning_rate": 4.114687001873228e-05, "loss": 0.3280991554260254, "step": 2270, "token_acc": 0.8962957262992859 }, { "epoch": 1.740627390971691, "grad_norm": 0.6839405298233032, "learning_rate": 4.093937996232625e-05, "loss": 0.31872236728668213, "step": 2275, "token_acc": 0.8943005204200745 }, { "epoch": 1.7444529456771232, "grad_norm": 0.7605020999908447, "learning_rate": 4.073205111061436e-05, "loss": 0.31961095333099365, "step": 2280, "token_acc": 0.8964794278144836 }, { "epoch": 1.7482785003825554, "grad_norm": 0.6984594464302063, "learning_rate": 4.052488715234902e-05, "loss": 0.31977455615997313, "step": 2285, "token_acc": 0.8969309329986572 }, { "epoch": 1.7521040550879876, "grad_norm": 0.7754748463630676, "learning_rate": 4.0317891773348946e-05, "loss": 0.31035671234130857, "step": 2290, "token_acc": 0.8990971446037292 }, { "epoch": 1.75592960979342, "grad_norm": 0.8007567524909973, "learning_rate": 4.0111068656433426e-05, "loss": 0.34440956115722654, "step": 2295, "token_acc": 0.8881877660751343 }, { "epoch": 1.7597551644988525, "grad_norm": 0.9330772161483765, "learning_rate": 3.9904421481357e-05, "loss": 0.3286851406097412, "step": 2300, "token_acc": 0.8939043283462524 }, { "epoch": 1.7597551644988525, "eval_loss": 0.4778790771961212, "eval_runtime": 7.806, "eval_samples_per_second": 13.323, "eval_steps_per_second": 1.665, "eval_token_acc": 0.8623987436294556, "step": 2300 }, { "epoch": 1.7635807192042847, "grad_norm": 0.5906277894973755, "learning_rate": 3.969795392474383e-05, "loss": 0.34573922157287595, "step": 2305, "token_acc": 0.8903287053108215 }, { "epoch": 1.767406273909717, "grad_norm": 0.7397768497467041, "learning_rate": 3.9491669660022345e-05, "loss": 0.35153021812438967, "step": 2310, "token_acc": 0.8872886896133423 }, { "epoch": 1.7712318286151492, "grad_norm": 0.7996999025344849, "learning_rate": 3.928557235735989e-05, "loss": 0.31516518592834475, "step": 2315, "token_acc": 0.8970757722854614 }, { "epoch": 1.7750573833205814, "grad_norm": 0.6419305205345154, "learning_rate": 3.907966568359742e-05, "loss": 0.3054972171783447, "step": 2320, "token_acc": 0.8993676900863647 }, { "epoch": 1.7788829380260138, "grad_norm": 0.6739971041679382, "learning_rate": 3.887395330218429e-05, "loss": 0.3448510646820068, "step": 2325, "token_acc": 0.8888943195343018 }, { "epoch": 1.782708492731446, "grad_norm": 0.7799039483070374, "learning_rate": 3.866843887311297e-05, "loss": 0.31788105964660646, "step": 2330, "token_acc": 0.8954451680183411 }, { "epoch": 1.7865340474368785, "grad_norm": 0.7341748476028442, "learning_rate": 3.846312605285408e-05, "loss": 0.34601006507873533, "step": 2335, "token_acc": 0.8898206353187561 }, { "epoch": 1.7903596021423107, "grad_norm": 0.7024774551391602, "learning_rate": 3.8258018494291234e-05, "loss": 0.32241551876068114, "step": 2340, "token_acc": 0.89708012342453 }, { "epoch": 1.794185156847743, "grad_norm": 0.7515860795974731, "learning_rate": 3.8053119846656026e-05, "loss": 0.30928614139556887, "step": 2345, "token_acc": 0.8996888995170593 }, { "epoch": 1.7980107115531752, "grad_norm": 0.8652954697608948, "learning_rate": 3.78484337554632e-05, "loss": 0.30088629722595217, "step": 2350, "token_acc": 0.9041286110877991 }, { "epoch": 1.7980107115531752, "eval_loss": 0.47428014874458313, "eval_runtime": 7.8145, "eval_samples_per_second": 13.309, "eval_steps_per_second": 1.664, "eval_token_acc": 0.8631907105445862, "step": 2350 }, { "epoch": 1.8018362662586074, "grad_norm": 0.9508410692214966, "learning_rate": 3.764396386244577e-05, "loss": 0.34288840293884276, "step": 2355, "token_acc": 0.8890052437782288 }, { "epoch": 1.8056618209640398, "grad_norm": 0.775829017162323, "learning_rate": 3.743971380549008e-05, "loss": 0.30949153900146487, "step": 2360, "token_acc": 0.8984510898590088 }, { "epoch": 1.809487375669472, "grad_norm": 0.6938086152076721, "learning_rate": 3.723568721857133e-05, "loss": 0.28354833126068113, "step": 2365, "token_acc": 0.9054216146469116 }, { "epoch": 1.8133129303749045, "grad_norm": 0.6911359429359436, "learning_rate": 3.703188773168869e-05, "loss": 0.2959973096847534, "step": 2370, "token_acc": 0.9038095474243164 }, { "epoch": 1.8171384850803367, "grad_norm": 60.64387130737305, "learning_rate": 3.682831897080087e-05, "loss": 0.40934906005859373, "step": 2375, "token_acc": 0.8823349475860596 }, { "epoch": 1.820964039785769, "grad_norm": 0.7439799308776855, "learning_rate": 3.6624984557761504e-05, "loss": 0.2931365489959717, "step": 2380, "token_acc": 0.9051112532615662 }, { "epoch": 1.8247895944912012, "grad_norm": 0.6623691320419312, "learning_rate": 3.642188811025481e-05, "loss": 0.3292604207992554, "step": 2385, "token_acc": 0.8928682208061218 }, { "epoch": 1.8286151491966334, "grad_norm": 0.6264249086380005, "learning_rate": 3.621903324173114e-05, "loss": 0.265956974029541, "step": 2390, "token_acc": 0.9118374586105347 }, { "epoch": 1.8324407039020658, "grad_norm": 0.8278756737709045, "learning_rate": 3.6016423561342706e-05, "loss": 0.29644384384155276, "step": 2395, "token_acc": 0.9024685025215149 }, { "epoch": 1.836266258607498, "grad_norm": 0.810718297958374, "learning_rate": 3.581406267387941e-05, "loss": 0.281774640083313, "step": 2400, "token_acc": 0.9071557521820068 }, { "epoch": 1.836266258607498, "eval_loss": 0.47047871351242065, "eval_runtime": 7.868, "eval_samples_per_second": 13.218, "eval_steps_per_second": 1.652, "eval_token_acc": 0.8643736243247986, "step": 2400 }, { "epoch": 1.8400918133129305, "grad_norm": 0.7788925170898438, "learning_rate": 3.56119541797047e-05, "loss": 0.3004364013671875, "step": 2405, "token_acc": 0.8989213705062866 }, { "epoch": 1.8439173680183627, "grad_norm": 0.7350240349769592, "learning_rate": 3.5410101674691434e-05, "loss": 0.3446574449539185, "step": 2410, "token_acc": 0.8929014801979065 }, { "epoch": 1.847742922723795, "grad_norm": 0.7535839080810547, "learning_rate": 3.520850875015801e-05, "loss": 0.31823389530181884, "step": 2415, "token_acc": 0.896795928478241 }, { "epoch": 1.8515684774292271, "grad_norm": 0.8284432291984558, "learning_rate": 3.5007178992804416e-05, "loss": 0.30584444999694826, "step": 2420, "token_acc": 0.9038248658180237 }, { "epoch": 1.8553940321346594, "grad_norm": 0.8060945272445679, "learning_rate": 3.480611598464844e-05, "loss": 0.2657127857208252, "step": 2425, "token_acc": 0.9115975499153137 }, { "epoch": 1.8592195868400918, "grad_norm": 0.6967042684555054, "learning_rate": 3.4605323302961854e-05, "loss": 0.30145883560180664, "step": 2430, "token_acc": 0.9007070064544678 }, { "epoch": 1.8630451415455243, "grad_norm": 0.827389657497406, "learning_rate": 3.4404804520206915e-05, "loss": 0.3457145929336548, "step": 2435, "token_acc": 0.889440655708313 }, { "epoch": 1.8668706962509565, "grad_norm": 0.7290979027748108, "learning_rate": 3.42045632039727e-05, "loss": 0.29812381267547605, "step": 2440, "token_acc": 0.9029287695884705 }, { "epoch": 1.8706962509563887, "grad_norm": 0.8037905693054199, "learning_rate": 3.400460291691164e-05, "loss": 0.32248711585998535, "step": 2445, "token_acc": 0.8946207165718079 }, { "epoch": 1.874521805661821, "grad_norm": 0.7474591732025146, "learning_rate": 3.380492721667618e-05, "loss": 0.3022623062133789, "step": 2450, "token_acc": 0.9007507562637329 }, { "epoch": 1.874521805661821, "eval_loss": 0.46530866622924805, "eval_runtime": 7.8984, "eval_samples_per_second": 13.167, "eval_steps_per_second": 1.646, "eval_token_acc": 0.8647946715354919, "step": 2450 }, { "epoch": 1.8783473603672531, "grad_norm": 0.71452397108078, "learning_rate": 3.3605539655855445e-05, "loss": 0.28342552185058595, "step": 2455, "token_acc": 0.9065305590629578 }, { "epoch": 1.8821729150726856, "grad_norm": 0.7897852659225464, "learning_rate": 3.3406443781912014e-05, "loss": 0.2861522912979126, "step": 2460, "token_acc": 0.9051787257194519 }, { "epoch": 1.8859984697781178, "grad_norm": 0.7614904642105103, "learning_rate": 3.3207643137118874e-05, "loss": 0.2704183578491211, "step": 2465, "token_acc": 0.911378026008606 }, { "epoch": 1.8898240244835502, "grad_norm": 0.6754797697067261, "learning_rate": 3.3009141258496344e-05, "loss": 0.31130855083465575, "step": 2470, "token_acc": 0.8980752229690552 }, { "epoch": 1.8936495791889825, "grad_norm": 0.7454941272735596, "learning_rate": 3.2810941677749164e-05, "loss": 0.34280953407287595, "step": 2475, "token_acc": 0.8920162320137024 }, { "epoch": 1.8974751338944147, "grad_norm": 0.7202689051628113, "learning_rate": 3.261304792120361e-05, "loss": 0.2786979675292969, "step": 2480, "token_acc": 0.907993495464325 }, { "epoch": 1.901300688599847, "grad_norm": 0.7289252281188965, "learning_rate": 3.2415463509744855e-05, "loss": 0.28704142570495605, "step": 2485, "token_acc": 0.9051684141159058 }, { "epoch": 1.9051262433052791, "grad_norm": 0.7389020919799805, "learning_rate": 3.2218191958754226e-05, "loss": 0.3317502498626709, "step": 2490, "token_acc": 0.8912999629974365 }, { "epoch": 1.9089517980107116, "grad_norm": 0.7187902331352234, "learning_rate": 3.202123677804672e-05, "loss": 0.32085230350494387, "step": 2495, "token_acc": 0.8973221182823181 }, { "epoch": 1.9127773527161438, "grad_norm": 0.780617892742157, "learning_rate": 3.18246014718085e-05, "loss": 0.2799449682235718, "step": 2500, "token_acc": 0.9089812636375427 }, { "epoch": 1.9127773527161438, "eval_loss": 0.4558640122413635, "eval_runtime": 7.6268, "eval_samples_per_second": 13.636, "eval_steps_per_second": 1.705, "eval_token_acc": 0.8680527806282043, "step": 2500 }, { "epoch": 1.9166029074215762, "grad_norm": 0.7578943967819214, "learning_rate": 3.162828953853469e-05, "loss": 0.283012843132019, "step": 2505, "token_acc": 0.908361554145813 }, { "epoch": 1.9204284621270085, "grad_norm": 0.7080029249191284, "learning_rate": 3.14323044709669e-05, "loss": 0.26364171504974365, "step": 2510, "token_acc": 0.9134095311164856 }, { "epoch": 1.9242540168324407, "grad_norm": 0.7052859663963318, "learning_rate": 3.12366497560313e-05, "loss": 0.28186535835266113, "step": 2515, "token_acc": 0.9079092741012573 }, { "epoch": 1.928079571537873, "grad_norm": 0.722137451171875, "learning_rate": 3.104132887477647e-05, "loss": 0.2929178953170776, "step": 2520, "token_acc": 0.9022585153579712 }, { "epoch": 1.9319051262433051, "grad_norm": 0.6590465903282166, "learning_rate": 3.084634530231145e-05, "loss": 0.29388132095336916, "step": 2525, "token_acc": 0.9019988179206848 }, { "epoch": 1.9357306809487376, "grad_norm": 0.7757251858711243, "learning_rate": 3.065170250774401e-05, "loss": 0.3049909591674805, "step": 2530, "token_acc": 0.8986476063728333 }, { "epoch": 1.93955623565417, "grad_norm": 0.7149041295051575, "learning_rate": 3.0457403954118856e-05, "loss": 0.2536777019500732, "step": 2535, "token_acc": 0.9141318202018738 }, { "epoch": 1.9433817903596022, "grad_norm": 0.6480096578598022, "learning_rate": 3.026345309835602e-05, "loss": 0.3146909952163696, "step": 2540, "token_acc": 0.8978093266487122 }, { "epoch": 1.9472073450650345, "grad_norm": 0.7162771224975586, "learning_rate": 3.0069853391189352e-05, "loss": 0.29620161056518557, "step": 2545, "token_acc": 0.9032965302467346 }, { "epoch": 1.9510328997704667, "grad_norm": 0.6839264631271362, "learning_rate": 2.9876608277105145e-05, "loss": 0.3268457889556885, "step": 2550, "token_acc": 0.8923251032829285 }, { "epoch": 1.9510328997704667, "eval_loss": 0.4463500678539276, "eval_runtime": 7.6674, "eval_samples_per_second": 13.564, "eval_steps_per_second": 1.695, "eval_token_acc": 0.8691655397415161, "step": 2550 }, { "epoch": 1.954858454475899, "grad_norm": 0.7191382050514221, "learning_rate": 2.9683721194280877e-05, "loss": 0.2873558044433594, "step": 2555, "token_acc": 0.9027012586593628 }, { "epoch": 1.9586840091813313, "grad_norm": 0.7788121700286865, "learning_rate": 2.9491195574523945e-05, "loss": 0.29071290493011476, "step": 2560, "token_acc": 0.9054592251777649 }, { "epoch": 1.9625095638867636, "grad_norm": 0.6830841302871704, "learning_rate": 2.9299034843210726e-05, "loss": 0.2975457668304443, "step": 2565, "token_acc": 0.9023709297180176 }, { "epoch": 1.966335118592196, "grad_norm": 0.8139908909797668, "learning_rate": 2.9107242419225577e-05, "loss": 0.2521679401397705, "step": 2570, "token_acc": 0.9153000116348267 }, { "epoch": 1.9701606732976282, "grad_norm": 0.6574170589447021, "learning_rate": 2.8915821714899917e-05, "loss": 0.268428373336792, "step": 2575, "token_acc": 0.9112088680267334 }, { "epoch": 1.9739862280030605, "grad_norm": 0.7240482568740845, "learning_rate": 2.8724776135951747e-05, "loss": 0.2789809226989746, "step": 2580, "token_acc": 0.9081910848617554 }, { "epoch": 1.9778117827084927, "grad_norm": 0.675998330116272, "learning_rate": 2.85341090814248e-05, "loss": 0.300505256652832, "step": 2585, "token_acc": 0.9032467007637024 }, { "epoch": 1.981637337413925, "grad_norm": 0.7282765507698059, "learning_rate": 2.8343823943628257e-05, "loss": 0.2605840444564819, "step": 2590, "token_acc": 0.9125819206237793 }, { "epoch": 1.9854628921193573, "grad_norm": 0.8446104526519775, "learning_rate": 2.8153924108076234e-05, "loss": 0.3036641120910645, "step": 2595, "token_acc": 0.9020313024520874 }, { "epoch": 1.9892884468247896, "grad_norm": 0.8688914179801941, "learning_rate": 2.7964412953427667e-05, "loss": 0.301717472076416, "step": 2600, "token_acc": 0.90234375 }, { "epoch": 1.9892884468247896, "eval_loss": 0.44557470083236694, "eval_runtime": 7.7719, "eval_samples_per_second": 13.382, "eval_steps_per_second": 1.673, "eval_token_acc": 0.869877278804779, "step": 2600 }, { "epoch": 1.993114001530222, "grad_norm": 0.6388227343559265, "learning_rate": 2.7775293851426232e-05, "loss": 0.28205983638763427, "step": 2605, "token_acc": 0.9057275056838989 }, { "epoch": 1.9969395562356542, "grad_norm": 0.6498620510101318, "learning_rate": 2.7586570166840153e-05, "loss": 0.28784162998199464, "step": 2610, "token_acc": 0.9042630791664124 }, { "epoch": 2.0007651109410864, "grad_norm": 0.46216583251953125, "learning_rate": 2.7398245257402567e-05, "loss": 0.24226248264312744, "step": 2615, "token_acc": 0.9181912541389465 }, { "epoch": 2.0045906656465187, "grad_norm": 0.4526701867580414, "learning_rate": 2.721032247375165e-05, "loss": 0.13410005569458008, "step": 2620, "token_acc": 0.9554323554039001 }, { "epoch": 2.008416220351951, "grad_norm": 0.5027770400047302, "learning_rate": 2.7022805159371023e-05, "loss": 0.14986848831176758, "step": 2625, "token_acc": 0.9495237469673157 }, { "epoch": 2.012241775057383, "grad_norm": 0.6318019032478333, "learning_rate": 2.683569665053033e-05, "loss": 0.13008542060852052, "step": 2630, "token_acc": 0.9539133906364441 }, { "epoch": 2.0160673297628158, "grad_norm": 0.738571286201477, "learning_rate": 2.664900027622577e-05, "loss": 0.15502784252166749, "step": 2635, "token_acc": 0.9469853043556213 }, { "epoch": 2.019892884468248, "grad_norm": 0.6892253160476685, "learning_rate": 2.646271935812098e-05, "loss": 0.13881022930145265, "step": 2640, "token_acc": 0.9516469240188599 }, { "epoch": 2.02371843917368, "grad_norm": 0.6470181941986084, "learning_rate": 2.6276857210487858e-05, "loss": 0.1207735538482666, "step": 2645, "token_acc": 0.9576534032821655 }, { "epoch": 2.0275439938791124, "grad_norm": 0.6596648097038269, "learning_rate": 2.6091417140147634e-05, "loss": 0.11292877197265624, "step": 2650, "token_acc": 0.9626390337944031 }, { "epoch": 2.0275439938791124, "eval_loss": 0.5168122053146362, "eval_runtime": 8.1996, "eval_samples_per_second": 12.684, "eval_steps_per_second": 1.585, "eval_token_acc": 0.8695364594459534, "step": 2650 }, { "epoch": 2.0313695485845447, "grad_norm": 0.577893853187561, "learning_rate": 2.5906402446412027e-05, "loss": 0.14242198467254638, "step": 2655, "token_acc": 0.9518451690673828 }, { "epoch": 2.035195103289977, "grad_norm": 0.6954317688941956, "learning_rate": 2.5721816421024515e-05, "loss": 0.12017552852630616, "step": 2660, "token_acc": 0.9579612016677856 }, { "epoch": 2.0390206579954095, "grad_norm": 0.5604422688484192, "learning_rate": 2.553766234810181e-05, "loss": 0.12801860570907592, "step": 2665, "token_acc": 0.9555345773696899 }, { "epoch": 2.0428462127008418, "grad_norm": 0.6638826727867126, "learning_rate": 2.535394350407548e-05, "loss": 0.1116684079170227, "step": 2670, "token_acc": 0.960515022277832 }, { "epoch": 2.046671767406274, "grad_norm": 0.5910780429840088, "learning_rate": 2.5170663157633477e-05, "loss": 0.13454906940460204, "step": 2675, "token_acc": 0.9548289775848389 }, { "epoch": 2.050497322111706, "grad_norm": 0.6535590291023254, "learning_rate": 2.4987824569662167e-05, "loss": 0.12083430290222168, "step": 2680, "token_acc": 0.9585192799568176 }, { "epoch": 2.0543228768171384, "grad_norm": 0.5576914548873901, "learning_rate": 2.4805430993188228e-05, "loss": 0.12852833271026612, "step": 2685, "token_acc": 0.9565430879592896 }, { "epoch": 2.0581484315225707, "grad_norm": 0.57133549451828, "learning_rate": 2.4623485673320772e-05, "loss": 0.13395898342132567, "step": 2690, "token_acc": 0.9541014432907104 }, { "epoch": 2.061973986228003, "grad_norm": 0.824409008026123, "learning_rate": 2.4441991847193636e-05, "loss": 0.1304774522781372, "step": 2695, "token_acc": 0.9565969109535217 }, { "epoch": 2.0657995409334355, "grad_norm": 0.6546271443367004, "learning_rate": 2.4260952743907756e-05, "loss": 0.13317997455596925, "step": 2700, "token_acc": 0.9548870325088501 }, { "epoch": 2.0657995409334355, "eval_loss": 0.5218855142593384, "eval_runtime": 8.6536, "eval_samples_per_second": 12.018, "eval_steps_per_second": 1.502, "eval_token_acc": 0.8702181577682495, "step": 2700 }, { "epoch": 2.0696250956388678, "grad_norm": 0.49882644414901733, "learning_rate": 2.4080371584473748e-05, "loss": 0.10250062942504883, "step": 2705, "token_acc": 0.9647969007492065 }, { "epoch": 2.0734506503443, "grad_norm": 0.6716576814651489, "learning_rate": 2.390025158175458e-05, "loss": 0.12553690671920775, "step": 2710, "token_acc": 0.9559978246688843 }, { "epoch": 2.077276205049732, "grad_norm": 0.630893349647522, "learning_rate": 2.3720595940408413e-05, "loss": 0.1133840560913086, "step": 2715, "token_acc": 0.960378885269165 }, { "epoch": 2.0811017597551644, "grad_norm": 0.6294081211090088, "learning_rate": 2.3541407856831598e-05, "loss": 0.11989744901657104, "step": 2720, "token_acc": 0.9581653475761414 }, { "epoch": 2.0849273144605966, "grad_norm": 0.6295720934867859, "learning_rate": 2.3362690519101728e-05, "loss": 0.10788016319274903, "step": 2725, "token_acc": 0.9615026116371155 }, { "epoch": 2.088752869166029, "grad_norm": 0.6127709150314331, "learning_rate": 2.318444710692109e-05, "loss": 0.18858987092971802, "step": 2730, "token_acc": 0.9527615308761597 }, { "epoch": 2.0925784238714615, "grad_norm": 0.6840873956680298, "learning_rate": 2.3006680791559943e-05, "loss": 0.13058118820190429, "step": 2735, "token_acc": 0.9559764862060547 }, { "epoch": 2.0964039785768938, "grad_norm": 0.6548556089401245, "learning_rate": 2.2829394735800075e-05, "loss": 0.12637789249420167, "step": 2740, "token_acc": 0.9560421705245972 }, { "epoch": 2.100229533282326, "grad_norm": 0.6251739263534546, "learning_rate": 2.2652592093878666e-05, "loss": 0.1079249382019043, "step": 2745, "token_acc": 0.9616904854774475 }, { "epoch": 2.104055087987758, "grad_norm": 0.5070903301239014, "learning_rate": 2.2476276011432056e-05, "loss": 0.10909421443939209, "step": 2750, "token_acc": 0.9607372879981995 }, { "epoch": 2.104055087987758, "eval_loss": 0.5240176916122437, "eval_runtime": 7.9542, "eval_samples_per_second": 13.075, "eval_steps_per_second": 1.634, "eval_token_acc": 0.8699575066566467, "step": 2750 }, { "epoch": 2.1078806426931904, "grad_norm": 0.5303053259849548, "learning_rate": 2.230044962543989e-05, "loss": 0.10541150569915772, "step": 2755, "token_acc": 0.9636523723602295 }, { "epoch": 2.1117061973986226, "grad_norm": 0.6467751264572144, "learning_rate": 2.2125116064169125e-05, "loss": 0.11249511241912842, "step": 2760, "token_acc": 0.9602897763252258 }, { "epoch": 2.1155317521040553, "grad_norm": 0.6789493560791016, "learning_rate": 2.195027844711856e-05, "loss": 0.13851575851440429, "step": 2765, "token_acc": 0.9524257183074951 }, { "epoch": 2.1193573068094875, "grad_norm": 0.5706949234008789, "learning_rate": 2.177593988496323e-05, "loss": 0.0956031322479248, "step": 2770, "token_acc": 0.9663383960723877 }, { "epoch": 2.1231828615149198, "grad_norm": 0.5609292984008789, "learning_rate": 2.1602103479499093e-05, "loss": 0.11319952011108399, "step": 2775, "token_acc": 0.9608060717582703 }, { "epoch": 2.127008416220352, "grad_norm": 0.639937162399292, "learning_rate": 2.1428772323587827e-05, "loss": 0.13543224334716797, "step": 2780, "token_acc": 0.9520896077156067 }, { "epoch": 2.130833970925784, "grad_norm": 0.6833350658416748, "learning_rate": 2.1255949501101847e-05, "loss": 0.14142370223999023, "step": 2785, "token_acc": 0.9528786540031433 }, { "epoch": 2.1346595256312164, "grad_norm": 0.5408839583396912, "learning_rate": 2.1083638086869327e-05, "loss": 0.12588857412338256, "step": 2790, "token_acc": 0.9563543200492859 }, { "epoch": 2.1384850803366486, "grad_norm": 0.5438815355300903, "learning_rate": 2.0911841146619676e-05, "loss": 0.12137541770935059, "step": 2795, "token_acc": 0.958185613155365 }, { "epoch": 2.1423106350420813, "grad_norm": 0.6048544645309448, "learning_rate": 2.074056173692881e-05, "loss": 0.10157194137573242, "step": 2800, "token_acc": 0.9674689769744873 }, { "epoch": 2.1423106350420813, "eval_loss": 0.5312597751617432, "eval_runtime": 9.0822, "eval_samples_per_second": 11.451, "eval_steps_per_second": 1.431, "eval_token_acc": 0.8708697557449341, "step": 2800 }, { "epoch": 2.1461361897475135, "grad_norm": 0.689985990524292, "learning_rate": 2.05698029051649e-05, "loss": 0.12691206932067872, "step": 2805, "token_acc": 0.9552291035652161 }, { "epoch": 2.1499617444529457, "grad_norm": 0.628235936164856, "learning_rate": 2.0399567689434007e-05, "loss": 0.12962342500686647, "step": 2810, "token_acc": 0.9563965201377869 }, { "epoch": 2.153787299158378, "grad_norm": 0.583711564540863, "learning_rate": 2.0229859118526244e-05, "loss": 0.11104552745819092, "step": 2815, "token_acc": 0.9605592489242554 }, { "epoch": 2.15761285386381, "grad_norm": 0.749139666557312, "learning_rate": 2.0060680211861722e-05, "loss": 0.11064702272415161, "step": 2820, "token_acc": 0.9618842601776123 }, { "epoch": 2.1614384085692424, "grad_norm": 0.6225452423095703, "learning_rate": 1.989203397943682e-05, "loss": 0.1368303894996643, "step": 2825, "token_acc": 0.9523999691009521 }, { "epoch": 2.1652639632746746, "grad_norm": 0.7548052072525024, "learning_rate": 1.9723923421770744e-05, "loss": 0.12567458152770997, "step": 2830, "token_acc": 0.9570740461349487 }, { "epoch": 2.1690895179801073, "grad_norm": 0.6393832564353943, "learning_rate": 1.9556351529852086e-05, "loss": 0.12716997861862184, "step": 2835, "token_acc": 0.9550226926803589 }, { "epoch": 2.1729150726855395, "grad_norm": 0.5963457822799683, "learning_rate": 1.9389321285085572e-05, "loss": 0.12617888450622558, "step": 2840, "token_acc": 0.9543135166168213 }, { "epoch": 2.1767406273909717, "grad_norm": 0.7114848494529724, "learning_rate": 1.9222835659239086e-05, "loss": 0.12233096361160278, "step": 2845, "token_acc": 0.9570853114128113 }, { "epoch": 2.180566182096404, "grad_norm": 0.6505621671676636, "learning_rate": 1.905689761439075e-05, "loss": 0.13814208507537842, "step": 2850, "token_acc": 0.9528710246086121 }, { "epoch": 2.180566182096404, "eval_loss": 0.5228633284568787, "eval_runtime": 7.9764, "eval_samples_per_second": 13.038, "eval_steps_per_second": 1.63, "eval_token_acc": 0.8718922734260559, "step": 2850 }, { "epoch": 2.184391736801836, "grad_norm": 0.6201128959655762, "learning_rate": 1.8891510102876235e-05, "loss": 0.12893006801605225, "step": 2855, "token_acc": 0.9550007581710815 }, { "epoch": 2.1882172915072684, "grad_norm": 0.6673233509063721, "learning_rate": 1.8726676067236245e-05, "loss": 0.10436077117919922, "step": 2860, "token_acc": 0.9645984172821045 }, { "epoch": 2.1920428462127006, "grad_norm": 0.7207808494567871, "learning_rate": 1.8562398440164135e-05, "loss": 0.14118155241012573, "step": 2865, "token_acc": 0.9522634148597717 }, { "epoch": 2.1958684009181333, "grad_norm": 0.7116675972938538, "learning_rate": 1.8398680144453794e-05, "loss": 0.11731832027435303, "step": 2870, "token_acc": 0.9581528902053833 }, { "epoch": 2.1996939556235655, "grad_norm": 0.5616986155509949, "learning_rate": 1.823552409294752e-05, "loss": 0.10328438282012939, "step": 2875, "token_acc": 0.9635567665100098 }, { "epoch": 2.2035195103289977, "grad_norm": 0.7303850650787354, "learning_rate": 1.8072933188484385e-05, "loss": 0.12835383415222168, "step": 2880, "token_acc": 0.9546709060668945 }, { "epoch": 2.20734506503443, "grad_norm": 0.684688925743103, "learning_rate": 1.7910910323848433e-05, "loss": 0.12336525917053223, "step": 2885, "token_acc": 0.9571567177772522 }, { "epoch": 2.211170619739862, "grad_norm": 0.5825948119163513, "learning_rate": 1.774945838171721e-05, "loss": 0.12321670055389404, "step": 2890, "token_acc": 0.9568530321121216 }, { "epoch": 2.2149961744452944, "grad_norm": 0.5380724668502808, "learning_rate": 1.758858023461059e-05, "loss": 0.1462591528892517, "step": 2895, "token_acc": 0.9520248770713806 }, { "epoch": 2.218821729150727, "grad_norm": 0.7477222084999084, "learning_rate": 1.742827874483958e-05, "loss": 0.1159374475479126, "step": 2900, "token_acc": 0.9597063660621643 }, { "epoch": 2.218821729150727, "eval_loss": 0.5222508311271667, "eval_runtime": 8.103, "eval_samples_per_second": 12.835, "eval_steps_per_second": 1.604, "eval_token_acc": 0.872032642364502, "step": 2900 }, { "epoch": 2.2226472838561593, "grad_norm": 0.578953206539154, "learning_rate": 1.7268556764455433e-05, "loss": 0.1094053030014038, "step": 2905, "token_acc": 0.9612045884132385 }, { "epoch": 2.2264728385615915, "grad_norm": 0.6454194188117981, "learning_rate": 1.7109417135198875e-05, "loss": 0.09978902339935303, "step": 2910, "token_acc": 0.9648175239562988 }, { "epoch": 2.2302983932670237, "grad_norm": 0.6507310271263123, "learning_rate": 1.6950862688449555e-05, "loss": 0.12494430541992188, "step": 2915, "token_acc": 0.9561623930931091 }, { "epoch": 2.234123947972456, "grad_norm": 0.5561665296554565, "learning_rate": 1.6792896245175695e-05, "loss": 0.12519459724426268, "step": 2920, "token_acc": 0.957149863243103 }, { "epoch": 2.237949502677888, "grad_norm": 0.6335827708244324, "learning_rate": 1.6635520615883854e-05, "loss": 0.12490168809890748, "step": 2925, "token_acc": 0.956473171710968 }, { "epoch": 2.2417750573833204, "grad_norm": 0.518527090549469, "learning_rate": 1.6478738600568978e-05, "loss": 0.11815754175186158, "step": 2930, "token_acc": 0.9581723809242249 }, { "epoch": 2.245600612088753, "grad_norm": 0.7105391025543213, "learning_rate": 1.6322552988664548e-05, "loss": 0.1265929937362671, "step": 2935, "token_acc": 0.9559991359710693 }, { "epoch": 2.2494261667941853, "grad_norm": 0.6597128510475159, "learning_rate": 1.616696655899291e-05, "loss": 0.10472848415374755, "step": 2940, "token_acc": 0.9618938565254211 }, { "epoch": 2.2532517214996175, "grad_norm": 0.5978385806083679, "learning_rate": 1.601198207971596e-05, "loss": 0.11347222328186035, "step": 2945, "token_acc": 0.9598453640937805 }, { "epoch": 2.2570772762050497, "grad_norm": 0.5900003910064697, "learning_rate": 1.585760230828579e-05, "loss": 0.1062214732170105, "step": 2950, "token_acc": 0.9621166586875916 }, { "epoch": 2.2570772762050497, "eval_loss": 0.529563307762146, "eval_runtime": 7.925, "eval_samples_per_second": 13.123, "eval_steps_per_second": 1.64, "eval_token_acc": 0.8730752468109131, "step": 2950 }, { "epoch": 2.260902830910482, "grad_norm": 0.6690232753753662, "learning_rate": 1.57038299913956e-05, "loss": 0.12313377857208252, "step": 2955, "token_acc": 0.9577500820159912 }, { "epoch": 2.264728385615914, "grad_norm": 0.6129235625267029, "learning_rate": 1.555066786493094e-05, "loss": 0.11549534797668456, "step": 2960, "token_acc": 0.9599046111106873 }, { "epoch": 2.268553940321347, "grad_norm": 0.7165189385414124, "learning_rate": 1.5398118653920986e-05, "loss": 0.10570051670074462, "step": 2965, "token_acc": 0.9616792798042297 }, { "epoch": 2.272379495026779, "grad_norm": 0.7057157754898071, "learning_rate": 1.5246185072490027e-05, "loss": 0.11799094676971436, "step": 2970, "token_acc": 0.9599979519844055 }, { "epoch": 2.2762050497322113, "grad_norm": 0.6109249591827393, "learning_rate": 1.5094869823809166e-05, "loss": 0.12232885360717774, "step": 2975, "token_acc": 0.9563071727752686 }, { "epoch": 2.2800306044376435, "grad_norm": 0.6849731206893921, "learning_rate": 1.4944175600048294e-05, "loss": 0.12355262041091919, "step": 2980, "token_acc": 0.9571903944015503 }, { "epoch": 2.2838561591430757, "grad_norm": 0.551438570022583, "learning_rate": 1.4794105082328158e-05, "loss": 0.10952677726745605, "step": 2985, "token_acc": 0.963117241859436 }, { "epoch": 2.287681713848508, "grad_norm": 0.7222511172294617, "learning_rate": 1.4644660940672627e-05, "loss": 0.1401592493057251, "step": 2990, "token_acc": 0.9511399865150452 }, { "epoch": 2.29150726855394, "grad_norm": 0.7186452150344849, "learning_rate": 1.449584583396124e-05, "loss": 0.1436525344848633, "step": 2995, "token_acc": 0.9500516653060913 }, { "epoch": 2.295332823259373, "grad_norm": 0.7001931071281433, "learning_rate": 1.4347662409881868e-05, "loss": 0.12311695814132691, "step": 3000, "token_acc": 0.9562889337539673 }, { "epoch": 2.295332823259373, "eval_loss": 0.5203014612197876, "eval_runtime": 9.1094, "eval_samples_per_second": 11.417, "eval_steps_per_second": 1.427, "eval_token_acc": 0.8741077780723572, "step": 3000 }, { "epoch": 2.299158377964805, "grad_norm": 0.6098562479019165, "learning_rate": 1.4200113304883611e-05, "loss": 0.13382203578948976, "step": 3005, "token_acc": 0.9541038274765015 }, { "epoch": 2.3029839326702373, "grad_norm": 0.6261680126190186, "learning_rate": 1.405320114412989e-05, "loss": 0.0949715256690979, "step": 3010, "token_acc": 0.96717369556427 }, { "epoch": 2.3068094873756695, "grad_norm": 0.5904762744903564, "learning_rate": 1.3906928541451775e-05, "loss": 0.10795230865478515, "step": 3015, "token_acc": 0.9621407985687256 }, { "epoch": 2.3106350420811017, "grad_norm": 0.6883955001831055, "learning_rate": 1.3761298099301378e-05, "loss": 0.12801848649978637, "step": 3020, "token_acc": 0.9559524059295654 }, { "epoch": 2.314460596786534, "grad_norm": 0.6712023615837097, "learning_rate": 1.3616312408705689e-05, "loss": 0.12017567157745361, "step": 3025, "token_acc": 0.9589926600456238 }, { "epoch": 2.318286151491966, "grad_norm": 0.5586845874786377, "learning_rate": 1.3471974049220403e-05, "loss": 0.09736464023590088, "step": 3030, "token_acc": 0.9669448733329773 }, { "epoch": 2.322111706197399, "grad_norm": 0.7812525033950806, "learning_rate": 1.3328285588884032e-05, "loss": 0.11876866817474366, "step": 3035, "token_acc": 0.9586123824119568 }, { "epoch": 2.325937260902831, "grad_norm": 0.5611070394515991, "learning_rate": 1.3185249584172172e-05, "loss": 0.09341703653335572, "step": 3040, "token_acc": 0.9679653644561768 }, { "epoch": 2.3297628156082633, "grad_norm": 0.7015408873558044, "learning_rate": 1.304286857995209e-05, "loss": 0.10733482837677003, "step": 3045, "token_acc": 0.9623789191246033 }, { "epoch": 2.3335883703136955, "grad_norm": 0.6591479778289795, "learning_rate": 1.2901145109437474e-05, "loss": 0.11940803527832031, "step": 3050, "token_acc": 0.9576820135116577 }, { "epoch": 2.3335883703136955, "eval_loss": 0.5162126421928406, "eval_runtime": 7.6579, "eval_samples_per_second": 13.581, "eval_steps_per_second": 1.698, "eval_token_acc": 0.8742882609367371, "step": 3050 }, { "epoch": 2.3374139250191277, "grad_norm": 0.5746079087257385, "learning_rate": 1.27600816941432e-05, "loss": 0.12224366664886474, "step": 3055, "token_acc": 0.95743727684021 }, { "epoch": 2.34123947972456, "grad_norm": 0.6104121208190918, "learning_rate": 1.2619680843840659e-05, "loss": 0.12069646120071412, "step": 3060, "token_acc": 0.9580378532409668 }, { "epoch": 2.345065034429992, "grad_norm": 0.6610199213027954, "learning_rate": 1.2479945056512993e-05, "loss": 0.10805834531784057, "step": 3065, "token_acc": 0.9605792760848999 }, { "epoch": 2.348890589135425, "grad_norm": 0.6179318428039551, "learning_rate": 1.2340876818310682e-05, "loss": 0.1121566653251648, "step": 3070, "token_acc": 0.9616247415542603 }, { "epoch": 2.352716143840857, "grad_norm": 0.6470217108726501, "learning_rate": 1.22024786035073e-05, "loss": 0.09998181462287903, "step": 3075, "token_acc": 0.9644249081611633 }, { "epoch": 2.3565416985462893, "grad_norm": 0.6415740847587585, "learning_rate": 1.206475287445552e-05, "loss": 0.10013750791549683, "step": 3080, "token_acc": 0.9655629396438599 }, { "epoch": 2.3603672532517215, "grad_norm": 0.5981183648109436, "learning_rate": 1.1927702081543279e-05, "loss": 0.10144208669662476, "step": 3085, "token_acc": 0.965247631072998 }, { "epoch": 2.3641928079571537, "grad_norm": 0.4865865409374237, "learning_rate": 1.179132866315018e-05, "loss": 0.10601496696472168, "step": 3090, "token_acc": 0.9624915719032288 }, { "epoch": 2.368018362662586, "grad_norm": 0.5336887240409851, "learning_rate": 1.165563504560413e-05, "loss": 0.11365892887115478, "step": 3095, "token_acc": 0.9594626426696777 }, { "epoch": 2.371843917368018, "grad_norm": 0.4895932376384735, "learning_rate": 1.1520623643138162e-05, "loss": 0.11079982519149781, "step": 3100, "token_acc": 0.9616596102714539 }, { "epoch": 2.371843917368018, "eval_loss": 0.5221489667892456, "eval_runtime": 8.0254, "eval_samples_per_second": 12.959, "eval_steps_per_second": 1.62, "eval_token_acc": 0.8750301003456116, "step": 3100 }, { "epoch": 2.375669472073451, "grad_norm": 0.6662837266921997, "learning_rate": 1.1386296857847444e-05, "loss": 0.09341274499893189, "step": 3105, "token_acc": 0.9671337008476257 }, { "epoch": 2.379495026778883, "grad_norm": 0.5832562446594238, "learning_rate": 1.12526570796466e-05, "loss": 0.11719496250152588, "step": 3110, "token_acc": 0.9592087864875793 }, { "epoch": 2.3833205814843152, "grad_norm": 0.5843919515609741, "learning_rate": 1.1119706686227211e-05, "loss": 0.10511226654052734, "step": 3115, "token_acc": 0.9644036889076233 }, { "epoch": 2.3871461361897475, "grad_norm": 0.49912717938423157, "learning_rate": 1.0987448043015374e-05, "loss": 0.09345480799674988, "step": 3120, "token_acc": 0.9667991399765015 }, { "epoch": 2.3909716908951797, "grad_norm": 0.7507015466690063, "learning_rate": 1.0855883503129772e-05, "loss": 0.11863377094268798, "step": 3125, "token_acc": 0.9587963819503784 }, { "epoch": 2.394797245600612, "grad_norm": 0.7630432844161987, "learning_rate": 1.0725015407339717e-05, "loss": 0.1126257300376892, "step": 3130, "token_acc": 0.9607234597206116 }, { "epoch": 2.398622800306044, "grad_norm": 0.6372060179710388, "learning_rate": 1.0594846084023547e-05, "loss": 0.10468795299530029, "step": 3135, "token_acc": 0.9627901315689087 }, { "epoch": 2.402448355011477, "grad_norm": 0.6120291352272034, "learning_rate": 1.0465377849127172e-05, "loss": 0.09292224049568176, "step": 3140, "token_acc": 0.9677795171737671 }, { "epoch": 2.406273909716909, "grad_norm": 0.5614500045776367, "learning_rate": 1.0336613006122892e-05, "loss": 0.09670157432556152, "step": 3145, "token_acc": 0.9674481153488159 }, { "epoch": 2.4100994644223412, "grad_norm": 0.5987251996994019, "learning_rate": 1.0208553845968383e-05, "loss": 0.13896613121032714, "step": 3150, "token_acc": 0.9524605870246887 }, { "epoch": 2.4100994644223412, "eval_loss": 0.5215019583702087, "eval_runtime": 7.8548, "eval_samples_per_second": 13.24, "eval_steps_per_second": 1.655, "eval_token_acc": 0.8747493624687195, "step": 3150 }, { "epoch": 2.4139250191277735, "grad_norm": 0.5754761695861816, "learning_rate": 1.008120264706598e-05, "loss": 0.10798046588897706, "step": 3155, "token_acc": 0.9625075459480286 }, { "epoch": 2.4177505738332057, "grad_norm": 0.5995942950248718, "learning_rate": 9.95456167522209e-06, "loss": 0.11118266582489014, "step": 3160, "token_acc": 0.9624667167663574 }, { "epoch": 2.4215761285386384, "grad_norm": 0.6560847759246826, "learning_rate": 9.82863318360695e-06, "loss": 0.11946277618408203, "step": 3165, "token_acc": 0.9585193991661072 }, { "epoch": 2.4254016832440706, "grad_norm": 0.5231161713600159, "learning_rate": 9.703419412714431e-06, "loss": 0.1082839012145996, "step": 3170, "token_acc": 0.9630952477455139 }, { "epoch": 2.429227237949503, "grad_norm": 0.6471136808395386, "learning_rate": 9.578922590322276e-06, "loss": 0.10554378032684326, "step": 3175, "token_acc": 0.9643285870552063 }, { "epoch": 2.433052792654935, "grad_norm": 0.6062421202659607, "learning_rate": 9.45514493145246e-06, "loss": 0.11804389953613281, "step": 3180, "token_acc": 0.9601839780807495 }, { "epoch": 2.4368783473603672, "grad_norm": 0.6130327582359314, "learning_rate": 9.332088638331682e-06, "loss": 0.12830252647399903, "step": 3185, "token_acc": 0.955107569694519 }, { "epoch": 2.4407039020657995, "grad_norm": 0.5650054812431335, "learning_rate": 9.209755900352285e-06, "loss": 0.08745735883712769, "step": 3190, "token_acc": 0.9680666327476501 }, { "epoch": 2.4445294567712317, "grad_norm": 0.6417719125747681, "learning_rate": 9.088148894033255e-06, "loss": 0.10346298217773438, "step": 3195, "token_acc": 0.9632440209388733 }, { "epoch": 2.4483550114766643, "grad_norm": 0.549809992313385, "learning_rate": 8.967269782981557e-06, "loss": 0.10478920936584472, "step": 3200, "token_acc": 0.964032769203186 }, { "epoch": 2.4483550114766643, "eval_loss": 0.524568498134613, "eval_runtime": 7.9187, "eval_samples_per_second": 13.133, "eval_steps_per_second": 1.642, "eval_token_acc": 0.8750100135803223, "step": 3200 }, { "epoch": 2.4521805661820966, "grad_norm": 0.5881340503692627, "learning_rate": 8.847120717853513e-06, "loss": 0.09231488704681397, "step": 3205, "token_acc": 0.967642068862915 }, { "epoch": 2.456006120887529, "grad_norm": 0.49171632528305054, "learning_rate": 8.727703836316664e-06, "loss": 0.08269585371017456, "step": 3210, "token_acc": 0.9714418053627014 }, { "epoch": 2.459831675592961, "grad_norm": 0.5847451090812683, "learning_rate": 8.609021263011696e-06, "loss": 0.09583220481872559, "step": 3215, "token_acc": 0.967701256275177 }, { "epoch": 2.4636572302983932, "grad_norm": 0.6022827625274658, "learning_rate": 8.491075109514612e-06, "loss": 0.0968513011932373, "step": 3220, "token_acc": 0.965691328048706 }, { "epoch": 2.4674827850038255, "grad_norm": 0.6396250128746033, "learning_rate": 8.373867474299197e-06, "loss": 0.09366763830184936, "step": 3225, "token_acc": 0.967291533946991 }, { "epoch": 2.4713083397092577, "grad_norm": 0.6564737558364868, "learning_rate": 8.257400442699681e-06, "loss": 0.09510574340820313, "step": 3230, "token_acc": 0.9668706059455872 }, { "epoch": 2.4751338944146903, "grad_norm": 0.5506086945533752, "learning_rate": 8.141676086873572e-06, "loss": 0.09186252355575561, "step": 3235, "token_acc": 0.9672021865844727 }, { "epoch": 2.4789594491201226, "grad_norm": 0.5937402844429016, "learning_rate": 8.026696465764922e-06, "loss": 0.09575964212417602, "step": 3240, "token_acc": 0.9655571579933167 }, { "epoch": 2.482785003825555, "grad_norm": 0.5168645977973938, "learning_rate": 7.912463625067568e-06, "loss": 0.11513475179672242, "step": 3245, "token_acc": 0.9584820866584778 }, { "epoch": 2.486610558530987, "grad_norm": 12.089369773864746, "learning_rate": 7.7989795971888e-06, "loss": 0.29053955078125, "step": 3250, "token_acc": 0.9437501430511475 }, { "epoch": 2.486610558530987, "eval_loss": 0.5287056565284729, "eval_runtime": 7.9028, "eval_samples_per_second": 13.16, "eval_steps_per_second": 1.645, "eval_token_acc": 0.8761628866195679, "step": 3250 }, { "epoch": 2.4904361132364192, "grad_norm": 0.6238409876823425, "learning_rate": 7.68624640121316e-06, "loss": 0.1205405831336975, "step": 3255, "token_acc": 0.9586801528930664 }, { "epoch": 2.4942616679418514, "grad_norm": 0.6099902391433716, "learning_rate": 7.574266042866546e-06, "loss": 0.09387488961219788, "step": 3260, "token_acc": 0.9670175909996033 }, { "epoch": 2.4980872226472837, "grad_norm": 0.6190466284751892, "learning_rate": 7.463040514480579e-06, "loss": 0.11645488739013672, "step": 3265, "token_acc": 0.9598995447158813 }, { "epoch": 2.5019127773527163, "grad_norm": 0.6443151235580444, "learning_rate": 7.352571794957025e-06, "loss": 0.08591481447219848, "step": 3270, "token_acc": 0.9710960388183594 }, { "epoch": 2.5057383320581486, "grad_norm": 0.6558806896209717, "learning_rate": 7.242861849732696e-06, "loss": 0.1108025312423706, "step": 3275, "token_acc": 0.9633561968803406 }, { "epoch": 2.5095638867635808, "grad_norm": 0.6043168306350708, "learning_rate": 7.133912630744455e-06, "loss": 0.08010676503181458, "step": 3280, "token_acc": 0.9711145162582397 }, { "epoch": 2.513389441469013, "grad_norm": 0.671475887298584, "learning_rate": 7.025726076394462e-06, "loss": 0.1144939661026001, "step": 3285, "token_acc": 0.9594224691390991 }, { "epoch": 2.517214996174445, "grad_norm": 0.5959923267364502, "learning_rate": 6.9183041115157165e-06, "loss": 0.08532092571258545, "step": 3290, "token_acc": 0.9698848724365234 }, { "epoch": 2.5210405508798774, "grad_norm": 0.552179217338562, "learning_rate": 6.8116486473377985e-06, "loss": 0.09567714929580688, "step": 3295, "token_acc": 0.966461718082428 }, { "epoch": 2.5248661055853097, "grad_norm": 0.8035470843315125, "learning_rate": 6.7057615814528514e-06, "loss": 0.11172772645950317, "step": 3300, "token_acc": 0.9609107375144958 }, { "epoch": 2.5248661055853097, "eval_loss": 0.5269036889076233, "eval_runtime": 8.3826, "eval_samples_per_second": 12.407, "eval_steps_per_second": 1.551, "eval_token_acc": 0.8761628866195679, "step": 3300 }, { "epoch": 2.5286916602907423, "grad_norm": 0.5826445817947388, "learning_rate": 6.600644797781847e-06, "loss": 0.09061547517776489, "step": 3305, "token_acc": 0.9684428572654724 }, { "epoch": 2.5325172149961745, "grad_norm": 0.6639491319656372, "learning_rate": 6.496300166541052e-06, "loss": 0.1045493245124817, "step": 3310, "token_acc": 0.9641888737678528 }, { "epoch": 2.5363427697016068, "grad_norm": 0.5682926177978516, "learning_rate": 6.392729544208758e-06, "loss": 0.10315026044845581, "step": 3315, "token_acc": 0.963904619216919 }, { "epoch": 2.540168324407039, "grad_norm": 0.6878834962844849, "learning_rate": 6.289934773492223e-06, "loss": 0.10737843513488769, "step": 3320, "token_acc": 0.963394284248352 }, { "epoch": 2.543993879112471, "grad_norm": 0.5965612530708313, "learning_rate": 6.1879176832949525e-06, "loss": 0.11070966720581055, "step": 3325, "token_acc": 0.9651868939399719 }, { "epoch": 2.5478194338179034, "grad_norm": 0.6844844818115234, "learning_rate": 6.086680088684105e-06, "loss": 0.10959099531173706, "step": 3330, "token_acc": 0.9614537358283997 }, { "epoch": 2.5516449885233357, "grad_norm": 0.5353488922119141, "learning_rate": 5.986223790858186e-06, "loss": 0.09058489799499511, "step": 3335, "token_acc": 0.9692246317863464 }, { "epoch": 2.5554705432287683, "grad_norm": 0.6746286749839783, "learning_rate": 5.886550577115069e-06, "loss": 0.1055182695388794, "step": 3340, "token_acc": 0.9636992812156677 }, { "epoch": 2.5592960979342005, "grad_norm": 0.5335373282432556, "learning_rate": 5.787662220820134e-06, "loss": 0.1255274772644043, "step": 3345, "token_acc": 0.9566043615341187 }, { "epoch": 2.5631216526396328, "grad_norm": 0.6528668403625488, "learning_rate": 5.689560481374734e-06, "loss": 0.10252002477645875, "step": 3350, "token_acc": 0.9639867544174194 }, { "epoch": 2.5631216526396328, "eval_loss": 0.5217230319976807, "eval_runtime": 8.1191, "eval_samples_per_second": 12.809, "eval_steps_per_second": 1.601, "eval_token_acc": 0.8769047260284424, "step": 3350 }, { "epoch": 2.566947207345065, "grad_norm": 0.49694639444351196, "learning_rate": 5.592247104184917e-06, "loss": 0.08688923120498657, "step": 3355, "token_acc": 0.9706814289093018 }, { "epoch": 2.570772762050497, "grad_norm": 0.5503761172294617, "learning_rate": 5.495723820630333e-06, "loss": 0.12382068634033203, "step": 3360, "token_acc": 0.9561320543289185 }, { "epoch": 2.57459831675593, "grad_norm": 0.6813068985939026, "learning_rate": 5.399992348033461e-06, "loss": 0.12225714921951295, "step": 3365, "token_acc": 0.9570099711418152 }, { "epoch": 2.5784238714613616, "grad_norm": 0.5871702432632446, "learning_rate": 5.305054389629022e-06, "loss": 0.07900494337081909, "step": 3370, "token_acc": 0.9720001220703125 }, { "epoch": 2.5822494261667943, "grad_norm": 0.7074242830276489, "learning_rate": 5.210911634533721e-06, "loss": 0.11348228454589844, "step": 3375, "token_acc": 0.9611703157424927 }, { "epoch": 2.5860749808722265, "grad_norm": 0.6286773085594177, "learning_rate": 5.117565757716158e-06, "loss": 0.11759569644927978, "step": 3380, "token_acc": 0.9579370617866516 }, { "epoch": 2.5899005355776588, "grad_norm": 0.6363070607185364, "learning_rate": 5.025018419967009e-06, "loss": 0.11911303997039795, "step": 3385, "token_acc": 0.9589115977287292 }, { "epoch": 2.593726090283091, "grad_norm": 0.6866349577903748, "learning_rate": 4.933271267869566e-06, "loss": 0.11872742176055909, "step": 3390, "token_acc": 0.9597334265708923 }, { "epoch": 2.597551644988523, "grad_norm": 0.5686379075050354, "learning_rate": 4.842325933770342e-06, "loss": 0.10091429948806763, "step": 3395, "token_acc": 0.9646428227424622 }, { "epoch": 2.601377199693956, "grad_norm": 0.5744697451591492, "learning_rate": 4.752184035750068e-06, "loss": 0.1112870454788208, "step": 3400, "token_acc": 0.9629582166671753 }, { "epoch": 2.601377199693956, "eval_loss": 0.5221067667007446, "eval_runtime": 7.949, "eval_samples_per_second": 13.083, "eval_steps_per_second": 1.635, "eval_token_acc": 0.8777067065238953, "step": 3400 }, { "epoch": 2.6052027543993876, "grad_norm": 0.5436497926712036, "learning_rate": 4.662847177594909e-06, "loss": 0.09204695224761963, "step": 3405, "token_acc": 0.9677549004554749 }, { "epoch": 2.6090283091048203, "grad_norm": 0.5940696001052856, "learning_rate": 4.5743169487679316e-06, "loss": 0.09365889430046082, "step": 3410, "token_acc": 0.9672086834907532 }, { "epoch": 2.6128538638102525, "grad_norm": 0.5806345343589783, "learning_rate": 4.486594924380838e-06, "loss": 0.07467930316925049, "step": 3415, "token_acc": 0.9740605354309082 }, { "epoch": 2.6166794185156848, "grad_norm": 0.6086448431015015, "learning_rate": 4.3996826651658775e-06, "loss": 0.09224212169647217, "step": 3420, "token_acc": 0.9681790471076965 }, { "epoch": 2.620504973221117, "grad_norm": 0.4966646432876587, "learning_rate": 4.313581717448156e-06, "loss": 0.08799538612365723, "step": 3425, "token_acc": 0.9687092304229736 }, { "epoch": 2.624330527926549, "grad_norm": 0.7006512880325317, "learning_rate": 4.228293613118089e-06, "loss": 0.10830029249191284, "step": 3430, "token_acc": 0.962169885635376 }, { "epoch": 2.628156082631982, "grad_norm": 0.7951710820198059, "learning_rate": 4.143819869604132e-06, "loss": 0.09951411485671997, "step": 3435, "token_acc": 0.9649299383163452 }, { "epoch": 2.631981637337414, "grad_norm": 0.6713584661483765, "learning_rate": 4.060161989845818e-06, "loss": 0.09943540692329407, "step": 3440, "token_acc": 0.9660786390304565 }, { "epoch": 2.6358071920428463, "grad_norm": 0.8555734753608704, "learning_rate": 3.977321462266998e-06, "loss": 0.12329368591308594, "step": 3445, "token_acc": 0.9588665962219238 }, { "epoch": 2.6396327467482785, "grad_norm": 0.7402066588401794, "learning_rate": 3.8952997607493325e-06, "loss": 0.1296180248260498, "step": 3450, "token_acc": 0.9544374942779541 }, { "epoch": 2.6396327467482785, "eval_loss": 0.5221165418624878, "eval_runtime": 7.8424, "eval_samples_per_second": 13.261, "eval_steps_per_second": 1.658, "eval_token_acc": 0.8774861693382263, "step": 3450 }, { "epoch": 2.6434583014537107, "grad_norm": 0.5311779975891113, "learning_rate": 3.814098344606143e-06, "loss": 0.08472838401794433, "step": 3455, "token_acc": 0.9710620045661926 }, { "epoch": 2.647283856159143, "grad_norm": 0.572284460067749, "learning_rate": 3.7337186585563732e-06, "loss": 0.08200944662094116, "step": 3460, "token_acc": 0.9718431234359741 }, { "epoch": 2.651109410864575, "grad_norm": 0.4984256327152252, "learning_rate": 3.654162132698918e-06, "loss": 0.10278162956237794, "step": 3465, "token_acc": 0.965274453163147 }, { "epoch": 2.654934965570008, "grad_norm": 0.5390318036079407, "learning_rate": 3.5754301824871605e-06, "loss": 0.10632505416870117, "step": 3470, "token_acc": 0.9644556641578674 }, { "epoch": 2.65876052027544, "grad_norm": 0.5882481336593628, "learning_rate": 3.497524208703834e-06, "loss": 0.10900474786758423, "step": 3475, "token_acc": 0.9621248841285706 }, { "epoch": 2.6625860749808723, "grad_norm": 0.6717934608459473, "learning_rate": 3.420445597436056e-06, "loss": 0.0886709749698639, "step": 3480, "token_acc": 0.9691559672355652 }, { "epoch": 2.6664116296863045, "grad_norm": 0.5694244503974915, "learning_rate": 3.344195720050658e-06, "loss": 0.09270554780960083, "step": 3485, "token_acc": 0.9656193852424622 }, { "epoch": 2.6702371843917367, "grad_norm": 0.7296086549758911, "learning_rate": 3.2687759331698375e-06, "loss": 0.10218125581741333, "step": 3490, "token_acc": 0.9648373126983643 }, { "epoch": 2.674062739097169, "grad_norm": 0.4986768662929535, "learning_rate": 3.194187578646979e-06, "loss": 0.09201115369796753, "step": 3495, "token_acc": 0.9665822982788086 }, { "epoch": 2.677888293802601, "grad_norm": 0.6790587306022644, "learning_rate": 3.120431983542793e-06, "loss": 0.10237842798233032, "step": 3500, "token_acc": 0.9661151170730591 }, { "epoch": 2.677888293802601, "eval_loss": 0.5228468179702759, "eval_runtime": 7.9645, "eval_samples_per_second": 13.058, "eval_steps_per_second": 1.632, "eval_token_acc": 0.8785387873649597, "step": 3500 }, { "epoch": 2.681713848508034, "grad_norm": 0.6572025418281555, "learning_rate": 3.047510460101705e-06, "loss": 0.13050510883331298, "step": 3505, "token_acc": 0.9555116295814514 }, { "epoch": 2.685539403213466, "grad_norm": 0.8115324378013611, "learning_rate": 2.9754243057285134e-06, "loss": 0.1264261245727539, "step": 3510, "token_acc": 0.956243634223938 }, { "epoch": 2.6893649579188983, "grad_norm": 0.5161707401275635, "learning_rate": 2.9041748029652927e-06, "loss": 0.08881696462631225, "step": 3515, "token_acc": 0.9682623147964478 }, { "epoch": 2.6931905126243305, "grad_norm": 0.5522788763046265, "learning_rate": 2.8337632194685993e-06, "loss": 0.08286306858062745, "step": 3520, "token_acc": 0.9708802700042725 }, { "epoch": 2.6970160673297627, "grad_norm": 0.5946321487426758, "learning_rate": 2.7641908079868827e-06, "loss": 0.10248844623565674, "step": 3525, "token_acc": 0.9636382460594177 }, { "epoch": 2.700841622035195, "grad_norm": 0.6317991018295288, "learning_rate": 2.69545880633823e-06, "loss": 0.10524777173995972, "step": 3530, "token_acc": 0.9621507525444031 }, { "epoch": 2.704667176740627, "grad_norm": 0.41846737265586853, "learning_rate": 2.627568437388306e-06, "loss": 0.08343310356140136, "step": 3535, "token_acc": 0.970815122127533 }, { "epoch": 2.70849273144606, "grad_norm": 0.592873752117157, "learning_rate": 2.560520909028663e-06, "loss": 0.08635797500610351, "step": 3540, "token_acc": 0.9700879454612732 }, { "epoch": 2.712318286151492, "grad_norm": 0.5590534210205078, "learning_rate": 2.4943174141551674e-06, "loss": 0.10181926488876343, "step": 3545, "token_acc": 0.9652162194252014 }, { "epoch": 2.7161438408569243, "grad_norm": 0.5901391506195068, "learning_rate": 2.428959130646824e-06, "loss": 0.09749918580055236, "step": 3550, "token_acc": 0.9646121263504028 }, { "epoch": 2.7161438408569243, "eval_loss": 0.5235512256622314, "eval_runtime": 7.8855, "eval_samples_per_second": 13.189, "eval_steps_per_second": 1.649, "eval_token_acc": 0.8786590695381165, "step": 3550 }, { "epoch": 2.7199693955623565, "grad_norm": 0.5816419720649719, "learning_rate": 2.364447221344812e-06, "loss": 0.12211033105850219, "step": 3555, "token_acc": 0.9581829905509949 }, { "epoch": 2.7237949502677887, "grad_norm": 0.6168470978736877, "learning_rate": 2.3007828340318114e-06, "loss": 0.09811439514160156, "step": 3560, "token_acc": 0.9663928151130676 }, { "epoch": 2.7276205049732214, "grad_norm": 0.599656343460083, "learning_rate": 2.237967101411531e-06, "loss": 0.12740142345428468, "step": 3565, "token_acc": 0.9561182260513306 }, { "epoch": 2.731446059678653, "grad_norm": 0.6238080263137817, "learning_rate": 2.1760011410886126e-06, "loss": 0.09838619828224182, "step": 3570, "token_acc": 0.9653590321540833 }, { "epoch": 2.735271614384086, "grad_norm": 0.5564831495285034, "learning_rate": 2.1148860555487204e-06, "loss": 0.09222927689552307, "step": 3575, "token_acc": 0.9685646891593933 }, { "epoch": 2.739097169089518, "grad_norm": 0.6360819935798645, "learning_rate": 2.0546229321389278e-06, "loss": 0.09308220148086548, "step": 3580, "token_acc": 0.9680613279342651 }, { "epoch": 2.7429227237949503, "grad_norm": 0.5651523470878601, "learning_rate": 1.995212843048372e-06, "loss": 0.09616876244544983, "step": 3585, "token_acc": 0.9660496115684509 }, { "epoch": 2.7467482785003825, "grad_norm": 0.6321117877960205, "learning_rate": 1.93665684528917e-06, "loss": 0.09454690217971802, "step": 3590, "token_acc": 0.9675334692001343 }, { "epoch": 2.7505738332058147, "grad_norm": 0.5536521077156067, "learning_rate": 1.878955980677638e-06, "loss": 0.07992898225784302, "step": 3595, "token_acc": 0.9721735119819641 }, { "epoch": 2.7543993879112474, "grad_norm": 0.688173770904541, "learning_rate": 1.82211127581573e-06, "loss": 0.09609293937683105, "step": 3600, "token_acc": 0.9671096205711365 }, { "epoch": 2.7543993879112474, "eval_loss": 0.5215653777122498, "eval_runtime": 8.0108, "eval_samples_per_second": 12.982, "eval_steps_per_second": 1.623, "eval_token_acc": 0.8788695931434631, "step": 3600 }, { "epoch": 2.758224942616679, "grad_norm": 0.6505938768386841, "learning_rate": 1.7661237420727784e-06, "loss": 0.1013750433921814, "step": 3605, "token_acc": 0.9644123315811157 }, { "epoch": 2.762050497322112, "grad_norm": 0.5934735536575317, "learning_rate": 1.710994375567504e-06, "loss": 0.0851688802242279, "step": 3610, "token_acc": 0.9705018997192383 }, { "epoch": 2.765876052027544, "grad_norm": 0.6007834076881409, "learning_rate": 1.6567241571502912e-06, "loss": 0.07638438940048217, "step": 3615, "token_acc": 0.9712318778038025 }, { "epoch": 2.7697016067329763, "grad_norm": 0.5481213927268982, "learning_rate": 1.6033140523857404e-06, "loss": 0.09145662784576417, "step": 3620, "token_acc": 0.9675630927085876 }, { "epoch": 2.7735271614384085, "grad_norm": 0.6200750470161438, "learning_rate": 1.5507650115354877e-06, "loss": 0.10738480091094971, "step": 3625, "token_acc": 0.9640287756919861 }, { "epoch": 2.7773527161438407, "grad_norm": 0.6538658142089844, "learning_rate": 1.499077969541307e-06, "loss": 0.10229132175445557, "step": 3630, "token_acc": 0.9641778469085693 }, { "epoch": 2.7811782708492734, "grad_norm": 1.8193166255950928, "learning_rate": 1.4482538460084293e-06, "loss": 0.13732895851135254, "step": 3635, "token_acc": 0.958136796951294 }, { "epoch": 2.785003825554705, "grad_norm": 0.5257523655891418, "learning_rate": 1.3982935451892498e-06, "loss": 0.08640526533126831, "step": 3640, "token_acc": 0.971260130405426 }, { "epoch": 2.788829380260138, "grad_norm": 0.568705141544342, "learning_rate": 1.3491979559672075e-06, "loss": 0.08791974782943726, "step": 3645, "token_acc": 0.9699133038520813 }, { "epoch": 2.79265493496557, "grad_norm": 0.5045759081840515, "learning_rate": 1.3009679518409479e-06, "loss": 0.07553626298904419, "step": 3650, "token_acc": 0.9740194082260132 }, { "epoch": 2.79265493496557, "eval_loss": 0.5219829678535461, "eval_runtime": 8.0288, "eval_samples_per_second": 12.953, "eval_steps_per_second": 1.619, "eval_token_acc": 0.8788595795631409, "step": 3650 }, { "epoch": 2.7964804896710023, "grad_norm": 0.610518217086792, "learning_rate": 1.2536043909088191e-06, "loss": 0.10455150604248047, "step": 3655, "token_acc": 0.9636396765708923 }, { "epoch": 2.8003060443764345, "grad_norm": 0.5319099426269531, "learning_rate": 1.2071081158535947e-06, "loss": 0.08882582187652588, "step": 3660, "token_acc": 0.968651294708252 }, { "epoch": 2.8041315990818667, "grad_norm": 0.6065900325775146, "learning_rate": 1.1614799539274634e-06, "loss": 0.08307374119758607, "step": 3665, "token_acc": 0.9706868529319763 }, { "epoch": 2.8079571537872994, "grad_norm": 0.6401634812355042, "learning_rate": 1.1167207169373195e-06, "loss": 0.09725141525268555, "step": 3670, "token_acc": 0.9657084941864014 }, { "epoch": 2.8117827084927316, "grad_norm": 0.524497389793396, "learning_rate": 1.0728312012303454e-06, "loss": 0.11780104637145997, "step": 3675, "token_acc": 0.960728108882904 }, { "epoch": 2.815608263198164, "grad_norm": 0.7346832156181335, "learning_rate": 1.0298121876797962e-06, "loss": 0.11407887935638428, "step": 3680, "token_acc": 0.9612630605697632 }, { "epoch": 2.819433817903596, "grad_norm": 0.6890755295753479, "learning_rate": 9.876644416711488e-07, "loss": 0.11829521656036376, "step": 3685, "token_acc": 0.9585215449333191 }, { "epoch": 2.8232593726090283, "grad_norm": 0.5342867970466614, "learning_rate": 9.46388713088453e-07, "loss": 0.09410252571105956, "step": 3690, "token_acc": 0.9661674499511719 }, { "epoch": 2.8270849273144605, "grad_norm": 0.4889836311340332, "learning_rate": 9.059857363010183e-07, "loss": 0.09603096842765808, "step": 3695, "token_acc": 0.965887188911438 }, { "epoch": 2.8309104820198927, "grad_norm": 0.5685746073722839, "learning_rate": 8.664562301503143e-07, "loss": 0.08459590077400207, "step": 3700, "token_acc": 0.9699506163597107 }, { "epoch": 2.8309104820198927, "eval_loss": 0.5205320119857788, "eval_runtime": 7.8427, "eval_samples_per_second": 13.261, "eval_steps_per_second": 1.658, "eval_token_acc": 0.8790299892425537, "step": 3700 }, { "epoch": 2.8347360367253254, "grad_norm": 0.5299521684646606, "learning_rate": 8.278008979372087e-07, "loss": 0.09127166271209716, "step": 3705, "token_acc": 0.9684864282608032 }, { "epoch": 2.8385615914307576, "grad_norm": 0.4766036868095398, "learning_rate": 7.900204274094602e-07, "loss": 0.09881120324134826, "step": 3710, "token_acc": 0.9655190706253052 }, { "epoch": 2.84238714613619, "grad_norm": 8.799799919128418, "learning_rate": 7.531154907494397e-07, "loss": 0.13544522523880004, "step": 3715, "token_acc": 0.9555306434631348 }, { "epoch": 2.846212700841622, "grad_norm": 0.563325822353363, "learning_rate": 7.170867445622287e-07, "loss": 0.10241570472717285, "step": 3720, "token_acc": 0.9647788405418396 }, { "epoch": 2.8500382555470543, "grad_norm": 0.6075456142425537, "learning_rate": 6.819348298638839e-07, "loss": 0.12761690616607665, "step": 3725, "token_acc": 0.9584816098213196 }, { "epoch": 2.8538638102524865, "grad_norm": 0.6337462663650513, "learning_rate": 6.476603720700636e-07, "loss": 0.09158645272254944, "step": 3730, "token_acc": 0.9687730669975281 }, { "epoch": 2.8576893649579187, "grad_norm": 0.5899404287338257, "learning_rate": 6.142639809849027e-07, "loss": 0.09597094655036927, "step": 3735, "token_acc": 0.9665765762329102 }, { "epoch": 2.8615149196633514, "grad_norm": 0.5653353929519653, "learning_rate": 5.817462507901383e-07, "loss": 0.10877490043640137, "step": 3740, "token_acc": 0.9619103074073792 }, { "epoch": 2.8653404743687836, "grad_norm": 0.49452540278434753, "learning_rate": 5.501077600345572e-07, "loss": 0.08857889175415039, "step": 3745, "token_acc": 0.9700949192047119 }, { "epoch": 2.869166029074216, "grad_norm": 0.731597900390625, "learning_rate": 5.193490716237037e-07, "loss": 0.12281218767166138, "step": 3750, "token_acc": 0.9560735821723938 }, { "epoch": 2.869166029074216, "eval_loss": 0.5206364989280701, "eval_runtime": 9.2942, "eval_samples_per_second": 11.19, "eval_steps_per_second": 1.399, "eval_token_acc": 0.879270613193512, "step": 3750 }, { "epoch": 2.872991583779648, "grad_norm": 0.6116617321968079, "learning_rate": 4.894707328098602e-07, "loss": 0.11083317995071411, "step": 3755, "token_acc": 0.9610885977745056 }, { "epoch": 2.8768171384850802, "grad_norm": 0.5174733400344849, "learning_rate": 4.6047327518230485e-07, "loss": 0.08961974382400513, "step": 3760, "token_acc": 0.9690099954605103 }, { "epoch": 2.8806426931905125, "grad_norm": 0.5262379050254822, "learning_rate": 4.3235721465784697e-07, "loss": 0.09585506916046142, "step": 3765, "token_acc": 0.9667736887931824 }, { "epoch": 2.8844682478959447, "grad_norm": 0.5788334012031555, "learning_rate": 4.0512305147167863e-07, "loss": 0.08268014192581177, "step": 3770, "token_acc": 0.9712512493133545 }, { "epoch": 2.8882938026013774, "grad_norm": 0.687783420085907, "learning_rate": 3.787712701684598e-07, "loss": 0.08984529376029968, "step": 3775, "token_acc": 0.9686997532844543 }, { "epoch": 2.8921193573068096, "grad_norm": 0.6016952395439148, "learning_rate": 3.5330233959365853e-07, "loss": 0.09222807884216308, "step": 3780, "token_acc": 0.9685728549957275 }, { "epoch": 2.895944912012242, "grad_norm": 0.5089208483695984, "learning_rate": 3.2871671288528525e-07, "loss": 0.09786663055419922, "step": 3785, "token_acc": 0.9665623903274536 }, { "epoch": 2.899770466717674, "grad_norm": 1.769921898841858, "learning_rate": 3.050148274657549e-07, "loss": 0.12438170909881592, "step": 3790, "token_acc": 0.9624179601669312 }, { "epoch": 2.9035960214231062, "grad_norm": 0.5424771904945374, "learning_rate": 2.821971050341654e-07, "loss": 0.0890495777130127, "step": 3795, "token_acc": 0.9703425765037537 }, { "epoch": 2.907421576128539, "grad_norm": 0.5487825274467468, "learning_rate": 2.6026395155874795e-07, "loss": 0.10370445251464844, "step": 3800, "token_acc": 0.9638125896453857 }, { "epoch": 2.907421576128539, "eval_loss": 0.5206490159034729, "eval_runtime": 8.3112, "eval_samples_per_second": 12.513, "eval_steps_per_second": 1.564, "eval_token_acc": 0.8794209361076355, "step": 3800 }, { "epoch": 2.9112471308339707, "grad_norm": 0.5681285262107849, "learning_rate": 2.3921575726967846e-07, "loss": 0.09305150508880615, "step": 3805, "token_acc": 0.9688363075256348 }, { "epoch": 2.9150726855394034, "grad_norm": 0.4438033103942871, "learning_rate": 2.1905289665211104e-07, "loss": 0.08973047733306885, "step": 3810, "token_acc": 0.9688341021537781 }, { "epoch": 2.9188982402448356, "grad_norm": 0.5287227630615234, "learning_rate": 1.9977572843953296e-07, "loss": 0.07862873077392578, "step": 3815, "token_acc": 0.9715408086776733 }, { "epoch": 2.922723794950268, "grad_norm": 0.5739708542823792, "learning_rate": 1.8138459560735899e-07, "loss": 0.08315033316612244, "step": 3820, "token_acc": 0.9718932509422302 }, { "epoch": 2.9265493496557, "grad_norm": 0.6123870611190796, "learning_rate": 1.638798253668694e-07, "loss": 0.125601065158844, "step": 3825, "token_acc": 0.9556345343589783 }, { "epoch": 2.9303749043611322, "grad_norm": 0.6285126209259033, "learning_rate": 1.4726172915933146e-07, "loss": 0.09772306680679321, "step": 3830, "token_acc": 0.9654306769371033 }, { "epoch": 2.934200459066565, "grad_norm": 0.4770904779434204, "learning_rate": 1.315306026505092e-07, "loss": 0.0937896728515625, "step": 3835, "token_acc": 0.9662994146347046 }, { "epoch": 2.9380260137719967, "grad_norm": 0.4980320632457733, "learning_rate": 1.1668672572539008e-07, "loss": 0.08644679784774781, "step": 3840, "token_acc": 0.969020664691925 }, { "epoch": 2.9418515684774293, "grad_norm": 0.5362405180931091, "learning_rate": 1.0273036248318324e-07, "loss": 0.08760695457458496, "step": 3845, "token_acc": 0.9707760214805603 }, { "epoch": 2.9456771231828616, "grad_norm": 0.4886132776737213, "learning_rate": 8.966176123264003e-08, "loss": 0.06749528646469116, "step": 3850, "token_acc": 0.9768878221511841 }, { "epoch": 2.9456771231828616, "eval_loss": 0.5208922028541565, "eval_runtime": 8.161, "eval_samples_per_second": 12.743, "eval_steps_per_second": 1.593, "eval_token_acc": 0.8793407678604126, "step": 3850 }, { "epoch": 2.949502677888294, "grad_norm": 0.5290758013725281, "learning_rate": 7.748115448763526e-08, "loss": 0.07928290963172913, "step": 3855, "token_acc": 0.971563458442688 }, { "epoch": 2.953328232593726, "grad_norm": 0.6795271039009094, "learning_rate": 6.618875896303167e-08, "loss": 0.10474317073822022, "step": 3860, "token_acc": 0.9640142321586609 }, { "epoch": 2.9571537872991582, "grad_norm": 0.6599166989326477, "learning_rate": 5.578477557081074e-08, "loss": 0.10668476819992065, "step": 3865, "token_acc": 0.9629032015800476 }, { "epoch": 2.960979342004591, "grad_norm": 0.6517552733421326, "learning_rate": 4.6269389416514486e-08, "loss": 0.08918753862380982, "step": 3870, "token_acc": 0.9688775539398193 }, { "epoch": 2.964804896710023, "grad_norm": 0.6627753376960754, "learning_rate": 3.764276979593695e-08, "loss": 0.08152820467948914, "step": 3875, "token_acc": 0.9715802669525146 }, { "epoch": 2.9686304514154553, "grad_norm": 0.5488728284835815, "learning_rate": 2.990507019213218e-08, "loss": 0.08794408440589904, "step": 3880, "token_acc": 0.9700236916542053 }, { "epoch": 2.9724560061208876, "grad_norm": 0.5994005799293518, "learning_rate": 2.305642827266641e-08, "loss": 0.10513956546783447, "step": 3885, "token_acc": 0.9652788639068604 }, { "epoch": 2.97628156082632, "grad_norm": 0.5402779579162598, "learning_rate": 1.7096965887164475e-08, "loss": 0.10320125818252564, "step": 3890, "token_acc": 0.964747428894043 }, { "epoch": 2.980107115531752, "grad_norm": 0.5638807415962219, "learning_rate": 1.2026789065167077e-08, "loss": 0.09008901119232178, "step": 3895, "token_acc": 0.9677461385726929 }, { "epoch": 2.9839326702371842, "grad_norm": 0.6424400806427002, "learning_rate": 7.845988014215655e-09, "loss": 0.09886548519134522, "step": 3900, "token_acc": 0.9671627879142761 }, { "epoch": 2.9839326702371842, "eval_loss": 0.5208696126937866, "eval_runtime": 8.2236, "eval_samples_per_second": 12.647, "eval_steps_per_second": 1.581, "eval_token_acc": 0.8792405128479004, "step": 3900 }, { "epoch": 2.987758224942617, "grad_norm": 0.6108574867248535, "learning_rate": 4.554637118270311e-09, "loss": 0.10293105840682984, "step": 3905, "token_acc": 0.9645171165466309 }, { "epoch": 2.991583779648049, "grad_norm": 0.5026504993438721, "learning_rate": 2.1527949363664425e-09, "loss": 0.1074068307876587, "step": 3910, "token_acc": 0.9619331359863281 }, { "epoch": 2.9954093343534813, "grad_norm": 0.6875292658805847, "learning_rate": 6.405042015877882e-10, "loss": 0.11073212623596192, "step": 3915, "token_acc": 0.9605428576469421 }, { "epoch": 2.9992348890589136, "grad_norm": 0.6482424139976501, "learning_rate": 1.7791820305923523e-11, "loss": 0.11924041509628296, "step": 3920, "token_acc": 0.9589547514915466 }, { "epoch": 3.0, "eval_loss": 0.5209956765174866, "eval_runtime": 8.1308, "eval_samples_per_second": 12.791, "eval_steps_per_second": 1.599, "eval_token_acc": 0.8794109225273132, "step": 3921 } ], "logging_steps": 5, "max_steps": 3921, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.411019928798757e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }