{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 7.4, "eval_steps": 500, "global_step": 74000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002, "grad_norm": 1.3247226476669312, "learning_rate": 9.99899989999e-05, "loss": 7.6291, "mean_token_accuracy": 0.5622441053390503, "step": 20 }, { "epoch": 0.004, "grad_norm": 6.80141544342041, "learning_rate": 9.996999699969998e-05, "loss": 4.8702, "mean_token_accuracy": 0.5753256440162658, "step": 40 }, { "epoch": 0.006, "grad_norm": 7.3839287757873535, "learning_rate": 9.994999499949995e-05, "loss": 2.953, "mean_token_accuracy": 0.6048817723989487, "step": 60 }, { "epoch": 0.008, "grad_norm": 6.373369216918945, "learning_rate": 9.992999299929993e-05, "loss": 2.4012, "mean_token_accuracy": 0.6089199587702752, "step": 80 }, { "epoch": 0.01, "grad_norm": 3.7571306228637695, "learning_rate": 9.990999099909991e-05, "loss": 2.3367, "mean_token_accuracy": 0.6371296495199203, "step": 100 }, { "epoch": 0.012, "grad_norm": 7.018854141235352, "learning_rate": 9.98899889988999e-05, "loss": 2.289, "mean_token_accuracy": 0.6311097115278244, "step": 120 }, { "epoch": 0.014, "grad_norm": 2.465070962905884, "learning_rate": 9.986998699869987e-05, "loss": 2.6102, "mean_token_accuracy": 0.6210418313741684, "step": 140 }, { "epoch": 0.016, "grad_norm": 15.181129455566406, "learning_rate": 9.984998499849985e-05, "loss": 2.171, "mean_token_accuracy": 0.6348705053329468, "step": 160 }, { "epoch": 0.018, "grad_norm": 5.408708095550537, "learning_rate": 9.982998299829983e-05, "loss": 2.1018, "mean_token_accuracy": 0.6446641147136688, "step": 180 }, { "epoch": 0.02, "grad_norm": 4.022862911224365, "learning_rate": 9.980998099809982e-05, "loss": 2.063, "mean_token_accuracy": 0.6453554481267929, "step": 200 }, { "epoch": 0.022, "grad_norm": 5.344348430633545, "learning_rate": 9.978997899789979e-05, "loss": 2.1544, "mean_token_accuracy": 0.6231616824865341, "step": 220 }, { "epoch": 0.024, "grad_norm": 4.067018985748291, "learning_rate": 9.976997699769978e-05, "loss": 1.9694, "mean_token_accuracy": 0.6363441944122314, "step": 240 }, { "epoch": 0.026, "grad_norm": 8.169196128845215, "learning_rate": 9.974997499749975e-05, "loss": 2.035, "mean_token_accuracy": 0.6404319405555725, "step": 260 }, { "epoch": 0.028, "grad_norm": 4.658342361450195, "learning_rate": 9.972997299729974e-05, "loss": 1.9551, "mean_token_accuracy": 0.6172670423984528, "step": 280 }, { "epoch": 0.03, "grad_norm": 2.634455680847168, "learning_rate": 9.97099709970997e-05, "loss": 1.7537, "mean_token_accuracy": 0.6480552941560745, "step": 300 }, { "epoch": 0.032, "grad_norm": 3.7653615474700928, "learning_rate": 9.96899689968997e-05, "loss": 2.1466, "mean_token_accuracy": 0.6246859103441238, "step": 320 }, { "epoch": 0.034, "grad_norm": 3.5279033184051514, "learning_rate": 9.966996699669967e-05, "loss": 1.9004, "mean_token_accuracy": 0.6284026026725769, "step": 340 }, { "epoch": 0.036, "grad_norm": 4.281381130218506, "learning_rate": 9.964996499649965e-05, "loss": 1.9505, "mean_token_accuracy": 0.5965759262442589, "step": 360 }, { "epoch": 0.038, "grad_norm": 4.235584735870361, "learning_rate": 9.962996299629964e-05, "loss": 1.8555, "mean_token_accuracy": 0.6280817478895188, "step": 380 }, { "epoch": 0.04, "grad_norm": 2.792436361312866, "learning_rate": 9.960996099609962e-05, "loss": 2.0169, "mean_token_accuracy": 0.627738681435585, "step": 400 }, { "epoch": 0.042, "grad_norm": 3.105224609375, "learning_rate": 9.958995899589959e-05, "loss": 1.9972, "mean_token_accuracy": 0.6322987288236618, "step": 420 }, { "epoch": 0.044, "grad_norm": 2.7474701404571533, "learning_rate": 9.956995699569957e-05, "loss": 1.9674, "mean_token_accuracy": 0.6358790665864944, "step": 440 }, { "epoch": 0.046, "grad_norm": 3.885836601257324, "learning_rate": 9.954995499549956e-05, "loss": 1.9392, "mean_token_accuracy": 0.5877051010727883, "step": 460 }, { "epoch": 0.048, "grad_norm": 47.457637786865234, "learning_rate": 9.952995299529954e-05, "loss": 1.9568, "mean_token_accuracy": 0.5834626913070678, "step": 480 }, { "epoch": 0.05, "grad_norm": 10.007217407226562, "learning_rate": 9.950995099509951e-05, "loss": 2.5362, "mean_token_accuracy": 0.2511857017641887, "step": 500 }, { "epoch": 0.052, "grad_norm": 3.7258198261260986, "learning_rate": 9.948994899489949e-05, "loss": 2.467, "mean_token_accuracy": 0.09096106961369514, "step": 520 }, { "epoch": 0.054, "grad_norm": 4.4112725257873535, "learning_rate": 9.946994699469947e-05, "loss": 2.3343, "mean_token_accuracy": 0.11090310662984848, "step": 540 }, { "epoch": 0.056, "grad_norm": 2.269949197769165, "learning_rate": 9.944994499449946e-05, "loss": 2.4446, "mean_token_accuracy": 0.12855769135057926, "step": 560 }, { "epoch": 0.058, "grad_norm": 4.933803081512451, "learning_rate": 9.942994299429943e-05, "loss": 2.1624, "mean_token_accuracy": 0.2825122267007828, "step": 580 }, { "epoch": 0.06, "grad_norm": 2.6434617042541504, "learning_rate": 9.940994099409941e-05, "loss": 1.9063, "mean_token_accuracy": 0.5449647575616836, "step": 600 }, { "epoch": 0.062, "grad_norm": 3.638512372970581, "learning_rate": 9.938993899389939e-05, "loss": 1.8515, "mean_token_accuracy": 0.5953349024057388, "step": 620 }, { "epoch": 0.064, "grad_norm": 3.0906410217285156, "learning_rate": 9.936993699369938e-05, "loss": 1.9479, "mean_token_accuracy": 0.60838782787323, "step": 640 }, { "epoch": 0.066, "grad_norm": 2.830639362335205, "learning_rate": 9.934993499349935e-05, "loss": 1.9313, "mean_token_accuracy": 0.5938417404890061, "step": 660 }, { "epoch": 0.068, "grad_norm": 3.6546099185943604, "learning_rate": 9.932993299329933e-05, "loss": 1.6726, "mean_token_accuracy": 0.6079109936952591, "step": 680 }, { "epoch": 0.07, "grad_norm": 2.6105642318725586, "learning_rate": 9.930993099309931e-05, "loss": 1.7657, "mean_token_accuracy": 0.6147334307432175, "step": 700 }, { "epoch": 0.072, "grad_norm": 2.0112414360046387, "learning_rate": 9.92899289928993e-05, "loss": 2.0182, "mean_token_accuracy": 0.6062263309955597, "step": 720 }, { "epoch": 0.074, "grad_norm": 2.451645851135254, "learning_rate": 9.926992699269926e-05, "loss": 1.6976, "mean_token_accuracy": 0.6402467161417007, "step": 740 }, { "epoch": 0.076, "grad_norm": 2.6577961444854736, "learning_rate": 9.924992499249926e-05, "loss": 1.7386, "mean_token_accuracy": 0.6098492950201034, "step": 760 }, { "epoch": 0.078, "grad_norm": 1.909602403640747, "learning_rate": 9.922992299229924e-05, "loss": 1.7549, "mean_token_accuracy": 0.6311674267053604, "step": 780 }, { "epoch": 0.08, "grad_norm": 1.5352959632873535, "learning_rate": 9.920992099209921e-05, "loss": 1.6826, "mean_token_accuracy": 0.6240277171134949, "step": 800 }, { "epoch": 0.082, "grad_norm": 2.6040313243865967, "learning_rate": 9.91899189918992e-05, "loss": 1.7481, "mean_token_accuracy": 0.5903356015682221, "step": 820 }, { "epoch": 0.084, "grad_norm": 3.2531189918518066, "learning_rate": 9.916991699169918e-05, "loss": 1.7941, "mean_token_accuracy": 0.6164201036095619, "step": 840 }, { "epoch": 0.086, "grad_norm": 2.6445322036743164, "learning_rate": 9.914991499149916e-05, "loss": 1.7489, "mean_token_accuracy": 0.6177029103040695, "step": 860 }, { "epoch": 0.088, "grad_norm": 2.760507345199585, "learning_rate": 9.912991299129913e-05, "loss": 1.6907, "mean_token_accuracy": 0.6256379395723343, "step": 880 }, { "epoch": 0.09, "grad_norm": 2.801820755004883, "learning_rate": 9.910991099109911e-05, "loss": 1.8128, "mean_token_accuracy": 0.6368381589651108, "step": 900 }, { "epoch": 0.092, "grad_norm": 1.8368525505065918, "learning_rate": 9.90899089908991e-05, "loss": 1.7363, "mean_token_accuracy": 0.609622910618782, "step": 920 }, { "epoch": 0.094, "grad_norm": 1.7642139196395874, "learning_rate": 9.906990699069908e-05, "loss": 1.9077, "mean_token_accuracy": 0.6064844191074371, "step": 940 }, { "epoch": 0.096, "grad_norm": 1.9125126600265503, "learning_rate": 9.904990499049905e-05, "loss": 1.8401, "mean_token_accuracy": 0.6358690887689591, "step": 960 }, { "epoch": 0.098, "grad_norm": 2.282369613647461, "learning_rate": 9.902990299029903e-05, "loss": 1.8463, "mean_token_accuracy": 0.6356282562017441, "step": 980 }, { "epoch": 0.1, "grad_norm": 2.997248649597168, "learning_rate": 9.900990099009902e-05, "loss": 1.6731, "mean_token_accuracy": 0.6338737607002258, "step": 1000 }, { "epoch": 0.102, "grad_norm": 1.5888018608093262, "learning_rate": 9.8989898989899e-05, "loss": 1.7853, "mean_token_accuracy": 0.6258419632911683, "step": 1020 }, { "epoch": 0.104, "grad_norm": 3.112539291381836, "learning_rate": 9.896989698969897e-05, "loss": 1.7123, "mean_token_accuracy": 0.626197999715805, "step": 1040 }, { "epoch": 0.106, "grad_norm": 2.167492151260376, "learning_rate": 9.894989498949897e-05, "loss": 1.7547, "mean_token_accuracy": 0.6386072188615799, "step": 1060 }, { "epoch": 0.108, "grad_norm": 4.3419108390808105, "learning_rate": 9.892989298929894e-05, "loss": 1.7642, "mean_token_accuracy": 0.6236382931470871, "step": 1080 }, { "epoch": 0.11, "grad_norm": 1.7134941816329956, "learning_rate": 9.890989098909892e-05, "loss": 1.564, "mean_token_accuracy": 0.6248876959085464, "step": 1100 }, { "epoch": 0.112, "grad_norm": 2.7101199626922607, "learning_rate": 9.888988898889889e-05, "loss": 1.6065, "mean_token_accuracy": 0.6259982317686081, "step": 1120 }, { "epoch": 0.114, "grad_norm": 2.0855743885040283, "learning_rate": 9.886988698869888e-05, "loss": 1.7116, "mean_token_accuracy": 0.619007807970047, "step": 1140 }, { "epoch": 0.116, "grad_norm": 2.656412124633789, "learning_rate": 9.884988498849885e-05, "loss": 1.6564, "mean_token_accuracy": 0.6278261929750443, "step": 1160 }, { "epoch": 0.118, "grad_norm": 1.3344535827636719, "learning_rate": 9.882988298829884e-05, "loss": 1.6282, "mean_token_accuracy": 0.6419837027788162, "step": 1180 }, { "epoch": 0.12, "grad_norm": 2.2939035892486572, "learning_rate": 9.88098809880988e-05, "loss": 1.7053, "mean_token_accuracy": 0.6344074100255966, "step": 1200 }, { "epoch": 0.122, "grad_norm": 1.3304641246795654, "learning_rate": 9.87898789878988e-05, "loss": 1.6659, "mean_token_accuracy": 0.6374082803726197, "step": 1220 }, { "epoch": 0.124, "grad_norm": 2.9824750423431396, "learning_rate": 9.876987698769877e-05, "loss": 1.6014, "mean_token_accuracy": 0.6201750040054321, "step": 1240 }, { "epoch": 0.126, "grad_norm": 8.271860122680664, "learning_rate": 9.874987498749876e-05, "loss": 1.6129, "mean_token_accuracy": 0.6188908606767655, "step": 1260 }, { "epoch": 0.128, "grad_norm": 1.8335728645324707, "learning_rate": 9.872987298729874e-05, "loss": 1.5983, "mean_token_accuracy": 0.6181432873010635, "step": 1280 }, { "epoch": 0.13, "grad_norm": 2.2508862018585205, "learning_rate": 9.870987098709872e-05, "loss": 1.8071, "mean_token_accuracy": 0.6118679791688919, "step": 1300 }, { "epoch": 0.132, "grad_norm": 2.8468539714813232, "learning_rate": 9.868986898689869e-05, "loss": 1.6655, "mean_token_accuracy": 0.6004094183444977, "step": 1320 }, { "epoch": 0.134, "grad_norm": 2.693953514099121, "learning_rate": 9.866986698669867e-05, "loss": 1.6205, "mean_token_accuracy": 0.6203346371650695, "step": 1340 }, { "epoch": 0.136, "grad_norm": 2.040891408920288, "learning_rate": 9.864986498649866e-05, "loss": 1.7603, "mean_token_accuracy": 0.6026186913251876, "step": 1360 }, { "epoch": 0.138, "grad_norm": 2.0439929962158203, "learning_rate": 9.862986298629864e-05, "loss": 1.554, "mean_token_accuracy": 0.6221558958292007, "step": 1380 }, { "epoch": 0.14, "grad_norm": 1.7837157249450684, "learning_rate": 9.860986098609861e-05, "loss": 1.6034, "mean_token_accuracy": 0.6144555568695068, "step": 1400 }, { "epoch": 0.142, "grad_norm": 1.426424503326416, "learning_rate": 9.858985898589859e-05, "loss": 1.62, "mean_token_accuracy": 0.6326490581035614, "step": 1420 }, { "epoch": 0.144, "grad_norm": 2.521559476852417, "learning_rate": 9.856985698569858e-05, "loss": 1.5613, "mean_token_accuracy": 0.6357426166534423, "step": 1440 }, { "epoch": 0.146, "grad_norm": 2.962573289871216, "learning_rate": 9.854985498549856e-05, "loss": 1.6002, "mean_token_accuracy": 0.6064192622900009, "step": 1460 }, { "epoch": 0.148, "grad_norm": 1.4009008407592773, "learning_rate": 9.852985298529853e-05, "loss": 1.6675, "mean_token_accuracy": 0.6386959582567215, "step": 1480 }, { "epoch": 0.15, "grad_norm": 1.6275063753128052, "learning_rate": 9.850985098509851e-05, "loss": 1.49, "mean_token_accuracy": 0.6391436487436295, "step": 1500 }, { "epoch": 0.152, "grad_norm": 2.574467658996582, "learning_rate": 9.84898489848985e-05, "loss": 1.5473, "mean_token_accuracy": 0.6366174548864365, "step": 1520 }, { "epoch": 0.154, "grad_norm": 2.0260791778564453, "learning_rate": 9.846984698469848e-05, "loss": 1.5141, "mean_token_accuracy": 0.6452589422464371, "step": 1540 }, { "epoch": 0.156, "grad_norm": 1.5415323972702026, "learning_rate": 9.844984498449845e-05, "loss": 1.6989, "mean_token_accuracy": 0.6273437440395355, "step": 1560 }, { "epoch": 0.158, "grad_norm": 1.7570462226867676, "learning_rate": 9.842984298429844e-05, "loss": 1.6772, "mean_token_accuracy": 0.6357504665851593, "step": 1580 }, { "epoch": 0.16, "grad_norm": 2.5721936225891113, "learning_rate": 9.840984098409841e-05, "loss": 1.5833, "mean_token_accuracy": 0.642360670864582, "step": 1600 }, { "epoch": 0.162, "grad_norm": 2.7183115482330322, "learning_rate": 9.83898389838984e-05, "loss": 1.5145, "mean_token_accuracy": 0.6486396968364716, "step": 1620 }, { "epoch": 0.164, "grad_norm": 2.3220036029815674, "learning_rate": 9.836983698369837e-05, "loss": 1.3721, "mean_token_accuracy": 0.6424904316663742, "step": 1640 }, { "epoch": 0.166, "grad_norm": 1.4994099140167236, "learning_rate": 9.834983498349836e-05, "loss": 1.6533, "mean_token_accuracy": 0.6274626016616821, "step": 1660 }, { "epoch": 0.168, "grad_norm": 1.4341535568237305, "learning_rate": 9.832983298329833e-05, "loss": 1.7272, "mean_token_accuracy": 0.6255038797855377, "step": 1680 }, { "epoch": 0.17, "grad_norm": 1.7813321352005005, "learning_rate": 9.830983098309831e-05, "loss": 1.4881, "mean_token_accuracy": 0.6472748398780823, "step": 1700 }, { "epoch": 0.172, "grad_norm": 4.774188041687012, "learning_rate": 9.828982898289828e-05, "loss": 1.4466, "mean_token_accuracy": 0.6420197546482086, "step": 1720 }, { "epoch": 0.174, "grad_norm": 2.04072642326355, "learning_rate": 9.826982698269828e-05, "loss": 1.5808, "mean_token_accuracy": 0.6337102949619293, "step": 1740 }, { "epoch": 0.176, "grad_norm": 1.6528573036193848, "learning_rate": 9.824982498249825e-05, "loss": 1.6013, "mean_token_accuracy": 0.6375040888786316, "step": 1760 }, { "epoch": 0.178, "grad_norm": 3.3909525871276855, "learning_rate": 9.822982298229823e-05, "loss": 1.5001, "mean_token_accuracy": 0.6515787869691849, "step": 1780 }, { "epoch": 0.18, "grad_norm": 1.4815295934677124, "learning_rate": 9.820982098209822e-05, "loss": 1.4685, "mean_token_accuracy": 0.6363899886608124, "step": 1800 }, { "epoch": 0.182, "grad_norm": 2.092970609664917, "learning_rate": 9.81898189818982e-05, "loss": 1.4442, "mean_token_accuracy": 0.6493135154247284, "step": 1820 }, { "epoch": 0.184, "grad_norm": 1.2305734157562256, "learning_rate": 9.816981698169817e-05, "loss": 1.3145, "mean_token_accuracy": 0.6232963889837265, "step": 1840 }, { "epoch": 0.186, "grad_norm": 1.524640679359436, "learning_rate": 9.814981498149815e-05, "loss": 1.43, "mean_token_accuracy": 0.6437770068645478, "step": 1860 }, { "epoch": 0.188, "grad_norm": 2.6700494289398193, "learning_rate": 9.812981298129814e-05, "loss": 1.4868, "mean_token_accuracy": 0.6298510998487472, "step": 1880 }, { "epoch": 0.19, "grad_norm": 1.6419850587844849, "learning_rate": 9.810981098109812e-05, "loss": 1.4594, "mean_token_accuracy": 0.6217441618442535, "step": 1900 }, { "epoch": 0.192, "grad_norm": 1.4646073579788208, "learning_rate": 9.808980898089809e-05, "loss": 1.4861, "mean_token_accuracy": 0.6351762771606445, "step": 1920 }, { "epoch": 0.194, "grad_norm": 1.8556599617004395, "learning_rate": 9.806980698069807e-05, "loss": 1.3789, "mean_token_accuracy": 0.6482267916202545, "step": 1940 }, { "epoch": 0.196, "grad_norm": 2.0808053016662598, "learning_rate": 9.804980498049805e-05, "loss": 1.472, "mean_token_accuracy": 0.6403911203145981, "step": 1960 }, { "epoch": 0.198, "grad_norm": 2.295166015625, "learning_rate": 9.802980298029804e-05, "loss": 1.3124, "mean_token_accuracy": 0.6423475295305252, "step": 1980 }, { "epoch": 0.2, "grad_norm": 2.174572229385376, "learning_rate": 9.8009800980098e-05, "loss": 1.6964, "mean_token_accuracy": 0.6434104561805725, "step": 2000 }, { "epoch": 0.202, "grad_norm": 2.7021141052246094, "learning_rate": 9.798979897989799e-05, "loss": 1.5288, "mean_token_accuracy": 0.6327837437391282, "step": 2020 }, { "epoch": 0.204, "grad_norm": 4.059101581573486, "learning_rate": 9.796979697969797e-05, "loss": 1.5377, "mean_token_accuracy": 0.6264410972595215, "step": 2040 }, { "epoch": 0.206, "grad_norm": 1.7678347826004028, "learning_rate": 9.794979497949796e-05, "loss": 1.4339, "mean_token_accuracy": 0.6407283037900925, "step": 2060 }, { "epoch": 0.208, "grad_norm": 2.008620500564575, "learning_rate": 9.792979297929792e-05, "loss": 1.4281, "mean_token_accuracy": 0.6356410712003708, "step": 2080 }, { "epoch": 0.21, "grad_norm": 1.4534415006637573, "learning_rate": 9.790979097909792e-05, "loss": 1.3607, "mean_token_accuracy": 0.6617552816867829, "step": 2100 }, { "epoch": 0.212, "grad_norm": 1.5948216915130615, "learning_rate": 9.788978897889789e-05, "loss": 1.5017, "mean_token_accuracy": 0.6342270106077195, "step": 2120 }, { "epoch": 0.214, "grad_norm": 1.5272901058197021, "learning_rate": 9.786978697869787e-05, "loss": 1.4423, "mean_token_accuracy": 0.6455341875553131, "step": 2140 }, { "epoch": 0.216, "grad_norm": 2.229461193084717, "learning_rate": 9.784978497849784e-05, "loss": 1.3986, "mean_token_accuracy": 0.6433578312397004, "step": 2160 }, { "epoch": 0.218, "grad_norm": 1.7767797708511353, "learning_rate": 9.782978297829784e-05, "loss": 1.3152, "mean_token_accuracy": 0.6379425078630447, "step": 2180 }, { "epoch": 0.22, "grad_norm": 1.3392592668533325, "learning_rate": 9.780978097809781e-05, "loss": 1.3269, "mean_token_accuracy": 0.6375521898269654, "step": 2200 }, { "epoch": 0.222, "grad_norm": 2.7356717586517334, "learning_rate": 9.778977897789779e-05, "loss": 1.3911, "mean_token_accuracy": 0.6473947912454605, "step": 2220 }, { "epoch": 0.224, "grad_norm": 1.4934086799621582, "learning_rate": 9.776977697769778e-05, "loss": 1.2814, "mean_token_accuracy": 0.6394591838121414, "step": 2240 }, { "epoch": 0.226, "grad_norm": 1.8894997835159302, "learning_rate": 9.774977497749776e-05, "loss": 1.3624, "mean_token_accuracy": 0.6444092065095901, "step": 2260 }, { "epoch": 0.228, "grad_norm": 1.6504650115966797, "learning_rate": 9.772977297729774e-05, "loss": 1.4319, "mean_token_accuracy": 0.6287138164043427, "step": 2280 }, { "epoch": 0.23, "grad_norm": 2.2318992614746094, "learning_rate": 9.770977097709771e-05, "loss": 1.5167, "mean_token_accuracy": 0.6467519074678421, "step": 2300 }, { "epoch": 0.232, "grad_norm": 2.0008583068847656, "learning_rate": 9.76897689768977e-05, "loss": 1.4815, "mean_token_accuracy": 0.6552913933992386, "step": 2320 }, { "epoch": 0.234, "grad_norm": 1.7956541776657104, "learning_rate": 9.766976697669768e-05, "loss": 1.3629, "mean_token_accuracy": 0.6539251029491424, "step": 2340 }, { "epoch": 0.236, "grad_norm": 2.809174060821533, "learning_rate": 9.764976497649766e-05, "loss": 1.5809, "mean_token_accuracy": 0.6618976324796677, "step": 2360 }, { "epoch": 0.238, "grad_norm": 1.483973503112793, "learning_rate": 9.762976297629763e-05, "loss": 1.323, "mean_token_accuracy": 0.6608596116304397, "step": 2380 }, { "epoch": 0.24, "grad_norm": 2.214658260345459, "learning_rate": 9.760976097609761e-05, "loss": 1.4584, "mean_token_accuracy": 0.6634894877672195, "step": 2400 }, { "epoch": 0.242, "grad_norm": 1.319594144821167, "learning_rate": 9.75897589758976e-05, "loss": 1.3323, "mean_token_accuracy": 0.6517296671867371, "step": 2420 }, { "epoch": 0.244, "grad_norm": 1.867640495300293, "learning_rate": 9.756975697569758e-05, "loss": 1.4025, "mean_token_accuracy": 0.6451198160648346, "step": 2440 }, { "epoch": 0.246, "grad_norm": 2.712649345397949, "learning_rate": 9.754975497549755e-05, "loss": 1.398, "mean_token_accuracy": 0.6432423949241638, "step": 2460 }, { "epoch": 0.248, "grad_norm": 2.0262832641601562, "learning_rate": 9.752975297529755e-05, "loss": 1.3975, "mean_token_accuracy": 0.6591994941234589, "step": 2480 }, { "epoch": 0.25, "grad_norm": 2.175233840942383, "learning_rate": 9.750975097509751e-05, "loss": 1.3575, "mean_token_accuracy": 0.6556499511003494, "step": 2500 }, { "epoch": 0.252, "grad_norm": 1.7130564451217651, "learning_rate": 9.74897489748975e-05, "loss": 1.4095, "mean_token_accuracy": 0.6536113023757935, "step": 2520 }, { "epoch": 0.254, "grad_norm": 1.5217152833938599, "learning_rate": 9.746974697469747e-05, "loss": 1.4136, "mean_token_accuracy": 0.6425490826368332, "step": 2540 }, { "epoch": 0.256, "grad_norm": 3.0278613567352295, "learning_rate": 9.744974497449746e-05, "loss": 1.4796, "mean_token_accuracy": 0.6420489102602005, "step": 2560 }, { "epoch": 0.258, "grad_norm": 2.890223741531372, "learning_rate": 9.742974297429743e-05, "loss": 1.429, "mean_token_accuracy": 0.6419516623020172, "step": 2580 }, { "epoch": 0.26, "grad_norm": 2.287043809890747, "learning_rate": 9.740974097409742e-05, "loss": 1.345, "mean_token_accuracy": 0.6539288640022278, "step": 2600 }, { "epoch": 0.262, "grad_norm": 4.415812015533447, "learning_rate": 9.73897389738974e-05, "loss": 1.2837, "mean_token_accuracy": 0.6608103096485138, "step": 2620 }, { "epoch": 0.264, "grad_norm": 2.272096633911133, "learning_rate": 9.736973697369738e-05, "loss": 1.3086, "mean_token_accuracy": 0.6431496769189835, "step": 2640 }, { "epoch": 0.266, "grad_norm": 1.9342799186706543, "learning_rate": 9.734973497349735e-05, "loss": 1.2684, "mean_token_accuracy": 0.6353938311338425, "step": 2660 }, { "epoch": 0.268, "grad_norm": 1.1796984672546387, "learning_rate": 9.732973297329733e-05, "loss": 1.3884, "mean_token_accuracy": 0.6604422628879547, "step": 2680 }, { "epoch": 0.27, "grad_norm": 2.39225435256958, "learning_rate": 9.730973097309732e-05, "loss": 1.304, "mean_token_accuracy": 0.6547920078039169, "step": 2700 }, { "epoch": 0.272, "grad_norm": 1.380109429359436, "learning_rate": 9.72897289728973e-05, "loss": 1.2739, "mean_token_accuracy": 0.6579254180192947, "step": 2720 }, { "epoch": 0.274, "grad_norm": 2.9508607387542725, "learning_rate": 9.726972697269727e-05, "loss": 1.3697, "mean_token_accuracy": 0.6530997514724731, "step": 2740 }, { "epoch": 0.276, "grad_norm": 4.8157854080200195, "learning_rate": 9.724972497249725e-05, "loss": 1.5012, "mean_token_accuracy": 0.6609708964824677, "step": 2760 }, { "epoch": 0.278, "grad_norm": 1.3421454429626465, "learning_rate": 9.722972297229724e-05, "loss": 1.241, "mean_token_accuracy": 0.6395006120204926, "step": 2780 }, { "epoch": 0.28, "grad_norm": 2.235891580581665, "learning_rate": 9.720972097209722e-05, "loss": 1.4808, "mean_token_accuracy": 0.6659048020839691, "step": 2800 }, { "epoch": 0.282, "grad_norm": 2.056382656097412, "learning_rate": 9.718971897189719e-05, "loss": 1.2786, "mean_token_accuracy": 0.6696320950984955, "step": 2820 }, { "epoch": 0.284, "grad_norm": 2.1024115085601807, "learning_rate": 9.716971697169717e-05, "loss": 1.2878, "mean_token_accuracy": 0.6423914939165115, "step": 2840 }, { "epoch": 0.286, "grad_norm": 2.095243453979492, "learning_rate": 9.714971497149716e-05, "loss": 1.3973, "mean_token_accuracy": 0.6418523252010345, "step": 2860 }, { "epoch": 0.288, "grad_norm": 2.08601450920105, "learning_rate": 9.712971297129714e-05, "loss": 1.2794, "mean_token_accuracy": 0.6517385005950928, "step": 2880 }, { "epoch": 0.29, "grad_norm": 2.2491276264190674, "learning_rate": 9.710971097109711e-05, "loss": 1.28, "mean_token_accuracy": 0.6667718857526779, "step": 2900 }, { "epoch": 0.292, "grad_norm": 2.3658862113952637, "learning_rate": 9.708970897089709e-05, "loss": 1.3789, "mean_token_accuracy": 0.6468125522136688, "step": 2920 }, { "epoch": 0.294, "grad_norm": 2.303043842315674, "learning_rate": 9.706970697069707e-05, "loss": 1.4624, "mean_token_accuracy": 0.6432337909936905, "step": 2940 }, { "epoch": 0.296, "grad_norm": 1.7150546312332153, "learning_rate": 9.704970497049706e-05, "loss": 1.1749, "mean_token_accuracy": 0.6438453078269959, "step": 2960 }, { "epoch": 0.298, "grad_norm": 1.7162537574768066, "learning_rate": 9.702970297029703e-05, "loss": 1.2364, "mean_token_accuracy": 0.6411181867122651, "step": 2980 }, { "epoch": 0.3, "grad_norm": 2.6857388019561768, "learning_rate": 9.700970097009702e-05, "loss": 1.3328, "mean_token_accuracy": 0.6537453800439834, "step": 3000 }, { "epoch": 0.302, "grad_norm": 2.3116490840911865, "learning_rate": 9.698969896989699e-05, "loss": 1.3406, "mean_token_accuracy": 0.6329759299755097, "step": 3020 }, { "epoch": 0.304, "grad_norm": 2.577031135559082, "learning_rate": 9.696969696969698e-05, "loss": 1.3332, "mean_token_accuracy": 0.6553189933300019, "step": 3040 }, { "epoch": 0.306, "grad_norm": 1.3062621355056763, "learning_rate": 9.694969496949695e-05, "loss": 1.4703, "mean_token_accuracy": 0.6616103649139404, "step": 3060 }, { "epoch": 0.308, "grad_norm": 2.982353925704956, "learning_rate": 9.692969296929694e-05, "loss": 1.2529, "mean_token_accuracy": 0.6591937869787217, "step": 3080 }, { "epoch": 0.31, "grad_norm": 3.514193296432495, "learning_rate": 9.690969096909691e-05, "loss": 1.1283, "mean_token_accuracy": 0.6463283836841583, "step": 3100 }, { "epoch": 0.312, "grad_norm": 1.8741295337677002, "learning_rate": 9.68896889688969e-05, "loss": 1.2725, "mean_token_accuracy": 0.6572372138500213, "step": 3120 }, { "epoch": 0.314, "grad_norm": 2.0382134914398193, "learning_rate": 9.686968696869688e-05, "loss": 1.2307, "mean_token_accuracy": 0.6763725876808167, "step": 3140 }, { "epoch": 0.316, "grad_norm": 1.8228074312210083, "learning_rate": 9.684968496849686e-05, "loss": 1.3032, "mean_token_accuracy": 0.6564922541379928, "step": 3160 }, { "epoch": 0.318, "grad_norm": 2.889320135116577, "learning_rate": 9.682968296829683e-05, "loss": 1.2314, "mean_token_accuracy": 0.6519796282052994, "step": 3180 }, { "epoch": 0.32, "grad_norm": 1.6455059051513672, "learning_rate": 9.680968096809681e-05, "loss": 1.1395, "mean_token_accuracy": 0.6677790403366088, "step": 3200 }, { "epoch": 0.322, "grad_norm": 1.8470892906188965, "learning_rate": 9.67896789678968e-05, "loss": 1.2319, "mean_token_accuracy": 0.6511774808168411, "step": 3220 }, { "epoch": 0.324, "grad_norm": 1.6863080263137817, "learning_rate": 9.676967696769678e-05, "loss": 1.2578, "mean_token_accuracy": 0.6326248973608017, "step": 3240 }, { "epoch": 0.326, "grad_norm": 2.6951427459716797, "learning_rate": 9.674967496749675e-05, "loss": 1.1408, "mean_token_accuracy": 0.6581951975822449, "step": 3260 }, { "epoch": 0.328, "grad_norm": 1.4557970762252808, "learning_rate": 9.672967296729673e-05, "loss": 1.136, "mean_token_accuracy": 0.6343502670526504, "step": 3280 }, { "epoch": 0.33, "grad_norm": 1.17385995388031, "learning_rate": 9.670967096709671e-05, "loss": 1.1787, "mean_token_accuracy": 0.6255623400211334, "step": 3300 }, { "epoch": 0.332, "grad_norm": 1.1940430402755737, "learning_rate": 9.66896689668967e-05, "loss": 1.0071, "mean_token_accuracy": 0.6629451781511306, "step": 3320 }, { "epoch": 0.334, "grad_norm": 1.5524379014968872, "learning_rate": 9.666966696669667e-05, "loss": 1.2519, "mean_token_accuracy": 0.6409065693616867, "step": 3340 }, { "epoch": 0.336, "grad_norm": 1.8319742679595947, "learning_rate": 9.664966496649665e-05, "loss": 1.0745, "mean_token_accuracy": 0.6657441169023514, "step": 3360 }, { "epoch": 0.338, "grad_norm": 1.7755353450775146, "learning_rate": 9.662966296629663e-05, "loss": 1.0494, "mean_token_accuracy": 0.6429086536169052, "step": 3380 }, { "epoch": 0.34, "grad_norm": 1.9397002458572388, "learning_rate": 9.660966096609662e-05, "loss": 1.0876, "mean_token_accuracy": 0.6472330510616302, "step": 3400 }, { "epoch": 0.342, "grad_norm": 2.8032376766204834, "learning_rate": 9.658965896589659e-05, "loss": 1.5052, "mean_token_accuracy": 0.6404959291219712, "step": 3420 }, { "epoch": 0.344, "grad_norm": 1.3576545715332031, "learning_rate": 9.656965696569657e-05, "loss": 1.1472, "mean_token_accuracy": 0.6624483704566956, "step": 3440 }, { "epoch": 0.346, "grad_norm": 2.4566333293914795, "learning_rate": 9.654965496549655e-05, "loss": 1.3489, "mean_token_accuracy": 0.64918352663517, "step": 3460 }, { "epoch": 0.348, "grad_norm": 2.1999475955963135, "learning_rate": 9.652965296529653e-05, "loss": 1.3782, "mean_token_accuracy": 0.6410320311784744, "step": 3480 }, { "epoch": 0.35, "grad_norm": 1.048429250717163, "learning_rate": 9.65096509650965e-05, "loss": 1.0432, "mean_token_accuracy": 0.6685724854469299, "step": 3500 }, { "epoch": 0.352, "grad_norm": 2.4794933795928955, "learning_rate": 9.64896489648965e-05, "loss": 1.09, "mean_token_accuracy": 0.6497958481311799, "step": 3520 }, { "epoch": 0.354, "grad_norm": 2.129655122756958, "learning_rate": 9.646964696469647e-05, "loss": 1.2232, "mean_token_accuracy": 0.6643973529338837, "step": 3540 }, { "epoch": 0.356, "grad_norm": 2.1845932006835938, "learning_rate": 9.644964496449645e-05, "loss": 1.2147, "mean_token_accuracy": 0.65315942466259, "step": 3560 }, { "epoch": 0.358, "grad_norm": 2.627959966659546, "learning_rate": 9.642964296429642e-05, "loss": 1.4235, "mean_token_accuracy": 0.6390389144420624, "step": 3580 }, { "epoch": 0.36, "grad_norm": 1.1263914108276367, "learning_rate": 9.640964096409642e-05, "loss": 1.0513, "mean_token_accuracy": 0.6551793694496155, "step": 3600 }, { "epoch": 0.362, "grad_norm": 1.263154149055481, "learning_rate": 9.638963896389639e-05, "loss": 1.229, "mean_token_accuracy": 0.670009633898735, "step": 3620 }, { "epoch": 0.364, "grad_norm": 3.153191089630127, "learning_rate": 9.636963696369637e-05, "loss": 1.3589, "mean_token_accuracy": 0.6218452751636505, "step": 3640 }, { "epoch": 0.366, "grad_norm": 1.2771191596984863, "learning_rate": 9.634963496349636e-05, "loss": 1.1795, "mean_token_accuracy": 0.657193198800087, "step": 3660 }, { "epoch": 0.368, "grad_norm": 2.001551628112793, "learning_rate": 9.632963296329634e-05, "loss": 1.0669, "mean_token_accuracy": 0.6337447315454483, "step": 3680 }, { "epoch": 0.37, "grad_norm": 6.348815441131592, "learning_rate": 9.630963096309631e-05, "loss": 1.0976, "mean_token_accuracy": 0.6515982627868653, "step": 3700 }, { "epoch": 0.372, "grad_norm": 1.0634410381317139, "learning_rate": 9.628962896289629e-05, "loss": 1.0479, "mean_token_accuracy": 0.6685764133930207, "step": 3720 }, { "epoch": 0.374, "grad_norm": 2.583277940750122, "learning_rate": 9.626962696269627e-05, "loss": 1.2449, "mean_token_accuracy": 0.6708226650953293, "step": 3740 }, { "epoch": 0.376, "grad_norm": 1.207780361175537, "learning_rate": 9.624962496249626e-05, "loss": 1.1719, "mean_token_accuracy": 0.6464640140533447, "step": 3760 }, { "epoch": 0.378, "grad_norm": 1.2217520475387573, "learning_rate": 9.622962296229624e-05, "loss": 1.0517, "mean_token_accuracy": 0.6403751313686371, "step": 3780 }, { "epoch": 0.38, "grad_norm": 2.867626667022705, "learning_rate": 9.620962096209621e-05, "loss": 1.1353, "mean_token_accuracy": 0.6520886242389679, "step": 3800 }, { "epoch": 0.382, "grad_norm": 1.437845230102539, "learning_rate": 9.61896189618962e-05, "loss": 1.2342, "mean_token_accuracy": 0.6467665612697602, "step": 3820 }, { "epoch": 0.384, "grad_norm": 1.1339672803878784, "learning_rate": 9.616961696169618e-05, "loss": 0.8892, "mean_token_accuracy": 0.6610689729452133, "step": 3840 }, { "epoch": 0.386, "grad_norm": 2.568793773651123, "learning_rate": 9.614961496149616e-05, "loss": 1.3745, "mean_token_accuracy": 0.6651821255683898, "step": 3860 }, { "epoch": 0.388, "grad_norm": 2.217998743057251, "learning_rate": 9.612961296129613e-05, "loss": 1.1815, "mean_token_accuracy": 0.6844048053026199, "step": 3880 }, { "epoch": 0.39, "grad_norm": 6.411093711853027, "learning_rate": 9.610961096109612e-05, "loss": 1.1878, "mean_token_accuracy": 0.657959035038948, "step": 3900 }, { "epoch": 0.392, "grad_norm": 1.016244888305664, "learning_rate": 9.60896089608961e-05, "loss": 1.1865, "mean_token_accuracy": 0.6638659745454788, "step": 3920 }, { "epoch": 0.394, "grad_norm": 1.209750771522522, "learning_rate": 9.606960696069608e-05, "loss": 1.0107, "mean_token_accuracy": 0.6810258448123931, "step": 3940 }, { "epoch": 0.396, "grad_norm": 3.038722038269043, "learning_rate": 9.604960496049606e-05, "loss": 1.3503, "mean_token_accuracy": 0.6612859785556793, "step": 3960 }, { "epoch": 0.398, "grad_norm": 1.8357658386230469, "learning_rate": 9.602960296029604e-05, "loss": 1.0435, "mean_token_accuracy": 0.654178723692894, "step": 3980 }, { "epoch": 0.4, "grad_norm": 1.620197057723999, "learning_rate": 9.600960096009601e-05, "loss": 1.2652, "mean_token_accuracy": 0.6423360347747803, "step": 4000 }, { "epoch": 0.402, "grad_norm": 1.0550901889801025, "learning_rate": 9.5989598959896e-05, "loss": 1.094, "mean_token_accuracy": 0.6574887096881866, "step": 4020 }, { "epoch": 0.404, "grad_norm": 2.6640586853027344, "learning_rate": 9.596959695969598e-05, "loss": 0.8871, "mean_token_accuracy": 0.6449653297662735, "step": 4040 }, { "epoch": 0.406, "grad_norm": 1.1081268787384033, "learning_rate": 9.594959495949596e-05, "loss": 1.1208, "mean_token_accuracy": 0.6484499961137772, "step": 4060 }, { "epoch": 0.408, "grad_norm": 2.595853567123413, "learning_rate": 9.592959295929593e-05, "loss": 0.9822, "mean_token_accuracy": 0.6633910328149796, "step": 4080 }, { "epoch": 0.41, "grad_norm": 1.3190412521362305, "learning_rate": 9.590959095909591e-05, "loss": 1.0079, "mean_token_accuracy": 0.6654098421335221, "step": 4100 }, { "epoch": 0.412, "grad_norm": 3.2669615745544434, "learning_rate": 9.58895889588959e-05, "loss": 1.0283, "mean_token_accuracy": 0.6525435090065003, "step": 4120 }, { "epoch": 0.414, "grad_norm": 1.9627903699874878, "learning_rate": 9.586958695869588e-05, "loss": 1.0769, "mean_token_accuracy": 0.6471657693386078, "step": 4140 }, { "epoch": 0.416, "grad_norm": 5.344222545623779, "learning_rate": 9.584958495849585e-05, "loss": 1.2119, "mean_token_accuracy": 0.6489155054092407, "step": 4160 }, { "epoch": 0.418, "grad_norm": 1.0202699899673462, "learning_rate": 9.582958295829583e-05, "loss": 0.943, "mean_token_accuracy": 0.6678596526384354, "step": 4180 }, { "epoch": 0.42, "grad_norm": 1.3315589427947998, "learning_rate": 9.580958095809582e-05, "loss": 1.1973, "mean_token_accuracy": 0.6468497961759567, "step": 4200 }, { "epoch": 0.422, "grad_norm": 5.302000522613525, "learning_rate": 9.57895789578958e-05, "loss": 1.1701, "mean_token_accuracy": 0.6366271704435349, "step": 4220 }, { "epoch": 0.424, "grad_norm": 1.3129881620407104, "learning_rate": 9.576957695769577e-05, "loss": 0.9348, "mean_token_accuracy": 0.6630638837814331, "step": 4240 }, { "epoch": 0.426, "grad_norm": 1.3279340267181396, "learning_rate": 9.574957495749575e-05, "loss": 1.2596, "mean_token_accuracy": 0.6476463139057159, "step": 4260 }, { "epoch": 0.428, "grad_norm": 1.6791844367980957, "learning_rate": 9.572957295729573e-05, "loss": 1.2416, "mean_token_accuracy": 0.6465205550193787, "step": 4280 }, { "epoch": 0.43, "grad_norm": 1.8744187355041504, "learning_rate": 9.570957095709572e-05, "loss": 0.878, "mean_token_accuracy": 0.6565584301948547, "step": 4300 }, { "epoch": 0.432, "grad_norm": 4.574236869812012, "learning_rate": 9.568956895689569e-05, "loss": 1.0103, "mean_token_accuracy": 0.668510228395462, "step": 4320 }, { "epoch": 0.434, "grad_norm": 2.688021183013916, "learning_rate": 9.566956695669568e-05, "loss": 1.1165, "mean_token_accuracy": 0.6501451551914215, "step": 4340 }, { "epoch": 0.436, "grad_norm": 2.132188320159912, "learning_rate": 9.564956495649565e-05, "loss": 0.8865, "mean_token_accuracy": 0.6442415475845337, "step": 4360 }, { "epoch": 0.438, "grad_norm": 3.3185269832611084, "learning_rate": 9.562956295629564e-05, "loss": 1.1008, "mean_token_accuracy": 0.6659028023481369, "step": 4380 }, { "epoch": 0.44, "grad_norm": 1.163778305053711, "learning_rate": 9.56095609560956e-05, "loss": 0.991, "mean_token_accuracy": 0.6257940143346786, "step": 4400 }, { "epoch": 0.442, "grad_norm": 2.194847583770752, "learning_rate": 9.55895589558956e-05, "loss": 0.9832, "mean_token_accuracy": 0.6569331377744675, "step": 4420 }, { "epoch": 0.444, "grad_norm": 2.0252432823181152, "learning_rate": 9.556955695569557e-05, "loss": 1.1652, "mean_token_accuracy": 0.6418680101633072, "step": 4440 }, { "epoch": 0.446, "grad_norm": 1.1339881420135498, "learning_rate": 9.554955495549556e-05, "loss": 1.3405, "mean_token_accuracy": 0.6501057773828507, "step": 4460 }, { "epoch": 0.448, "grad_norm": 4.0223541259765625, "learning_rate": 9.552955295529554e-05, "loss": 1.2129, "mean_token_accuracy": 0.6398109674453736, "step": 4480 }, { "epoch": 0.45, "grad_norm": 1.2862168550491333, "learning_rate": 9.550955095509552e-05, "loss": 0.8685, "mean_token_accuracy": 0.6619790554046631, "step": 4500 }, { "epoch": 0.452, "grad_norm": 1.2701129913330078, "learning_rate": 9.548954895489549e-05, "loss": 1.0055, "mean_token_accuracy": 0.6598946243524552, "step": 4520 }, { "epoch": 0.454, "grad_norm": 4.032655239105225, "learning_rate": 9.546954695469547e-05, "loss": 1.2101, "mean_token_accuracy": 0.6567770004272461, "step": 4540 }, { "epoch": 0.456, "grad_norm": 2.289066791534424, "learning_rate": 9.544954495449546e-05, "loss": 0.8687, "mean_token_accuracy": 0.6443863332271575, "step": 4560 }, { "epoch": 0.458, "grad_norm": 5.82201623916626, "learning_rate": 9.542954295429544e-05, "loss": 0.7242, "mean_token_accuracy": 0.6623267948627471, "step": 4580 }, { "epoch": 0.46, "grad_norm": 4.947569847106934, "learning_rate": 9.540954095409541e-05, "loss": 1.0478, "mean_token_accuracy": 0.6467921644449234, "step": 4600 }, { "epoch": 0.462, "grad_norm": 1.5769656896591187, "learning_rate": 9.538953895389539e-05, "loss": 0.841, "mean_token_accuracy": 0.6595022737979889, "step": 4620 }, { "epoch": 0.464, "grad_norm": 1.6579396724700928, "learning_rate": 9.536953695369538e-05, "loss": 1.0968, "mean_token_accuracy": 0.644562178850174, "step": 4640 }, { "epoch": 0.466, "grad_norm": 2.0896332263946533, "learning_rate": 9.534953495349536e-05, "loss": 1.0299, "mean_token_accuracy": 0.6695588439702987, "step": 4660 }, { "epoch": 0.468, "grad_norm": 2.5427067279815674, "learning_rate": 9.532953295329533e-05, "loss": 1.3019, "mean_token_accuracy": 0.6311022967100144, "step": 4680 }, { "epoch": 0.47, "grad_norm": 3.0125882625579834, "learning_rate": 9.530953095309531e-05, "loss": 1.0051, "mean_token_accuracy": 0.6603398144245147, "step": 4700 }, { "epoch": 0.472, "grad_norm": 4.391168117523193, "learning_rate": 9.52895289528953e-05, "loss": 0.9258, "mean_token_accuracy": 0.6543330311775207, "step": 4720 }, { "epoch": 0.474, "grad_norm": 0.9369609951972961, "learning_rate": 9.526952695269528e-05, "loss": 0.8133, "mean_token_accuracy": 0.6838192522525788, "step": 4740 }, { "epoch": 0.476, "grad_norm": 3.0758707523345947, "learning_rate": 9.524952495249525e-05, "loss": 1.0331, "mean_token_accuracy": 0.671579796075821, "step": 4760 }, { "epoch": 0.478, "grad_norm": 3.380450963973999, "learning_rate": 9.522952295229523e-05, "loss": 0.9895, "mean_token_accuracy": 0.6591333329677582, "step": 4780 }, { "epoch": 0.48, "grad_norm": 0.9552709460258484, "learning_rate": 9.520952095209521e-05, "loss": 1.0274, "mean_token_accuracy": 0.6494253754615784, "step": 4800 }, { "epoch": 0.482, "grad_norm": 1.4330755472183228, "learning_rate": 9.51895189518952e-05, "loss": 0.9653, "mean_token_accuracy": 0.6513494104146957, "step": 4820 }, { "epoch": 0.484, "grad_norm": 2.275315999984741, "learning_rate": 9.516951695169517e-05, "loss": 1.1337, "mean_token_accuracy": 0.6522755980491638, "step": 4840 }, { "epoch": 0.486, "grad_norm": 2.9689841270446777, "learning_rate": 9.514951495149516e-05, "loss": 1.1215, "mean_token_accuracy": 0.6570962429046631, "step": 4860 }, { "epoch": 0.488, "grad_norm": 1.3327592611312866, "learning_rate": 9.512951295129513e-05, "loss": 1.0051, "mean_token_accuracy": 0.6772982835769653, "step": 4880 }, { "epoch": 0.49, "grad_norm": 1.6002180576324463, "learning_rate": 9.510951095109511e-05, "loss": 0.9946, "mean_token_accuracy": 0.6635778456926346, "step": 4900 }, { "epoch": 0.492, "grad_norm": 2.023193120956421, "learning_rate": 9.508950895089508e-05, "loss": 1.0767, "mean_token_accuracy": 0.6540512472391129, "step": 4920 }, { "epoch": 0.494, "grad_norm": 1.8311463594436646, "learning_rate": 9.506950695069508e-05, "loss": 0.8728, "mean_token_accuracy": 0.6485318422317505, "step": 4940 }, { "epoch": 0.496, "grad_norm": 2.5032310485839844, "learning_rate": 9.504950495049505e-05, "loss": 1.1689, "mean_token_accuracy": 0.6597102284431458, "step": 4960 }, { "epoch": 0.498, "grad_norm": 3.5815117359161377, "learning_rate": 9.502950295029503e-05, "loss": 1.1621, "mean_token_accuracy": 0.658498901128769, "step": 4980 }, { "epoch": 0.5, "grad_norm": 0.8097955584526062, "learning_rate": 9.500950095009502e-05, "loss": 0.7808, "mean_token_accuracy": 0.672059965133667, "step": 5000 }, { "epoch": 0.502, "grad_norm": 2.431734085083008, "learning_rate": 9.4989498949895e-05, "loss": 0.8432, "mean_token_accuracy": 0.6555502772331238, "step": 5020 }, { "epoch": 0.504, "grad_norm": 2.9842209815979004, "learning_rate": 9.496949694969497e-05, "loss": 0.8895, "mean_token_accuracy": 0.6638733327388764, "step": 5040 }, { "epoch": 0.506, "grad_norm": 4.109753608703613, "learning_rate": 9.494949494949495e-05, "loss": 0.9964, "mean_token_accuracy": 0.6527444988489151, "step": 5060 }, { "epoch": 0.508, "grad_norm": 1.9016938209533691, "learning_rate": 9.492949294929493e-05, "loss": 1.1063, "mean_token_accuracy": 0.6574041455984115, "step": 5080 }, { "epoch": 0.51, "grad_norm": 1.083150029182434, "learning_rate": 9.490949094909492e-05, "loss": 1.019, "mean_token_accuracy": 0.6484863460063934, "step": 5100 }, { "epoch": 0.512, "grad_norm": 3.242363214492798, "learning_rate": 9.488948894889489e-05, "loss": 1.1412, "mean_token_accuracy": 0.6512755781412125, "step": 5120 }, { "epoch": 0.514, "grad_norm": 1.918184757232666, "learning_rate": 9.486948694869487e-05, "loss": 0.8676, "mean_token_accuracy": 0.6674722701311111, "step": 5140 }, { "epoch": 0.516, "grad_norm": 1.2218667268753052, "learning_rate": 9.484948494849485e-05, "loss": 0.9029, "mean_token_accuracy": 0.6649456739425659, "step": 5160 }, { "epoch": 0.518, "grad_norm": 1.3374180793762207, "learning_rate": 9.482948294829484e-05, "loss": 0.9411, "mean_token_accuracy": 0.661174276471138, "step": 5180 }, { "epoch": 0.52, "grad_norm": 8.082998275756836, "learning_rate": 9.48094809480948e-05, "loss": 0.9383, "mean_token_accuracy": 0.646235466003418, "step": 5200 }, { "epoch": 0.522, "grad_norm": 5.361447334289551, "learning_rate": 9.478947894789479e-05, "loss": 0.9557, "mean_token_accuracy": 0.6540566712617875, "step": 5220 }, { "epoch": 0.524, "grad_norm": 3.554076910018921, "learning_rate": 9.476947694769477e-05, "loss": 0.7761, "mean_token_accuracy": 0.6654619127511978, "step": 5240 }, { "epoch": 0.526, "grad_norm": 1.233636498451233, "learning_rate": 9.474947494749475e-05, "loss": 0.826, "mean_token_accuracy": 0.6568653732538223, "step": 5260 }, { "epoch": 0.528, "grad_norm": 0.9833571314811707, "learning_rate": 9.472947294729474e-05, "loss": 0.763, "mean_token_accuracy": 0.6572056919336319, "step": 5280 }, { "epoch": 0.53, "grad_norm": 1.15773606300354, "learning_rate": 9.470947094709471e-05, "loss": 0.8704, "mean_token_accuracy": 0.6706767737865448, "step": 5300 }, { "epoch": 0.532, "grad_norm": 5.081540107727051, "learning_rate": 9.46894689468947e-05, "loss": 1.1354, "mean_token_accuracy": 0.6387688547372818, "step": 5320 }, { "epoch": 0.534, "grad_norm": 1.3922549486160278, "learning_rate": 9.466946694669467e-05, "loss": 1.074, "mean_token_accuracy": 0.638795119524002, "step": 5340 }, { "epoch": 0.536, "grad_norm": 1.167132019996643, "learning_rate": 9.464946494649466e-05, "loss": 0.8919, "mean_token_accuracy": 0.6538525938987731, "step": 5360 }, { "epoch": 0.538, "grad_norm": 1.1925541162490845, "learning_rate": 9.462946294629464e-05, "loss": 0.9033, "mean_token_accuracy": 0.6420210629701615, "step": 5380 }, { "epoch": 0.54, "grad_norm": 0.9960333704948425, "learning_rate": 9.460946094609462e-05, "loss": 0.8649, "mean_token_accuracy": 0.6540909230709075, "step": 5400 }, { "epoch": 0.542, "grad_norm": 2.120863199234009, "learning_rate": 9.458945894589459e-05, "loss": 0.7763, "mean_token_accuracy": 0.6749876409769058, "step": 5420 }, { "epoch": 0.544, "grad_norm": 1.1143039464950562, "learning_rate": 9.456945694569458e-05, "loss": 0.7911, "mean_token_accuracy": 0.6628814667463303, "step": 5440 }, { "epoch": 0.546, "grad_norm": 2.0088248252868652, "learning_rate": 9.454945494549456e-05, "loss": 0.9449, "mean_token_accuracy": 0.6476354271173477, "step": 5460 }, { "epoch": 0.548, "grad_norm": 1.2605093717575073, "learning_rate": 9.452945294529454e-05, "loss": 0.9197, "mean_token_accuracy": 0.6475590825080871, "step": 5480 }, { "epoch": 0.55, "grad_norm": 1.1798518896102905, "learning_rate": 9.450945094509451e-05, "loss": 0.8597, "mean_token_accuracy": 0.6526351571083069, "step": 5500 }, { "epoch": 0.552, "grad_norm": 1.3558850288391113, "learning_rate": 9.44894489448945e-05, "loss": 0.8785, "mean_token_accuracy": 0.6435926109552383, "step": 5520 }, { "epoch": 0.554, "grad_norm": 5.072884559631348, "learning_rate": 9.446944694469448e-05, "loss": 0.9799, "mean_token_accuracy": 0.6677773952484131, "step": 5540 }, { "epoch": 0.556, "grad_norm": 1.7544652223587036, "learning_rate": 9.444944494449446e-05, "loss": 0.9258, "mean_token_accuracy": 0.6643315106630325, "step": 5560 }, { "epoch": 0.558, "grad_norm": 1.0743858814239502, "learning_rate": 9.442944294429443e-05, "loss": 0.9153, "mean_token_accuracy": 0.657461878657341, "step": 5580 }, { "epoch": 0.56, "grad_norm": 6.277653217315674, "learning_rate": 9.440944094409441e-05, "loss": 0.9238, "mean_token_accuracy": 0.6618271172046661, "step": 5600 }, { "epoch": 0.562, "grad_norm": 0.8871872425079346, "learning_rate": 9.43894389438944e-05, "loss": 0.8334, "mean_token_accuracy": 0.6601798981428146, "step": 5620 }, { "epoch": 0.564, "grad_norm": 0.8818346261978149, "learning_rate": 9.436943694369438e-05, "loss": 0.9852, "mean_token_accuracy": 0.657116150856018, "step": 5640 }, { "epoch": 0.566, "grad_norm": 1.276206612586975, "learning_rate": 9.434943494349435e-05, "loss": 0.9009, "mean_token_accuracy": 0.6573558956384659, "step": 5660 }, { "epoch": 0.568, "grad_norm": 0.9371493458747864, "learning_rate": 9.432943294329434e-05, "loss": 0.8403, "mean_token_accuracy": 0.6615984916687012, "step": 5680 }, { "epoch": 0.57, "grad_norm": 1.6636489629745483, "learning_rate": 9.430943094309431e-05, "loss": 0.7047, "mean_token_accuracy": 0.6601086169481277, "step": 5700 }, { "epoch": 0.572, "grad_norm": 1.232314944267273, "learning_rate": 9.42894289428943e-05, "loss": 0.7024, "mean_token_accuracy": 0.6748123377561569, "step": 5720 }, { "epoch": 0.574, "grad_norm": 0.8655527830123901, "learning_rate": 9.426942694269427e-05, "loss": 0.9314, "mean_token_accuracy": 0.6738088697195053, "step": 5740 }, { "epoch": 0.576, "grad_norm": 0.8767774105072021, "learning_rate": 9.424942494249426e-05, "loss": 0.7981, "mean_token_accuracy": 0.6507501751184464, "step": 5760 }, { "epoch": 0.578, "grad_norm": 3.5299220085144043, "learning_rate": 9.422942294229423e-05, "loss": 0.7841, "mean_token_accuracy": 0.6744657427072525, "step": 5780 }, { "epoch": 0.58, "grad_norm": 1.1918418407440186, "learning_rate": 9.420942094209422e-05, "loss": 0.9905, "mean_token_accuracy": 0.6535312920808792, "step": 5800 }, { "epoch": 0.582, "grad_norm": 1.2911040782928467, "learning_rate": 9.418941894189419e-05, "loss": 0.9884, "mean_token_accuracy": 0.6431009143590927, "step": 5820 }, { "epoch": 0.584, "grad_norm": 1.5286130905151367, "learning_rate": 9.416941694169418e-05, "loss": 0.964, "mean_token_accuracy": 0.6518302500247956, "step": 5840 }, { "epoch": 0.586, "grad_norm": 2.3529794216156006, "learning_rate": 9.414941494149415e-05, "loss": 0.8846, "mean_token_accuracy": 0.6534213930368423, "step": 5860 }, { "epoch": 0.588, "grad_norm": 0.9725406765937805, "learning_rate": 9.412941294129413e-05, "loss": 1.2344, "mean_token_accuracy": 0.6469882071018219, "step": 5880 }, { "epoch": 0.59, "grad_norm": 0.8946827054023743, "learning_rate": 9.410941094109412e-05, "loss": 0.6354, "mean_token_accuracy": 0.67442606985569, "step": 5900 }, { "epoch": 0.592, "grad_norm": 1.922399878501892, "learning_rate": 9.40894089408941e-05, "loss": 0.9528, "mean_token_accuracy": 0.6642639100551605, "step": 5920 }, { "epoch": 0.594, "grad_norm": 1.595730185508728, "learning_rate": 9.406940694069407e-05, "loss": 0.7298, "mean_token_accuracy": 0.667911034822464, "step": 5940 }, { "epoch": 0.596, "grad_norm": 1.4079737663269043, "learning_rate": 9.404940494049405e-05, "loss": 0.7662, "mean_token_accuracy": 0.6663949012756347, "step": 5960 }, { "epoch": 0.598, "grad_norm": 1.4684034585952759, "learning_rate": 9.402940294029404e-05, "loss": 0.752, "mean_token_accuracy": 0.6549987912178039, "step": 5980 }, { "epoch": 0.6, "grad_norm": 1.1439335346221924, "learning_rate": 9.400940094009402e-05, "loss": 0.8146, "mean_token_accuracy": 0.6600568681955338, "step": 6000 }, { "epoch": 0.602, "grad_norm": 2.828627586364746, "learning_rate": 9.398939893989399e-05, "loss": 1.0792, "mean_token_accuracy": 0.6477522104978561, "step": 6020 }, { "epoch": 0.604, "grad_norm": 1.3867160081863403, "learning_rate": 9.396939693969397e-05, "loss": 0.6937, "mean_token_accuracy": 0.6723254203796387, "step": 6040 }, { "epoch": 0.606, "grad_norm": 2.180436611175537, "learning_rate": 9.394939493949395e-05, "loss": 1.0785, "mean_token_accuracy": 0.6532107204198837, "step": 6060 }, { "epoch": 0.608, "grad_norm": 1.3327655792236328, "learning_rate": 9.392939293929394e-05, "loss": 0.9264, "mean_token_accuracy": 0.6491348206996918, "step": 6080 }, { "epoch": 0.61, "grad_norm": 1.7624468803405762, "learning_rate": 9.390939093909391e-05, "loss": 1.0922, "mean_token_accuracy": 0.6456236064434051, "step": 6100 }, { "epoch": 0.612, "grad_norm": 1.4257749319076538, "learning_rate": 9.388938893889389e-05, "loss": 1.1699, "mean_token_accuracy": 0.65648755133152, "step": 6120 }, { "epoch": 0.614, "grad_norm": 1.155750036239624, "learning_rate": 9.386938693869387e-05, "loss": 0.701, "mean_token_accuracy": 0.6641527116298676, "step": 6140 }, { "epoch": 0.616, "grad_norm": 1.501948356628418, "learning_rate": 9.384938493849386e-05, "loss": 0.8943, "mean_token_accuracy": 0.6687067300081253, "step": 6160 }, { "epoch": 0.618, "grad_norm": 1.1031874418258667, "learning_rate": 9.382938293829383e-05, "loss": 0.6294, "mean_token_accuracy": 0.6581684827804566, "step": 6180 }, { "epoch": 0.62, "grad_norm": 3.566166877746582, "learning_rate": 9.380938093809382e-05, "loss": 0.8519, "mean_token_accuracy": 0.6650036454200745, "step": 6200 }, { "epoch": 0.622, "grad_norm": 1.4322166442871094, "learning_rate": 9.378937893789379e-05, "loss": 0.89, "mean_token_accuracy": 0.6732980966567993, "step": 6220 }, { "epoch": 0.624, "grad_norm": 2.0347490310668945, "learning_rate": 9.376937693769378e-05, "loss": 0.7814, "mean_token_accuracy": 0.6648334860801697, "step": 6240 }, { "epoch": 0.626, "grad_norm": 0.7313852310180664, "learning_rate": 9.374937493749374e-05, "loss": 0.9007, "mean_token_accuracy": 0.6549704939126968, "step": 6260 }, { "epoch": 0.628, "grad_norm": 1.237227439880371, "learning_rate": 9.372937293729374e-05, "loss": 0.8863, "mean_token_accuracy": 0.6628742888569832, "step": 6280 }, { "epoch": 0.63, "grad_norm": 0.7699325084686279, "learning_rate": 9.370937093709371e-05, "loss": 0.5021, "mean_token_accuracy": 0.6726272732019425, "step": 6300 }, { "epoch": 0.632, "grad_norm": 2.272789478302002, "learning_rate": 9.36893689368937e-05, "loss": 0.9212, "mean_token_accuracy": 0.6645343393087387, "step": 6320 }, { "epoch": 0.634, "grad_norm": 3.041881561279297, "learning_rate": 9.366936693669366e-05, "loss": 1.0216, "mean_token_accuracy": 0.6651605993509293, "step": 6340 }, { "epoch": 0.636, "grad_norm": 1.2670320272445679, "learning_rate": 9.364936493649366e-05, "loss": 0.9381, "mean_token_accuracy": 0.6534036457538605, "step": 6360 }, { "epoch": 0.638, "grad_norm": 1.585636019706726, "learning_rate": 9.362936293629363e-05, "loss": 0.9924, "mean_token_accuracy": 0.6605123072862625, "step": 6380 }, { "epoch": 0.64, "grad_norm": 1.2743092775344849, "learning_rate": 9.360936093609361e-05, "loss": 0.7672, "mean_token_accuracy": 0.6655510812997818, "step": 6400 }, { "epoch": 0.642, "grad_norm": 1.2287760972976685, "learning_rate": 9.35893589358936e-05, "loss": 0.8503, "mean_token_accuracy": 0.6664782613515854, "step": 6420 }, { "epoch": 0.644, "grad_norm": 0.8711447715759277, "learning_rate": 9.356935693569358e-05, "loss": 0.8508, "mean_token_accuracy": 0.6743896961212158, "step": 6440 }, { "epoch": 0.646, "grad_norm": 0.8022291660308838, "learning_rate": 9.354935493549355e-05, "loss": 0.7804, "mean_token_accuracy": 0.6643415749073028, "step": 6460 }, { "epoch": 0.648, "grad_norm": 1.4064675569534302, "learning_rate": 9.352935293529353e-05, "loss": 0.811, "mean_token_accuracy": 0.6714879125356674, "step": 6480 }, { "epoch": 0.65, "grad_norm": 2.584028959274292, "learning_rate": 9.350935093509351e-05, "loss": 0.8229, "mean_token_accuracy": 0.6572656691074371, "step": 6500 }, { "epoch": 0.652, "grad_norm": 0.8311348557472229, "learning_rate": 9.34893489348935e-05, "loss": 0.8639, "mean_token_accuracy": 0.6661805361509323, "step": 6520 }, { "epoch": 0.654, "grad_norm": 1.5471758842468262, "learning_rate": 9.346934693469347e-05, "loss": 0.8315, "mean_token_accuracy": 0.6472446978092193, "step": 6540 }, { "epoch": 0.656, "grad_norm": 0.8368139863014221, "learning_rate": 9.344934493449345e-05, "loss": 0.6331, "mean_token_accuracy": 0.66368827521801, "step": 6560 }, { "epoch": 0.658, "grad_norm": 1.588139295578003, "learning_rate": 9.342934293429343e-05, "loss": 0.7548, "mean_token_accuracy": 0.6530240833759308, "step": 6580 }, { "epoch": 0.66, "grad_norm": 1.215022325515747, "learning_rate": 9.340934093409342e-05, "loss": 0.9035, "mean_token_accuracy": 0.6476800888776779, "step": 6600 }, { "epoch": 0.662, "grad_norm": 1.7638986110687256, "learning_rate": 9.338933893389339e-05, "loss": 0.8617, "mean_token_accuracy": 0.6705313444137573, "step": 6620 }, { "epoch": 0.664, "grad_norm": 4.746072769165039, "learning_rate": 9.336933693369337e-05, "loss": 1.025, "mean_token_accuracy": 0.6610666871070862, "step": 6640 }, { "epoch": 0.666, "grad_norm": 1.2673625946044922, "learning_rate": 9.334933493349335e-05, "loss": 0.806, "mean_token_accuracy": 0.6585023671388626, "step": 6660 }, { "epoch": 0.668, "grad_norm": 0.9524112939834595, "learning_rate": 9.332933293329333e-05, "loss": 0.8975, "mean_token_accuracy": 0.65595583319664, "step": 6680 }, { "epoch": 0.67, "grad_norm": 1.6795239448547363, "learning_rate": 9.33093309330933e-05, "loss": 0.8467, "mean_token_accuracy": 0.6401339173316956, "step": 6700 }, { "epoch": 0.672, "grad_norm": 0.6834287643432617, "learning_rate": 9.32893289328933e-05, "loss": 0.9316, "mean_token_accuracy": 0.6560386657714844, "step": 6720 }, { "epoch": 0.674, "grad_norm": 1.2551835775375366, "learning_rate": 9.326932693269327e-05, "loss": 0.5878, "mean_token_accuracy": 0.6829735934734344, "step": 6740 }, { "epoch": 0.676, "grad_norm": 0.8644407391548157, "learning_rate": 9.324932493249325e-05, "loss": 0.6297, "mean_token_accuracy": 0.67588811814785, "step": 6760 }, { "epoch": 0.678, "grad_norm": 1.4109035730361938, "learning_rate": 9.322932293229324e-05, "loss": 0.9735, "mean_token_accuracy": 0.6482152789831161, "step": 6780 }, { "epoch": 0.68, "grad_norm": 1.1359890699386597, "learning_rate": 9.320932093209322e-05, "loss": 0.9865, "mean_token_accuracy": 0.6641979455947876, "step": 6800 }, { "epoch": 0.682, "grad_norm": 4.661962985992432, "learning_rate": 9.31893189318932e-05, "loss": 1.1488, "mean_token_accuracy": 0.6642133563756942, "step": 6820 }, { "epoch": 0.684, "grad_norm": 2.1430106163024902, "learning_rate": 9.316931693169317e-05, "loss": 0.9811, "mean_token_accuracy": 0.6657360851764679, "step": 6840 }, { "epoch": 0.686, "grad_norm": 6.126121997833252, "learning_rate": 9.314931493149315e-05, "loss": 0.906, "mean_token_accuracy": 0.6616795837879181, "step": 6860 }, { "epoch": 0.688, "grad_norm": 1.5471621751785278, "learning_rate": 9.312931293129314e-05, "loss": 1.0059, "mean_token_accuracy": 0.6685090452432633, "step": 6880 }, { "epoch": 0.69, "grad_norm": 3.3706657886505127, "learning_rate": 9.310931093109312e-05, "loss": 0.7684, "mean_token_accuracy": 0.6695968717336654, "step": 6900 }, { "epoch": 0.692, "grad_norm": 1.1430113315582275, "learning_rate": 9.308930893089309e-05, "loss": 0.9701, "mean_token_accuracy": 0.6494587540626526, "step": 6920 }, { "epoch": 0.694, "grad_norm": 1.061987042427063, "learning_rate": 9.306930693069307e-05, "loss": 0.7889, "mean_token_accuracy": 0.6679951161146164, "step": 6940 }, { "epoch": 0.696, "grad_norm": 3.040602922439575, "learning_rate": 9.304930493049306e-05, "loss": 0.7177, "mean_token_accuracy": 0.6514664262533187, "step": 6960 }, { "epoch": 0.698, "grad_norm": 0.970022439956665, "learning_rate": 9.302930293029304e-05, "loss": 0.8946, "mean_token_accuracy": 0.6661164492368699, "step": 6980 }, { "epoch": 0.7, "grad_norm": 2.129945755004883, "learning_rate": 9.300930093009301e-05, "loss": 1.0056, "mean_token_accuracy": 0.6580301463603974, "step": 7000 }, { "epoch": 0.702, "grad_norm": 1.5264748334884644, "learning_rate": 9.298929892989299e-05, "loss": 0.7452, "mean_token_accuracy": 0.6558452665805816, "step": 7020 }, { "epoch": 0.704, "grad_norm": 6.915838718414307, "learning_rate": 9.296929692969298e-05, "loss": 0.6564, "mean_token_accuracy": 0.6736262679100037, "step": 7040 }, { "epoch": 0.706, "grad_norm": 0.7658067345619202, "learning_rate": 9.294929492949296e-05, "loss": 0.7235, "mean_token_accuracy": 0.6659425765275955, "step": 7060 }, { "epoch": 0.708, "grad_norm": 1.6119685173034668, "learning_rate": 9.292929292929293e-05, "loss": 1.0702, "mean_token_accuracy": 0.65518858730793, "step": 7080 }, { "epoch": 0.71, "grad_norm": 1.7088737487792969, "learning_rate": 9.290929092909292e-05, "loss": 0.9129, "mean_token_accuracy": 0.6632930934429169, "step": 7100 }, { "epoch": 0.712, "grad_norm": 3.9656903743743896, "learning_rate": 9.28892889288929e-05, "loss": 0.6839, "mean_token_accuracy": 0.6674529552459717, "step": 7120 }, { "epoch": 0.714, "grad_norm": 1.1310580968856812, "learning_rate": 9.286928692869288e-05, "loss": 0.9638, "mean_token_accuracy": 0.6721255362033844, "step": 7140 }, { "epoch": 0.716, "grad_norm": 1.2415573596954346, "learning_rate": 9.284928492849285e-05, "loss": 0.5547, "mean_token_accuracy": 0.6775854825973511, "step": 7160 }, { "epoch": 0.718, "grad_norm": 2.5680930614471436, "learning_rate": 9.282928292829284e-05, "loss": 0.6802, "mean_token_accuracy": 0.669127032160759, "step": 7180 }, { "epoch": 0.72, "grad_norm": 0.8748185038566589, "learning_rate": 9.280928092809281e-05, "loss": 0.4799, "mean_token_accuracy": 0.6814782738685607, "step": 7200 }, { "epoch": 0.722, "grad_norm": 2.1335341930389404, "learning_rate": 9.27892789278928e-05, "loss": 0.8468, "mean_token_accuracy": 0.6709761023521423, "step": 7220 }, { "epoch": 0.724, "grad_norm": 1.2749598026275635, "learning_rate": 9.276927692769278e-05, "loss": 0.6831, "mean_token_accuracy": 0.6839420348405838, "step": 7240 }, { "epoch": 0.726, "grad_norm": 3.3807621002197266, "learning_rate": 9.274927492749276e-05, "loss": 0.6884, "mean_token_accuracy": 0.6682859241962433, "step": 7260 }, { "epoch": 0.728, "grad_norm": 3.30072283744812, "learning_rate": 9.272927292729273e-05, "loss": 0.8395, "mean_token_accuracy": 0.6587472885847092, "step": 7280 }, { "epoch": 0.73, "grad_norm": 0.9489920139312744, "learning_rate": 9.270927092709271e-05, "loss": 0.8663, "mean_token_accuracy": 0.6762009412050247, "step": 7300 }, { "epoch": 0.732, "grad_norm": 1.4304338693618774, "learning_rate": 9.26892689268927e-05, "loss": 0.8279, "mean_token_accuracy": 0.6560547649860382, "step": 7320 }, { "epoch": 0.734, "grad_norm": 1.3956124782562256, "learning_rate": 9.266926692669268e-05, "loss": 0.8956, "mean_token_accuracy": 0.6494292914867401, "step": 7340 }, { "epoch": 0.736, "grad_norm": 0.8983442187309265, "learning_rate": 9.264926492649265e-05, "loss": 0.6295, "mean_token_accuracy": 0.6803551197052002, "step": 7360 }, { "epoch": 0.738, "grad_norm": 0.8164054751396179, "learning_rate": 9.262926292629263e-05, "loss": 0.7459, "mean_token_accuracy": 0.6763005167245865, "step": 7380 }, { "epoch": 0.74, "grad_norm": 0.9074740409851074, "learning_rate": 9.260926092609262e-05, "loss": 0.7966, "mean_token_accuracy": 0.6745173066854477, "step": 7400 }, { "epoch": 0.742, "grad_norm": 1.2072185277938843, "learning_rate": 9.25892589258926e-05, "loss": 0.7398, "mean_token_accuracy": 0.6624195247888565, "step": 7420 }, { "epoch": 0.744, "grad_norm": 1.4692628383636475, "learning_rate": 9.256925692569257e-05, "loss": 0.8298, "mean_token_accuracy": 0.6741400182247161, "step": 7440 }, { "epoch": 0.746, "grad_norm": 0.885430097579956, "learning_rate": 9.254925492549255e-05, "loss": 0.7928, "mean_token_accuracy": 0.665663731098175, "step": 7460 }, { "epoch": 0.748, "grad_norm": 0.9728226065635681, "learning_rate": 9.252925292529253e-05, "loss": 0.6563, "mean_token_accuracy": 0.6622903138399124, "step": 7480 }, { "epoch": 0.75, "grad_norm": 1.1150236129760742, "learning_rate": 9.250925092509252e-05, "loss": 0.9439, "mean_token_accuracy": 0.6568855434656143, "step": 7500 }, { "epoch": 0.752, "grad_norm": 5.817782878875732, "learning_rate": 9.248924892489249e-05, "loss": 1.0707, "mean_token_accuracy": 0.6556974232196808, "step": 7520 }, { "epoch": 0.754, "grad_norm": 1.7092589139938354, "learning_rate": 9.246924692469247e-05, "loss": 0.7127, "mean_token_accuracy": 0.6602062165737153, "step": 7540 }, { "epoch": 0.756, "grad_norm": 1.6525636911392212, "learning_rate": 9.244924492449245e-05, "loss": 0.8109, "mean_token_accuracy": 0.6605839401483535, "step": 7560 }, { "epoch": 0.758, "grad_norm": 1.066807746887207, "learning_rate": 9.242924292429244e-05, "loss": 0.7653, "mean_token_accuracy": 0.667397677898407, "step": 7580 }, { "epoch": 0.76, "grad_norm": 1.0027117729187012, "learning_rate": 9.24092409240924e-05, "loss": 0.5736, "mean_token_accuracy": 0.665279284119606, "step": 7600 }, { "epoch": 0.762, "grad_norm": 0.8232443332672119, "learning_rate": 9.23892389238924e-05, "loss": 0.6676, "mean_token_accuracy": 0.6655489265918731, "step": 7620 }, { "epoch": 0.764, "grad_norm": 1.488205909729004, "learning_rate": 9.236923692369237e-05, "loss": 0.9088, "mean_token_accuracy": 0.6541147112846375, "step": 7640 }, { "epoch": 0.766, "grad_norm": 0.7287368774414062, "learning_rate": 9.234923492349235e-05, "loss": 0.8414, "mean_token_accuracy": 0.6536255478858948, "step": 7660 }, { "epoch": 0.768, "grad_norm": 1.1022675037384033, "learning_rate": 9.232923292329232e-05, "loss": 0.5858, "mean_token_accuracy": 0.6658602595329285, "step": 7680 }, { "epoch": 0.77, "grad_norm": 0.9213204979896545, "learning_rate": 9.230923092309232e-05, "loss": 0.8918, "mean_token_accuracy": 0.6735916793346405, "step": 7700 }, { "epoch": 0.772, "grad_norm": 0.8819130063056946, "learning_rate": 9.228922892289229e-05, "loss": 0.7928, "mean_token_accuracy": 0.6767945021390915, "step": 7720 }, { "epoch": 0.774, "grad_norm": 0.9245057702064514, "learning_rate": 9.226922692269227e-05, "loss": 0.7547, "mean_token_accuracy": 0.6780115604400635, "step": 7740 }, { "epoch": 0.776, "grad_norm": 0.732587456703186, "learning_rate": 9.224922492249226e-05, "loss": 0.5661, "mean_token_accuracy": 0.6930559545755386, "step": 7760 }, { "epoch": 0.778, "grad_norm": 1.3257665634155273, "learning_rate": 9.222922292229224e-05, "loss": 0.7074, "mean_token_accuracy": 0.6763699144124985, "step": 7780 }, { "epoch": 0.78, "grad_norm": 1.4781749248504639, "learning_rate": 9.220922092209221e-05, "loss": 0.8203, "mean_token_accuracy": 0.6689307034015656, "step": 7800 }, { "epoch": 0.782, "grad_norm": 0.8947974443435669, "learning_rate": 9.218921892189219e-05, "loss": 0.588, "mean_token_accuracy": 0.6792481184005738, "step": 7820 }, { "epoch": 0.784, "grad_norm": 1.7630465030670166, "learning_rate": 9.216921692169217e-05, "loss": 0.8203, "mean_token_accuracy": 0.6572496622800827, "step": 7840 }, { "epoch": 0.786, "grad_norm": 2.6763317584991455, "learning_rate": 9.214921492149216e-05, "loss": 0.8906, "mean_token_accuracy": 0.6589333802461624, "step": 7860 }, { "epoch": 0.788, "grad_norm": 1.760607123374939, "learning_rate": 9.212921292129213e-05, "loss": 0.8566, "mean_token_accuracy": 0.6761729210615158, "step": 7880 }, { "epoch": 0.79, "grad_norm": 1.221703290939331, "learning_rate": 9.210921092109211e-05, "loss": 0.647, "mean_token_accuracy": 0.6647182643413544, "step": 7900 }, { "epoch": 0.792, "grad_norm": 1.1340383291244507, "learning_rate": 9.20892089208921e-05, "loss": 0.6131, "mean_token_accuracy": 0.6549494355916977, "step": 7920 }, { "epoch": 0.794, "grad_norm": 3.1617071628570557, "learning_rate": 9.206920692069208e-05, "loss": 0.8055, "mean_token_accuracy": 0.6606454938650131, "step": 7940 }, { "epoch": 0.796, "grad_norm": 2.6513047218322754, "learning_rate": 9.204920492049205e-05, "loss": 0.9151, "mean_token_accuracy": 0.660069489479065, "step": 7960 }, { "epoch": 0.798, "grad_norm": 2.5707545280456543, "learning_rate": 9.202920292029203e-05, "loss": 1.0266, "mean_token_accuracy": 0.6552793979644775, "step": 7980 }, { "epoch": 0.8, "grad_norm": 2.0892038345336914, "learning_rate": 9.200920092009201e-05, "loss": 0.9219, "mean_token_accuracy": 0.6702646732330322, "step": 8000 }, { "epoch": 0.802, "grad_norm": 1.605025053024292, "learning_rate": 9.1989198919892e-05, "loss": 1.038, "mean_token_accuracy": 0.66781245470047, "step": 8020 }, { "epoch": 0.804, "grad_norm": 0.7712906002998352, "learning_rate": 9.196919691969196e-05, "loss": 0.6891, "mean_token_accuracy": 0.6634811729192733, "step": 8040 }, { "epoch": 0.806, "grad_norm": 0.940776526927948, "learning_rate": 9.194919491949195e-05, "loss": 0.6904, "mean_token_accuracy": 0.6635693162679672, "step": 8060 }, { "epoch": 0.808, "grad_norm": 2.291799306869507, "learning_rate": 9.192919291929193e-05, "loss": 0.997, "mean_token_accuracy": 0.6692399173974991, "step": 8080 }, { "epoch": 0.81, "grad_norm": 0.7738393545150757, "learning_rate": 9.190919091909191e-05, "loss": 0.8912, "mean_token_accuracy": 0.6822640985250473, "step": 8100 }, { "epoch": 0.812, "grad_norm": 2.945173978805542, "learning_rate": 9.188918891889188e-05, "loss": 0.769, "mean_token_accuracy": 0.6640377223491669, "step": 8120 }, { "epoch": 0.814, "grad_norm": 1.116876244544983, "learning_rate": 9.186918691869188e-05, "loss": 0.7061, "mean_token_accuracy": 0.6750342875719071, "step": 8140 }, { "epoch": 0.816, "grad_norm": 0.850378155708313, "learning_rate": 9.184918491849185e-05, "loss": 0.9927, "mean_token_accuracy": 0.6486116111278534, "step": 8160 }, { "epoch": 0.818, "grad_norm": 2.1469454765319824, "learning_rate": 9.182918291829183e-05, "loss": 0.7417, "mean_token_accuracy": 0.6732907235622406, "step": 8180 }, { "epoch": 0.82, "grad_norm": 4.718832492828369, "learning_rate": 9.18091809180918e-05, "loss": 0.6993, "mean_token_accuracy": 0.6776007145643235, "step": 8200 }, { "epoch": 0.822, "grad_norm": 0.7138538956642151, "learning_rate": 9.17891789178918e-05, "loss": 0.8367, "mean_token_accuracy": 0.661688506603241, "step": 8220 }, { "epoch": 0.824, "grad_norm": 2.0133118629455566, "learning_rate": 9.176917691769177e-05, "loss": 0.7703, "mean_token_accuracy": 0.6636381208896637, "step": 8240 }, { "epoch": 0.826, "grad_norm": 1.3703150749206543, "learning_rate": 9.174917491749175e-05, "loss": 0.6003, "mean_token_accuracy": 0.6739101856946945, "step": 8260 }, { "epoch": 0.828, "grad_norm": 1.7647544145584106, "learning_rate": 9.172917291729173e-05, "loss": 0.7779, "mean_token_accuracy": 0.655205351114273, "step": 8280 }, { "epoch": 0.83, "grad_norm": 2.0467991828918457, "learning_rate": 9.170917091709172e-05, "loss": 0.6173, "mean_token_accuracy": 0.6673642754554748, "step": 8300 }, { "epoch": 0.832, "grad_norm": 4.676120758056641, "learning_rate": 9.16891689168917e-05, "loss": 1.0113, "mean_token_accuracy": 0.660214039683342, "step": 8320 }, { "epoch": 0.834, "grad_norm": 2.1926817893981934, "learning_rate": 9.166916691669167e-05, "loss": 0.8351, "mean_token_accuracy": 0.675602599978447, "step": 8340 }, { "epoch": 0.836, "grad_norm": 0.5701399445533752, "learning_rate": 9.164916491649165e-05, "loss": 0.5132, "mean_token_accuracy": 0.6798494398593903, "step": 8360 }, { "epoch": 0.838, "grad_norm": 0.6894659399986267, "learning_rate": 9.162916291629164e-05, "loss": 0.5845, "mean_token_accuracy": 0.6675403922796249, "step": 8380 }, { "epoch": 0.84, "grad_norm": 8.077582359313965, "learning_rate": 9.160916091609162e-05, "loss": 0.591, "mean_token_accuracy": 0.6506429105997086, "step": 8400 }, { "epoch": 0.842, "grad_norm": 0.6910912394523621, "learning_rate": 9.158915891589159e-05, "loss": 0.9514, "mean_token_accuracy": 0.6544751942157745, "step": 8420 }, { "epoch": 0.844, "grad_norm": 0.8903449773788452, "learning_rate": 9.156915691569159e-05, "loss": 0.4855, "mean_token_accuracy": 0.6915587931871414, "step": 8440 }, { "epoch": 0.846, "grad_norm": 0.9856466054916382, "learning_rate": 9.154915491549155e-05, "loss": 0.6993, "mean_token_accuracy": 0.6700444906949997, "step": 8460 }, { "epoch": 0.848, "grad_norm": 1.0484875440597534, "learning_rate": 9.152915291529154e-05, "loss": 0.6098, "mean_token_accuracy": 0.6663589477539062, "step": 8480 }, { "epoch": 0.85, "grad_norm": 1.1943718194961548, "learning_rate": 9.150915091509151e-05, "loss": 0.835, "mean_token_accuracy": 0.6663583904504776, "step": 8500 }, { "epoch": 0.852, "grad_norm": 1.5613973140716553, "learning_rate": 9.14891489148915e-05, "loss": 0.9077, "mean_token_accuracy": 0.651554799079895, "step": 8520 }, { "epoch": 0.854, "grad_norm": 0.6001622676849365, "learning_rate": 9.146914691469147e-05, "loss": 0.6189, "mean_token_accuracy": 0.6734272599220276, "step": 8540 }, { "epoch": 0.856, "grad_norm": 3.427902936935425, "learning_rate": 9.144914491449146e-05, "loss": 0.9181, "mean_token_accuracy": 0.6623103618621826, "step": 8560 }, { "epoch": 0.858, "grad_norm": 1.1803587675094604, "learning_rate": 9.142914291429143e-05, "loss": 0.5492, "mean_token_accuracy": 0.6525664120912552, "step": 8580 }, { "epoch": 0.86, "grad_norm": 2.308009386062622, "learning_rate": 9.140914091409142e-05, "loss": 1.2088, "mean_token_accuracy": 0.6709436118602753, "step": 8600 }, { "epoch": 0.862, "grad_norm": 0.7417079210281372, "learning_rate": 9.138913891389139e-05, "loss": 0.6767, "mean_token_accuracy": 0.6862359136343003, "step": 8620 }, { "epoch": 0.864, "grad_norm": 1.8243463039398193, "learning_rate": 9.136913691369137e-05, "loss": 0.7204, "mean_token_accuracy": 0.6628228187561035, "step": 8640 }, { "epoch": 0.866, "grad_norm": 0.7223024368286133, "learning_rate": 9.134913491349136e-05, "loss": 0.6721, "mean_token_accuracy": 0.6719308316707611, "step": 8660 }, { "epoch": 0.868, "grad_norm": 1.1341661214828491, "learning_rate": 9.132913291329134e-05, "loss": 0.8107, "mean_token_accuracy": 0.6596487283706665, "step": 8680 }, { "epoch": 0.87, "grad_norm": 1.3605455160140991, "learning_rate": 9.130913091309131e-05, "loss": 0.9299, "mean_token_accuracy": 0.6701935261487961, "step": 8700 }, { "epoch": 0.872, "grad_norm": 1.0869039297103882, "learning_rate": 9.12891289128913e-05, "loss": 0.8241, "mean_token_accuracy": 0.6665963679552078, "step": 8720 }, { "epoch": 0.874, "grad_norm": 3.134340286254883, "learning_rate": 9.126912691269128e-05, "loss": 0.7323, "mean_token_accuracy": 0.6712658941745758, "step": 8740 }, { "epoch": 0.876, "grad_norm": 1.0900349617004395, "learning_rate": 9.124912491249126e-05, "loss": 0.7699, "mean_token_accuracy": 0.6753752380609512, "step": 8760 }, { "epoch": 0.878, "grad_norm": 0.8416166305541992, "learning_rate": 9.122912291229123e-05, "loss": 0.6267, "mean_token_accuracy": 0.6728849202394486, "step": 8780 }, { "epoch": 0.88, "grad_norm": 1.0594664812088013, "learning_rate": 9.120912091209121e-05, "loss": 0.6756, "mean_token_accuracy": 0.6608364164829255, "step": 8800 }, { "epoch": 0.882, "grad_norm": 1.0708415508270264, "learning_rate": 9.11891189118912e-05, "loss": 0.7248, "mean_token_accuracy": 0.6671474367380142, "step": 8820 }, { "epoch": 0.884, "grad_norm": 1.1216610670089722, "learning_rate": 9.116911691169118e-05, "loss": 0.7053, "mean_token_accuracy": 0.6642766177654267, "step": 8840 }, { "epoch": 0.886, "grad_norm": 0.7421129941940308, "learning_rate": 9.114911491149115e-05, "loss": 0.6596, "mean_token_accuracy": 0.679628336429596, "step": 8860 }, { "epoch": 0.888, "grad_norm": 1.1607214212417603, "learning_rate": 9.112911291129113e-05, "loss": 0.5636, "mean_token_accuracy": 0.6801018267869949, "step": 8880 }, { "epoch": 0.89, "grad_norm": 1.316946268081665, "learning_rate": 9.110911091109111e-05, "loss": 0.7251, "mean_token_accuracy": 0.6618233293294906, "step": 8900 }, { "epoch": 0.892, "grad_norm": 1.3579041957855225, "learning_rate": 9.10891089108911e-05, "loss": 0.9529, "mean_token_accuracy": 0.652110344171524, "step": 8920 }, { "epoch": 0.894, "grad_norm": 0.9045913815498352, "learning_rate": 9.106910691069107e-05, "loss": 0.8093, "mean_token_accuracy": 0.672506007552147, "step": 8940 }, { "epoch": 0.896, "grad_norm": 0.8080394864082336, "learning_rate": 9.104910491049106e-05, "loss": 0.8132, "mean_token_accuracy": 0.6835117161273956, "step": 8960 }, { "epoch": 0.898, "grad_norm": 0.7649246454238892, "learning_rate": 9.102910291029103e-05, "loss": 0.6577, "mean_token_accuracy": 0.6719095349311829, "step": 8980 }, { "epoch": 0.9, "grad_norm": 1.799012541770935, "learning_rate": 9.100910091009102e-05, "loss": 0.8267, "mean_token_accuracy": 0.6761313557624817, "step": 9000 }, { "epoch": 0.902, "grad_norm": 1.2893325090408325, "learning_rate": 9.098909890989098e-05, "loss": 0.9635, "mean_token_accuracy": 0.6665897041559219, "step": 9020 }, { "epoch": 0.904, "grad_norm": 1.1883139610290527, "learning_rate": 9.096909690969098e-05, "loss": 0.5034, "mean_token_accuracy": 0.6716513067483902, "step": 9040 }, { "epoch": 0.906, "grad_norm": 0.7591665983200073, "learning_rate": 9.094909490949095e-05, "loss": 0.7992, "mean_token_accuracy": 0.6739805459976196, "step": 9060 }, { "epoch": 0.908, "grad_norm": 0.5677579045295715, "learning_rate": 9.092909290929093e-05, "loss": 0.6017, "mean_token_accuracy": 0.6761542141437531, "step": 9080 }, { "epoch": 0.91, "grad_norm": 1.2295539379119873, "learning_rate": 9.090909090909092e-05, "loss": 0.7596, "mean_token_accuracy": 0.6617599308490754, "step": 9100 }, { "epoch": 0.912, "grad_norm": 0.982981264591217, "learning_rate": 9.08890889088909e-05, "loss": 0.9196, "mean_token_accuracy": 0.6579632997512818, "step": 9120 }, { "epoch": 0.914, "grad_norm": 1.5708266496658325, "learning_rate": 9.086908690869087e-05, "loss": 0.9838, "mean_token_accuracy": 0.6601591825485229, "step": 9140 }, { "epoch": 0.916, "grad_norm": 5.570658206939697, "learning_rate": 9.084908490849085e-05, "loss": 0.8426, "mean_token_accuracy": 0.6737748503684997, "step": 9160 }, { "epoch": 0.918, "grad_norm": 4.791493892669678, "learning_rate": 9.082908290829084e-05, "loss": 0.4816, "mean_token_accuracy": 0.6879294782876968, "step": 9180 }, { "epoch": 0.92, "grad_norm": 0.8159438371658325, "learning_rate": 9.080908090809082e-05, "loss": 0.6517, "mean_token_accuracy": 0.6641249239444733, "step": 9200 }, { "epoch": 0.922, "grad_norm": 0.8148259520530701, "learning_rate": 9.078907890789079e-05, "loss": 0.7427, "mean_token_accuracy": 0.6736963361501693, "step": 9220 }, { "epoch": 0.924, "grad_norm": 0.9576878547668457, "learning_rate": 9.076907690769077e-05, "loss": 0.62, "mean_token_accuracy": 0.6643208086490631, "step": 9240 }, { "epoch": 0.926, "grad_norm": 1.0139881372451782, "learning_rate": 9.074907490749075e-05, "loss": 0.7525, "mean_token_accuracy": 0.668274262547493, "step": 9260 }, { "epoch": 0.928, "grad_norm": 1.1630759239196777, "learning_rate": 9.072907290729074e-05, "loss": 0.5872, "mean_token_accuracy": 0.6719723254442215, "step": 9280 }, { "epoch": 0.93, "grad_norm": 0.6954880952835083, "learning_rate": 9.070907090709071e-05, "loss": 0.8299, "mean_token_accuracy": 0.6708706825971603, "step": 9300 }, { "epoch": 0.932, "grad_norm": 1.178210973739624, "learning_rate": 9.068906890689069e-05, "loss": 0.4757, "mean_token_accuracy": 0.6800819098949432, "step": 9320 }, { "epoch": 0.934, "grad_norm": 1.8534265756607056, "learning_rate": 9.066906690669067e-05, "loss": 0.6682, "mean_token_accuracy": 0.6732523530721665, "step": 9340 }, { "epoch": 0.936, "grad_norm": 3.7028133869171143, "learning_rate": 9.064906490649066e-05, "loss": 0.7136, "mean_token_accuracy": 0.6613909780979157, "step": 9360 }, { "epoch": 0.938, "grad_norm": 2.145373821258545, "learning_rate": 9.062906290629063e-05, "loss": 0.918, "mean_token_accuracy": 0.6611023873090744, "step": 9380 }, { "epoch": 0.94, "grad_norm": 1.0123775005340576, "learning_rate": 9.060906090609061e-05, "loss": 0.7474, "mean_token_accuracy": 0.6668477684259415, "step": 9400 }, { "epoch": 0.942, "grad_norm": 0.8934378027915955, "learning_rate": 9.058905890589059e-05, "loss": 0.5897, "mean_token_accuracy": 0.674164867401123, "step": 9420 }, { "epoch": 0.944, "grad_norm": 4.622416019439697, "learning_rate": 9.056905690569057e-05, "loss": 0.8111, "mean_token_accuracy": 0.6689103245735168, "step": 9440 }, { "epoch": 0.946, "grad_norm": 0.7833349108695984, "learning_rate": 9.054905490549054e-05, "loss": 0.6482, "mean_token_accuracy": 0.6703067511320114, "step": 9460 }, { "epoch": 0.948, "grad_norm": 1.113885521888733, "learning_rate": 9.052905290529054e-05, "loss": 0.6415, "mean_token_accuracy": 0.6784826725721359, "step": 9480 }, { "epoch": 0.95, "grad_norm": 0.5889685750007629, "learning_rate": 9.050905090509051e-05, "loss": 0.6157, "mean_token_accuracy": 0.6785697728395462, "step": 9500 }, { "epoch": 0.952, "grad_norm": 1.0165150165557861, "learning_rate": 9.04890489048905e-05, "loss": 0.5751, "mean_token_accuracy": 0.6785718858242035, "step": 9520 }, { "epoch": 0.954, "grad_norm": 0.6460950374603271, "learning_rate": 9.046904690469046e-05, "loss": 0.6807, "mean_token_accuracy": 0.6873869001865387, "step": 9540 }, { "epoch": 0.956, "grad_norm": 0.8695957660675049, "learning_rate": 9.044904490449046e-05, "loss": 0.6943, "mean_token_accuracy": 0.6658698320388794, "step": 9560 }, { "epoch": 0.958, "grad_norm": 0.7577893137931824, "learning_rate": 9.042904290429043e-05, "loss": 0.7178, "mean_token_accuracy": 0.6768856287002564, "step": 9580 }, { "epoch": 0.96, "grad_norm": 2.5963211059570312, "learning_rate": 9.040904090409041e-05, "loss": 0.7955, "mean_token_accuracy": 0.6659332007169724, "step": 9600 }, { "epoch": 0.962, "grad_norm": 2.6931076049804688, "learning_rate": 9.03890389038904e-05, "loss": 0.5918, "mean_token_accuracy": 0.6630280256271363, "step": 9620 }, { "epoch": 0.964, "grad_norm": 0.7399647831916809, "learning_rate": 9.036903690369038e-05, "loss": 0.9049, "mean_token_accuracy": 0.6655225396156311, "step": 9640 }, { "epoch": 0.966, "grad_norm": 2.3366994857788086, "learning_rate": 9.034903490349035e-05, "loss": 0.9797, "mean_token_accuracy": 0.6746182650327682, "step": 9660 }, { "epoch": 0.968, "grad_norm": 0.8411794900894165, "learning_rate": 9.032903290329033e-05, "loss": 0.8041, "mean_token_accuracy": 0.6651323944330215, "step": 9680 }, { "epoch": 0.97, "grad_norm": 0.7890340089797974, "learning_rate": 9.030903090309031e-05, "loss": 0.507, "mean_token_accuracy": 0.6927546411752701, "step": 9700 }, { "epoch": 0.972, "grad_norm": 1.6731897592544556, "learning_rate": 9.02890289028903e-05, "loss": 0.9317, "mean_token_accuracy": 0.6553625196218491, "step": 9720 }, { "epoch": 0.974, "grad_norm": 1.8394495248794556, "learning_rate": 9.026902690269027e-05, "loss": 0.7464, "mean_token_accuracy": 0.6805295020341873, "step": 9740 }, { "epoch": 0.976, "grad_norm": 0.6131716370582581, "learning_rate": 9.024902490249025e-05, "loss": 0.7972, "mean_token_accuracy": 0.6620415389537812, "step": 9760 }, { "epoch": 0.978, "grad_norm": 1.361794114112854, "learning_rate": 9.022902290229023e-05, "loss": 0.6777, "mean_token_accuracy": 0.6763813495635986, "step": 9780 }, { "epoch": 0.98, "grad_norm": 0.7802141308784485, "learning_rate": 9.020902090209022e-05, "loss": 0.5933, "mean_token_accuracy": 0.6620063573122025, "step": 9800 }, { "epoch": 0.982, "grad_norm": 1.5056581497192383, "learning_rate": 9.01890189018902e-05, "loss": 0.8552, "mean_token_accuracy": 0.6822407931089401, "step": 9820 }, { "epoch": 0.984, "grad_norm": 4.647301197052002, "learning_rate": 9.016901690169017e-05, "loss": 0.7689, "mean_token_accuracy": 0.6661632120609283, "step": 9840 }, { "epoch": 0.986, "grad_norm": 0.7921538949012756, "learning_rate": 9.014901490149016e-05, "loss": 0.7517, "mean_token_accuracy": 0.6734686851501465, "step": 9860 }, { "epoch": 0.988, "grad_norm": 1.0864884853363037, "learning_rate": 9.012901290129013e-05, "loss": 0.8893, "mean_token_accuracy": 0.6630823135375976, "step": 9880 }, { "epoch": 0.99, "grad_norm": 1.258154273033142, "learning_rate": 9.010901090109012e-05, "loss": 0.7398, "mean_token_accuracy": 0.6609723746776581, "step": 9900 }, { "epoch": 0.992, "grad_norm": 1.0020272731781006, "learning_rate": 9.008900890089009e-05, "loss": 0.6399, "mean_token_accuracy": 0.6793039113283157, "step": 9920 }, { "epoch": 0.994, "grad_norm": 2.108456611633301, "learning_rate": 9.006900690069008e-05, "loss": 0.487, "mean_token_accuracy": 0.6835231721401215, "step": 9940 }, { "epoch": 0.996, "grad_norm": 0.9598050117492676, "learning_rate": 9.004900490049005e-05, "loss": 1.0024, "mean_token_accuracy": 0.6631507307291031, "step": 9960 }, { "epoch": 0.998, "grad_norm": 0.9263708591461182, "learning_rate": 9.002900290029004e-05, "loss": 0.7151, "mean_token_accuracy": 0.67125004529953, "step": 9980 }, { "epoch": 1.0, "grad_norm": 0.7976640462875366, "learning_rate": 9.000900090009002e-05, "loss": 0.8368, "mean_token_accuracy": 0.6802001237869263, "step": 10000 }, { "epoch": 1.002, "grad_norm": 1.3322148323059082, "learning_rate": 8.998899889989e-05, "loss": 0.8368, "mean_token_accuracy": 0.7046248853206635, "step": 10020 }, { "epoch": 1.004, "grad_norm": 2.247560977935791, "learning_rate": 8.996899689968997e-05, "loss": 0.7125, "mean_token_accuracy": 0.6907266914844513, "step": 10040 }, { "epoch": 1.006, "grad_norm": 0.9087889790534973, "learning_rate": 8.994899489948995e-05, "loss": 0.5326, "mean_token_accuracy": 0.7226085543632508, "step": 10060 }, { "epoch": 1.008, "grad_norm": 1.6002432107925415, "learning_rate": 8.992899289928994e-05, "loss": 0.7031, "mean_token_accuracy": 0.7187296152114868, "step": 10080 }, { "epoch": 1.01, "grad_norm": 0.9077501893043518, "learning_rate": 8.990899089908992e-05, "loss": 0.483, "mean_token_accuracy": 0.7092296659946442, "step": 10100 }, { "epoch": 1.012, "grad_norm": 1.874840259552002, "learning_rate": 8.988898889888989e-05, "loss": 0.6129, "mean_token_accuracy": 0.726060664653778, "step": 10120 }, { "epoch": 1.014, "grad_norm": 1.110640048980713, "learning_rate": 8.986898689868987e-05, "loss": 0.7627, "mean_token_accuracy": 0.6922136753797531, "step": 10140 }, { "epoch": 1.016, "grad_norm": 0.7734236121177673, "learning_rate": 8.984898489848986e-05, "loss": 0.569, "mean_token_accuracy": 0.7051617801189423, "step": 10160 }, { "epoch": 1.018, "grad_norm": 0.5256991982460022, "learning_rate": 8.982898289828984e-05, "loss": 0.4272, "mean_token_accuracy": 0.7220031261444092, "step": 10180 }, { "epoch": 1.02, "grad_norm": 0.7006994485855103, "learning_rate": 8.980898089808981e-05, "loss": 0.4329, "mean_token_accuracy": 0.7355425775051116, "step": 10200 }, { "epoch": 1.022, "grad_norm": 0.7696146965026855, "learning_rate": 8.978897889788979e-05, "loss": 0.7299, "mean_token_accuracy": 0.6942385613918305, "step": 10220 }, { "epoch": 1.024, "grad_norm": 0.8435760140419006, "learning_rate": 8.976897689768977e-05, "loss": 0.5151, "mean_token_accuracy": 0.710737407207489, "step": 10240 }, { "epoch": 1.026, "grad_norm": 0.710013210773468, "learning_rate": 8.974897489748976e-05, "loss": 0.8069, "mean_token_accuracy": 0.6971306353807449, "step": 10260 }, { "epoch": 1.028, "grad_norm": 0.5898831486701965, "learning_rate": 8.972897289728973e-05, "loss": 0.7987, "mean_token_accuracy": 0.7036171048879624, "step": 10280 }, { "epoch": 1.03, "grad_norm": 0.6048874258995056, "learning_rate": 8.970897089708971e-05, "loss": 0.5643, "mean_token_accuracy": 0.7090870559215545, "step": 10300 }, { "epoch": 1.032, "grad_norm": 2.7120590209960938, "learning_rate": 8.96889688968897e-05, "loss": 0.6123, "mean_token_accuracy": 0.6988580405712128, "step": 10320 }, { "epoch": 1.034, "grad_norm": 0.8622129559516907, "learning_rate": 8.966896689668968e-05, "loss": 0.633, "mean_token_accuracy": 0.714711046218872, "step": 10340 }, { "epoch": 1.036, "grad_norm": 1.5965604782104492, "learning_rate": 8.964896489648965e-05, "loss": 0.6437, "mean_token_accuracy": 0.7055393636226654, "step": 10360 }, { "epoch": 1.038, "grad_norm": 0.6459986567497253, "learning_rate": 8.962896289628964e-05, "loss": 0.6288, "mean_token_accuracy": 0.7101465821266174, "step": 10380 }, { "epoch": 1.04, "grad_norm": 5.907314777374268, "learning_rate": 8.960896089608961e-05, "loss": 0.5078, "mean_token_accuracy": 0.6954573869705201, "step": 10400 }, { "epoch": 1.042, "grad_norm": 1.1032240390777588, "learning_rate": 8.95889588958896e-05, "loss": 0.4664, "mean_token_accuracy": 0.7343897342681884, "step": 10420 }, { "epoch": 1.044, "grad_norm": 0.9582010507583618, "learning_rate": 8.956895689568956e-05, "loss": 0.7058, "mean_token_accuracy": 0.684578201174736, "step": 10440 }, { "epoch": 1.046, "grad_norm": 1.227325677871704, "learning_rate": 8.954895489548956e-05, "loss": 0.6599, "mean_token_accuracy": 0.7057207196950912, "step": 10460 }, { "epoch": 1.048, "grad_norm": 1.238121747970581, "learning_rate": 8.952895289528953e-05, "loss": 0.5632, "mean_token_accuracy": 0.6974528133869171, "step": 10480 }, { "epoch": 1.05, "grad_norm": 0.9272149205207825, "learning_rate": 8.950895089508951e-05, "loss": 0.5405, "mean_token_accuracy": 0.7146629333496094, "step": 10500 }, { "epoch": 1.052, "grad_norm": 1.0964316129684448, "learning_rate": 8.94889488948895e-05, "loss": 0.4697, "mean_token_accuracy": 0.7134774655103684, "step": 10520 }, { "epoch": 1.054, "grad_norm": 0.8879337310791016, "learning_rate": 8.946894689468948e-05, "loss": 0.5552, "mean_token_accuracy": 0.7152970224618912, "step": 10540 }, { "epoch": 1.056, "grad_norm": 0.8506771922111511, "learning_rate": 8.944894489448945e-05, "loss": 0.4064, "mean_token_accuracy": 0.7229478120803833, "step": 10560 }, { "epoch": 1.058, "grad_norm": 0.6848797798156738, "learning_rate": 8.942894289428943e-05, "loss": 0.5593, "mean_token_accuracy": 0.7099254339933395, "step": 10580 }, { "epoch": 1.06, "grad_norm": 0.5085077881813049, "learning_rate": 8.940894089408942e-05, "loss": 0.3992, "mean_token_accuracy": 0.7031489342451096, "step": 10600 }, { "epoch": 1.062, "grad_norm": 1.557930588722229, "learning_rate": 8.93889388938894e-05, "loss": 1.0084, "mean_token_accuracy": 0.7008959293365479, "step": 10620 }, { "epoch": 1.064, "grad_norm": 0.8533952832221985, "learning_rate": 8.936893689368937e-05, "loss": 0.554, "mean_token_accuracy": 0.705353382229805, "step": 10640 }, { "epoch": 1.066, "grad_norm": 0.6192994713783264, "learning_rate": 8.934893489348935e-05, "loss": 0.662, "mean_token_accuracy": 0.7003495156764984, "step": 10660 }, { "epoch": 1.068, "grad_norm": 4.473926067352295, "learning_rate": 8.932893289328933e-05, "loss": 0.5657, "mean_token_accuracy": 0.7111297339200974, "step": 10680 }, { "epoch": 1.07, "grad_norm": 0.6248135566711426, "learning_rate": 8.930893089308932e-05, "loss": 0.5925, "mean_token_accuracy": 0.6915192879736424, "step": 10700 }, { "epoch": 1.072, "grad_norm": 3.305978298187256, "learning_rate": 8.928892889288929e-05, "loss": 0.8151, "mean_token_accuracy": 0.7074527174234391, "step": 10720 }, { "epoch": 1.074, "grad_norm": 1.3093210458755493, "learning_rate": 8.926892689268927e-05, "loss": 0.5316, "mean_token_accuracy": 0.7133027195930481, "step": 10740 }, { "epoch": 1.076, "grad_norm": 0.8629125356674194, "learning_rate": 8.924892489248925e-05, "loss": 0.9096, "mean_token_accuracy": 0.674931338429451, "step": 10760 }, { "epoch": 1.078, "grad_norm": 2.3041653633117676, "learning_rate": 8.922892289228924e-05, "loss": 0.6834, "mean_token_accuracy": 0.6827345758676528, "step": 10780 }, { "epoch": 1.08, "grad_norm": 4.791919708251953, "learning_rate": 8.92089208920892e-05, "loss": 0.7028, "mean_token_accuracy": 0.7056783229112625, "step": 10800 }, { "epoch": 1.082, "grad_norm": 0.5928692817687988, "learning_rate": 8.91889188918892e-05, "loss": 0.6549, "mean_token_accuracy": 0.699923786520958, "step": 10820 }, { "epoch": 1.084, "grad_norm": 3.4034910202026367, "learning_rate": 8.916891689168917e-05, "loss": 0.7526, "mean_token_accuracy": 0.6692015498876571, "step": 10840 }, { "epoch": 1.086, "grad_norm": 1.2188136577606201, "learning_rate": 8.914891489148915e-05, "loss": 0.6988, "mean_token_accuracy": 0.695663258433342, "step": 10860 }, { "epoch": 1.088, "grad_norm": 0.7307220697402954, "learning_rate": 8.912891289128912e-05, "loss": 0.7633, "mean_token_accuracy": 0.6966923594474792, "step": 10880 }, { "epoch": 1.09, "grad_norm": 0.8137832880020142, "learning_rate": 8.910891089108912e-05, "loss": 0.4852, "mean_token_accuracy": 0.6961425125598908, "step": 10900 }, { "epoch": 1.092, "grad_norm": 0.6045920252799988, "learning_rate": 8.908890889088909e-05, "loss": 0.7118, "mean_token_accuracy": 0.6938358008861542, "step": 10920 }, { "epoch": 1.094, "grad_norm": 0.5752485990524292, "learning_rate": 8.906890689068907e-05, "loss": 0.6355, "mean_token_accuracy": 0.6905763477087021, "step": 10940 }, { "epoch": 1.096, "grad_norm": 0.9764902591705322, "learning_rate": 8.904890489048904e-05, "loss": 0.4527, "mean_token_accuracy": 0.703451868891716, "step": 10960 }, { "epoch": 1.098, "grad_norm": 0.6686572432518005, "learning_rate": 8.902890289028904e-05, "loss": 0.821, "mean_token_accuracy": 0.6878774255514145, "step": 10980 }, { "epoch": 1.1, "grad_norm": 0.7086687684059143, "learning_rate": 8.900890089008901e-05, "loss": 0.4551, "mean_token_accuracy": 0.7048228293657303, "step": 11000 }, { "epoch": 1.102, "grad_norm": 1.3888927698135376, "learning_rate": 8.898889888988899e-05, "loss": 0.6724, "mean_token_accuracy": 0.7096023380756378, "step": 11020 }, { "epoch": 1.104, "grad_norm": 0.981815755367279, "learning_rate": 8.896889688968897e-05, "loss": 0.731, "mean_token_accuracy": 0.7101524531841278, "step": 11040 }, { "epoch": 1.106, "grad_norm": 5.641482353210449, "learning_rate": 8.894889488948896e-05, "loss": 0.7683, "mean_token_accuracy": 0.718191945552826, "step": 11060 }, { "epoch": 1.108, "grad_norm": 0.8044900298118591, "learning_rate": 8.892889288928893e-05, "loss": 0.4496, "mean_token_accuracy": 0.7023221015930176, "step": 11080 }, { "epoch": 1.11, "grad_norm": 0.573824405670166, "learning_rate": 8.890889088908891e-05, "loss": 0.653, "mean_token_accuracy": 0.7127705156803131, "step": 11100 }, { "epoch": 1.112, "grad_norm": 0.6668990850448608, "learning_rate": 8.888888888888889e-05, "loss": 0.5585, "mean_token_accuracy": 0.6922543168067932, "step": 11120 }, { "epoch": 1.114, "grad_norm": 0.5718148946762085, "learning_rate": 8.886888688868888e-05, "loss": 0.73, "mean_token_accuracy": 0.7048147857189179, "step": 11140 }, { "epoch": 1.116, "grad_norm": 1.791288137435913, "learning_rate": 8.884888488848885e-05, "loss": 0.7951, "mean_token_accuracy": 0.6866030305624008, "step": 11160 }, { "epoch": 1.1179999999999999, "grad_norm": 2.9900033473968506, "learning_rate": 8.882888288828883e-05, "loss": 1.0699, "mean_token_accuracy": 0.6827403753995895, "step": 11180 }, { "epoch": 1.12, "grad_norm": 0.6993131637573242, "learning_rate": 8.880888088808881e-05, "loss": 0.5362, "mean_token_accuracy": 0.7054001063108444, "step": 11200 }, { "epoch": 1.1219999999999999, "grad_norm": 1.0039595365524292, "learning_rate": 8.87888788878888e-05, "loss": 0.4702, "mean_token_accuracy": 0.7106244891881943, "step": 11220 }, { "epoch": 1.124, "grad_norm": 0.5595393180847168, "learning_rate": 8.876887688768876e-05, "loss": 0.6423, "mean_token_accuracy": 0.6961508899927139, "step": 11240 }, { "epoch": 1.126, "grad_norm": 0.9592801332473755, "learning_rate": 8.874887488748875e-05, "loss": 0.5401, "mean_token_accuracy": 0.7079581290483474, "step": 11260 }, { "epoch": 1.1280000000000001, "grad_norm": 0.5323911905288696, "learning_rate": 8.872887288728873e-05, "loss": 0.6561, "mean_token_accuracy": 0.7149004429578781, "step": 11280 }, { "epoch": 1.13, "grad_norm": 1.249634861946106, "learning_rate": 8.870887088708871e-05, "loss": 1.0321, "mean_token_accuracy": 0.6963305979967117, "step": 11300 }, { "epoch": 1.1320000000000001, "grad_norm": 0.8873103260993958, "learning_rate": 8.86888688868887e-05, "loss": 0.8472, "mean_token_accuracy": 0.6930580556392669, "step": 11320 }, { "epoch": 1.134, "grad_norm": 0.9327400922775269, "learning_rate": 8.866886688668868e-05, "loss": 0.5405, "mean_token_accuracy": 0.7072806835174561, "step": 11340 }, { "epoch": 1.1360000000000001, "grad_norm": 0.9449324607849121, "learning_rate": 8.864886488648866e-05, "loss": 0.7698, "mean_token_accuracy": 0.7045326590538025, "step": 11360 }, { "epoch": 1.138, "grad_norm": 1.4908852577209473, "learning_rate": 8.862886288628863e-05, "loss": 0.5845, "mean_token_accuracy": 0.7135529935359954, "step": 11380 }, { "epoch": 1.1400000000000001, "grad_norm": 1.413973093032837, "learning_rate": 8.860886088608862e-05, "loss": 0.7062, "mean_token_accuracy": 0.6986879199743271, "step": 11400 }, { "epoch": 1.142, "grad_norm": 5.860307216644287, "learning_rate": 8.85888588858886e-05, "loss": 0.5909, "mean_token_accuracy": 0.7132135272026062, "step": 11420 }, { "epoch": 1.144, "grad_norm": 0.6576524972915649, "learning_rate": 8.856885688568858e-05, "loss": 0.7331, "mean_token_accuracy": 0.7026103943586349, "step": 11440 }, { "epoch": 1.146, "grad_norm": 0.9809183478355408, "learning_rate": 8.854885488548855e-05, "loss": 0.509, "mean_token_accuracy": 0.7143113285303115, "step": 11460 }, { "epoch": 1.148, "grad_norm": 0.6206372380256653, "learning_rate": 8.852885288528853e-05, "loss": 0.6768, "mean_token_accuracy": 0.7083843767642974, "step": 11480 }, { "epoch": 1.15, "grad_norm": 1.3664430379867554, "learning_rate": 8.850885088508852e-05, "loss": 0.4366, "mean_token_accuracy": 0.7188077926635742, "step": 11500 }, { "epoch": 1.152, "grad_norm": 1.2126384973526, "learning_rate": 8.84888488848885e-05, "loss": 0.6053, "mean_token_accuracy": 0.704610425233841, "step": 11520 }, { "epoch": 1.154, "grad_norm": 4.867877006530762, "learning_rate": 8.846884688468847e-05, "loss": 0.7246, "mean_token_accuracy": 0.7034604400396347, "step": 11540 }, { "epoch": 1.156, "grad_norm": 0.6809661388397217, "learning_rate": 8.844884488448845e-05, "loss": 0.5778, "mean_token_accuracy": 0.7189712762832642, "step": 11560 }, { "epoch": 1.158, "grad_norm": 4.597437858581543, "learning_rate": 8.842884288428844e-05, "loss": 0.6814, "mean_token_accuracy": 0.6890878647565841, "step": 11580 }, { "epoch": 1.16, "grad_norm": 0.6874442100524902, "learning_rate": 8.840884088408842e-05, "loss": 0.4211, "mean_token_accuracy": 0.7122998297214508, "step": 11600 }, { "epoch": 1.162, "grad_norm": 0.7081030011177063, "learning_rate": 8.838883888388839e-05, "loss": 0.5162, "mean_token_accuracy": 0.6851412564516067, "step": 11620 }, { "epoch": 1.164, "grad_norm": 1.3355962038040161, "learning_rate": 8.836883688368837e-05, "loss": 0.5315, "mean_token_accuracy": 0.7090936034917832, "step": 11640 }, { "epoch": 1.166, "grad_norm": 0.5885812640190125, "learning_rate": 8.834883488348835e-05, "loss": 0.4253, "mean_token_accuracy": 0.7037347197532654, "step": 11660 }, { "epoch": 1.168, "grad_norm": 0.708652138710022, "learning_rate": 8.832883288328834e-05, "loss": 0.5355, "mean_token_accuracy": 0.6882912546396256, "step": 11680 }, { "epoch": 1.17, "grad_norm": 0.6419230103492737, "learning_rate": 8.83088308830883e-05, "loss": 0.7513, "mean_token_accuracy": 0.7004115104675293, "step": 11700 }, { "epoch": 1.172, "grad_norm": 0.8149036765098572, "learning_rate": 8.82888288828883e-05, "loss": 0.6535, "mean_token_accuracy": 0.7016696333885193, "step": 11720 }, { "epoch": 1.174, "grad_norm": 1.0504193305969238, "learning_rate": 8.826882688268827e-05, "loss": 0.5118, "mean_token_accuracy": 0.70557861328125, "step": 11740 }, { "epoch": 1.176, "grad_norm": 1.059934377670288, "learning_rate": 8.824882488248826e-05, "loss": 0.6888, "mean_token_accuracy": 0.7068374931812287, "step": 11760 }, { "epoch": 1.178, "grad_norm": 0.8791245222091675, "learning_rate": 8.822882288228823e-05, "loss": 0.4242, "mean_token_accuracy": 0.7109264343976974, "step": 11780 }, { "epoch": 1.18, "grad_norm": 1.0898411273956299, "learning_rate": 8.820882088208822e-05, "loss": 0.7818, "mean_token_accuracy": 0.6958979368209839, "step": 11800 }, { "epoch": 1.182, "grad_norm": 1.9335490465164185, "learning_rate": 8.818881888188819e-05, "loss": 0.4629, "mean_token_accuracy": 0.7073468387126922, "step": 11820 }, { "epoch": 1.184, "grad_norm": 0.614459753036499, "learning_rate": 8.816881688168817e-05, "loss": 0.6732, "mean_token_accuracy": 0.7048043310642242, "step": 11840 }, { "epoch": 1.186, "grad_norm": 6.208465099334717, "learning_rate": 8.814881488148816e-05, "loss": 0.5766, "mean_token_accuracy": 0.7112806409597396, "step": 11860 }, { "epoch": 1.188, "grad_norm": 0.684927225112915, "learning_rate": 8.812881288128814e-05, "loss": 0.4581, "mean_token_accuracy": 0.7192516684532165, "step": 11880 }, { "epoch": 1.19, "grad_norm": 0.9237399697303772, "learning_rate": 8.810881088108811e-05, "loss": 1.004, "mean_token_accuracy": 0.6853518009185791, "step": 11900 }, { "epoch": 1.192, "grad_norm": 0.7274589538574219, "learning_rate": 8.808880888088809e-05, "loss": 0.5598, "mean_token_accuracy": 0.6875534206628799, "step": 11920 }, { "epoch": 1.194, "grad_norm": 0.9799804091453552, "learning_rate": 8.806880688068808e-05, "loss": 0.6608, "mean_token_accuracy": 0.6978721678256988, "step": 11940 }, { "epoch": 1.196, "grad_norm": 0.8149034380912781, "learning_rate": 8.804880488048806e-05, "loss": 0.6816, "mean_token_accuracy": 0.7078562051057815, "step": 11960 }, { "epoch": 1.198, "grad_norm": 1.2367113828659058, "learning_rate": 8.802880288028803e-05, "loss": 0.5683, "mean_token_accuracy": 0.7041148543357849, "step": 11980 }, { "epoch": 1.2, "grad_norm": 0.5226647257804871, "learning_rate": 8.800880088008801e-05, "loss": 0.4583, "mean_token_accuracy": 0.7120580077171326, "step": 12000 }, { "epoch": 1.202, "grad_norm": 1.2396918535232544, "learning_rate": 8.7988798879888e-05, "loss": 0.5479, "mean_token_accuracy": 0.7189677357673645, "step": 12020 }, { "epoch": 1.204, "grad_norm": 0.9343885779380798, "learning_rate": 8.796879687968798e-05, "loss": 1.2504, "mean_token_accuracy": 0.6920939654111862, "step": 12040 }, { "epoch": 1.206, "grad_norm": 0.6013520359992981, "learning_rate": 8.794879487948795e-05, "loss": 0.9756, "mean_token_accuracy": 0.6999391496181488, "step": 12060 }, { "epoch": 1.208, "grad_norm": 0.7721669673919678, "learning_rate": 8.792879287928793e-05, "loss": 0.4473, "mean_token_accuracy": 0.7083842307329178, "step": 12080 }, { "epoch": 1.21, "grad_norm": 1.1854826211929321, "learning_rate": 8.790879087908791e-05, "loss": 0.6853, "mean_token_accuracy": 0.7097859531641006, "step": 12100 }, { "epoch": 1.212, "grad_norm": 0.5042135715484619, "learning_rate": 8.78887888788879e-05, "loss": 0.5083, "mean_token_accuracy": 0.7030031710863114, "step": 12120 }, { "epoch": 1.214, "grad_norm": 1.2553173303604126, "learning_rate": 8.786878687868787e-05, "loss": 0.4791, "mean_token_accuracy": 0.7071308374404908, "step": 12140 }, { "epoch": 1.216, "grad_norm": 3.1421515941619873, "learning_rate": 8.784878487848785e-05, "loss": 0.8118, "mean_token_accuracy": 0.6787464886903762, "step": 12160 }, { "epoch": 1.218, "grad_norm": 0.844445526599884, "learning_rate": 8.782878287828783e-05, "loss": 0.666, "mean_token_accuracy": 0.7061302870512008, "step": 12180 }, { "epoch": 1.22, "grad_norm": 0.6649876236915588, "learning_rate": 8.780878087808782e-05, "loss": 0.7078, "mean_token_accuracy": 0.6964013606309891, "step": 12200 }, { "epoch": 1.222, "grad_norm": 1.183819055557251, "learning_rate": 8.778877887788778e-05, "loss": 0.8081, "mean_token_accuracy": 0.6927585661411285, "step": 12220 }, { "epoch": 1.224, "grad_norm": 0.726966381072998, "learning_rate": 8.776877687768778e-05, "loss": 0.6064, "mean_token_accuracy": 0.699896639585495, "step": 12240 }, { "epoch": 1.226, "grad_norm": 0.7219884991645813, "learning_rate": 8.774877487748775e-05, "loss": 0.6081, "mean_token_accuracy": 0.7093743711709977, "step": 12260 }, { "epoch": 1.228, "grad_norm": 1.0614509582519531, "learning_rate": 8.772877287728773e-05, "loss": 0.6963, "mean_token_accuracy": 0.6952145785093308, "step": 12280 }, { "epoch": 1.23, "grad_norm": 0.5685293674468994, "learning_rate": 8.77087708770877e-05, "loss": 0.691, "mean_token_accuracy": 0.7002934277057647, "step": 12300 }, { "epoch": 1.232, "grad_norm": 1.7203712463378906, "learning_rate": 8.76887688768877e-05, "loss": 0.8917, "mean_token_accuracy": 0.6883782356977463, "step": 12320 }, { "epoch": 1.234, "grad_norm": 0.9494832754135132, "learning_rate": 8.766876687668767e-05, "loss": 0.4724, "mean_token_accuracy": 0.7007526069879532, "step": 12340 }, { "epoch": 1.236, "grad_norm": 0.900100588798523, "learning_rate": 8.764876487648765e-05, "loss": 0.4574, "mean_token_accuracy": 0.7076001554727555, "step": 12360 }, { "epoch": 1.238, "grad_norm": 2.1943068504333496, "learning_rate": 8.762876287628764e-05, "loss": 0.6441, "mean_token_accuracy": 0.6907369405031204, "step": 12380 }, { "epoch": 1.24, "grad_norm": 1.2599490880966187, "learning_rate": 8.760876087608762e-05, "loss": 0.3843, "mean_token_accuracy": 0.7196915239095688, "step": 12400 }, { "epoch": 1.242, "grad_norm": 1.5404052734375, "learning_rate": 8.758875887588759e-05, "loss": 0.7162, "mean_token_accuracy": 0.6848631232976914, "step": 12420 }, { "epoch": 1.244, "grad_norm": 0.6012865900993347, "learning_rate": 8.756875687568757e-05, "loss": 0.6076, "mean_token_accuracy": 0.7117492824792861, "step": 12440 }, { "epoch": 1.246, "grad_norm": 1.2850457429885864, "learning_rate": 8.754875487548755e-05, "loss": 0.5407, "mean_token_accuracy": 0.699531438946724, "step": 12460 }, { "epoch": 1.248, "grad_norm": 3.7141244411468506, "learning_rate": 8.752875287528754e-05, "loss": 0.773, "mean_token_accuracy": 0.6975049167871475, "step": 12480 }, { "epoch": 1.25, "grad_norm": 2.6487419605255127, "learning_rate": 8.75087508750875e-05, "loss": 0.3765, "mean_token_accuracy": 0.7015343546867371, "step": 12500 }, { "epoch": 1.252, "grad_norm": 1.0641571283340454, "learning_rate": 8.748874887488749e-05, "loss": 0.5669, "mean_token_accuracy": 0.7066483855247497, "step": 12520 }, { "epoch": 1.254, "grad_norm": 0.7483652830123901, "learning_rate": 8.746874687468747e-05, "loss": 0.7491, "mean_token_accuracy": 0.6804600685834885, "step": 12540 }, { "epoch": 1.256, "grad_norm": 1.0490339994430542, "learning_rate": 8.744874487448746e-05, "loss": 0.7518, "mean_token_accuracy": 0.6932063221931457, "step": 12560 }, { "epoch": 1.258, "grad_norm": 1.1001087427139282, "learning_rate": 8.742874287428743e-05, "loss": 0.3942, "mean_token_accuracy": 0.7162411570549011, "step": 12580 }, { "epoch": 1.26, "grad_norm": 0.8121678829193115, "learning_rate": 8.740874087408741e-05, "loss": 0.6285, "mean_token_accuracy": 0.7118010997772217, "step": 12600 }, { "epoch": 1.262, "grad_norm": 0.7639322280883789, "learning_rate": 8.738873887388739e-05, "loss": 0.4221, "mean_token_accuracy": 0.6982269525527954, "step": 12620 }, { "epoch": 1.264, "grad_norm": 0.7046753168106079, "learning_rate": 8.736873687368737e-05, "loss": 0.5736, "mean_token_accuracy": 0.7153317004442215, "step": 12640 }, { "epoch": 1.266, "grad_norm": 1.1677870750427246, "learning_rate": 8.734873487348734e-05, "loss": 0.6789, "mean_token_accuracy": 0.7090670526027679, "step": 12660 }, { "epoch": 1.268, "grad_norm": 0.6979532837867737, "learning_rate": 8.732873287328733e-05, "loss": 0.5288, "mean_token_accuracy": 0.6998330652713776, "step": 12680 }, { "epoch": 1.27, "grad_norm": 4.145288467407227, "learning_rate": 8.730873087308731e-05, "loss": 0.4921, "mean_token_accuracy": 0.720389860868454, "step": 12700 }, { "epoch": 1.272, "grad_norm": 0.9591246843338013, "learning_rate": 8.728872887288729e-05, "loss": 0.6191, "mean_token_accuracy": 0.7051250517368317, "step": 12720 }, { "epoch": 1.274, "grad_norm": 0.6271379590034485, "learning_rate": 8.726872687268726e-05, "loss": 0.6504, "mean_token_accuracy": 0.712639057636261, "step": 12740 }, { "epoch": 1.276, "grad_norm": 1.1915303468704224, "learning_rate": 8.724872487248726e-05, "loss": 0.7392, "mean_token_accuracy": 0.705252081155777, "step": 12760 }, { "epoch": 1.278, "grad_norm": 3.593942642211914, "learning_rate": 8.722872287228723e-05, "loss": 0.5594, "mean_token_accuracy": 0.7101649701595306, "step": 12780 }, { "epoch": 1.28, "grad_norm": 0.8793991804122925, "learning_rate": 8.720872087208721e-05, "loss": 0.7144, "mean_token_accuracy": 0.6911307454109192, "step": 12800 }, { "epoch": 1.282, "grad_norm": 1.950652003288269, "learning_rate": 8.71887188718872e-05, "loss": 0.4895, "mean_token_accuracy": 0.7064989984035492, "step": 12820 }, { "epoch": 1.284, "grad_norm": 0.6992264986038208, "learning_rate": 8.716871687168718e-05, "loss": 0.812, "mean_token_accuracy": 0.5825750827789307, "step": 12840 }, { "epoch": 1.286, "grad_norm": 0.750967800617218, "learning_rate": 8.714871487148716e-05, "loss": 0.501, "mean_token_accuracy": 0.6839283466339111, "step": 12860 }, { "epoch": 1.288, "grad_norm": 1.2811205387115479, "learning_rate": 8.712871287128713e-05, "loss": 0.6673, "mean_token_accuracy": 0.7083881169557571, "step": 12880 }, { "epoch": 1.29, "grad_norm": 8.777938842773438, "learning_rate": 8.710871087108711e-05, "loss": 0.703, "mean_token_accuracy": 0.6877928972244263, "step": 12900 }, { "epoch": 1.292, "grad_norm": 0.5664731860160828, "learning_rate": 8.70887088708871e-05, "loss": 0.6937, "mean_token_accuracy": 0.7021973848342895, "step": 12920 }, { "epoch": 1.294, "grad_norm": 0.80867999792099, "learning_rate": 8.706870687068708e-05, "loss": 0.6363, "mean_token_accuracy": 0.6985648363828659, "step": 12940 }, { "epoch": 1.296, "grad_norm": 0.9420487880706787, "learning_rate": 8.704870487048705e-05, "loss": 0.4807, "mean_token_accuracy": 0.703539502620697, "step": 12960 }, { "epoch": 1.298, "grad_norm": 0.821361243724823, "learning_rate": 8.702870287028703e-05, "loss": 0.7282, "mean_token_accuracy": 0.7020607978105545, "step": 12980 }, { "epoch": 1.3, "grad_norm": 1.1039105653762817, "learning_rate": 8.700870087008701e-05, "loss": 0.6836, "mean_token_accuracy": 0.687190368771553, "step": 13000 }, { "epoch": 1.302, "grad_norm": 3.4772140979766846, "learning_rate": 8.6988698869887e-05, "loss": 0.8474, "mean_token_accuracy": 0.6938465178012848, "step": 13020 }, { "epoch": 1.304, "grad_norm": 0.611554741859436, "learning_rate": 8.696869686968697e-05, "loss": 0.7121, "mean_token_accuracy": 0.7057340174913407, "step": 13040 }, { "epoch": 1.306, "grad_norm": 1.2342891693115234, "learning_rate": 8.694869486948696e-05, "loss": 0.6367, "mean_token_accuracy": 0.6958259761333465, "step": 13060 }, { "epoch": 1.308, "grad_norm": 0.7140197157859802, "learning_rate": 8.692869286928693e-05, "loss": 0.677, "mean_token_accuracy": 0.6673732452094555, "step": 13080 }, { "epoch": 1.31, "grad_norm": 0.9007594585418701, "learning_rate": 8.690869086908692e-05, "loss": 0.625, "mean_token_accuracy": 0.7055485099554062, "step": 13100 }, { "epoch": 1.312, "grad_norm": 0.7013643383979797, "learning_rate": 8.688868886888689e-05, "loss": 0.5495, "mean_token_accuracy": 0.7003115713596344, "step": 13120 }, { "epoch": 1.314, "grad_norm": 6.98708438873291, "learning_rate": 8.686868686868688e-05, "loss": 0.9422, "mean_token_accuracy": 0.6884891003370285, "step": 13140 }, { "epoch": 1.316, "grad_norm": 3.156315803527832, "learning_rate": 8.684868486848685e-05, "loss": 0.4689, "mean_token_accuracy": 0.7017326653003693, "step": 13160 }, { "epoch": 1.318, "grad_norm": 0.8355620503425598, "learning_rate": 8.682868286828684e-05, "loss": 0.5174, "mean_token_accuracy": 0.7003647208213806, "step": 13180 }, { "epoch": 1.32, "grad_norm": 0.8273841142654419, "learning_rate": 8.68086808680868e-05, "loss": 0.5871, "mean_token_accuracy": 0.7172893673181534, "step": 13200 }, { "epoch": 1.322, "grad_norm": 0.805784285068512, "learning_rate": 8.67886788678868e-05, "loss": 0.5196, "mean_token_accuracy": 0.6821262672543525, "step": 13220 }, { "epoch": 1.324, "grad_norm": 0.8203831911087036, "learning_rate": 8.676867686768677e-05, "loss": 0.6271, "mean_token_accuracy": 0.6877326995134354, "step": 13240 }, { "epoch": 1.326, "grad_norm": 2.8516597747802734, "learning_rate": 8.674867486748675e-05, "loss": 0.6876, "mean_token_accuracy": 0.6922837615013122, "step": 13260 }, { "epoch": 1.328, "grad_norm": 18.538776397705078, "learning_rate": 8.672867286728674e-05, "loss": 0.6048, "mean_token_accuracy": 0.7004165768623352, "step": 13280 }, { "epoch": 1.33, "grad_norm": 1.2990244626998901, "learning_rate": 8.670867086708672e-05, "loss": 0.6902, "mean_token_accuracy": 0.6951852560043335, "step": 13300 }, { "epoch": 1.332, "grad_norm": 0.9301687479019165, "learning_rate": 8.668866886688669e-05, "loss": 0.4698, "mean_token_accuracy": 0.688472005724907, "step": 13320 }, { "epoch": 1.334, "grad_norm": 3.226372241973877, "learning_rate": 8.666866686668667e-05, "loss": 0.4672, "mean_token_accuracy": 0.688987722992897, "step": 13340 }, { "epoch": 1.336, "grad_norm": 1.2940361499786377, "learning_rate": 8.664866486648666e-05, "loss": 0.6108, "mean_token_accuracy": 0.701537698507309, "step": 13360 }, { "epoch": 1.338, "grad_norm": 0.7894430160522461, "learning_rate": 8.662866286628664e-05, "loss": 0.5154, "mean_token_accuracy": 0.7061700731515884, "step": 13380 }, { "epoch": 1.34, "grad_norm": 2.5326995849609375, "learning_rate": 8.660866086608661e-05, "loss": 0.675, "mean_token_accuracy": 0.6968096196651459, "step": 13400 }, { "epoch": 1.342, "grad_norm": 2.3820226192474365, "learning_rate": 8.658865886588659e-05, "loss": 0.6255, "mean_token_accuracy": 0.7076544374227524, "step": 13420 }, { "epoch": 1.3439999999999999, "grad_norm": 3.5793538093566895, "learning_rate": 8.656865686568657e-05, "loss": 0.6803, "mean_token_accuracy": 0.6975560009479522, "step": 13440 }, { "epoch": 1.346, "grad_norm": 0.8396913409233093, "learning_rate": 8.654865486548656e-05, "loss": 0.4967, "mean_token_accuracy": 0.7010517656803131, "step": 13460 }, { "epoch": 1.3479999999999999, "grad_norm": 1.2609410285949707, "learning_rate": 8.652865286528653e-05, "loss": 0.6056, "mean_token_accuracy": 0.6831089377403259, "step": 13480 }, { "epoch": 1.35, "grad_norm": 0.8697395920753479, "learning_rate": 8.650865086508651e-05, "loss": 0.5866, "mean_token_accuracy": 0.7142840534448623, "step": 13500 }, { "epoch": 1.3519999999999999, "grad_norm": 0.7067045569419861, "learning_rate": 8.648864886488649e-05, "loss": 0.7686, "mean_token_accuracy": 0.7046869546175003, "step": 13520 }, { "epoch": 1.354, "grad_norm": 1.0591555833816528, "learning_rate": 8.646864686468648e-05, "loss": 0.6694, "mean_token_accuracy": 0.6942510515451431, "step": 13540 }, { "epoch": 1.3559999999999999, "grad_norm": 0.5960679650306702, "learning_rate": 8.644864486448645e-05, "loss": 0.6567, "mean_token_accuracy": 0.713141018152237, "step": 13560 }, { "epoch": 1.358, "grad_norm": 0.5309821963310242, "learning_rate": 8.642864286428644e-05, "loss": 0.5294, "mean_token_accuracy": 0.719961604475975, "step": 13580 }, { "epoch": 1.3599999999999999, "grad_norm": 2.8643195629119873, "learning_rate": 8.640864086408641e-05, "loss": 1.2489, "mean_token_accuracy": 0.6758122563362121, "step": 13600 }, { "epoch": 1.362, "grad_norm": 2.835094451904297, "learning_rate": 8.63886388638864e-05, "loss": 0.8563, "mean_token_accuracy": 0.6863232523202896, "step": 13620 }, { "epoch": 1.3639999999999999, "grad_norm": 2.8329474925994873, "learning_rate": 8.636863686368636e-05, "loss": 0.6315, "mean_token_accuracy": 0.7080232590436936, "step": 13640 }, { "epoch": 1.366, "grad_norm": 0.7419478297233582, "learning_rate": 8.634863486348636e-05, "loss": 0.568, "mean_token_accuracy": 0.7040223836898803, "step": 13660 }, { "epoch": 1.3679999999999999, "grad_norm": 0.6741124391555786, "learning_rate": 8.632863286328633e-05, "loss": 0.6434, "mean_token_accuracy": 0.7064003676176072, "step": 13680 }, { "epoch": 1.37, "grad_norm": 0.5859820246696472, "learning_rate": 8.630863086308631e-05, "loss": 0.5184, "mean_token_accuracy": 0.7098209381103515, "step": 13700 }, { "epoch": 1.3719999999999999, "grad_norm": 0.829322338104248, "learning_rate": 8.628862886288628e-05, "loss": 0.4259, "mean_token_accuracy": 0.7042018204927445, "step": 13720 }, { "epoch": 1.374, "grad_norm": 1.0438581705093384, "learning_rate": 8.626862686268628e-05, "loss": 0.6384, "mean_token_accuracy": 0.7020093828439713, "step": 13740 }, { "epoch": 1.376, "grad_norm": 0.6491368412971497, "learning_rate": 8.624862486248625e-05, "loss": 0.6954, "mean_token_accuracy": 0.6969826400279999, "step": 13760 }, { "epoch": 1.3780000000000001, "grad_norm": 1.0123258829116821, "learning_rate": 8.622862286228623e-05, "loss": 0.8428, "mean_token_accuracy": 0.7125895291566848, "step": 13780 }, { "epoch": 1.38, "grad_norm": 0.7230886220932007, "learning_rate": 8.620862086208621e-05, "loss": 0.6363, "mean_token_accuracy": 0.6983377456665039, "step": 13800 }, { "epoch": 1.3820000000000001, "grad_norm": 0.5741832852363586, "learning_rate": 8.61886188618862e-05, "loss": 0.531, "mean_token_accuracy": 0.710117781162262, "step": 13820 }, { "epoch": 1.384, "grad_norm": 16.643821716308594, "learning_rate": 8.616861686168617e-05, "loss": 1.0458, "mean_token_accuracy": 0.6886645197868347, "step": 13840 }, { "epoch": 1.3860000000000001, "grad_norm": 2.682036876678467, "learning_rate": 8.614861486148615e-05, "loss": 0.5729, "mean_token_accuracy": 0.7017452597618103, "step": 13860 }, { "epoch": 1.388, "grad_norm": 0.5528013110160828, "learning_rate": 8.612861286128613e-05, "loss": 0.5957, "mean_token_accuracy": 0.7002559244632721, "step": 13880 }, { "epoch": 1.3900000000000001, "grad_norm": 0.9383981227874756, "learning_rate": 8.610861086108612e-05, "loss": 0.607, "mean_token_accuracy": 0.6971845537424087, "step": 13900 }, { "epoch": 1.392, "grad_norm": 0.7808845043182373, "learning_rate": 8.608860886088609e-05, "loss": 0.4484, "mean_token_accuracy": 0.7167744338512421, "step": 13920 }, { "epoch": 1.3940000000000001, "grad_norm": 2.828643321990967, "learning_rate": 8.606860686068607e-05, "loss": 0.6311, "mean_token_accuracy": 0.6985089510679245, "step": 13940 }, { "epoch": 1.396, "grad_norm": 0.9414486289024353, "learning_rate": 8.604860486048605e-05, "loss": 0.6856, "mean_token_accuracy": 0.7109049737453461, "step": 13960 }, { "epoch": 1.3980000000000001, "grad_norm": 0.6245232224464417, "learning_rate": 8.602860286028604e-05, "loss": 0.6009, "mean_token_accuracy": 0.7079724550247193, "step": 13980 }, { "epoch": 1.4, "grad_norm": 0.6243788003921509, "learning_rate": 8.6008600860086e-05, "loss": 0.5438, "mean_token_accuracy": 0.7110241949558258, "step": 14000 }, { "epoch": 1.4020000000000001, "grad_norm": 1.6417462825775146, "learning_rate": 8.598859885988599e-05, "loss": 0.7193, "mean_token_accuracy": 0.6981867700815201, "step": 14020 }, { "epoch": 1.404, "grad_norm": 1.6669013500213623, "learning_rate": 8.596859685968597e-05, "loss": 0.6135, "mean_token_accuracy": 0.701386621594429, "step": 14040 }, { "epoch": 1.4060000000000001, "grad_norm": 0.6281242370605469, "learning_rate": 8.594859485948595e-05, "loss": 0.5324, "mean_token_accuracy": 0.695099851489067, "step": 14060 }, { "epoch": 1.408, "grad_norm": 0.5145024061203003, "learning_rate": 8.592859285928592e-05, "loss": 0.4124, "mean_token_accuracy": 0.7036818742752076, "step": 14080 }, { "epoch": 1.41, "grad_norm": 0.566077709197998, "learning_rate": 8.590859085908592e-05, "loss": 0.3893, "mean_token_accuracy": 0.7163118541240692, "step": 14100 }, { "epoch": 1.412, "grad_norm": 0.4677271544933319, "learning_rate": 8.588858885888589e-05, "loss": 0.5844, "mean_token_accuracy": 0.7034952491521835, "step": 14120 }, { "epoch": 1.414, "grad_norm": 7.9937543869018555, "learning_rate": 8.586858685868587e-05, "loss": 0.5366, "mean_token_accuracy": 0.7049313306808471, "step": 14140 }, { "epoch": 1.416, "grad_norm": 0.8665594458580017, "learning_rate": 8.584858485848584e-05, "loss": 0.4704, "mean_token_accuracy": 0.7066628187894821, "step": 14160 }, { "epoch": 1.418, "grad_norm": 0.6434713006019592, "learning_rate": 8.582858285828584e-05, "loss": 0.4704, "mean_token_accuracy": 0.6922138839960098, "step": 14180 }, { "epoch": 1.42, "grad_norm": 0.707238495349884, "learning_rate": 8.580858085808581e-05, "loss": 0.5821, "mean_token_accuracy": 0.7029243588447571, "step": 14200 }, { "epoch": 1.422, "grad_norm": 2.2533392906188965, "learning_rate": 8.578857885788579e-05, "loss": 0.5301, "mean_token_accuracy": 0.7046777904033661, "step": 14220 }, { "epoch": 1.424, "grad_norm": 0.5871409177780151, "learning_rate": 8.576857685768577e-05, "loss": 0.4604, "mean_token_accuracy": 0.7184743911027909, "step": 14240 }, { "epoch": 1.426, "grad_norm": 0.808042585849762, "learning_rate": 8.574857485748576e-05, "loss": 0.4184, "mean_token_accuracy": 0.7175511449575425, "step": 14260 }, { "epoch": 1.428, "grad_norm": 12.474239349365234, "learning_rate": 8.572857285728573e-05, "loss": 0.6526, "mean_token_accuracy": 0.6864762514829635, "step": 14280 }, { "epoch": 1.43, "grad_norm": 0.7518370747566223, "learning_rate": 8.570857085708571e-05, "loss": 0.6168, "mean_token_accuracy": 0.6988178193569183, "step": 14300 }, { "epoch": 1.432, "grad_norm": 0.7769436836242676, "learning_rate": 8.568856885688569e-05, "loss": 0.4532, "mean_token_accuracy": 0.6919051080942153, "step": 14320 }, { "epoch": 1.434, "grad_norm": 0.44431427121162415, "learning_rate": 8.566856685668568e-05, "loss": 0.3703, "mean_token_accuracy": 0.7077921807765961, "step": 14340 }, { "epoch": 1.436, "grad_norm": 0.7064526677131653, "learning_rate": 8.564856485648566e-05, "loss": 0.4965, "mean_token_accuracy": 0.6959783524274826, "step": 14360 }, { "epoch": 1.438, "grad_norm": 7.761783123016357, "learning_rate": 8.562856285628563e-05, "loss": 0.4413, "mean_token_accuracy": 0.7093058615922928, "step": 14380 }, { "epoch": 1.44, "grad_norm": 0.9067802429199219, "learning_rate": 8.560856085608561e-05, "loss": 0.6498, "mean_token_accuracy": 0.6811404824256897, "step": 14400 }, { "epoch": 1.442, "grad_norm": 0.5073018074035645, "learning_rate": 8.55885588558856e-05, "loss": 0.6141, "mean_token_accuracy": 0.6914813995361329, "step": 14420 }, { "epoch": 1.444, "grad_norm": 0.5342203974723816, "learning_rate": 8.556855685568558e-05, "loss": 0.436, "mean_token_accuracy": 0.7168768286705017, "step": 14440 }, { "epoch": 1.446, "grad_norm": 0.6690633893013, "learning_rate": 8.554855485548555e-05, "loss": 0.5069, "mean_token_accuracy": 0.7083010166883469, "step": 14460 }, { "epoch": 1.448, "grad_norm": 1.5945907831192017, "learning_rate": 8.552855285528554e-05, "loss": 0.5058, "mean_token_accuracy": 0.711129829287529, "step": 14480 }, { "epoch": 1.45, "grad_norm": 0.8461160659790039, "learning_rate": 8.550855085508551e-05, "loss": 0.4765, "mean_token_accuracy": 0.7163731545209885, "step": 14500 }, { "epoch": 1.452, "grad_norm": 0.9539020657539368, "learning_rate": 8.54885488548855e-05, "loss": 0.6048, "mean_token_accuracy": 0.700741884112358, "step": 14520 }, { "epoch": 1.454, "grad_norm": 0.933876097202301, "learning_rate": 8.546854685468547e-05, "loss": 0.5458, "mean_token_accuracy": 0.71622334420681, "step": 14540 }, { "epoch": 1.456, "grad_norm": 0.6466652154922485, "learning_rate": 8.544854485448546e-05, "loss": 0.68, "mean_token_accuracy": 0.7119543880224228, "step": 14560 }, { "epoch": 1.458, "grad_norm": 5.3717169761657715, "learning_rate": 8.542854285428543e-05, "loss": 0.5535, "mean_token_accuracy": 0.7033396095037461, "step": 14580 }, { "epoch": 1.46, "grad_norm": 2.3599934577941895, "learning_rate": 8.540854085408541e-05, "loss": 0.4084, "mean_token_accuracy": 0.7116325676441193, "step": 14600 }, { "epoch": 1.462, "grad_norm": 4.64826774597168, "learning_rate": 8.53885388538854e-05, "loss": 0.608, "mean_token_accuracy": 0.7172676414251328, "step": 14620 }, { "epoch": 1.464, "grad_norm": 4.331453323364258, "learning_rate": 8.536853685368538e-05, "loss": 0.7754, "mean_token_accuracy": 0.6879455417394638, "step": 14640 }, { "epoch": 1.466, "grad_norm": 11.388643264770508, "learning_rate": 8.534853485348535e-05, "loss": 0.4413, "mean_token_accuracy": 0.7145833253860474, "step": 14660 }, { "epoch": 1.468, "grad_norm": 0.5166917443275452, "learning_rate": 8.532853285328533e-05, "loss": 0.4096, "mean_token_accuracy": 0.7142437666654586, "step": 14680 }, { "epoch": 1.47, "grad_norm": 0.6875913739204407, "learning_rate": 8.530853085308532e-05, "loss": 0.4308, "mean_token_accuracy": 0.7131168335676193, "step": 14700 }, { "epoch": 1.472, "grad_norm": 0.6468980312347412, "learning_rate": 8.52885288528853e-05, "loss": 0.541, "mean_token_accuracy": 0.6934563159942627, "step": 14720 }, { "epoch": 1.474, "grad_norm": 0.8676846623420715, "learning_rate": 8.526852685268527e-05, "loss": 0.6375, "mean_token_accuracy": 0.7138593465089798, "step": 14740 }, { "epoch": 1.476, "grad_norm": 2.6902201175689697, "learning_rate": 8.524852485248525e-05, "loss": 0.5803, "mean_token_accuracy": 0.7044082880020142, "step": 14760 }, { "epoch": 1.478, "grad_norm": 3.0495975017547607, "learning_rate": 8.522852285228524e-05, "loss": 0.5353, "mean_token_accuracy": 0.707113167643547, "step": 14780 }, { "epoch": 1.48, "grad_norm": 0.887805163860321, "learning_rate": 8.520852085208522e-05, "loss": 0.7541, "mean_token_accuracy": 0.7075016915798187, "step": 14800 }, { "epoch": 1.482, "grad_norm": 1.6588265895843506, "learning_rate": 8.518851885188519e-05, "loss": 0.5714, "mean_token_accuracy": 0.7103629559278488, "step": 14820 }, { "epoch": 1.484, "grad_norm": 2.714083671569824, "learning_rate": 8.516851685168517e-05, "loss": 0.7888, "mean_token_accuracy": 0.7098389059305191, "step": 14840 }, { "epoch": 1.486, "grad_norm": 0.561485767364502, "learning_rate": 8.514851485148515e-05, "loss": 0.7132, "mean_token_accuracy": 0.7029921054840088, "step": 14860 }, { "epoch": 1.488, "grad_norm": 0.8472420573234558, "learning_rate": 8.512851285128514e-05, "loss": 0.5641, "mean_token_accuracy": 0.7104632198810578, "step": 14880 }, { "epoch": 1.49, "grad_norm": 0.7564120292663574, "learning_rate": 8.51085108510851e-05, "loss": 0.4854, "mean_token_accuracy": 0.7212682038545608, "step": 14900 }, { "epoch": 1.492, "grad_norm": 0.6986401081085205, "learning_rate": 8.508850885088509e-05, "loss": 0.4543, "mean_token_accuracy": 0.7146214962005615, "step": 14920 }, { "epoch": 1.494, "grad_norm": 0.5606655478477478, "learning_rate": 8.506850685068507e-05, "loss": 0.4313, "mean_token_accuracy": 0.7145511835813523, "step": 14940 }, { "epoch": 1.496, "grad_norm": 0.6983371376991272, "learning_rate": 8.504850485048506e-05, "loss": 0.5127, "mean_token_accuracy": 0.7111760079860687, "step": 14960 }, { "epoch": 1.498, "grad_norm": 0.589189887046814, "learning_rate": 8.502850285028502e-05, "loss": 0.5962, "mean_token_accuracy": 0.705603688955307, "step": 14980 }, { "epoch": 1.5, "grad_norm": 1.1171836853027344, "learning_rate": 8.500850085008502e-05, "loss": 0.6029, "mean_token_accuracy": 0.7060607999563218, "step": 15000 }, { "epoch": 1.502, "grad_norm": 0.9127572774887085, "learning_rate": 8.498849884988499e-05, "loss": 0.5344, "mean_token_accuracy": 0.7021369814872742, "step": 15020 }, { "epoch": 1.504, "grad_norm": 0.7169922590255737, "learning_rate": 8.496849684968497e-05, "loss": 0.6335, "mean_token_accuracy": 0.6924422711133957, "step": 15040 }, { "epoch": 1.506, "grad_norm": 0.9882239699363708, "learning_rate": 8.494849484948494e-05, "loss": 0.4959, "mean_token_accuracy": 0.7140223473310471, "step": 15060 }, { "epoch": 1.508, "grad_norm": 3.5078794956207275, "learning_rate": 8.492849284928494e-05, "loss": 0.3988, "mean_token_accuracy": 0.7081432223320008, "step": 15080 }, { "epoch": 1.51, "grad_norm": 0.6786485314369202, "learning_rate": 8.490849084908491e-05, "loss": 0.4629, "mean_token_accuracy": 0.7150883585214615, "step": 15100 }, { "epoch": 1.512, "grad_norm": 1.7254565954208374, "learning_rate": 8.488848884888489e-05, "loss": 0.522, "mean_token_accuracy": 0.7032965123653412, "step": 15120 }, { "epoch": 1.514, "grad_norm": 0.9750689268112183, "learning_rate": 8.486848684868488e-05, "loss": 0.4693, "mean_token_accuracy": 0.7056882500648498, "step": 15140 }, { "epoch": 1.516, "grad_norm": 2.2170491218566895, "learning_rate": 8.484848484848486e-05, "loss": 0.5578, "mean_token_accuracy": 0.7114224672317505, "step": 15160 }, { "epoch": 1.518, "grad_norm": 5.602394104003906, "learning_rate": 8.482848284828483e-05, "loss": 0.7511, "mean_token_accuracy": 0.704080018401146, "step": 15180 }, { "epoch": 1.52, "grad_norm": 0.6181830763816833, "learning_rate": 8.480848084808481e-05, "loss": 0.6149, "mean_token_accuracy": 0.7048383027315139, "step": 15200 }, { "epoch": 1.522, "grad_norm": 0.8344520330429077, "learning_rate": 8.47884788478848e-05, "loss": 0.661, "mean_token_accuracy": 0.7006375879049301, "step": 15220 }, { "epoch": 1.524, "grad_norm": 0.6497386693954468, "learning_rate": 8.476847684768478e-05, "loss": 0.6436, "mean_token_accuracy": 0.6810556948184967, "step": 15240 }, { "epoch": 1.526, "grad_norm": 0.9798128604888916, "learning_rate": 8.474847484748475e-05, "loss": 0.3979, "mean_token_accuracy": 0.7243845611810684, "step": 15260 }, { "epoch": 1.528, "grad_norm": 0.706176221370697, "learning_rate": 8.472847284728473e-05, "loss": 0.4018, "mean_token_accuracy": 0.709145924448967, "step": 15280 }, { "epoch": 1.53, "grad_norm": 0.5567392706871033, "learning_rate": 8.470847084708471e-05, "loss": 0.5, "mean_token_accuracy": 0.7026640176773071, "step": 15300 }, { "epoch": 1.532, "grad_norm": 1.1795716285705566, "learning_rate": 8.46884688468847e-05, "loss": 0.4405, "mean_token_accuracy": 0.723418727517128, "step": 15320 }, { "epoch": 1.534, "grad_norm": 0.6795164942741394, "learning_rate": 8.466846684668467e-05, "loss": 0.437, "mean_token_accuracy": 0.7119427710771561, "step": 15340 }, { "epoch": 1.536, "grad_norm": 1.831821084022522, "learning_rate": 8.464846484648465e-05, "loss": 0.5376, "mean_token_accuracy": 0.7226175338029861, "step": 15360 }, { "epoch": 1.538, "grad_norm": 0.7932433485984802, "learning_rate": 8.462846284628463e-05, "loss": 0.5378, "mean_token_accuracy": 0.7026787281036377, "step": 15380 }, { "epoch": 1.54, "grad_norm": 0.9271606206893921, "learning_rate": 8.460846084608461e-05, "loss": 0.494, "mean_token_accuracy": 0.7030612498521804, "step": 15400 }, { "epoch": 1.542, "grad_norm": 6.5954461097717285, "learning_rate": 8.458845884588458e-05, "loss": 0.4798, "mean_token_accuracy": 0.6989629536867141, "step": 15420 }, { "epoch": 1.544, "grad_norm": 0.6945750117301941, "learning_rate": 8.456845684568457e-05, "loss": 0.4349, "mean_token_accuracy": 0.7037984400987625, "step": 15440 }, { "epoch": 1.546, "grad_norm": 0.9790162444114685, "learning_rate": 8.454845484548455e-05, "loss": 0.8112, "mean_token_accuracy": 0.7100191920995712, "step": 15460 }, { "epoch": 1.548, "grad_norm": 3.7689661979675293, "learning_rate": 8.452845284528453e-05, "loss": 0.4534, "mean_token_accuracy": 0.6960612684488297, "step": 15480 }, { "epoch": 1.55, "grad_norm": 0.6001984477043152, "learning_rate": 8.45084508450845e-05, "loss": 0.5597, "mean_token_accuracy": 0.6905664920806884, "step": 15500 }, { "epoch": 1.552, "grad_norm": 1.3765658140182495, "learning_rate": 8.44884488448845e-05, "loss": 0.4872, "mean_token_accuracy": 0.7084162622690201, "step": 15520 }, { "epoch": 1.554, "grad_norm": 0.7353301644325256, "learning_rate": 8.446844684468447e-05, "loss": 0.4454, "mean_token_accuracy": 0.713133817911148, "step": 15540 }, { "epoch": 1.556, "grad_norm": 9.19461727142334, "learning_rate": 8.444844484448445e-05, "loss": 0.6758, "mean_token_accuracy": 0.6908205300569534, "step": 15560 }, { "epoch": 1.558, "grad_norm": 0.5582415461540222, "learning_rate": 8.442844284428442e-05, "loss": 0.5427, "mean_token_accuracy": 0.7036492437124252, "step": 15580 }, { "epoch": 1.56, "grad_norm": 1.356122612953186, "learning_rate": 8.440844084408442e-05, "loss": 0.617, "mean_token_accuracy": 0.7102467715740204, "step": 15600 }, { "epoch": 1.562, "grad_norm": 0.5494337677955627, "learning_rate": 8.438843884388439e-05, "loss": 0.4455, "mean_token_accuracy": 0.7147228598594666, "step": 15620 }, { "epoch": 1.564, "grad_norm": 0.8006953597068787, "learning_rate": 8.436843684368437e-05, "loss": 0.5697, "mean_token_accuracy": 0.6998766243457795, "step": 15640 }, { "epoch": 1.5659999999999998, "grad_norm": 0.8416642546653748, "learning_rate": 8.434843484348435e-05, "loss": 0.5509, "mean_token_accuracy": 0.694899320602417, "step": 15660 }, { "epoch": 1.568, "grad_norm": 0.7322935461997986, "learning_rate": 8.432843284328434e-05, "loss": 0.4707, "mean_token_accuracy": 0.6992872655391693, "step": 15680 }, { "epoch": 1.5699999999999998, "grad_norm": 2.876232385635376, "learning_rate": 8.43084308430843e-05, "loss": 0.4803, "mean_token_accuracy": 0.6975739568471908, "step": 15700 }, { "epoch": 1.572, "grad_norm": 0.5577207803726196, "learning_rate": 8.428842884288429e-05, "loss": 0.369, "mean_token_accuracy": 0.7184360831975937, "step": 15720 }, { "epoch": 1.5739999999999998, "grad_norm": 0.5541228652000427, "learning_rate": 8.426842684268427e-05, "loss": 0.4149, "mean_token_accuracy": 0.7117748320102691, "step": 15740 }, { "epoch": 1.576, "grad_norm": 0.7384124994277954, "learning_rate": 8.424842484248426e-05, "loss": 0.6722, "mean_token_accuracy": 0.712174391746521, "step": 15760 }, { "epoch": 1.5779999999999998, "grad_norm": 0.521217405796051, "learning_rate": 8.422842284228422e-05, "loss": 0.565, "mean_token_accuracy": 0.6756115704774857, "step": 15780 }, { "epoch": 1.58, "grad_norm": 0.9986819624900818, "learning_rate": 8.420842084208421e-05, "loss": 0.6014, "mean_token_accuracy": 0.6978912860155105, "step": 15800 }, { "epoch": 1.5819999999999999, "grad_norm": 0.9440637230873108, "learning_rate": 8.41884188418842e-05, "loss": 0.5231, "mean_token_accuracy": 0.7081424117088317, "step": 15820 }, { "epoch": 1.584, "grad_norm": 0.5819364786148071, "learning_rate": 8.416841684168417e-05, "loss": 0.6033, "mean_token_accuracy": 0.7084242433309555, "step": 15840 }, { "epoch": 1.5859999999999999, "grad_norm": 0.4693375825881958, "learning_rate": 8.414841484148416e-05, "loss": 0.4549, "mean_token_accuracy": 0.705313292145729, "step": 15860 }, { "epoch": 1.588, "grad_norm": 1.4599193334579468, "learning_rate": 8.412841284128413e-05, "loss": 0.5668, "mean_token_accuracy": 0.6897589862346649, "step": 15880 }, { "epoch": 1.5899999999999999, "grad_norm": 0.8695377707481384, "learning_rate": 8.410841084108412e-05, "loss": 0.45, "mean_token_accuracy": 0.7034918755292893, "step": 15900 }, { "epoch": 1.592, "grad_norm": 0.5472011566162109, "learning_rate": 8.408840884088409e-05, "loss": 0.4182, "mean_token_accuracy": 0.6963304698467254, "step": 15920 }, { "epoch": 1.5939999999999999, "grad_norm": 1.2698365449905396, "learning_rate": 8.406840684068408e-05, "loss": 0.444, "mean_token_accuracy": 0.7156891852617264, "step": 15940 }, { "epoch": 1.596, "grad_norm": 1.9094666242599487, "learning_rate": 8.404840484048406e-05, "loss": 0.6697, "mean_token_accuracy": 0.6893982350826263, "step": 15960 }, { "epoch": 1.5979999999999999, "grad_norm": 0.5362067222595215, "learning_rate": 8.402840284028404e-05, "loss": 0.3895, "mean_token_accuracy": 0.7163457095623016, "step": 15980 }, { "epoch": 1.6, "grad_norm": 5.005615234375, "learning_rate": 8.400840084008401e-05, "loss": 0.6518, "mean_token_accuracy": 0.7093731969594955, "step": 16000 }, { "epoch": 1.6019999999999999, "grad_norm": 1.0188957452774048, "learning_rate": 8.3988398839884e-05, "loss": 0.4023, "mean_token_accuracy": 0.7118296325206757, "step": 16020 }, { "epoch": 1.604, "grad_norm": 0.5064823627471924, "learning_rate": 8.396839683968398e-05, "loss": 0.3832, "mean_token_accuracy": 0.7247596591711044, "step": 16040 }, { "epoch": 1.6059999999999999, "grad_norm": 1.530633807182312, "learning_rate": 8.394839483948396e-05, "loss": 0.4812, "mean_token_accuracy": 0.7054111361503601, "step": 16060 }, { "epoch": 1.608, "grad_norm": 0.965334951877594, "learning_rate": 8.392839283928393e-05, "loss": 0.4864, "mean_token_accuracy": 0.7167988061904907, "step": 16080 }, { "epoch": 1.6099999999999999, "grad_norm": 0.6030615568161011, "learning_rate": 8.390839083908391e-05, "loss": 0.4986, "mean_token_accuracy": 0.7096189051866532, "step": 16100 }, { "epoch": 1.612, "grad_norm": 1.742362141609192, "learning_rate": 8.38883888388839e-05, "loss": 0.5456, "mean_token_accuracy": 0.6943044155836106, "step": 16120 }, { "epoch": 1.6139999999999999, "grad_norm": 0.9908391237258911, "learning_rate": 8.386838683868388e-05, "loss": 0.4069, "mean_token_accuracy": 0.7062683522701263, "step": 16140 }, { "epoch": 1.616, "grad_norm": 0.5606012940406799, "learning_rate": 8.384838483848385e-05, "loss": 0.6317, "mean_token_accuracy": 0.6967306762933732, "step": 16160 }, { "epoch": 1.6179999999999999, "grad_norm": 0.6369947791099548, "learning_rate": 8.382838283828383e-05, "loss": 0.5295, "mean_token_accuracy": 0.7008612006902695, "step": 16180 }, { "epoch": 1.62, "grad_norm": 0.884046733379364, "learning_rate": 8.380838083808381e-05, "loss": 0.6597, "mean_token_accuracy": 0.7029900193214417, "step": 16200 }, { "epoch": 1.6219999999999999, "grad_norm": 0.5116100907325745, "learning_rate": 8.37883788378838e-05, "loss": 0.4753, "mean_token_accuracy": 0.7174296706914902, "step": 16220 }, { "epoch": 1.624, "grad_norm": 0.6703429818153381, "learning_rate": 8.376837683768377e-05, "loss": 0.4867, "mean_token_accuracy": 0.7154217839241028, "step": 16240 }, { "epoch": 1.626, "grad_norm": 0.5594198107719421, "learning_rate": 8.374837483748375e-05, "loss": 0.6411, "mean_token_accuracy": 0.6975947886705398, "step": 16260 }, { "epoch": 1.6280000000000001, "grad_norm": 0.9781226515769958, "learning_rate": 8.372837283728373e-05, "loss": 0.7664, "mean_token_accuracy": 0.6804246068000793, "step": 16280 }, { "epoch": 1.63, "grad_norm": 0.5358578562736511, "learning_rate": 8.370837083708372e-05, "loss": 0.457, "mean_token_accuracy": 0.7128564983606338, "step": 16300 }, { "epoch": 1.6320000000000001, "grad_norm": 0.6279613375663757, "learning_rate": 8.368836883688369e-05, "loss": 0.5076, "mean_token_accuracy": 0.7155460804700852, "step": 16320 }, { "epoch": 1.634, "grad_norm": 0.8114880919456482, "learning_rate": 8.366836683668368e-05, "loss": 0.5018, "mean_token_accuracy": 0.7075564444065094, "step": 16340 }, { "epoch": 1.6360000000000001, "grad_norm": 1.0845683813095093, "learning_rate": 8.364836483648365e-05, "loss": 0.4548, "mean_token_accuracy": 0.721671462059021, "step": 16360 }, { "epoch": 1.638, "grad_norm": 1.5687729120254517, "learning_rate": 8.362836283628363e-05, "loss": 0.5208, "mean_token_accuracy": 0.7081500291824341, "step": 16380 }, { "epoch": 1.6400000000000001, "grad_norm": 1.8940166234970093, "learning_rate": 8.36083608360836e-05, "loss": 0.3827, "mean_token_accuracy": 0.7078454792499542, "step": 16400 }, { "epoch": 1.642, "grad_norm": 0.9786929488182068, "learning_rate": 8.35883588358836e-05, "loss": 0.6561, "mean_token_accuracy": 0.7003695607185364, "step": 16420 }, { "epoch": 1.6440000000000001, "grad_norm": 0.9835501313209534, "learning_rate": 8.356835683568357e-05, "loss": 0.4998, "mean_token_accuracy": 0.702194693684578, "step": 16440 }, { "epoch": 1.646, "grad_norm": 0.6722352504730225, "learning_rate": 8.354835483548355e-05, "loss": 0.5312, "mean_token_accuracy": 0.6964615494012832, "step": 16460 }, { "epoch": 1.6480000000000001, "grad_norm": 1.123732328414917, "learning_rate": 8.352835283528354e-05, "loss": 0.5545, "mean_token_accuracy": 0.6915721654891968, "step": 16480 }, { "epoch": 1.65, "grad_norm": 0.6975769400596619, "learning_rate": 8.350835083508352e-05, "loss": 0.4995, "mean_token_accuracy": 0.7060832858085633, "step": 16500 }, { "epoch": 1.6520000000000001, "grad_norm": 1.845727562904358, "learning_rate": 8.348834883488349e-05, "loss": 0.4579, "mean_token_accuracy": 0.6996943801641464, "step": 16520 }, { "epoch": 1.654, "grad_norm": 0.805519163608551, "learning_rate": 8.346834683468347e-05, "loss": 0.4768, "mean_token_accuracy": 0.7010816425085068, "step": 16540 }, { "epoch": 1.6560000000000001, "grad_norm": 0.8901602029800415, "learning_rate": 8.344834483448346e-05, "loss": 0.6965, "mean_token_accuracy": 0.6865421891212463, "step": 16560 }, { "epoch": 1.658, "grad_norm": 0.5213484168052673, "learning_rate": 8.342834283428344e-05, "loss": 0.4333, "mean_token_accuracy": 0.7143392354249954, "step": 16580 }, { "epoch": 1.6600000000000001, "grad_norm": 0.4718249440193176, "learning_rate": 8.340834083408341e-05, "loss": 0.3866, "mean_token_accuracy": 0.7043508917093277, "step": 16600 }, { "epoch": 1.662, "grad_norm": 0.513038158416748, "learning_rate": 8.338833883388339e-05, "loss": 0.5884, "mean_token_accuracy": 0.6932128131389618, "step": 16620 }, { "epoch": 1.6640000000000001, "grad_norm": 4.017649173736572, "learning_rate": 8.336833683368337e-05, "loss": 0.4872, "mean_token_accuracy": 0.6992698311805725, "step": 16640 }, { "epoch": 1.666, "grad_norm": 0.8368757367134094, "learning_rate": 8.334833483348336e-05, "loss": 0.5214, "mean_token_accuracy": 0.7060582041740417, "step": 16660 }, { "epoch": 1.6680000000000001, "grad_norm": 0.6308738589286804, "learning_rate": 8.332833283328333e-05, "loss": 0.5745, "mean_token_accuracy": 0.6989130496978759, "step": 16680 }, { "epoch": 1.67, "grad_norm": 1.2249956130981445, "learning_rate": 8.330833083308331e-05, "loss": 0.5957, "mean_token_accuracy": 0.7096390038728714, "step": 16700 }, { "epoch": 1.6720000000000002, "grad_norm": 0.44054338335990906, "learning_rate": 8.328832883288329e-05, "loss": 0.4205, "mean_token_accuracy": 0.707685387134552, "step": 16720 }, { "epoch": 1.674, "grad_norm": 1.2617522478103638, "learning_rate": 8.326832683268328e-05, "loss": 0.5345, "mean_token_accuracy": 0.7062843710184097, "step": 16740 }, { "epoch": 1.6760000000000002, "grad_norm": 0.9944084882736206, "learning_rate": 8.324832483248324e-05, "loss": 0.416, "mean_token_accuracy": 0.7194874107837677, "step": 16760 }, { "epoch": 1.678, "grad_norm": 0.5824191570281982, "learning_rate": 8.322832283228323e-05, "loss": 0.4506, "mean_token_accuracy": 0.7109140813350677, "step": 16780 }, { "epoch": 1.6800000000000002, "grad_norm": 0.5180118083953857, "learning_rate": 8.320832083208321e-05, "loss": 0.5876, "mean_token_accuracy": 0.6909366726875306, "step": 16800 }, { "epoch": 1.682, "grad_norm": 0.5663391351699829, "learning_rate": 8.31883188318832e-05, "loss": 0.4343, "mean_token_accuracy": 0.717526626586914, "step": 16820 }, { "epoch": 1.6840000000000002, "grad_norm": 0.5357835292816162, "learning_rate": 8.316831683168316e-05, "loss": 0.413, "mean_token_accuracy": 0.7205507546663285, "step": 16840 }, { "epoch": 1.686, "grad_norm": 2.302443265914917, "learning_rate": 8.314831483148316e-05, "loss": 0.5291, "mean_token_accuracy": 0.6981423497200012, "step": 16860 }, { "epoch": 1.688, "grad_norm": 0.5970796346664429, "learning_rate": 8.312831283128313e-05, "loss": 0.5587, "mean_token_accuracy": 0.7188669890165329, "step": 16880 }, { "epoch": 1.69, "grad_norm": 0.8828132152557373, "learning_rate": 8.310831083108311e-05, "loss": 0.4853, "mean_token_accuracy": 0.7043181538581849, "step": 16900 }, { "epoch": 1.692, "grad_norm": 0.6447278261184692, "learning_rate": 8.308830883088308e-05, "loss": 0.5305, "mean_token_accuracy": 0.7124626040458679, "step": 16920 }, { "epoch": 1.694, "grad_norm": 0.5986257195472717, "learning_rate": 8.306830683068308e-05, "loss": 0.3725, "mean_token_accuracy": 0.7217547535896301, "step": 16940 }, { "epoch": 1.696, "grad_norm": 0.6789641380310059, "learning_rate": 8.304830483048305e-05, "loss": 0.5422, "mean_token_accuracy": 0.6983249425888062, "step": 16960 }, { "epoch": 1.698, "grad_norm": 1.1777503490447998, "learning_rate": 8.302830283028303e-05, "loss": 0.5522, "mean_token_accuracy": 0.7009921491146087, "step": 16980 }, { "epoch": 1.7, "grad_norm": 0.9609647989273071, "learning_rate": 8.300830083008301e-05, "loss": 0.5634, "mean_token_accuracy": 0.7151965409517288, "step": 17000 }, { "epoch": 1.702, "grad_norm": 0.909100353717804, "learning_rate": 8.2988298829883e-05, "loss": 0.5786, "mean_token_accuracy": 0.69469995200634, "step": 17020 }, { "epoch": 1.704, "grad_norm": 3.019503116607666, "learning_rate": 8.296829682968297e-05, "loss": 0.4815, "mean_token_accuracy": 0.7093754768371582, "step": 17040 }, { "epoch": 1.706, "grad_norm": 0.573549747467041, "learning_rate": 8.294829482948295e-05, "loss": 0.3501, "mean_token_accuracy": 0.7220644861459732, "step": 17060 }, { "epoch": 1.708, "grad_norm": 1.3207563161849976, "learning_rate": 8.292829282928293e-05, "loss": 0.5307, "mean_token_accuracy": 0.7112865418195724, "step": 17080 }, { "epoch": 1.71, "grad_norm": 0.7461437582969666, "learning_rate": 8.290829082908292e-05, "loss": 0.5172, "mean_token_accuracy": 0.6988568276166915, "step": 17100 }, { "epoch": 1.712, "grad_norm": 0.8286963105201721, "learning_rate": 8.288828882888289e-05, "loss": 0.5404, "mean_token_accuracy": 0.7091914713382721, "step": 17120 }, { "epoch": 1.714, "grad_norm": 0.5060201287269592, "learning_rate": 8.286828682868287e-05, "loss": 0.4145, "mean_token_accuracy": 0.7013580232858658, "step": 17140 }, { "epoch": 1.716, "grad_norm": 0.9401388764381409, "learning_rate": 8.284828482848285e-05, "loss": 0.4499, "mean_token_accuracy": 0.7125023692846298, "step": 17160 }, { "epoch": 1.718, "grad_norm": 1.444548487663269, "learning_rate": 8.282828282828283e-05, "loss": 0.5624, "mean_token_accuracy": 0.7012740701436997, "step": 17180 }, { "epoch": 1.72, "grad_norm": 0.4763924777507782, "learning_rate": 8.28082808280828e-05, "loss": 0.4467, "mean_token_accuracy": 0.6919543147087097, "step": 17200 }, { "epoch": 1.722, "grad_norm": 0.5694157481193542, "learning_rate": 8.278827882788279e-05, "loss": 0.4394, "mean_token_accuracy": 0.7115898847579956, "step": 17220 }, { "epoch": 1.724, "grad_norm": 2.1611766815185547, "learning_rate": 8.276827682768277e-05, "loss": 0.6509, "mean_token_accuracy": 0.6919595807790756, "step": 17240 }, { "epoch": 1.726, "grad_norm": 1.164734125137329, "learning_rate": 8.274827482748275e-05, "loss": 0.4764, "mean_token_accuracy": 0.7136271178722382, "step": 17260 }, { "epoch": 1.728, "grad_norm": 0.438438355922699, "learning_rate": 8.272827282728272e-05, "loss": 0.3953, "mean_token_accuracy": 0.7154319524765015, "step": 17280 }, { "epoch": 1.73, "grad_norm": 2.436094284057617, "learning_rate": 8.27082708270827e-05, "loss": 0.6088, "mean_token_accuracy": 0.7172380805015564, "step": 17300 }, { "epoch": 1.732, "grad_norm": 4.111093521118164, "learning_rate": 8.26882688268827e-05, "loss": 0.5488, "mean_token_accuracy": 0.703995081782341, "step": 17320 }, { "epoch": 1.734, "grad_norm": 1.5234607458114624, "learning_rate": 8.266826682668267e-05, "loss": 0.5613, "mean_token_accuracy": 0.7096753925085068, "step": 17340 }, { "epoch": 1.736, "grad_norm": 0.505492627620697, "learning_rate": 8.264826482648266e-05, "loss": 0.4223, "mean_token_accuracy": 0.7069117248058319, "step": 17360 }, { "epoch": 1.738, "grad_norm": 0.8709173798561096, "learning_rate": 8.262826282628264e-05, "loss": 0.4792, "mean_token_accuracy": 0.7115259826183319, "step": 17380 }, { "epoch": 1.74, "grad_norm": 0.6046682596206665, "learning_rate": 8.260826082608262e-05, "loss": 0.5385, "mean_token_accuracy": 0.6998805850744247, "step": 17400 }, { "epoch": 1.742, "grad_norm": 0.9802923798561096, "learning_rate": 8.258825882588259e-05, "loss": 0.6053, "mean_token_accuracy": 0.7081262618303299, "step": 17420 }, { "epoch": 1.744, "grad_norm": 0.45070675015449524, "learning_rate": 8.256825682568257e-05, "loss": 0.5581, "mean_token_accuracy": 0.7120647996664047, "step": 17440 }, { "epoch": 1.746, "grad_norm": 0.6543669104576111, "learning_rate": 8.254825482548256e-05, "loss": 0.4766, "mean_token_accuracy": 0.704866224527359, "step": 17460 }, { "epoch": 1.748, "grad_norm": 0.44927576184272766, "learning_rate": 8.252825282528254e-05, "loss": 0.4299, "mean_token_accuracy": 0.7130716353654861, "step": 17480 }, { "epoch": 1.75, "grad_norm": 0.4794252812862396, "learning_rate": 8.250825082508251e-05, "loss": 0.4182, "mean_token_accuracy": 0.7130175530910492, "step": 17500 }, { "epoch": 1.752, "grad_norm": 0.5823835134506226, "learning_rate": 8.248824882488249e-05, "loss": 0.4077, "mean_token_accuracy": 0.7188629925251007, "step": 17520 }, { "epoch": 1.754, "grad_norm": 0.9742077589035034, "learning_rate": 8.246824682468248e-05, "loss": 0.4709, "mean_token_accuracy": 0.712686362862587, "step": 17540 }, { "epoch": 1.756, "grad_norm": 1.8265490531921387, "learning_rate": 8.244824482448246e-05, "loss": 0.4291, "mean_token_accuracy": 0.7147208511829376, "step": 17560 }, { "epoch": 1.758, "grad_norm": 3.6113226413726807, "learning_rate": 8.242824282428243e-05, "loss": 0.5153, "mean_token_accuracy": 0.7101753324270248, "step": 17580 }, { "epoch": 1.76, "grad_norm": 0.8866433501243591, "learning_rate": 8.240824082408241e-05, "loss": 0.4695, "mean_token_accuracy": 0.7062278658151626, "step": 17600 }, { "epoch": 1.762, "grad_norm": 0.9069706201553345, "learning_rate": 8.23882388238824e-05, "loss": 0.3865, "mean_token_accuracy": 0.719810101389885, "step": 17620 }, { "epoch": 1.764, "grad_norm": 1.091770887374878, "learning_rate": 8.236823682368238e-05, "loss": 0.4089, "mean_token_accuracy": 0.7083958059549331, "step": 17640 }, { "epoch": 1.766, "grad_norm": 1.1817817687988281, "learning_rate": 8.234823482348235e-05, "loss": 0.6256, "mean_token_accuracy": 0.7055875360965729, "step": 17660 }, { "epoch": 1.768, "grad_norm": 1.0777617692947388, "learning_rate": 8.232823282328233e-05, "loss": 0.4218, "mean_token_accuracy": 0.713532468676567, "step": 17680 }, { "epoch": 1.77, "grad_norm": 0.6313412189483643, "learning_rate": 8.230823082308231e-05, "loss": 0.5212, "mean_token_accuracy": 0.7200062900781632, "step": 17700 }, { "epoch": 1.772, "grad_norm": 1.4250984191894531, "learning_rate": 8.22882288228823e-05, "loss": 0.3961, "mean_token_accuracy": 0.7150634765625, "step": 17720 }, { "epoch": 1.774, "grad_norm": 0.652806282043457, "learning_rate": 8.226822682268227e-05, "loss": 0.6279, "mean_token_accuracy": 0.6962476223707199, "step": 17740 }, { "epoch": 1.776, "grad_norm": 0.4778764247894287, "learning_rate": 8.224822482248226e-05, "loss": 0.5686, "mean_token_accuracy": 0.7019259959459305, "step": 17760 }, { "epoch": 1.778, "grad_norm": 0.5337588787078857, "learning_rate": 8.222822282228223e-05, "loss": 0.5231, "mean_token_accuracy": 0.7057235836982727, "step": 17780 }, { "epoch": 1.78, "grad_norm": 0.4265744984149933, "learning_rate": 8.220822082208221e-05, "loss": 0.4019, "mean_token_accuracy": 0.7106193453073502, "step": 17800 }, { "epoch": 1.782, "grad_norm": 1.1875509023666382, "learning_rate": 8.218821882188218e-05, "loss": 0.64, "mean_token_accuracy": 0.6955713629722595, "step": 17820 }, { "epoch": 1.784, "grad_norm": 0.6375987529754639, "learning_rate": 8.216821682168218e-05, "loss": 0.4689, "mean_token_accuracy": 0.7128394067287445, "step": 17840 }, { "epoch": 1.786, "grad_norm": 0.7126708030700684, "learning_rate": 8.214821482148215e-05, "loss": 0.4165, "mean_token_accuracy": 0.7273801863193512, "step": 17860 }, { "epoch": 1.788, "grad_norm": 1.0436592102050781, "learning_rate": 8.212821282128213e-05, "loss": 0.4821, "mean_token_accuracy": 0.712918359041214, "step": 17880 }, { "epoch": 1.79, "grad_norm": 0.4635600447654724, "learning_rate": 8.210821082108212e-05, "loss": 0.3971, "mean_token_accuracy": 0.7225432515144348, "step": 17900 }, { "epoch": 1.792, "grad_norm": 0.8067865967750549, "learning_rate": 8.20882088208821e-05, "loss": 0.4711, "mean_token_accuracy": 0.6987961530685425, "step": 17920 }, { "epoch": 1.794, "grad_norm": 0.5492339730262756, "learning_rate": 8.206820682068207e-05, "loss": 0.5366, "mean_token_accuracy": 0.6892670661211013, "step": 17940 }, { "epoch": 1.796, "grad_norm": 0.6094165444374084, "learning_rate": 8.204820482048205e-05, "loss": 0.4965, "mean_token_accuracy": 0.6953775137662888, "step": 17960 }, { "epoch": 1.798, "grad_norm": 1.5831767320632935, "learning_rate": 8.202820282028203e-05, "loss": 0.4851, "mean_token_accuracy": 0.702346608042717, "step": 17980 }, { "epoch": 1.8, "grad_norm": 1.2446714639663696, "learning_rate": 8.200820082008202e-05, "loss": 0.4604, "mean_token_accuracy": 0.7159430593252182, "step": 18000 }, { "epoch": 1.802, "grad_norm": 1.3903992176055908, "learning_rate": 8.198819881988199e-05, "loss": 0.5257, "mean_token_accuracy": 0.6988525778055191, "step": 18020 }, { "epoch": 1.804, "grad_norm": 0.556807279586792, "learning_rate": 8.196819681968197e-05, "loss": 0.4509, "mean_token_accuracy": 0.7045964002609253, "step": 18040 }, { "epoch": 1.806, "grad_norm": 0.5379879474639893, "learning_rate": 8.194819481948195e-05, "loss": 0.4587, "mean_token_accuracy": 0.7105973958969116, "step": 18060 }, { "epoch": 1.808, "grad_norm": 0.8217251896858215, "learning_rate": 8.192819281928194e-05, "loss": 0.3406, "mean_token_accuracy": 0.716037729382515, "step": 18080 }, { "epoch": 1.81, "grad_norm": 0.4356684684753418, "learning_rate": 8.19081908190819e-05, "loss": 0.4351, "mean_token_accuracy": 0.7162895947694778, "step": 18100 }, { "epoch": 1.812, "grad_norm": 0.6881900429725647, "learning_rate": 8.188818881888189e-05, "loss": 0.4467, "mean_token_accuracy": 0.713318407535553, "step": 18120 }, { "epoch": 1.814, "grad_norm": 0.6315464377403259, "learning_rate": 8.186818681868187e-05, "loss": 0.5686, "mean_token_accuracy": 0.7127558141946793, "step": 18140 }, { "epoch": 1.8159999999999998, "grad_norm": 10.226913452148438, "learning_rate": 8.184818481848185e-05, "loss": 0.6157, "mean_token_accuracy": 0.7161587655544281, "step": 18160 }, { "epoch": 1.818, "grad_norm": 0.4417383670806885, "learning_rate": 8.182818281828182e-05, "loss": 0.554, "mean_token_accuracy": 0.6887322664260864, "step": 18180 }, { "epoch": 1.8199999999999998, "grad_norm": 1.3103207349777222, "learning_rate": 8.180818081808182e-05, "loss": 0.5711, "mean_token_accuracy": 0.7008888185024261, "step": 18200 }, { "epoch": 1.822, "grad_norm": 0.4767926335334778, "learning_rate": 8.178817881788179e-05, "loss": 0.3987, "mean_token_accuracy": 0.7093613356351852, "step": 18220 }, { "epoch": 1.8239999999999998, "grad_norm": 1.2101701498031616, "learning_rate": 8.176817681768177e-05, "loss": 0.4701, "mean_token_accuracy": 0.7171274036169052, "step": 18240 }, { "epoch": 1.826, "grad_norm": 0.962717592716217, "learning_rate": 8.174817481748174e-05, "loss": 0.5082, "mean_token_accuracy": 0.7114862769842147, "step": 18260 }, { "epoch": 1.8279999999999998, "grad_norm": 1.073393702507019, "learning_rate": 8.172817281728174e-05, "loss": 0.459, "mean_token_accuracy": 0.7041614949703217, "step": 18280 }, { "epoch": 1.83, "grad_norm": 0.6911130547523499, "learning_rate": 8.170817081708171e-05, "loss": 0.5571, "mean_token_accuracy": 0.7036919146776199, "step": 18300 }, { "epoch": 1.8319999999999999, "grad_norm": 4.648578643798828, "learning_rate": 8.168816881688169e-05, "loss": 0.5833, "mean_token_accuracy": 0.718160080909729, "step": 18320 }, { "epoch": 1.834, "grad_norm": 1.0674675703048706, "learning_rate": 8.166816681668166e-05, "loss": 0.4382, "mean_token_accuracy": 0.7202678620815277, "step": 18340 }, { "epoch": 1.8359999999999999, "grad_norm": 1.3647710084915161, "learning_rate": 8.164816481648166e-05, "loss": 0.463, "mean_token_accuracy": 0.7063499957323074, "step": 18360 }, { "epoch": 1.838, "grad_norm": 0.5088642835617065, "learning_rate": 8.162816281628163e-05, "loss": 0.4328, "mean_token_accuracy": 0.7195646703243256, "step": 18380 }, { "epoch": 1.8399999999999999, "grad_norm": 1.4341464042663574, "learning_rate": 8.160816081608161e-05, "loss": 0.4046, "mean_token_accuracy": 0.7146503537893295, "step": 18400 }, { "epoch": 1.842, "grad_norm": 0.7176058292388916, "learning_rate": 8.15881588158816e-05, "loss": 0.5217, "mean_token_accuracy": 0.7195612251758575, "step": 18420 }, { "epoch": 1.8439999999999999, "grad_norm": 2.8507237434387207, "learning_rate": 8.156815681568158e-05, "loss": 0.4655, "mean_token_accuracy": 0.7029233753681183, "step": 18440 }, { "epoch": 1.846, "grad_norm": 0.6011839509010315, "learning_rate": 8.154815481548155e-05, "loss": 0.4906, "mean_token_accuracy": 0.7150457412004471, "step": 18460 }, { "epoch": 1.8479999999999999, "grad_norm": 0.9157691597938538, "learning_rate": 8.152815281528153e-05, "loss": 0.5732, "mean_token_accuracy": 0.696447005867958, "step": 18480 }, { "epoch": 1.85, "grad_norm": 0.5561360120773315, "learning_rate": 8.150815081508151e-05, "loss": 0.3362, "mean_token_accuracy": 0.7206290751695633, "step": 18500 }, { "epoch": 1.8519999999999999, "grad_norm": 0.5702043175697327, "learning_rate": 8.14881488148815e-05, "loss": 0.4194, "mean_token_accuracy": 0.7110967725515366, "step": 18520 }, { "epoch": 1.854, "grad_norm": 0.5137916803359985, "learning_rate": 8.146814681468147e-05, "loss": 0.4459, "mean_token_accuracy": 0.7169893085956573, "step": 18540 }, { "epoch": 1.8559999999999999, "grad_norm": 0.6560607552528381, "learning_rate": 8.144814481448145e-05, "loss": 0.3413, "mean_token_accuracy": 0.7074418097734452, "step": 18560 }, { "epoch": 1.858, "grad_norm": 0.5478920340538025, "learning_rate": 8.142814281428143e-05, "loss": 0.5742, "mean_token_accuracy": 0.7194196492433548, "step": 18580 }, { "epoch": 1.8599999999999999, "grad_norm": 0.6566569209098816, "learning_rate": 8.140814081408141e-05, "loss": 0.3624, "mean_token_accuracy": 0.723362398147583, "step": 18600 }, { "epoch": 1.862, "grad_norm": 0.7332559823989868, "learning_rate": 8.138813881388138e-05, "loss": 0.4983, "mean_token_accuracy": 0.7133599370718002, "step": 18620 }, { "epoch": 1.8639999999999999, "grad_norm": 0.8265591859817505, "learning_rate": 8.136813681368137e-05, "loss": 0.484, "mean_token_accuracy": 0.6979497998952866, "step": 18640 }, { "epoch": 1.866, "grad_norm": 0.8892085552215576, "learning_rate": 8.134813481348135e-05, "loss": 0.3786, "mean_token_accuracy": 0.7304515540599823, "step": 18660 }, { "epoch": 1.8679999999999999, "grad_norm": 0.8341506719589233, "learning_rate": 8.132813281328133e-05, "loss": 0.384, "mean_token_accuracy": 0.7266325175762176, "step": 18680 }, { "epoch": 1.87, "grad_norm": 1.2232264280319214, "learning_rate": 8.13081308130813e-05, "loss": 0.4664, "mean_token_accuracy": 0.6937896817922592, "step": 18700 }, { "epoch": 1.8719999999999999, "grad_norm": 0.5876225829124451, "learning_rate": 8.12881288128813e-05, "loss": 0.4073, "mean_token_accuracy": 0.7258016914129257, "step": 18720 }, { "epoch": 1.874, "grad_norm": 1.5141240358352661, "learning_rate": 8.126812681268127e-05, "loss": 0.5597, "mean_token_accuracy": 0.7143633157014847, "step": 18740 }, { "epoch": 1.876, "grad_norm": 2.7660348415374756, "learning_rate": 8.124812481248125e-05, "loss": 0.5027, "mean_token_accuracy": 0.7025002479553223, "step": 18760 }, { "epoch": 1.8780000000000001, "grad_norm": 0.8338335752487183, "learning_rate": 8.122812281228122e-05, "loss": 0.387, "mean_token_accuracy": 0.7230230093002319, "step": 18780 }, { "epoch": 1.88, "grad_norm": 0.6959752440452576, "learning_rate": 8.120812081208122e-05, "loss": 0.3871, "mean_token_accuracy": 0.723121702671051, "step": 18800 }, { "epoch": 1.8820000000000001, "grad_norm": 0.5503751635551453, "learning_rate": 8.11881188118812e-05, "loss": 0.5501, "mean_token_accuracy": 0.7162346243858337, "step": 18820 }, { "epoch": 1.884, "grad_norm": 1.835876703262329, "learning_rate": 8.116811681168117e-05, "loss": 0.5626, "mean_token_accuracy": 0.697523421049118, "step": 18840 }, { "epoch": 1.8860000000000001, "grad_norm": 1.3577426671981812, "learning_rate": 8.114811481148115e-05, "loss": 0.5384, "mean_token_accuracy": 0.7039977133274078, "step": 18860 }, { "epoch": 1.888, "grad_norm": 0.5714197158813477, "learning_rate": 8.112811281128114e-05, "loss": 0.5442, "mean_token_accuracy": 0.6866020649671555, "step": 18880 }, { "epoch": 1.8900000000000001, "grad_norm": 1.3371057510375977, "learning_rate": 8.110811081108112e-05, "loss": 0.6448, "mean_token_accuracy": 0.6891319990158081, "step": 18900 }, { "epoch": 1.892, "grad_norm": 4.315441608428955, "learning_rate": 8.108810881088109e-05, "loss": 0.3705, "mean_token_accuracy": 0.7173156470060349, "step": 18920 }, { "epoch": 1.8940000000000001, "grad_norm": 0.5137152671813965, "learning_rate": 8.106810681068107e-05, "loss": 0.442, "mean_token_accuracy": 0.6999231070280075, "step": 18940 }, { "epoch": 1.896, "grad_norm": 1.7943978309631348, "learning_rate": 8.104810481048105e-05, "loss": 0.7931, "mean_token_accuracy": 0.7118431806564331, "step": 18960 }, { "epoch": 1.8980000000000001, "grad_norm": 0.6019849181175232, "learning_rate": 8.102810281028104e-05, "loss": 0.534, "mean_token_accuracy": 0.7017726898193359, "step": 18980 }, { "epoch": 1.9, "grad_norm": 0.5552505254745483, "learning_rate": 8.100810081008101e-05, "loss": 0.5362, "mean_token_accuracy": 0.7059240460395813, "step": 19000 }, { "epoch": 1.9020000000000001, "grad_norm": 2.352442741394043, "learning_rate": 8.098809880988099e-05, "loss": 0.7242, "mean_token_accuracy": 0.6965461403131485, "step": 19020 }, { "epoch": 1.904, "grad_norm": 1.0850178003311157, "learning_rate": 8.096809680968097e-05, "loss": 0.628, "mean_token_accuracy": 0.6945373594760895, "step": 19040 }, { "epoch": 1.9060000000000001, "grad_norm": 0.5679989457130432, "learning_rate": 8.094809480948096e-05, "loss": 0.349, "mean_token_accuracy": 0.7093919485807418, "step": 19060 }, { "epoch": 1.908, "grad_norm": 0.5413291454315186, "learning_rate": 8.092809280928093e-05, "loss": 0.3569, "mean_token_accuracy": 0.707967221736908, "step": 19080 }, { "epoch": 1.9100000000000001, "grad_norm": 0.7049057483673096, "learning_rate": 8.090809080908092e-05, "loss": 0.4617, "mean_token_accuracy": 0.7125222146511078, "step": 19100 }, { "epoch": 1.912, "grad_norm": 0.5649737119674683, "learning_rate": 8.088808880888089e-05, "loss": 0.5058, "mean_token_accuracy": 0.719788721203804, "step": 19120 }, { "epoch": 1.9140000000000001, "grad_norm": 0.9667566418647766, "learning_rate": 8.086808680868088e-05, "loss": 0.4413, "mean_token_accuracy": 0.6981199204921722, "step": 19140 }, { "epoch": 1.916, "grad_norm": 1.504543423652649, "learning_rate": 8.084808480848084e-05, "loss": 0.3915, "mean_token_accuracy": 0.7115859240293503, "step": 19160 }, { "epoch": 1.9180000000000001, "grad_norm": 1.0049608945846558, "learning_rate": 8.082808280828084e-05, "loss": 0.4347, "mean_token_accuracy": 0.7035864740610123, "step": 19180 }, { "epoch": 1.92, "grad_norm": 0.4657405912876129, "learning_rate": 8.080808080808081e-05, "loss": 0.3133, "mean_token_accuracy": 0.7168406695127487, "step": 19200 }, { "epoch": 1.9220000000000002, "grad_norm": 0.6178229451179504, "learning_rate": 8.07880788078808e-05, "loss": 0.4349, "mean_token_accuracy": 0.7062496393918991, "step": 19220 }, { "epoch": 1.924, "grad_norm": 0.9314051270484924, "learning_rate": 8.076807680768078e-05, "loss": 0.4301, "mean_token_accuracy": 0.7210506051778793, "step": 19240 }, { "epoch": 1.9260000000000002, "grad_norm": 0.6566388010978699, "learning_rate": 8.074807480748076e-05, "loss": 0.5343, "mean_token_accuracy": 0.7062906354665757, "step": 19260 }, { "epoch": 1.928, "grad_norm": 0.4743705093860626, "learning_rate": 8.072807280728073e-05, "loss": 0.5259, "mean_token_accuracy": 0.706720945239067, "step": 19280 }, { "epoch": 1.9300000000000002, "grad_norm": 1.0531501770019531, "learning_rate": 8.070807080708071e-05, "loss": 0.4618, "mean_token_accuracy": 0.7206770956516266, "step": 19300 }, { "epoch": 1.932, "grad_norm": 0.7239099740982056, "learning_rate": 8.06880688068807e-05, "loss": 0.4398, "mean_token_accuracy": 0.707528606057167, "step": 19320 }, { "epoch": 1.9340000000000002, "grad_norm": 1.8094955682754517, "learning_rate": 8.066806680668068e-05, "loss": 0.5781, "mean_token_accuracy": 0.6765953809022903, "step": 19340 }, { "epoch": 1.936, "grad_norm": 0.4743248224258423, "learning_rate": 8.064806480648065e-05, "loss": 0.5857, "mean_token_accuracy": 0.6976533591747284, "step": 19360 }, { "epoch": 1.938, "grad_norm": 0.5393968820571899, "learning_rate": 8.062806280628063e-05, "loss": 0.4504, "mean_token_accuracy": 0.7059657245874404, "step": 19380 }, { "epoch": 1.94, "grad_norm": 1.316855549812317, "learning_rate": 8.060806080608061e-05, "loss": 0.5168, "mean_token_accuracy": 0.6965774178504944, "step": 19400 }, { "epoch": 1.942, "grad_norm": 0.44342464208602905, "learning_rate": 8.05880588058806e-05, "loss": 0.4952, "mean_token_accuracy": 0.7100599974393844, "step": 19420 }, { "epoch": 1.944, "grad_norm": 0.5438318252563477, "learning_rate": 8.056805680568057e-05, "loss": 0.4838, "mean_token_accuracy": 0.7224825352430344, "step": 19440 }, { "epoch": 1.946, "grad_norm": 0.8415161967277527, "learning_rate": 8.054805480548055e-05, "loss": 0.3745, "mean_token_accuracy": 0.712497329711914, "step": 19460 }, { "epoch": 1.948, "grad_norm": 0.625388503074646, "learning_rate": 8.052805280528053e-05, "loss": 0.467, "mean_token_accuracy": 0.7189813286066056, "step": 19480 }, { "epoch": 1.95, "grad_norm": 0.47866860032081604, "learning_rate": 8.050805080508052e-05, "loss": 0.4915, "mean_token_accuracy": 0.7142548352479935, "step": 19500 }, { "epoch": 1.952, "grad_norm": 0.663293719291687, "learning_rate": 8.048804880488049e-05, "loss": 0.3858, "mean_token_accuracy": 0.72041457593441, "step": 19520 }, { "epoch": 1.954, "grad_norm": 0.5153266787528992, "learning_rate": 8.046804680468047e-05, "loss": 0.3825, "mean_token_accuracy": 0.7156494945287705, "step": 19540 }, { "epoch": 1.956, "grad_norm": 0.57334965467453, "learning_rate": 8.044804480448045e-05, "loss": 0.4072, "mean_token_accuracy": 0.7150208055973053, "step": 19560 }, { "epoch": 1.958, "grad_norm": 0.779120147228241, "learning_rate": 8.042804280428043e-05, "loss": 0.4973, "mean_token_accuracy": 0.7182896137237549, "step": 19580 }, { "epoch": 1.96, "grad_norm": 0.5259256362915039, "learning_rate": 8.04080408040804e-05, "loss": 0.5133, "mean_token_accuracy": 0.6913227677345276, "step": 19600 }, { "epoch": 1.962, "grad_norm": 0.4328373372554779, "learning_rate": 8.03880388038804e-05, "loss": 0.3578, "mean_token_accuracy": 0.7067274808883667, "step": 19620 }, { "epoch": 1.964, "grad_norm": 0.5565055012702942, "learning_rate": 8.036803680368037e-05, "loss": 0.563, "mean_token_accuracy": 0.7009298771619796, "step": 19640 }, { "epoch": 1.966, "grad_norm": 0.8377035856246948, "learning_rate": 8.034803480348035e-05, "loss": 0.4186, "mean_token_accuracy": 0.7179678350687027, "step": 19660 }, { "epoch": 1.968, "grad_norm": 0.5818961262702942, "learning_rate": 8.032803280328032e-05, "loss": 0.4847, "mean_token_accuracy": 0.6910266488790512, "step": 19680 }, { "epoch": 1.97, "grad_norm": 0.7693539261817932, "learning_rate": 8.030803080308032e-05, "loss": 0.4909, "mean_token_accuracy": 0.6966992884874343, "step": 19700 }, { "epoch": 1.972, "grad_norm": 1.0090186595916748, "learning_rate": 8.028802880288029e-05, "loss": 0.4707, "mean_token_accuracy": 0.7184349775314331, "step": 19720 }, { "epoch": 1.974, "grad_norm": 0.7978401780128479, "learning_rate": 8.026802680268027e-05, "loss": 0.3516, "mean_token_accuracy": 0.7147697478532791, "step": 19740 }, { "epoch": 1.976, "grad_norm": 0.4835987389087677, "learning_rate": 8.024802480248025e-05, "loss": 0.4419, "mean_token_accuracy": 0.7095379948616027, "step": 19760 }, { "epoch": 1.978, "grad_norm": 0.5523759722709656, "learning_rate": 8.022802280228024e-05, "loss": 0.4855, "mean_token_accuracy": 0.6982594639062881, "step": 19780 }, { "epoch": 1.98, "grad_norm": 0.5060316920280457, "learning_rate": 8.020802080208021e-05, "loss": 0.4515, "mean_token_accuracy": 0.7026746779680252, "step": 19800 }, { "epoch": 1.982, "grad_norm": 1.4233671426773071, "learning_rate": 8.018801880188019e-05, "loss": 0.5403, "mean_token_accuracy": 0.7040970385074615, "step": 19820 }, { "epoch": 1.984, "grad_norm": 0.41426050662994385, "learning_rate": 8.016801680168017e-05, "loss": 0.4258, "mean_token_accuracy": 0.712032425403595, "step": 19840 }, { "epoch": 1.986, "grad_norm": 0.5407652258872986, "learning_rate": 8.014801480148016e-05, "loss": 0.3683, "mean_token_accuracy": 0.7100863367319107, "step": 19860 }, { "epoch": 1.988, "grad_norm": 0.6378005743026733, "learning_rate": 8.012801280128013e-05, "loss": 0.3849, "mean_token_accuracy": 0.7090198218822479, "step": 19880 }, { "epoch": 1.99, "grad_norm": 0.5062994360923767, "learning_rate": 8.010801080108011e-05, "loss": 0.5439, "mean_token_accuracy": 0.7005260944366455, "step": 19900 }, { "epoch": 1.992, "grad_norm": 0.4086558520793915, "learning_rate": 8.008800880088009e-05, "loss": 0.4133, "mean_token_accuracy": 0.7275536268949508, "step": 19920 }, { "epoch": 1.994, "grad_norm": 0.4935610890388489, "learning_rate": 8.006800680068008e-05, "loss": 0.4875, "mean_token_accuracy": 0.7157592147588729, "step": 19940 }, { "epoch": 1.996, "grad_norm": 3.4600155353546143, "learning_rate": 8.004800480048004e-05, "loss": 0.5447, "mean_token_accuracy": 0.7145169854164124, "step": 19960 }, { "epoch": 1.998, "grad_norm": 0.4310895502567291, "learning_rate": 8.002800280028003e-05, "loss": 0.4887, "mean_token_accuracy": 0.7130834221839905, "step": 19980 }, { "epoch": 2.0, "grad_norm": 0.49482983350753784, "learning_rate": 8.000800080008001e-05, "loss": 0.4375, "mean_token_accuracy": 0.7117225736379623, "step": 20000 }, { "epoch": 2.002, "grad_norm": 1.7127337455749512, "learning_rate": 7.998799879988e-05, "loss": 0.3607, "mean_token_accuracy": 0.7748285859823227, "step": 20020 }, { "epoch": 2.004, "grad_norm": 1.2073472738265991, "learning_rate": 7.996799679967996e-05, "loss": 0.514, "mean_token_accuracy": 0.7984467357397079, "step": 20040 }, { "epoch": 2.006, "grad_norm": 0.7721752524375916, "learning_rate": 7.994799479947995e-05, "loss": 0.4479, "mean_token_accuracy": 0.7716844290494919, "step": 20060 }, { "epoch": 2.008, "grad_norm": 0.5650334358215332, "learning_rate": 7.992799279927993e-05, "loss": 0.2804, "mean_token_accuracy": 0.7773394137620926, "step": 20080 }, { "epoch": 2.01, "grad_norm": 2.216174602508545, "learning_rate": 7.990799079907991e-05, "loss": 0.4946, "mean_token_accuracy": 0.7627478301525116, "step": 20100 }, { "epoch": 2.012, "grad_norm": 1.0602139234542847, "learning_rate": 7.988798879887988e-05, "loss": 0.4581, "mean_token_accuracy": 0.7750811547040939, "step": 20120 }, { "epoch": 2.014, "grad_norm": 0.9871034026145935, "learning_rate": 7.986798679867988e-05, "loss": 0.3159, "mean_token_accuracy": 0.8013813346624374, "step": 20140 }, { "epoch": 2.016, "grad_norm": 2.581484794616699, "learning_rate": 7.984798479847985e-05, "loss": 0.459, "mean_token_accuracy": 0.7726360082626342, "step": 20160 }, { "epoch": 2.018, "grad_norm": 0.5203135013580322, "learning_rate": 7.982798279827983e-05, "loss": 0.3296, "mean_token_accuracy": 0.7890249371528626, "step": 20180 }, { "epoch": 2.02, "grad_norm": 0.5017856955528259, "learning_rate": 7.98079807980798e-05, "loss": 0.3735, "mean_token_accuracy": 0.7784524410963058, "step": 20200 }, { "epoch": 2.022, "grad_norm": 1.9048385620117188, "learning_rate": 7.97879787978798e-05, "loss": 0.4212, "mean_token_accuracy": 0.7942043602466583, "step": 20220 }, { "epoch": 2.024, "grad_norm": 0.6760960817337036, "learning_rate": 7.976797679767977e-05, "loss": 0.4382, "mean_token_accuracy": 0.7797897279262542, "step": 20240 }, { "epoch": 2.026, "grad_norm": 0.6205967664718628, "learning_rate": 7.974797479747975e-05, "loss": 0.5963, "mean_token_accuracy": 0.7796809136867523, "step": 20260 }, { "epoch": 2.028, "grad_norm": 1.3684543371200562, "learning_rate": 7.972797279727973e-05, "loss": 0.4948, "mean_token_accuracy": 0.7835149079561233, "step": 20280 }, { "epoch": 2.03, "grad_norm": 0.4489606022834778, "learning_rate": 7.970797079707972e-05, "loss": 0.3715, "mean_token_accuracy": 0.7909787178039551, "step": 20300 }, { "epoch": 2.032, "grad_norm": 0.8410580158233643, "learning_rate": 7.96879687968797e-05, "loss": 0.5063, "mean_token_accuracy": 0.7653146326541901, "step": 20320 }, { "epoch": 2.034, "grad_norm": 0.6681613326072693, "learning_rate": 7.966796679667967e-05, "loss": 0.4514, "mean_token_accuracy": 0.7669599950313568, "step": 20340 }, { "epoch": 2.036, "grad_norm": 0.5101450085639954, "learning_rate": 7.964796479647965e-05, "loss": 0.5255, "mean_token_accuracy": 0.7684529066085816, "step": 20360 }, { "epoch": 2.038, "grad_norm": 0.6591892838478088, "learning_rate": 7.962796279627963e-05, "loss": 0.4268, "mean_token_accuracy": 0.7793665826320648, "step": 20380 }, { "epoch": 2.04, "grad_norm": 1.5891978740692139, "learning_rate": 7.960796079607962e-05, "loss": 0.4989, "mean_token_accuracy": 0.777790105342865, "step": 20400 }, { "epoch": 2.042, "grad_norm": 1.6966440677642822, "learning_rate": 7.958795879587959e-05, "loss": 0.3891, "mean_token_accuracy": 0.7787217944860458, "step": 20420 }, { "epoch": 2.044, "grad_norm": 1.0864137411117554, "learning_rate": 7.956795679567958e-05, "loss": 0.4626, "mean_token_accuracy": 0.769822895526886, "step": 20440 }, { "epoch": 2.046, "grad_norm": 0.6229099035263062, "learning_rate": 7.954795479547955e-05, "loss": 0.6155, "mean_token_accuracy": 0.7693908423185348, "step": 20460 }, { "epoch": 2.048, "grad_norm": 0.6498054265975952, "learning_rate": 7.952795279527954e-05, "loss": 0.3721, "mean_token_accuracy": 0.7935333281755448, "step": 20480 }, { "epoch": 2.05, "grad_norm": 0.6006797552108765, "learning_rate": 7.95079507950795e-05, "loss": 0.4645, "mean_token_accuracy": 0.7536375343799591, "step": 20500 }, { "epoch": 2.052, "grad_norm": 0.4812840521335602, "learning_rate": 7.94879487948795e-05, "loss": 0.3912, "mean_token_accuracy": 0.7752534061670303, "step": 20520 }, { "epoch": 2.054, "grad_norm": 1.0204683542251587, "learning_rate": 7.946794679467947e-05, "loss": 0.4184, "mean_token_accuracy": 0.7798435419797898, "step": 20540 }, { "epoch": 2.056, "grad_norm": 0.5433595180511475, "learning_rate": 7.944794479447945e-05, "loss": 0.2912, "mean_token_accuracy": 0.7815324664115906, "step": 20560 }, { "epoch": 2.058, "grad_norm": 0.5223949551582336, "learning_rate": 7.942794279427942e-05, "loss": 0.2585, "mean_token_accuracy": 0.7886959075927734, "step": 20580 }, { "epoch": 2.06, "grad_norm": 0.7901504039764404, "learning_rate": 7.940794079407942e-05, "loss": 0.4847, "mean_token_accuracy": 0.7652919828891754, "step": 20600 }, { "epoch": 2.062, "grad_norm": 0.5857166647911072, "learning_rate": 7.938793879387939e-05, "loss": 0.5935, "mean_token_accuracy": 0.7600272536277771, "step": 20620 }, { "epoch": 2.064, "grad_norm": 0.5536137819290161, "learning_rate": 7.936793679367937e-05, "loss": 0.6198, "mean_token_accuracy": 0.759911498427391, "step": 20640 }, { "epoch": 2.066, "grad_norm": 1.497252345085144, "learning_rate": 7.934793479347936e-05, "loss": 0.3816, "mean_token_accuracy": 0.7692884147167206, "step": 20660 }, { "epoch": 2.068, "grad_norm": 0.5304000377655029, "learning_rate": 7.932793279327934e-05, "loss": 0.3803, "mean_token_accuracy": 0.786907634139061, "step": 20680 }, { "epoch": 2.07, "grad_norm": 1.4048898220062256, "learning_rate": 7.930793079307931e-05, "loss": 0.4099, "mean_token_accuracy": 0.7814711302518844, "step": 20700 }, { "epoch": 2.072, "grad_norm": 0.6567322611808777, "learning_rate": 7.928792879287929e-05, "loss": 0.3173, "mean_token_accuracy": 0.7731285780668259, "step": 20720 }, { "epoch": 2.074, "grad_norm": 0.4842546880245209, "learning_rate": 7.926792679267927e-05, "loss": 0.3403, "mean_token_accuracy": 0.7780163943767547, "step": 20740 }, { "epoch": 2.076, "grad_norm": 0.8619312644004822, "learning_rate": 7.924792479247926e-05, "loss": 0.3667, "mean_token_accuracy": 0.7785163342952728, "step": 20760 }, { "epoch": 2.078, "grad_norm": 0.5253666639328003, "learning_rate": 7.922792279227923e-05, "loss": 0.2311, "mean_token_accuracy": 0.7764224052429199, "step": 20780 }, { "epoch": 2.08, "grad_norm": 0.5725424885749817, "learning_rate": 7.920792079207921e-05, "loss": 0.3155, "mean_token_accuracy": 0.7834271222352982, "step": 20800 }, { "epoch": 2.082, "grad_norm": 0.523566484451294, "learning_rate": 7.91879187918792e-05, "loss": 0.3115, "mean_token_accuracy": 0.7886469960212708, "step": 20820 }, { "epoch": 2.084, "grad_norm": 0.8203099966049194, "learning_rate": 7.916791679167918e-05, "loss": 0.6505, "mean_token_accuracy": 0.7445968508720398, "step": 20840 }, { "epoch": 2.086, "grad_norm": 1.3321168422698975, "learning_rate": 7.914791479147915e-05, "loss": 0.407, "mean_token_accuracy": 0.7953745931386947, "step": 20860 }, { "epoch": 2.088, "grad_norm": 0.5506932735443115, "learning_rate": 7.912791279127913e-05, "loss": 0.4936, "mean_token_accuracy": 0.7661927700042724, "step": 20880 }, { "epoch": 2.09, "grad_norm": 0.786858320236206, "learning_rate": 7.910791079107911e-05, "loss": 0.3917, "mean_token_accuracy": 0.7813081026077271, "step": 20900 }, { "epoch": 2.092, "grad_norm": 1.4122593402862549, "learning_rate": 7.90879087908791e-05, "loss": 0.42, "mean_token_accuracy": 0.765156215429306, "step": 20920 }, { "epoch": 2.094, "grad_norm": 0.640546977519989, "learning_rate": 7.906790679067906e-05, "loss": 0.5276, "mean_token_accuracy": 0.7511861741542816, "step": 20940 }, { "epoch": 2.096, "grad_norm": 0.5078151822090149, "learning_rate": 7.904790479047906e-05, "loss": 0.4597, "mean_token_accuracy": 0.7609752088785171, "step": 20960 }, { "epoch": 2.098, "grad_norm": 0.6038398146629333, "learning_rate": 7.902790279027903e-05, "loss": 0.3755, "mean_token_accuracy": 0.776852685213089, "step": 20980 }, { "epoch": 2.1, "grad_norm": 0.45430007576942444, "learning_rate": 7.900790079007901e-05, "loss": 0.375, "mean_token_accuracy": 0.7710929304361344, "step": 21000 }, { "epoch": 2.102, "grad_norm": 1.3684762716293335, "learning_rate": 7.898789878987898e-05, "loss": 0.3495, "mean_token_accuracy": 0.7708664834499359, "step": 21020 }, { "epoch": 2.104, "grad_norm": 0.5502551794052124, "learning_rate": 7.896789678967898e-05, "loss": 0.449, "mean_token_accuracy": 0.780603215098381, "step": 21040 }, { "epoch": 2.106, "grad_norm": 0.5812153220176697, "learning_rate": 7.894789478947895e-05, "loss": 0.4252, "mean_token_accuracy": 0.7748601227998734, "step": 21060 }, { "epoch": 2.108, "grad_norm": 0.6680713891983032, "learning_rate": 7.892789278927893e-05, "loss": 0.4124, "mean_token_accuracy": 0.7860050201416016, "step": 21080 }, { "epoch": 2.11, "grad_norm": 1.464181661605835, "learning_rate": 7.89078907890789e-05, "loss": 0.3452, "mean_token_accuracy": 0.781421884894371, "step": 21100 }, { "epoch": 2.112, "grad_norm": 1.1662946939468384, "learning_rate": 7.88878887888789e-05, "loss": 0.4597, "mean_token_accuracy": 0.7661173075437546, "step": 21120 }, { "epoch": 2.114, "grad_norm": 0.632640540599823, "learning_rate": 7.886788678867887e-05, "loss": 0.4072, "mean_token_accuracy": 0.7671910583972931, "step": 21140 }, { "epoch": 2.116, "grad_norm": 0.6185449957847595, "learning_rate": 7.884788478847885e-05, "loss": 0.3841, "mean_token_accuracy": 0.7873892664909363, "step": 21160 }, { "epoch": 2.118, "grad_norm": 1.3396950960159302, "learning_rate": 7.882788278827883e-05, "loss": 0.4489, "mean_token_accuracy": 0.7649066716432571, "step": 21180 }, { "epoch": 2.12, "grad_norm": 0.5437762141227722, "learning_rate": 7.880788078807882e-05, "loss": 0.401, "mean_token_accuracy": 0.775800633430481, "step": 21200 }, { "epoch": 2.122, "grad_norm": 2.9318716526031494, "learning_rate": 7.878787878787879e-05, "loss": 0.4965, "mean_token_accuracy": 0.7580222338438034, "step": 21220 }, { "epoch": 2.124, "grad_norm": 0.7541435360908508, "learning_rate": 7.876787678767877e-05, "loss": 0.3822, "mean_token_accuracy": 0.7659507423639298, "step": 21240 }, { "epoch": 2.126, "grad_norm": 0.5074002742767334, "learning_rate": 7.874787478747875e-05, "loss": 0.3494, "mean_token_accuracy": 0.7900628954172134, "step": 21260 }, { "epoch": 2.128, "grad_norm": 0.5080586075782776, "learning_rate": 7.872787278727874e-05, "loss": 0.3926, "mean_token_accuracy": 0.7716055184602737, "step": 21280 }, { "epoch": 2.13, "grad_norm": 2.4915883541107178, "learning_rate": 7.87078707870787e-05, "loss": 0.5383, "mean_token_accuracy": 0.7540159583091736, "step": 21300 }, { "epoch": 2.132, "grad_norm": 1.5605757236480713, "learning_rate": 7.868786878687869e-05, "loss": 0.3309, "mean_token_accuracy": 0.7811779588460922, "step": 21320 }, { "epoch": 2.134, "grad_norm": 0.5168538689613342, "learning_rate": 7.866786678667867e-05, "loss": 0.3169, "mean_token_accuracy": 0.7669662177562714, "step": 21340 }, { "epoch": 2.136, "grad_norm": 0.5723507404327393, "learning_rate": 7.864786478647865e-05, "loss": 0.39, "mean_token_accuracy": 0.765573137998581, "step": 21360 }, { "epoch": 2.138, "grad_norm": 0.8903061151504517, "learning_rate": 7.862786278627862e-05, "loss": 0.3848, "mean_token_accuracy": 0.7800672978162766, "step": 21380 }, { "epoch": 2.14, "grad_norm": 0.8117502331733704, "learning_rate": 7.860786078607861e-05, "loss": 0.3505, "mean_token_accuracy": 0.7808476835489273, "step": 21400 }, { "epoch": 2.142, "grad_norm": 0.5283521413803101, "learning_rate": 7.858785878587859e-05, "loss": 0.457, "mean_token_accuracy": 0.7639314144849777, "step": 21420 }, { "epoch": 2.144, "grad_norm": 0.5168699026107788, "learning_rate": 7.856785678567857e-05, "loss": 0.362, "mean_token_accuracy": 0.7822362810373307, "step": 21440 }, { "epoch": 2.146, "grad_norm": 0.6064140796661377, "learning_rate": 7.854785478547854e-05, "loss": 0.5071, "mean_token_accuracy": 0.7540976494550705, "step": 21460 }, { "epoch": 2.148, "grad_norm": 1.5769844055175781, "learning_rate": 7.852785278527854e-05, "loss": 0.3982, "mean_token_accuracy": 0.7477894872426987, "step": 21480 }, { "epoch": 2.15, "grad_norm": 0.46762216091156006, "learning_rate": 7.850785078507851e-05, "loss": 0.3742, "mean_token_accuracy": 0.7775210112333297, "step": 21500 }, { "epoch": 2.152, "grad_norm": 0.5319267511367798, "learning_rate": 7.848784878487849e-05, "loss": 0.5431, "mean_token_accuracy": 0.7633801937103272, "step": 21520 }, { "epoch": 2.154, "grad_norm": 0.5735356211662292, "learning_rate": 7.846784678467846e-05, "loss": 0.6617, "mean_token_accuracy": 0.7506336003541947, "step": 21540 }, { "epoch": 2.156, "grad_norm": 0.5044766068458557, "learning_rate": 7.844784478447846e-05, "loss": 0.3425, "mean_token_accuracy": 0.7901438921689987, "step": 21560 }, { "epoch": 2.158, "grad_norm": 1.9377018213272095, "learning_rate": 7.842784278427843e-05, "loss": 0.4529, "mean_token_accuracy": 0.7780735045671463, "step": 21580 }, { "epoch": 2.16, "grad_norm": 0.7694812417030334, "learning_rate": 7.840784078407841e-05, "loss": 0.3843, "mean_token_accuracy": 0.7604918301105499, "step": 21600 }, { "epoch": 2.162, "grad_norm": 0.5490021109580994, "learning_rate": 7.83878387838784e-05, "loss": 0.384, "mean_token_accuracy": 0.7735811084508896, "step": 21620 }, { "epoch": 2.164, "grad_norm": 0.6708257794380188, "learning_rate": 7.836783678367838e-05, "loss": 0.4448, "mean_token_accuracy": 0.7569201588630676, "step": 21640 }, { "epoch": 2.166, "grad_norm": 0.5557554960250854, "learning_rate": 7.834783478347835e-05, "loss": 0.5439, "mean_token_accuracy": 0.7582821995019913, "step": 21660 }, { "epoch": 2.168, "grad_norm": 0.5262100696563721, "learning_rate": 7.832783278327833e-05, "loss": 0.2589, "mean_token_accuracy": 0.7871497839689254, "step": 21680 }, { "epoch": 2.17, "grad_norm": 0.6100625991821289, "learning_rate": 7.830783078307831e-05, "loss": 0.4343, "mean_token_accuracy": 0.7761149734258652, "step": 21700 }, { "epoch": 2.172, "grad_norm": 0.5215007066726685, "learning_rate": 7.82878287828783e-05, "loss": 0.3832, "mean_token_accuracy": 0.7818897247314454, "step": 21720 }, { "epoch": 2.174, "grad_norm": 0.788175642490387, "learning_rate": 7.826782678267826e-05, "loss": 0.4859, "mean_token_accuracy": 0.774088191986084, "step": 21740 }, { "epoch": 2.176, "grad_norm": 0.6623234748840332, "learning_rate": 7.824782478247825e-05, "loss": 0.3588, "mean_token_accuracy": 0.776423841714859, "step": 21760 }, { "epoch": 2.178, "grad_norm": 3.562682867050171, "learning_rate": 7.822782278227823e-05, "loss": 0.385, "mean_token_accuracy": 0.7695600748062134, "step": 21780 }, { "epoch": 2.18, "grad_norm": 0.8040341734886169, "learning_rate": 7.820782078207821e-05, "loss": 0.4227, "mean_token_accuracy": 0.7614876389503479, "step": 21800 }, { "epoch": 2.182, "grad_norm": 0.5363765358924866, "learning_rate": 7.81878187818782e-05, "loss": 0.3005, "mean_token_accuracy": 0.7785279542207718, "step": 21820 }, { "epoch": 2.184, "grad_norm": 0.5078396201133728, "learning_rate": 7.816781678167817e-05, "loss": 0.5828, "mean_token_accuracy": 0.7728529512882233, "step": 21840 }, { "epoch": 2.186, "grad_norm": 1.506076693534851, "learning_rate": 7.814781478147816e-05, "loss": 0.4048, "mean_token_accuracy": 0.7773618817329406, "step": 21860 }, { "epoch": 2.188, "grad_norm": 0.6758870482444763, "learning_rate": 7.812781278127813e-05, "loss": 0.3735, "mean_token_accuracy": 0.7635821223258972, "step": 21880 }, { "epoch": 2.19, "grad_norm": 0.9340448975563049, "learning_rate": 7.810781078107812e-05, "loss": 0.3177, "mean_token_accuracy": 0.7719242721796036, "step": 21900 }, { "epoch": 2.192, "grad_norm": 0.5044220089912415, "learning_rate": 7.808780878087808e-05, "loss": 0.4502, "mean_token_accuracy": 0.7686127305030823, "step": 21920 }, { "epoch": 2.194, "grad_norm": 0.8492603302001953, "learning_rate": 7.806780678067808e-05, "loss": 0.3798, "mean_token_accuracy": 0.7854416042566299, "step": 21940 }, { "epoch": 2.196, "grad_norm": 1.0079522132873535, "learning_rate": 7.804780478047805e-05, "loss": 0.3538, "mean_token_accuracy": 0.7872963964939117, "step": 21960 }, { "epoch": 2.198, "grad_norm": 4.38313627243042, "learning_rate": 7.802780278027803e-05, "loss": 0.5251, "mean_token_accuracy": 0.7650081068277359, "step": 21980 }, { "epoch": 2.2, "grad_norm": 0.49484026432037354, "learning_rate": 7.800780078007802e-05, "loss": 0.3467, "mean_token_accuracy": 0.777098998427391, "step": 22000 }, { "epoch": 2.202, "grad_norm": 0.8215750455856323, "learning_rate": 7.7987798779878e-05, "loss": 0.4863, "mean_token_accuracy": 0.7560094028711319, "step": 22020 }, { "epoch": 2.204, "grad_norm": 0.4980577230453491, "learning_rate": 7.796779677967797e-05, "loss": 0.378, "mean_token_accuracy": 0.7757959872484207, "step": 22040 }, { "epoch": 2.206, "grad_norm": 0.5537676215171814, "learning_rate": 7.794779477947795e-05, "loss": 0.3478, "mean_token_accuracy": 0.771566066145897, "step": 22060 }, { "epoch": 2.208, "grad_norm": 0.510953962802887, "learning_rate": 7.792779277927794e-05, "loss": 0.3204, "mean_token_accuracy": 0.7634815543889999, "step": 22080 }, { "epoch": 2.21, "grad_norm": 0.5319976210594177, "learning_rate": 7.790779077907792e-05, "loss": 0.4075, "mean_token_accuracy": 0.7666189491748809, "step": 22100 }, { "epoch": 2.212, "grad_norm": 1.0083792209625244, "learning_rate": 7.788778877887789e-05, "loss": 0.4262, "mean_token_accuracy": 0.7651657432317733, "step": 22120 }, { "epoch": 2.214, "grad_norm": 1.4775347709655762, "learning_rate": 7.786778677867787e-05, "loss": 0.4018, "mean_token_accuracy": 0.775810244679451, "step": 22140 }, { "epoch": 2.216, "grad_norm": 0.8043671250343323, "learning_rate": 7.784778477847785e-05, "loss": 0.423, "mean_token_accuracy": 0.7668640911579132, "step": 22160 }, { "epoch": 2.218, "grad_norm": 1.5242195129394531, "learning_rate": 7.782778277827784e-05, "loss": 0.4732, "mean_token_accuracy": 0.7557585328817368, "step": 22180 }, { "epoch": 2.22, "grad_norm": 0.6835929751396179, "learning_rate": 7.780778077807781e-05, "loss": 0.4387, "mean_token_accuracy": 0.7570622056722641, "step": 22200 }, { "epoch": 2.222, "grad_norm": 1.1332660913467407, "learning_rate": 7.778777877787779e-05, "loss": 0.4113, "mean_token_accuracy": 0.7799622237682342, "step": 22220 }, { "epoch": 2.224, "grad_norm": 0.59469074010849, "learning_rate": 7.776777677767777e-05, "loss": 0.3853, "mean_token_accuracy": 0.7504439800977707, "step": 22240 }, { "epoch": 2.226, "grad_norm": 1.124879240989685, "learning_rate": 7.774777477747776e-05, "loss": 0.4875, "mean_token_accuracy": 0.7736962288618088, "step": 22260 }, { "epoch": 2.228, "grad_norm": 0.5869929194450378, "learning_rate": 7.772777277727773e-05, "loss": 0.5236, "mean_token_accuracy": 0.7690495491027832, "step": 22280 }, { "epoch": 2.23, "grad_norm": 1.3079770803451538, "learning_rate": 7.770777077707771e-05, "loss": 0.4823, "mean_token_accuracy": 0.7686618953943253, "step": 22300 }, { "epoch": 2.232, "grad_norm": 0.8872121572494507, "learning_rate": 7.768776877687769e-05, "loss": 0.3588, "mean_token_accuracy": 0.7513070285320282, "step": 22320 }, { "epoch": 2.234, "grad_norm": 0.5672991871833801, "learning_rate": 7.766776677667767e-05, "loss": 0.3603, "mean_token_accuracy": 0.769586706161499, "step": 22340 }, { "epoch": 2.2359999999999998, "grad_norm": 0.5220412015914917, "learning_rate": 7.764776477647764e-05, "loss": 0.3866, "mean_token_accuracy": 0.7713562160730362, "step": 22360 }, { "epoch": 2.238, "grad_norm": 0.7763454914093018, "learning_rate": 7.762776277627764e-05, "loss": 0.4987, "mean_token_accuracy": 0.7567514985799789, "step": 22380 }, { "epoch": 2.24, "grad_norm": 1.0679988861083984, "learning_rate": 7.760776077607761e-05, "loss": 0.5152, "mean_token_accuracy": 0.7603351235389709, "step": 22400 }, { "epoch": 2.242, "grad_norm": 0.9136451482772827, "learning_rate": 7.75877587758776e-05, "loss": 0.4539, "mean_token_accuracy": 0.7468688458204269, "step": 22420 }, { "epoch": 2.2439999999999998, "grad_norm": 0.5695063471794128, "learning_rate": 7.756775677567756e-05, "loss": 0.4424, "mean_token_accuracy": 0.7612689167261124, "step": 22440 }, { "epoch": 2.246, "grad_norm": 0.7886316776275635, "learning_rate": 7.754775477547756e-05, "loss": 0.3628, "mean_token_accuracy": 0.7737084180116653, "step": 22460 }, { "epoch": 2.248, "grad_norm": 0.6740391850471497, "learning_rate": 7.752775277527753e-05, "loss": 0.3923, "mean_token_accuracy": 0.7509552389383316, "step": 22480 }, { "epoch": 2.25, "grad_norm": 0.490797221660614, "learning_rate": 7.750775077507751e-05, "loss": 0.432, "mean_token_accuracy": 0.7623810440301895, "step": 22500 }, { "epoch": 2.252, "grad_norm": 0.6149345636367798, "learning_rate": 7.74877487748775e-05, "loss": 0.3683, "mean_token_accuracy": 0.7723636955022812, "step": 22520 }, { "epoch": 2.254, "grad_norm": 1.8003937005996704, "learning_rate": 7.746774677467748e-05, "loss": 0.4256, "mean_token_accuracy": 0.7628582119941711, "step": 22540 }, { "epoch": 2.2560000000000002, "grad_norm": 0.4745742380619049, "learning_rate": 7.744774477447745e-05, "loss": 0.3768, "mean_token_accuracy": 0.7698173224925995, "step": 22560 }, { "epoch": 2.258, "grad_norm": 0.5605483651161194, "learning_rate": 7.742774277427743e-05, "loss": 0.3384, "mean_token_accuracy": 0.7798591583967209, "step": 22580 }, { "epoch": 2.26, "grad_norm": 1.1290463209152222, "learning_rate": 7.740774077407741e-05, "loss": 0.4952, "mean_token_accuracy": 0.7578169465065002, "step": 22600 }, { "epoch": 2.262, "grad_norm": 0.49361544847488403, "learning_rate": 7.73877387738774e-05, "loss": 0.3148, "mean_token_accuracy": 0.7763891398906708, "step": 22620 }, { "epoch": 2.2640000000000002, "grad_norm": 0.5748021006584167, "learning_rate": 7.736773677367737e-05, "loss": 0.5084, "mean_token_accuracy": 0.7710334181785583, "step": 22640 }, { "epoch": 2.266, "grad_norm": 2.3144545555114746, "learning_rate": 7.734773477347735e-05, "loss": 0.4416, "mean_token_accuracy": 0.7693270295858383, "step": 22660 }, { "epoch": 2.268, "grad_norm": 0.8633536696434021, "learning_rate": 7.732773277327733e-05, "loss": 0.3628, "mean_token_accuracy": 0.7655173152685165, "step": 22680 }, { "epoch": 2.27, "grad_norm": 0.703799307346344, "learning_rate": 7.730773077307732e-05, "loss": 0.5023, "mean_token_accuracy": 0.7703765064477921, "step": 22700 }, { "epoch": 2.2720000000000002, "grad_norm": 0.5855663418769836, "learning_rate": 7.728772877287728e-05, "loss": 0.4547, "mean_token_accuracy": 0.7775100082159042, "step": 22720 }, { "epoch": 2.274, "grad_norm": 1.2994557619094849, "learning_rate": 7.726772677267727e-05, "loss": 0.3689, "mean_token_accuracy": 0.7621994644403458, "step": 22740 }, { "epoch": 2.276, "grad_norm": 0.7375706434249878, "learning_rate": 7.724772477247725e-05, "loss": 0.476, "mean_token_accuracy": 0.775600591301918, "step": 22760 }, { "epoch": 2.278, "grad_norm": 0.8407130241394043, "learning_rate": 7.722772277227723e-05, "loss": 0.3807, "mean_token_accuracy": 0.7860877007246018, "step": 22780 }, { "epoch": 2.2800000000000002, "grad_norm": 2.0315163135528564, "learning_rate": 7.72077207720772e-05, "loss": 0.4893, "mean_token_accuracy": 0.7709286987781525, "step": 22800 }, { "epoch": 2.282, "grad_norm": 0.512087345123291, "learning_rate": 7.718771877187719e-05, "loss": 0.3498, "mean_token_accuracy": 0.7771093130111695, "step": 22820 }, { "epoch": 2.284, "grad_norm": 0.845865786075592, "learning_rate": 7.716771677167717e-05, "loss": 0.2855, "mean_token_accuracy": 0.7569301158189774, "step": 22840 }, { "epoch": 2.286, "grad_norm": 0.5056632161140442, "learning_rate": 7.714771477147715e-05, "loss": 0.422, "mean_token_accuracy": 0.7582791298627853, "step": 22860 }, { "epoch": 2.288, "grad_norm": 0.4973207712173462, "learning_rate": 7.712771277127712e-05, "loss": 0.3843, "mean_token_accuracy": 0.7777834117412568, "step": 22880 }, { "epoch": 2.29, "grad_norm": 0.4650529623031616, "learning_rate": 7.710771077107712e-05, "loss": 0.2611, "mean_token_accuracy": 0.7922221422195435, "step": 22900 }, { "epoch": 2.292, "grad_norm": 0.5173490047454834, "learning_rate": 7.708770877087709e-05, "loss": 0.3589, "mean_token_accuracy": 0.7753509372472763, "step": 22920 }, { "epoch": 2.294, "grad_norm": 0.8033084273338318, "learning_rate": 7.706770677067707e-05, "loss": 0.3198, "mean_token_accuracy": 0.770737886428833, "step": 22940 }, { "epoch": 2.296, "grad_norm": 1.2661125659942627, "learning_rate": 7.704770477047704e-05, "loss": 0.4386, "mean_token_accuracy": 0.7566825151443481, "step": 22960 }, { "epoch": 2.298, "grad_norm": 0.6954066753387451, "learning_rate": 7.702770277027704e-05, "loss": 0.3797, "mean_token_accuracy": 0.7667675644159317, "step": 22980 }, { "epoch": 2.3, "grad_norm": 0.6038840413093567, "learning_rate": 7.700770077007701e-05, "loss": 0.4194, "mean_token_accuracy": 0.7596136897802352, "step": 23000 }, { "epoch": 2.302, "grad_norm": 0.3983232378959656, "learning_rate": 7.698769876987699e-05, "loss": 0.3535, "mean_token_accuracy": 0.7669613540172577, "step": 23020 }, { "epoch": 2.304, "grad_norm": 0.5551156997680664, "learning_rate": 7.696769676967697e-05, "loss": 0.3746, "mean_token_accuracy": 0.7744466125965118, "step": 23040 }, { "epoch": 2.306, "grad_norm": 0.6440017819404602, "learning_rate": 7.694769476947696e-05, "loss": 0.3842, "mean_token_accuracy": 0.7575966894626618, "step": 23060 }, { "epoch": 2.308, "grad_norm": 0.5775814652442932, "learning_rate": 7.692769276927693e-05, "loss": 0.3729, "mean_token_accuracy": 0.780083978176117, "step": 23080 }, { "epoch": 2.31, "grad_norm": 0.5683767795562744, "learning_rate": 7.690769076907691e-05, "loss": 0.3635, "mean_token_accuracy": 0.7795859843492507, "step": 23100 }, { "epoch": 2.312, "grad_norm": 0.509635865688324, "learning_rate": 7.688768876887689e-05, "loss": 0.4568, "mean_token_accuracy": 0.7717027813196182, "step": 23120 }, { "epoch": 2.314, "grad_norm": 0.5953083634376526, "learning_rate": 7.686768676867687e-05, "loss": 0.4574, "mean_token_accuracy": 0.7618860572576522, "step": 23140 }, { "epoch": 2.316, "grad_norm": 0.7802192568778992, "learning_rate": 7.684768476847684e-05, "loss": 0.3627, "mean_token_accuracy": 0.7642887115478516, "step": 23160 }, { "epoch": 2.318, "grad_norm": 2.5221593379974365, "learning_rate": 7.682768276827683e-05, "loss": 0.4068, "mean_token_accuracy": 0.7724740356206894, "step": 23180 }, { "epoch": 2.32, "grad_norm": 0.7932778596878052, "learning_rate": 7.680768076807681e-05, "loss": 0.5131, "mean_token_accuracy": 0.7700458019971848, "step": 23200 }, { "epoch": 2.322, "grad_norm": 0.6598499417304993, "learning_rate": 7.67876787678768e-05, "loss": 0.3562, "mean_token_accuracy": 0.7676775485277176, "step": 23220 }, { "epoch": 2.324, "grad_norm": 1.914103388786316, "learning_rate": 7.676767676767676e-05, "loss": 0.3749, "mean_token_accuracy": 0.7547209173440933, "step": 23240 }, { "epoch": 2.326, "grad_norm": 0.6188720464706421, "learning_rate": 7.674767476747675e-05, "loss": 0.2546, "mean_token_accuracy": 0.7757228642702103, "step": 23260 }, { "epoch": 2.328, "grad_norm": 0.5623543858528137, "learning_rate": 7.672767276727673e-05, "loss": 0.332, "mean_token_accuracy": 0.7765046328306198, "step": 23280 }, { "epoch": 2.33, "grad_norm": 0.897958517074585, "learning_rate": 7.670767076707671e-05, "loss": 0.3717, "mean_token_accuracy": 0.7729084879159928, "step": 23300 }, { "epoch": 2.332, "grad_norm": 1.8490970134735107, "learning_rate": 7.66876687668767e-05, "loss": 0.4875, "mean_token_accuracy": 0.7736924886703491, "step": 23320 }, { "epoch": 2.334, "grad_norm": 0.6084020733833313, "learning_rate": 7.666766676667668e-05, "loss": 0.4738, "mean_token_accuracy": 0.7458075374364853, "step": 23340 }, { "epoch": 2.336, "grad_norm": 0.42907199263572693, "learning_rate": 7.664766476647666e-05, "loss": 0.3029, "mean_token_accuracy": 0.7728521287441253, "step": 23360 }, { "epoch": 2.338, "grad_norm": 0.6761646866798401, "learning_rate": 7.662766276627663e-05, "loss": 0.3922, "mean_token_accuracy": 0.7730079114437103, "step": 23380 }, { "epoch": 2.34, "grad_norm": 0.4173096716403961, "learning_rate": 7.660766076607661e-05, "loss": 0.3419, "mean_token_accuracy": 0.7771919220685959, "step": 23400 }, { "epoch": 2.342, "grad_norm": 1.4635635614395142, "learning_rate": 7.65876587658766e-05, "loss": 0.5754, "mean_token_accuracy": 0.7608881771564484, "step": 23420 }, { "epoch": 2.344, "grad_norm": 1.2001796960830688, "learning_rate": 7.656765676567658e-05, "loss": 0.4446, "mean_token_accuracy": 0.7745162934064865, "step": 23440 }, { "epoch": 2.346, "grad_norm": 0.5743774175643921, "learning_rate": 7.654765476547655e-05, "loss": 0.3829, "mean_token_accuracy": 0.7449348717927933, "step": 23460 }, { "epoch": 2.348, "grad_norm": 0.6108196973800659, "learning_rate": 7.652765276527653e-05, "loss": 0.5534, "mean_token_accuracy": 0.7588549643754959, "step": 23480 }, { "epoch": 2.35, "grad_norm": 0.898906946182251, "learning_rate": 7.650765076507652e-05, "loss": 0.4432, "mean_token_accuracy": 0.7646829158067703, "step": 23500 }, { "epoch": 2.352, "grad_norm": 0.6535515189170837, "learning_rate": 7.64876487648765e-05, "loss": 0.4645, "mean_token_accuracy": 0.7563628047704697, "step": 23520 }, { "epoch": 2.354, "grad_norm": 0.6854336857795715, "learning_rate": 7.646764676467647e-05, "loss": 0.4702, "mean_token_accuracy": 0.7778071522712707, "step": 23540 }, { "epoch": 2.356, "grad_norm": 0.5824872851371765, "learning_rate": 7.644764476447645e-05, "loss": 0.5846, "mean_token_accuracy": 0.7580104023218155, "step": 23560 }, { "epoch": 2.358, "grad_norm": 1.7997703552246094, "learning_rate": 7.642764276427643e-05, "loss": 0.4808, "mean_token_accuracy": 0.7650878459215165, "step": 23580 }, { "epoch": 2.36, "grad_norm": 1.5422656536102295, "learning_rate": 7.640764076407642e-05, "loss": 0.4508, "mean_token_accuracy": 0.7707587957382203, "step": 23600 }, { "epoch": 2.362, "grad_norm": 0.5635190010070801, "learning_rate": 7.638763876387639e-05, "loss": 0.2947, "mean_token_accuracy": 0.7763437628746033, "step": 23620 }, { "epoch": 2.364, "grad_norm": 0.6673288941383362, "learning_rate": 7.636763676367637e-05, "loss": 0.4564, "mean_token_accuracy": 0.7578366965055465, "step": 23640 }, { "epoch": 2.366, "grad_norm": 0.7083185315132141, "learning_rate": 7.634763476347635e-05, "loss": 0.3792, "mean_token_accuracy": 0.7621849268674851, "step": 23660 }, { "epoch": 2.368, "grad_norm": 0.6734260320663452, "learning_rate": 7.632763276327634e-05, "loss": 0.4607, "mean_token_accuracy": 0.7646942228078842, "step": 23680 }, { "epoch": 2.37, "grad_norm": 0.48572617769241333, "learning_rate": 7.63076307630763e-05, "loss": 0.3975, "mean_token_accuracy": 0.7767685562372207, "step": 23700 }, { "epoch": 2.372, "grad_norm": 0.9049386382102966, "learning_rate": 7.62876287628763e-05, "loss": 0.363, "mean_token_accuracy": 0.7674583852291107, "step": 23720 }, { "epoch": 2.374, "grad_norm": 1.1591734886169434, "learning_rate": 7.626762676267627e-05, "loss": 0.3899, "mean_token_accuracy": 0.7757192820310592, "step": 23740 }, { "epoch": 2.376, "grad_norm": 1.609484076499939, "learning_rate": 7.624762476247625e-05, "loss": 0.3793, "mean_token_accuracy": 0.7730779081583024, "step": 23760 }, { "epoch": 2.378, "grad_norm": 0.7479820251464844, "learning_rate": 7.622762276227622e-05, "loss": 0.3689, "mean_token_accuracy": 0.7836084514856339, "step": 23780 }, { "epoch": 2.38, "grad_norm": 0.70456862449646, "learning_rate": 7.620762076207622e-05, "loss": 0.4454, "mean_token_accuracy": 0.7719931036233902, "step": 23800 }, { "epoch": 2.382, "grad_norm": 1.0259613990783691, "learning_rate": 7.618761876187619e-05, "loss": 0.3852, "mean_token_accuracy": 0.7724763840436936, "step": 23820 }, { "epoch": 2.384, "grad_norm": 1.1266697645187378, "learning_rate": 7.616761676167617e-05, "loss": 0.4201, "mean_token_accuracy": 0.7666309386491775, "step": 23840 }, { "epoch": 2.386, "grad_norm": 0.6358482837677002, "learning_rate": 7.614761476147616e-05, "loss": 0.4233, "mean_token_accuracy": 0.775671300292015, "step": 23860 }, { "epoch": 2.388, "grad_norm": 0.6102115511894226, "learning_rate": 7.612761276127614e-05, "loss": 0.425, "mean_token_accuracy": 0.7738061904907226, "step": 23880 }, { "epoch": 2.39, "grad_norm": 1.3303892612457275, "learning_rate": 7.610761076107611e-05, "loss": 0.4281, "mean_token_accuracy": 0.7549847185611724, "step": 23900 }, { "epoch": 2.392, "grad_norm": 0.513321042060852, "learning_rate": 7.608760876087609e-05, "loss": 0.4765, "mean_token_accuracy": 0.7706928968429565, "step": 23920 }, { "epoch": 2.394, "grad_norm": 0.520776629447937, "learning_rate": 7.606760676067607e-05, "loss": 0.4887, "mean_token_accuracy": 0.7684839367866516, "step": 23940 }, { "epoch": 2.396, "grad_norm": 0.6654241681098938, "learning_rate": 7.604760476047606e-05, "loss": 0.2894, "mean_token_accuracy": 0.7762702226638794, "step": 23960 }, { "epoch": 2.398, "grad_norm": 0.6476843953132629, "learning_rate": 7.602760276027603e-05, "loss": 0.3452, "mean_token_accuracy": 0.7781904131174088, "step": 23980 }, { "epoch": 2.4, "grad_norm": 0.6018632650375366, "learning_rate": 7.600760076007601e-05, "loss": 0.3755, "mean_token_accuracy": 0.7614085584878921, "step": 24000 }, { "epoch": 2.402, "grad_norm": 0.46126607060432434, "learning_rate": 7.598759875987599e-05, "loss": 0.4628, "mean_token_accuracy": 0.7637097507715225, "step": 24020 }, { "epoch": 2.404, "grad_norm": 0.7222064733505249, "learning_rate": 7.596759675967598e-05, "loss": 0.3877, "mean_token_accuracy": 0.7722072660923004, "step": 24040 }, { "epoch": 2.406, "grad_norm": 1.1507998704910278, "learning_rate": 7.594759475947595e-05, "loss": 0.3502, "mean_token_accuracy": 0.769579142332077, "step": 24060 }, { "epoch": 2.408, "grad_norm": 0.5809856057167053, "learning_rate": 7.592759275927593e-05, "loss": 0.4703, "mean_token_accuracy": 0.7642252236604691, "step": 24080 }, { "epoch": 2.41, "grad_norm": 0.6736276149749756, "learning_rate": 7.590759075907591e-05, "loss": 0.4058, "mean_token_accuracy": 0.7483141392469406, "step": 24100 }, { "epoch": 2.412, "grad_norm": 0.5366234183311462, "learning_rate": 7.58875887588759e-05, "loss": 0.4313, "mean_token_accuracy": 0.7676592200994492, "step": 24120 }, { "epoch": 2.414, "grad_norm": 0.5376785397529602, "learning_rate": 7.586758675867586e-05, "loss": 0.2893, "mean_token_accuracy": 0.7584418743848801, "step": 24140 }, { "epoch": 2.416, "grad_norm": 0.9400070905685425, "learning_rate": 7.584758475847585e-05, "loss": 0.5795, "mean_token_accuracy": 0.7563718825578689, "step": 24160 }, { "epoch": 2.418, "grad_norm": 1.0221420526504517, "learning_rate": 7.582758275827583e-05, "loss": 0.4893, "mean_token_accuracy": 0.7586238563060761, "step": 24180 }, { "epoch": 2.42, "grad_norm": 0.5170950889587402, "learning_rate": 7.580758075807581e-05, "loss": 0.3101, "mean_token_accuracy": 0.7725032687187194, "step": 24200 }, { "epoch": 2.422, "grad_norm": 0.9415600299835205, "learning_rate": 7.578757875787578e-05, "loss": 0.4707, "mean_token_accuracy": 0.7505202382802963, "step": 24220 }, { "epoch": 2.424, "grad_norm": 0.5214412212371826, "learning_rate": 7.576757675767578e-05, "loss": 0.3997, "mean_token_accuracy": 0.7848150968551636, "step": 24240 }, { "epoch": 2.426, "grad_norm": 0.5146944522857666, "learning_rate": 7.574757475747575e-05, "loss": 0.3014, "mean_token_accuracy": 0.7693692743778229, "step": 24260 }, { "epoch": 2.428, "grad_norm": 0.7192540764808655, "learning_rate": 7.572757275727573e-05, "loss": 0.3341, "mean_token_accuracy": 0.7799159497022629, "step": 24280 }, { "epoch": 2.43, "grad_norm": 1.0202417373657227, "learning_rate": 7.57075707570757e-05, "loss": 0.3321, "mean_token_accuracy": 0.7783661514520646, "step": 24300 }, { "epoch": 2.432, "grad_norm": 0.5408229231834412, "learning_rate": 7.56875687568757e-05, "loss": 0.3316, "mean_token_accuracy": 0.7834094583988189, "step": 24320 }, { "epoch": 2.434, "grad_norm": 0.4366854727268219, "learning_rate": 7.566756675667567e-05, "loss": 0.3592, "mean_token_accuracy": 0.7691802084445953, "step": 24340 }, { "epoch": 2.436, "grad_norm": 0.5033348798751831, "learning_rate": 7.564756475647565e-05, "loss": 0.3752, "mean_token_accuracy": 0.7784680545330047, "step": 24360 }, { "epoch": 2.438, "grad_norm": 0.5097927451133728, "learning_rate": 7.562756275627563e-05, "loss": 0.3522, "mean_token_accuracy": 0.7622760862112046, "step": 24380 }, { "epoch": 2.44, "grad_norm": 0.6013988852500916, "learning_rate": 7.560756075607562e-05, "loss": 0.3499, "mean_token_accuracy": 0.7926249384880066, "step": 24400 }, { "epoch": 2.442, "grad_norm": 0.7303833961486816, "learning_rate": 7.558755875587559e-05, "loss": 0.4262, "mean_token_accuracy": 0.7765411227941513, "step": 24420 }, { "epoch": 2.444, "grad_norm": 0.4800407886505127, "learning_rate": 7.556755675567557e-05, "loss": 0.4053, "mean_token_accuracy": 0.7596575886011123, "step": 24440 }, { "epoch": 2.446, "grad_norm": 0.530717670917511, "learning_rate": 7.554755475547555e-05, "loss": 0.2978, "mean_token_accuracy": 0.7809791982173919, "step": 24460 }, { "epoch": 2.448, "grad_norm": 1.0906200408935547, "learning_rate": 7.552755275527554e-05, "loss": 0.4881, "mean_token_accuracy": 0.7432172119617462, "step": 24480 }, { "epoch": 2.45, "grad_norm": 0.5246856212615967, "learning_rate": 7.55075507550755e-05, "loss": 0.3921, "mean_token_accuracy": 0.7595292001962661, "step": 24500 }, { "epoch": 2.452, "grad_norm": 0.5531862378120422, "learning_rate": 7.548754875487549e-05, "loss": 0.4425, "mean_token_accuracy": 0.7648659855127334, "step": 24520 }, { "epoch": 2.454, "grad_norm": 0.6864519715309143, "learning_rate": 7.546754675467547e-05, "loss": 0.3352, "mean_token_accuracy": 0.7720136374235154, "step": 24540 }, { "epoch": 2.456, "grad_norm": 0.6881136298179626, "learning_rate": 7.544754475447545e-05, "loss": 0.4468, "mean_token_accuracy": 0.7684414744377136, "step": 24560 }, { "epoch": 2.458, "grad_norm": 0.8115366697311401, "learning_rate": 7.542754275427542e-05, "loss": 0.4449, "mean_token_accuracy": 0.760135293006897, "step": 24580 }, { "epoch": 2.46, "grad_norm": 0.4169372320175171, "learning_rate": 7.54075407540754e-05, "loss": 0.4226, "mean_token_accuracy": 0.7828252017498016, "step": 24600 }, { "epoch": 2.462, "grad_norm": 2.2294623851776123, "learning_rate": 7.538753875387539e-05, "loss": 0.3453, "mean_token_accuracy": 0.7713555991649628, "step": 24620 }, { "epoch": 2.464, "grad_norm": 0.9809215664863586, "learning_rate": 7.536753675367537e-05, "loss": 0.5487, "mean_token_accuracy": 0.7365349590778351, "step": 24640 }, { "epoch": 2.466, "grad_norm": 0.7193884253501892, "learning_rate": 7.534753475347534e-05, "loss": 0.5276, "mean_token_accuracy": 0.7652481853961944, "step": 24660 }, { "epoch": 2.468, "grad_norm": 3.02557635307312, "learning_rate": 7.532753275327533e-05, "loss": 0.3638, "mean_token_accuracy": 0.7620997816324234, "step": 24680 }, { "epoch": 2.4699999999999998, "grad_norm": 0.9154054522514343, "learning_rate": 7.530753075307531e-05, "loss": 0.3364, "mean_token_accuracy": 0.7746912926435471, "step": 24700 }, { "epoch": 2.472, "grad_norm": 0.5519665479660034, "learning_rate": 7.528752875287529e-05, "loss": 0.2994, "mean_token_accuracy": 0.7725926518440247, "step": 24720 }, { "epoch": 2.474, "grad_norm": 0.5812709927558899, "learning_rate": 7.526752675267526e-05, "loss": 0.4625, "mean_token_accuracy": 0.7602076083421707, "step": 24740 }, { "epoch": 2.476, "grad_norm": 0.6820473074913025, "learning_rate": 7.524752475247526e-05, "loss": 0.3672, "mean_token_accuracy": 0.7771924197673797, "step": 24760 }, { "epoch": 2.4779999999999998, "grad_norm": 0.5386390089988708, "learning_rate": 7.522752275227523e-05, "loss": 0.329, "mean_token_accuracy": 0.7798908114433288, "step": 24780 }, { "epoch": 2.48, "grad_norm": 0.6058246493339539, "learning_rate": 7.520752075207521e-05, "loss": 0.4089, "mean_token_accuracy": 0.764163413643837, "step": 24800 }, { "epoch": 2.482, "grad_norm": 0.6935483813285828, "learning_rate": 7.518751875187519e-05, "loss": 0.3158, "mean_token_accuracy": 0.7751814395189285, "step": 24820 }, { "epoch": 2.484, "grad_norm": 0.49146005511283875, "learning_rate": 7.516751675167518e-05, "loss": 0.3125, "mean_token_accuracy": 0.7775338530540467, "step": 24840 }, { "epoch": 2.4859999999999998, "grad_norm": 0.4693954586982727, "learning_rate": 7.514751475147516e-05, "loss": 0.3706, "mean_token_accuracy": 0.7726909905672074, "step": 24860 }, { "epoch": 2.488, "grad_norm": 0.47553274035453796, "learning_rate": 7.512751275127513e-05, "loss": 0.3742, "mean_token_accuracy": 0.7776723593473435, "step": 24880 }, { "epoch": 2.49, "grad_norm": 0.4401707351207733, "learning_rate": 7.510751075107511e-05, "loss": 0.3969, "mean_token_accuracy": 0.7541901379823684, "step": 24900 }, { "epoch": 2.492, "grad_norm": 0.5397002100944519, "learning_rate": 7.50875087508751e-05, "loss": 0.4237, "mean_token_accuracy": 0.7732920110225677, "step": 24920 }, { "epoch": 2.4939999999999998, "grad_norm": 0.5383827090263367, "learning_rate": 7.506750675067508e-05, "loss": 0.3618, "mean_token_accuracy": 0.7733333766460418, "step": 24940 }, { "epoch": 2.496, "grad_norm": 0.6489604115486145, "learning_rate": 7.504750475047505e-05, "loss": 0.3357, "mean_token_accuracy": 0.7690646857023239, "step": 24960 }, { "epoch": 2.498, "grad_norm": 1.1094436645507812, "learning_rate": 7.502750275027503e-05, "loss": 0.4036, "mean_token_accuracy": 0.7767532944679261, "step": 24980 }, { "epoch": 2.5, "grad_norm": 0.6057858467102051, "learning_rate": 7.500750075007501e-05, "loss": 0.3555, "mean_token_accuracy": 0.7563881278038025, "step": 25000 }, { "epoch": 2.502, "grad_norm": 0.5300800204277039, "learning_rate": 7.4987498749875e-05, "loss": 0.4177, "mean_token_accuracy": 0.783178648352623, "step": 25020 }, { "epoch": 2.504, "grad_norm": 0.8406039476394653, "learning_rate": 7.496749674967497e-05, "loss": 0.387, "mean_token_accuracy": 0.7647445738315582, "step": 25040 }, { "epoch": 2.5060000000000002, "grad_norm": 2.612395763397217, "learning_rate": 7.494749474947496e-05, "loss": 0.4305, "mean_token_accuracy": 0.77183196246624, "step": 25060 }, { "epoch": 2.508, "grad_norm": 1.2056868076324463, "learning_rate": 7.492749274927493e-05, "loss": 0.533, "mean_token_accuracy": 0.750890052318573, "step": 25080 }, { "epoch": 2.51, "grad_norm": 2.8100829124450684, "learning_rate": 7.490749074907492e-05, "loss": 0.4342, "mean_token_accuracy": 0.7760986238718033, "step": 25100 }, { "epoch": 2.512, "grad_norm": 0.5827141404151917, "learning_rate": 7.488748874887488e-05, "loss": 0.4337, "mean_token_accuracy": 0.779557591676712, "step": 25120 }, { "epoch": 2.5140000000000002, "grad_norm": 0.5698882937431335, "learning_rate": 7.486748674867488e-05, "loss": 0.2682, "mean_token_accuracy": 0.7790238708257675, "step": 25140 }, { "epoch": 2.516, "grad_norm": 1.8857797384262085, "learning_rate": 7.484748474847485e-05, "loss": 0.3278, "mean_token_accuracy": 0.7795716822147369, "step": 25160 }, { "epoch": 2.518, "grad_norm": 0.4478018879890442, "learning_rate": 7.482748274827483e-05, "loss": 0.3638, "mean_token_accuracy": 0.7770979702472687, "step": 25180 }, { "epoch": 2.52, "grad_norm": 0.6441559791564941, "learning_rate": 7.48074807480748e-05, "loss": 0.4352, "mean_token_accuracy": 0.7694665879011154, "step": 25200 }, { "epoch": 2.5220000000000002, "grad_norm": 0.45727354288101196, "learning_rate": 7.47874787478748e-05, "loss": 0.4718, "mean_token_accuracy": 0.7822771698236466, "step": 25220 }, { "epoch": 2.524, "grad_norm": 0.7135100960731506, "learning_rate": 7.476747674767477e-05, "loss": 0.3557, "mean_token_accuracy": 0.7781578004360199, "step": 25240 }, { "epoch": 2.526, "grad_norm": 0.6635820865631104, "learning_rate": 7.474747474747475e-05, "loss": 0.3637, "mean_token_accuracy": 0.757407546043396, "step": 25260 }, { "epoch": 2.528, "grad_norm": 1.6025158166885376, "learning_rate": 7.472747274727474e-05, "loss": 0.4695, "mean_token_accuracy": 0.7698992311954498, "step": 25280 }, { "epoch": 2.5300000000000002, "grad_norm": 0.5484709739685059, "learning_rate": 7.470747074707472e-05, "loss": 0.3547, "mean_token_accuracy": 0.7802099257707595, "step": 25300 }, { "epoch": 2.532, "grad_norm": 1.3524175882339478, "learning_rate": 7.468746874687469e-05, "loss": 0.3532, "mean_token_accuracy": 0.7895096927881241, "step": 25320 }, { "epoch": 2.534, "grad_norm": 0.5647957921028137, "learning_rate": 7.466746674667467e-05, "loss": 0.3076, "mean_token_accuracy": 0.7761312454938889, "step": 25340 }, { "epoch": 2.536, "grad_norm": 2.013195753097534, "learning_rate": 7.464746474647465e-05, "loss": 0.4407, "mean_token_accuracy": 0.7752522319555283, "step": 25360 }, { "epoch": 2.5380000000000003, "grad_norm": 1.3815882205963135, "learning_rate": 7.462746274627464e-05, "loss": 0.467, "mean_token_accuracy": 0.7652805715799331, "step": 25380 }, { "epoch": 2.54, "grad_norm": 5.83587121963501, "learning_rate": 7.46074607460746e-05, "loss": 0.3381, "mean_token_accuracy": 0.7706245630979538, "step": 25400 }, { "epoch": 2.542, "grad_norm": 0.8957182765007019, "learning_rate": 7.458745874587459e-05, "loss": 0.4009, "mean_token_accuracy": 0.7737463712692261, "step": 25420 }, { "epoch": 2.544, "grad_norm": 0.6525290012359619, "learning_rate": 7.456745674567457e-05, "loss": 0.4033, "mean_token_accuracy": 0.7726758718490601, "step": 25440 }, { "epoch": 2.5460000000000003, "grad_norm": 0.5140429735183716, "learning_rate": 7.454745474547456e-05, "loss": 0.3587, "mean_token_accuracy": 0.783271187543869, "step": 25460 }, { "epoch": 2.548, "grad_norm": 1.6453909873962402, "learning_rate": 7.452745274527453e-05, "loss": 0.6637, "mean_token_accuracy": 0.7690565615892411, "step": 25480 }, { "epoch": 2.55, "grad_norm": 0.6619111895561218, "learning_rate": 7.450745074507451e-05, "loss": 0.3639, "mean_token_accuracy": 0.780635404586792, "step": 25500 }, { "epoch": 2.552, "grad_norm": 0.526692807674408, "learning_rate": 7.448744874487449e-05, "loss": 0.2527, "mean_token_accuracy": 0.7741579860448837, "step": 25520 }, { "epoch": 2.5540000000000003, "grad_norm": 0.5004529356956482, "learning_rate": 7.446744674467447e-05, "loss": 0.4678, "mean_token_accuracy": 0.758646410703659, "step": 25540 }, { "epoch": 2.556, "grad_norm": 0.6294264197349548, "learning_rate": 7.444744474447444e-05, "loss": 0.4563, "mean_token_accuracy": 0.7595522582530976, "step": 25560 }, { "epoch": 2.558, "grad_norm": 0.7612650990486145, "learning_rate": 7.442744274427444e-05, "loss": 0.4014, "mean_token_accuracy": 0.7614146888256073, "step": 25580 }, { "epoch": 2.56, "grad_norm": 1.0266075134277344, "learning_rate": 7.440744074407441e-05, "loss": 0.3593, "mean_token_accuracy": 0.7575856864452362, "step": 25600 }, { "epoch": 2.5620000000000003, "grad_norm": 0.5226241946220398, "learning_rate": 7.438743874387439e-05, "loss": 0.336, "mean_token_accuracy": 0.7805801719427109, "step": 25620 }, { "epoch": 2.564, "grad_norm": 0.5892675518989563, "learning_rate": 7.436743674367436e-05, "loss": 0.3447, "mean_token_accuracy": 0.7685180991888046, "step": 25640 }, { "epoch": 2.566, "grad_norm": 0.647567093372345, "learning_rate": 7.434743474347436e-05, "loss": 0.365, "mean_token_accuracy": 0.7663004755973816, "step": 25660 }, { "epoch": 2.568, "grad_norm": 1.1439605951309204, "learning_rate": 7.432743274327433e-05, "loss": 0.5047, "mean_token_accuracy": 0.7562308818101883, "step": 25680 }, { "epoch": 2.57, "grad_norm": 0.6088922023773193, "learning_rate": 7.430743074307431e-05, "loss": 0.3311, "mean_token_accuracy": 0.7770054787397385, "step": 25700 }, { "epoch": 2.572, "grad_norm": 0.8841238617897034, "learning_rate": 7.428742874287428e-05, "loss": 0.3622, "mean_token_accuracy": 0.7806790739297866, "step": 25720 }, { "epoch": 2.574, "grad_norm": 0.5690216422080994, "learning_rate": 7.426742674267428e-05, "loss": 0.3808, "mean_token_accuracy": 0.7687692135572434, "step": 25740 }, { "epoch": 2.576, "grad_norm": 0.6540228128433228, "learning_rate": 7.424742474247425e-05, "loss": 0.3906, "mean_token_accuracy": 0.7560274779796601, "step": 25760 }, { "epoch": 2.578, "grad_norm": 0.6677089929580688, "learning_rate": 7.422742274227423e-05, "loss": 0.3904, "mean_token_accuracy": 0.771072369813919, "step": 25780 }, { "epoch": 2.58, "grad_norm": 0.6336685419082642, "learning_rate": 7.420742074207421e-05, "loss": 0.4637, "mean_token_accuracy": 0.7489790081977844, "step": 25800 }, { "epoch": 2.582, "grad_norm": 1.2293556928634644, "learning_rate": 7.41874187418742e-05, "loss": 0.3621, "mean_token_accuracy": 0.7709732681512833, "step": 25820 }, { "epoch": 2.584, "grad_norm": 1.060499668121338, "learning_rate": 7.416741674167417e-05, "loss": 0.3596, "mean_token_accuracy": 0.7742567628622055, "step": 25840 }, { "epoch": 2.586, "grad_norm": 0.525017499923706, "learning_rate": 7.414741474147415e-05, "loss": 0.3371, "mean_token_accuracy": 0.7758841335773468, "step": 25860 }, { "epoch": 2.588, "grad_norm": 0.5732645988464355, "learning_rate": 7.412741274127413e-05, "loss": 0.5838, "mean_token_accuracy": 0.7690180569887162, "step": 25880 }, { "epoch": 2.59, "grad_norm": 0.6200737953186035, "learning_rate": 7.410741074107411e-05, "loss": 0.3948, "mean_token_accuracy": 0.7709156930446625, "step": 25900 }, { "epoch": 2.592, "grad_norm": 0.5558332204818726, "learning_rate": 7.408740874087408e-05, "loss": 0.4338, "mean_token_accuracy": 0.7487597614526749, "step": 25920 }, { "epoch": 2.594, "grad_norm": 0.5390216708183289, "learning_rate": 7.406740674067407e-05, "loss": 0.4442, "mean_token_accuracy": 0.7483654320240021, "step": 25940 }, { "epoch": 2.596, "grad_norm": 0.5428351759910583, "learning_rate": 7.404740474047405e-05, "loss": 0.3898, "mean_token_accuracy": 0.7771248996257782, "step": 25960 }, { "epoch": 2.598, "grad_norm": 0.5442767143249512, "learning_rate": 7.402740274027403e-05, "loss": 0.4526, "mean_token_accuracy": 0.7621782273054123, "step": 25980 }, { "epoch": 2.6, "grad_norm": 0.6642824411392212, "learning_rate": 7.4007400740074e-05, "loss": 0.2943, "mean_token_accuracy": 0.7816902130842209, "step": 26000 }, { "epoch": 2.602, "grad_norm": 1.543161392211914, "learning_rate": 7.398739873987399e-05, "loss": 0.3415, "mean_token_accuracy": 0.7717452257871628, "step": 26020 }, { "epoch": 2.604, "grad_norm": 0.5920403003692627, "learning_rate": 7.396739673967397e-05, "loss": 0.378, "mean_token_accuracy": 0.7663905560970307, "step": 26040 }, { "epoch": 2.606, "grad_norm": 1.5106810331344604, "learning_rate": 7.394739473947395e-05, "loss": 0.5294, "mean_token_accuracy": 0.7732407808303833, "step": 26060 }, { "epoch": 2.608, "grad_norm": 0.7187085747718811, "learning_rate": 7.392739273927392e-05, "loss": 0.3161, "mean_token_accuracy": 0.7919797301292419, "step": 26080 }, { "epoch": 2.61, "grad_norm": 0.48519036173820496, "learning_rate": 7.390739073907392e-05, "loss": 0.5913, "mean_token_accuracy": 0.7655506879091263, "step": 26100 }, { "epoch": 2.612, "grad_norm": 0.5624449849128723, "learning_rate": 7.388738873887389e-05, "loss": 0.4675, "mean_token_accuracy": 0.7650695115327835, "step": 26120 }, { "epoch": 2.614, "grad_norm": 0.6431352496147156, "learning_rate": 7.386738673867387e-05, "loss": 0.3451, "mean_token_accuracy": 0.771933114528656, "step": 26140 }, { "epoch": 2.616, "grad_norm": 0.6370450258255005, "learning_rate": 7.384738473847384e-05, "loss": 0.3818, "mean_token_accuracy": 0.7686731606721878, "step": 26160 }, { "epoch": 2.618, "grad_norm": 0.8472216725349426, "learning_rate": 7.382738273827384e-05, "loss": 0.3966, "mean_token_accuracy": 0.7632576584815979, "step": 26180 }, { "epoch": 2.62, "grad_norm": 0.4943225085735321, "learning_rate": 7.38073807380738e-05, "loss": 0.3747, "mean_token_accuracy": 0.7768097877502441, "step": 26200 }, { "epoch": 2.622, "grad_norm": 0.8859454989433289, "learning_rate": 7.378737873787379e-05, "loss": 0.3753, "mean_token_accuracy": 0.7739031434059143, "step": 26220 }, { "epoch": 2.624, "grad_norm": 0.40213918685913086, "learning_rate": 7.376737673767376e-05, "loss": 0.3615, "mean_token_accuracy": 0.7721775084733963, "step": 26240 }, { "epoch": 2.626, "grad_norm": 0.6703710556030273, "learning_rate": 7.374737473747376e-05, "loss": 0.3901, "mean_token_accuracy": 0.7678544819355011, "step": 26260 }, { "epoch": 2.628, "grad_norm": 3.4614651203155518, "learning_rate": 7.372737273727373e-05, "loss": 0.3858, "mean_token_accuracy": 0.7625130206346512, "step": 26280 }, { "epoch": 2.63, "grad_norm": 0.8459333777427673, "learning_rate": 7.370737073707371e-05, "loss": 0.4956, "mean_token_accuracy": 0.7662484109401703, "step": 26300 }, { "epoch": 2.632, "grad_norm": 1.3638286590576172, "learning_rate": 7.368736873687369e-05, "loss": 0.4287, "mean_token_accuracy": 0.7693981647491455, "step": 26320 }, { "epoch": 2.634, "grad_norm": 1.3112456798553467, "learning_rate": 7.366736673667367e-05, "loss": 0.4804, "mean_token_accuracy": 0.7534136056900025, "step": 26340 }, { "epoch": 2.636, "grad_norm": 1.219071865081787, "learning_rate": 7.364736473647366e-05, "loss": 0.4266, "mean_token_accuracy": 0.7813971251249313, "step": 26360 }, { "epoch": 2.638, "grad_norm": 0.47472989559173584, "learning_rate": 7.362736273627363e-05, "loss": 0.354, "mean_token_accuracy": 0.7721374541521072, "step": 26380 }, { "epoch": 2.64, "grad_norm": 1.4683942794799805, "learning_rate": 7.360736073607361e-05, "loss": 0.2803, "mean_token_accuracy": 0.7726209580898284, "step": 26400 }, { "epoch": 2.642, "grad_norm": 0.7663973569869995, "learning_rate": 7.358735873587359e-05, "loss": 0.458, "mean_token_accuracy": 0.7644380420446396, "step": 26420 }, { "epoch": 2.644, "grad_norm": 2.150810480117798, "learning_rate": 7.356735673567358e-05, "loss": 0.3195, "mean_token_accuracy": 0.7552993834018707, "step": 26440 }, { "epoch": 2.646, "grad_norm": 0.7259455919265747, "learning_rate": 7.354735473547355e-05, "loss": 0.325, "mean_token_accuracy": 0.7823679059743881, "step": 26460 }, { "epoch": 2.648, "grad_norm": 0.4290153384208679, "learning_rate": 7.352735273527354e-05, "loss": 0.3717, "mean_token_accuracy": 0.7793803513050079, "step": 26480 }, { "epoch": 2.65, "grad_norm": 0.5267384648323059, "learning_rate": 7.350735073507351e-05, "loss": 0.2607, "mean_token_accuracy": 0.7745116829872132, "step": 26500 }, { "epoch": 2.652, "grad_norm": 0.475987046957016, "learning_rate": 7.34873487348735e-05, "loss": 0.3254, "mean_token_accuracy": 0.7815397769212723, "step": 26520 }, { "epoch": 2.654, "grad_norm": 0.48368293046951294, "learning_rate": 7.346734673467346e-05, "loss": 0.3422, "mean_token_accuracy": 0.7646366566419601, "step": 26540 }, { "epoch": 2.656, "grad_norm": 0.5792871117591858, "learning_rate": 7.344734473447346e-05, "loss": 0.3183, "mean_token_accuracy": 0.7626155346632004, "step": 26560 }, { "epoch": 2.658, "grad_norm": 1.0347193479537964, "learning_rate": 7.342734273427343e-05, "loss": 0.3609, "mean_token_accuracy": 0.7649537444114685, "step": 26580 }, { "epoch": 2.66, "grad_norm": 0.6120941638946533, "learning_rate": 7.340734073407341e-05, "loss": 0.2685, "mean_token_accuracy": 0.7677059471607208, "step": 26600 }, { "epoch": 2.662, "grad_norm": 0.4521442651748657, "learning_rate": 7.33873387338734e-05, "loss": 0.3305, "mean_token_accuracy": 0.7618342250585556, "step": 26620 }, { "epoch": 2.664, "grad_norm": 0.7313169240951538, "learning_rate": 7.336733673367338e-05, "loss": 0.4516, "mean_token_accuracy": 0.749949437379837, "step": 26640 }, { "epoch": 2.666, "grad_norm": 0.4464898109436035, "learning_rate": 7.334733473347335e-05, "loss": 0.2591, "mean_token_accuracy": 0.7780465215444565, "step": 26660 }, { "epoch": 2.668, "grad_norm": 0.7257695198059082, "learning_rate": 7.332733273327333e-05, "loss": 0.2896, "mean_token_accuracy": 0.7835281670093537, "step": 26680 }, { "epoch": 2.67, "grad_norm": 0.48731157183647156, "learning_rate": 7.330733073307331e-05, "loss": 0.4257, "mean_token_accuracy": 0.7797983884811401, "step": 26700 }, { "epoch": 2.672, "grad_norm": 0.4860582649707794, "learning_rate": 7.32873287328733e-05, "loss": 0.4173, "mean_token_accuracy": 0.7669709742069244, "step": 26720 }, { "epoch": 2.674, "grad_norm": 1.4737993478775024, "learning_rate": 7.326732673267327e-05, "loss": 0.444, "mean_token_accuracy": 0.7681554019451141, "step": 26740 }, { "epoch": 2.676, "grad_norm": 0.5065277814865112, "learning_rate": 7.324732473247325e-05, "loss": 0.3953, "mean_token_accuracy": 0.7700491368770599, "step": 26760 }, { "epoch": 2.678, "grad_norm": 0.7107218503952026, "learning_rate": 7.322732273227323e-05, "loss": 0.4232, "mean_token_accuracy": 0.7795166045427322, "step": 26780 }, { "epoch": 2.68, "grad_norm": 0.6404061317443848, "learning_rate": 7.320732073207322e-05, "loss": 0.4873, "mean_token_accuracy": 0.7603070676326752, "step": 26800 }, { "epoch": 2.682, "grad_norm": 0.46912986040115356, "learning_rate": 7.318731873187319e-05, "loss": 0.3791, "mean_token_accuracy": 0.7701680690050126, "step": 26820 }, { "epoch": 2.684, "grad_norm": 1.358063817024231, "learning_rate": 7.316731673167317e-05, "loss": 0.4789, "mean_token_accuracy": 0.7625478833913804, "step": 26840 }, { "epoch": 2.686, "grad_norm": 0.8823156952857971, "learning_rate": 7.314731473147315e-05, "loss": 0.351, "mean_token_accuracy": 0.7644151747226715, "step": 26860 }, { "epoch": 2.6879999999999997, "grad_norm": 0.5954269766807556, "learning_rate": 7.312731273127314e-05, "loss": 0.2884, "mean_token_accuracy": 0.7765515118837356, "step": 26880 }, { "epoch": 2.69, "grad_norm": 1.0419721603393555, "learning_rate": 7.31073107310731e-05, "loss": 0.4089, "mean_token_accuracy": 0.7772235095500946, "step": 26900 }, { "epoch": 2.692, "grad_norm": 0.6775176525115967, "learning_rate": 7.308730873087309e-05, "loss": 0.389, "mean_token_accuracy": 0.7501326829195023, "step": 26920 }, { "epoch": 2.694, "grad_norm": 0.5140560269355774, "learning_rate": 7.306730673067307e-05, "loss": 0.3611, "mean_token_accuracy": 0.7704172283411026, "step": 26940 }, { "epoch": 2.6959999999999997, "grad_norm": 0.5168219804763794, "learning_rate": 7.304730473047305e-05, "loss": 0.3947, "mean_token_accuracy": 0.7616788744926453, "step": 26960 }, { "epoch": 2.698, "grad_norm": 0.4534956216812134, "learning_rate": 7.302730273027302e-05, "loss": 0.3535, "mean_token_accuracy": 0.7744235098361969, "step": 26980 }, { "epoch": 2.7, "grad_norm": 0.6191295981407166, "learning_rate": 7.300730073007302e-05, "loss": 0.3258, "mean_token_accuracy": 0.768257737159729, "step": 27000 }, { "epoch": 2.702, "grad_norm": 2.6698319911956787, "learning_rate": 7.298729872987299e-05, "loss": 0.4991, "mean_token_accuracy": 0.7722083270549774, "step": 27020 }, { "epoch": 2.7039999999999997, "grad_norm": 0.5549486875534058, "learning_rate": 7.296729672967297e-05, "loss": 0.283, "mean_token_accuracy": 0.7857799857854844, "step": 27040 }, { "epoch": 2.706, "grad_norm": 0.8250360488891602, "learning_rate": 7.294729472947294e-05, "loss": 0.3988, "mean_token_accuracy": 0.762323135137558, "step": 27060 }, { "epoch": 2.708, "grad_norm": 1.1740707159042358, "learning_rate": 7.292729272927294e-05, "loss": 0.3359, "mean_token_accuracy": 0.791151362657547, "step": 27080 }, { "epoch": 2.71, "grad_norm": 0.6380830407142639, "learning_rate": 7.290729072907291e-05, "loss": 0.2793, "mean_token_accuracy": 0.7641856402158738, "step": 27100 }, { "epoch": 2.7119999999999997, "grad_norm": 0.46899616718292236, "learning_rate": 7.288728872887289e-05, "loss": 0.4544, "mean_token_accuracy": 0.7660612910985947, "step": 27120 }, { "epoch": 2.714, "grad_norm": 1.56208074092865, "learning_rate": 7.286728672867287e-05, "loss": 0.3706, "mean_token_accuracy": 0.766968160867691, "step": 27140 }, { "epoch": 2.716, "grad_norm": 0.4850318431854248, "learning_rate": 7.284728472847286e-05, "loss": 0.3678, "mean_token_accuracy": 0.7660909950733185, "step": 27160 }, { "epoch": 2.718, "grad_norm": 1.3090757131576538, "learning_rate": 7.282728272827283e-05, "loss": 0.2718, "mean_token_accuracy": 0.7895204186439514, "step": 27180 }, { "epoch": 2.7199999999999998, "grad_norm": 1.0180199146270752, "learning_rate": 7.280728072807281e-05, "loss": 0.6162, "mean_token_accuracy": 0.780162262916565, "step": 27200 }, { "epoch": 2.722, "grad_norm": 0.560012698173523, "learning_rate": 7.278727872787279e-05, "loss": 0.395, "mean_token_accuracy": 0.7876878052949905, "step": 27220 }, { "epoch": 2.724, "grad_norm": 1.2974014282226562, "learning_rate": 7.276727672767278e-05, "loss": 0.3956, "mean_token_accuracy": 0.7639310985803605, "step": 27240 }, { "epoch": 2.726, "grad_norm": 0.6483154892921448, "learning_rate": 7.274727472747275e-05, "loss": 0.4557, "mean_token_accuracy": 0.7672583818435669, "step": 27260 }, { "epoch": 2.7279999999999998, "grad_norm": 0.5086987614631653, "learning_rate": 7.272727272727273e-05, "loss": 0.3663, "mean_token_accuracy": 0.7773028880357742, "step": 27280 }, { "epoch": 2.73, "grad_norm": 0.9283174872398376, "learning_rate": 7.270727072707271e-05, "loss": 0.4399, "mean_token_accuracy": 0.774404090642929, "step": 27300 }, { "epoch": 2.732, "grad_norm": 1.2660390138626099, "learning_rate": 7.26872687268727e-05, "loss": 0.5054, "mean_token_accuracy": 0.759745791554451, "step": 27320 }, { "epoch": 2.734, "grad_norm": 0.5215126872062683, "learning_rate": 7.266726672667266e-05, "loss": 0.4834, "mean_token_accuracy": 0.7740251243114471, "step": 27340 }, { "epoch": 2.7359999999999998, "grad_norm": 0.5511738657951355, "learning_rate": 7.264726472647265e-05, "loss": 0.2723, "mean_token_accuracy": 0.7840337365865707, "step": 27360 }, { "epoch": 2.738, "grad_norm": 0.4827548563480377, "learning_rate": 7.262726272627263e-05, "loss": 0.2744, "mean_token_accuracy": 0.7842442274093628, "step": 27380 }, { "epoch": 2.74, "grad_norm": 0.5542432069778442, "learning_rate": 7.260726072607261e-05, "loss": 0.4504, "mean_token_accuracy": 0.7686699837446213, "step": 27400 }, { "epoch": 2.742, "grad_norm": 0.39758604764938354, "learning_rate": 7.258725872587258e-05, "loss": 0.3543, "mean_token_accuracy": 0.7873461782932282, "step": 27420 }, { "epoch": 2.7439999999999998, "grad_norm": 0.6692826747894287, "learning_rate": 7.256725672567257e-05, "loss": 0.3464, "mean_token_accuracy": 0.7726529002189636, "step": 27440 }, { "epoch": 2.746, "grad_norm": 0.5761163830757141, "learning_rate": 7.254725472547255e-05, "loss": 0.2799, "mean_token_accuracy": 0.7659371078014374, "step": 27460 }, { "epoch": 2.748, "grad_norm": 0.5442177653312683, "learning_rate": 7.252725272527253e-05, "loss": 0.3028, "mean_token_accuracy": 0.76364786028862, "step": 27480 }, { "epoch": 2.75, "grad_norm": 0.6498110890388489, "learning_rate": 7.25072507250725e-05, "loss": 0.4235, "mean_token_accuracy": 0.7845389604568481, "step": 27500 }, { "epoch": 2.752, "grad_norm": 0.9222553968429565, "learning_rate": 7.24872487248725e-05, "loss": 0.479, "mean_token_accuracy": 0.7736484676599502, "step": 27520 }, { "epoch": 2.754, "grad_norm": 1.0956486463546753, "learning_rate": 7.246724672467247e-05, "loss": 0.2925, "mean_token_accuracy": 0.7769776284694672, "step": 27540 }, { "epoch": 2.7560000000000002, "grad_norm": 1.1472057104110718, "learning_rate": 7.244724472447245e-05, "loss": 0.4873, "mean_token_accuracy": 0.7770749032497406, "step": 27560 }, { "epoch": 2.758, "grad_norm": 0.7564826607704163, "learning_rate": 7.242724272427242e-05, "loss": 0.4027, "mean_token_accuracy": 0.7681439131498337, "step": 27580 }, { "epoch": 2.76, "grad_norm": 1.3243197202682495, "learning_rate": 7.240724072407242e-05, "loss": 0.4241, "mean_token_accuracy": 0.7601159393787384, "step": 27600 }, { "epoch": 2.762, "grad_norm": 2.8252339363098145, "learning_rate": 7.238723872387239e-05, "loss": 0.4364, "mean_token_accuracy": 0.7626213759183884, "step": 27620 }, { "epoch": 2.7640000000000002, "grad_norm": 0.45203861594200134, "learning_rate": 7.236723672367237e-05, "loss": 0.304, "mean_token_accuracy": 0.7701959997415543, "step": 27640 }, { "epoch": 2.766, "grad_norm": 0.8971319198608398, "learning_rate": 7.234723472347235e-05, "loss": 0.3706, "mean_token_accuracy": 0.7761748850345611, "step": 27660 }, { "epoch": 2.768, "grad_norm": 0.5350444316864014, "learning_rate": 7.232723272327234e-05, "loss": 0.4354, "mean_token_accuracy": 0.7631947338581085, "step": 27680 }, { "epoch": 2.77, "grad_norm": 0.694657027721405, "learning_rate": 7.23072307230723e-05, "loss": 0.3038, "mean_token_accuracy": 0.7762582242488861, "step": 27700 }, { "epoch": 2.7720000000000002, "grad_norm": 0.43975430727005005, "learning_rate": 7.228722872287229e-05, "loss": 0.3589, "mean_token_accuracy": 0.7723548471927643, "step": 27720 }, { "epoch": 2.774, "grad_norm": 1.2447121143341064, "learning_rate": 7.226722672267227e-05, "loss": 0.4236, "mean_token_accuracy": 0.7593380987644196, "step": 27740 }, { "epoch": 2.776, "grad_norm": 0.42081525921821594, "learning_rate": 7.224722472247225e-05, "loss": 0.3771, "mean_token_accuracy": 0.7731598556041718, "step": 27760 }, { "epoch": 2.778, "grad_norm": 0.4513353407382965, "learning_rate": 7.222722272227222e-05, "loss": 0.2954, "mean_token_accuracy": 0.7759094476699829, "step": 27780 }, { "epoch": 2.7800000000000002, "grad_norm": 1.3491549491882324, "learning_rate": 7.22072207220722e-05, "loss": 0.4364, "mean_token_accuracy": 0.7733048111200332, "step": 27800 }, { "epoch": 2.782, "grad_norm": 0.5595263838768005, "learning_rate": 7.218721872187219e-05, "loss": 0.4291, "mean_token_accuracy": 0.774910768866539, "step": 27820 }, { "epoch": 2.784, "grad_norm": 0.6232470870018005, "learning_rate": 7.216721672167217e-05, "loss": 0.3549, "mean_token_accuracy": 0.7720575779676437, "step": 27840 }, { "epoch": 2.786, "grad_norm": 0.8599614500999451, "learning_rate": 7.214721472147216e-05, "loss": 0.4123, "mean_token_accuracy": 0.7681786835193634, "step": 27860 }, { "epoch": 2.7880000000000003, "grad_norm": 0.5995838046073914, "learning_rate": 7.212721272127212e-05, "loss": 0.2893, "mean_token_accuracy": 0.7671032309532165, "step": 27880 }, { "epoch": 2.79, "grad_norm": 0.5014781951904297, "learning_rate": 7.210721072107212e-05, "loss": 0.386, "mean_token_accuracy": 0.7631019920110702, "step": 27900 }, { "epoch": 2.792, "grad_norm": 0.7817425727844238, "learning_rate": 7.208720872087209e-05, "loss": 0.3149, "mean_token_accuracy": 0.787963005900383, "step": 27920 }, { "epoch": 2.794, "grad_norm": 1.5762746334075928, "learning_rate": 7.206720672067207e-05, "loss": 0.3508, "mean_token_accuracy": 0.7949762284755707, "step": 27940 }, { "epoch": 2.7960000000000003, "grad_norm": 0.462683767080307, "learning_rate": 7.204720472047204e-05, "loss": 0.402, "mean_token_accuracy": 0.7771514803171158, "step": 27960 }, { "epoch": 2.798, "grad_norm": 0.6354729533195496, "learning_rate": 7.202720272027204e-05, "loss": 0.4286, "mean_token_accuracy": 0.7566902726888657, "step": 27980 }, { "epoch": 2.8, "grad_norm": 0.6342338919639587, "learning_rate": 7.200720072007201e-05, "loss": 0.3632, "mean_token_accuracy": 0.7660365790128708, "step": 28000 }, { "epoch": 2.802, "grad_norm": 1.5562235116958618, "learning_rate": 7.198719871987199e-05, "loss": 0.4064, "mean_token_accuracy": 0.7778418600559235, "step": 28020 }, { "epoch": 2.8040000000000003, "grad_norm": 1.0036340951919556, "learning_rate": 7.196719671967198e-05, "loss": 0.4718, "mean_token_accuracy": 0.7611131757497788, "step": 28040 }, { "epoch": 2.806, "grad_norm": 0.502666175365448, "learning_rate": 7.194719471947196e-05, "loss": 0.3775, "mean_token_accuracy": 0.782520255446434, "step": 28060 }, { "epoch": 2.808, "grad_norm": 0.6750527620315552, "learning_rate": 7.192719271927193e-05, "loss": 0.2844, "mean_token_accuracy": 0.7718601375818253, "step": 28080 }, { "epoch": 2.81, "grad_norm": 0.5022746920585632, "learning_rate": 7.190719071907191e-05, "loss": 0.3401, "mean_token_accuracy": 0.7724752128124237, "step": 28100 }, { "epoch": 2.8120000000000003, "grad_norm": 0.42003539204597473, "learning_rate": 7.18871887188719e-05, "loss": 0.345, "mean_token_accuracy": 0.7880270779132843, "step": 28120 }, { "epoch": 2.814, "grad_norm": 0.9371370077133179, "learning_rate": 7.186718671867188e-05, "loss": 0.3041, "mean_token_accuracy": 0.7714037269353866, "step": 28140 }, { "epoch": 2.816, "grad_norm": 1.613385558128357, "learning_rate": 7.184718471847185e-05, "loss": 0.4236, "mean_token_accuracy": 0.7594305455684662, "step": 28160 }, { "epoch": 2.818, "grad_norm": 0.5815364122390747, "learning_rate": 7.182718271827183e-05, "loss": 0.3001, "mean_token_accuracy": 0.7842517197132111, "step": 28180 }, { "epoch": 2.82, "grad_norm": 1.7138980627059937, "learning_rate": 7.180718071807181e-05, "loss": 0.2973, "mean_token_accuracy": 0.767485749721527, "step": 28200 }, { "epoch": 2.822, "grad_norm": 0.41392603516578674, "learning_rate": 7.17871787178718e-05, "loss": 0.4238, "mean_token_accuracy": 0.7810722529888153, "step": 28220 }, { "epoch": 2.824, "grad_norm": 1.3356351852416992, "learning_rate": 7.176717671767177e-05, "loss": 0.3685, "mean_token_accuracy": 0.781608647108078, "step": 28240 }, { "epoch": 2.826, "grad_norm": 0.5949317812919617, "learning_rate": 7.174717471747175e-05, "loss": 0.3747, "mean_token_accuracy": 0.7789996057748795, "step": 28260 }, { "epoch": 2.828, "grad_norm": 1.7084662914276123, "learning_rate": 7.172717271727173e-05, "loss": 0.4797, "mean_token_accuracy": 0.7749371528625488, "step": 28280 }, { "epoch": 2.83, "grad_norm": 0.4826675355434418, "learning_rate": 7.170717071707171e-05, "loss": 0.287, "mean_token_accuracy": 0.7955493777990341, "step": 28300 }, { "epoch": 2.832, "grad_norm": 1.9523309469223022, "learning_rate": 7.168716871687168e-05, "loss": 0.3496, "mean_token_accuracy": 0.7700273483991623, "step": 28320 }, { "epoch": 2.834, "grad_norm": 0.4979017674922943, "learning_rate": 7.166716671667168e-05, "loss": 0.3516, "mean_token_accuracy": 0.7658978879451752, "step": 28340 }, { "epoch": 2.836, "grad_norm": 2.6267507076263428, "learning_rate": 7.164716471647165e-05, "loss": 0.4762, "mean_token_accuracy": 0.768436947464943, "step": 28360 }, { "epoch": 2.838, "grad_norm": 0.794539213180542, "learning_rate": 7.162716271627163e-05, "loss": 0.3491, "mean_token_accuracy": 0.7672697603702545, "step": 28380 }, { "epoch": 2.84, "grad_norm": 0.6585694551467896, "learning_rate": 7.16071607160716e-05, "loss": 0.3648, "mean_token_accuracy": 0.7717416197061538, "step": 28400 }, { "epoch": 2.842, "grad_norm": 0.5819289088249207, "learning_rate": 7.15871587158716e-05, "loss": 0.2814, "mean_token_accuracy": 0.785032108426094, "step": 28420 }, { "epoch": 2.844, "grad_norm": 0.7321308851242065, "learning_rate": 7.156715671567157e-05, "loss": 0.437, "mean_token_accuracy": 0.7718233793973923, "step": 28440 }, { "epoch": 2.846, "grad_norm": 0.687842607498169, "learning_rate": 7.154715471547155e-05, "loss": 0.383, "mean_token_accuracy": 0.7721839040517807, "step": 28460 }, { "epoch": 2.848, "grad_norm": 0.467720627784729, "learning_rate": 7.152715271527153e-05, "loss": 0.385, "mean_token_accuracy": 0.7696374982595444, "step": 28480 }, { "epoch": 2.85, "grad_norm": 0.5354492664337158, "learning_rate": 7.150715071507152e-05, "loss": 0.3679, "mean_token_accuracy": 0.7733211994171143, "step": 28500 }, { "epoch": 2.852, "grad_norm": 0.9242133498191833, "learning_rate": 7.148714871487149e-05, "loss": 0.4576, "mean_token_accuracy": 0.7596699327230454, "step": 28520 }, { "epoch": 2.854, "grad_norm": 0.592906653881073, "learning_rate": 7.146714671467147e-05, "loss": 0.3153, "mean_token_accuracy": 0.7795357018709183, "step": 28540 }, { "epoch": 2.856, "grad_norm": 0.5033730864524841, "learning_rate": 7.144714471447145e-05, "loss": 0.4533, "mean_token_accuracy": 0.7537955552339554, "step": 28560 }, { "epoch": 2.858, "grad_norm": 0.6340453028678894, "learning_rate": 7.142714271427144e-05, "loss": 0.3315, "mean_token_accuracy": 0.77965627014637, "step": 28580 }, { "epoch": 2.86, "grad_norm": 0.5199571251869202, "learning_rate": 7.14071407140714e-05, "loss": 0.3822, "mean_token_accuracy": 0.7712762296199799, "step": 28600 }, { "epoch": 2.862, "grad_norm": 1.1548068523406982, "learning_rate": 7.138713871387139e-05, "loss": 0.3697, "mean_token_accuracy": 0.7836301147937774, "step": 28620 }, { "epoch": 2.864, "grad_norm": 0.6470006704330444, "learning_rate": 7.136713671367137e-05, "loss": 0.4231, "mean_token_accuracy": 0.7825014770030976, "step": 28640 }, { "epoch": 2.866, "grad_norm": 0.7511778473854065, "learning_rate": 7.134713471347136e-05, "loss": 0.3952, "mean_token_accuracy": 0.7770122349262237, "step": 28660 }, { "epoch": 2.868, "grad_norm": 1.7539533376693726, "learning_rate": 7.132713271327132e-05, "loss": 0.4769, "mean_token_accuracy": 0.7669455885887146, "step": 28680 }, { "epoch": 2.87, "grad_norm": 0.6294025182723999, "learning_rate": 7.130713071307131e-05, "loss": 0.312, "mean_token_accuracy": 0.769847109913826, "step": 28700 }, { "epoch": 2.872, "grad_norm": 0.9348831176757812, "learning_rate": 7.128712871287129e-05, "loss": 0.4156, "mean_token_accuracy": 0.7744261264801026, "step": 28720 }, { "epoch": 2.874, "grad_norm": 0.557972252368927, "learning_rate": 7.126712671267127e-05, "loss": 0.3747, "mean_token_accuracy": 0.7840445935726166, "step": 28740 }, { "epoch": 2.876, "grad_norm": 0.5004600882530212, "learning_rate": 7.124712471247124e-05, "loss": 0.2381, "mean_token_accuracy": 0.785550233721733, "step": 28760 }, { "epoch": 2.878, "grad_norm": 0.4821445941925049, "learning_rate": 7.122712271227123e-05, "loss": 0.409, "mean_token_accuracy": 0.7781937032938003, "step": 28780 }, { "epoch": 2.88, "grad_norm": 0.43911656737327576, "learning_rate": 7.120712071207121e-05, "loss": 0.4779, "mean_token_accuracy": 0.7711127460002899, "step": 28800 }, { "epoch": 2.882, "grad_norm": 0.5811845660209656, "learning_rate": 7.118711871187119e-05, "loss": 0.2996, "mean_token_accuracy": 0.7792768210172654, "step": 28820 }, { "epoch": 2.884, "grad_norm": 0.4290788471698761, "learning_rate": 7.116711671167116e-05, "loss": 0.2654, "mean_token_accuracy": 0.7671212673187255, "step": 28840 }, { "epoch": 2.886, "grad_norm": 0.6822429299354553, "learning_rate": 7.114711471147116e-05, "loss": 0.3394, "mean_token_accuracy": 0.7853504598140717, "step": 28860 }, { "epoch": 2.888, "grad_norm": 0.9372721910476685, "learning_rate": 7.112711271127113e-05, "loss": 0.2918, "mean_token_accuracy": 0.7847874283790588, "step": 28880 }, { "epoch": 2.89, "grad_norm": 0.5561668276786804, "learning_rate": 7.110711071107111e-05, "loss": 0.3436, "mean_token_accuracy": 0.7817149579524993, "step": 28900 }, { "epoch": 2.892, "grad_norm": 0.5779550075531006, "learning_rate": 7.108710871087108e-05, "loss": 0.3408, "mean_token_accuracy": 0.7787374705076218, "step": 28920 }, { "epoch": 2.894, "grad_norm": 0.4636074900627136, "learning_rate": 7.106710671067108e-05, "loss": 0.3915, "mean_token_accuracy": 0.7747856616973877, "step": 28940 }, { "epoch": 2.896, "grad_norm": 0.5386895537376404, "learning_rate": 7.104710471047105e-05, "loss": 0.5283, "mean_token_accuracy": 0.7541032522916794, "step": 28960 }, { "epoch": 2.898, "grad_norm": 0.6066156625747681, "learning_rate": 7.102710271027103e-05, "loss": 0.258, "mean_token_accuracy": 0.7846490710973739, "step": 28980 }, { "epoch": 2.9, "grad_norm": 0.6210530400276184, "learning_rate": 7.100710071007101e-05, "loss": 0.3482, "mean_token_accuracy": 0.7796706467866897, "step": 29000 }, { "epoch": 2.902, "grad_norm": 1.3516981601715088, "learning_rate": 7.0987098709871e-05, "loss": 0.3957, "mean_token_accuracy": 0.7683951467275619, "step": 29020 }, { "epoch": 2.904, "grad_norm": 0.45656633377075195, "learning_rate": 7.096709670967097e-05, "loss": 0.4647, "mean_token_accuracy": 0.746124392747879, "step": 29040 }, { "epoch": 2.906, "grad_norm": 1.8548595905303955, "learning_rate": 7.094709470947095e-05, "loss": 0.459, "mean_token_accuracy": 0.7573868215084076, "step": 29060 }, { "epoch": 2.908, "grad_norm": 0.5436551570892334, "learning_rate": 7.092709270927093e-05, "loss": 0.4203, "mean_token_accuracy": 0.7556138038635254, "step": 29080 }, { "epoch": 2.91, "grad_norm": 2.6477746963500977, "learning_rate": 7.090709070907091e-05, "loss": 0.4763, "mean_token_accuracy": 0.7623636335134506, "step": 29100 }, { "epoch": 2.912, "grad_norm": 0.4993080198764801, "learning_rate": 7.088708870887088e-05, "loss": 0.4514, "mean_token_accuracy": 0.7795067846775054, "step": 29120 }, { "epoch": 2.914, "grad_norm": 0.4687429368495941, "learning_rate": 7.086708670867087e-05, "loss": 0.3492, "mean_token_accuracy": 0.7548551291227341, "step": 29140 }, { "epoch": 2.916, "grad_norm": 0.5505796074867249, "learning_rate": 7.084708470847085e-05, "loss": 0.2811, "mean_token_accuracy": 0.7841164588928222, "step": 29160 }, { "epoch": 2.918, "grad_norm": 0.6359449625015259, "learning_rate": 7.082708270827083e-05, "loss": 0.2608, "mean_token_accuracy": 0.7749526292085648, "step": 29180 }, { "epoch": 2.92, "grad_norm": 1.4524582624435425, "learning_rate": 7.08070807080708e-05, "loss": 0.4608, "mean_token_accuracy": 0.7488138735294342, "step": 29200 }, { "epoch": 2.922, "grad_norm": 0.45987236499786377, "learning_rate": 7.078707870787079e-05, "loss": 0.3522, "mean_token_accuracy": 0.783650141954422, "step": 29220 }, { "epoch": 2.924, "grad_norm": 0.704616367816925, "learning_rate": 7.076707670767077e-05, "loss": 0.5268, "mean_token_accuracy": 0.7646976262331009, "step": 29240 }, { "epoch": 2.926, "grad_norm": 0.9832669496536255, "learning_rate": 7.074707470747075e-05, "loss": 0.3144, "mean_token_accuracy": 0.7882678776979446, "step": 29260 }, { "epoch": 2.928, "grad_norm": 0.5674408674240112, "learning_rate": 7.072707270727072e-05, "loss": 0.3663, "mean_token_accuracy": 0.7732463270425797, "step": 29280 }, { "epoch": 2.93, "grad_norm": 0.5252720713615417, "learning_rate": 7.07070707070707e-05, "loss": 0.369, "mean_token_accuracy": 0.7711725175380707, "step": 29300 }, { "epoch": 2.932, "grad_norm": 0.484718382358551, "learning_rate": 7.068706870687069e-05, "loss": 0.3073, "mean_token_accuracy": 0.7823656350374222, "step": 29320 }, { "epoch": 2.934, "grad_norm": 0.6399009823799133, "learning_rate": 7.066706670667067e-05, "loss": 0.3456, "mean_token_accuracy": 0.77097889482975, "step": 29340 }, { "epoch": 2.936, "grad_norm": 1.2482736110687256, "learning_rate": 7.064706470647065e-05, "loss": 0.4507, "mean_token_accuracy": 0.7626610308885574, "step": 29360 }, { "epoch": 2.9379999999999997, "grad_norm": 0.42354458570480347, "learning_rate": 7.062706270627064e-05, "loss": 0.3436, "mean_token_accuracy": 0.7688536554574966, "step": 29380 }, { "epoch": 2.94, "grad_norm": 0.49689558148384094, "learning_rate": 7.060706070607062e-05, "loss": 0.341, "mean_token_accuracy": 0.7759579241275787, "step": 29400 }, { "epoch": 2.942, "grad_norm": 0.5098229646682739, "learning_rate": 7.058705870587059e-05, "loss": 0.3432, "mean_token_accuracy": 0.7684891521930695, "step": 29420 }, { "epoch": 2.944, "grad_norm": 2.4646825790405273, "learning_rate": 7.056705670567057e-05, "loss": 0.302, "mean_token_accuracy": 0.7901662707328796, "step": 29440 }, { "epoch": 2.9459999999999997, "grad_norm": 0.652504026889801, "learning_rate": 7.054705470547056e-05, "loss": 0.3554, "mean_token_accuracy": 0.7735525876283645, "step": 29460 }, { "epoch": 2.948, "grad_norm": 0.7552750706672668, "learning_rate": 7.052705270527054e-05, "loss": 0.3719, "mean_token_accuracy": 0.7647523492574692, "step": 29480 }, { "epoch": 2.95, "grad_norm": 0.4699416160583496, "learning_rate": 7.050705070507051e-05, "loss": 0.3537, "mean_token_accuracy": 0.7859277874231339, "step": 29500 }, { "epoch": 2.952, "grad_norm": 0.537209689617157, "learning_rate": 7.048704870487049e-05, "loss": 0.4573, "mean_token_accuracy": 0.7747603535652161, "step": 29520 }, { "epoch": 2.9539999999999997, "grad_norm": 0.882503867149353, "learning_rate": 7.046704670467047e-05, "loss": 0.3819, "mean_token_accuracy": 0.7691424667835236, "step": 29540 }, { "epoch": 2.956, "grad_norm": 0.6148021817207336, "learning_rate": 7.044704470447046e-05, "loss": 0.2779, "mean_token_accuracy": 0.7594513714313507, "step": 29560 }, { "epoch": 2.958, "grad_norm": 0.6376790404319763, "learning_rate": 7.042704270427043e-05, "loss": 0.3431, "mean_token_accuracy": 0.7731406152248382, "step": 29580 }, { "epoch": 2.96, "grad_norm": 0.5759845972061157, "learning_rate": 7.040704070407041e-05, "loss": 0.2554, "mean_token_accuracy": 0.7999852210283279, "step": 29600 }, { "epoch": 2.9619999999999997, "grad_norm": 0.6887943148612976, "learning_rate": 7.038703870387039e-05, "loss": 0.3686, "mean_token_accuracy": 0.7654375731945038, "step": 29620 }, { "epoch": 2.964, "grad_norm": 0.6170690655708313, "learning_rate": 7.036703670367038e-05, "loss": 0.3868, "mean_token_accuracy": 0.7784854412078858, "step": 29640 }, { "epoch": 2.966, "grad_norm": 0.6125934720039368, "learning_rate": 7.034703470347034e-05, "loss": 0.3403, "mean_token_accuracy": 0.7878391832113266, "step": 29660 }, { "epoch": 2.968, "grad_norm": 0.7866735458374023, "learning_rate": 7.032703270327033e-05, "loss": 0.566, "mean_token_accuracy": 0.7520978480577469, "step": 29680 }, { "epoch": 2.9699999999999998, "grad_norm": 0.8195043802261353, "learning_rate": 7.030703070307031e-05, "loss": 0.3987, "mean_token_accuracy": 0.7684801012277603, "step": 29700 }, { "epoch": 2.972, "grad_norm": 0.7373509407043457, "learning_rate": 7.02870287028703e-05, "loss": 0.3848, "mean_token_accuracy": 0.7812507718801498, "step": 29720 }, { "epoch": 2.974, "grad_norm": 1.1405839920043945, "learning_rate": 7.026702670267026e-05, "loss": 0.3524, "mean_token_accuracy": 0.7791335344314575, "step": 29740 }, { "epoch": 2.976, "grad_norm": 0.727328896522522, "learning_rate": 7.024702470247026e-05, "loss": 0.3239, "mean_token_accuracy": 0.7713832348585129, "step": 29760 }, { "epoch": 2.9779999999999998, "grad_norm": 0.8749078512191772, "learning_rate": 7.022702270227023e-05, "loss": 0.3384, "mean_token_accuracy": 0.7767517447471619, "step": 29780 }, { "epoch": 2.98, "grad_norm": 0.5280575156211853, "learning_rate": 7.020702070207021e-05, "loss": 0.359, "mean_token_accuracy": 0.78366779088974, "step": 29800 }, { "epoch": 2.982, "grad_norm": 0.5950624346733093, "learning_rate": 7.018701870187018e-05, "loss": 0.3674, "mean_token_accuracy": 0.7600268334150314, "step": 29820 }, { "epoch": 2.984, "grad_norm": 0.6062999367713928, "learning_rate": 7.016701670167018e-05, "loss": 0.347, "mean_token_accuracy": 0.7832524925470352, "step": 29840 }, { "epoch": 2.9859999999999998, "grad_norm": 0.7782748341560364, "learning_rate": 7.014701470147015e-05, "loss": 0.4285, "mean_token_accuracy": 0.7678215235471726, "step": 29860 }, { "epoch": 2.988, "grad_norm": 0.5775238275527954, "learning_rate": 7.012701270127013e-05, "loss": 0.3229, "mean_token_accuracy": 0.7661768585443497, "step": 29880 }, { "epoch": 2.99, "grad_norm": 5.812900066375732, "learning_rate": 7.010701070107011e-05, "loss": 0.3487, "mean_token_accuracy": 0.7876387029886246, "step": 29900 }, { "epoch": 2.992, "grad_norm": 0.5200148820877075, "learning_rate": 7.00870087008701e-05, "loss": 0.4055, "mean_token_accuracy": 0.7761238396167756, "step": 29920 }, { "epoch": 2.9939999999999998, "grad_norm": 0.551474392414093, "learning_rate": 7.006700670067007e-05, "loss": 0.384, "mean_token_accuracy": 0.7674370169639587, "step": 29940 }, { "epoch": 2.996, "grad_norm": 0.53330397605896, "learning_rate": 7.004700470047005e-05, "loss": 0.2773, "mean_token_accuracy": 0.7833516508340835, "step": 29960 }, { "epoch": 2.998, "grad_norm": 0.4471687376499176, "learning_rate": 7.002700270027003e-05, "loss": 0.3384, "mean_token_accuracy": 0.7746705383062362, "step": 29980 }, { "epoch": 3.0, "grad_norm": 0.5775869488716125, "learning_rate": 7.000700070007002e-05, "loss": 0.378, "mean_token_accuracy": 0.7711824506521225, "step": 30000 }, { "epoch": 3.002, "grad_norm": 0.4446943402290344, "learning_rate": 6.998699869986999e-05, "loss": 0.2419, "mean_token_accuracy": 0.8940179228782654, "step": 30020 }, { "epoch": 3.004, "grad_norm": 0.589713990688324, "learning_rate": 6.996699669966997e-05, "loss": 0.3808, "mean_token_accuracy": 0.8738609790802002, "step": 30040 }, { "epoch": 3.006, "grad_norm": 0.5511112213134766, "learning_rate": 6.994699469946995e-05, "loss": 0.2488, "mean_token_accuracy": 0.8833417445421219, "step": 30060 }, { "epoch": 3.008, "grad_norm": 0.5566015243530273, "learning_rate": 6.992699269926993e-05, "loss": 0.3057, "mean_token_accuracy": 0.8891569316387177, "step": 30080 }, { "epoch": 3.01, "grad_norm": 0.5168930888175964, "learning_rate": 6.99069906990699e-05, "loss": 0.3989, "mean_token_accuracy": 0.881235858798027, "step": 30100 }, { "epoch": 3.012, "grad_norm": 1.6312882900238037, "learning_rate": 6.988698869886989e-05, "loss": 0.3986, "mean_token_accuracy": 0.8753060877323151, "step": 30120 }, { "epoch": 3.014, "grad_norm": 0.6844755411148071, "learning_rate": 6.986698669866987e-05, "loss": 0.2699, "mean_token_accuracy": 0.8707124412059783, "step": 30140 }, { "epoch": 3.016, "grad_norm": 0.6394733786582947, "learning_rate": 6.984698469846985e-05, "loss": 0.273, "mean_token_accuracy": 0.8865856885910034, "step": 30160 }, { "epoch": 3.018, "grad_norm": 0.4774573743343353, "learning_rate": 6.982698269826982e-05, "loss": 0.4003, "mean_token_accuracy": 0.8549665629863739, "step": 30180 }, { "epoch": 3.02, "grad_norm": 1.0644994974136353, "learning_rate": 6.980698069806982e-05, "loss": 0.4169, "mean_token_accuracy": 0.8710037410259247, "step": 30200 }, { "epoch": 3.022, "grad_norm": 0.5435361862182617, "learning_rate": 6.978697869786979e-05, "loss": 0.2722, "mean_token_accuracy": 0.8864139586687088, "step": 30220 }, { "epoch": 3.024, "grad_norm": 0.5678538084030151, "learning_rate": 6.976697669766977e-05, "loss": 0.3195, "mean_token_accuracy": 0.8737633764743805, "step": 30240 }, { "epoch": 3.026, "grad_norm": 1.1056652069091797, "learning_rate": 6.974697469746974e-05, "loss": 0.4225, "mean_token_accuracy": 0.8581416666507721, "step": 30260 }, { "epoch": 3.028, "grad_norm": 0.6666806936264038, "learning_rate": 6.972697269726974e-05, "loss": 0.1886, "mean_token_accuracy": 0.8845161348581314, "step": 30280 }, { "epoch": 3.03, "grad_norm": 1.4905072450637817, "learning_rate": 6.970697069706971e-05, "loss": 0.3257, "mean_token_accuracy": 0.8778342545032501, "step": 30300 }, { "epoch": 3.032, "grad_norm": 0.5367848873138428, "learning_rate": 6.968696869686969e-05, "loss": 0.4647, "mean_token_accuracy": 0.8671422243118286, "step": 30320 }, { "epoch": 3.034, "grad_norm": 0.5310669541358948, "learning_rate": 6.966696669666966e-05, "loss": 0.3434, "mean_token_accuracy": 0.8816934525966644, "step": 30340 }, { "epoch": 3.036, "grad_norm": 0.7137521505355835, "learning_rate": 6.964696469646966e-05, "loss": 0.3825, "mean_token_accuracy": 0.8620676070451736, "step": 30360 }, { "epoch": 3.038, "grad_norm": 0.6583218574523926, "learning_rate": 6.962696269626963e-05, "loss": 0.4113, "mean_token_accuracy": 0.8753404080867767, "step": 30380 }, { "epoch": 3.04, "grad_norm": 0.6597818732261658, "learning_rate": 6.960696069606961e-05, "loss": 0.2795, "mean_token_accuracy": 0.8947083920240402, "step": 30400 }, { "epoch": 3.042, "grad_norm": 0.903576672077179, "learning_rate": 6.958695869586959e-05, "loss": 0.2205, "mean_token_accuracy": 0.8913400024175644, "step": 30420 }, { "epoch": 3.044, "grad_norm": 1.3384283781051636, "learning_rate": 6.956695669566958e-05, "loss": 0.3468, "mean_token_accuracy": 0.8796342521905899, "step": 30440 }, { "epoch": 3.046, "grad_norm": 0.5625686645507812, "learning_rate": 6.954695469546954e-05, "loss": 0.3629, "mean_token_accuracy": 0.8685958951711654, "step": 30460 }, { "epoch": 3.048, "grad_norm": 1.1122145652770996, "learning_rate": 6.952695269526953e-05, "loss": 0.3564, "mean_token_accuracy": 0.8692611068487167, "step": 30480 }, { "epoch": 3.05, "grad_norm": 0.43822211027145386, "learning_rate": 6.950695069506951e-05, "loss": 0.3822, "mean_token_accuracy": 0.8811999559402466, "step": 30500 }, { "epoch": 3.052, "grad_norm": 0.5918541550636292, "learning_rate": 6.94869486948695e-05, "loss": 0.223, "mean_token_accuracy": 0.8870831936597824, "step": 30520 }, { "epoch": 3.054, "grad_norm": 0.7326030731201172, "learning_rate": 6.946694669466946e-05, "loss": 0.3167, "mean_token_accuracy": 0.8983963459730149, "step": 30540 }, { "epoch": 3.056, "grad_norm": 0.9779550433158875, "learning_rate": 6.944694469446945e-05, "loss": 0.3091, "mean_token_accuracy": 0.8789362907409668, "step": 30560 }, { "epoch": 3.058, "grad_norm": 0.6323958039283752, "learning_rate": 6.942694269426943e-05, "loss": 0.2701, "mean_token_accuracy": 0.8802805006504059, "step": 30580 }, { "epoch": 3.06, "grad_norm": 1.0300406217575073, "learning_rate": 6.940694069406941e-05, "loss": 0.2943, "mean_token_accuracy": 0.8776756972074509, "step": 30600 }, { "epoch": 3.062, "grad_norm": 0.6279745101928711, "learning_rate": 6.938693869386938e-05, "loss": 0.3432, "mean_token_accuracy": 0.8679916203022003, "step": 30620 }, { "epoch": 3.064, "grad_norm": 0.5251446962356567, "learning_rate": 6.936693669366937e-05, "loss": 0.1993, "mean_token_accuracy": 0.8695676863193512, "step": 30640 }, { "epoch": 3.066, "grad_norm": 0.7378559112548828, "learning_rate": 6.934693469346935e-05, "loss": 0.3046, "mean_token_accuracy": 0.8892906874418258, "step": 30660 }, { "epoch": 3.068, "grad_norm": 0.44998326897621155, "learning_rate": 6.932693269326933e-05, "loss": 0.3066, "mean_token_accuracy": 0.8700879842042923, "step": 30680 }, { "epoch": 3.07, "grad_norm": 1.4374336004257202, "learning_rate": 6.93069306930693e-05, "loss": 0.2892, "mean_token_accuracy": 0.8796122580766678, "step": 30700 }, { "epoch": 3.072, "grad_norm": 0.5517739653587341, "learning_rate": 6.92869286928693e-05, "loss": 0.4494, "mean_token_accuracy": 0.8722359389066696, "step": 30720 }, { "epoch": 3.074, "grad_norm": 0.8692848682403564, "learning_rate": 6.926692669266927e-05, "loss": 0.4649, "mean_token_accuracy": 0.8660543709993362, "step": 30740 }, { "epoch": 3.076, "grad_norm": 0.6221420764923096, "learning_rate": 6.924692469246925e-05, "loss": 0.3296, "mean_token_accuracy": 0.8683126717805862, "step": 30760 }, { "epoch": 3.078, "grad_norm": 1.4160009622573853, "learning_rate": 6.922692269226922e-05, "loss": 0.3392, "mean_token_accuracy": 0.8777949571609497, "step": 30780 }, { "epoch": 3.08, "grad_norm": 2.138057231903076, "learning_rate": 6.920692069206922e-05, "loss": 0.3653, "mean_token_accuracy": 0.8816106200218201, "step": 30800 }, { "epoch": 3.082, "grad_norm": 0.8248131275177002, "learning_rate": 6.918691869186919e-05, "loss": 0.2691, "mean_token_accuracy": 0.8846494913101196, "step": 30820 }, { "epoch": 3.084, "grad_norm": 0.6675848364830017, "learning_rate": 6.916691669166917e-05, "loss": 0.2724, "mean_token_accuracy": 0.8729799687862396, "step": 30840 }, { "epoch": 3.086, "grad_norm": 1.41282320022583, "learning_rate": 6.914691469146915e-05, "loss": 0.3073, "mean_token_accuracy": 0.8794100463390351, "step": 30860 }, { "epoch": 3.088, "grad_norm": 1.0014463663101196, "learning_rate": 6.912691269126913e-05, "loss": 0.2718, "mean_token_accuracy": 0.8761445343494415, "step": 30880 }, { "epoch": 3.09, "grad_norm": 0.6354532241821289, "learning_rate": 6.910691069106912e-05, "loss": 0.3357, "mean_token_accuracy": 0.8811373919248581, "step": 30900 }, { "epoch": 3.092, "grad_norm": 0.5817384123802185, "learning_rate": 6.908690869086909e-05, "loss": 0.3427, "mean_token_accuracy": 0.8551957756280899, "step": 30920 }, { "epoch": 3.094, "grad_norm": 0.815639853477478, "learning_rate": 6.906690669066907e-05, "loss": 0.4141, "mean_token_accuracy": 0.8680730432271957, "step": 30940 }, { "epoch": 3.096, "grad_norm": 1.135587215423584, "learning_rate": 6.904690469046905e-05, "loss": 0.2893, "mean_token_accuracy": 0.863542228937149, "step": 30960 }, { "epoch": 3.098, "grad_norm": 0.6221270561218262, "learning_rate": 6.902690269026904e-05, "loss": 0.3319, "mean_token_accuracy": 0.8898957341909408, "step": 30980 }, { "epoch": 3.1, "grad_norm": 0.4705598056316376, "learning_rate": 6.9006900690069e-05, "loss": 0.3316, "mean_token_accuracy": 0.8745740950107574, "step": 31000 }, { "epoch": 3.102, "grad_norm": 1.708646297454834, "learning_rate": 6.898689868986899e-05, "loss": 0.4898, "mean_token_accuracy": 0.8595007091760636, "step": 31020 }, { "epoch": 3.104, "grad_norm": 0.5904032588005066, "learning_rate": 6.896689668966897e-05, "loss": 0.3448, "mean_token_accuracy": 0.8808046549558639, "step": 31040 }, { "epoch": 3.106, "grad_norm": 0.6003983616828918, "learning_rate": 6.894689468946895e-05, "loss": 0.2369, "mean_token_accuracy": 0.8761374026536941, "step": 31060 }, { "epoch": 3.108, "grad_norm": 0.601841926574707, "learning_rate": 6.892689268926892e-05, "loss": 0.3204, "mean_token_accuracy": 0.8575987845659256, "step": 31080 }, { "epoch": 3.11, "grad_norm": 0.557679295539856, "learning_rate": 6.890689068906892e-05, "loss": 0.3159, "mean_token_accuracy": 0.8722812563180924, "step": 31100 }, { "epoch": 3.112, "grad_norm": 0.6948885917663574, "learning_rate": 6.888688868886889e-05, "loss": 0.4033, "mean_token_accuracy": 0.86625657081604, "step": 31120 }, { "epoch": 3.114, "grad_norm": 0.5296452641487122, "learning_rate": 6.886688668866887e-05, "loss": 0.3705, "mean_token_accuracy": 0.8672240197658538, "step": 31140 }, { "epoch": 3.116, "grad_norm": 0.5692376494407654, "learning_rate": 6.884688468846884e-05, "loss": 0.3056, "mean_token_accuracy": 0.8683233261108398, "step": 31160 }, { "epoch": 3.118, "grad_norm": 0.7813221216201782, "learning_rate": 6.882688268826884e-05, "loss": 0.3286, "mean_token_accuracy": 0.8799499064683914, "step": 31180 }, { "epoch": 3.12, "grad_norm": 0.5804118514060974, "learning_rate": 6.880688068806881e-05, "loss": 0.3316, "mean_token_accuracy": 0.8689316153526306, "step": 31200 }, { "epoch": 3.122, "grad_norm": 2.266254425048828, "learning_rate": 6.878687868786879e-05, "loss": 0.2777, "mean_token_accuracy": 0.8749155849218369, "step": 31220 }, { "epoch": 3.124, "grad_norm": 0.5913439989089966, "learning_rate": 6.876687668766878e-05, "loss": 0.3759, "mean_token_accuracy": 0.8673680126667023, "step": 31240 }, { "epoch": 3.126, "grad_norm": 1.045470952987671, "learning_rate": 6.874687468746876e-05, "loss": 0.3065, "mean_token_accuracy": 0.8784758239984513, "step": 31260 }, { "epoch": 3.128, "grad_norm": 0.5276808738708496, "learning_rate": 6.872687268726873e-05, "loss": 0.293, "mean_token_accuracy": 0.8837313860654831, "step": 31280 }, { "epoch": 3.13, "grad_norm": 0.5212607383728027, "learning_rate": 6.870687068706871e-05, "loss": 0.3172, "mean_token_accuracy": 0.8834840536117554, "step": 31300 }, { "epoch": 3.132, "grad_norm": 0.6665724515914917, "learning_rate": 6.86868686868687e-05, "loss": 0.5095, "mean_token_accuracy": 0.882305771112442, "step": 31320 }, { "epoch": 3.134, "grad_norm": 0.8061319589614868, "learning_rate": 6.866686668666868e-05, "loss": 0.233, "mean_token_accuracy": 0.8831484973430633, "step": 31340 }, { "epoch": 3.136, "grad_norm": 0.4372413456439972, "learning_rate": 6.864686468646865e-05, "loss": 0.3147, "mean_token_accuracy": 0.8648031115531921, "step": 31360 }, { "epoch": 3.138, "grad_norm": 0.615067183971405, "learning_rate": 6.862686268626863e-05, "loss": 0.2843, "mean_token_accuracy": 0.8740931957960129, "step": 31380 }, { "epoch": 3.14, "grad_norm": 0.6272055506706238, "learning_rate": 6.860686068606861e-05, "loss": 0.3169, "mean_token_accuracy": 0.8732999175786972, "step": 31400 }, { "epoch": 3.142, "grad_norm": 0.6079870462417603, "learning_rate": 6.85868586858686e-05, "loss": 0.3497, "mean_token_accuracy": 0.8688756406307221, "step": 31420 }, { "epoch": 3.144, "grad_norm": 0.6516059041023254, "learning_rate": 6.856685668566857e-05, "loss": 0.3852, "mean_token_accuracy": 0.8536276936531066, "step": 31440 }, { "epoch": 3.146, "grad_norm": 2.3802788257598877, "learning_rate": 6.854685468546855e-05, "loss": 0.3, "mean_token_accuracy": 0.8663832038640976, "step": 31460 }, { "epoch": 3.148, "grad_norm": 0.6777468919754028, "learning_rate": 6.852685268526853e-05, "loss": 0.2458, "mean_token_accuracy": 0.8717755764722824, "step": 31480 }, { "epoch": 3.15, "grad_norm": 1.5665737390518188, "learning_rate": 6.850685068506851e-05, "loss": 0.2836, "mean_token_accuracy": 0.8825663805007935, "step": 31500 }, { "epoch": 3.152, "grad_norm": 0.5585176348686218, "learning_rate": 6.848684868486848e-05, "loss": 0.2067, "mean_token_accuracy": 0.8705617815256119, "step": 31520 }, { "epoch": 3.154, "grad_norm": 0.6781510710716248, "learning_rate": 6.846684668466847e-05, "loss": 0.3636, "mean_token_accuracy": 0.8847904294729233, "step": 31540 }, { "epoch": 3.156, "grad_norm": 0.7589893937110901, "learning_rate": 6.844684468446845e-05, "loss": 0.2647, "mean_token_accuracy": 0.8763284355401992, "step": 31560 }, { "epoch": 3.158, "grad_norm": 1.7737735509872437, "learning_rate": 6.842684268426843e-05, "loss": 0.2994, "mean_token_accuracy": 0.8762444645166397, "step": 31580 }, { "epoch": 3.16, "grad_norm": 0.5563002824783325, "learning_rate": 6.84068406840684e-05, "loss": 0.3177, "mean_token_accuracy": 0.8711637914180755, "step": 31600 }, { "epoch": 3.162, "grad_norm": 0.6063830256462097, "learning_rate": 6.83868386838684e-05, "loss": 0.3124, "mean_token_accuracy": 0.8696059167385102, "step": 31620 }, { "epoch": 3.164, "grad_norm": 0.6271325349807739, "learning_rate": 6.836683668366837e-05, "loss": 0.2907, "mean_token_accuracy": 0.875458762049675, "step": 31640 }, { "epoch": 3.166, "grad_norm": 0.5615509152412415, "learning_rate": 6.834683468346835e-05, "loss": 0.251, "mean_token_accuracy": 0.8840543121099472, "step": 31660 }, { "epoch": 3.168, "grad_norm": 1.3212860822677612, "learning_rate": 6.832683268326832e-05, "loss": 0.3658, "mean_token_accuracy": 0.8672939240932465, "step": 31680 }, { "epoch": 3.17, "grad_norm": 0.6966462135314941, "learning_rate": 6.830683068306832e-05, "loss": 0.4266, "mean_token_accuracy": 0.8688521385192871, "step": 31700 }, { "epoch": 3.172, "grad_norm": 0.62582927942276, "learning_rate": 6.828682868286829e-05, "loss": 0.2026, "mean_token_accuracy": 0.8704431474208831, "step": 31720 }, { "epoch": 3.174, "grad_norm": 0.5975244045257568, "learning_rate": 6.826682668266827e-05, "loss": 0.2384, "mean_token_accuracy": 0.8772104680538177, "step": 31740 }, { "epoch": 3.176, "grad_norm": 0.5999241471290588, "learning_rate": 6.824682468246825e-05, "loss": 0.2334, "mean_token_accuracy": 0.8863313317298889, "step": 31760 }, { "epoch": 3.178, "grad_norm": 0.5892619490623474, "learning_rate": 6.822682268226824e-05, "loss": 0.2677, "mean_token_accuracy": 0.866864088177681, "step": 31780 }, { "epoch": 3.18, "grad_norm": 0.6507098078727722, "learning_rate": 6.82068206820682e-05, "loss": 0.3714, "mean_token_accuracy": 0.8630100786685944, "step": 31800 }, { "epoch": 3.182, "grad_norm": 0.4704149067401886, "learning_rate": 6.818681868186819e-05, "loss": 0.2627, "mean_token_accuracy": 0.8675028353929519, "step": 31820 }, { "epoch": 3.184, "grad_norm": 0.47879838943481445, "learning_rate": 6.816681668166817e-05, "loss": 0.2332, "mean_token_accuracy": 0.8842342138290405, "step": 31840 }, { "epoch": 3.186, "grad_norm": 0.6302092671394348, "learning_rate": 6.814681468146815e-05, "loss": 0.2445, "mean_token_accuracy": 0.8777852922677993, "step": 31860 }, { "epoch": 3.188, "grad_norm": 1.5597740411758423, "learning_rate": 6.812681268126812e-05, "loss": 0.326, "mean_token_accuracy": 0.8615997105836868, "step": 31880 }, { "epoch": 3.19, "grad_norm": 0.5620926022529602, "learning_rate": 6.810681068106811e-05, "loss": 0.3654, "mean_token_accuracy": 0.8747089684009552, "step": 31900 }, { "epoch": 3.192, "grad_norm": 0.6851575970649719, "learning_rate": 6.808680868086809e-05, "loss": 0.2631, "mean_token_accuracy": 0.8724864691495895, "step": 31920 }, { "epoch": 3.194, "grad_norm": 0.6946195960044861, "learning_rate": 6.806680668066807e-05, "loss": 0.3168, "mean_token_accuracy": 0.8730955719947815, "step": 31940 }, { "epoch": 3.196, "grad_norm": 2.9120144844055176, "learning_rate": 6.804680468046804e-05, "loss": 0.3712, "mean_token_accuracy": 0.8792525619268418, "step": 31960 }, { "epoch": 3.198, "grad_norm": 0.6314313411712646, "learning_rate": 6.802680268026803e-05, "loss": 0.368, "mean_token_accuracy": 0.8658206194639206, "step": 31980 }, { "epoch": 3.2, "grad_norm": 0.513914167881012, "learning_rate": 6.800680068006801e-05, "loss": 0.3739, "mean_token_accuracy": 0.8794240772724151, "step": 32000 }, { "epoch": 3.202, "grad_norm": 0.8265764117240906, "learning_rate": 6.798679867986799e-05, "loss": 0.3145, "mean_token_accuracy": 0.8657783746719361, "step": 32020 }, { "epoch": 3.204, "grad_norm": 0.5957169532775879, "learning_rate": 6.796679667966796e-05, "loss": 0.3282, "mean_token_accuracy": 0.8700960218906403, "step": 32040 }, { "epoch": 3.206, "grad_norm": 1.01340651512146, "learning_rate": 6.794679467946794e-05, "loss": 0.4153, "mean_token_accuracy": 0.8772399663925171, "step": 32060 }, { "epoch": 3.208, "grad_norm": 0.5049470067024231, "learning_rate": 6.792679267926793e-05, "loss": 0.255, "mean_token_accuracy": 0.8855417788028717, "step": 32080 }, { "epoch": 3.21, "grad_norm": 0.5378025770187378, "learning_rate": 6.790679067906791e-05, "loss": 0.316, "mean_token_accuracy": 0.8706786960363389, "step": 32100 }, { "epoch": 3.212, "grad_norm": 0.6185476779937744, "learning_rate": 6.788678867886788e-05, "loss": 0.3767, "mean_token_accuracy": 0.8655941814184189, "step": 32120 }, { "epoch": 3.214, "grad_norm": 0.6312613487243652, "learning_rate": 6.786678667866788e-05, "loss": 0.3454, "mean_token_accuracy": 0.8595848381519318, "step": 32140 }, { "epoch": 3.216, "grad_norm": 1.0758951902389526, "learning_rate": 6.784678467846785e-05, "loss": 0.4596, "mean_token_accuracy": 0.8662818461656571, "step": 32160 }, { "epoch": 3.218, "grad_norm": 0.7507582902908325, "learning_rate": 6.782678267826783e-05, "loss": 0.2959, "mean_token_accuracy": 0.8643357992172241, "step": 32180 }, { "epoch": 3.22, "grad_norm": 0.5485620498657227, "learning_rate": 6.78067806780678e-05, "loss": 0.2423, "mean_token_accuracy": 0.8807024389505387, "step": 32200 }, { "epoch": 3.222, "grad_norm": 0.9272204637527466, "learning_rate": 6.77867786778678e-05, "loss": 0.1766, "mean_token_accuracy": 0.8734985917806626, "step": 32220 }, { "epoch": 3.224, "grad_norm": 0.5423893332481384, "learning_rate": 6.776677667766776e-05, "loss": 0.4174, "mean_token_accuracy": 0.866055715084076, "step": 32240 }, { "epoch": 3.226, "grad_norm": 0.5717921853065491, "learning_rate": 6.774677467746775e-05, "loss": 0.4007, "mean_token_accuracy": 0.8693304091691971, "step": 32260 }, { "epoch": 3.228, "grad_norm": 0.7650997042655945, "learning_rate": 6.772677267726773e-05, "loss": 0.2154, "mean_token_accuracy": 0.877428287267685, "step": 32280 }, { "epoch": 3.23, "grad_norm": 0.5835682153701782, "learning_rate": 6.770677067706771e-05, "loss": 0.3267, "mean_token_accuracy": 0.8884496122598649, "step": 32300 }, { "epoch": 3.232, "grad_norm": 1.7561603784561157, "learning_rate": 6.768676867686768e-05, "loss": 0.477, "mean_token_accuracy": 0.8639912396669388, "step": 32320 }, { "epoch": 3.234, "grad_norm": 1.5861812829971313, "learning_rate": 6.766676667666767e-05, "loss": 0.2869, "mean_token_accuracy": 0.8817894876003265, "step": 32340 }, { "epoch": 3.2359999999999998, "grad_norm": 0.5617793798446655, "learning_rate": 6.764676467646765e-05, "loss": 0.477, "mean_token_accuracy": 0.8388183504343033, "step": 32360 }, { "epoch": 3.238, "grad_norm": 0.6341413259506226, "learning_rate": 6.762676267626763e-05, "loss": 0.3357, "mean_token_accuracy": 0.8613066047430038, "step": 32380 }, { "epoch": 3.24, "grad_norm": 0.6191155910491943, "learning_rate": 6.760676067606762e-05, "loss": 0.3664, "mean_token_accuracy": 0.8644601672887802, "step": 32400 }, { "epoch": 3.242, "grad_norm": 0.7628487348556519, "learning_rate": 6.758675867586759e-05, "loss": 0.2286, "mean_token_accuracy": 0.8806452333927155, "step": 32420 }, { "epoch": 3.2439999999999998, "grad_norm": 0.7922534346580505, "learning_rate": 6.756675667566758e-05, "loss": 0.3921, "mean_token_accuracy": 0.8390231460332871, "step": 32440 }, { "epoch": 3.246, "grad_norm": 0.7441483736038208, "learning_rate": 6.754675467546755e-05, "loss": 0.2705, "mean_token_accuracy": 0.871590456366539, "step": 32460 }, { "epoch": 3.248, "grad_norm": 0.9581438899040222, "learning_rate": 6.752675267526753e-05, "loss": 0.3273, "mean_token_accuracy": 0.8654574692249298, "step": 32480 }, { "epoch": 3.25, "grad_norm": 0.7397668361663818, "learning_rate": 6.75067506750675e-05, "loss": 0.2777, "mean_token_accuracy": 0.8656565576791764, "step": 32500 }, { "epoch": 3.252, "grad_norm": 0.8981633186340332, "learning_rate": 6.74867486748675e-05, "loss": 0.4197, "mean_token_accuracy": 0.8602405339479446, "step": 32520 }, { "epoch": 3.254, "grad_norm": 1.2154542207717896, "learning_rate": 6.746674667466747e-05, "loss": 0.2562, "mean_token_accuracy": 0.8733404994010925, "step": 32540 }, { "epoch": 3.2560000000000002, "grad_norm": 0.7211659550666809, "learning_rate": 6.744674467446745e-05, "loss": 0.3313, "mean_token_accuracy": 0.8667224228382111, "step": 32560 }, { "epoch": 3.258, "grad_norm": 0.5700468420982361, "learning_rate": 6.742674267426742e-05, "loss": 0.4326, "mean_token_accuracy": 0.8638245761394501, "step": 32580 }, { "epoch": 3.26, "grad_norm": 0.8099965453147888, "learning_rate": 6.740674067406742e-05, "loss": 0.3539, "mean_token_accuracy": 0.8705437749624252, "step": 32600 }, { "epoch": 3.262, "grad_norm": 0.5604251027107239, "learning_rate": 6.738673867386739e-05, "loss": 0.3028, "mean_token_accuracy": 0.8646650075912475, "step": 32620 }, { "epoch": 3.2640000000000002, "grad_norm": 0.5848718285560608, "learning_rate": 6.736673667366737e-05, "loss": 0.3358, "mean_token_accuracy": 0.8644698172807693, "step": 32640 }, { "epoch": 3.266, "grad_norm": 1.1692882776260376, "learning_rate": 6.734673467346735e-05, "loss": 0.2581, "mean_token_accuracy": 0.8798609167337418, "step": 32660 }, { "epoch": 3.268, "grad_norm": 0.6225622892379761, "learning_rate": 6.732673267326734e-05, "loss": 0.2588, "mean_token_accuracy": 0.8673434495925904, "step": 32680 }, { "epoch": 3.27, "grad_norm": 0.6398547291755676, "learning_rate": 6.730673067306731e-05, "loss": 0.3815, "mean_token_accuracy": 0.8787832289934159, "step": 32700 }, { "epoch": 3.2720000000000002, "grad_norm": 0.49418774247169495, "learning_rate": 6.728672867286729e-05, "loss": 0.3715, "mean_token_accuracy": 0.8752611190080642, "step": 32720 }, { "epoch": 3.274, "grad_norm": 0.5467320680618286, "learning_rate": 6.726672667266727e-05, "loss": 0.2192, "mean_token_accuracy": 0.8626510888338089, "step": 32740 }, { "epoch": 3.276, "grad_norm": 0.9603420495986938, "learning_rate": 6.724672467246726e-05, "loss": 0.3221, "mean_token_accuracy": 0.8675257980823516, "step": 32760 }, { "epoch": 3.278, "grad_norm": 0.5007884502410889, "learning_rate": 6.722672267226723e-05, "loss": 0.2665, "mean_token_accuracy": 0.8636785537004471, "step": 32780 }, { "epoch": 3.2800000000000002, "grad_norm": 0.669080913066864, "learning_rate": 6.720672067206721e-05, "loss": 0.3034, "mean_token_accuracy": 0.8641418963670731, "step": 32800 }, { "epoch": 3.282, "grad_norm": 0.6126741170883179, "learning_rate": 6.718671867186719e-05, "loss": 0.282, "mean_token_accuracy": 0.8797101080417633, "step": 32820 }, { "epoch": 3.284, "grad_norm": 0.42816680669784546, "learning_rate": 6.716671667166718e-05, "loss": 0.3895, "mean_token_accuracy": 0.8678907960653305, "step": 32840 }, { "epoch": 3.286, "grad_norm": 0.5792250037193298, "learning_rate": 6.714671467146714e-05, "loss": 0.3009, "mean_token_accuracy": 0.8646954476833344, "step": 32860 }, { "epoch": 3.288, "grad_norm": 0.5283281207084656, "learning_rate": 6.712671267126713e-05, "loss": 0.4122, "mean_token_accuracy": 0.8570707023143769, "step": 32880 }, { "epoch": 3.29, "grad_norm": 0.6319969296455383, "learning_rate": 6.710671067106711e-05, "loss": 0.3461, "mean_token_accuracy": 0.8817934840917587, "step": 32900 }, { "epoch": 3.292, "grad_norm": 0.7757834792137146, "learning_rate": 6.70867086708671e-05, "loss": 0.2297, "mean_token_accuracy": 0.8696182519197464, "step": 32920 }, { "epoch": 3.294, "grad_norm": 0.6360335946083069, "learning_rate": 6.706670667066706e-05, "loss": 0.2551, "mean_token_accuracy": 0.8640986382961273, "step": 32940 }, { "epoch": 3.296, "grad_norm": 0.4964936077594757, "learning_rate": 6.704670467046706e-05, "loss": 0.2698, "mean_token_accuracy": 0.8697379380464554, "step": 32960 }, { "epoch": 3.298, "grad_norm": 0.5574098229408264, "learning_rate": 6.702670267026703e-05, "loss": 0.3492, "mean_token_accuracy": 0.8588025093078613, "step": 32980 }, { "epoch": 3.3, "grad_norm": 0.5993109345436096, "learning_rate": 6.700670067006701e-05, "loss": 0.3678, "mean_token_accuracy": 0.8665024548768997, "step": 33000 }, { "epoch": 3.302, "grad_norm": 0.5706835985183716, "learning_rate": 6.698669866986698e-05, "loss": 0.2872, "mean_token_accuracy": 0.870916947722435, "step": 33020 }, { "epoch": 3.304, "grad_norm": 0.5419235825538635, "learning_rate": 6.696669666966698e-05, "loss": 0.4096, "mean_token_accuracy": 0.8627248674631118, "step": 33040 }, { "epoch": 3.306, "grad_norm": 0.6583293080329895, "learning_rate": 6.694669466946695e-05, "loss": 0.4003, "mean_token_accuracy": 0.8714838355779648, "step": 33060 }, { "epoch": 3.308, "grad_norm": 0.8243187069892883, "learning_rate": 6.692669266926693e-05, "loss": 0.2313, "mean_token_accuracy": 0.8758340239524841, "step": 33080 }, { "epoch": 3.31, "grad_norm": 0.5023772716522217, "learning_rate": 6.69066906690669e-05, "loss": 0.1989, "mean_token_accuracy": 0.8720045685768127, "step": 33100 }, { "epoch": 3.312, "grad_norm": 0.5430787801742554, "learning_rate": 6.68866886688669e-05, "loss": 0.3222, "mean_token_accuracy": 0.8628418862819671, "step": 33120 }, { "epoch": 3.314, "grad_norm": 0.6290366053581238, "learning_rate": 6.686668666866687e-05, "loss": 0.3361, "mean_token_accuracy": 0.8602136522531509, "step": 33140 }, { "epoch": 3.316, "grad_norm": 0.5448154211044312, "learning_rate": 6.684668466846685e-05, "loss": 0.2514, "mean_token_accuracy": 0.8672451734542846, "step": 33160 }, { "epoch": 3.318, "grad_norm": 0.6964037418365479, "learning_rate": 6.682668266826683e-05, "loss": 0.408, "mean_token_accuracy": 0.8566480249166488, "step": 33180 }, { "epoch": 3.32, "grad_norm": 1.8045257329940796, "learning_rate": 6.680668066806682e-05, "loss": 0.4361, "mean_token_accuracy": 0.8668989092111588, "step": 33200 }, { "epoch": 3.322, "grad_norm": 0.565591037273407, "learning_rate": 6.678667866786679e-05, "loss": 0.3427, "mean_token_accuracy": 0.8615114003419876, "step": 33220 }, { "epoch": 3.324, "grad_norm": 0.5605135560035706, "learning_rate": 6.676667666766677e-05, "loss": 0.5485, "mean_token_accuracy": 0.8540320813655853, "step": 33240 }, { "epoch": 3.326, "grad_norm": 1.7081164121627808, "learning_rate": 6.674667466746675e-05, "loss": 0.2476, "mean_token_accuracy": 0.8772746235132217, "step": 33260 }, { "epoch": 3.328, "grad_norm": 1.4041954278945923, "learning_rate": 6.672667266726673e-05, "loss": 0.4448, "mean_token_accuracy": 0.8593144834041595, "step": 33280 }, { "epoch": 3.33, "grad_norm": 0.47435057163238525, "learning_rate": 6.67066706670667e-05, "loss": 0.3364, "mean_token_accuracy": 0.8640255928039551, "step": 33300 }, { "epoch": 3.332, "grad_norm": 0.8015786409378052, "learning_rate": 6.668666866686669e-05, "loss": 0.3047, "mean_token_accuracy": 0.876236054301262, "step": 33320 }, { "epoch": 3.334, "grad_norm": 1.4905786514282227, "learning_rate": 6.666666666666667e-05, "loss": 0.3444, "mean_token_accuracy": 0.8750124573707581, "step": 33340 }, { "epoch": 3.336, "grad_norm": 1.1409449577331543, "learning_rate": 6.664666466646665e-05, "loss": 0.5261, "mean_token_accuracy": 0.846904131770134, "step": 33360 }, { "epoch": 3.338, "grad_norm": 0.6519871950149536, "learning_rate": 6.662666266626662e-05, "loss": 0.2887, "mean_token_accuracy": 0.8722639560699463, "step": 33380 }, { "epoch": 3.34, "grad_norm": 0.5524666905403137, "learning_rate": 6.66066606660666e-05, "loss": 0.3671, "mean_token_accuracy": 0.8764760762453079, "step": 33400 }, { "epoch": 3.342, "grad_norm": 0.8233127593994141, "learning_rate": 6.658665866586659e-05, "loss": 0.4041, "mean_token_accuracy": 0.8665405124425888, "step": 33420 }, { "epoch": 3.344, "grad_norm": 1.225811243057251, "learning_rate": 6.656665666566657e-05, "loss": 0.2879, "mean_token_accuracy": 0.8734911113977433, "step": 33440 }, { "epoch": 3.346, "grad_norm": 0.6440261602401733, "learning_rate": 6.654665466546654e-05, "loss": 0.3065, "mean_token_accuracy": 0.8631411463022232, "step": 33460 }, { "epoch": 3.348, "grad_norm": 0.7424303293228149, "learning_rate": 6.652665266526654e-05, "loss": 0.2929, "mean_token_accuracy": 0.8697924137115478, "step": 33480 }, { "epoch": 3.35, "grad_norm": 0.5054978132247925, "learning_rate": 6.650665066506651e-05, "loss": 0.5233, "mean_token_accuracy": 0.864971062541008, "step": 33500 }, { "epoch": 3.352, "grad_norm": 0.4603955149650574, "learning_rate": 6.648664866486649e-05, "loss": 0.3833, "mean_token_accuracy": 0.8620764881372451, "step": 33520 }, { "epoch": 3.354, "grad_norm": 0.5276235938072205, "learning_rate": 6.646664666466646e-05, "loss": 0.2718, "mean_token_accuracy": 0.8627213269472123, "step": 33540 }, { "epoch": 3.356, "grad_norm": 0.5077042579650879, "learning_rate": 6.644664466446646e-05, "loss": 0.2912, "mean_token_accuracy": 0.8669674813747406, "step": 33560 }, { "epoch": 3.358, "grad_norm": 0.5491507649421692, "learning_rate": 6.642664266426643e-05, "loss": 0.3075, "mean_token_accuracy": 0.8708546459674835, "step": 33580 }, { "epoch": 3.36, "grad_norm": 1.0462005138397217, "learning_rate": 6.640664066406641e-05, "loss": 0.3884, "mean_token_accuracy": 0.8761539131402969, "step": 33600 }, { "epoch": 3.362, "grad_norm": 0.5731706619262695, "learning_rate": 6.638663866386639e-05, "loss": 0.2766, "mean_token_accuracy": 0.8818988114595413, "step": 33620 }, { "epoch": 3.364, "grad_norm": 0.6662700772285461, "learning_rate": 6.636663666366637e-05, "loss": 0.3026, "mean_token_accuracy": 0.8778397500514984, "step": 33640 }, { "epoch": 3.366, "grad_norm": 1.1146599054336548, "learning_rate": 6.634663466346634e-05, "loss": 0.4296, "mean_token_accuracy": 0.8644279420375824, "step": 33660 }, { "epoch": 3.368, "grad_norm": 0.8065193295478821, "learning_rate": 6.632663266326633e-05, "loss": 0.254, "mean_token_accuracy": 0.8657648772001266, "step": 33680 }, { "epoch": 3.37, "grad_norm": 0.7463464736938477, "learning_rate": 6.630663066306631e-05, "loss": 0.3192, "mean_token_accuracy": 0.8750720083713531, "step": 33700 }, { "epoch": 3.372, "grad_norm": 0.6883248686790466, "learning_rate": 6.62866286628663e-05, "loss": 0.2855, "mean_token_accuracy": 0.8582820862531662, "step": 33720 }, { "epoch": 3.374, "grad_norm": 0.5789526104927063, "learning_rate": 6.626662666266626e-05, "loss": 0.3549, "mean_token_accuracy": 0.8487199485301972, "step": 33740 }, { "epoch": 3.376, "grad_norm": 0.8296191692352295, "learning_rate": 6.624662466246625e-05, "loss": 0.3182, "mean_token_accuracy": 0.8691107958555222, "step": 33760 }, { "epoch": 3.378, "grad_norm": 0.9863596558570862, "learning_rate": 6.622662266226623e-05, "loss": 0.4328, "mean_token_accuracy": 0.8768334031105042, "step": 33780 }, { "epoch": 3.38, "grad_norm": 6.9657111167907715, "learning_rate": 6.620662066206621e-05, "loss": 1.0221, "mean_token_accuracy": 0.8521427631378173, "step": 33800 }, { "epoch": 3.382, "grad_norm": 0.7375881671905518, "learning_rate": 6.618661866186618e-05, "loss": 0.4316, "mean_token_accuracy": 0.8654433190822601, "step": 33820 }, { "epoch": 3.384, "grad_norm": 0.5335933566093445, "learning_rate": 6.616661666166616e-05, "loss": 0.2902, "mean_token_accuracy": 0.8624963611364365, "step": 33840 }, { "epoch": 3.386, "grad_norm": 0.5243452787399292, "learning_rate": 6.614661466146616e-05, "loss": 0.2614, "mean_token_accuracy": 0.8593473732471466, "step": 33860 }, { "epoch": 3.388, "grad_norm": 0.5765053033828735, "learning_rate": 6.612661266126613e-05, "loss": 0.2309, "mean_token_accuracy": 0.8749292016029357, "step": 33880 }, { "epoch": 3.39, "grad_norm": 0.5238656401634216, "learning_rate": 6.610661066106611e-05, "loss": 0.3839, "mean_token_accuracy": 0.8578035295009613, "step": 33900 }, { "epoch": 3.392, "grad_norm": 0.5194885730743408, "learning_rate": 6.608660866086608e-05, "loss": 0.2843, "mean_token_accuracy": 0.8666597902774811, "step": 33920 }, { "epoch": 3.394, "grad_norm": 0.8796253800392151, "learning_rate": 6.606660666066608e-05, "loss": 0.4228, "mean_token_accuracy": 0.8587622940540314, "step": 33940 }, { "epoch": 3.396, "grad_norm": 0.595947265625, "learning_rate": 6.604660466046605e-05, "loss": 0.2155, "mean_token_accuracy": 0.8602279901504517, "step": 33960 }, { "epoch": 3.398, "grad_norm": 0.7995667457580566, "learning_rate": 6.602660266026603e-05, "loss": 0.3435, "mean_token_accuracy": 0.8694554388523101, "step": 33980 }, { "epoch": 3.4, "grad_norm": 0.6083911657333374, "learning_rate": 6.600660066006602e-05, "loss": 0.4238, "mean_token_accuracy": 0.8784394323825836, "step": 34000 }, { "epoch": 3.402, "grad_norm": 0.5656099319458008, "learning_rate": 6.5986598659866e-05, "loss": 0.2854, "mean_token_accuracy": 0.8453126549720764, "step": 34020 }, { "epoch": 3.404, "grad_norm": 0.5517226457595825, "learning_rate": 6.596659665966597e-05, "loss": 0.342, "mean_token_accuracy": 0.870712673664093, "step": 34040 }, { "epoch": 3.406, "grad_norm": 0.4678800702095032, "learning_rate": 6.594659465946595e-05, "loss": 0.2408, "mean_token_accuracy": 0.8658047050237656, "step": 34060 }, { "epoch": 3.408, "grad_norm": 5.1341705322265625, "learning_rate": 6.592659265926593e-05, "loss": 0.3475, "mean_token_accuracy": 0.865521690249443, "step": 34080 }, { "epoch": 3.41, "grad_norm": 0.638600766658783, "learning_rate": 6.590659065906592e-05, "loss": 0.2467, "mean_token_accuracy": 0.8760639309883118, "step": 34100 }, { "epoch": 3.412, "grad_norm": 0.6711747050285339, "learning_rate": 6.588658865886589e-05, "loss": 0.2903, "mean_token_accuracy": 0.8721790164709091, "step": 34120 }, { "epoch": 3.414, "grad_norm": 0.6521173715591431, "learning_rate": 6.586658665866587e-05, "loss": 0.2741, "mean_token_accuracy": 0.8875161141157151, "step": 34140 }, { "epoch": 3.416, "grad_norm": 0.6799888014793396, "learning_rate": 6.584658465846585e-05, "loss": 0.2827, "mean_token_accuracy": 0.8631364911794662, "step": 34160 }, { "epoch": 3.418, "grad_norm": 1.0619354248046875, "learning_rate": 6.582658265826584e-05, "loss": 0.3112, "mean_token_accuracy": 0.8506942301988601, "step": 34180 }, { "epoch": 3.42, "grad_norm": 0.7244651913642883, "learning_rate": 6.58065806580658e-05, "loss": 0.2437, "mean_token_accuracy": 0.8715906828641892, "step": 34200 }, { "epoch": 3.422, "grad_norm": 0.6003289222717285, "learning_rate": 6.578657865786579e-05, "loss": 0.3064, "mean_token_accuracy": 0.8502597898244858, "step": 34220 }, { "epoch": 3.424, "grad_norm": 0.6277062296867371, "learning_rate": 6.576657665766577e-05, "loss": 0.3106, "mean_token_accuracy": 0.8641259133815765, "step": 34240 }, { "epoch": 3.426, "grad_norm": 0.5835585594177246, "learning_rate": 6.574657465746575e-05, "loss": 0.3285, "mean_token_accuracy": 0.8735463589429855, "step": 34260 }, { "epoch": 3.428, "grad_norm": 0.6368887424468994, "learning_rate": 6.572657265726572e-05, "loss": 0.29, "mean_token_accuracy": 0.8656401962041855, "step": 34280 }, { "epoch": 3.43, "grad_norm": 0.46675997972488403, "learning_rate": 6.570657065706571e-05, "loss": 0.303, "mean_token_accuracy": 0.8565243154764175, "step": 34300 }, { "epoch": 3.432, "grad_norm": 0.6070669889450073, "learning_rate": 6.568656865686569e-05, "loss": 0.2715, "mean_token_accuracy": 0.8813949882984161, "step": 34320 }, { "epoch": 3.434, "grad_norm": 0.5847573280334473, "learning_rate": 6.566656665666567e-05, "loss": 0.2619, "mean_token_accuracy": 0.8689747542142868, "step": 34340 }, { "epoch": 3.436, "grad_norm": 0.5459972620010376, "learning_rate": 6.564656465646564e-05, "loss": 0.2434, "mean_token_accuracy": 0.8824111193418502, "step": 34360 }, { "epoch": 3.438, "grad_norm": 0.8996637463569641, "learning_rate": 6.562656265626564e-05, "loss": 0.2738, "mean_token_accuracy": 0.87446628510952, "step": 34380 }, { "epoch": 3.44, "grad_norm": 0.609082043170929, "learning_rate": 6.560656065606561e-05, "loss": 0.3179, "mean_token_accuracy": 0.8645369648933411, "step": 34400 }, { "epoch": 3.442, "grad_norm": 0.9434329867362976, "learning_rate": 6.558655865586559e-05, "loss": 0.3939, "mean_token_accuracy": 0.8697970092296601, "step": 34420 }, { "epoch": 3.444, "grad_norm": 0.48531273007392883, "learning_rate": 6.556655665566556e-05, "loss": 0.2566, "mean_token_accuracy": 0.8703100442886352, "step": 34440 }, { "epoch": 3.446, "grad_norm": 1.7494908571243286, "learning_rate": 6.554655465546556e-05, "loss": 0.3689, "mean_token_accuracy": 0.8678005158901214, "step": 34460 }, { "epoch": 3.448, "grad_norm": 0.553388774394989, "learning_rate": 6.552655265526553e-05, "loss": 0.3776, "mean_token_accuracy": 0.8776224881410599, "step": 34480 }, { "epoch": 3.45, "grad_norm": 0.5605815649032593, "learning_rate": 6.550655065506551e-05, "loss": 0.4345, "mean_token_accuracy": 0.8637740969657898, "step": 34500 }, { "epoch": 3.452, "grad_norm": 0.6912214159965515, "learning_rate": 6.54865486548655e-05, "loss": 0.2892, "mean_token_accuracy": 0.8708735287189484, "step": 34520 }, { "epoch": 3.454, "grad_norm": 2.298866033554077, "learning_rate": 6.546654665466548e-05, "loss": 0.3705, "mean_token_accuracy": 0.8593005746603012, "step": 34540 }, { "epoch": 3.456, "grad_norm": 0.6084408760070801, "learning_rate": 6.544654465446545e-05, "loss": 0.4265, "mean_token_accuracy": 0.8581977754831314, "step": 34560 }, { "epoch": 3.458, "grad_norm": 0.47074976563453674, "learning_rate": 6.542654265426543e-05, "loss": 0.3764, "mean_token_accuracy": 0.8784455716609955, "step": 34580 }, { "epoch": 3.46, "grad_norm": 0.6409473419189453, "learning_rate": 6.540654065406541e-05, "loss": 0.4383, "mean_token_accuracy": 0.8671617835760117, "step": 34600 }, { "epoch": 3.462, "grad_norm": 0.48046717047691345, "learning_rate": 6.53865386538654e-05, "loss": 0.3071, "mean_token_accuracy": 0.8424535095691681, "step": 34620 }, { "epoch": 3.464, "grad_norm": 1.441306233406067, "learning_rate": 6.536653665366536e-05, "loss": 0.4458, "mean_token_accuracy": 0.8483067154884338, "step": 34640 }, { "epoch": 3.466, "grad_norm": 0.5927050113677979, "learning_rate": 6.534653465346535e-05, "loss": 0.3273, "mean_token_accuracy": 0.8623914629220962, "step": 34660 }, { "epoch": 3.468, "grad_norm": 0.6230330467224121, "learning_rate": 6.532653265326533e-05, "loss": 0.332, "mean_token_accuracy": 0.8728429198265075, "step": 34680 }, { "epoch": 3.4699999999999998, "grad_norm": 0.6794305443763733, "learning_rate": 6.530653065306531e-05, "loss": 0.2756, "mean_token_accuracy": 0.8792745053768158, "step": 34700 }, { "epoch": 3.472, "grad_norm": 0.6406999230384827, "learning_rate": 6.528652865286528e-05, "loss": 0.4134, "mean_token_accuracy": 0.8611472696065903, "step": 34720 }, { "epoch": 3.474, "grad_norm": 0.8929893970489502, "learning_rate": 6.526652665266527e-05, "loss": 0.3571, "mean_token_accuracy": 0.8711273550987244, "step": 34740 }, { "epoch": 3.476, "grad_norm": 0.6521641612052917, "learning_rate": 6.524652465246525e-05, "loss": 0.2522, "mean_token_accuracy": 0.86176777780056, "step": 34760 }, { "epoch": 3.4779999999999998, "grad_norm": 0.608278751373291, "learning_rate": 6.522652265226523e-05, "loss": 0.393, "mean_token_accuracy": 0.8652106672525406, "step": 34780 }, { "epoch": 3.48, "grad_norm": 0.5095530152320862, "learning_rate": 6.52065206520652e-05, "loss": 0.2855, "mean_token_accuracy": 0.8731456905603409, "step": 34800 }, { "epoch": 3.482, "grad_norm": 0.9158013463020325, "learning_rate": 6.518651865186518e-05, "loss": 0.3174, "mean_token_accuracy": 0.8776051521301269, "step": 34820 }, { "epoch": 3.484, "grad_norm": 0.567789614200592, "learning_rate": 6.516651665166517e-05, "loss": 0.3504, "mean_token_accuracy": 0.8833423972129821, "step": 34840 }, { "epoch": 3.4859999999999998, "grad_norm": 0.7128135561943054, "learning_rate": 6.514651465146515e-05, "loss": 0.3181, "mean_token_accuracy": 0.8746280819177628, "step": 34860 }, { "epoch": 3.488, "grad_norm": 0.5056888461112976, "learning_rate": 6.512651265126512e-05, "loss": 0.3802, "mean_token_accuracy": 0.8740082204341888, "step": 34880 }, { "epoch": 3.49, "grad_norm": 0.5616424679756165, "learning_rate": 6.510651065106512e-05, "loss": 0.2522, "mean_token_accuracy": 0.8729003012180329, "step": 34900 }, { "epoch": 3.492, "grad_norm": 0.720918595790863, "learning_rate": 6.508650865086509e-05, "loss": 0.3619, "mean_token_accuracy": 0.8797172844409943, "step": 34920 }, { "epoch": 3.4939999999999998, "grad_norm": 0.5628081560134888, "learning_rate": 6.506650665066507e-05, "loss": 0.2471, "mean_token_accuracy": 0.8755690276622772, "step": 34940 }, { "epoch": 3.496, "grad_norm": 0.608792781829834, "learning_rate": 6.504650465046504e-05, "loss": 0.2252, "mean_token_accuracy": 0.8758887022733688, "step": 34960 }, { "epoch": 3.498, "grad_norm": 0.9637975692749023, "learning_rate": 6.502650265026504e-05, "loss": 0.2501, "mean_token_accuracy": 0.8706470787525177, "step": 34980 }, { "epoch": 3.5, "grad_norm": 1.8350884914398193, "learning_rate": 6.5006500650065e-05, "loss": 0.3243, "mean_token_accuracy": 0.8741276562213898, "step": 35000 }, { "epoch": 3.502, "grad_norm": 0.6371554136276245, "learning_rate": 6.498649864986499e-05, "loss": 0.264, "mean_token_accuracy": 0.8668960630893707, "step": 35020 }, { "epoch": 3.504, "grad_norm": 0.7082345485687256, "learning_rate": 6.496649664966497e-05, "loss": 0.357, "mean_token_accuracy": 0.8528844833374023, "step": 35040 }, { "epoch": 3.5060000000000002, "grad_norm": 1.887665867805481, "learning_rate": 6.494649464946495e-05, "loss": 0.3197, "mean_token_accuracy": 0.8839955568313599, "step": 35060 }, { "epoch": 3.508, "grad_norm": 0.48946380615234375, "learning_rate": 6.492649264926492e-05, "loss": 0.3469, "mean_token_accuracy": 0.860028150677681, "step": 35080 }, { "epoch": 3.51, "grad_norm": 0.6366333365440369, "learning_rate": 6.490649064906491e-05, "loss": 0.2501, "mean_token_accuracy": 0.8762770026922226, "step": 35100 }, { "epoch": 3.512, "grad_norm": 0.5281057953834534, "learning_rate": 6.488648864886489e-05, "loss": 0.3488, "mean_token_accuracy": 0.8789265275001525, "step": 35120 }, { "epoch": 3.5140000000000002, "grad_norm": 2.3866395950317383, "learning_rate": 6.486648664866487e-05, "loss": 0.4137, "mean_token_accuracy": 0.8345100820064545, "step": 35140 }, { "epoch": 3.516, "grad_norm": 0.43953344225883484, "learning_rate": 6.484648464846484e-05, "loss": 0.2168, "mean_token_accuracy": 0.8654292970895767, "step": 35160 }, { "epoch": 3.518, "grad_norm": 1.5761680603027344, "learning_rate": 6.482648264826483e-05, "loss": 0.4269, "mean_token_accuracy": 0.8559231281280517, "step": 35180 }, { "epoch": 3.52, "grad_norm": 0.4813806712627411, "learning_rate": 6.480648064806481e-05, "loss": 0.3529, "mean_token_accuracy": 0.8605226039886474, "step": 35200 }, { "epoch": 3.5220000000000002, "grad_norm": 0.8426644206047058, "learning_rate": 6.478647864786479e-05, "loss": 0.2842, "mean_token_accuracy": 0.8854694187641143, "step": 35220 }, { "epoch": 3.524, "grad_norm": 0.6106650233268738, "learning_rate": 6.476647664766476e-05, "loss": 0.2066, "mean_token_accuracy": 0.883697259426117, "step": 35240 }, { "epoch": 3.526, "grad_norm": 0.601314127445221, "learning_rate": 6.474647464746474e-05, "loss": 0.3374, "mean_token_accuracy": 0.8742801517248153, "step": 35260 }, { "epoch": 3.528, "grad_norm": 1.536591649055481, "learning_rate": 6.472647264726473e-05, "loss": 0.3759, "mean_token_accuracy": 0.8596427589654922, "step": 35280 }, { "epoch": 3.5300000000000002, "grad_norm": 0.6199634075164795, "learning_rate": 6.470647064706471e-05, "loss": 0.2253, "mean_token_accuracy": 0.8660606920719147, "step": 35300 }, { "epoch": 3.532, "grad_norm": 0.8382868766784668, "learning_rate": 6.468646864686468e-05, "loss": 0.3363, "mean_token_accuracy": 0.8707746893167496, "step": 35320 }, { "epoch": 3.534, "grad_norm": 0.7127678990364075, "learning_rate": 6.466646664666466e-05, "loss": 0.3472, "mean_token_accuracy": 0.870624965429306, "step": 35340 }, { "epoch": 3.536, "grad_norm": 1.2820680141448975, "learning_rate": 6.464646464646466e-05, "loss": 0.2868, "mean_token_accuracy": 0.8736959904432297, "step": 35360 }, { "epoch": 3.5380000000000003, "grad_norm": 0.4816438555717468, "learning_rate": 6.462646264626463e-05, "loss": 0.3497, "mean_token_accuracy": 0.8711611509323121, "step": 35380 }, { "epoch": 3.54, "grad_norm": 0.9274947047233582, "learning_rate": 6.460646064606461e-05, "loss": 0.3625, "mean_token_accuracy": 0.8601684302091599, "step": 35400 }, { "epoch": 3.542, "grad_norm": 0.893025279045105, "learning_rate": 6.45864586458646e-05, "loss": 0.3868, "mean_token_accuracy": 0.8663959622383117, "step": 35420 }, { "epoch": 3.544, "grad_norm": 2.396207094192505, "learning_rate": 6.456645664566458e-05, "loss": 0.3782, "mean_token_accuracy": 0.8761404097080231, "step": 35440 }, { "epoch": 3.5460000000000003, "grad_norm": 1.016396403312683, "learning_rate": 6.454645464546455e-05, "loss": 0.2755, "mean_token_accuracy": 0.871134078502655, "step": 35460 }, { "epoch": 3.548, "grad_norm": 0.6450251340866089, "learning_rate": 6.452645264526453e-05, "loss": 0.3543, "mean_token_accuracy": 0.8704058498144149, "step": 35480 }, { "epoch": 3.55, "grad_norm": 0.6560280323028564, "learning_rate": 6.450645064506451e-05, "loss": 0.3126, "mean_token_accuracy": 0.8707493990659714, "step": 35500 }, { "epoch": 3.552, "grad_norm": 0.6127378940582275, "learning_rate": 6.44864486448645e-05, "loss": 0.2917, "mean_token_accuracy": 0.8630226284265519, "step": 35520 }, { "epoch": 3.5540000000000003, "grad_norm": 1.2316161394119263, "learning_rate": 6.446644664466447e-05, "loss": 0.3146, "mean_token_accuracy": 0.8519515216350555, "step": 35540 }, { "epoch": 3.556, "grad_norm": 0.6978626251220703, "learning_rate": 6.444644464446445e-05, "loss": 0.2752, "mean_token_accuracy": 0.8794277429580688, "step": 35560 }, { "epoch": 3.558, "grad_norm": 0.6713722348213196, "learning_rate": 6.442644264426443e-05, "loss": 0.1979, "mean_token_accuracy": 0.8905303716659546, "step": 35580 }, { "epoch": 3.56, "grad_norm": 0.5106977224349976, "learning_rate": 6.440644064406442e-05, "loss": 0.2884, "mean_token_accuracy": 0.8794292062520981, "step": 35600 }, { "epoch": 3.5620000000000003, "grad_norm": 0.5619730949401855, "learning_rate": 6.438643864386438e-05, "loss": 0.3858, "mean_token_accuracy": 0.8642650842666626, "step": 35620 }, { "epoch": 3.564, "grad_norm": 0.6372853517532349, "learning_rate": 6.436643664366437e-05, "loss": 0.321, "mean_token_accuracy": 0.8602515518665313, "step": 35640 }, { "epoch": 3.566, "grad_norm": 0.6180396676063538, "learning_rate": 6.434643464346435e-05, "loss": 0.236, "mean_token_accuracy": 0.8673213124275208, "step": 35660 }, { "epoch": 3.568, "grad_norm": 1.1392948627471924, "learning_rate": 6.432643264326433e-05, "loss": 0.3291, "mean_token_accuracy": 0.8656696051359176, "step": 35680 }, { "epoch": 3.57, "grad_norm": 1.5457830429077148, "learning_rate": 6.43064306430643e-05, "loss": 0.2616, "mean_token_accuracy": 0.8800959378480911, "step": 35700 }, { "epoch": 3.572, "grad_norm": 0.5180692076683044, "learning_rate": 6.42864286428643e-05, "loss": 0.3241, "mean_token_accuracy": 0.854593288898468, "step": 35720 }, { "epoch": 3.574, "grad_norm": 0.6379057168960571, "learning_rate": 6.426642664266427e-05, "loss": 0.2801, "mean_token_accuracy": 0.8715409368276597, "step": 35740 }, { "epoch": 3.576, "grad_norm": 0.49185195565223694, "learning_rate": 6.424642464246425e-05, "loss": 0.3457, "mean_token_accuracy": 0.8705912411212922, "step": 35760 }, { "epoch": 3.578, "grad_norm": 1.2083040475845337, "learning_rate": 6.422642264226422e-05, "loss": 0.4267, "mean_token_accuracy": 0.8795862108469009, "step": 35780 }, { "epoch": 3.58, "grad_norm": 0.6788333654403687, "learning_rate": 6.420642064206422e-05, "loss": 0.3279, "mean_token_accuracy": 0.8684213668107986, "step": 35800 }, { "epoch": 3.582, "grad_norm": 0.6079851388931274, "learning_rate": 6.418641864186419e-05, "loss": 0.3501, "mean_token_accuracy": 0.8460934877395629, "step": 35820 }, { "epoch": 3.584, "grad_norm": 0.5668812394142151, "learning_rate": 6.416641664166417e-05, "loss": 0.346, "mean_token_accuracy": 0.8747670769691467, "step": 35840 }, { "epoch": 3.586, "grad_norm": 0.5738574266433716, "learning_rate": 6.414641464146415e-05, "loss": 0.2273, "mean_token_accuracy": 0.8842083811759949, "step": 35860 }, { "epoch": 3.588, "grad_norm": 2.001695156097412, "learning_rate": 6.412641264126414e-05, "loss": 0.4822, "mean_token_accuracy": 0.8628922224044799, "step": 35880 }, { "epoch": 3.59, "grad_norm": 0.66736900806427, "learning_rate": 6.410641064106411e-05, "loss": 0.3804, "mean_token_accuracy": 0.845063892006874, "step": 35900 }, { "epoch": 3.592, "grad_norm": 0.6395388245582581, "learning_rate": 6.408640864086409e-05, "loss": 0.2418, "mean_token_accuracy": 0.8772603750228882, "step": 35920 }, { "epoch": 3.594, "grad_norm": 0.5625948309898376, "learning_rate": 6.406640664066407e-05, "loss": 0.3162, "mean_token_accuracy": 0.8675746351480484, "step": 35940 }, { "epoch": 3.596, "grad_norm": 0.695488691329956, "learning_rate": 6.404640464046406e-05, "loss": 0.2317, "mean_token_accuracy": 0.8695042669773102, "step": 35960 }, { "epoch": 3.598, "grad_norm": 0.5804022550582886, "learning_rate": 6.402640264026403e-05, "loss": 0.2603, "mean_token_accuracy": 0.859330353140831, "step": 35980 }, { "epoch": 3.6, "grad_norm": 0.5922414660453796, "learning_rate": 6.400640064006401e-05, "loss": 0.2569, "mean_token_accuracy": 0.8707299143075943, "step": 36000 }, { "epoch": 3.602, "grad_norm": 0.5631738305091858, "learning_rate": 6.398639863986399e-05, "loss": 0.2324, "mean_token_accuracy": 0.8663616061210633, "step": 36020 }, { "epoch": 3.604, "grad_norm": 22.22084617614746, "learning_rate": 6.396639663966397e-05, "loss": 0.2733, "mean_token_accuracy": 0.8793836414813996, "step": 36040 }, { "epoch": 3.606, "grad_norm": 0.5743826031684875, "learning_rate": 6.394639463946394e-05, "loss": 0.2718, "mean_token_accuracy": 0.8798131704330444, "step": 36060 }, { "epoch": 3.608, "grad_norm": 0.7400786280632019, "learning_rate": 6.392639263926393e-05, "loss": 0.3726, "mean_token_accuracy": 0.8710187584161758, "step": 36080 }, { "epoch": 3.61, "grad_norm": 0.611079216003418, "learning_rate": 6.390639063906391e-05, "loss": 0.2565, "mean_token_accuracy": 0.8847640514373779, "step": 36100 }, { "epoch": 3.612, "grad_norm": 0.8568688631057739, "learning_rate": 6.388638863886389e-05, "loss": 0.23, "mean_token_accuracy": 0.8621512591838837, "step": 36120 }, { "epoch": 3.614, "grad_norm": 0.7334113717079163, "learning_rate": 6.386638663866386e-05, "loss": 0.2739, "mean_token_accuracy": 0.8693646758794784, "step": 36140 }, { "epoch": 3.616, "grad_norm": 0.5725245475769043, "learning_rate": 6.384638463846385e-05, "loss": 0.2943, "mean_token_accuracy": 0.8773335903882981, "step": 36160 }, { "epoch": 3.618, "grad_norm": 0.5320279598236084, "learning_rate": 6.382638263826383e-05, "loss": 0.2067, "mean_token_accuracy": 0.8797772228717804, "step": 36180 }, { "epoch": 3.62, "grad_norm": 0.4630730450153351, "learning_rate": 6.380638063806381e-05, "loss": 0.2209, "mean_token_accuracy": 0.8674615412950516, "step": 36200 }, { "epoch": 3.622, "grad_norm": 0.5290582776069641, "learning_rate": 6.378637863786378e-05, "loss": 0.2911, "mean_token_accuracy": 0.8716132253408432, "step": 36220 }, { "epoch": 3.624, "grad_norm": 2.0019521713256836, "learning_rate": 6.376637663766378e-05, "loss": 0.2997, "mean_token_accuracy": 0.8634348720312118, "step": 36240 }, { "epoch": 3.626, "grad_norm": 0.6074441075325012, "learning_rate": 6.374637463746375e-05, "loss": 0.2658, "mean_token_accuracy": 0.8706897318363189, "step": 36260 }, { "epoch": 3.628, "grad_norm": 0.6137661337852478, "learning_rate": 6.372637263726373e-05, "loss": 0.3479, "mean_token_accuracy": 0.8525453448295593, "step": 36280 }, { "epoch": 3.63, "grad_norm": 0.6996994614601135, "learning_rate": 6.37063706370637e-05, "loss": 0.2092, "mean_token_accuracy": 0.8764907211065293, "step": 36300 }, { "epoch": 3.632, "grad_norm": 1.5749528408050537, "learning_rate": 6.36863686368637e-05, "loss": 0.4344, "mean_token_accuracy": 0.8628779858350754, "step": 36320 }, { "epoch": 3.634, "grad_norm": 0.5904160737991333, "learning_rate": 6.366636663666367e-05, "loss": 0.3944, "mean_token_accuracy": 0.8591307312250137, "step": 36340 }, { "epoch": 3.636, "grad_norm": 0.5639327168464661, "learning_rate": 6.364636463646365e-05, "loss": 0.2365, "mean_token_accuracy": 0.8780531227588654, "step": 36360 }, { "epoch": 3.638, "grad_norm": 0.8936943411827087, "learning_rate": 6.362636263626363e-05, "loss": 0.3211, "mean_token_accuracy": 0.8851422935724258, "step": 36380 }, { "epoch": 3.64, "grad_norm": 1.52871835231781, "learning_rate": 6.360636063606362e-05, "loss": 0.3596, "mean_token_accuracy": 0.8875949501991272, "step": 36400 }, { "epoch": 3.642, "grad_norm": 0.5356078147888184, "learning_rate": 6.358635863586358e-05, "loss": 0.3169, "mean_token_accuracy": 0.8709643512964249, "step": 36420 }, { "epoch": 3.644, "grad_norm": 0.9660807847976685, "learning_rate": 6.356635663566357e-05, "loss": 0.3898, "mean_token_accuracy": 0.8643285781145096, "step": 36440 }, { "epoch": 3.646, "grad_norm": 0.5478933453559875, "learning_rate": 6.354635463546355e-05, "loss": 0.2572, "mean_token_accuracy": 0.889742648601532, "step": 36460 }, { "epoch": 3.648, "grad_norm": 0.6140152215957642, "learning_rate": 6.352635263526353e-05, "loss": 0.2861, "mean_token_accuracy": 0.8679725706577301, "step": 36480 }, { "epoch": 3.65, "grad_norm": 1.2701501846313477, "learning_rate": 6.35063506350635e-05, "loss": 0.2711, "mean_token_accuracy": 0.8915522634983063, "step": 36500 }, { "epoch": 3.652, "grad_norm": 0.6399600505828857, "learning_rate": 6.348634863486349e-05, "loss": 0.2705, "mean_token_accuracy": 0.8717467308044433, "step": 36520 }, { "epoch": 3.654, "grad_norm": 0.7218167781829834, "learning_rate": 6.346634663466347e-05, "loss": 0.2483, "mean_token_accuracy": 0.8731810122728347, "step": 36540 }, { "epoch": 3.656, "grad_norm": 0.5605192184448242, "learning_rate": 6.344634463446345e-05, "loss": 0.3109, "mean_token_accuracy": 0.8710456818342209, "step": 36560 }, { "epoch": 3.658, "grad_norm": 1.6038073301315308, "learning_rate": 6.342634263426342e-05, "loss": 0.3213, "mean_token_accuracy": 0.8651738822460174, "step": 36580 }, { "epoch": 3.66, "grad_norm": 0.5034592151641846, "learning_rate": 6.34063406340634e-05, "loss": 0.36, "mean_token_accuracy": 0.8718470752239227, "step": 36600 }, { "epoch": 3.662, "grad_norm": 0.7382104992866516, "learning_rate": 6.338633863386339e-05, "loss": 0.3688, "mean_token_accuracy": 0.8822584897279739, "step": 36620 }, { "epoch": 3.664, "grad_norm": 1.0684707164764404, "learning_rate": 6.336633663366337e-05, "loss": 0.342, "mean_token_accuracy": 0.8516645491123199, "step": 36640 }, { "epoch": 3.666, "grad_norm": 0.6122945547103882, "learning_rate": 6.334633463346334e-05, "loss": 0.3033, "mean_token_accuracy": 0.8618218094110489, "step": 36660 }, { "epoch": 3.668, "grad_norm": 0.6085900068283081, "learning_rate": 6.332633263326332e-05, "loss": 0.2888, "mean_token_accuracy": 0.8776034027338028, "step": 36680 }, { "epoch": 3.67, "grad_norm": 0.5310750603675842, "learning_rate": 6.33063306330633e-05, "loss": 0.2861, "mean_token_accuracy": 0.8539076238870621, "step": 36700 }, { "epoch": 3.672, "grad_norm": 0.8019564151763916, "learning_rate": 6.328632863286329e-05, "loss": 0.4245, "mean_token_accuracy": 0.8736049205064773, "step": 36720 }, { "epoch": 3.674, "grad_norm": 0.772192656993866, "learning_rate": 6.326632663266326e-05, "loss": 0.2828, "mean_token_accuracy": 0.8714875519275666, "step": 36740 }, { "epoch": 3.676, "grad_norm": 0.4405321776866913, "learning_rate": 6.324632463246326e-05, "loss": 0.3453, "mean_token_accuracy": 0.8574615448713303, "step": 36760 }, { "epoch": 3.678, "grad_norm": 0.8477486371994019, "learning_rate": 6.322632263226323e-05, "loss": 0.4094, "mean_token_accuracy": 0.8803147941827774, "step": 36780 }, { "epoch": 3.68, "grad_norm": 0.5488203763961792, "learning_rate": 6.320632063206321e-05, "loss": 0.3747, "mean_token_accuracy": 0.8557728588581085, "step": 36800 }, { "epoch": 3.682, "grad_norm": 1.2774447202682495, "learning_rate": 6.318631863186318e-05, "loss": 0.3019, "mean_token_accuracy": 0.8889457076787949, "step": 36820 }, { "epoch": 3.684, "grad_norm": 0.8357100486755371, "learning_rate": 6.316631663166317e-05, "loss": 0.3738, "mean_token_accuracy": 0.8557890355587006, "step": 36840 }, { "epoch": 3.686, "grad_norm": 0.8444141745567322, "learning_rate": 6.314631463146316e-05, "loss": 0.3379, "mean_token_accuracy": 0.8752065390348435, "step": 36860 }, { "epoch": 3.6879999999999997, "grad_norm": 0.6483259201049805, "learning_rate": 6.312631263126313e-05, "loss": 0.2658, "mean_token_accuracy": 0.8663195937871933, "step": 36880 }, { "epoch": 3.69, "grad_norm": 0.645963191986084, "learning_rate": 6.310631063106311e-05, "loss": 0.3355, "mean_token_accuracy": 0.8662048488855362, "step": 36900 }, { "epoch": 3.692, "grad_norm": 0.5470776557922363, "learning_rate": 6.308630863086309e-05, "loss": 0.3645, "mean_token_accuracy": 0.8659660071134567, "step": 36920 }, { "epoch": 3.694, "grad_norm": 0.7384814620018005, "learning_rate": 6.306630663066308e-05, "loss": 0.3442, "mean_token_accuracy": 0.8662906557321548, "step": 36940 }, { "epoch": 3.6959999999999997, "grad_norm": 1.185265302658081, "learning_rate": 6.304630463046305e-05, "loss": 0.2534, "mean_token_accuracy": 0.8742549180984497, "step": 36960 }, { "epoch": 3.698, "grad_norm": 0.6041831970214844, "learning_rate": 6.302630263026303e-05, "loss": 0.2747, "mean_token_accuracy": 0.8836761236190795, "step": 36980 }, { "epoch": 3.7, "grad_norm": 0.7151323556900024, "learning_rate": 6.300630063006301e-05, "loss": 0.2109, "mean_token_accuracy": 0.8773964405059814, "step": 37000 }, { "epoch": 3.702, "grad_norm": 0.6741049289703369, "learning_rate": 6.2986298629863e-05, "loss": 0.2556, "mean_token_accuracy": 0.8694325834512711, "step": 37020 }, { "epoch": 3.7039999999999997, "grad_norm": 0.608410120010376, "learning_rate": 6.296629662966296e-05, "loss": 0.3693, "mean_token_accuracy": 0.8696465820074082, "step": 37040 }, { "epoch": 3.706, "grad_norm": 0.7116169333457947, "learning_rate": 6.294629462946295e-05, "loss": 0.2748, "mean_token_accuracy": 0.8817040115594864, "step": 37060 }, { "epoch": 3.708, "grad_norm": 0.5597010850906372, "learning_rate": 6.292629262926293e-05, "loss": 0.2524, "mean_token_accuracy": 0.8595593869686127, "step": 37080 }, { "epoch": 3.71, "grad_norm": 0.5205817222595215, "learning_rate": 6.290629062906291e-05, "loss": 0.2369, "mean_token_accuracy": 0.8785796403884888, "step": 37100 }, { "epoch": 3.7119999999999997, "grad_norm": 0.6231561303138733, "learning_rate": 6.288628862886288e-05, "loss": 0.3332, "mean_token_accuracy": 0.8702651858329773, "step": 37120 }, { "epoch": 3.714, "grad_norm": 0.6803618669509888, "learning_rate": 6.286628662866288e-05, "loss": 0.3462, "mean_token_accuracy": 0.8640334367752075, "step": 37140 }, { "epoch": 3.716, "grad_norm": 1.7439289093017578, "learning_rate": 6.284628462846285e-05, "loss": 0.3001, "mean_token_accuracy": 0.8802816778421402, "step": 37160 }, { "epoch": 3.718, "grad_norm": 0.5643773674964905, "learning_rate": 6.282628262826283e-05, "loss": 0.4264, "mean_token_accuracy": 0.8696710944175721, "step": 37180 }, { "epoch": 3.7199999999999998, "grad_norm": 0.5369650721549988, "learning_rate": 6.28062806280628e-05, "loss": 0.252, "mean_token_accuracy": 0.852923983335495, "step": 37200 }, { "epoch": 3.722, "grad_norm": 0.6918923854827881, "learning_rate": 6.27862786278628e-05, "loss": 0.3707, "mean_token_accuracy": 0.8707129418849945, "step": 37220 }, { "epoch": 3.724, "grad_norm": 0.6588964462280273, "learning_rate": 6.276627662766277e-05, "loss": 0.3818, "mean_token_accuracy": 0.8675257563591003, "step": 37240 }, { "epoch": 3.726, "grad_norm": 0.4852888286113739, "learning_rate": 6.274627462746275e-05, "loss": 0.2565, "mean_token_accuracy": 0.8713476717472076, "step": 37260 }, { "epoch": 3.7279999999999998, "grad_norm": 0.5562394857406616, "learning_rate": 6.272627262726273e-05, "loss": 0.378, "mean_token_accuracy": 0.8656119257211685, "step": 37280 }, { "epoch": 3.73, "grad_norm": 1.3802870512008667, "learning_rate": 6.270627062706272e-05, "loss": 0.2287, "mean_token_accuracy": 0.8695983350276947, "step": 37300 }, { "epoch": 3.732, "grad_norm": 0.5776583552360535, "learning_rate": 6.268626862686269e-05, "loss": 0.2436, "mean_token_accuracy": 0.876975628733635, "step": 37320 }, { "epoch": 3.734, "grad_norm": 0.5748469829559326, "learning_rate": 6.266626662666267e-05, "loss": 0.2697, "mean_token_accuracy": 0.8561839133501052, "step": 37340 }, { "epoch": 3.7359999999999998, "grad_norm": 0.5542618036270142, "learning_rate": 6.264626462646265e-05, "loss": 0.2728, "mean_token_accuracy": 0.8731607347726822, "step": 37360 }, { "epoch": 3.738, "grad_norm": 0.9513124823570251, "learning_rate": 6.262626262626264e-05, "loss": 0.3523, "mean_token_accuracy": 0.8763543337583541, "step": 37380 }, { "epoch": 3.74, "grad_norm": 0.43419981002807617, "learning_rate": 6.26062606260626e-05, "loss": 0.2988, "mean_token_accuracy": 0.8753682106733323, "step": 37400 }, { "epoch": 3.742, "grad_norm": 0.5429550409317017, "learning_rate": 6.258625862586259e-05, "loss": 0.2714, "mean_token_accuracy": 0.8733316361904144, "step": 37420 }, { "epoch": 3.7439999999999998, "grad_norm": 1.38466477394104, "learning_rate": 6.256625662566257e-05, "loss": 0.251, "mean_token_accuracy": 0.8716876029968261, "step": 37440 }, { "epoch": 3.746, "grad_norm": 0.6838926076889038, "learning_rate": 6.254625462546255e-05, "loss": 0.3585, "mean_token_accuracy": 0.884550753235817, "step": 37460 }, { "epoch": 3.748, "grad_norm": 0.6243214011192322, "learning_rate": 6.252625262526252e-05, "loss": 0.4475, "mean_token_accuracy": 0.8627159297466278, "step": 37480 }, { "epoch": 3.75, "grad_norm": 0.6300207376480103, "learning_rate": 6.25062506250625e-05, "loss": 0.1867, "mean_token_accuracy": 0.8799022108316421, "step": 37500 }, { "epoch": 3.752, "grad_norm": 0.6362948417663574, "learning_rate": 6.248624862486249e-05, "loss": 0.3834, "mean_token_accuracy": 0.8728847295045853, "step": 37520 }, { "epoch": 3.754, "grad_norm": 0.9515674710273743, "learning_rate": 6.246624662466247e-05, "loss": 0.2676, "mean_token_accuracy": 0.8667024523019791, "step": 37540 }, { "epoch": 3.7560000000000002, "grad_norm": 0.5715243816375732, "learning_rate": 6.244624462446244e-05, "loss": 0.2613, "mean_token_accuracy": 0.8752205610275269, "step": 37560 }, { "epoch": 3.758, "grad_norm": 0.6455516219139099, "learning_rate": 6.242624262426244e-05, "loss": 0.2288, "mean_token_accuracy": 0.8662695735692978, "step": 37580 }, { "epoch": 3.76, "grad_norm": 0.8637039661407471, "learning_rate": 6.240624062406241e-05, "loss": 0.345, "mean_token_accuracy": 0.8668742895126342, "step": 37600 }, { "epoch": 3.762, "grad_norm": 0.736375093460083, "learning_rate": 6.238623862386239e-05, "loss": 0.4104, "mean_token_accuracy": 0.8653882622718811, "step": 37620 }, { "epoch": 3.7640000000000002, "grad_norm": 0.8004177212715149, "learning_rate": 6.236623662366236e-05, "loss": 0.3988, "mean_token_accuracy": 0.8696412652730942, "step": 37640 }, { "epoch": 3.766, "grad_norm": 0.49216774106025696, "learning_rate": 6.234623462346236e-05, "loss": 0.2992, "mean_token_accuracy": 0.8647985905408859, "step": 37660 }, { "epoch": 3.768, "grad_norm": 0.5348808765411377, "learning_rate": 6.232623262326233e-05, "loss": 0.3084, "mean_token_accuracy": 0.870289397239685, "step": 37680 }, { "epoch": 3.77, "grad_norm": 0.6070945262908936, "learning_rate": 6.230623062306231e-05, "loss": 0.3087, "mean_token_accuracy": 0.8727643251419067, "step": 37700 }, { "epoch": 3.7720000000000002, "grad_norm": 0.5283859372138977, "learning_rate": 6.228622862286228e-05, "loss": 0.3097, "mean_token_accuracy": 0.8787392348051071, "step": 37720 }, { "epoch": 3.774, "grad_norm": 0.4858422577381134, "learning_rate": 6.226622662266228e-05, "loss": 0.2091, "mean_token_accuracy": 0.8737773597240448, "step": 37740 }, { "epoch": 3.776, "grad_norm": 0.6852453947067261, "learning_rate": 6.224622462246225e-05, "loss": 0.4255, "mean_token_accuracy": 0.8855845391750335, "step": 37760 }, { "epoch": 3.778, "grad_norm": 0.5940825343132019, "learning_rate": 6.222622262226223e-05, "loss": 0.5019, "mean_token_accuracy": 0.8572398245334625, "step": 37780 }, { "epoch": 3.7800000000000002, "grad_norm": 0.784761369228363, "learning_rate": 6.220622062206221e-05, "loss": 0.3779, "mean_token_accuracy": 0.8465296447277069, "step": 37800 }, { "epoch": 3.782, "grad_norm": 1.005854845046997, "learning_rate": 6.21862186218622e-05, "loss": 0.2988, "mean_token_accuracy": 0.8691680133342743, "step": 37820 }, { "epoch": 3.784, "grad_norm": 2.935957431793213, "learning_rate": 6.216621662166216e-05, "loss": 0.3196, "mean_token_accuracy": 0.8762942135334015, "step": 37840 }, { "epoch": 3.786, "grad_norm": 0.639076828956604, "learning_rate": 6.214621462146215e-05, "loss": 0.2457, "mean_token_accuracy": 0.8732437014579773, "step": 37860 }, { "epoch": 3.7880000000000003, "grad_norm": 0.562962532043457, "learning_rate": 6.212621262126213e-05, "loss": 0.2658, "mean_token_accuracy": 0.8756238520145416, "step": 37880 }, { "epoch": 3.79, "grad_norm": 0.6728877425193787, "learning_rate": 6.210621062106211e-05, "loss": 0.338, "mean_token_accuracy": 0.8719765335321427, "step": 37900 }, { "epoch": 3.792, "grad_norm": 0.7859980463981628, "learning_rate": 6.208620862086208e-05, "loss": 0.3816, "mean_token_accuracy": 0.8442730665206909, "step": 37920 }, { "epoch": 3.794, "grad_norm": 0.6737905740737915, "learning_rate": 6.206620662066207e-05, "loss": 0.2918, "mean_token_accuracy": 0.8644498527050019, "step": 37940 }, { "epoch": 3.7960000000000003, "grad_norm": 0.5277662873268127, "learning_rate": 6.204620462046205e-05, "loss": 0.2754, "mean_token_accuracy": 0.85609650015831, "step": 37960 }, { "epoch": 3.798, "grad_norm": 0.7912155389785767, "learning_rate": 6.202620262026203e-05, "loss": 0.3324, "mean_token_accuracy": 0.8579482018947602, "step": 37980 }, { "epoch": 3.8, "grad_norm": 0.6966779828071594, "learning_rate": 6.2006200620062e-05, "loss": 0.3079, "mean_token_accuracy": 0.8876460790634155, "step": 38000 }, { "epoch": 3.802, "grad_norm": 0.7252274751663208, "learning_rate": 6.198619861986198e-05, "loss": 0.2855, "mean_token_accuracy": 0.8883203506469727, "step": 38020 }, { "epoch": 3.8040000000000003, "grad_norm": 0.5773529410362244, "learning_rate": 6.196619661966197e-05, "loss": 0.2907, "mean_token_accuracy": 0.88181092441082, "step": 38040 }, { "epoch": 3.806, "grad_norm": 0.5423643589019775, "learning_rate": 6.194619461946195e-05, "loss": 0.3201, "mean_token_accuracy": 0.8827274531126023, "step": 38060 }, { "epoch": 3.808, "grad_norm": 0.5180025696754456, "learning_rate": 6.192619261926192e-05, "loss": 0.4792, "mean_token_accuracy": 0.8531189024448395, "step": 38080 }, { "epoch": 3.81, "grad_norm": 0.5337915420532227, "learning_rate": 6.190619061906192e-05, "loss": 0.2526, "mean_token_accuracy": 0.869611057639122, "step": 38100 }, { "epoch": 3.8120000000000003, "grad_norm": 0.5521311163902283, "learning_rate": 6.188618861886189e-05, "loss": 0.2037, "mean_token_accuracy": 0.8752965033054352, "step": 38120 }, { "epoch": 3.814, "grad_norm": 0.622378408908844, "learning_rate": 6.186618661866187e-05, "loss": 0.2366, "mean_token_accuracy": 0.8914768993854523, "step": 38140 }, { "epoch": 3.816, "grad_norm": 0.6798290014266968, "learning_rate": 6.184618461846184e-05, "loss": 0.3947, "mean_token_accuracy": 0.8644128262996673, "step": 38160 }, { "epoch": 3.818, "grad_norm": 0.7119019031524658, "learning_rate": 6.182618261826184e-05, "loss": 0.59, "mean_token_accuracy": 0.8617500424385071, "step": 38180 }, { "epoch": 3.82, "grad_norm": 0.6194971799850464, "learning_rate": 6.18061806180618e-05, "loss": 0.2956, "mean_token_accuracy": 0.8810172975063324, "step": 38200 }, { "epoch": 3.822, "grad_norm": 0.5795000195503235, "learning_rate": 6.178617861786179e-05, "loss": 0.3219, "mean_token_accuracy": 0.873500183224678, "step": 38220 }, { "epoch": 3.824, "grad_norm": 0.7810654640197754, "learning_rate": 6.176617661766176e-05, "loss": 0.1921, "mean_token_accuracy": 0.8875189960002899, "step": 38240 }, { "epoch": 3.826, "grad_norm": 0.6174477338790894, "learning_rate": 6.174617461746175e-05, "loss": 0.3204, "mean_token_accuracy": 0.88436039686203, "step": 38260 }, { "epoch": 3.828, "grad_norm": 0.7711799740791321, "learning_rate": 6.172617261726172e-05, "loss": 0.4903, "mean_token_accuracy": 0.8505413174629212, "step": 38280 }, { "epoch": 3.83, "grad_norm": 0.6188388466835022, "learning_rate": 6.17061706170617e-05, "loss": 0.2339, "mean_token_accuracy": 0.8870622903108597, "step": 38300 }, { "epoch": 3.832, "grad_norm": 0.8194860219955444, "learning_rate": 6.168616861686169e-05, "loss": 0.2316, "mean_token_accuracy": 0.8816296875476837, "step": 38320 }, { "epoch": 3.834, "grad_norm": 0.5578952431678772, "learning_rate": 6.166616661666167e-05, "loss": 0.2444, "mean_token_accuracy": 0.8721778720617295, "step": 38340 }, { "epoch": 3.836, "grad_norm": 0.6502073407173157, "learning_rate": 6.164616461646166e-05, "loss": 0.4148, "mean_token_accuracy": 0.8453341960906983, "step": 38360 }, { "epoch": 3.838, "grad_norm": 0.6690168380737305, "learning_rate": 6.162616261626163e-05, "loss": 0.3, "mean_token_accuracy": 0.8596091151237488, "step": 38380 }, { "epoch": 3.84, "grad_norm": 0.5337373614311218, "learning_rate": 6.160616061606161e-05, "loss": 0.1904, "mean_token_accuracy": 0.8697469621896744, "step": 38400 }, { "epoch": 3.842, "grad_norm": 1.4840962886810303, "learning_rate": 6.158615861586159e-05, "loss": 0.3775, "mean_token_accuracy": 0.8769190341234208, "step": 38420 }, { "epoch": 3.844, "grad_norm": 0.6534121036529541, "learning_rate": 6.156615661566157e-05, "loss": 0.2977, "mean_token_accuracy": 0.8798849642276764, "step": 38440 }, { "epoch": 3.846, "grad_norm": 0.7179723978042603, "learning_rate": 6.154615461546154e-05, "loss": 0.3444, "mean_token_accuracy": 0.8647358626127243, "step": 38460 }, { "epoch": 3.848, "grad_norm": 0.5466015338897705, "learning_rate": 6.152615261526154e-05, "loss": 0.2963, "mean_token_accuracy": 0.8813448071479797, "step": 38480 }, { "epoch": 3.85, "grad_norm": 0.5829864740371704, "learning_rate": 6.150615061506151e-05, "loss": 0.4132, "mean_token_accuracy": 0.8815576940774917, "step": 38500 }, { "epoch": 3.852, "grad_norm": 0.5904212594032288, "learning_rate": 6.148614861486149e-05, "loss": 0.4076, "mean_token_accuracy": 0.8710647732019424, "step": 38520 }, { "epoch": 3.854, "grad_norm": 0.9814632534980774, "learning_rate": 6.146614661466146e-05, "loss": 0.3758, "mean_token_accuracy": 0.87456716299057, "step": 38540 }, { "epoch": 3.856, "grad_norm": 0.6038378477096558, "learning_rate": 6.144614461446146e-05, "loss": 0.222, "mean_token_accuracy": 0.8926930397748947, "step": 38560 }, { "epoch": 3.858, "grad_norm": 0.6015466451644897, "learning_rate": 6.142614261426143e-05, "loss": 0.1705, "mean_token_accuracy": 0.8914864897727967, "step": 38580 }, { "epoch": 3.86, "grad_norm": 0.5414056777954102, "learning_rate": 6.140614061406141e-05, "loss": 0.1989, "mean_token_accuracy": 0.8837334901094437, "step": 38600 }, { "epoch": 3.862, "grad_norm": 9.175581932067871, "learning_rate": 6.13861386138614e-05, "loss": 0.3543, "mean_token_accuracy": 0.8712111949920655, "step": 38620 }, { "epoch": 3.864, "grad_norm": 0.48010626435279846, "learning_rate": 6.136613661366138e-05, "loss": 0.2327, "mean_token_accuracy": 0.8726304143667221, "step": 38640 }, { "epoch": 3.866, "grad_norm": 0.6079490780830383, "learning_rate": 6.134613461346135e-05, "loss": 0.3045, "mean_token_accuracy": 0.8548991560935975, "step": 38660 }, { "epoch": 3.868, "grad_norm": 0.4913569390773773, "learning_rate": 6.132613261326133e-05, "loss": 0.2782, "mean_token_accuracy": 0.8728037565946579, "step": 38680 }, { "epoch": 3.87, "grad_norm": 0.49279531836509705, "learning_rate": 6.130613061306131e-05, "loss": 0.3507, "mean_token_accuracy": 0.8656157433986664, "step": 38700 }, { "epoch": 3.872, "grad_norm": 0.641196608543396, "learning_rate": 6.12861286128613e-05, "loss": 0.346, "mean_token_accuracy": 0.8843702375888824, "step": 38720 }, { "epoch": 3.874, "grad_norm": 0.5836178660392761, "learning_rate": 6.126612661266127e-05, "loss": 0.4232, "mean_token_accuracy": 0.8695846408605575, "step": 38740 }, { "epoch": 3.876, "grad_norm": 0.8318409323692322, "learning_rate": 6.124612461246125e-05, "loss": 0.3961, "mean_token_accuracy": 0.8758835911750793, "step": 38760 }, { "epoch": 3.878, "grad_norm": 0.5151286125183105, "learning_rate": 6.122612261226123e-05, "loss": 0.3694, "mean_token_accuracy": 0.8664066851139068, "step": 38780 }, { "epoch": 3.88, "grad_norm": 0.620460569858551, "learning_rate": 6.120612061206121e-05, "loss": 0.2225, "mean_token_accuracy": 0.8888752371072769, "step": 38800 }, { "epoch": 3.882, "grad_norm": 0.6984573006629944, "learning_rate": 6.118611861186118e-05, "loss": 0.2272, "mean_token_accuracy": 0.8720823049545288, "step": 38820 }, { "epoch": 3.884, "grad_norm": 0.5988692045211792, "learning_rate": 6.116611661166117e-05, "loss": 0.4039, "mean_token_accuracy": 0.8577894747257233, "step": 38840 }, { "epoch": 3.886, "grad_norm": 0.5616875290870667, "learning_rate": 6.114611461146115e-05, "loss": 0.2493, "mean_token_accuracy": 0.868996399641037, "step": 38860 }, { "epoch": 3.888, "grad_norm": 0.6555869579315186, "learning_rate": 6.112611261126113e-05, "loss": 0.2399, "mean_token_accuracy": 0.8698376417160034, "step": 38880 }, { "epoch": 3.89, "grad_norm": 0.5255077481269836, "learning_rate": 6.11061106110611e-05, "loss": 0.4067, "mean_token_accuracy": 0.8638045221567154, "step": 38900 }, { "epoch": 3.892, "grad_norm": 0.5106098055839539, "learning_rate": 6.108610861086109e-05, "loss": 0.1989, "mean_token_accuracy": 0.8751670360565186, "step": 38920 }, { "epoch": 3.894, "grad_norm": 0.5743734240531921, "learning_rate": 6.106610661066107e-05, "loss": 0.2947, "mean_token_accuracy": 0.8668641209602356, "step": 38940 }, { "epoch": 3.896, "grad_norm": 0.42944279313087463, "learning_rate": 6.104610461046105e-05, "loss": 0.4089, "mean_token_accuracy": 0.8663593798875808, "step": 38960 }, { "epoch": 3.898, "grad_norm": 2.142580032348633, "learning_rate": 6.102610261026103e-05, "loss": 0.3752, "mean_token_accuracy": 0.8581943064928055, "step": 38980 }, { "epoch": 3.9, "grad_norm": 3.6973512172698975, "learning_rate": 6.100610061006101e-05, "loss": 0.3502, "mean_token_accuracy": 0.8715075880289078, "step": 39000 }, { "epoch": 3.902, "grad_norm": 4.878500938415527, "learning_rate": 6.098609860986099e-05, "loss": 0.2065, "mean_token_accuracy": 0.8810646384954453, "step": 39020 }, { "epoch": 3.904, "grad_norm": 5.934504508972168, "learning_rate": 6.096609660966097e-05, "loss": 0.3507, "mean_token_accuracy": 0.8782994717359542, "step": 39040 }, { "epoch": 3.906, "grad_norm": 0.7002901434898376, "learning_rate": 6.094609460946095e-05, "loss": 0.311, "mean_token_accuracy": 0.8699419945478439, "step": 39060 }, { "epoch": 3.908, "grad_norm": 0.4467146396636963, "learning_rate": 6.092609260926093e-05, "loss": 0.4255, "mean_token_accuracy": 0.8552658647298813, "step": 39080 }, { "epoch": 3.91, "grad_norm": 0.6588146686553955, "learning_rate": 6.0906090609060906e-05, "loss": 0.3955, "mean_token_accuracy": 0.8692861288785935, "step": 39100 }, { "epoch": 3.912, "grad_norm": 0.5640832185745239, "learning_rate": 6.088608860886089e-05, "loss": 0.2887, "mean_token_accuracy": 0.8790556818246842, "step": 39120 }, { "epoch": 3.914, "grad_norm": 0.6424194574356079, "learning_rate": 6.0866086608660866e-05, "loss": 0.2388, "mean_token_accuracy": 0.8659415811300277, "step": 39140 }, { "epoch": 3.916, "grad_norm": 0.5990620255470276, "learning_rate": 6.084608460846085e-05, "loss": 0.3251, "mean_token_accuracy": 0.8677189528942109, "step": 39160 }, { "epoch": 3.918, "grad_norm": 0.6320942640304565, "learning_rate": 6.0826082608260825e-05, "loss": 0.3263, "mean_token_accuracy": 0.8769141882658005, "step": 39180 }, { "epoch": 3.92, "grad_norm": 0.5531835556030273, "learning_rate": 6.080608060806081e-05, "loss": 0.2496, "mean_token_accuracy": 0.8531225472688675, "step": 39200 }, { "epoch": 3.922, "grad_norm": 0.6960066556930542, "learning_rate": 6.0786078607860784e-05, "loss": 0.4618, "mean_token_accuracy": 0.8715507388114929, "step": 39220 }, { "epoch": 3.924, "grad_norm": 0.6904236674308777, "learning_rate": 6.0766076607660774e-05, "loss": 0.2711, "mean_token_accuracy": 0.8802380353212357, "step": 39240 }, { "epoch": 3.926, "grad_norm": 0.6049536466598511, "learning_rate": 6.0746074607460744e-05, "loss": 0.2714, "mean_token_accuracy": 0.8800592243671417, "step": 39260 }, { "epoch": 3.928, "grad_norm": 0.5941533446311951, "learning_rate": 6.0726072607260733e-05, "loss": 0.289, "mean_token_accuracy": 0.8777857035398483, "step": 39280 }, { "epoch": 3.93, "grad_norm": 0.7025784850120544, "learning_rate": 6.07060706070607e-05, "loss": 0.3071, "mean_token_accuracy": 0.876789465546608, "step": 39300 }, { "epoch": 3.932, "grad_norm": 1.166749358177185, "learning_rate": 6.068606860686069e-05, "loss": 0.2699, "mean_token_accuracy": 0.8808971077203751, "step": 39320 }, { "epoch": 3.934, "grad_norm": 0.5680447220802307, "learning_rate": 6.066606660666066e-05, "loss": 0.262, "mean_token_accuracy": 0.892320254445076, "step": 39340 }, { "epoch": 3.936, "grad_norm": 2.2990829944610596, "learning_rate": 6.064606460646065e-05, "loss": 0.3526, "mean_token_accuracy": 0.8649907290935517, "step": 39360 }, { "epoch": 3.9379999999999997, "grad_norm": 2.06404709815979, "learning_rate": 6.062606260626063e-05, "loss": 0.3393, "mean_token_accuracy": 0.8812209129333496, "step": 39380 }, { "epoch": 3.94, "grad_norm": 0.7405892014503479, "learning_rate": 6.060606060606061e-05, "loss": 0.2392, "mean_token_accuracy": 0.8654718518257141, "step": 39400 }, { "epoch": 3.942, "grad_norm": 0.644312858581543, "learning_rate": 6.058605860586059e-05, "loss": 0.1919, "mean_token_accuracy": 0.8853428572416305, "step": 39420 }, { "epoch": 3.944, "grad_norm": 0.5864706039428711, "learning_rate": 6.056605660566057e-05, "loss": 0.2956, "mean_token_accuracy": 0.8664432644844056, "step": 39440 }, { "epoch": 3.9459999999999997, "grad_norm": 0.5767986178398132, "learning_rate": 6.054605460546055e-05, "loss": 0.2984, "mean_token_accuracy": 0.8708448380231857, "step": 39460 }, { "epoch": 3.948, "grad_norm": 0.750184178352356, "learning_rate": 6.052605260526053e-05, "loss": 0.3029, "mean_token_accuracy": 0.8798156529664993, "step": 39480 }, { "epoch": 3.95, "grad_norm": 0.5681413412094116, "learning_rate": 6.0506050605060506e-05, "loss": 0.2529, "mean_token_accuracy": 0.8779018461704254, "step": 39500 }, { "epoch": 3.952, "grad_norm": 2.986074924468994, "learning_rate": 6.048604860486049e-05, "loss": 0.3492, "mean_token_accuracy": 0.8796614557504654, "step": 39520 }, { "epoch": 3.9539999999999997, "grad_norm": 1.8619135618209839, "learning_rate": 6.0466046604660466e-05, "loss": 0.3996, "mean_token_accuracy": 0.8669161707162857, "step": 39540 }, { "epoch": 3.956, "grad_norm": 0.523206889629364, "learning_rate": 6.044604460446045e-05, "loss": 0.2536, "mean_token_accuracy": 0.8724441409111023, "step": 39560 }, { "epoch": 3.958, "grad_norm": 0.48148179054260254, "learning_rate": 6.0426042604260425e-05, "loss": 0.3364, "mean_token_accuracy": 0.8857713311910629, "step": 39580 }, { "epoch": 3.96, "grad_norm": 0.6755000948905945, "learning_rate": 6.040604060406041e-05, "loss": 0.2455, "mean_token_accuracy": 0.8786976218223572, "step": 39600 }, { "epoch": 3.9619999999999997, "grad_norm": 0.6397257447242737, "learning_rate": 6.0386038603860384e-05, "loss": 0.294, "mean_token_accuracy": 0.8775074124336243, "step": 39620 }, { "epoch": 3.964, "grad_norm": 2.2424724102020264, "learning_rate": 6.036603660366037e-05, "loss": 0.2767, "mean_token_accuracy": 0.8666448175907135, "step": 39640 }, { "epoch": 3.966, "grad_norm": 0.5477710366249084, "learning_rate": 6.0346034603460344e-05, "loss": 0.2676, "mean_token_accuracy": 0.8681154370307922, "step": 39660 }, { "epoch": 3.968, "grad_norm": 0.6751280426979065, "learning_rate": 6.032603260326033e-05, "loss": 0.3162, "mean_token_accuracy": 0.8813120663166046, "step": 39680 }, { "epoch": 3.9699999999999998, "grad_norm": 0.5646085143089294, "learning_rate": 6.03060306030603e-05, "loss": 0.2458, "mean_token_accuracy": 0.8866648882627487, "step": 39700 }, { "epoch": 3.972, "grad_norm": 0.5218909382820129, "learning_rate": 6.028602860286029e-05, "loss": 0.308, "mean_token_accuracy": 0.8761284679174424, "step": 39720 }, { "epoch": 3.974, "grad_norm": 0.823512077331543, "learning_rate": 6.026602660266026e-05, "loss": 0.2215, "mean_token_accuracy": 0.8751194626092911, "step": 39740 }, { "epoch": 3.976, "grad_norm": 1.223345398902893, "learning_rate": 6.024602460246025e-05, "loss": 0.2831, "mean_token_accuracy": 0.8784719586372376, "step": 39760 }, { "epoch": 3.9779999999999998, "grad_norm": 0.5631878972053528, "learning_rate": 6.022602260226022e-05, "loss": 0.2223, "mean_token_accuracy": 0.8794191867113114, "step": 39780 }, { "epoch": 3.98, "grad_norm": 0.6777649521827698, "learning_rate": 6.020602060206021e-05, "loss": 0.3959, "mean_token_accuracy": 0.8770450174808502, "step": 39800 }, { "epoch": 3.982, "grad_norm": 0.4794694781303406, "learning_rate": 6.018601860186018e-05, "loss": 0.2299, "mean_token_accuracy": 0.8969220131635666, "step": 39820 }, { "epoch": 3.984, "grad_norm": 1.0506629943847656, "learning_rate": 6.016601660166017e-05, "loss": 0.3091, "mean_token_accuracy": 0.8760922640562058, "step": 39840 }, { "epoch": 3.9859999999999998, "grad_norm": 2.684183359146118, "learning_rate": 6.0146014601460154e-05, "loss": 0.2749, "mean_token_accuracy": 0.8764826595783234, "step": 39860 }, { "epoch": 3.988, "grad_norm": 0.5487027764320374, "learning_rate": 6.012601260126013e-05, "loss": 0.3622, "mean_token_accuracy": 0.8704480051994323, "step": 39880 }, { "epoch": 3.99, "grad_norm": 0.552635908126831, "learning_rate": 6.010601060106011e-05, "loss": 0.3174, "mean_token_accuracy": 0.8690520465373993, "step": 39900 }, { "epoch": 3.992, "grad_norm": 0.6580628752708435, "learning_rate": 6.008600860086009e-05, "loss": 0.2751, "mean_token_accuracy": 0.8929239928722381, "step": 39920 }, { "epoch": 3.9939999999999998, "grad_norm": 0.6576511263847351, "learning_rate": 6.006600660066007e-05, "loss": 0.1864, "mean_token_accuracy": 0.8839194566011429, "step": 39940 }, { "epoch": 3.996, "grad_norm": 0.6553398966789246, "learning_rate": 6.004600460046005e-05, "loss": 0.2438, "mean_token_accuracy": 0.8805165141820908, "step": 39960 }, { "epoch": 3.998, "grad_norm": 0.8516300916671753, "learning_rate": 6.002600260026003e-05, "loss": 0.3511, "mean_token_accuracy": 0.8640880525112152, "step": 39980 }, { "epoch": 4.0, "grad_norm": 1.038821816444397, "learning_rate": 6.000600060006001e-05, "loss": 0.3003, "mean_token_accuracy": 0.8648482650518418, "step": 40000 }, { "epoch": 4.002, "grad_norm": 0.3695070147514343, "learning_rate": 5.998599859985999e-05, "loss": 0.1917, "mean_token_accuracy": 0.9404533237218857, "step": 40020 }, { "epoch": 4.004, "grad_norm": 0.4274654984474182, "learning_rate": 5.996599659965997e-05, "loss": 0.2178, "mean_token_accuracy": 0.9481073468923569, "step": 40040 }, { "epoch": 4.006, "grad_norm": 0.555463433265686, "learning_rate": 5.994599459945995e-05, "loss": 0.1916, "mean_token_accuracy": 0.9444965630769729, "step": 40060 }, { "epoch": 4.008, "grad_norm": 0.48068761825561523, "learning_rate": 5.9925992599259927e-05, "loss": 0.2579, "mean_token_accuracy": 0.9502566426992416, "step": 40080 }, { "epoch": 4.01, "grad_norm": 0.5315089225769043, "learning_rate": 5.9905990599059916e-05, "loss": 0.2063, "mean_token_accuracy": 0.952561205625534, "step": 40100 }, { "epoch": 4.012, "grad_norm": 0.4238646626472473, "learning_rate": 5.9885988598859886e-05, "loss": 0.2793, "mean_token_accuracy": 0.9392744719982147, "step": 40120 }, { "epoch": 4.014, "grad_norm": 0.6652076244354248, "learning_rate": 5.9865986598659876e-05, "loss": 0.2402, "mean_token_accuracy": 0.9399936378002167, "step": 40140 }, { "epoch": 4.016, "grad_norm": 1.3490166664123535, "learning_rate": 5.9845984598459845e-05, "loss": 0.2787, "mean_token_accuracy": 0.9426165699958802, "step": 40160 }, { "epoch": 4.018, "grad_norm": 0.6210604906082153, "learning_rate": 5.9825982598259835e-05, "loss": 0.2474, "mean_token_accuracy": 0.9455516546964645, "step": 40180 }, { "epoch": 4.02, "grad_norm": 0.7105346322059631, "learning_rate": 5.9805980598059805e-05, "loss": 0.2551, "mean_token_accuracy": 0.9448003321886063, "step": 40200 }, { "epoch": 4.022, "grad_norm": 0.41922876238822937, "learning_rate": 5.9785978597859794e-05, "loss": 0.3928, "mean_token_accuracy": 0.9403809428215026, "step": 40220 }, { "epoch": 4.024, "grad_norm": 0.8681653738021851, "learning_rate": 5.976597659765977e-05, "loss": 0.2846, "mean_token_accuracy": 0.9452485769987107, "step": 40240 }, { "epoch": 4.026, "grad_norm": 0.44276848435401917, "learning_rate": 5.9745974597459754e-05, "loss": 0.2479, "mean_token_accuracy": 0.955161651968956, "step": 40260 }, { "epoch": 4.028, "grad_norm": 0.52724289894104, "learning_rate": 5.972597259725973e-05, "loss": 0.2693, "mean_token_accuracy": 0.9502043187618255, "step": 40280 }, { "epoch": 4.03, "grad_norm": 0.3722785711288452, "learning_rate": 5.970597059705971e-05, "loss": 0.1744, "mean_token_accuracy": 0.9490847408771514, "step": 40300 }, { "epoch": 4.032, "grad_norm": 0.530300498008728, "learning_rate": 5.968596859685969e-05, "loss": 0.2589, "mean_token_accuracy": 0.9461315244436264, "step": 40320 }, { "epoch": 4.034, "grad_norm": 0.6968156695365906, "learning_rate": 5.966596659665967e-05, "loss": 0.2557, "mean_token_accuracy": 0.9391181409358978, "step": 40340 }, { "epoch": 4.036, "grad_norm": 2.73713755607605, "learning_rate": 5.964596459645965e-05, "loss": 0.3821, "mean_token_accuracy": 0.9411807268857956, "step": 40360 }, { "epoch": 4.038, "grad_norm": 0.36891821026802063, "learning_rate": 5.962596259625963e-05, "loss": 0.2107, "mean_token_accuracy": 0.9496043086051941, "step": 40380 }, { "epoch": 4.04, "grad_norm": 3.677866220474243, "learning_rate": 5.960596059605961e-05, "loss": 0.2708, "mean_token_accuracy": 0.9453651219606399, "step": 40400 }, { "epoch": 4.042, "grad_norm": 0.5516304969787598, "learning_rate": 5.958595859585959e-05, "loss": 0.2859, "mean_token_accuracy": 0.9424598783254623, "step": 40420 }, { "epoch": 4.044, "grad_norm": 0.5842597484588623, "learning_rate": 5.956595659565957e-05, "loss": 0.2377, "mean_token_accuracy": 0.9542440563440323, "step": 40440 }, { "epoch": 4.046, "grad_norm": 0.505531370639801, "learning_rate": 5.954595459545955e-05, "loss": 0.3277, "mean_token_accuracy": 0.9572823405265808, "step": 40460 }, { "epoch": 4.048, "grad_norm": 0.6434100270271301, "learning_rate": 5.9525952595259527e-05, "loss": 0.3066, "mean_token_accuracy": 0.9452645480632782, "step": 40480 }, { "epoch": 4.05, "grad_norm": 0.4394971430301666, "learning_rate": 5.950595059505951e-05, "loss": 0.2966, "mean_token_accuracy": 0.9550013482570648, "step": 40500 }, { "epoch": 4.052, "grad_norm": 1.4099290370941162, "learning_rate": 5.9485948594859486e-05, "loss": 0.3224, "mean_token_accuracy": 0.9457403510808945, "step": 40520 }, { "epoch": 4.054, "grad_norm": 1.963117003440857, "learning_rate": 5.946594659465947e-05, "loss": 0.2302, "mean_token_accuracy": 0.9575450122356415, "step": 40540 }, { "epoch": 4.056, "grad_norm": 2.311525821685791, "learning_rate": 5.9445944594459445e-05, "loss": 0.3363, "mean_token_accuracy": 0.9447369903326035, "step": 40560 }, { "epoch": 4.058, "grad_norm": 2.199549436569214, "learning_rate": 5.9425942594259435e-05, "loss": 0.486, "mean_token_accuracy": 0.9360175430774689, "step": 40580 }, { "epoch": 4.06, "grad_norm": 0.6077132821083069, "learning_rate": 5.9405940594059404e-05, "loss": 0.189, "mean_token_accuracy": 0.9548536151647568, "step": 40600 }, { "epoch": 4.062, "grad_norm": 0.46862196922302246, "learning_rate": 5.9385938593859394e-05, "loss": 0.2676, "mean_token_accuracy": 0.9454487234354019, "step": 40620 }, { "epoch": 4.064, "grad_norm": 0.4515359103679657, "learning_rate": 5.9365936593659364e-05, "loss": 0.2599, "mean_token_accuracy": 0.9531122028827668, "step": 40640 }, { "epoch": 4.066, "grad_norm": 0.4432629942893982, "learning_rate": 5.9345934593459354e-05, "loss": 0.2768, "mean_token_accuracy": 0.9551115602254867, "step": 40660 }, { "epoch": 4.068, "grad_norm": 0.8393521308898926, "learning_rate": 5.932593259325932e-05, "loss": 0.2777, "mean_token_accuracy": 0.9517722338438034, "step": 40680 }, { "epoch": 4.07, "grad_norm": 2.1338534355163574, "learning_rate": 5.930593059305931e-05, "loss": 0.2478, "mean_token_accuracy": 0.9515111416578292, "step": 40700 }, { "epoch": 4.072, "grad_norm": 0.5018056631088257, "learning_rate": 5.928592859285928e-05, "loss": 0.2297, "mean_token_accuracy": 0.9492642253637313, "step": 40720 }, { "epoch": 4.074, "grad_norm": 0.6028667092323303, "learning_rate": 5.926592659265927e-05, "loss": 0.2735, "mean_token_accuracy": 0.9516919165849685, "step": 40740 }, { "epoch": 4.076, "grad_norm": 1.074133038520813, "learning_rate": 5.924592459245925e-05, "loss": 0.2816, "mean_token_accuracy": 0.9518454015254975, "step": 40760 }, { "epoch": 4.078, "grad_norm": 0.5521485209465027, "learning_rate": 5.922592259225923e-05, "loss": 0.2866, "mean_token_accuracy": 0.9525439143180847, "step": 40780 }, { "epoch": 4.08, "grad_norm": 1.129496455192566, "learning_rate": 5.920592059205921e-05, "loss": 0.3261, "mean_token_accuracy": 0.9428776890039444, "step": 40800 }, { "epoch": 4.082, "grad_norm": 0.41810935735702515, "learning_rate": 5.918591859185919e-05, "loss": 0.23, "mean_token_accuracy": 0.9462844282388687, "step": 40820 }, { "epoch": 4.084, "grad_norm": 0.9623427987098694, "learning_rate": 5.916591659165917e-05, "loss": 0.3328, "mean_token_accuracy": 0.9435751646757126, "step": 40840 }, { "epoch": 4.086, "grad_norm": 0.5017719268798828, "learning_rate": 5.914591459145915e-05, "loss": 0.2397, "mean_token_accuracy": 0.9507381647825242, "step": 40860 }, { "epoch": 4.088, "grad_norm": 0.41046202182769775, "learning_rate": 5.9125912591259126e-05, "loss": 0.2239, "mean_token_accuracy": 0.9471152186393738, "step": 40880 }, { "epoch": 4.09, "grad_norm": 0.3734019100666046, "learning_rate": 5.910591059105911e-05, "loss": 0.2306, "mean_token_accuracy": 0.9525427967309952, "step": 40900 }, { "epoch": 4.092, "grad_norm": 0.627392053604126, "learning_rate": 5.9085908590859086e-05, "loss": 0.4339, "mean_token_accuracy": 0.9357777088880539, "step": 40920 }, { "epoch": 4.094, "grad_norm": 1.6327327489852905, "learning_rate": 5.906590659065907e-05, "loss": 0.2643, "mean_token_accuracy": 0.9464447617530822, "step": 40940 }, { "epoch": 4.096, "grad_norm": 1.093636155128479, "learning_rate": 5.9045904590459045e-05, "loss": 0.2928, "mean_token_accuracy": 0.9539493680000305, "step": 40960 }, { "epoch": 4.098, "grad_norm": 0.5414416790008545, "learning_rate": 5.902590259025903e-05, "loss": 0.2225, "mean_token_accuracy": 0.9507306277751922, "step": 40980 }, { "epoch": 4.1, "grad_norm": 0.5084402561187744, "learning_rate": 5.9005900590059004e-05, "loss": 0.1979, "mean_token_accuracy": 0.9363684922456741, "step": 41000 }, { "epoch": 4.102, "grad_norm": 0.6151275038719177, "learning_rate": 5.898589858985899e-05, "loss": 0.279, "mean_token_accuracy": 0.9473098874092102, "step": 41020 }, { "epoch": 4.104, "grad_norm": 0.6197025775909424, "learning_rate": 5.8965896589658964e-05, "loss": 0.3399, "mean_token_accuracy": 0.9423624455928803, "step": 41040 }, { "epoch": 4.106, "grad_norm": 0.5132684111595154, "learning_rate": 5.894589458945895e-05, "loss": 0.2245, "mean_token_accuracy": 0.957836365699768, "step": 41060 }, { "epoch": 4.108, "grad_norm": 0.5429067611694336, "learning_rate": 5.892589258925892e-05, "loss": 0.1785, "mean_token_accuracy": 0.952794435620308, "step": 41080 }, { "epoch": 4.11, "grad_norm": 0.34343841671943665, "learning_rate": 5.890589058905891e-05, "loss": 0.2222, "mean_token_accuracy": 0.9508093267679214, "step": 41100 }, { "epoch": 4.112, "grad_norm": 3.7678964138031006, "learning_rate": 5.888588858885888e-05, "loss": 0.2937, "mean_token_accuracy": 0.9535905420780182, "step": 41120 }, { "epoch": 4.114, "grad_norm": 1.5048037767410278, "learning_rate": 5.886588658865887e-05, "loss": 0.2627, "mean_token_accuracy": 0.940145394206047, "step": 41140 }, { "epoch": 4.116, "grad_norm": 0.4424203932285309, "learning_rate": 5.884588458845884e-05, "loss": 0.1902, "mean_token_accuracy": 0.9397420138120651, "step": 41160 }, { "epoch": 4.118, "grad_norm": 0.40979668498039246, "learning_rate": 5.882588258825883e-05, "loss": 0.2803, "mean_token_accuracy": 0.9494920372962952, "step": 41180 }, { "epoch": 4.12, "grad_norm": 1.2730212211608887, "learning_rate": 5.88058805880588e-05, "loss": 0.1699, "mean_token_accuracy": 0.9591596275568008, "step": 41200 }, { "epoch": 4.122, "grad_norm": 0.48263952136039734, "learning_rate": 5.878587858785879e-05, "loss": 0.1982, "mean_token_accuracy": 0.9488851070404053, "step": 41220 }, { "epoch": 4.124, "grad_norm": 0.5386667847633362, "learning_rate": 5.876587658765876e-05, "loss": 0.2925, "mean_token_accuracy": 0.9408364087343216, "step": 41240 }, { "epoch": 4.126, "grad_norm": 2.093743324279785, "learning_rate": 5.874587458745875e-05, "loss": 0.2634, "mean_token_accuracy": 0.948492294549942, "step": 41260 }, { "epoch": 4.128, "grad_norm": 0.5067076683044434, "learning_rate": 5.8725872587258726e-05, "loss": 0.1684, "mean_token_accuracy": 0.9501991450786591, "step": 41280 }, { "epoch": 4.13, "grad_norm": 1.084386944770813, "learning_rate": 5.870587058705871e-05, "loss": 0.2567, "mean_token_accuracy": 0.9525615811347962, "step": 41300 }, { "epoch": 4.132, "grad_norm": 1.0101174116134644, "learning_rate": 5.8685868586858686e-05, "loss": 0.225, "mean_token_accuracy": 0.9526930570602417, "step": 41320 }, { "epoch": 4.134, "grad_norm": 0.6198846697807312, "learning_rate": 5.866586658665867e-05, "loss": 0.2326, "mean_token_accuracy": 0.9482288390398026, "step": 41340 }, { "epoch": 4.136, "grad_norm": 0.43679600954055786, "learning_rate": 5.864586458645865e-05, "loss": 0.2054, "mean_token_accuracy": 0.9451482206583023, "step": 41360 }, { "epoch": 4.138, "grad_norm": 0.5862542390823364, "learning_rate": 5.862586258625863e-05, "loss": 0.1927, "mean_token_accuracy": 0.9510740399360657, "step": 41380 }, { "epoch": 4.14, "grad_norm": 0.5658684968948364, "learning_rate": 5.860586058605861e-05, "loss": 0.2502, "mean_token_accuracy": 0.9492620825767517, "step": 41400 }, { "epoch": 4.142, "grad_norm": 0.4736931622028351, "learning_rate": 5.858585858585859e-05, "loss": 0.2599, "mean_token_accuracy": 0.9400552958250046, "step": 41420 }, { "epoch": 4.144, "grad_norm": 0.5623666644096375, "learning_rate": 5.856585658565858e-05, "loss": 0.2377, "mean_token_accuracy": 0.948515522480011, "step": 41440 }, { "epoch": 4.146, "grad_norm": 0.6730065941810608, "learning_rate": 5.854585458545855e-05, "loss": 0.2648, "mean_token_accuracy": 0.9476835876703262, "step": 41460 }, { "epoch": 4.148, "grad_norm": 0.7132259011268616, "learning_rate": 5.8525852585258537e-05, "loss": 0.2739, "mean_token_accuracy": 0.9552204340696335, "step": 41480 }, { "epoch": 4.15, "grad_norm": 0.5503770709037781, "learning_rate": 5.8505850585058506e-05, "loss": 0.3107, "mean_token_accuracy": 0.9424478948116303, "step": 41500 }, { "epoch": 4.152, "grad_norm": 0.5040830373764038, "learning_rate": 5.8485848584858496e-05, "loss": 0.2689, "mean_token_accuracy": 0.946813914179802, "step": 41520 }, { "epoch": 4.154, "grad_norm": 0.43317776918411255, "learning_rate": 5.8465846584658465e-05, "loss": 0.3047, "mean_token_accuracy": 0.9451870590448379, "step": 41540 }, { "epoch": 4.156, "grad_norm": 0.5035208463668823, "learning_rate": 5.8445844584458455e-05, "loss": 0.2822, "mean_token_accuracy": 0.9492358267307281, "step": 41560 }, { "epoch": 4.158, "grad_norm": 0.4225292205810547, "learning_rate": 5.8425842584258425e-05, "loss": 0.2088, "mean_token_accuracy": 0.9297201365232468, "step": 41580 }, { "epoch": 4.16, "grad_norm": 1.4966809749603271, "learning_rate": 5.8405840584058415e-05, "loss": 0.4042, "mean_token_accuracy": 0.9450295776128769, "step": 41600 }, { "epoch": 4.162, "grad_norm": 0.46085113286972046, "learning_rate": 5.838583858385839e-05, "loss": 0.2157, "mean_token_accuracy": 0.951479709148407, "step": 41620 }, { "epoch": 4.164, "grad_norm": 0.8015799522399902, "learning_rate": 5.8365836583658374e-05, "loss": 0.1939, "mean_token_accuracy": 0.9546682685613632, "step": 41640 }, { "epoch": 4.166, "grad_norm": 0.5036529898643494, "learning_rate": 5.834583458345835e-05, "loss": 0.2522, "mean_token_accuracy": 0.9530010044574737, "step": 41660 }, { "epoch": 4.168, "grad_norm": 0.5836896896362305, "learning_rate": 5.832583258325833e-05, "loss": 0.2327, "mean_token_accuracy": 0.9572021007537842, "step": 41680 }, { "epoch": 4.17, "grad_norm": 2.6578755378723145, "learning_rate": 5.830583058305831e-05, "loss": 0.3173, "mean_token_accuracy": 0.9539605736732483, "step": 41700 }, { "epoch": 4.172, "grad_norm": 0.4493508040904999, "learning_rate": 5.828582858285829e-05, "loss": 0.3725, "mean_token_accuracy": 0.9420162886381149, "step": 41720 }, { "epoch": 4.174, "grad_norm": 0.7266096472740173, "learning_rate": 5.826582658265827e-05, "loss": 0.2946, "mean_token_accuracy": 0.9574705243110657, "step": 41740 }, { "epoch": 4.176, "grad_norm": 0.7140586376190186, "learning_rate": 5.824582458245825e-05, "loss": 0.2269, "mean_token_accuracy": 0.947677007317543, "step": 41760 }, { "epoch": 4.178, "grad_norm": 0.48984211683273315, "learning_rate": 5.822582258225823e-05, "loss": 0.2422, "mean_token_accuracy": 0.9478967368602753, "step": 41780 }, { "epoch": 4.18, "grad_norm": 0.5162603259086609, "learning_rate": 5.820582058205821e-05, "loss": 0.3799, "mean_token_accuracy": 0.9438677132129669, "step": 41800 }, { "epoch": 4.182, "grad_norm": 0.9083994030952454, "learning_rate": 5.818581858185819e-05, "loss": 0.2493, "mean_token_accuracy": 0.9529791593551635, "step": 41820 }, { "epoch": 4.184, "grad_norm": 0.805809497833252, "learning_rate": 5.816581658165817e-05, "loss": 0.2081, "mean_token_accuracy": 0.9519144028425217, "step": 41840 }, { "epoch": 4.186, "grad_norm": 0.370047926902771, "learning_rate": 5.814581458145815e-05, "loss": 0.1795, "mean_token_accuracy": 0.9461367756128312, "step": 41860 }, { "epoch": 4.188, "grad_norm": 0.40202564001083374, "learning_rate": 5.812581258125813e-05, "loss": 0.2422, "mean_token_accuracy": 0.9564118593931198, "step": 41880 }, { "epoch": 4.19, "grad_norm": 0.5427061319351196, "learning_rate": 5.8105810581058106e-05, "loss": 0.2436, "mean_token_accuracy": 0.953515213727951, "step": 41900 }, { "epoch": 4.192, "grad_norm": 0.4648411273956299, "learning_rate": 5.808580858085809e-05, "loss": 0.4277, "mean_token_accuracy": 0.9525188028812408, "step": 41920 }, { "epoch": 4.194, "grad_norm": 0.4529035985469818, "learning_rate": 5.8065806580658065e-05, "loss": 0.2365, "mean_token_accuracy": 0.9540094405412674, "step": 41940 }, { "epoch": 4.196, "grad_norm": 0.49893859028816223, "learning_rate": 5.8045804580458055e-05, "loss": 0.2622, "mean_token_accuracy": 0.9591177195310593, "step": 41960 }, { "epoch": 4.198, "grad_norm": 0.515582799911499, "learning_rate": 5.8025802580258025e-05, "loss": 0.2004, "mean_token_accuracy": 0.953780597448349, "step": 41980 }, { "epoch": 4.2, "grad_norm": 0.3742259740829468, "learning_rate": 5.8005800580058014e-05, "loss": 0.2923, "mean_token_accuracy": 0.9560854822397232, "step": 42000 }, { "epoch": 4.202, "grad_norm": 0.4680871367454529, "learning_rate": 5.7985798579857984e-05, "loss": 0.2834, "mean_token_accuracy": 0.9484363406896591, "step": 42020 }, { "epoch": 4.204, "grad_norm": 0.7428479790687561, "learning_rate": 5.7965796579657974e-05, "loss": 0.1718, "mean_token_accuracy": 0.9543842256069184, "step": 42040 }, { "epoch": 4.206, "grad_norm": 0.4246862530708313, "learning_rate": 5.794579457945794e-05, "loss": 0.3022, "mean_token_accuracy": 0.9478253453969956, "step": 42060 }, { "epoch": 4.208, "grad_norm": 0.5569894313812256, "learning_rate": 5.792579257925793e-05, "loss": 0.2575, "mean_token_accuracy": 0.9440230548381805, "step": 42080 }, { "epoch": 4.21, "grad_norm": 5.480184555053711, "learning_rate": 5.79057905790579e-05, "loss": 0.3168, "mean_token_accuracy": 0.9498941987752915, "step": 42100 }, { "epoch": 4.212, "grad_norm": 0.42477431893348694, "learning_rate": 5.788578857885789e-05, "loss": 0.2231, "mean_token_accuracy": 0.9401930451393128, "step": 42120 }, { "epoch": 4.214, "grad_norm": 0.4564965069293976, "learning_rate": 5.786578657865787e-05, "loss": 0.2292, "mean_token_accuracy": 0.9538596481084823, "step": 42140 }, { "epoch": 4.216, "grad_norm": 0.5547365546226501, "learning_rate": 5.784578457845785e-05, "loss": 0.1851, "mean_token_accuracy": 0.9508047968149185, "step": 42160 }, { "epoch": 4.218, "grad_norm": 0.5177942514419556, "learning_rate": 5.782578257825783e-05, "loss": 0.2316, "mean_token_accuracy": 0.949157327413559, "step": 42180 }, { "epoch": 4.22, "grad_norm": 2.3923521041870117, "learning_rate": 5.780578057805781e-05, "loss": 0.2563, "mean_token_accuracy": 0.9415371984243393, "step": 42200 }, { "epoch": 4.222, "grad_norm": 0.5691554546356201, "learning_rate": 5.778577857785779e-05, "loss": 0.3016, "mean_token_accuracy": 0.9451070100069046, "step": 42220 }, { "epoch": 4.224, "grad_norm": 0.5511828064918518, "learning_rate": 5.776577657765777e-05, "loss": 0.289, "mean_token_accuracy": 0.9434355020523071, "step": 42240 }, { "epoch": 4.226, "grad_norm": 0.5305318832397461, "learning_rate": 5.774577457745775e-05, "loss": 0.301, "mean_token_accuracy": 0.9544847935438157, "step": 42260 }, { "epoch": 4.228, "grad_norm": 0.9290055632591248, "learning_rate": 5.772577257725773e-05, "loss": 0.2893, "mean_token_accuracy": 0.9532301753759385, "step": 42280 }, { "epoch": 4.23, "grad_norm": 12.410210609436035, "learning_rate": 5.7705770577057706e-05, "loss": 0.3523, "mean_token_accuracy": 0.9480721682310105, "step": 42300 }, { "epoch": 4.232, "grad_norm": 5.813937187194824, "learning_rate": 5.768576857685769e-05, "loss": 0.314, "mean_token_accuracy": 0.9463520288467407, "step": 42320 }, { "epoch": 4.234, "grad_norm": 0.6079583168029785, "learning_rate": 5.7665766576657665e-05, "loss": 0.2577, "mean_token_accuracy": 0.9560761034488678, "step": 42340 }, { "epoch": 4.236, "grad_norm": 2.0771360397338867, "learning_rate": 5.764576457645765e-05, "loss": 0.2773, "mean_token_accuracy": 0.949495130777359, "step": 42360 }, { "epoch": 4.2379999999999995, "grad_norm": 0.544366180896759, "learning_rate": 5.7625762576257625e-05, "loss": 0.3556, "mean_token_accuracy": 0.950864651799202, "step": 42380 }, { "epoch": 4.24, "grad_norm": 0.4626167416572571, "learning_rate": 5.760576057605761e-05, "loss": 0.1978, "mean_token_accuracy": 0.9591055065393448, "step": 42400 }, { "epoch": 4.242, "grad_norm": 0.41763830184936523, "learning_rate": 5.7585758575857584e-05, "loss": 0.2363, "mean_token_accuracy": 0.9508707582950592, "step": 42420 }, { "epoch": 4.244, "grad_norm": 0.6205820441246033, "learning_rate": 5.756575657565757e-05, "loss": 0.2867, "mean_token_accuracy": 0.9552483439445496, "step": 42440 }, { "epoch": 4.246, "grad_norm": 0.5696792006492615, "learning_rate": 5.754575457545754e-05, "loss": 0.3228, "mean_token_accuracy": 0.9355537950992584, "step": 42460 }, { "epoch": 4.248, "grad_norm": 0.7173739671707153, "learning_rate": 5.752575257525753e-05, "loss": 0.2916, "mean_token_accuracy": 0.9484154850244522, "step": 42480 }, { "epoch": 4.25, "grad_norm": 0.7025670409202576, "learning_rate": 5.75057505750575e-05, "loss": 0.4432, "mean_token_accuracy": 0.937627774477005, "step": 42500 }, { "epoch": 4.252, "grad_norm": 0.6152819395065308, "learning_rate": 5.748574857485749e-05, "loss": 0.2476, "mean_token_accuracy": 0.9513394087553024, "step": 42520 }, { "epoch": 4.254, "grad_norm": 1.3019510507583618, "learning_rate": 5.746574657465746e-05, "loss": 0.2677, "mean_token_accuracy": 0.9488348543643952, "step": 42540 }, { "epoch": 4.256, "grad_norm": 0.4510754942893982, "learning_rate": 5.744574457445745e-05, "loss": 0.2593, "mean_token_accuracy": 0.9513588964939117, "step": 42560 }, { "epoch": 4.258, "grad_norm": 0.45911383628845215, "learning_rate": 5.742574257425742e-05, "loss": 0.2749, "mean_token_accuracy": 0.9505417823791504, "step": 42580 }, { "epoch": 4.26, "grad_norm": 0.3833797574043274, "learning_rate": 5.740574057405741e-05, "loss": 0.2607, "mean_token_accuracy": 0.9534167587757111, "step": 42600 }, { "epoch": 4.2620000000000005, "grad_norm": 0.7699018120765686, "learning_rate": 5.738573857385738e-05, "loss": 0.2279, "mean_token_accuracy": 0.9502296775579453, "step": 42620 }, { "epoch": 4.264, "grad_norm": 0.45561540126800537, "learning_rate": 5.736573657365737e-05, "loss": 0.3283, "mean_token_accuracy": 0.9474566906690598, "step": 42640 }, { "epoch": 4.266, "grad_norm": 0.5706644058227539, "learning_rate": 5.7345734573457347e-05, "loss": 0.2764, "mean_token_accuracy": 0.9374931544065476, "step": 42660 }, { "epoch": 4.268, "grad_norm": 1.09022855758667, "learning_rate": 5.732573257325733e-05, "loss": 0.2331, "mean_token_accuracy": 0.951859387755394, "step": 42680 }, { "epoch": 4.27, "grad_norm": 0.2916398048400879, "learning_rate": 5.7305730573057306e-05, "loss": 0.2356, "mean_token_accuracy": 0.9468427836894989, "step": 42700 }, { "epoch": 4.272, "grad_norm": 0.797853410243988, "learning_rate": 5.728572857285729e-05, "loss": 0.2492, "mean_token_accuracy": 0.9526392489671707, "step": 42720 }, { "epoch": 4.274, "grad_norm": 0.3814660608768463, "learning_rate": 5.7265726572657265e-05, "loss": 0.2777, "mean_token_accuracy": 0.9473558723926544, "step": 42740 }, { "epoch": 4.276, "grad_norm": 0.5550805926322937, "learning_rate": 5.724572457245725e-05, "loss": 0.2124, "mean_token_accuracy": 0.9413058757781982, "step": 42760 }, { "epoch": 4.2780000000000005, "grad_norm": 0.643577516078949, "learning_rate": 5.7225722572257225e-05, "loss": 0.3269, "mean_token_accuracy": 0.94885755777359, "step": 42780 }, { "epoch": 4.28, "grad_norm": 0.3156307637691498, "learning_rate": 5.720572057205721e-05, "loss": 0.1738, "mean_token_accuracy": 0.9460732132196427, "step": 42800 }, { "epoch": 4.282, "grad_norm": 0.415195107460022, "learning_rate": 5.7185718571857184e-05, "loss": 0.3149, "mean_token_accuracy": 0.9405674934387207, "step": 42820 }, { "epoch": 4.284, "grad_norm": 0.32569459080696106, "learning_rate": 5.716571657165717e-05, "loss": 0.4229, "mean_token_accuracy": 0.9442908793687821, "step": 42840 }, { "epoch": 4.286, "grad_norm": 0.44022640585899353, "learning_rate": 5.714571457145714e-05, "loss": 0.2891, "mean_token_accuracy": 0.9489710599184036, "step": 42860 }, { "epoch": 4.288, "grad_norm": 0.514324963092804, "learning_rate": 5.7125712571257126e-05, "loss": 0.2283, "mean_token_accuracy": 0.9449921220541, "step": 42880 }, { "epoch": 4.29, "grad_norm": 0.4084415137767792, "learning_rate": 5.7105710571057116e-05, "loss": 0.2789, "mean_token_accuracy": 0.9483839839696884, "step": 42900 }, { "epoch": 4.292, "grad_norm": 1.3273847103118896, "learning_rate": 5.7085708570857086e-05, "loss": 0.2883, "mean_token_accuracy": 0.9508475184440612, "step": 42920 }, { "epoch": 4.294, "grad_norm": 1.8245376348495483, "learning_rate": 5.7065706570657075e-05, "loss": 0.2438, "mean_token_accuracy": 0.9528135269880295, "step": 42940 }, { "epoch": 4.296, "grad_norm": 0.6946828961372375, "learning_rate": 5.7045704570457045e-05, "loss": 0.2977, "mean_token_accuracy": 0.9493881821632385, "step": 42960 }, { "epoch": 4.298, "grad_norm": 0.5459950566291809, "learning_rate": 5.7025702570257035e-05, "loss": 0.2122, "mean_token_accuracy": 0.9566958487033844, "step": 42980 }, { "epoch": 4.3, "grad_norm": 0.38602501153945923, "learning_rate": 5.700570057005701e-05, "loss": 0.2415, "mean_token_accuracy": 0.9593441307544708, "step": 43000 }, { "epoch": 4.302, "grad_norm": 0.4263528287410736, "learning_rate": 5.6985698569856994e-05, "loss": 0.2109, "mean_token_accuracy": 0.9437270581722259, "step": 43020 }, { "epoch": 4.304, "grad_norm": 0.6519758701324463, "learning_rate": 5.696569656965697e-05, "loss": 0.2215, "mean_token_accuracy": 0.9571756899356842, "step": 43040 }, { "epoch": 4.306, "grad_norm": 0.5619766116142273, "learning_rate": 5.694569456945695e-05, "loss": 0.2312, "mean_token_accuracy": 0.9561039924621582, "step": 43060 }, { "epoch": 4.308, "grad_norm": 0.43109509348869324, "learning_rate": 5.692569256925693e-05, "loss": 0.2961, "mean_token_accuracy": 0.95396488904953, "step": 43080 }, { "epoch": 4.31, "grad_norm": 0.7328852415084839, "learning_rate": 5.690569056905691e-05, "loss": 0.2426, "mean_token_accuracy": 0.9375558227300644, "step": 43100 }, { "epoch": 4.312, "grad_norm": 1.5356978178024292, "learning_rate": 5.688568856885689e-05, "loss": 0.2342, "mean_token_accuracy": 0.950673621892929, "step": 43120 }, { "epoch": 4.314, "grad_norm": 0.5724785327911377, "learning_rate": 5.686568656865687e-05, "loss": 0.3328, "mean_token_accuracy": 0.9443675726652145, "step": 43140 }, { "epoch": 4.316, "grad_norm": 0.5491111278533936, "learning_rate": 5.684568456845685e-05, "loss": 0.3331, "mean_token_accuracy": 0.9450695514678955, "step": 43160 }, { "epoch": 4.318, "grad_norm": 0.5358384847640991, "learning_rate": 5.682568256825683e-05, "loss": 0.2773, "mean_token_accuracy": 0.9463579028844833, "step": 43180 }, { "epoch": 4.32, "grad_norm": 0.5838424563407898, "learning_rate": 5.680568056805681e-05, "loss": 0.3199, "mean_token_accuracy": 0.9367333590984345, "step": 43200 }, { "epoch": 4.322, "grad_norm": 0.45379018783569336, "learning_rate": 5.678567856785679e-05, "loss": 0.2844, "mean_token_accuracy": 0.9521439433097839, "step": 43220 }, { "epoch": 4.324, "grad_norm": 0.5357297658920288, "learning_rate": 5.676567656765677e-05, "loss": 0.3991, "mean_token_accuracy": 0.9356542974710464, "step": 43240 }, { "epoch": 4.326, "grad_norm": 0.5156724452972412, "learning_rate": 5.674567456745675e-05, "loss": 0.1657, "mean_token_accuracy": 0.9574740171432495, "step": 43260 }, { "epoch": 4.328, "grad_norm": 0.43905726075172424, "learning_rate": 5.6725672567256726e-05, "loss": 0.171, "mean_token_accuracy": 0.9516792714595794, "step": 43280 }, { "epoch": 4.33, "grad_norm": 1.9446085691452026, "learning_rate": 5.670567056705671e-05, "loss": 0.2445, "mean_token_accuracy": 0.9399393945932388, "step": 43300 }, { "epoch": 4.332, "grad_norm": 0.4291374981403351, "learning_rate": 5.6685668566856685e-05, "loss": 0.2962, "mean_token_accuracy": 0.948683425784111, "step": 43320 }, { "epoch": 4.334, "grad_norm": 3.140251874923706, "learning_rate": 5.6665666566656675e-05, "loss": 0.3081, "mean_token_accuracy": 0.9564166605472565, "step": 43340 }, { "epoch": 4.336, "grad_norm": 0.537321150302887, "learning_rate": 5.6645664566456645e-05, "loss": 0.2291, "mean_token_accuracy": 0.9512440264225006, "step": 43360 }, { "epoch": 4.338, "grad_norm": 0.3375115990638733, "learning_rate": 5.6625662566256635e-05, "loss": 0.2091, "mean_token_accuracy": 0.9572125464677811, "step": 43380 }, { "epoch": 4.34, "grad_norm": 0.5528057813644409, "learning_rate": 5.6605660566056604e-05, "loss": 0.2253, "mean_token_accuracy": 0.9452495276927948, "step": 43400 }, { "epoch": 4.342, "grad_norm": 0.6303374171257019, "learning_rate": 5.6585658565856594e-05, "loss": 0.2344, "mean_token_accuracy": 0.9534285873174667, "step": 43420 }, { "epoch": 4.344, "grad_norm": 0.701657235622406, "learning_rate": 5.6565656565656563e-05, "loss": 0.164, "mean_token_accuracy": 0.9544565647840499, "step": 43440 }, { "epoch": 4.346, "grad_norm": 0.6154728531837463, "learning_rate": 5.654565456545655e-05, "loss": 0.1998, "mean_token_accuracy": 0.9571744620800018, "step": 43460 }, { "epoch": 4.348, "grad_norm": 0.401607483625412, "learning_rate": 5.652565256525652e-05, "loss": 0.2211, "mean_token_accuracy": 0.94394551217556, "step": 43480 }, { "epoch": 4.35, "grad_norm": 0.5385695695877075, "learning_rate": 5.650565056505651e-05, "loss": 0.2437, "mean_token_accuracy": 0.959339314699173, "step": 43500 }, { "epoch": 4.352, "grad_norm": 4.462339401245117, "learning_rate": 5.648564856485649e-05, "loss": 0.2553, "mean_token_accuracy": 0.9412847399711609, "step": 43520 }, { "epoch": 4.354, "grad_norm": 1.3696494102478027, "learning_rate": 5.646564656465647e-05, "loss": 0.1904, "mean_token_accuracy": 0.9527248114347457, "step": 43540 }, { "epoch": 4.356, "grad_norm": 0.4667107164859772, "learning_rate": 5.644564456445645e-05, "loss": 0.2265, "mean_token_accuracy": 0.947400838136673, "step": 43560 }, { "epoch": 4.358, "grad_norm": 0.4541124701499939, "learning_rate": 5.642564256425643e-05, "loss": 0.2499, "mean_token_accuracy": 0.9489735901355744, "step": 43580 }, { "epoch": 4.36, "grad_norm": 0.4087500274181366, "learning_rate": 5.640564056405641e-05, "loss": 0.1598, "mean_token_accuracy": 0.9575030833482743, "step": 43600 }, { "epoch": 4.362, "grad_norm": 0.5867382287979126, "learning_rate": 5.638563856385639e-05, "loss": 0.2464, "mean_token_accuracy": 0.9493691921234131, "step": 43620 }, { "epoch": 4.364, "grad_norm": 0.8415071368217468, "learning_rate": 5.636563656365637e-05, "loss": 0.3457, "mean_token_accuracy": 0.9443661063909531, "step": 43640 }, { "epoch": 4.366, "grad_norm": 0.529693067073822, "learning_rate": 5.634563456345635e-05, "loss": 0.1399, "mean_token_accuracy": 0.9418334841728211, "step": 43660 }, { "epoch": 4.368, "grad_norm": 0.4624863862991333, "learning_rate": 5.6325632563256326e-05, "loss": 0.2127, "mean_token_accuracy": 0.9531038075685501, "step": 43680 }, { "epoch": 4.37, "grad_norm": 0.4766046404838562, "learning_rate": 5.630563056305631e-05, "loss": 0.2477, "mean_token_accuracy": 0.9505458116531372, "step": 43700 }, { "epoch": 4.372, "grad_norm": 0.44884440302848816, "learning_rate": 5.6285628562856285e-05, "loss": 0.2545, "mean_token_accuracy": 0.9433784544467926, "step": 43720 }, { "epoch": 4.374, "grad_norm": 0.4839913249015808, "learning_rate": 5.626562656265627e-05, "loss": 0.2671, "mean_token_accuracy": 0.9503501147031784, "step": 43740 }, { "epoch": 4.376, "grad_norm": 0.4606853723526001, "learning_rate": 5.6245624562456245e-05, "loss": 0.3646, "mean_token_accuracy": 0.951855731010437, "step": 43760 }, { "epoch": 4.378, "grad_norm": 0.8597580790519714, "learning_rate": 5.622562256225623e-05, "loss": 0.2894, "mean_token_accuracy": 0.9532365471124649, "step": 43780 }, { "epoch": 4.38, "grad_norm": 0.5115299820899963, "learning_rate": 5.6205620562056204e-05, "loss": 0.2472, "mean_token_accuracy": 0.9520283043384552, "step": 43800 }, { "epoch": 4.382, "grad_norm": 1.8732072114944458, "learning_rate": 5.618561856185619e-05, "loss": 0.2202, "mean_token_accuracy": 0.9543524295091629, "step": 43820 }, { "epoch": 4.384, "grad_norm": 0.4040198028087616, "learning_rate": 5.616561656165616e-05, "loss": 0.241, "mean_token_accuracy": 0.9487646341323852, "step": 43840 }, { "epoch": 4.386, "grad_norm": 0.5425691604614258, "learning_rate": 5.614561456145615e-05, "loss": 0.2462, "mean_token_accuracy": 0.9562036544084549, "step": 43860 }, { "epoch": 4.388, "grad_norm": 0.48583173751831055, "learning_rate": 5.612561256125612e-05, "loss": 0.178, "mean_token_accuracy": 0.953959721326828, "step": 43880 }, { "epoch": 4.39, "grad_norm": 0.600719153881073, "learning_rate": 5.610561056105611e-05, "loss": 0.3255, "mean_token_accuracy": 0.9442650616168976, "step": 43900 }, { "epoch": 4.392, "grad_norm": 0.39495500922203064, "learning_rate": 5.608560856085608e-05, "loss": 0.2022, "mean_token_accuracy": 0.9500107377767563, "step": 43920 }, { "epoch": 4.394, "grad_norm": 0.4559065103530884, "learning_rate": 5.606560656065607e-05, "loss": 0.3765, "mean_token_accuracy": 0.9380100339651107, "step": 43940 }, { "epoch": 4.396, "grad_norm": 0.5745212435722351, "learning_rate": 5.604560456045604e-05, "loss": 0.2552, "mean_token_accuracy": 0.9434156149625779, "step": 43960 }, { "epoch": 4.398, "grad_norm": 0.6221771240234375, "learning_rate": 5.602560256025603e-05, "loss": 0.2556, "mean_token_accuracy": 0.9506140559911728, "step": 43980 }, { "epoch": 4.4, "grad_norm": 0.6060522198677063, "learning_rate": 5.600560056005601e-05, "loss": 0.3603, "mean_token_accuracy": 0.9487789452075959, "step": 44000 }, { "epoch": 4.402, "grad_norm": 0.3680565357208252, "learning_rate": 5.598559855985599e-05, "loss": 0.3746, "mean_token_accuracy": 0.9506963223218918, "step": 44020 }, { "epoch": 4.404, "grad_norm": 0.7484232783317566, "learning_rate": 5.596559655965597e-05, "loss": 0.2611, "mean_token_accuracy": 0.9507825195789337, "step": 44040 }, { "epoch": 4.406, "grad_norm": 0.5432941913604736, "learning_rate": 5.594559455945595e-05, "loss": 0.3439, "mean_token_accuracy": 0.9416364133358002, "step": 44060 }, { "epoch": 4.408, "grad_norm": 0.4499337077140808, "learning_rate": 5.5925592559255926e-05, "loss": 0.1841, "mean_token_accuracy": 0.9458434253931045, "step": 44080 }, { "epoch": 4.41, "grad_norm": 0.5463444590568542, "learning_rate": 5.590559055905591e-05, "loss": 0.1753, "mean_token_accuracy": 0.9542170822620392, "step": 44100 }, { "epoch": 4.412, "grad_norm": 0.5639700293540955, "learning_rate": 5.5885588558855885e-05, "loss": 0.1562, "mean_token_accuracy": 0.9478236347436905, "step": 44120 }, { "epoch": 4.414, "grad_norm": 0.5579461455345154, "learning_rate": 5.586558655865587e-05, "loss": 0.3331, "mean_token_accuracy": 0.9404454231262207, "step": 44140 }, { "epoch": 4.416, "grad_norm": 0.5288747549057007, "learning_rate": 5.5845584558455845e-05, "loss": 0.2601, "mean_token_accuracy": 0.9446564465761185, "step": 44160 }, { "epoch": 4.418, "grad_norm": 4.901015281677246, "learning_rate": 5.582558255825583e-05, "loss": 0.3154, "mean_token_accuracy": 0.9405314803123475, "step": 44180 }, { "epoch": 4.42, "grad_norm": 2.6376752853393555, "learning_rate": 5.5805580558055804e-05, "loss": 0.2989, "mean_token_accuracy": 0.9391706049442291, "step": 44200 }, { "epoch": 4.422, "grad_norm": 0.5209048986434937, "learning_rate": 5.578557855785579e-05, "loss": 0.205, "mean_token_accuracy": 0.9515407413244248, "step": 44220 }, { "epoch": 4.424, "grad_norm": 0.5449560284614563, "learning_rate": 5.576557655765576e-05, "loss": 0.3343, "mean_token_accuracy": 0.9519144028425217, "step": 44240 }, { "epoch": 4.426, "grad_norm": 0.5911797881126404, "learning_rate": 5.5745574557455746e-05, "loss": 0.4055, "mean_token_accuracy": 0.9447562277317048, "step": 44260 }, { "epoch": 4.428, "grad_norm": 0.492829293012619, "learning_rate": 5.572557255725572e-05, "loss": 0.4225, "mean_token_accuracy": 0.9454954475164413, "step": 44280 }, { "epoch": 4.43, "grad_norm": 0.5155919790267944, "learning_rate": 5.5705570557055706e-05, "loss": 0.2425, "mean_token_accuracy": 0.9448281317949295, "step": 44300 }, { "epoch": 4.432, "grad_norm": 3.6645145416259766, "learning_rate": 5.568556855685568e-05, "loss": 0.3729, "mean_token_accuracy": 0.9442078173160553, "step": 44320 }, { "epoch": 4.434, "grad_norm": 0.48425132036209106, "learning_rate": 5.5665566556655665e-05, "loss": 0.2621, "mean_token_accuracy": 0.943002125620842, "step": 44340 }, { "epoch": 4.436, "grad_norm": 0.5479345917701721, "learning_rate": 5.564556455645564e-05, "loss": 0.2025, "mean_token_accuracy": 0.9443637728691101, "step": 44360 }, { "epoch": 4.438, "grad_norm": 2.5827176570892334, "learning_rate": 5.562556255625563e-05, "loss": 0.2898, "mean_token_accuracy": 0.94475017786026, "step": 44380 }, { "epoch": 4.44, "grad_norm": 0.3915630280971527, "learning_rate": 5.5605560556055614e-05, "loss": 0.2704, "mean_token_accuracy": 0.9435435205698013, "step": 44400 }, { "epoch": 4.442, "grad_norm": 0.5002061724662781, "learning_rate": 5.558555855585559e-05, "loss": 0.3289, "mean_token_accuracy": 0.9495557934045792, "step": 44420 }, { "epoch": 4.444, "grad_norm": 0.7180065512657166, "learning_rate": 5.5565556555655573e-05, "loss": 0.1909, "mean_token_accuracy": 0.9544135898351669, "step": 44440 }, { "epoch": 4.446, "grad_norm": 1.0593832731246948, "learning_rate": 5.554555455545555e-05, "loss": 0.239, "mean_token_accuracy": 0.9582737386226654, "step": 44460 }, { "epoch": 4.448, "grad_norm": 0.4139302968978882, "learning_rate": 5.552555255525553e-05, "loss": 0.4027, "mean_token_accuracy": 0.9305232167243958, "step": 44480 }, { "epoch": 4.45, "grad_norm": 0.4892943501472473, "learning_rate": 5.550555055505551e-05, "loss": 0.244, "mean_token_accuracy": 0.9599599331617356, "step": 44500 }, { "epoch": 4.452, "grad_norm": 0.5320159792900085, "learning_rate": 5.548554855485549e-05, "loss": 0.1872, "mean_token_accuracy": 0.9529474198818206, "step": 44520 }, { "epoch": 4.454, "grad_norm": 0.5171449184417725, "learning_rate": 5.546554655465547e-05, "loss": 0.2371, "mean_token_accuracy": 0.9470263212919235, "step": 44540 }, { "epoch": 4.456, "grad_norm": 0.4821256697177887, "learning_rate": 5.544554455445545e-05, "loss": 0.2179, "mean_token_accuracy": 0.9542043507099152, "step": 44560 }, { "epoch": 4.458, "grad_norm": 0.4151834547519684, "learning_rate": 5.542554255425543e-05, "loss": 0.251, "mean_token_accuracy": 0.9541057914495468, "step": 44580 }, { "epoch": 4.46, "grad_norm": 0.5227431058883667, "learning_rate": 5.540554055405541e-05, "loss": 0.5408, "mean_token_accuracy": 0.9402148187160492, "step": 44600 }, { "epoch": 4.462, "grad_norm": 0.5250457525253296, "learning_rate": 5.538553855385539e-05, "loss": 0.2941, "mean_token_accuracy": 0.9520050346851349, "step": 44620 }, { "epoch": 4.464, "grad_norm": 0.8260389566421509, "learning_rate": 5.536553655365537e-05, "loss": 0.2861, "mean_token_accuracy": 0.9552899956703186, "step": 44640 }, { "epoch": 4.466, "grad_norm": 0.540470540523529, "learning_rate": 5.5345534553455346e-05, "loss": 0.2402, "mean_token_accuracy": 0.9436097890138626, "step": 44660 }, { "epoch": 4.468, "grad_norm": 0.5240906476974487, "learning_rate": 5.532553255325533e-05, "loss": 0.3148, "mean_token_accuracy": 0.9448235720396042, "step": 44680 }, { "epoch": 4.47, "grad_norm": 0.6227112412452698, "learning_rate": 5.5305530553055306e-05, "loss": 0.2471, "mean_token_accuracy": 0.9581371575593949, "step": 44700 }, { "epoch": 4.4719999999999995, "grad_norm": 0.46325749158859253, "learning_rate": 5.5285528552855295e-05, "loss": 0.2549, "mean_token_accuracy": 0.950892984867096, "step": 44720 }, { "epoch": 4.474, "grad_norm": 1.8632752895355225, "learning_rate": 5.5265526552655265e-05, "loss": 0.187, "mean_token_accuracy": 0.950791385769844, "step": 44740 }, { "epoch": 4.476, "grad_norm": 0.7479382157325745, "learning_rate": 5.5245524552455255e-05, "loss": 0.3812, "mean_token_accuracy": 0.950472629070282, "step": 44760 }, { "epoch": 4.478, "grad_norm": 0.47199249267578125, "learning_rate": 5.5225522552255224e-05, "loss": 0.3357, "mean_token_accuracy": 0.942318731546402, "step": 44780 }, { "epoch": 4.48, "grad_norm": 0.8228979110717773, "learning_rate": 5.5205520552055214e-05, "loss": 0.2747, "mean_token_accuracy": 0.9414202511310578, "step": 44800 }, { "epoch": 4.482, "grad_norm": 0.5188085436820984, "learning_rate": 5.5185518551855184e-05, "loss": 0.2914, "mean_token_accuracy": 0.9532766431570053, "step": 44820 }, { "epoch": 4.484, "grad_norm": 0.5744419097900391, "learning_rate": 5.5165516551655173e-05, "loss": 0.2402, "mean_token_accuracy": 0.9490375459194184, "step": 44840 }, { "epoch": 4.486, "grad_norm": 0.7943609356880188, "learning_rate": 5.514551455145515e-05, "loss": 0.4332, "mean_token_accuracy": 0.9532684236764908, "step": 44860 }, { "epoch": 4.4879999999999995, "grad_norm": 0.6814321279525757, "learning_rate": 5.512551255125513e-05, "loss": 0.1855, "mean_token_accuracy": 0.944426029920578, "step": 44880 }, { "epoch": 4.49, "grad_norm": 0.6985943913459778, "learning_rate": 5.510551055105511e-05, "loss": 0.2388, "mean_token_accuracy": 0.947109979391098, "step": 44900 }, { "epoch": 4.492, "grad_norm": 0.6005276441574097, "learning_rate": 5.508550855085509e-05, "loss": 0.207, "mean_token_accuracy": 0.9485108256340027, "step": 44920 }, { "epoch": 4.494, "grad_norm": 0.48027151823043823, "learning_rate": 5.506550655065507e-05, "loss": 0.2022, "mean_token_accuracy": 0.9533713161945343, "step": 44940 }, { "epoch": 4.496, "grad_norm": 0.464248925447464, "learning_rate": 5.504550455045505e-05, "loss": 0.3175, "mean_token_accuracy": 0.9460115164518357, "step": 44960 }, { "epoch": 4.498, "grad_norm": 0.530166745185852, "learning_rate": 5.502550255025503e-05, "loss": 0.3384, "mean_token_accuracy": 0.9461131513118743, "step": 44980 }, { "epoch": 4.5, "grad_norm": 0.47669851779937744, "learning_rate": 5.500550055005501e-05, "loss": 0.3491, "mean_token_accuracy": 0.9462127268314362, "step": 45000 }, { "epoch": 4.502, "grad_norm": 0.50117427110672, "learning_rate": 5.498549854985499e-05, "loss": 0.2451, "mean_token_accuracy": 0.9511123090982437, "step": 45020 }, { "epoch": 4.504, "grad_norm": 0.5778023600578308, "learning_rate": 5.496549654965497e-05, "loss": 0.2706, "mean_token_accuracy": 0.9564693659543991, "step": 45040 }, { "epoch": 4.506, "grad_norm": 0.4594550132751465, "learning_rate": 5.4945494549454946e-05, "loss": 0.3323, "mean_token_accuracy": 0.9478718966245652, "step": 45060 }, { "epoch": 4.508, "grad_norm": 0.621816873550415, "learning_rate": 5.492549254925493e-05, "loss": 0.3257, "mean_token_accuracy": 0.9395396918058395, "step": 45080 }, { "epoch": 4.51, "grad_norm": 0.6187944412231445, "learning_rate": 5.4905490549054906e-05, "loss": 0.2569, "mean_token_accuracy": 0.952342739701271, "step": 45100 }, { "epoch": 4.5120000000000005, "grad_norm": 0.43519413471221924, "learning_rate": 5.488548854885489e-05, "loss": 0.2302, "mean_token_accuracy": 0.9415944784879684, "step": 45120 }, { "epoch": 4.514, "grad_norm": 0.37049436569213867, "learning_rate": 5.4865486548654865e-05, "loss": 0.225, "mean_token_accuracy": 0.9485164761543274, "step": 45140 }, { "epoch": 4.516, "grad_norm": 0.5732784271240234, "learning_rate": 5.484548454845485e-05, "loss": 0.2368, "mean_token_accuracy": 0.9511556059122086, "step": 45160 }, { "epoch": 4.518, "grad_norm": 0.8531704545021057, "learning_rate": 5.4825482548254824e-05, "loss": 0.172, "mean_token_accuracy": 0.9507166266441345, "step": 45180 }, { "epoch": 4.52, "grad_norm": 0.3922710716724396, "learning_rate": 5.480548054805481e-05, "loss": 0.1459, "mean_token_accuracy": 0.9563775688409806, "step": 45200 }, { "epoch": 4.522, "grad_norm": 0.38241639733314514, "learning_rate": 5.4785478547854784e-05, "loss": 0.3659, "mean_token_accuracy": 0.9401970475912094, "step": 45220 }, { "epoch": 4.524, "grad_norm": 0.5706650614738464, "learning_rate": 5.476547654765477e-05, "loss": 0.2694, "mean_token_accuracy": 0.9475147128105164, "step": 45240 }, { "epoch": 4.526, "grad_norm": 1.600840449333191, "learning_rate": 5.474547454745474e-05, "loss": 0.332, "mean_token_accuracy": 0.959752231836319, "step": 45260 }, { "epoch": 4.5280000000000005, "grad_norm": 0.41582244634628296, "learning_rate": 5.472547254725473e-05, "loss": 0.196, "mean_token_accuracy": 0.9440909534692764, "step": 45280 }, { "epoch": 4.53, "grad_norm": 1.5277507305145264, "learning_rate": 5.47054705470547e-05, "loss": 0.2071, "mean_token_accuracy": 0.9509551912546158, "step": 45300 }, { "epoch": 4.532, "grad_norm": 1.3702914714813232, "learning_rate": 5.468546854685469e-05, "loss": 0.2108, "mean_token_accuracy": 0.9535886228084565, "step": 45320 }, { "epoch": 4.534, "grad_norm": 0.5348719358444214, "learning_rate": 5.466546654665466e-05, "loss": 0.3524, "mean_token_accuracy": 0.9472041964530945, "step": 45340 }, { "epoch": 4.536, "grad_norm": 0.4929870665073395, "learning_rate": 5.464546454645465e-05, "loss": 0.2499, "mean_token_accuracy": 0.9534244537353516, "step": 45360 }, { "epoch": 4.538, "grad_norm": 0.43807801604270935, "learning_rate": 5.462546254625463e-05, "loss": 0.2205, "mean_token_accuracy": 0.9502074301242829, "step": 45380 }, { "epoch": 4.54, "grad_norm": 0.491524875164032, "learning_rate": 5.460546054605461e-05, "loss": 0.2275, "mean_token_accuracy": 0.9521450728178025, "step": 45400 }, { "epoch": 4.542, "grad_norm": 0.40545904636383057, "learning_rate": 5.458545854585459e-05, "loss": 0.3363, "mean_token_accuracy": 0.9377736568450927, "step": 45420 }, { "epoch": 4.5440000000000005, "grad_norm": 0.760547399520874, "learning_rate": 5.456545654565457e-05, "loss": 0.2779, "mean_token_accuracy": 0.9565403938293457, "step": 45440 }, { "epoch": 4.546, "grad_norm": 0.6351425051689148, "learning_rate": 5.4545454545454546e-05, "loss": 0.263, "mean_token_accuracy": 0.9530819743871689, "step": 45460 }, { "epoch": 4.548, "grad_norm": 0.8572994470596313, "learning_rate": 5.452545254525453e-05, "loss": 0.2782, "mean_token_accuracy": 0.9530787378549576, "step": 45480 }, { "epoch": 4.55, "grad_norm": 0.5443311333656311, "learning_rate": 5.4505450545054506e-05, "loss": 0.2133, "mean_token_accuracy": 0.9539451897144318, "step": 45500 }, { "epoch": 4.552, "grad_norm": 0.43352392315864563, "learning_rate": 5.448544854485449e-05, "loss": 0.3501, "mean_token_accuracy": 0.9523571312427521, "step": 45520 }, { "epoch": 4.554, "grad_norm": 0.4121265113353729, "learning_rate": 5.4465446544654465e-05, "loss": 0.1486, "mean_token_accuracy": 0.9541467398405075, "step": 45540 }, { "epoch": 4.556, "grad_norm": 0.5954023599624634, "learning_rate": 5.444544454445445e-05, "loss": 0.2398, "mean_token_accuracy": 0.9390344947576523, "step": 45560 }, { "epoch": 4.558, "grad_norm": 0.9226826429367065, "learning_rate": 5.4425442544254424e-05, "loss": 0.2175, "mean_token_accuracy": 0.9481313258409501, "step": 45580 }, { "epoch": 4.5600000000000005, "grad_norm": 0.939606249332428, "learning_rate": 5.440544054405441e-05, "loss": 0.2424, "mean_token_accuracy": 0.9517650157213211, "step": 45600 }, { "epoch": 4.562, "grad_norm": 0.5062436461448669, "learning_rate": 5.4385438543854383e-05, "loss": 0.2203, "mean_token_accuracy": 0.9472890466451644, "step": 45620 }, { "epoch": 4.564, "grad_norm": 0.8488078713417053, "learning_rate": 5.4365436543654367e-05, "loss": 0.3579, "mean_token_accuracy": 0.956934580206871, "step": 45640 }, { "epoch": 4.566, "grad_norm": 0.4946453869342804, "learning_rate": 5.434543454345434e-05, "loss": 0.3797, "mean_token_accuracy": 0.9356310725212097, "step": 45660 }, { "epoch": 4.568, "grad_norm": 0.5200145244598389, "learning_rate": 5.4325432543254326e-05, "loss": 0.1954, "mean_token_accuracy": 0.9498382627964019, "step": 45680 }, { "epoch": 4.57, "grad_norm": 0.3912155032157898, "learning_rate": 5.43054305430543e-05, "loss": 0.2222, "mean_token_accuracy": 0.951872730255127, "step": 45700 }, { "epoch": 4.572, "grad_norm": 0.5132560729980469, "learning_rate": 5.428542854285429e-05, "loss": 0.3032, "mean_token_accuracy": 0.9358177900314331, "step": 45720 }, { "epoch": 4.574, "grad_norm": 0.36302903294563293, "learning_rate": 5.426542654265426e-05, "loss": 0.3192, "mean_token_accuracy": 0.9461145788431168, "step": 45740 }, { "epoch": 4.576, "grad_norm": 0.4604845345020294, "learning_rate": 5.424542454245425e-05, "loss": 0.2668, "mean_token_accuracy": 0.9393214851617813, "step": 45760 }, { "epoch": 4.578, "grad_norm": 105.40351104736328, "learning_rate": 5.422542254225422e-05, "loss": 0.3404, "mean_token_accuracy": 0.9484212458133697, "step": 45780 }, { "epoch": 4.58, "grad_norm": 1.040514349937439, "learning_rate": 5.420542054205421e-05, "loss": 0.2716, "mean_token_accuracy": 0.9450879752635956, "step": 45800 }, { "epoch": 4.582, "grad_norm": 0.44848453998565674, "learning_rate": 5.418541854185418e-05, "loss": 0.3202, "mean_token_accuracy": 0.9431906908750534, "step": 45820 }, { "epoch": 4.584, "grad_norm": 0.5581626296043396, "learning_rate": 5.416541654165417e-05, "loss": 0.2949, "mean_token_accuracy": 0.950212049484253, "step": 45840 }, { "epoch": 4.586, "grad_norm": 2.854315757751465, "learning_rate": 5.414541454145414e-05, "loss": 0.3658, "mean_token_accuracy": 0.9379511207342148, "step": 45860 }, { "epoch": 4.588, "grad_norm": 0.899622917175293, "learning_rate": 5.412541254125413e-05, "loss": 0.2242, "mean_token_accuracy": 0.9513896971940994, "step": 45880 }, { "epoch": 4.59, "grad_norm": 0.5616745352745056, "learning_rate": 5.410541054105411e-05, "loss": 0.1949, "mean_token_accuracy": 0.951519501209259, "step": 45900 }, { "epoch": 4.592, "grad_norm": 0.475714772939682, "learning_rate": 5.408540854085409e-05, "loss": 0.1959, "mean_token_accuracy": 0.9550049066543579, "step": 45920 }, { "epoch": 4.594, "grad_norm": 0.626192033290863, "learning_rate": 5.406540654065407e-05, "loss": 0.3674, "mean_token_accuracy": 0.9577156126499176, "step": 45940 }, { "epoch": 4.596, "grad_norm": 0.6244730949401855, "learning_rate": 5.404540454045405e-05, "loss": 0.2336, "mean_token_accuracy": 0.9529500633478165, "step": 45960 }, { "epoch": 4.598, "grad_norm": 0.4084434509277344, "learning_rate": 5.402540254025403e-05, "loss": 0.2753, "mean_token_accuracy": 0.9543772578239441, "step": 45980 }, { "epoch": 4.6, "grad_norm": 0.4994432032108307, "learning_rate": 5.400540054005401e-05, "loss": 0.2277, "mean_token_accuracy": 0.9503443837165833, "step": 46000 }, { "epoch": 4.602, "grad_norm": 0.5252498388290405, "learning_rate": 5.398539853985399e-05, "loss": 0.2642, "mean_token_accuracy": 0.9569629222154618, "step": 46020 }, { "epoch": 4.604, "grad_norm": 4.17717170715332, "learning_rate": 5.3965396539653966e-05, "loss": 0.3063, "mean_token_accuracy": 0.9383350580930709, "step": 46040 }, { "epoch": 4.606, "grad_norm": 0.33387184143066406, "learning_rate": 5.394539453945395e-05, "loss": 0.2544, "mean_token_accuracy": 0.9556968688964844, "step": 46060 }, { "epoch": 4.608, "grad_norm": 0.4983985722064972, "learning_rate": 5.3925392539253926e-05, "loss": 0.1903, "mean_token_accuracy": 0.9473049730062485, "step": 46080 }, { "epoch": 4.61, "grad_norm": 0.4840107560157776, "learning_rate": 5.3905390539053916e-05, "loss": 0.2812, "mean_token_accuracy": 0.9488168001174927, "step": 46100 }, { "epoch": 4.612, "grad_norm": 0.46381574869155884, "learning_rate": 5.3885388538853885e-05, "loss": 0.1779, "mean_token_accuracy": 0.9584474921226501, "step": 46120 }, { "epoch": 4.614, "grad_norm": 0.47688207030296326, "learning_rate": 5.3865386538653875e-05, "loss": 0.1653, "mean_token_accuracy": 0.9521658718585968, "step": 46140 }, { "epoch": 4.616, "grad_norm": 0.4309937357902527, "learning_rate": 5.3845384538453844e-05, "loss": 0.1606, "mean_token_accuracy": 0.9520042777061463, "step": 46160 }, { "epoch": 4.618, "grad_norm": 0.9907006025314331, "learning_rate": 5.3825382538253834e-05, "loss": 0.2033, "mean_token_accuracy": 0.9418475717306137, "step": 46180 }, { "epoch": 4.62, "grad_norm": 0.7348577380180359, "learning_rate": 5.3805380538053804e-05, "loss": 0.2679, "mean_token_accuracy": 0.9236943155527115, "step": 46200 }, { "epoch": 4.622, "grad_norm": 0.6719161868095398, "learning_rate": 5.3785378537853794e-05, "loss": 0.2093, "mean_token_accuracy": 0.9475183308124542, "step": 46220 }, { "epoch": 4.624, "grad_norm": 0.3608535826206207, "learning_rate": 5.376537653765377e-05, "loss": 0.2872, "mean_token_accuracy": 0.9525112032890319, "step": 46240 }, { "epoch": 4.626, "grad_norm": 0.31857365369796753, "learning_rate": 5.374537453745375e-05, "loss": 0.2474, "mean_token_accuracy": 0.9489248991012573, "step": 46260 }, { "epoch": 4.628, "grad_norm": 0.563237190246582, "learning_rate": 5.372537253725373e-05, "loss": 0.2659, "mean_token_accuracy": 0.9609324902296066, "step": 46280 }, { "epoch": 4.63, "grad_norm": 0.5096452832221985, "learning_rate": 5.370537053705371e-05, "loss": 0.1891, "mean_token_accuracy": 0.9496272534132004, "step": 46300 }, { "epoch": 4.632, "grad_norm": 0.6193784475326538, "learning_rate": 5.368536853685369e-05, "loss": 0.1975, "mean_token_accuracy": 0.9555294632911682, "step": 46320 }, { "epoch": 4.634, "grad_norm": 0.7048591375350952, "learning_rate": 5.366536653665367e-05, "loss": 0.1953, "mean_token_accuracy": 0.9549690783023834, "step": 46340 }, { "epoch": 4.636, "grad_norm": 0.4862920641899109, "learning_rate": 5.364536453645365e-05, "loss": 0.3203, "mean_token_accuracy": 0.9498288691043854, "step": 46360 }, { "epoch": 4.638, "grad_norm": 0.7168574929237366, "learning_rate": 5.362536253625363e-05, "loss": 0.1768, "mean_token_accuracy": 0.9531753808259964, "step": 46380 }, { "epoch": 4.64, "grad_norm": 0.8338634967803955, "learning_rate": 5.360536053605361e-05, "loss": 0.2831, "mean_token_accuracy": 0.9448576658964157, "step": 46400 }, { "epoch": 4.642, "grad_norm": 0.588050127029419, "learning_rate": 5.358535853585359e-05, "loss": 0.3237, "mean_token_accuracy": 0.9441344767808915, "step": 46420 }, { "epoch": 4.644, "grad_norm": 0.5282827615737915, "learning_rate": 5.3565356535653566e-05, "loss": 0.2564, "mean_token_accuracy": 0.9508712649345398, "step": 46440 }, { "epoch": 4.646, "grad_norm": 0.6140597462654114, "learning_rate": 5.354535453545355e-05, "loss": 0.1923, "mean_token_accuracy": 0.951283621788025, "step": 46460 }, { "epoch": 4.648, "grad_norm": 0.5938533544540405, "learning_rate": 5.3525352535253526e-05, "loss": 0.2067, "mean_token_accuracy": 0.951728817820549, "step": 46480 }, { "epoch": 4.65, "grad_norm": 0.4528265595436096, "learning_rate": 5.350535053505351e-05, "loss": 0.287, "mean_token_accuracy": 0.9362508952617645, "step": 46500 }, { "epoch": 4.652, "grad_norm": 0.6580877304077148, "learning_rate": 5.3485348534853485e-05, "loss": 0.1889, "mean_token_accuracy": 0.9543783992528916, "step": 46520 }, { "epoch": 4.654, "grad_norm": 0.8223935961723328, "learning_rate": 5.346534653465347e-05, "loss": 0.2522, "mean_token_accuracy": 0.9538104891777038, "step": 46540 }, { "epoch": 4.656, "grad_norm": 0.4057020843029022, "learning_rate": 5.3445344534453444e-05, "loss": 0.206, "mean_token_accuracy": 0.9385709494352341, "step": 46560 }, { "epoch": 4.658, "grad_norm": 0.6874830722808838, "learning_rate": 5.3425342534253434e-05, "loss": 0.3326, "mean_token_accuracy": 0.9564052194356918, "step": 46580 }, { "epoch": 4.66, "grad_norm": 0.5532559156417847, "learning_rate": 5.3405340534053404e-05, "loss": 0.2414, "mean_token_accuracy": 0.9560913860797882, "step": 46600 }, { "epoch": 4.662, "grad_norm": 0.41208696365356445, "learning_rate": 5.3385338533853394e-05, "loss": 0.1867, "mean_token_accuracy": 0.9565712034702301, "step": 46620 }, { "epoch": 4.664, "grad_norm": 0.44983693957328796, "learning_rate": 5.336533653365336e-05, "loss": 0.2277, "mean_token_accuracy": 0.9589067846536636, "step": 46640 }, { "epoch": 4.666, "grad_norm": 0.6511163115501404, "learning_rate": 5.334533453345335e-05, "loss": 0.1835, "mean_token_accuracy": 0.9507533192634583, "step": 46660 }, { "epoch": 4.668, "grad_norm": 0.5476688146591187, "learning_rate": 5.332533253325332e-05, "loss": 0.1647, "mean_token_accuracy": 0.9440673142671585, "step": 46680 }, { "epoch": 4.67, "grad_norm": 0.5490545034408569, "learning_rate": 5.330533053305331e-05, "loss": 0.3596, "mean_token_accuracy": 0.9515316843986511, "step": 46700 }, { "epoch": 4.672, "grad_norm": 0.49369874596595764, "learning_rate": 5.328532853285328e-05, "loss": 0.1696, "mean_token_accuracy": 0.9531889081001281, "step": 46720 }, { "epoch": 4.674, "grad_norm": 0.37914782762527466, "learning_rate": 5.326532653265327e-05, "loss": 0.3058, "mean_token_accuracy": 0.9437550038099289, "step": 46740 }, { "epoch": 4.676, "grad_norm": 0.53809654712677, "learning_rate": 5.324532453245325e-05, "loss": 0.2524, "mean_token_accuracy": 0.9573943883180618, "step": 46760 }, { "epoch": 4.678, "grad_norm": 0.47799453139305115, "learning_rate": 5.322532253225323e-05, "loss": 0.2048, "mean_token_accuracy": 0.9450800716876984, "step": 46780 }, { "epoch": 4.68, "grad_norm": 0.5393360257148743, "learning_rate": 5.320532053205321e-05, "loss": 0.2112, "mean_token_accuracy": 0.9524317502975463, "step": 46800 }, { "epoch": 4.682, "grad_norm": 0.6689404249191284, "learning_rate": 5.318531853185319e-05, "loss": 0.2985, "mean_token_accuracy": 0.9420739233493804, "step": 46820 }, { "epoch": 4.684, "grad_norm": 0.6001117825508118, "learning_rate": 5.3165316531653166e-05, "loss": 0.3035, "mean_token_accuracy": 0.9466839104890823, "step": 46840 }, { "epoch": 4.686, "grad_norm": 0.37779784202575684, "learning_rate": 5.314531453145315e-05, "loss": 0.2281, "mean_token_accuracy": 0.9528478622436524, "step": 46860 }, { "epoch": 4.688, "grad_norm": 0.6192914843559265, "learning_rate": 5.3125312531253126e-05, "loss": 0.385, "mean_token_accuracy": 0.9374649494886398, "step": 46880 }, { "epoch": 4.6899999999999995, "grad_norm": 0.5502698421478271, "learning_rate": 5.310531053105311e-05, "loss": 0.2029, "mean_token_accuracy": 0.9591533482074738, "step": 46900 }, { "epoch": 4.692, "grad_norm": 0.45706695318222046, "learning_rate": 5.3085308530853085e-05, "loss": 0.2825, "mean_token_accuracy": 0.9559469729661941, "step": 46920 }, { "epoch": 4.694, "grad_norm": 14.468315124511719, "learning_rate": 5.306530653065307e-05, "loss": 0.3682, "mean_token_accuracy": 0.9495856672525406, "step": 46940 }, { "epoch": 4.696, "grad_norm": 0.8042221665382385, "learning_rate": 5.3045304530453044e-05, "loss": 0.1975, "mean_token_accuracy": 0.9414919197559357, "step": 46960 }, { "epoch": 4.698, "grad_norm": 14.84186840057373, "learning_rate": 5.302530253025303e-05, "loss": 0.4505, "mean_token_accuracy": 0.9351680010557175, "step": 46980 }, { "epoch": 4.7, "grad_norm": 1.0577383041381836, "learning_rate": 5.3005300530053004e-05, "loss": 0.2373, "mean_token_accuracy": 0.9529698550701141, "step": 47000 }, { "epoch": 4.702, "grad_norm": 0.5605416893959045, "learning_rate": 5.298529852985299e-05, "loss": 0.2194, "mean_token_accuracy": 0.9502122700214386, "step": 47020 }, { "epoch": 4.704, "grad_norm": 0.9511723518371582, "learning_rate": 5.296529652965296e-05, "loss": 0.2144, "mean_token_accuracy": 0.9495195329189301, "step": 47040 }, { "epoch": 4.7059999999999995, "grad_norm": 2.491398334503174, "learning_rate": 5.2945294529452946e-05, "loss": 0.3593, "mean_token_accuracy": 0.9503291457891464, "step": 47060 }, { "epoch": 4.708, "grad_norm": 0.43021780252456665, "learning_rate": 5.292529252925292e-05, "loss": 0.2298, "mean_token_accuracy": 0.952723029255867, "step": 47080 }, { "epoch": 4.71, "grad_norm": 0.5322905778884888, "learning_rate": 5.290529052905291e-05, "loss": 0.1937, "mean_token_accuracy": 0.9489379972219467, "step": 47100 }, { "epoch": 4.712, "grad_norm": 1.0324047803878784, "learning_rate": 5.288528852885288e-05, "loss": 0.4045, "mean_token_accuracy": 0.9404559105634689, "step": 47120 }, { "epoch": 4.714, "grad_norm": 0.6325280666351318, "learning_rate": 5.286528652865287e-05, "loss": 0.244, "mean_token_accuracy": 0.9408183723688126, "step": 47140 }, { "epoch": 4.716, "grad_norm": 0.7272889614105225, "learning_rate": 5.284528452845284e-05, "loss": 0.2797, "mean_token_accuracy": 0.9443068832159043, "step": 47160 }, { "epoch": 4.718, "grad_norm": 0.4824579954147339, "learning_rate": 5.282528252825283e-05, "loss": 0.3686, "mean_token_accuracy": 0.935981172323227, "step": 47180 }, { "epoch": 4.72, "grad_norm": 2.028541088104248, "learning_rate": 5.28052805280528e-05, "loss": 0.3023, "mean_token_accuracy": 0.9450941115617753, "step": 47200 }, { "epoch": 4.7219999999999995, "grad_norm": 0.5112606883049011, "learning_rate": 5.278527852785279e-05, "loss": 0.2081, "mean_token_accuracy": 0.9597261041402817, "step": 47220 }, { "epoch": 4.724, "grad_norm": 0.4374748170375824, "learning_rate": 5.276527652765276e-05, "loss": 0.3162, "mean_token_accuracy": 0.9478707820177078, "step": 47240 }, { "epoch": 4.726, "grad_norm": 0.35925960540771484, "learning_rate": 5.274527452745275e-05, "loss": 0.2173, "mean_token_accuracy": 0.9474398672580719, "step": 47260 }, { "epoch": 4.728, "grad_norm": 0.5463429689407349, "learning_rate": 5.2725272527252726e-05, "loss": 0.2062, "mean_token_accuracy": 0.9570593029260636, "step": 47280 }, { "epoch": 4.73, "grad_norm": 0.533208429813385, "learning_rate": 5.270527052705271e-05, "loss": 0.3068, "mean_token_accuracy": 0.9411843657493592, "step": 47300 }, { "epoch": 4.732, "grad_norm": 0.6532982587814331, "learning_rate": 5.2685268526852685e-05, "loss": 0.3405, "mean_token_accuracy": 0.9523553162813186, "step": 47320 }, { "epoch": 4.734, "grad_norm": 0.48077282309532166, "learning_rate": 5.266526652665267e-05, "loss": 0.2665, "mean_token_accuracy": 0.9485032528638839, "step": 47340 }, { "epoch": 4.736, "grad_norm": 0.5606146454811096, "learning_rate": 5.2645264526452644e-05, "loss": 0.2394, "mean_token_accuracy": 0.949824896454811, "step": 47360 }, { "epoch": 4.7379999999999995, "grad_norm": 0.6052073240280151, "learning_rate": 5.262526252625263e-05, "loss": 0.1915, "mean_token_accuracy": 0.9524537444114685, "step": 47380 }, { "epoch": 4.74, "grad_norm": 0.6316732168197632, "learning_rate": 5.260526052605261e-05, "loss": 0.2567, "mean_token_accuracy": 0.9520655542612075, "step": 47400 }, { "epoch": 4.742, "grad_norm": 0.6271879076957703, "learning_rate": 5.258525852585259e-05, "loss": 0.4014, "mean_token_accuracy": 0.9491375058889389, "step": 47420 }, { "epoch": 4.744, "grad_norm": 0.5018371939659119, "learning_rate": 5.2565256525652576e-05, "loss": 0.2674, "mean_token_accuracy": 0.9502766549587249, "step": 47440 }, { "epoch": 4.746, "grad_norm": 0.5933026075363159, "learning_rate": 5.2545254525452546e-05, "loss": 0.2417, "mean_token_accuracy": 0.9547708690166473, "step": 47460 }, { "epoch": 4.748, "grad_norm": 0.6251583099365234, "learning_rate": 5.2525252525252536e-05, "loss": 0.2229, "mean_token_accuracy": 0.9555115789175034, "step": 47480 }, { "epoch": 4.75, "grad_norm": 63.94221496582031, "learning_rate": 5.2505250525052505e-05, "loss": 0.3758, "mean_token_accuracy": 0.9469183534383774, "step": 47500 }, { "epoch": 4.752, "grad_norm": 0.49736201763153076, "learning_rate": 5.2485248524852495e-05, "loss": 0.3302, "mean_token_accuracy": 0.9424886614084244, "step": 47520 }, { "epoch": 4.754, "grad_norm": 0.5101087689399719, "learning_rate": 5.2465246524652465e-05, "loss": 0.2681, "mean_token_accuracy": 0.9498222142457962, "step": 47540 }, { "epoch": 4.756, "grad_norm": 0.5674864053726196, "learning_rate": 5.2445244524452454e-05, "loss": 0.1905, "mean_token_accuracy": 0.9471674263477325, "step": 47560 }, { "epoch": 4.758, "grad_norm": 0.4083028733730316, "learning_rate": 5.2425242524252424e-05, "loss": 0.2704, "mean_token_accuracy": 0.9377876251935959, "step": 47580 }, { "epoch": 4.76, "grad_norm": 0.4600699245929718, "learning_rate": 5.2405240524052414e-05, "loss": 0.2212, "mean_token_accuracy": 0.9483680546283721, "step": 47600 }, { "epoch": 4.7620000000000005, "grad_norm": 0.5800397992134094, "learning_rate": 5.238523852385239e-05, "loss": 0.4088, "mean_token_accuracy": 0.9429655760526657, "step": 47620 }, { "epoch": 4.764, "grad_norm": 0.6108448505401611, "learning_rate": 5.236523652365237e-05, "loss": 0.2329, "mean_token_accuracy": 0.9368856817483902, "step": 47640 }, { "epoch": 4.766, "grad_norm": 0.3905455470085144, "learning_rate": 5.234523452345235e-05, "loss": 0.1968, "mean_token_accuracy": 0.9561373203992843, "step": 47660 }, { "epoch": 4.768, "grad_norm": 0.5872034430503845, "learning_rate": 5.232523252325233e-05, "loss": 0.2763, "mean_token_accuracy": 0.9486961722373962, "step": 47680 }, { "epoch": 4.77, "grad_norm": 0.5021838545799255, "learning_rate": 5.230523052305231e-05, "loss": 0.3566, "mean_token_accuracy": 0.9495103418827057, "step": 47700 }, { "epoch": 4.772, "grad_norm": 0.49191802740097046, "learning_rate": 5.228522852285229e-05, "loss": 0.2156, "mean_token_accuracy": 0.9519496858119965, "step": 47720 }, { "epoch": 4.774, "grad_norm": 0.4402620494365692, "learning_rate": 5.226522652265227e-05, "loss": 0.1742, "mean_token_accuracy": 0.938413941860199, "step": 47740 }, { "epoch": 4.776, "grad_norm": 0.49390122294425964, "learning_rate": 5.224522452245225e-05, "loss": 0.2509, "mean_token_accuracy": 0.9491738528013229, "step": 47760 }, { "epoch": 4.7780000000000005, "grad_norm": 0.4211030900478363, "learning_rate": 5.222522252225223e-05, "loss": 0.3533, "mean_token_accuracy": 0.9569147109985352, "step": 47780 }, { "epoch": 4.78, "grad_norm": 0.5308117866516113, "learning_rate": 5.220522052205221e-05, "loss": 0.3644, "mean_token_accuracy": 0.9575829118490219, "step": 47800 }, { "epoch": 4.782, "grad_norm": 0.4891054928302765, "learning_rate": 5.2185218521852187e-05, "loss": 0.2958, "mean_token_accuracy": 0.940354910492897, "step": 47820 }, { "epoch": 4.784, "grad_norm": 0.45875418186187744, "learning_rate": 5.216521652165217e-05, "loss": 0.3581, "mean_token_accuracy": 0.9394708067178726, "step": 47840 }, { "epoch": 4.786, "grad_norm": 0.45546430349349976, "learning_rate": 5.2145214521452146e-05, "loss": 0.1452, "mean_token_accuracy": 0.9445515245199203, "step": 47860 }, { "epoch": 4.788, "grad_norm": 0.5378508567810059, "learning_rate": 5.212521252125213e-05, "loss": 0.286, "mean_token_accuracy": 0.9538032561540604, "step": 47880 }, { "epoch": 4.79, "grad_norm": 0.46347910165786743, "learning_rate": 5.2105210521052105e-05, "loss": 0.196, "mean_token_accuracy": 0.9558185398578644, "step": 47900 }, { "epoch": 4.792, "grad_norm": 0.8137404918670654, "learning_rate": 5.208520852085209e-05, "loss": 0.2866, "mean_token_accuracy": 0.9491500228643417, "step": 47920 }, { "epoch": 4.7940000000000005, "grad_norm": 0.40665414929389954, "learning_rate": 5.2065206520652065e-05, "loss": 0.3852, "mean_token_accuracy": 0.9418047189712524, "step": 47940 }, { "epoch": 4.796, "grad_norm": 0.633590817451477, "learning_rate": 5.2045204520452054e-05, "loss": 0.3478, "mean_token_accuracy": 0.9498216509819031, "step": 47960 }, { "epoch": 4.798, "grad_norm": 0.5341655611991882, "learning_rate": 5.2025202520252024e-05, "loss": 0.2756, "mean_token_accuracy": 0.9448777973651886, "step": 47980 }, { "epoch": 4.8, "grad_norm": 3.9439520835876465, "learning_rate": 5.2005200520052014e-05, "loss": 0.3626, "mean_token_accuracy": 0.9469345599412918, "step": 48000 }, { "epoch": 4.802, "grad_norm": 0.5364279747009277, "learning_rate": 5.198519851985198e-05, "loss": 0.3787, "mean_token_accuracy": 0.956580075621605, "step": 48020 }, { "epoch": 4.804, "grad_norm": 0.5586780309677124, "learning_rate": 5.196519651965197e-05, "loss": 0.2583, "mean_token_accuracy": 0.9465888857841491, "step": 48040 }, { "epoch": 4.806, "grad_norm": 0.4727746546268463, "learning_rate": 5.194519451945194e-05, "loss": 0.2248, "mean_token_accuracy": 0.9449974119663238, "step": 48060 }, { "epoch": 4.808, "grad_norm": 0.5410430431365967, "learning_rate": 5.192519251925193e-05, "loss": 0.263, "mean_token_accuracy": 0.952611917257309, "step": 48080 }, { "epoch": 4.8100000000000005, "grad_norm": 0.5152536034584045, "learning_rate": 5.19051905190519e-05, "loss": 0.3439, "mean_token_accuracy": 0.9458228081464768, "step": 48100 }, { "epoch": 4.812, "grad_norm": 0.6991317272186279, "learning_rate": 5.188518851885189e-05, "loss": 0.2682, "mean_token_accuracy": 0.9503213286399841, "step": 48120 }, { "epoch": 4.814, "grad_norm": 0.5121229887008667, "learning_rate": 5.186518651865187e-05, "loss": 0.1916, "mean_token_accuracy": 0.9553158521652222, "step": 48140 }, { "epoch": 4.816, "grad_norm": 1.1031150817871094, "learning_rate": 5.184518451845185e-05, "loss": 0.18, "mean_token_accuracy": 0.9514229387044907, "step": 48160 }, { "epoch": 4.818, "grad_norm": 0.39852288365364075, "learning_rate": 5.182518251825183e-05, "loss": 0.2261, "mean_token_accuracy": 0.9557711124420166, "step": 48180 }, { "epoch": 4.82, "grad_norm": 0.463159441947937, "learning_rate": 5.180518051805181e-05, "loss": 0.2762, "mean_token_accuracy": 0.9482864618301392, "step": 48200 }, { "epoch": 4.822, "grad_norm": 0.39114177227020264, "learning_rate": 5.1785178517851787e-05, "loss": 0.2185, "mean_token_accuracy": 0.954434335231781, "step": 48220 }, { "epoch": 4.824, "grad_norm": 0.3981667160987854, "learning_rate": 5.176517651765177e-05, "loss": 0.2444, "mean_token_accuracy": 0.9482378840446473, "step": 48240 }, { "epoch": 4.826, "grad_norm": 0.4902900457382202, "learning_rate": 5.1745174517451746e-05, "loss": 0.2251, "mean_token_accuracy": 0.9537658005952835, "step": 48260 }, { "epoch": 4.828, "grad_norm": 0.389598548412323, "learning_rate": 5.172517251725173e-05, "loss": 0.2051, "mean_token_accuracy": 0.9451513290405273, "step": 48280 }, { "epoch": 4.83, "grad_norm": 0.40998849272727966, "learning_rate": 5.1705170517051705e-05, "loss": 0.2318, "mean_token_accuracy": 0.9454596966505051, "step": 48300 }, { "epoch": 4.832, "grad_norm": 3.0759084224700928, "learning_rate": 5.168516851685169e-05, "loss": 0.1576, "mean_token_accuracy": 0.943116956949234, "step": 48320 }, { "epoch": 4.834, "grad_norm": 0.4472881555557251, "learning_rate": 5.1665166516651664e-05, "loss": 0.2244, "mean_token_accuracy": 0.9624250203371048, "step": 48340 }, { "epoch": 4.836, "grad_norm": 0.48411089181900024, "learning_rate": 5.164516451645165e-05, "loss": 0.2771, "mean_token_accuracy": 0.9413780272006989, "step": 48360 }, { "epoch": 4.838, "grad_norm": 0.5015215277671814, "learning_rate": 5.1625162516251624e-05, "loss": 0.2544, "mean_token_accuracy": 0.9549166411161423, "step": 48380 }, { "epoch": 4.84, "grad_norm": 0.5699737071990967, "learning_rate": 5.160516051605161e-05, "loss": 0.2506, "mean_token_accuracy": 0.9481029361486435, "step": 48400 }, { "epoch": 4.842, "grad_norm": 79.3485107421875, "learning_rate": 5.158515851585158e-05, "loss": 0.3047, "mean_token_accuracy": 0.9471292525529862, "step": 48420 }, { "epoch": 4.844, "grad_norm": 0.6342761516571045, "learning_rate": 5.1565156515651566e-05, "loss": 0.2091, "mean_token_accuracy": 0.9553956001996994, "step": 48440 }, { "epoch": 4.846, "grad_norm": 0.8203365206718445, "learning_rate": 5.154515451545154e-05, "loss": 0.3492, "mean_token_accuracy": 0.9412745624780655, "step": 48460 }, { "epoch": 4.848, "grad_norm": 0.5130201578140259, "learning_rate": 5.152515251525153e-05, "loss": 0.1711, "mean_token_accuracy": 0.9555384933948516, "step": 48480 }, { "epoch": 4.85, "grad_norm": 0.40554821491241455, "learning_rate": 5.15051505150515e-05, "loss": 0.2563, "mean_token_accuracy": 0.9528710871934891, "step": 48500 }, { "epoch": 4.852, "grad_norm": 0.6018354296684265, "learning_rate": 5.148514851485149e-05, "loss": 0.152, "mean_token_accuracy": 0.9485453188419342, "step": 48520 }, { "epoch": 4.854, "grad_norm": 0.6289942860603333, "learning_rate": 5.146514651465146e-05, "loss": 0.3126, "mean_token_accuracy": 0.9539737313985824, "step": 48540 }, { "epoch": 4.856, "grad_norm": 0.6331518292427063, "learning_rate": 5.144514451445145e-05, "loss": 0.2895, "mean_token_accuracy": 0.9471724539995193, "step": 48560 }, { "epoch": 4.858, "grad_norm": 0.4339366853237152, "learning_rate": 5.142514251425142e-05, "loss": 0.1649, "mean_token_accuracy": 0.9508849442005157, "step": 48580 }, { "epoch": 4.86, "grad_norm": 0.530343234539032, "learning_rate": 5.140514051405141e-05, "loss": 0.2516, "mean_token_accuracy": 0.9524196416139603, "step": 48600 }, { "epoch": 4.862, "grad_norm": 0.3632861375808716, "learning_rate": 5.138513851385138e-05, "loss": 0.2697, "mean_token_accuracy": 0.9500267744064331, "step": 48620 }, { "epoch": 4.864, "grad_norm": 0.43549439311027527, "learning_rate": 5.136513651365137e-05, "loss": 0.3035, "mean_token_accuracy": 0.9365398555994033, "step": 48640 }, { "epoch": 4.866, "grad_norm": 0.4162615239620209, "learning_rate": 5.1345134513451346e-05, "loss": 0.218, "mean_token_accuracy": 0.9543356269598007, "step": 48660 }, { "epoch": 4.868, "grad_norm": 0.3640359342098236, "learning_rate": 5.132513251325133e-05, "loss": 0.2774, "mean_token_accuracy": 0.935717859864235, "step": 48680 }, { "epoch": 4.87, "grad_norm": 2.4561378955841064, "learning_rate": 5.1305130513051305e-05, "loss": 0.243, "mean_token_accuracy": 0.9509427338838577, "step": 48700 }, { "epoch": 4.872, "grad_norm": 0.7371290922164917, "learning_rate": 5.128512851285129e-05, "loss": 0.2319, "mean_token_accuracy": 0.9434944957494735, "step": 48720 }, { "epoch": 4.874, "grad_norm": 0.37297409772872925, "learning_rate": 5.1265126512651264e-05, "loss": 0.2521, "mean_token_accuracy": 0.9515521764755249, "step": 48740 }, { "epoch": 4.876, "grad_norm": 0.35099324584007263, "learning_rate": 5.124512451245125e-05, "loss": 0.1682, "mean_token_accuracy": 0.955020260810852, "step": 48760 }, { "epoch": 4.878, "grad_norm": 0.6713294386863708, "learning_rate": 5.1225122512251224e-05, "loss": 0.2203, "mean_token_accuracy": 0.9487522095441818, "step": 48780 }, { "epoch": 4.88, "grad_norm": 0.5888541340827942, "learning_rate": 5.120512051205121e-05, "loss": 0.2612, "mean_token_accuracy": 0.9533623576164245, "step": 48800 }, { "epoch": 4.882, "grad_norm": 2.3862147331237793, "learning_rate": 5.118511851185118e-05, "loss": 0.3174, "mean_token_accuracy": 0.949346736073494, "step": 48820 }, { "epoch": 4.884, "grad_norm": 0.5671187043190002, "learning_rate": 5.1165116511651166e-05, "loss": 0.2079, "mean_token_accuracy": 0.954522430896759, "step": 48840 }, { "epoch": 4.886, "grad_norm": 0.47548916935920715, "learning_rate": 5.114511451145114e-05, "loss": 0.1863, "mean_token_accuracy": 0.9456584066152572, "step": 48860 }, { "epoch": 4.888, "grad_norm": 0.5855345726013184, "learning_rate": 5.1125112511251125e-05, "loss": 0.2258, "mean_token_accuracy": 0.9424479931592942, "step": 48880 }, { "epoch": 4.89, "grad_norm": 0.8004392981529236, "learning_rate": 5.1105110511051115e-05, "loss": 0.2171, "mean_token_accuracy": 0.9570972174406052, "step": 48900 }, { "epoch": 4.892, "grad_norm": 0.4436773657798767, "learning_rate": 5.1085108510851085e-05, "loss": 0.2468, "mean_token_accuracy": 0.9533957779407501, "step": 48920 }, { "epoch": 4.894, "grad_norm": 0.4513472318649292, "learning_rate": 5.1065106510651075e-05, "loss": 0.2932, "mean_token_accuracy": 0.9434502422809601, "step": 48940 }, { "epoch": 4.896, "grad_norm": 0.5600302219390869, "learning_rate": 5.1045104510451044e-05, "loss": 0.1639, "mean_token_accuracy": 0.9503132343292237, "step": 48960 }, { "epoch": 4.898, "grad_norm": 0.5565479397773743, "learning_rate": 5.1025102510251034e-05, "loss": 0.1686, "mean_token_accuracy": 0.9526222109794616, "step": 48980 }, { "epoch": 4.9, "grad_norm": 0.3426266014575958, "learning_rate": 5.100510051005101e-05, "loss": 0.1709, "mean_token_accuracy": 0.9528955489397049, "step": 49000 }, { "epoch": 4.902, "grad_norm": 5.448521137237549, "learning_rate": 5.098509850985099e-05, "loss": 0.1841, "mean_token_accuracy": 0.9538695067167282, "step": 49020 }, { "epoch": 4.904, "grad_norm": 1.161231517791748, "learning_rate": 5.096509650965097e-05, "loss": 0.2821, "mean_token_accuracy": 0.9568229854106903, "step": 49040 }, { "epoch": 4.906, "grad_norm": 0.5325889587402344, "learning_rate": 5.094509450945095e-05, "loss": 0.2671, "mean_token_accuracy": 0.9533707052469254, "step": 49060 }, { "epoch": 4.908, "grad_norm": 0.5701935887336731, "learning_rate": 5.092509250925093e-05, "loss": 0.2341, "mean_token_accuracy": 0.952960392832756, "step": 49080 }, { "epoch": 4.91, "grad_norm": 0.5663413405418396, "learning_rate": 5.090509050905091e-05, "loss": 0.2, "mean_token_accuracy": 0.9459663718938828, "step": 49100 }, { "epoch": 4.912, "grad_norm": 0.5214759707450867, "learning_rate": 5.088508850885089e-05, "loss": 0.1882, "mean_token_accuracy": 0.9519005954265595, "step": 49120 }, { "epoch": 4.914, "grad_norm": 2.39339280128479, "learning_rate": 5.086508650865087e-05, "loss": 0.3373, "mean_token_accuracy": 0.9424140572547912, "step": 49140 }, { "epoch": 4.916, "grad_norm": 0.46898239850997925, "learning_rate": 5.084508450845085e-05, "loss": 0.2163, "mean_token_accuracy": 0.9586037874221802, "step": 49160 }, { "epoch": 4.918, "grad_norm": 1.0041149854660034, "learning_rate": 5.082508250825083e-05, "loss": 0.1853, "mean_token_accuracy": 0.953001520037651, "step": 49180 }, { "epoch": 4.92, "grad_norm": 1.2596195936203003, "learning_rate": 5.080508050805081e-05, "loss": 0.3473, "mean_token_accuracy": 0.9497013002634048, "step": 49200 }, { "epoch": 4.922, "grad_norm": 1.6006125211715698, "learning_rate": 5.078507850785079e-05, "loss": 0.1923, "mean_token_accuracy": 0.9553630083799363, "step": 49220 }, { "epoch": 4.924, "grad_norm": 0.4982592463493347, "learning_rate": 5.0765076507650766e-05, "loss": 0.2835, "mean_token_accuracy": 0.9456127017736435, "step": 49240 }, { "epoch": 4.926, "grad_norm": 0.5606324076652527, "learning_rate": 5.074507450745075e-05, "loss": 0.2384, "mean_token_accuracy": 0.9572379112243652, "step": 49260 }, { "epoch": 4.928, "grad_norm": 0.5015475749969482, "learning_rate": 5.0725072507250725e-05, "loss": 0.1788, "mean_token_accuracy": 0.9545013248920441, "step": 49280 }, { "epoch": 4.93, "grad_norm": 0.3830410838127136, "learning_rate": 5.070507050705071e-05, "loss": 0.2481, "mean_token_accuracy": 0.9517801374197006, "step": 49300 }, { "epoch": 4.932, "grad_norm": 0.3938472270965576, "learning_rate": 5.0685068506850685e-05, "loss": 0.3347, "mean_token_accuracy": 0.9463801771402359, "step": 49320 }, { "epoch": 4.934, "grad_norm": 0.7057716846466064, "learning_rate": 5.0665066506650675e-05, "loss": 0.3124, "mean_token_accuracy": 0.9503782331943512, "step": 49340 }, { "epoch": 4.936, "grad_norm": 0.3814490735530853, "learning_rate": 5.0645064506450644e-05, "loss": 0.3138, "mean_token_accuracy": 0.941419991850853, "step": 49360 }, { "epoch": 4.938, "grad_norm": 0.3844664394855499, "learning_rate": 5.0625062506250634e-05, "loss": 0.2755, "mean_token_accuracy": 0.9567446142435074, "step": 49380 }, { "epoch": 4.9399999999999995, "grad_norm": 0.5432105660438538, "learning_rate": 5.06050605060506e-05, "loss": 0.1914, "mean_token_accuracy": 0.9580394089221954, "step": 49400 }, { "epoch": 4.942, "grad_norm": 0.8787903189659119, "learning_rate": 5.058505850585059e-05, "loss": 0.3176, "mean_token_accuracy": 0.9558625251054764, "step": 49420 }, { "epoch": 4.944, "grad_norm": 0.5449715852737427, "learning_rate": 5.056505650565056e-05, "loss": 0.2084, "mean_token_accuracy": 0.9593677878379822, "step": 49440 }, { "epoch": 4.946, "grad_norm": 0.4743959605693817, "learning_rate": 5.054505450545055e-05, "loss": 0.168, "mean_token_accuracy": 0.9498600989580155, "step": 49460 }, { "epoch": 4.948, "grad_norm": 0.4441942274570465, "learning_rate": 5.052505250525052e-05, "loss": 0.2666, "mean_token_accuracy": 0.9500206649303437, "step": 49480 }, { "epoch": 4.95, "grad_norm": 0.5057803988456726, "learning_rate": 5.050505050505051e-05, "loss": 0.2085, "mean_token_accuracy": 0.94436716735363, "step": 49500 }, { "epoch": 4.952, "grad_norm": 0.4742594361305237, "learning_rate": 5.048504850485049e-05, "loss": 0.2974, "mean_token_accuracy": 0.9436679512262345, "step": 49520 }, { "epoch": 4.954, "grad_norm": 0.5170695185661316, "learning_rate": 5.046504650465047e-05, "loss": 0.1708, "mean_token_accuracy": 0.950748085975647, "step": 49540 }, { "epoch": 4.9559999999999995, "grad_norm": 0.5312256217002869, "learning_rate": 5.044504450445045e-05, "loss": 0.1663, "mean_token_accuracy": 0.9598763912916184, "step": 49560 }, { "epoch": 4.958, "grad_norm": 0.5866159200668335, "learning_rate": 5.042504250425043e-05, "loss": 0.3487, "mean_token_accuracy": 0.9561771839857102, "step": 49580 }, { "epoch": 4.96, "grad_norm": 0.4750876724720001, "learning_rate": 5.040504050405041e-05, "loss": 0.2115, "mean_token_accuracy": 0.9467152059078217, "step": 49600 }, { "epoch": 4.962, "grad_norm": 4.797390460968018, "learning_rate": 5.038503850385039e-05, "loss": 0.2494, "mean_token_accuracy": 0.9511096179485321, "step": 49620 }, { "epoch": 4.964, "grad_norm": 0.490823358297348, "learning_rate": 5.0365036503650366e-05, "loss": 0.2609, "mean_token_accuracy": 0.950966814160347, "step": 49640 }, { "epoch": 4.966, "grad_norm": 0.7950718998908997, "learning_rate": 5.034503450345035e-05, "loss": 0.2234, "mean_token_accuracy": 0.9465080201625824, "step": 49660 }, { "epoch": 4.968, "grad_norm": 1.2639607191085815, "learning_rate": 5.0325032503250325e-05, "loss": 0.2049, "mean_token_accuracy": 0.9535951375961303, "step": 49680 }, { "epoch": 4.97, "grad_norm": 0.4848184883594513, "learning_rate": 5.030503050305031e-05, "loss": 0.2226, "mean_token_accuracy": 0.9550231128931046, "step": 49700 }, { "epoch": 4.9719999999999995, "grad_norm": 0.47777459025382996, "learning_rate": 5.0285028502850285e-05, "loss": 0.2259, "mean_token_accuracy": 0.9483823537826538, "step": 49720 }, { "epoch": 4.974, "grad_norm": 0.5612962245941162, "learning_rate": 5.026502650265027e-05, "loss": 0.1613, "mean_token_accuracy": 0.952475979924202, "step": 49740 }, { "epoch": 4.976, "grad_norm": 0.3986077606678009, "learning_rate": 5.0245024502450244e-05, "loss": 0.2406, "mean_token_accuracy": 0.9436233729124069, "step": 49760 }, { "epoch": 4.978, "grad_norm": 3.255563735961914, "learning_rate": 5.022502250225023e-05, "loss": 0.3609, "mean_token_accuracy": 0.9486172586679459, "step": 49780 }, { "epoch": 4.98, "grad_norm": 0.7177601456642151, "learning_rate": 5.02050205020502e-05, "loss": 0.3021, "mean_token_accuracy": 0.9517813801765442, "step": 49800 }, { "epoch": 4.982, "grad_norm": 2.0596818923950195, "learning_rate": 5.0185018501850186e-05, "loss": 0.1951, "mean_token_accuracy": 0.9521042823791503, "step": 49820 }, { "epoch": 4.984, "grad_norm": 0.4007246792316437, "learning_rate": 5.016501650165016e-05, "loss": 0.2413, "mean_token_accuracy": 0.9509454041719436, "step": 49840 }, { "epoch": 4.986, "grad_norm": 0.5606411099433899, "learning_rate": 5.014501450145015e-05, "loss": 0.1601, "mean_token_accuracy": 0.9572587549686432, "step": 49860 }, { "epoch": 4.9879999999999995, "grad_norm": 0.4627227485179901, "learning_rate": 5.012501250125012e-05, "loss": 0.1639, "mean_token_accuracy": 0.9444790124893189, "step": 49880 }, { "epoch": 4.99, "grad_norm": 0.46379584074020386, "learning_rate": 5.010501050105011e-05, "loss": 0.2167, "mean_token_accuracy": 0.950873926281929, "step": 49900 }, { "epoch": 4.992, "grad_norm": 0.463895320892334, "learning_rate": 5.008500850085008e-05, "loss": 0.277, "mean_token_accuracy": 0.9624052286148072, "step": 49920 }, { "epoch": 4.994, "grad_norm": 0.5187451839447021, "learning_rate": 5.006500650065007e-05, "loss": 0.228, "mean_token_accuracy": 0.9554361909627914, "step": 49940 }, { "epoch": 4.996, "grad_norm": 0.5377447009086609, "learning_rate": 5.004500450045004e-05, "loss": 0.2298, "mean_token_accuracy": 0.9521589577198029, "step": 49960 }, { "epoch": 4.998, "grad_norm": 0.5080665946006775, "learning_rate": 5.002500250025003e-05, "loss": 0.3197, "mean_token_accuracy": 0.9538516759872436, "step": 49980 }, { "epoch": 5.0, "grad_norm": 0.4801624119281769, "learning_rate": 5.000500050005001e-05, "loss": 0.2183, "mean_token_accuracy": 0.9453055649995804, "step": 50000 }, { "epoch": 5.002, "grad_norm": 0.6982571482658386, "learning_rate": 4.998499849984999e-05, "loss": 0.2447, "mean_token_accuracy": 0.9779837101697921, "step": 50020 }, { "epoch": 5.004, "grad_norm": 3.462526559829712, "learning_rate": 4.9964996499649966e-05, "loss": 0.1916, "mean_token_accuracy": 0.9826926857233047, "step": 50040 }, { "epoch": 5.006, "grad_norm": 0.5367085337638855, "learning_rate": 4.994499449944995e-05, "loss": 0.2136, "mean_token_accuracy": 0.9776311874389648, "step": 50060 }, { "epoch": 5.008, "grad_norm": 0.3077492117881775, "learning_rate": 4.9924992499249925e-05, "loss": 0.1338, "mean_token_accuracy": 0.9770518004894256, "step": 50080 }, { "epoch": 5.01, "grad_norm": 5.277929782867432, "learning_rate": 4.990499049904991e-05, "loss": 0.2998, "mean_token_accuracy": 0.9810316085815429, "step": 50100 }, { "epoch": 5.012, "grad_norm": 0.42997094988822937, "learning_rate": 4.988498849884989e-05, "loss": 0.206, "mean_token_accuracy": 0.9779494017362594, "step": 50120 }, { "epoch": 5.014, "grad_norm": 0.36356741189956665, "learning_rate": 4.986498649864987e-05, "loss": 0.2315, "mean_token_accuracy": 0.9731970280408859, "step": 50140 }, { "epoch": 5.016, "grad_norm": 0.35556650161743164, "learning_rate": 4.984498449844985e-05, "loss": 0.3638, "mean_token_accuracy": 0.9805087238550186, "step": 50160 }, { "epoch": 5.018, "grad_norm": 0.7763292193412781, "learning_rate": 4.982498249824983e-05, "loss": 0.1811, "mean_token_accuracy": 0.9859559446573257, "step": 50180 }, { "epoch": 5.02, "grad_norm": 0.31531378626823425, "learning_rate": 4.980498049804981e-05, "loss": 0.2074, "mean_token_accuracy": 0.9795692384243011, "step": 50200 }, { "epoch": 5.022, "grad_norm": 3.410762310028076, "learning_rate": 4.9784978497849786e-05, "loss": 0.2042, "mean_token_accuracy": 0.9805530101060868, "step": 50220 }, { "epoch": 5.024, "grad_norm": 0.6069897413253784, "learning_rate": 4.976497649764977e-05, "loss": 0.2845, "mean_token_accuracy": 0.9742653995752335, "step": 50240 }, { "epoch": 5.026, "grad_norm": 0.5019215941429138, "learning_rate": 4.9744974497449746e-05, "loss": 0.2723, "mean_token_accuracy": 0.978281581401825, "step": 50260 }, { "epoch": 5.028, "grad_norm": 0.40884754061698914, "learning_rate": 4.972497249724973e-05, "loss": 0.1783, "mean_token_accuracy": 0.9813922017812728, "step": 50280 }, { "epoch": 5.03, "grad_norm": 0.3120260238647461, "learning_rate": 4.9704970497049705e-05, "loss": 0.1988, "mean_token_accuracy": 0.9808554857969284, "step": 50300 }, { "epoch": 5.032, "grad_norm": 0.4060530662536621, "learning_rate": 4.968496849684969e-05, "loss": 0.2128, "mean_token_accuracy": 0.9806118905544281, "step": 50320 }, { "epoch": 5.034, "grad_norm": 1.0851222276687622, "learning_rate": 4.9664966496649664e-05, "loss": 0.2897, "mean_token_accuracy": 0.979051023721695, "step": 50340 }, { "epoch": 5.036, "grad_norm": 0.3222222924232483, "learning_rate": 4.964496449644965e-05, "loss": 0.2409, "mean_token_accuracy": 0.9783275336027145, "step": 50360 }, { "epoch": 5.038, "grad_norm": 0.37747159600257874, "learning_rate": 4.962496249624963e-05, "loss": 0.2402, "mean_token_accuracy": 0.9825859874486923, "step": 50380 }, { "epoch": 5.04, "grad_norm": 0.34742066264152527, "learning_rate": 4.9604960496049607e-05, "loss": 0.1891, "mean_token_accuracy": 0.9787782251834869, "step": 50400 }, { "epoch": 5.042, "grad_norm": 0.3753032088279724, "learning_rate": 4.958495849584959e-05, "loss": 0.1678, "mean_token_accuracy": 0.9769984751939773, "step": 50420 }, { "epoch": 5.044, "grad_norm": 0.41712796688079834, "learning_rate": 4.9564956495649566e-05, "loss": 0.1703, "mean_token_accuracy": 0.9808734744787216, "step": 50440 }, { "epoch": 5.046, "grad_norm": 0.5172892212867737, "learning_rate": 4.954495449544955e-05, "loss": 0.2541, "mean_token_accuracy": 0.9828263282775879, "step": 50460 }, { "epoch": 5.048, "grad_norm": 0.7563605308532715, "learning_rate": 4.9524952495249525e-05, "loss": 0.1966, "mean_token_accuracy": 0.9792012959718704, "step": 50480 }, { "epoch": 5.05, "grad_norm": 0.3131413757801056, "learning_rate": 4.950495049504951e-05, "loss": 0.2355, "mean_token_accuracy": 0.974976658821106, "step": 50500 }, { "epoch": 5.052, "grad_norm": 0.2478647083044052, "learning_rate": 4.9484948494849485e-05, "loss": 0.2576, "mean_token_accuracy": 0.9826691538095474, "step": 50520 }, { "epoch": 5.054, "grad_norm": 3.0008797645568848, "learning_rate": 4.946494649464947e-05, "loss": 0.1928, "mean_token_accuracy": 0.9811894685029984, "step": 50540 }, { "epoch": 5.056, "grad_norm": 0.44542786478996277, "learning_rate": 4.9444944494449444e-05, "loss": 0.1801, "mean_token_accuracy": 0.9836760640144349, "step": 50560 }, { "epoch": 5.058, "grad_norm": 0.3673244118690491, "learning_rate": 4.942494249424943e-05, "loss": 0.2285, "mean_token_accuracy": 0.9822397410869599, "step": 50580 }, { "epoch": 5.06, "grad_norm": 0.27397000789642334, "learning_rate": 4.94049404940494e-05, "loss": 0.1782, "mean_token_accuracy": 0.9829358547925949, "step": 50600 }, { "epoch": 5.062, "grad_norm": 0.3818152844905853, "learning_rate": 4.9384938493849386e-05, "loss": 0.2471, "mean_token_accuracy": 0.9726565927267075, "step": 50620 }, { "epoch": 5.064, "grad_norm": 2.9921205043792725, "learning_rate": 4.936493649364937e-05, "loss": 0.2701, "mean_token_accuracy": 0.9725047588348389, "step": 50640 }, { "epoch": 5.066, "grad_norm": 0.4002314805984497, "learning_rate": 4.9344934493449346e-05, "loss": 0.1909, "mean_token_accuracy": 0.9799420684576035, "step": 50660 }, { "epoch": 5.068, "grad_norm": 0.6599588990211487, "learning_rate": 4.932493249324933e-05, "loss": 0.2119, "mean_token_accuracy": 0.9771903187036515, "step": 50680 }, { "epoch": 5.07, "grad_norm": 0.33810099959373474, "learning_rate": 4.9304930493049305e-05, "loss": 0.1814, "mean_token_accuracy": 0.9806728810071945, "step": 50700 }, { "epoch": 5.072, "grad_norm": 0.2487565279006958, "learning_rate": 4.928492849284929e-05, "loss": 0.2105, "mean_token_accuracy": 0.9810240268707275, "step": 50720 }, { "epoch": 5.074, "grad_norm": 0.3094072937965393, "learning_rate": 4.9264926492649264e-05, "loss": 0.2161, "mean_token_accuracy": 0.9720200657844543, "step": 50740 }, { "epoch": 5.076, "grad_norm": 0.36123931407928467, "learning_rate": 4.924492449244925e-05, "loss": 0.1258, "mean_token_accuracy": 0.9815755695104599, "step": 50760 }, { "epoch": 5.078, "grad_norm": 0.25893840193748474, "learning_rate": 4.9224922492249223e-05, "loss": 0.2824, "mean_token_accuracy": 0.9786216139793396, "step": 50780 }, { "epoch": 5.08, "grad_norm": 0.3028656244277954, "learning_rate": 4.9204920492049207e-05, "loss": 0.3332, "mean_token_accuracy": 0.9709639400243759, "step": 50800 }, { "epoch": 5.082, "grad_norm": 0.4152524471282959, "learning_rate": 4.918491849184918e-05, "loss": 0.1556, "mean_token_accuracy": 0.9782142877578736, "step": 50820 }, { "epoch": 5.084, "grad_norm": 0.24521347880363464, "learning_rate": 4.9164916491649166e-05, "loss": 0.1952, "mean_token_accuracy": 0.9747226476669312, "step": 50840 }, { "epoch": 5.086, "grad_norm": 0.3762798011302948, "learning_rate": 4.914491449144914e-05, "loss": 0.1988, "mean_token_accuracy": 0.9830172538757325, "step": 50860 }, { "epoch": 5.088, "grad_norm": 0.2505640983581543, "learning_rate": 4.9124912491249125e-05, "loss": 0.2566, "mean_token_accuracy": 0.9809012174606323, "step": 50880 }, { "epoch": 5.09, "grad_norm": 0.33433797955513, "learning_rate": 4.910491049104911e-05, "loss": 0.1689, "mean_token_accuracy": 0.9792927384376526, "step": 50900 }, { "epoch": 5.092, "grad_norm": 0.33575162291526794, "learning_rate": 4.9084908490849084e-05, "loss": 0.2561, "mean_token_accuracy": 0.9756198018789292, "step": 50920 }, { "epoch": 5.094, "grad_norm": 0.4663751721382141, "learning_rate": 4.906490649064907e-05, "loss": 0.1697, "mean_token_accuracy": 0.980192032456398, "step": 50940 }, { "epoch": 5.096, "grad_norm": 0.34543904662132263, "learning_rate": 4.9044904490449044e-05, "loss": 0.2953, "mean_token_accuracy": 0.9745420038700103, "step": 50960 }, { "epoch": 5.098, "grad_norm": 1.1109944581985474, "learning_rate": 4.902490249024903e-05, "loss": 0.2659, "mean_token_accuracy": 0.9779754728078842, "step": 50980 }, { "epoch": 5.1, "grad_norm": 0.5328553915023804, "learning_rate": 4.9004900490049e-05, "loss": 0.2323, "mean_token_accuracy": 0.983553209900856, "step": 51000 }, { "epoch": 5.102, "grad_norm": 0.5012798309326172, "learning_rate": 4.8984898489848986e-05, "loss": 0.1766, "mean_token_accuracy": 0.9830692827701568, "step": 51020 }, { "epoch": 5.104, "grad_norm": 0.28149086236953735, "learning_rate": 4.896489648964896e-05, "loss": 0.186, "mean_token_accuracy": 0.9746704280376435, "step": 51040 }, { "epoch": 5.106, "grad_norm": 0.2929629385471344, "learning_rate": 4.8944894489448945e-05, "loss": 0.2268, "mean_token_accuracy": 0.9784418106079101, "step": 51060 }, { "epoch": 5.108, "grad_norm": 0.7950888276100159, "learning_rate": 4.892489248924892e-05, "loss": 0.1345, "mean_token_accuracy": 0.9802918672561646, "step": 51080 }, { "epoch": 5.11, "grad_norm": 1.6839797496795654, "learning_rate": 4.8904890489048905e-05, "loss": 0.226, "mean_token_accuracy": 0.9745178461074829, "step": 51100 }, { "epoch": 5.112, "grad_norm": 0.38252711296081543, "learning_rate": 4.888488848884889e-05, "loss": 0.1914, "mean_token_accuracy": 0.9780081808567047, "step": 51120 }, { "epoch": 5.114, "grad_norm": 0.5129145979881287, "learning_rate": 4.886488648864887e-05, "loss": 0.1541, "mean_token_accuracy": 0.9748898118734359, "step": 51140 }, { "epoch": 5.116, "grad_norm": 0.25289779901504517, "learning_rate": 4.884488448844885e-05, "loss": 0.2584, "mean_token_accuracy": 0.9818875968456269, "step": 51160 }, { "epoch": 5.118, "grad_norm": 0.25186315178871155, "learning_rate": 4.882488248824883e-05, "loss": 0.1976, "mean_token_accuracy": 0.981270968914032, "step": 51180 }, { "epoch": 5.12, "grad_norm": 3.2822203636169434, "learning_rate": 4.8804880488048806e-05, "loss": 0.2914, "mean_token_accuracy": 0.9760506629943848, "step": 51200 }, { "epoch": 5.122, "grad_norm": 0.47284016013145447, "learning_rate": 4.878487848784879e-05, "loss": 0.1595, "mean_token_accuracy": 0.9837116956710815, "step": 51220 }, { "epoch": 5.124, "grad_norm": 0.6612608432769775, "learning_rate": 4.876487648764877e-05, "loss": 0.1653, "mean_token_accuracy": 0.9799239158630371, "step": 51240 }, { "epoch": 5.126, "grad_norm": 0.3415968418121338, "learning_rate": 4.874487448744875e-05, "loss": 0.3113, "mean_token_accuracy": 0.9715928137302399, "step": 51260 }, { "epoch": 5.128, "grad_norm": 0.25708791613578796, "learning_rate": 4.872487248724873e-05, "loss": 0.248, "mean_token_accuracy": 0.9817256391048431, "step": 51280 }, { "epoch": 5.13, "grad_norm": 0.31623125076293945, "learning_rate": 4.870487048704871e-05, "loss": 0.195, "mean_token_accuracy": 0.9807168781757355, "step": 51300 }, { "epoch": 5.132, "grad_norm": 0.8904235363006592, "learning_rate": 4.868486848684869e-05, "loss": 0.2696, "mean_token_accuracy": 0.9788030922412873, "step": 51320 }, { "epoch": 5.134, "grad_norm": 0.32543060183525085, "learning_rate": 4.866486648664867e-05, "loss": 0.1493, "mean_token_accuracy": 0.9746747761964798, "step": 51340 }, { "epoch": 5.136, "grad_norm": 0.46732962131500244, "learning_rate": 4.864486448644865e-05, "loss": 0.3154, "mean_token_accuracy": 0.9725378692150116, "step": 51360 }, { "epoch": 5.138, "grad_norm": 0.3435992896556854, "learning_rate": 4.862486248624863e-05, "loss": 0.1558, "mean_token_accuracy": 0.9806052446365356, "step": 51380 }, { "epoch": 5.14, "grad_norm": 0.4443340003490448, "learning_rate": 4.860486048604861e-05, "loss": 0.1927, "mean_token_accuracy": 0.9813200622797013, "step": 51400 }, { "epoch": 5.142, "grad_norm": 0.5917695760726929, "learning_rate": 4.8584858485848586e-05, "loss": 0.3698, "mean_token_accuracy": 0.9744754016399384, "step": 51420 }, { "epoch": 5.144, "grad_norm": 0.29646268486976624, "learning_rate": 4.856485648564857e-05, "loss": 0.2219, "mean_token_accuracy": 0.980954909324646, "step": 51440 }, { "epoch": 5.146, "grad_norm": 0.3776611089706421, "learning_rate": 4.8544854485448545e-05, "loss": 0.1993, "mean_token_accuracy": 0.9749773234128952, "step": 51460 }, { "epoch": 5.148, "grad_norm": 0.3404972553253174, "learning_rate": 4.852485248524853e-05, "loss": 0.1673, "mean_token_accuracy": 0.9829028904438019, "step": 51480 }, { "epoch": 5.15, "grad_norm": 0.36097824573516846, "learning_rate": 4.850485048504851e-05, "loss": 0.262, "mean_token_accuracy": 0.9863341212272644, "step": 51500 }, { "epoch": 5.152, "grad_norm": 0.3562147617340088, "learning_rate": 4.848484848484849e-05, "loss": 0.2551, "mean_token_accuracy": 0.9789530456066131, "step": 51520 }, { "epoch": 5.154, "grad_norm": 0.31639429926872253, "learning_rate": 4.846484648464847e-05, "loss": 0.1706, "mean_token_accuracy": 0.9798960745334625, "step": 51540 }, { "epoch": 5.156, "grad_norm": 0.3708163797855377, "learning_rate": 4.844484448444845e-05, "loss": 0.2351, "mean_token_accuracy": 0.978619521856308, "step": 51560 }, { "epoch": 5.158, "grad_norm": 0.4084985852241516, "learning_rate": 4.842484248424843e-05, "loss": 0.2615, "mean_token_accuracy": 0.9836827427148819, "step": 51580 }, { "epoch": 5.16, "grad_norm": 0.48485618829727173, "learning_rate": 4.8404840484048406e-05, "loss": 0.2489, "mean_token_accuracy": 0.9720619797706604, "step": 51600 }, { "epoch": 5.162, "grad_norm": 0.41916128993034363, "learning_rate": 4.838483848384839e-05, "loss": 0.2189, "mean_token_accuracy": 0.9824330240488053, "step": 51620 }, { "epoch": 5.164, "grad_norm": 0.4764823615550995, "learning_rate": 4.8364836483648366e-05, "loss": 0.2315, "mean_token_accuracy": 0.9783413946628571, "step": 51640 }, { "epoch": 5.166, "grad_norm": 0.5010421276092529, "learning_rate": 4.834483448344835e-05, "loss": 0.2308, "mean_token_accuracy": 0.9821982771158219, "step": 51660 }, { "epoch": 5.168, "grad_norm": 0.39313921332359314, "learning_rate": 4.8324832483248325e-05, "loss": 0.2636, "mean_token_accuracy": 0.9784429609775543, "step": 51680 }, { "epoch": 5.17, "grad_norm": 0.2674700617790222, "learning_rate": 4.830483048304831e-05, "loss": 0.2136, "mean_token_accuracy": 0.9808703511953354, "step": 51700 }, { "epoch": 5.172, "grad_norm": 0.23196619749069214, "learning_rate": 4.8284828482848284e-05, "loss": 0.2466, "mean_token_accuracy": 0.9809323161840439, "step": 51720 }, { "epoch": 5.174, "grad_norm": 0.3945944309234619, "learning_rate": 4.826482648264827e-05, "loss": 0.1503, "mean_token_accuracy": 0.9739989280700684, "step": 51740 }, { "epoch": 5.176, "grad_norm": 0.26027724146842957, "learning_rate": 4.824482448244825e-05, "loss": 0.3608, "mean_token_accuracy": 0.9709251046180725, "step": 51760 }, { "epoch": 5.178, "grad_norm": 0.3932245969772339, "learning_rate": 4.822482248224823e-05, "loss": 0.1604, "mean_token_accuracy": 0.9813632398843766, "step": 51780 }, { "epoch": 5.18, "grad_norm": 0.4270341098308563, "learning_rate": 4.820482048204821e-05, "loss": 0.1697, "mean_token_accuracy": 0.9801243513822555, "step": 51800 }, { "epoch": 5.182, "grad_norm": 0.3273424208164215, "learning_rate": 4.8184818481848186e-05, "loss": 0.1983, "mean_token_accuracy": 0.9762916892766953, "step": 51820 }, { "epoch": 5.184, "grad_norm": 0.462312251329422, "learning_rate": 4.816481648164817e-05, "loss": 0.16, "mean_token_accuracy": 0.9801619201898575, "step": 51840 }, { "epoch": 5.186, "grad_norm": 0.27926480770111084, "learning_rate": 4.8144814481448145e-05, "loss": 0.1637, "mean_token_accuracy": 0.9814834058284759, "step": 51860 }, { "epoch": 5.188, "grad_norm": 0.9240552186965942, "learning_rate": 4.812481248124813e-05, "loss": 0.2435, "mean_token_accuracy": 0.9819360822439194, "step": 51880 }, { "epoch": 5.19, "grad_norm": 0.4104342758655548, "learning_rate": 4.8104810481048105e-05, "loss": 0.1772, "mean_token_accuracy": 0.9819832414388656, "step": 51900 }, { "epoch": 5.192, "grad_norm": 0.3975203037261963, "learning_rate": 4.808480848084809e-05, "loss": 0.3916, "mean_token_accuracy": 0.978831672668457, "step": 51920 }, { "epoch": 5.194, "grad_norm": 0.27744072675704956, "learning_rate": 4.8064806480648064e-05, "loss": 0.2019, "mean_token_accuracy": 0.9755666464567184, "step": 51940 }, { "epoch": 5.196, "grad_norm": 0.41666167974472046, "learning_rate": 4.804480448044805e-05, "loss": 0.2079, "mean_token_accuracy": 0.9859124541282653, "step": 51960 }, { "epoch": 5.198, "grad_norm": 1.2902401685714722, "learning_rate": 4.802480248024803e-05, "loss": 0.2301, "mean_token_accuracy": 0.9802925944328308, "step": 51980 }, { "epoch": 5.2, "grad_norm": 0.45505979657173157, "learning_rate": 4.8004800480048006e-05, "loss": 0.2322, "mean_token_accuracy": 0.9829574823379517, "step": 52000 }, { "epoch": 5.202, "grad_norm": 7.558276176452637, "learning_rate": 4.798479847984799e-05, "loss": 0.1755, "mean_token_accuracy": 0.9822513908147812, "step": 52020 }, { "epoch": 5.204, "grad_norm": 0.24447529017925262, "learning_rate": 4.7964796479647966e-05, "loss": 0.1673, "mean_token_accuracy": 0.974457922577858, "step": 52040 }, { "epoch": 5.206, "grad_norm": 0.2590775191783905, "learning_rate": 4.794479447944795e-05, "loss": 0.2331, "mean_token_accuracy": 0.9754801303148269, "step": 52060 }, { "epoch": 5.208, "grad_norm": 0.37801316380500793, "learning_rate": 4.7924792479247925e-05, "loss": 0.2685, "mean_token_accuracy": 0.970135024189949, "step": 52080 }, { "epoch": 5.21, "grad_norm": 0.6817066073417664, "learning_rate": 4.790479047904791e-05, "loss": 0.2217, "mean_token_accuracy": 0.9825059801340104, "step": 52100 }, { "epoch": 5.212, "grad_norm": 0.3278687000274658, "learning_rate": 4.7884788478847884e-05, "loss": 0.218, "mean_token_accuracy": 0.9780094563961029, "step": 52120 }, { "epoch": 5.214, "grad_norm": 0.32052353024482727, "learning_rate": 4.786478647864787e-05, "loss": 0.2146, "mean_token_accuracy": 0.9798569172620774, "step": 52140 }, { "epoch": 5.216, "grad_norm": 0.24419263005256653, "learning_rate": 4.7844784478447844e-05, "loss": 0.126, "mean_token_accuracy": 0.9818146646022796, "step": 52160 }, { "epoch": 5.218, "grad_norm": 0.36759233474731445, "learning_rate": 4.782478247824783e-05, "loss": 0.1215, "mean_token_accuracy": 0.9834551751613617, "step": 52180 }, { "epoch": 5.22, "grad_norm": 0.24198931455612183, "learning_rate": 4.78047804780478e-05, "loss": 0.2004, "mean_token_accuracy": 0.9766809433698654, "step": 52200 }, { "epoch": 5.222, "grad_norm": 0.21472486853599548, "learning_rate": 4.7784778477847786e-05, "loss": 0.177, "mean_token_accuracy": 0.9830642223358155, "step": 52220 }, { "epoch": 5.224, "grad_norm": 0.7395322322845459, "learning_rate": 4.776477647764777e-05, "loss": 0.3385, "mean_token_accuracy": 0.9801821708679199, "step": 52240 }, { "epoch": 5.226, "grad_norm": 0.35265231132507324, "learning_rate": 4.7744774477447745e-05, "loss": 0.19, "mean_token_accuracy": 0.9787793248891831, "step": 52260 }, { "epoch": 5.228, "grad_norm": 0.6457259654998779, "learning_rate": 4.772477247724773e-05, "loss": 0.2836, "mean_token_accuracy": 0.9797410756349564, "step": 52280 }, { "epoch": 5.23, "grad_norm": 0.3330765664577484, "learning_rate": 4.7704770477047705e-05, "loss": 0.2107, "mean_token_accuracy": 0.985047098994255, "step": 52300 }, { "epoch": 5.232, "grad_norm": 0.2525431215763092, "learning_rate": 4.768476847684769e-05, "loss": 0.1784, "mean_token_accuracy": 0.9793926388025284, "step": 52320 }, { "epoch": 5.234, "grad_norm": 0.3704693019390106, "learning_rate": 4.7664766476647664e-05, "loss": 0.1985, "mean_token_accuracy": 0.9799446702003479, "step": 52340 }, { "epoch": 5.236, "grad_norm": 0.4348202347755432, "learning_rate": 4.764476447644765e-05, "loss": 0.1915, "mean_token_accuracy": 0.9803055971860886, "step": 52360 }, { "epoch": 5.2379999999999995, "grad_norm": 0.36757686734199524, "learning_rate": 4.762476247624762e-05, "loss": 0.2496, "mean_token_accuracy": 0.9773878812789917, "step": 52380 }, { "epoch": 5.24, "grad_norm": 0.37694188952445984, "learning_rate": 4.7604760476047606e-05, "loss": 0.1415, "mean_token_accuracy": 0.9815342038869858, "step": 52400 }, { "epoch": 5.242, "grad_norm": 0.18723544478416443, "learning_rate": 4.758475847584758e-05, "loss": 0.1652, "mean_token_accuracy": 0.9821707785129548, "step": 52420 }, { "epoch": 5.244, "grad_norm": 0.3788837194442749, "learning_rate": 4.7564756475647566e-05, "loss": 0.2571, "mean_token_accuracy": 0.9794730752706527, "step": 52440 }, { "epoch": 5.246, "grad_norm": 0.6089565753936768, "learning_rate": 4.754475447544754e-05, "loss": 0.2413, "mean_token_accuracy": 0.9730407148599625, "step": 52460 }, { "epoch": 5.248, "grad_norm": 2.1152517795562744, "learning_rate": 4.7524752475247525e-05, "loss": 0.2196, "mean_token_accuracy": 0.9708738684654236, "step": 52480 }, { "epoch": 5.25, "grad_norm": 1.3299182653427124, "learning_rate": 4.750475047504751e-05, "loss": 0.2462, "mean_token_accuracy": 0.9800086557865143, "step": 52500 }, { "epoch": 5.252, "grad_norm": 0.327279657125473, "learning_rate": 4.7484748474847484e-05, "loss": 0.1852, "mean_token_accuracy": 0.9781356662511825, "step": 52520 }, { "epoch": 5.254, "grad_norm": 0.39636361598968506, "learning_rate": 4.746474647464747e-05, "loss": 0.1643, "mean_token_accuracy": 0.9734513968229294, "step": 52540 }, { "epoch": 5.256, "grad_norm": 0.36810705065727234, "learning_rate": 4.7444744474447444e-05, "loss": 0.1794, "mean_token_accuracy": 0.9771336436271667, "step": 52560 }, { "epoch": 5.258, "grad_norm": 0.47226008772850037, "learning_rate": 4.742474247424743e-05, "loss": 0.2866, "mean_token_accuracy": 0.975891163945198, "step": 52580 }, { "epoch": 5.26, "grad_norm": 0.3356093168258667, "learning_rate": 4.74047404740474e-05, "loss": 0.2306, "mean_token_accuracy": 0.9728047460317611, "step": 52600 }, { "epoch": 5.2620000000000005, "grad_norm": 0.3352322280406952, "learning_rate": 4.7384738473847386e-05, "loss": 0.1892, "mean_token_accuracy": 0.9791698813438415, "step": 52620 }, { "epoch": 5.264, "grad_norm": 3.297248363494873, "learning_rate": 4.736473647364737e-05, "loss": 0.4591, "mean_token_accuracy": 0.9730041265487671, "step": 52640 }, { "epoch": 5.266, "grad_norm": 0.4363212585449219, "learning_rate": 4.734473447344735e-05, "loss": 0.2714, "mean_token_accuracy": 0.9800631374120712, "step": 52660 }, { "epoch": 5.268, "grad_norm": 0.3185291588306427, "learning_rate": 4.732473247324733e-05, "loss": 0.247, "mean_token_accuracy": 0.980915829539299, "step": 52680 }, { "epoch": 5.27, "grad_norm": 0.33638903498649597, "learning_rate": 4.730473047304731e-05, "loss": 0.1864, "mean_token_accuracy": 0.9750754773616791, "step": 52700 }, { "epoch": 5.272, "grad_norm": 1.1798157691955566, "learning_rate": 4.728472847284729e-05, "loss": 0.1799, "mean_token_accuracy": 0.9802220523357391, "step": 52720 }, { "epoch": 5.274, "grad_norm": 0.35023191571235657, "learning_rate": 4.726472647264727e-05, "loss": 0.1399, "mean_token_accuracy": 0.9808076322078705, "step": 52740 }, { "epoch": 5.276, "grad_norm": 0.2921421229839325, "learning_rate": 4.724472447244725e-05, "loss": 0.1679, "mean_token_accuracy": 0.9823468744754791, "step": 52760 }, { "epoch": 5.2780000000000005, "grad_norm": 0.3414359390735626, "learning_rate": 4.722472247224723e-05, "loss": 0.1544, "mean_token_accuracy": 0.9836916238069534, "step": 52780 }, { "epoch": 5.28, "grad_norm": 1.0787540674209595, "learning_rate": 4.7204720472047206e-05, "loss": 0.237, "mean_token_accuracy": 0.9759448766708374, "step": 52800 }, { "epoch": 5.282, "grad_norm": 0.332831472158432, "learning_rate": 4.718471847184719e-05, "loss": 0.1834, "mean_token_accuracy": 0.9808250814676285, "step": 52820 }, { "epoch": 5.284, "grad_norm": 0.28561288118362427, "learning_rate": 4.716471647164717e-05, "loss": 0.2165, "mean_token_accuracy": 0.9768164664506912, "step": 52840 }, { "epoch": 5.286, "grad_norm": 0.2622925937175751, "learning_rate": 4.714471447144715e-05, "loss": 0.1897, "mean_token_accuracy": 0.9711933106184005, "step": 52860 }, { "epoch": 5.288, "grad_norm": 0.4883337616920471, "learning_rate": 4.712471247124713e-05, "loss": 0.3731, "mean_token_accuracy": 0.9759874552488327, "step": 52880 }, { "epoch": 5.29, "grad_norm": 0.33436545729637146, "learning_rate": 4.710471047104711e-05, "loss": 0.1585, "mean_token_accuracy": 0.974277800321579, "step": 52900 }, { "epoch": 5.292, "grad_norm": 0.40300577878952026, "learning_rate": 4.708470847084709e-05, "loss": 0.2495, "mean_token_accuracy": 0.9818835169076919, "step": 52920 }, { "epoch": 5.294, "grad_norm": 0.23857352137565613, "learning_rate": 4.706470647064707e-05, "loss": 0.2878, "mean_token_accuracy": 0.979593300819397, "step": 52940 }, { "epoch": 5.296, "grad_norm": 0.9898882508277893, "learning_rate": 4.704470447044705e-05, "loss": 0.2164, "mean_token_accuracy": 0.9815830200910568, "step": 52960 }, { "epoch": 5.298, "grad_norm": 0.3458665609359741, "learning_rate": 4.7024702470247027e-05, "loss": 0.2175, "mean_token_accuracy": 0.9796840101480484, "step": 52980 }, { "epoch": 5.3, "grad_norm": 0.5314254760742188, "learning_rate": 4.700470047004701e-05, "loss": 0.1854, "mean_token_accuracy": 0.983336228132248, "step": 53000 }, { "epoch": 5.302, "grad_norm": 0.4657209515571594, "learning_rate": 4.6984698469846986e-05, "loss": 0.1673, "mean_token_accuracy": 0.9854593306779862, "step": 53020 }, { "epoch": 5.304, "grad_norm": 1.0586810111999512, "learning_rate": 4.696469646964697e-05, "loss": 0.1735, "mean_token_accuracy": 0.9803152918815613, "step": 53040 }, { "epoch": 5.306, "grad_norm": 0.3580987751483917, "learning_rate": 4.6944694469446945e-05, "loss": 0.2677, "mean_token_accuracy": 0.9833448052406311, "step": 53060 }, { "epoch": 5.308, "grad_norm": 0.864484965801239, "learning_rate": 4.692469246924693e-05, "loss": 0.2013, "mean_token_accuracy": 0.9813712298870086, "step": 53080 }, { "epoch": 5.31, "grad_norm": 0.5183056592941284, "learning_rate": 4.690469046904691e-05, "loss": 0.1613, "mean_token_accuracy": 0.9784564405679703, "step": 53100 }, { "epoch": 5.312, "grad_norm": 0.3336404860019684, "learning_rate": 4.688468846884689e-05, "loss": 0.2093, "mean_token_accuracy": 0.9781199097633362, "step": 53120 }, { "epoch": 5.314, "grad_norm": 0.35269540548324585, "learning_rate": 4.686468646864687e-05, "loss": 0.1866, "mean_token_accuracy": 0.9818723887205124, "step": 53140 }, { "epoch": 5.316, "grad_norm": 0.37119096517562866, "learning_rate": 4.684468446844685e-05, "loss": 0.2819, "mean_token_accuracy": 0.976993465423584, "step": 53160 }, { "epoch": 5.318, "grad_norm": 0.3353612720966339, "learning_rate": 4.682468246824683e-05, "loss": 0.3108, "mean_token_accuracy": 0.9754577875137329, "step": 53180 }, { "epoch": 5.32, "grad_norm": 3.1967267990112305, "learning_rate": 4.6804680468046806e-05, "loss": 0.2447, "mean_token_accuracy": 0.9809943288564682, "step": 53200 }, { "epoch": 5.322, "grad_norm": 0.2882085144519806, "learning_rate": 4.678467846784679e-05, "loss": 0.177, "mean_token_accuracy": 0.9764856636524201, "step": 53220 }, { "epoch": 5.324, "grad_norm": 0.3048015832901001, "learning_rate": 4.6764676467646766e-05, "loss": 0.1908, "mean_token_accuracy": 0.9766767978668213, "step": 53240 }, { "epoch": 5.326, "grad_norm": 18.67999839782715, "learning_rate": 4.674467446744675e-05, "loss": 0.167, "mean_token_accuracy": 0.977770259976387, "step": 53260 }, { "epoch": 5.328, "grad_norm": 0.33816176652908325, "learning_rate": 4.6724672467246725e-05, "loss": 0.3046, "mean_token_accuracy": 0.9806339412927627, "step": 53280 }, { "epoch": 5.33, "grad_norm": 0.7467902302742004, "learning_rate": 4.670467046704671e-05, "loss": 0.154, "mean_token_accuracy": 0.9802667379379273, "step": 53300 }, { "epoch": 5.332, "grad_norm": 0.3959951102733612, "learning_rate": 4.6684668466846684e-05, "loss": 0.248, "mean_token_accuracy": 0.9815631598234177, "step": 53320 }, { "epoch": 5.334, "grad_norm": 0.40341126918792725, "learning_rate": 4.666466646664667e-05, "loss": 0.2045, "mean_token_accuracy": 0.9731790661811829, "step": 53340 }, { "epoch": 5.336, "grad_norm": 0.36348551511764526, "learning_rate": 4.664466446644665e-05, "loss": 0.1853, "mean_token_accuracy": 0.9795125842094421, "step": 53360 }, { "epoch": 5.338, "grad_norm": 0.3122216463088989, "learning_rate": 4.6624662466246627e-05, "loss": 0.2016, "mean_token_accuracy": 0.9767312824726104, "step": 53380 }, { "epoch": 5.34, "grad_norm": 0.3081227242946625, "learning_rate": 4.660466046604661e-05, "loss": 0.1847, "mean_token_accuracy": 0.9752623409032821, "step": 53400 }, { "epoch": 5.342, "grad_norm": 0.2640342116355896, "learning_rate": 4.6584658465846586e-05, "loss": 0.172, "mean_token_accuracy": 0.977271169424057, "step": 53420 }, { "epoch": 5.344, "grad_norm": 0.4132683575153351, "learning_rate": 4.656465646564657e-05, "loss": 0.2221, "mean_token_accuracy": 0.9815975368022919, "step": 53440 }, { "epoch": 5.346, "grad_norm": 0.3736031651496887, "learning_rate": 4.6544654465446545e-05, "loss": 0.2175, "mean_token_accuracy": 0.9714543223381042, "step": 53460 }, { "epoch": 5.348, "grad_norm": 0.2629060447216034, "learning_rate": 4.652465246524653e-05, "loss": 0.2257, "mean_token_accuracy": 0.9783656001091003, "step": 53480 }, { "epoch": 5.35, "grad_norm": 0.3690784275531769, "learning_rate": 4.6504650465046504e-05, "loss": 0.2377, "mean_token_accuracy": 0.9707792311906814, "step": 53500 }, { "epoch": 5.352, "grad_norm": 0.5050088167190552, "learning_rate": 4.648464846484649e-05, "loss": 0.1604, "mean_token_accuracy": 0.9834676504135131, "step": 53520 }, { "epoch": 5.354, "grad_norm": 0.2605150043964386, "learning_rate": 4.6464646464646464e-05, "loss": 0.1993, "mean_token_accuracy": 0.9772179603576661, "step": 53540 }, { "epoch": 5.356, "grad_norm": 0.4532092213630676, "learning_rate": 4.644464446444645e-05, "loss": 0.1933, "mean_token_accuracy": 0.9783060878515244, "step": 53560 }, { "epoch": 5.358, "grad_norm": 0.5257278084754944, "learning_rate": 4.642464246424642e-05, "loss": 0.2503, "mean_token_accuracy": 0.9814976155757904, "step": 53580 }, { "epoch": 5.36, "grad_norm": 0.2886364758014679, "learning_rate": 4.6404640464046406e-05, "loss": 0.1727, "mean_token_accuracy": 0.97788507938385, "step": 53600 }, { "epoch": 5.362, "grad_norm": 0.403273344039917, "learning_rate": 4.638463846384639e-05, "loss": 0.1319, "mean_token_accuracy": 0.9807789534330368, "step": 53620 }, { "epoch": 5.364, "grad_norm": 0.34876468777656555, "learning_rate": 4.6364636463646365e-05, "loss": 0.2061, "mean_token_accuracy": 0.9806899130344391, "step": 53640 }, { "epoch": 5.366, "grad_norm": 3.815650463104248, "learning_rate": 4.634463446344635e-05, "loss": 0.221, "mean_token_accuracy": 0.9802520006895066, "step": 53660 }, { "epoch": 5.368, "grad_norm": 0.34827691316604614, "learning_rate": 4.6324632463246325e-05, "loss": 0.2963, "mean_token_accuracy": 0.9778765708208084, "step": 53680 }, { "epoch": 5.37, "grad_norm": 0.3634011149406433, "learning_rate": 4.630463046304631e-05, "loss": 0.1936, "mean_token_accuracy": 0.9778734624385834, "step": 53700 }, { "epoch": 5.372, "grad_norm": 1.3534226417541504, "learning_rate": 4.6284628462846284e-05, "loss": 0.1715, "mean_token_accuracy": 0.9817847907543182, "step": 53720 }, { "epoch": 5.374, "grad_norm": 0.313848614692688, "learning_rate": 4.626462646264627e-05, "loss": 0.2795, "mean_token_accuracy": 0.9755320310592651, "step": 53740 }, { "epoch": 5.376, "grad_norm": 0.4295641779899597, "learning_rate": 4.6244624462446243e-05, "loss": 0.2076, "mean_token_accuracy": 0.9792132109403611, "step": 53760 }, { "epoch": 5.378, "grad_norm": 0.36156097054481506, "learning_rate": 4.6224622462246226e-05, "loss": 0.2987, "mean_token_accuracy": 0.9652392745018006, "step": 53780 }, { "epoch": 5.38, "grad_norm": 0.4867600202560425, "learning_rate": 4.62046204620462e-05, "loss": 0.2037, "mean_token_accuracy": 0.9767342418432235, "step": 53800 }, { "epoch": 5.382, "grad_norm": 0.33150672912597656, "learning_rate": 4.6184618461846186e-05, "loss": 0.2522, "mean_token_accuracy": 0.980039831995964, "step": 53820 }, { "epoch": 5.384, "grad_norm": 0.3598178029060364, "learning_rate": 4.616461646164616e-05, "loss": 0.2072, "mean_token_accuracy": 0.9792965739965439, "step": 53840 }, { "epoch": 5.386, "grad_norm": 0.3861021399497986, "learning_rate": 4.6144614461446145e-05, "loss": 0.216, "mean_token_accuracy": 0.9816025137901306, "step": 53860 }, { "epoch": 5.388, "grad_norm": 0.3268469572067261, "learning_rate": 4.612461246124613e-05, "loss": 0.1375, "mean_token_accuracy": 0.9830299288034439, "step": 53880 }, { "epoch": 5.39, "grad_norm": 0.44071513414382935, "learning_rate": 4.6104610461046104e-05, "loss": 0.228, "mean_token_accuracy": 0.9814634531736374, "step": 53900 }, { "epoch": 5.392, "grad_norm": 0.28003764152526855, "learning_rate": 4.608460846084609e-05, "loss": 0.1392, "mean_token_accuracy": 0.9742489665746689, "step": 53920 }, { "epoch": 5.394, "grad_norm": 0.22718439996242523, "learning_rate": 4.6064606460646064e-05, "loss": 0.1547, "mean_token_accuracy": 0.980899977684021, "step": 53940 }, { "epoch": 5.396, "grad_norm": 0.34891608357429504, "learning_rate": 4.604460446044605e-05, "loss": 0.2608, "mean_token_accuracy": 0.9809980422258378, "step": 53960 }, { "epoch": 5.398, "grad_norm": 0.28281736373901367, "learning_rate": 4.602460246024602e-05, "loss": 0.2352, "mean_token_accuracy": 0.9810497403144837, "step": 53980 }, { "epoch": 5.4, "grad_norm": 0.3083273768424988, "learning_rate": 4.6004600460046006e-05, "loss": 0.2134, "mean_token_accuracy": 0.9796054422855377, "step": 54000 }, { "epoch": 5.402, "grad_norm": 0.33455726504325867, "learning_rate": 4.598459845984598e-05, "loss": 0.1901, "mean_token_accuracy": 0.9797475069761277, "step": 54020 }, { "epoch": 5.404, "grad_norm": 0.3573274314403534, "learning_rate": 4.5964596459645965e-05, "loss": 0.1761, "mean_token_accuracy": 0.9769040137529373, "step": 54040 }, { "epoch": 5.406, "grad_norm": 0.2984355390071869, "learning_rate": 4.594459445944594e-05, "loss": 0.2276, "mean_token_accuracy": 0.9788481026887894, "step": 54060 }, { "epoch": 5.408, "grad_norm": 0.30377885699272156, "learning_rate": 4.5924592459245925e-05, "loss": 0.132, "mean_token_accuracy": 0.9824376910924911, "step": 54080 }, { "epoch": 5.41, "grad_norm": 0.36450013518333435, "learning_rate": 4.59045904590459e-05, "loss": 0.2562, "mean_token_accuracy": 0.9788756400346756, "step": 54100 }, { "epoch": 5.412, "grad_norm": 0.4543377459049225, "learning_rate": 4.5884588458845884e-05, "loss": 0.3491, "mean_token_accuracy": 0.9773686677217484, "step": 54120 }, { "epoch": 5.414, "grad_norm": 0.3115576207637787, "learning_rate": 4.586458645864587e-05, "loss": 0.1499, "mean_token_accuracy": 0.9798816740512848, "step": 54140 }, { "epoch": 5.416, "grad_norm": 0.8316057324409485, "learning_rate": 4.584458445844585e-05, "loss": 0.1792, "mean_token_accuracy": 0.9851768225431442, "step": 54160 }, { "epoch": 5.418, "grad_norm": 2.3627700805664062, "learning_rate": 4.5824582458245826e-05, "loss": 0.1471, "mean_token_accuracy": 0.9803995162248611, "step": 54180 }, { "epoch": 5.42, "grad_norm": 0.5285453796386719, "learning_rate": 4.580458045804581e-05, "loss": 0.2101, "mean_token_accuracy": 0.9806190401315689, "step": 54200 }, { "epoch": 5.422, "grad_norm": 3.3297345638275146, "learning_rate": 4.578457845784579e-05, "loss": 0.2458, "mean_token_accuracy": 0.9805264800786972, "step": 54220 }, { "epoch": 5.424, "grad_norm": 0.49663472175598145, "learning_rate": 4.576457645764577e-05, "loss": 0.234, "mean_token_accuracy": 0.9740496933460235, "step": 54240 }, { "epoch": 5.426, "grad_norm": 0.3379811644554138, "learning_rate": 4.574457445744575e-05, "loss": 0.2362, "mean_token_accuracy": 0.9770863950252533, "step": 54260 }, { "epoch": 5.428, "grad_norm": 0.471243292093277, "learning_rate": 4.572457245724573e-05, "loss": 0.1758, "mean_token_accuracy": 0.9810789525508881, "step": 54280 }, { "epoch": 5.43, "grad_norm": 0.38110408186912537, "learning_rate": 4.570457045704571e-05, "loss": 0.2987, "mean_token_accuracy": 0.9819381147623062, "step": 54300 }, { "epoch": 5.432, "grad_norm": 0.28754475712776184, "learning_rate": 4.568456845684569e-05, "loss": 0.1632, "mean_token_accuracy": 0.9765417903661728, "step": 54320 }, { "epoch": 5.434, "grad_norm": 0.26820793747901917, "learning_rate": 4.566456645664567e-05, "loss": 0.1501, "mean_token_accuracy": 0.9812320530414581, "step": 54340 }, { "epoch": 5.436, "grad_norm": 0.3298952281475067, "learning_rate": 4.564456445644565e-05, "loss": 0.2806, "mean_token_accuracy": 0.9783230125904083, "step": 54360 }, { "epoch": 5.438, "grad_norm": 0.41673967242240906, "learning_rate": 4.562456245624563e-05, "loss": 0.2685, "mean_token_accuracy": 0.9740398079156876, "step": 54380 }, { "epoch": 5.44, "grad_norm": 0.31676968932151794, "learning_rate": 4.5604560456045606e-05, "loss": 0.1759, "mean_token_accuracy": 0.9751763463020324, "step": 54400 }, { "epoch": 5.442, "grad_norm": 0.4499315023422241, "learning_rate": 4.558455845584559e-05, "loss": 0.1493, "mean_token_accuracy": 0.9801555275917053, "step": 54420 }, { "epoch": 5.444, "grad_norm": 0.5331151485443115, "learning_rate": 4.5564556455645565e-05, "loss": 0.1777, "mean_token_accuracy": 0.9773860543966293, "step": 54440 }, { "epoch": 5.446, "grad_norm": 0.29704397916793823, "learning_rate": 4.554455445544555e-05, "loss": 0.2322, "mean_token_accuracy": 0.9781081676483154, "step": 54460 }, { "epoch": 5.448, "grad_norm": 0.3705858588218689, "learning_rate": 4.552455245524553e-05, "loss": 0.2129, "mean_token_accuracy": 0.9782767325639725, "step": 54480 }, { "epoch": 5.45, "grad_norm": 0.42845115065574646, "learning_rate": 4.550455045504551e-05, "loss": 0.2002, "mean_token_accuracy": 0.9787015587091445, "step": 54500 }, { "epoch": 5.452, "grad_norm": 0.39694666862487793, "learning_rate": 4.548454845484549e-05, "loss": 0.3098, "mean_token_accuracy": 0.9785013616085052, "step": 54520 }, { "epoch": 5.454, "grad_norm": 0.29602330923080444, "learning_rate": 4.546454645464547e-05, "loss": 0.2496, "mean_token_accuracy": 0.9780072063207627, "step": 54540 }, { "epoch": 5.456, "grad_norm": 0.38362476229667664, "learning_rate": 4.544454445444545e-05, "loss": 0.2274, "mean_token_accuracy": 0.9789666891098022, "step": 54560 }, { "epoch": 5.458, "grad_norm": 1.4138031005859375, "learning_rate": 4.5424542454245426e-05, "loss": 0.3021, "mean_token_accuracy": 0.9758157074451447, "step": 54580 }, { "epoch": 5.46, "grad_norm": 0.3561655580997467, "learning_rate": 4.540454045404541e-05, "loss": 0.253, "mean_token_accuracy": 0.9768504917621612, "step": 54600 }, { "epoch": 5.462, "grad_norm": 0.27754542231559753, "learning_rate": 4.5384538453845386e-05, "loss": 0.1365, "mean_token_accuracy": 0.9795873820781708, "step": 54620 }, { "epoch": 5.464, "grad_norm": 0.3136947453022003, "learning_rate": 4.536453645364537e-05, "loss": 0.1471, "mean_token_accuracy": 0.9808840751647949, "step": 54640 }, { "epoch": 5.466, "grad_norm": 0.3724389970302582, "learning_rate": 4.5344534453445345e-05, "loss": 0.153, "mean_token_accuracy": 0.9804904729127883, "step": 54660 }, { "epoch": 5.468, "grad_norm": 0.34824004769325256, "learning_rate": 4.532453245324533e-05, "loss": 0.228, "mean_token_accuracy": 0.977583384513855, "step": 54680 }, { "epoch": 5.47, "grad_norm": 2.023407220840454, "learning_rate": 4.5304530453045304e-05, "loss": 0.3582, "mean_token_accuracy": 0.9737777650356293, "step": 54700 }, { "epoch": 5.4719999999999995, "grad_norm": 0.3588600158691406, "learning_rate": 4.528452845284529e-05, "loss": 0.144, "mean_token_accuracy": 0.9785261750221252, "step": 54720 }, { "epoch": 5.474, "grad_norm": 0.40719088912010193, "learning_rate": 4.526452645264527e-05, "loss": 0.1297, "mean_token_accuracy": 0.9796646058559417, "step": 54740 }, { "epoch": 5.476, "grad_norm": 0.29258355498313904, "learning_rate": 4.524452445244525e-05, "loss": 0.1456, "mean_token_accuracy": 0.9830550163984298, "step": 54760 }, { "epoch": 5.478, "grad_norm": 0.39582014083862305, "learning_rate": 4.522452245224523e-05, "loss": 0.1697, "mean_token_accuracy": 0.9800173342227936, "step": 54780 }, { "epoch": 5.48, "grad_norm": 0.19482341408729553, "learning_rate": 4.5204520452045206e-05, "loss": 0.2204, "mean_token_accuracy": 0.9798511803150177, "step": 54800 }, { "epoch": 5.482, "grad_norm": 0.9808713793754578, "learning_rate": 4.518451845184519e-05, "loss": 0.2096, "mean_token_accuracy": 0.976726371049881, "step": 54820 }, { "epoch": 5.484, "grad_norm": 0.3118008077144623, "learning_rate": 4.5164516451645165e-05, "loss": 0.1855, "mean_token_accuracy": 0.976778045296669, "step": 54840 }, { "epoch": 5.486, "grad_norm": 0.3669881224632263, "learning_rate": 4.514451445144515e-05, "loss": 0.1919, "mean_token_accuracy": 0.9767959475517273, "step": 54860 }, { "epoch": 5.4879999999999995, "grad_norm": 0.3391495645046234, "learning_rate": 4.5124512451245125e-05, "loss": 0.1702, "mean_token_accuracy": 0.9808354258537293, "step": 54880 }, { "epoch": 5.49, "grad_norm": 0.34203335642814636, "learning_rate": 4.510451045104511e-05, "loss": 0.2628, "mean_token_accuracy": 0.9804449886083603, "step": 54900 }, { "epoch": 5.492, "grad_norm": 0.35187408328056335, "learning_rate": 4.5084508450845084e-05, "loss": 0.214, "mean_token_accuracy": 0.9800564974546433, "step": 54920 }, { "epoch": 5.494, "grad_norm": 0.38869065046310425, "learning_rate": 4.506450645064507e-05, "loss": 0.31, "mean_token_accuracy": 0.9773856550455093, "step": 54940 }, { "epoch": 5.496, "grad_norm": 0.35112959146499634, "learning_rate": 4.504450445044504e-05, "loss": 0.1739, "mean_token_accuracy": 0.9718163400888443, "step": 54960 }, { "epoch": 5.498, "grad_norm": 0.45618245005607605, "learning_rate": 4.5024502450245026e-05, "loss": 0.1993, "mean_token_accuracy": 0.9764459758996964, "step": 54980 }, { "epoch": 5.5, "grad_norm": 0.31004276871681213, "learning_rate": 4.500450045004501e-05, "loss": 0.4258, "mean_token_accuracy": 0.9736371248960495, "step": 55000 }, { "epoch": 5.502, "grad_norm": 0.37727126479148865, "learning_rate": 4.4984498449844986e-05, "loss": 0.3131, "mean_token_accuracy": 0.9770850211381912, "step": 55020 }, { "epoch": 5.504, "grad_norm": 0.6376299262046814, "learning_rate": 4.496449644964497e-05, "loss": 0.2064, "mean_token_accuracy": 0.9785244971513748, "step": 55040 }, { "epoch": 5.506, "grad_norm": 0.48406919836997986, "learning_rate": 4.4944494449444945e-05, "loss": 0.2758, "mean_token_accuracy": 0.9821186095476151, "step": 55060 }, { "epoch": 5.508, "grad_norm": 0.35804662108421326, "learning_rate": 4.492449244924493e-05, "loss": 0.2068, "mean_token_accuracy": 0.9784313350915909, "step": 55080 }, { "epoch": 5.51, "grad_norm": 0.28055185079574585, "learning_rate": 4.4904490449044904e-05, "loss": 0.1817, "mean_token_accuracy": 0.9787579953670502, "step": 55100 }, { "epoch": 5.5120000000000005, "grad_norm": 0.33029505610466003, "learning_rate": 4.488448844884489e-05, "loss": 0.1932, "mean_token_accuracy": 0.9824257522821427, "step": 55120 }, { "epoch": 5.514, "grad_norm": 0.388209730386734, "learning_rate": 4.4864486448644864e-05, "loss": 0.2158, "mean_token_accuracy": 0.9807987719774246, "step": 55140 }, { "epoch": 5.516, "grad_norm": 0.34081047773361206, "learning_rate": 4.484448444844485e-05, "loss": 0.3365, "mean_token_accuracy": 0.9785737484693527, "step": 55160 }, { "epoch": 5.518, "grad_norm": 0.3616770803928375, "learning_rate": 4.482448244824482e-05, "loss": 0.2392, "mean_token_accuracy": 0.9761659771203994, "step": 55180 }, { "epoch": 5.52, "grad_norm": 0.2877389192581177, "learning_rate": 4.4804480448044806e-05, "loss": 0.1506, "mean_token_accuracy": 0.9806297451257706, "step": 55200 }, { "epoch": 5.522, "grad_norm": 4.039425849914551, "learning_rate": 4.478447844784478e-05, "loss": 0.2135, "mean_token_accuracy": 0.9790421962738037, "step": 55220 }, { "epoch": 5.524, "grad_norm": 0.39750930666923523, "learning_rate": 4.4764476447644765e-05, "loss": 0.179, "mean_token_accuracy": 0.9724822908639907, "step": 55240 }, { "epoch": 5.526, "grad_norm": 0.560032844543457, "learning_rate": 4.474447444744475e-05, "loss": 0.1927, "mean_token_accuracy": 0.9730812102556229, "step": 55260 }, { "epoch": 5.5280000000000005, "grad_norm": 0.34663161635398865, "learning_rate": 4.4724472447244725e-05, "loss": 0.1598, "mean_token_accuracy": 0.9829982668161392, "step": 55280 }, { "epoch": 5.53, "grad_norm": 0.33829978108406067, "learning_rate": 4.470447044704471e-05, "loss": 0.2204, "mean_token_accuracy": 0.9765541702508926, "step": 55300 }, { "epoch": 5.532, "grad_norm": 0.3275889754295349, "learning_rate": 4.4684468446844684e-05, "loss": 0.1759, "mean_token_accuracy": 0.9756647169589996, "step": 55320 }, { "epoch": 5.534, "grad_norm": 0.40217092633247375, "learning_rate": 4.466446644664467e-05, "loss": 0.2154, "mean_token_accuracy": 0.9816769629716873, "step": 55340 }, { "epoch": 5.536, "grad_norm": 0.4656672179698944, "learning_rate": 4.464446444644464e-05, "loss": 0.1985, "mean_token_accuracy": 0.9817026168107986, "step": 55360 }, { "epoch": 5.538, "grad_norm": 0.5331560373306274, "learning_rate": 4.4624462446244626e-05, "loss": 0.2894, "mean_token_accuracy": 0.9764230132102967, "step": 55380 }, { "epoch": 5.54, "grad_norm": 1.044302225112915, "learning_rate": 4.46044604460446e-05, "loss": 0.3025, "mean_token_accuracy": 0.9818750143051147, "step": 55400 }, { "epoch": 5.542, "grad_norm": 0.32789406180381775, "learning_rate": 4.4584458445844586e-05, "loss": 0.2655, "mean_token_accuracy": 0.981196716427803, "step": 55420 }, { "epoch": 5.5440000000000005, "grad_norm": 0.31649020314216614, "learning_rate": 4.456445644564456e-05, "loss": 0.1621, "mean_token_accuracy": 0.9787332624197006, "step": 55440 }, { "epoch": 5.546, "grad_norm": 0.23607641458511353, "learning_rate": 4.4544454445444545e-05, "loss": 0.2887, "mean_token_accuracy": 0.972809785604477, "step": 55460 }, { "epoch": 5.548, "grad_norm": 1.146369218826294, "learning_rate": 4.452445244524452e-05, "loss": 0.177, "mean_token_accuracy": 0.9836510360240937, "step": 55480 }, { "epoch": 5.55, "grad_norm": 0.39392805099487305, "learning_rate": 4.4504450445044504e-05, "loss": 0.1952, "mean_token_accuracy": 0.9817927420139313, "step": 55500 }, { "epoch": 5.552, "grad_norm": 0.3308945298194885, "learning_rate": 4.448444844484449e-05, "loss": 0.2185, "mean_token_accuracy": 0.9808392882347107, "step": 55520 }, { "epoch": 5.554, "grad_norm": 1.7251638174057007, "learning_rate": 4.4464446444644464e-05, "loss": 0.2709, "mean_token_accuracy": 0.9797600597143173, "step": 55540 }, { "epoch": 5.556, "grad_norm": 0.32775217294692993, "learning_rate": 4.4444444444444447e-05, "loss": 0.244, "mean_token_accuracy": 0.9760156512260437, "step": 55560 }, { "epoch": 5.558, "grad_norm": 0.4880360960960388, "learning_rate": 4.442444244424442e-05, "loss": 0.216, "mean_token_accuracy": 0.9824769288301468, "step": 55580 }, { "epoch": 5.5600000000000005, "grad_norm": 0.2824866473674774, "learning_rate": 4.4404440444044406e-05, "loss": 0.1885, "mean_token_accuracy": 0.9803198337554931, "step": 55600 }, { "epoch": 5.562, "grad_norm": 0.4788757264614105, "learning_rate": 4.438443844384438e-05, "loss": 0.2312, "mean_token_accuracy": 0.976647213101387, "step": 55620 }, { "epoch": 5.564, "grad_norm": 0.2941882610321045, "learning_rate": 4.4364436443644365e-05, "loss": 0.2605, "mean_token_accuracy": 0.9837896347045898, "step": 55640 }, { "epoch": 5.566, "grad_norm": 0.39535605907440186, "learning_rate": 4.434443444344435e-05, "loss": 0.1355, "mean_token_accuracy": 0.9784809976816178, "step": 55660 }, { "epoch": 5.568, "grad_norm": 0.3228239119052887, "learning_rate": 4.432443244324433e-05, "loss": 0.1693, "mean_token_accuracy": 0.9762319535017013, "step": 55680 }, { "epoch": 5.57, "grad_norm": 0.411074697971344, "learning_rate": 4.430443044304431e-05, "loss": 0.2357, "mean_token_accuracy": 0.9765878796577454, "step": 55700 }, { "epoch": 5.572, "grad_norm": 0.8028706312179565, "learning_rate": 4.428442844284429e-05, "loss": 0.2215, "mean_token_accuracy": 0.9828480362892151, "step": 55720 }, { "epoch": 5.574, "grad_norm": 0.32578879594802856, "learning_rate": 4.426442644264427e-05, "loss": 0.1924, "mean_token_accuracy": 0.9792143076658248, "step": 55740 }, { "epoch": 5.576, "grad_norm": 0.5684463977813721, "learning_rate": 4.424442444244425e-05, "loss": 0.2314, "mean_token_accuracy": 0.9787602722644806, "step": 55760 }, { "epoch": 5.578, "grad_norm": 0.422922819852829, "learning_rate": 4.4224422442244226e-05, "loss": 0.1489, "mean_token_accuracy": 0.9771398544311524, "step": 55780 }, { "epoch": 5.58, "grad_norm": 0.3589858412742615, "learning_rate": 4.420442044204421e-05, "loss": 0.2046, "mean_token_accuracy": 0.9792209565639496, "step": 55800 }, { "epoch": 5.582, "grad_norm": 0.3582344949245453, "learning_rate": 4.4184418441844186e-05, "loss": 0.2089, "mean_token_accuracy": 0.9806838661432267, "step": 55820 }, { "epoch": 5.584, "grad_norm": 0.24583961069583893, "learning_rate": 4.416441644164417e-05, "loss": 0.1903, "mean_token_accuracy": 0.9794797718524932, "step": 55840 }, { "epoch": 5.586, "grad_norm": 1.2223669290542603, "learning_rate": 4.414441444144415e-05, "loss": 0.1803, "mean_token_accuracy": 0.9742105007171631, "step": 55860 }, { "epoch": 5.588, "grad_norm": 0.9492931962013245, "learning_rate": 4.412441244124413e-05, "loss": 0.3395, "mean_token_accuracy": 0.978675302863121, "step": 55880 }, { "epoch": 5.59, "grad_norm": 0.35025522112846375, "learning_rate": 4.410441044104411e-05, "loss": 0.1416, "mean_token_accuracy": 0.9758209109306335, "step": 55900 }, { "epoch": 5.592, "grad_norm": 0.4871929883956909, "learning_rate": 4.408440844084409e-05, "loss": 0.1772, "mean_token_accuracy": 0.9812789142131806, "step": 55920 }, { "epoch": 5.594, "grad_norm": 0.40865767002105713, "learning_rate": 4.406440644064407e-05, "loss": 0.2306, "mean_token_accuracy": 0.9765850305557251, "step": 55940 }, { "epoch": 5.596, "grad_norm": 0.326375275850296, "learning_rate": 4.4044404440444047e-05, "loss": 0.2426, "mean_token_accuracy": 0.9830195367336273, "step": 55960 }, { "epoch": 5.598, "grad_norm": 0.27891990542411804, "learning_rate": 4.402440244024403e-05, "loss": 0.1849, "mean_token_accuracy": 0.9779745191335678, "step": 55980 }, { "epoch": 5.6, "grad_norm": 0.2547537088394165, "learning_rate": 4.4004400440044006e-05, "loss": 0.2026, "mean_token_accuracy": 0.981719771027565, "step": 56000 }, { "epoch": 5.602, "grad_norm": 0.48127487301826477, "learning_rate": 4.398439843984399e-05, "loss": 0.2487, "mean_token_accuracy": 0.9785022497177124, "step": 56020 }, { "epoch": 5.604, "grad_norm": 0.35251471400260925, "learning_rate": 4.3964396439643965e-05, "loss": 0.2152, "mean_token_accuracy": 0.9791264176368714, "step": 56040 }, { "epoch": 5.606, "grad_norm": 0.3268388509750366, "learning_rate": 4.394439443944395e-05, "loss": 0.248, "mean_token_accuracy": 0.9814691185951233, "step": 56060 }, { "epoch": 5.608, "grad_norm": 0.400949627161026, "learning_rate": 4.3924392439243924e-05, "loss": 0.1462, "mean_token_accuracy": 0.9793224990367889, "step": 56080 }, { "epoch": 5.61, "grad_norm": 0.3834770917892456, "learning_rate": 4.390439043904391e-05, "loss": 0.1919, "mean_token_accuracy": 0.9811149537563324, "step": 56100 }, { "epoch": 5.612, "grad_norm": 0.33682534098625183, "learning_rate": 4.388438843884389e-05, "loss": 0.1515, "mean_token_accuracy": 0.9758371412754059, "step": 56120 }, { "epoch": 5.614, "grad_norm": 0.2042650729417801, "learning_rate": 4.386438643864387e-05, "loss": 0.1431, "mean_token_accuracy": 0.9847969859838486, "step": 56140 }, { "epoch": 5.616, "grad_norm": 0.9859150648117065, "learning_rate": 4.384438443844385e-05, "loss": 0.285, "mean_token_accuracy": 0.9782578259706497, "step": 56160 }, { "epoch": 5.618, "grad_norm": 0.3391551077365875, "learning_rate": 4.3824382438243826e-05, "loss": 0.1867, "mean_token_accuracy": 0.9744471907615662, "step": 56180 }, { "epoch": 5.62, "grad_norm": 0.5252114534378052, "learning_rate": 4.380438043804381e-05, "loss": 0.4716, "mean_token_accuracy": 0.9762915909290314, "step": 56200 }, { "epoch": 5.622, "grad_norm": 3.9747684001922607, "learning_rate": 4.3784378437843785e-05, "loss": 0.1631, "mean_token_accuracy": 0.9825115650892258, "step": 56220 }, { "epoch": 5.624, "grad_norm": 4.561916351318359, "learning_rate": 4.376437643764377e-05, "loss": 0.3358, "mean_token_accuracy": 0.9669644117355347, "step": 56240 }, { "epoch": 5.626, "grad_norm": 0.3444564640522003, "learning_rate": 4.3744374437443745e-05, "loss": 0.2039, "mean_token_accuracy": 0.9805388361215591, "step": 56260 }, { "epoch": 5.628, "grad_norm": 0.3305828273296356, "learning_rate": 4.372437243724373e-05, "loss": 0.2227, "mean_token_accuracy": 0.9824455827474594, "step": 56280 }, { "epoch": 5.63, "grad_norm": 0.4366151988506317, "learning_rate": 4.3704370437043704e-05, "loss": 0.1947, "mean_token_accuracy": 0.9776460379362106, "step": 56300 }, { "epoch": 5.632, "grad_norm": 1.1918121576309204, "learning_rate": 4.368436843684369e-05, "loss": 0.1217, "mean_token_accuracy": 0.9772636830806732, "step": 56320 }, { "epoch": 5.634, "grad_norm": 0.301749587059021, "learning_rate": 4.3664366436643663e-05, "loss": 0.1887, "mean_token_accuracy": 0.9836124002933502, "step": 56340 }, { "epoch": 5.636, "grad_norm": 0.36222946643829346, "learning_rate": 4.3644364436443646e-05, "loss": 0.248, "mean_token_accuracy": 0.983824822306633, "step": 56360 }, { "epoch": 5.638, "grad_norm": 0.23189443349838257, "learning_rate": 4.362436243624363e-05, "loss": 0.1941, "mean_token_accuracy": 0.9736188977956772, "step": 56380 }, { "epoch": 5.64, "grad_norm": 0.3843432068824768, "learning_rate": 4.3604360436043606e-05, "loss": 0.2122, "mean_token_accuracy": 0.9831967949867249, "step": 56400 }, { "epoch": 5.642, "grad_norm": 0.24772924184799194, "learning_rate": 4.358435843584359e-05, "loss": 0.1808, "mean_token_accuracy": 0.9800303280353546, "step": 56420 }, { "epoch": 5.644, "grad_norm": 0.3139965236186981, "learning_rate": 4.3564356435643565e-05, "loss": 0.2402, "mean_token_accuracy": 0.9757402032613754, "step": 56440 }, { "epoch": 5.646, "grad_norm": 0.3399488031864166, "learning_rate": 4.354435443544355e-05, "loss": 0.208, "mean_token_accuracy": 0.9800664693117142, "step": 56460 }, { "epoch": 5.648, "grad_norm": 3.107914686203003, "learning_rate": 4.3524352435243524e-05, "loss": 0.2514, "mean_token_accuracy": 0.9718166887760162, "step": 56480 }, { "epoch": 5.65, "grad_norm": 0.34249526262283325, "learning_rate": 4.350435043504351e-05, "loss": 0.3811, "mean_token_accuracy": 0.9717596054077149, "step": 56500 }, { "epoch": 5.652, "grad_norm": 0.25553175806999207, "learning_rate": 4.3484348434843484e-05, "loss": 0.1883, "mean_token_accuracy": 0.978977307677269, "step": 56520 }, { "epoch": 5.654, "grad_norm": 0.4816286861896515, "learning_rate": 4.346434643464347e-05, "loss": 0.1766, "mean_token_accuracy": 0.9817674785852433, "step": 56540 }, { "epoch": 5.656, "grad_norm": 0.6499930024147034, "learning_rate": 4.344434443444344e-05, "loss": 0.2549, "mean_token_accuracy": 0.9751752108335495, "step": 56560 }, { "epoch": 5.658, "grad_norm": 0.37031981348991394, "learning_rate": 4.3424342434243426e-05, "loss": 0.3263, "mean_token_accuracy": 0.9722284078598022, "step": 56580 }, { "epoch": 5.66, "grad_norm": 0.3418433666229248, "learning_rate": 4.34043404340434e-05, "loss": 0.2097, "mean_token_accuracy": 0.9792511969804764, "step": 56600 }, { "epoch": 5.662, "grad_norm": 0.2839960753917694, "learning_rate": 4.3384338433843385e-05, "loss": 0.2764, "mean_token_accuracy": 0.9785633862018586, "step": 56620 }, { "epoch": 5.664, "grad_norm": 0.33902469277381897, "learning_rate": 4.336433643364337e-05, "loss": 0.1725, "mean_token_accuracy": 0.9781011879444123, "step": 56640 }, { "epoch": 5.666, "grad_norm": 0.38872256875038147, "learning_rate": 4.3344334433443345e-05, "loss": 0.1701, "mean_token_accuracy": 0.9738655328750611, "step": 56660 }, { "epoch": 5.668, "grad_norm": 0.4539215862751007, "learning_rate": 4.332433243324333e-05, "loss": 0.1564, "mean_token_accuracy": 0.9825784713029861, "step": 56680 }, { "epoch": 5.67, "grad_norm": 0.39540308713912964, "learning_rate": 4.3304330433043304e-05, "loss": 0.2129, "mean_token_accuracy": 0.9748787164688111, "step": 56700 }, { "epoch": 5.672, "grad_norm": 0.42904624342918396, "learning_rate": 4.328432843284329e-05, "loss": 0.1676, "mean_token_accuracy": 0.9793749332427979, "step": 56720 }, { "epoch": 5.674, "grad_norm": 0.387501060962677, "learning_rate": 4.326432643264326e-05, "loss": 0.1454, "mean_token_accuracy": 0.9819690704345703, "step": 56740 }, { "epoch": 5.676, "grad_norm": 0.2668623626232147, "learning_rate": 4.3244324432443246e-05, "loss": 0.1786, "mean_token_accuracy": 0.9812593758106232, "step": 56760 }, { "epoch": 5.678, "grad_norm": 0.4106128513813019, "learning_rate": 4.322432243224322e-05, "loss": 0.19, "mean_token_accuracy": 0.983023265004158, "step": 56780 }, { "epoch": 5.68, "grad_norm": 0.30120575428009033, "learning_rate": 4.3204320432043206e-05, "loss": 0.2393, "mean_token_accuracy": 0.9752364307641983, "step": 56800 }, { "epoch": 5.682, "grad_norm": 0.38123539090156555, "learning_rate": 4.318431843184318e-05, "loss": 0.2337, "mean_token_accuracy": 0.9803376793861389, "step": 56820 }, { "epoch": 5.684, "grad_norm": 0.3045507073402405, "learning_rate": 4.3164316431643165e-05, "loss": 0.2564, "mean_token_accuracy": 0.9747986882925034, "step": 56840 }, { "epoch": 5.686, "grad_norm": 0.8786834478378296, "learning_rate": 4.314431443144314e-05, "loss": 0.2807, "mean_token_accuracy": 0.971024689078331, "step": 56860 }, { "epoch": 5.688, "grad_norm": 0.34173405170440674, "learning_rate": 4.3124312431243124e-05, "loss": 0.2821, "mean_token_accuracy": 0.9804856151342392, "step": 56880 }, { "epoch": 5.6899999999999995, "grad_norm": 0.2586211562156677, "learning_rate": 4.310431043104311e-05, "loss": 0.1333, "mean_token_accuracy": 0.9809400588274002, "step": 56900 }, { "epoch": 5.692, "grad_norm": 0.296110063791275, "learning_rate": 4.3084308430843084e-05, "loss": 0.1727, "mean_token_accuracy": 0.9795963406562805, "step": 56920 }, { "epoch": 5.694, "grad_norm": 0.3243914544582367, "learning_rate": 4.306430643064307e-05, "loss": 0.187, "mean_token_accuracy": 0.9740759164094925, "step": 56940 }, { "epoch": 5.696, "grad_norm": 0.35637280344963074, "learning_rate": 4.304430443044304e-05, "loss": 0.1729, "mean_token_accuracy": 0.978412976861, "step": 56960 }, { "epoch": 5.698, "grad_norm": 0.2721569538116455, "learning_rate": 4.3024302430243026e-05, "loss": 0.1475, "mean_token_accuracy": 0.9775982618331909, "step": 56980 }, { "epoch": 5.7, "grad_norm": 0.5282973051071167, "learning_rate": 4.3004300430043e-05, "loss": 0.1842, "mean_token_accuracy": 0.9793015718460083, "step": 57000 }, { "epoch": 5.702, "grad_norm": 0.3283464312553406, "learning_rate": 4.2984298429842985e-05, "loss": 0.2813, "mean_token_accuracy": 0.9710642129182816, "step": 57020 }, { "epoch": 5.704, "grad_norm": 0.3045545220375061, "learning_rate": 4.296429642964296e-05, "loss": 0.2913, "mean_token_accuracy": 0.9756669372320175, "step": 57040 }, { "epoch": 5.7059999999999995, "grad_norm": 0.4572155475616455, "learning_rate": 4.2944294429442945e-05, "loss": 0.2914, "mean_token_accuracy": 0.980001351237297, "step": 57060 }, { "epoch": 5.708, "grad_norm": 0.7172882556915283, "learning_rate": 4.292429242924292e-05, "loss": 0.2311, "mean_token_accuracy": 0.977746844291687, "step": 57080 }, { "epoch": 5.71, "grad_norm": 0.4142398536205292, "learning_rate": 4.2904290429042904e-05, "loss": 0.2723, "mean_token_accuracy": 0.9692842364311218, "step": 57100 }, { "epoch": 5.712, "grad_norm": 0.405947744846344, "learning_rate": 4.288428842884289e-05, "loss": 0.218, "mean_token_accuracy": 0.9803596198558807, "step": 57120 }, { "epoch": 5.714, "grad_norm": 0.3358750641345978, "learning_rate": 4.286428642864286e-05, "loss": 0.2475, "mean_token_accuracy": 0.9746206194162369, "step": 57140 }, { "epoch": 5.716, "grad_norm": 0.2815413177013397, "learning_rate": 4.2844284428442846e-05, "loss": 0.1431, "mean_token_accuracy": 0.9762867748737335, "step": 57160 }, { "epoch": 5.718, "grad_norm": 0.3676919639110565, "learning_rate": 4.282428242824283e-05, "loss": 0.2223, "mean_token_accuracy": 0.973769822716713, "step": 57180 }, { "epoch": 5.72, "grad_norm": 0.3937438130378723, "learning_rate": 4.2804280428042806e-05, "loss": 0.3195, "mean_token_accuracy": 0.9768236309289933, "step": 57200 }, { "epoch": 5.7219999999999995, "grad_norm": 0.4882102310657501, "learning_rate": 4.278427842784279e-05, "loss": 0.1979, "mean_token_accuracy": 0.978671869635582, "step": 57220 }, { "epoch": 5.724, "grad_norm": 3.339160919189453, "learning_rate": 4.276427642764277e-05, "loss": 0.2567, "mean_token_accuracy": 0.9760376214981079, "step": 57240 }, { "epoch": 5.726, "grad_norm": 0.3337726294994354, "learning_rate": 4.274427442744275e-05, "loss": 0.1651, "mean_token_accuracy": 0.9793857872486115, "step": 57260 }, { "epoch": 5.728, "grad_norm": 5.463102340698242, "learning_rate": 4.272427242724273e-05, "loss": 0.1455, "mean_token_accuracy": 0.9738187223672867, "step": 57280 }, { "epoch": 5.73, "grad_norm": 0.3984277844429016, "learning_rate": 4.270427042704271e-05, "loss": 0.2341, "mean_token_accuracy": 0.9769771695137024, "step": 57300 }, { "epoch": 5.732, "grad_norm": 0.25798875093460083, "learning_rate": 4.268426842684269e-05, "loss": 0.2775, "mean_token_accuracy": 0.9835726976394653, "step": 57320 }, { "epoch": 5.734, "grad_norm": 0.4196510910987854, "learning_rate": 4.266426642664267e-05, "loss": 0.2676, "mean_token_accuracy": 0.9792571991682053, "step": 57340 }, { "epoch": 5.736, "grad_norm": 0.3711874186992645, "learning_rate": 4.264426442644265e-05, "loss": 0.2068, "mean_token_accuracy": 0.9775934010744095, "step": 57360 }, { "epoch": 5.7379999999999995, "grad_norm": 0.43344220519065857, "learning_rate": 4.2624262426242626e-05, "loss": 0.2268, "mean_token_accuracy": 0.9819819033145905, "step": 57380 }, { "epoch": 5.74, "grad_norm": 0.47380512952804565, "learning_rate": 4.260426042604261e-05, "loss": 0.2132, "mean_token_accuracy": 0.9819375663995743, "step": 57400 }, { "epoch": 5.742, "grad_norm": 4.969549655914307, "learning_rate": 4.2584258425842585e-05, "loss": 0.2804, "mean_token_accuracy": 0.9797096818685531, "step": 57420 }, { "epoch": 5.744, "grad_norm": 0.3900277018547058, "learning_rate": 4.256425642564257e-05, "loss": 0.185, "mean_token_accuracy": 0.9821339160203933, "step": 57440 }, { "epoch": 5.746, "grad_norm": 0.3198936879634857, "learning_rate": 4.2544254425442545e-05, "loss": 0.1688, "mean_token_accuracy": 0.9826356172561646, "step": 57460 }, { "epoch": 5.748, "grad_norm": 0.2719173729419708, "learning_rate": 4.252425242524253e-05, "loss": 0.2847, "mean_token_accuracy": 0.9805544346570969, "step": 57480 }, { "epoch": 5.75, "grad_norm": 0.3485959768295288, "learning_rate": 4.250425042504251e-05, "loss": 0.113, "mean_token_accuracy": 0.9823164433240891, "step": 57500 }, { "epoch": 5.752, "grad_norm": 0.5338280200958252, "learning_rate": 4.248424842484249e-05, "loss": 0.1888, "mean_token_accuracy": 0.9817294985055923, "step": 57520 }, { "epoch": 5.754, "grad_norm": 0.6206852793693542, "learning_rate": 4.246424642464247e-05, "loss": 0.2608, "mean_token_accuracy": 0.9777007937431336, "step": 57540 }, { "epoch": 5.756, "grad_norm": 0.5298039317131042, "learning_rate": 4.2444244424442446e-05, "loss": 0.2688, "mean_token_accuracy": 0.9778906971216201, "step": 57560 }, { "epoch": 5.758, "grad_norm": 0.43006595969200134, "learning_rate": 4.242424242424243e-05, "loss": 0.4173, "mean_token_accuracy": 0.9645839601755142, "step": 57580 }, { "epoch": 5.76, "grad_norm": 0.4869430959224701, "learning_rate": 4.2404240424042406e-05, "loss": 0.2341, "mean_token_accuracy": 0.9792276531457901, "step": 57600 }, { "epoch": 5.7620000000000005, "grad_norm": 0.3400184214115143, "learning_rate": 4.238423842384239e-05, "loss": 0.1957, "mean_token_accuracy": 0.9814266949892044, "step": 57620 }, { "epoch": 5.764, "grad_norm": 1.1949034929275513, "learning_rate": 4.2364236423642365e-05, "loss": 0.1905, "mean_token_accuracy": 0.9779545098543168, "step": 57640 }, { "epoch": 5.766, "grad_norm": 0.25556454062461853, "learning_rate": 4.234423442344235e-05, "loss": 0.23, "mean_token_accuracy": 0.9792509883642196, "step": 57660 }, { "epoch": 5.768, "grad_norm": 0.3437163829803467, "learning_rate": 4.2324232423242324e-05, "loss": 0.1941, "mean_token_accuracy": 0.9800981819629669, "step": 57680 }, { "epoch": 5.77, "grad_norm": 0.2709636986255646, "learning_rate": 4.230423042304231e-05, "loss": 0.1349, "mean_token_accuracy": 0.9805484354496002, "step": 57700 }, { "epoch": 5.772, "grad_norm": 0.6583581566810608, "learning_rate": 4.2284228422842284e-05, "loss": 0.1692, "mean_token_accuracy": 0.9824007540941239, "step": 57720 }, { "epoch": 5.774, "grad_norm": 1.0048335790634155, "learning_rate": 4.226422642264227e-05, "loss": 0.2118, "mean_token_accuracy": 0.9789935797452927, "step": 57740 }, { "epoch": 5.776, "grad_norm": 4.217690944671631, "learning_rate": 4.224422442244225e-05, "loss": 0.1847, "mean_token_accuracy": 0.9731710076332092, "step": 57760 }, { "epoch": 5.7780000000000005, "grad_norm": 0.48924025893211365, "learning_rate": 4.2224222422242226e-05, "loss": 0.2385, "mean_token_accuracy": 0.9804710358381271, "step": 57780 }, { "epoch": 5.78, "grad_norm": 0.42472201585769653, "learning_rate": 4.220422042204221e-05, "loss": 0.197, "mean_token_accuracy": 0.9761794060468674, "step": 57800 }, { "epoch": 5.782, "grad_norm": 0.3961001932621002, "learning_rate": 4.2184218421842185e-05, "loss": 0.1478, "mean_token_accuracy": 0.9771145880222321, "step": 57820 }, { "epoch": 5.784, "grad_norm": 9.61920166015625, "learning_rate": 4.216421642164217e-05, "loss": 0.3265, "mean_token_accuracy": 0.9704391866922378, "step": 57840 }, { "epoch": 5.786, "grad_norm": 0.39987486600875854, "learning_rate": 4.2144214421442145e-05, "loss": 0.1702, "mean_token_accuracy": 0.9805747240781784, "step": 57860 }, { "epoch": 5.788, "grad_norm": 5.4189300537109375, "learning_rate": 4.212421242124213e-05, "loss": 0.3159, "mean_token_accuracy": 0.9708470284938813, "step": 57880 }, { "epoch": 5.79, "grad_norm": 0.6015975475311279, "learning_rate": 4.2104210421042104e-05, "loss": 0.286, "mean_token_accuracy": 0.9738795101642609, "step": 57900 }, { "epoch": 5.792, "grad_norm": 0.46912243962287903, "learning_rate": 4.208420842084209e-05, "loss": 0.2043, "mean_token_accuracy": 0.9805169612169266, "step": 57920 }, { "epoch": 5.7940000000000005, "grad_norm": 0.442431777715683, "learning_rate": 4.206420642064206e-05, "loss": 0.1976, "mean_token_accuracy": 0.9667463392019272, "step": 57940 }, { "epoch": 5.796, "grad_norm": 0.3509601354598999, "learning_rate": 4.2044204420442046e-05, "loss": 0.2019, "mean_token_accuracy": 0.9792567521333695, "step": 57960 }, { "epoch": 5.798, "grad_norm": 0.2954523265361786, "learning_rate": 4.202420242024203e-05, "loss": 0.2677, "mean_token_accuracy": 0.9791784644126892, "step": 57980 }, { "epoch": 5.8, "grad_norm": 0.37243539094924927, "learning_rate": 4.2004200420042006e-05, "loss": 0.1304, "mean_token_accuracy": 0.9778010964393615, "step": 58000 }, { "epoch": 5.802, "grad_norm": 0.8619077801704407, "learning_rate": 4.198419841984199e-05, "loss": 0.2266, "mean_token_accuracy": 0.9821074277162551, "step": 58020 }, { "epoch": 5.804, "grad_norm": 0.34314608573913574, "learning_rate": 4.1964196419641965e-05, "loss": 0.164, "mean_token_accuracy": 0.9794174253940582, "step": 58040 }, { "epoch": 5.806, "grad_norm": 0.26243263483047485, "learning_rate": 4.194419441944195e-05, "loss": 0.1944, "mean_token_accuracy": 0.9816887140274048, "step": 58060 }, { "epoch": 5.808, "grad_norm": 0.3434557020664215, "learning_rate": 4.1924192419241924e-05, "loss": 0.2589, "mean_token_accuracy": 0.979188296198845, "step": 58080 }, { "epoch": 5.8100000000000005, "grad_norm": 0.3604726493358612, "learning_rate": 4.190419041904191e-05, "loss": 0.3173, "mean_token_accuracy": 0.9691259235143661, "step": 58100 }, { "epoch": 5.812, "grad_norm": 0.4519491195678711, "learning_rate": 4.1884188418841884e-05, "loss": 0.2317, "mean_token_accuracy": 0.9812194615602493, "step": 58120 }, { "epoch": 5.814, "grad_norm": 0.3970159590244293, "learning_rate": 4.1864186418641867e-05, "loss": 0.273, "mean_token_accuracy": 0.9812477886676788, "step": 58140 }, { "epoch": 5.816, "grad_norm": 0.4387889802455902, "learning_rate": 4.184418441844184e-05, "loss": 0.2013, "mean_token_accuracy": 0.9699615985155106, "step": 58160 }, { "epoch": 5.818, "grad_norm": 0.29654425382614136, "learning_rate": 4.1824182418241826e-05, "loss": 0.2781, "mean_token_accuracy": 0.9756461054086685, "step": 58180 }, { "epoch": 5.82, "grad_norm": 0.42017045617103577, "learning_rate": 4.18041804180418e-05, "loss": 0.3446, "mean_token_accuracy": 0.979296612739563, "step": 58200 }, { "epoch": 5.822, "grad_norm": 0.448875367641449, "learning_rate": 4.1784178417841785e-05, "loss": 0.2512, "mean_token_accuracy": 0.9822266697883606, "step": 58220 }, { "epoch": 5.824, "grad_norm": 0.30783611536026, "learning_rate": 4.176417641764177e-05, "loss": 0.2409, "mean_token_accuracy": 0.9765777856111526, "step": 58240 }, { "epoch": 5.826, "grad_norm": 0.2846779525279999, "learning_rate": 4.1744174417441745e-05, "loss": 0.2812, "mean_token_accuracy": 0.979139867424965, "step": 58260 }, { "epoch": 5.828, "grad_norm": 0.41217830777168274, "learning_rate": 4.172417241724173e-05, "loss": 0.1784, "mean_token_accuracy": 0.9756014376878739, "step": 58280 }, { "epoch": 5.83, "grad_norm": 2.4871888160705566, "learning_rate": 4.1704170417041704e-05, "loss": 0.4633, "mean_token_accuracy": 0.9683480232954025, "step": 58300 }, { "epoch": 5.832, "grad_norm": 0.3771604597568512, "learning_rate": 4.168416841684169e-05, "loss": 0.2058, "mean_token_accuracy": 0.9751060307025909, "step": 58320 }, { "epoch": 5.834, "grad_norm": 1.824070930480957, "learning_rate": 4.166416641664166e-05, "loss": 0.154, "mean_token_accuracy": 0.9817645102739334, "step": 58340 }, { "epoch": 5.836, "grad_norm": 0.5995447635650635, "learning_rate": 4.1644164416441646e-05, "loss": 0.2809, "mean_token_accuracy": 0.9788348466157913, "step": 58360 }, { "epoch": 5.838, "grad_norm": 0.38588571548461914, "learning_rate": 4.162416241624162e-05, "loss": 0.2099, "mean_token_accuracy": 0.9817722052335739, "step": 58380 }, { "epoch": 5.84, "grad_norm": 0.2745191156864166, "learning_rate": 4.1604160416041606e-05, "loss": 0.1858, "mean_token_accuracy": 0.9822569668293, "step": 58400 }, { "epoch": 5.842, "grad_norm": 0.3728424608707428, "learning_rate": 4.158415841584158e-05, "loss": 0.1408, "mean_token_accuracy": 0.9790921986103058, "step": 58420 }, { "epoch": 5.844, "grad_norm": 0.3059096336364746, "learning_rate": 4.1564156415641565e-05, "loss": 0.2602, "mean_token_accuracy": 0.9790444612503052, "step": 58440 }, { "epoch": 5.846, "grad_norm": 0.33661532402038574, "learning_rate": 4.154415441544154e-05, "loss": 0.2461, "mean_token_accuracy": 0.9732171833515167, "step": 58460 }, { "epoch": 5.848, "grad_norm": 0.3447761535644531, "learning_rate": 4.1524152415241524e-05, "loss": 0.164, "mean_token_accuracy": 0.9838398575782776, "step": 58480 }, { "epoch": 5.85, "grad_norm": 0.2499307096004486, "learning_rate": 4.150415041504151e-05, "loss": 0.1351, "mean_token_accuracy": 0.9812704652547837, "step": 58500 }, { "epoch": 5.852, "grad_norm": 0.639296293258667, "learning_rate": 4.1484148414841483e-05, "loss": 0.2387, "mean_token_accuracy": 0.9784160017967224, "step": 58520 }, { "epoch": 5.854, "grad_norm": 0.2693125605583191, "learning_rate": 4.1464146414641467e-05, "loss": 0.2074, "mean_token_accuracy": 0.9784303069114685, "step": 58540 }, { "epoch": 5.856, "grad_norm": 0.3392106592655182, "learning_rate": 4.144414441444144e-05, "loss": 0.3062, "mean_token_accuracy": 0.9790849357843399, "step": 58560 }, { "epoch": 5.858, "grad_norm": 0.32460078597068787, "learning_rate": 4.1424142414241426e-05, "loss": 0.2329, "mean_token_accuracy": 0.9751298993825912, "step": 58580 }, { "epoch": 5.86, "grad_norm": 0.3284367620944977, "learning_rate": 4.14041404140414e-05, "loss": 0.1132, "mean_token_accuracy": 0.9810712456703186, "step": 58600 }, { "epoch": 5.862, "grad_norm": 0.3488309979438782, "learning_rate": 4.1384138413841385e-05, "loss": 0.1583, "mean_token_accuracy": 0.9817476391792297, "step": 58620 }, { "epoch": 5.864, "grad_norm": 0.2670525014400482, "learning_rate": 4.136413641364136e-05, "loss": 0.3225, "mean_token_accuracy": 0.9692124217748642, "step": 58640 }, { "epoch": 5.866, "grad_norm": 2.962106943130493, "learning_rate": 4.134413441344135e-05, "loss": 0.1525, "mean_token_accuracy": 0.97934590280056, "step": 58660 }, { "epoch": 5.868, "grad_norm": 0.715602695941925, "learning_rate": 4.132413241324133e-05, "loss": 0.1867, "mean_token_accuracy": 0.9769827753305436, "step": 58680 }, { "epoch": 5.87, "grad_norm": 0.33154165744781494, "learning_rate": 4.130413041304131e-05, "loss": 0.2315, "mean_token_accuracy": 0.9790841400623321, "step": 58700 }, { "epoch": 5.872, "grad_norm": 0.30396929383277893, "learning_rate": 4.128412841284129e-05, "loss": 0.2004, "mean_token_accuracy": 0.9793908953666687, "step": 58720 }, { "epoch": 5.874, "grad_norm": 0.23002536594867706, "learning_rate": 4.126412641264127e-05, "loss": 0.2231, "mean_token_accuracy": 0.9789889216423034, "step": 58740 }, { "epoch": 5.876, "grad_norm": 0.8979139924049377, "learning_rate": 4.1244124412441246e-05, "loss": 0.1556, "mean_token_accuracy": 0.9788487106561661, "step": 58760 }, { "epoch": 5.878, "grad_norm": 0.25696760416030884, "learning_rate": 4.122412241224123e-05, "loss": 0.1549, "mean_token_accuracy": 0.9801557064056396, "step": 58780 }, { "epoch": 5.88, "grad_norm": 0.3865380585193634, "learning_rate": 4.1204120412041205e-05, "loss": 0.2823, "mean_token_accuracy": 0.9746944606304169, "step": 58800 }, { "epoch": 5.882, "grad_norm": 0.2980695962905884, "learning_rate": 4.118411841184119e-05, "loss": 0.1701, "mean_token_accuracy": 0.97918621301651, "step": 58820 }, { "epoch": 5.884, "grad_norm": 0.5756284594535828, "learning_rate": 4.1164116411641165e-05, "loss": 0.2601, "mean_token_accuracy": 0.9772479832172394, "step": 58840 }, { "epoch": 5.886, "grad_norm": 0.42838114500045776, "learning_rate": 4.114411441144115e-05, "loss": 0.195, "mean_token_accuracy": 0.9803133279085159, "step": 58860 }, { "epoch": 5.888, "grad_norm": 0.37715017795562744, "learning_rate": 4.112411241124113e-05, "loss": 0.2519, "mean_token_accuracy": 0.9790619552135468, "step": 58880 }, { "epoch": 5.89, "grad_norm": 0.38071194291114807, "learning_rate": 4.110411041104111e-05, "loss": 0.1971, "mean_token_accuracy": 0.9762496441602707, "step": 58900 }, { "epoch": 5.892, "grad_norm": 0.3153488337993622, "learning_rate": 4.108410841084109e-05, "loss": 0.1718, "mean_token_accuracy": 0.9779014706611633, "step": 58920 }, { "epoch": 5.894, "grad_norm": 1.1034210920333862, "learning_rate": 4.1064106410641066e-05, "loss": 0.1752, "mean_token_accuracy": 0.9777091920375824, "step": 58940 }, { "epoch": 5.896, "grad_norm": 2.611356019973755, "learning_rate": 4.104410441044105e-05, "loss": 0.3026, "mean_token_accuracy": 0.9714400470256805, "step": 58960 }, { "epoch": 5.898, "grad_norm": 0.3762631118297577, "learning_rate": 4.1024102410241026e-05, "loss": 0.2406, "mean_token_accuracy": 0.9818399548530579, "step": 58980 }, { "epoch": 5.9, "grad_norm": 0.27014800906181335, "learning_rate": 4.100410041004101e-05, "loss": 0.1165, "mean_token_accuracy": 0.9824950903654098, "step": 59000 }, { "epoch": 5.902, "grad_norm": 0.4123614728450775, "learning_rate": 4.0984098409840985e-05, "loss": 0.2343, "mean_token_accuracy": 0.9773309111595154, "step": 59020 }, { "epoch": 5.904, "grad_norm": 0.21313205361366272, "learning_rate": 4.096409640964097e-05, "loss": 0.2694, "mean_token_accuracy": 0.9788897126913071, "step": 59040 }, { "epoch": 5.906, "grad_norm": 2.156609296798706, "learning_rate": 4.0944094409440944e-05, "loss": 0.152, "mean_token_accuracy": 0.9814854711294174, "step": 59060 }, { "epoch": 5.908, "grad_norm": 0.4635979235172272, "learning_rate": 4.092409240924093e-05, "loss": 0.1709, "mean_token_accuracy": 0.9745910614728928, "step": 59080 }, { "epoch": 5.91, "grad_norm": 0.40402334928512573, "learning_rate": 4.090409040904091e-05, "loss": 0.1957, "mean_token_accuracy": 0.976408663392067, "step": 59100 }, { "epoch": 5.912, "grad_norm": 0.26175162196159363, "learning_rate": 4.088408840884089e-05, "loss": 0.1745, "mean_token_accuracy": 0.9759087026119232, "step": 59120 }, { "epoch": 5.914, "grad_norm": 0.36112162470817566, "learning_rate": 4.086408640864087e-05, "loss": 0.2543, "mean_token_accuracy": 0.9774573683738709, "step": 59140 }, { "epoch": 5.916, "grad_norm": 5.1886138916015625, "learning_rate": 4.0844084408440846e-05, "loss": 0.1802, "mean_token_accuracy": 0.9762089848518372, "step": 59160 }, { "epoch": 5.918, "grad_norm": 1.1060951948165894, "learning_rate": 4.082408240824083e-05, "loss": 0.1559, "mean_token_accuracy": 0.9768004775047302, "step": 59180 }, { "epoch": 5.92, "grad_norm": 0.3918673098087311, "learning_rate": 4.0804080408040805e-05, "loss": 0.3855, "mean_token_accuracy": 0.96982461810112, "step": 59200 }, { "epoch": 5.922, "grad_norm": 0.3664752244949341, "learning_rate": 4.078407840784079e-05, "loss": 0.1663, "mean_token_accuracy": 0.98042533993721, "step": 59220 }, { "epoch": 5.924, "grad_norm": 0.37030676007270813, "learning_rate": 4.0764076407640765e-05, "loss": 0.2689, "mean_token_accuracy": 0.9783704340457916, "step": 59240 }, { "epoch": 5.926, "grad_norm": 0.3088802993297577, "learning_rate": 4.074407440744075e-05, "loss": 0.2165, "mean_token_accuracy": 0.980127477645874, "step": 59260 }, { "epoch": 5.928, "grad_norm": 0.3744543790817261, "learning_rate": 4.0724072407240724e-05, "loss": 0.2368, "mean_token_accuracy": 0.9805212497711182, "step": 59280 }, { "epoch": 5.93, "grad_norm": 0.6494140028953552, "learning_rate": 4.070407040704071e-05, "loss": 0.3896, "mean_token_accuracy": 0.9701353371143341, "step": 59300 }, { "epoch": 5.932, "grad_norm": 0.4938710331916809, "learning_rate": 4.068406840684068e-05, "loss": 0.2727, "mean_token_accuracy": 0.9768064528703689, "step": 59320 }, { "epoch": 5.934, "grad_norm": 0.7565509676933289, "learning_rate": 4.0664066406640666e-05, "loss": 0.1822, "mean_token_accuracy": 0.9811148077249527, "step": 59340 }, { "epoch": 5.936, "grad_norm": 0.39868396520614624, "learning_rate": 4.064406440644065e-05, "loss": 0.2456, "mean_token_accuracy": 0.9770483762025833, "step": 59360 }, { "epoch": 5.938, "grad_norm": 0.2918083965778351, "learning_rate": 4.0624062406240626e-05, "loss": 0.2075, "mean_token_accuracy": 0.9779973149299621, "step": 59380 }, { "epoch": 5.9399999999999995, "grad_norm": 0.2923343777656555, "learning_rate": 4.060406040604061e-05, "loss": 0.2622, "mean_token_accuracy": 0.9722013264894486, "step": 59400 }, { "epoch": 5.942, "grad_norm": 7.115927219390869, "learning_rate": 4.0584058405840585e-05, "loss": 0.2676, "mean_token_accuracy": 0.9727411389350891, "step": 59420 }, { "epoch": 5.944, "grad_norm": 0.27069392800331116, "learning_rate": 4.056405640564057e-05, "loss": 0.1824, "mean_token_accuracy": 0.9790448546409607, "step": 59440 }, { "epoch": 5.946, "grad_norm": 1.6045550107955933, "learning_rate": 4.0544054405440544e-05, "loss": 0.2015, "mean_token_accuracy": 0.982080751657486, "step": 59460 }, { "epoch": 5.948, "grad_norm": 7.861315727233887, "learning_rate": 4.052405240524053e-05, "loss": 0.3328, "mean_token_accuracy": 0.9726365268230438, "step": 59480 }, { "epoch": 5.95, "grad_norm": 0.3854958415031433, "learning_rate": 4.0504050405040504e-05, "loss": 0.2196, "mean_token_accuracy": 0.9796378433704376, "step": 59500 }, { "epoch": 5.952, "grad_norm": 0.30037084221839905, "learning_rate": 4.048404840484049e-05, "loss": 0.241, "mean_token_accuracy": 0.979366448521614, "step": 59520 }, { "epoch": 5.954, "grad_norm": 0.8395131230354309, "learning_rate": 4.046404640464046e-05, "loss": 0.1827, "mean_token_accuracy": 0.9798900038003922, "step": 59540 }, { "epoch": 5.9559999999999995, "grad_norm": 0.6276876926422119, "learning_rate": 4.0444044404440446e-05, "loss": 0.2185, "mean_token_accuracy": 0.9805685460567475, "step": 59560 }, { "epoch": 5.958, "grad_norm": 0.3671773374080658, "learning_rate": 4.042404240424042e-05, "loss": 0.2689, "mean_token_accuracy": 0.9810848563909531, "step": 59580 }, { "epoch": 5.96, "grad_norm": 0.2524561882019043, "learning_rate": 4.0404040404040405e-05, "loss": 0.2905, "mean_token_accuracy": 0.9788853466510773, "step": 59600 }, { "epoch": 5.962, "grad_norm": 0.9455273747444153, "learning_rate": 4.038403840384039e-05, "loss": 0.2989, "mean_token_accuracy": 0.9787728071212769, "step": 59620 }, { "epoch": 5.964, "grad_norm": 1.4238640069961548, "learning_rate": 4.0364036403640365e-05, "loss": 0.2386, "mean_token_accuracy": 0.9765239030122757, "step": 59640 }, { "epoch": 5.966, "grad_norm": 0.23513536155223846, "learning_rate": 4.034403440344035e-05, "loss": 0.1931, "mean_token_accuracy": 0.9796916007995605, "step": 59660 }, { "epoch": 5.968, "grad_norm": 0.5324181914329529, "learning_rate": 4.0324032403240324e-05, "loss": 0.245, "mean_token_accuracy": 0.9822176694869995, "step": 59680 }, { "epoch": 5.97, "grad_norm": 0.35216987133026123, "learning_rate": 4.030403040304031e-05, "loss": 0.1825, "mean_token_accuracy": 0.9798574864864349, "step": 59700 }, { "epoch": 5.9719999999999995, "grad_norm": 0.3704109191894531, "learning_rate": 4.028402840284028e-05, "loss": 0.1787, "mean_token_accuracy": 0.9825795978307724, "step": 59720 }, { "epoch": 5.974, "grad_norm": 0.3604222536087036, "learning_rate": 4.0264026402640266e-05, "loss": 0.2207, "mean_token_accuracy": 0.9774585425853729, "step": 59740 }, { "epoch": 5.976, "grad_norm": 0.44067153334617615, "learning_rate": 4.024402440244024e-05, "loss": 0.1691, "mean_token_accuracy": 0.9775527864694595, "step": 59760 }, { "epoch": 5.978, "grad_norm": 1.4003362655639648, "learning_rate": 4.0224022402240226e-05, "loss": 0.1902, "mean_token_accuracy": 0.972144877910614, "step": 59780 }, { "epoch": 5.98, "grad_norm": 0.5138097405433655, "learning_rate": 4.02040204020402e-05, "loss": 0.2231, "mean_token_accuracy": 0.9733028322458267, "step": 59800 }, { "epoch": 5.982, "grad_norm": 0.3646353781223297, "learning_rate": 4.0184018401840185e-05, "loss": 0.2016, "mean_token_accuracy": 0.9832718968391418, "step": 59820 }, { "epoch": 5.984, "grad_norm": 0.3697987198829651, "learning_rate": 4.016401640164016e-05, "loss": 0.1687, "mean_token_accuracy": 0.9782730787992477, "step": 59840 }, { "epoch": 5.986, "grad_norm": 0.3173183798789978, "learning_rate": 4.0144014401440144e-05, "loss": 0.2016, "mean_token_accuracy": 0.9822468876838684, "step": 59860 }, { "epoch": 5.9879999999999995, "grad_norm": 0.2644508183002472, "learning_rate": 4.012401240124013e-05, "loss": 0.3026, "mean_token_accuracy": 0.9744181096553802, "step": 59880 }, { "epoch": 5.99, "grad_norm": 9.607027053833008, "learning_rate": 4.0104010401040104e-05, "loss": 0.2278, "mean_token_accuracy": 0.9781644731760025, "step": 59900 }, { "epoch": 5.992, "grad_norm": 0.3456099033355713, "learning_rate": 4.008400840084009e-05, "loss": 0.2701, "mean_token_accuracy": 0.9768544584512711, "step": 59920 }, { "epoch": 5.994, "grad_norm": 0.36291465163230896, "learning_rate": 4.006400640064006e-05, "loss": 0.2596, "mean_token_accuracy": 0.970385617017746, "step": 59940 }, { "epoch": 5.996, "grad_norm": 0.5119251012802124, "learning_rate": 4.0044004400440046e-05, "loss": 0.1581, "mean_token_accuracy": 0.9764900892972946, "step": 59960 }, { "epoch": 5.998, "grad_norm": 0.26823684573173523, "learning_rate": 4.002400240024002e-05, "loss": 0.1211, "mean_token_accuracy": 0.9756813108921051, "step": 59980 }, { "epoch": 6.0, "grad_norm": 0.29217252135276794, "learning_rate": 4.0004000400040005e-05, "loss": 0.1555, "mean_token_accuracy": 0.9824453383684159, "step": 60000 }, { "epoch": 6.002, "grad_norm": 2.8349897861480713, "learning_rate": 3.998399839983998e-05, "loss": 0.2928, "mean_token_accuracy": 0.9841232001781464, "step": 60020 }, { "epoch": 6.004, "grad_norm": 3.24314284324646, "learning_rate": 3.9963996399639965e-05, "loss": 0.2972, "mean_token_accuracy": 0.9846493452787399, "step": 60040 }, { "epoch": 6.006, "grad_norm": 0.5767121315002441, "learning_rate": 3.994399439943994e-05, "loss": 0.1811, "mean_token_accuracy": 0.9858658462762833, "step": 60060 }, { "epoch": 6.008, "grad_norm": 0.5199044942855835, "learning_rate": 3.9923992399239924e-05, "loss": 0.1543, "mean_token_accuracy": 0.9882243782281875, "step": 60080 }, { "epoch": 6.01, "grad_norm": 0.27182093262672424, "learning_rate": 3.99039903990399e-05, "loss": 0.3772, "mean_token_accuracy": 0.9840344190597534, "step": 60100 }, { "epoch": 6.012, "grad_norm": 0.5764203667640686, "learning_rate": 3.988398839883988e-05, "loss": 0.1517, "mean_token_accuracy": 0.9881797164678574, "step": 60120 }, { "epoch": 6.014, "grad_norm": 2.4272429943084717, "learning_rate": 3.9863986398639866e-05, "loss": 0.1956, "mean_token_accuracy": 0.985997885465622, "step": 60140 }, { "epoch": 6.016, "grad_norm": 5.498265743255615, "learning_rate": 3.984398439843985e-05, "loss": 0.2333, "mean_token_accuracy": 0.9828183799982071, "step": 60160 }, { "epoch": 6.018, "grad_norm": 0.5033850073814392, "learning_rate": 3.9823982398239826e-05, "loss": 0.2182, "mean_token_accuracy": 0.9860900849103927, "step": 60180 }, { "epoch": 6.02, "grad_norm": 0.28949472308158875, "learning_rate": 3.980398039803981e-05, "loss": 0.1716, "mean_token_accuracy": 0.9884227871894836, "step": 60200 }, { "epoch": 6.022, "grad_norm": 0.3285590708255768, "learning_rate": 3.978397839783979e-05, "loss": 0.2658, "mean_token_accuracy": 0.98587207198143, "step": 60220 }, { "epoch": 6.024, "grad_norm": 0.3444896638393402, "learning_rate": 3.976397639763977e-05, "loss": 0.2023, "mean_token_accuracy": 0.9866574734449387, "step": 60240 }, { "epoch": 6.026, "grad_norm": 0.644788920879364, "learning_rate": 3.974397439743975e-05, "loss": 0.1729, "mean_token_accuracy": 0.9869831681251526, "step": 60260 }, { "epoch": 6.028, "grad_norm": 1.2193036079406738, "learning_rate": 3.972397239723973e-05, "loss": 0.4171, "mean_token_accuracy": 0.9758450448513031, "step": 60280 }, { "epoch": 6.03, "grad_norm": 0.31555309891700745, "learning_rate": 3.970397039703971e-05, "loss": 0.1554, "mean_token_accuracy": 0.9882366210222244, "step": 60300 }, { "epoch": 6.032, "grad_norm": 0.3988054692745209, "learning_rate": 3.968396839683969e-05, "loss": 0.126, "mean_token_accuracy": 0.9895609080791473, "step": 60320 }, { "epoch": 6.034, "grad_norm": 0.29976406693458557, "learning_rate": 3.966396639663967e-05, "loss": 0.1383, "mean_token_accuracy": 0.9871496587991715, "step": 60340 }, { "epoch": 6.036, "grad_norm": 0.3386934697628021, "learning_rate": 3.9643964396439646e-05, "loss": 0.1038, "mean_token_accuracy": 0.9867965698242187, "step": 60360 }, { "epoch": 6.038, "grad_norm": 0.3253265917301178, "learning_rate": 3.962396239623963e-05, "loss": 0.1788, "mean_token_accuracy": 0.9862632781267167, "step": 60380 }, { "epoch": 6.04, "grad_norm": 0.6599153876304626, "learning_rate": 3.9603960396039605e-05, "loss": 0.2289, "mean_token_accuracy": 0.9896980375051498, "step": 60400 }, { "epoch": 6.042, "grad_norm": 0.3152582347393036, "learning_rate": 3.958395839583959e-05, "loss": 0.1799, "mean_token_accuracy": 0.9866113483905792, "step": 60420 }, { "epoch": 6.044, "grad_norm": 0.2019403874874115, "learning_rate": 3.9563956395639565e-05, "loss": 0.2153, "mean_token_accuracy": 0.9856364607810975, "step": 60440 }, { "epoch": 6.046, "grad_norm": 0.8774945735931396, "learning_rate": 3.954395439543955e-05, "loss": 0.1612, "mean_token_accuracy": 0.989298889040947, "step": 60460 }, { "epoch": 6.048, "grad_norm": 0.27457866072654724, "learning_rate": 3.952395239523953e-05, "loss": 0.2001, "mean_token_accuracy": 0.9873542368412018, "step": 60480 }, { "epoch": 6.05, "grad_norm": 0.3444322943687439, "learning_rate": 3.950395039503951e-05, "loss": 0.2183, "mean_token_accuracy": 0.9899153470993042, "step": 60500 }, { "epoch": 6.052, "grad_norm": 4.590970039367676, "learning_rate": 3.948394839483949e-05, "loss": 0.3207, "mean_token_accuracy": 0.9866376489400863, "step": 60520 }, { "epoch": 6.054, "grad_norm": 0.2799339294433594, "learning_rate": 3.9463946394639466e-05, "loss": 0.2173, "mean_token_accuracy": 0.9845909863710404, "step": 60540 }, { "epoch": 6.056, "grad_norm": 0.26017916202545166, "learning_rate": 3.944394439443945e-05, "loss": 0.2975, "mean_token_accuracy": 0.9908647418022156, "step": 60560 }, { "epoch": 6.058, "grad_norm": 0.33854347467422485, "learning_rate": 3.9423942394239426e-05, "loss": 0.2068, "mean_token_accuracy": 0.9864837795495986, "step": 60580 }, { "epoch": 6.06, "grad_norm": 0.2496056854724884, "learning_rate": 3.940394039403941e-05, "loss": 0.2019, "mean_token_accuracy": 0.9881948202848434, "step": 60600 }, { "epoch": 6.062, "grad_norm": 0.2168179154396057, "learning_rate": 3.9383938393839385e-05, "loss": 0.2536, "mean_token_accuracy": 0.9858988642692565, "step": 60620 }, { "epoch": 6.064, "grad_norm": 0.30439600348472595, "learning_rate": 3.936393639363937e-05, "loss": 0.1967, "mean_token_accuracy": 0.9880341172218323, "step": 60640 }, { "epoch": 6.066, "grad_norm": 0.330769419670105, "learning_rate": 3.9343934393439344e-05, "loss": 0.1957, "mean_token_accuracy": 0.9810899317264556, "step": 60660 }, { "epoch": 6.068, "grad_norm": 0.23622800409793854, "learning_rate": 3.932393239323933e-05, "loss": 0.1592, "mean_token_accuracy": 0.990247493982315, "step": 60680 }, { "epoch": 6.07, "grad_norm": 0.3822772204875946, "learning_rate": 3.9303930393039304e-05, "loss": 0.154, "mean_token_accuracy": 0.9852812945842743, "step": 60700 }, { "epoch": 6.072, "grad_norm": 24.50721549987793, "learning_rate": 3.9283928392839287e-05, "loss": 0.2908, "mean_token_accuracy": 0.9810187071561813, "step": 60720 }, { "epoch": 6.074, "grad_norm": 0.2735440731048584, "learning_rate": 3.926392639263927e-05, "loss": 0.2061, "mean_token_accuracy": 0.9880321025848389, "step": 60740 }, { "epoch": 6.076, "grad_norm": 0.7103083729743958, "learning_rate": 3.9243924392439246e-05, "loss": 0.1605, "mean_token_accuracy": 0.9873922854661942, "step": 60760 }, { "epoch": 6.078, "grad_norm": 0.562482476234436, "learning_rate": 3.922392239223923e-05, "loss": 0.1795, "mean_token_accuracy": 0.9881090104579926, "step": 60780 }, { "epoch": 6.08, "grad_norm": 0.2599644362926483, "learning_rate": 3.9203920392039205e-05, "loss": 0.1828, "mean_token_accuracy": 0.9886240720748901, "step": 60800 }, { "epoch": 6.082, "grad_norm": 0.18064938485622406, "learning_rate": 3.918391839183919e-05, "loss": 0.15, "mean_token_accuracy": 0.9861800819635391, "step": 60820 }, { "epoch": 6.084, "grad_norm": 0.2978329658508301, "learning_rate": 3.9163916391639165e-05, "loss": 0.1302, "mean_token_accuracy": 0.9845503091812133, "step": 60840 }, { "epoch": 6.086, "grad_norm": 0.3058304488658905, "learning_rate": 3.914391439143915e-05, "loss": 0.1831, "mean_token_accuracy": 0.9863019496202469, "step": 60860 }, { "epoch": 6.088, "grad_norm": 2.2455854415893555, "learning_rate": 3.9123912391239124e-05, "loss": 0.2329, "mean_token_accuracy": 0.9853265762329102, "step": 60880 }, { "epoch": 6.09, "grad_norm": 0.455901175737381, "learning_rate": 3.910391039103911e-05, "loss": 0.2537, "mean_token_accuracy": 0.9857560813426971, "step": 60900 }, { "epoch": 6.092, "grad_norm": 0.3003759980201721, "learning_rate": 3.908390839083908e-05, "loss": 0.1721, "mean_token_accuracy": 0.9799523919820785, "step": 60920 }, { "epoch": 6.094, "grad_norm": 0.7577944397926331, "learning_rate": 3.9063906390639066e-05, "loss": 0.2633, "mean_token_accuracy": 0.9799831092357636, "step": 60940 }, { "epoch": 6.096, "grad_norm": 1.0013971328735352, "learning_rate": 3.904390439043904e-05, "loss": 0.1455, "mean_token_accuracy": 0.9863037168979645, "step": 60960 }, { "epoch": 6.098, "grad_norm": 0.2956526279449463, "learning_rate": 3.9023902390239026e-05, "loss": 0.1516, "mean_token_accuracy": 0.9879920810461045, "step": 60980 }, { "epoch": 6.1, "grad_norm": 0.36727219820022583, "learning_rate": 3.900390039003901e-05, "loss": 0.2033, "mean_token_accuracy": 0.9838875025510788, "step": 61000 }, { "epoch": 6.102, "grad_norm": 0.2867144048213959, "learning_rate": 3.8983898389838985e-05, "loss": 0.2004, "mean_token_accuracy": 0.9862304300069809, "step": 61020 }, { "epoch": 6.104, "grad_norm": 0.2776888608932495, "learning_rate": 3.896389638963897e-05, "loss": 0.2254, "mean_token_accuracy": 0.9874357253313064, "step": 61040 }, { "epoch": 6.106, "grad_norm": 0.2803952693939209, "learning_rate": 3.8943894389438944e-05, "loss": 0.2033, "mean_token_accuracy": 0.9842085719108582, "step": 61060 }, { "epoch": 6.108, "grad_norm": 0.6820498704910278, "learning_rate": 3.892389238923893e-05, "loss": 0.2255, "mean_token_accuracy": 0.986513900756836, "step": 61080 }, { "epoch": 6.11, "grad_norm": 0.2935560643672943, "learning_rate": 3.8903890389038903e-05, "loss": 0.1665, "mean_token_accuracy": 0.9885123372077942, "step": 61100 }, { "epoch": 6.112, "grad_norm": 0.20035983622074127, "learning_rate": 3.8883888388838887e-05, "loss": 0.2204, "mean_token_accuracy": 0.9859970062971115, "step": 61120 }, { "epoch": 6.114, "grad_norm": 0.27167174220085144, "learning_rate": 3.886388638863886e-05, "loss": 0.1864, "mean_token_accuracy": 0.9794614136219024, "step": 61140 }, { "epoch": 6.116, "grad_norm": 0.5772905349731445, "learning_rate": 3.8843884388438846e-05, "loss": 0.2172, "mean_token_accuracy": 0.9877961456775666, "step": 61160 }, { "epoch": 6.118, "grad_norm": 0.9239216446876526, "learning_rate": 3.882388238823882e-05, "loss": 0.1943, "mean_token_accuracy": 0.9878912508487702, "step": 61180 }, { "epoch": 6.12, "grad_norm": 0.2572813630104065, "learning_rate": 3.8803880388038805e-05, "loss": 0.2359, "mean_token_accuracy": 0.9868422329425812, "step": 61200 }, { "epoch": 6.122, "grad_norm": 0.20358359813690186, "learning_rate": 3.878387838783878e-05, "loss": 0.1593, "mean_token_accuracy": 0.9874232709407806, "step": 61220 }, { "epoch": 6.124, "grad_norm": 0.8377254605293274, "learning_rate": 3.8763876387638764e-05, "loss": 0.2869, "mean_token_accuracy": 0.9768101662397385, "step": 61240 }, { "epoch": 6.126, "grad_norm": 0.3732958734035492, "learning_rate": 3.874387438743875e-05, "loss": 0.2071, "mean_token_accuracy": 0.9871192395687103, "step": 61260 }, { "epoch": 6.128, "grad_norm": 0.41414928436279297, "learning_rate": 3.8723872387238724e-05, "loss": 0.2288, "mean_token_accuracy": 0.9862336754798889, "step": 61280 }, { "epoch": 6.13, "grad_norm": 0.2532384395599365, "learning_rate": 3.870387038703871e-05, "loss": 0.1625, "mean_token_accuracy": 0.9879456639289856, "step": 61300 }, { "epoch": 6.132, "grad_norm": 0.3606497645378113, "learning_rate": 3.868386838683868e-05, "loss": 0.27, "mean_token_accuracy": 0.9847257643938064, "step": 61320 }, { "epoch": 6.134, "grad_norm": 0.24546638131141663, "learning_rate": 3.8663866386638666e-05, "loss": 0.2854, "mean_token_accuracy": 0.9873810708522797, "step": 61340 }, { "epoch": 6.136, "grad_norm": 0.29023513197898865, "learning_rate": 3.864386438643864e-05, "loss": 0.1761, "mean_token_accuracy": 0.9867204189300537, "step": 61360 }, { "epoch": 6.138, "grad_norm": 0.33152514696121216, "learning_rate": 3.8623862386238625e-05, "loss": 0.2913, "mean_token_accuracy": 0.9867033332586288, "step": 61380 }, { "epoch": 6.14, "grad_norm": 0.2698994278907776, "learning_rate": 3.86038603860386e-05, "loss": 0.175, "mean_token_accuracy": 0.9861187368631363, "step": 61400 }, { "epoch": 6.142, "grad_norm": 0.5023800134658813, "learning_rate": 3.8583858385838585e-05, "loss": 0.1926, "mean_token_accuracy": 0.9846811085939408, "step": 61420 }, { "epoch": 6.144, "grad_norm": 0.2298969179391861, "learning_rate": 3.856385638563856e-05, "loss": 0.1242, "mean_token_accuracy": 0.9888104528188706, "step": 61440 }, { "epoch": 6.146, "grad_norm": 0.24800284206867218, "learning_rate": 3.8543854385438544e-05, "loss": 0.2186, "mean_token_accuracy": 0.9790783286094665, "step": 61460 }, { "epoch": 6.148, "grad_norm": 0.3960084021091461, "learning_rate": 3.852385238523852e-05, "loss": 0.1954, "mean_token_accuracy": 0.9883563071489334, "step": 61480 }, { "epoch": 6.15, "grad_norm": 0.2471463680267334, "learning_rate": 3.8503850385038503e-05, "loss": 0.1845, "mean_token_accuracy": 0.9869692653417588, "step": 61500 }, { "epoch": 6.152, "grad_norm": 0.7534583806991577, "learning_rate": 3.8483848384838486e-05, "loss": 0.1621, "mean_token_accuracy": 0.9827293068170547, "step": 61520 }, { "epoch": 6.154, "grad_norm": 0.3615434169769287, "learning_rate": 3.846384638463846e-05, "loss": 0.3554, "mean_token_accuracy": 0.987092599272728, "step": 61540 }, { "epoch": 6.156, "grad_norm": 0.19210593402385712, "learning_rate": 3.8443844384438446e-05, "loss": 0.1492, "mean_token_accuracy": 0.9867588102817535, "step": 61560 }, { "epoch": 6.158, "grad_norm": 3.938720703125, "learning_rate": 3.842384238423842e-05, "loss": 0.2606, "mean_token_accuracy": 0.9901694744825363, "step": 61580 }, { "epoch": 6.16, "grad_norm": 0.29663556814193726, "learning_rate": 3.8403840384038405e-05, "loss": 0.245, "mean_token_accuracy": 0.9876997143030166, "step": 61600 }, { "epoch": 6.162, "grad_norm": 0.34798192977905273, "learning_rate": 3.838383838383838e-05, "loss": 0.1678, "mean_token_accuracy": 0.9894169986248016, "step": 61620 }, { "epoch": 6.164, "grad_norm": 25.278345108032227, "learning_rate": 3.8363836383638364e-05, "loss": 0.2931, "mean_token_accuracy": 0.984656909108162, "step": 61640 }, { "epoch": 6.166, "grad_norm": 0.26606887578964233, "learning_rate": 3.834383438343835e-05, "loss": 0.1884, "mean_token_accuracy": 0.9852963238954544, "step": 61660 }, { "epoch": 6.168, "grad_norm": 0.5439487099647522, "learning_rate": 3.832383238323833e-05, "loss": 0.1718, "mean_token_accuracy": 0.9889381378889084, "step": 61680 }, { "epoch": 6.17, "grad_norm": 0.18621693551540375, "learning_rate": 3.830383038303831e-05, "loss": 0.1293, "mean_token_accuracy": 0.9873347401618957, "step": 61700 }, { "epoch": 6.172, "grad_norm": 2.8010315895080566, "learning_rate": 3.828382838283829e-05, "loss": 0.1857, "mean_token_accuracy": 0.9810226529836654, "step": 61720 }, { "epoch": 6.174, "grad_norm": 0.2100827991962433, "learning_rate": 3.8263826382638266e-05, "loss": 0.1225, "mean_token_accuracy": 0.9880403041839599, "step": 61740 }, { "epoch": 6.176, "grad_norm": 0.3092288672924042, "learning_rate": 3.824382438243825e-05, "loss": 0.2914, "mean_token_accuracy": 0.9840195387601852, "step": 61760 }, { "epoch": 6.178, "grad_norm": 0.2872907519340515, "learning_rate": 3.8223822382238225e-05, "loss": 0.1271, "mean_token_accuracy": 0.9867116242647171, "step": 61780 }, { "epoch": 6.18, "grad_norm": 0.21730320155620575, "learning_rate": 3.820382038203821e-05, "loss": 0.1099, "mean_token_accuracy": 0.9867184102535248, "step": 61800 }, { "epoch": 6.182, "grad_norm": 0.20728076994419098, "learning_rate": 3.8183818381838185e-05, "loss": 0.1625, "mean_token_accuracy": 0.9861974626779556, "step": 61820 }, { "epoch": 6.184, "grad_norm": 0.2671181857585907, "learning_rate": 3.816381638163817e-05, "loss": 0.1737, "mean_token_accuracy": 0.989355406165123, "step": 61840 }, { "epoch": 6.186, "grad_norm": 0.23032934963703156, "learning_rate": 3.814381438143815e-05, "loss": 0.2384, "mean_token_accuracy": 0.9851123452186584, "step": 61860 }, { "epoch": 6.188, "grad_norm": 0.3141525685787201, "learning_rate": 3.812381238123813e-05, "loss": 0.1193, "mean_token_accuracy": 0.9862658679485321, "step": 61880 }, { "epoch": 6.19, "grad_norm": 0.2801823914051056, "learning_rate": 3.810381038103811e-05, "loss": 0.1425, "mean_token_accuracy": 0.9852701425552368, "step": 61900 }, { "epoch": 6.192, "grad_norm": 0.30803078413009644, "learning_rate": 3.8083808380838086e-05, "loss": 0.1533, "mean_token_accuracy": 0.9888172149658203, "step": 61920 }, { "epoch": 6.194, "grad_norm": 0.2645457684993744, "learning_rate": 3.806380638063807e-05, "loss": 0.1614, "mean_token_accuracy": 0.9817666202783585, "step": 61940 }, { "epoch": 6.196, "grad_norm": 0.35079294443130493, "learning_rate": 3.8043804380438046e-05, "loss": 0.2451, "mean_token_accuracy": 0.9886845976114274, "step": 61960 }, { "epoch": 6.198, "grad_norm": 0.3997754156589508, "learning_rate": 3.802380238023803e-05, "loss": 0.2215, "mean_token_accuracy": 0.9906711518764496, "step": 61980 }, { "epoch": 6.2, "grad_norm": 0.39263400435447693, "learning_rate": 3.8003800380038005e-05, "loss": 0.1538, "mean_token_accuracy": 0.9861565798521041, "step": 62000 }, { "epoch": 6.202, "grad_norm": 0.8925210237503052, "learning_rate": 3.798379837983799e-05, "loss": 0.1165, "mean_token_accuracy": 0.9857583403587341, "step": 62020 }, { "epoch": 6.204, "grad_norm": 0.24952690303325653, "learning_rate": 3.7963796379637964e-05, "loss": 0.1719, "mean_token_accuracy": 0.9853226006031036, "step": 62040 }, { "epoch": 6.206, "grad_norm": 0.23889106512069702, "learning_rate": 3.794379437943795e-05, "loss": 0.2038, "mean_token_accuracy": 0.984839391708374, "step": 62060 }, { "epoch": 6.208, "grad_norm": 2.4647982120513916, "learning_rate": 3.7923792379237924e-05, "loss": 0.1653, "mean_token_accuracy": 0.9861971646547317, "step": 62080 }, { "epoch": 6.21, "grad_norm": 0.2447003275156021, "learning_rate": 3.790379037903791e-05, "loss": 0.2208, "mean_token_accuracy": 0.9824275523424149, "step": 62100 }, { "epoch": 6.212, "grad_norm": 0.317548006772995, "learning_rate": 3.788378837883789e-05, "loss": 0.2283, "mean_token_accuracy": 0.9879062086343765, "step": 62120 }, { "epoch": 6.214, "grad_norm": 0.3601900339126587, "learning_rate": 3.7863786378637866e-05, "loss": 0.1688, "mean_token_accuracy": 0.984816524386406, "step": 62140 }, { "epoch": 6.216, "grad_norm": 0.289646178483963, "learning_rate": 3.784378437843785e-05, "loss": 0.1276, "mean_token_accuracy": 0.9884780675172806, "step": 62160 }, { "epoch": 6.218, "grad_norm": 0.6798099875450134, "learning_rate": 3.7823782378237825e-05, "loss": 0.1355, "mean_token_accuracy": 0.9874667346477508, "step": 62180 }, { "epoch": 6.22, "grad_norm": 0.4942800998687744, "learning_rate": 3.780378037803781e-05, "loss": 0.1871, "mean_token_accuracy": 0.9879170477390289, "step": 62200 }, { "epoch": 6.222, "grad_norm": 0.28834426403045654, "learning_rate": 3.7783778377837785e-05, "loss": 0.2299, "mean_token_accuracy": 0.9828972369432449, "step": 62220 }, { "epoch": 6.224, "grad_norm": 0.25042468309402466, "learning_rate": 3.776377637763777e-05, "loss": 0.1626, "mean_token_accuracy": 0.986855012178421, "step": 62240 }, { "epoch": 6.226, "grad_norm": 0.27341896295547485, "learning_rate": 3.7743774377437744e-05, "loss": 0.3065, "mean_token_accuracy": 0.9825337499380111, "step": 62260 }, { "epoch": 6.228, "grad_norm": 0.3044675290584564, "learning_rate": 3.772377237723773e-05, "loss": 0.1302, "mean_token_accuracy": 0.9892271220684051, "step": 62280 }, { "epoch": 6.23, "grad_norm": 1.5155028104782104, "learning_rate": 3.77037703770377e-05, "loss": 0.239, "mean_token_accuracy": 0.9836427867412567, "step": 62300 }, { "epoch": 6.232, "grad_norm": 0.29185765981674194, "learning_rate": 3.7683768376837686e-05, "loss": 0.2139, "mean_token_accuracy": 0.9908135831356049, "step": 62320 }, { "epoch": 6.234, "grad_norm": 0.3019137382507324, "learning_rate": 3.766376637663766e-05, "loss": 0.1632, "mean_token_accuracy": 0.982847535610199, "step": 62340 }, { "epoch": 6.236, "grad_norm": 0.2872399389743805, "learning_rate": 3.7643764376437646e-05, "loss": 0.2785, "mean_token_accuracy": 0.9859834402799607, "step": 62360 }, { "epoch": 6.2379999999999995, "grad_norm": 0.4506475031375885, "learning_rate": 3.762376237623763e-05, "loss": 0.1256, "mean_token_accuracy": 0.9850576639175415, "step": 62380 }, { "epoch": 6.24, "grad_norm": 0.253010094165802, "learning_rate": 3.7603760376037605e-05, "loss": 0.1733, "mean_token_accuracy": 0.9858906477689743, "step": 62400 }, { "epoch": 6.242, "grad_norm": 0.288056343793869, "learning_rate": 3.758375837583759e-05, "loss": 0.1887, "mean_token_accuracy": 0.9877594441175461, "step": 62420 }, { "epoch": 6.244, "grad_norm": 0.2567198574542999, "learning_rate": 3.7563756375637564e-05, "loss": 0.1459, "mean_token_accuracy": 0.9851928055286407, "step": 62440 }, { "epoch": 6.246, "grad_norm": 0.28664809465408325, "learning_rate": 3.754375437543755e-05, "loss": 0.1255, "mean_token_accuracy": 0.9868131309747696, "step": 62460 }, { "epoch": 6.248, "grad_norm": 0.3060973286628723, "learning_rate": 3.7523752375237524e-05, "loss": 0.155, "mean_token_accuracy": 0.9860345602035523, "step": 62480 }, { "epoch": 6.25, "grad_norm": 0.2682327330112457, "learning_rate": 3.750375037503751e-05, "loss": 0.1443, "mean_token_accuracy": 0.989409762620926, "step": 62500 }, { "epoch": 6.252, "grad_norm": 0.2668534517288208, "learning_rate": 3.748374837483748e-05, "loss": 0.3494, "mean_token_accuracy": 0.9863003462553024, "step": 62520 }, { "epoch": 6.254, "grad_norm": 0.23468269407749176, "learning_rate": 3.7463746374637466e-05, "loss": 0.1357, "mean_token_accuracy": 0.9870061248540878, "step": 62540 }, { "epoch": 6.256, "grad_norm": 0.3547341525554657, "learning_rate": 3.744374437443744e-05, "loss": 0.2025, "mean_token_accuracy": 0.9847435086965561, "step": 62560 }, { "epoch": 6.258, "grad_norm": 0.21602943539619446, "learning_rate": 3.7423742374237425e-05, "loss": 0.1551, "mean_token_accuracy": 0.9843036025762558, "step": 62580 }, { "epoch": 6.26, "grad_norm": 0.2832258641719818, "learning_rate": 3.74037403740374e-05, "loss": 0.1499, "mean_token_accuracy": 0.9844323307275772, "step": 62600 }, { "epoch": 6.2620000000000005, "grad_norm": 0.285769522190094, "learning_rate": 3.7383738373837385e-05, "loss": 0.1819, "mean_token_accuracy": 0.9894167482852936, "step": 62620 }, { "epoch": 6.264, "grad_norm": 0.2690717875957489, "learning_rate": 3.736373637363737e-05, "loss": 0.2027, "mean_token_accuracy": 0.9897973626852036, "step": 62640 }, { "epoch": 6.266, "grad_norm": 0.19910606741905212, "learning_rate": 3.7343734373437344e-05, "loss": 0.2945, "mean_token_accuracy": 0.9853726297616958, "step": 62660 }, { "epoch": 6.268, "grad_norm": 1.0638004541397095, "learning_rate": 3.732373237323733e-05, "loss": 0.1437, "mean_token_accuracy": 0.9865536034107208, "step": 62680 }, { "epoch": 6.27, "grad_norm": 0.2996343970298767, "learning_rate": 3.73037303730373e-05, "loss": 0.211, "mean_token_accuracy": 0.9873775452375412, "step": 62700 }, { "epoch": 6.272, "grad_norm": 0.7304286956787109, "learning_rate": 3.7283728372837286e-05, "loss": 0.2143, "mean_token_accuracy": 0.9879233062267303, "step": 62720 }, { "epoch": 6.274, "grad_norm": 0.2869472801685333, "learning_rate": 3.726372637263726e-05, "loss": 0.2713, "mean_token_accuracy": 0.9766190975904465, "step": 62740 }, { "epoch": 6.276, "grad_norm": 0.2889515161514282, "learning_rate": 3.7243724372437246e-05, "loss": 0.2282, "mean_token_accuracy": 0.9872916787862778, "step": 62760 }, { "epoch": 6.2780000000000005, "grad_norm": 0.28663012385368347, "learning_rate": 3.722372237223722e-05, "loss": 0.332, "mean_token_accuracy": 0.9814169108867645, "step": 62780 }, { "epoch": 6.28, "grad_norm": 3.087033987045288, "learning_rate": 3.7203720372037205e-05, "loss": 0.2541, "mean_token_accuracy": 0.9853112459182739, "step": 62800 }, { "epoch": 6.282, "grad_norm": 0.25831443071365356, "learning_rate": 3.718371837183718e-05, "loss": 0.1676, "mean_token_accuracy": 0.9800544053316116, "step": 62820 }, { "epoch": 6.284, "grad_norm": 0.540744423866272, "learning_rate": 3.7163716371637164e-05, "loss": 0.168, "mean_token_accuracy": 0.9843910038471222, "step": 62840 }, { "epoch": 6.286, "grad_norm": 0.2725704610347748, "learning_rate": 3.714371437143714e-05, "loss": 0.2249, "mean_token_accuracy": 0.9890442371368409, "step": 62860 }, { "epoch": 6.288, "grad_norm": 0.2513914108276367, "learning_rate": 3.7123712371237124e-05, "loss": 0.1297, "mean_token_accuracy": 0.9873562276363372, "step": 62880 }, { "epoch": 6.29, "grad_norm": 0.29995450377464294, "learning_rate": 3.710371037103711e-05, "loss": 0.1472, "mean_token_accuracy": 0.9878587305545807, "step": 62900 }, { "epoch": 6.292, "grad_norm": 0.5282148122787476, "learning_rate": 3.708370837083708e-05, "loss": 0.1551, "mean_token_accuracy": 0.9866943627595901, "step": 62920 }, { "epoch": 6.294, "grad_norm": 0.20113453269004822, "learning_rate": 3.7063706370637066e-05, "loss": 0.099, "mean_token_accuracy": 0.9864253044128418, "step": 62940 }, { "epoch": 6.296, "grad_norm": 0.2500074803829193, "learning_rate": 3.704370437043704e-05, "loss": 0.1882, "mean_token_accuracy": 0.9851401716470718, "step": 62960 }, { "epoch": 6.298, "grad_norm": 0.2880468964576721, "learning_rate": 3.7023702370237025e-05, "loss": 0.1844, "mean_token_accuracy": 0.9871421366930008, "step": 62980 }, { "epoch": 6.3, "grad_norm": 0.42882657051086426, "learning_rate": 3.7003700370037e-05, "loss": 0.248, "mean_token_accuracy": 0.9804296493530273, "step": 63000 }, { "epoch": 6.302, "grad_norm": 0.27850204706192017, "learning_rate": 3.6983698369836985e-05, "loss": 0.1291, "mean_token_accuracy": 0.9894699275493621, "step": 63020 }, { "epoch": 6.304, "grad_norm": 0.18717224895954132, "learning_rate": 3.696369636963696e-05, "loss": 0.129, "mean_token_accuracy": 0.9883245587348938, "step": 63040 }, { "epoch": 6.306, "grad_norm": 4.695562362670898, "learning_rate": 3.6943694369436944e-05, "loss": 0.1648, "mean_token_accuracy": 0.9845078587532043, "step": 63060 }, { "epoch": 6.308, "grad_norm": 0.2938148081302643, "learning_rate": 3.692369236923692e-05, "loss": 0.1957, "mean_token_accuracy": 0.985721206665039, "step": 63080 }, { "epoch": 6.31, "grad_norm": 0.34214526414871216, "learning_rate": 3.69036903690369e-05, "loss": 0.2046, "mean_token_accuracy": 0.9840965420007706, "step": 63100 }, { "epoch": 6.312, "grad_norm": 0.35594576597213745, "learning_rate": 3.688368836883688e-05, "loss": 0.1762, "mean_token_accuracy": 0.9893361568450928, "step": 63120 }, { "epoch": 6.314, "grad_norm": 0.3166491389274597, "learning_rate": 3.686368636863686e-05, "loss": 0.1875, "mean_token_accuracy": 0.9848360389471054, "step": 63140 }, { "epoch": 6.316, "grad_norm": 0.3254336416721344, "learning_rate": 3.6843684368436846e-05, "loss": 0.1804, "mean_token_accuracy": 0.989296692609787, "step": 63160 }, { "epoch": 6.318, "grad_norm": 0.5822221040725708, "learning_rate": 3.682368236823683e-05, "loss": 0.2523, "mean_token_accuracy": 0.9815619558095932, "step": 63180 }, { "epoch": 6.32, "grad_norm": 0.21716171503067017, "learning_rate": 3.6803680368036805e-05, "loss": 0.1566, "mean_token_accuracy": 0.9870341271162033, "step": 63200 }, { "epoch": 6.322, "grad_norm": 0.26513251662254333, "learning_rate": 3.678367836783679e-05, "loss": 0.1586, "mean_token_accuracy": 0.9867132186889649, "step": 63220 }, { "epoch": 6.324, "grad_norm": 0.2676273286342621, "learning_rate": 3.676367636763677e-05, "loss": 0.1709, "mean_token_accuracy": 0.9823116719722748, "step": 63240 }, { "epoch": 6.326, "grad_norm": 0.2857803404331207, "learning_rate": 3.674367436743675e-05, "loss": 0.1665, "mean_token_accuracy": 0.9877039551734924, "step": 63260 }, { "epoch": 6.328, "grad_norm": 0.2515602707862854, "learning_rate": 3.672367236723673e-05, "loss": 0.1341, "mean_token_accuracy": 0.9862238258123398, "step": 63280 }, { "epoch": 6.33, "grad_norm": 3.864598512649536, "learning_rate": 3.6703670367036707e-05, "loss": 0.129, "mean_token_accuracy": 0.9866984337568283, "step": 63300 }, { "epoch": 6.332, "grad_norm": 0.37837889790534973, "learning_rate": 3.668366836683669e-05, "loss": 0.1106, "mean_token_accuracy": 0.987974151968956, "step": 63320 }, { "epoch": 6.334, "grad_norm": 0.2297341376543045, "learning_rate": 3.6663666366636666e-05, "loss": 0.1419, "mean_token_accuracy": 0.9854336172342301, "step": 63340 }, { "epoch": 6.336, "grad_norm": 0.23302625119686127, "learning_rate": 3.664366436643665e-05, "loss": 0.1749, "mean_token_accuracy": 0.9868430852890014, "step": 63360 }, { "epoch": 6.338, "grad_norm": 0.31222862005233765, "learning_rate": 3.6623662366236625e-05, "loss": 0.1459, "mean_token_accuracy": 0.9873465150594711, "step": 63380 }, { "epoch": 6.34, "grad_norm": 0.2970985770225525, "learning_rate": 3.660366036603661e-05, "loss": 0.2568, "mean_token_accuracy": 0.9847049981355667, "step": 63400 }, { "epoch": 6.342, "grad_norm": 0.38221853971481323, "learning_rate": 3.6583658365836585e-05, "loss": 0.1186, "mean_token_accuracy": 0.986903578042984, "step": 63420 }, { "epoch": 6.344, "grad_norm": 0.31918612122535706, "learning_rate": 3.656365636563657e-05, "loss": 0.1358, "mean_token_accuracy": 0.986939400434494, "step": 63440 }, { "epoch": 6.346, "grad_norm": 0.32745102047920227, "learning_rate": 3.6543654365436544e-05, "loss": 0.2313, "mean_token_accuracy": 0.9852112799882888, "step": 63460 }, { "epoch": 6.348, "grad_norm": 0.21951395273208618, "learning_rate": 3.652365236523653e-05, "loss": 0.2289, "mean_token_accuracy": 0.9830992132425308, "step": 63480 }, { "epoch": 6.35, "grad_norm": 0.2885410487651825, "learning_rate": 3.650365036503651e-05, "loss": 0.1738, "mean_token_accuracy": 0.9891485720872879, "step": 63500 }, { "epoch": 6.352, "grad_norm": 0.3092964291572571, "learning_rate": 3.6483648364836486e-05, "loss": 0.199, "mean_token_accuracy": 0.9829888284206391, "step": 63520 }, { "epoch": 6.354, "grad_norm": 0.27887654304504395, "learning_rate": 3.646364636463647e-05, "loss": 0.1728, "mean_token_accuracy": 0.985864719748497, "step": 63540 }, { "epoch": 6.356, "grad_norm": 0.24987158179283142, "learning_rate": 3.6443644364436446e-05, "loss": 0.1613, "mean_token_accuracy": 0.9841623425483703, "step": 63560 }, { "epoch": 6.358, "grad_norm": 0.25579455494880676, "learning_rate": 3.642364236423643e-05, "loss": 0.1004, "mean_token_accuracy": 0.9881481617689133, "step": 63580 }, { "epoch": 6.36, "grad_norm": 0.2385323941707611, "learning_rate": 3.6403640364036405e-05, "loss": 0.1532, "mean_token_accuracy": 0.9888777941465378, "step": 63600 }, { "epoch": 6.362, "grad_norm": 0.2895430028438568, "learning_rate": 3.638363836383639e-05, "loss": 0.2943, "mean_token_accuracy": 0.9841992139816285, "step": 63620 }, { "epoch": 6.364, "grad_norm": 0.5538919568061829, "learning_rate": 3.6363636363636364e-05, "loss": 0.285, "mean_token_accuracy": 0.9855029702186584, "step": 63640 }, { "epoch": 6.366, "grad_norm": 0.2655647397041321, "learning_rate": 3.634363436343635e-05, "loss": 0.1775, "mean_token_accuracy": 0.987720274925232, "step": 63660 }, { "epoch": 6.368, "grad_norm": 0.27572232484817505, "learning_rate": 3.6323632363236323e-05, "loss": 0.2347, "mean_token_accuracy": 0.9883319109678268, "step": 63680 }, { "epoch": 6.37, "grad_norm": 0.3754271864891052, "learning_rate": 3.6303630363036307e-05, "loss": 0.2256, "mean_token_accuracy": 0.982456773519516, "step": 63700 }, { "epoch": 6.372, "grad_norm": 0.2318781167268753, "learning_rate": 3.628362836283628e-05, "loss": 0.2051, "mean_token_accuracy": 0.9860905855894089, "step": 63720 }, { "epoch": 6.374, "grad_norm": 0.25896117091178894, "learning_rate": 3.6263626362636266e-05, "loss": 0.167, "mean_token_accuracy": 0.9823007017374039, "step": 63740 }, { "epoch": 6.376, "grad_norm": 0.4286016523838043, "learning_rate": 3.624362436243625e-05, "loss": 0.1692, "mean_token_accuracy": 0.9871737778186798, "step": 63760 }, { "epoch": 6.378, "grad_norm": 0.34764543175697327, "learning_rate": 3.6223622362236225e-05, "loss": 0.238, "mean_token_accuracy": 0.9875211685895919, "step": 63780 }, { "epoch": 6.38, "grad_norm": 0.4796670973300934, "learning_rate": 3.620362036203621e-05, "loss": 0.1355, "mean_token_accuracy": 0.9838226675987244, "step": 63800 }, { "epoch": 6.382, "grad_norm": 0.24762354791164398, "learning_rate": 3.6183618361836184e-05, "loss": 0.1474, "mean_token_accuracy": 0.985103240609169, "step": 63820 }, { "epoch": 6.384, "grad_norm": 0.5400233268737793, "learning_rate": 3.616361636163617e-05, "loss": 0.185, "mean_token_accuracy": 0.9902496129274369, "step": 63840 }, { "epoch": 6.386, "grad_norm": 0.6661818623542786, "learning_rate": 3.6143614361436144e-05, "loss": 0.1951, "mean_token_accuracy": 0.9840015888214111, "step": 63860 }, { "epoch": 6.388, "grad_norm": 0.2799980938434601, "learning_rate": 3.612361236123613e-05, "loss": 0.1407, "mean_token_accuracy": 0.9857525259256363, "step": 63880 }, { "epoch": 6.39, "grad_norm": 0.34568992257118225, "learning_rate": 3.61036103610361e-05, "loss": 0.1695, "mean_token_accuracy": 0.9880797564983368, "step": 63900 }, { "epoch": 6.392, "grad_norm": 0.2683306634426117, "learning_rate": 3.6083608360836086e-05, "loss": 0.1381, "mean_token_accuracy": 0.9880622118711472, "step": 63920 }, { "epoch": 6.394, "grad_norm": 0.258354514837265, "learning_rate": 3.606360636063606e-05, "loss": 0.1582, "mean_token_accuracy": 0.9867158532142639, "step": 63940 }, { "epoch": 6.396, "grad_norm": 0.2288939356803894, "learning_rate": 3.6043604360436045e-05, "loss": 0.2122, "mean_token_accuracy": 0.986763808131218, "step": 63960 }, { "epoch": 6.398, "grad_norm": 0.23877175152301788, "learning_rate": 3.602360236023602e-05, "loss": 0.1548, "mean_token_accuracy": 0.984081581234932, "step": 63980 }, { "epoch": 6.4, "grad_norm": 0.2783271074295044, "learning_rate": 3.6003600360036005e-05, "loss": 0.1334, "mean_token_accuracy": 0.9845703661441803, "step": 64000 }, { "epoch": 6.402, "grad_norm": 0.22530542314052582, "learning_rate": 3.598359835983599e-05, "loss": 0.2572, "mean_token_accuracy": 0.9859477162361145, "step": 64020 }, { "epoch": 6.404, "grad_norm": 0.2665806710720062, "learning_rate": 3.5963596359635964e-05, "loss": 0.1791, "mean_token_accuracy": 0.9871680557727813, "step": 64040 }, { "epoch": 6.406, "grad_norm": 0.3637467622756958, "learning_rate": 3.594359435943595e-05, "loss": 0.1698, "mean_token_accuracy": 0.9879029482603073, "step": 64060 }, { "epoch": 6.408, "grad_norm": 11.959065437316895, "learning_rate": 3.5923592359235923e-05, "loss": 0.1823, "mean_token_accuracy": 0.9818393081426621, "step": 64080 }, { "epoch": 6.41, "grad_norm": 0.5870586037635803, "learning_rate": 3.5903590359035906e-05, "loss": 0.1829, "mean_token_accuracy": 0.9823683321475982, "step": 64100 }, { "epoch": 6.412, "grad_norm": 0.3541073203086853, "learning_rate": 3.588358835883588e-05, "loss": 0.2413, "mean_token_accuracy": 0.9873243987560272, "step": 64120 }, { "epoch": 6.414, "grad_norm": 0.36593398451805115, "learning_rate": 3.5863586358635866e-05, "loss": 0.2304, "mean_token_accuracy": 0.9880183964967728, "step": 64140 }, { "epoch": 6.416, "grad_norm": 0.32528409361839294, "learning_rate": 3.584358435843584e-05, "loss": 0.1665, "mean_token_accuracy": 0.9887857288122177, "step": 64160 }, { "epoch": 6.418, "grad_norm": 0.2859201729297638, "learning_rate": 3.5823582358235825e-05, "loss": 0.1128, "mean_token_accuracy": 0.9856736958026886, "step": 64180 }, { "epoch": 6.42, "grad_norm": 0.4936062693595886, "learning_rate": 3.58035803580358e-05, "loss": 0.1299, "mean_token_accuracy": 0.9878364264965057, "step": 64200 }, { "epoch": 6.422, "grad_norm": 0.2617616653442383, "learning_rate": 3.5783578357835784e-05, "loss": 0.1843, "mean_token_accuracy": 0.9885910212993622, "step": 64220 }, { "epoch": 6.424, "grad_norm": 0.2544686496257782, "learning_rate": 3.576357635763577e-05, "loss": 0.239, "mean_token_accuracy": 0.9771861046552658, "step": 64240 }, { "epoch": 6.426, "grad_norm": 0.3257557153701782, "learning_rate": 3.5743574357435744e-05, "loss": 0.1601, "mean_token_accuracy": 0.9887812256813049, "step": 64260 }, { "epoch": 6.428, "grad_norm": 0.309286892414093, "learning_rate": 3.572357235723573e-05, "loss": 0.2889, "mean_token_accuracy": 0.9866312891244888, "step": 64280 }, { "epoch": 6.43, "grad_norm": 0.32245585322380066, "learning_rate": 3.57035703570357e-05, "loss": 0.1908, "mean_token_accuracy": 0.9884273558855057, "step": 64300 }, { "epoch": 6.432, "grad_norm": 2.329608917236328, "learning_rate": 3.5683568356835686e-05, "loss": 0.2355, "mean_token_accuracy": 0.9790932804346084, "step": 64320 }, { "epoch": 6.434, "grad_norm": 0.21288111805915833, "learning_rate": 3.566356635663566e-05, "loss": 0.2099, "mean_token_accuracy": 0.9871852606534958, "step": 64340 }, { "epoch": 6.436, "grad_norm": 0.2690618932247162, "learning_rate": 3.5643564356435645e-05, "loss": 0.2398, "mean_token_accuracy": 0.9869323968887329, "step": 64360 }, { "epoch": 6.438, "grad_norm": 0.24804188311100006, "learning_rate": 3.562356235623562e-05, "loss": 0.1388, "mean_token_accuracy": 0.9823918521404267, "step": 64380 }, { "epoch": 6.44, "grad_norm": 0.3203963339328766, "learning_rate": 3.5603560356035605e-05, "loss": 0.231, "mean_token_accuracy": 0.985198637843132, "step": 64400 }, { "epoch": 6.442, "grad_norm": 0.5585077404975891, "learning_rate": 3.558355835583558e-05, "loss": 0.2214, "mean_token_accuracy": 0.9900667667388916, "step": 64420 }, { "epoch": 6.444, "grad_norm": 0.2642773985862732, "learning_rate": 3.5563556355635564e-05, "loss": 0.1112, "mean_token_accuracy": 0.9878390461206437, "step": 64440 }, { "epoch": 6.446, "grad_norm": 0.2737801969051361, "learning_rate": 3.554355435543554e-05, "loss": 0.1276, "mean_token_accuracy": 0.9882962316274643, "step": 64460 }, { "epoch": 6.448, "grad_norm": 4.764594078063965, "learning_rate": 3.552355235523552e-05, "loss": 0.1896, "mean_token_accuracy": 0.9877075523138046, "step": 64480 }, { "epoch": 6.45, "grad_norm": 0.28715193271636963, "learning_rate": 3.5503550355035506e-05, "loss": 0.1722, "mean_token_accuracy": 0.9863953590393066, "step": 64500 }, { "epoch": 6.452, "grad_norm": 0.3426864445209503, "learning_rate": 3.548354835483548e-05, "loss": 0.1271, "mean_token_accuracy": 0.9892733126878739, "step": 64520 }, { "epoch": 6.454, "grad_norm": 0.297380656003952, "learning_rate": 3.5463546354635466e-05, "loss": 0.1615, "mean_token_accuracy": 0.9881208211183548, "step": 64540 }, { "epoch": 6.456, "grad_norm": 0.3012201189994812, "learning_rate": 3.544354435443544e-05, "loss": 0.236, "mean_token_accuracy": 0.9882899224758148, "step": 64560 }, { "epoch": 6.458, "grad_norm": 0.7689819931983948, "learning_rate": 3.5423542354235425e-05, "loss": 0.2369, "mean_token_accuracy": 0.9881375283002853, "step": 64580 }, { "epoch": 6.46, "grad_norm": 0.3618817925453186, "learning_rate": 3.54035403540354e-05, "loss": 0.1813, "mean_token_accuracy": 0.9850628793239593, "step": 64600 }, { "epoch": 6.462, "grad_norm": 0.20518335700035095, "learning_rate": 3.5383538353835384e-05, "loss": 0.1496, "mean_token_accuracy": 0.9866809070110321, "step": 64620 }, { "epoch": 6.464, "grad_norm": 0.8180327415466309, "learning_rate": 3.536353635363536e-05, "loss": 0.1372, "mean_token_accuracy": 0.9871083110570907, "step": 64640 }, { "epoch": 6.466, "grad_norm": 0.475656121969223, "learning_rate": 3.5343534353435344e-05, "loss": 0.1797, "mean_token_accuracy": 0.9878749370574951, "step": 64660 }, { "epoch": 6.468, "grad_norm": 0.28141477704048157, "learning_rate": 3.532353235323533e-05, "loss": 0.0987, "mean_token_accuracy": 0.9874036729335784, "step": 64680 }, { "epoch": 6.47, "grad_norm": 0.2420186549425125, "learning_rate": 3.530353035303531e-05, "loss": 0.1702, "mean_token_accuracy": 0.9884171426296234, "step": 64700 }, { "epoch": 6.4719999999999995, "grad_norm": 0.2372388243675232, "learning_rate": 3.5283528352835286e-05, "loss": 0.2173, "mean_token_accuracy": 0.9843870818614959, "step": 64720 }, { "epoch": 6.474, "grad_norm": 0.2746586203575134, "learning_rate": 3.526352635263527e-05, "loss": 0.1828, "mean_token_accuracy": 0.9879689812660217, "step": 64740 }, { "epoch": 6.476, "grad_norm": 0.2967888116836548, "learning_rate": 3.5243524352435245e-05, "loss": 0.2552, "mean_token_accuracy": 0.9858520418405533, "step": 64760 }, { "epoch": 6.478, "grad_norm": 0.349786639213562, "learning_rate": 3.522352235223523e-05, "loss": 0.1235, "mean_token_accuracy": 0.9878809750080109, "step": 64780 }, { "epoch": 6.48, "grad_norm": 0.27367645502090454, "learning_rate": 3.5203520352035205e-05, "loss": 0.2176, "mean_token_accuracy": 0.9866064608097076, "step": 64800 }, { "epoch": 6.482, "grad_norm": 0.2383221834897995, "learning_rate": 3.518351835183519e-05, "loss": 0.2171, "mean_token_accuracy": 0.9890417009592056, "step": 64820 }, { "epoch": 6.484, "grad_norm": 0.25220412015914917, "learning_rate": 3.5163516351635164e-05, "loss": 0.1399, "mean_token_accuracy": 0.9879716664552689, "step": 64840 }, { "epoch": 6.486, "grad_norm": 0.16845068335533142, "learning_rate": 3.514351435143515e-05, "loss": 0.1171, "mean_token_accuracy": 0.9891416251659393, "step": 64860 }, { "epoch": 6.4879999999999995, "grad_norm": 0.25492092967033386, "learning_rate": 3.512351235123513e-05, "loss": 0.3084, "mean_token_accuracy": 0.9904765665531159, "step": 64880 }, { "epoch": 6.49, "grad_norm": 0.4969242215156555, "learning_rate": 3.5103510351035106e-05, "loss": 0.1322, "mean_token_accuracy": 0.9865708947181702, "step": 64900 }, { "epoch": 6.492, "grad_norm": 0.44470611214637756, "learning_rate": 3.508350835083509e-05, "loss": 0.1113, "mean_token_accuracy": 0.9873694986104965, "step": 64920 }, { "epoch": 6.494, "grad_norm": 0.24348346889019012, "learning_rate": 3.5063506350635066e-05, "loss": 0.2178, "mean_token_accuracy": 0.9869630575180054, "step": 64940 }, { "epoch": 6.496, "grad_norm": 0.3169240653514862, "learning_rate": 3.504350435043505e-05, "loss": 0.2573, "mean_token_accuracy": 0.9852478832006455, "step": 64960 }, { "epoch": 6.498, "grad_norm": 0.5856525301933289, "learning_rate": 3.5023502350235025e-05, "loss": 0.3131, "mean_token_accuracy": 0.9794980108737945, "step": 64980 }, { "epoch": 6.5, "grad_norm": 0.2033928483724594, "learning_rate": 3.500350035003501e-05, "loss": 0.1034, "mean_token_accuracy": 0.9881351113319397, "step": 65000 }, { "epoch": 6.502, "grad_norm": 0.25470849871635437, "learning_rate": 3.4983498349834984e-05, "loss": 0.1954, "mean_token_accuracy": 0.989227831363678, "step": 65020 }, { "epoch": 6.504, "grad_norm": 0.2394360601902008, "learning_rate": 3.496349634963497e-05, "loss": 0.1781, "mean_token_accuracy": 0.985652819275856, "step": 65040 }, { "epoch": 6.506, "grad_norm": 0.1907133311033249, "learning_rate": 3.4943494349434944e-05, "loss": 0.1568, "mean_token_accuracy": 0.9887098580598831, "step": 65060 }, { "epoch": 6.508, "grad_norm": 0.6740947961807251, "learning_rate": 3.492349234923493e-05, "loss": 0.2318, "mean_token_accuracy": 0.9872565478086471, "step": 65080 }, { "epoch": 6.51, "grad_norm": 0.31721019744873047, "learning_rate": 3.490349034903491e-05, "loss": 0.1634, "mean_token_accuracy": 0.9840515375137329, "step": 65100 }, { "epoch": 6.5120000000000005, "grad_norm": 0.35921400785446167, "learning_rate": 3.4883488348834886e-05, "loss": 0.2144, "mean_token_accuracy": 0.9872325479984283, "step": 65120 }, { "epoch": 6.514, "grad_norm": 0.25063034892082214, "learning_rate": 3.486348634863487e-05, "loss": 0.1483, "mean_token_accuracy": 0.988674345612526, "step": 65140 }, { "epoch": 6.516, "grad_norm": 0.5194647908210754, "learning_rate": 3.4843484348434845e-05, "loss": 0.1237, "mean_token_accuracy": 0.9864558905363083, "step": 65160 }, { "epoch": 6.518, "grad_norm": 0.30012306571006775, "learning_rate": 3.482348234823483e-05, "loss": 0.1008, "mean_token_accuracy": 0.9855094760656357, "step": 65180 }, { "epoch": 6.52, "grad_norm": 0.2475326657295227, "learning_rate": 3.4803480348034805e-05, "loss": 0.1773, "mean_token_accuracy": 0.9887704491615296, "step": 65200 }, { "epoch": 6.522, "grad_norm": 0.3020193874835968, "learning_rate": 3.478347834783479e-05, "loss": 0.1853, "mean_token_accuracy": 0.9881248742341995, "step": 65220 }, { "epoch": 6.524, "grad_norm": 0.26361531019210815, "learning_rate": 3.4763476347634764e-05, "loss": 0.1496, "mean_token_accuracy": 0.986868703365326, "step": 65240 }, { "epoch": 6.526, "grad_norm": 0.34751096367836, "learning_rate": 3.474347434743475e-05, "loss": 0.121, "mean_token_accuracy": 0.9846732199192048, "step": 65260 }, { "epoch": 6.5280000000000005, "grad_norm": 0.2599371671676636, "learning_rate": 3.472347234723472e-05, "loss": 0.2068, "mean_token_accuracy": 0.9877131402492523, "step": 65280 }, { "epoch": 6.53, "grad_norm": 0.25878310203552246, "learning_rate": 3.4703470347034706e-05, "loss": 0.2145, "mean_token_accuracy": 0.9841939240694046, "step": 65300 }, { "epoch": 6.532, "grad_norm": 0.2514811158180237, "learning_rate": 3.468346834683468e-05, "loss": 0.1328, "mean_token_accuracy": 0.9876660674810409, "step": 65320 }, { "epoch": 6.534, "grad_norm": 0.2900117039680481, "learning_rate": 3.4663466346634666e-05, "loss": 0.1852, "mean_token_accuracy": 0.9841011852025986, "step": 65340 }, { "epoch": 6.536, "grad_norm": 9.43431282043457, "learning_rate": 3.464346434643465e-05, "loss": 0.1889, "mean_token_accuracy": 0.9860668867826462, "step": 65360 }, { "epoch": 6.538, "grad_norm": 3.867013454437256, "learning_rate": 3.4623462346234625e-05, "loss": 0.3195, "mean_token_accuracy": 0.9834309399127961, "step": 65380 }, { "epoch": 6.54, "grad_norm": 0.2462446689605713, "learning_rate": 3.460346034603461e-05, "loss": 0.2567, "mean_token_accuracy": 0.9813687205314636, "step": 65400 }, { "epoch": 6.542, "grad_norm": 0.28149789571762085, "learning_rate": 3.4583458345834584e-05, "loss": 0.1898, "mean_token_accuracy": 0.9887377411127091, "step": 65420 }, { "epoch": 6.5440000000000005, "grad_norm": 10.527507781982422, "learning_rate": 3.456345634563457e-05, "loss": 0.1562, "mean_token_accuracy": 0.9861860424280167, "step": 65440 }, { "epoch": 6.546, "grad_norm": 0.2952827513217926, "learning_rate": 3.4543454345434544e-05, "loss": 0.1504, "mean_token_accuracy": 0.986117422580719, "step": 65460 }, { "epoch": 6.548, "grad_norm": 0.30273720622062683, "learning_rate": 3.452345234523453e-05, "loss": 0.1509, "mean_token_accuracy": 0.9869212001562119, "step": 65480 }, { "epoch": 6.55, "grad_norm": 0.21065431833267212, "learning_rate": 3.45034503450345e-05, "loss": 0.2143, "mean_token_accuracy": 0.9835163921117782, "step": 65500 }, { "epoch": 6.552, "grad_norm": 0.2854054570198059, "learning_rate": 3.4483448344834486e-05, "loss": 0.1679, "mean_token_accuracy": 0.9887353748083114, "step": 65520 }, { "epoch": 6.554, "grad_norm": 0.28037339448928833, "learning_rate": 3.446344634463446e-05, "loss": 0.1569, "mean_token_accuracy": 0.9867696642875672, "step": 65540 }, { "epoch": 6.556, "grad_norm": 0.33940213918685913, "learning_rate": 3.4443444344434445e-05, "loss": 0.1584, "mean_token_accuracy": 0.9865298241376876, "step": 65560 }, { "epoch": 6.558, "grad_norm": 0.21554148197174072, "learning_rate": 3.442344234423442e-05, "loss": 0.1557, "mean_token_accuracy": 0.9880954951047898, "step": 65580 }, { "epoch": 6.5600000000000005, "grad_norm": 0.33988118171691895, "learning_rate": 3.4403440344034405e-05, "loss": 0.233, "mean_token_accuracy": 0.9869731038808822, "step": 65600 }, { "epoch": 6.562, "grad_norm": 0.22510764002799988, "learning_rate": 3.438343834383439e-05, "loss": 0.1558, "mean_token_accuracy": 0.9885844707489013, "step": 65620 }, { "epoch": 6.564, "grad_norm": 0.3186708688735962, "learning_rate": 3.4363436343634364e-05, "loss": 0.1514, "mean_token_accuracy": 0.9863959729671479, "step": 65640 }, { "epoch": 6.566, "grad_norm": 0.3562796711921692, "learning_rate": 3.434343434343435e-05, "loss": 0.1246, "mean_token_accuracy": 0.9841464042663575, "step": 65660 }, { "epoch": 6.568, "grad_norm": 0.26717644929885864, "learning_rate": 3.432343234323432e-05, "loss": 0.14, "mean_token_accuracy": 0.9864898949861527, "step": 65680 }, { "epoch": 6.57, "grad_norm": 0.9867539405822754, "learning_rate": 3.4303430343034306e-05, "loss": 0.1539, "mean_token_accuracy": 0.9889231055974961, "step": 65700 }, { "epoch": 6.572, "grad_norm": 0.28953826427459717, "learning_rate": 3.428342834283428e-05, "loss": 0.1446, "mean_token_accuracy": 0.9880344808101654, "step": 65720 }, { "epoch": 6.574, "grad_norm": 0.2548098564147949, "learning_rate": 3.4263426342634266e-05, "loss": 0.2924, "mean_token_accuracy": 0.9860222071409226, "step": 65740 }, { "epoch": 6.576, "grad_norm": 0.2960788309574127, "learning_rate": 3.424342434243424e-05, "loss": 0.1392, "mean_token_accuracy": 0.9848441511392594, "step": 65760 }, { "epoch": 6.578, "grad_norm": 0.37591812014579773, "learning_rate": 3.4223422342234225e-05, "loss": 0.2468, "mean_token_accuracy": 0.9831920832395553, "step": 65780 }, { "epoch": 6.58, "grad_norm": 3.008094310760498, "learning_rate": 3.42034203420342e-05, "loss": 0.1932, "mean_token_accuracy": 0.981335711479187, "step": 65800 }, { "epoch": 6.582, "grad_norm": 0.4146102964878082, "learning_rate": 3.4183418341834184e-05, "loss": 0.2408, "mean_token_accuracy": 0.9874918967485428, "step": 65820 }, { "epoch": 6.584, "grad_norm": 0.2505679726600647, "learning_rate": 3.416341634163416e-05, "loss": 0.213, "mean_token_accuracy": 0.9848211109638214, "step": 65840 }, { "epoch": 6.586, "grad_norm": 0.2763100266456604, "learning_rate": 3.4143414341434144e-05, "loss": 0.1467, "mean_token_accuracy": 0.9863694489002228, "step": 65860 }, { "epoch": 6.588, "grad_norm": 0.2883995771408081, "learning_rate": 3.4123412341234127e-05, "loss": 0.2814, "mean_token_accuracy": 0.985490944981575, "step": 65880 }, { "epoch": 6.59, "grad_norm": 0.3209068477153778, "learning_rate": 3.41034103410341e-05, "loss": 0.161, "mean_token_accuracy": 0.9883510082960129, "step": 65900 }, { "epoch": 6.592, "grad_norm": 0.3681967258453369, "learning_rate": 3.4083408340834086e-05, "loss": 0.1181, "mean_token_accuracy": 0.9851167440414429, "step": 65920 }, { "epoch": 6.594, "grad_norm": 0.23478251695632935, "learning_rate": 3.406340634063406e-05, "loss": 0.2797, "mean_token_accuracy": 0.9839485317468644, "step": 65940 }, { "epoch": 6.596, "grad_norm": 0.2355402559041977, "learning_rate": 3.4043404340434045e-05, "loss": 0.1612, "mean_token_accuracy": 0.9878865987062454, "step": 65960 }, { "epoch": 6.598, "grad_norm": 0.30481576919555664, "learning_rate": 3.402340234023402e-05, "loss": 0.1765, "mean_token_accuracy": 0.9871097505092621, "step": 65980 }, { "epoch": 6.6, "grad_norm": 0.5653330683708191, "learning_rate": 3.4003400340034005e-05, "loss": 0.2023, "mean_token_accuracy": 0.9877082020044327, "step": 66000 }, { "epoch": 6.602, "grad_norm": 0.3429393768310547, "learning_rate": 3.398339833983398e-05, "loss": 0.2024, "mean_token_accuracy": 0.9884268552064895, "step": 66020 }, { "epoch": 6.604, "grad_norm": 0.24607805907726288, "learning_rate": 3.3963396339633964e-05, "loss": 0.1702, "mean_token_accuracy": 0.9849891006946564, "step": 66040 }, { "epoch": 6.606, "grad_norm": 0.27575555443763733, "learning_rate": 3.394339433943394e-05, "loss": 0.205, "mean_token_accuracy": 0.9842185050249099, "step": 66060 }, { "epoch": 6.608, "grad_norm": 0.2980971932411194, "learning_rate": 3.392339233923392e-05, "loss": 0.1836, "mean_token_accuracy": 0.9884531289339066, "step": 66080 }, { "epoch": 6.61, "grad_norm": 0.25330203771591187, "learning_rate": 3.39033903390339e-05, "loss": 0.14, "mean_token_accuracy": 0.9857312232255936, "step": 66100 }, { "epoch": 6.612, "grad_norm": 0.27526959776878357, "learning_rate": 3.388338833883388e-05, "loss": 0.1325, "mean_token_accuracy": 0.988544425368309, "step": 66120 }, { "epoch": 6.614, "grad_norm": 0.2577066421508789, "learning_rate": 3.3863386338633866e-05, "loss": 0.2489, "mean_token_accuracy": 0.9873633593320846, "step": 66140 }, { "epoch": 6.616, "grad_norm": 0.8204617500305176, "learning_rate": 3.384338433843384e-05, "loss": 0.1461, "mean_token_accuracy": 0.9888998448848725, "step": 66160 }, { "epoch": 6.618, "grad_norm": 8.482430458068848, "learning_rate": 3.3823382338233825e-05, "loss": 0.2545, "mean_token_accuracy": 0.9846042037010193, "step": 66180 }, { "epoch": 6.62, "grad_norm": 0.3015037477016449, "learning_rate": 3.380338033803381e-05, "loss": 0.1456, "mean_token_accuracy": 0.9881495058536529, "step": 66200 }, { "epoch": 6.622, "grad_norm": 8.676420211791992, "learning_rate": 3.378337833783379e-05, "loss": 0.1976, "mean_token_accuracy": 0.9804931282997131, "step": 66220 }, { "epoch": 6.624, "grad_norm": 0.4197008013725281, "learning_rate": 3.376337633763377e-05, "loss": 0.2068, "mean_token_accuracy": 0.9863222807645797, "step": 66240 }, { "epoch": 6.626, "grad_norm": 0.22658655047416687, "learning_rate": 3.374337433743375e-05, "loss": 0.1414, "mean_token_accuracy": 0.9864144712686539, "step": 66260 }, { "epoch": 6.628, "grad_norm": 0.6414458751678467, "learning_rate": 3.3723372337233727e-05, "loss": 0.2526, "mean_token_accuracy": 0.9827042579650879, "step": 66280 }, { "epoch": 6.63, "grad_norm": 0.31275904178619385, "learning_rate": 3.370337033703371e-05, "loss": 0.1605, "mean_token_accuracy": 0.9863550662994385, "step": 66300 }, { "epoch": 6.632, "grad_norm": 0.24986162781715393, "learning_rate": 3.3683368336833686e-05, "loss": 0.1669, "mean_token_accuracy": 0.9855017274618149, "step": 66320 }, { "epoch": 6.634, "grad_norm": 0.8875067830085754, "learning_rate": 3.366336633663367e-05, "loss": 0.1764, "mean_token_accuracy": 0.9875601291656494, "step": 66340 }, { "epoch": 6.636, "grad_norm": 0.25811055302619934, "learning_rate": 3.3643364336433645e-05, "loss": 0.1541, "mean_token_accuracy": 0.9830195665359497, "step": 66360 }, { "epoch": 6.638, "grad_norm": 0.7111265063285828, "learning_rate": 3.362336233623363e-05, "loss": 0.16, "mean_token_accuracy": 0.9875197410583496, "step": 66380 }, { "epoch": 6.64, "grad_norm": 0.22530420124530792, "learning_rate": 3.3603360336033604e-05, "loss": 0.1109, "mean_token_accuracy": 0.9871774911880493, "step": 66400 }, { "epoch": 6.642, "grad_norm": 0.22556200623512268, "learning_rate": 3.358335833583359e-05, "loss": 0.2371, "mean_token_accuracy": 0.9874253988265991, "step": 66420 }, { "epoch": 6.644, "grad_norm": 0.6490240693092346, "learning_rate": 3.3563356335633564e-05, "loss": 0.1946, "mean_token_accuracy": 0.989126768708229, "step": 66440 }, { "epoch": 6.646, "grad_norm": 0.3039954900741577, "learning_rate": 3.354335433543355e-05, "loss": 0.1601, "mean_token_accuracy": 0.9868967175483704, "step": 66460 }, { "epoch": 6.648, "grad_norm": 0.26356399059295654, "learning_rate": 3.352335233523353e-05, "loss": 0.208, "mean_token_accuracy": 0.9877393305301666, "step": 66480 }, { "epoch": 6.65, "grad_norm": 0.2841387391090393, "learning_rate": 3.3503350335033506e-05, "loss": 0.1878, "mean_token_accuracy": 0.9862100094556808, "step": 66500 }, { "epoch": 6.652, "grad_norm": 0.35584843158721924, "learning_rate": 3.348334833483349e-05, "loss": 0.166, "mean_token_accuracy": 0.9884832799434662, "step": 66520 }, { "epoch": 6.654, "grad_norm": 0.2699015140533447, "learning_rate": 3.3463346334633465e-05, "loss": 0.2326, "mean_token_accuracy": 0.9873291581869126, "step": 66540 }, { "epoch": 6.656, "grad_norm": 0.2778572738170624, "learning_rate": 3.344334433443345e-05, "loss": 0.1573, "mean_token_accuracy": 0.9877862930297852, "step": 66560 }, { "epoch": 6.658, "grad_norm": 0.24280045926570892, "learning_rate": 3.3423342334233425e-05, "loss": 0.1436, "mean_token_accuracy": 0.9839800238609314, "step": 66580 }, { "epoch": 6.66, "grad_norm": 0.32988959550857544, "learning_rate": 3.340334033403341e-05, "loss": 0.2395, "mean_token_accuracy": 0.9891884714365006, "step": 66600 }, { "epoch": 6.662, "grad_norm": 0.19273823499679565, "learning_rate": 3.3383338333833384e-05, "loss": 0.1117, "mean_token_accuracy": 0.9866424381732941, "step": 66620 }, { "epoch": 6.664, "grad_norm": 22.25006675720215, "learning_rate": 3.336333633363337e-05, "loss": 0.192, "mean_token_accuracy": 0.9853893876075744, "step": 66640 }, { "epoch": 6.666, "grad_norm": 0.37878134846687317, "learning_rate": 3.3343334333433343e-05, "loss": 0.1784, "mean_token_accuracy": 0.9836116075515747, "step": 66660 }, { "epoch": 6.668, "grad_norm": 0.28100696206092834, "learning_rate": 3.3323332333233326e-05, "loss": 0.1834, "mean_token_accuracy": 0.9865212142467499, "step": 66680 }, { "epoch": 6.67, "grad_norm": 6.001819610595703, "learning_rate": 3.33033303330333e-05, "loss": 0.2986, "mean_token_accuracy": 0.9851398855447769, "step": 66700 }, { "epoch": 6.672, "grad_norm": 0.2698683738708496, "learning_rate": 3.3283328332833286e-05, "loss": 0.1842, "mean_token_accuracy": 0.98656947016716, "step": 66720 }, { "epoch": 6.674, "grad_norm": 1.235374093055725, "learning_rate": 3.326332633263327e-05, "loss": 0.2262, "mean_token_accuracy": 0.9889723300933838, "step": 66740 }, { "epoch": 6.676, "grad_norm": 0.31436681747436523, "learning_rate": 3.3243324332433245e-05, "loss": 0.1203, "mean_token_accuracy": 0.9895979076623916, "step": 66760 }, { "epoch": 6.678, "grad_norm": 0.3051697909832001, "learning_rate": 3.322332233223323e-05, "loss": 0.1844, "mean_token_accuracy": 0.9810862272977829, "step": 66780 }, { "epoch": 6.68, "grad_norm": 0.38624250888824463, "learning_rate": 3.3203320332033204e-05, "loss": 0.2051, "mean_token_accuracy": 0.9842140942811965, "step": 66800 }, { "epoch": 6.682, "grad_norm": 0.3333258032798767, "learning_rate": 3.318331833183319e-05, "loss": 0.2049, "mean_token_accuracy": 0.9828038513660431, "step": 66820 }, { "epoch": 6.684, "grad_norm": 0.3059077262878418, "learning_rate": 3.3163316331633164e-05, "loss": 0.3231, "mean_token_accuracy": 0.9819667905569076, "step": 66840 }, { "epoch": 6.686, "grad_norm": 0.23571985960006714, "learning_rate": 3.314331433143315e-05, "loss": 0.2766, "mean_token_accuracy": 0.9828603684902191, "step": 66860 }, { "epoch": 6.688, "grad_norm": 0.3341544568538666, "learning_rate": 3.312331233123312e-05, "loss": 0.2, "mean_token_accuracy": 0.981201457977295, "step": 66880 }, { "epoch": 6.6899999999999995, "grad_norm": 0.20517708361148834, "learning_rate": 3.3103310331033106e-05, "loss": 0.1359, "mean_token_accuracy": 0.9861009567975998, "step": 66900 }, { "epoch": 6.692, "grad_norm": 0.20901262760162354, "learning_rate": 3.308330833083308e-05, "loss": 0.1491, "mean_token_accuracy": 0.9805674880743027, "step": 66920 }, { "epoch": 6.694, "grad_norm": 0.38141506910324097, "learning_rate": 3.3063306330633065e-05, "loss": 0.2047, "mean_token_accuracy": 0.9824846893548965, "step": 66940 }, { "epoch": 6.696, "grad_norm": 0.32883167266845703, "learning_rate": 3.304330433043304e-05, "loss": 0.2109, "mean_token_accuracy": 0.9882402628660202, "step": 66960 }, { "epoch": 6.698, "grad_norm": 0.2746795415878296, "learning_rate": 3.3023302330233025e-05, "loss": 0.1618, "mean_token_accuracy": 0.9888942569494248, "step": 66980 }, { "epoch": 6.7, "grad_norm": 0.3388822376728058, "learning_rate": 3.300330033003301e-05, "loss": 0.2288, "mean_token_accuracy": 0.9852576136589051, "step": 67000 }, { "epoch": 6.702, "grad_norm": 0.23704639077186584, "learning_rate": 3.2983298329832984e-05, "loss": 0.1862, "mean_token_accuracy": 0.9825613260269165, "step": 67020 }, { "epoch": 6.704, "grad_norm": 0.23309941589832306, "learning_rate": 3.296329632963297e-05, "loss": 0.1943, "mean_token_accuracy": 0.9871392101049423, "step": 67040 }, { "epoch": 6.7059999999999995, "grad_norm": 0.3322908878326416, "learning_rate": 3.294329432943294e-05, "loss": 0.192, "mean_token_accuracy": 0.9876873016357421, "step": 67060 }, { "epoch": 6.708, "grad_norm": 0.368615061044693, "learning_rate": 3.2923292329232926e-05, "loss": 0.2952, "mean_token_accuracy": 0.9833579957485199, "step": 67080 }, { "epoch": 6.71, "grad_norm": 0.29024094343185425, "learning_rate": 3.29032903290329e-05, "loss": 0.1535, "mean_token_accuracy": 0.9885296612977982, "step": 67100 }, { "epoch": 6.712, "grad_norm": 0.3332720398902893, "learning_rate": 3.2883288328832886e-05, "loss": 0.1752, "mean_token_accuracy": 0.9885250508785248, "step": 67120 }, { "epoch": 6.714, "grad_norm": 0.31717562675476074, "learning_rate": 3.286328632863286e-05, "loss": 0.1627, "mean_token_accuracy": 0.9863089680671692, "step": 67140 }, { "epoch": 6.716, "grad_norm": 0.1809220165014267, "learning_rate": 3.2843284328432845e-05, "loss": 0.2254, "mean_token_accuracy": 0.9863983750343323, "step": 67160 }, { "epoch": 6.718, "grad_norm": 0.22698359191417694, "learning_rate": 3.282328232823282e-05, "loss": 0.1427, "mean_token_accuracy": 0.9863959521055221, "step": 67180 }, { "epoch": 6.72, "grad_norm": 0.37504521012306213, "learning_rate": 3.2803280328032804e-05, "loss": 0.2497, "mean_token_accuracy": 0.986793264746666, "step": 67200 }, { "epoch": 6.7219999999999995, "grad_norm": 0.22187499701976776, "learning_rate": 3.278327832783278e-05, "loss": 0.1311, "mean_token_accuracy": 0.9885304778814316, "step": 67220 }, { "epoch": 6.724, "grad_norm": 3.8319780826568604, "learning_rate": 3.2763276327632764e-05, "loss": 0.1758, "mean_token_accuracy": 0.9843073695898056, "step": 67240 }, { "epoch": 6.726, "grad_norm": 0.3025054931640625, "learning_rate": 3.274327432743275e-05, "loss": 0.2147, "mean_token_accuracy": 0.9839147478342056, "step": 67260 }, { "epoch": 6.728, "grad_norm": 0.2283153235912323, "learning_rate": 3.272327232723272e-05, "loss": 0.1584, "mean_token_accuracy": 0.9848929047584534, "step": 67280 }, { "epoch": 6.73, "grad_norm": 0.1698990762233734, "learning_rate": 3.2703270327032706e-05, "loss": 0.3386, "mean_token_accuracy": 0.9745432734489441, "step": 67300 }, { "epoch": 6.732, "grad_norm": 0.2796860635280609, "learning_rate": 3.268326832683268e-05, "loss": 0.1555, "mean_token_accuracy": 0.9873958885669708, "step": 67320 }, { "epoch": 6.734, "grad_norm": 0.31227266788482666, "learning_rate": 3.2663266326632665e-05, "loss": 0.1926, "mean_token_accuracy": 0.9855979651212692, "step": 67340 }, { "epoch": 6.736, "grad_norm": 0.28321322798728943, "learning_rate": 3.264326432643264e-05, "loss": 0.1684, "mean_token_accuracy": 0.9876653283834458, "step": 67360 }, { "epoch": 6.7379999999999995, "grad_norm": 0.1991456001996994, "learning_rate": 3.2623262326232625e-05, "loss": 0.1819, "mean_token_accuracy": 0.9871483951807022, "step": 67380 }, { "epoch": 6.74, "grad_norm": 0.25637978315353394, "learning_rate": 3.26032603260326e-05, "loss": 0.1423, "mean_token_accuracy": 0.9874047040939331, "step": 67400 }, { "epoch": 6.742, "grad_norm": 0.2250969558954239, "learning_rate": 3.2583258325832584e-05, "loss": 0.1912, "mean_token_accuracy": 0.9786381661891937, "step": 67420 }, { "epoch": 6.744, "grad_norm": 0.4206317365169525, "learning_rate": 3.256325632563256e-05, "loss": 0.1697, "mean_token_accuracy": 0.98397755920887, "step": 67440 }, { "epoch": 6.746, "grad_norm": 0.2994478940963745, "learning_rate": 3.254325432543254e-05, "loss": 0.1674, "mean_token_accuracy": 0.9887321650981903, "step": 67460 }, { "epoch": 6.748, "grad_norm": 0.30622997879981995, "learning_rate": 3.252325232523252e-05, "loss": 0.1292, "mean_token_accuracy": 0.9872935265302658, "step": 67480 }, { "epoch": 6.75, "grad_norm": 0.320875883102417, "learning_rate": 3.25032503250325e-05, "loss": 0.1152, "mean_token_accuracy": 0.9882820695638657, "step": 67500 }, { "epoch": 6.752, "grad_norm": 0.2548742890357971, "learning_rate": 3.2483248324832486e-05, "loss": 0.1639, "mean_token_accuracy": 0.9866363942623139, "step": 67520 }, { "epoch": 6.754, "grad_norm": 0.2628532350063324, "learning_rate": 3.246324632463246e-05, "loss": 0.328, "mean_token_accuracy": 0.9870528250932693, "step": 67540 }, { "epoch": 6.756, "grad_norm": 0.40560418367385864, "learning_rate": 3.2443244324432445e-05, "loss": 0.3059, "mean_token_accuracy": 0.9855130910873413, "step": 67560 }, { "epoch": 6.758, "grad_norm": 0.4384123384952545, "learning_rate": 3.242324232423242e-05, "loss": 0.2455, "mean_token_accuracy": 0.974761575460434, "step": 67580 }, { "epoch": 6.76, "grad_norm": 0.29215511679649353, "learning_rate": 3.2403240324032404e-05, "loss": 0.1726, "mean_token_accuracy": 0.990128967165947, "step": 67600 }, { "epoch": 6.7620000000000005, "grad_norm": 0.3757854700088501, "learning_rate": 3.238323832383238e-05, "loss": 0.1542, "mean_token_accuracy": 0.9849118947982788, "step": 67620 }, { "epoch": 6.764, "grad_norm": 0.4048632085323334, "learning_rate": 3.2363236323632364e-05, "loss": 0.2229, "mean_token_accuracy": 0.9861485362052917, "step": 67640 }, { "epoch": 6.766, "grad_norm": 0.41197705268859863, "learning_rate": 3.234323432343234e-05, "loss": 0.2416, "mean_token_accuracy": 0.9817881077528, "step": 67660 }, { "epoch": 6.768, "grad_norm": 0.334850549697876, "learning_rate": 3.232323232323233e-05, "loss": 0.1408, "mean_token_accuracy": 0.9879557847976684, "step": 67680 }, { "epoch": 6.77, "grad_norm": 0.27262434363365173, "learning_rate": 3.2303230323032306e-05, "loss": 0.188, "mean_token_accuracy": 0.9886330723762512, "step": 67700 }, { "epoch": 6.772, "grad_norm": 0.215382382273674, "learning_rate": 3.228322832283229e-05, "loss": 0.193, "mean_token_accuracy": 0.9867016077041626, "step": 67720 }, { "epoch": 6.774, "grad_norm": 0.37804049253463745, "learning_rate": 3.2263226322632265e-05, "loss": 0.2394, "mean_token_accuracy": 0.9865109175443649, "step": 67740 }, { "epoch": 6.776, "grad_norm": 0.2594779133796692, "learning_rate": 3.224322432243225e-05, "loss": 0.2369, "mean_token_accuracy": 0.983840137720108, "step": 67760 }, { "epoch": 6.7780000000000005, "grad_norm": 4.518535137176514, "learning_rate": 3.2223222322232225e-05, "loss": 0.2627, "mean_token_accuracy": 0.9824739277362824, "step": 67780 }, { "epoch": 6.78, "grad_norm": 0.2568599581718445, "learning_rate": 3.220322032203221e-05, "loss": 0.1308, "mean_token_accuracy": 0.9893053829669952, "step": 67800 }, { "epoch": 6.782, "grad_norm": 0.3522271513938904, "learning_rate": 3.2183218321832184e-05, "loss": 0.1906, "mean_token_accuracy": 0.9887803196907043, "step": 67820 }, { "epoch": 6.784, "grad_norm": 0.218488872051239, "learning_rate": 3.216321632163217e-05, "loss": 0.1109, "mean_token_accuracy": 0.9848962247371673, "step": 67840 }, { "epoch": 6.786, "grad_norm": 0.31839674711227417, "learning_rate": 3.214321432143215e-05, "loss": 0.2184, "mean_token_accuracy": 0.9863736361265183, "step": 67860 }, { "epoch": 6.788, "grad_norm": 1.6474993228912354, "learning_rate": 3.2123212321232126e-05, "loss": 0.1579, "mean_token_accuracy": 0.9856426000595093, "step": 67880 }, { "epoch": 6.79, "grad_norm": 0.23062625527381897, "learning_rate": 3.210321032103211e-05, "loss": 0.1282, "mean_token_accuracy": 0.9876395732164382, "step": 67900 }, { "epoch": 6.792, "grad_norm": 0.5541673302650452, "learning_rate": 3.2083208320832086e-05, "loss": 0.1702, "mean_token_accuracy": 0.9839796870946884, "step": 67920 }, { "epoch": 6.7940000000000005, "grad_norm": 0.19621090590953827, "learning_rate": 3.206320632063207e-05, "loss": 0.1875, "mean_token_accuracy": 0.9889530390501022, "step": 67940 }, { "epoch": 6.796, "grad_norm": 0.35208648443222046, "learning_rate": 3.2043204320432045e-05, "loss": 0.2, "mean_token_accuracy": 0.9877236157655715, "step": 67960 }, { "epoch": 6.798, "grad_norm": 0.4297434985637665, "learning_rate": 3.202320232023203e-05, "loss": 0.2156, "mean_token_accuracy": 0.9873518258333206, "step": 67980 }, { "epoch": 6.8, "grad_norm": 0.23863787949085236, "learning_rate": 3.2003200320032004e-05, "loss": 0.1143, "mean_token_accuracy": 0.9882578641176224, "step": 68000 }, { "epoch": 6.802, "grad_norm": 0.29143643379211426, "learning_rate": 3.198319831983199e-05, "loss": 0.19, "mean_token_accuracy": 0.9883712470531464, "step": 68020 }, { "epoch": 6.804, "grad_norm": 0.2614004909992218, "learning_rate": 3.1963196319631964e-05, "loss": 0.1429, "mean_token_accuracy": 0.988724285364151, "step": 68040 }, { "epoch": 6.806, "grad_norm": 0.9233289361000061, "learning_rate": 3.1943194319431947e-05, "loss": 0.1347, "mean_token_accuracy": 0.9890417784452439, "step": 68060 }, { "epoch": 6.808, "grad_norm": 0.6898273229598999, "learning_rate": 3.192319231923192e-05, "loss": 0.2051, "mean_token_accuracy": 0.9834380477666855, "step": 68080 }, { "epoch": 6.8100000000000005, "grad_norm": 0.9228523969650269, "learning_rate": 3.1903190319031906e-05, "loss": 0.1885, "mean_token_accuracy": 0.9817050814628601, "step": 68100 }, { "epoch": 6.812, "grad_norm": 0.2223232537508011, "learning_rate": 3.188318831883189e-05, "loss": 0.158, "mean_token_accuracy": 0.9888252437114715, "step": 68120 }, { "epoch": 6.814, "grad_norm": 0.22317834198474884, "learning_rate": 3.1863186318631865e-05, "loss": 0.176, "mean_token_accuracy": 0.9867993026971817, "step": 68140 }, { "epoch": 6.816, "grad_norm": 0.24395714700222015, "learning_rate": 3.184318431843185e-05, "loss": 0.2485, "mean_token_accuracy": 0.9864526867866517, "step": 68160 }, { "epoch": 6.818, "grad_norm": 0.21640793979167938, "learning_rate": 3.1823182318231825e-05, "loss": 0.1479, "mean_token_accuracy": 0.9871033281087875, "step": 68180 }, { "epoch": 6.82, "grad_norm": 0.22830717265605927, "learning_rate": 3.180318031803181e-05, "loss": 0.1491, "mean_token_accuracy": 0.9866976916790009, "step": 68200 }, { "epoch": 6.822, "grad_norm": 0.25865185260772705, "learning_rate": 3.1783178317831784e-05, "loss": 0.2021, "mean_token_accuracy": 0.9883598834276199, "step": 68220 }, { "epoch": 6.824, "grad_norm": 0.3863964378833771, "learning_rate": 3.176317631763177e-05, "loss": 0.2256, "mean_token_accuracy": 0.986169746518135, "step": 68240 }, { "epoch": 6.826, "grad_norm": 0.4281299114227295, "learning_rate": 3.174317431743174e-05, "loss": 0.1523, "mean_token_accuracy": 0.9874548286199569, "step": 68260 }, { "epoch": 6.828, "grad_norm": 0.9782912731170654, "learning_rate": 3.1723172317231726e-05, "loss": 0.2152, "mean_token_accuracy": 0.9869198113679886, "step": 68280 }, { "epoch": 6.83, "grad_norm": 0.2953052818775177, "learning_rate": 3.17031703170317e-05, "loss": 0.2568, "mean_token_accuracy": 0.9839261889457702, "step": 68300 }, { "epoch": 6.832, "grad_norm": 0.3077390789985657, "learning_rate": 3.1683168316831686e-05, "loss": 0.1455, "mean_token_accuracy": 0.9886887848377228, "step": 68320 }, { "epoch": 6.834, "grad_norm": 0.22098474204540253, "learning_rate": 3.166316631663166e-05, "loss": 0.1832, "mean_token_accuracy": 0.9886153310537338, "step": 68340 }, { "epoch": 6.836, "grad_norm": 0.3550775349140167, "learning_rate": 3.1643164316431645e-05, "loss": 0.1916, "mean_token_accuracy": 0.9854147911071778, "step": 68360 }, { "epoch": 6.838, "grad_norm": 0.21668541431427002, "learning_rate": 3.162316231623163e-05, "loss": 0.1994, "mean_token_accuracy": 0.9894854754209519, "step": 68380 }, { "epoch": 6.84, "grad_norm": 0.29914531111717224, "learning_rate": 3.1603160316031604e-05, "loss": 0.1813, "mean_token_accuracy": 0.9851445347070694, "step": 68400 }, { "epoch": 6.842, "grad_norm": 0.24209974706172943, "learning_rate": 3.158315831583159e-05, "loss": 0.2602, "mean_token_accuracy": 0.9856644958257675, "step": 68420 }, { "epoch": 6.844, "grad_norm": 0.20449785888195038, "learning_rate": 3.1563156315631564e-05, "loss": 0.2271, "mean_token_accuracy": 0.9863549023866653, "step": 68440 }, { "epoch": 6.846, "grad_norm": 0.540649950504303, "learning_rate": 3.1543154315431547e-05, "loss": 0.2811, "mean_token_accuracy": 0.9832740992307663, "step": 68460 }, { "epoch": 6.848, "grad_norm": 0.2816218435764313, "learning_rate": 3.152315231523152e-05, "loss": 0.1789, "mean_token_accuracy": 0.9863565534353256, "step": 68480 }, { "epoch": 6.85, "grad_norm": 0.28009310364723206, "learning_rate": 3.1503150315031506e-05, "loss": 0.1833, "mean_token_accuracy": 0.9888320088386535, "step": 68500 }, { "epoch": 6.852, "grad_norm": 0.37210869789123535, "learning_rate": 3.148314831483148e-05, "loss": 0.1887, "mean_token_accuracy": 0.9900014221668243, "step": 68520 }, { "epoch": 6.854, "grad_norm": 0.2786544859409332, "learning_rate": 3.1463146314631465e-05, "loss": 0.1818, "mean_token_accuracy": 0.9856808900833129, "step": 68540 }, { "epoch": 6.856, "grad_norm": 1.2909044027328491, "learning_rate": 3.144314431443144e-05, "loss": 0.3337, "mean_token_accuracy": 0.9796359777450562, "step": 68560 }, { "epoch": 6.858, "grad_norm": 0.2546485364437103, "learning_rate": 3.1423142314231425e-05, "loss": 0.2774, "mean_token_accuracy": 0.982107275724411, "step": 68580 }, { "epoch": 6.86, "grad_norm": 0.478765070438385, "learning_rate": 3.14031403140314e-05, "loss": 0.2948, "mean_token_accuracy": 0.9815145641565323, "step": 68600 }, { "epoch": 6.862, "grad_norm": 0.34715837240219116, "learning_rate": 3.1383138313831384e-05, "loss": 0.2787, "mean_token_accuracy": 0.9859073609113693, "step": 68620 }, { "epoch": 6.864, "grad_norm": 0.273219496011734, "learning_rate": 3.136313631363137e-05, "loss": 0.1491, "mean_token_accuracy": 0.9861700236797333, "step": 68640 }, { "epoch": 6.866, "grad_norm": 0.30269500613212585, "learning_rate": 3.134313431343134e-05, "loss": 0.1273, "mean_token_accuracy": 0.985478749871254, "step": 68660 }, { "epoch": 6.868, "grad_norm": 0.23566226661205292, "learning_rate": 3.1323132313231326e-05, "loss": 0.1583, "mean_token_accuracy": 0.9866245329380036, "step": 68680 }, { "epoch": 6.87, "grad_norm": 0.2886464595794678, "learning_rate": 3.13031303130313e-05, "loss": 0.1903, "mean_token_accuracy": 0.9876408457756043, "step": 68700 }, { "epoch": 6.872, "grad_norm": 0.30322951078414917, "learning_rate": 3.1283128312831286e-05, "loss": 0.2349, "mean_token_accuracy": 0.9768937677145004, "step": 68720 }, { "epoch": 6.874, "grad_norm": 0.23224574327468872, "learning_rate": 3.126312631263126e-05, "loss": 0.1159, "mean_token_accuracy": 0.9890233010053635, "step": 68740 }, { "epoch": 6.876, "grad_norm": 0.3122113347053528, "learning_rate": 3.1243124312431245e-05, "loss": 0.1353, "mean_token_accuracy": 0.9868241071701049, "step": 68760 }, { "epoch": 6.878, "grad_norm": 14.301602363586426, "learning_rate": 3.122312231223122e-05, "loss": 0.296, "mean_token_accuracy": 0.9779287964105606, "step": 68780 }, { "epoch": 6.88, "grad_norm": 0.860538899898529, "learning_rate": 3.1203120312031204e-05, "loss": 0.1709, "mean_token_accuracy": 0.988333448767662, "step": 68800 }, { "epoch": 6.882, "grad_norm": 0.34551170468330383, "learning_rate": 3.118311831183118e-05, "loss": 0.1729, "mean_token_accuracy": 0.9888276845216751, "step": 68820 }, { "epoch": 6.884, "grad_norm": 0.8343037962913513, "learning_rate": 3.1163116311631163e-05, "loss": 0.2107, "mean_token_accuracy": 0.9871218472719192, "step": 68840 }, { "epoch": 6.886, "grad_norm": 0.2624155879020691, "learning_rate": 3.114311431143114e-05, "loss": 0.2257, "mean_token_accuracy": 0.9854477882385254, "step": 68860 }, { "epoch": 6.888, "grad_norm": 0.3681867718696594, "learning_rate": 3.112311231123112e-05, "loss": 0.1165, "mean_token_accuracy": 0.9863348096609116, "step": 68880 }, { "epoch": 6.89, "grad_norm": 0.20479971170425415, "learning_rate": 3.1103110311031106e-05, "loss": 0.1992, "mean_token_accuracy": 0.9863291531801224, "step": 68900 }, { "epoch": 6.892, "grad_norm": 0.27564841508865356, "learning_rate": 3.108310831083108e-05, "loss": 0.1855, "mean_token_accuracy": 0.9868548721075058, "step": 68920 }, { "epoch": 6.894, "grad_norm": 0.4634324312210083, "learning_rate": 3.1063106310631065e-05, "loss": 0.1577, "mean_token_accuracy": 0.9873145699501038, "step": 68940 }, { "epoch": 6.896, "grad_norm": 0.35037001967430115, "learning_rate": 3.104310431043104e-05, "loss": 0.1972, "mean_token_accuracy": 0.9764459371566773, "step": 68960 }, { "epoch": 6.898, "grad_norm": 0.3132155239582062, "learning_rate": 3.1023102310231024e-05, "loss": 0.2176, "mean_token_accuracy": 0.9887728959321975, "step": 68980 }, { "epoch": 6.9, "grad_norm": 0.2379944771528244, "learning_rate": 3.1003100310031e-05, "loss": 0.1086, "mean_token_accuracy": 0.9866978317499161, "step": 69000 }, { "epoch": 6.902, "grad_norm": 0.16201472282409668, "learning_rate": 3.0983098309830984e-05, "loss": 0.1422, "mean_token_accuracy": 0.9882227867841721, "step": 69020 }, { "epoch": 6.904, "grad_norm": 0.26545456051826477, "learning_rate": 3.096309630963096e-05, "loss": 0.1486, "mean_token_accuracy": 0.9874608486890792, "step": 69040 }, { "epoch": 6.906, "grad_norm": 0.2523564100265503, "learning_rate": 3.094309430943094e-05, "loss": 0.152, "mean_token_accuracy": 0.9866646885871887, "step": 69060 }, { "epoch": 6.908, "grad_norm": 0.23957517743110657, "learning_rate": 3.092309230923092e-05, "loss": 0.1084, "mean_token_accuracy": 0.9886752992868424, "step": 69080 }, { "epoch": 6.91, "grad_norm": 0.29990366101264954, "learning_rate": 3.09030903090309e-05, "loss": 0.2276, "mean_token_accuracy": 0.9839458227157593, "step": 69100 }, { "epoch": 6.912, "grad_norm": 20.81692886352539, "learning_rate": 3.088308830883088e-05, "loss": 0.1704, "mean_token_accuracy": 0.9876567929983139, "step": 69120 }, { "epoch": 6.914, "grad_norm": 0.19382436573505402, "learning_rate": 3.086308630863086e-05, "loss": 0.2208, "mean_token_accuracy": 0.9885877698659897, "step": 69140 }, { "epoch": 6.916, "grad_norm": 0.42698726058006287, "learning_rate": 3.0843084308430845e-05, "loss": 0.1416, "mean_token_accuracy": 0.9838666409254074, "step": 69160 }, { "epoch": 6.918, "grad_norm": 0.28950241208076477, "learning_rate": 3.082308230823083e-05, "loss": 0.1512, "mean_token_accuracy": 0.9857223808765412, "step": 69180 }, { "epoch": 6.92, "grad_norm": 0.32689014077186584, "learning_rate": 3.0803080308030804e-05, "loss": 0.2636, "mean_token_accuracy": 0.9849308073520661, "step": 69200 }, { "epoch": 6.922, "grad_norm": 0.22183984518051147, "learning_rate": 3.078307830783079e-05, "loss": 0.2784, "mean_token_accuracy": 0.9841628253459931, "step": 69220 }, { "epoch": 6.924, "grad_norm": 0.5368547439575195, "learning_rate": 3.076307630763077e-05, "loss": 0.2192, "mean_token_accuracy": 0.9845182865858078, "step": 69240 }, { "epoch": 6.926, "grad_norm": 0.43609869480133057, "learning_rate": 3.0743074307430746e-05, "loss": 0.1838, "mean_token_accuracy": 0.9795934647321701, "step": 69260 }, { "epoch": 6.928, "grad_norm": 0.18525128066539764, "learning_rate": 3.072307230723073e-05, "loss": 0.1696, "mean_token_accuracy": 0.9878548800945282, "step": 69280 }, { "epoch": 6.93, "grad_norm": 0.20019426941871643, "learning_rate": 3.0703070307030706e-05, "loss": 0.1316, "mean_token_accuracy": 0.9871724486351013, "step": 69300 }, { "epoch": 6.932, "grad_norm": 0.2751573324203491, "learning_rate": 3.068306830683069e-05, "loss": 0.1631, "mean_token_accuracy": 0.9874874830245972, "step": 69320 }, { "epoch": 6.934, "grad_norm": 0.3051007390022278, "learning_rate": 3.0663066306630665e-05, "loss": 0.1469, "mean_token_accuracy": 0.9859991878271103, "step": 69340 }, { "epoch": 6.936, "grad_norm": 0.2559065818786621, "learning_rate": 3.064306430643065e-05, "loss": 0.2262, "mean_token_accuracy": 0.9814842760562896, "step": 69360 }, { "epoch": 6.938, "grad_norm": 0.2103295624256134, "learning_rate": 3.0623062306230624e-05, "loss": 0.1573, "mean_token_accuracy": 0.9880674064159394, "step": 69380 }, { "epoch": 6.9399999999999995, "grad_norm": 0.31433534622192383, "learning_rate": 3.060306030603061e-05, "loss": 0.1568, "mean_token_accuracy": 0.9827150464057922, "step": 69400 }, { "epoch": 6.942, "grad_norm": 0.17441219091415405, "learning_rate": 3.0583058305830584e-05, "loss": 0.228, "mean_token_accuracy": 0.9815599143505096, "step": 69420 }, { "epoch": 6.944, "grad_norm": 0.2455831915140152, "learning_rate": 3.056305630563057e-05, "loss": 0.2034, "mean_token_accuracy": 0.9816018372774125, "step": 69440 }, { "epoch": 6.946, "grad_norm": 0.25589051842689514, "learning_rate": 3.054305430543054e-05, "loss": 0.338, "mean_token_accuracy": 0.9870156973600388, "step": 69460 }, { "epoch": 6.948, "grad_norm": 0.20396576821804047, "learning_rate": 3.0523052305230526e-05, "loss": 0.1522, "mean_token_accuracy": 0.9864003121852875, "step": 69480 }, { "epoch": 6.95, "grad_norm": 0.28686144948005676, "learning_rate": 3.0503050305030506e-05, "loss": 0.2136, "mean_token_accuracy": 0.9871064603328705, "step": 69500 }, { "epoch": 6.952, "grad_norm": 0.26144638657569885, "learning_rate": 3.0483048304830485e-05, "loss": 0.2091, "mean_token_accuracy": 0.9844709366559983, "step": 69520 }, { "epoch": 6.954, "grad_norm": 0.30962127447128296, "learning_rate": 3.0463046304630465e-05, "loss": 0.2714, "mean_token_accuracy": 0.9869352042675018, "step": 69540 }, { "epoch": 6.9559999999999995, "grad_norm": 0.2319253385066986, "learning_rate": 3.0443044304430445e-05, "loss": 0.1752, "mean_token_accuracy": 0.9877912402153015, "step": 69560 }, { "epoch": 6.958, "grad_norm": 0.3258090913295746, "learning_rate": 3.0423042304230424e-05, "loss": 0.2044, "mean_token_accuracy": 0.9860555082559586, "step": 69580 }, { "epoch": 6.96, "grad_norm": 0.23980863392353058, "learning_rate": 3.0403040304030404e-05, "loss": 0.1359, "mean_token_accuracy": 0.9867068082094193, "step": 69600 }, { "epoch": 6.962, "grad_norm": 0.2290395349264145, "learning_rate": 3.0383038303830387e-05, "loss": 0.1788, "mean_token_accuracy": 0.9862692594528198, "step": 69620 }, { "epoch": 6.964, "grad_norm": 0.7723631262779236, "learning_rate": 3.0363036303630367e-05, "loss": 0.1802, "mean_token_accuracy": 0.9871639549732208, "step": 69640 }, { "epoch": 6.966, "grad_norm": 0.2767546474933624, "learning_rate": 3.0343034303430346e-05, "loss": 0.1764, "mean_token_accuracy": 0.9867234200239181, "step": 69660 }, { "epoch": 6.968, "grad_norm": 0.23288334906101227, "learning_rate": 3.0323032303230326e-05, "loss": 0.2698, "mean_token_accuracy": 0.9805299788713455, "step": 69680 }, { "epoch": 6.97, "grad_norm": 0.29315999150276184, "learning_rate": 3.0303030303030306e-05, "loss": 0.1557, "mean_token_accuracy": 0.9876276224851608, "step": 69700 }, { "epoch": 6.9719999999999995, "grad_norm": 0.27140623331069946, "learning_rate": 3.0283028302830285e-05, "loss": 0.2113, "mean_token_accuracy": 0.9869115114212036, "step": 69720 }, { "epoch": 6.974, "grad_norm": 0.2818463444709778, "learning_rate": 3.0263026302630265e-05, "loss": 0.1251, "mean_token_accuracy": 0.9901355177164077, "step": 69740 }, { "epoch": 6.976, "grad_norm": 0.4276299774646759, "learning_rate": 3.0243024302430245e-05, "loss": 0.1978, "mean_token_accuracy": 0.9869349122047424, "step": 69760 }, { "epoch": 6.978, "grad_norm": 0.2207600325345993, "learning_rate": 3.0223022302230224e-05, "loss": 0.2529, "mean_token_accuracy": 0.9875721603631973, "step": 69780 }, { "epoch": 6.98, "grad_norm": 0.21087269484996796, "learning_rate": 3.0203020302030204e-05, "loss": 0.1678, "mean_token_accuracy": 0.9863366395235061, "step": 69800 }, { "epoch": 6.982, "grad_norm": 0.2368898242712021, "learning_rate": 3.0183018301830184e-05, "loss": 0.127, "mean_token_accuracy": 0.9835608989000321, "step": 69820 }, { "epoch": 6.984, "grad_norm": 0.23772145807743073, "learning_rate": 3.0163016301630163e-05, "loss": 0.1122, "mean_token_accuracy": 0.9879611730575562, "step": 69840 }, { "epoch": 6.986, "grad_norm": 1.6670657396316528, "learning_rate": 3.0143014301430146e-05, "loss": 0.2291, "mean_token_accuracy": 0.9846957176923752, "step": 69860 }, { "epoch": 6.9879999999999995, "grad_norm": 0.23266828060150146, "learning_rate": 3.0123012301230126e-05, "loss": 0.15, "mean_token_accuracy": 0.9865671008825302, "step": 69880 }, { "epoch": 6.99, "grad_norm": 0.21664167940616608, "learning_rate": 3.0103010301030106e-05, "loss": 0.2618, "mean_token_accuracy": 0.985537189245224, "step": 69900 }, { "epoch": 6.992, "grad_norm": 4.031055450439453, "learning_rate": 3.0083008300830085e-05, "loss": 0.1381, "mean_token_accuracy": 0.9856501132249832, "step": 69920 }, { "epoch": 6.994, "grad_norm": 0.19343681633472443, "learning_rate": 3.0063006300630065e-05, "loss": 0.2625, "mean_token_accuracy": 0.9841539889574051, "step": 69940 }, { "epoch": 6.996, "grad_norm": 0.6522583961486816, "learning_rate": 3.0043004300430045e-05, "loss": 0.1635, "mean_token_accuracy": 0.9874361306428909, "step": 69960 }, { "epoch": 6.998, "grad_norm": 0.28200703859329224, "learning_rate": 3.0023002300230024e-05, "loss": 0.259, "mean_token_accuracy": 0.9861367017030715, "step": 69980 }, { "epoch": 7.0, "grad_norm": 0.9795888662338257, "learning_rate": 3.0003000300030004e-05, "loss": 0.1471, "mean_token_accuracy": 0.9875567227602005, "step": 70000 }, { "epoch": 7.002, "grad_norm": 0.2032836675643921, "learning_rate": 2.9982998299829984e-05, "loss": 0.1503, "mean_token_accuracy": 0.986686709523201, "step": 70020 }, { "epoch": 7.004, "grad_norm": 0.5300183892250061, "learning_rate": 2.9962996299629963e-05, "loss": 0.1504, "mean_token_accuracy": 0.990347895026207, "step": 70040 }, { "epoch": 7.006, "grad_norm": 0.923738956451416, "learning_rate": 2.9942994299429943e-05, "loss": 0.1835, "mean_token_accuracy": 0.9899703621864319, "step": 70060 }, { "epoch": 7.008, "grad_norm": 0.27765560150146484, "learning_rate": 2.9922992299229923e-05, "loss": 0.1109, "mean_token_accuracy": 0.9873852819204331, "step": 70080 }, { "epoch": 7.01, "grad_norm": 0.6907898187637329, "learning_rate": 2.9902990299029902e-05, "loss": 0.1679, "mean_token_accuracy": 0.9864766448736191, "step": 70100 }, { "epoch": 7.012, "grad_norm": 0.6382275819778442, "learning_rate": 2.9882988298829885e-05, "loss": 0.1476, "mean_token_accuracy": 0.9915387004613876, "step": 70120 }, { "epoch": 7.014, "grad_norm": 0.45420217514038086, "learning_rate": 2.9862986298629865e-05, "loss": 0.1141, "mean_token_accuracy": 0.9894871443510056, "step": 70140 }, { "epoch": 7.016, "grad_norm": 0.2645356357097626, "learning_rate": 2.9842984298429845e-05, "loss": 0.1174, "mean_token_accuracy": 0.984830230474472, "step": 70160 }, { "epoch": 7.018, "grad_norm": 0.3828628957271576, "learning_rate": 2.9822982298229824e-05, "loss": 0.1658, "mean_token_accuracy": 0.9896336466073989, "step": 70180 }, { "epoch": 7.02, "grad_norm": 0.35479456186294556, "learning_rate": 2.9802980298029804e-05, "loss": 0.3161, "mean_token_accuracy": 0.984274971485138, "step": 70200 }, { "epoch": 7.022, "grad_norm": 0.2492641806602478, "learning_rate": 2.9782978297829784e-05, "loss": 0.18, "mean_token_accuracy": 0.9880323022603988, "step": 70220 }, { "epoch": 7.024, "grad_norm": 0.25470560789108276, "learning_rate": 2.9762976297629763e-05, "loss": 0.169, "mean_token_accuracy": 0.9900365144014358, "step": 70240 }, { "epoch": 7.026, "grad_norm": 0.48732566833496094, "learning_rate": 2.9742974297429743e-05, "loss": 0.1219, "mean_token_accuracy": 0.9872163474559784, "step": 70260 }, { "epoch": 7.028, "grad_norm": 0.23377813398838043, "learning_rate": 2.9722972297229723e-05, "loss": 0.1485, "mean_token_accuracy": 0.9885260581970214, "step": 70280 }, { "epoch": 7.03, "grad_norm": 0.39890897274017334, "learning_rate": 2.9702970297029702e-05, "loss": 0.1925, "mean_token_accuracy": 0.9875983476638794, "step": 70300 }, { "epoch": 7.032, "grad_norm": 0.207405686378479, "learning_rate": 2.9682968296829682e-05, "loss": 0.2237, "mean_token_accuracy": 0.9857867330312728, "step": 70320 }, { "epoch": 7.034, "grad_norm": 0.21824681758880615, "learning_rate": 2.966296629662966e-05, "loss": 0.1512, "mean_token_accuracy": 0.9871966779232025, "step": 70340 }, { "epoch": 7.036, "grad_norm": 0.21276003122329712, "learning_rate": 2.964296429642964e-05, "loss": 0.2268, "mean_token_accuracy": 0.9924437671899795, "step": 70360 }, { "epoch": 7.038, "grad_norm": 0.37183961272239685, "learning_rate": 2.9622962296229624e-05, "loss": 0.1539, "mean_token_accuracy": 0.9872448444366455, "step": 70380 }, { "epoch": 7.04, "grad_norm": 0.279134064912796, "learning_rate": 2.9602960296029604e-05, "loss": 0.1896, "mean_token_accuracy": 0.9874804019927979, "step": 70400 }, { "epoch": 7.042, "grad_norm": 0.4229012131690979, "learning_rate": 2.9582958295829584e-05, "loss": 0.2226, "mean_token_accuracy": 0.9898339241743088, "step": 70420 }, { "epoch": 7.044, "grad_norm": 0.3871731758117676, "learning_rate": 2.9562956295629563e-05, "loss": 0.2033, "mean_token_accuracy": 0.9890114277601242, "step": 70440 }, { "epoch": 7.046, "grad_norm": 0.24057747423648834, "learning_rate": 2.9542954295429543e-05, "loss": 0.2198, "mean_token_accuracy": 0.9710906326770783, "step": 70460 }, { "epoch": 7.048, "grad_norm": 0.6838417053222656, "learning_rate": 2.9522952295229523e-05, "loss": 0.109, "mean_token_accuracy": 0.9882032305002213, "step": 70480 }, { "epoch": 7.05, "grad_norm": 0.24834579229354858, "learning_rate": 2.9502950295029502e-05, "loss": 0.1576, "mean_token_accuracy": 0.9910332560539246, "step": 70500 }, { "epoch": 7.052, "grad_norm": 0.2627458870410919, "learning_rate": 2.9482948294829482e-05, "loss": 0.0988, "mean_token_accuracy": 0.9903418987989425, "step": 70520 }, { "epoch": 7.054, "grad_norm": 0.3727167844772339, "learning_rate": 2.946294629462946e-05, "loss": 0.1438, "mean_token_accuracy": 0.9910328924655915, "step": 70540 }, { "epoch": 7.056, "grad_norm": 0.4475243091583252, "learning_rate": 2.944294429442944e-05, "loss": 0.1782, "mean_token_accuracy": 0.9904597252607346, "step": 70560 }, { "epoch": 7.058, "grad_norm": 0.23138169944286346, "learning_rate": 2.942294229422942e-05, "loss": 0.2694, "mean_token_accuracy": 0.9857080161571503, "step": 70580 }, { "epoch": 7.06, "grad_norm": 0.26962652802467346, "learning_rate": 2.94029402940294e-05, "loss": 0.1906, "mean_token_accuracy": 0.9891306191682816, "step": 70600 }, { "epoch": 7.062, "grad_norm": 0.45916229486465454, "learning_rate": 2.938293829382938e-05, "loss": 0.171, "mean_token_accuracy": 0.9902126073837281, "step": 70620 }, { "epoch": 7.064, "grad_norm": 0.29979559779167175, "learning_rate": 2.9362936293629363e-05, "loss": 0.1819, "mean_token_accuracy": 0.9904740393161774, "step": 70640 }, { "epoch": 7.066, "grad_norm": 0.4728180468082428, "learning_rate": 2.9342934293429343e-05, "loss": 0.1277, "mean_token_accuracy": 0.9904728889465332, "step": 70660 }, { "epoch": 7.068, "grad_norm": 0.3561569154262543, "learning_rate": 2.9322932293229326e-05, "loss": 0.1593, "mean_token_accuracy": 0.9872538536787033, "step": 70680 }, { "epoch": 7.07, "grad_norm": 0.22337721288204193, "learning_rate": 2.9302930293029306e-05, "loss": 0.1931, "mean_token_accuracy": 0.9853521138429642, "step": 70700 }, { "epoch": 7.072, "grad_norm": 0.32758423686027527, "learning_rate": 2.928292829282929e-05, "loss": 0.2926, "mean_token_accuracy": 0.9888787508010864, "step": 70720 }, { "epoch": 7.074, "grad_norm": 0.2531667649745941, "learning_rate": 2.9262926292629268e-05, "loss": 0.1465, "mean_token_accuracy": 0.9840857475996018, "step": 70740 }, { "epoch": 7.076, "grad_norm": 0.4359259605407715, "learning_rate": 2.9242924292429248e-05, "loss": 0.1575, "mean_token_accuracy": 0.9904336214065552, "step": 70760 }, { "epoch": 7.078, "grad_norm": 0.3066394627094269, "learning_rate": 2.9222922292229228e-05, "loss": 0.168, "mean_token_accuracy": 0.9886292725801468, "step": 70780 }, { "epoch": 7.08, "grad_norm": 0.2727543115615845, "learning_rate": 2.9202920292029207e-05, "loss": 0.2003, "mean_token_accuracy": 0.9869277894496917, "step": 70800 }, { "epoch": 7.082, "grad_norm": 0.23162348568439484, "learning_rate": 2.9182918291829187e-05, "loss": 0.1666, "mean_token_accuracy": 0.9889775454998017, "step": 70820 }, { "epoch": 7.084, "grad_norm": 0.7029260993003845, "learning_rate": 2.9162916291629167e-05, "loss": 0.1208, "mean_token_accuracy": 0.9897910416126251, "step": 70840 }, { "epoch": 7.086, "grad_norm": 0.4460585117340088, "learning_rate": 2.9142914291429146e-05, "loss": 0.1275, "mean_token_accuracy": 0.9894942790269852, "step": 70860 }, { "epoch": 7.088, "grad_norm": 0.26191234588623047, "learning_rate": 2.9122912291229126e-05, "loss": 0.1579, "mean_token_accuracy": 0.9904881447553635, "step": 70880 }, { "epoch": 7.09, "grad_norm": 0.547494113445282, "learning_rate": 2.9102910291029106e-05, "loss": 0.1422, "mean_token_accuracy": 0.9885366350412369, "step": 70900 }, { "epoch": 7.092, "grad_norm": 0.1583574414253235, "learning_rate": 2.9082908290829085e-05, "loss": 0.1045, "mean_token_accuracy": 0.9904805332422256, "step": 70920 }, { "epoch": 7.094, "grad_norm": 0.8738057017326355, "learning_rate": 2.9062906290629065e-05, "loss": 0.1142, "mean_token_accuracy": 0.9852668553590774, "step": 70940 }, { "epoch": 7.096, "grad_norm": 0.2610366642475128, "learning_rate": 2.9042904290429045e-05, "loss": 0.0974, "mean_token_accuracy": 0.9903378814458847, "step": 70960 }, { "epoch": 7.098, "grad_norm": 0.17115390300750732, "learning_rate": 2.9022902290229028e-05, "loss": 0.2109, "mean_token_accuracy": 0.9869120687246322, "step": 70980 }, { "epoch": 7.1, "grad_norm": 0.3043868839740753, "learning_rate": 2.9002900290029007e-05, "loss": 0.1095, "mean_token_accuracy": 0.9909919470548629, "step": 71000 }, { "epoch": 7.102, "grad_norm": 0.35026803612709045, "learning_rate": 2.8982898289828987e-05, "loss": 0.2398, "mean_token_accuracy": 0.9896520167589188, "step": 71020 }, { "epoch": 7.104, "grad_norm": 0.833526611328125, "learning_rate": 2.8962896289628967e-05, "loss": 0.1491, "mean_token_accuracy": 0.9908526778221131, "step": 71040 }, { "epoch": 7.106, "grad_norm": 0.6268945336341858, "learning_rate": 2.8942894289428946e-05, "loss": 0.1352, "mean_token_accuracy": 0.9899603813886643, "step": 71060 }, { "epoch": 7.108, "grad_norm": 0.7561474442481995, "learning_rate": 2.8922892289228926e-05, "loss": 0.1935, "mean_token_accuracy": 0.9889992654323578, "step": 71080 }, { "epoch": 7.11, "grad_norm": 0.21166473627090454, "learning_rate": 2.8902890289028906e-05, "loss": 0.2269, "mean_token_accuracy": 0.9850024461746216, "step": 71100 }, { "epoch": 7.112, "grad_norm": 0.30928322672843933, "learning_rate": 2.8882888288828885e-05, "loss": 0.1853, "mean_token_accuracy": 0.9911071747541428, "step": 71120 }, { "epoch": 7.114, "grad_norm": 0.2953234314918518, "learning_rate": 2.8862886288628865e-05, "loss": 0.2016, "mean_token_accuracy": 0.9788285672664643, "step": 71140 }, { "epoch": 7.116, "grad_norm": 0.29581642150878906, "learning_rate": 2.8842884288428845e-05, "loss": 0.1887, "mean_token_accuracy": 0.9868236899375915, "step": 71160 }, { "epoch": 7.118, "grad_norm": 0.31380903720855713, "learning_rate": 2.8822882288228824e-05, "loss": 0.1701, "mean_token_accuracy": 0.9893961429595948, "step": 71180 }, { "epoch": 7.12, "grad_norm": 1.2183904647827148, "learning_rate": 2.8802880288028804e-05, "loss": 0.1524, "mean_token_accuracy": 0.9894449084997177, "step": 71200 }, { "epoch": 7.122, "grad_norm": 0.41939908266067505, "learning_rate": 2.8782878287828783e-05, "loss": 0.2039, "mean_token_accuracy": 0.9909680724143982, "step": 71220 }, { "epoch": 7.124, "grad_norm": 0.27011480927467346, "learning_rate": 2.8762876287628767e-05, "loss": 0.1375, "mean_token_accuracy": 0.9890353351831436, "step": 71240 }, { "epoch": 7.126, "grad_norm": 0.3153814375400543, "learning_rate": 2.8742874287428746e-05, "loss": 0.126, "mean_token_accuracy": 0.9882440090179443, "step": 71260 }, { "epoch": 7.128, "grad_norm": 0.19448222219944, "learning_rate": 2.8722872287228726e-05, "loss": 0.1221, "mean_token_accuracy": 0.9884807586669921, "step": 71280 }, { "epoch": 7.13, "grad_norm": 0.23752643167972565, "learning_rate": 2.8702870287028706e-05, "loss": 0.1782, "mean_token_accuracy": 0.9910929799079895, "step": 71300 }, { "epoch": 7.132, "grad_norm": 0.26896607875823975, "learning_rate": 2.8682868286828685e-05, "loss": 0.2815, "mean_token_accuracy": 0.987887054681778, "step": 71320 }, { "epoch": 7.134, "grad_norm": 0.2225130945444107, "learning_rate": 2.8662866286628665e-05, "loss": 0.2942, "mean_token_accuracy": 0.9868027061223984, "step": 71340 }, { "epoch": 7.136, "grad_norm": 1.190095067024231, "learning_rate": 2.8642864286428644e-05, "loss": 0.2788, "mean_token_accuracy": 0.9800835698843002, "step": 71360 }, { "epoch": 7.138, "grad_norm": 0.2195560783147812, "learning_rate": 2.8622862286228624e-05, "loss": 0.1123, "mean_token_accuracy": 0.9894248664379119, "step": 71380 }, { "epoch": 7.14, "grad_norm": 0.7138532996177673, "learning_rate": 2.8602860286028604e-05, "loss": 0.2858, "mean_token_accuracy": 0.9827207684516907, "step": 71400 }, { "epoch": 7.142, "grad_norm": 0.31197190284729004, "learning_rate": 2.8582858285828583e-05, "loss": 0.1252, "mean_token_accuracy": 0.9877605587244034, "step": 71420 }, { "epoch": 7.144, "grad_norm": 0.26634711027145386, "learning_rate": 2.8562856285628563e-05, "loss": 0.1702, "mean_token_accuracy": 0.9880350977182388, "step": 71440 }, { "epoch": 7.146, "grad_norm": 0.8953298330307007, "learning_rate": 2.8542854285428543e-05, "loss": 0.1246, "mean_token_accuracy": 0.9905150711536408, "step": 71460 }, { "epoch": 7.148, "grad_norm": 0.1959136426448822, "learning_rate": 2.8522852285228522e-05, "loss": 0.1301, "mean_token_accuracy": 0.9879056513309479, "step": 71480 }, { "epoch": 7.15, "grad_norm": 0.27649328112602234, "learning_rate": 2.8502850285028505e-05, "loss": 0.2233, "mean_token_accuracy": 0.990865382552147, "step": 71500 }, { "epoch": 7.152, "grad_norm": 0.22841539978981018, "learning_rate": 2.8482848284828485e-05, "loss": 0.1442, "mean_token_accuracy": 0.9884066790342331, "step": 71520 }, { "epoch": 7.154, "grad_norm": 0.1822848916053772, "learning_rate": 2.8462846284628465e-05, "loss": 0.109, "mean_token_accuracy": 0.9904673010110855, "step": 71540 }, { "epoch": 7.156, "grad_norm": 0.313311368227005, "learning_rate": 2.8442844284428444e-05, "loss": 0.2371, "mean_token_accuracy": 0.9895942568778991, "step": 71560 }, { "epoch": 7.158, "grad_norm": 0.3252527713775635, "learning_rate": 2.8422842284228424e-05, "loss": 0.1889, "mean_token_accuracy": 0.9857772916555405, "step": 71580 }, { "epoch": 7.16, "grad_norm": 0.26228949427604675, "learning_rate": 2.8402840284028404e-05, "loss": 0.1935, "mean_token_accuracy": 0.9890711069107055, "step": 71600 }, { "epoch": 7.162, "grad_norm": 0.2743028998374939, "learning_rate": 2.8382838283828383e-05, "loss": 0.1505, "mean_token_accuracy": 0.9885980129241944, "step": 71620 }, { "epoch": 7.164, "grad_norm": 14.165108680725098, "learning_rate": 2.8362836283628363e-05, "loss": 0.1593, "mean_token_accuracy": 0.9895754784345627, "step": 71640 }, { "epoch": 7.166, "grad_norm": 0.22161270678043365, "learning_rate": 2.8342834283428343e-05, "loss": 0.1678, "mean_token_accuracy": 0.9852345257997512, "step": 71660 }, { "epoch": 7.168, "grad_norm": 0.25139808654785156, "learning_rate": 2.8322832283228322e-05, "loss": 0.1161, "mean_token_accuracy": 0.9902107566595078, "step": 71680 }, { "epoch": 7.17, "grad_norm": 0.20850321650505066, "learning_rate": 2.8302830283028302e-05, "loss": 0.1549, "mean_token_accuracy": 0.9909046113491058, "step": 71700 }, { "epoch": 7.172, "grad_norm": 1.1230722665786743, "learning_rate": 2.8282828282828282e-05, "loss": 0.2718, "mean_token_accuracy": 0.9889735460281373, "step": 71720 }, { "epoch": 7.174, "grad_norm": 0.2466428130865097, "learning_rate": 2.826282628262826e-05, "loss": 0.2003, "mean_token_accuracy": 0.9900189638137817, "step": 71740 }, { "epoch": 7.176, "grad_norm": 0.30201342701911926, "learning_rate": 2.8242824282428244e-05, "loss": 0.1104, "mean_token_accuracy": 0.9901454597711563, "step": 71760 }, { "epoch": 7.178, "grad_norm": 0.33753103017807007, "learning_rate": 2.8222822282228224e-05, "loss": 0.173, "mean_token_accuracy": 0.9883151888847351, "step": 71780 }, { "epoch": 7.18, "grad_norm": 0.985698401927948, "learning_rate": 2.8202820282028204e-05, "loss": 0.2033, "mean_token_accuracy": 0.9883380144834518, "step": 71800 }, { "epoch": 7.182, "grad_norm": 0.47725629806518555, "learning_rate": 2.8182818281828183e-05, "loss": 0.1439, "mean_token_accuracy": 0.9896139144897461, "step": 71820 }, { "epoch": 7.184, "grad_norm": 0.23128266632556915, "learning_rate": 2.8162816281628163e-05, "loss": 0.2413, "mean_token_accuracy": 0.9872489750385285, "step": 71840 }, { "epoch": 7.186, "grad_norm": 0.22333171963691711, "learning_rate": 2.8142814281428143e-05, "loss": 0.2449, "mean_token_accuracy": 0.9889802634716034, "step": 71860 }, { "epoch": 7.188, "grad_norm": 0.2209613174200058, "learning_rate": 2.8122812281228122e-05, "loss": 0.1587, "mean_token_accuracy": 0.989593043923378, "step": 71880 }, { "epoch": 7.19, "grad_norm": 0.2593700885772705, "learning_rate": 2.8102810281028102e-05, "loss": 0.1271, "mean_token_accuracy": 0.9884443312883378, "step": 71900 }, { "epoch": 7.192, "grad_norm": 0.21651506423950195, "learning_rate": 2.808280828082808e-05, "loss": 0.2218, "mean_token_accuracy": 0.9876300990581512, "step": 71920 }, { "epoch": 7.194, "grad_norm": 0.21634480357170105, "learning_rate": 2.806280628062806e-05, "loss": 0.1541, "mean_token_accuracy": 0.9845004171133042, "step": 71940 }, { "epoch": 7.196, "grad_norm": 0.22485274076461792, "learning_rate": 2.804280428042804e-05, "loss": 0.1298, "mean_token_accuracy": 0.989570289850235, "step": 71960 }, { "epoch": 7.198, "grad_norm": 0.2920265793800354, "learning_rate": 2.802280228022802e-05, "loss": 0.1322, "mean_token_accuracy": 0.9906474739313126, "step": 71980 }, { "epoch": 7.2, "grad_norm": 0.20543067157268524, "learning_rate": 2.8002800280028004e-05, "loss": 0.154, "mean_token_accuracy": 0.9903871089220047, "step": 72000 }, { "epoch": 7.202, "grad_norm": 0.1395280659198761, "learning_rate": 2.7982798279827983e-05, "loss": 0.1651, "mean_token_accuracy": 0.9892794728279114, "step": 72020 }, { "epoch": 7.204, "grad_norm": 0.20586223900318146, "learning_rate": 2.7962796279627963e-05, "loss": 0.1639, "mean_token_accuracy": 0.9900944292545318, "step": 72040 }, { "epoch": 7.206, "grad_norm": 0.25756996870040894, "learning_rate": 2.7942794279427943e-05, "loss": 0.0985, "mean_token_accuracy": 0.9883422583341599, "step": 72060 }, { "epoch": 7.208, "grad_norm": 0.22013095021247864, "learning_rate": 2.7922792279227922e-05, "loss": 0.1346, "mean_token_accuracy": 0.9892033994197845, "step": 72080 }, { "epoch": 7.21, "grad_norm": 0.25519442558288574, "learning_rate": 2.7902790279027902e-05, "loss": 0.1512, "mean_token_accuracy": 0.9872778534889222, "step": 72100 }, { "epoch": 7.212, "grad_norm": 0.20908278226852417, "learning_rate": 2.788278827882788e-05, "loss": 0.1511, "mean_token_accuracy": 0.9904956459999085, "step": 72120 }, { "epoch": 7.214, "grad_norm": 0.4655841290950775, "learning_rate": 2.786278627862786e-05, "loss": 0.2815, "mean_token_accuracy": 0.988070261478424, "step": 72140 }, { "epoch": 7.216, "grad_norm": 0.6415140628814697, "learning_rate": 2.784278427842784e-05, "loss": 0.1398, "mean_token_accuracy": 0.9898968815803528, "step": 72160 }, { "epoch": 7.218, "grad_norm": 0.17074550688266754, "learning_rate": 2.782278227822782e-05, "loss": 0.1242, "mean_token_accuracy": 0.9904469788074494, "step": 72180 }, { "epoch": 7.22, "grad_norm": 4.443115234375, "learning_rate": 2.7802780278027807e-05, "loss": 0.1771, "mean_token_accuracy": 0.9864508867263794, "step": 72200 }, { "epoch": 7.222, "grad_norm": 0.2227572649717331, "learning_rate": 2.7782778277827787e-05, "loss": 0.1407, "mean_token_accuracy": 0.9889877587556839, "step": 72220 }, { "epoch": 7.224, "grad_norm": 0.4777730703353882, "learning_rate": 2.7762776277627766e-05, "loss": 0.215, "mean_token_accuracy": 0.9834804832935333, "step": 72240 }, { "epoch": 7.226, "grad_norm": 0.2438010573387146, "learning_rate": 2.7742774277427746e-05, "loss": 0.1521, "mean_token_accuracy": 0.9884385526180267, "step": 72260 }, { "epoch": 7.228, "grad_norm": 0.23908589780330658, "learning_rate": 2.7722772277227726e-05, "loss": 0.1397, "mean_token_accuracy": 0.9908278793096542, "step": 72280 }, { "epoch": 7.23, "grad_norm": 6.445828914642334, "learning_rate": 2.7702770277027705e-05, "loss": 0.1959, "mean_token_accuracy": 0.9882095336914063, "step": 72300 }, { "epoch": 7.232, "grad_norm": 0.2668273448944092, "learning_rate": 2.7682768276827685e-05, "loss": 0.1337, "mean_token_accuracy": 0.9894245535135269, "step": 72320 }, { "epoch": 7.234, "grad_norm": 0.22844065725803375, "learning_rate": 2.7662766276627665e-05, "loss": 0.1302, "mean_token_accuracy": 0.9892031490802765, "step": 72340 }, { "epoch": 7.236, "grad_norm": 0.18159323930740356, "learning_rate": 2.7642764276427648e-05, "loss": 0.1532, "mean_token_accuracy": 0.988066217303276, "step": 72360 }, { "epoch": 7.2379999999999995, "grad_norm": 0.4657279849052429, "learning_rate": 2.7622762276227627e-05, "loss": 0.2535, "mean_token_accuracy": 0.9856504499912262, "step": 72380 }, { "epoch": 7.24, "grad_norm": 0.2305304855108261, "learning_rate": 2.7602760276027607e-05, "loss": 0.1701, "mean_token_accuracy": 0.989395159482956, "step": 72400 }, { "epoch": 7.242, "grad_norm": 0.214175283908844, "learning_rate": 2.7582758275827587e-05, "loss": 0.1551, "mean_token_accuracy": 0.9840266823768615, "step": 72420 }, { "epoch": 7.244, "grad_norm": 0.3568032383918762, "learning_rate": 2.7562756275627566e-05, "loss": 0.2114, "mean_token_accuracy": 0.9776964753866195, "step": 72440 }, { "epoch": 7.246, "grad_norm": 0.6460886597633362, "learning_rate": 2.7542754275427546e-05, "loss": 0.1991, "mean_token_accuracy": 0.9895922183990479, "step": 72460 }, { "epoch": 7.248, "grad_norm": 0.2591857314109802, "learning_rate": 2.7522752275227526e-05, "loss": 0.1642, "mean_token_accuracy": 0.9855939954519272, "step": 72480 }, { "epoch": 7.25, "grad_norm": 0.2485010027885437, "learning_rate": 2.7502750275027505e-05, "loss": 0.133, "mean_token_accuracy": 0.9875862568616867, "step": 72500 }, { "epoch": 7.252, "grad_norm": 2.226665735244751, "learning_rate": 2.7482748274827485e-05, "loss": 0.1253, "mean_token_accuracy": 0.9823168963193893, "step": 72520 }, { "epoch": 7.254, "grad_norm": 3.2521708011627197, "learning_rate": 2.7462746274627465e-05, "loss": 0.2311, "mean_token_accuracy": 0.9899189531803131, "step": 72540 }, { "epoch": 7.256, "grad_norm": 6.29935884475708, "learning_rate": 2.7442744274427444e-05, "loss": 0.142, "mean_token_accuracy": 0.9871420204639435, "step": 72560 }, { "epoch": 7.258, "grad_norm": 0.2163979411125183, "learning_rate": 2.7422742274227424e-05, "loss": 0.1409, "mean_token_accuracy": 0.9890915244817734, "step": 72580 }, { "epoch": 7.26, "grad_norm": 0.2902594804763794, "learning_rate": 2.7402740274027404e-05, "loss": 0.1147, "mean_token_accuracy": 0.9886566281318665, "step": 72600 }, { "epoch": 7.2620000000000005, "grad_norm": 0.6247573494911194, "learning_rate": 2.7382738273827387e-05, "loss": 0.1564, "mean_token_accuracy": 0.9874420255422592, "step": 72620 }, { "epoch": 7.264, "grad_norm": 0.3434240221977234, "learning_rate": 2.7362736273627366e-05, "loss": 0.1718, "mean_token_accuracy": 0.9892862975597382, "step": 72640 }, { "epoch": 7.266, "grad_norm": 0.2079593986272812, "learning_rate": 2.7342734273427346e-05, "loss": 0.187, "mean_token_accuracy": 0.9862428039312363, "step": 72660 }, { "epoch": 7.268, "grad_norm": 0.16971541941165924, "learning_rate": 2.7322732273227326e-05, "loss": 0.1167, "mean_token_accuracy": 0.9893717646598816, "step": 72680 }, { "epoch": 7.27, "grad_norm": 0.196394145488739, "learning_rate": 2.7302730273027305e-05, "loss": 0.1185, "mean_token_accuracy": 0.9906856745481492, "step": 72700 }, { "epoch": 7.272, "grad_norm": 0.2569144666194916, "learning_rate": 2.7282728272827285e-05, "loss": 0.1538, "mean_token_accuracy": 0.9886883705854416, "step": 72720 }, { "epoch": 7.274, "grad_norm": 0.24451161921024323, "learning_rate": 2.7262726272627265e-05, "loss": 0.2255, "mean_token_accuracy": 0.9895576506853103, "step": 72740 }, { "epoch": 7.276, "grad_norm": 0.25122857093811035, "learning_rate": 2.7242724272427244e-05, "loss": 0.1106, "mean_token_accuracy": 0.9909765005111695, "step": 72760 }, { "epoch": 7.2780000000000005, "grad_norm": 0.6019963622093201, "learning_rate": 2.7222722272227224e-05, "loss": 0.1101, "mean_token_accuracy": 0.990985119342804, "step": 72780 }, { "epoch": 7.28, "grad_norm": 0.20161688327789307, "learning_rate": 2.7202720272027204e-05, "loss": 0.1849, "mean_token_accuracy": 0.989641186594963, "step": 72800 }, { "epoch": 7.282, "grad_norm": 0.18933530151844025, "learning_rate": 2.7182718271827183e-05, "loss": 0.1889, "mean_token_accuracy": 0.9897954612970352, "step": 72820 }, { "epoch": 7.284, "grad_norm": 0.18460451066493988, "learning_rate": 2.7162716271627163e-05, "loss": 0.229, "mean_token_accuracy": 0.9840620428323745, "step": 72840 }, { "epoch": 7.286, "grad_norm": 0.23304088413715363, "learning_rate": 2.7142714271427146e-05, "loss": 0.2089, "mean_token_accuracy": 0.9876222789287568, "step": 72860 }, { "epoch": 7.288, "grad_norm": 0.26065924763679504, "learning_rate": 2.7122712271227126e-05, "loss": 0.1487, "mean_token_accuracy": 0.9914780139923096, "step": 72880 }, { "epoch": 7.29, "grad_norm": 0.4072655439376831, "learning_rate": 2.7102710271027105e-05, "loss": 0.1515, "mean_token_accuracy": 0.9909524530172348, "step": 72900 }, { "epoch": 7.292, "grad_norm": 0.22826722264289856, "learning_rate": 2.7082708270827085e-05, "loss": 0.2231, "mean_token_accuracy": 0.9887062698602677, "step": 72920 }, { "epoch": 7.294, "grad_norm": 0.660993754863739, "learning_rate": 2.7062706270627065e-05, "loss": 0.1743, "mean_token_accuracy": 0.9863274455070495, "step": 72940 }, { "epoch": 7.296, "grad_norm": 0.29583948850631714, "learning_rate": 2.7042704270427044e-05, "loss": 0.1259, "mean_token_accuracy": 0.9900993764400482, "step": 72960 }, { "epoch": 7.298, "grad_norm": 0.24169443547725677, "learning_rate": 2.7022702270227024e-05, "loss": 0.1466, "mean_token_accuracy": 0.9906796902418137, "step": 72980 }, { "epoch": 7.3, "grad_norm": 0.20686113834381104, "learning_rate": 2.7002700270027004e-05, "loss": 0.1504, "mean_token_accuracy": 0.9892458617687225, "step": 73000 }, { "epoch": 7.302, "grad_norm": 0.2699017822742462, "learning_rate": 2.6982698269826983e-05, "loss": 0.166, "mean_token_accuracy": 0.9870116174221039, "step": 73020 }, { "epoch": 7.304, "grad_norm": 1.232200026512146, "learning_rate": 2.6962696269626963e-05, "loss": 0.2299, "mean_token_accuracy": 0.9912352561950684, "step": 73040 }, { "epoch": 7.306, "grad_norm": 0.2847486138343811, "learning_rate": 2.6942694269426943e-05, "loss": 0.1919, "mean_token_accuracy": 0.9880786240100861, "step": 73060 }, { "epoch": 7.308, "grad_norm": 0.2301040142774582, "learning_rate": 2.6922692269226922e-05, "loss": 0.1541, "mean_token_accuracy": 0.9879134207963943, "step": 73080 }, { "epoch": 7.31, "grad_norm": 0.19887417554855347, "learning_rate": 2.6902690269026902e-05, "loss": 0.1696, "mean_token_accuracy": 0.9892732441425324, "step": 73100 }, { "epoch": 7.312, "grad_norm": 0.32632914185523987, "learning_rate": 2.6882688268826885e-05, "loss": 0.1741, "mean_token_accuracy": 0.9896240890026092, "step": 73120 }, { "epoch": 7.314, "grad_norm": 0.3358915150165558, "learning_rate": 2.6862686268626865e-05, "loss": 0.1417, "mean_token_accuracy": 0.9889935910701751, "step": 73140 }, { "epoch": 7.316, "grad_norm": 0.28708985447883606, "learning_rate": 2.6842684268426844e-05, "loss": 0.2156, "mean_token_accuracy": 0.9862085580825806, "step": 73160 }, { "epoch": 7.318, "grad_norm": 0.17546449601650238, "learning_rate": 2.6822682268226824e-05, "loss": 0.1313, "mean_token_accuracy": 0.9876793205738068, "step": 73180 }, { "epoch": 7.32, "grad_norm": 0.2861030399799347, "learning_rate": 2.6802680268026804e-05, "loss": 0.1865, "mean_token_accuracy": 0.9878284454345703, "step": 73200 }, { "epoch": 7.322, "grad_norm": 3.7280397415161133, "learning_rate": 2.6782678267826783e-05, "loss": 0.1437, "mean_token_accuracy": 0.9896204441785812, "step": 73220 }, { "epoch": 7.324, "grad_norm": 0.29134660959243774, "learning_rate": 2.6762676267626763e-05, "loss": 0.1368, "mean_token_accuracy": 0.98905348777771, "step": 73240 }, { "epoch": 7.326, "grad_norm": 0.26406389474868774, "learning_rate": 2.6742674267426743e-05, "loss": 0.1529, "mean_token_accuracy": 0.9869206577539444, "step": 73260 }, { "epoch": 7.328, "grad_norm": 0.908896267414093, "learning_rate": 2.6722672267226722e-05, "loss": 0.1927, "mean_token_accuracy": 0.9902439981698989, "step": 73280 }, { "epoch": 7.33, "grad_norm": 0.5426515340805054, "learning_rate": 2.6702670267026702e-05, "loss": 0.2021, "mean_token_accuracy": 0.9879167586565017, "step": 73300 }, { "epoch": 7.332, "grad_norm": 0.3907138407230377, "learning_rate": 2.668266826682668e-05, "loss": 0.1884, "mean_token_accuracy": 0.9876822859048844, "step": 73320 }, { "epoch": 7.334, "grad_norm": 0.724514365196228, "learning_rate": 2.666266626662666e-05, "loss": 0.1354, "mean_token_accuracy": 0.9884259968996048, "step": 73340 }, { "epoch": 7.336, "grad_norm": 0.21738263964653015, "learning_rate": 2.664266426642664e-05, "loss": 0.1298, "mean_token_accuracy": 0.9886239022016525, "step": 73360 }, { "epoch": 7.338, "grad_norm": 0.2688619792461395, "learning_rate": 2.6622662266226624e-05, "loss": 0.1973, "mean_token_accuracy": 0.9915250480175019, "step": 73380 }, { "epoch": 7.34, "grad_norm": 0.26005128026008606, "learning_rate": 2.6602660266026604e-05, "loss": 0.3184, "mean_token_accuracy": 0.9869996845722199, "step": 73400 }, { "epoch": 7.342, "grad_norm": 0.23421236872673035, "learning_rate": 2.6582658265826583e-05, "loss": 0.3147, "mean_token_accuracy": 0.9879982888698577, "step": 73420 }, { "epoch": 7.344, "grad_norm": 0.2757292687892914, "learning_rate": 2.6562656265626563e-05, "loss": 0.2215, "mean_token_accuracy": 0.9902210384607315, "step": 73440 }, { "epoch": 7.346, "grad_norm": 0.4532703757286072, "learning_rate": 2.6542654265426543e-05, "loss": 0.1292, "mean_token_accuracy": 0.9866806387901306, "step": 73460 }, { "epoch": 7.348, "grad_norm": 0.2528744339942932, "learning_rate": 2.6522652265226522e-05, "loss": 0.1643, "mean_token_accuracy": 0.9892407089471817, "step": 73480 }, { "epoch": 7.35, "grad_norm": 0.2714689373970032, "learning_rate": 2.6502650265026502e-05, "loss": 0.1474, "mean_token_accuracy": 0.9901845932006836, "step": 73500 }, { "epoch": 7.352, "grad_norm": 0.2149820774793625, "learning_rate": 2.648264826482648e-05, "loss": 0.2671, "mean_token_accuracy": 0.9875700443983078, "step": 73520 }, { "epoch": 7.354, "grad_norm": 0.2648971974849701, "learning_rate": 2.646264626462646e-05, "loss": 0.1827, "mean_token_accuracy": 0.9880949914455414, "step": 73540 }, { "epoch": 7.356, "grad_norm": 1.0744080543518066, "learning_rate": 2.644264426442644e-05, "loss": 0.1661, "mean_token_accuracy": 0.9888256460428237, "step": 73560 }, { "epoch": 7.358, "grad_norm": 0.29849180579185486, "learning_rate": 2.642264226422642e-05, "loss": 0.212, "mean_token_accuracy": 0.9838367164134979, "step": 73580 }, { "epoch": 7.36, "grad_norm": 0.22093772888183594, "learning_rate": 2.64026402640264e-05, "loss": 0.1106, "mean_token_accuracy": 0.9906867980957031, "step": 73600 }, { "epoch": 7.362, "grad_norm": 3.7124977111816406, "learning_rate": 2.638263826382638e-05, "loss": 0.1316, "mean_token_accuracy": 0.9855901420116424, "step": 73620 }, { "epoch": 7.364, "grad_norm": 0.24020422995090485, "learning_rate": 2.6362636263626363e-05, "loss": 0.1655, "mean_token_accuracy": 0.9894746661186218, "step": 73640 }, { "epoch": 7.366, "grad_norm": 0.28099325299263, "learning_rate": 2.6342634263426342e-05, "loss": 0.1758, "mean_token_accuracy": 0.9866327702999115, "step": 73660 }, { "epoch": 7.368, "grad_norm": 0.9258522391319275, "learning_rate": 2.6322632263226322e-05, "loss": 0.1978, "mean_token_accuracy": 0.9900214016437531, "step": 73680 }, { "epoch": 7.37, "grad_norm": 0.3170884847640991, "learning_rate": 2.6302630263026305e-05, "loss": 0.1566, "mean_token_accuracy": 0.9881828159093857, "step": 73700 }, { "epoch": 7.372, "grad_norm": 4.804331302642822, "learning_rate": 2.6282628262826288e-05, "loss": 0.3212, "mean_token_accuracy": 0.9845367044210434, "step": 73720 }, { "epoch": 7.374, "grad_norm": 0.2482161521911621, "learning_rate": 2.6262626262626268e-05, "loss": 0.1168, "mean_token_accuracy": 0.9904347509145737, "step": 73740 }, { "epoch": 7.376, "grad_norm": 0.21539030969142914, "learning_rate": 2.6242624262426248e-05, "loss": 0.1695, "mean_token_accuracy": 0.9873065054416656, "step": 73760 }, { "epoch": 7.378, "grad_norm": 0.32123318314552307, "learning_rate": 2.6222622262226227e-05, "loss": 0.1289, "mean_token_accuracy": 0.9894933551549911, "step": 73780 }, { "epoch": 7.38, "grad_norm": 0.9552251100540161, "learning_rate": 2.6202620262026207e-05, "loss": 0.2222, "mean_token_accuracy": 0.9873287111520768, "step": 73800 }, { "epoch": 7.382, "grad_norm": 0.7158880233764648, "learning_rate": 2.6182618261826187e-05, "loss": 0.2057, "mean_token_accuracy": 0.9866257041692734, "step": 73820 }, { "epoch": 7.384, "grad_norm": 1.7818965911865234, "learning_rate": 2.6162616261626166e-05, "loss": 0.1405, "mean_token_accuracy": 0.9884511679410934, "step": 73840 }, { "epoch": 7.386, "grad_norm": 0.2516486346721649, "learning_rate": 2.6142614261426146e-05, "loss": 0.1585, "mean_token_accuracy": 0.9892282783985138, "step": 73860 }, { "epoch": 7.388, "grad_norm": 0.2683066427707672, "learning_rate": 2.6122612261226125e-05, "loss": 0.1009, "mean_token_accuracy": 0.9874500393867492, "step": 73880 }, { "epoch": 7.39, "grad_norm": 0.1895657181739807, "learning_rate": 2.6102610261026105e-05, "loss": 0.1752, "mean_token_accuracy": 0.9875174343585968, "step": 73900 }, { "epoch": 7.392, "grad_norm": 0.26551303267478943, "learning_rate": 2.6082608260826085e-05, "loss": 0.161, "mean_token_accuracy": 0.9881728798151016, "step": 73920 }, { "epoch": 7.394, "grad_norm": 0.21998608112335205, "learning_rate": 2.6062606260626064e-05, "loss": 0.1142, "mean_token_accuracy": 0.9890188872814178, "step": 73940 }, { "epoch": 7.396, "grad_norm": 0.2339870035648346, "learning_rate": 2.6042604260426044e-05, "loss": 0.1177, "mean_token_accuracy": 0.9904817104339599, "step": 73960 }, { "epoch": 7.398, "grad_norm": 0.2664642930030823, "learning_rate": 2.6022602260226027e-05, "loss": 0.2131, "mean_token_accuracy": 0.9895089745521546, "step": 73980 }, { "epoch": 7.4, "grad_norm": 0.2208671271800995, "learning_rate": 2.6002600260026007e-05, "loss": 0.1596, "mean_token_accuracy": 0.9894811183214187, "step": 74000 } ], "logging_steps": 20, "max_steps": 100000, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.112144303919411e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }