diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4879 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 100, + "global_step": 2867, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0017439832577607255, + "grad_norm": 5.828580856323242, + "learning_rate": 3.4843205574912896e-07, + "loss": 2.5765, + "mean_token_accuracy": 0.48322336077690126, + "step": 5 + }, + { + "epoch": 0.003487966515521451, + "grad_norm": 5.456836700439453, + "learning_rate": 6.968641114982579e-07, + "loss": 2.6299, + "mean_token_accuracy": 0.4750183284282684, + "step": 10 + }, + { + "epoch": 0.005231949773282177, + "grad_norm": 5.288783073425293, + "learning_rate": 1.045296167247387e-06, + "loss": 2.6085, + "mean_token_accuracy": 0.4762524425983429, + "step": 15 + }, + { + "epoch": 0.006975933031042902, + "grad_norm": 4.602453231811523, + "learning_rate": 1.3937282229965158e-06, + "loss": 2.5809, + "mean_token_accuracy": 0.4816410064697266, + "step": 20 + }, + { + "epoch": 0.008719916288803628, + "grad_norm": 4.758207321166992, + "learning_rate": 1.742160278745645e-06, + "loss": 2.6076, + "mean_token_accuracy": 0.4783724367618561, + "step": 25 + }, + { + "epoch": 0.010463899546564353, + "grad_norm": 5.43353271484375, + "learning_rate": 2.090592334494774e-06, + "loss": 2.5803, + "mean_token_accuracy": 0.47928274869918824, + "step": 30 + }, + { + "epoch": 0.012207882804325079, + "grad_norm": 5.942702293395996, + "learning_rate": 2.4390243902439027e-06, + "loss": 2.5573, + "mean_token_accuracy": 0.48284457325935365, + "step": 35 + }, + { + "epoch": 0.013951866062085804, + "grad_norm": 5.366197109222412, + "learning_rate": 2.7874564459930316e-06, + "loss": 2.5996, + "mean_token_accuracy": 0.4792888581752777, + "step": 40 + }, + { + "epoch": 0.01569584931984653, + "grad_norm": 4.370666980743408, + "learning_rate": 3.13588850174216e-06, + "loss": 2.5367, + "mean_token_accuracy": 0.4887096703052521, + "step": 45 + }, + { + "epoch": 0.017439832577607256, + "grad_norm": 3.6244709491729736, + "learning_rate": 3.48432055749129e-06, + "loss": 2.4902, + "mean_token_accuracy": 0.49310851097106934, + "step": 50 + }, + { + "epoch": 0.01918381583536798, + "grad_norm": 4.285321235656738, + "learning_rate": 3.832752613240418e-06, + "loss": 2.5491, + "mean_token_accuracy": 0.4849462330341339, + "step": 55 + }, + { + "epoch": 0.020927799093128707, + "grad_norm": 3.2961580753326416, + "learning_rate": 4.181184668989548e-06, + "loss": 2.5381, + "mean_token_accuracy": 0.48914956450462344, + "step": 60 + }, + { + "epoch": 0.02267178235088943, + "grad_norm": 2.5111758708953857, + "learning_rate": 4.529616724738676e-06, + "loss": 2.4718, + "mean_token_accuracy": 0.4966581165790558, + "step": 65 + }, + { + "epoch": 0.024415765608650157, + "grad_norm": 2.264514923095703, + "learning_rate": 4.8780487804878055e-06, + "loss": 2.4617, + "mean_token_accuracy": 0.4947519600391388, + "step": 70 + }, + { + "epoch": 0.026159748866410884, + "grad_norm": 2.5210282802581787, + "learning_rate": 5.226480836236935e-06, + "loss": 2.4443, + "mean_token_accuracy": 0.4989003002643585, + "step": 75 + }, + { + "epoch": 0.027903732124171608, + "grad_norm": 2.014357328414917, + "learning_rate": 5.574912891986063e-06, + "loss": 2.4272, + "mean_token_accuracy": 0.5005743026733398, + "step": 80 + }, + { + "epoch": 0.029647715381932335, + "grad_norm": 1.9813461303710938, + "learning_rate": 5.923344947735193e-06, + "loss": 2.3583, + "mean_token_accuracy": 0.5127138316631317, + "step": 85 + }, + { + "epoch": 0.03139169863969306, + "grad_norm": 2.2138099670410156, + "learning_rate": 6.27177700348432e-06, + "loss": 2.4124, + "mean_token_accuracy": 0.5018206298351288, + "step": 90 + }, + { + "epoch": 0.033135681897453785, + "grad_norm": 1.8536137342453003, + "learning_rate": 6.62020905923345e-06, + "loss": 2.3254, + "mean_token_accuracy": 0.5126893877983093, + "step": 95 + }, + { + "epoch": 0.03487966515521451, + "grad_norm": 1.7873475551605225, + "learning_rate": 6.96864111498258e-06, + "loss": 2.352, + "mean_token_accuracy": 0.5120723485946655, + "step": 100 + }, + { + "epoch": 0.03487966515521451, + "eval_loss": 2.2051470279693604, + "eval_mean_token_accuracy": 0.5369774165792741, + "eval_runtime": 2.2987, + "eval_samples_per_second": 238.829, + "eval_steps_per_second": 60.033, + "step": 100 + }, + { + "epoch": 0.03662364841297523, + "grad_norm": 1.9375139474868774, + "learning_rate": 7.317073170731707e-06, + "loss": 2.35, + "mean_token_accuracy": 0.5110764861106872, + "step": 105 + }, + { + "epoch": 0.03836763167073596, + "grad_norm": 1.6356894969940186, + "learning_rate": 7.665505226480837e-06, + "loss": 2.3122, + "mean_token_accuracy": 0.5163428664207459, + "step": 110 + }, + { + "epoch": 0.040111614928496686, + "grad_norm": 1.5576404333114624, + "learning_rate": 8.013937282229966e-06, + "loss": 2.2809, + "mean_token_accuracy": 0.5201857209205627, + "step": 115 + }, + { + "epoch": 0.04185559818625741, + "grad_norm": 1.5572550296783447, + "learning_rate": 8.362369337979095e-06, + "loss": 2.2641, + "mean_token_accuracy": 0.5205217599868774, + "step": 120 + }, + { + "epoch": 0.04359958144401814, + "grad_norm": 1.4597965478897095, + "learning_rate": 8.710801393728223e-06, + "loss": 2.2707, + "mean_token_accuracy": 0.5182001471519471, + "step": 125 + }, + { + "epoch": 0.04534356470177886, + "grad_norm": 1.3881841897964478, + "learning_rate": 9.059233449477352e-06, + "loss": 2.2115, + "mean_token_accuracy": 0.5258614301681519, + "step": 130 + }, + { + "epoch": 0.04708754795953959, + "grad_norm": 1.3807543516159058, + "learning_rate": 9.407665505226482e-06, + "loss": 2.2063, + "mean_token_accuracy": 0.5298081636428833, + "step": 135 + }, + { + "epoch": 0.048831531217300314, + "grad_norm": 1.3745309114456177, + "learning_rate": 9.756097560975611e-06, + "loss": 2.2417, + "mean_token_accuracy": 0.5213098764419556, + "step": 140 + }, + { + "epoch": 0.05057551447506104, + "grad_norm": 1.4421513080596924, + "learning_rate": 1.0104529616724739e-05, + "loss": 2.2577, + "mean_token_accuracy": 0.5173509359359741, + "step": 145 + }, + { + "epoch": 0.05231949773282177, + "grad_norm": 1.430609107017517, + "learning_rate": 1.045296167247387e-05, + "loss": 2.2354, + "mean_token_accuracy": 0.5197275161743165, + "step": 150 + }, + { + "epoch": 0.05406348099058249, + "grad_norm": 1.392878770828247, + "learning_rate": 1.0801393728222997e-05, + "loss": 2.1975, + "mean_token_accuracy": 0.5275171160697937, + "step": 155 + }, + { + "epoch": 0.055807464248343215, + "grad_norm": 1.5011759996414185, + "learning_rate": 1.1149825783972127e-05, + "loss": 2.2084, + "mean_token_accuracy": 0.5244867920875549, + "step": 160 + }, + { + "epoch": 0.05755144750610394, + "grad_norm": 1.346450924873352, + "learning_rate": 1.1498257839721256e-05, + "loss": 2.1895, + "mean_token_accuracy": 0.5289711594581604, + "step": 165 + }, + { + "epoch": 0.05929543076386467, + "grad_norm": 1.2974307537078857, + "learning_rate": 1.1846689895470385e-05, + "loss": 2.0789, + "mean_token_accuracy": 0.5457416892051696, + "step": 170 + }, + { + "epoch": 0.06103941402162539, + "grad_norm": 1.372033953666687, + "learning_rate": 1.2195121951219513e-05, + "loss": 2.188, + "mean_token_accuracy": 0.5286840319633483, + "step": 175 + }, + { + "epoch": 0.06278339727938612, + "grad_norm": 1.3679383993148804, + "learning_rate": 1.254355400696864e-05, + "loss": 2.1573, + "mean_token_accuracy": 0.5302480340003968, + "step": 180 + }, + { + "epoch": 0.06452738053714685, + "grad_norm": 1.52040433883667, + "learning_rate": 1.2891986062717772e-05, + "loss": 2.1298, + "mean_token_accuracy": 0.5355449676513672, + "step": 185 + }, + { + "epoch": 0.06627136379490757, + "grad_norm": 1.8917245864868164, + "learning_rate": 1.32404181184669e-05, + "loss": 2.1081, + "mean_token_accuracy": 0.5390640258789062, + "step": 190 + }, + { + "epoch": 0.06801534705266829, + "grad_norm": 1.5094658136367798, + "learning_rate": 1.3588850174216028e-05, + "loss": 2.087, + "mean_token_accuracy": 0.5443181872367859, + "step": 195 + }, + { + "epoch": 0.06975933031042902, + "grad_norm": 1.4626585245132446, + "learning_rate": 1.393728222996516e-05, + "loss": 2.1288, + "mean_token_accuracy": 0.5318609476089478, + "step": 200 + }, + { + "epoch": 0.06975933031042902, + "eval_loss": 2.009420871734619, + "eval_mean_token_accuracy": 0.5560248355934585, + "eval_runtime": 2.2797, + "eval_samples_per_second": 240.818, + "eval_steps_per_second": 60.533, + "step": 200 + }, + { + "epoch": 0.07150331356818974, + "grad_norm": 1.3244273662567139, + "learning_rate": 1.4285714285714287e-05, + "loss": 2.0375, + "mean_token_accuracy": 0.5483809828758239, + "step": 205 + }, + { + "epoch": 0.07324729682595046, + "grad_norm": 1.4732096195220947, + "learning_rate": 1.4634146341463415e-05, + "loss": 2.1141, + "mean_token_accuracy": 0.5372800588607788, + "step": 210 + }, + { + "epoch": 0.0749912800837112, + "grad_norm": 1.8651827573776245, + "learning_rate": 1.4982578397212544e-05, + "loss": 2.1044, + "mean_token_accuracy": 0.5341581106185913, + "step": 215 + }, + { + "epoch": 0.07673526334147192, + "grad_norm": 1.592007040977478, + "learning_rate": 1.5331010452961673e-05, + "loss": 2.1381, + "mean_token_accuracy": 0.5317448735237121, + "step": 220 + }, + { + "epoch": 0.07847924659923265, + "grad_norm": 1.4337468147277832, + "learning_rate": 1.5679442508710803e-05, + "loss": 2.074, + "mean_token_accuracy": 0.5427969336509705, + "step": 225 + }, + { + "epoch": 0.08022322985699337, + "grad_norm": 1.5800775289535522, + "learning_rate": 1.6027874564459932e-05, + "loss": 2.1234, + "mean_token_accuracy": 0.5346529722213745, + "step": 230 + }, + { + "epoch": 0.08196721311475409, + "grad_norm": 1.5034254789352417, + "learning_rate": 1.637630662020906e-05, + "loss": 2.0815, + "mean_token_accuracy": 0.539674985408783, + "step": 235 + }, + { + "epoch": 0.08371119637251483, + "grad_norm": 1.5010743141174316, + "learning_rate": 1.672473867595819e-05, + "loss": 2.0307, + "mean_token_accuracy": 0.5488391995429993, + "step": 240 + }, + { + "epoch": 0.08545517963027555, + "grad_norm": 1.3525656461715698, + "learning_rate": 1.7073170731707317e-05, + "loss": 2.0587, + "mean_token_accuracy": 0.5427052736282348, + "step": 245 + }, + { + "epoch": 0.08719916288803628, + "grad_norm": 1.4599875211715698, + "learning_rate": 1.7421602787456446e-05, + "loss": 2.0764, + "mean_token_accuracy": 0.5395100235939025, + "step": 250 + }, + { + "epoch": 0.088943146145797, + "grad_norm": 1.4865527153015137, + "learning_rate": 1.7770034843205575e-05, + "loss": 2.055, + "mean_token_accuracy": 0.5414956092834473, + "step": 255 + }, + { + "epoch": 0.09068712940355772, + "grad_norm": 1.7332290410995483, + "learning_rate": 1.8118466898954705e-05, + "loss": 2.0527, + "mean_token_accuracy": 0.5426930665969849, + "step": 260 + }, + { + "epoch": 0.09243111266131845, + "grad_norm": 1.6574803590774536, + "learning_rate": 1.8466898954703834e-05, + "loss": 2.0471, + "mean_token_accuracy": 0.5455828428268432, + "step": 265 + }, + { + "epoch": 0.09417509591907917, + "grad_norm": 1.5530742406845093, + "learning_rate": 1.8815331010452963e-05, + "loss": 2.0439, + "mean_token_accuracy": 0.5468413949012756, + "step": 270 + }, + { + "epoch": 0.09591907917683991, + "grad_norm": 1.3351445198059082, + "learning_rate": 1.9163763066202093e-05, + "loss": 2.0563, + "mean_token_accuracy": 0.5398888111114502, + "step": 275 + }, + { + "epoch": 0.09766306243460063, + "grad_norm": 1.4879651069641113, + "learning_rate": 1.9512195121951222e-05, + "loss": 2.0609, + "mean_token_accuracy": 0.5395711183547973, + "step": 280 + }, + { + "epoch": 0.09940704569236135, + "grad_norm": 1.465016484260559, + "learning_rate": 1.9860627177700348e-05, + "loss": 2.0231, + "mean_token_accuracy": 0.551362407207489, + "step": 285 + }, + { + "epoch": 0.10115102895012208, + "grad_norm": 1.564724326133728, + "learning_rate": 1.9999933277491715e-05, + "loss": 2.0238, + "mean_token_accuracy": 0.5468474984169006, + "step": 290 + }, + { + "epoch": 0.1028950122078828, + "grad_norm": 1.5854735374450684, + "learning_rate": 1.999952553205438e-05, + "loss": 2.0431, + "mean_token_accuracy": 0.542937445640564, + "step": 295 + }, + { + "epoch": 0.10463899546564354, + "grad_norm": 1.468493938446045, + "learning_rate": 1.999874712433585e-05, + "loss": 1.9805, + "mean_token_accuracy": 0.5586143612861634, + "step": 300 + }, + { + "epoch": 0.10463899546564354, + "eval_loss": 1.9213224649429321, + "eval_mean_token_accuracy": 0.5677391005598981, + "eval_runtime": 2.2832, + "eval_samples_per_second": 240.449, + "eval_steps_per_second": 60.441, + "step": 300 + }, + { + "epoch": 0.10638297872340426, + "grad_norm": 1.8331785202026367, + "learning_rate": 1.999759808319013e-05, + "loss": 2.0748, + "mean_token_accuracy": 0.5357649087905884, + "step": 305 + }, + { + "epoch": 0.10812696198116498, + "grad_norm": 1.4497989416122437, + "learning_rate": 1.9996078451209863e-05, + "loss": 1.9881, + "mean_token_accuracy": 0.5525659799575806, + "step": 310 + }, + { + "epoch": 0.10987094523892571, + "grad_norm": 1.3693747520446777, + "learning_rate": 1.999418828472475e-05, + "loss": 2.0294, + "mean_token_accuracy": 0.5477150559425354, + "step": 315 + }, + { + "epoch": 0.11161492849668643, + "grad_norm": 1.4497454166412354, + "learning_rate": 1.9991927653799458e-05, + "loss": 1.9793, + "mean_token_accuracy": 0.5588343024253846, + "step": 320 + }, + { + "epoch": 0.11335891175444716, + "grad_norm": 1.523069977760315, + "learning_rate": 1.998929664223102e-05, + "loss": 1.9743, + "mean_token_accuracy": 0.5582722306251526, + "step": 325 + }, + { + "epoch": 0.11510289501220788, + "grad_norm": 1.5147883892059326, + "learning_rate": 1.9986295347545738e-05, + "loss": 2.0142, + "mean_token_accuracy": 0.5505131959915162, + "step": 330 + }, + { + "epoch": 0.1168468782699686, + "grad_norm": 1.422179937362671, + "learning_rate": 1.998292388099557e-05, + "loss": 2.017, + "mean_token_accuracy": 0.5482038021087646, + "step": 335 + }, + { + "epoch": 0.11859086152772934, + "grad_norm": 1.5548981428146362, + "learning_rate": 1.9979182367553994e-05, + "loss": 1.983, + "mean_token_accuracy": 0.5520527839660645, + "step": 340 + }, + { + "epoch": 0.12033484478549006, + "grad_norm": 1.4368444681167603, + "learning_rate": 1.997507094591137e-05, + "loss": 1.9885, + "mean_token_accuracy": 0.5542216539382935, + "step": 345 + }, + { + "epoch": 0.12207882804325078, + "grad_norm": 1.4380384683609009, + "learning_rate": 1.9970589768469833e-05, + "loss": 2.0199, + "mean_token_accuracy": 0.544018828868866, + "step": 350 + }, + { + "epoch": 0.12382281130101151, + "grad_norm": 1.49617600440979, + "learning_rate": 1.996573900133761e-05, + "loss": 2.0006, + "mean_token_accuracy": 0.5538795113563537, + "step": 355 + }, + { + "epoch": 0.12556679455877223, + "grad_norm": 1.6664273738861084, + "learning_rate": 1.996051882432286e-05, + "loss": 1.9556, + "mean_token_accuracy": 0.5589076399803161, + "step": 360 + }, + { + "epoch": 0.12731077781653297, + "grad_norm": 1.6932154893875122, + "learning_rate": 1.995492943092705e-05, + "loss": 2.0146, + "mean_token_accuracy": 0.5482771396636963, + "step": 365 + }, + { + "epoch": 0.1290547610742937, + "grad_norm": 1.619291067123413, + "learning_rate": 1.9948971028337737e-05, + "loss": 2.0194, + "mean_token_accuracy": 0.5486803531646729, + "step": 370 + }, + { + "epoch": 0.1307987443320544, + "grad_norm": 1.5092412233352661, + "learning_rate": 1.9942643837420904e-05, + "loss": 2.0039, + "mean_token_accuracy": 0.5521933078765869, + "step": 375 + }, + { + "epoch": 0.13254272758981514, + "grad_norm": 1.4223320484161377, + "learning_rate": 1.9935948092712792e-05, + "loss": 1.9544, + "mean_token_accuracy": 0.5596468687057495, + "step": 380 + }, + { + "epoch": 0.13428671084757587, + "grad_norm": 1.8329344987869263, + "learning_rate": 1.992888404241117e-05, + "loss": 1.9691, + "mean_token_accuracy": 0.5566349029541016, + "step": 385 + }, + { + "epoch": 0.13603069410533658, + "grad_norm": 1.8566089868545532, + "learning_rate": 1.992145194836616e-05, + "loss": 1.9745, + "mean_token_accuracy": 0.5550647616386414, + "step": 390 + }, + { + "epoch": 0.13777467736309731, + "grad_norm": 1.2803272008895874, + "learning_rate": 1.9913652086070535e-05, + "loss": 1.9198, + "mean_token_accuracy": 0.5675464272499084, + "step": 395 + }, + { + "epoch": 0.13951866062085805, + "grad_norm": 1.56267249584198, + "learning_rate": 1.9905484744649484e-05, + "loss": 1.9312, + "mean_token_accuracy": 0.5651576161384583, + "step": 400 + }, + { + "epoch": 0.13951866062085805, + "eval_loss": 1.868219256401062, + "eval_mean_token_accuracy": 0.5745108853215757, + "eval_runtime": 2.2785, + "eval_samples_per_second": 240.945, + "eval_steps_per_second": 60.565, + "step": 400 + }, + { + "epoch": 0.14126264387861875, + "grad_norm": 1.577077031135559, + "learning_rate": 1.989695022684991e-05, + "loss": 1.9765, + "mean_token_accuracy": 0.5537695407867431, + "step": 405 + }, + { + "epoch": 0.1430066271363795, + "grad_norm": 1.4785982370376587, + "learning_rate": 1.988804884902921e-05, + "loss": 1.9474, + "mean_token_accuracy": 0.5616202354431152, + "step": 410 + }, + { + "epoch": 0.14475061039414022, + "grad_norm": 1.6787692308425903, + "learning_rate": 1.9878780941143538e-05, + "loss": 1.9546, + "mean_token_accuracy": 0.5621517539024353, + "step": 415 + }, + { + "epoch": 0.14649459365190093, + "grad_norm": 1.4049898386001587, + "learning_rate": 1.9869146846735576e-05, + "loss": 1.9216, + "mean_token_accuracy": 0.5653103590011597, + "step": 420 + }, + { + "epoch": 0.14823857690966166, + "grad_norm": 1.3464614152908325, + "learning_rate": 1.985914692292182e-05, + "loss": 1.954, + "mean_token_accuracy": 0.5586510181427002, + "step": 425 + }, + { + "epoch": 0.1499825601674224, + "grad_norm": 1.5550650358200073, + "learning_rate": 1.9848781540379312e-05, + "loss": 1.9868, + "mean_token_accuracy": 0.5543682813644409, + "step": 430 + }, + { + "epoch": 0.15172654342518313, + "grad_norm": 1.471214771270752, + "learning_rate": 1.983805108333191e-05, + "loss": 1.9541, + "mean_token_accuracy": 0.5577651381492614, + "step": 435 + }, + { + "epoch": 0.15347052668294384, + "grad_norm": 1.9721437692642212, + "learning_rate": 1.9826955949536062e-05, + "loss": 1.9664, + "mean_token_accuracy": 0.5577590346336365, + "step": 440 + }, + { + "epoch": 0.15521450994070457, + "grad_norm": 1.2202377319335938, + "learning_rate": 1.9815496550266036e-05, + "loss": 1.9541, + "mean_token_accuracy": 0.5553213596343994, + "step": 445 + }, + { + "epoch": 0.1569584931984653, + "grad_norm": 1.3433985710144043, + "learning_rate": 1.98036733102987e-05, + "loss": 1.9351, + "mean_token_accuracy": 0.5629887580871582, + "step": 450 + }, + { + "epoch": 0.158702476456226, + "grad_norm": 1.5679229497909546, + "learning_rate": 1.979148666789775e-05, + "loss": 1.9587, + "mean_token_accuracy": 0.5566532254219055, + "step": 455 + }, + { + "epoch": 0.16044645971398674, + "grad_norm": 1.6487351655960083, + "learning_rate": 1.9778937074797494e-05, + "loss": 1.9454, + "mean_token_accuracy": 0.5555962681770324, + "step": 460 + }, + { + "epoch": 0.16219044297174748, + "grad_norm": 1.4100677967071533, + "learning_rate": 1.976602499618608e-05, + "loss": 1.9281, + "mean_token_accuracy": 0.562218964099884, + "step": 465 + }, + { + "epoch": 0.16393442622950818, + "grad_norm": 1.6682672500610352, + "learning_rate": 1.9752750910688278e-05, + "loss": 2.0092, + "mean_token_accuracy": 0.5436400294303894, + "step": 470 + }, + { + "epoch": 0.16567840948726892, + "grad_norm": 1.5930486917495728, + "learning_rate": 1.9739115310347698e-05, + "loss": 1.9421, + "mean_token_accuracy": 0.5602822542190552, + "step": 475 + }, + { + "epoch": 0.16742239274502965, + "grad_norm": 1.325100302696228, + "learning_rate": 1.972511870060861e-05, + "loss": 1.9263, + "mean_token_accuracy": 0.5628787755966187, + "step": 480 + }, + { + "epoch": 0.1691663760027904, + "grad_norm": 1.2343814373016357, + "learning_rate": 1.9710761600297147e-05, + "loss": 1.8846, + "mean_token_accuracy": 0.5730755209922791, + "step": 485 + }, + { + "epoch": 0.1709103592605511, + "grad_norm": 1.2456448078155518, + "learning_rate": 1.9696044541602126e-05, + "loss": 1.9517, + "mean_token_accuracy": 0.5562744498252868, + "step": 490 + }, + { + "epoch": 0.17265434251831183, + "grad_norm": 1.6084667444229126, + "learning_rate": 1.968096807005528e-05, + "loss": 1.9199, + "mean_token_accuracy": 0.5609176397323609, + "step": 495 + }, + { + "epoch": 0.17439832577607256, + "grad_norm": 1.7223913669586182, + "learning_rate": 1.966553274451106e-05, + "loss": 1.9237, + "mean_token_accuracy": 0.5654142379760743, + "step": 500 + }, + { + "epoch": 0.17439832577607256, + "eval_loss": 1.833382248878479, + "eval_mean_token_accuracy": 0.579058465750321, + "eval_runtime": 2.2781, + "eval_samples_per_second": 240.986, + "eval_steps_per_second": 60.576, + "step": 500 + }, + { + "epoch": 0.17614230903383327, + "grad_norm": 1.495141625404358, + "learning_rate": 1.964973913712591e-05, + "loss": 1.9314, + "mean_token_accuracy": 0.564112913608551, + "step": 505 + }, + { + "epoch": 0.177886292291594, + "grad_norm": 1.5330617427825928, + "learning_rate": 1.9633587833337064e-05, + "loss": 1.9157, + "mean_token_accuracy": 0.5603983283042908, + "step": 510 + }, + { + "epoch": 0.17963027554935473, + "grad_norm": 1.6696357727050781, + "learning_rate": 1.961707943184083e-05, + "loss": 1.9172, + "mean_token_accuracy": 0.5651698470115661, + "step": 515 + }, + { + "epoch": 0.18137425880711544, + "grad_norm": 1.4351089000701904, + "learning_rate": 1.9600214544570432e-05, + "loss": 1.9497, + "mean_token_accuracy": 0.5564149498939515, + "step": 520 + }, + { + "epoch": 0.18311824206487617, + "grad_norm": 1.4948656558990479, + "learning_rate": 1.958299379667328e-05, + "loss": 1.9328, + "mean_token_accuracy": 0.5620662212371826, + "step": 525 + }, + { + "epoch": 0.1848622253226369, + "grad_norm": 1.8340569734573364, + "learning_rate": 1.9565417826487835e-05, + "loss": 1.9301, + "mean_token_accuracy": 0.5591153383255005, + "step": 530 + }, + { + "epoch": 0.18660620858039761, + "grad_norm": 1.3762108087539673, + "learning_rate": 1.9547487285519922e-05, + "loss": 1.9164, + "mean_token_accuracy": 0.5675342202186584, + "step": 535 + }, + { + "epoch": 0.18835019183815835, + "grad_norm": 1.4461901187896729, + "learning_rate": 1.952920283841861e-05, + "loss": 1.9354, + "mean_token_accuracy": 0.5584616422653198, + "step": 540 + }, + { + "epoch": 0.19009417509591908, + "grad_norm": 1.385931134223938, + "learning_rate": 1.9510565162951538e-05, + "loss": 1.9054, + "mean_token_accuracy": 0.5638990640640259, + "step": 545 + }, + { + "epoch": 0.19183815835367982, + "grad_norm": 1.3334605693817139, + "learning_rate": 1.9491574949979814e-05, + "loss": 1.9325, + "mean_token_accuracy": 0.560050117969513, + "step": 550 + }, + { + "epoch": 0.19358214161144052, + "grad_norm": 1.2869641780853271, + "learning_rate": 1.9472232903432406e-05, + "loss": 1.8869, + "mean_token_accuracy": 0.5688171982765198, + "step": 555 + }, + { + "epoch": 0.19532612486920126, + "grad_norm": 1.4418607950210571, + "learning_rate": 1.945253974028004e-05, + "loss": 1.9032, + "mean_token_accuracy": 0.5671615362167358, + "step": 560 + }, + { + "epoch": 0.197070108126962, + "grad_norm": 1.4661266803741455, + "learning_rate": 1.9432496190508633e-05, + "loss": 1.9086, + "mean_token_accuracy": 0.5675708889961243, + "step": 565 + }, + { + "epoch": 0.1988140913847227, + "grad_norm": 1.2697911262512207, + "learning_rate": 1.941210299709222e-05, + "loss": 1.9178, + "mean_token_accuracy": 0.5595613360404968, + "step": 570 + }, + { + "epoch": 0.20055807464248343, + "grad_norm": 1.3371995687484741, + "learning_rate": 1.9391360915965426e-05, + "loss": 1.918, + "mean_token_accuracy": 0.559781277179718, + "step": 575 + }, + { + "epoch": 0.20230205790024416, + "grad_norm": 1.170939564704895, + "learning_rate": 1.9370270715995447e-05, + "loss": 1.8858, + "mean_token_accuracy": 0.5686339259147644, + "step": 580 + }, + { + "epoch": 0.20404604115800487, + "grad_norm": 1.2868223190307617, + "learning_rate": 1.934883317895354e-05, + "loss": 1.9338, + "mean_token_accuracy": 0.5597551107406616, + "step": 585 + }, + { + "epoch": 0.2057900244157656, + "grad_norm": 1.2733184099197388, + "learning_rate": 1.932704909948604e-05, + "loss": 1.889, + "mean_token_accuracy": 0.5675036668777466, + "step": 590 + }, + { + "epoch": 0.20753400767352634, + "grad_norm": 1.2116005420684814, + "learning_rate": 1.930491928508492e-05, + "loss": 1.9222, + "mean_token_accuracy": 0.5653592348098755, + "step": 595 + }, + { + "epoch": 0.20927799093128707, + "grad_norm": 1.437216877937317, + "learning_rate": 1.9282444556057855e-05, + "loss": 1.9026, + "mean_token_accuracy": 0.5654997587203979, + "step": 600 + }, + { + "epoch": 0.20927799093128707, + "eval_loss": 1.8072869777679443, + "eval_mean_token_accuracy": 0.5830163489217344, + "eval_runtime": 2.28, + "eval_samples_per_second": 240.79, + "eval_steps_per_second": 60.527, + "step": 600 + }, + { + "epoch": 0.21102197418904778, + "grad_norm": 1.313175916671753, + "learning_rate": 1.9259625745497803e-05, + "loss": 1.9393, + "mean_token_accuracy": 0.5573802471160889, + "step": 605 + }, + { + "epoch": 0.2127659574468085, + "grad_norm": 1.419681191444397, + "learning_rate": 1.9236463699252136e-05, + "loss": 1.8977, + "mean_token_accuracy": 0.569849717617035, + "step": 610 + }, + { + "epoch": 0.21450994070456925, + "grad_norm": 1.399595856666565, + "learning_rate": 1.921295927589127e-05, + "loss": 1.8635, + "mean_token_accuracy": 0.5736925721168518, + "step": 615 + }, + { + "epoch": 0.21625392396232995, + "grad_norm": 1.3106194734573364, + "learning_rate": 1.9189113346676878e-05, + "loss": 1.8424, + "mean_token_accuracy": 0.5776759505271911, + "step": 620 + }, + { + "epoch": 0.2179979072200907, + "grad_norm": 1.4282829761505127, + "learning_rate": 1.916492679552954e-05, + "loss": 1.899, + "mean_token_accuracy": 0.566831636428833, + "step": 625 + }, + { + "epoch": 0.21974189047785142, + "grad_norm": 1.2370389699935913, + "learning_rate": 1.914040051899602e-05, + "loss": 1.8815, + "mean_token_accuracy": 0.57172532081604, + "step": 630 + }, + { + "epoch": 0.22148587373561213, + "grad_norm": 1.536375641822815, + "learning_rate": 1.9115535426216018e-05, + "loss": 1.8925, + "mean_token_accuracy": 0.5654631137847901, + "step": 635 + }, + { + "epoch": 0.22322985699337286, + "grad_norm": 1.4581001996994019, + "learning_rate": 1.9090332438888458e-05, + "loss": 1.8854, + "mean_token_accuracy": 0.5668621778488159, + "step": 640 + }, + { + "epoch": 0.2249738402511336, + "grad_norm": 1.3430296182632446, + "learning_rate": 1.906479249123735e-05, + "loss": 1.8894, + "mean_token_accuracy": 0.564418375492096, + "step": 645 + }, + { + "epoch": 0.22671782350889433, + "grad_norm": 1.3927807807922363, + "learning_rate": 1.9038916529977136e-05, + "loss": 1.8592, + "mean_token_accuracy": 0.5744745969772339, + "step": 650 + }, + { + "epoch": 0.22846180676665503, + "grad_norm": 1.538617491722107, + "learning_rate": 1.901270551427761e-05, + "loss": 1.8606, + "mean_token_accuracy": 0.5753482222557068, + "step": 655 + }, + { + "epoch": 0.23020579002441577, + "grad_norm": 1.2715994119644165, + "learning_rate": 1.898616041572836e-05, + "loss": 1.9148, + "mean_token_accuracy": 0.5637707591056824, + "step": 660 + }, + { + "epoch": 0.2319497732821765, + "grad_norm": 1.2087376117706299, + "learning_rate": 1.8959282218302746e-05, + "loss": 1.9056, + "mean_token_accuracy": 0.5649193525314331, + "step": 665 + }, + { + "epoch": 0.2336937565399372, + "grad_norm": 1.2960108518600464, + "learning_rate": 1.893207191832144e-05, + "loss": 1.863, + "mean_token_accuracy": 0.5746028780937195, + "step": 670 + }, + { + "epoch": 0.23543773979769794, + "grad_norm": 1.268133282661438, + "learning_rate": 1.8904530524415483e-05, + "loss": 1.8656, + "mean_token_accuracy": 0.5707600235939025, + "step": 675 + }, + { + "epoch": 0.23718172305545868, + "grad_norm": 1.2900458574295044, + "learning_rate": 1.8876659057488905e-05, + "loss": 1.8498, + "mean_token_accuracy": 0.5751710653305053, + "step": 680 + }, + { + "epoch": 0.23892570631321938, + "grad_norm": 1.218464970588684, + "learning_rate": 1.8848458550680875e-05, + "loss": 1.872, + "mean_token_accuracy": 0.5708394289016724, + "step": 685 + }, + { + "epoch": 0.24066968957098012, + "grad_norm": 1.2474693059921265, + "learning_rate": 1.8819930049327412e-05, + "loss": 1.8894, + "mean_token_accuracy": 0.5656585931777954, + "step": 690 + }, + { + "epoch": 0.24241367282874085, + "grad_norm": 1.2624021768569946, + "learning_rate": 1.8791074610922624e-05, + "loss": 1.8647, + "mean_token_accuracy": 0.5723240494728088, + "step": 695 + }, + { + "epoch": 0.24415765608650156, + "grad_norm": 1.1939747333526611, + "learning_rate": 1.8761893305079528e-05, + "loss": 1.8881, + "mean_token_accuracy": 0.5674242496490478, + "step": 700 + }, + { + "epoch": 0.24415765608650156, + "eval_loss": 1.7823907136917114, + "eval_mean_token_accuracy": 0.5864341168299966, + "eval_runtime": 2.2844, + "eval_samples_per_second": 240.327, + "eval_steps_per_second": 60.41, + "step": 700 + }, + { + "epoch": 0.2459016393442623, + "grad_norm": 1.16288161277771, + "learning_rate": 1.873238721349038e-05, + "loss": 1.9086, + "mean_token_accuracy": 0.5639662861824035, + "step": 705 + }, + { + "epoch": 0.24764562260202302, + "grad_norm": 1.2528672218322754, + "learning_rate": 1.8702557429886607e-05, + "loss": 1.8714, + "mean_token_accuracy": 0.5726111888885498, + "step": 710 + }, + { + "epoch": 0.24938960585978376, + "grad_norm": 1.3696616888046265, + "learning_rate": 1.8672405059998228e-05, + "loss": 1.897, + "mean_token_accuracy": 0.563336992263794, + "step": 715 + }, + { + "epoch": 0.25113358911754446, + "grad_norm": 1.215741515159607, + "learning_rate": 1.8641931221512895e-05, + "loss": 1.8601, + "mean_token_accuracy": 0.5752016067504883, + "step": 720 + }, + { + "epoch": 0.25287757237530517, + "grad_norm": 1.0280753374099731, + "learning_rate": 1.8611137044034454e-05, + "loss": 1.8595, + "mean_token_accuracy": 0.5699352383613586, + "step": 725 + }, + { + "epoch": 0.25462155563306593, + "grad_norm": 1.222464680671692, + "learning_rate": 1.858002366904107e-05, + "loss": 1.8792, + "mean_token_accuracy": 0.5669415950775146, + "step": 730 + }, + { + "epoch": 0.25636553889082664, + "grad_norm": 1.1792371273040771, + "learning_rate": 1.854859224984292e-05, + "loss": 1.8359, + "mean_token_accuracy": 0.5753787875175476, + "step": 735 + }, + { + "epoch": 0.2581095221485874, + "grad_norm": 1.4262481927871704, + "learning_rate": 1.851684395153944e-05, + "loss": 1.8534, + "mean_token_accuracy": 0.5753849029541016, + "step": 740 + }, + { + "epoch": 0.2598535054063481, + "grad_norm": 1.0678586959838867, + "learning_rate": 1.8484779950976133e-05, + "loss": 1.8112, + "mean_token_accuracy": 0.5816593408584595, + "step": 745 + }, + { + "epoch": 0.2615974886641088, + "grad_norm": 1.2717487812042236, + "learning_rate": 1.8452401436700954e-05, + "loss": 1.8438, + "mean_token_accuracy": 0.5754643201828002, + "step": 750 + }, + { + "epoch": 0.2633414719218696, + "grad_norm": 1.240116834640503, + "learning_rate": 1.8419709608920243e-05, + "loss": 1.8756, + "mean_token_accuracy": 0.5700452208518982, + "step": 755 + }, + { + "epoch": 0.2650854551796303, + "grad_norm": 1.175158143043518, + "learning_rate": 1.8386705679454243e-05, + "loss": 1.8375, + "mean_token_accuracy": 0.5759897470474243, + "step": 760 + }, + { + "epoch": 0.266829438437391, + "grad_norm": 1.1849920749664307, + "learning_rate": 1.8353390871692176e-05, + "loss": 1.8541, + "mean_token_accuracy": 0.5735887169837952, + "step": 765 + }, + { + "epoch": 0.26857342169515175, + "grad_norm": 1.1253867149353027, + "learning_rate": 1.8319766420546902e-05, + "loss": 1.8087, + "mean_token_accuracy": 0.5798264861106872, + "step": 770 + }, + { + "epoch": 0.27031740495291245, + "grad_norm": 1.216903567314148, + "learning_rate": 1.8285833572409135e-05, + "loss": 1.8711, + "mean_token_accuracy": 0.5681268334388733, + "step": 775 + }, + { + "epoch": 0.27206138821067316, + "grad_norm": 1.2326476573944092, + "learning_rate": 1.8251593585101243e-05, + "loss": 1.8454, + "mean_token_accuracy": 0.5726234197616578, + "step": 780 + }, + { + "epoch": 0.2738053714684339, + "grad_norm": 1.3498684167861938, + "learning_rate": 1.821704772783063e-05, + "loss": 1.8744, + "mean_token_accuracy": 0.5702773809432984, + "step": 785 + }, + { + "epoch": 0.27554935472619463, + "grad_norm": 1.2527179718017578, + "learning_rate": 1.818219728114267e-05, + "loss": 1.8247, + "mean_token_accuracy": 0.5808956503868103, + "step": 790 + }, + { + "epoch": 0.27729333798395533, + "grad_norm": 1.2847415208816528, + "learning_rate": 1.8147043536873275e-05, + "loss": 1.8515, + "mean_token_accuracy": 0.5718536376953125, + "step": 795 + }, + { + "epoch": 0.2790373212417161, + "grad_norm": 1.1496868133544922, + "learning_rate": 1.8111587798100974e-05, + "loss": 1.8424, + "mean_token_accuracy": 0.5737292289733886, + "step": 800 + }, + { + "epoch": 0.2790373212417161, + "eval_loss": 1.7646913528442383, + "eval_mean_token_accuracy": 0.5887769706871199, + "eval_runtime": 2.2882, + "eval_samples_per_second": 239.926, + "eval_steps_per_second": 60.309, + "step": 800 + }, + { + "epoch": 0.2807813044994768, + "grad_norm": 1.0835829973220825, + "learning_rate": 1.807583137909862e-05, + "loss": 1.858, + "mean_token_accuracy": 0.5735092878341674, + "step": 805 + }, + { + "epoch": 0.2825252877572375, + "grad_norm": 1.124707579612732, + "learning_rate": 1.8039775605284687e-05, + "loss": 1.8985, + "mean_token_accuracy": 0.5621334314346313, + "step": 810 + }, + { + "epoch": 0.28426927101499827, + "grad_norm": 1.149370789527893, + "learning_rate": 1.800342181317413e-05, + "loss": 1.8533, + "mean_token_accuracy": 0.5710105061531067, + "step": 815 + }, + { + "epoch": 0.286013254272759, + "grad_norm": 1.1378004550933838, + "learning_rate": 1.7966771350328825e-05, + "loss": 1.8015, + "mean_token_accuracy": 0.5808406591415405, + "step": 820 + }, + { + "epoch": 0.2877572375305197, + "grad_norm": 1.2488001585006714, + "learning_rate": 1.7929825575307665e-05, + "loss": 1.8474, + "mean_token_accuracy": 0.5742790818214416, + "step": 825 + }, + { + "epoch": 0.28950122078828044, + "grad_norm": 1.1303263902664185, + "learning_rate": 1.7892585857616144e-05, + "loss": 1.8558, + "mean_token_accuracy": 0.5733260035514831, + "step": 830 + }, + { + "epoch": 0.29124520404604115, + "grad_norm": 1.1966603994369507, + "learning_rate": 1.785505357765563e-05, + "loss": 1.8795, + "mean_token_accuracy": 0.5667155504226684, + "step": 835 + }, + { + "epoch": 0.29298918730380186, + "grad_norm": 1.3885133266448975, + "learning_rate": 1.781723012667218e-05, + "loss": 1.8459, + "mean_token_accuracy": 0.5731243848800659, + "step": 840 + }, + { + "epoch": 0.2947331705615626, + "grad_norm": 1.0937238931655884, + "learning_rate": 1.7779116906704986e-05, + "loss": 1.84, + "mean_token_accuracy": 0.5721102118492126, + "step": 845 + }, + { + "epoch": 0.2964771538193233, + "grad_norm": 1.1150310039520264, + "learning_rate": 1.7740715330534383e-05, + "loss": 1.8459, + "mean_token_accuracy": 0.572262954711914, + "step": 850 + }, + { + "epoch": 0.2982211370770841, + "grad_norm": 1.2256666421890259, + "learning_rate": 1.770202682162949e-05, + "loss": 1.875, + "mean_token_accuracy": 0.5647544026374817, + "step": 855 + }, + { + "epoch": 0.2999651203348448, + "grad_norm": 1.1450411081314087, + "learning_rate": 1.7663052814095447e-05, + "loss": 1.8503, + "mean_token_accuracy": 0.5749572515487671, + "step": 860 + }, + { + "epoch": 0.3017091035926055, + "grad_norm": 1.3011276721954346, + "learning_rate": 1.7623794752620255e-05, + "loss": 1.8717, + "mean_token_accuracy": 0.5700330018997193, + "step": 865 + }, + { + "epoch": 0.30345308685036626, + "grad_norm": 1.136769413948059, + "learning_rate": 1.7584254092421226e-05, + "loss": 1.8435, + "mean_token_accuracy": 0.5756720423698425, + "step": 870 + }, + { + "epoch": 0.30519707010812697, + "grad_norm": 1.0772783756256104, + "learning_rate": 1.754443229919103e-05, + "loss": 1.8665, + "mean_token_accuracy": 0.5729411005973816, + "step": 875 + }, + { + "epoch": 0.3069410533658877, + "grad_norm": 1.2176796197891235, + "learning_rate": 1.7504330849043373e-05, + "loss": 1.8326, + "mean_token_accuracy": 0.5759836196899414, + "step": 880 + }, + { + "epoch": 0.30868503662364843, + "grad_norm": 1.1939221620559692, + "learning_rate": 1.7463951228458288e-05, + "loss": 1.8605, + "mean_token_accuracy": 0.5705645203590393, + "step": 885 + }, + { + "epoch": 0.31042901988140914, + "grad_norm": 1.0732890367507935, + "learning_rate": 1.7423294934227017e-05, + "loss": 1.7695, + "mean_token_accuracy": 0.5898338317871094, + "step": 890 + }, + { + "epoch": 0.31217300313916985, + "grad_norm": 1.2261382341384888, + "learning_rate": 1.7382363473396543e-05, + "loss": 1.8425, + "mean_token_accuracy": 0.5737353324890136, + "step": 895 + }, + { + "epoch": 0.3139169863969306, + "grad_norm": 1.0703833103179932, + "learning_rate": 1.734115836321372e-05, + "loss": 1.8293, + "mean_token_accuracy": 0.5781097173690796, + "step": 900 + }, + { + "epoch": 0.3139169863969306, + "eval_loss": 1.7465689182281494, + "eval_mean_token_accuracy": 0.5915820192599642, + "eval_runtime": 2.2848, + "eval_samples_per_second": 240.284, + "eval_steps_per_second": 60.399, + "step": 900 + }, + { + "epoch": 0.3156609696546913, + "grad_norm": 1.038590908050537, + "learning_rate": 1.7299681131069026e-05, + "loss": 1.8501, + "mean_token_accuracy": 0.5711571335792541, + "step": 905 + }, + { + "epoch": 0.317404952912452, + "grad_norm": 1.0435972213745117, + "learning_rate": 1.725793331443996e-05, + "loss": 1.7811, + "mean_token_accuracy": 0.5848179459571838, + "step": 910 + }, + { + "epoch": 0.3191489361702128, + "grad_norm": 1.2644709348678589, + "learning_rate": 1.7215916460834048e-05, + "loss": 1.842, + "mean_token_accuracy": 0.5768450736999512, + "step": 915 + }, + { + "epoch": 0.3208929194279735, + "grad_norm": 1.1662228107452393, + "learning_rate": 1.7173632127731462e-05, + "loss": 1.8336, + "mean_token_accuracy": 0.5745356917381287, + "step": 920 + }, + { + "epoch": 0.3226369026857342, + "grad_norm": 1.094821572303772, + "learning_rate": 1.7131081882527305e-05, + "loss": 1.8453, + "mean_token_accuracy": 0.5741950988769531, + "step": 925 + }, + { + "epoch": 0.32438088594349496, + "grad_norm": 0.9720579981803894, + "learning_rate": 1.708826730247351e-05, + "loss": 1.8315, + "mean_token_accuracy": 0.5795026898384095, + "step": 930 + }, + { + "epoch": 0.32612486920125566, + "grad_norm": 1.1986461877822876, + "learning_rate": 1.704518997462037e-05, + "loss": 1.8025, + "mean_token_accuracy": 0.5812072396278382, + "step": 935 + }, + { + "epoch": 0.32786885245901637, + "grad_norm": 1.1447429656982422, + "learning_rate": 1.7001851495757708e-05, + "loss": 1.8188, + "mean_token_accuracy": 0.5802847027778626, + "step": 940 + }, + { + "epoch": 0.32961283571677713, + "grad_norm": 1.0322043895721436, + "learning_rate": 1.6958253472355687e-05, + "loss": 1.8232, + "mean_token_accuracy": 0.5813599824905396, + "step": 945 + }, + { + "epoch": 0.33135681897453784, + "grad_norm": 1.2150319814682007, + "learning_rate": 1.6914397520505267e-05, + "loss": 1.8198, + "mean_token_accuracy": 0.579056692123413, + "step": 950 + }, + { + "epoch": 0.33310080223229854, + "grad_norm": 1.033612847328186, + "learning_rate": 1.6870285265858298e-05, + "loss": 1.8582, + "mean_token_accuracy": 0.5732221484184266, + "step": 955 + }, + { + "epoch": 0.3348447854900593, + "grad_norm": 1.1266491413116455, + "learning_rate": 1.6825918343567257e-05, + "loss": 1.8096, + "mean_token_accuracy": 0.5805229663848877, + "step": 960 + }, + { + "epoch": 0.33658876874782, + "grad_norm": 1.1227892637252808, + "learning_rate": 1.678129839822463e-05, + "loss": 1.8081, + "mean_token_accuracy": 0.5752443790435791, + "step": 965 + }, + { + "epoch": 0.3383327520055808, + "grad_norm": 1.119611382484436, + "learning_rate": 1.673642708380198e-05, + "loss": 1.8719, + "mean_token_accuracy": 0.5680229783058166, + "step": 970 + }, + { + "epoch": 0.3400767352633415, + "grad_norm": 1.135180950164795, + "learning_rate": 1.6691306063588583e-05, + "loss": 1.8034, + "mean_token_accuracy": 0.5833944439888, + "step": 975 + }, + { + "epoch": 0.3418207185211022, + "grad_norm": 1.084303855895996, + "learning_rate": 1.6645937010129837e-05, + "loss": 1.8057, + "mean_token_accuracy": 0.5811583638191223, + "step": 980 + }, + { + "epoch": 0.34356470177886295, + "grad_norm": 1.0643222332000732, + "learning_rate": 1.660032160516522e-05, + "loss": 1.8034, + "mean_token_accuracy": 0.5824169039726257, + "step": 985 + }, + { + "epoch": 0.34530868503662365, + "grad_norm": 1.0902906656265259, + "learning_rate": 1.6554461539565953e-05, + "loss": 1.8502, + "mean_token_accuracy": 0.5699413418769836, + "step": 990 + }, + { + "epoch": 0.34705266829438436, + "grad_norm": 1.0563591718673706, + "learning_rate": 1.650835851327236e-05, + "loss": 1.8095, + "mean_token_accuracy": 0.5772666096687317, + "step": 995 + }, + { + "epoch": 0.3487966515521451, + "grad_norm": 1.1587867736816406, + "learning_rate": 1.6462014235230805e-05, + "loss": 1.8237, + "mean_token_accuracy": 0.5748350501060486, + "step": 1000 + }, + { + "epoch": 0.3487966515521451, + "eval_loss": 1.7325563430786133, + "eval_mean_token_accuracy": 0.5949785376804463, + "eval_runtime": 2.2839, + "eval_samples_per_second": 240.377, + "eval_steps_per_second": 60.423, + "step": 1000 + }, + { + "epoch": 0.3505406348099058, + "grad_norm": 0.9310490489006042, + "learning_rate": 1.641543042333038e-05, + "loss": 1.8168, + "mean_token_accuracy": 0.5771566510200501, + "step": 1005 + }, + { + "epoch": 0.35228461806766653, + "grad_norm": 1.0678431987762451, + "learning_rate": 1.636860880433922e-05, + "loss": 1.7982, + "mean_token_accuracy": 0.5835043907165527, + "step": 1010 + }, + { + "epoch": 0.3540286013254273, + "grad_norm": 0.9736899137496948, + "learning_rate": 1.632155111384047e-05, + "loss": 1.8001, + "mean_token_accuracy": 0.5807431936264038, + "step": 1015 + }, + { + "epoch": 0.355772584583188, + "grad_norm": 1.1601985692977905, + "learning_rate": 1.6274259096168e-05, + "loss": 1.7876, + "mean_token_accuracy": 0.5840786933898926, + "step": 1020 + }, + { + "epoch": 0.3575165678409487, + "grad_norm": 1.0871918201446533, + "learning_rate": 1.622673450434169e-05, + "loss": 1.8109, + "mean_token_accuracy": 0.5792338609695434, + "step": 1025 + }, + { + "epoch": 0.35926055109870947, + "grad_norm": 1.105723261833191, + "learning_rate": 1.6178979100002486e-05, + "loss": 1.7914, + "mean_token_accuracy": 0.5835716128349304, + "step": 1030 + }, + { + "epoch": 0.3610045343564702, + "grad_norm": 1.2562061548233032, + "learning_rate": 1.6130994653347096e-05, + "loss": 1.7972, + "mean_token_accuracy": 0.5833822131156922, + "step": 1035 + }, + { + "epoch": 0.3627485176142309, + "grad_norm": 1.0530085563659668, + "learning_rate": 1.6082782943062355e-05, + "loss": 1.7812, + "mean_token_accuracy": 0.5828384637832642, + "step": 1040 + }, + { + "epoch": 0.36449250087199164, + "grad_norm": 1.0546525716781616, + "learning_rate": 1.6034345756259303e-05, + "loss": 1.8259, + "mean_token_accuracy": 0.5737475514411926, + "step": 1045 + }, + { + "epoch": 0.36623648412975235, + "grad_norm": 1.234269618988037, + "learning_rate": 1.598568488840695e-05, + "loss": 1.8117, + "mean_token_accuracy": 0.5792827486991883, + "step": 1050 + }, + { + "epoch": 0.36798046738751305, + "grad_norm": 0.9675230979919434, + "learning_rate": 1.5936802143265708e-05, + "loss": 1.8184, + "mean_token_accuracy": 0.5783785462379456, + "step": 1055 + }, + { + "epoch": 0.3697244506452738, + "grad_norm": 1.098398208618164, + "learning_rate": 1.5887699332820527e-05, + "loss": 1.8039, + "mean_token_accuracy": 0.5803519010543823, + "step": 1060 + }, + { + "epoch": 0.3714684339030345, + "grad_norm": 1.1153620481491089, + "learning_rate": 1.5838378277213745e-05, + "loss": 1.8082, + "mean_token_accuracy": 0.580186951160431, + "step": 1065 + }, + { + "epoch": 0.37321241716079523, + "grad_norm": 1.088965654373169, + "learning_rate": 1.57888408046776e-05, + "loss": 1.8525, + "mean_token_accuracy": 0.5731427073478699, + "step": 1070 + }, + { + "epoch": 0.374956400418556, + "grad_norm": 1.0894255638122559, + "learning_rate": 1.573908875146648e-05, + "loss": 1.8001, + "mean_token_accuracy": 0.579618763923645, + "step": 1075 + }, + { + "epoch": 0.3767003836763167, + "grad_norm": 1.117111086845398, + "learning_rate": 1.5689123961788834e-05, + "loss": 1.7819, + "mean_token_accuracy": 0.5856304883956909, + "step": 1080 + }, + { + "epoch": 0.37844436693407746, + "grad_norm": 1.0354894399642944, + "learning_rate": 1.563894828773883e-05, + "loss": 1.8382, + "mean_token_accuracy": 0.5725928664207458, + "step": 1085 + }, + { + "epoch": 0.38018835019183816, + "grad_norm": 1.027478575706482, + "learning_rate": 1.55885635892277e-05, + "loss": 1.8209, + "mean_token_accuracy": 0.5758980989456177, + "step": 1090 + }, + { + "epoch": 0.38193233344959887, + "grad_norm": 0.991007924079895, + "learning_rate": 1.5537971733914784e-05, + "loss": 1.805, + "mean_token_accuracy": 0.5769305944442749, + "step": 1095 + }, + { + "epoch": 0.38367631670735963, + "grad_norm": 0.9640344381332397, + "learning_rate": 1.5487174597138314e-05, + "loss": 1.7925, + "mean_token_accuracy": 0.582343602180481, + "step": 1100 + }, + { + "epoch": 0.38367631670735963, + "eval_loss": 1.719460129737854, + "eval_mean_token_accuracy": 0.5949838507002678, + "eval_runtime": 2.2841, + "eval_samples_per_second": 240.354, + "eval_steps_per_second": 60.417, + "step": 1100 + }, + { + "epoch": 0.38542029996512034, + "grad_norm": 0.9246540069580078, + "learning_rate": 1.543617406184589e-05, + "loss": 1.7818, + "mean_token_accuracy": 0.5908357858657837, + "step": 1105 + }, + { + "epoch": 0.38716428322288104, + "grad_norm": 0.9606871008872986, + "learning_rate": 1.5384972018524678e-05, + "loss": 1.7716, + "mean_token_accuracy": 0.5862597703933716, + "step": 1110 + }, + { + "epoch": 0.3889082664806418, + "grad_norm": 1.079079508781433, + "learning_rate": 1.5333570365131353e-05, + "loss": 1.8139, + "mean_token_accuracy": 0.5795637726783752, + "step": 1115 + }, + { + "epoch": 0.3906522497384025, + "grad_norm": 1.0687153339385986, + "learning_rate": 1.5281971007021728e-05, + "loss": 1.8078, + "mean_token_accuracy": 0.5825513243675232, + "step": 1120 + }, + { + "epoch": 0.3923962329961632, + "grad_norm": 1.1391572952270508, + "learning_rate": 1.5230175856880132e-05, + "loss": 1.7842, + "mean_token_accuracy": 0.5824596881866455, + "step": 1125 + }, + { + "epoch": 0.394140216253924, + "grad_norm": 0.956125020980835, + "learning_rate": 1.5178186834648509e-05, + "loss": 1.8093, + "mean_token_accuracy": 0.5799975514411926, + "step": 1130 + }, + { + "epoch": 0.3958841995116847, + "grad_norm": 0.9353156685829163, + "learning_rate": 1.5126005867455256e-05, + "loss": 1.8135, + "mean_token_accuracy": 0.5779386639595032, + "step": 1135 + }, + { + "epoch": 0.3976281827694454, + "grad_norm": 1.022855281829834, + "learning_rate": 1.5073634889543778e-05, + "loss": 1.7999, + "mean_token_accuracy": 0.5833822011947631, + "step": 1140 + }, + { + "epoch": 0.39937216602720615, + "grad_norm": 0.9858539700508118, + "learning_rate": 1.5021075842200796e-05, + "loss": 1.811, + "mean_token_accuracy": 0.5799120187759399, + "step": 1145 + }, + { + "epoch": 0.40111614928496686, + "grad_norm": 0.9535442590713501, + "learning_rate": 1.4968330673684387e-05, + "loss": 1.8106, + "mean_token_accuracy": 0.579343843460083, + "step": 1150 + }, + { + "epoch": 0.40286013254272757, + "grad_norm": 0.9476344585418701, + "learning_rate": 1.4915401339151769e-05, + "loss": 1.8066, + "mean_token_accuracy": 0.5847305536270142, + "step": 1155 + }, + { + "epoch": 0.40460411580048833, + "grad_norm": 0.9653626680374146, + "learning_rate": 1.486228980058682e-05, + "loss": 1.8132, + "mean_token_accuracy": 0.5774804353713989, + "step": 1160 + }, + { + "epoch": 0.40634809905824903, + "grad_norm": 0.941003680229187, + "learning_rate": 1.4808998026727348e-05, + "loss": 1.794, + "mean_token_accuracy": 0.5829545378684997, + "step": 1165 + }, + { + "epoch": 0.40809208231600974, + "grad_norm": 0.9951907992362976, + "learning_rate": 1.4755527992992133e-05, + "loss": 1.8001, + "mean_token_accuracy": 0.584249758720398, + "step": 1170 + }, + { + "epoch": 0.4098360655737705, + "grad_norm": 1.161740779876709, + "learning_rate": 1.4701881681407684e-05, + "loss": 1.8273, + "mean_token_accuracy": 0.5732282638549805, + "step": 1175 + }, + { + "epoch": 0.4115800488315312, + "grad_norm": 0.98726487159729, + "learning_rate": 1.464806108053477e-05, + "loss": 1.7741, + "mean_token_accuracy": 0.5860153913497925, + "step": 1180 + }, + { + "epoch": 0.4133240320892919, + "grad_norm": 1.0420575141906738, + "learning_rate": 1.4594068185394723e-05, + "loss": 1.7744, + "mean_token_accuracy": 0.5826429724693298, + "step": 1185 + }, + { + "epoch": 0.4150680153470527, + "grad_norm": 1.0130770206451416, + "learning_rate": 1.4539904997395468e-05, + "loss": 1.8072, + "mean_token_accuracy": 0.578256344795227, + "step": 1190 + }, + { + "epoch": 0.4168119986048134, + "grad_norm": 1.1711149215698242, + "learning_rate": 1.448557352425735e-05, + "loss": 1.7783, + "mean_token_accuracy": 0.5852089285850525, + "step": 1195 + }, + { + "epoch": 0.41855598186257414, + "grad_norm": 1.0184165239334106, + "learning_rate": 1.44310757799387e-05, + "loss": 1.7711, + "mean_token_accuracy": 0.5863941788673401, + "step": 1200 + }, + { + "epoch": 0.41855598186257414, + "eval_loss": 1.7093919515609741, + "eval_mean_token_accuracy": 0.5984264102534972, + "eval_runtime": 2.2831, + "eval_samples_per_second": 240.462, + "eval_steps_per_second": 60.444, + "step": 1200 + }, + { + "epoch": 0.42029996512033485, + "grad_norm": 0.926988422870636, + "learning_rate": 1.437641378456119e-05, + "loss": 1.8055, + "mean_token_accuracy": 0.5789711475372314, + "step": 1205 + }, + { + "epoch": 0.42204394837809556, + "grad_norm": 1.0789161920547485, + "learning_rate": 1.4321589564334946e-05, + "loss": 1.7836, + "mean_token_accuracy": 0.5825513243675232, + "step": 1210 + }, + { + "epoch": 0.4237879316358563, + "grad_norm": 1.0459811687469482, + "learning_rate": 1.4266605151483444e-05, + "loss": 1.7751, + "mean_token_accuracy": 0.5854777693748474, + "step": 1215 + }, + { + "epoch": 0.425531914893617, + "grad_norm": 0.9387654066085815, + "learning_rate": 1.4211462584168178e-05, + "loss": 1.7893, + "mean_token_accuracy": 0.5830950617790223, + "step": 1220 + }, + { + "epoch": 0.42727589815137773, + "grad_norm": 0.9399378299713135, + "learning_rate": 1.4156163906413113e-05, + "loss": 1.8074, + "mean_token_accuracy": 0.577999746799469, + "step": 1225 + }, + { + "epoch": 0.4290198814091385, + "grad_norm": 1.310066819190979, + "learning_rate": 1.4100711168028906e-05, + "loss": 1.7835, + "mean_token_accuracy": 0.5836021423339843, + "step": 1230 + }, + { + "epoch": 0.4307638646668992, + "grad_norm": 1.1233808994293213, + "learning_rate": 1.4045106424536938e-05, + "loss": 1.7972, + "mean_token_accuracy": 0.5836510300636292, + "step": 1235 + }, + { + "epoch": 0.4325078479246599, + "grad_norm": 0.9128984808921814, + "learning_rate": 1.398935173709311e-05, + "loss": 1.7921, + "mean_token_accuracy": 0.5823985695838928, + "step": 1240 + }, + { + "epoch": 0.43425183118242067, + "grad_norm": 0.9416753053665161, + "learning_rate": 1.3933449172411446e-05, + "loss": 1.7956, + "mean_token_accuracy": 0.5798387169837952, + "step": 1245 + }, + { + "epoch": 0.4359958144401814, + "grad_norm": 1.0303266048431396, + "learning_rate": 1.387740080268748e-05, + "loss": 1.757, + "mean_token_accuracy": 0.5885202884674072, + "step": 1250 + }, + { + "epoch": 0.4377397976979421, + "grad_norm": 0.8626015186309814, + "learning_rate": 1.3821208705521442e-05, + "loss": 1.7647, + "mean_token_accuracy": 0.5852578163146973, + "step": 1255 + }, + { + "epoch": 0.43948378095570284, + "grad_norm": 1.1387044191360474, + "learning_rate": 1.3764874963841255e-05, + "loss": 1.7862, + "mean_token_accuracy": 0.5823741555213928, + "step": 1260 + }, + { + "epoch": 0.44122776421346355, + "grad_norm": 0.9297892451286316, + "learning_rate": 1.3708401665825319e-05, + "loss": 1.7902, + "mean_token_accuracy": 0.5839259505271912, + "step": 1265 + }, + { + "epoch": 0.44297174747122425, + "grad_norm": 1.0010992288589478, + "learning_rate": 1.36517909048251e-05, + "loss": 1.7884, + "mean_token_accuracy": 0.5820075869560242, + "step": 1270 + }, + { + "epoch": 0.444715730728985, + "grad_norm": 1.074327826499939, + "learning_rate": 1.3595044779287543e-05, + "loss": 1.8309, + "mean_token_accuracy": 0.5718536138534546, + "step": 1275 + }, + { + "epoch": 0.4464597139867457, + "grad_norm": 1.0309607982635498, + "learning_rate": 1.3538165392677288e-05, + "loss": 1.7558, + "mean_token_accuracy": 0.5917277693748474, + "step": 1280 + }, + { + "epoch": 0.4482036972445064, + "grad_norm": 0.9470037221908569, + "learning_rate": 1.3481154853398686e-05, + "loss": 1.753, + "mean_token_accuracy": 0.5908663392066955, + "step": 1285 + }, + { + "epoch": 0.4499476805022672, + "grad_norm": 0.9484385251998901, + "learning_rate": 1.3424015274717665e-05, + "loss": 1.7697, + "mean_token_accuracy": 0.5895222306251526, + "step": 1290 + }, + { + "epoch": 0.4516916637600279, + "grad_norm": 1.1056883335113525, + "learning_rate": 1.3366748774683376e-05, + "loss": 1.7975, + "mean_token_accuracy": 0.5820259094238281, + "step": 1295 + }, + { + "epoch": 0.45343564701778866, + "grad_norm": 1.0223686695098877, + "learning_rate": 1.3309357476049686e-05, + "loss": 1.8295, + "mean_token_accuracy": 0.5727333903312684, + "step": 1300 + }, + { + "epoch": 0.45343564701778866, + "eval_loss": 1.700812578201294, + "eval_mean_token_accuracy": 0.5995987221814584, + "eval_runtime": 2.2821, + "eval_samples_per_second": 240.566, + "eval_steps_per_second": 60.47, + "step": 1300 + }, + { + "epoch": 0.45517963027554936, + "grad_norm": 0.9181007742881775, + "learning_rate": 1.3251843506196508e-05, + "loss": 1.7702, + "mean_token_accuracy": 0.5867607593536377, + "step": 1305 + }, + { + "epoch": 0.45692361353331007, + "grad_norm": 0.9574410319328308, + "learning_rate": 1.3194208997050915e-05, + "loss": 1.8047, + "mean_token_accuracy": 0.578793978691101, + "step": 1310 + }, + { + "epoch": 0.45866759679107083, + "grad_norm": 0.9232772588729858, + "learning_rate": 1.313645608500814e-05, + "loss": 1.7674, + "mean_token_accuracy": 0.5875427722930908, + "step": 1315 + }, + { + "epoch": 0.46041158004883154, + "grad_norm": 0.9460382461547852, + "learning_rate": 1.3078586910852364e-05, + "loss": 1.8047, + "mean_token_accuracy": 0.5798387169837952, + "step": 1320 + }, + { + "epoch": 0.46215556330659224, + "grad_norm": 0.9923214316368103, + "learning_rate": 1.3020603619677378e-05, + "loss": 1.7847, + "mean_token_accuracy": 0.585044002532959, + "step": 1325 + }, + { + "epoch": 0.463899546564353, + "grad_norm": 1.0260258913040161, + "learning_rate": 1.296250836080706e-05, + "loss": 1.809, + "mean_token_accuracy": 0.5806695938110351, + "step": 1330 + }, + { + "epoch": 0.4656435298221137, + "grad_norm": 1.2574996948242188, + "learning_rate": 1.2904303287715702e-05, + "loss": 1.7681, + "mean_token_accuracy": 0.5868768453598022, + "step": 1335 + }, + { + "epoch": 0.4673875130798744, + "grad_norm": 0.9898363351821899, + "learning_rate": 1.284599055794819e-05, + "loss": 1.7829, + "mean_token_accuracy": 0.5837426781654358, + "step": 1340 + }, + { + "epoch": 0.4691314963376352, + "grad_norm": 1.020398497581482, + "learning_rate": 1.2787572333040022e-05, + "loss": 1.7655, + "mean_token_accuracy": 0.5858993172645569, + "step": 1345 + }, + { + "epoch": 0.4708754795953959, + "grad_norm": 0.9708066582679749, + "learning_rate": 1.2729050778437197e-05, + "loss": 1.7848, + "mean_token_accuracy": 0.5809078574180603, + "step": 1350 + }, + { + "epoch": 0.4726194628531566, + "grad_norm": 0.9287837147712708, + "learning_rate": 1.2670428063415932e-05, + "loss": 1.7768, + "mean_token_accuracy": 0.586418628692627, + "step": 1355 + }, + { + "epoch": 0.47436344611091735, + "grad_norm": 0.928696870803833, + "learning_rate": 1.2611706361002254e-05, + "loss": 1.7344, + "mean_token_accuracy": 0.5956867098808288, + "step": 1360 + }, + { + "epoch": 0.47610742936867806, + "grad_norm": 0.9905332922935486, + "learning_rate": 1.2552887847891462e-05, + "loss": 1.804, + "mean_token_accuracy": 0.5773338317871094, + "step": 1365 + }, + { + "epoch": 0.47785141262643877, + "grad_norm": 0.9971755743026733, + "learning_rate": 1.2493974704367427e-05, + "loss": 1.8377, + "mean_token_accuracy": 0.5717558622360229, + "step": 1370 + }, + { + "epoch": 0.4795953958841995, + "grad_norm": 0.9887624979019165, + "learning_rate": 1.2434969114221777e-05, + "loss": 1.7546, + "mean_token_accuracy": 0.5923387050628662, + "step": 1375 + }, + { + "epoch": 0.48133937914196023, + "grad_norm": 0.9379517436027527, + "learning_rate": 1.237587326467296e-05, + "loss": 1.8012, + "mean_token_accuracy": 0.5816349029541016, + "step": 1380 + }, + { + "epoch": 0.48308336239972094, + "grad_norm": 0.8809686303138733, + "learning_rate": 1.2316689346285146e-05, + "loss": 1.7762, + "mean_token_accuracy": 0.5839442849159241, + "step": 1385 + }, + { + "epoch": 0.4848273456574817, + "grad_norm": 0.9739828109741211, + "learning_rate": 1.2257419552887047e-05, + "loss": 1.8035, + "mean_token_accuracy": 0.5774987816810608, + "step": 1390 + }, + { + "epoch": 0.4865713289152424, + "grad_norm": 1.0573025941848755, + "learning_rate": 1.2198066081490585e-05, + "loss": 1.7745, + "mean_token_accuracy": 0.5860153913497925, + "step": 1395 + }, + { + "epoch": 0.4883153121730031, + "grad_norm": 0.952739953994751, + "learning_rate": 1.213863113220946e-05, + "loss": 1.7634, + "mean_token_accuracy": 0.5889907002449035, + "step": 1400 + }, + { + "epoch": 0.4883153121730031, + "eval_loss": 1.6934020519256592, + "eval_mean_token_accuracy": 0.6011765607889148, + "eval_runtime": 2.2826, + "eval_samples_per_second": 240.513, + "eval_steps_per_second": 60.457, + "step": 1400 + }, + { + "epoch": 0.4900592954307639, + "grad_norm": 1.0023179054260254, + "learning_rate": 1.2079116908177592e-05, + "loss": 1.7869, + "mean_token_accuracy": 0.5814271688461303, + "step": 1405 + }, + { + "epoch": 0.4918032786885246, + "grad_norm": 1.0752602815628052, + "learning_rate": 1.2019525615467462e-05, + "loss": 1.7817, + "mean_token_accuracy": 0.5788734078407287, + "step": 1410 + }, + { + "epoch": 0.49354726194628534, + "grad_norm": 0.9689585566520691, + "learning_rate": 1.1959859463008316e-05, + "loss": 1.7643, + "mean_token_accuracy": 0.5866957783699036, + "step": 1415 + }, + { + "epoch": 0.49529124520404605, + "grad_norm": 1.0101878643035889, + "learning_rate": 1.1900120662504315e-05, + "loss": 1.7924, + "mean_token_accuracy": 0.5803335905075073, + "step": 1420 + }, + { + "epoch": 0.49703522846180676, + "grad_norm": 0.9992361068725586, + "learning_rate": 1.1840311428352536e-05, + "loss": 1.8082, + "mean_token_accuracy": 0.5811033606529236, + "step": 1425 + }, + { + "epoch": 0.4987792117195675, + "grad_norm": 0.9244523048400879, + "learning_rate": 1.1780433977560879e-05, + "loss": 1.7779, + "mean_token_accuracy": 0.5819770336151123, + "step": 1430 + }, + { + "epoch": 0.5005231949773282, + "grad_norm": 0.9137963056564331, + "learning_rate": 1.1720490529665904e-05, + "loss": 1.7764, + "mean_token_accuracy": 0.5843108534812927, + "step": 1435 + }, + { + "epoch": 0.5022671782350889, + "grad_norm": 1.0141640901565552, + "learning_rate": 1.1660483306650558e-05, + "loss": 1.7252, + "mean_token_accuracy": 0.5950757622718811, + "step": 1440 + }, + { + "epoch": 0.5040111614928496, + "grad_norm": 0.9211955666542053, + "learning_rate": 1.160041453286179e-05, + "loss": 1.7875, + "mean_token_accuracy": 0.5778897881507874, + "step": 1445 + }, + { + "epoch": 0.5057551447506103, + "grad_norm": 0.8747527599334717, + "learning_rate": 1.154028643492812e-05, + "loss": 1.767, + "mean_token_accuracy": 0.5832966804504395, + "step": 1450 + }, + { + "epoch": 0.5074991280083712, + "grad_norm": 1.0392370223999023, + "learning_rate": 1.1480101241677097e-05, + "loss": 1.7773, + "mean_token_accuracy": 0.5835349440574646, + "step": 1455 + }, + { + "epoch": 0.5092431112661319, + "grad_norm": 1.0041764974594116, + "learning_rate": 1.1419861184052669e-05, + "loss": 1.8024, + "mean_token_accuracy": 0.5797898292541503, + "step": 1460 + }, + { + "epoch": 0.5109870945238926, + "grad_norm": 0.9882892370223999, + "learning_rate": 1.1359568495032505e-05, + "loss": 1.7546, + "mean_token_accuracy": 0.5886791229248047, + "step": 1465 + }, + { + "epoch": 0.5127310777816533, + "grad_norm": 0.9167733192443848, + "learning_rate": 1.1299225409545207e-05, + "loss": 1.7963, + "mean_token_accuracy": 0.5816165804862976, + "step": 1470 + }, + { + "epoch": 0.514475061039414, + "grad_norm": 1.0606944561004639, + "learning_rate": 1.123883416438748e-05, + "loss": 1.8045, + "mean_token_accuracy": 0.5793132901191711, + "step": 1475 + }, + { + "epoch": 0.5162190442971748, + "grad_norm": 0.8895936608314514, + "learning_rate": 1.1178396998141206e-05, + "loss": 1.7426, + "mean_token_accuracy": 0.591837739944458, + "step": 1480 + }, + { + "epoch": 0.5179630275549355, + "grad_norm": 0.9077578186988831, + "learning_rate": 1.1117916151090469e-05, + "loss": 1.7847, + "mean_token_accuracy": 0.5825146555900573, + "step": 1485 + }, + { + "epoch": 0.5197070108126962, + "grad_norm": 0.8612576127052307, + "learning_rate": 1.1057393865138513e-05, + "loss": 1.8154, + "mean_token_accuracy": 0.575824785232544, + "step": 1490 + }, + { + "epoch": 0.5214509940704569, + "grad_norm": 0.9697543978691101, + "learning_rate": 1.099683238372464e-05, + "loss": 1.7774, + "mean_token_accuracy": 0.5810361742973328, + "step": 1495 + }, + { + "epoch": 0.5231949773282176, + "grad_norm": 0.929071843624115, + "learning_rate": 1.0936233951741052e-05, + "loss": 1.7377, + "mean_token_accuracy": 0.5958333253860474, + "step": 1500 + }, + { + "epoch": 0.5231949773282176, + "eval_loss": 1.6859747171401978, + "eval_mean_token_accuracy": 0.6019823041515074, + "eval_runtime": 2.2807, + "eval_samples_per_second": 240.715, + "eval_steps_per_second": 60.508, + "step": 1500 + }, + { + "epoch": 0.5249389605859783, + "grad_norm": 1.1334950923919678, + "learning_rate": 1.0875600815449624e-05, + "loss": 1.7675, + "mean_token_accuracy": 0.5857221364974976, + "step": 1505 + }, + { + "epoch": 0.5266829438437391, + "grad_norm": 0.8689450621604919, + "learning_rate": 1.081493522239866e-05, + "loss": 1.73, + "mean_token_accuracy": 0.5933406710624695, + "step": 1510 + }, + { + "epoch": 0.5284269271014999, + "grad_norm": 1.0081589221954346, + "learning_rate": 1.075423942133957e-05, + "loss": 1.7694, + "mean_token_accuracy": 0.5881414890289307, + "step": 1515 + }, + { + "epoch": 0.5301709103592606, + "grad_norm": 0.8938185572624207, + "learning_rate": 1.0693515662143505e-05, + "loss": 1.7686, + "mean_token_accuracy": 0.5840603470802307, + "step": 1520 + }, + { + "epoch": 0.5319148936170213, + "grad_norm": 0.950944185256958, + "learning_rate": 1.0632766195717979e-05, + "loss": 1.7725, + "mean_token_accuracy": 0.5843536019325256, + "step": 1525 + }, + { + "epoch": 0.533658876874782, + "grad_norm": 0.7957980632781982, + "learning_rate": 1.0571993273923412e-05, + "loss": 1.7376, + "mean_token_accuracy": 0.5929924249649048, + "step": 1530 + }, + { + "epoch": 0.5354028601325427, + "grad_norm": 0.9728139638900757, + "learning_rate": 1.0511199149489673e-05, + "loss": 1.7479, + "mean_token_accuracy": 0.5879215598106384, + "step": 1535 + }, + { + "epoch": 0.5371468433903035, + "grad_norm": 0.9370113611221313, + "learning_rate": 1.0450386075932571e-05, + "loss": 1.7848, + "mean_token_accuracy": 0.5811705827713013, + "step": 1540 + }, + { + "epoch": 0.5388908266480642, + "grad_norm": 0.9360505938529968, + "learning_rate": 1.0389556307470316e-05, + "loss": 1.7651, + "mean_token_accuracy": 0.5896260976791382, + "step": 1545 + }, + { + "epoch": 0.5406348099058249, + "grad_norm": 1.013931155204773, + "learning_rate": 1.0328712098939968e-05, + "loss": 1.8339, + "mean_token_accuracy": 0.570356798171997, + "step": 1550 + }, + { + "epoch": 0.5423787931635856, + "grad_norm": 0.8847731947898865, + "learning_rate": 1.0267855705713854e-05, + "loss": 1.7205, + "mean_token_accuracy": 0.5962793350219726, + "step": 1555 + }, + { + "epoch": 0.5441227764213463, + "grad_norm": 0.8559343218803406, + "learning_rate": 1.020698938361595e-05, + "loss": 1.8224, + "mean_token_accuracy": 0.5764296174049377, + "step": 1560 + }, + { + "epoch": 0.545866759679107, + "grad_norm": 0.94319087266922, + "learning_rate": 1.0146115388838293e-05, + "loss": 1.7453, + "mean_token_accuracy": 0.591049599647522, + "step": 1565 + }, + { + "epoch": 0.5476107429368678, + "grad_norm": 1.0683573484420776, + "learning_rate": 1.0085235977857322e-05, + "loss": 1.7492, + "mean_token_accuracy": 0.5862597942352294, + "step": 1570 + }, + { + "epoch": 0.5493547261946286, + "grad_norm": 0.8956939578056335, + "learning_rate": 1.002435340735024e-05, + "loss": 1.7363, + "mean_token_accuracy": 0.5916177988052368, + "step": 1575 + }, + { + "epoch": 0.5510987094523893, + "grad_norm": 0.9776235818862915, + "learning_rate": 9.963469934111374e-06, + "loss": 1.8291, + "mean_token_accuracy": 0.5710043907165527, + "step": 1580 + }, + { + "epoch": 0.55284269271015, + "grad_norm": 1.0536141395568848, + "learning_rate": 9.90258781496851e-06, + "loss": 1.7707, + "mean_token_accuracy": 0.5849829077720642, + "step": 1585 + }, + { + "epoch": 0.5545866759679107, + "grad_norm": 0.9491280317306519, + "learning_rate": 9.841709306699245e-06, + "loss": 1.716, + "mean_token_accuracy": 0.5943609595298767, + "step": 1590 + }, + { + "epoch": 0.5563306592256715, + "grad_norm": 1.028785228729248, + "learning_rate": 9.78083666594732e-06, + "loss": 1.7985, + "mean_token_accuracy": 0.5787267804145813, + "step": 1595 + }, + { + "epoch": 0.5580746424834322, + "grad_norm": 0.9020082950592041, + "learning_rate": 9.719972149138985e-06, + "loss": 1.7616, + "mean_token_accuracy": 0.5864491701126099, + "step": 1600 + }, + { + "epoch": 0.5580746424834322, + "eval_loss": 1.6814908981323242, + "eval_mean_token_accuracy": 0.6028889862523563, + "eval_runtime": 2.2845, + "eval_samples_per_second": 240.32, + "eval_steps_per_second": 60.408, + "step": 1600 + }, + { + "epoch": 0.5598186257411929, + "grad_norm": 0.844459593296051, + "learning_rate": 9.659118012399352e-06, + "loss": 1.7666, + "mean_token_accuracy": 0.585337245464325, + "step": 1605 + }, + { + "epoch": 0.5615626089989536, + "grad_norm": 1.0420951843261719, + "learning_rate": 9.598276511468763e-06, + "loss": 1.7888, + "mean_token_accuracy": 0.5808284401893615, + "step": 1610 + }, + { + "epoch": 0.5633065922567143, + "grad_norm": 0.8618496656417847, + "learning_rate": 9.537449901619174e-06, + "loss": 1.7231, + "mean_token_accuracy": 0.5956072926521301, + "step": 1615 + }, + { + "epoch": 0.565050575514475, + "grad_norm": 0.8843145966529846, + "learning_rate": 9.476640437570562e-06, + "loss": 1.7718, + "mean_token_accuracy": 0.5840725779533387, + "step": 1620 + }, + { + "epoch": 0.5667945587722358, + "grad_norm": 1.0477300882339478, + "learning_rate": 9.415850373407342e-06, + "loss": 1.7461, + "mean_token_accuracy": 0.5911412358283996, + "step": 1625 + }, + { + "epoch": 0.5685385420299965, + "grad_norm": 0.9050325155258179, + "learning_rate": 9.355081962494815e-06, + "loss": 1.7672, + "mean_token_accuracy": 0.582093107700348, + "step": 1630 + }, + { + "epoch": 0.5702825252877572, + "grad_norm": 0.919677734375, + "learning_rate": 9.294337457395638e-06, + "loss": 1.733, + "mean_token_accuracy": 0.5913611888885498, + "step": 1635 + }, + { + "epoch": 0.572026508545518, + "grad_norm": 0.8888424038887024, + "learning_rate": 9.233619109786332e-06, + "loss": 1.7136, + "mean_token_accuracy": 0.5980632901191711, + "step": 1640 + }, + { + "epoch": 0.5737704918032787, + "grad_norm": 0.9428724050521851, + "learning_rate": 9.172929170373804e-06, + "loss": 1.7515, + "mean_token_accuracy": 0.5902431607246399, + "step": 1645 + }, + { + "epoch": 0.5755144750610394, + "grad_norm": 0.9060950875282288, + "learning_rate": 9.112269888811934e-06, + "loss": 1.727, + "mean_token_accuracy": 0.5939760446548462, + "step": 1650 + }, + { + "epoch": 0.5772584583188002, + "grad_norm": 0.8351477980613708, + "learning_rate": 9.051643513618176e-06, + "loss": 1.7196, + "mean_token_accuracy": 0.5938905239105224, + "step": 1655 + }, + { + "epoch": 0.5790024415765609, + "grad_norm": 0.8715441823005676, + "learning_rate": 8.99105229209021e-06, + "loss": 1.7194, + "mean_token_accuracy": 0.5924608826637268, + "step": 1660 + }, + { + "epoch": 0.5807464248343216, + "grad_norm": 0.9747238159179688, + "learning_rate": 8.930498470222641e-06, + "loss": 1.758, + "mean_token_accuracy": 0.5870173454284668, + "step": 1665 + }, + { + "epoch": 0.5824904080920823, + "grad_norm": 0.9051499366760254, + "learning_rate": 8.86998429262374e-06, + "loss": 1.7501, + "mean_token_accuracy": 0.5860031723976136, + "step": 1670 + }, + { + "epoch": 0.584234391349843, + "grad_norm": 0.9120787382125854, + "learning_rate": 8.809512002432252e-06, + "loss": 1.7903, + "mean_token_accuracy": 0.5806634902954102, + "step": 1675 + }, + { + "epoch": 0.5859783746076037, + "grad_norm": 0.856230616569519, + "learning_rate": 8.749083841234235e-06, + "loss": 1.7594, + "mean_token_accuracy": 0.5864796996116638, + "step": 1680 + }, + { + "epoch": 0.5877223578653645, + "grad_norm": 0.9261459112167358, + "learning_rate": 8.688702048979974e-06, + "loss": 1.742, + "mean_token_accuracy": 0.588844096660614, + "step": 1685 + }, + { + "epoch": 0.5894663411231252, + "grad_norm": 0.9426308274269104, + "learning_rate": 8.628368863900954e-06, + "loss": 1.7619, + "mean_token_accuracy": 0.5845735549926758, + "step": 1690 + }, + { + "epoch": 0.5912103243808859, + "grad_norm": 0.9162436127662659, + "learning_rate": 8.568086522426884e-06, + "loss": 1.8073, + "mean_token_accuracy": 0.5761485934257508, + "step": 1695 + }, + { + "epoch": 0.5929543076386466, + "grad_norm": 0.9955480694770813, + "learning_rate": 8.507857259102814e-06, + "loss": 1.7923, + "mean_token_accuracy": 0.5799303531646729, + "step": 1700 + }, + { + "epoch": 0.5929543076386466, + "eval_loss": 1.6758500337600708, + "eval_mean_token_accuracy": 0.6034928508426832, + "eval_runtime": 2.2831, + "eval_samples_per_second": 240.467, + "eval_steps_per_second": 60.445, + "step": 1700 + }, + { + "epoch": 0.5946982908964074, + "grad_norm": 1.026448369026184, + "learning_rate": 8.447683306506279e-06, + "loss": 1.7804, + "mean_token_accuracy": 0.5816471219062805, + "step": 1705 + }, + { + "epoch": 0.5964422741541682, + "grad_norm": 0.9623015522956848, + "learning_rate": 8.387566895164566e-06, + "loss": 1.7791, + "mean_token_accuracy": 0.5825146555900573, + "step": 1710 + }, + { + "epoch": 0.5981862574119289, + "grad_norm": 0.9515750408172607, + "learning_rate": 8.327510253472023e-06, + "loss": 1.7715, + "mean_token_accuracy": 0.5841092467308044, + "step": 1715 + }, + { + "epoch": 0.5999302406696896, + "grad_norm": 0.9369364380836487, + "learning_rate": 8.267515607607458e-06, + "loss": 1.7576, + "mean_token_accuracy": 0.5836632370948791, + "step": 1720 + }, + { + "epoch": 0.6016742239274503, + "grad_norm": 0.9058694839477539, + "learning_rate": 8.207585181451611e-06, + "loss": 1.7514, + "mean_token_accuracy": 0.5888318538665771, + "step": 1725 + }, + { + "epoch": 0.603418207185211, + "grad_norm": 1.0317144393920898, + "learning_rate": 8.147721196504736e-06, + "loss": 1.6989, + "mean_token_accuracy": 0.6007025837898254, + "step": 1730 + }, + { + "epoch": 0.6051621904429717, + "grad_norm": 0.862755298614502, + "learning_rate": 8.08792587180424e-06, + "loss": 1.7155, + "mean_token_accuracy": 0.5944892525672912, + "step": 1735 + }, + { + "epoch": 0.6069061737007325, + "grad_norm": 0.8898607492446899, + "learning_rate": 8.028201423842437e-06, + "loss": 1.7209, + "mean_token_accuracy": 0.5962915420532227, + "step": 1740 + }, + { + "epoch": 0.6086501569584932, + "grad_norm": 0.9579831957817078, + "learning_rate": 7.96855006648438e-06, + "loss": 1.766, + "mean_token_accuracy": 0.5847262859344482, + "step": 1745 + }, + { + "epoch": 0.6103941402162539, + "grad_norm": 0.8500779271125793, + "learning_rate": 7.908974010885795e-06, + "loss": 1.7426, + "mean_token_accuracy": 0.592611289024353, + "step": 1750 + }, + { + "epoch": 0.6121381234740146, + "grad_norm": 0.9754694700241089, + "learning_rate": 7.849475465411136e-06, + "loss": 1.7492, + "mean_token_accuracy": 0.5904875278472901, + "step": 1755 + }, + { + "epoch": 0.6138821067317753, + "grad_norm": 0.8871049284934998, + "learning_rate": 7.790056635551704e-06, + "loss": 1.7559, + "mean_token_accuracy": 0.5869256973266601, + "step": 1760 + }, + { + "epoch": 0.615626089989536, + "grad_norm": 0.8794604539871216, + "learning_rate": 7.730719723843903e-06, + "loss": 1.736, + "mean_token_accuracy": 0.5898521661758422, + "step": 1765 + }, + { + "epoch": 0.6173700732472969, + "grad_norm": 0.8360015153884888, + "learning_rate": 7.671466929787598e-06, + "loss": 1.7542, + "mean_token_accuracy": 0.588312566280365, + "step": 1770 + }, + { + "epoch": 0.6191140565050576, + "grad_norm": 0.9423067569732666, + "learning_rate": 7.61230044976458e-06, + "loss": 1.7532, + "mean_token_accuracy": 0.5861534833908081, + "step": 1775 + }, + { + "epoch": 0.6208580397628183, + "grad_norm": 0.9311609268188477, + "learning_rate": 7.553222476957157e-06, + "loss": 1.766, + "mean_token_accuracy": 0.5862353444099426, + "step": 1780 + }, + { + "epoch": 0.622602023020579, + "grad_norm": 0.8329134583473206, + "learning_rate": 7.494235201266849e-06, + "loss": 1.7481, + "mean_token_accuracy": 0.5884714126586914, + "step": 1785 + }, + { + "epoch": 0.6243460062783397, + "grad_norm": 0.8583689332008362, + "learning_rate": 7.435340809233218e-06, + "loss": 1.7389, + "mean_token_accuracy": 0.589962112903595, + "step": 1790 + }, + { + "epoch": 0.6260899895361004, + "grad_norm": 0.9669222235679626, + "learning_rate": 7.376541483952811e-06, + "loss": 1.7661, + "mean_token_accuracy": 0.5822458386421203, + "step": 1795 + }, + { + "epoch": 0.6278339727938612, + "grad_norm": 0.8193614482879639, + "learning_rate": 7.3178394049982485e-06, + "loss": 1.7409, + "mean_token_accuracy": 0.591287887096405, + "step": 1800 + }, + { + "epoch": 0.6278339727938612, + "eval_loss": 1.6728371381759644, + "eval_mean_token_accuracy": 0.6040453612804413, + "eval_runtime": 2.2857, + "eval_samples_per_second": 240.187, + "eval_steps_per_second": 60.375, + "step": 1800 + }, + { + "epoch": 0.6295779560516219, + "grad_norm": 0.9367895722389221, + "learning_rate": 7.259236748337421e-06, + "loss": 1.7466, + "mean_token_accuracy": 0.5905180931091308, + "step": 1805 + }, + { + "epoch": 0.6313219393093826, + "grad_norm": 1.000838279724121, + "learning_rate": 7.20073568625284e-06, + "loss": 1.7839, + "mean_token_accuracy": 0.5814638137817383, + "step": 1810 + }, + { + "epoch": 0.6330659225671433, + "grad_norm": 0.7985386252403259, + "learning_rate": 7.1423383872611045e-06, + "loss": 1.7188, + "mean_token_accuracy": 0.5944403648376465, + "step": 1815 + }, + { + "epoch": 0.634809905824904, + "grad_norm": 0.9904494285583496, + "learning_rate": 7.084047016032528e-06, + "loss": 1.7236, + "mean_token_accuracy": 0.5956622838974, + "step": 1820 + }, + { + "epoch": 0.6365538890826649, + "grad_norm": 0.8954839110374451, + "learning_rate": 7.025863733310894e-06, + "loss": 1.8033, + "mean_token_accuracy": 0.5786290287971496, + "step": 1825 + }, + { + "epoch": 0.6382978723404256, + "grad_norm": 1.004160761833191, + "learning_rate": 6.967790695833363e-06, + "loss": 1.7626, + "mean_token_accuracy": 0.5868799567222596, + "step": 1830 + }, + { + "epoch": 0.6400418555981863, + "grad_norm": 0.9229363799095154, + "learning_rate": 6.909830056250527e-06, + "loss": 1.7143, + "mean_token_accuracy": 0.5966275453567504, + "step": 1835 + }, + { + "epoch": 0.641785838855947, + "grad_norm": 1.0013066530227661, + "learning_rate": 6.851983963046612e-06, + "loss": 1.744, + "mean_token_accuracy": 0.5903042435646058, + "step": 1840 + }, + { + "epoch": 0.6435298221137077, + "grad_norm": 0.8317592740058899, + "learning_rate": 6.794254560459843e-06, + "loss": 1.7359, + "mean_token_accuracy": 0.5928885579109192, + "step": 1845 + }, + { + "epoch": 0.6452738053714684, + "grad_norm": 0.9083294868469238, + "learning_rate": 6.736643988402958e-06, + "loss": 1.7409, + "mean_token_accuracy": 0.59159334897995, + "step": 1850 + }, + { + "epoch": 0.6470177886292292, + "grad_norm": 0.8991389274597168, + "learning_rate": 6.679154382383883e-06, + "loss": 1.7402, + "mean_token_accuracy": 0.5913489699363709, + "step": 1855 + }, + { + "epoch": 0.6487617718869899, + "grad_norm": 0.9679570198059082, + "learning_rate": 6.621787873426581e-06, + "loss": 1.7556, + "mean_token_accuracy": 0.5897360682487488, + "step": 1860 + }, + { + "epoch": 0.6505057551447506, + "grad_norm": 0.8622152805328369, + "learning_rate": 6.564546587992054e-06, + "loss": 1.7389, + "mean_token_accuracy": 0.5905425071716308, + "step": 1865 + }, + { + "epoch": 0.6522497384025113, + "grad_norm": 0.9783850312232971, + "learning_rate": 6.507432647899519e-06, + "loss": 1.7265, + "mean_token_accuracy": 0.5953873515129089, + "step": 1870 + }, + { + "epoch": 0.653993721660272, + "grad_norm": 0.8522977828979492, + "learning_rate": 6.450448170247757e-06, + "loss": 1.7357, + "mean_token_accuracy": 0.5928824543952942, + "step": 1875 + }, + { + "epoch": 0.6557377049180327, + "grad_norm": 0.8314604759216309, + "learning_rate": 6.393595267336639e-06, + "loss": 1.7436, + "mean_token_accuracy": 0.5882331252098083, + "step": 1880 + }, + { + "epoch": 0.6574816881757936, + "grad_norm": 0.8277537226676941, + "learning_rate": 6.3368760465888226e-06, + "loss": 1.789, + "mean_token_accuracy": 0.5794782400131225, + "step": 1885 + }, + { + "epoch": 0.6592256714335543, + "grad_norm": 0.9186983108520508, + "learning_rate": 6.280292610471639e-06, + "loss": 1.7609, + "mean_token_accuracy": 0.5859115362167359, + "step": 1890 + }, + { + "epoch": 0.660969654691315, + "grad_norm": 0.8679489493370056, + "learning_rate": 6.223847056419154e-06, + "loss": 1.7637, + "mean_token_accuracy": 0.5874450206756592, + "step": 1895 + }, + { + "epoch": 0.6627136379490757, + "grad_norm": 0.8838227987289429, + "learning_rate": 6.1675414767544285e-06, + "loss": 1.7348, + "mean_token_accuracy": 0.5937255620956421, + "step": 1900 + }, + { + "epoch": 0.6627136379490757, + "eval_loss": 1.6681548357009888, + "eval_mean_token_accuracy": 0.6047023521817249, + "eval_runtime": 2.2821, + "eval_samples_per_second": 240.564, + "eval_steps_per_second": 60.47, + "step": 1900 + }, + { + "epoch": 0.6644576212068364, + "grad_norm": 1.0021533966064453, + "learning_rate": 6.111377958611948e-06, + "loss": 1.7763, + "mean_token_accuracy": 0.5830941438674927, + "step": 1905 + }, + { + "epoch": 0.6662016044645971, + "grad_norm": 0.8784207105636597, + "learning_rate": 6.055358583860267e-06, + "loss": 1.7717, + "mean_token_accuracy": 0.5817021012306214, + "step": 1910 + }, + { + "epoch": 0.6679455877223579, + "grad_norm": 0.9345823526382446, + "learning_rate": 5.99948542902483e-06, + "loss": 1.7401, + "mean_token_accuracy": 0.5889540553092957, + "step": 1915 + }, + { + "epoch": 0.6696895709801186, + "grad_norm": 0.8546095490455627, + "learning_rate": 5.943760565211011e-06, + "loss": 1.708, + "mean_token_accuracy": 0.5965847969055176, + "step": 1920 + }, + { + "epoch": 0.6714335542378793, + "grad_norm": 0.8473442792892456, + "learning_rate": 5.8881860580273285e-06, + "loss": 1.7733, + "mean_token_accuracy": 0.5782319068908691, + "step": 1925 + }, + { + "epoch": 0.67317753749564, + "grad_norm": 0.8169350624084473, + "learning_rate": 5.832763967508885e-06, + "loss": 1.7744, + "mean_token_accuracy": 0.5818487286567688, + "step": 1930 + }, + { + "epoch": 0.6749215207534007, + "grad_norm": 0.8931566476821899, + "learning_rate": 5.777496348041009e-06, + "loss": 1.7253, + "mean_token_accuracy": 0.5921370983123779, + "step": 1935 + }, + { + "epoch": 0.6766655040111615, + "grad_norm": 0.8605539202690125, + "learning_rate": 5.722385248283092e-06, + "loss": 1.7848, + "mean_token_accuracy": 0.5819281578063965, + "step": 1940 + }, + { + "epoch": 0.6784094872689223, + "grad_norm": 0.8370801210403442, + "learning_rate": 5.667432711092651e-06, + "loss": 1.7845, + "mean_token_accuracy": 0.5804679870605469, + "step": 1945 + }, + { + "epoch": 0.680153470526683, + "grad_norm": 0.8503681421279907, + "learning_rate": 5.61264077344962e-06, + "loss": 1.7468, + "mean_token_accuracy": 0.5905364036560059, + "step": 1950 + }, + { + "epoch": 0.6818974537844437, + "grad_norm": 0.9331584572792053, + "learning_rate": 5.558011466380824e-06, + "loss": 1.7553, + "mean_token_accuracy": 0.5860276222229004, + "step": 1955 + }, + { + "epoch": 0.6836414370422044, + "grad_norm": 0.7952961325645447, + "learning_rate": 5.5035468148846926e-06, + "loss": 1.7585, + "mean_token_accuracy": 0.584536898136139, + "step": 1960 + }, + { + "epoch": 0.6853854202999651, + "grad_norm": 0.848480761051178, + "learning_rate": 5.449248837856224e-06, + "loss": 1.7403, + "mean_token_accuracy": 0.5923509240150452, + "step": 1965 + }, + { + "epoch": 0.6871294035577259, + "grad_norm": 0.9587191343307495, + "learning_rate": 5.395119548012112e-06, + "loss": 1.7304, + "mean_token_accuracy": 0.592375373840332, + "step": 1970 + }, + { + "epoch": 0.6888733868154866, + "grad_norm": 0.9457473754882812, + "learning_rate": 5.34116095181616e-06, + "loss": 1.7578, + "mean_token_accuracy": 0.5837732195854187, + "step": 1975 + }, + { + "epoch": 0.6906173700732473, + "grad_norm": 0.9263405799865723, + "learning_rate": 5.287375049404909e-06, + "loss": 1.762, + "mean_token_accuracy": 0.5890579223632812, + "step": 1980 + }, + { + "epoch": 0.692361353331008, + "grad_norm": 0.9388116598129272, + "learning_rate": 5.233763834513479e-06, + "loss": 1.7182, + "mean_token_accuracy": 0.592656409740448, + "step": 1985 + }, + { + "epoch": 0.6941053365887687, + "grad_norm": 0.8628541231155396, + "learning_rate": 5.180329294401685e-06, + "loss": 1.7436, + "mean_token_accuracy": 0.5878604531288147, + "step": 1990 + }, + { + "epoch": 0.6958493198465294, + "grad_norm": 0.8491072654724121, + "learning_rate": 5.127073409780352e-06, + "loss": 1.7338, + "mean_token_accuracy": 0.5916422247886658, + "step": 1995 + }, + { + "epoch": 0.6975933031042902, + "grad_norm": 0.8467431664466858, + "learning_rate": 5.0739981547379215e-06, + "loss": 1.7528, + "mean_token_accuracy": 0.5883736491203309, + "step": 2000 + }, + { + "epoch": 0.6975933031042902, + "eval_loss": 1.665104866027832, + "eval_mean_token_accuracy": 0.6051149640394293, + "eval_runtime": 2.2845, + "eval_samples_per_second": 240.312, + "eval_steps_per_second": 60.406, + "step": 2000 + }, + { + "epoch": 0.699337286362051, + "grad_norm": 1.043385624885559, + "learning_rate": 5.02110549666724e-06, + "loss": 1.7881, + "mean_token_accuracy": 0.5774926662445068, + "step": 2005 + }, + { + "epoch": 0.7010812696198117, + "grad_norm": 0.8518879413604736, + "learning_rate": 4.968397396192675e-06, + "loss": 1.7453, + "mean_token_accuracy": 0.5914296627044677, + "step": 2010 + }, + { + "epoch": 0.7028252528775724, + "grad_norm": 0.9270824790000916, + "learning_rate": 4.91587580709739e-06, + "loss": 1.7398, + "mean_token_accuracy": 0.5871517539024353, + "step": 2015 + }, + { + "epoch": 0.7045692361353331, + "grad_norm": 0.9412179589271545, + "learning_rate": 4.863542676250972e-06, + "loss": 1.785, + "mean_token_accuracy": 0.5836510181427002, + "step": 2020 + }, + { + "epoch": 0.7063132193930938, + "grad_norm": 0.8825154304504395, + "learning_rate": 4.811399943537223e-06, + "loss": 1.703, + "mean_token_accuracy": 0.5958150029182434, + "step": 2025 + }, + { + "epoch": 0.7080572026508546, + "grad_norm": 0.9375229477882385, + "learning_rate": 4.759449541782272e-06, + "loss": 1.8157, + "mean_token_accuracy": 0.5745540142059327, + "step": 2030 + }, + { + "epoch": 0.7098011859086153, + "grad_norm": 0.8242154121398926, + "learning_rate": 4.707693396682936e-06, + "loss": 1.7119, + "mean_token_accuracy": 0.5958577752113342, + "step": 2035 + }, + { + "epoch": 0.711545169166376, + "grad_norm": 0.8792186379432678, + "learning_rate": 4.656133426735315e-06, + "loss": 1.7451, + "mean_token_accuracy": 0.5919232726097107, + "step": 2040 + }, + { + "epoch": 0.7132891524241367, + "grad_norm": 0.8993618488311768, + "learning_rate": 4.604771543163706e-06, + "loss": 1.7514, + "mean_token_accuracy": 0.5879826545715332, + "step": 2045 + }, + { + "epoch": 0.7150331356818974, + "grad_norm": 0.8959999680519104, + "learning_rate": 4.5536096498497295e-06, + "loss": 1.7252, + "mean_token_accuracy": 0.5921065449714661, + "step": 2050 + }, + { + "epoch": 0.7167771189396582, + "grad_norm": 0.8765257000923157, + "learning_rate": 4.502649643261779e-06, + "loss": 1.7354, + "mean_token_accuracy": 0.5902675747871399, + "step": 2055 + }, + { + "epoch": 0.7185211021974189, + "grad_norm": 0.8657769560813904, + "learning_rate": 4.451893412384707e-06, + "loss": 1.7452, + "mean_token_accuracy": 0.5894855737686158, + "step": 2060 + }, + { + "epoch": 0.7202650854551796, + "grad_norm": 0.8636703491210938, + "learning_rate": 4.401342838649818e-06, + "loss": 1.7543, + "mean_token_accuracy": 0.5891617655754089, + "step": 2065 + }, + { + "epoch": 0.7220090687129403, + "grad_norm": 0.866857647895813, + "learning_rate": 4.350999795865109e-06, + "loss": 1.7584, + "mean_token_accuracy": 0.5861375808715821, + "step": 2070 + }, + { + "epoch": 0.7237530519707011, + "grad_norm": 0.8767180442810059, + "learning_rate": 4.300866150145837e-06, + "loss": 1.6975, + "mean_token_accuracy": 0.5987536549568176, + "step": 2075 + }, + { + "epoch": 0.7254970352284618, + "grad_norm": 0.8914757966995239, + "learning_rate": 4.250943759845316e-06, + "loss": 1.769, + "mean_token_accuracy": 0.5828201413154602, + "step": 2080 + }, + { + "epoch": 0.7272410184862226, + "grad_norm": 0.828929603099823, + "learning_rate": 4.201234475486063e-06, + "loss": 1.7937, + "mean_token_accuracy": 0.5783907532691955, + "step": 2085 + }, + { + "epoch": 0.7289850017439833, + "grad_norm": 0.8762986063957214, + "learning_rate": 4.1517401396911725e-06, + "loss": 1.7546, + "mean_token_accuracy": 0.5867057561874389, + "step": 2090 + }, + { + "epoch": 0.730728985001744, + "grad_norm": 0.8607875108718872, + "learning_rate": 4.1024625871160325e-06, + "loss": 1.7255, + "mean_token_accuracy": 0.5968353033065796, + "step": 2095 + }, + { + "epoch": 0.7324729682595047, + "grad_norm": 0.8799224495887756, + "learning_rate": 4.053403644380321e-06, + "loss": 1.7287, + "mean_token_accuracy": 0.590744137763977, + "step": 2100 + }, + { + "epoch": 0.7324729682595047, + "eval_loss": 1.6634001731872559, + "eval_mean_token_accuracy": 0.60536288437636, + "eval_runtime": 2.2873, + "eval_samples_per_second": 240.023, + "eval_steps_per_second": 60.334, + "step": 2100 + }, + { + "epoch": 0.7342169515172654, + "grad_norm": 0.9895725250244141, + "learning_rate": 4.004565130000277e-06, + "loss": 1.7311, + "mean_token_accuracy": 0.5927297115325928, + "step": 2105 + }, + { + "epoch": 0.7359609347750261, + "grad_norm": 0.8334611654281616, + "learning_rate": 3.955948854321321e-06, + "loss": 1.6875, + "mean_token_accuracy": 0.5993462920188903, + "step": 2110 + }, + { + "epoch": 0.7377049180327869, + "grad_norm": 0.8794987797737122, + "learning_rate": 3.907556619450909e-06, + "loss": 1.7689, + "mean_token_accuracy": 0.5833211183547974, + "step": 2115 + }, + { + "epoch": 0.7394489012905476, + "grad_norm": 0.8264355659484863, + "learning_rate": 3.859390219191775e-06, + "loss": 1.7188, + "mean_token_accuracy": 0.5973301649093627, + "step": 2120 + }, + { + "epoch": 0.7411928845483083, + "grad_norm": 0.8609153628349304, + "learning_rate": 3.8114514389754098e-06, + "loss": 1.7383, + "mean_token_accuracy": 0.5891617774963379, + "step": 2125 + }, + { + "epoch": 0.742936867806069, + "grad_norm": 0.9186127781867981, + "learning_rate": 3.7637420557958927e-06, + "loss": 1.7631, + "mean_token_accuracy": 0.5858076572418213, + "step": 2130 + }, + { + "epoch": 0.7446808510638298, + "grad_norm": 0.8366584181785583, + "learning_rate": 3.7162638381440077e-06, + "loss": 1.7288, + "mean_token_accuracy": 0.5968719363212586, + "step": 2135 + }, + { + "epoch": 0.7464248343215905, + "grad_norm": 0.8015324473381042, + "learning_rate": 3.6690185459417107e-06, + "loss": 1.7428, + "mean_token_accuracy": 0.5873044848442077, + "step": 2140 + }, + { + "epoch": 0.7481688175793513, + "grad_norm": 0.9060224890708923, + "learning_rate": 3.622007930476865e-06, + "loss": 1.7097, + "mean_token_accuracy": 0.5956317067146302, + "step": 2145 + }, + { + "epoch": 0.749912800837112, + "grad_norm": 1.0230004787445068, + "learning_rate": 3.575233734338356e-06, + "loss": 1.7633, + "mean_token_accuracy": 0.584787392616272, + "step": 2150 + }, + { + "epoch": 0.7516567840948727, + "grad_norm": 0.7938732504844666, + "learning_rate": 3.528697691351465e-06, + "loss": 1.7227, + "mean_token_accuracy": 0.595906662940979, + "step": 2155 + }, + { + "epoch": 0.7534007673526334, + "grad_norm": 1.0092865228652954, + "learning_rate": 3.4824015265136278e-06, + "loss": 1.7149, + "mean_token_accuracy": 0.5933773279190063, + "step": 2160 + }, + { + "epoch": 0.7551447506103941, + "grad_norm": 0.8216748237609863, + "learning_rate": 3.436346955930472e-06, + "loss": 1.7639, + "mean_token_accuracy": 0.5871151089668274, + "step": 2165 + }, + { + "epoch": 0.7568887338681549, + "grad_norm": 0.8152060508728027, + "learning_rate": 3.3905356867522187e-06, + "loss": 1.726, + "mean_token_accuracy": 0.5919538021087647, + "step": 2170 + }, + { + "epoch": 0.7586327171259156, + "grad_norm": 0.8350356221199036, + "learning_rate": 3.344969417110391e-06, + "loss": 1.7168, + "mean_token_accuracy": 0.5934139847755432, + "step": 2175 + }, + { + "epoch": 0.7603767003836763, + "grad_norm": 0.9290037155151367, + "learning_rate": 3.29964983605487e-06, + "loss": 1.7197, + "mean_token_accuracy": 0.5946297645568848, + "step": 2180 + }, + { + "epoch": 0.762120683641437, + "grad_norm": 0.8726941347122192, + "learning_rate": 3.2545786234913e-06, + "loss": 1.7117, + "mean_token_accuracy": 0.5981060624122619, + "step": 2185 + }, + { + "epoch": 0.7638646668991977, + "grad_norm": 0.8974432349205017, + "learning_rate": 3.2097574501187877e-06, + "loss": 1.7744, + "mean_token_accuracy": 0.5832661271095276, + "step": 2190 + }, + { + "epoch": 0.7656086501569584, + "grad_norm": 0.9594095349311829, + "learning_rate": 3.165187977368007e-06, + "loss": 1.7428, + "mean_token_accuracy": 0.5913917303085328, + "step": 2195 + }, + { + "epoch": 0.7673526334147193, + "grad_norm": 0.8100512623786926, + "learning_rate": 3.120871857339582e-06, + "loss": 1.7362, + "mean_token_accuracy": 0.5879093408584595, + "step": 2200 + }, + { + "epoch": 0.7673526334147193, + "eval_loss": 1.6623201370239258, + "eval_mean_token_accuracy": 0.6055824722068898, + "eval_runtime": 2.2847, + "eval_samples_per_second": 240.297, + "eval_steps_per_second": 60.403, + "step": 2200 + }, + { + "epoch": 0.76909661667248, + "grad_norm": 0.8855761885643005, + "learning_rate": 3.0768107327428766e-06, + "loss": 1.7189, + "mean_token_accuracy": 0.5938171982765198, + "step": 2205 + }, + { + "epoch": 0.7708405999302407, + "grad_norm": 0.8835321068763733, + "learning_rate": 3.033006236835071e-06, + "loss": 1.7513, + "mean_token_accuracy": 0.5863086462020874, + "step": 2210 + }, + { + "epoch": 0.7725845831880014, + "grad_norm": 0.8403179049491882, + "learning_rate": 2.9894599933606518e-06, + "loss": 1.7331, + "mean_token_accuracy": 0.5915383577346802, + "step": 2215 + }, + { + "epoch": 0.7743285664457621, + "grad_norm": 0.965403139591217, + "learning_rate": 2.9461736164911934e-06, + "loss": 1.7464, + "mean_token_accuracy": 0.589393949508667, + "step": 2220 + }, + { + "epoch": 0.7760725497035228, + "grad_norm": 0.9553238749504089, + "learning_rate": 2.903148710765552e-06, + "loss": 1.7225, + "mean_token_accuracy": 0.5911718010902405, + "step": 2225 + }, + { + "epoch": 0.7778165329612836, + "grad_norm": 0.9546641111373901, + "learning_rate": 2.8603868710303662e-06, + "loss": 1.7569, + "mean_token_accuracy": 0.5881659507751464, + "step": 2230 + }, + { + "epoch": 0.7795605162190443, + "grad_norm": 1.0061533451080322, + "learning_rate": 2.8178896823809465e-06, + "loss": 1.7705, + "mean_token_accuracy": 0.5832539081573487, + "step": 2235 + }, + { + "epoch": 0.781304499476805, + "grad_norm": 0.9131800532341003, + "learning_rate": 2.7756587201025297e-06, + "loss": 1.7649, + "mean_token_accuracy": 0.5844391345977783, + "step": 2240 + }, + { + "epoch": 0.7830484827345657, + "grad_norm": 0.8138077259063721, + "learning_rate": 2.7336955496118666e-06, + "loss": 1.7727, + "mean_token_accuracy": 0.5835838079452514, + "step": 2245 + }, + { + "epoch": 0.7847924659923264, + "grad_norm": 0.8967751264572144, + "learning_rate": 2.692001726399215e-06, + "loss": 1.7425, + "mean_token_accuracy": 0.5900721073150634, + "step": 2250 + }, + { + "epoch": 0.7865364492500871, + "grad_norm": 0.9141238331794739, + "learning_rate": 2.6505787959706607e-06, + "loss": 1.7522, + "mean_token_accuracy": 0.5897482872009278, + "step": 2255 + }, + { + "epoch": 0.788280432507848, + "grad_norm": 0.8366966247558594, + "learning_rate": 2.609428293790852e-06, + "loss": 1.7674, + "mean_token_accuracy": 0.5846468806266785, + "step": 2260 + }, + { + "epoch": 0.7900244157656087, + "grad_norm": 0.9955487251281738, + "learning_rate": 2.5685517452260566e-06, + "loss": 1.7447, + "mean_token_accuracy": 0.5857221484184265, + "step": 2265 + }, + { + "epoch": 0.7917683990233694, + "grad_norm": 0.870811402797699, + "learning_rate": 2.5279506654876473e-06, + "loss": 1.7609, + "mean_token_accuracy": 0.583962619304657, + "step": 2270 + }, + { + "epoch": 0.7935123822811301, + "grad_norm": 0.7584528923034668, + "learning_rate": 2.487626559575911e-06, + "loss": 1.7466, + "mean_token_accuracy": 0.5889173865318298, + "step": 2275 + }, + { + "epoch": 0.7952563655388908, + "grad_norm": 0.9040627479553223, + "learning_rate": 2.4475809222242775e-06, + "loss": 1.6884, + "mean_token_accuracy": 0.5985092759132385, + "step": 2280 + }, + { + "epoch": 0.7970003487966516, + "grad_norm": 0.8853275179862976, + "learning_rate": 2.4078152378439033e-06, + "loss": 1.7347, + "mean_token_accuracy": 0.5906036138534546, + "step": 2285 + }, + { + "epoch": 0.7987443320544123, + "grad_norm": 0.8405744433403015, + "learning_rate": 2.3683309804686604e-06, + "loss": 1.709, + "mean_token_accuracy": 0.5952712535858155, + "step": 2290 + }, + { + "epoch": 0.800488315312173, + "grad_norm": 0.8898788690567017, + "learning_rate": 2.329129613700478e-06, + "loss": 1.7429, + "mean_token_accuracy": 0.5902614951133728, + "step": 2295 + }, + { + "epoch": 0.8022322985699337, + "grad_norm": 0.8866685628890991, + "learning_rate": 2.29021259065511e-06, + "loss": 1.75, + "mean_token_accuracy": 0.5874266982078552, + "step": 2300 + }, + { + "epoch": 0.8022322985699337, + "eval_loss": 1.6609832048416138, + "eval_mean_token_accuracy": 0.6056586197320966, + "eval_runtime": 2.2958, + "eval_samples_per_second": 239.131, + "eval_steps_per_second": 60.109, + "step": 2300 + }, + { + "epoch": 0.8039762818276944, + "grad_norm": 0.8643705248832703, + "learning_rate": 2.251581353908252e-06, + "loss": 1.7428, + "mean_token_accuracy": 0.5863147497177124, + "step": 2305 + }, + { + "epoch": 0.8057202650854551, + "grad_norm": 0.8851569890975952, + "learning_rate": 2.2132373354420833e-06, + "loss": 1.7529, + "mean_token_accuracy": 0.5874816656112671, + "step": 2310 + }, + { + "epoch": 0.807464248343216, + "grad_norm": 0.9599546194076538, + "learning_rate": 2.1751819565921774e-06, + "loss": 1.6785, + "mean_token_accuracy": 0.6038734197616578, + "step": 2315 + }, + { + "epoch": 0.8092082316009767, + "grad_norm": 0.948011577129364, + "learning_rate": 2.137416627994814e-06, + "loss": 1.7415, + "mean_token_accuracy": 0.5893267393112183, + "step": 2320 + }, + { + "epoch": 0.8109522148587374, + "grad_norm": 0.9201554656028748, + "learning_rate": 2.0999427495347035e-06, + "loss": 1.7479, + "mean_token_accuracy": 0.5879765391349793, + "step": 2325 + }, + { + "epoch": 0.8126961981164981, + "grad_norm": 0.9085831642150879, + "learning_rate": 2.0627617102930753e-06, + "loss": 1.7508, + "mean_token_accuracy": 0.5871945261955261, + "step": 2330 + }, + { + "epoch": 0.8144401813742588, + "grad_norm": 0.8818920254707336, + "learning_rate": 2.02587488849621e-06, + "loss": 1.7743, + "mean_token_accuracy": 0.5828140258789063, + "step": 2335 + }, + { + "epoch": 0.8161841646320195, + "grad_norm": 0.8753697872161865, + "learning_rate": 1.989283651464329e-06, + "loss": 1.7635, + "mean_token_accuracy": 0.5849523544311523, + "step": 2340 + }, + { + "epoch": 0.8179281478897803, + "grad_norm": 0.8756415247917175, + "learning_rate": 1.952989355560929e-06, + "loss": 1.7443, + "mean_token_accuracy": 0.5903653383255005, + "step": 2345 + }, + { + "epoch": 0.819672131147541, + "grad_norm": 0.8336009383201599, + "learning_rate": 1.9169933461424928e-06, + "loss": 1.7401, + "mean_token_accuracy": 0.5914039611816406, + "step": 2350 + }, + { + "epoch": 0.8214161144053017, + "grad_norm": 0.8844002485275269, + "learning_rate": 1.8812969575086272e-06, + "loss": 1.7482, + "mean_token_accuracy": 0.5864552736282349, + "step": 2355 + }, + { + "epoch": 0.8231600976630624, + "grad_norm": 0.9012787342071533, + "learning_rate": 1.8459015128525937e-06, + "loss": 1.7409, + "mean_token_accuracy": 0.5908663272857666, + "step": 2360 + }, + { + "epoch": 0.8249040809208231, + "grad_norm": 0.7956106662750244, + "learning_rate": 1.8108083242122764e-06, + "loss": 1.7255, + "mean_token_accuracy": 0.5906525015830993, + "step": 2365 + }, + { + "epoch": 0.8266480641785838, + "grad_norm": 0.8740471005439758, + "learning_rate": 1.7760186924215239e-06, + "loss": 1.7055, + "mean_token_accuracy": 0.5988453030586243, + "step": 2370 + }, + { + "epoch": 0.8283920474363446, + "grad_norm": 0.9480302929878235, + "learning_rate": 1.7415339070619586e-06, + "loss": 1.7627, + "mean_token_accuracy": 0.5840114951133728, + "step": 2375 + }, + { + "epoch": 0.8301360306941054, + "grad_norm": 0.8435900211334229, + "learning_rate": 1.7073552464151465e-06, + "loss": 1.7349, + "mean_token_accuracy": 0.5891129016876221, + "step": 2380 + }, + { + "epoch": 0.8318800139518661, + "grad_norm": 0.9360732436180115, + "learning_rate": 1.6734839774152322e-06, + "loss": 1.7482, + "mean_token_accuracy": 0.588336992263794, + "step": 2385 + }, + { + "epoch": 0.8336239972096268, + "grad_norm": 0.823639452457428, + "learning_rate": 1.6399213556019732e-06, + "loss": 1.728, + "mean_token_accuracy": 0.5900171041488648, + "step": 2390 + }, + { + "epoch": 0.8353679804673875, + "grad_norm": 0.8690071105957031, + "learning_rate": 1.6066686250741904e-06, + "loss": 1.7024, + "mean_token_accuracy": 0.5993401646614075, + "step": 2395 + }, + { + "epoch": 0.8371119637251483, + "grad_norm": 0.8038567900657654, + "learning_rate": 1.573727018443667e-06, + "loss": 1.7292, + "mean_token_accuracy": 0.5914956092834472, + "step": 2400 + }, + { + "epoch": 0.8371119637251483, + "eval_loss": 1.6601351499557495, + "eval_mean_token_accuracy": 0.6055771587551504, + "eval_runtime": 2.2875, + "eval_samples_per_second": 239.995, + "eval_steps_per_second": 60.327, + "step": 2400 + }, + { + "epoch": 0.838855946982909, + "grad_norm": 0.8841709494590759, + "learning_rate": 1.5410977567894403e-06, + "loss": 1.7341, + "mean_token_accuracy": 0.5907196998596191, + "step": 2405 + }, + { + "epoch": 0.8405999302406697, + "grad_norm": 0.8232508897781372, + "learning_rate": 1.5087820496125595e-06, + "loss": 1.7411, + "mean_token_accuracy": 0.5903592348098755, + "step": 2410 + }, + { + "epoch": 0.8423439134984304, + "grad_norm": 0.8263944983482361, + "learning_rate": 1.4767810947912275e-06, + "loss": 1.7275, + "mean_token_accuracy": 0.5903470039367675, + "step": 2415 + }, + { + "epoch": 0.8440878967561911, + "grad_norm": 0.8025116920471191, + "learning_rate": 1.4450960785364244e-06, + "loss": 1.7217, + "mean_token_accuracy": 0.5952101707458496, + "step": 2420 + }, + { + "epoch": 0.8458318800139518, + "grad_norm": 0.9063084125518799, + "learning_rate": 1.4137281753479092e-06, + "loss": 1.7581, + "mean_token_accuracy": 0.5873411655426025, + "step": 2425 + }, + { + "epoch": 0.8475758632717126, + "grad_norm": 0.8529554605484009, + "learning_rate": 1.3826785479707128e-06, + "loss": 1.6905, + "mean_token_accuracy": 0.6020161390304566, + "step": 2430 + }, + { + "epoch": 0.8493198465294733, + "grad_norm": 0.8058586120605469, + "learning_rate": 1.3519483473520124e-06, + "loss": 1.708, + "mean_token_accuracy": 0.5949108004570007, + "step": 2435 + }, + { + "epoch": 0.851063829787234, + "grad_norm": 0.8389970660209656, + "learning_rate": 1.3215387125984813e-06, + "loss": 1.7444, + "mean_token_accuracy": 0.5873289108276367, + "step": 2440 + }, + { + "epoch": 0.8528078130449948, + "grad_norm": 0.8207646608352661, + "learning_rate": 1.2914507709340596e-06, + "loss": 1.7109, + "mean_token_accuracy": 0.6009653210639954, + "step": 2445 + }, + { + "epoch": 0.8545517963027555, + "grad_norm": 0.8824375867843628, + "learning_rate": 1.2616856376581766e-06, + "loss": 1.7679, + "mean_token_accuracy": 0.5860520601272583, + "step": 2450 + }, + { + "epoch": 0.8562957795605162, + "grad_norm": 0.9311790466308594, + "learning_rate": 1.2322444161044e-06, + "loss": 1.7405, + "mean_token_accuracy": 0.5889479517936707, + "step": 2455 + }, + { + "epoch": 0.858039762818277, + "grad_norm": 0.9297588467597961, + "learning_rate": 1.2031281975995467e-06, + "loss": 1.7474, + "mean_token_accuracy": 0.5883553385734558, + "step": 2460 + }, + { + "epoch": 0.8597837460760377, + "grad_norm": 0.8158808350563049, + "learning_rate": 1.1743380614232213e-06, + "loss": 1.7406, + "mean_token_accuracy": 0.5882514595985413, + "step": 2465 + }, + { + "epoch": 0.8615277293337984, + "grad_norm": 0.9283084273338318, + "learning_rate": 1.1458750747678105e-06, + "loss": 1.6958, + "mean_token_accuracy": 0.5967864155769348, + "step": 2470 + }, + { + "epoch": 0.8632717125915591, + "grad_norm": 0.8277420997619629, + "learning_rate": 1.1177402926989345e-06, + "loss": 1.7215, + "mean_token_accuracy": 0.5943853735923768, + "step": 2475 + }, + { + "epoch": 0.8650156958493198, + "grad_norm": 0.8195232152938843, + "learning_rate": 1.0899347581163222e-06, + "loss": 1.7171, + "mean_token_accuracy": 0.5963037490844727, + "step": 2480 + }, + { + "epoch": 0.8667596791070806, + "grad_norm": 0.822303056716919, + "learning_rate": 1.0624595017151685e-06, + "loss": 1.695, + "mean_token_accuracy": 0.6015273690223694, + "step": 2485 + }, + { + "epoch": 0.8685036623648413, + "grad_norm": 0.8165375590324402, + "learning_rate": 1.0353155419479122e-06, + "loss": 1.7527, + "mean_token_accuracy": 0.5851356267929078, + "step": 2490 + }, + { + "epoch": 0.870247645622602, + "grad_norm": 0.9221644997596741, + "learning_rate": 1.0085038849865025e-06, + "loss": 1.7328, + "mean_token_accuracy": 0.5928335785865784, + "step": 2495 + }, + { + "epoch": 0.8719916288803627, + "grad_norm": 0.8893325924873352, + "learning_rate": 9.820255246850853e-07, + "loss": 1.7685, + "mean_token_accuracy": 0.5828812479972839, + "step": 2500 + }, + { + "epoch": 0.8719916288803627, + "eval_loss": 1.6591064929962158, + "eval_mean_token_accuracy": 0.60604820864788, + "eval_runtime": 2.2827, + "eval_samples_per_second": 240.501, + "eval_steps_per_second": 60.454, + "step": 2500 + }, + { + "epoch": 0.8737356121381235, + "grad_norm": 0.848588228225708, + "learning_rate": 9.55881442543174e-07, + "loss": 1.7503, + "mean_token_accuracy": 0.5898521542549133, + "step": 2505 + }, + { + "epoch": 0.8754795953958842, + "grad_norm": 0.824779212474823, + "learning_rate": 9.30072607669259e-07, + "loss": 1.6952, + "mean_token_accuracy": 0.5995356798171997, + "step": 2510 + }, + { + "epoch": 0.877223578653645, + "grad_norm": 0.9452983736991882, + "learning_rate": 9.045999767448988e-07, + "loss": 1.7376, + "mean_token_accuracy": 0.5895466685295105, + "step": 2515 + }, + { + "epoch": 0.8789675619114057, + "grad_norm": 0.8861690759658813, + "learning_rate": 8.794644939892361e-07, + "loss": 1.7512, + "mean_token_accuracy": 0.5881637215614319, + "step": 2520 + }, + { + "epoch": 0.8807115451691664, + "grad_norm": 0.7995678782463074, + "learning_rate": 8.546670911240196e-07, + "loss": 1.7207, + "mean_token_accuracy": 0.5995723366737366, + "step": 2525 + }, + { + "epoch": 0.8824555284269271, + "grad_norm": 0.9331717491149902, + "learning_rate": 8.302086873390536e-07, + "loss": 1.7391, + "mean_token_accuracy": 0.5895955681800842, + "step": 2530 + }, + { + "epoch": 0.8841995116846878, + "grad_norm": 0.9418917298316956, + "learning_rate": 8.060901892581241e-07, + "loss": 1.766, + "mean_token_accuracy": 0.5847507357597351, + "step": 2535 + }, + { + "epoch": 0.8859434949424485, + "grad_norm": 0.9470700621604919, + "learning_rate": 7.82312490905407e-07, + "loss": 1.7325, + "mean_token_accuracy": 0.592769980430603, + "step": 2540 + }, + { + "epoch": 0.8876874782002093, + "grad_norm": 0.8495118021965027, + "learning_rate": 7.588764736723086e-07, + "loss": 1.7386, + "mean_token_accuracy": 0.5916666746139526, + "step": 2545 + }, + { + "epoch": 0.88943146145797, + "grad_norm": 0.8816931843757629, + "learning_rate": 7.357830062848114e-07, + "loss": 1.7324, + "mean_token_accuracy": 0.5908174395561219, + "step": 2550 + }, + { + "epoch": 0.8911754447157307, + "grad_norm": 0.8850387334823608, + "learning_rate": 7.130329447712581e-07, + "loss": 1.7421, + "mean_token_accuracy": 0.5845735549926758, + "step": 2555 + }, + { + "epoch": 0.8929194279734914, + "grad_norm": 0.9255213737487793, + "learning_rate": 6.906271324306335e-07, + "loss": 1.7584, + "mean_token_accuracy": 0.5835899114608765, + "step": 2560 + }, + { + "epoch": 0.8946634112312521, + "grad_norm": 0.8412609696388245, + "learning_rate": 6.685663998012926e-07, + "loss": 1.7253, + "mean_token_accuracy": 0.593218469619751, + "step": 2565 + }, + { + "epoch": 0.8964073944890129, + "grad_norm": 0.7967264652252197, + "learning_rate": 6.468515646301865e-07, + "loss": 1.6948, + "mean_token_accuracy": 0.599981677532196, + "step": 2570 + }, + { + "epoch": 0.8981513777467737, + "grad_norm": 0.8431442379951477, + "learning_rate": 6.254834318425363e-07, + "loss": 1.7389, + "mean_token_accuracy": 0.587487769126892, + "step": 2575 + }, + { + "epoch": 0.8998953610045344, + "grad_norm": 0.8867560625076294, + "learning_rate": 6.044627935120107e-07, + "loss": 1.7546, + "mean_token_accuracy": 0.586687433719635, + "step": 2580 + }, + { + "epoch": 0.9016393442622951, + "grad_norm": 0.8555639386177063, + "learning_rate": 5.837904288313545e-07, + "loss": 1.7537, + "mean_token_accuracy": 0.5884286522865295, + "step": 2585 + }, + { + "epoch": 0.9033833275200558, + "grad_norm": 0.8967999815940857, + "learning_rate": 5.634671040835104e-07, + "loss": 1.7668, + "mean_token_accuracy": 0.5837426662445069, + "step": 2590 + }, + { + "epoch": 0.9051273107778165, + "grad_norm": 0.9188566207885742, + "learning_rate": 5.43493572613214e-07, + "loss": 1.7575, + "mean_token_accuracy": 0.5860459446907044, + "step": 2595 + }, + { + "epoch": 0.9068712940355773, + "grad_norm": 0.8510751128196716, + "learning_rate": 5.238705747990669e-07, + "loss": 1.7189, + "mean_token_accuracy": 0.5938905239105224, + "step": 2600 + }, + { + "epoch": 0.9068712940355773, + "eval_loss": 1.6594796180725098, + "eval_mean_token_accuracy": 0.605966748966687, + "eval_runtime": 2.2855, + "eval_samples_per_second": 240.212, + "eval_steps_per_second": 60.381, + "step": 2600 + }, + { + "epoch": 0.908615277293338, + "grad_norm": 0.8285852670669556, + "learning_rate": 5.045988380260935e-07, + "loss": 1.762, + "mean_token_accuracy": 0.5864308476448059, + "step": 2605 + }, + { + "epoch": 0.9103592605510987, + "grad_norm": 0.8446671962738037, + "learning_rate": 4.856790766587815e-07, + "loss": 1.7301, + "mean_token_accuracy": 0.5945686578750611, + "step": 2610 + }, + { + "epoch": 0.9121032438088594, + "grad_norm": 0.8847333788871765, + "learning_rate": 4.6711199201459833e-07, + "loss": 1.7357, + "mean_token_accuracy": 0.5930405378341674, + "step": 2615 + }, + { + "epoch": 0.9138472270666201, + "grad_norm": 0.8858950138092041, + "learning_rate": 4.488982723379887e-07, + "loss": 1.7396, + "mean_token_accuracy": 0.5911412477493286, + "step": 2620 + }, + { + "epoch": 0.9155912103243808, + "grad_norm": 0.9765468835830688, + "learning_rate": 4.3103859277488056e-07, + "loss": 1.7825, + "mean_token_accuracy": 0.5797287344932556, + "step": 2625 + }, + { + "epoch": 0.9173351935821417, + "grad_norm": 0.872566282749176, + "learning_rate": 4.1353361534763657e-07, + "loss": 1.7548, + "mean_token_accuracy": 0.5853922247886658, + "step": 2630 + }, + { + "epoch": 0.9190791768399024, + "grad_norm": 0.8923954367637634, + "learning_rate": 3.963839889305343e-07, + "loss": 1.7308, + "mean_token_accuracy": 0.5930046439170837, + "step": 2635 + }, + { + "epoch": 0.9208231600976631, + "grad_norm": 0.9071829915046692, + "learning_rate": 3.7959034922569804e-07, + "loss": 1.688, + "mean_token_accuracy": 0.6026087522506713, + "step": 2640 + }, + { + "epoch": 0.9225671433554238, + "grad_norm": 0.8722803592681885, + "learning_rate": 3.631533187395453e-07, + "loss": 1.741, + "mean_token_accuracy": 0.590487539768219, + "step": 2645 + }, + { + "epoch": 0.9243111266131845, + "grad_norm": 0.8746562600135803, + "learning_rate": 3.470735067597053e-07, + "loss": 1.6987, + "mean_token_accuracy": 0.6006781339645386, + "step": 2650 + }, + { + "epoch": 0.9260551098709452, + "grad_norm": 0.8060262203216553, + "learning_rate": 3.313515093324393e-07, + "loss": 1.7085, + "mean_token_accuracy": 0.594287633895874, + "step": 2655 + }, + { + "epoch": 0.927799093128706, + "grad_norm": 0.9321186542510986, + "learning_rate": 3.1598790924053936e-07, + "loss": 1.7868, + "mean_token_accuracy": 0.5794293880462646, + "step": 2660 + }, + { + "epoch": 0.9295430763864667, + "grad_norm": 0.9236606955528259, + "learning_rate": 3.009832759817344e-07, + "loss": 1.7793, + "mean_token_accuracy": 0.5795698881149292, + "step": 2665 + }, + { + "epoch": 0.9312870596442274, + "grad_norm": 0.8090543150901794, + "learning_rate": 2.8633816574757166e-07, + "loss": 1.7138, + "mean_token_accuracy": 0.5982649207115174, + "step": 2670 + }, + { + "epoch": 0.9330310429019881, + "grad_norm": 0.9216322898864746, + "learning_rate": 2.720531214028055e-07, + "loss": 1.723, + "mean_token_accuracy": 0.5922653913497925, + "step": 2675 + }, + { + "epoch": 0.9347750261597488, + "grad_norm": 0.8106808066368103, + "learning_rate": 2.5812867246527207e-07, + "loss": 1.7209, + "mean_token_accuracy": 0.5942265391349792, + "step": 2680 + }, + { + "epoch": 0.9365190094175095, + "grad_norm": 0.9196311235427856, + "learning_rate": 2.445653350862609e-07, + "loss": 1.7327, + "mean_token_accuracy": 0.5905364036560059, + "step": 2685 + }, + { + "epoch": 0.9382629926752704, + "grad_norm": 0.8612604141235352, + "learning_rate": 2.3136361203138668e-07, + "loss": 1.7133, + "mean_token_accuracy": 0.59539954662323, + "step": 2690 + }, + { + "epoch": 0.9400069759330311, + "grad_norm": 0.8981013298034668, + "learning_rate": 2.1852399266194312e-07, + "loss": 1.7595, + "mean_token_accuracy": 0.5843658208847046, + "step": 2695 + }, + { + "epoch": 0.9417509591907918, + "grad_norm": 0.9147467613220215, + "learning_rate": 2.0604695291677523e-07, + "loss": 1.7001, + "mean_token_accuracy": 0.5955461859703064, + "step": 2700 + }, + { + "epoch": 0.9417509591907918, + "eval_loss": 1.6584773063659668, + "eval_mean_token_accuracy": 0.6060039370820143, + "eval_runtime": 2.2919, + "eval_samples_per_second": 239.541, + "eval_steps_per_second": 60.213, + "step": 2700 + }, + { + "epoch": 0.9434949424485525, + "grad_norm": 0.8913834691047668, + "learning_rate": 1.9393295529462674e-07, + "loss": 1.7178, + "mean_token_accuracy": 0.594287633895874, + "step": 2705 + }, + { + "epoch": 0.9452389257063132, + "grad_norm": 0.8741482496261597, + "learning_rate": 1.8218244883700386e-07, + "loss": 1.7434, + "mean_token_accuracy": 0.5877382636070252, + "step": 2710 + }, + { + "epoch": 0.946982908964074, + "grad_norm": 0.8018344044685364, + "learning_rate": 1.7079586911152413e-07, + "loss": 1.7282, + "mean_token_accuracy": 0.5907319188117981, + "step": 2715 + }, + { + "epoch": 0.9487268922218347, + "grad_norm": 0.8863992094993591, + "learning_rate": 1.597736381957782e-07, + "loss": 1.7402, + "mean_token_accuracy": 0.5870540142059326, + "step": 2720 + }, + { + "epoch": 0.9504708754795954, + "grad_norm": 0.8158055543899536, + "learning_rate": 1.4911616466167345e-07, + "loss": 1.7224, + "mean_token_accuracy": 0.5916422247886658, + "step": 2725 + }, + { + "epoch": 0.9522148587373561, + "grad_norm": 0.910216748714447, + "learning_rate": 1.3882384356030066e-07, + "loss": 1.7494, + "mean_token_accuracy": 0.5889174103736877, + "step": 2730 + }, + { + "epoch": 0.9539588419951168, + "grad_norm": 0.8515923023223877, + "learning_rate": 1.2889705640728445e-07, + "loss": 1.7571, + "mean_token_accuracy": 0.5835349559783936, + "step": 2735 + }, + { + "epoch": 0.9557028252528775, + "grad_norm": 0.9505128264427185, + "learning_rate": 1.1933617116863805e-07, + "loss": 1.7456, + "mean_token_accuracy": 0.587506103515625, + "step": 2740 + }, + { + "epoch": 0.9574468085106383, + "grad_norm": 0.8739469647407532, + "learning_rate": 1.1014154224713302e-07, + "loss": 1.6999, + "mean_token_accuracy": 0.5958394527435302, + "step": 2745 + }, + { + "epoch": 0.959190791768399, + "grad_norm": 0.8585278391838074, + "learning_rate": 1.0131351046915094e-07, + "loss": 1.7381, + "mean_token_accuracy": 0.591556704044342, + "step": 2750 + }, + { + "epoch": 0.9609347750261598, + "grad_norm": 0.906234860420227, + "learning_rate": 9.285240307206123e-08, + "loss": 1.769, + "mean_token_accuracy": 0.5836265802383422, + "step": 2755 + }, + { + "epoch": 0.9626787582839205, + "grad_norm": 0.8195945620536804, + "learning_rate": 8.475853369207753e-08, + "loss": 1.7143, + "mean_token_accuracy": 0.5954728722572327, + "step": 2760 + }, + { + "epoch": 0.9644227415416812, + "grad_norm": 1.0005578994750977, + "learning_rate": 7.703220235264708e-08, + "loss": 1.7157, + "mean_token_accuracy": 0.593475079536438, + "step": 2765 + }, + { + "epoch": 0.9661667247994419, + "grad_norm": 0.8547971844673157, + "learning_rate": 6.967369545331615e-08, + "loss": 1.7054, + "mean_token_accuracy": 0.5944586992263794, + "step": 2770 + }, + { + "epoch": 0.9679107080572027, + "grad_norm": 0.9394063353538513, + "learning_rate": 6.26832857591242e-08, + "loss": 1.7512, + "mean_token_accuracy": 0.5890349984169007, + "step": 2775 + }, + { + "epoch": 0.9696546913149634, + "grad_norm": 0.9623610377311707, + "learning_rate": 5.606123239048522e-08, + "loss": 1.7152, + "mean_token_accuracy": 0.5931390523910522, + "step": 2780 + }, + { + "epoch": 0.9713986745727241, + "grad_norm": 0.893081784248352, + "learning_rate": 4.9807780813586615e-08, + "loss": 1.744, + "mean_token_accuracy": 0.5872067332267761, + "step": 2785 + }, + { + "epoch": 0.9731426578304848, + "grad_norm": 0.8337393999099731, + "learning_rate": 4.392316283128861e-08, + "loss": 1.7563, + "mean_token_accuracy": 0.5852150678634643, + "step": 2790 + }, + { + "epoch": 0.9748866410882455, + "grad_norm": 0.8514472842216492, + "learning_rate": 3.840759657453452e-08, + "loss": 1.7353, + "mean_token_accuracy": 0.5920976281166077, + "step": 2795 + }, + { + "epoch": 0.9766306243460062, + "grad_norm": 0.912960410118103, + "learning_rate": 3.326128649426053e-08, + "loss": 1.7796, + "mean_token_accuracy": 0.5818365216255188, + "step": 2800 + }, + { + "epoch": 0.9766306243460062, + "eval_loss": 1.6588006019592285, + "eval_mean_token_accuracy": 0.6062022754247638, + "eval_runtime": 2.2914, + "eval_samples_per_second": 239.594, + "eval_steps_per_second": 60.226, + "step": 2800 + }, + { + "epoch": 0.978374607603767, + "grad_norm": 0.8588950037956238, + "learning_rate": 2.8484423353822842e-08, + "loss": 1.7226, + "mean_token_accuracy": 0.591306209564209, + "step": 2805 + }, + { + "epoch": 0.9801185908615277, + "grad_norm": 0.833301842212677, + "learning_rate": 2.4077184221920068e-08, + "loss": 1.7007, + "mean_token_accuracy": 0.5998289346694946, + "step": 2810 + }, + { + "epoch": 0.9818625741192885, + "grad_norm": 0.8208726048469543, + "learning_rate": 2.003973246603508e-08, + "loss": 1.7621, + "mean_token_accuracy": 0.5872918248176575, + "step": 2815 + }, + { + "epoch": 0.9836065573770492, + "grad_norm": 0.8127596378326416, + "learning_rate": 1.637221774637765e-08, + "loss": 1.742, + "mean_token_accuracy": 0.5922959446907043, + "step": 2820 + }, + { + "epoch": 0.9853505406348099, + "grad_norm": 0.8162915706634521, + "learning_rate": 1.3074776010334466e-08, + "loss": 1.7298, + "mean_token_accuracy": 0.590731930732727, + "step": 2825 + }, + { + "epoch": 0.9870945238925707, + "grad_norm": 0.8142114281654358, + "learning_rate": 1.0147529487432028e-08, + "loss": 1.7448, + "mean_token_accuracy": 0.5880193114280701, + "step": 2830 + }, + { + "epoch": 0.9888385071503314, + "grad_norm": 0.8614972233772278, + "learning_rate": 7.590586684805834e-09, + "loss": 1.7487, + "mean_token_accuracy": 0.5880315184593201, + "step": 2835 + }, + { + "epoch": 0.9905824904080921, + "grad_norm": 0.8733019232749939, + "learning_rate": 5.4040423831802635e-09, + "loss": 1.7423, + "mean_token_accuracy": 0.5866141200065613, + "step": 2840 + }, + { + "epoch": 0.9923264736658528, + "grad_norm": 0.7841631174087524, + "learning_rate": 3.587977633348061e-09, + "loss": 1.7832, + "mean_token_accuracy": 0.5818609476089478, + "step": 2845 + }, + { + "epoch": 0.9940704569236135, + "grad_norm": 0.8357582688331604, + "learning_rate": 2.1424597531749524e-09, + "loss": 1.7155, + "mean_token_accuracy": 0.5949352383613586, + "step": 2850 + }, + { + "epoch": 0.9958144401813742, + "grad_norm": 0.7859371304512024, + "learning_rate": 1.0675423250994244e-09, + "loss": 1.7464, + "mean_token_accuracy": 0.5891129016876221, + "step": 2855 + }, + { + "epoch": 0.997558423439135, + "grad_norm": 0.8230715990066528, + "learning_rate": 3.6326519414431327e-10, + "loss": 1.7622, + "mean_token_accuracy": 0.5866019129753113, + "step": 2860 + }, + { + "epoch": 0.9993024066968957, + "grad_norm": 0.7908604145050049, + "learning_rate": 2.965446644798142e-11, + "loss": 1.7485, + "mean_token_accuracy": 0.590188181400299, + "step": 2865 + }, + { + "epoch": 1.0, + "mean_token_accuracy": 0.5885422825813293, + "step": 2867, + "total_flos": 4.794029428021658e+16, + "train_loss": 1.842788679657142, + "train_runtime": 1095.7674, + "train_samples_per_second": 83.719, + "train_steps_per_second": 2.616 + } + ], + "logging_steps": 5, + "max_steps": 2867, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.794029428021658e+16, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +}