{ "best_global_step": 9705, "best_metric": 0.5019899606704712, "best_model_checkpoint": "saves/p-tuning/llama-3-8b-instruct/train_stsb_1754652140/checkpoint-9705", "epoch": 10.0, "eval_steps": 647, "global_step": 12940, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0038639876352395673, "grad_norm": 108.71379852294922, "learning_rate": 1.5455950540958268e-07, "loss": 6.562, "num_input_tokens_seen": 1728, "step": 5 }, { "epoch": 0.0077279752704791345, "grad_norm": 62.509613037109375, "learning_rate": 3.4775888717156104e-07, "loss": 7.1966, "num_input_tokens_seen": 3456, "step": 10 }, { "epoch": 0.011591962905718702, "grad_norm": 54.61553192138672, "learning_rate": 5.409582689335394e-07, "loss": 7.0239, "num_input_tokens_seen": 4992, "step": 15 }, { "epoch": 0.015455950540958269, "grad_norm": 54.90473937988281, "learning_rate": 7.341576506955178e-07, "loss": 6.9169, "num_input_tokens_seen": 6624, "step": 20 }, { "epoch": 0.019319938176197836, "grad_norm": 59.44583511352539, "learning_rate": 9.273570324574961e-07, "loss": 6.9838, "num_input_tokens_seen": 8384, "step": 25 }, { "epoch": 0.023183925811437404, "grad_norm": 47.32235336303711, "learning_rate": 1.1205564142194745e-06, "loss": 6.3589, "num_input_tokens_seen": 9856, "step": 30 }, { "epoch": 0.02704791344667697, "grad_norm": 52.24999237060547, "learning_rate": 1.313755795981453e-06, "loss": 6.5799, "num_input_tokens_seen": 11456, "step": 35 }, { "epoch": 0.030911901081916538, "grad_norm": 69.10556030273438, "learning_rate": 1.5069551777434314e-06, "loss": 5.7474, "num_input_tokens_seen": 13312, "step": 40 }, { "epoch": 0.0347758887171561, "grad_norm": 76.39933776855469, "learning_rate": 1.7001545595054098e-06, "loss": 5.9067, "num_input_tokens_seen": 14880, "step": 45 }, { "epoch": 0.03863987635239567, "grad_norm": 45.793357849121094, "learning_rate": 1.8933539412673882e-06, "loss": 5.4043, "num_input_tokens_seen": 16512, "step": 50 }, { "epoch": 0.04250386398763524, "grad_norm": 98.06501770019531, "learning_rate": 2.0865533230293665e-06, "loss": 5.6323, "num_input_tokens_seen": 17952, "step": 55 }, { "epoch": 0.04636785162287481, "grad_norm": 68.69889068603516, "learning_rate": 2.2797527047913445e-06, "loss": 4.6772, "num_input_tokens_seen": 19488, "step": 60 }, { "epoch": 0.05023183925811438, "grad_norm": 79.63642120361328, "learning_rate": 2.472952086553323e-06, "loss": 4.8897, "num_input_tokens_seen": 21184, "step": 65 }, { "epoch": 0.05409582689335394, "grad_norm": 78.30905151367188, "learning_rate": 2.6661514683153014e-06, "loss": 5.0602, "num_input_tokens_seen": 22976, "step": 70 }, { "epoch": 0.05795981452859351, "grad_norm": 62.087310791015625, "learning_rate": 2.85935085007728e-06, "loss": 4.904, "num_input_tokens_seen": 24672, "step": 75 }, { "epoch": 0.061823802163833076, "grad_norm": 31.626808166503906, "learning_rate": 3.0525502318392582e-06, "loss": 4.7113, "num_input_tokens_seen": 26496, "step": 80 }, { "epoch": 0.06568778979907264, "grad_norm": 52.603118896484375, "learning_rate": 3.2457496136012367e-06, "loss": 4.9226, "num_input_tokens_seen": 28160, "step": 85 }, { "epoch": 0.0695517774343122, "grad_norm": 52.18206787109375, "learning_rate": 3.438948995363215e-06, "loss": 4.5143, "num_input_tokens_seen": 29984, "step": 90 }, { "epoch": 0.07341576506955177, "grad_norm": 98.37920379638672, "learning_rate": 3.6321483771251936e-06, "loss": 4.5047, "num_input_tokens_seen": 31712, "step": 95 }, { "epoch": 0.07727975270479134, "grad_norm": 67.4117660522461, "learning_rate": 3.8253477588871716e-06, "loss": 4.5475, "num_input_tokens_seen": 33152, "step": 100 }, { "epoch": 0.08114374034003091, "grad_norm": 100.19391632080078, "learning_rate": 4.01854714064915e-06, "loss": 3.8478, "num_input_tokens_seen": 34816, "step": 105 }, { "epoch": 0.08500772797527048, "grad_norm": 47.59382247924805, "learning_rate": 4.2117465224111284e-06, "loss": 4.0702, "num_input_tokens_seen": 36416, "step": 110 }, { "epoch": 0.08887171561051005, "grad_norm": 75.25868225097656, "learning_rate": 4.404945904173107e-06, "loss": 3.1519, "num_input_tokens_seen": 38304, "step": 115 }, { "epoch": 0.09273570324574962, "grad_norm": 35.79936981201172, "learning_rate": 4.598145285935085e-06, "loss": 3.1418, "num_input_tokens_seen": 39744, "step": 120 }, { "epoch": 0.09659969088098919, "grad_norm": 66.84203338623047, "learning_rate": 4.791344667697063e-06, "loss": 3.3736, "num_input_tokens_seen": 41504, "step": 125 }, { "epoch": 0.10046367851622875, "grad_norm": 39.79071807861328, "learning_rate": 4.984544049459041e-06, "loss": 3.0209, "num_input_tokens_seen": 42944, "step": 130 }, { "epoch": 0.10432766615146831, "grad_norm": 33.28805923461914, "learning_rate": 5.17774343122102e-06, "loss": 2.9372, "num_input_tokens_seen": 44800, "step": 135 }, { "epoch": 0.10819165378670788, "grad_norm": 147.2544708251953, "learning_rate": 5.370942812982998e-06, "loss": 3.0455, "num_input_tokens_seen": 46528, "step": 140 }, { "epoch": 0.11205564142194745, "grad_norm": 82.80937194824219, "learning_rate": 5.564142194744977e-06, "loss": 2.4544, "num_input_tokens_seen": 48128, "step": 145 }, { "epoch": 0.11591962905718702, "grad_norm": 76.22148895263672, "learning_rate": 5.757341576506955e-06, "loss": 2.7769, "num_input_tokens_seen": 49952, "step": 150 }, { "epoch": 0.11978361669242658, "grad_norm": 41.61345291137695, "learning_rate": 5.9505409582689335e-06, "loss": 2.2783, "num_input_tokens_seen": 51648, "step": 155 }, { "epoch": 0.12364760432766615, "grad_norm": 54.1597900390625, "learning_rate": 6.143740340030912e-06, "loss": 2.4857, "num_input_tokens_seen": 53152, "step": 160 }, { "epoch": 0.1275115919629057, "grad_norm": 48.925392150878906, "learning_rate": 6.3369397217928904e-06, "loss": 1.6151, "num_input_tokens_seen": 54848, "step": 165 }, { "epoch": 0.13137557959814528, "grad_norm": 39.552711486816406, "learning_rate": 6.530139103554869e-06, "loss": 1.3151, "num_input_tokens_seen": 56224, "step": 170 }, { "epoch": 0.13523956723338484, "grad_norm": 36.534027099609375, "learning_rate": 6.723338485316847e-06, "loss": 1.8837, "num_input_tokens_seen": 57888, "step": 175 }, { "epoch": 0.1391035548686244, "grad_norm": 73.38136291503906, "learning_rate": 6.916537867078826e-06, "loss": 1.8041, "num_input_tokens_seen": 59776, "step": 180 }, { "epoch": 0.14296754250386398, "grad_norm": 74.02698516845703, "learning_rate": 7.109737248840804e-06, "loss": 1.3887, "num_input_tokens_seen": 61472, "step": 185 }, { "epoch": 0.14683153013910355, "grad_norm": 118.75630950927734, "learning_rate": 7.302936630602783e-06, "loss": 1.4309, "num_input_tokens_seen": 63232, "step": 190 }, { "epoch": 0.15069551777434312, "grad_norm": 142.51296997070312, "learning_rate": 7.496136012364761e-06, "loss": 1.7527, "num_input_tokens_seen": 64960, "step": 195 }, { "epoch": 0.1545595054095827, "grad_norm": 53.628116607666016, "learning_rate": 7.689335394126738e-06, "loss": 1.4131, "num_input_tokens_seen": 66528, "step": 200 }, { "epoch": 0.15842349304482226, "grad_norm": 29.120288848876953, "learning_rate": 7.882534775888716e-06, "loss": 1.7749, "num_input_tokens_seen": 68416, "step": 205 }, { "epoch": 0.16228748068006182, "grad_norm": 94.55474090576172, "learning_rate": 8.075734157650695e-06, "loss": 1.383, "num_input_tokens_seen": 70176, "step": 210 }, { "epoch": 0.1661514683153014, "grad_norm": 46.191646575927734, "learning_rate": 8.268933539412673e-06, "loss": 1.1297, "num_input_tokens_seen": 71648, "step": 215 }, { "epoch": 0.17001545595054096, "grad_norm": 26.59138298034668, "learning_rate": 8.462132921174652e-06, "loss": 1.1835, "num_input_tokens_seen": 73216, "step": 220 }, { "epoch": 0.17387944358578053, "grad_norm": 64.0928726196289, "learning_rate": 8.65533230293663e-06, "loss": 1.2361, "num_input_tokens_seen": 74912, "step": 225 }, { "epoch": 0.1777434312210201, "grad_norm": 36.164127349853516, "learning_rate": 8.848531684698608e-06, "loss": 1.2103, "num_input_tokens_seen": 76480, "step": 230 }, { "epoch": 0.18160741885625967, "grad_norm": 71.60681915283203, "learning_rate": 9.041731066460587e-06, "loss": 1.0928, "num_input_tokens_seen": 78016, "step": 235 }, { "epoch": 0.18547140649149924, "grad_norm": 56.944400787353516, "learning_rate": 9.234930448222565e-06, "loss": 1.1127, "num_input_tokens_seen": 79424, "step": 240 }, { "epoch": 0.1893353941267388, "grad_norm": 35.20863723754883, "learning_rate": 9.428129829984544e-06, "loss": 1.4106, "num_input_tokens_seen": 81440, "step": 245 }, { "epoch": 0.19319938176197837, "grad_norm": 57.1212043762207, "learning_rate": 9.621329211746522e-06, "loss": 0.9569, "num_input_tokens_seen": 82976, "step": 250 }, { "epoch": 0.19706336939721794, "grad_norm": 62.85688018798828, "learning_rate": 9.8145285935085e-06, "loss": 0.9041, "num_input_tokens_seen": 84576, "step": 255 }, { "epoch": 0.2009273570324575, "grad_norm": 23.02287483215332, "learning_rate": 1.0007727975270479e-05, "loss": 1.4116, "num_input_tokens_seen": 86368, "step": 260 }, { "epoch": 0.20479134466769705, "grad_norm": 346.6825256347656, "learning_rate": 1.0200927357032458e-05, "loss": 1.2984, "num_input_tokens_seen": 88064, "step": 265 }, { "epoch": 0.20865533230293662, "grad_norm": 33.03400802612305, "learning_rate": 1.0394126738794436e-05, "loss": 0.9413, "num_input_tokens_seen": 89664, "step": 270 }, { "epoch": 0.2125193199381762, "grad_norm": 283.82110595703125, "learning_rate": 1.0587326120556414e-05, "loss": 1.2537, "num_input_tokens_seen": 91200, "step": 275 }, { "epoch": 0.21638330757341576, "grad_norm": 42.77622604370117, "learning_rate": 1.0780525502318393e-05, "loss": 1.2736, "num_input_tokens_seen": 92576, "step": 280 }, { "epoch": 0.22024729520865532, "grad_norm": 91.24327087402344, "learning_rate": 1.0973724884080371e-05, "loss": 1.2562, "num_input_tokens_seen": 94336, "step": 285 }, { "epoch": 0.2241112828438949, "grad_norm": 21.53784942626953, "learning_rate": 1.116692426584235e-05, "loss": 1.559, "num_input_tokens_seen": 96256, "step": 290 }, { "epoch": 0.22797527047913446, "grad_norm": 47.53639221191406, "learning_rate": 1.1360123647604328e-05, "loss": 1.0915, "num_input_tokens_seen": 97920, "step": 295 }, { "epoch": 0.23183925811437403, "grad_norm": 30.942581176757812, "learning_rate": 1.1553323029366307e-05, "loss": 1.4491, "num_input_tokens_seen": 99712, "step": 300 }, { "epoch": 0.2357032457496136, "grad_norm": 28.2766056060791, "learning_rate": 1.1746522411128285e-05, "loss": 1.1427, "num_input_tokens_seen": 101280, "step": 305 }, { "epoch": 0.23956723338485317, "grad_norm": 48.791019439697266, "learning_rate": 1.1939721792890263e-05, "loss": 1.3584, "num_input_tokens_seen": 102944, "step": 310 }, { "epoch": 0.24343122102009274, "grad_norm": 34.396915435791016, "learning_rate": 1.2132921174652242e-05, "loss": 0.8836, "num_input_tokens_seen": 104896, "step": 315 }, { "epoch": 0.2472952086553323, "grad_norm": 35.14845657348633, "learning_rate": 1.232612055641422e-05, "loss": 0.7939, "num_input_tokens_seen": 106720, "step": 320 }, { "epoch": 0.2511591962905719, "grad_norm": 32.558349609375, "learning_rate": 1.2519319938176199e-05, "loss": 0.7042, "num_input_tokens_seen": 108544, "step": 325 }, { "epoch": 0.2550231839258114, "grad_norm": 31.770999908447266, "learning_rate": 1.2712519319938177e-05, "loss": 1.1622, "num_input_tokens_seen": 110176, "step": 330 }, { "epoch": 0.258887171561051, "grad_norm": 19.95728302001953, "learning_rate": 1.2905718701700156e-05, "loss": 0.8079, "num_input_tokens_seen": 111808, "step": 335 }, { "epoch": 0.26275115919629055, "grad_norm": 40.798580169677734, "learning_rate": 1.3098918083462134e-05, "loss": 0.9864, "num_input_tokens_seen": 113600, "step": 340 }, { "epoch": 0.26661514683153015, "grad_norm": 14.568097114562988, "learning_rate": 1.3292117465224113e-05, "loss": 0.831, "num_input_tokens_seen": 115456, "step": 345 }, { "epoch": 0.2704791344667697, "grad_norm": 37.26884078979492, "learning_rate": 1.3485316846986091e-05, "loss": 0.772, "num_input_tokens_seen": 116992, "step": 350 }, { "epoch": 0.2743431221020093, "grad_norm": 18.524816513061523, "learning_rate": 1.367851622874807e-05, "loss": 0.9897, "num_input_tokens_seen": 118880, "step": 355 }, { "epoch": 0.2782071097372488, "grad_norm": 22.43954849243164, "learning_rate": 1.3871715610510048e-05, "loss": 0.9943, "num_input_tokens_seen": 120416, "step": 360 }, { "epoch": 0.2820710973724884, "grad_norm": 35.5218391418457, "learning_rate": 1.4064914992272025e-05, "loss": 0.9037, "num_input_tokens_seen": 122080, "step": 365 }, { "epoch": 0.28593508500772796, "grad_norm": 19.117769241333008, "learning_rate": 1.4258114374034005e-05, "loss": 0.8019, "num_input_tokens_seen": 124064, "step": 370 }, { "epoch": 0.28979907264296756, "grad_norm": 27.269067764282227, "learning_rate": 1.4451313755795981e-05, "loss": 0.9046, "num_input_tokens_seen": 125600, "step": 375 }, { "epoch": 0.2936630602782071, "grad_norm": 26.24531364440918, "learning_rate": 1.4644513137557962e-05, "loss": 1.0584, "num_input_tokens_seen": 127168, "step": 380 }, { "epoch": 0.2975270479134467, "grad_norm": 16.09661293029785, "learning_rate": 1.4837712519319938e-05, "loss": 0.827, "num_input_tokens_seen": 128928, "step": 385 }, { "epoch": 0.30139103554868624, "grad_norm": 65.89136505126953, "learning_rate": 1.5030911901081918e-05, "loss": 0.7609, "num_input_tokens_seen": 130688, "step": 390 }, { "epoch": 0.30525502318392583, "grad_norm": 23.163131713867188, "learning_rate": 1.5224111282843895e-05, "loss": 0.9276, "num_input_tokens_seen": 132096, "step": 395 }, { "epoch": 0.3091190108191654, "grad_norm": 34.66181564331055, "learning_rate": 1.5417310664605875e-05, "loss": 1.2854, "num_input_tokens_seen": 133664, "step": 400 }, { "epoch": 0.31298299845440497, "grad_norm": 9.269013404846191, "learning_rate": 1.561051004636785e-05, "loss": 0.8838, "num_input_tokens_seen": 135616, "step": 405 }, { "epoch": 0.3168469860896445, "grad_norm": 19.139049530029297, "learning_rate": 1.5803709428129832e-05, "loss": 0.9386, "num_input_tokens_seen": 137248, "step": 410 }, { "epoch": 0.3207109737248841, "grad_norm": 27.688785552978516, "learning_rate": 1.5996908809891807e-05, "loss": 1.0209, "num_input_tokens_seen": 139040, "step": 415 }, { "epoch": 0.32457496136012365, "grad_norm": 17.730220794677734, "learning_rate": 1.619010819165379e-05, "loss": 0.6648, "num_input_tokens_seen": 140864, "step": 420 }, { "epoch": 0.3284389489953632, "grad_norm": 42.47673797607422, "learning_rate": 1.6383307573415764e-05, "loss": 0.6893, "num_input_tokens_seen": 142752, "step": 425 }, { "epoch": 0.3323029366306028, "grad_norm": 14.735602378845215, "learning_rate": 1.6576506955177746e-05, "loss": 0.792, "num_input_tokens_seen": 144576, "step": 430 }, { "epoch": 0.3361669242658423, "grad_norm": 14.783099174499512, "learning_rate": 1.676970633693972e-05, "loss": 0.9006, "num_input_tokens_seen": 146176, "step": 435 }, { "epoch": 0.3400309119010819, "grad_norm": 26.35193634033203, "learning_rate": 1.6962905718701703e-05, "loss": 0.9566, "num_input_tokens_seen": 147840, "step": 440 }, { "epoch": 0.34389489953632146, "grad_norm": 24.10528564453125, "learning_rate": 1.7156105100463678e-05, "loss": 0.8549, "num_input_tokens_seen": 149344, "step": 445 }, { "epoch": 0.34775888717156106, "grad_norm": 16.213115692138672, "learning_rate": 1.734930448222566e-05, "loss": 0.7894, "num_input_tokens_seen": 151040, "step": 450 }, { "epoch": 0.3516228748068006, "grad_norm": 16.681446075439453, "learning_rate": 1.7542503863987635e-05, "loss": 0.8001, "num_input_tokens_seen": 152768, "step": 455 }, { "epoch": 0.3554868624420402, "grad_norm": 34.44270324707031, "learning_rate": 1.7735703245749617e-05, "loss": 0.9346, "num_input_tokens_seen": 154176, "step": 460 }, { "epoch": 0.35935085007727974, "grad_norm": 21.283370971679688, "learning_rate": 1.792890262751159e-05, "loss": 0.8944, "num_input_tokens_seen": 155936, "step": 465 }, { "epoch": 0.36321483771251933, "grad_norm": 29.350069046020508, "learning_rate": 1.812210200927357e-05, "loss": 0.8513, "num_input_tokens_seen": 157568, "step": 470 }, { "epoch": 0.3670788253477589, "grad_norm": 23.542827606201172, "learning_rate": 1.831530139103555e-05, "loss": 0.8327, "num_input_tokens_seen": 159392, "step": 475 }, { "epoch": 0.37094281298299847, "grad_norm": 10.363747596740723, "learning_rate": 1.8508500772797527e-05, "loss": 0.7847, "num_input_tokens_seen": 161024, "step": 480 }, { "epoch": 0.374806800618238, "grad_norm": 22.1647891998291, "learning_rate": 1.8701700154559505e-05, "loss": 0.7509, "num_input_tokens_seen": 162560, "step": 485 }, { "epoch": 0.3786707882534776, "grad_norm": 19.166217803955078, "learning_rate": 1.8894899536321484e-05, "loss": 0.9651, "num_input_tokens_seen": 164224, "step": 490 }, { "epoch": 0.38253477588871715, "grad_norm": 28.60698699951172, "learning_rate": 1.9088098918083462e-05, "loss": 0.8269, "num_input_tokens_seen": 166144, "step": 495 }, { "epoch": 0.38639876352395675, "grad_norm": 40.16697692871094, "learning_rate": 1.928129829984544e-05, "loss": 1.1063, "num_input_tokens_seen": 167968, "step": 500 }, { "epoch": 0.3902627511591963, "grad_norm": 18.697368621826172, "learning_rate": 1.947449768160742e-05, "loss": 0.9268, "num_input_tokens_seen": 169728, "step": 505 }, { "epoch": 0.3941267387944359, "grad_norm": 15.886354446411133, "learning_rate": 1.9667697063369398e-05, "loss": 0.7076, "num_input_tokens_seen": 171232, "step": 510 }, { "epoch": 0.3979907264296754, "grad_norm": 47.13811111450195, "learning_rate": 1.9860896445131376e-05, "loss": 0.8998, "num_input_tokens_seen": 172960, "step": 515 }, { "epoch": 0.401854714064915, "grad_norm": 9.30449390411377, "learning_rate": 2.0054095826893355e-05, "loss": 0.8139, "num_input_tokens_seen": 174784, "step": 520 }, { "epoch": 0.40571870170015456, "grad_norm": 27.032848358154297, "learning_rate": 2.0247295208655333e-05, "loss": 0.6275, "num_input_tokens_seen": 176256, "step": 525 }, { "epoch": 0.4095826893353941, "grad_norm": 22.722747802734375, "learning_rate": 2.044049459041731e-05, "loss": 0.8347, "num_input_tokens_seen": 177760, "step": 530 }, { "epoch": 0.4134466769706337, "grad_norm": 13.308094024658203, "learning_rate": 2.063369397217929e-05, "loss": 0.6605, "num_input_tokens_seen": 179456, "step": 535 }, { "epoch": 0.41731066460587324, "grad_norm": 12.275449752807617, "learning_rate": 2.0826893353941268e-05, "loss": 0.7415, "num_input_tokens_seen": 181408, "step": 540 }, { "epoch": 0.42117465224111283, "grad_norm": 12.652037620544434, "learning_rate": 2.1020092735703247e-05, "loss": 0.7097, "num_input_tokens_seen": 182912, "step": 545 }, { "epoch": 0.4250386398763524, "grad_norm": 14.79366683959961, "learning_rate": 2.1213292117465225e-05, "loss": 0.7512, "num_input_tokens_seen": 184480, "step": 550 }, { "epoch": 0.42890262751159197, "grad_norm": 12.677210807800293, "learning_rate": 2.1406491499227204e-05, "loss": 0.7244, "num_input_tokens_seen": 186144, "step": 555 }, { "epoch": 0.4327666151468315, "grad_norm": 40.1728401184082, "learning_rate": 2.1599690880989182e-05, "loss": 1.265, "num_input_tokens_seen": 187776, "step": 560 }, { "epoch": 0.4366306027820711, "grad_norm": 22.25064468383789, "learning_rate": 2.179289026275116e-05, "loss": 0.6479, "num_input_tokens_seen": 189344, "step": 565 }, { "epoch": 0.44049459041731065, "grad_norm": 21.866174697875977, "learning_rate": 2.198608964451314e-05, "loss": 0.8867, "num_input_tokens_seen": 191072, "step": 570 }, { "epoch": 0.44435857805255025, "grad_norm": 20.108129501342773, "learning_rate": 2.2179289026275117e-05, "loss": 0.7265, "num_input_tokens_seen": 192896, "step": 575 }, { "epoch": 0.4482225656877898, "grad_norm": 26.60913848876953, "learning_rate": 2.2372488408037096e-05, "loss": 0.8289, "num_input_tokens_seen": 194624, "step": 580 }, { "epoch": 0.4520865533230294, "grad_norm": 20.259096145629883, "learning_rate": 2.2565687789799074e-05, "loss": 0.7206, "num_input_tokens_seen": 196224, "step": 585 }, { "epoch": 0.4559505409582689, "grad_norm": 10.611980438232422, "learning_rate": 2.2758887171561053e-05, "loss": 1.1316, "num_input_tokens_seen": 197984, "step": 590 }, { "epoch": 0.4598145285935085, "grad_norm": 15.928028106689453, "learning_rate": 2.295208655332303e-05, "loss": 0.8223, "num_input_tokens_seen": 199904, "step": 595 }, { "epoch": 0.46367851622874806, "grad_norm": 25.125158309936523, "learning_rate": 2.314528593508501e-05, "loss": 0.7832, "num_input_tokens_seen": 201472, "step": 600 }, { "epoch": 0.46754250386398766, "grad_norm": 10.892329216003418, "learning_rate": 2.3338485316846988e-05, "loss": 0.6911, "num_input_tokens_seen": 203040, "step": 605 }, { "epoch": 0.4714064914992272, "grad_norm": 15.637603759765625, "learning_rate": 2.3531684698608966e-05, "loss": 1.0943, "num_input_tokens_seen": 204608, "step": 610 }, { "epoch": 0.4752704791344668, "grad_norm": 7.646291732788086, "learning_rate": 2.3724884080370945e-05, "loss": 0.6316, "num_input_tokens_seen": 206144, "step": 615 }, { "epoch": 0.47913446676970634, "grad_norm": 16.112329483032227, "learning_rate": 2.3918083462132923e-05, "loss": 0.8129, "num_input_tokens_seen": 207904, "step": 620 }, { "epoch": 0.48299845440494593, "grad_norm": 10.188529968261719, "learning_rate": 2.4111282843894902e-05, "loss": 0.8244, "num_input_tokens_seen": 209696, "step": 625 }, { "epoch": 0.4868624420401855, "grad_norm": 15.559508323669434, "learning_rate": 2.430448222565688e-05, "loss": 0.6637, "num_input_tokens_seen": 211360, "step": 630 }, { "epoch": 0.490726429675425, "grad_norm": 9.22081184387207, "learning_rate": 2.449768160741886e-05, "loss": 0.8407, "num_input_tokens_seen": 213088, "step": 635 }, { "epoch": 0.4945904173106646, "grad_norm": 7.897764205932617, "learning_rate": 2.4690880989180837e-05, "loss": 0.6584, "num_input_tokens_seen": 214880, "step": 640 }, { "epoch": 0.49845440494590415, "grad_norm": 22.405506134033203, "learning_rate": 2.4884080370942815e-05, "loss": 0.7201, "num_input_tokens_seen": 216832, "step": 645 }, { "epoch": 0.5, "eval_loss": 0.921030580997467, "eval_runtime": 11.4414, "eval_samples_per_second": 50.256, "eval_steps_per_second": 12.586, "num_input_tokens_seen": 217472, "step": 647 }, { "epoch": 0.5023183925811437, "grad_norm": 18.4849910736084, "learning_rate": 2.507727975270479e-05, "loss": 1.0672, "num_input_tokens_seen": 218368, "step": 650 }, { "epoch": 0.5061823802163833, "grad_norm": 14.70101547241211, "learning_rate": 2.5270479134466772e-05, "loss": 0.7603, "num_input_tokens_seen": 219968, "step": 655 }, { "epoch": 0.5100463678516228, "grad_norm": 12.655029296875, "learning_rate": 2.546367851622875e-05, "loss": 1.2672, "num_input_tokens_seen": 221760, "step": 660 }, { "epoch": 0.5139103554868625, "grad_norm": 14.609688758850098, "learning_rate": 2.565687789799073e-05, "loss": 0.8704, "num_input_tokens_seen": 223648, "step": 665 }, { "epoch": 0.517774343122102, "grad_norm": 9.628323554992676, "learning_rate": 2.5850077279752704e-05, "loss": 1.0117, "num_input_tokens_seen": 225568, "step": 670 }, { "epoch": 0.5216383307573416, "grad_norm": 14.350893020629883, "learning_rate": 2.6043276661514683e-05, "loss": 0.6708, "num_input_tokens_seen": 226944, "step": 675 }, { "epoch": 0.5255023183925811, "grad_norm": 11.73587703704834, "learning_rate": 2.6236476043276665e-05, "loss": 0.8381, "num_input_tokens_seen": 228512, "step": 680 }, { "epoch": 0.5293663060278208, "grad_norm": 13.388036727905273, "learning_rate": 2.6429675425038643e-05, "loss": 0.644, "num_input_tokens_seen": 230208, "step": 685 }, { "epoch": 0.5332302936630603, "grad_norm": 9.25024700164795, "learning_rate": 2.6622874806800618e-05, "loss": 0.7761, "num_input_tokens_seen": 231840, "step": 690 }, { "epoch": 0.5370942812982998, "grad_norm": 12.96055793762207, "learning_rate": 2.6816074188562596e-05, "loss": 0.9471, "num_input_tokens_seen": 233792, "step": 695 }, { "epoch": 0.5409582689335394, "grad_norm": 12.044790267944336, "learning_rate": 2.700927357032458e-05, "loss": 0.8425, "num_input_tokens_seen": 235584, "step": 700 }, { "epoch": 0.544822256568779, "grad_norm": 9.95194149017334, "learning_rate": 2.7202472952086557e-05, "loss": 0.7119, "num_input_tokens_seen": 237376, "step": 705 }, { "epoch": 0.5486862442040186, "grad_norm": 20.56953239440918, "learning_rate": 2.7395672333848532e-05, "loss": 0.8352, "num_input_tokens_seen": 239008, "step": 710 }, { "epoch": 0.5525502318392581, "grad_norm": 9.253941535949707, "learning_rate": 2.758887171561051e-05, "loss": 0.956, "num_input_tokens_seen": 240576, "step": 715 }, { "epoch": 0.5564142194744977, "grad_norm": 7.416138172149658, "learning_rate": 2.7782071097372492e-05, "loss": 0.7096, "num_input_tokens_seen": 242208, "step": 720 }, { "epoch": 0.5602782071097373, "grad_norm": 11.301790237426758, "learning_rate": 2.797527047913447e-05, "loss": 0.8211, "num_input_tokens_seen": 243808, "step": 725 }, { "epoch": 0.5641421947449768, "grad_norm": 7.010188102722168, "learning_rate": 2.8168469860896446e-05, "loss": 0.9543, "num_input_tokens_seen": 245472, "step": 730 }, { "epoch": 0.5680061823802164, "grad_norm": 23.600738525390625, "learning_rate": 2.8361669242658424e-05, "loss": 0.8261, "num_input_tokens_seen": 247360, "step": 735 }, { "epoch": 0.5718701700154559, "grad_norm": 6.5511794090271, "learning_rate": 2.8554868624420406e-05, "loss": 0.6398, "num_input_tokens_seen": 249216, "step": 740 }, { "epoch": 0.5757341576506955, "grad_norm": 17.442710876464844, "learning_rate": 2.8748068006182384e-05, "loss": 0.8443, "num_input_tokens_seen": 250848, "step": 745 }, { "epoch": 0.5795981452859351, "grad_norm": 6.275444507598877, "learning_rate": 2.894126738794436e-05, "loss": 0.7121, "num_input_tokens_seen": 252544, "step": 750 }, { "epoch": 0.5834621329211747, "grad_norm": 5.052901744842529, "learning_rate": 2.9134466769706338e-05, "loss": 0.7355, "num_input_tokens_seen": 254304, "step": 755 }, { "epoch": 0.5873261205564142, "grad_norm": 6.318686485290527, "learning_rate": 2.932766615146832e-05, "loss": 0.7261, "num_input_tokens_seen": 255968, "step": 760 }, { "epoch": 0.5911901081916537, "grad_norm": 13.516203880310059, "learning_rate": 2.9520865533230298e-05, "loss": 0.7034, "num_input_tokens_seen": 257760, "step": 765 }, { "epoch": 0.5950540958268934, "grad_norm": 7.276726722717285, "learning_rate": 2.9714064914992273e-05, "loss": 0.6161, "num_input_tokens_seen": 259456, "step": 770 }, { "epoch": 0.5989180834621329, "grad_norm": 9.464397430419922, "learning_rate": 2.990726429675425e-05, "loss": 0.6975, "num_input_tokens_seen": 261312, "step": 775 }, { "epoch": 0.6027820710973725, "grad_norm": 7.1459832191467285, "learning_rate": 3.0100463678516227e-05, "loss": 0.6272, "num_input_tokens_seen": 262944, "step": 780 }, { "epoch": 0.606646058732612, "grad_norm": 6.500988006591797, "learning_rate": 3.0293663060278212e-05, "loss": 0.678, "num_input_tokens_seen": 264608, "step": 785 }, { "epoch": 0.6105100463678517, "grad_norm": 4.333360195159912, "learning_rate": 3.0486862442040187e-05, "loss": 0.5571, "num_input_tokens_seen": 266144, "step": 790 }, { "epoch": 0.6143740340030912, "grad_norm": 9.184494018554688, "learning_rate": 3.0680061823802165e-05, "loss": 0.775, "num_input_tokens_seen": 267680, "step": 795 }, { "epoch": 0.6182380216383307, "grad_norm": 7.026245594024658, "learning_rate": 3.0873261205564144e-05, "loss": 0.8627, "num_input_tokens_seen": 269312, "step": 800 }, { "epoch": 0.6221020092735703, "grad_norm": 6.15141487121582, "learning_rate": 3.106646058732612e-05, "loss": 0.8033, "num_input_tokens_seen": 270848, "step": 805 }, { "epoch": 0.6259659969088099, "grad_norm": 4.614773750305176, "learning_rate": 3.12596599690881e-05, "loss": 0.5587, "num_input_tokens_seen": 272672, "step": 810 }, { "epoch": 0.6298299845440495, "grad_norm": 6.07302188873291, "learning_rate": 3.145285935085008e-05, "loss": 0.6096, "num_input_tokens_seen": 274048, "step": 815 }, { "epoch": 0.633693972179289, "grad_norm": 6.035991668701172, "learning_rate": 3.164605873261206e-05, "loss": 0.5936, "num_input_tokens_seen": 275648, "step": 820 }, { "epoch": 0.6375579598145286, "grad_norm": 6.594690799713135, "learning_rate": 3.1839258114374036e-05, "loss": 0.6637, "num_input_tokens_seen": 277376, "step": 825 }, { "epoch": 0.6414219474497682, "grad_norm": 3.9405901432037354, "learning_rate": 3.2032457496136014e-05, "loss": 1.0109, "num_input_tokens_seen": 279104, "step": 830 }, { "epoch": 0.6452859350850078, "grad_norm": 11.118372917175293, "learning_rate": 3.222565687789799e-05, "loss": 0.8456, "num_input_tokens_seen": 280896, "step": 835 }, { "epoch": 0.6491499227202473, "grad_norm": 6.442062854766846, "learning_rate": 3.241885625965997e-05, "loss": 0.7265, "num_input_tokens_seen": 282720, "step": 840 }, { "epoch": 0.6530139103554868, "grad_norm": 10.887469291687012, "learning_rate": 3.261205564142195e-05, "loss": 0.7846, "num_input_tokens_seen": 284352, "step": 845 }, { "epoch": 0.6568778979907264, "grad_norm": 13.330336570739746, "learning_rate": 3.280525502318393e-05, "loss": 0.9807, "num_input_tokens_seen": 286080, "step": 850 }, { "epoch": 0.660741885625966, "grad_norm": 16.476484298706055, "learning_rate": 3.2998454404945907e-05, "loss": 0.7617, "num_input_tokens_seen": 287744, "step": 855 }, { "epoch": 0.6646058732612056, "grad_norm": 7.407608509063721, "learning_rate": 3.3191653786707885e-05, "loss": 0.9338, "num_input_tokens_seen": 289280, "step": 860 }, { "epoch": 0.6684698608964451, "grad_norm": 5.281447887420654, "learning_rate": 3.3384853168469863e-05, "loss": 0.7126, "num_input_tokens_seen": 290816, "step": 865 }, { "epoch": 0.6723338485316847, "grad_norm": 6.173165798187256, "learning_rate": 3.357805255023184e-05, "loss": 0.6428, "num_input_tokens_seen": 292480, "step": 870 }, { "epoch": 0.6761978361669243, "grad_norm": 6.420854568481445, "learning_rate": 3.377125193199382e-05, "loss": 0.7302, "num_input_tokens_seen": 294176, "step": 875 }, { "epoch": 0.6800618238021638, "grad_norm": 5.293971061706543, "learning_rate": 3.39644513137558e-05, "loss": 0.9809, "num_input_tokens_seen": 295776, "step": 880 }, { "epoch": 0.6839258114374034, "grad_norm": 4.780097007751465, "learning_rate": 3.415765069551777e-05, "loss": 0.7255, "num_input_tokens_seen": 297664, "step": 885 }, { "epoch": 0.6877897990726429, "grad_norm": 5.402822017669678, "learning_rate": 3.4350850077279756e-05, "loss": 0.6325, "num_input_tokens_seen": 299168, "step": 890 }, { "epoch": 0.6916537867078826, "grad_norm": 4.812047958374023, "learning_rate": 3.4544049459041734e-05, "loss": 0.5593, "num_input_tokens_seen": 300704, "step": 895 }, { "epoch": 0.6955177743431221, "grad_norm": 8.333353996276855, "learning_rate": 3.473724884080371e-05, "loss": 0.6934, "num_input_tokens_seen": 302432, "step": 900 }, { "epoch": 0.6993817619783617, "grad_norm": 7.964436054229736, "learning_rate": 3.4930448222565684e-05, "loss": 0.7894, "num_input_tokens_seen": 303936, "step": 905 }, { "epoch": 0.7032457496136012, "grad_norm": 20.27760887145996, "learning_rate": 3.512364760432767e-05, "loss": 0.6393, "num_input_tokens_seen": 305600, "step": 910 }, { "epoch": 0.7071097372488409, "grad_norm": 3.6520543098449707, "learning_rate": 3.531684698608965e-05, "loss": 0.8638, "num_input_tokens_seen": 307328, "step": 915 }, { "epoch": 0.7109737248840804, "grad_norm": 6.799697399139404, "learning_rate": 3.5510046367851626e-05, "loss": 0.6569, "num_input_tokens_seen": 309056, "step": 920 }, { "epoch": 0.7148377125193199, "grad_norm": 4.876644134521484, "learning_rate": 3.57032457496136e-05, "loss": 0.7661, "num_input_tokens_seen": 310592, "step": 925 }, { "epoch": 0.7187017001545595, "grad_norm": 6.81377649307251, "learning_rate": 3.589644513137558e-05, "loss": 0.846, "num_input_tokens_seen": 312128, "step": 930 }, { "epoch": 0.7225656877897991, "grad_norm": 6.32791805267334, "learning_rate": 3.608964451313756e-05, "loss": 1.4472, "num_input_tokens_seen": 313984, "step": 935 }, { "epoch": 0.7264296754250387, "grad_norm": 8.712087631225586, "learning_rate": 3.628284389489954e-05, "loss": 0.8607, "num_input_tokens_seen": 315872, "step": 940 }, { "epoch": 0.7302936630602782, "grad_norm": 4.0325727462768555, "learning_rate": 3.647604327666151e-05, "loss": 0.6691, "num_input_tokens_seen": 317792, "step": 945 }, { "epoch": 0.7341576506955177, "grad_norm": 8.846315383911133, "learning_rate": 3.66692426584235e-05, "loss": 0.6794, "num_input_tokens_seen": 319520, "step": 950 }, { "epoch": 0.7380216383307573, "grad_norm": 3.4582154750823975, "learning_rate": 3.6862442040185475e-05, "loss": 0.5463, "num_input_tokens_seen": 321024, "step": 955 }, { "epoch": 0.7418856259659969, "grad_norm": 9.707828521728516, "learning_rate": 3.7055641421947454e-05, "loss": 1.0942, "num_input_tokens_seen": 322656, "step": 960 }, { "epoch": 0.7457496136012365, "grad_norm": 8.417296409606934, "learning_rate": 3.7248840803709425e-05, "loss": 0.7958, "num_input_tokens_seen": 324672, "step": 965 }, { "epoch": 0.749613601236476, "grad_norm": 4.8317766189575195, "learning_rate": 3.744204018547141e-05, "loss": 0.759, "num_input_tokens_seen": 326432, "step": 970 }, { "epoch": 0.7534775888717156, "grad_norm": 9.190349578857422, "learning_rate": 3.763523956723339e-05, "loss": 0.8002, "num_input_tokens_seen": 327936, "step": 975 }, { "epoch": 0.7573415765069552, "grad_norm": 5.1527581214904785, "learning_rate": 3.782843894899537e-05, "loss": 0.6125, "num_input_tokens_seen": 329376, "step": 980 }, { "epoch": 0.7612055641421948, "grad_norm": 5.2063822746276855, "learning_rate": 3.802163833075734e-05, "loss": 0.8203, "num_input_tokens_seen": 330752, "step": 985 }, { "epoch": 0.7650695517774343, "grad_norm": 8.216318130493164, "learning_rate": 3.821483771251932e-05, "loss": 0.9451, "num_input_tokens_seen": 332512, "step": 990 }, { "epoch": 0.7689335394126738, "grad_norm": 5.421473026275635, "learning_rate": 3.84080370942813e-05, "loss": 0.6643, "num_input_tokens_seen": 334144, "step": 995 }, { "epoch": 0.7727975270479135, "grad_norm": 5.369229793548584, "learning_rate": 3.860123647604328e-05, "loss": 0.7194, "num_input_tokens_seen": 335808, "step": 1000 }, { "epoch": 0.776661514683153, "grad_norm": 4.9428253173828125, "learning_rate": 3.879443585780525e-05, "loss": 0.8215, "num_input_tokens_seen": 337344, "step": 1005 }, { "epoch": 0.7805255023183926, "grad_norm": 17.760059356689453, "learning_rate": 3.898763523956723e-05, "loss": 0.6701, "num_input_tokens_seen": 338976, "step": 1010 }, { "epoch": 0.7843894899536321, "grad_norm": 6.841742038726807, "learning_rate": 3.9180834621329217e-05, "loss": 0.7121, "num_input_tokens_seen": 340512, "step": 1015 }, { "epoch": 0.7882534775888718, "grad_norm": 5.968893051147461, "learning_rate": 3.9374034003091195e-05, "loss": 0.7951, "num_input_tokens_seen": 342432, "step": 1020 }, { "epoch": 0.7921174652241113, "grad_norm": 9.027800559997559, "learning_rate": 3.956723338485317e-05, "loss": 0.6313, "num_input_tokens_seen": 344096, "step": 1025 }, { "epoch": 0.7959814528593508, "grad_norm": 3.652646064758301, "learning_rate": 3.9760432766615145e-05, "loss": 0.5803, "num_input_tokens_seen": 345664, "step": 1030 }, { "epoch": 0.7998454404945904, "grad_norm": 2.4274933338165283, "learning_rate": 3.995363214837713e-05, "loss": 0.8859, "num_input_tokens_seen": 347232, "step": 1035 }, { "epoch": 0.80370942812983, "grad_norm": 3.748764753341675, "learning_rate": 4.014683153013911e-05, "loss": 0.6379, "num_input_tokens_seen": 348992, "step": 1040 }, { "epoch": 0.8075734157650696, "grad_norm": 3.5328495502471924, "learning_rate": 4.034003091190108e-05, "loss": 0.6299, "num_input_tokens_seen": 350592, "step": 1045 }, { "epoch": 0.8114374034003091, "grad_norm": 6.558239459991455, "learning_rate": 4.053323029366306e-05, "loss": 0.6831, "num_input_tokens_seen": 352352, "step": 1050 }, { "epoch": 0.8153013910355487, "grad_norm": 5.872474670410156, "learning_rate": 4.0726429675425044e-05, "loss": 0.9921, "num_input_tokens_seen": 353760, "step": 1055 }, { "epoch": 0.8191653786707882, "grad_norm": 5.27492094039917, "learning_rate": 4.091962905718702e-05, "loss": 0.6912, "num_input_tokens_seen": 355424, "step": 1060 }, { "epoch": 0.8230293663060279, "grad_norm": 3.3896567821502686, "learning_rate": 4.1112828438948994e-05, "loss": 0.7114, "num_input_tokens_seen": 357120, "step": 1065 }, { "epoch": 0.8268933539412674, "grad_norm": 6.957058429718018, "learning_rate": 4.130602782071097e-05, "loss": 0.961, "num_input_tokens_seen": 359168, "step": 1070 }, { "epoch": 0.8307573415765069, "grad_norm": 5.5103373527526855, "learning_rate": 4.149922720247296e-05, "loss": 0.8291, "num_input_tokens_seen": 361184, "step": 1075 }, { "epoch": 0.8346213292117465, "grad_norm": 3.2119452953338623, "learning_rate": 4.1692426584234936e-05, "loss": 0.53, "num_input_tokens_seen": 362848, "step": 1080 }, { "epoch": 0.8384853168469861, "grad_norm": 5.641288757324219, "learning_rate": 4.188562596599691e-05, "loss": 0.6337, "num_input_tokens_seen": 364640, "step": 1085 }, { "epoch": 0.8423493044822257, "grad_norm": 9.591625213623047, "learning_rate": 4.2078825347758886e-05, "loss": 1.0384, "num_input_tokens_seen": 366272, "step": 1090 }, { "epoch": 0.8462132921174652, "grad_norm": 3.2398712635040283, "learning_rate": 4.2272024729520865e-05, "loss": 0.7159, "num_input_tokens_seen": 367872, "step": 1095 }, { "epoch": 0.8500772797527048, "grad_norm": 2.9561452865600586, "learning_rate": 4.246522411128285e-05, "loss": 1.2057, "num_input_tokens_seen": 369792, "step": 1100 }, { "epoch": 0.8539412673879444, "grad_norm": 4.295708656311035, "learning_rate": 4.265842349304482e-05, "loss": 0.6116, "num_input_tokens_seen": 371232, "step": 1105 }, { "epoch": 0.8578052550231839, "grad_norm": 3.496870517730713, "learning_rate": 4.28516228748068e-05, "loss": 0.659, "num_input_tokens_seen": 372864, "step": 1110 }, { "epoch": 0.8616692426584235, "grad_norm": 3.823777675628662, "learning_rate": 4.304482225656878e-05, "loss": 0.6056, "num_input_tokens_seen": 374496, "step": 1115 }, { "epoch": 0.865533230293663, "grad_norm": 3.2684507369995117, "learning_rate": 4.3238021638330764e-05, "loss": 0.6129, "num_input_tokens_seen": 376256, "step": 1120 }, { "epoch": 0.8693972179289027, "grad_norm": 5.135096073150635, "learning_rate": 4.3431221020092735e-05, "loss": 0.6851, "num_input_tokens_seen": 377760, "step": 1125 }, { "epoch": 0.8732612055641422, "grad_norm": 5.1519012451171875, "learning_rate": 4.3624420401854714e-05, "loss": 0.8376, "num_input_tokens_seen": 379456, "step": 1130 }, { "epoch": 0.8771251931993818, "grad_norm": 2.19795560836792, "learning_rate": 4.381761978361669e-05, "loss": 0.5428, "num_input_tokens_seen": 381344, "step": 1135 }, { "epoch": 0.8809891808346213, "grad_norm": 3.002683639526367, "learning_rate": 4.401081916537868e-05, "loss": 0.6935, "num_input_tokens_seen": 383264, "step": 1140 }, { "epoch": 0.884853168469861, "grad_norm": 7.708734512329102, "learning_rate": 4.420401854714065e-05, "loss": 0.8678, "num_input_tokens_seen": 385056, "step": 1145 }, { "epoch": 0.8887171561051005, "grad_norm": 5.061878204345703, "learning_rate": 4.439721792890263e-05, "loss": 0.8629, "num_input_tokens_seen": 387072, "step": 1150 }, { "epoch": 0.89258114374034, "grad_norm": 2.0840277671813965, "learning_rate": 4.4590417310664606e-05, "loss": 0.591, "num_input_tokens_seen": 388864, "step": 1155 }, { "epoch": 0.8964451313755796, "grad_norm": 3.3418915271759033, "learning_rate": 4.478361669242659e-05, "loss": 1.0106, "num_input_tokens_seen": 390560, "step": 1160 }, { "epoch": 0.9003091190108191, "grad_norm": 3.4786694049835205, "learning_rate": 4.497681607418856e-05, "loss": 0.6011, "num_input_tokens_seen": 392128, "step": 1165 }, { "epoch": 0.9041731066460588, "grad_norm": 3.1302053928375244, "learning_rate": 4.517001545595054e-05, "loss": 0.8227, "num_input_tokens_seen": 393888, "step": 1170 }, { "epoch": 0.9080370942812983, "grad_norm": 4.063078880310059, "learning_rate": 4.536321483771252e-05, "loss": 0.616, "num_input_tokens_seen": 395776, "step": 1175 }, { "epoch": 0.9119010819165378, "grad_norm": 3.8050053119659424, "learning_rate": 4.5556414219474505e-05, "loss": 0.6952, "num_input_tokens_seen": 397248, "step": 1180 }, { "epoch": 0.9157650695517774, "grad_norm": 2.0413312911987305, "learning_rate": 4.574961360123648e-05, "loss": 1.0828, "num_input_tokens_seen": 399040, "step": 1185 }, { "epoch": 0.919629057187017, "grad_norm": 2.850386619567871, "learning_rate": 4.5942812982998455e-05, "loss": 0.7131, "num_input_tokens_seen": 400736, "step": 1190 }, { "epoch": 0.9234930448222566, "grad_norm": 2.374300479888916, "learning_rate": 4.6136012364760434e-05, "loss": 0.7001, "num_input_tokens_seen": 402528, "step": 1195 }, { "epoch": 0.9273570324574961, "grad_norm": 3.896080732345581, "learning_rate": 4.632921174652241e-05, "loss": 0.5905, "num_input_tokens_seen": 404032, "step": 1200 }, { "epoch": 0.9312210200927357, "grad_norm": 2.3679215908050537, "learning_rate": 4.652241112828439e-05, "loss": 0.9157, "num_input_tokens_seen": 405792, "step": 1205 }, { "epoch": 0.9350850077279753, "grad_norm": 3.943150758743286, "learning_rate": 4.671561051004637e-05, "loss": 0.6847, "num_input_tokens_seen": 407520, "step": 1210 }, { "epoch": 0.9389489953632149, "grad_norm": 2.8842110633850098, "learning_rate": 4.690880989180835e-05, "loss": 0.5979, "num_input_tokens_seen": 409152, "step": 1215 }, { "epoch": 0.9428129829984544, "grad_norm": 4.248842239379883, "learning_rate": 4.7102009273570326e-05, "loss": 0.5761, "num_input_tokens_seen": 410816, "step": 1220 }, { "epoch": 0.9466769706336939, "grad_norm": 4.930397987365723, "learning_rate": 4.7295208655332304e-05, "loss": 0.7806, "num_input_tokens_seen": 412480, "step": 1225 }, { "epoch": 0.9505409582689336, "grad_norm": 3.283113479614258, "learning_rate": 4.748840803709428e-05, "loss": 0.6914, "num_input_tokens_seen": 414080, "step": 1230 }, { "epoch": 0.9544049459041731, "grad_norm": 2.2087464332580566, "learning_rate": 4.768160741885626e-05, "loss": 0.707, "num_input_tokens_seen": 415872, "step": 1235 }, { "epoch": 0.9582689335394127, "grad_norm": 2.3423264026641846, "learning_rate": 4.787480680061824e-05, "loss": 0.6194, "num_input_tokens_seen": 417440, "step": 1240 }, { "epoch": 0.9621329211746522, "grad_norm": 3.836172342300415, "learning_rate": 4.806800618238022e-05, "loss": 0.6171, "num_input_tokens_seen": 419264, "step": 1245 }, { "epoch": 0.9659969088098919, "grad_norm": 3.2541263103485107, "learning_rate": 4.8261205564142196e-05, "loss": 0.6302, "num_input_tokens_seen": 420896, "step": 1250 }, { "epoch": 0.9698608964451314, "grad_norm": 3.7730610370635986, "learning_rate": 4.8454404945904175e-05, "loss": 0.8667, "num_input_tokens_seen": 422912, "step": 1255 }, { "epoch": 0.973724884080371, "grad_norm": 4.002993583679199, "learning_rate": 4.864760432766615e-05, "loss": 0.8096, "num_input_tokens_seen": 424512, "step": 1260 }, { "epoch": 0.9775888717156105, "grad_norm": 4.549214839935303, "learning_rate": 4.884080370942813e-05, "loss": 0.7455, "num_input_tokens_seen": 425984, "step": 1265 }, { "epoch": 0.98145285935085, "grad_norm": 2.7755491733551025, "learning_rate": 4.903400309119011e-05, "loss": 0.7464, "num_input_tokens_seen": 427904, "step": 1270 }, { "epoch": 0.9853168469860897, "grad_norm": 2.506521701812744, "learning_rate": 4.922720247295209e-05, "loss": 0.9315, "num_input_tokens_seen": 429920, "step": 1275 }, { "epoch": 0.9891808346213292, "grad_norm": 3.0748167037963867, "learning_rate": 4.942040185471407e-05, "loss": 0.6098, "num_input_tokens_seen": 431296, "step": 1280 }, { "epoch": 0.9930448222565688, "grad_norm": 2.220958948135376, "learning_rate": 4.9613601236476046e-05, "loss": 0.8557, "num_input_tokens_seen": 432800, "step": 1285 }, { "epoch": 0.9969088098918083, "grad_norm": 2.896904706954956, "learning_rate": 4.9806800618238024e-05, "loss": 0.6524, "num_input_tokens_seen": 434272, "step": 1290 }, { "epoch": 1.0, "eval_loss": 0.8727672696113586, "eval_runtime": 11.4469, "eval_samples_per_second": 50.232, "eval_steps_per_second": 12.58, "num_input_tokens_seen": 435488, "step": 1294 }, { "epoch": 1.000772797527048, "grad_norm": 5.569401264190674, "learning_rate": 5e-05, "loss": 0.6268, "num_input_tokens_seen": 435776, "step": 1295 }, { "epoch": 1.0046367851622875, "grad_norm": 5.1326093673706055, "learning_rate": 4.999997725970315e-05, "loss": 0.7275, "num_input_tokens_seen": 437696, "step": 1300 }, { "epoch": 1.008500772797527, "grad_norm": 2.7466344833374023, "learning_rate": 4.9999909038853964e-05, "loss": 0.6433, "num_input_tokens_seen": 439712, "step": 1305 }, { "epoch": 1.0123647604327666, "grad_norm": 6.306580066680908, "learning_rate": 4.9999795337576546e-05, "loss": 0.6256, "num_input_tokens_seen": 441504, "step": 1310 }, { "epoch": 1.0162287480680061, "grad_norm": 3.266453266143799, "learning_rate": 4.9999636156077754e-05, "loss": 0.6392, "num_input_tokens_seen": 443264, "step": 1315 }, { "epoch": 1.0200927357032457, "grad_norm": 2.966540813446045, "learning_rate": 4.999943149464716e-05, "loss": 0.6198, "num_input_tokens_seen": 444864, "step": 1320 }, { "epoch": 1.0239567233384854, "grad_norm": 2.5080649852752686, "learning_rate": 4.9999181353657115e-05, "loss": 0.8494, "num_input_tokens_seen": 446496, "step": 1325 }, { "epoch": 1.027820710973725, "grad_norm": 1.94948148727417, "learning_rate": 4.9998885733562664e-05, "loss": 0.5996, "num_input_tokens_seen": 448000, "step": 1330 }, { "epoch": 1.0316846986089645, "grad_norm": 6.69545841217041, "learning_rate": 4.99985446349016e-05, "loss": 0.7259, "num_input_tokens_seen": 449632, "step": 1335 }, { "epoch": 1.035548686244204, "grad_norm": 3.7938272953033447, "learning_rate": 4.9998158058294464e-05, "loss": 0.8377, "num_input_tokens_seen": 451424, "step": 1340 }, { "epoch": 1.0394126738794436, "grad_norm": 3.0293776988983154, "learning_rate": 4.999772600444453e-05, "loss": 0.6916, "num_input_tokens_seen": 453024, "step": 1345 }, { "epoch": 1.0432766615146831, "grad_norm": 3.521404504776001, "learning_rate": 4.9997248474137806e-05, "loss": 0.706, "num_input_tokens_seen": 454624, "step": 1350 }, { "epoch": 1.0471406491499227, "grad_norm": 1.9636902809143066, "learning_rate": 4.9996725468243e-05, "loss": 0.7512, "num_input_tokens_seen": 456544, "step": 1355 }, { "epoch": 1.0510046367851622, "grad_norm": 3.2807185649871826, "learning_rate": 4.999615698771161e-05, "loss": 0.6565, "num_input_tokens_seen": 457984, "step": 1360 }, { "epoch": 1.054868624420402, "grad_norm": 1.792867660522461, "learning_rate": 4.9995543033577806e-05, "loss": 0.6, "num_input_tokens_seen": 459808, "step": 1365 }, { "epoch": 1.0587326120556415, "grad_norm": 1.4668031930923462, "learning_rate": 4.999488360695852e-05, "loss": 0.8234, "num_input_tokens_seen": 461696, "step": 1370 }, { "epoch": 1.062596599690881, "grad_norm": 4.267587661743164, "learning_rate": 4.9994178709053384e-05, "loss": 0.6224, "num_input_tokens_seen": 463424, "step": 1375 }, { "epoch": 1.0664605873261206, "grad_norm": 1.4623420238494873, "learning_rate": 4.999342834114479e-05, "loss": 0.6578, "num_input_tokens_seen": 465088, "step": 1380 }, { "epoch": 1.0703245749613601, "grad_norm": 3.5440890789031982, "learning_rate": 4.999263250459779e-05, "loss": 0.8016, "num_input_tokens_seen": 466592, "step": 1385 }, { "epoch": 1.0741885625965997, "grad_norm": 2.654419183731079, "learning_rate": 4.9991791200860216e-05, "loss": 0.8327, "num_input_tokens_seen": 468032, "step": 1390 }, { "epoch": 1.0780525502318392, "grad_norm": 3.996605157852173, "learning_rate": 4.9990904431462585e-05, "loss": 1.0091, "num_input_tokens_seen": 469568, "step": 1395 }, { "epoch": 1.0819165378670788, "grad_norm": 1.8970979452133179, "learning_rate": 4.998997219801811e-05, "loss": 0.6183, "num_input_tokens_seen": 471072, "step": 1400 }, { "epoch": 1.0857805255023183, "grad_norm": 2.055800199508667, "learning_rate": 4.998899450222275e-05, "loss": 0.778, "num_input_tokens_seen": 472768, "step": 1405 }, { "epoch": 1.089644513137558, "grad_norm": 1.651590347290039, "learning_rate": 4.998797134585516e-05, "loss": 0.6458, "num_input_tokens_seen": 474592, "step": 1410 }, { "epoch": 1.0935085007727976, "grad_norm": 1.8094594478607178, "learning_rate": 4.9986902730776665e-05, "loss": 0.6048, "num_input_tokens_seen": 476320, "step": 1415 }, { "epoch": 1.0973724884080371, "grad_norm": 1.8772789239883423, "learning_rate": 4.998578865893133e-05, "loss": 0.6161, "num_input_tokens_seen": 478144, "step": 1420 }, { "epoch": 1.1012364760432767, "grad_norm": 1.6253868341445923, "learning_rate": 4.99846291323459e-05, "loss": 0.7987, "num_input_tokens_seen": 479808, "step": 1425 }, { "epoch": 1.1051004636785162, "grad_norm": 4.032077789306641, "learning_rate": 4.998342415312981e-05, "loss": 0.5778, "num_input_tokens_seen": 481376, "step": 1430 }, { "epoch": 1.1089644513137558, "grad_norm": 2.202702760696411, "learning_rate": 4.998217372347519e-05, "loss": 0.7242, "num_input_tokens_seen": 483072, "step": 1435 }, { "epoch": 1.1128284389489953, "grad_norm": 2.925344705581665, "learning_rate": 4.998087784565685e-05, "loss": 0.697, "num_input_tokens_seen": 484800, "step": 1440 }, { "epoch": 1.1166924265842348, "grad_norm": 2.1689670085906982, "learning_rate": 4.9979536522032284e-05, "loss": 0.8073, "num_input_tokens_seen": 486496, "step": 1445 }, { "epoch": 1.1205564142194744, "grad_norm": 1.4746007919311523, "learning_rate": 4.997814975504166e-05, "loss": 0.6072, "num_input_tokens_seen": 488192, "step": 1450 }, { "epoch": 1.1244204018547141, "grad_norm": 7.89620304107666, "learning_rate": 4.9976717547207806e-05, "loss": 0.9425, "num_input_tokens_seen": 489792, "step": 1455 }, { "epoch": 1.1282843894899537, "grad_norm": 1.903210997581482, "learning_rate": 4.9975239901136244e-05, "loss": 0.7361, "num_input_tokens_seen": 491616, "step": 1460 }, { "epoch": 1.1321483771251932, "grad_norm": 2.7027246952056885, "learning_rate": 4.997371681951514e-05, "loss": 0.5481, "num_input_tokens_seen": 493088, "step": 1465 }, { "epoch": 1.1360123647604328, "grad_norm": 1.4442485570907593, "learning_rate": 4.9972148305115315e-05, "loss": 0.6156, "num_input_tokens_seen": 494592, "step": 1470 }, { "epoch": 1.1398763523956723, "grad_norm": 1.7765363454818726, "learning_rate": 4.997053436079025e-05, "loss": 0.7854, "num_input_tokens_seen": 496352, "step": 1475 }, { "epoch": 1.1437403400309119, "grad_norm": 4.410486221313477, "learning_rate": 4.996887498947606e-05, "loss": 0.9784, "num_input_tokens_seen": 498016, "step": 1480 }, { "epoch": 1.1476043276661514, "grad_norm": 1.5805256366729736, "learning_rate": 4.996717019419153e-05, "loss": 0.56, "num_input_tokens_seen": 499584, "step": 1485 }, { "epoch": 1.1514683153013912, "grad_norm": 5.529237747192383, "learning_rate": 4.9965419978038064e-05, "loss": 1.2734, "num_input_tokens_seen": 501472, "step": 1490 }, { "epoch": 1.1553323029366307, "grad_norm": 1.8511298894882202, "learning_rate": 4.9963624344199674e-05, "loss": 0.5698, "num_input_tokens_seen": 503136, "step": 1495 }, { "epoch": 1.1591962905718702, "grad_norm": 1.8273423910140991, "learning_rate": 4.996178329594305e-05, "loss": 0.7142, "num_input_tokens_seen": 504704, "step": 1500 }, { "epoch": 1.1630602782071098, "grad_norm": 4.462129592895508, "learning_rate": 4.9959896836617456e-05, "loss": 0.7227, "num_input_tokens_seen": 506496, "step": 1505 }, { "epoch": 1.1669242658423493, "grad_norm": 1.992414951324463, "learning_rate": 4.995796496965478e-05, "loss": 0.6845, "num_input_tokens_seen": 508064, "step": 1510 }, { "epoch": 1.1707882534775889, "grad_norm": 6.661719799041748, "learning_rate": 4.995598769856952e-05, "loss": 0.7697, "num_input_tokens_seen": 509984, "step": 1515 }, { "epoch": 1.1746522411128284, "grad_norm": 3.6906778812408447, "learning_rate": 4.995396502695878e-05, "loss": 0.6151, "num_input_tokens_seen": 511392, "step": 1520 }, { "epoch": 1.178516228748068, "grad_norm": 2.444606065750122, "learning_rate": 4.9951896958502254e-05, "loss": 0.6603, "num_input_tokens_seen": 512960, "step": 1525 }, { "epoch": 1.1823802163833075, "grad_norm": 3.751354455947876, "learning_rate": 4.994978349696222e-05, "loss": 0.5742, "num_input_tokens_seen": 514656, "step": 1530 }, { "epoch": 1.1862442040185472, "grad_norm": 7.15833854675293, "learning_rate": 4.9947624646183534e-05, "loss": 0.8928, "num_input_tokens_seen": 516512, "step": 1535 }, { "epoch": 1.1901081916537868, "grad_norm": 2.145423412322998, "learning_rate": 4.994542041009362e-05, "loss": 0.7131, "num_input_tokens_seen": 518112, "step": 1540 }, { "epoch": 1.1939721792890263, "grad_norm": 1.7351375818252563, "learning_rate": 4.9943170792702503e-05, "loss": 0.6112, "num_input_tokens_seen": 519616, "step": 1545 }, { "epoch": 1.1978361669242659, "grad_norm": 2.646552324295044, "learning_rate": 4.994087579810271e-05, "loss": 0.5835, "num_input_tokens_seen": 521312, "step": 1550 }, { "epoch": 1.2017001545595054, "grad_norm": 3.1838555335998535, "learning_rate": 4.993853543046938e-05, "loss": 0.7569, "num_input_tokens_seen": 523008, "step": 1555 }, { "epoch": 1.205564142194745, "grad_norm": 2.347683906555176, "learning_rate": 4.9936149694060144e-05, "loss": 0.649, "num_input_tokens_seen": 524768, "step": 1560 }, { "epoch": 1.2094281298299845, "grad_norm": 5.678030967712402, "learning_rate": 4.993371859321518e-05, "loss": 0.7668, "num_input_tokens_seen": 526400, "step": 1565 }, { "epoch": 1.213292117465224, "grad_norm": 5.765255928039551, "learning_rate": 4.9931242132357235e-05, "loss": 0.6162, "num_input_tokens_seen": 528352, "step": 1570 }, { "epoch": 1.2171561051004636, "grad_norm": 3.321552276611328, "learning_rate": 4.9928720315991526e-05, "loss": 0.5717, "num_input_tokens_seen": 529888, "step": 1575 }, { "epoch": 1.2210200927357033, "grad_norm": 2.712909460067749, "learning_rate": 4.992615314870581e-05, "loss": 0.7121, "num_input_tokens_seen": 531840, "step": 1580 }, { "epoch": 1.2248840803709429, "grad_norm": 1.9446531534194946, "learning_rate": 4.9923540635170336e-05, "loss": 0.5354, "num_input_tokens_seen": 533408, "step": 1585 }, { "epoch": 1.2287480680061824, "grad_norm": 4.157997131347656, "learning_rate": 4.9920882780137845e-05, "loss": 0.7504, "num_input_tokens_seen": 534944, "step": 1590 }, { "epoch": 1.232612055641422, "grad_norm": 3.130476236343384, "learning_rate": 4.991817958844357e-05, "loss": 0.6563, "num_input_tokens_seen": 536416, "step": 1595 }, { "epoch": 1.2364760432766615, "grad_norm": 1.9648979902267456, "learning_rate": 4.991543106500523e-05, "loss": 0.5442, "num_input_tokens_seen": 538208, "step": 1600 }, { "epoch": 1.240340030911901, "grad_norm": 2.1644175052642822, "learning_rate": 4.9912637214823e-05, "loss": 0.7044, "num_input_tokens_seen": 539712, "step": 1605 }, { "epoch": 1.2442040185471406, "grad_norm": 3.38248348236084, "learning_rate": 4.990979804297952e-05, "loss": 0.999, "num_input_tokens_seen": 541536, "step": 1610 }, { "epoch": 1.2480680061823803, "grad_norm": 3.0010507106781006, "learning_rate": 4.990691355463987e-05, "loss": 0.6695, "num_input_tokens_seen": 542912, "step": 1615 }, { "epoch": 1.2519319938176197, "grad_norm": 2.5331642627716064, "learning_rate": 4.990398375505159e-05, "loss": 0.858, "num_input_tokens_seen": 544576, "step": 1620 }, { "epoch": 1.2557959814528594, "grad_norm": 1.7367154359817505, "learning_rate": 4.990100864954463e-05, "loss": 0.7172, "num_input_tokens_seen": 546432, "step": 1625 }, { "epoch": 1.259659969088099, "grad_norm": 2.087881088256836, "learning_rate": 4.9897988243531394e-05, "loss": 0.5547, "num_input_tokens_seen": 548032, "step": 1630 }, { "epoch": 1.2635239567233385, "grad_norm": 1.8679537773132324, "learning_rate": 4.989492254250666e-05, "loss": 0.6132, "num_input_tokens_seen": 549664, "step": 1635 }, { "epoch": 1.267387944358578, "grad_norm": 1.5577925443649292, "learning_rate": 4.9891811552047624e-05, "loss": 0.6207, "num_input_tokens_seen": 551552, "step": 1640 }, { "epoch": 1.2712519319938176, "grad_norm": 1.0598777532577515, "learning_rate": 4.988865527781388e-05, "loss": 0.8395, "num_input_tokens_seen": 553344, "step": 1645 }, { "epoch": 1.2751159196290571, "grad_norm": 2.1903908252716064, "learning_rate": 4.988545372554738e-05, "loss": 0.5821, "num_input_tokens_seen": 554880, "step": 1650 }, { "epoch": 1.2789799072642967, "grad_norm": 3.2598509788513184, "learning_rate": 4.9882206901072495e-05, "loss": 0.6931, "num_input_tokens_seen": 556544, "step": 1655 }, { "epoch": 1.2828438948995364, "grad_norm": 2.190890073776245, "learning_rate": 4.9878914810295894e-05, "loss": 0.5642, "num_input_tokens_seen": 558144, "step": 1660 }, { "epoch": 1.286707882534776, "grad_norm": 3.237941265106201, "learning_rate": 4.9875577459206645e-05, "loss": 0.7097, "num_input_tokens_seen": 559680, "step": 1665 }, { "epoch": 1.2905718701700155, "grad_norm": 2.03079891204834, "learning_rate": 4.987219485387614e-05, "loss": 0.5467, "num_input_tokens_seen": 561184, "step": 1670 }, { "epoch": 1.294435857805255, "grad_norm": 1.6131632328033447, "learning_rate": 4.986876700045808e-05, "loss": 0.5441, "num_input_tokens_seen": 562624, "step": 1675 }, { "epoch": 1.2982998454404946, "grad_norm": 2.096224069595337, "learning_rate": 4.98652939051885e-05, "loss": 0.7396, "num_input_tokens_seen": 564320, "step": 1680 }, { "epoch": 1.3021638330757341, "grad_norm": 3.965817451477051, "learning_rate": 4.986177557438575e-05, "loss": 0.566, "num_input_tokens_seen": 566112, "step": 1685 }, { "epoch": 1.3060278207109737, "grad_norm": 1.4071474075317383, "learning_rate": 4.9858212014450446e-05, "loss": 0.5658, "num_input_tokens_seen": 567584, "step": 1690 }, { "epoch": 1.3098918083462132, "grad_norm": 3.1105756759643555, "learning_rate": 4.98546032318655e-05, "loss": 0.6203, "num_input_tokens_seen": 569344, "step": 1695 }, { "epoch": 1.3137557959814528, "grad_norm": 2.441549301147461, "learning_rate": 4.985094923319612e-05, "loss": 0.6293, "num_input_tokens_seen": 571328, "step": 1700 }, { "epoch": 1.3176197836166925, "grad_norm": 1.4593091011047363, "learning_rate": 4.984725002508972e-05, "loss": 0.5344, "num_input_tokens_seen": 572864, "step": 1705 }, { "epoch": 1.321483771251932, "grad_norm": 1.7467397451400757, "learning_rate": 4.9843505614275996e-05, "loss": 0.5442, "num_input_tokens_seen": 574656, "step": 1710 }, { "epoch": 1.3253477588871716, "grad_norm": 1.3995556831359863, "learning_rate": 4.983971600756687e-05, "loss": 0.6802, "num_input_tokens_seen": 576192, "step": 1715 }, { "epoch": 1.3292117465224111, "grad_norm": 1.6219801902770996, "learning_rate": 4.983588121185648e-05, "loss": 0.5188, "num_input_tokens_seen": 577792, "step": 1720 }, { "epoch": 1.3330757341576507, "grad_norm": 2.064866304397583, "learning_rate": 4.983200123412119e-05, "loss": 0.9419, "num_input_tokens_seen": 579360, "step": 1725 }, { "epoch": 1.3369397217928902, "grad_norm": 2.1432583332061768, "learning_rate": 4.982807608141954e-05, "loss": 0.5847, "num_input_tokens_seen": 580896, "step": 1730 }, { "epoch": 1.3408037094281298, "grad_norm": 1.1927039623260498, "learning_rate": 4.9824105760892254e-05, "loss": 0.5277, "num_input_tokens_seen": 582656, "step": 1735 }, { "epoch": 1.3446676970633695, "grad_norm": 2.8567543029785156, "learning_rate": 4.982009027976224e-05, "loss": 0.5426, "num_input_tokens_seen": 584192, "step": 1740 }, { "epoch": 1.3485316846986088, "grad_norm": 1.3339349031448364, "learning_rate": 4.981602964533456e-05, "loss": 0.5191, "num_input_tokens_seen": 586080, "step": 1745 }, { "epoch": 1.3523956723338486, "grad_norm": 1.8199747800827026, "learning_rate": 4.981192386499641e-05, "loss": 0.5787, "num_input_tokens_seen": 587680, "step": 1750 }, { "epoch": 1.3562596599690881, "grad_norm": 3.670886278152466, "learning_rate": 4.980777294621712e-05, "loss": 0.5434, "num_input_tokens_seen": 589408, "step": 1755 }, { "epoch": 1.3601236476043277, "grad_norm": 2.6732687950134277, "learning_rate": 4.9803576896548155e-05, "loss": 0.5971, "num_input_tokens_seen": 591072, "step": 1760 }, { "epoch": 1.3639876352395672, "grad_norm": 2.198974609375, "learning_rate": 4.979933572362306e-05, "loss": 0.7074, "num_input_tokens_seen": 592960, "step": 1765 }, { "epoch": 1.3678516228748068, "grad_norm": 2.396319627761841, "learning_rate": 4.979504943515747e-05, "loss": 0.7182, "num_input_tokens_seen": 594656, "step": 1770 }, { "epoch": 1.3717156105100463, "grad_norm": 4.786496162414551, "learning_rate": 4.9790718038949116e-05, "loss": 0.9055, "num_input_tokens_seen": 596352, "step": 1775 }, { "epoch": 1.3755795981452859, "grad_norm": 1.2780112028121948, "learning_rate": 4.978634154287777e-05, "loss": 0.6312, "num_input_tokens_seen": 598080, "step": 1780 }, { "epoch": 1.3794435857805256, "grad_norm": 3.2135121822357178, "learning_rate": 4.978191995490525e-05, "loss": 0.6296, "num_input_tokens_seen": 599968, "step": 1785 }, { "epoch": 1.383307573415765, "grad_norm": 11.160727500915527, "learning_rate": 4.977745328307543e-05, "loss": 0.941, "num_input_tokens_seen": 602016, "step": 1790 }, { "epoch": 1.3871715610510047, "grad_norm": 2.0388169288635254, "learning_rate": 4.977294153551417e-05, "loss": 0.8757, "num_input_tokens_seen": 603904, "step": 1795 }, { "epoch": 1.3910355486862442, "grad_norm": 4.4117431640625, "learning_rate": 4.9768384720429364e-05, "loss": 0.7394, "num_input_tokens_seen": 605472, "step": 1800 }, { "epoch": 1.3948995363214838, "grad_norm": 1.575762391090393, "learning_rate": 4.9763782846110864e-05, "loss": 0.7237, "num_input_tokens_seen": 606944, "step": 1805 }, { "epoch": 1.3987635239567233, "grad_norm": 1.6657627820968628, "learning_rate": 4.975913592093052e-05, "loss": 0.6869, "num_input_tokens_seen": 608288, "step": 1810 }, { "epoch": 1.4026275115919629, "grad_norm": 1.4919757843017578, "learning_rate": 4.975444395334212e-05, "loss": 0.7597, "num_input_tokens_seen": 610048, "step": 1815 }, { "epoch": 1.4064914992272024, "grad_norm": 1.7389391660690308, "learning_rate": 4.97497069518814e-05, "loss": 0.5489, "num_input_tokens_seen": 611904, "step": 1820 }, { "epoch": 1.410355486862442, "grad_norm": 3.4058728218078613, "learning_rate": 4.974492492516604e-05, "loss": 0.8457, "num_input_tokens_seen": 613568, "step": 1825 }, { "epoch": 1.4142194744976817, "grad_norm": 2.62445068359375, "learning_rate": 4.974009788189561e-05, "loss": 0.6575, "num_input_tokens_seen": 615232, "step": 1830 }, { "epoch": 1.4180834621329212, "grad_norm": 2.078320026397705, "learning_rate": 4.9735225830851575e-05, "loss": 0.5366, "num_input_tokens_seen": 616896, "step": 1835 }, { "epoch": 1.4219474497681608, "grad_norm": 1.4576778411865234, "learning_rate": 4.9730308780897295e-05, "loss": 0.6029, "num_input_tokens_seen": 618624, "step": 1840 }, { "epoch": 1.4258114374034003, "grad_norm": 1.4914380311965942, "learning_rate": 4.9725346740977974e-05, "loss": 0.6073, "num_input_tokens_seen": 620352, "step": 1845 }, { "epoch": 1.4296754250386399, "grad_norm": 3.763982057571411, "learning_rate": 4.9720339720120685e-05, "loss": 0.6675, "num_input_tokens_seen": 621888, "step": 1850 }, { "epoch": 1.4335394126738794, "grad_norm": 6.968742370605469, "learning_rate": 4.9715287727434314e-05, "loss": 0.9175, "num_input_tokens_seen": 623680, "step": 1855 }, { "epoch": 1.437403400309119, "grad_norm": 1.6241474151611328, "learning_rate": 4.971019077210955e-05, "loss": 0.5913, "num_input_tokens_seen": 625280, "step": 1860 }, { "epoch": 1.4412673879443587, "grad_norm": 2.0128719806671143, "learning_rate": 4.970504886341893e-05, "loss": 0.6703, "num_input_tokens_seen": 627264, "step": 1865 }, { "epoch": 1.445131375579598, "grad_norm": 2.1391568183898926, "learning_rate": 4.969986201071671e-05, "loss": 0.5598, "num_input_tokens_seen": 628736, "step": 1870 }, { "epoch": 1.4489953632148378, "grad_norm": 1.2016927003860474, "learning_rate": 4.969463022343894e-05, "loss": 0.5574, "num_input_tokens_seen": 630304, "step": 1875 }, { "epoch": 1.4528593508500773, "grad_norm": 1.4031468629837036, "learning_rate": 4.968935351110342e-05, "loss": 0.6695, "num_input_tokens_seen": 632256, "step": 1880 }, { "epoch": 1.4567233384853169, "grad_norm": 1.94596266746521, "learning_rate": 4.968403188330966e-05, "loss": 0.5356, "num_input_tokens_seen": 633856, "step": 1885 }, { "epoch": 1.4605873261205564, "grad_norm": 1.2916264533996582, "learning_rate": 4.96786653497389e-05, "loss": 0.5662, "num_input_tokens_seen": 635584, "step": 1890 }, { "epoch": 1.464451313755796, "grad_norm": 1.5977596044540405, "learning_rate": 4.967325392015406e-05, "loss": 0.6932, "num_input_tokens_seen": 637344, "step": 1895 }, { "epoch": 1.4683153013910355, "grad_norm": 1.2034029960632324, "learning_rate": 4.966779760439974e-05, "loss": 0.609, "num_input_tokens_seen": 638880, "step": 1900 }, { "epoch": 1.472179289026275, "grad_norm": 1.9132407903671265, "learning_rate": 4.966229641240221e-05, "loss": 0.54, "num_input_tokens_seen": 640736, "step": 1905 }, { "epoch": 1.4760432766615148, "grad_norm": 1.2859302759170532, "learning_rate": 4.965675035416935e-05, "loss": 0.5368, "num_input_tokens_seen": 642336, "step": 1910 }, { "epoch": 1.4799072642967541, "grad_norm": 3.091049909591675, "learning_rate": 4.96511594397907e-05, "loss": 0.6105, "num_input_tokens_seen": 644096, "step": 1915 }, { "epoch": 1.4837712519319939, "grad_norm": 2.7636842727661133, "learning_rate": 4.964552367943737e-05, "loss": 0.7898, "num_input_tokens_seen": 645760, "step": 1920 }, { "epoch": 1.4876352395672334, "grad_norm": 1.1487714052200317, "learning_rate": 4.963984308336208e-05, "loss": 0.5801, "num_input_tokens_seen": 647744, "step": 1925 }, { "epoch": 1.491499227202473, "grad_norm": 1.0016406774520874, "learning_rate": 4.96341176618991e-05, "loss": 0.7508, "num_input_tokens_seen": 649280, "step": 1930 }, { "epoch": 1.4953632148377125, "grad_norm": 1.397398591041565, "learning_rate": 4.962834742546425e-05, "loss": 0.567, "num_input_tokens_seen": 650688, "step": 1935 }, { "epoch": 1.499227202472952, "grad_norm": 1.1556490659713745, "learning_rate": 4.9622532384554885e-05, "loss": 0.5352, "num_input_tokens_seen": 652224, "step": 1940 }, { "epoch": 1.5, "eval_loss": 0.7178723812103271, "eval_runtime": 11.4653, "eval_samples_per_second": 50.151, "eval_steps_per_second": 12.56, "num_input_tokens_seen": 652480, "step": 1941 }, { "epoch": 1.5030911901081918, "grad_norm": 1.4895243644714355, "learning_rate": 4.961667254974987e-05, "loss": 0.8016, "num_input_tokens_seen": 653920, "step": 1945 }, { "epoch": 1.5069551777434311, "grad_norm": 1.201518177986145, "learning_rate": 4.961076793170954e-05, "loss": 0.5986, "num_input_tokens_seen": 655520, "step": 1950 }, { "epoch": 1.510819165378671, "grad_norm": 1.5094408988952637, "learning_rate": 4.960481854117574e-05, "loss": 0.5324, "num_input_tokens_seen": 657152, "step": 1955 }, { "epoch": 1.5146831530139102, "grad_norm": 1.6980327367782593, "learning_rate": 4.959882438897172e-05, "loss": 0.5252, "num_input_tokens_seen": 659008, "step": 1960 }, { "epoch": 1.51854714064915, "grad_norm": 1.7007030248641968, "learning_rate": 4.9592785486002206e-05, "loss": 0.6947, "num_input_tokens_seen": 660800, "step": 1965 }, { "epoch": 1.5224111282843895, "grad_norm": 2.073425531387329, "learning_rate": 4.958670184325328e-05, "loss": 0.6421, "num_input_tokens_seen": 662560, "step": 1970 }, { "epoch": 1.526275115919629, "grad_norm": 2.339219570159912, "learning_rate": 4.958057347179249e-05, "loss": 0.6091, "num_input_tokens_seen": 664512, "step": 1975 }, { "epoch": 1.5301391035548686, "grad_norm": 1.5169576406478882, "learning_rate": 4.957440038276869e-05, "loss": 0.5775, "num_input_tokens_seen": 666016, "step": 1980 }, { "epoch": 1.5340030911901081, "grad_norm": 2.30753755569458, "learning_rate": 4.956818258741211e-05, "loss": 0.5121, "num_input_tokens_seen": 667744, "step": 1985 }, { "epoch": 1.537867078825348, "grad_norm": 3.5178563594818115, "learning_rate": 4.956192009703432e-05, "loss": 0.6319, "num_input_tokens_seen": 669600, "step": 1990 }, { "epoch": 1.5417310664605872, "grad_norm": 1.688891887664795, "learning_rate": 4.955561292302819e-05, "loss": 0.5181, "num_input_tokens_seen": 671200, "step": 1995 }, { "epoch": 1.545595054095827, "grad_norm": 2.825032949447632, "learning_rate": 4.954926107686788e-05, "loss": 0.714, "num_input_tokens_seen": 672832, "step": 2000 }, { "epoch": 1.5494590417310663, "grad_norm": 1.632403016090393, "learning_rate": 4.954286457010881e-05, "loss": 0.8074, "num_input_tokens_seen": 674560, "step": 2005 }, { "epoch": 1.553323029366306, "grad_norm": 1.81997549533844, "learning_rate": 4.9536423414387666e-05, "loss": 0.6823, "num_input_tokens_seen": 676192, "step": 2010 }, { "epoch": 1.5571870170015456, "grad_norm": 2.0194883346557617, "learning_rate": 4.952993762142236e-05, "loss": 1.0348, "num_input_tokens_seen": 678080, "step": 2015 }, { "epoch": 1.5610510046367851, "grad_norm": 1.3360929489135742, "learning_rate": 4.9523407203011985e-05, "loss": 0.8976, "num_input_tokens_seen": 679776, "step": 2020 }, { "epoch": 1.5649149922720247, "grad_norm": 1.007196068763733, "learning_rate": 4.951683217103684e-05, "loss": 0.5572, "num_input_tokens_seen": 681312, "step": 2025 }, { "epoch": 1.5687789799072642, "grad_norm": 1.0449111461639404, "learning_rate": 4.9510212537458375e-05, "loss": 0.7436, "num_input_tokens_seen": 683232, "step": 2030 }, { "epoch": 1.572642967542504, "grad_norm": 2.2628791332244873, "learning_rate": 4.9503548314319184e-05, "loss": 0.789, "num_input_tokens_seen": 685440, "step": 2035 }, { "epoch": 1.5765069551777433, "grad_norm": 4.169325828552246, "learning_rate": 4.9496839513742996e-05, "loss": 0.6536, "num_input_tokens_seen": 687072, "step": 2040 }, { "epoch": 1.580370942812983, "grad_norm": 1.56338369846344, "learning_rate": 4.94900861479346e-05, "loss": 0.572, "num_input_tokens_seen": 688640, "step": 2045 }, { "epoch": 1.5842349304482226, "grad_norm": 1.6310685873031616, "learning_rate": 4.948328822917989e-05, "loss": 0.6328, "num_input_tokens_seen": 690112, "step": 2050 }, { "epoch": 1.5880989180834622, "grad_norm": 4.043805122375488, "learning_rate": 4.9476445769845805e-05, "loss": 0.809, "num_input_tokens_seen": 691744, "step": 2055 }, { "epoch": 1.5919629057187017, "grad_norm": 3.5313925743103027, "learning_rate": 4.94695587823803e-05, "loss": 0.7261, "num_input_tokens_seen": 693760, "step": 2060 }, { "epoch": 1.5958268933539412, "grad_norm": 2.2301182746887207, "learning_rate": 4.9462627279312343e-05, "loss": 0.8492, "num_input_tokens_seen": 695328, "step": 2065 }, { "epoch": 1.599690880989181, "grad_norm": 2.0581419467926025, "learning_rate": 4.94556512732519e-05, "loss": 0.6053, "num_input_tokens_seen": 696704, "step": 2070 }, { "epoch": 1.6035548686244203, "grad_norm": 1.5733519792556763, "learning_rate": 4.944863077688989e-05, "loss": 0.8057, "num_input_tokens_seen": 698496, "step": 2075 }, { "epoch": 1.60741885625966, "grad_norm": 1.232012152671814, "learning_rate": 4.944156580299815e-05, "loss": 0.4819, "num_input_tokens_seen": 700096, "step": 2080 }, { "epoch": 1.6112828438948994, "grad_norm": 1.2082643508911133, "learning_rate": 4.943445636442946e-05, "loss": 0.5121, "num_input_tokens_seen": 701920, "step": 2085 }, { "epoch": 1.6151468315301392, "grad_norm": 2.1775283813476562, "learning_rate": 4.942730247411748e-05, "loss": 0.7415, "num_input_tokens_seen": 703392, "step": 2090 }, { "epoch": 1.6190108191653787, "grad_norm": 2.449903726577759, "learning_rate": 4.9420104145076736e-05, "loss": 0.6899, "num_input_tokens_seen": 705152, "step": 2095 }, { "epoch": 1.6228748068006182, "grad_norm": 1.4219528436660767, "learning_rate": 4.941286139040259e-05, "loss": 0.88, "num_input_tokens_seen": 706624, "step": 2100 }, { "epoch": 1.6267387944358578, "grad_norm": 1.6299880743026733, "learning_rate": 4.9405574223271246e-05, "loss": 0.7582, "num_input_tokens_seen": 708352, "step": 2105 }, { "epoch": 1.6306027820710973, "grad_norm": 1.3721541166305542, "learning_rate": 4.9398242656939684e-05, "loss": 0.5373, "num_input_tokens_seen": 709664, "step": 2110 }, { "epoch": 1.634466769706337, "grad_norm": 5.045022964477539, "learning_rate": 4.939086670474568e-05, "loss": 0.6916, "num_input_tokens_seen": 711552, "step": 2115 }, { "epoch": 1.6383307573415764, "grad_norm": 1.1204551458358765, "learning_rate": 4.938344638010771e-05, "loss": 0.5635, "num_input_tokens_seen": 713088, "step": 2120 }, { "epoch": 1.6421947449768162, "grad_norm": 2.263209581375122, "learning_rate": 4.937598169652502e-05, "loss": 0.5816, "num_input_tokens_seen": 714464, "step": 2125 }, { "epoch": 1.6460587326120555, "grad_norm": 1.2780593633651733, "learning_rate": 4.936847266757755e-05, "loss": 0.6572, "num_input_tokens_seen": 716096, "step": 2130 }, { "epoch": 1.6499227202472952, "grad_norm": 1.6126980781555176, "learning_rate": 4.936091930692589e-05, "loss": 0.6172, "num_input_tokens_seen": 717760, "step": 2135 }, { "epoch": 1.6537867078825348, "grad_norm": 0.9015939831733704, "learning_rate": 4.9353321628311305e-05, "loss": 0.5145, "num_input_tokens_seen": 719392, "step": 2140 }, { "epoch": 1.6576506955177743, "grad_norm": 1.415062665939331, "learning_rate": 4.9345679645555664e-05, "loss": 0.6166, "num_input_tokens_seen": 720992, "step": 2145 }, { "epoch": 1.6615146831530139, "grad_norm": 1.2511241436004639, "learning_rate": 4.933799337256144e-05, "loss": 0.5967, "num_input_tokens_seen": 722752, "step": 2150 }, { "epoch": 1.6653786707882534, "grad_norm": 1.9728535413742065, "learning_rate": 4.93302628233117e-05, "loss": 0.7035, "num_input_tokens_seen": 724448, "step": 2155 }, { "epoch": 1.6692426584234932, "grad_norm": 2.9270153045654297, "learning_rate": 4.932248801187003e-05, "loss": 0.6664, "num_input_tokens_seen": 726240, "step": 2160 }, { "epoch": 1.6731066460587325, "grad_norm": 1.7510836124420166, "learning_rate": 4.931466895238054e-05, "loss": 0.6057, "num_input_tokens_seen": 727872, "step": 2165 }, { "epoch": 1.6769706336939723, "grad_norm": 1.840891718864441, "learning_rate": 4.930680565906787e-05, "loss": 0.5325, "num_input_tokens_seen": 729856, "step": 2170 }, { "epoch": 1.6808346213292118, "grad_norm": 1.7051429748535156, "learning_rate": 4.9298898146237105e-05, "loss": 0.6419, "num_input_tokens_seen": 731840, "step": 2175 }, { "epoch": 1.6846986089644513, "grad_norm": 1.7991974353790283, "learning_rate": 4.9290946428273775e-05, "loss": 0.5799, "num_input_tokens_seen": 733632, "step": 2180 }, { "epoch": 1.6885625965996909, "grad_norm": 0.9840047359466553, "learning_rate": 4.9282950519643836e-05, "loss": 0.4441, "num_input_tokens_seen": 735168, "step": 2185 }, { "epoch": 1.6924265842349304, "grad_norm": 2.117263078689575, "learning_rate": 4.927491043489363e-05, "loss": 0.7129, "num_input_tokens_seen": 736992, "step": 2190 }, { "epoch": 1.69629057187017, "grad_norm": 1.047438621520996, "learning_rate": 4.926682618864988e-05, "loss": 0.5175, "num_input_tokens_seen": 738624, "step": 2195 }, { "epoch": 1.7001545595054095, "grad_norm": 1.2048122882843018, "learning_rate": 4.9258697795619635e-05, "loss": 0.5705, "num_input_tokens_seen": 740064, "step": 2200 }, { "epoch": 1.7040185471406493, "grad_norm": 1.0884156227111816, "learning_rate": 4.925052527059026e-05, "loss": 0.5158, "num_input_tokens_seen": 741792, "step": 2205 }, { "epoch": 1.7078825347758886, "grad_norm": 1.945084810256958, "learning_rate": 4.92423086284294e-05, "loss": 0.4966, "num_input_tokens_seen": 743488, "step": 2210 }, { "epoch": 1.7117465224111283, "grad_norm": 1.8525573015213013, "learning_rate": 4.923404788408498e-05, "loss": 0.5936, "num_input_tokens_seen": 745184, "step": 2215 }, { "epoch": 1.7156105100463679, "grad_norm": 1.7582542896270752, "learning_rate": 4.922574305258513e-05, "loss": 0.5492, "num_input_tokens_seen": 746784, "step": 2220 }, { "epoch": 1.7194744976816074, "grad_norm": 1.2979035377502441, "learning_rate": 4.9217394149038196e-05, "loss": 0.5972, "num_input_tokens_seen": 748288, "step": 2225 }, { "epoch": 1.723338485316847, "grad_norm": 1.1435630321502686, "learning_rate": 4.920900118863271e-05, "loss": 0.5307, "num_input_tokens_seen": 750048, "step": 2230 }, { "epoch": 1.7272024729520865, "grad_norm": 1.1659036874771118, "learning_rate": 4.9200564186637346e-05, "loss": 0.7984, "num_input_tokens_seen": 751904, "step": 2235 }, { "epoch": 1.7310664605873263, "grad_norm": 1.19560706615448, "learning_rate": 4.919208315840088e-05, "loss": 0.5349, "num_input_tokens_seen": 753536, "step": 2240 }, { "epoch": 1.7349304482225656, "grad_norm": 1.7486205101013184, "learning_rate": 4.918355811935223e-05, "loss": 0.4997, "num_input_tokens_seen": 755136, "step": 2245 }, { "epoch": 1.7387944358578054, "grad_norm": 1.0370196104049683, "learning_rate": 4.9174989085000325e-05, "loss": 0.5857, "num_input_tokens_seen": 756544, "step": 2250 }, { "epoch": 1.7426584234930447, "grad_norm": 1.5363653898239136, "learning_rate": 4.916637607093416e-05, "loss": 0.7711, "num_input_tokens_seen": 758048, "step": 2255 }, { "epoch": 1.7465224111282844, "grad_norm": 1.4228001832962036, "learning_rate": 4.9157719092822747e-05, "loss": 0.954, "num_input_tokens_seen": 759872, "step": 2260 }, { "epoch": 1.750386398763524, "grad_norm": 1.3623327016830444, "learning_rate": 4.914901816641505e-05, "loss": 0.4764, "num_input_tokens_seen": 761344, "step": 2265 }, { "epoch": 1.7542503863987635, "grad_norm": 1.5220723152160645, "learning_rate": 4.914027330754002e-05, "loss": 0.6934, "num_input_tokens_seen": 763136, "step": 2270 }, { "epoch": 1.758114374034003, "grad_norm": 2.120134115219116, "learning_rate": 4.913148453210649e-05, "loss": 0.6201, "num_input_tokens_seen": 764576, "step": 2275 }, { "epoch": 1.7619783616692426, "grad_norm": 5.282575607299805, "learning_rate": 4.9122651856103216e-05, "loss": 0.9926, "num_input_tokens_seen": 766240, "step": 2280 }, { "epoch": 1.7658423493044824, "grad_norm": 1.25540030002594, "learning_rate": 4.911377529559883e-05, "loss": 0.5174, "num_input_tokens_seen": 767936, "step": 2285 }, { "epoch": 1.7697063369397217, "grad_norm": 1.6631218194961548, "learning_rate": 4.9104854866741756e-05, "loss": 0.5089, "num_input_tokens_seen": 770144, "step": 2290 }, { "epoch": 1.7735703245749614, "grad_norm": 1.3036575317382812, "learning_rate": 4.9095890585760266e-05, "loss": 0.5412, "num_input_tokens_seen": 772096, "step": 2295 }, { "epoch": 1.7774343122102008, "grad_norm": 4.666076183319092, "learning_rate": 4.908688246896238e-05, "loss": 0.6275, "num_input_tokens_seen": 773664, "step": 2300 }, { "epoch": 1.7812982998454405, "grad_norm": 3.210664749145508, "learning_rate": 4.90778305327359e-05, "loss": 0.7936, "num_input_tokens_seen": 775168, "step": 2305 }, { "epoch": 1.78516228748068, "grad_norm": 1.333380103111267, "learning_rate": 4.90687347935483e-05, "loss": 0.511, "num_input_tokens_seen": 776960, "step": 2310 }, { "epoch": 1.7890262751159196, "grad_norm": 2.0277512073516846, "learning_rate": 4.905959526794678e-05, "loss": 0.5565, "num_input_tokens_seen": 778432, "step": 2315 }, { "epoch": 1.7928902627511591, "grad_norm": 0.9740800857543945, "learning_rate": 4.905041197255817e-05, "loss": 0.5641, "num_input_tokens_seen": 779968, "step": 2320 }, { "epoch": 1.7967542503863987, "grad_norm": 1.8244127035140991, "learning_rate": 4.904118492408895e-05, "loss": 0.5639, "num_input_tokens_seen": 781536, "step": 2325 }, { "epoch": 1.8006182380216385, "grad_norm": 1.283334732055664, "learning_rate": 4.9031914139325175e-05, "loss": 0.5199, "num_input_tokens_seen": 783296, "step": 2330 }, { "epoch": 1.8044822256568778, "grad_norm": 1.422148585319519, "learning_rate": 4.902259963513248e-05, "loss": 0.7572, "num_input_tokens_seen": 785088, "step": 2335 }, { "epoch": 1.8083462132921175, "grad_norm": 1.6225029230117798, "learning_rate": 4.901324142845605e-05, "loss": 0.5553, "num_input_tokens_seen": 786816, "step": 2340 }, { "epoch": 1.812210200927357, "grad_norm": 1.2208060026168823, "learning_rate": 4.900383953632053e-05, "loss": 0.7162, "num_input_tokens_seen": 788576, "step": 2345 }, { "epoch": 1.8160741885625966, "grad_norm": 1.295591115951538, "learning_rate": 4.8994393975830074e-05, "loss": 1.0837, "num_input_tokens_seen": 790656, "step": 2350 }, { "epoch": 1.8199381761978362, "grad_norm": 1.4950356483459473, "learning_rate": 4.898490476416828e-05, "loss": 0.5341, "num_input_tokens_seen": 792480, "step": 2355 }, { "epoch": 1.8238021638330757, "grad_norm": 1.8285695314407349, "learning_rate": 4.8975371918598144e-05, "loss": 0.5305, "num_input_tokens_seen": 794272, "step": 2360 }, { "epoch": 1.8276661514683155, "grad_norm": 1.110389232635498, "learning_rate": 4.896579545646204e-05, "loss": 0.7142, "num_input_tokens_seen": 795904, "step": 2365 }, { "epoch": 1.8315301391035548, "grad_norm": 3.7162725925445557, "learning_rate": 4.895617539518169e-05, "loss": 0.8098, "num_input_tokens_seen": 797632, "step": 2370 }, { "epoch": 1.8353941267387945, "grad_norm": 0.951755940914154, "learning_rate": 4.894651175225815e-05, "loss": 0.5534, "num_input_tokens_seen": 799648, "step": 2375 }, { "epoch": 1.8392581143740339, "grad_norm": 3.052557945251465, "learning_rate": 4.893680454527174e-05, "loss": 0.5402, "num_input_tokens_seen": 801216, "step": 2380 }, { "epoch": 1.8431221020092736, "grad_norm": 1.1122266054153442, "learning_rate": 4.892705379188205e-05, "loss": 0.6495, "num_input_tokens_seen": 802848, "step": 2385 }, { "epoch": 1.8469860896445132, "grad_norm": 1.3800560235977173, "learning_rate": 4.891725950982787e-05, "loss": 0.5758, "num_input_tokens_seen": 804672, "step": 2390 }, { "epoch": 1.8508500772797527, "grad_norm": 1.9103162288665771, "learning_rate": 4.890742171692721e-05, "loss": 0.7216, "num_input_tokens_seen": 806496, "step": 2395 }, { "epoch": 1.8547140649149922, "grad_norm": 1.295331358909607, "learning_rate": 4.889754043107719e-05, "loss": 0.5205, "num_input_tokens_seen": 808288, "step": 2400 }, { "epoch": 1.8585780525502318, "grad_norm": 0.8636698126792908, "learning_rate": 4.8887615670254105e-05, "loss": 0.5587, "num_input_tokens_seen": 809696, "step": 2405 }, { "epoch": 1.8624420401854715, "grad_norm": 1.10353684425354, "learning_rate": 4.8877647452513294e-05, "loss": 0.7025, "num_input_tokens_seen": 811392, "step": 2410 }, { "epoch": 1.8663060278207109, "grad_norm": 5.125211238861084, "learning_rate": 4.88676357959892e-05, "loss": 0.5943, "num_input_tokens_seen": 813248, "step": 2415 }, { "epoch": 1.8701700154559506, "grad_norm": 2.0516300201416016, "learning_rate": 4.885758071889524e-05, "loss": 0.8003, "num_input_tokens_seen": 814688, "step": 2420 }, { "epoch": 1.87403400309119, "grad_norm": 1.0114511251449585, "learning_rate": 4.884748223952387e-05, "loss": 0.485, "num_input_tokens_seen": 816096, "step": 2425 }, { "epoch": 1.8778979907264297, "grad_norm": 2.534055709838867, "learning_rate": 4.883734037624647e-05, "loss": 0.6158, "num_input_tokens_seen": 817888, "step": 2430 }, { "epoch": 1.8817619783616693, "grad_norm": 2.3783583641052246, "learning_rate": 4.882715514751336e-05, "loss": 0.7092, "num_input_tokens_seen": 819808, "step": 2435 }, { "epoch": 1.8856259659969088, "grad_norm": 2.3611772060394287, "learning_rate": 4.881692657185376e-05, "loss": 0.7409, "num_input_tokens_seen": 821376, "step": 2440 }, { "epoch": 1.8894899536321483, "grad_norm": 2.441654682159424, "learning_rate": 4.8806654667875723e-05, "loss": 0.5961, "num_input_tokens_seen": 823136, "step": 2445 }, { "epoch": 1.8933539412673879, "grad_norm": 4.523927688598633, "learning_rate": 4.879633945426615e-05, "loss": 0.5504, "num_input_tokens_seen": 824576, "step": 2450 }, { "epoch": 1.8972179289026276, "grad_norm": 1.3558104038238525, "learning_rate": 4.8785980949790724e-05, "loss": 0.5375, "num_input_tokens_seen": 826240, "step": 2455 }, { "epoch": 1.901081916537867, "grad_norm": 2.5606250762939453, "learning_rate": 4.8775579173293885e-05, "loss": 0.6093, "num_input_tokens_seen": 828288, "step": 2460 }, { "epoch": 1.9049459041731067, "grad_norm": 1.609434723854065, "learning_rate": 4.876513414369878e-05, "loss": 0.5719, "num_input_tokens_seen": 829984, "step": 2465 }, { "epoch": 1.9088098918083463, "grad_norm": 1.238630771636963, "learning_rate": 4.875464588000727e-05, "loss": 0.5684, "num_input_tokens_seen": 831680, "step": 2470 }, { "epoch": 1.9126738794435858, "grad_norm": 1.4387930631637573, "learning_rate": 4.8744114401299834e-05, "loss": 0.4893, "num_input_tokens_seen": 833376, "step": 2475 }, { "epoch": 1.9165378670788253, "grad_norm": 1.6219137907028198, "learning_rate": 4.87335397267356e-05, "loss": 0.7982, "num_input_tokens_seen": 835232, "step": 2480 }, { "epoch": 1.9204018547140649, "grad_norm": 4.36570930480957, "learning_rate": 4.872292187555227e-05, "loss": 0.9341, "num_input_tokens_seen": 836864, "step": 2485 }, { "epoch": 1.9242658423493046, "grad_norm": 3.037836790084839, "learning_rate": 4.8712260867066085e-05, "loss": 0.5121, "num_input_tokens_seen": 838624, "step": 2490 }, { "epoch": 1.928129829984544, "grad_norm": 0.9014259576797485, "learning_rate": 4.87015567206718e-05, "loss": 0.4966, "num_input_tokens_seen": 840544, "step": 2495 }, { "epoch": 1.9319938176197837, "grad_norm": 1.091304898262024, "learning_rate": 4.8690809455842676e-05, "loss": 0.7852, "num_input_tokens_seen": 842272, "step": 2500 }, { "epoch": 1.935857805255023, "grad_norm": 2.407442092895508, "learning_rate": 4.8680019092130364e-05, "loss": 0.5832, "num_input_tokens_seen": 844000, "step": 2505 }, { "epoch": 1.9397217928902628, "grad_norm": 1.0977860689163208, "learning_rate": 4.8669185649164964e-05, "loss": 0.5185, "num_input_tokens_seen": 845408, "step": 2510 }, { "epoch": 1.9435857805255023, "grad_norm": 1.2953741550445557, "learning_rate": 4.865830914665493e-05, "loss": 0.8511, "num_input_tokens_seen": 847136, "step": 2515 }, { "epoch": 1.947449768160742, "grad_norm": 1.7437061071395874, "learning_rate": 4.864738960438705e-05, "loss": 0.736, "num_input_tokens_seen": 848640, "step": 2520 }, { "epoch": 1.9513137557959814, "grad_norm": 1.528610110282898, "learning_rate": 4.863642704222642e-05, "loss": 0.5788, "num_input_tokens_seen": 850432, "step": 2525 }, { "epoch": 1.955177743431221, "grad_norm": 1.3274085521697998, "learning_rate": 4.8625421480116395e-05, "loss": 0.5532, "num_input_tokens_seen": 852128, "step": 2530 }, { "epoch": 1.9590417310664607, "grad_norm": 1.860080361366272, "learning_rate": 4.861437293807856e-05, "loss": 0.5771, "num_input_tokens_seen": 853888, "step": 2535 }, { "epoch": 1.9629057187017, "grad_norm": 1.230636715888977, "learning_rate": 4.860328143621267e-05, "loss": 0.4457, "num_input_tokens_seen": 855584, "step": 2540 }, { "epoch": 1.9667697063369398, "grad_norm": 0.9437137246131897, "learning_rate": 4.8592146994696646e-05, "loss": 0.6668, "num_input_tokens_seen": 857280, "step": 2545 }, { "epoch": 1.9706336939721791, "grad_norm": 1.012313723564148, "learning_rate": 4.8580969633786544e-05, "loss": 0.6828, "num_input_tokens_seen": 859104, "step": 2550 }, { "epoch": 1.974497681607419, "grad_norm": 1.7987966537475586, "learning_rate": 4.856974937381648e-05, "loss": 0.6001, "num_input_tokens_seen": 860768, "step": 2555 }, { "epoch": 1.9783616692426584, "grad_norm": 1.3273541927337646, "learning_rate": 4.855848623519862e-05, "loss": 0.6573, "num_input_tokens_seen": 862496, "step": 2560 }, { "epoch": 1.982225656877898, "grad_norm": 1.7044663429260254, "learning_rate": 4.854718023842312e-05, "loss": 0.5552, "num_input_tokens_seen": 864512, "step": 2565 }, { "epoch": 1.9860896445131375, "grad_norm": 1.6660404205322266, "learning_rate": 4.853583140405813e-05, "loss": 0.5192, "num_input_tokens_seen": 865856, "step": 2570 }, { "epoch": 1.989953632148377, "grad_norm": 1.586242914199829, "learning_rate": 4.8524439752749716e-05, "loss": 0.7794, "num_input_tokens_seen": 867552, "step": 2575 }, { "epoch": 1.9938176197836168, "grad_norm": 0.8288000226020813, "learning_rate": 4.8513005305221846e-05, "loss": 0.7336, "num_input_tokens_seen": 868928, "step": 2580 }, { "epoch": 1.9976816074188561, "grad_norm": 1.2148245573043823, "learning_rate": 4.850152808227633e-05, "loss": 0.5722, "num_input_tokens_seen": 870432, "step": 2585 }, { "epoch": 2.0, "eval_loss": 0.6825047731399536, "eval_runtime": 11.445, "eval_samples_per_second": 50.24, "eval_steps_per_second": 12.582, "num_input_tokens_seen": 871200, "step": 2588 }, { "epoch": 2.001545595054096, "grad_norm": 1.8192992210388184, "learning_rate": 4.849000810479281e-05, "loss": 0.6712, "num_input_tokens_seen": 871808, "step": 2590 }, { "epoch": 2.0054095826893352, "grad_norm": 1.0573179721832275, "learning_rate": 4.8478445393728686e-05, "loss": 0.6894, "num_input_tokens_seen": 873440, "step": 2595 }, { "epoch": 2.009273570324575, "grad_norm": 1.2139945030212402, "learning_rate": 4.846683997011914e-05, "loss": 0.6208, "num_input_tokens_seen": 875104, "step": 2600 }, { "epoch": 2.0131375579598147, "grad_norm": 1.540300726890564, "learning_rate": 4.845519185507703e-05, "loss": 0.5693, "num_input_tokens_seen": 876992, "step": 2605 }, { "epoch": 2.017001545595054, "grad_norm": 0.9763923287391663, "learning_rate": 4.844350106979287e-05, "loss": 0.7365, "num_input_tokens_seen": 878816, "step": 2610 }, { "epoch": 2.020865533230294, "grad_norm": 0.7553500533103943, "learning_rate": 4.843176763553483e-05, "loss": 0.5576, "num_input_tokens_seen": 880480, "step": 2615 }, { "epoch": 2.024729520865533, "grad_norm": 0.9088889956474304, "learning_rate": 4.8419991573648635e-05, "loss": 0.4629, "num_input_tokens_seen": 882208, "step": 2620 }, { "epoch": 2.028593508500773, "grad_norm": 1.296615719795227, "learning_rate": 4.8408172905557596e-05, "loss": 0.5641, "num_input_tokens_seen": 884192, "step": 2625 }, { "epoch": 2.0324574961360122, "grad_norm": 1.0783467292785645, "learning_rate": 4.8396311652762505e-05, "loss": 0.6194, "num_input_tokens_seen": 885856, "step": 2630 }, { "epoch": 2.036321483771252, "grad_norm": 1.8956995010375977, "learning_rate": 4.8384407836841626e-05, "loss": 0.5267, "num_input_tokens_seen": 887488, "step": 2635 }, { "epoch": 2.0401854714064913, "grad_norm": 4.462429523468018, "learning_rate": 4.837246147945068e-05, "loss": 0.7846, "num_input_tokens_seen": 889184, "step": 2640 }, { "epoch": 2.044049459041731, "grad_norm": 0.9090690612792969, "learning_rate": 4.836047260232276e-05, "loss": 0.4908, "num_input_tokens_seen": 890624, "step": 2645 }, { "epoch": 2.047913446676971, "grad_norm": 2.114497184753418, "learning_rate": 4.83484412272683e-05, "loss": 0.5044, "num_input_tokens_seen": 892224, "step": 2650 }, { "epoch": 2.05177743431221, "grad_norm": 1.7265336513519287, "learning_rate": 4.833636737617509e-05, "loss": 0.6586, "num_input_tokens_seen": 893824, "step": 2655 }, { "epoch": 2.05564142194745, "grad_norm": 1.4411967992782593, "learning_rate": 4.832425107100814e-05, "loss": 0.517, "num_input_tokens_seen": 895840, "step": 2660 }, { "epoch": 2.0595054095826892, "grad_norm": 0.601601779460907, "learning_rate": 4.8312092333809734e-05, "loss": 0.6538, "num_input_tokens_seen": 897568, "step": 2665 }, { "epoch": 2.063369397217929, "grad_norm": 1.0393946170806885, "learning_rate": 4.829989118669933e-05, "loss": 0.5438, "num_input_tokens_seen": 899328, "step": 2670 }, { "epoch": 2.0672333848531683, "grad_norm": 5.023001670837402, "learning_rate": 4.8287647651873565e-05, "loss": 1.051, "num_input_tokens_seen": 900992, "step": 2675 }, { "epoch": 2.071097372488408, "grad_norm": 1.298365831375122, "learning_rate": 4.827536175160614e-05, "loss": 0.527, "num_input_tokens_seen": 902752, "step": 2680 }, { "epoch": 2.0749613601236474, "grad_norm": 0.8913055658340454, "learning_rate": 4.826303350824787e-05, "loss": 0.5988, "num_input_tokens_seen": 904736, "step": 2685 }, { "epoch": 2.078825347758887, "grad_norm": 0.9280687570571899, "learning_rate": 4.825066294422659e-05, "loss": 0.4682, "num_input_tokens_seen": 906144, "step": 2690 }, { "epoch": 2.082689335394127, "grad_norm": 0.8446508049964905, "learning_rate": 4.8238250082047106e-05, "loss": 0.6254, "num_input_tokens_seen": 907616, "step": 2695 }, { "epoch": 2.0865533230293662, "grad_norm": 0.9633538722991943, "learning_rate": 4.822579494429123e-05, "loss": 0.574, "num_input_tokens_seen": 909408, "step": 2700 }, { "epoch": 2.090417310664606, "grad_norm": 3.4377827644348145, "learning_rate": 4.82132975536176e-05, "loss": 0.6627, "num_input_tokens_seen": 910848, "step": 2705 }, { "epoch": 2.0942812982998453, "grad_norm": 1.255698323249817, "learning_rate": 4.8200757932761786e-05, "loss": 0.5943, "num_input_tokens_seen": 912480, "step": 2710 }, { "epoch": 2.098145285935085, "grad_norm": 1.1742775440216064, "learning_rate": 4.818817610453617e-05, "loss": 0.5842, "num_input_tokens_seen": 913952, "step": 2715 }, { "epoch": 2.1020092735703244, "grad_norm": 1.3470114469528198, "learning_rate": 4.8175552091829914e-05, "loss": 0.5332, "num_input_tokens_seen": 915744, "step": 2720 }, { "epoch": 2.105873261205564, "grad_norm": 1.1415218114852905, "learning_rate": 4.816288591760891e-05, "loss": 0.5053, "num_input_tokens_seen": 917376, "step": 2725 }, { "epoch": 2.109737248840804, "grad_norm": 1.4444624185562134, "learning_rate": 4.8150177604915764e-05, "loss": 0.6624, "num_input_tokens_seen": 919168, "step": 2730 }, { "epoch": 2.1136012364760433, "grad_norm": 1.0789060592651367, "learning_rate": 4.813742717686974e-05, "loss": 1.0457, "num_input_tokens_seen": 920864, "step": 2735 }, { "epoch": 2.117465224111283, "grad_norm": 1.158774971961975, "learning_rate": 4.8124634656666734e-05, "loss": 0.8069, "num_input_tokens_seen": 922688, "step": 2740 }, { "epoch": 2.1213292117465223, "grad_norm": 0.8617830276489258, "learning_rate": 4.811180006757919e-05, "loss": 0.5576, "num_input_tokens_seen": 924512, "step": 2745 }, { "epoch": 2.125193199381762, "grad_norm": 2.3595569133758545, "learning_rate": 4.80989234329561e-05, "loss": 0.5214, "num_input_tokens_seen": 926272, "step": 2750 }, { "epoch": 2.1290571870170014, "grad_norm": 8.54576301574707, "learning_rate": 4.808600477622293e-05, "loss": 0.6211, "num_input_tokens_seen": 928032, "step": 2755 }, { "epoch": 2.132921174652241, "grad_norm": 1.6093125343322754, "learning_rate": 4.807304412088163e-05, "loss": 0.4489, "num_input_tokens_seen": 929696, "step": 2760 }, { "epoch": 2.1367851622874805, "grad_norm": 1.1189583539962769, "learning_rate": 4.806004149051052e-05, "loss": 0.6793, "num_input_tokens_seen": 931072, "step": 2765 }, { "epoch": 2.1406491499227203, "grad_norm": 2.444523572921753, "learning_rate": 4.80469969087643e-05, "loss": 0.6529, "num_input_tokens_seen": 932672, "step": 2770 }, { "epoch": 2.1445131375579596, "grad_norm": 2.017971992492676, "learning_rate": 4.803391039937397e-05, "loss": 0.6467, "num_input_tokens_seen": 934240, "step": 2775 }, { "epoch": 2.1483771251931993, "grad_norm": 1.8557348251342773, "learning_rate": 4.8020781986146834e-05, "loss": 0.5908, "num_input_tokens_seen": 935904, "step": 2780 }, { "epoch": 2.152241112828439, "grad_norm": 0.7478421330451965, "learning_rate": 4.80076116929664e-05, "loss": 0.5127, "num_input_tokens_seen": 937568, "step": 2785 }, { "epoch": 2.1561051004636784, "grad_norm": 0.6839173436164856, "learning_rate": 4.799439954379239e-05, "loss": 0.5207, "num_input_tokens_seen": 939136, "step": 2790 }, { "epoch": 2.159969088098918, "grad_norm": 1.3487695455551147, "learning_rate": 4.798114556266066e-05, "loss": 0.5444, "num_input_tokens_seen": 940640, "step": 2795 }, { "epoch": 2.1638330757341575, "grad_norm": 1.5758495330810547, "learning_rate": 4.7967849773683146e-05, "loss": 0.575, "num_input_tokens_seen": 942528, "step": 2800 }, { "epoch": 2.1676970633693973, "grad_norm": 1.2912291288375854, "learning_rate": 4.795451220104789e-05, "loss": 0.737, "num_input_tokens_seen": 944000, "step": 2805 }, { "epoch": 2.1715610510046366, "grad_norm": 0.8708459734916687, "learning_rate": 4.79411328690189e-05, "loss": 0.4491, "num_input_tokens_seen": 945760, "step": 2810 }, { "epoch": 2.1754250386398764, "grad_norm": 1.695089340209961, "learning_rate": 4.792771180193619e-05, "loss": 0.6017, "num_input_tokens_seen": 947424, "step": 2815 }, { "epoch": 2.179289026275116, "grad_norm": 2.9894039630889893, "learning_rate": 4.7914249024215686e-05, "loss": 0.6651, "num_input_tokens_seen": 949088, "step": 2820 }, { "epoch": 2.1831530139103554, "grad_norm": 1.1626811027526855, "learning_rate": 4.790074456034918e-05, "loss": 0.4824, "num_input_tokens_seen": 950784, "step": 2825 }, { "epoch": 2.187017001545595, "grad_norm": 1.2852622270584106, "learning_rate": 4.788719843490431e-05, "loss": 0.5166, "num_input_tokens_seen": 952768, "step": 2830 }, { "epoch": 2.1908809891808345, "grad_norm": 1.2360841035842896, "learning_rate": 4.787361067252452e-05, "loss": 0.5076, "num_input_tokens_seen": 954528, "step": 2835 }, { "epoch": 2.1947449768160743, "grad_norm": 1.0477244853973389, "learning_rate": 4.7859981297928986e-05, "loss": 0.5535, "num_input_tokens_seen": 956384, "step": 2840 }, { "epoch": 2.1986089644513136, "grad_norm": 1.3799697160720825, "learning_rate": 4.784631033591259e-05, "loss": 0.5914, "num_input_tokens_seen": 957920, "step": 2845 }, { "epoch": 2.2024729520865534, "grad_norm": 1.26620352268219, "learning_rate": 4.783259781134588e-05, "loss": 0.5603, "num_input_tokens_seen": 959552, "step": 2850 }, { "epoch": 2.206336939721793, "grad_norm": 1.0855884552001953, "learning_rate": 4.781884374917499e-05, "loss": 0.5791, "num_input_tokens_seen": 960896, "step": 2855 }, { "epoch": 2.2102009273570324, "grad_norm": 1.286551833152771, "learning_rate": 4.780504817442165e-05, "loss": 0.8503, "num_input_tokens_seen": 962528, "step": 2860 }, { "epoch": 2.214064914992272, "grad_norm": 2.7074520587921143, "learning_rate": 4.7791211112183084e-05, "loss": 0.8775, "num_input_tokens_seen": 964128, "step": 2865 }, { "epoch": 2.2179289026275115, "grad_norm": 2.6111092567443848, "learning_rate": 4.777733258763202e-05, "loss": 0.6214, "num_input_tokens_seen": 965792, "step": 2870 }, { "epoch": 2.2217928902627513, "grad_norm": 1.2884844541549683, "learning_rate": 4.7763412626016597e-05, "loss": 0.507, "num_input_tokens_seen": 967584, "step": 2875 }, { "epoch": 2.2256568778979906, "grad_norm": 1.6022179126739502, "learning_rate": 4.7749451252660326e-05, "loss": 0.6846, "num_input_tokens_seen": 969440, "step": 2880 }, { "epoch": 2.2295208655332304, "grad_norm": 1.491121768951416, "learning_rate": 4.773544849296208e-05, "loss": 0.8272, "num_input_tokens_seen": 971296, "step": 2885 }, { "epoch": 2.2333848531684697, "grad_norm": 1.0879337787628174, "learning_rate": 4.772140437239601e-05, "loss": 0.5236, "num_input_tokens_seen": 972800, "step": 2890 }, { "epoch": 2.2372488408037094, "grad_norm": 1.114122986793518, "learning_rate": 4.7707318916511515e-05, "loss": 0.5042, "num_input_tokens_seen": 974592, "step": 2895 }, { "epoch": 2.2411128284389488, "grad_norm": 4.874577522277832, "learning_rate": 4.769319215093318e-05, "loss": 0.7654, "num_input_tokens_seen": 976224, "step": 2900 }, { "epoch": 2.2449768160741885, "grad_norm": 1.145654559135437, "learning_rate": 4.767902410136076e-05, "loss": 0.6464, "num_input_tokens_seen": 978144, "step": 2905 }, { "epoch": 2.2488408037094283, "grad_norm": 1.882925033569336, "learning_rate": 4.7664814793569125e-05, "loss": 0.6286, "num_input_tokens_seen": 979808, "step": 2910 }, { "epoch": 2.2527047913446676, "grad_norm": 1.5039573907852173, "learning_rate": 4.765056425340815e-05, "loss": 0.6282, "num_input_tokens_seen": 981504, "step": 2915 }, { "epoch": 2.2565687789799074, "grad_norm": 1.3709388971328735, "learning_rate": 4.7636272506802784e-05, "loss": 0.5423, "num_input_tokens_seen": 983104, "step": 2920 }, { "epoch": 2.2604327666151467, "grad_norm": 0.9909651875495911, "learning_rate": 4.7621939579752906e-05, "loss": 0.6022, "num_input_tokens_seen": 984704, "step": 2925 }, { "epoch": 2.2642967542503865, "grad_norm": 1.050881266593933, "learning_rate": 4.760756549833331e-05, "loss": 0.4194, "num_input_tokens_seen": 986432, "step": 2930 }, { "epoch": 2.2681607418856258, "grad_norm": 2.409635066986084, "learning_rate": 4.7593150288693664e-05, "loss": 0.5873, "num_input_tokens_seen": 987904, "step": 2935 }, { "epoch": 2.2720247295208655, "grad_norm": 1.4359625577926636, "learning_rate": 4.7578693977058474e-05, "loss": 0.9243, "num_input_tokens_seen": 989472, "step": 2940 }, { "epoch": 2.2758887171561053, "grad_norm": 1.4288301467895508, "learning_rate": 4.7564196589727e-05, "loss": 0.758, "num_input_tokens_seen": 991040, "step": 2945 }, { "epoch": 2.2797527047913446, "grad_norm": 1.1762726306915283, "learning_rate": 4.754965815307323e-05, "loss": 0.5541, "num_input_tokens_seen": 992672, "step": 2950 }, { "epoch": 2.2836166924265844, "grad_norm": 1.1025105714797974, "learning_rate": 4.7535078693545825e-05, "loss": 0.5197, "num_input_tokens_seen": 994304, "step": 2955 }, { "epoch": 2.2874806800618237, "grad_norm": 0.9903162717819214, "learning_rate": 4.75204582376681e-05, "loss": 0.5427, "num_input_tokens_seen": 995872, "step": 2960 }, { "epoch": 2.2913446676970635, "grad_norm": 1.1935129165649414, "learning_rate": 4.750579681203792e-05, "loss": 0.6144, "num_input_tokens_seen": 997632, "step": 2965 }, { "epoch": 2.295208655332303, "grad_norm": 1.0572192668914795, "learning_rate": 4.749109444332772e-05, "loss": 0.4866, "num_input_tokens_seen": 999648, "step": 2970 }, { "epoch": 2.2990726429675425, "grad_norm": 1.1438264846801758, "learning_rate": 4.7476351158284374e-05, "loss": 0.5031, "num_input_tokens_seen": 1001760, "step": 2975 }, { "epoch": 2.3029366306027823, "grad_norm": 1.91819429397583, "learning_rate": 4.746156698372923e-05, "loss": 0.7069, "num_input_tokens_seen": 1003168, "step": 2980 }, { "epoch": 2.3068006182380216, "grad_norm": 2.564328193664551, "learning_rate": 4.744674194655802e-05, "loss": 0.6263, "num_input_tokens_seen": 1004736, "step": 2985 }, { "epoch": 2.3106646058732614, "grad_norm": 1.0680264234542847, "learning_rate": 4.7431876073740774e-05, "loss": 0.556, "num_input_tokens_seen": 1006464, "step": 2990 }, { "epoch": 2.3145285935085007, "grad_norm": 2.1349689960479736, "learning_rate": 4.741696939232186e-05, "loss": 0.529, "num_input_tokens_seen": 1008064, "step": 2995 }, { "epoch": 2.3183925811437405, "grad_norm": 2.053471565246582, "learning_rate": 4.740202192941987e-05, "loss": 0.5402, "num_input_tokens_seen": 1009824, "step": 3000 }, { "epoch": 2.32225656877898, "grad_norm": 0.718292772769928, "learning_rate": 4.738703371222758e-05, "loss": 0.6005, "num_input_tokens_seen": 1011584, "step": 3005 }, { "epoch": 2.3261205564142196, "grad_norm": 1.3756283521652222, "learning_rate": 4.73720047680119e-05, "loss": 0.7874, "num_input_tokens_seen": 1013312, "step": 3010 }, { "epoch": 2.329984544049459, "grad_norm": 1.2997392416000366, "learning_rate": 4.7356935124113864e-05, "loss": 0.8504, "num_input_tokens_seen": 1015296, "step": 3015 }, { "epoch": 2.3338485316846986, "grad_norm": 1.0843547582626343, "learning_rate": 4.73418248079485e-05, "loss": 0.5073, "num_input_tokens_seen": 1017184, "step": 3020 }, { "epoch": 2.337712519319938, "grad_norm": 0.8252149224281311, "learning_rate": 4.7326673847004875e-05, "loss": 0.465, "num_input_tokens_seen": 1018848, "step": 3025 }, { "epoch": 2.3415765069551777, "grad_norm": 1.1008614301681519, "learning_rate": 4.731148226884596e-05, "loss": 0.5322, "num_input_tokens_seen": 1020544, "step": 3030 }, { "epoch": 2.3454404945904175, "grad_norm": 1.865654468536377, "learning_rate": 4.7296250101108656e-05, "loss": 0.6214, "num_input_tokens_seen": 1022272, "step": 3035 }, { "epoch": 2.349304482225657, "grad_norm": 1.440487027168274, "learning_rate": 4.7280977371503664e-05, "loss": 0.5946, "num_input_tokens_seen": 1023936, "step": 3040 }, { "epoch": 2.3531684698608966, "grad_norm": 1.2520562410354614, "learning_rate": 4.726566410781551e-05, "loss": 0.5528, "num_input_tokens_seen": 1025664, "step": 3045 }, { "epoch": 2.357032457496136, "grad_norm": 1.1526507139205933, "learning_rate": 4.725031033790244e-05, "loss": 0.5422, "num_input_tokens_seen": 1027264, "step": 3050 }, { "epoch": 2.3608964451313756, "grad_norm": 1.4851676225662231, "learning_rate": 4.723491608969639e-05, "loss": 0.4899, "num_input_tokens_seen": 1028768, "step": 3055 }, { "epoch": 2.364760432766615, "grad_norm": 3.2803032398223877, "learning_rate": 4.721948139120296e-05, "loss": 0.6426, "num_input_tokens_seen": 1030720, "step": 3060 }, { "epoch": 2.3686244204018547, "grad_norm": 2.041292190551758, "learning_rate": 4.720400627050131e-05, "loss": 0.4543, "num_input_tokens_seen": 1032352, "step": 3065 }, { "epoch": 2.3724884080370945, "grad_norm": 1.9266728162765503, "learning_rate": 4.7188490755744144e-05, "loss": 0.6006, "num_input_tokens_seen": 1034080, "step": 3070 }, { "epoch": 2.376352395672334, "grad_norm": 1.0572469234466553, "learning_rate": 4.717293487515765e-05, "loss": 0.5869, "num_input_tokens_seen": 1035616, "step": 3075 }, { "epoch": 2.3802163833075736, "grad_norm": 2.0563671588897705, "learning_rate": 4.715733865704147e-05, "loss": 0.5736, "num_input_tokens_seen": 1037312, "step": 3080 }, { "epoch": 2.384080370942813, "grad_norm": 1.2802661657333374, "learning_rate": 4.7141702129768614e-05, "loss": 0.7178, "num_input_tokens_seen": 1038816, "step": 3085 }, { "epoch": 2.3879443585780527, "grad_norm": 1.312269926071167, "learning_rate": 4.712602532178541e-05, "loss": 0.6118, "num_input_tokens_seen": 1040672, "step": 3090 }, { "epoch": 2.391808346213292, "grad_norm": 1.3565399646759033, "learning_rate": 4.711030826161149e-05, "loss": 0.6771, "num_input_tokens_seen": 1042016, "step": 3095 }, { "epoch": 2.3956723338485317, "grad_norm": 1.0816421508789062, "learning_rate": 4.70945509778397e-05, "loss": 0.4734, "num_input_tokens_seen": 1043712, "step": 3100 }, { "epoch": 2.3995363214837715, "grad_norm": 0.9340289235115051, "learning_rate": 4.7078753499136064e-05, "loss": 0.5705, "num_input_tokens_seen": 1045312, "step": 3105 }, { "epoch": 2.403400309119011, "grad_norm": 1.1149224042892456, "learning_rate": 4.7062915854239735e-05, "loss": 0.6851, "num_input_tokens_seen": 1047136, "step": 3110 }, { "epoch": 2.4072642967542506, "grad_norm": 4.080916881561279, "learning_rate": 4.7047038071962926e-05, "loss": 0.8024, "num_input_tokens_seen": 1048640, "step": 3115 }, { "epoch": 2.41112828438949, "grad_norm": 2.120288133621216, "learning_rate": 4.703112018119088e-05, "loss": 0.5338, "num_input_tokens_seen": 1050240, "step": 3120 }, { "epoch": 2.4149922720247297, "grad_norm": 0.8035090565681458, "learning_rate": 4.70151622108818e-05, "loss": 0.5901, "num_input_tokens_seen": 1052000, "step": 3125 }, { "epoch": 2.418856259659969, "grad_norm": 6.286470413208008, "learning_rate": 4.6999164190066804e-05, "loss": 0.8324, "num_input_tokens_seen": 1053824, "step": 3130 }, { "epoch": 2.4227202472952087, "grad_norm": 1.0757136344909668, "learning_rate": 4.698312614784988e-05, "loss": 0.5594, "num_input_tokens_seen": 1055392, "step": 3135 }, { "epoch": 2.426584234930448, "grad_norm": 1.2404892444610596, "learning_rate": 4.6967048113407795e-05, "loss": 0.7476, "num_input_tokens_seen": 1057056, "step": 3140 }, { "epoch": 2.430448222565688, "grad_norm": 0.7119969129562378, "learning_rate": 4.695093011599011e-05, "loss": 0.4932, "num_input_tokens_seen": 1058624, "step": 3145 }, { "epoch": 2.434312210200927, "grad_norm": 1.0174546241760254, "learning_rate": 4.693477218491907e-05, "loss": 0.7281, "num_input_tokens_seen": 1060288, "step": 3150 }, { "epoch": 2.438176197836167, "grad_norm": 4.06351900100708, "learning_rate": 4.691857434958955e-05, "loss": 0.7164, "num_input_tokens_seen": 1062208, "step": 3155 }, { "epoch": 2.4420401854714067, "grad_norm": 1.5657566785812378, "learning_rate": 4.6902336639469046e-05, "loss": 0.749, "num_input_tokens_seen": 1063808, "step": 3160 }, { "epoch": 2.445904173106646, "grad_norm": 1.4192901849746704, "learning_rate": 4.688605908409759e-05, "loss": 0.531, "num_input_tokens_seen": 1065600, "step": 3165 }, { "epoch": 2.4497681607418857, "grad_norm": 0.961067259311676, "learning_rate": 4.68697417130877e-05, "loss": 0.619, "num_input_tokens_seen": 1067328, "step": 3170 }, { "epoch": 2.453632148377125, "grad_norm": 0.9534875750541687, "learning_rate": 4.685338455612431e-05, "loss": 0.5333, "num_input_tokens_seen": 1069184, "step": 3175 }, { "epoch": 2.457496136012365, "grad_norm": 1.7152059078216553, "learning_rate": 4.683698764296476e-05, "loss": 0.5383, "num_input_tokens_seen": 1071040, "step": 3180 }, { "epoch": 2.461360123647604, "grad_norm": 1.8059871196746826, "learning_rate": 4.682055100343869e-05, "loss": 0.5421, "num_input_tokens_seen": 1072672, "step": 3185 }, { "epoch": 2.465224111282844, "grad_norm": 1.0954240560531616, "learning_rate": 4.680407466744805e-05, "loss": 0.5935, "num_input_tokens_seen": 1074240, "step": 3190 }, { "epoch": 2.4690880989180837, "grad_norm": 0.8433399200439453, "learning_rate": 4.678755866496696e-05, "loss": 0.5495, "num_input_tokens_seen": 1076000, "step": 3195 }, { "epoch": 2.472952086553323, "grad_norm": 1.0151426792144775, "learning_rate": 4.6771003026041735e-05, "loss": 0.6899, "num_input_tokens_seen": 1077504, "step": 3200 }, { "epoch": 2.4768160741885628, "grad_norm": 0.8818321824073792, "learning_rate": 4.6754407780790776e-05, "loss": 0.5006, "num_input_tokens_seen": 1079232, "step": 3205 }, { "epoch": 2.480680061823802, "grad_norm": 0.7308194637298584, "learning_rate": 4.6737772959404555e-05, "loss": 0.8693, "num_input_tokens_seen": 1080960, "step": 3210 }, { "epoch": 2.484544049459042, "grad_norm": 1.4986399412155151, "learning_rate": 4.672109859214554e-05, "loss": 0.7419, "num_input_tokens_seen": 1082560, "step": 3215 }, { "epoch": 2.488408037094281, "grad_norm": 0.8562100529670715, "learning_rate": 4.6704384709348124e-05, "loss": 0.6214, "num_input_tokens_seen": 1084128, "step": 3220 }, { "epoch": 2.492272024729521, "grad_norm": 1.2434278726577759, "learning_rate": 4.66876313414186e-05, "loss": 0.5303, "num_input_tokens_seen": 1085760, "step": 3225 }, { "epoch": 2.4961360123647607, "grad_norm": 1.391172170639038, "learning_rate": 4.66708385188351e-05, "loss": 0.5517, "num_input_tokens_seen": 1087456, "step": 3230 }, { "epoch": 2.5, "grad_norm": 1.020952582359314, "learning_rate": 4.665400627214753e-05, "loss": 0.486, "num_input_tokens_seen": 1089120, "step": 3235 }, { "epoch": 2.5, "eval_loss": 0.668274462223053, "eval_runtime": 11.472, "eval_samples_per_second": 50.122, "eval_steps_per_second": 12.552, "num_input_tokens_seen": 1089120, "step": 3235 }, { "epoch": 2.5038639876352393, "grad_norm": 2.3255228996276855, "learning_rate": 4.66371346319775e-05, "loss": 0.5947, "num_input_tokens_seen": 1090784, "step": 3240 }, { "epoch": 2.507727975270479, "grad_norm": 1.9303598403930664, "learning_rate": 4.662022362901831e-05, "loss": 0.5911, "num_input_tokens_seen": 1092608, "step": 3245 }, { "epoch": 2.511591962905719, "grad_norm": 1.2284221649169922, "learning_rate": 4.6603273294034844e-05, "loss": 0.5847, "num_input_tokens_seen": 1094528, "step": 3250 }, { "epoch": 2.515455950540958, "grad_norm": 1.049053430557251, "learning_rate": 4.6586283657863566e-05, "loss": 0.5192, "num_input_tokens_seen": 1096352, "step": 3255 }, { "epoch": 2.519319938176198, "grad_norm": 1.3961013555526733, "learning_rate": 4.656925475141243e-05, "loss": 0.5417, "num_input_tokens_seen": 1098176, "step": 3260 }, { "epoch": 2.5231839258114372, "grad_norm": 0.8182019591331482, "learning_rate": 4.6552186605660805e-05, "loss": 0.8753, "num_input_tokens_seen": 1099840, "step": 3265 }, { "epoch": 2.527047913446677, "grad_norm": 1.6517817974090576, "learning_rate": 4.653507925165948e-05, "loss": 0.6258, "num_input_tokens_seen": 1101568, "step": 3270 }, { "epoch": 2.5309119010819163, "grad_norm": 1.6704210042953491, "learning_rate": 4.6517932720530574e-05, "loss": 0.5764, "num_input_tokens_seen": 1103296, "step": 3275 }, { "epoch": 2.534775888717156, "grad_norm": 1.2822444438934326, "learning_rate": 4.650074704346744e-05, "loss": 0.5177, "num_input_tokens_seen": 1104800, "step": 3280 }, { "epoch": 2.538639876352396, "grad_norm": 2.190788507461548, "learning_rate": 4.648352225173469e-05, "loss": 0.5535, "num_input_tokens_seen": 1106400, "step": 3285 }, { "epoch": 2.542503863987635, "grad_norm": 1.430443286895752, "learning_rate": 4.646625837666806e-05, "loss": 0.5502, "num_input_tokens_seen": 1108064, "step": 3290 }, { "epoch": 2.546367851622875, "grad_norm": 1.0539770126342773, "learning_rate": 4.64489554496744e-05, "loss": 0.4826, "num_input_tokens_seen": 1109504, "step": 3295 }, { "epoch": 2.5502318392581143, "grad_norm": 1.459738850593567, "learning_rate": 4.643161350223162e-05, "loss": 0.6888, "num_input_tokens_seen": 1111168, "step": 3300 }, { "epoch": 2.554095826893354, "grad_norm": 2.4266860485076904, "learning_rate": 4.641423256588859e-05, "loss": 0.8653, "num_input_tokens_seen": 1112864, "step": 3305 }, { "epoch": 2.5579598145285933, "grad_norm": 2.701082468032837, "learning_rate": 4.639681267226513e-05, "loss": 0.6033, "num_input_tokens_seen": 1114848, "step": 3310 }, { "epoch": 2.561823802163833, "grad_norm": 0.8932035565376282, "learning_rate": 4.637935385305191e-05, "loss": 0.5538, "num_input_tokens_seen": 1116448, "step": 3315 }, { "epoch": 2.565687789799073, "grad_norm": 1.9617201089859009, "learning_rate": 4.6361856140010454e-05, "loss": 0.5475, "num_input_tokens_seen": 1118016, "step": 3320 }, { "epoch": 2.569551777434312, "grad_norm": 1.036487340927124, "learning_rate": 4.6344319564972997e-05, "loss": 0.4987, "num_input_tokens_seen": 1119648, "step": 3325 }, { "epoch": 2.573415765069552, "grad_norm": 1.877293348312378, "learning_rate": 4.6326744159842495e-05, "loss": 0.5514, "num_input_tokens_seen": 1121280, "step": 3330 }, { "epoch": 2.5772797527047913, "grad_norm": 3.866645097732544, "learning_rate": 4.6309129956592545e-05, "loss": 0.5507, "num_input_tokens_seen": 1123008, "step": 3335 }, { "epoch": 2.581143740340031, "grad_norm": 3.5748002529144287, "learning_rate": 4.6291476987267325e-05, "loss": 0.6061, "num_input_tokens_seen": 1124672, "step": 3340 }, { "epoch": 2.5850077279752703, "grad_norm": 1.9211492538452148, "learning_rate": 4.627378528398153e-05, "loss": 0.6613, "num_input_tokens_seen": 1126240, "step": 3345 }, { "epoch": 2.58887171561051, "grad_norm": 1.789670467376709, "learning_rate": 4.625605487892034e-05, "loss": 0.5642, "num_input_tokens_seen": 1127744, "step": 3350 }, { "epoch": 2.59273570324575, "grad_norm": 1.1578205823898315, "learning_rate": 4.623828580433932e-05, "loss": 0.5476, "num_input_tokens_seen": 1129472, "step": 3355 }, { "epoch": 2.596599690880989, "grad_norm": 1.4665238857269287, "learning_rate": 4.622047809256439e-05, "loss": 0.4922, "num_input_tokens_seen": 1131200, "step": 3360 }, { "epoch": 2.6004636785162285, "grad_norm": 2.1001505851745605, "learning_rate": 4.620263177599177e-05, "loss": 0.6043, "num_input_tokens_seen": 1132832, "step": 3365 }, { "epoch": 2.6043276661514683, "grad_norm": 2.3926289081573486, "learning_rate": 4.618474688708789e-05, "loss": 0.5341, "num_input_tokens_seen": 1134432, "step": 3370 }, { "epoch": 2.608191653786708, "grad_norm": 1.2728084325790405, "learning_rate": 4.6166823458389385e-05, "loss": 0.4747, "num_input_tokens_seen": 1136256, "step": 3375 }, { "epoch": 2.6120556414219473, "grad_norm": 1.206154704093933, "learning_rate": 4.614886152250296e-05, "loss": 0.6293, "num_input_tokens_seen": 1138016, "step": 3380 }, { "epoch": 2.615919629057187, "grad_norm": 1.149579644203186, "learning_rate": 4.6130861112105416e-05, "loss": 0.7112, "num_input_tokens_seen": 1139712, "step": 3385 }, { "epoch": 2.6197836166924264, "grad_norm": 2.651641607284546, "learning_rate": 4.611282225994351e-05, "loss": 0.5774, "num_input_tokens_seen": 1141408, "step": 3390 }, { "epoch": 2.623647604327666, "grad_norm": 4.051639080047607, "learning_rate": 4.609474499883396e-05, "loss": 0.5171, "num_input_tokens_seen": 1143168, "step": 3395 }, { "epoch": 2.6275115919629055, "grad_norm": 1.1607474088668823, "learning_rate": 4.607662936166334e-05, "loss": 0.5908, "num_input_tokens_seen": 1144640, "step": 3400 }, { "epoch": 2.6313755795981453, "grad_norm": 4.535083293914795, "learning_rate": 4.6058475381388055e-05, "loss": 0.7344, "num_input_tokens_seen": 1146208, "step": 3405 }, { "epoch": 2.635239567233385, "grad_norm": 1.6206036806106567, "learning_rate": 4.6040283091034263e-05, "loss": 0.4538, "num_input_tokens_seen": 1147648, "step": 3410 }, { "epoch": 2.6391035548686244, "grad_norm": 1.333551287651062, "learning_rate": 4.6022052523697794e-05, "loss": 1.0622, "num_input_tokens_seen": 1149408, "step": 3415 }, { "epoch": 2.642967542503864, "grad_norm": 2.1299562454223633, "learning_rate": 4.600378371254414e-05, "loss": 0.6358, "num_input_tokens_seen": 1151168, "step": 3420 }, { "epoch": 2.6468315301391034, "grad_norm": 4.0673017501831055, "learning_rate": 4.5985476690808355e-05, "loss": 0.6778, "num_input_tokens_seen": 1152736, "step": 3425 }, { "epoch": 2.650695517774343, "grad_norm": 0.7632556557655334, "learning_rate": 4.596713149179501e-05, "loss": 0.6841, "num_input_tokens_seen": 1154624, "step": 3430 }, { "epoch": 2.6545595054095825, "grad_norm": 1.3216108083724976, "learning_rate": 4.594874814887811e-05, "loss": 0.5417, "num_input_tokens_seen": 1156480, "step": 3435 }, { "epoch": 2.6584234930448223, "grad_norm": 2.1246426105499268, "learning_rate": 4.593032669550109e-05, "loss": 0.5522, "num_input_tokens_seen": 1158176, "step": 3440 }, { "epoch": 2.662287480680062, "grad_norm": 2.0757579803466797, "learning_rate": 4.591186716517669e-05, "loss": 0.5284, "num_input_tokens_seen": 1160032, "step": 3445 }, { "epoch": 2.6661514683153014, "grad_norm": 1.3203383684158325, "learning_rate": 4.589336959148692e-05, "loss": 0.4886, "num_input_tokens_seen": 1161824, "step": 3450 }, { "epoch": 2.6700154559505407, "grad_norm": 4.877880096435547, "learning_rate": 4.5874834008083015e-05, "loss": 0.691, "num_input_tokens_seen": 1163520, "step": 3455 }, { "epoch": 2.6738794435857804, "grad_norm": 1.8425902128219604, "learning_rate": 4.585626044868534e-05, "loss": 0.5202, "num_input_tokens_seen": 1165408, "step": 3460 }, { "epoch": 2.67774343122102, "grad_norm": 1.5242557525634766, "learning_rate": 4.5837648947083355e-05, "loss": 0.527, "num_input_tokens_seen": 1167104, "step": 3465 }, { "epoch": 2.6816074188562595, "grad_norm": 1.8297951221466064, "learning_rate": 4.5818999537135556e-05, "loss": 0.5593, "num_input_tokens_seen": 1168960, "step": 3470 }, { "epoch": 2.6854714064914993, "grad_norm": 1.8081016540527344, "learning_rate": 4.580031225276937e-05, "loss": 0.5157, "num_input_tokens_seen": 1170656, "step": 3475 }, { "epoch": 2.689335394126739, "grad_norm": 2.4675021171569824, "learning_rate": 4.5781587127981176e-05, "loss": 0.5054, "num_input_tokens_seen": 1172064, "step": 3480 }, { "epoch": 2.6931993817619784, "grad_norm": 1.4511276483535767, "learning_rate": 4.576282419683615e-05, "loss": 0.91, "num_input_tokens_seen": 1173632, "step": 3485 }, { "epoch": 2.6970633693972177, "grad_norm": 1.9782909154891968, "learning_rate": 4.574402349346827e-05, "loss": 0.6688, "num_input_tokens_seen": 1175456, "step": 3490 }, { "epoch": 2.7009273570324575, "grad_norm": 1.7608928680419922, "learning_rate": 4.572518505208021e-05, "loss": 0.5408, "num_input_tokens_seen": 1177024, "step": 3495 }, { "epoch": 2.704791344667697, "grad_norm": 1.2236762046813965, "learning_rate": 4.570630890694332e-05, "loss": 0.575, "num_input_tokens_seen": 1178752, "step": 3500 }, { "epoch": 2.7086553323029365, "grad_norm": 1.363706111907959, "learning_rate": 4.568739509239753e-05, "loss": 0.6752, "num_input_tokens_seen": 1180352, "step": 3505 }, { "epoch": 2.7125193199381763, "grad_norm": 3.2164578437805176, "learning_rate": 4.56684436428513e-05, "loss": 0.7305, "num_input_tokens_seen": 1182176, "step": 3510 }, { "epoch": 2.7163833075734156, "grad_norm": 1.36143958568573, "learning_rate": 4.5649454592781546e-05, "loss": 0.5915, "num_input_tokens_seen": 1183872, "step": 3515 }, { "epoch": 2.7202472952086554, "grad_norm": 1.3736883401870728, "learning_rate": 4.5630427976733624e-05, "loss": 0.5289, "num_input_tokens_seen": 1185792, "step": 3520 }, { "epoch": 2.7241112828438947, "grad_norm": 2.0808115005493164, "learning_rate": 4.5611363829321184e-05, "loss": 0.4745, "num_input_tokens_seen": 1187328, "step": 3525 }, { "epoch": 2.7279752704791345, "grad_norm": 1.7596118450164795, "learning_rate": 4.559226218522617e-05, "loss": 0.5984, "num_input_tokens_seen": 1188800, "step": 3530 }, { "epoch": 2.7318392581143742, "grad_norm": 1.4933277368545532, "learning_rate": 4.557312307919878e-05, "loss": 0.4772, "num_input_tokens_seen": 1190464, "step": 3535 }, { "epoch": 2.7357032457496135, "grad_norm": 3.4922046661376953, "learning_rate": 4.55539465460573e-05, "loss": 0.6257, "num_input_tokens_seen": 1192192, "step": 3540 }, { "epoch": 2.7395672333848533, "grad_norm": 1.4701374769210815, "learning_rate": 4.553473262068815e-05, "loss": 0.4605, "num_input_tokens_seen": 1193984, "step": 3545 }, { "epoch": 2.7434312210200926, "grad_norm": 1.2330809831619263, "learning_rate": 4.551548133804575e-05, "loss": 0.6886, "num_input_tokens_seen": 1195616, "step": 3550 }, { "epoch": 2.7472952086553324, "grad_norm": 1.6165212392807007, "learning_rate": 4.549619273315249e-05, "loss": 0.5561, "num_input_tokens_seen": 1197344, "step": 3555 }, { "epoch": 2.7511591962905717, "grad_norm": 1.1340265274047852, "learning_rate": 4.547686684109867e-05, "loss": 0.4766, "num_input_tokens_seen": 1199040, "step": 3560 }, { "epoch": 2.7550231839258115, "grad_norm": 2.1234073638916016, "learning_rate": 4.545750369704239e-05, "loss": 0.5231, "num_input_tokens_seen": 1200448, "step": 3565 }, { "epoch": 2.7588871715610512, "grad_norm": 1.1463847160339355, "learning_rate": 4.5438103336209565e-05, "loss": 0.6582, "num_input_tokens_seen": 1202304, "step": 3570 }, { "epoch": 2.7627511591962906, "grad_norm": 2.0229923725128174, "learning_rate": 4.5418665793893777e-05, "loss": 0.6884, "num_input_tokens_seen": 1203840, "step": 3575 }, { "epoch": 2.76661514683153, "grad_norm": 1.6085162162780762, "learning_rate": 4.539919110545627e-05, "loss": 0.5793, "num_input_tokens_seen": 1205568, "step": 3580 }, { "epoch": 2.7704791344667696, "grad_norm": 1.719744324684143, "learning_rate": 4.537967930632587e-05, "loss": 0.5243, "num_input_tokens_seen": 1207072, "step": 3585 }, { "epoch": 2.7743431221020094, "grad_norm": 4.771373271942139, "learning_rate": 4.536013043199889e-05, "loss": 0.6496, "num_input_tokens_seen": 1209184, "step": 3590 }, { "epoch": 2.7782071097372487, "grad_norm": 2.4482030868530273, "learning_rate": 4.534054451803911e-05, "loss": 0.5303, "num_input_tokens_seen": 1211168, "step": 3595 }, { "epoch": 2.7820710973724885, "grad_norm": 1.8389033079147339, "learning_rate": 4.5320921600077685e-05, "loss": 0.5431, "num_input_tokens_seen": 1212864, "step": 3600 }, { "epoch": 2.7859350850077282, "grad_norm": 3.1525673866271973, "learning_rate": 4.530126171381311e-05, "loss": 0.5224, "num_input_tokens_seen": 1214784, "step": 3605 }, { "epoch": 2.7897990726429676, "grad_norm": 1.304419994354248, "learning_rate": 4.5281564895011094e-05, "loss": 0.5501, "num_input_tokens_seen": 1216768, "step": 3610 }, { "epoch": 2.793663060278207, "grad_norm": 2.0876269340515137, "learning_rate": 4.526183117950459e-05, "loss": 0.5021, "num_input_tokens_seen": 1218432, "step": 3615 }, { "epoch": 2.7975270479134466, "grad_norm": 1.413348913192749, "learning_rate": 4.5242060603193604e-05, "loss": 0.4511, "num_input_tokens_seen": 1220064, "step": 3620 }, { "epoch": 2.8013910355486864, "grad_norm": 1.5908048152923584, "learning_rate": 4.522225320204526e-05, "loss": 0.6544, "num_input_tokens_seen": 1221632, "step": 3625 }, { "epoch": 2.8052550231839257, "grad_norm": 1.3921207189559937, "learning_rate": 4.520240901209365e-05, "loss": 0.4101, "num_input_tokens_seen": 1223232, "step": 3630 }, { "epoch": 2.8091190108191655, "grad_norm": 3.274709701538086, "learning_rate": 4.51825280694398e-05, "loss": 0.7381, "num_input_tokens_seen": 1225120, "step": 3635 }, { "epoch": 2.812982998454405, "grad_norm": 2.3160617351531982, "learning_rate": 4.516261041025158e-05, "loss": 0.458, "num_input_tokens_seen": 1227008, "step": 3640 }, { "epoch": 2.8168469860896446, "grad_norm": 2.004870653152466, "learning_rate": 4.514265607076368e-05, "loss": 0.5521, "num_input_tokens_seen": 1228768, "step": 3645 }, { "epoch": 2.820710973724884, "grad_norm": 1.5570169687271118, "learning_rate": 4.51226650872775e-05, "loss": 0.4213, "num_input_tokens_seen": 1230368, "step": 3650 }, { "epoch": 2.8245749613601236, "grad_norm": 3.569948196411133, "learning_rate": 4.5102637496161115e-05, "loss": 0.6639, "num_input_tokens_seen": 1232000, "step": 3655 }, { "epoch": 2.8284389489953634, "grad_norm": 5.539769172668457, "learning_rate": 4.50825733338492e-05, "loss": 0.8101, "num_input_tokens_seen": 1233600, "step": 3660 }, { "epoch": 2.8323029366306027, "grad_norm": 2.22871994972229, "learning_rate": 4.506247263684294e-05, "loss": 0.6222, "num_input_tokens_seen": 1235072, "step": 3665 }, { "epoch": 2.8361669242658425, "grad_norm": 1.6910781860351562, "learning_rate": 4.5042335441710026e-05, "loss": 0.5559, "num_input_tokens_seen": 1236832, "step": 3670 }, { "epoch": 2.840030911901082, "grad_norm": 2.0096123218536377, "learning_rate": 4.5022161785084506e-05, "loss": 0.4591, "num_input_tokens_seen": 1238464, "step": 3675 }, { "epoch": 2.8438948995363216, "grad_norm": 3.118959665298462, "learning_rate": 4.5001951703666766e-05, "loss": 0.6422, "num_input_tokens_seen": 1239936, "step": 3680 }, { "epoch": 2.847758887171561, "grad_norm": 1.5835235118865967, "learning_rate": 4.498170523422348e-05, "loss": 0.6457, "num_input_tokens_seen": 1241856, "step": 3685 }, { "epoch": 2.8516228748068007, "grad_norm": 3.609930992126465, "learning_rate": 4.4961422413587506e-05, "loss": 0.5294, "num_input_tokens_seen": 1243520, "step": 3690 }, { "epoch": 2.8554868624420404, "grad_norm": 5.147461891174316, "learning_rate": 4.4941103278657825e-05, "loss": 0.5428, "num_input_tokens_seen": 1245472, "step": 3695 }, { "epoch": 2.8593508500772797, "grad_norm": 7.061466217041016, "learning_rate": 4.4920747866399495e-05, "loss": 0.6778, "num_input_tokens_seen": 1247392, "step": 3700 }, { "epoch": 2.863214837712519, "grad_norm": 5.441125392913818, "learning_rate": 4.4900356213843565e-05, "loss": 0.6167, "num_input_tokens_seen": 1249056, "step": 3705 }, { "epoch": 2.867078825347759, "grad_norm": 1.8006725311279297, "learning_rate": 4.487992835808701e-05, "loss": 0.6663, "num_input_tokens_seen": 1250880, "step": 3710 }, { "epoch": 2.8709428129829986, "grad_norm": 2.437763214111328, "learning_rate": 4.4859464336292674e-05, "loss": 0.5584, "num_input_tokens_seen": 1252448, "step": 3715 }, { "epoch": 2.874806800618238, "grad_norm": 1.9037609100341797, "learning_rate": 4.483896418568919e-05, "loss": 0.4547, "num_input_tokens_seen": 1254208, "step": 3720 }, { "epoch": 2.8786707882534777, "grad_norm": 1.7170246839523315, "learning_rate": 4.4818427943570924e-05, "loss": 0.4723, "num_input_tokens_seen": 1255840, "step": 3725 }, { "epoch": 2.8825347758887174, "grad_norm": 3.8570783138275146, "learning_rate": 4.4797855647297886e-05, "loss": 0.4699, "num_input_tokens_seen": 1257408, "step": 3730 }, { "epoch": 2.8863987635239567, "grad_norm": 1.133665919303894, "learning_rate": 4.477724733429569e-05, "loss": 0.5358, "num_input_tokens_seen": 1258944, "step": 3735 }, { "epoch": 2.890262751159196, "grad_norm": 1.6512043476104736, "learning_rate": 4.4756603042055476e-05, "loss": 0.4889, "num_input_tokens_seen": 1260640, "step": 3740 }, { "epoch": 2.894126738794436, "grad_norm": 2.5813212394714355, "learning_rate": 4.4735922808133826e-05, "loss": 0.6085, "num_input_tokens_seen": 1262432, "step": 3745 }, { "epoch": 2.8979907264296756, "grad_norm": 2.055209159851074, "learning_rate": 4.47152066701527e-05, "loss": 0.6599, "num_input_tokens_seen": 1264096, "step": 3750 }, { "epoch": 2.901854714064915, "grad_norm": 2.6289875507354736, "learning_rate": 4.4694454665799404e-05, "loss": 0.4919, "num_input_tokens_seen": 1265568, "step": 3755 }, { "epoch": 2.9057187017001547, "grad_norm": 2.2878947257995605, "learning_rate": 4.467366683282647e-05, "loss": 0.5619, "num_input_tokens_seen": 1267200, "step": 3760 }, { "epoch": 2.909582689335394, "grad_norm": 1.2125518321990967, "learning_rate": 4.465284320905162e-05, "loss": 0.7503, "num_input_tokens_seen": 1268800, "step": 3765 }, { "epoch": 2.9134466769706338, "grad_norm": 1.128667950630188, "learning_rate": 4.4631983832357684e-05, "loss": 0.4215, "num_input_tokens_seen": 1270464, "step": 3770 }, { "epoch": 2.917310664605873, "grad_norm": 1.32891845703125, "learning_rate": 4.461108874069254e-05, "loss": 0.5859, "num_input_tokens_seen": 1272000, "step": 3775 }, { "epoch": 2.921174652241113, "grad_norm": 4.825840473175049, "learning_rate": 4.4590157972069025e-05, "loss": 0.7787, "num_input_tokens_seen": 1273664, "step": 3780 }, { "epoch": 2.9250386398763526, "grad_norm": 3.4125306606292725, "learning_rate": 4.45691915645649e-05, "loss": 0.604, "num_input_tokens_seen": 1275072, "step": 3785 }, { "epoch": 2.928902627511592, "grad_norm": 4.5419158935546875, "learning_rate": 4.454818955632274e-05, "loss": 0.4611, "num_input_tokens_seen": 1276768, "step": 3790 }, { "epoch": 2.9327666151468317, "grad_norm": 1.679552674293518, "learning_rate": 4.452715198554991e-05, "loss": 0.4884, "num_input_tokens_seen": 1278432, "step": 3795 }, { "epoch": 2.936630602782071, "grad_norm": 3.3888494968414307, "learning_rate": 4.450607889051846e-05, "loss": 0.678, "num_input_tokens_seen": 1280448, "step": 3800 }, { "epoch": 2.9404945904173108, "grad_norm": 2.1231448650360107, "learning_rate": 4.4484970309565046e-05, "loss": 0.5854, "num_input_tokens_seen": 1282368, "step": 3805 }, { "epoch": 2.94435857805255, "grad_norm": 2.608415365219116, "learning_rate": 4.4463826281090916e-05, "loss": 0.5438, "num_input_tokens_seen": 1284064, "step": 3810 }, { "epoch": 2.94822256568779, "grad_norm": 3.572591543197632, "learning_rate": 4.4442646843561785e-05, "loss": 0.8952, "num_input_tokens_seen": 1285728, "step": 3815 }, { "epoch": 2.9520865533230296, "grad_norm": 5.7818403244018555, "learning_rate": 4.4421432035507786e-05, "loss": 0.889, "num_input_tokens_seen": 1287392, "step": 3820 }, { "epoch": 2.955950540958269, "grad_norm": 5.005038261413574, "learning_rate": 4.4400181895523405e-05, "loss": 0.744, "num_input_tokens_seen": 1289312, "step": 3825 }, { "epoch": 2.9598145285935082, "grad_norm": 2.026059627532959, "learning_rate": 4.43788964622674e-05, "loss": 0.5456, "num_input_tokens_seen": 1290752, "step": 3830 }, { "epoch": 2.963678516228748, "grad_norm": 3.3033807277679443, "learning_rate": 4.4357575774462744e-05, "loss": 0.7589, "num_input_tokens_seen": 1292288, "step": 3835 }, { "epoch": 2.9675425038639878, "grad_norm": 1.0577995777130127, "learning_rate": 4.4336219870896525e-05, "loss": 0.5948, "num_input_tokens_seen": 1293856, "step": 3840 }, { "epoch": 2.971406491499227, "grad_norm": 4.597434043884277, "learning_rate": 4.4314828790419916e-05, "loss": 0.4489, "num_input_tokens_seen": 1295424, "step": 3845 }, { "epoch": 2.975270479134467, "grad_norm": 1.4607815742492676, "learning_rate": 4.429340257194807e-05, "loss": 0.4694, "num_input_tokens_seen": 1297280, "step": 3850 }, { "epoch": 2.9791344667697066, "grad_norm": 4.757996082305908, "learning_rate": 4.427194125446009e-05, "loss": 0.5582, "num_input_tokens_seen": 1298816, "step": 3855 }, { "epoch": 2.982998454404946, "grad_norm": 6.332582950592041, "learning_rate": 4.4250444876998906e-05, "loss": 0.7156, "num_input_tokens_seen": 1300512, "step": 3860 }, { "epoch": 2.9868624420401853, "grad_norm": 1.7305604219436646, "learning_rate": 4.422891347867124e-05, "loss": 0.8584, "num_input_tokens_seen": 1302432, "step": 3865 }, { "epoch": 2.990726429675425, "grad_norm": 2.401588201522827, "learning_rate": 4.4207347098647525e-05, "loss": 0.4862, "num_input_tokens_seen": 1304032, "step": 3870 }, { "epoch": 2.9945904173106648, "grad_norm": 1.447226881980896, "learning_rate": 4.418574577616182e-05, "loss": 0.5239, "num_input_tokens_seen": 1305664, "step": 3875 }, { "epoch": 2.998454404945904, "grad_norm": 1.4749029874801636, "learning_rate": 4.4164109550511775e-05, "loss": 0.6085, "num_input_tokens_seen": 1307456, "step": 3880 }, { "epoch": 3.0, "eval_loss": 0.5941787362098694, "eval_runtime": 11.4603, "eval_samples_per_second": 50.173, "eval_steps_per_second": 12.565, "num_input_tokens_seen": 1307968, "step": 3882 }, { "epoch": 3.002318392581144, "grad_norm": 2.5790042877197266, "learning_rate": 4.414243846105852e-05, "loss": 0.7192, "num_input_tokens_seen": 1309088, "step": 3885 }, { "epoch": 3.006182380216383, "grad_norm": 1.3487333059310913, "learning_rate": 4.412073254722662e-05, "loss": 0.4674, "num_input_tokens_seen": 1310560, "step": 3890 }, { "epoch": 3.010046367851623, "grad_norm": 2.1629276275634766, "learning_rate": 4.409899184850399e-05, "loss": 0.8152, "num_input_tokens_seen": 1312480, "step": 3895 }, { "epoch": 3.0139103554868623, "grad_norm": 2.835353374481201, "learning_rate": 4.407721640444181e-05, "loss": 0.8317, "num_input_tokens_seen": 1313856, "step": 3900 }, { "epoch": 3.017774343122102, "grad_norm": 4.20973539352417, "learning_rate": 4.405540625465451e-05, "loss": 0.4737, "num_input_tokens_seen": 1315328, "step": 3905 }, { "epoch": 3.021638330757342, "grad_norm": 3.189619302749634, "learning_rate": 4.403356143881961e-05, "loss": 0.5799, "num_input_tokens_seen": 1316960, "step": 3910 }, { "epoch": 3.025502318392581, "grad_norm": 1.3044224977493286, "learning_rate": 4.4011681996677735e-05, "loss": 0.5666, "num_input_tokens_seen": 1318560, "step": 3915 }, { "epoch": 3.029366306027821, "grad_norm": 1.4026161432266235, "learning_rate": 4.398976796803248e-05, "loss": 0.4501, "num_input_tokens_seen": 1320384, "step": 3920 }, { "epoch": 3.03323029366306, "grad_norm": 1.637884497642517, "learning_rate": 4.396781939275036e-05, "loss": 0.6848, "num_input_tokens_seen": 1322112, "step": 3925 }, { "epoch": 3.0370942812983, "grad_norm": 1.6982097625732422, "learning_rate": 4.394583631076075e-05, "loss": 0.4298, "num_input_tokens_seen": 1323712, "step": 3930 }, { "epoch": 3.0409582689335393, "grad_norm": 1.6389122009277344, "learning_rate": 4.3923818762055794e-05, "loss": 0.4464, "num_input_tokens_seen": 1325440, "step": 3935 }, { "epoch": 3.044822256568779, "grad_norm": 3.021299362182617, "learning_rate": 4.3901766786690334e-05, "loss": 0.5063, "num_input_tokens_seen": 1327200, "step": 3940 }, { "epoch": 3.0486862442040183, "grad_norm": 1.1507476568222046, "learning_rate": 4.387968042478186e-05, "loss": 0.6143, "num_input_tokens_seen": 1328704, "step": 3945 }, { "epoch": 3.052550231839258, "grad_norm": 1.446286678314209, "learning_rate": 4.3857559716510394e-05, "loss": 0.4979, "num_input_tokens_seen": 1330528, "step": 3950 }, { "epoch": 3.056414219474498, "grad_norm": 4.553092956542969, "learning_rate": 4.383540470211846e-05, "loss": 0.471, "num_input_tokens_seen": 1332288, "step": 3955 }, { "epoch": 3.060278207109737, "grad_norm": 2.48172926902771, "learning_rate": 4.3813215421910984e-05, "loss": 0.4475, "num_input_tokens_seen": 1334080, "step": 3960 }, { "epoch": 3.064142194744977, "grad_norm": 1.475191593170166, "learning_rate": 4.3790991916255234e-05, "loss": 0.4058, "num_input_tokens_seen": 1335680, "step": 3965 }, { "epoch": 3.0680061823802163, "grad_norm": 2.3719654083251953, "learning_rate": 4.376873422558073e-05, "loss": 0.6201, "num_input_tokens_seen": 1337248, "step": 3970 }, { "epoch": 3.071870170015456, "grad_norm": 1.9354842901229858, "learning_rate": 4.374644239037921e-05, "loss": 0.5352, "num_input_tokens_seen": 1338752, "step": 3975 }, { "epoch": 3.0757341576506954, "grad_norm": 2.903547525405884, "learning_rate": 4.37241164512045e-05, "loss": 0.4253, "num_input_tokens_seen": 1340256, "step": 3980 }, { "epoch": 3.079598145285935, "grad_norm": 1.3258841037750244, "learning_rate": 4.370175644867247e-05, "loss": 0.4356, "num_input_tokens_seen": 1342016, "step": 3985 }, { "epoch": 3.0834621329211744, "grad_norm": 2.4656431674957275, "learning_rate": 4.367936242346098e-05, "loss": 0.5112, "num_input_tokens_seen": 1343616, "step": 3990 }, { "epoch": 3.087326120556414, "grad_norm": 1.2736636400222778, "learning_rate": 4.365693441630976e-05, "loss": 0.5505, "num_input_tokens_seen": 1345408, "step": 3995 }, { "epoch": 3.091190108191654, "grad_norm": 1.9504047632217407, "learning_rate": 4.3634472468020385e-05, "loss": 0.4933, "num_input_tokens_seen": 1347296, "step": 4000 }, { "epoch": 3.0950540958268933, "grad_norm": 1.2948801517486572, "learning_rate": 4.361197661945616e-05, "loss": 0.7142, "num_input_tokens_seen": 1348800, "step": 4005 }, { "epoch": 3.098918083462133, "grad_norm": 1.4261538982391357, "learning_rate": 4.3589446911542074e-05, "loss": 0.4338, "num_input_tokens_seen": 1350656, "step": 4010 }, { "epoch": 3.1027820710973724, "grad_norm": 2.2387986183166504, "learning_rate": 4.3566883385264694e-05, "loss": 0.4229, "num_input_tokens_seen": 1352128, "step": 4015 }, { "epoch": 3.106646058732612, "grad_norm": 2.272691011428833, "learning_rate": 4.3544286081672134e-05, "loss": 0.4568, "num_input_tokens_seen": 1354240, "step": 4020 }, { "epoch": 3.1105100463678514, "grad_norm": 2.714954137802124, "learning_rate": 4.352165504187394e-05, "loss": 0.564, "num_input_tokens_seen": 1356128, "step": 4025 }, { "epoch": 3.114374034003091, "grad_norm": 2.0696589946746826, "learning_rate": 4.349899030704103e-05, "loss": 0.6418, "num_input_tokens_seen": 1357952, "step": 4030 }, { "epoch": 3.118238021638331, "grad_norm": 3.6651084423065186, "learning_rate": 4.347629191840564e-05, "loss": 0.6449, "num_input_tokens_seen": 1359584, "step": 4035 }, { "epoch": 3.1221020092735703, "grad_norm": 1.5164068937301636, "learning_rate": 4.3453559917261216e-05, "loss": 0.5472, "num_input_tokens_seen": 1361056, "step": 4040 }, { "epoch": 3.12596599690881, "grad_norm": 2.3051259517669678, "learning_rate": 4.343079434496235e-05, "loss": 0.5589, "num_input_tokens_seen": 1362848, "step": 4045 }, { "epoch": 3.1298299845440494, "grad_norm": 1.5726701021194458, "learning_rate": 4.3407995242924714e-05, "loss": 0.5117, "num_input_tokens_seen": 1364512, "step": 4050 }, { "epoch": 3.133693972179289, "grad_norm": 1.5542783737182617, "learning_rate": 4.338516265262497e-05, "loss": 0.6942, "num_input_tokens_seen": 1366304, "step": 4055 }, { "epoch": 3.1375579598145285, "grad_norm": 2.2469356060028076, "learning_rate": 4.3362296615600716e-05, "loss": 0.517, "num_input_tokens_seen": 1367872, "step": 4060 }, { "epoch": 3.141421947449768, "grad_norm": 1.7843961715698242, "learning_rate": 4.333939717345039e-05, "loss": 0.4825, "num_input_tokens_seen": 1369504, "step": 4065 }, { "epoch": 3.1452859350850075, "grad_norm": 1.1871178150177002, "learning_rate": 4.33164643678332e-05, "loss": 0.4818, "num_input_tokens_seen": 1371328, "step": 4070 }, { "epoch": 3.1491499227202473, "grad_norm": 2.2114365100860596, "learning_rate": 4.3293498240469045e-05, "loss": 0.5847, "num_input_tokens_seen": 1372800, "step": 4075 }, { "epoch": 3.153013910355487, "grad_norm": 4.519014358520508, "learning_rate": 4.327049883313845e-05, "loss": 0.6299, "num_input_tokens_seen": 1374496, "step": 4080 }, { "epoch": 3.1568778979907264, "grad_norm": 2.8311524391174316, "learning_rate": 4.324746618768249e-05, "loss": 0.4825, "num_input_tokens_seen": 1376064, "step": 4085 }, { "epoch": 3.160741885625966, "grad_norm": 2.63315486907959, "learning_rate": 4.3224400346002705e-05, "loss": 0.4185, "num_input_tokens_seen": 1377728, "step": 4090 }, { "epoch": 3.1646058732612055, "grad_norm": 5.207010269165039, "learning_rate": 4.3201301350060996e-05, "loss": 0.5275, "num_input_tokens_seen": 1379744, "step": 4095 }, { "epoch": 3.1684698608964452, "grad_norm": 1.3192821741104126, "learning_rate": 4.317816924187964e-05, "loss": 0.5118, "num_input_tokens_seen": 1381536, "step": 4100 }, { "epoch": 3.1723338485316845, "grad_norm": 1.5576586723327637, "learning_rate": 4.3155004063541085e-05, "loss": 0.4267, "num_input_tokens_seen": 1383264, "step": 4105 }, { "epoch": 3.1761978361669243, "grad_norm": 2.440986394882202, "learning_rate": 4.3131805857188e-05, "loss": 0.4167, "num_input_tokens_seen": 1384928, "step": 4110 }, { "epoch": 3.1800618238021636, "grad_norm": 1.3095035552978516, "learning_rate": 4.3108574665023095e-05, "loss": 0.7152, "num_input_tokens_seen": 1386528, "step": 4115 }, { "epoch": 3.1839258114374034, "grad_norm": 1.7267659902572632, "learning_rate": 4.308531052930911e-05, "loss": 0.5949, "num_input_tokens_seen": 1388480, "step": 4120 }, { "epoch": 3.187789799072643, "grad_norm": 2.4541592597961426, "learning_rate": 4.306201349236872e-05, "loss": 0.5537, "num_input_tokens_seen": 1390272, "step": 4125 }, { "epoch": 3.1916537867078825, "grad_norm": 1.8966723680496216, "learning_rate": 4.303868359658445e-05, "loss": 0.5065, "num_input_tokens_seen": 1391936, "step": 4130 }, { "epoch": 3.1955177743431222, "grad_norm": 2.047074317932129, "learning_rate": 4.301532088439859e-05, "loss": 0.5692, "num_input_tokens_seen": 1393536, "step": 4135 }, { "epoch": 3.1993817619783615, "grad_norm": 4.05169153213501, "learning_rate": 4.299192539831315e-05, "loss": 0.5246, "num_input_tokens_seen": 1395168, "step": 4140 }, { "epoch": 3.2032457496136013, "grad_norm": 1.3212201595306396, "learning_rate": 4.2968497180889746e-05, "loss": 0.447, "num_input_tokens_seen": 1396672, "step": 4145 }, { "epoch": 3.2071097372488406, "grad_norm": 6.214070796966553, "learning_rate": 4.294503627474955e-05, "loss": 0.7184, "num_input_tokens_seen": 1398496, "step": 4150 }, { "epoch": 3.2109737248840804, "grad_norm": 1.56643545627594, "learning_rate": 4.292154272257319e-05, "loss": 0.4908, "num_input_tokens_seen": 1400160, "step": 4155 }, { "epoch": 3.21483771251932, "grad_norm": 1.6522753238677979, "learning_rate": 4.289801656710072e-05, "loss": 0.6311, "num_input_tokens_seen": 1401792, "step": 4160 }, { "epoch": 3.2187017001545595, "grad_norm": 1.3291428089141846, "learning_rate": 4.287445785113146e-05, "loss": 0.7471, "num_input_tokens_seen": 1403680, "step": 4165 }, { "epoch": 3.2225656877897992, "grad_norm": 3.1879682540893555, "learning_rate": 4.2850866617524e-05, "loss": 0.4275, "num_input_tokens_seen": 1405408, "step": 4170 }, { "epoch": 3.2264296754250386, "grad_norm": 1.5519782304763794, "learning_rate": 4.282724290919605e-05, "loss": 0.6756, "num_input_tokens_seen": 1407168, "step": 4175 }, { "epoch": 3.2302936630602783, "grad_norm": 4.281375408172607, "learning_rate": 4.2803586769124435e-05, "loss": 0.8003, "num_input_tokens_seen": 1409088, "step": 4180 }, { "epoch": 3.2341576506955176, "grad_norm": 3.2866830825805664, "learning_rate": 4.277989824034497e-05, "loss": 0.4712, "num_input_tokens_seen": 1410880, "step": 4185 }, { "epoch": 3.2380216383307574, "grad_norm": 2.1629323959350586, "learning_rate": 4.2756177365952385e-05, "loss": 0.6833, "num_input_tokens_seen": 1412704, "step": 4190 }, { "epoch": 3.2418856259659967, "grad_norm": 4.5756425857543945, "learning_rate": 4.2732424189100265e-05, "loss": 0.6252, "num_input_tokens_seen": 1414336, "step": 4195 }, { "epoch": 3.2457496136012365, "grad_norm": 1.2097439765930176, "learning_rate": 4.270863875300094e-05, "loss": 0.3978, "num_input_tokens_seen": 1416128, "step": 4200 }, { "epoch": 3.2496136012364762, "grad_norm": 1.6057111024856567, "learning_rate": 4.2684821100925445e-05, "loss": 0.4285, "num_input_tokens_seen": 1417568, "step": 4205 }, { "epoch": 3.2534775888717156, "grad_norm": 2.921102523803711, "learning_rate": 4.266097127620342e-05, "loss": 0.4632, "num_input_tokens_seen": 1419296, "step": 4210 }, { "epoch": 3.2573415765069553, "grad_norm": 1.616066336631775, "learning_rate": 4.263708932222303e-05, "loss": 0.4386, "num_input_tokens_seen": 1420768, "step": 4215 }, { "epoch": 3.2612055641421946, "grad_norm": 5.963724136352539, "learning_rate": 4.2613175282430904e-05, "loss": 0.9767, "num_input_tokens_seen": 1422560, "step": 4220 }, { "epoch": 3.2650695517774344, "grad_norm": 1.9723445177078247, "learning_rate": 4.258922920033203e-05, "loss": 0.4984, "num_input_tokens_seen": 1424288, "step": 4225 }, { "epoch": 3.2689335394126737, "grad_norm": 1.67899751663208, "learning_rate": 4.256525111948967e-05, "loss": 0.456, "num_input_tokens_seen": 1426048, "step": 4230 }, { "epoch": 3.2727975270479135, "grad_norm": 1.1183730363845825, "learning_rate": 4.254124108352533e-05, "loss": 0.4807, "num_input_tokens_seen": 1427712, "step": 4235 }, { "epoch": 3.276661514683153, "grad_norm": 2.273562431335449, "learning_rate": 4.251719913611863e-05, "loss": 0.4839, "num_input_tokens_seen": 1429248, "step": 4240 }, { "epoch": 3.2805255023183926, "grad_norm": 1.0710886716842651, "learning_rate": 4.2493125321007276e-05, "loss": 0.4457, "num_input_tokens_seen": 1431456, "step": 4245 }, { "epoch": 3.2843894899536323, "grad_norm": 1.5294173955917358, "learning_rate": 4.24690196819869e-05, "loss": 0.4029, "num_input_tokens_seen": 1433120, "step": 4250 }, { "epoch": 3.2882534775888717, "grad_norm": 3.687967300415039, "learning_rate": 4.2444882262911066e-05, "loss": 0.5114, "num_input_tokens_seen": 1434720, "step": 4255 }, { "epoch": 3.2921174652241114, "grad_norm": 1.689640998840332, "learning_rate": 4.2420713107691126e-05, "loss": 0.7594, "num_input_tokens_seen": 1436320, "step": 4260 }, { "epoch": 3.2959814528593507, "grad_norm": 1.7179181575775146, "learning_rate": 4.239651226029619e-05, "loss": 0.4091, "num_input_tokens_seen": 1437888, "step": 4265 }, { "epoch": 3.2998454404945905, "grad_norm": 3.5392544269561768, "learning_rate": 4.237227976475302e-05, "loss": 0.6408, "num_input_tokens_seen": 1439616, "step": 4270 }, { "epoch": 3.30370942812983, "grad_norm": 5.957282543182373, "learning_rate": 4.234801566514595e-05, "loss": 0.6225, "num_input_tokens_seen": 1441344, "step": 4275 }, { "epoch": 3.3075734157650696, "grad_norm": 1.6967871189117432, "learning_rate": 4.232372000561678e-05, "loss": 0.4752, "num_input_tokens_seen": 1443168, "step": 4280 }, { "epoch": 3.3114374034003093, "grad_norm": 2.8163554668426514, "learning_rate": 4.229939283036478e-05, "loss": 0.4348, "num_input_tokens_seen": 1444960, "step": 4285 }, { "epoch": 3.3153013910355487, "grad_norm": 3.1443917751312256, "learning_rate": 4.22750341836465e-05, "loss": 0.683, "num_input_tokens_seen": 1446624, "step": 4290 }, { "epoch": 3.3191653786707884, "grad_norm": 2.657972574234009, "learning_rate": 4.2250644109775794e-05, "loss": 0.4559, "num_input_tokens_seen": 1448416, "step": 4295 }, { "epoch": 3.3230293663060277, "grad_norm": 2.382570266723633, "learning_rate": 4.2226222653123645e-05, "loss": 0.5667, "num_input_tokens_seen": 1450048, "step": 4300 }, { "epoch": 3.3268933539412675, "grad_norm": 1.6241313219070435, "learning_rate": 4.220176985811816e-05, "loss": 0.4621, "num_input_tokens_seen": 1451520, "step": 4305 }, { "epoch": 3.330757341576507, "grad_norm": 1.8125404119491577, "learning_rate": 4.2177285769244436e-05, "loss": 0.9889, "num_input_tokens_seen": 1453376, "step": 4310 }, { "epoch": 3.3346213292117466, "grad_norm": 1.4280503988265991, "learning_rate": 4.215277043104451e-05, "loss": 0.6442, "num_input_tokens_seen": 1455104, "step": 4315 }, { "epoch": 3.338485316846986, "grad_norm": 1.2660709619522095, "learning_rate": 4.212822388811726e-05, "loss": 0.3997, "num_input_tokens_seen": 1456928, "step": 4320 }, { "epoch": 3.3423493044822257, "grad_norm": 2.803041696548462, "learning_rate": 4.210364618511836e-05, "loss": 0.416, "num_input_tokens_seen": 1458432, "step": 4325 }, { "epoch": 3.346213292117465, "grad_norm": 1.404082179069519, "learning_rate": 4.207903736676013e-05, "loss": 0.4604, "num_input_tokens_seen": 1460128, "step": 4330 }, { "epoch": 3.3500772797527048, "grad_norm": 4.871484279632568, "learning_rate": 4.2054397477811536e-05, "loss": 0.6844, "num_input_tokens_seen": 1461792, "step": 4335 }, { "epoch": 3.3539412673879445, "grad_norm": 4.9135332107543945, "learning_rate": 4.202972656309803e-05, "loss": 0.5035, "num_input_tokens_seen": 1463360, "step": 4340 }, { "epoch": 3.357805255023184, "grad_norm": 2.795682668685913, "learning_rate": 4.200502466750154e-05, "loss": 0.467, "num_input_tokens_seen": 1465056, "step": 4345 }, { "epoch": 3.3616692426584236, "grad_norm": 2.718587636947632, "learning_rate": 4.1980291835960326e-05, "loss": 0.4763, "num_input_tokens_seen": 1466560, "step": 4350 }, { "epoch": 3.365533230293663, "grad_norm": 3.083667516708374, "learning_rate": 4.1955528113468966e-05, "loss": 0.6261, "num_input_tokens_seen": 1468448, "step": 4355 }, { "epoch": 3.3693972179289027, "grad_norm": 1.9831490516662598, "learning_rate": 4.1930733545078195e-05, "loss": 0.7059, "num_input_tokens_seen": 1470048, "step": 4360 }, { "epoch": 3.373261205564142, "grad_norm": 2.567063093185425, "learning_rate": 4.190590817589488e-05, "loss": 0.4517, "num_input_tokens_seen": 1471744, "step": 4365 }, { "epoch": 3.3771251931993818, "grad_norm": 1.5277791023254395, "learning_rate": 4.1881052051081924e-05, "loss": 0.4074, "num_input_tokens_seen": 1473568, "step": 4370 }, { "epoch": 3.3809891808346215, "grad_norm": 2.6930196285247803, "learning_rate": 4.1856165215858186e-05, "loss": 0.542, "num_input_tokens_seen": 1475616, "step": 4375 }, { "epoch": 3.384853168469861, "grad_norm": 5.092924118041992, "learning_rate": 4.183124771549839e-05, "loss": 0.5881, "num_input_tokens_seen": 1477376, "step": 4380 }, { "epoch": 3.3887171561051006, "grad_norm": 1.5484986305236816, "learning_rate": 4.180629959533303e-05, "loss": 0.5945, "num_input_tokens_seen": 1478976, "step": 4385 }, { "epoch": 3.39258114374034, "grad_norm": 1.5277379751205444, "learning_rate": 4.178132090074833e-05, "loss": 0.4819, "num_input_tokens_seen": 1480608, "step": 4390 }, { "epoch": 3.3964451313755797, "grad_norm": 2.247413396835327, "learning_rate": 4.175631167718612e-05, "loss": 0.446, "num_input_tokens_seen": 1482304, "step": 4395 }, { "epoch": 3.400309119010819, "grad_norm": 1.4215114116668701, "learning_rate": 4.173127197014377e-05, "loss": 0.4332, "num_input_tokens_seen": 1483808, "step": 4400 }, { "epoch": 3.4041731066460588, "grad_norm": 1.7902250289916992, "learning_rate": 4.170620182517412e-05, "loss": 0.3805, "num_input_tokens_seen": 1485408, "step": 4405 }, { "epoch": 3.4080370942812985, "grad_norm": 2.5509159564971924, "learning_rate": 4.168110128788536e-05, "loss": 0.5283, "num_input_tokens_seen": 1487552, "step": 4410 }, { "epoch": 3.411901081916538, "grad_norm": 2.78304386138916, "learning_rate": 4.1655970403941e-05, "loss": 0.4209, "num_input_tokens_seen": 1489152, "step": 4415 }, { "epoch": 3.4157650695517776, "grad_norm": 3.472442626953125, "learning_rate": 4.163080921905972e-05, "loss": 0.3722, "num_input_tokens_seen": 1491104, "step": 4420 }, { "epoch": 3.419629057187017, "grad_norm": 6.401125431060791, "learning_rate": 4.160561777901537e-05, "loss": 0.4593, "num_input_tokens_seen": 1492768, "step": 4425 }, { "epoch": 3.4234930448222567, "grad_norm": 5.42356538772583, "learning_rate": 4.15803961296368e-05, "loss": 0.6266, "num_input_tokens_seen": 1494304, "step": 4430 }, { "epoch": 3.427357032457496, "grad_norm": 2.0095651149749756, "learning_rate": 4.155514431680783e-05, "loss": 0.5026, "num_input_tokens_seen": 1496000, "step": 4435 }, { "epoch": 3.4312210200927358, "grad_norm": 2.9705262184143066, "learning_rate": 4.1529862386467164e-05, "loss": 0.4165, "num_input_tokens_seen": 1497952, "step": 4440 }, { "epoch": 3.435085007727975, "grad_norm": 2.1989333629608154, "learning_rate": 4.15045503846083e-05, "loss": 0.4317, "num_input_tokens_seen": 1499904, "step": 4445 }, { "epoch": 3.438948995363215, "grad_norm": 1.4867478609085083, "learning_rate": 4.147920835727941e-05, "loss": 0.4319, "num_input_tokens_seen": 1501536, "step": 4450 }, { "epoch": 3.442812982998454, "grad_norm": 3.191936731338501, "learning_rate": 4.145383635058333e-05, "loss": 0.6941, "num_input_tokens_seen": 1503296, "step": 4455 }, { "epoch": 3.446676970633694, "grad_norm": 1.6645140647888184, "learning_rate": 4.142843441067742e-05, "loss": 0.3973, "num_input_tokens_seen": 1505056, "step": 4460 }, { "epoch": 3.4505409582689337, "grad_norm": 1.8803516626358032, "learning_rate": 4.140300258377347e-05, "loss": 0.4814, "num_input_tokens_seen": 1506624, "step": 4465 }, { "epoch": 3.454404945904173, "grad_norm": 1.8822559118270874, "learning_rate": 4.137754091613768e-05, "loss": 0.6442, "num_input_tokens_seen": 1508256, "step": 4470 }, { "epoch": 3.458268933539413, "grad_norm": 1.5514893531799316, "learning_rate": 4.135204945409052e-05, "loss": 0.4404, "num_input_tokens_seen": 1510112, "step": 4475 }, { "epoch": 3.462132921174652, "grad_norm": 3.1590003967285156, "learning_rate": 4.1326528244006666e-05, "loss": 0.5779, "num_input_tokens_seen": 1511904, "step": 4480 }, { "epoch": 3.465996908809892, "grad_norm": 3.6076033115386963, "learning_rate": 4.1300977332314905e-05, "loss": 0.5021, "num_input_tokens_seen": 1513600, "step": 4485 }, { "epoch": 3.469860896445131, "grad_norm": 4.361085414886475, "learning_rate": 4.1275396765498054e-05, "loss": 0.5611, "num_input_tokens_seen": 1515424, "step": 4490 }, { "epoch": 3.473724884080371, "grad_norm": 3.2283270359039307, "learning_rate": 4.1249786590092903e-05, "loss": 0.5902, "num_input_tokens_seen": 1517280, "step": 4495 }, { "epoch": 3.4775888717156107, "grad_norm": 1.8131355047225952, "learning_rate": 4.122414685269008e-05, "loss": 0.442, "num_input_tokens_seen": 1519072, "step": 4500 }, { "epoch": 3.48145285935085, "grad_norm": 4.0203447341918945, "learning_rate": 4.119847759993401e-05, "loss": 0.4543, "num_input_tokens_seen": 1520800, "step": 4505 }, { "epoch": 3.48531684698609, "grad_norm": 4.155941009521484, "learning_rate": 4.11727788785228e-05, "loss": 0.4921, "num_input_tokens_seen": 1522496, "step": 4510 }, { "epoch": 3.489180834621329, "grad_norm": 3.253438949584961, "learning_rate": 4.1147050735208194e-05, "loss": 0.4736, "num_input_tokens_seen": 1524384, "step": 4515 }, { "epoch": 3.493044822256569, "grad_norm": 3.498032331466675, "learning_rate": 4.112129321679541e-05, "loss": 0.4239, "num_input_tokens_seen": 1525792, "step": 4520 }, { "epoch": 3.496908809891808, "grad_norm": 2.2438278198242188, "learning_rate": 4.109550637014317e-05, "loss": 0.5084, "num_input_tokens_seen": 1527712, "step": 4525 }, { "epoch": 3.5, "eval_loss": 0.550849974155426, "eval_runtime": 11.5017, "eval_samples_per_second": 49.993, "eval_steps_per_second": 12.52, "num_input_tokens_seen": 1529024, "step": 4529 }, { "epoch": 3.500772797527048, "grad_norm": 3.2917258739471436, "learning_rate": 4.1069690242163484e-05, "loss": 0.5329, "num_input_tokens_seen": 1529312, "step": 4530 }, { "epoch": 3.5046367851622877, "grad_norm": 1.4445055723190308, "learning_rate": 4.104384487982169e-05, "loss": 0.406, "num_input_tokens_seen": 1531136, "step": 4535 }, { "epoch": 3.508500772797527, "grad_norm": 7.345132350921631, "learning_rate": 4.101797033013628e-05, "loss": 0.7543, "num_input_tokens_seen": 1532896, "step": 4540 }, { "epoch": 3.5123647604327664, "grad_norm": 2.069902181625366, "learning_rate": 4.099206664017885e-05, "loss": 0.4918, "num_input_tokens_seen": 1534880, "step": 4545 }, { "epoch": 3.516228748068006, "grad_norm": 2.2228307723999023, "learning_rate": 4.0966133857074005e-05, "loss": 0.5312, "num_input_tokens_seen": 1536448, "step": 4550 }, { "epoch": 3.520092735703246, "grad_norm": 4.5451273918151855, "learning_rate": 4.094017202799928e-05, "loss": 0.4103, "num_input_tokens_seen": 1538048, "step": 4555 }, { "epoch": 3.523956723338485, "grad_norm": 2.577434539794922, "learning_rate": 4.091418120018504e-05, "loss": 0.5364, "num_input_tokens_seen": 1539584, "step": 4560 }, { "epoch": 3.527820710973725, "grad_norm": 1.8120495080947876, "learning_rate": 4.088816142091444e-05, "loss": 0.4179, "num_input_tokens_seen": 1541152, "step": 4565 }, { "epoch": 3.5316846986089647, "grad_norm": 2.412081480026245, "learning_rate": 4.086211273752326e-05, "loss": 0.5683, "num_input_tokens_seen": 1542752, "step": 4570 }, { "epoch": 3.535548686244204, "grad_norm": 3.0846896171569824, "learning_rate": 4.08360351973999e-05, "loss": 0.499, "num_input_tokens_seen": 1544384, "step": 4575 }, { "epoch": 3.5394126738794434, "grad_norm": 2.0819833278656006, "learning_rate": 4.080992884798522e-05, "loss": 0.4929, "num_input_tokens_seen": 1546112, "step": 4580 }, { "epoch": 3.543276661514683, "grad_norm": 1.5553834438323975, "learning_rate": 4.0783793736772536e-05, "loss": 0.5546, "num_input_tokens_seen": 1547776, "step": 4585 }, { "epoch": 3.547140649149923, "grad_norm": 2.897662401199341, "learning_rate": 4.075762991130744e-05, "loss": 0.4462, "num_input_tokens_seen": 1549504, "step": 4590 }, { "epoch": 3.551004636785162, "grad_norm": 4.422805309295654, "learning_rate": 4.07314374191878e-05, "loss": 0.5088, "num_input_tokens_seen": 1551296, "step": 4595 }, { "epoch": 3.554868624420402, "grad_norm": 3.1693060398101807, "learning_rate": 4.07052163080636e-05, "loss": 0.5185, "num_input_tokens_seen": 1552896, "step": 4600 }, { "epoch": 3.5587326120556413, "grad_norm": 2.3071799278259277, "learning_rate": 4.067896662563694e-05, "loss": 0.4561, "num_input_tokens_seen": 1554688, "step": 4605 }, { "epoch": 3.562596599690881, "grad_norm": 2.039830446243286, "learning_rate": 4.0652688419661835e-05, "loss": 0.4922, "num_input_tokens_seen": 1556192, "step": 4610 }, { "epoch": 3.5664605873261204, "grad_norm": 1.5888996124267578, "learning_rate": 4.0626381737944234e-05, "loss": 0.4065, "num_input_tokens_seen": 1557600, "step": 4615 }, { "epoch": 3.57032457496136, "grad_norm": 2.1365628242492676, "learning_rate": 4.060004662834188e-05, "loss": 0.4474, "num_input_tokens_seen": 1559296, "step": 4620 }, { "epoch": 3.5741885625966, "grad_norm": 2.5520541667938232, "learning_rate": 4.057368313876423e-05, "loss": 0.6656, "num_input_tokens_seen": 1560864, "step": 4625 }, { "epoch": 3.578052550231839, "grad_norm": 1.28108549118042, "learning_rate": 4.0547291317172354e-05, "loss": 0.4686, "num_input_tokens_seen": 1562592, "step": 4630 }, { "epoch": 3.581916537867079, "grad_norm": 3.876394271850586, "learning_rate": 4.0520871211578904e-05, "loss": 0.5314, "num_input_tokens_seen": 1564160, "step": 4635 }, { "epoch": 3.5857805255023183, "grad_norm": 1.4100345373153687, "learning_rate": 4.049442287004795e-05, "loss": 0.4764, "num_input_tokens_seen": 1565664, "step": 4640 }, { "epoch": 3.589644513137558, "grad_norm": 5.910153865814209, "learning_rate": 4.046794634069495e-05, "loss": 0.4983, "num_input_tokens_seen": 1567264, "step": 4645 }, { "epoch": 3.5935085007727974, "grad_norm": 2.807605266571045, "learning_rate": 4.0441441671686625e-05, "loss": 0.5281, "num_input_tokens_seen": 1568992, "step": 4650 }, { "epoch": 3.597372488408037, "grad_norm": 1.1230469942092896, "learning_rate": 4.041490891124091e-05, "loss": 0.4257, "num_input_tokens_seen": 1571104, "step": 4655 }, { "epoch": 3.601236476043277, "grad_norm": 2.148188352584839, "learning_rate": 4.038834810762682e-05, "loss": 0.5095, "num_input_tokens_seen": 1572640, "step": 4660 }, { "epoch": 3.605100463678516, "grad_norm": 2.283417224884033, "learning_rate": 4.0361759309164415e-05, "loss": 0.3919, "num_input_tokens_seen": 1574240, "step": 4665 }, { "epoch": 3.6089644513137555, "grad_norm": 2.0869781970977783, "learning_rate": 4.0335142564224646e-05, "loss": 0.6335, "num_input_tokens_seen": 1575968, "step": 4670 }, { "epoch": 3.6128284389489953, "grad_norm": 1.627146601676941, "learning_rate": 4.030849792122936e-05, "loss": 0.457, "num_input_tokens_seen": 1578112, "step": 4675 }, { "epoch": 3.616692426584235, "grad_norm": 1.6836369037628174, "learning_rate": 4.02818254286511e-05, "loss": 0.7873, "num_input_tokens_seen": 1579552, "step": 4680 }, { "epoch": 3.6205564142194744, "grad_norm": 2.647073745727539, "learning_rate": 4.02551251350131e-05, "loss": 0.4497, "num_input_tokens_seen": 1580960, "step": 4685 }, { "epoch": 3.624420401854714, "grad_norm": 1.9089688062667847, "learning_rate": 4.022839708888918e-05, "loss": 0.4338, "num_input_tokens_seen": 1582496, "step": 4690 }, { "epoch": 3.628284389489954, "grad_norm": 1.430537462234497, "learning_rate": 4.020164133890362e-05, "loss": 0.427, "num_input_tokens_seen": 1584288, "step": 4695 }, { "epoch": 3.6321483771251932, "grad_norm": 1.7731047868728638, "learning_rate": 4.017485793373112e-05, "loss": 0.4109, "num_input_tokens_seen": 1586080, "step": 4700 }, { "epoch": 3.6360123647604325, "grad_norm": 3.2447168827056885, "learning_rate": 4.0148046922096704e-05, "loss": 0.473, "num_input_tokens_seen": 1587744, "step": 4705 }, { "epoch": 3.6398763523956723, "grad_norm": 3.6456520557403564, "learning_rate": 4.012120835277559e-05, "loss": 0.5806, "num_input_tokens_seen": 1589568, "step": 4710 }, { "epoch": 3.643740340030912, "grad_norm": 1.5505635738372803, "learning_rate": 4.0094342274593135e-05, "loss": 0.3988, "num_input_tokens_seen": 1591168, "step": 4715 }, { "epoch": 3.6476043276661514, "grad_norm": 3.324371099472046, "learning_rate": 4.006744873642474e-05, "loss": 0.6921, "num_input_tokens_seen": 1592864, "step": 4720 }, { "epoch": 3.651468315301391, "grad_norm": 1.9796169996261597, "learning_rate": 4.004052778719579e-05, "loss": 0.4841, "num_input_tokens_seen": 1594400, "step": 4725 }, { "epoch": 3.6553323029366305, "grad_norm": 4.565962791442871, "learning_rate": 4.00135794758815e-05, "loss": 0.6156, "num_input_tokens_seen": 1596416, "step": 4730 }, { "epoch": 3.6591962905718702, "grad_norm": 3.4246714115142822, "learning_rate": 3.998660385150688e-05, "loss": 0.4792, "num_input_tokens_seen": 1597952, "step": 4735 }, { "epoch": 3.6630602782071096, "grad_norm": 1.9151349067687988, "learning_rate": 3.995960096314663e-05, "loss": 0.4115, "num_input_tokens_seen": 1599456, "step": 4740 }, { "epoch": 3.6669242658423493, "grad_norm": 5.913809776306152, "learning_rate": 3.993257085992505e-05, "loss": 0.4771, "num_input_tokens_seen": 1601216, "step": 4745 }, { "epoch": 3.670788253477589, "grad_norm": 3.0290043354034424, "learning_rate": 3.990551359101593e-05, "loss": 0.4592, "num_input_tokens_seen": 1602912, "step": 4750 }, { "epoch": 3.6746522411128284, "grad_norm": 10.1158447265625, "learning_rate": 3.987842920564251e-05, "loss": 0.6198, "num_input_tokens_seen": 1604704, "step": 4755 }, { "epoch": 3.678516228748068, "grad_norm": 4.170707702636719, "learning_rate": 3.9851317753077345e-05, "loss": 0.4648, "num_input_tokens_seen": 1606240, "step": 4760 }, { "epoch": 3.6823802163833075, "grad_norm": 0.9760448932647705, "learning_rate": 3.982417928264223e-05, "loss": 0.5706, "num_input_tokens_seen": 1607968, "step": 4765 }, { "epoch": 3.6862442040185472, "grad_norm": 1.4732191562652588, "learning_rate": 3.979701384370812e-05, "loss": 0.512, "num_input_tokens_seen": 1609696, "step": 4770 }, { "epoch": 3.6901081916537866, "grad_norm": 3.5462565422058105, "learning_rate": 3.976982148569502e-05, "loss": 0.7374, "num_input_tokens_seen": 1611488, "step": 4775 }, { "epoch": 3.6939721792890263, "grad_norm": 2.7321224212646484, "learning_rate": 3.974260225807192e-05, "loss": 0.5906, "num_input_tokens_seen": 1612928, "step": 4780 }, { "epoch": 3.697836166924266, "grad_norm": 1.3831275701522827, "learning_rate": 3.971535621035667e-05, "loss": 0.5917, "num_input_tokens_seen": 1614752, "step": 4785 }, { "epoch": 3.7017001545595054, "grad_norm": 1.7038514614105225, "learning_rate": 3.968808339211595e-05, "loss": 0.6131, "num_input_tokens_seen": 1616256, "step": 4790 }, { "epoch": 3.7055641421947447, "grad_norm": 2.353518009185791, "learning_rate": 3.96607838529651e-05, "loss": 0.9057, "num_input_tokens_seen": 1617952, "step": 4795 }, { "epoch": 3.7094281298299845, "grad_norm": 2.5694596767425537, "learning_rate": 3.963345764256809e-05, "loss": 0.3776, "num_input_tokens_seen": 1619744, "step": 4800 }, { "epoch": 3.7132921174652243, "grad_norm": 1.4358855485916138, "learning_rate": 3.960610481063744e-05, "loss": 0.5013, "num_input_tokens_seen": 1621472, "step": 4805 }, { "epoch": 3.7171561051004636, "grad_norm": 1.8375383615493774, "learning_rate": 3.957872540693404e-05, "loss": 0.5038, "num_input_tokens_seen": 1623040, "step": 4810 }, { "epoch": 3.7210200927357033, "grad_norm": 1.322088599205017, "learning_rate": 3.955131948126716e-05, "loss": 0.4809, "num_input_tokens_seen": 1624704, "step": 4815 }, { "epoch": 3.7248840803709427, "grad_norm": 3.1233975887298584, "learning_rate": 3.95238870834943e-05, "loss": 0.5791, "num_input_tokens_seen": 1626400, "step": 4820 }, { "epoch": 3.7287480680061824, "grad_norm": 5.707540035247803, "learning_rate": 3.949642826352116e-05, "loss": 0.4698, "num_input_tokens_seen": 1628160, "step": 4825 }, { "epoch": 3.7326120556414217, "grad_norm": 2.3650286197662354, "learning_rate": 3.946894307130145e-05, "loss": 0.4444, "num_input_tokens_seen": 1629696, "step": 4830 }, { "epoch": 3.7364760432766615, "grad_norm": 1.3059293031692505, "learning_rate": 3.9441431556836894e-05, "loss": 0.5745, "num_input_tokens_seen": 1631392, "step": 4835 }, { "epoch": 3.7403400309119013, "grad_norm": 2.6042678356170654, "learning_rate": 3.941389377017709e-05, "loss": 0.9394, "num_input_tokens_seen": 1633056, "step": 4840 }, { "epoch": 3.7442040185471406, "grad_norm": 1.1605465412139893, "learning_rate": 3.938632976141944e-05, "loss": 0.5431, "num_input_tokens_seen": 1634624, "step": 4845 }, { "epoch": 3.7480680061823803, "grad_norm": 1.0997707843780518, "learning_rate": 3.935873958070904e-05, "loss": 0.6423, "num_input_tokens_seen": 1636512, "step": 4850 }, { "epoch": 3.7519319938176197, "grad_norm": 1.4378665685653687, "learning_rate": 3.933112327823859e-05, "loss": 0.4012, "num_input_tokens_seen": 1637920, "step": 4855 }, { "epoch": 3.7557959814528594, "grad_norm": 2.507401704788208, "learning_rate": 3.930348090424834e-05, "loss": 0.8409, "num_input_tokens_seen": 1639616, "step": 4860 }, { "epoch": 3.7596599690880987, "grad_norm": 2.645219564437866, "learning_rate": 3.927581250902595e-05, "loss": 0.4849, "num_input_tokens_seen": 1641664, "step": 4865 }, { "epoch": 3.7635239567233385, "grad_norm": 1.3951945304870605, "learning_rate": 3.9248118142906415e-05, "loss": 0.4124, "num_input_tokens_seen": 1643392, "step": 4870 }, { "epoch": 3.7673879443585783, "grad_norm": 1.9504481554031372, "learning_rate": 3.922039785627198e-05, "loss": 0.6406, "num_input_tokens_seen": 1645216, "step": 4875 }, { "epoch": 3.7712519319938176, "grad_norm": 2.486978054046631, "learning_rate": 3.919265169955207e-05, "loss": 0.4944, "num_input_tokens_seen": 1646816, "step": 4880 }, { "epoch": 3.7751159196290573, "grad_norm": 1.270833969116211, "learning_rate": 3.916487972322313e-05, "loss": 0.3723, "num_input_tokens_seen": 1648640, "step": 4885 }, { "epoch": 3.7789799072642967, "grad_norm": 1.8053007125854492, "learning_rate": 3.913708197780861e-05, "loss": 0.4793, "num_input_tokens_seen": 1650368, "step": 4890 }, { "epoch": 3.7828438948995364, "grad_norm": 2.050266981124878, "learning_rate": 3.910925851387882e-05, "loss": 0.6278, "num_input_tokens_seen": 1652160, "step": 4895 }, { "epoch": 3.7867078825347757, "grad_norm": 4.034176826477051, "learning_rate": 3.9081409382050876e-05, "loss": 0.4873, "num_input_tokens_seen": 1654112, "step": 4900 }, { "epoch": 3.7905718701700155, "grad_norm": 1.6664047241210938, "learning_rate": 3.9053534632988576e-05, "loss": 0.5267, "num_input_tokens_seen": 1655744, "step": 4905 }, { "epoch": 3.7944358578052553, "grad_norm": 1.5538736581802368, "learning_rate": 3.902563431740233e-05, "loss": 0.3813, "num_input_tokens_seen": 1657408, "step": 4910 }, { "epoch": 3.7982998454404946, "grad_norm": 1.5436431169509888, "learning_rate": 3.899770848604904e-05, "loss": 0.6297, "num_input_tokens_seen": 1659232, "step": 4915 }, { "epoch": 3.802163833075734, "grad_norm": 1.6372663974761963, "learning_rate": 3.8969757189732056e-05, "loss": 0.5873, "num_input_tokens_seen": 1660928, "step": 4920 }, { "epoch": 3.8060278207109737, "grad_norm": 1.490795612335205, "learning_rate": 3.894178047930104e-05, "loss": 0.9154, "num_input_tokens_seen": 1662720, "step": 4925 }, { "epoch": 3.8098918083462134, "grad_norm": 1.2131822109222412, "learning_rate": 3.891377840565188e-05, "loss": 0.4745, "num_input_tokens_seen": 1664352, "step": 4930 }, { "epoch": 3.8137557959814528, "grad_norm": 3.2687582969665527, "learning_rate": 3.888575101972661e-05, "loss": 0.3969, "num_input_tokens_seen": 1665952, "step": 4935 }, { "epoch": 3.8176197836166925, "grad_norm": 1.911726713180542, "learning_rate": 3.8857698372513334e-05, "loss": 0.4365, "num_input_tokens_seen": 1667456, "step": 4940 }, { "epoch": 3.821483771251932, "grad_norm": 2.6701714992523193, "learning_rate": 3.882962051504608e-05, "loss": 0.6456, "num_input_tokens_seen": 1668992, "step": 4945 }, { "epoch": 3.8253477588871716, "grad_norm": 2.351722240447998, "learning_rate": 3.8801517498404755e-05, "loss": 0.6494, "num_input_tokens_seen": 1670496, "step": 4950 }, { "epoch": 3.829211746522411, "grad_norm": 3.0430142879486084, "learning_rate": 3.877338937371504e-05, "loss": 0.5158, "num_input_tokens_seen": 1672096, "step": 4955 }, { "epoch": 3.8330757341576507, "grad_norm": 4.256603240966797, "learning_rate": 3.8745236192148274e-05, "loss": 0.4942, "num_input_tokens_seen": 1673856, "step": 4960 }, { "epoch": 3.8369397217928904, "grad_norm": 2.4980695247650146, "learning_rate": 3.871705800492141e-05, "loss": 0.6823, "num_input_tokens_seen": 1675424, "step": 4965 }, { "epoch": 3.8408037094281298, "grad_norm": 1.4811455011367798, "learning_rate": 3.868885486329686e-05, "loss": 0.4987, "num_input_tokens_seen": 1677216, "step": 4970 }, { "epoch": 3.8446676970633695, "grad_norm": 3.4305388927459717, "learning_rate": 3.866062681858247e-05, "loss": 0.5322, "num_input_tokens_seen": 1678784, "step": 4975 }, { "epoch": 3.848531684698609, "grad_norm": 1.5239381790161133, "learning_rate": 3.863237392213135e-05, "loss": 0.3984, "num_input_tokens_seen": 1680448, "step": 4980 }, { "epoch": 3.8523956723338486, "grad_norm": 2.3725664615631104, "learning_rate": 3.860409622534184e-05, "loss": 0.453, "num_input_tokens_seen": 1682112, "step": 4985 }, { "epoch": 3.856259659969088, "grad_norm": 1.3638439178466797, "learning_rate": 3.857579377965741e-05, "loss": 0.4427, "num_input_tokens_seen": 1683680, "step": 4990 }, { "epoch": 3.8601236476043277, "grad_norm": 1.5944757461547852, "learning_rate": 3.8547466636566534e-05, "loss": 0.415, "num_input_tokens_seen": 1685248, "step": 4995 }, { "epoch": 3.8639876352395675, "grad_norm": 1.265494704246521, "learning_rate": 3.851911484760262e-05, "loss": 0.4696, "num_input_tokens_seen": 1686880, "step": 5000 }, { "epoch": 3.8678516228748068, "grad_norm": 1.9884569644927979, "learning_rate": 3.849073846434393e-05, "loss": 0.4823, "num_input_tokens_seen": 1688768, "step": 5005 }, { "epoch": 3.871715610510046, "grad_norm": 3.226297378540039, "learning_rate": 3.846233753841344e-05, "loss": 0.5847, "num_input_tokens_seen": 1690400, "step": 5010 }, { "epoch": 3.875579598145286, "grad_norm": 2.414877414703369, "learning_rate": 3.8433912121478786e-05, "loss": 0.5417, "num_input_tokens_seen": 1692192, "step": 5015 }, { "epoch": 3.8794435857805256, "grad_norm": 1.3912780284881592, "learning_rate": 3.840546226525218e-05, "loss": 0.4024, "num_input_tokens_seen": 1693760, "step": 5020 }, { "epoch": 3.883307573415765, "grad_norm": 1.4946731328964233, "learning_rate": 3.8376988021490265e-05, "loss": 0.3425, "num_input_tokens_seen": 1695328, "step": 5025 }, { "epoch": 3.8871715610510047, "grad_norm": 4.56956672668457, "learning_rate": 3.834848944199406e-05, "loss": 0.6259, "num_input_tokens_seen": 1696960, "step": 5030 }, { "epoch": 3.8910355486862445, "grad_norm": 3.488584518432617, "learning_rate": 3.8319966578608856e-05, "loss": 0.6062, "num_input_tokens_seen": 1698688, "step": 5035 }, { "epoch": 3.894899536321484, "grad_norm": 1.4802907705307007, "learning_rate": 3.829141948322412e-05, "loss": 0.3409, "num_input_tokens_seen": 1700480, "step": 5040 }, { "epoch": 3.898763523956723, "grad_norm": 1.796071171760559, "learning_rate": 3.8262848207773435e-05, "loss": 0.3787, "num_input_tokens_seen": 1702240, "step": 5045 }, { "epoch": 3.902627511591963, "grad_norm": 2.786713123321533, "learning_rate": 3.82342528042343e-05, "loss": 0.8356, "num_input_tokens_seen": 1703712, "step": 5050 }, { "epoch": 3.9064914992272026, "grad_norm": 1.9073864221572876, "learning_rate": 3.8205633324628185e-05, "loss": 0.6845, "num_input_tokens_seen": 1705536, "step": 5055 }, { "epoch": 3.910355486862442, "grad_norm": 1.0513015985488892, "learning_rate": 3.8176989821020315e-05, "loss": 0.4755, "num_input_tokens_seen": 1707136, "step": 5060 }, { "epoch": 3.9142194744976817, "grad_norm": 4.305631160736084, "learning_rate": 3.814832234551963e-05, "loss": 0.4821, "num_input_tokens_seen": 1708512, "step": 5065 }, { "epoch": 3.918083462132921, "grad_norm": 1.9722061157226562, "learning_rate": 3.811963095027868e-05, "loss": 0.5965, "num_input_tokens_seen": 1710400, "step": 5070 }, { "epoch": 3.921947449768161, "grad_norm": 1.7969028949737549, "learning_rate": 3.8090915687493545e-05, "loss": 0.3895, "num_input_tokens_seen": 1712032, "step": 5075 }, { "epoch": 3.9258114374034, "grad_norm": 2.7149131298065186, "learning_rate": 3.806217660940371e-05, "loss": 0.5084, "num_input_tokens_seen": 1714016, "step": 5080 }, { "epoch": 3.92967542503864, "grad_norm": 1.920792818069458, "learning_rate": 3.803341376829197e-05, "loss": 0.4416, "num_input_tokens_seen": 1715456, "step": 5085 }, { "epoch": 3.9335394126738796, "grad_norm": 4.611039638519287, "learning_rate": 3.800462721648439e-05, "loss": 0.5159, "num_input_tokens_seen": 1717280, "step": 5090 }, { "epoch": 3.937403400309119, "grad_norm": 2.226130485534668, "learning_rate": 3.797581700635014e-05, "loss": 0.3892, "num_input_tokens_seen": 1718816, "step": 5095 }, { "epoch": 3.9412673879443587, "grad_norm": 1.4658949375152588, "learning_rate": 3.7946983190301437e-05, "loss": 0.4441, "num_input_tokens_seen": 1720320, "step": 5100 }, { "epoch": 3.945131375579598, "grad_norm": 1.7567282915115356, "learning_rate": 3.7918125820793446e-05, "loss": 0.477, "num_input_tokens_seen": 1722016, "step": 5105 }, { "epoch": 3.948995363214838, "grad_norm": 3.1927144527435303, "learning_rate": 3.788924495032419e-05, "loss": 0.5519, "num_input_tokens_seen": 1723904, "step": 5110 }, { "epoch": 3.952859350850077, "grad_norm": 1.3572742938995361, "learning_rate": 3.7860340631434416e-05, "loss": 0.6373, "num_input_tokens_seen": 1725568, "step": 5115 }, { "epoch": 3.956723338485317, "grad_norm": 1.7375192642211914, "learning_rate": 3.783141291670755e-05, "loss": 0.5365, "num_input_tokens_seen": 1727424, "step": 5120 }, { "epoch": 3.9605873261205566, "grad_norm": 1.8609740734100342, "learning_rate": 3.780246185876959e-05, "loss": 0.3948, "num_input_tokens_seen": 1728896, "step": 5125 }, { "epoch": 3.964451313755796, "grad_norm": 2.1998870372772217, "learning_rate": 3.7773487510288986e-05, "loss": 0.4837, "num_input_tokens_seen": 1730624, "step": 5130 }, { "epoch": 3.9683153013910353, "grad_norm": 1.0121334791183472, "learning_rate": 3.774448992397655e-05, "loss": 0.3668, "num_input_tokens_seen": 1732192, "step": 5135 }, { "epoch": 3.972179289026275, "grad_norm": 1.7320263385772705, "learning_rate": 3.771546915258538e-05, "loss": 0.3561, "num_input_tokens_seen": 1733728, "step": 5140 }, { "epoch": 3.976043276661515, "grad_norm": 1.5379325151443481, "learning_rate": 3.768642524891076e-05, "loss": 0.4193, "num_input_tokens_seen": 1735136, "step": 5145 }, { "epoch": 3.979907264296754, "grad_norm": 2.691956043243408, "learning_rate": 3.765735826579006e-05, "loss": 0.4148, "num_input_tokens_seen": 1736704, "step": 5150 }, { "epoch": 3.983771251931994, "grad_norm": 1.503035306930542, "learning_rate": 3.762826825610259e-05, "loss": 0.4015, "num_input_tokens_seen": 1738240, "step": 5155 }, { "epoch": 3.9876352395672336, "grad_norm": 4.403746128082275, "learning_rate": 3.759915527276963e-05, "loss": 0.4945, "num_input_tokens_seen": 1740032, "step": 5160 }, { "epoch": 3.991499227202473, "grad_norm": 0.9850881695747375, "learning_rate": 3.757001936875418e-05, "loss": 0.5916, "num_input_tokens_seen": 1741824, "step": 5165 }, { "epoch": 3.9953632148377123, "grad_norm": 2.4471755027770996, "learning_rate": 3.754086059706099e-05, "loss": 0.3647, "num_input_tokens_seen": 1743488, "step": 5170 }, { "epoch": 3.999227202472952, "grad_norm": 1.7338567972183228, "learning_rate": 3.7511679010736364e-05, "loss": 0.3727, "num_input_tokens_seen": 1745344, "step": 5175 }, { "epoch": 4.0, "eval_loss": 0.5626021027565002, "eval_runtime": 11.5023, "eval_samples_per_second": 49.99, "eval_steps_per_second": 12.519, "num_input_tokens_seen": 1745568, "step": 5176 }, { "epoch": 4.003091190108192, "grad_norm": 1.6083275079727173, "learning_rate": 3.748247466286816e-05, "loss": 0.4206, "num_input_tokens_seen": 1747200, "step": 5180 }, { "epoch": 4.006955177743431, "grad_norm": 2.635146141052246, "learning_rate": 3.745324760658561e-05, "loss": 0.4985, "num_input_tokens_seen": 1748736, "step": 5185 }, { "epoch": 4.0108191653786704, "grad_norm": 2.378819465637207, "learning_rate": 3.742399789505928e-05, "loss": 0.456, "num_input_tokens_seen": 1750496, "step": 5190 }, { "epoch": 4.014683153013911, "grad_norm": 1.5652929544448853, "learning_rate": 3.739472558150092e-05, "loss": 0.4508, "num_input_tokens_seen": 1752544, "step": 5195 }, { "epoch": 4.01854714064915, "grad_norm": 1.6911450624465942, "learning_rate": 3.736543071916344e-05, "loss": 0.3542, "num_input_tokens_seen": 1754144, "step": 5200 }, { "epoch": 4.022411128284389, "grad_norm": 2.4116127490997314, "learning_rate": 3.733611336134073e-05, "loss": 0.4294, "num_input_tokens_seen": 1755904, "step": 5205 }, { "epoch": 4.0262751159196295, "grad_norm": 3.8481626510620117, "learning_rate": 3.730677356136763e-05, "loss": 0.6862, "num_input_tokens_seen": 1757312, "step": 5210 }, { "epoch": 4.030139103554869, "grad_norm": 2.2477214336395264, "learning_rate": 3.7277411372619816e-05, "loss": 0.3901, "num_input_tokens_seen": 1759200, "step": 5215 }, { "epoch": 4.034003091190108, "grad_norm": 3.578573226928711, "learning_rate": 3.724802684851365e-05, "loss": 0.5504, "num_input_tokens_seen": 1760800, "step": 5220 }, { "epoch": 4.0378670788253475, "grad_norm": 3.8079307079315186, "learning_rate": 3.7218620042506185e-05, "loss": 0.549, "num_input_tokens_seen": 1762560, "step": 5225 }, { "epoch": 4.041731066460588, "grad_norm": 4.72942590713501, "learning_rate": 3.718919100809497e-05, "loss": 0.5943, "num_input_tokens_seen": 1764192, "step": 5230 }, { "epoch": 4.045595054095827, "grad_norm": 3.5094168186187744, "learning_rate": 3.715973979881799e-05, "loss": 0.5788, "num_input_tokens_seen": 1765792, "step": 5235 }, { "epoch": 4.049459041731066, "grad_norm": 3.629093647003174, "learning_rate": 3.71302664682536e-05, "loss": 0.6048, "num_input_tokens_seen": 1767424, "step": 5240 }, { "epoch": 4.053323029366306, "grad_norm": 2.0797622203826904, "learning_rate": 3.710077107002039e-05, "loss": 0.4141, "num_input_tokens_seen": 1768960, "step": 5245 }, { "epoch": 4.057187017001546, "grad_norm": 4.990087032318115, "learning_rate": 3.707125365777707e-05, "loss": 0.4889, "num_input_tokens_seen": 1770752, "step": 5250 }, { "epoch": 4.061051004636785, "grad_norm": 1.9308886528015137, "learning_rate": 3.704171428522242e-05, "loss": 0.4512, "num_input_tokens_seen": 1772352, "step": 5255 }, { "epoch": 4.0649149922720245, "grad_norm": 3.0602550506591797, "learning_rate": 3.7012153006095184e-05, "loss": 0.3791, "num_input_tokens_seen": 1773984, "step": 5260 }, { "epoch": 4.068778979907265, "grad_norm": 1.8320131301879883, "learning_rate": 3.698256987417392e-05, "loss": 0.595, "num_input_tokens_seen": 1775904, "step": 5265 }, { "epoch": 4.072642967542504, "grad_norm": 1.5172168016433716, "learning_rate": 3.695296494327698e-05, "loss": 0.4099, "num_input_tokens_seen": 1777568, "step": 5270 }, { "epoch": 4.076506955177743, "grad_norm": 1.1387360095977783, "learning_rate": 3.692333826726236e-05, "loss": 0.4512, "num_input_tokens_seen": 1779232, "step": 5275 }, { "epoch": 4.080370942812983, "grad_norm": 4.887117385864258, "learning_rate": 3.68936899000276e-05, "loss": 0.4706, "num_input_tokens_seen": 1780928, "step": 5280 }, { "epoch": 4.084234930448223, "grad_norm": 3.288454532623291, "learning_rate": 3.6864019895509724e-05, "loss": 0.5698, "num_input_tokens_seen": 1782656, "step": 5285 }, { "epoch": 4.088098918083462, "grad_norm": 3.245821952819824, "learning_rate": 3.68343283076851e-05, "loss": 0.4182, "num_input_tokens_seen": 1784096, "step": 5290 }, { "epoch": 4.0919629057187015, "grad_norm": 1.9624673128128052, "learning_rate": 3.680461519056938e-05, "loss": 0.455, "num_input_tokens_seen": 1786048, "step": 5295 }, { "epoch": 4.095826893353942, "grad_norm": 1.4714199304580688, "learning_rate": 3.677488059821736e-05, "loss": 0.5277, "num_input_tokens_seen": 1787616, "step": 5300 }, { "epoch": 4.099690880989181, "grad_norm": 1.3340038061141968, "learning_rate": 3.674512458472293e-05, "loss": 0.4945, "num_input_tokens_seen": 1789248, "step": 5305 }, { "epoch": 4.10355486862442, "grad_norm": 1.0320912599563599, "learning_rate": 3.6715347204218924e-05, "loss": 0.4587, "num_input_tokens_seen": 1790944, "step": 5310 }, { "epoch": 4.10741885625966, "grad_norm": 2.636582136154175, "learning_rate": 3.6685548510877064e-05, "loss": 0.426, "num_input_tokens_seen": 1792672, "step": 5315 }, { "epoch": 4.1112828438949, "grad_norm": 2.8037703037261963, "learning_rate": 3.665572855890785e-05, "loss": 0.4266, "num_input_tokens_seen": 1794528, "step": 5320 }, { "epoch": 4.115146831530139, "grad_norm": 2.9024786949157715, "learning_rate": 3.6625887402560434e-05, "loss": 0.5252, "num_input_tokens_seen": 1796064, "step": 5325 }, { "epoch": 4.1190108191653785, "grad_norm": 4.277598857879639, "learning_rate": 3.659602509612257e-05, "loss": 0.6438, "num_input_tokens_seen": 1797952, "step": 5330 }, { "epoch": 4.122874806800619, "grad_norm": 1.3794749975204468, "learning_rate": 3.6566141693920454e-05, "loss": 0.3591, "num_input_tokens_seen": 1799648, "step": 5335 }, { "epoch": 4.126738794435858, "grad_norm": 1.9509167671203613, "learning_rate": 3.65362372503187e-05, "loss": 0.5205, "num_input_tokens_seen": 1801248, "step": 5340 }, { "epoch": 4.130602782071097, "grad_norm": 1.911092758178711, "learning_rate": 3.650631181972018e-05, "loss": 0.6353, "num_input_tokens_seen": 1802848, "step": 5345 }, { "epoch": 4.134466769706337, "grad_norm": 1.6311448812484741, "learning_rate": 3.647636545656593e-05, "loss": 0.487, "num_input_tokens_seen": 1804704, "step": 5350 }, { "epoch": 4.138330757341577, "grad_norm": 1.9933803081512451, "learning_rate": 3.6446398215335106e-05, "loss": 0.4414, "num_input_tokens_seen": 1806560, "step": 5355 }, { "epoch": 4.142194744976816, "grad_norm": 1.720354676246643, "learning_rate": 3.641641015054482e-05, "loss": 0.8598, "num_input_tokens_seen": 1808288, "step": 5360 }, { "epoch": 4.1460587326120555, "grad_norm": 1.2043862342834473, "learning_rate": 3.638640131675006e-05, "loss": 0.4483, "num_input_tokens_seen": 1810080, "step": 5365 }, { "epoch": 4.149922720247295, "grad_norm": 2.231757640838623, "learning_rate": 3.6356371768543626e-05, "loss": 0.4138, "num_input_tokens_seen": 1811584, "step": 5370 }, { "epoch": 4.153786707882535, "grad_norm": 3.17195200920105, "learning_rate": 3.632632156055597e-05, "loss": 0.3739, "num_input_tokens_seen": 1813344, "step": 5375 }, { "epoch": 4.157650695517774, "grad_norm": 1.5969557762145996, "learning_rate": 3.6296250747455154e-05, "loss": 0.3946, "num_input_tokens_seen": 1815232, "step": 5380 }, { "epoch": 4.161514683153014, "grad_norm": 1.3702871799468994, "learning_rate": 3.6266159383946716e-05, "loss": 0.446, "num_input_tokens_seen": 1817088, "step": 5385 }, { "epoch": 4.165378670788254, "grad_norm": 2.7975103855133057, "learning_rate": 3.6236047524773576e-05, "loss": 0.357, "num_input_tokens_seen": 1818624, "step": 5390 }, { "epoch": 4.169242658423493, "grad_norm": 2.289538860321045, "learning_rate": 3.6205915224715936e-05, "loss": 0.457, "num_input_tokens_seen": 1820416, "step": 5395 }, { "epoch": 4.1731066460587325, "grad_norm": 4.094809055328369, "learning_rate": 3.617576253859121e-05, "loss": 0.4585, "num_input_tokens_seen": 1822464, "step": 5400 }, { "epoch": 4.176970633693972, "grad_norm": 3.0370328426361084, "learning_rate": 3.6145589521253855e-05, "loss": 0.464, "num_input_tokens_seen": 1824416, "step": 5405 }, { "epoch": 4.180834621329212, "grad_norm": 3.9973275661468506, "learning_rate": 3.611539622759536e-05, "loss": 0.5359, "num_input_tokens_seen": 1826080, "step": 5410 }, { "epoch": 4.184698608964451, "grad_norm": 3.594961404800415, "learning_rate": 3.608518271254409e-05, "loss": 0.5141, "num_input_tokens_seen": 1827744, "step": 5415 }, { "epoch": 4.188562596599691, "grad_norm": 1.256643295288086, "learning_rate": 3.605494903106516e-05, "loss": 0.3862, "num_input_tokens_seen": 1829664, "step": 5420 }, { "epoch": 4.192426584234931, "grad_norm": 2.4838547706604004, "learning_rate": 3.6024695238160424e-05, "loss": 0.473, "num_input_tokens_seen": 1831392, "step": 5425 }, { "epoch": 4.19629057187017, "grad_norm": 2.0141899585723877, "learning_rate": 3.599442138886829e-05, "loss": 0.841, "num_input_tokens_seen": 1833120, "step": 5430 }, { "epoch": 4.2001545595054095, "grad_norm": 3.1595821380615234, "learning_rate": 3.596412753826367e-05, "loss": 0.5744, "num_input_tokens_seen": 1834816, "step": 5435 }, { "epoch": 4.204018547140649, "grad_norm": 1.6489276885986328, "learning_rate": 3.593381374145785e-05, "loss": 0.3898, "num_input_tokens_seen": 1836800, "step": 5440 }, { "epoch": 4.207882534775889, "grad_norm": 2.6347107887268066, "learning_rate": 3.590348005359841e-05, "loss": 0.5196, "num_input_tokens_seen": 1838528, "step": 5445 }, { "epoch": 4.211746522411128, "grad_norm": 2.6562795639038086, "learning_rate": 3.5873126529869136e-05, "loss": 0.4635, "num_input_tokens_seen": 1840256, "step": 5450 }, { "epoch": 4.215610510046368, "grad_norm": 4.593440055847168, "learning_rate": 3.584275322548985e-05, "loss": 0.557, "num_input_tokens_seen": 1841664, "step": 5455 }, { "epoch": 4.219474497681608, "grad_norm": 1.4001777172088623, "learning_rate": 3.581236019571641e-05, "loss": 0.4588, "num_input_tokens_seen": 1843136, "step": 5460 }, { "epoch": 4.223338485316847, "grad_norm": 1.3685240745544434, "learning_rate": 3.578194749584051e-05, "loss": 0.7368, "num_input_tokens_seen": 1844832, "step": 5465 }, { "epoch": 4.2272024729520865, "grad_norm": 2.245004892349243, "learning_rate": 3.5751515181189684e-05, "loss": 0.6633, "num_input_tokens_seen": 1846592, "step": 5470 }, { "epoch": 4.231066460587326, "grad_norm": 1.1230907440185547, "learning_rate": 3.572106330712711e-05, "loss": 0.42, "num_input_tokens_seen": 1848160, "step": 5475 }, { "epoch": 4.234930448222566, "grad_norm": 3.3352210521698, "learning_rate": 3.569059192905156e-05, "loss": 0.6017, "num_input_tokens_seen": 1849696, "step": 5480 }, { "epoch": 4.238794435857805, "grad_norm": 2.7249703407287598, "learning_rate": 3.5660101102397295e-05, "loss": 0.3956, "num_input_tokens_seen": 1851136, "step": 5485 }, { "epoch": 4.242658423493045, "grad_norm": 1.4421058893203735, "learning_rate": 3.5629590882633946e-05, "loss": 0.4177, "num_input_tokens_seen": 1852896, "step": 5490 }, { "epoch": 4.246522411128284, "grad_norm": 2.9525179862976074, "learning_rate": 3.559906132526643e-05, "loss": 0.4343, "num_input_tokens_seen": 1854464, "step": 5495 }, { "epoch": 4.250386398763524, "grad_norm": 2.6593546867370605, "learning_rate": 3.556851248583483e-05, "loss": 0.4603, "num_input_tokens_seen": 1856160, "step": 5500 }, { "epoch": 4.2542503863987635, "grad_norm": 2.115382432937622, "learning_rate": 3.553794441991434e-05, "loss": 0.4979, "num_input_tokens_seen": 1857728, "step": 5505 }, { "epoch": 4.258114374034003, "grad_norm": 2.2695376873016357, "learning_rate": 3.550735718311511e-05, "loss": 0.3571, "num_input_tokens_seen": 1859232, "step": 5510 }, { "epoch": 4.261978361669243, "grad_norm": 1.78935968875885, "learning_rate": 3.5476750831082153e-05, "loss": 0.4052, "num_input_tokens_seen": 1860768, "step": 5515 }, { "epoch": 4.265842349304482, "grad_norm": 1.4328893423080444, "learning_rate": 3.5446125419495284e-05, "loss": 0.3386, "num_input_tokens_seen": 1862464, "step": 5520 }, { "epoch": 4.269706336939722, "grad_norm": 2.5695245265960693, "learning_rate": 3.541548100406897e-05, "loss": 0.5776, "num_input_tokens_seen": 1864320, "step": 5525 }, { "epoch": 4.273570324574961, "grad_norm": 1.5962872505187988, "learning_rate": 3.538481764055227e-05, "loss": 0.4307, "num_input_tokens_seen": 1865792, "step": 5530 }, { "epoch": 4.277434312210201, "grad_norm": 2.6499252319335938, "learning_rate": 3.53541353847287e-05, "loss": 0.4901, "num_input_tokens_seen": 1867424, "step": 5535 }, { "epoch": 4.2812982998454405, "grad_norm": 1.3021180629730225, "learning_rate": 3.5323434292416144e-05, "loss": 0.5903, "num_input_tokens_seen": 1869248, "step": 5540 }, { "epoch": 4.28516228748068, "grad_norm": 1.277121663093567, "learning_rate": 3.5292714419466755e-05, "loss": 0.4238, "num_input_tokens_seen": 1870784, "step": 5545 }, { "epoch": 4.289026275115919, "grad_norm": 1.9529911279678345, "learning_rate": 3.526197582176687e-05, "loss": 0.6352, "num_input_tokens_seen": 1872640, "step": 5550 }, { "epoch": 4.292890262751159, "grad_norm": 3.6996700763702393, "learning_rate": 3.523121855523686e-05, "loss": 0.4403, "num_input_tokens_seen": 1874272, "step": 5555 }, { "epoch": 4.296754250386399, "grad_norm": 1.8225107192993164, "learning_rate": 3.520044267583108e-05, "loss": 0.4329, "num_input_tokens_seen": 1876384, "step": 5560 }, { "epoch": 4.300618238021638, "grad_norm": 2.1841535568237305, "learning_rate": 3.516964823953775e-05, "loss": 0.356, "num_input_tokens_seen": 1878080, "step": 5565 }, { "epoch": 4.304482225656878, "grad_norm": 2.7712604999542236, "learning_rate": 3.513883530237882e-05, "loss": 0.475, "num_input_tokens_seen": 1880128, "step": 5570 }, { "epoch": 4.3083462132921175, "grad_norm": 2.3475849628448486, "learning_rate": 3.5108003920409926e-05, "loss": 0.4985, "num_input_tokens_seen": 1881632, "step": 5575 }, { "epoch": 4.312210200927357, "grad_norm": 5.666539192199707, "learning_rate": 3.507715414972027e-05, "loss": 0.4817, "num_input_tokens_seen": 1883328, "step": 5580 }, { "epoch": 4.316074188562597, "grad_norm": 3.3880727291107178, "learning_rate": 3.5046286046432454e-05, "loss": 0.4936, "num_input_tokens_seen": 1884832, "step": 5585 }, { "epoch": 4.319938176197836, "grad_norm": 2.7358365058898926, "learning_rate": 3.50153996667025e-05, "loss": 0.5, "num_input_tokens_seen": 1886432, "step": 5590 }, { "epoch": 4.323802163833076, "grad_norm": 2.80517578125, "learning_rate": 3.49844950667196e-05, "loss": 0.48, "num_input_tokens_seen": 1888096, "step": 5595 }, { "epoch": 4.327666151468315, "grad_norm": 1.9607690572738647, "learning_rate": 3.495357230270618e-05, "loss": 0.4237, "num_input_tokens_seen": 1889824, "step": 5600 }, { "epoch": 4.331530139103555, "grad_norm": 1.3611797094345093, "learning_rate": 3.492263143091764e-05, "loss": 0.4522, "num_input_tokens_seen": 1891584, "step": 5605 }, { "epoch": 4.3353941267387945, "grad_norm": 2.843477487564087, "learning_rate": 3.4891672507642356e-05, "loss": 0.5144, "num_input_tokens_seen": 1893248, "step": 5610 }, { "epoch": 4.339258114374034, "grad_norm": 1.9079909324645996, "learning_rate": 3.4860695589201535e-05, "loss": 0.539, "num_input_tokens_seen": 1894944, "step": 5615 }, { "epoch": 4.343122102009273, "grad_norm": 4.61338996887207, "learning_rate": 3.482970073194913e-05, "loss": 0.7057, "num_input_tokens_seen": 1896544, "step": 5620 }, { "epoch": 4.346986089644513, "grad_norm": 4.267466068267822, "learning_rate": 3.479868799227171e-05, "loss": 0.5031, "num_input_tokens_seen": 1898080, "step": 5625 }, { "epoch": 4.350850077279753, "grad_norm": 2.472168207168579, "learning_rate": 3.4767657426588396e-05, "loss": 0.5398, "num_input_tokens_seen": 1899648, "step": 5630 }, { "epoch": 4.354714064914992, "grad_norm": 3.175891399383545, "learning_rate": 3.473660909135073e-05, "loss": 0.5261, "num_input_tokens_seen": 1901568, "step": 5635 }, { "epoch": 4.358578052550232, "grad_norm": 3.0510663986206055, "learning_rate": 3.4705543043042575e-05, "loss": 0.4276, "num_input_tokens_seen": 1903264, "step": 5640 }, { "epoch": 4.3624420401854715, "grad_norm": 1.4388536214828491, "learning_rate": 3.467445933818002e-05, "loss": 0.4551, "num_input_tokens_seen": 1904928, "step": 5645 }, { "epoch": 4.366306027820711, "grad_norm": 1.8934731483459473, "learning_rate": 3.4643358033311305e-05, "loss": 0.442, "num_input_tokens_seen": 1906336, "step": 5650 }, { "epoch": 4.37017001545595, "grad_norm": 1.743368148803711, "learning_rate": 3.461223918501662e-05, "loss": 0.4982, "num_input_tokens_seen": 1908128, "step": 5655 }, { "epoch": 4.37403400309119, "grad_norm": 1.7443641424179077, "learning_rate": 3.458110284990815e-05, "loss": 0.4289, "num_input_tokens_seen": 1909856, "step": 5660 }, { "epoch": 4.37789799072643, "grad_norm": 1.1184145212173462, "learning_rate": 3.454994908462983e-05, "loss": 0.4137, "num_input_tokens_seen": 1911232, "step": 5665 }, { "epoch": 4.381761978361669, "grad_norm": 1.4508219957351685, "learning_rate": 3.4518777945857355e-05, "loss": 0.5663, "num_input_tokens_seen": 1912960, "step": 5670 }, { "epoch": 4.385625965996908, "grad_norm": 1.663489818572998, "learning_rate": 3.448758949029798e-05, "loss": 0.5805, "num_input_tokens_seen": 1914816, "step": 5675 }, { "epoch": 4.3894899536321486, "grad_norm": 1.8397408723831177, "learning_rate": 3.445638377469049e-05, "loss": 0.3784, "num_input_tokens_seen": 1916320, "step": 5680 }, { "epoch": 4.393353941267388, "grad_norm": 2.5360963344573975, "learning_rate": 3.442516085580507e-05, "loss": 0.3954, "num_input_tokens_seen": 1917984, "step": 5685 }, { "epoch": 4.397217928902627, "grad_norm": 1.7258542776107788, "learning_rate": 3.439392079044319e-05, "loss": 0.7749, "num_input_tokens_seen": 1920256, "step": 5690 }, { "epoch": 4.401081916537867, "grad_norm": 3.0073068141937256, "learning_rate": 3.436266363543751e-05, "loss": 0.5476, "num_input_tokens_seen": 1922176, "step": 5695 }, { "epoch": 4.404945904173107, "grad_norm": 1.7911351919174194, "learning_rate": 3.4331389447651815e-05, "loss": 0.5841, "num_input_tokens_seen": 1923776, "step": 5700 }, { "epoch": 4.408809891808346, "grad_norm": 4.255690574645996, "learning_rate": 3.4300098283980814e-05, "loss": 0.4153, "num_input_tokens_seen": 1925600, "step": 5705 }, { "epoch": 4.412673879443586, "grad_norm": 4.344098091125488, "learning_rate": 3.426879020135017e-05, "loss": 0.4368, "num_input_tokens_seen": 1927456, "step": 5710 }, { "epoch": 4.416537867078826, "grad_norm": 1.5204336643218994, "learning_rate": 3.423746525671626e-05, "loss": 0.5684, "num_input_tokens_seen": 1928960, "step": 5715 }, { "epoch": 4.420401854714065, "grad_norm": 2.755342483520508, "learning_rate": 3.420612350706619e-05, "loss": 0.4552, "num_input_tokens_seen": 1930784, "step": 5720 }, { "epoch": 4.424265842349304, "grad_norm": 1.0981614589691162, "learning_rate": 3.417476500941761e-05, "loss": 0.481, "num_input_tokens_seen": 1932320, "step": 5725 }, { "epoch": 4.428129829984544, "grad_norm": 1.8948336839675903, "learning_rate": 3.414338982081863e-05, "loss": 0.4828, "num_input_tokens_seen": 1933984, "step": 5730 }, { "epoch": 4.431993817619784, "grad_norm": 1.1052440404891968, "learning_rate": 3.411199799834776e-05, "loss": 0.3639, "num_input_tokens_seen": 1935584, "step": 5735 }, { "epoch": 4.435857805255023, "grad_norm": 2.3502819538116455, "learning_rate": 3.408058959911373e-05, "loss": 0.6078, "num_input_tokens_seen": 1937408, "step": 5740 }, { "epoch": 4.439721792890262, "grad_norm": 1.1913543939590454, "learning_rate": 3.404916468025546e-05, "loss": 0.5986, "num_input_tokens_seen": 1938944, "step": 5745 }, { "epoch": 4.443585780525503, "grad_norm": 2.4197134971618652, "learning_rate": 3.40177232989419e-05, "loss": 0.4058, "num_input_tokens_seen": 1940896, "step": 5750 }, { "epoch": 4.447449768160742, "grad_norm": 1.2846109867095947, "learning_rate": 3.398626551237196e-05, "loss": 0.4262, "num_input_tokens_seen": 1942560, "step": 5755 }, { "epoch": 4.451313755795981, "grad_norm": 1.2992697954177856, "learning_rate": 3.395479137777439e-05, "loss": 0.8142, "num_input_tokens_seen": 1944672, "step": 5760 }, { "epoch": 4.455177743431221, "grad_norm": 1.4086815118789673, "learning_rate": 3.3923300952407675e-05, "loss": 0.5721, "num_input_tokens_seen": 1946432, "step": 5765 }, { "epoch": 4.459041731066461, "grad_norm": 2.7060818672180176, "learning_rate": 3.389179429355997e-05, "loss": 0.6362, "num_input_tokens_seen": 1948320, "step": 5770 }, { "epoch": 4.4629057187017, "grad_norm": 1.309862494468689, "learning_rate": 3.386027145854892e-05, "loss": 0.3871, "num_input_tokens_seen": 1949856, "step": 5775 }, { "epoch": 4.466769706336939, "grad_norm": 1.1722899675369263, "learning_rate": 3.3828732504721615e-05, "loss": 0.6651, "num_input_tokens_seen": 1951648, "step": 5780 }, { "epoch": 4.47063369397218, "grad_norm": 3.3884117603302, "learning_rate": 3.379717748945447e-05, "loss": 0.4428, "num_input_tokens_seen": 1953152, "step": 5785 }, { "epoch": 4.474497681607419, "grad_norm": 1.4230624437332153, "learning_rate": 3.376560647015312e-05, "loss": 0.4349, "num_input_tokens_seen": 1954912, "step": 5790 }, { "epoch": 4.478361669242658, "grad_norm": 1.6572484970092773, "learning_rate": 3.373401950425231e-05, "loss": 0.3937, "num_input_tokens_seen": 1956512, "step": 5795 }, { "epoch": 4.4822256568778975, "grad_norm": 2.3996033668518066, "learning_rate": 3.3702416649215804e-05, "loss": 0.8273, "num_input_tokens_seen": 1958080, "step": 5800 }, { "epoch": 4.486089644513138, "grad_norm": 1.7978805303573608, "learning_rate": 3.367079796253626e-05, "loss": 0.5422, "num_input_tokens_seen": 1959744, "step": 5805 }, { "epoch": 4.489953632148377, "grad_norm": 0.9261136054992676, "learning_rate": 3.363916350173515e-05, "loss": 0.3753, "num_input_tokens_seen": 1961504, "step": 5810 }, { "epoch": 4.493817619783616, "grad_norm": 1.7501429319381714, "learning_rate": 3.360751332436263e-05, "loss": 0.4334, "num_input_tokens_seen": 1963360, "step": 5815 }, { "epoch": 4.497681607418857, "grad_norm": 1.7899534702301025, "learning_rate": 3.357584748799746e-05, "loss": 0.4597, "num_input_tokens_seen": 1964800, "step": 5820 }, { "epoch": 4.5, "eval_loss": 0.5266133546829224, "eval_runtime": 11.4483, "eval_samples_per_second": 50.226, "eval_steps_per_second": 12.578, "num_input_tokens_seen": 1965984, "step": 5823 }, { "epoch": 4.501545595054096, "grad_norm": 1.6489028930664062, "learning_rate": 3.354416605024687e-05, "loss": 0.4061, "num_input_tokens_seen": 1966816, "step": 5825 }, { "epoch": 4.505409582689335, "grad_norm": 1.3318084478378296, "learning_rate": 3.3512469068746505e-05, "loss": 0.3564, "num_input_tokens_seen": 1968608, "step": 5830 }, { "epoch": 4.509273570324575, "grad_norm": 2.9608864784240723, "learning_rate": 3.348075660116025e-05, "loss": 0.4682, "num_input_tokens_seen": 1970080, "step": 5835 }, { "epoch": 4.513137557959815, "grad_norm": 1.5503572225570679, "learning_rate": 3.344902870518018e-05, "loss": 0.4123, "num_input_tokens_seen": 1971776, "step": 5840 }, { "epoch": 4.517001545595054, "grad_norm": 2.3591766357421875, "learning_rate": 3.3417285438526434e-05, "loss": 0.4053, "num_input_tokens_seen": 1973472, "step": 5845 }, { "epoch": 4.520865533230293, "grad_norm": 4.294093132019043, "learning_rate": 3.3385526858947135e-05, "loss": 0.4324, "num_input_tokens_seen": 1975072, "step": 5850 }, { "epoch": 4.524729520865534, "grad_norm": 1.548915147781372, "learning_rate": 3.335375302421823e-05, "loss": 0.4055, "num_input_tokens_seen": 1976800, "step": 5855 }, { "epoch": 4.528593508500773, "grad_norm": 2.098775625228882, "learning_rate": 3.3321963992143436e-05, "loss": 0.3733, "num_input_tokens_seen": 1978336, "step": 5860 }, { "epoch": 4.532457496136012, "grad_norm": 2.3612637519836426, "learning_rate": 3.329015982055411e-05, "loss": 0.4304, "num_input_tokens_seen": 1980192, "step": 5865 }, { "epoch": 4.5363214837712516, "grad_norm": 2.1739308834075928, "learning_rate": 3.325834056730917e-05, "loss": 0.6352, "num_input_tokens_seen": 1981728, "step": 5870 }, { "epoch": 4.540185471406492, "grad_norm": 1.4305938482284546, "learning_rate": 3.322650629029494e-05, "loss": 0.4517, "num_input_tokens_seen": 1983392, "step": 5875 }, { "epoch": 4.544049459041731, "grad_norm": 3.7588274478912354, "learning_rate": 3.3194657047425115e-05, "loss": 0.5663, "num_input_tokens_seen": 1985056, "step": 5880 }, { "epoch": 4.54791344667697, "grad_norm": 1.1768568754196167, "learning_rate": 3.316279289664057e-05, "loss": 0.4615, "num_input_tokens_seen": 1986656, "step": 5885 }, { "epoch": 4.551777434312211, "grad_norm": 1.1840425729751587, "learning_rate": 3.3130913895909344e-05, "loss": 0.4567, "num_input_tokens_seen": 1988640, "step": 5890 }, { "epoch": 4.55564142194745, "grad_norm": 1.034289836883545, "learning_rate": 3.309902010322646e-05, "loss": 0.3618, "num_input_tokens_seen": 1990208, "step": 5895 }, { "epoch": 4.559505409582689, "grad_norm": 1.853190541267395, "learning_rate": 3.306711157661387e-05, "loss": 0.5251, "num_input_tokens_seen": 1991744, "step": 5900 }, { "epoch": 4.563369397217929, "grad_norm": 1.02388334274292, "learning_rate": 3.303518837412031e-05, "loss": 0.4132, "num_input_tokens_seen": 1993376, "step": 5905 }, { "epoch": 4.567233384853169, "grad_norm": 2.7188773155212402, "learning_rate": 3.3003250553821243e-05, "loss": 0.4624, "num_input_tokens_seen": 1994816, "step": 5910 }, { "epoch": 4.571097372488408, "grad_norm": 1.1583328247070312, "learning_rate": 3.297129817381871e-05, "loss": 0.3723, "num_input_tokens_seen": 1996608, "step": 5915 }, { "epoch": 4.574961360123647, "grad_norm": 1.699422001838684, "learning_rate": 3.293933129224124e-05, "loss": 0.4584, "num_input_tokens_seen": 1998496, "step": 5920 }, { "epoch": 4.578825347758887, "grad_norm": 3.1988747119903564, "learning_rate": 3.290734996724373e-05, "loss": 0.6979, "num_input_tokens_seen": 2000320, "step": 5925 }, { "epoch": 4.582689335394127, "grad_norm": 2.098971128463745, "learning_rate": 3.2875354257007375e-05, "loss": 0.4698, "num_input_tokens_seen": 2001984, "step": 5930 }, { "epoch": 4.586553323029366, "grad_norm": 1.800074815750122, "learning_rate": 3.284334421973953e-05, "loss": 0.8102, "num_input_tokens_seen": 2003936, "step": 5935 }, { "epoch": 4.590417310664606, "grad_norm": 2.4791886806488037, "learning_rate": 3.281131991367362e-05, "loss": 0.4185, "num_input_tokens_seen": 2005472, "step": 5940 }, { "epoch": 4.594281298299846, "grad_norm": 2.4546985626220703, "learning_rate": 3.277928139706901e-05, "loss": 0.406, "num_input_tokens_seen": 2007360, "step": 5945 }, { "epoch": 4.598145285935085, "grad_norm": 3.145899772644043, "learning_rate": 3.274722872821094e-05, "loss": 0.5785, "num_input_tokens_seen": 2009344, "step": 5950 }, { "epoch": 4.602009273570324, "grad_norm": 5.838898181915283, "learning_rate": 3.2715161965410384e-05, "loss": 0.5604, "num_input_tokens_seen": 2011168, "step": 5955 }, { "epoch": 4.605873261205565, "grad_norm": 2.388483762741089, "learning_rate": 3.268308116700397e-05, "loss": 0.3881, "num_input_tokens_seen": 2012736, "step": 5960 }, { "epoch": 4.609737248840804, "grad_norm": 1.6627204418182373, "learning_rate": 3.2650986391353824e-05, "loss": 0.4209, "num_input_tokens_seen": 2014432, "step": 5965 }, { "epoch": 4.613601236476043, "grad_norm": 1.7052208185195923, "learning_rate": 3.261887769684755e-05, "loss": 0.3695, "num_input_tokens_seen": 2016192, "step": 5970 }, { "epoch": 4.617465224111283, "grad_norm": 1.7042388916015625, "learning_rate": 3.258675514189802e-05, "loss": 0.3951, "num_input_tokens_seen": 2017760, "step": 5975 }, { "epoch": 4.621329211746523, "grad_norm": 2.721357822418213, "learning_rate": 3.2554618784943377e-05, "loss": 0.8654, "num_input_tokens_seen": 2019456, "step": 5980 }, { "epoch": 4.625193199381762, "grad_norm": 2.5729732513427734, "learning_rate": 3.252246868444683e-05, "loss": 0.3995, "num_input_tokens_seen": 2021248, "step": 5985 }, { "epoch": 4.629057187017001, "grad_norm": 3.386798858642578, "learning_rate": 3.24903048988966e-05, "loss": 0.4268, "num_input_tokens_seen": 2023072, "step": 5990 }, { "epoch": 4.632921174652241, "grad_norm": 3.35546612739563, "learning_rate": 3.245812748680582e-05, "loss": 0.5536, "num_input_tokens_seen": 2024704, "step": 5995 }, { "epoch": 4.636785162287481, "grad_norm": 1.3850666284561157, "learning_rate": 3.24259365067124e-05, "loss": 0.5939, "num_input_tokens_seen": 2026368, "step": 6000 }, { "epoch": 4.64064914992272, "grad_norm": 2.02775502204895, "learning_rate": 3.2393732017178935e-05, "loss": 0.6384, "num_input_tokens_seen": 2028000, "step": 6005 }, { "epoch": 4.64451313755796, "grad_norm": 1.243200421333313, "learning_rate": 3.23615140767926e-05, "loss": 0.482, "num_input_tokens_seen": 2029952, "step": 6010 }, { "epoch": 4.6483771251932, "grad_norm": 3.6516103744506836, "learning_rate": 3.2329282744165024e-05, "loss": 0.6145, "num_input_tokens_seen": 2031392, "step": 6015 }, { "epoch": 4.652241112828439, "grad_norm": 2.83798885345459, "learning_rate": 3.229703807793223e-05, "loss": 0.4435, "num_input_tokens_seen": 2032928, "step": 6020 }, { "epoch": 4.656105100463678, "grad_norm": 3.3334436416625977, "learning_rate": 3.226478013675447e-05, "loss": 0.6313, "num_input_tokens_seen": 2034368, "step": 6025 }, { "epoch": 4.659969088098918, "grad_norm": 1.4367032051086426, "learning_rate": 3.2232508979316165e-05, "loss": 0.3857, "num_input_tokens_seen": 2035968, "step": 6030 }, { "epoch": 4.663833075734158, "grad_norm": 1.7258646488189697, "learning_rate": 3.2200224664325755e-05, "loss": 0.4403, "num_input_tokens_seen": 2037728, "step": 6035 }, { "epoch": 4.667697063369397, "grad_norm": 1.4606680870056152, "learning_rate": 3.216792725051565e-05, "loss": 0.3665, "num_input_tokens_seen": 2039328, "step": 6040 }, { "epoch": 4.671561051004637, "grad_norm": 1.2159589529037476, "learning_rate": 3.2135616796642064e-05, "loss": 0.4587, "num_input_tokens_seen": 2040928, "step": 6045 }, { "epoch": 4.675425038639876, "grad_norm": 1.5690428018569946, "learning_rate": 3.210329336148494e-05, "loss": 0.4912, "num_input_tokens_seen": 2042464, "step": 6050 }, { "epoch": 4.679289026275116, "grad_norm": 1.914422869682312, "learning_rate": 3.2070957003847855e-05, "loss": 0.4948, "num_input_tokens_seen": 2043968, "step": 6055 }, { "epoch": 4.683153013910355, "grad_norm": 1.826066493988037, "learning_rate": 3.203860778255785e-05, "loss": 0.4079, "num_input_tokens_seen": 2045760, "step": 6060 }, { "epoch": 4.687017001545595, "grad_norm": 2.9646708965301514, "learning_rate": 3.200624575646542e-05, "loss": 0.4418, "num_input_tokens_seen": 2047424, "step": 6065 }, { "epoch": 4.690880989180835, "grad_norm": 2.1444318294525146, "learning_rate": 3.1973870984444324e-05, "loss": 0.4382, "num_input_tokens_seen": 2049088, "step": 6070 }, { "epoch": 4.694744976816074, "grad_norm": 3.119870662689209, "learning_rate": 3.194148352539152e-05, "loss": 0.4502, "num_input_tokens_seen": 2050496, "step": 6075 }, { "epoch": 4.698608964451314, "grad_norm": 3.6775271892547607, "learning_rate": 3.190908343822703e-05, "loss": 0.5265, "num_input_tokens_seen": 2051872, "step": 6080 }, { "epoch": 4.702472952086554, "grad_norm": 3.8508172035217285, "learning_rate": 3.187667078189388e-05, "loss": 0.4787, "num_input_tokens_seen": 2053600, "step": 6085 }, { "epoch": 4.706336939721793, "grad_norm": 3.627692461013794, "learning_rate": 3.184424561535793e-05, "loss": 0.5398, "num_input_tokens_seen": 2055232, "step": 6090 }, { "epoch": 4.710200927357032, "grad_norm": 2.0941224098205566, "learning_rate": 3.181180799760782e-05, "loss": 0.5876, "num_input_tokens_seen": 2056768, "step": 6095 }, { "epoch": 4.714064914992272, "grad_norm": 1.352728247642517, "learning_rate": 3.177935798765483e-05, "loss": 0.3717, "num_input_tokens_seen": 2058464, "step": 6100 }, { "epoch": 4.717928902627512, "grad_norm": 7.1874284744262695, "learning_rate": 3.174689564453279e-05, "loss": 0.541, "num_input_tokens_seen": 2060256, "step": 6105 }, { "epoch": 4.721792890262751, "grad_norm": 1.4282821416854858, "learning_rate": 3.171442102729797e-05, "loss": 0.4327, "num_input_tokens_seen": 2061696, "step": 6110 }, { "epoch": 4.725656877897991, "grad_norm": 2.12247896194458, "learning_rate": 3.168193419502896e-05, "loss": 0.5842, "num_input_tokens_seen": 2063488, "step": 6115 }, { "epoch": 4.72952086553323, "grad_norm": 1.969909429550171, "learning_rate": 3.164943520682658e-05, "loss": 0.4762, "num_input_tokens_seen": 2065152, "step": 6120 }, { "epoch": 4.73338485316847, "grad_norm": 2.6020612716674805, "learning_rate": 3.1616924121813764e-05, "loss": 0.454, "num_input_tokens_seen": 2066688, "step": 6125 }, { "epoch": 4.7372488408037094, "grad_norm": 2.31280255317688, "learning_rate": 3.1584400999135435e-05, "loss": 0.4169, "num_input_tokens_seen": 2068512, "step": 6130 }, { "epoch": 4.741112828438949, "grad_norm": 1.6236844062805176, "learning_rate": 3.155186589795844e-05, "loss": 0.7757, "num_input_tokens_seen": 2070176, "step": 6135 }, { "epoch": 4.744976816074189, "grad_norm": 4.767031192779541, "learning_rate": 3.151931887747141e-05, "loss": 0.8705, "num_input_tokens_seen": 2071808, "step": 6140 }, { "epoch": 4.748840803709428, "grad_norm": 1.7173705101013184, "learning_rate": 3.148675999688467e-05, "loss": 0.3352, "num_input_tokens_seen": 2073536, "step": 6145 }, { "epoch": 4.752704791344668, "grad_norm": 3.0689008235931396, "learning_rate": 3.145418931543008e-05, "loss": 0.3809, "num_input_tokens_seen": 2075168, "step": 6150 }, { "epoch": 4.756568778979907, "grad_norm": 1.4752572774887085, "learning_rate": 3.142160689236102e-05, "loss": 0.5827, "num_input_tokens_seen": 2076448, "step": 6155 }, { "epoch": 4.760432766615147, "grad_norm": 1.6265274286270142, "learning_rate": 3.1389012786952196e-05, "loss": 0.2832, "num_input_tokens_seen": 2077920, "step": 6160 }, { "epoch": 4.7642967542503865, "grad_norm": 19.983156204223633, "learning_rate": 3.135640705849958e-05, "loss": 0.501, "num_input_tokens_seen": 2079296, "step": 6165 }, { "epoch": 4.768160741885626, "grad_norm": 9.127872467041016, "learning_rate": 3.132378976632029e-05, "loss": 0.4528, "num_input_tokens_seen": 2081056, "step": 6170 }, { "epoch": 4.772024729520865, "grad_norm": 2.952507257461548, "learning_rate": 3.1291160969752483e-05, "loss": 0.5087, "num_input_tokens_seen": 2082752, "step": 6175 }, { "epoch": 4.775888717156105, "grad_norm": 3.445777654647827, "learning_rate": 3.1258520728155234e-05, "loss": 0.6294, "num_input_tokens_seen": 2084256, "step": 6180 }, { "epoch": 4.779752704791345, "grad_norm": 1.7061655521392822, "learning_rate": 3.1225869100908455e-05, "loss": 0.4425, "num_input_tokens_seen": 2085984, "step": 6185 }, { "epoch": 4.783616692426584, "grad_norm": 1.6150684356689453, "learning_rate": 3.119320614741274e-05, "loss": 0.5254, "num_input_tokens_seen": 2087744, "step": 6190 }, { "epoch": 4.787480680061824, "grad_norm": 2.7770533561706543, "learning_rate": 3.116053192708933e-05, "loss": 0.4226, "num_input_tokens_seen": 2089184, "step": 6195 }, { "epoch": 4.7913446676970635, "grad_norm": 3.697547435760498, "learning_rate": 3.1127846499379946e-05, "loss": 0.4365, "num_input_tokens_seen": 2091008, "step": 6200 }, { "epoch": 4.795208655332303, "grad_norm": 4.367976665496826, "learning_rate": 3.109514992374667e-05, "loss": 0.6944, "num_input_tokens_seen": 2092736, "step": 6205 }, { "epoch": 4.799072642967543, "grad_norm": 2.570645332336426, "learning_rate": 3.106244225967191e-05, "loss": 0.5198, "num_input_tokens_seen": 2094240, "step": 6210 }, { "epoch": 4.802936630602782, "grad_norm": 3.9730732440948486, "learning_rate": 3.102972356665822e-05, "loss": 0.7098, "num_input_tokens_seen": 2095968, "step": 6215 }, { "epoch": 4.806800618238022, "grad_norm": 1.2165952920913696, "learning_rate": 3.099699390422822e-05, "loss": 0.6078, "num_input_tokens_seen": 2097728, "step": 6220 }, { "epoch": 4.810664605873261, "grad_norm": 1.5308771133422852, "learning_rate": 3.09642533319245e-05, "loss": 0.3856, "num_input_tokens_seen": 2099360, "step": 6225 }, { "epoch": 4.814528593508501, "grad_norm": 2.68088436126709, "learning_rate": 3.093150190930946e-05, "loss": 0.4177, "num_input_tokens_seen": 2100864, "step": 6230 }, { "epoch": 4.8183925811437405, "grad_norm": 1.1877896785736084, "learning_rate": 3.089873969596531e-05, "loss": 0.3595, "num_input_tokens_seen": 2102560, "step": 6235 }, { "epoch": 4.82225656877898, "grad_norm": 3.0046000480651855, "learning_rate": 3.0865966751493796e-05, "loss": 0.5132, "num_input_tokens_seen": 2104352, "step": 6240 }, { "epoch": 4.826120556414219, "grad_norm": 2.732020378112793, "learning_rate": 3.083318313551628e-05, "loss": 0.4682, "num_input_tokens_seen": 2105760, "step": 6245 }, { "epoch": 4.829984544049459, "grad_norm": 1.377872109413147, "learning_rate": 3.0800388907673464e-05, "loss": 0.4081, "num_input_tokens_seen": 2107392, "step": 6250 }, { "epoch": 4.833848531684699, "grad_norm": 1.4164525270462036, "learning_rate": 3.07675841276254e-05, "loss": 0.4166, "num_input_tokens_seen": 2109024, "step": 6255 }, { "epoch": 4.837712519319938, "grad_norm": 1.3521472215652466, "learning_rate": 3.073476885505132e-05, "loss": 0.6077, "num_input_tokens_seen": 2110912, "step": 6260 }, { "epoch": 4.841576506955178, "grad_norm": 1.7542312145233154, "learning_rate": 3.070194314964956e-05, "loss": 0.4219, "num_input_tokens_seen": 2112480, "step": 6265 }, { "epoch": 4.8454404945904175, "grad_norm": 3.142608165740967, "learning_rate": 3.06691070711374e-05, "loss": 0.4751, "num_input_tokens_seen": 2114336, "step": 6270 }, { "epoch": 4.849304482225657, "grad_norm": 4.036103248596191, "learning_rate": 3.063626067925103e-05, "loss": 0.5279, "num_input_tokens_seen": 2115936, "step": 6275 }, { "epoch": 4.853168469860896, "grad_norm": 2.419321298599243, "learning_rate": 3.0603404033745376e-05, "loss": 0.4968, "num_input_tokens_seen": 2117504, "step": 6280 }, { "epoch": 4.857032457496136, "grad_norm": 1.7963231801986694, "learning_rate": 3.057053719439404e-05, "loss": 0.6428, "num_input_tokens_seen": 2119104, "step": 6285 }, { "epoch": 4.860896445131376, "grad_norm": 1.9570949077606201, "learning_rate": 3.053766022098914e-05, "loss": 0.4422, "num_input_tokens_seen": 2120832, "step": 6290 }, { "epoch": 4.864760432766615, "grad_norm": 3.4772584438323975, "learning_rate": 3.0504773173341267e-05, "loss": 0.4257, "num_input_tokens_seen": 2122720, "step": 6295 }, { "epoch": 4.868624420401854, "grad_norm": 1.8129602670669556, "learning_rate": 3.0471876111279295e-05, "loss": 0.4276, "num_input_tokens_seen": 2124416, "step": 6300 }, { "epoch": 4.8724884080370945, "grad_norm": 3.9768106937408447, "learning_rate": 3.043896909465037e-05, "loss": 0.3721, "num_input_tokens_seen": 2125952, "step": 6305 }, { "epoch": 4.876352395672334, "grad_norm": 1.561616063117981, "learning_rate": 3.0406052183319694e-05, "loss": 0.3706, "num_input_tokens_seen": 2127616, "step": 6310 }, { "epoch": 4.880216383307573, "grad_norm": 1.7910728454589844, "learning_rate": 3.0373125437170507e-05, "loss": 0.4687, "num_input_tokens_seen": 2129600, "step": 6315 }, { "epoch": 4.884080370942813, "grad_norm": 2.8902840614318848, "learning_rate": 3.0340188916103912e-05, "loss": 0.4708, "num_input_tokens_seen": 2131168, "step": 6320 }, { "epoch": 4.887944358578053, "grad_norm": 4.103988170623779, "learning_rate": 3.0307242680038828e-05, "loss": 0.4143, "num_input_tokens_seen": 2132832, "step": 6325 }, { "epoch": 4.891808346213292, "grad_norm": 1.5730876922607422, "learning_rate": 3.0274286788911827e-05, "loss": 0.3709, "num_input_tokens_seen": 2134592, "step": 6330 }, { "epoch": 4.895672333848532, "grad_norm": 1.2012859582901, "learning_rate": 3.0241321302677038e-05, "loss": 0.3851, "num_input_tokens_seen": 2136256, "step": 6335 }, { "epoch": 4.8995363214837715, "grad_norm": 1.3624320030212402, "learning_rate": 3.0208346281306065e-05, "loss": 0.5208, "num_input_tokens_seen": 2138208, "step": 6340 }, { "epoch": 4.903400309119011, "grad_norm": 2.1991524696350098, "learning_rate": 3.017536178478785e-05, "loss": 0.4069, "num_input_tokens_seen": 2139968, "step": 6345 }, { "epoch": 4.90726429675425, "grad_norm": 4.771265983581543, "learning_rate": 3.014236787312857e-05, "loss": 0.5786, "num_input_tokens_seen": 2141536, "step": 6350 }, { "epoch": 4.91112828438949, "grad_norm": 4.137334823608398, "learning_rate": 3.0109364606351532e-05, "loss": 0.5007, "num_input_tokens_seen": 2143296, "step": 6355 }, { "epoch": 4.91499227202473, "grad_norm": 1.7097389698028564, "learning_rate": 3.0076352044497057e-05, "loss": 0.4478, "num_input_tokens_seen": 2145088, "step": 6360 }, { "epoch": 4.918856259659969, "grad_norm": 2.635242462158203, "learning_rate": 3.0043330247622404e-05, "loss": 0.446, "num_input_tokens_seen": 2147136, "step": 6365 }, { "epoch": 4.922720247295208, "grad_norm": 2.827590227127075, "learning_rate": 3.0010299275801578e-05, "loss": 0.4381, "num_input_tokens_seen": 2148736, "step": 6370 }, { "epoch": 4.9265842349304485, "grad_norm": 2.2049901485443115, "learning_rate": 2.997725918912534e-05, "loss": 0.445, "num_input_tokens_seen": 2150592, "step": 6375 }, { "epoch": 4.930448222565688, "grad_norm": 2.028599977493286, "learning_rate": 2.994421004770097e-05, "loss": 0.4147, "num_input_tokens_seen": 2152352, "step": 6380 }, { "epoch": 4.934312210200927, "grad_norm": 2.511997938156128, "learning_rate": 2.9911151911652273e-05, "loss": 0.3723, "num_input_tokens_seen": 2154016, "step": 6385 }, { "epoch": 4.938176197836167, "grad_norm": 1.3972994089126587, "learning_rate": 2.9878084841119387e-05, "loss": 0.5791, "num_input_tokens_seen": 2155616, "step": 6390 }, { "epoch": 4.942040185471407, "grad_norm": 3.6761727333068848, "learning_rate": 2.984500889625871e-05, "loss": 0.4778, "num_input_tokens_seen": 2157088, "step": 6395 }, { "epoch": 4.945904173106646, "grad_norm": 1.2899342775344849, "learning_rate": 2.9811924137242787e-05, "loss": 0.4111, "num_input_tokens_seen": 2158848, "step": 6400 }, { "epoch": 4.949768160741885, "grad_norm": 2.155632257461548, "learning_rate": 2.97788306242602e-05, "loss": 0.4114, "num_input_tokens_seen": 2160384, "step": 6405 }, { "epoch": 4.9536321483771255, "grad_norm": 1.6184748411178589, "learning_rate": 2.9745728417515457e-05, "loss": 0.3724, "num_input_tokens_seen": 2162432, "step": 6410 }, { "epoch": 4.957496136012365, "grad_norm": 2.532302141189575, "learning_rate": 2.9712617577228875e-05, "loss": 0.3611, "num_input_tokens_seen": 2164000, "step": 6415 }, { "epoch": 4.961360123647604, "grad_norm": 4.6155104637146, "learning_rate": 2.9679498163636466e-05, "loss": 0.4014, "num_input_tokens_seen": 2165600, "step": 6420 }, { "epoch": 4.9652241112828435, "grad_norm": 1.4873557090759277, "learning_rate": 2.9646370236989884e-05, "loss": 0.4425, "num_input_tokens_seen": 2167168, "step": 6425 }, { "epoch": 4.969088098918084, "grad_norm": 3.399523973464966, "learning_rate": 2.9613233857556217e-05, "loss": 0.798, "num_input_tokens_seen": 2168928, "step": 6430 }, { "epoch": 4.972952086553323, "grad_norm": 2.215195655822754, "learning_rate": 2.9580089085617963e-05, "loss": 0.5089, "num_input_tokens_seen": 2170720, "step": 6435 }, { "epoch": 4.976816074188562, "grad_norm": 2.9037482738494873, "learning_rate": 2.9546935981472867e-05, "loss": 0.5653, "num_input_tokens_seen": 2172288, "step": 6440 }, { "epoch": 4.9806800618238025, "grad_norm": 3.481429100036621, "learning_rate": 2.9513774605433862e-05, "loss": 0.7702, "num_input_tokens_seen": 2173856, "step": 6445 }, { "epoch": 4.984544049459042, "grad_norm": 1.9054514169692993, "learning_rate": 2.94806050178289e-05, "loss": 0.4248, "num_input_tokens_seen": 2175680, "step": 6450 }, { "epoch": 4.988408037094281, "grad_norm": 2.937561511993408, "learning_rate": 2.9447427279000878e-05, "loss": 0.4361, "num_input_tokens_seen": 2177152, "step": 6455 }, { "epoch": 4.992272024729521, "grad_norm": 4.291450500488281, "learning_rate": 2.9414241449307535e-05, "loss": 0.5757, "num_input_tokens_seen": 2179040, "step": 6460 }, { "epoch": 4.996136012364761, "grad_norm": 1.0137884616851807, "learning_rate": 2.938104758912132e-05, "loss": 0.5382, "num_input_tokens_seen": 2180704, "step": 6465 }, { "epoch": 5.0, "grad_norm": 3.6628150939941406, "learning_rate": 2.9347845758829286e-05, "loss": 0.7069, "num_input_tokens_seen": 2182352, "step": 6470 }, { "epoch": 5.0, "eval_loss": 0.5203819870948792, "eval_runtime": 11.4655, "eval_samples_per_second": 50.15, "eval_steps_per_second": 12.559, "num_input_tokens_seen": 2182352, "step": 6470 }, { "epoch": 5.003863987635239, "grad_norm": 1.3343318700790405, "learning_rate": 2.9314636018832992e-05, "loss": 0.4307, "num_input_tokens_seen": 2184176, "step": 6475 }, { "epoch": 5.0077279752704795, "grad_norm": 1.3096975088119507, "learning_rate": 2.928141842954839e-05, "loss": 0.3531, "num_input_tokens_seen": 2185936, "step": 6480 }, { "epoch": 5.011591962905719, "grad_norm": 1.7781199216842651, "learning_rate": 2.9248193051405704e-05, "loss": 0.5944, "num_input_tokens_seen": 2187568, "step": 6485 }, { "epoch": 5.015455950540958, "grad_norm": 2.773961305618286, "learning_rate": 2.9214959944849324e-05, "loss": 0.4802, "num_input_tokens_seen": 2189392, "step": 6490 }, { "epoch": 5.0193199381761975, "grad_norm": 2.751554012298584, "learning_rate": 2.918171917033773e-05, "loss": 0.4491, "num_input_tokens_seen": 2191312, "step": 6495 }, { "epoch": 5.023183925811438, "grad_norm": 1.0919901132583618, "learning_rate": 2.9148470788343303e-05, "loss": 0.3533, "num_input_tokens_seen": 2193200, "step": 6500 }, { "epoch": 5.027047913446677, "grad_norm": 3.064990282058716, "learning_rate": 2.9115214859352303e-05, "loss": 0.4277, "num_input_tokens_seen": 2195024, "step": 6505 }, { "epoch": 5.030911901081916, "grad_norm": 1.4972394704818726, "learning_rate": 2.90819514438647e-05, "loss": 0.6131, "num_input_tokens_seen": 2196720, "step": 6510 }, { "epoch": 5.0347758887171565, "grad_norm": 3.8576323986053467, "learning_rate": 2.9048680602394085e-05, "loss": 0.4893, "num_input_tokens_seen": 2198416, "step": 6515 }, { "epoch": 5.038639876352396, "grad_norm": 1.615434169769287, "learning_rate": 2.901540239546758e-05, "loss": 0.3492, "num_input_tokens_seen": 2200112, "step": 6520 }, { "epoch": 5.042503863987635, "grad_norm": 2.2512423992156982, "learning_rate": 2.8982116883625677e-05, "loss": 0.5728, "num_input_tokens_seen": 2201840, "step": 6525 }, { "epoch": 5.0463678516228745, "grad_norm": 1.7042887210845947, "learning_rate": 2.8948824127422163e-05, "loss": 0.584, "num_input_tokens_seen": 2203440, "step": 6530 }, { "epoch": 5.050231839258115, "grad_norm": 2.447293281555176, "learning_rate": 2.8915524187424027e-05, "loss": 0.4023, "num_input_tokens_seen": 2204976, "step": 6535 }, { "epoch": 5.054095826893354, "grad_norm": 2.527125358581543, "learning_rate": 2.8882217124211296e-05, "loss": 0.409, "num_input_tokens_seen": 2206576, "step": 6540 }, { "epoch": 5.057959814528593, "grad_norm": 2.826016902923584, "learning_rate": 2.8848902998376986e-05, "loss": 0.4034, "num_input_tokens_seen": 2208240, "step": 6545 }, { "epoch": 5.061823802163833, "grad_norm": 1.4586350917816162, "learning_rate": 2.8815581870526924e-05, "loss": 0.4129, "num_input_tokens_seen": 2209872, "step": 6550 }, { "epoch": 5.065687789799073, "grad_norm": 2.9700448513031006, "learning_rate": 2.8782253801279723e-05, "loss": 0.4045, "num_input_tokens_seen": 2211632, "step": 6555 }, { "epoch": 5.069551777434312, "grad_norm": 2.8647780418395996, "learning_rate": 2.8748918851266575e-05, "loss": 0.467, "num_input_tokens_seen": 2213552, "step": 6560 }, { "epoch": 5.0734157650695515, "grad_norm": 1.40973699092865, "learning_rate": 2.8715577081131235e-05, "loss": 0.4085, "num_input_tokens_seen": 2215280, "step": 6565 }, { "epoch": 5.077279752704792, "grad_norm": 2.5421524047851562, "learning_rate": 2.8682228551529818e-05, "loss": 0.3547, "num_input_tokens_seen": 2216752, "step": 6570 }, { "epoch": 5.081143740340031, "grad_norm": 2.5357840061187744, "learning_rate": 2.8648873323130776e-05, "loss": 0.548, "num_input_tokens_seen": 2218448, "step": 6575 }, { "epoch": 5.08500772797527, "grad_norm": 3.408264636993408, "learning_rate": 2.8615511456614734e-05, "loss": 0.6266, "num_input_tokens_seen": 2220112, "step": 6580 }, { "epoch": 5.08887171561051, "grad_norm": 4.657522201538086, "learning_rate": 2.858214301267439e-05, "loss": 0.5483, "num_input_tokens_seen": 2221968, "step": 6585 }, { "epoch": 5.09273570324575, "grad_norm": 1.5772881507873535, "learning_rate": 2.8548768052014402e-05, "loss": 0.3299, "num_input_tokens_seen": 2223472, "step": 6590 }, { "epoch": 5.096599690880989, "grad_norm": 1.5591304302215576, "learning_rate": 2.85153866353513e-05, "loss": 0.3687, "num_input_tokens_seen": 2225104, "step": 6595 }, { "epoch": 5.1004636785162285, "grad_norm": 2.0694234371185303, "learning_rate": 2.8481998823413346e-05, "loss": 0.3778, "num_input_tokens_seen": 2226576, "step": 6600 }, { "epoch": 5.104327666151469, "grad_norm": 2.3040473461151123, "learning_rate": 2.8448604676940442e-05, "loss": 0.4123, "num_input_tokens_seen": 2227888, "step": 6605 }, { "epoch": 5.108191653786708, "grad_norm": 2.666635274887085, "learning_rate": 2.8415204256684013e-05, "loss": 0.4464, "num_input_tokens_seen": 2229648, "step": 6610 }, { "epoch": 5.112055641421947, "grad_norm": 2.389350414276123, "learning_rate": 2.8381797623406893e-05, "loss": 0.5282, "num_input_tokens_seen": 2231152, "step": 6615 }, { "epoch": 5.115919629057187, "grad_norm": 2.99371600151062, "learning_rate": 2.8348384837883224e-05, "loss": 0.3803, "num_input_tokens_seen": 2232944, "step": 6620 }, { "epoch": 5.119783616692427, "grad_norm": 4.345697402954102, "learning_rate": 2.8314965960898343e-05, "loss": 0.5488, "num_input_tokens_seen": 2234608, "step": 6625 }, { "epoch": 5.123647604327666, "grad_norm": 2.3153653144836426, "learning_rate": 2.8281541053248657e-05, "loss": 0.5006, "num_input_tokens_seen": 2236304, "step": 6630 }, { "epoch": 5.1275115919629055, "grad_norm": 2.8753747940063477, "learning_rate": 2.824811017574156e-05, "loss": 0.37, "num_input_tokens_seen": 2237840, "step": 6635 }, { "epoch": 5.131375579598146, "grad_norm": 1.4589475393295288, "learning_rate": 2.821467338919529e-05, "loss": 0.3521, "num_input_tokens_seen": 2239504, "step": 6640 }, { "epoch": 5.135239567233385, "grad_norm": 3.31514048576355, "learning_rate": 2.8181230754438852e-05, "loss": 0.3469, "num_input_tokens_seen": 2241168, "step": 6645 }, { "epoch": 5.139103554868624, "grad_norm": 1.3874328136444092, "learning_rate": 2.814778233231188e-05, "loss": 0.3518, "num_input_tokens_seen": 2242800, "step": 6650 }, { "epoch": 5.142967542503864, "grad_norm": 2.5542140007019043, "learning_rate": 2.811432818366453e-05, "loss": 0.4987, "num_input_tokens_seen": 2244592, "step": 6655 }, { "epoch": 5.146831530139104, "grad_norm": 2.223381519317627, "learning_rate": 2.8080868369357387e-05, "loss": 0.3715, "num_input_tokens_seen": 2246128, "step": 6660 }, { "epoch": 5.150695517774343, "grad_norm": 2.9401803016662598, "learning_rate": 2.804740295026134e-05, "loss": 0.6536, "num_input_tokens_seen": 2247856, "step": 6665 }, { "epoch": 5.1545595054095825, "grad_norm": 4.895540714263916, "learning_rate": 2.801393198725748e-05, "loss": 0.4377, "num_input_tokens_seen": 2249488, "step": 6670 }, { "epoch": 5.158423493044822, "grad_norm": 2.3026952743530273, "learning_rate": 2.7980455541236978e-05, "loss": 0.4367, "num_input_tokens_seen": 2251184, "step": 6675 }, { "epoch": 5.162287480680062, "grad_norm": 1.891308069229126, "learning_rate": 2.7946973673100964e-05, "loss": 0.4691, "num_input_tokens_seen": 2252720, "step": 6680 }, { "epoch": 5.166151468315301, "grad_norm": 1.4433339834213257, "learning_rate": 2.7913486443760468e-05, "loss": 0.3654, "num_input_tokens_seen": 2254512, "step": 6685 }, { "epoch": 5.170015455950541, "grad_norm": 2.3554067611694336, "learning_rate": 2.787999391413623e-05, "loss": 0.4456, "num_input_tokens_seen": 2256496, "step": 6690 }, { "epoch": 5.173879443585781, "grad_norm": 3.09601092338562, "learning_rate": 2.7846496145158683e-05, "loss": 0.4856, "num_input_tokens_seen": 2258256, "step": 6695 }, { "epoch": 5.17774343122102, "grad_norm": 2.013200283050537, "learning_rate": 2.7812993197767747e-05, "loss": 0.456, "num_input_tokens_seen": 2259920, "step": 6700 }, { "epoch": 5.1816074188562595, "grad_norm": 2.1086056232452393, "learning_rate": 2.7779485132912774e-05, "loss": 0.4349, "num_input_tokens_seen": 2261712, "step": 6705 }, { "epoch": 5.185471406491499, "grad_norm": 2.8244903087615967, "learning_rate": 2.7745972011552447e-05, "loss": 0.4142, "num_input_tokens_seen": 2263312, "step": 6710 }, { "epoch": 5.189335394126739, "grad_norm": 2.160449743270874, "learning_rate": 2.771245389465462e-05, "loss": 0.5495, "num_input_tokens_seen": 2264656, "step": 6715 }, { "epoch": 5.193199381761978, "grad_norm": 2.161191701889038, "learning_rate": 2.767893084319626e-05, "loss": 0.4996, "num_input_tokens_seen": 2266416, "step": 6720 }, { "epoch": 5.197063369397218, "grad_norm": 1.8190447092056274, "learning_rate": 2.764540291816328e-05, "loss": 0.4366, "num_input_tokens_seen": 2268016, "step": 6725 }, { "epoch": 5.200927357032458, "grad_norm": 2.215989351272583, "learning_rate": 2.7611870180550497e-05, "loss": 0.3824, "num_input_tokens_seen": 2269648, "step": 6730 }, { "epoch": 5.204791344667697, "grad_norm": 3.583143472671509, "learning_rate": 2.757833269136145e-05, "loss": 0.4059, "num_input_tokens_seen": 2271152, "step": 6735 }, { "epoch": 5.2086553323029365, "grad_norm": 2.8992366790771484, "learning_rate": 2.754479051160833e-05, "loss": 0.4729, "num_input_tokens_seen": 2272976, "step": 6740 }, { "epoch": 5.212519319938176, "grad_norm": 3.8541111946105957, "learning_rate": 2.7511243702311896e-05, "loss": 0.6037, "num_input_tokens_seen": 2274832, "step": 6745 }, { "epoch": 5.216383307573416, "grad_norm": 1.3532594442367554, "learning_rate": 2.747769232450127e-05, "loss": 0.469, "num_input_tokens_seen": 2276752, "step": 6750 }, { "epoch": 5.220247295208655, "grad_norm": 1.9801243543624878, "learning_rate": 2.744413643921393e-05, "loss": 0.3837, "num_input_tokens_seen": 2278128, "step": 6755 }, { "epoch": 5.224111282843895, "grad_norm": 7.630372047424316, "learning_rate": 2.7410576107495534e-05, "loss": 0.683, "num_input_tokens_seen": 2279952, "step": 6760 }, { "epoch": 5.227975270479135, "grad_norm": 1.263055682182312, "learning_rate": 2.7377011390399832e-05, "loss": 0.5795, "num_input_tokens_seen": 2282064, "step": 6765 }, { "epoch": 5.231839258114374, "grad_norm": 2.936452865600586, "learning_rate": 2.7343442348988567e-05, "loss": 0.4537, "num_input_tokens_seen": 2283824, "step": 6770 }, { "epoch": 5.2357032457496135, "grad_norm": 3.007610559463501, "learning_rate": 2.7309869044331325e-05, "loss": 0.4736, "num_input_tokens_seen": 2285616, "step": 6775 }, { "epoch": 5.239567233384853, "grad_norm": 3.7912161350250244, "learning_rate": 2.7276291537505462e-05, "loss": 0.4135, "num_input_tokens_seen": 2287184, "step": 6780 }, { "epoch": 5.243431221020093, "grad_norm": 2.8133254051208496, "learning_rate": 2.7242709889595974e-05, "loss": 0.4559, "num_input_tokens_seen": 2288848, "step": 6785 }, { "epoch": 5.247295208655332, "grad_norm": 1.9333399534225464, "learning_rate": 2.7209124161695397e-05, "loss": 0.3918, "num_input_tokens_seen": 2290832, "step": 6790 }, { "epoch": 5.251159196290572, "grad_norm": 3.4220242500305176, "learning_rate": 2.7175534414903687e-05, "loss": 0.623, "num_input_tokens_seen": 2292496, "step": 6795 }, { "epoch": 5.255023183925811, "grad_norm": 2.105346441268921, "learning_rate": 2.7141940710328095e-05, "loss": 0.4804, "num_input_tokens_seen": 2294032, "step": 6800 }, { "epoch": 5.258887171561051, "grad_norm": 2.6382694244384766, "learning_rate": 2.7108343109083102e-05, "loss": 0.4969, "num_input_tokens_seen": 2295440, "step": 6805 }, { "epoch": 5.2627511591962906, "grad_norm": 1.4646965265274048, "learning_rate": 2.707474167229025e-05, "loss": 0.4304, "num_input_tokens_seen": 2297168, "step": 6810 }, { "epoch": 5.26661514683153, "grad_norm": 2.7402801513671875, "learning_rate": 2.7041136461078088e-05, "loss": 0.4202, "num_input_tokens_seen": 2298864, "step": 6815 }, { "epoch": 5.27047913446677, "grad_norm": 2.6538312435150146, "learning_rate": 2.7007527536581988e-05, "loss": 0.4568, "num_input_tokens_seen": 2300752, "step": 6820 }, { "epoch": 5.274343122102009, "grad_norm": 1.8084245920181274, "learning_rate": 2.697391495994413e-05, "loss": 0.3944, "num_input_tokens_seen": 2302416, "step": 6825 }, { "epoch": 5.278207109737249, "grad_norm": 1.5680086612701416, "learning_rate": 2.694029879231329e-05, "loss": 0.4424, "num_input_tokens_seen": 2304208, "step": 6830 }, { "epoch": 5.282071097372488, "grad_norm": 2.3948569297790527, "learning_rate": 2.690667909484481e-05, "loss": 0.533, "num_input_tokens_seen": 2306000, "step": 6835 }, { "epoch": 5.285935085007728, "grad_norm": 4.123690605163574, "learning_rate": 2.6873055928700448e-05, "loss": 0.4085, "num_input_tokens_seen": 2307440, "step": 6840 }, { "epoch": 5.289799072642968, "grad_norm": 1.1739540100097656, "learning_rate": 2.683942935504825e-05, "loss": 0.3915, "num_input_tokens_seen": 2309232, "step": 6845 }, { "epoch": 5.293663060278207, "grad_norm": 3.4295406341552734, "learning_rate": 2.6805799435062496e-05, "loss": 0.5154, "num_input_tokens_seen": 2310608, "step": 6850 }, { "epoch": 5.297527047913447, "grad_norm": 2.8699684143066406, "learning_rate": 2.6772166229923517e-05, "loss": 0.3998, "num_input_tokens_seen": 2312208, "step": 6855 }, { "epoch": 5.301391035548686, "grad_norm": 4.125344276428223, "learning_rate": 2.673852980081765e-05, "loss": 0.4289, "num_input_tokens_seen": 2313840, "step": 6860 }, { "epoch": 5.305255023183926, "grad_norm": 4.093509674072266, "learning_rate": 2.6704890208937084e-05, "loss": 0.492, "num_input_tokens_seen": 2315600, "step": 6865 }, { "epoch": 5.309119010819165, "grad_norm": 2.6216723918914795, "learning_rate": 2.6671247515479754e-05, "loss": 0.8029, "num_input_tokens_seen": 2317040, "step": 6870 }, { "epoch": 5.312982998454405, "grad_norm": 2.504993438720703, "learning_rate": 2.6637601781649264e-05, "loss": 0.438, "num_input_tokens_seen": 2318736, "step": 6875 }, { "epoch": 5.316846986089645, "grad_norm": 1.7629039287567139, "learning_rate": 2.6603953068654713e-05, "loss": 0.385, "num_input_tokens_seen": 2320240, "step": 6880 }, { "epoch": 5.320710973724884, "grad_norm": 1.9617925882339478, "learning_rate": 2.657030143771066e-05, "loss": 0.5326, "num_input_tokens_seen": 2322064, "step": 6885 }, { "epoch": 5.324574961360124, "grad_norm": 1.744212031364441, "learning_rate": 2.653664695003693e-05, "loss": 0.5329, "num_input_tokens_seen": 2323632, "step": 6890 }, { "epoch": 5.328438948995363, "grad_norm": 1.428815484046936, "learning_rate": 2.650298966685858e-05, "loss": 0.4577, "num_input_tokens_seen": 2325232, "step": 6895 }, { "epoch": 5.332302936630603, "grad_norm": 1.5497790575027466, "learning_rate": 2.6469329649405738e-05, "loss": 0.4761, "num_input_tokens_seen": 2326928, "step": 6900 }, { "epoch": 5.336166924265842, "grad_norm": 1.085188865661621, "learning_rate": 2.6435666958913496e-05, "loss": 0.3886, "num_input_tokens_seen": 2328560, "step": 6905 }, { "epoch": 5.340030911901082, "grad_norm": 1.221995234489441, "learning_rate": 2.6402001656621833e-05, "loss": 0.3733, "num_input_tokens_seen": 2330160, "step": 6910 }, { "epoch": 5.343894899536322, "grad_norm": 2.2027313709259033, "learning_rate": 2.6368333803775462e-05, "loss": 0.6133, "num_input_tokens_seen": 2331792, "step": 6915 }, { "epoch": 5.347758887171561, "grad_norm": 1.5610952377319336, "learning_rate": 2.6334663461623736e-05, "loss": 0.3568, "num_input_tokens_seen": 2333232, "step": 6920 }, { "epoch": 5.3516228748068, "grad_norm": 5.310863494873047, "learning_rate": 2.630099069142055e-05, "loss": 0.5385, "num_input_tokens_seen": 2334928, "step": 6925 }, { "epoch": 5.35548686244204, "grad_norm": 1.4741277694702148, "learning_rate": 2.6267315554424192e-05, "loss": 0.4023, "num_input_tokens_seen": 2336944, "step": 6930 }, { "epoch": 5.35935085007728, "grad_norm": 1.4989711046218872, "learning_rate": 2.6233638111897297e-05, "loss": 0.5214, "num_input_tokens_seen": 2338736, "step": 6935 }, { "epoch": 5.363214837712519, "grad_norm": 2.7617201805114746, "learning_rate": 2.619995842510664e-05, "loss": 0.3747, "num_input_tokens_seen": 2340336, "step": 6940 }, { "epoch": 5.367078825347759, "grad_norm": 2.392116069793701, "learning_rate": 2.6166276555323126e-05, "loss": 0.5497, "num_input_tokens_seen": 2341936, "step": 6945 }, { "epoch": 5.370942812982999, "grad_norm": 1.7703044414520264, "learning_rate": 2.6132592563821602e-05, "loss": 0.459, "num_input_tokens_seen": 2343536, "step": 6950 }, { "epoch": 5.374806800618238, "grad_norm": 3.0312530994415283, "learning_rate": 2.6098906511880782e-05, "loss": 0.5652, "num_input_tokens_seen": 2345264, "step": 6955 }, { "epoch": 5.378670788253477, "grad_norm": 1.1657016277313232, "learning_rate": 2.6065218460783153e-05, "loss": 0.4157, "num_input_tokens_seen": 2346768, "step": 6960 }, { "epoch": 5.382534775888717, "grad_norm": 1.4460991621017456, "learning_rate": 2.6031528471814793e-05, "loss": 0.467, "num_input_tokens_seen": 2348592, "step": 6965 }, { "epoch": 5.386398763523957, "grad_norm": 3.3861093521118164, "learning_rate": 2.5997836606265345e-05, "loss": 0.4395, "num_input_tokens_seen": 2350288, "step": 6970 }, { "epoch": 5.390262751159196, "grad_norm": 3.0496408939361572, "learning_rate": 2.5964142925427838e-05, "loss": 0.4414, "num_input_tokens_seen": 2351920, "step": 6975 }, { "epoch": 5.394126738794436, "grad_norm": 3.624948024749756, "learning_rate": 2.593044749059863e-05, "loss": 0.4253, "num_input_tokens_seen": 2353648, "step": 6980 }, { "epoch": 5.397990726429676, "grad_norm": 3.7667341232299805, "learning_rate": 2.5896750363077253e-05, "loss": 0.4353, "num_input_tokens_seen": 2355312, "step": 6985 }, { "epoch": 5.401854714064915, "grad_norm": 3.5511577129364014, "learning_rate": 2.5863051604166317e-05, "loss": 0.4063, "num_input_tokens_seen": 2357104, "step": 6990 }, { "epoch": 5.405718701700154, "grad_norm": 3.8116276264190674, "learning_rate": 2.5829351275171406e-05, "loss": 0.6033, "num_input_tokens_seen": 2358768, "step": 6995 }, { "epoch": 5.409582689335394, "grad_norm": 1.6815975904464722, "learning_rate": 2.5795649437400958e-05, "loss": 0.6073, "num_input_tokens_seen": 2360592, "step": 7000 }, { "epoch": 5.413446676970634, "grad_norm": 1.674458622932434, "learning_rate": 2.576194615216616e-05, "loss": 0.6296, "num_input_tokens_seen": 2362224, "step": 7005 }, { "epoch": 5.417310664605873, "grad_norm": 2.036038875579834, "learning_rate": 2.5728241480780828e-05, "loss": 0.4039, "num_input_tokens_seen": 2363664, "step": 7010 }, { "epoch": 5.421174652241113, "grad_norm": 3.3004720211029053, "learning_rate": 2.5694535484561306e-05, "loss": 0.6743, "num_input_tokens_seen": 2365296, "step": 7015 }, { "epoch": 5.425038639876353, "grad_norm": 1.3936387300491333, "learning_rate": 2.566082822482633e-05, "loss": 0.6095, "num_input_tokens_seen": 2366992, "step": 7020 }, { "epoch": 5.428902627511592, "grad_norm": 1.349830150604248, "learning_rate": 2.5627119762896944e-05, "loss": 0.4783, "num_input_tokens_seen": 2368496, "step": 7025 }, { "epoch": 5.432766615146831, "grad_norm": 1.1225706338882446, "learning_rate": 2.559341016009641e-05, "loss": 0.4806, "num_input_tokens_seen": 2370128, "step": 7030 }, { "epoch": 5.436630602782071, "grad_norm": 2.719331979751587, "learning_rate": 2.5559699477750004e-05, "loss": 0.493, "num_input_tokens_seen": 2371888, "step": 7035 }, { "epoch": 5.440494590417311, "grad_norm": 2.7249393463134766, "learning_rate": 2.5525987777185035e-05, "loss": 0.4293, "num_input_tokens_seen": 2373552, "step": 7040 }, { "epoch": 5.44435857805255, "grad_norm": 1.0913643836975098, "learning_rate": 2.5492275119730592e-05, "loss": 0.3464, "num_input_tokens_seen": 2375376, "step": 7045 }, { "epoch": 5.448222565687789, "grad_norm": 1.5798609256744385, "learning_rate": 2.5458561566717572e-05, "loss": 0.3411, "num_input_tokens_seen": 2377232, "step": 7050 }, { "epoch": 5.45208655332303, "grad_norm": 1.7708474397659302, "learning_rate": 2.5424847179478454e-05, "loss": 0.3826, "num_input_tokens_seen": 2378736, "step": 7055 }, { "epoch": 5.455950540958269, "grad_norm": 3.5022737979888916, "learning_rate": 2.5391132019347265e-05, "loss": 0.5033, "num_input_tokens_seen": 2380816, "step": 7060 }, { "epoch": 5.459814528593508, "grad_norm": 1.1604989767074585, "learning_rate": 2.5357416147659407e-05, "loss": 0.3477, "num_input_tokens_seen": 2382736, "step": 7065 }, { "epoch": 5.4636785162287484, "grad_norm": 1.6591569185256958, "learning_rate": 2.5323699625751614e-05, "loss": 0.4442, "num_input_tokens_seen": 2384144, "step": 7070 }, { "epoch": 5.467542503863988, "grad_norm": 4.04130220413208, "learning_rate": 2.5289982514961775e-05, "loss": 0.4658, "num_input_tokens_seen": 2385648, "step": 7075 }, { "epoch": 5.471406491499227, "grad_norm": 2.4919183254241943, "learning_rate": 2.525626487662885e-05, "loss": 0.3905, "num_input_tokens_seen": 2387376, "step": 7080 }, { "epoch": 5.475270479134466, "grad_norm": 2.3465774059295654, "learning_rate": 2.5222546772092775e-05, "loss": 0.4612, "num_input_tokens_seen": 2389104, "step": 7085 }, { "epoch": 5.479134466769707, "grad_norm": 2.071312427520752, "learning_rate": 2.518882826269434e-05, "loss": 0.4839, "num_input_tokens_seen": 2390736, "step": 7090 }, { "epoch": 5.482998454404946, "grad_norm": 3.0940301418304443, "learning_rate": 2.5155109409775035e-05, "loss": 0.5205, "num_input_tokens_seen": 2392496, "step": 7095 }, { "epoch": 5.486862442040185, "grad_norm": 2.6808643341064453, "learning_rate": 2.5121390274677015e-05, "loss": 0.4595, "num_input_tokens_seen": 2394000, "step": 7100 }, { "epoch": 5.490726429675425, "grad_norm": 2.328744888305664, "learning_rate": 2.5087670918742917e-05, "loss": 0.4439, "num_input_tokens_seen": 2395664, "step": 7105 }, { "epoch": 5.494590417310665, "grad_norm": 2.0269453525543213, "learning_rate": 2.5053951403315802e-05, "loss": 0.8543, "num_input_tokens_seen": 2397680, "step": 7110 }, { "epoch": 5.498454404945904, "grad_norm": 1.6339777708053589, "learning_rate": 2.5020231789739018e-05, "loss": 0.4146, "num_input_tokens_seen": 2399248, "step": 7115 }, { "epoch": 5.5, "eval_loss": 0.5106875896453857, "eval_runtime": 11.4458, "eval_samples_per_second": 50.237, "eval_steps_per_second": 12.581, "num_input_tokens_seen": 2399760, "step": 7117 }, { "epoch": 5.502318392581143, "grad_norm": 1.470063328742981, "learning_rate": 2.498651213935607e-05, "loss": 0.5081, "num_input_tokens_seen": 2400848, "step": 7120 }, { "epoch": 5.506182380216384, "grad_norm": 3.0022106170654297, "learning_rate": 2.4952792513510574e-05, "loss": 0.4943, "num_input_tokens_seen": 2402704, "step": 7125 }, { "epoch": 5.510046367851623, "grad_norm": 2.3131802082061768, "learning_rate": 2.4919072973546056e-05, "loss": 0.554, "num_input_tokens_seen": 2404208, "step": 7130 }, { "epoch": 5.513910355486862, "grad_norm": 1.458678960800171, "learning_rate": 2.4885353580805903e-05, "loss": 0.3939, "num_input_tokens_seen": 2406032, "step": 7135 }, { "epoch": 5.5177743431221025, "grad_norm": 2.514805555343628, "learning_rate": 2.4851634396633234e-05, "loss": 0.5401, "num_input_tokens_seen": 2407856, "step": 7140 }, { "epoch": 5.521638330757342, "grad_norm": 3.4082999229431152, "learning_rate": 2.4817915482370796e-05, "loss": 0.3949, "num_input_tokens_seen": 2409328, "step": 7145 }, { "epoch": 5.525502318392581, "grad_norm": 1.3418668508529663, "learning_rate": 2.4784196899360845e-05, "loss": 0.4034, "num_input_tokens_seen": 2411248, "step": 7150 }, { "epoch": 5.52936630602782, "grad_norm": 2.642627477645874, "learning_rate": 2.4750478708945014e-05, "loss": 0.452, "num_input_tokens_seen": 2413104, "step": 7155 }, { "epoch": 5.533230293663061, "grad_norm": 2.851623773574829, "learning_rate": 2.4716760972464236e-05, "loss": 0.4578, "num_input_tokens_seen": 2414864, "step": 7160 }, { "epoch": 5.5370942812983, "grad_norm": 1.4435594081878662, "learning_rate": 2.468304375125863e-05, "loss": 0.432, "num_input_tokens_seen": 2416560, "step": 7165 }, { "epoch": 5.540958268933539, "grad_norm": 2.5016367435455322, "learning_rate": 2.4649327106667356e-05, "loss": 0.5898, "num_input_tokens_seen": 2418096, "step": 7170 }, { "epoch": 5.544822256568779, "grad_norm": 2.0207748413085938, "learning_rate": 2.4615611100028545e-05, "loss": 0.4125, "num_input_tokens_seen": 2419664, "step": 7175 }, { "epoch": 5.548686244204019, "grad_norm": 1.5536314249038696, "learning_rate": 2.4581895792679137e-05, "loss": 0.3693, "num_input_tokens_seen": 2421392, "step": 7180 }, { "epoch": 5.552550231839258, "grad_norm": 1.729184627532959, "learning_rate": 2.454818124595484e-05, "loss": 0.326, "num_input_tokens_seen": 2423024, "step": 7185 }, { "epoch": 5.556414219474497, "grad_norm": 4.388607978820801, "learning_rate": 2.4514467521189947e-05, "loss": 0.4144, "num_input_tokens_seen": 2424688, "step": 7190 }, { "epoch": 5.560278207109738, "grad_norm": 1.262351393699646, "learning_rate": 2.448075467971727e-05, "loss": 0.4241, "num_input_tokens_seen": 2426192, "step": 7195 }, { "epoch": 5.564142194744977, "grad_norm": 2.4862890243530273, "learning_rate": 2.444704278286801e-05, "loss": 0.3684, "num_input_tokens_seen": 2427728, "step": 7200 }, { "epoch": 5.568006182380216, "grad_norm": 1.825525164604187, "learning_rate": 2.4413331891971657e-05, "loss": 0.3692, "num_input_tokens_seen": 2429648, "step": 7205 }, { "epoch": 5.571870170015456, "grad_norm": 2.3688621520996094, "learning_rate": 2.4379622068355857e-05, "loss": 0.4842, "num_input_tokens_seen": 2431152, "step": 7210 }, { "epoch": 5.575734157650696, "grad_norm": 3.3010635375976562, "learning_rate": 2.434591337334632e-05, "loss": 0.4197, "num_input_tokens_seen": 2432624, "step": 7215 }, { "epoch": 5.579598145285935, "grad_norm": 4.189604759216309, "learning_rate": 2.4312205868266702e-05, "loss": 0.3155, "num_input_tokens_seen": 2434224, "step": 7220 }, { "epoch": 5.583462132921174, "grad_norm": 1.898262858390808, "learning_rate": 2.4278499614438512e-05, "loss": 0.357, "num_input_tokens_seen": 2435664, "step": 7225 }, { "epoch": 5.587326120556414, "grad_norm": 1.4496287107467651, "learning_rate": 2.424479467318096e-05, "loss": 0.3553, "num_input_tokens_seen": 2437008, "step": 7230 }, { "epoch": 5.591190108191654, "grad_norm": 2.0585246086120605, "learning_rate": 2.421109110581087e-05, "loss": 0.3833, "num_input_tokens_seen": 2438800, "step": 7235 }, { "epoch": 5.595054095826893, "grad_norm": 2.027984857559204, "learning_rate": 2.4177388973642564e-05, "loss": 0.3539, "num_input_tokens_seen": 2440528, "step": 7240 }, { "epoch": 5.598918083462133, "grad_norm": 2.333355188369751, "learning_rate": 2.4143688337987784e-05, "loss": 0.5671, "num_input_tokens_seen": 2442320, "step": 7245 }, { "epoch": 5.602782071097373, "grad_norm": 1.4469507932662964, "learning_rate": 2.4109989260155512e-05, "loss": 0.4563, "num_input_tokens_seen": 2444208, "step": 7250 }, { "epoch": 5.606646058732612, "grad_norm": 4.0677056312561035, "learning_rate": 2.4076291801451923e-05, "loss": 0.4693, "num_input_tokens_seen": 2445808, "step": 7255 }, { "epoch": 5.6105100463678514, "grad_norm": 1.8120524883270264, "learning_rate": 2.4042596023180207e-05, "loss": 0.4041, "num_input_tokens_seen": 2447184, "step": 7260 }, { "epoch": 5.614374034003092, "grad_norm": 3.391011953353882, "learning_rate": 2.4008901986640558e-05, "loss": 0.4169, "num_input_tokens_seen": 2448880, "step": 7265 }, { "epoch": 5.618238021638331, "grad_norm": 1.4708333015441895, "learning_rate": 2.3975209753129945e-05, "loss": 0.3625, "num_input_tokens_seen": 2450576, "step": 7270 }, { "epoch": 5.62210200927357, "grad_norm": 4.932965278625488, "learning_rate": 2.394151938394209e-05, "loss": 0.6672, "num_input_tokens_seen": 2452144, "step": 7275 }, { "epoch": 5.62596599690881, "grad_norm": 2.4243006706237793, "learning_rate": 2.39078309403673e-05, "loss": 0.3427, "num_input_tokens_seen": 2453936, "step": 7280 }, { "epoch": 5.62982998454405, "grad_norm": 1.3920552730560303, "learning_rate": 2.3874144483692407e-05, "loss": 0.3936, "num_input_tokens_seen": 2455408, "step": 7285 }, { "epoch": 5.633693972179289, "grad_norm": 2.8159396648406982, "learning_rate": 2.3840460075200612e-05, "loss": 0.3475, "num_input_tokens_seen": 2457136, "step": 7290 }, { "epoch": 5.6375579598145285, "grad_norm": 1.0237571001052856, "learning_rate": 2.3806777776171378e-05, "loss": 0.3339, "num_input_tokens_seen": 2459024, "step": 7295 }, { "epoch": 5.641421947449768, "grad_norm": 3.0729734897613525, "learning_rate": 2.377309764788035e-05, "loss": 0.3843, "num_input_tokens_seen": 2460752, "step": 7300 }, { "epoch": 5.645285935085008, "grad_norm": 1.3521244525909424, "learning_rate": 2.373941975159922e-05, "loss": 0.5995, "num_input_tokens_seen": 2462288, "step": 7305 }, { "epoch": 5.649149922720247, "grad_norm": 2.1334340572357178, "learning_rate": 2.3705744148595613e-05, "loss": 0.4579, "num_input_tokens_seen": 2464112, "step": 7310 }, { "epoch": 5.653013910355487, "grad_norm": 2.9102959632873535, "learning_rate": 2.3672070900132993e-05, "loss": 0.3808, "num_input_tokens_seen": 2465808, "step": 7315 }, { "epoch": 5.656877897990727, "grad_norm": 3.407919406890869, "learning_rate": 2.363840006747051e-05, "loss": 0.5163, "num_input_tokens_seen": 2467632, "step": 7320 }, { "epoch": 5.660741885625966, "grad_norm": 1.7968811988830566, "learning_rate": 2.360473171186298e-05, "loss": 0.4324, "num_input_tokens_seen": 2469072, "step": 7325 }, { "epoch": 5.6646058732612055, "grad_norm": 1.473459243774414, "learning_rate": 2.357106589456065e-05, "loss": 0.4432, "num_input_tokens_seen": 2470512, "step": 7330 }, { "epoch": 5.668469860896445, "grad_norm": 2.424107789993286, "learning_rate": 2.3537402676809176e-05, "loss": 0.4522, "num_input_tokens_seen": 2471920, "step": 7335 }, { "epoch": 5.672333848531685, "grad_norm": 1.3309507369995117, "learning_rate": 2.350374211984948e-05, "loss": 0.3449, "num_input_tokens_seen": 2473648, "step": 7340 }, { "epoch": 5.676197836166924, "grad_norm": 1.2430710792541504, "learning_rate": 2.3470084284917662e-05, "loss": 0.3897, "num_input_tokens_seen": 2475568, "step": 7345 }, { "epoch": 5.680061823802164, "grad_norm": 3.8188552856445312, "learning_rate": 2.343642923324485e-05, "loss": 0.4146, "num_input_tokens_seen": 2477168, "step": 7350 }, { "epoch": 5.683925811437403, "grad_norm": 1.3100895881652832, "learning_rate": 2.34027770260571e-05, "loss": 0.3345, "num_input_tokens_seen": 2478736, "step": 7355 }, { "epoch": 5.687789799072643, "grad_norm": 1.7636620998382568, "learning_rate": 2.3369127724575314e-05, "loss": 0.3469, "num_input_tokens_seen": 2480400, "step": 7360 }, { "epoch": 5.6916537867078825, "grad_norm": 5.119137287139893, "learning_rate": 2.3335481390015106e-05, "loss": 0.5877, "num_input_tokens_seen": 2482064, "step": 7365 }, { "epoch": 5.695517774343122, "grad_norm": 1.4903818368911743, "learning_rate": 2.3301838083586683e-05, "loss": 0.4431, "num_input_tokens_seen": 2483952, "step": 7370 }, { "epoch": 5.699381761978362, "grad_norm": 1.7128351926803589, "learning_rate": 2.326819786649475e-05, "loss": 0.3951, "num_input_tokens_seen": 2485648, "step": 7375 }, { "epoch": 5.703245749613601, "grad_norm": 2.9785680770874023, "learning_rate": 2.323456079993837e-05, "loss": 0.3778, "num_input_tokens_seen": 2487536, "step": 7380 }, { "epoch": 5.707109737248841, "grad_norm": 3.494973659515381, "learning_rate": 2.320092694511093e-05, "loss": 0.3809, "num_input_tokens_seen": 2489136, "step": 7385 }, { "epoch": 5.710973724884081, "grad_norm": 6.06226110458374, "learning_rate": 2.3167296363199905e-05, "loss": 0.6094, "num_input_tokens_seen": 2490960, "step": 7390 }, { "epoch": 5.71483771251932, "grad_norm": 1.7797526121139526, "learning_rate": 2.313366911538686e-05, "loss": 0.3292, "num_input_tokens_seen": 2492560, "step": 7395 }, { "epoch": 5.7187017001545595, "grad_norm": 1.393591284751892, "learning_rate": 2.310004526284728e-05, "loss": 0.4084, "num_input_tokens_seen": 2494320, "step": 7400 }, { "epoch": 5.722565687789799, "grad_norm": 2.627948760986328, "learning_rate": 2.306642486675048e-05, "loss": 0.4046, "num_input_tokens_seen": 2495760, "step": 7405 }, { "epoch": 5.726429675425039, "grad_norm": 1.6019299030303955, "learning_rate": 2.303280798825949e-05, "loss": 0.3637, "num_input_tokens_seen": 2497360, "step": 7410 }, { "epoch": 5.730293663060278, "grad_norm": 2.552898645401001, "learning_rate": 2.2999194688530916e-05, "loss": 0.3787, "num_input_tokens_seen": 2498960, "step": 7415 }, { "epoch": 5.734157650695518, "grad_norm": 2.3085010051727295, "learning_rate": 2.296558502871488e-05, "loss": 0.6072, "num_input_tokens_seen": 2500496, "step": 7420 }, { "epoch": 5.738021638330757, "grad_norm": 2.4248433113098145, "learning_rate": 2.293197906995487e-05, "loss": 0.529, "num_input_tokens_seen": 2502128, "step": 7425 }, { "epoch": 5.741885625965997, "grad_norm": 2.8930540084838867, "learning_rate": 2.2898376873387654e-05, "loss": 0.3522, "num_input_tokens_seen": 2503600, "step": 7430 }, { "epoch": 5.7457496136012365, "grad_norm": 4.589396953582764, "learning_rate": 2.2864778500143148e-05, "loss": 0.3687, "num_input_tokens_seen": 2505136, "step": 7435 }, { "epoch": 5.749613601236476, "grad_norm": 4.079711437225342, "learning_rate": 2.283118401134428e-05, "loss": 0.4621, "num_input_tokens_seen": 2506640, "step": 7440 }, { "epoch": 5.753477588871716, "grad_norm": 4.975756645202637, "learning_rate": 2.2797593468106986e-05, "loss": 0.5593, "num_input_tokens_seen": 2508560, "step": 7445 }, { "epoch": 5.757341576506955, "grad_norm": 1.7670661211013794, "learning_rate": 2.276400693153995e-05, "loss": 0.3598, "num_input_tokens_seen": 2510160, "step": 7450 }, { "epoch": 5.761205564142195, "grad_norm": 1.3692739009857178, "learning_rate": 2.2730424462744603e-05, "loss": 0.3613, "num_input_tokens_seen": 2511664, "step": 7455 }, { "epoch": 5.765069551777434, "grad_norm": 1.6344503164291382, "learning_rate": 2.2696846122814968e-05, "loss": 0.4265, "num_input_tokens_seen": 2513168, "step": 7460 }, { "epoch": 5.768933539412674, "grad_norm": 2.770636558532715, "learning_rate": 2.2663271972837568e-05, "loss": 0.5108, "num_input_tokens_seen": 2514960, "step": 7465 }, { "epoch": 5.7727975270479135, "grad_norm": 3.953547477722168, "learning_rate": 2.2629702073891292e-05, "loss": 0.4401, "num_input_tokens_seen": 2516848, "step": 7470 }, { "epoch": 5.776661514683153, "grad_norm": 2.2921345233917236, "learning_rate": 2.259613648704729e-05, "loss": 0.3278, "num_input_tokens_seen": 2518704, "step": 7475 }, { "epoch": 5.780525502318392, "grad_norm": 2.4287383556365967, "learning_rate": 2.256257527336887e-05, "loss": 0.4064, "num_input_tokens_seen": 2520400, "step": 7480 }, { "epoch": 5.784389489953632, "grad_norm": 5.3875627517700195, "learning_rate": 2.2529018493911406e-05, "loss": 0.3333, "num_input_tokens_seen": 2522160, "step": 7485 }, { "epoch": 5.788253477588872, "grad_norm": 1.780412197113037, "learning_rate": 2.249546620972218e-05, "loss": 0.447, "num_input_tokens_seen": 2524304, "step": 7490 }, { "epoch": 5.792117465224111, "grad_norm": 2.242182970046997, "learning_rate": 2.2461918481840305e-05, "loss": 0.5539, "num_input_tokens_seen": 2526256, "step": 7495 }, { "epoch": 5.795981452859351, "grad_norm": 5.32750940322876, "learning_rate": 2.2428375371296596e-05, "loss": 0.6499, "num_input_tokens_seen": 2527856, "step": 7500 }, { "epoch": 5.7998454404945905, "grad_norm": 3.381195306777954, "learning_rate": 2.2394836939113485e-05, "loss": 0.4959, "num_input_tokens_seen": 2529488, "step": 7505 }, { "epoch": 5.80370942812983, "grad_norm": 1.6119499206542969, "learning_rate": 2.2361303246304883e-05, "loss": 0.6351, "num_input_tokens_seen": 2531312, "step": 7510 }, { "epoch": 5.80757341576507, "grad_norm": 5.122296333312988, "learning_rate": 2.232777435387608e-05, "loss": 0.5165, "num_input_tokens_seen": 2533200, "step": 7515 }, { "epoch": 5.811437403400309, "grad_norm": 1.8853216171264648, "learning_rate": 2.229425032282363e-05, "loss": 0.539, "num_input_tokens_seen": 2534992, "step": 7520 }, { "epoch": 5.815301391035549, "grad_norm": 4.4248480796813965, "learning_rate": 2.2260731214135255e-05, "loss": 0.5355, "num_input_tokens_seen": 2536560, "step": 7525 }, { "epoch": 5.819165378670788, "grad_norm": 2.0735669136047363, "learning_rate": 2.2227217088789706e-05, "loss": 0.4336, "num_input_tokens_seen": 2538512, "step": 7530 }, { "epoch": 5.823029366306028, "grad_norm": 2.3263323307037354, "learning_rate": 2.2193708007756676e-05, "loss": 0.4281, "num_input_tokens_seen": 2540400, "step": 7535 }, { "epoch": 5.8268933539412675, "grad_norm": 3.326758623123169, "learning_rate": 2.216020403199668e-05, "loss": 0.5394, "num_input_tokens_seen": 2542096, "step": 7540 }, { "epoch": 5.830757341576507, "grad_norm": 3.840872287750244, "learning_rate": 2.2126705222460952e-05, "loss": 0.5065, "num_input_tokens_seen": 2543760, "step": 7545 }, { "epoch": 5.834621329211746, "grad_norm": 1.7489333152770996, "learning_rate": 2.2093211640091327e-05, "loss": 0.3538, "num_input_tokens_seen": 2545456, "step": 7550 }, { "epoch": 5.838485316846986, "grad_norm": 2.1975533962249756, "learning_rate": 2.205972334582011e-05, "loss": 0.6201, "num_input_tokens_seen": 2547216, "step": 7555 }, { "epoch": 5.842349304482226, "grad_norm": 1.8377741575241089, "learning_rate": 2.2026240400570004e-05, "loss": 0.4373, "num_input_tokens_seen": 2548944, "step": 7560 }, { "epoch": 5.846213292117465, "grad_norm": 1.8556621074676514, "learning_rate": 2.1992762865253985e-05, "loss": 0.5738, "num_input_tokens_seen": 2550704, "step": 7565 }, { "epoch": 5.850077279752705, "grad_norm": 2.2353157997131348, "learning_rate": 2.195929080077518e-05, "loss": 0.4368, "num_input_tokens_seen": 2553008, "step": 7570 }, { "epoch": 5.8539412673879445, "grad_norm": 1.1346403360366821, "learning_rate": 2.192582426802677e-05, "loss": 0.5431, "num_input_tokens_seen": 2554832, "step": 7575 }, { "epoch": 5.857805255023184, "grad_norm": 1.8562389612197876, "learning_rate": 2.1892363327891837e-05, "loss": 0.418, "num_input_tokens_seen": 2556592, "step": 7580 }, { "epoch": 5.861669242658423, "grad_norm": 3.9730231761932373, "learning_rate": 2.185890804124336e-05, "loss": 0.5711, "num_input_tokens_seen": 2558160, "step": 7585 }, { "epoch": 5.865533230293663, "grad_norm": 4.950126647949219, "learning_rate": 2.182545846894396e-05, "loss": 0.5903, "num_input_tokens_seen": 2559792, "step": 7590 }, { "epoch": 5.869397217928903, "grad_norm": 1.8499499559402466, "learning_rate": 2.1792014671845902e-05, "loss": 0.3365, "num_input_tokens_seen": 2561584, "step": 7595 }, { "epoch": 5.873261205564142, "grad_norm": 5.474534511566162, "learning_rate": 2.1758576710790936e-05, "loss": 0.574, "num_input_tokens_seen": 2563280, "step": 7600 }, { "epoch": 5.877125193199381, "grad_norm": 2.270703077316284, "learning_rate": 2.17251446466102e-05, "loss": 0.4026, "num_input_tokens_seen": 2564944, "step": 7605 }, { "epoch": 5.8809891808346215, "grad_norm": 1.5792685747146606, "learning_rate": 2.1691718540124098e-05, "loss": 0.5666, "num_input_tokens_seen": 2566736, "step": 7610 }, { "epoch": 5.884853168469861, "grad_norm": 1.6319199800491333, "learning_rate": 2.165829845214219e-05, "loss": 0.6197, "num_input_tokens_seen": 2568528, "step": 7615 }, { "epoch": 5.8887171561051, "grad_norm": 1.9117602109909058, "learning_rate": 2.1624884443463093e-05, "loss": 0.5094, "num_input_tokens_seen": 2570384, "step": 7620 }, { "epoch": 5.89258114374034, "grad_norm": 2.548082113265991, "learning_rate": 2.1591476574874372e-05, "loss": 0.6216, "num_input_tokens_seen": 2572208, "step": 7625 }, { "epoch": 5.89644513137558, "grad_norm": 1.6887456178665161, "learning_rate": 2.1558074907152413e-05, "loss": 0.3582, "num_input_tokens_seen": 2573872, "step": 7630 }, { "epoch": 5.900309119010819, "grad_norm": 4.13362455368042, "learning_rate": 2.152467950106233e-05, "loss": 0.5216, "num_input_tokens_seen": 2575664, "step": 7635 }, { "epoch": 5.904173106646059, "grad_norm": 4.826321125030518, "learning_rate": 2.1491290417357814e-05, "loss": 0.49, "num_input_tokens_seen": 2577040, "step": 7640 }, { "epoch": 5.9080370942812985, "grad_norm": 1.922561526298523, "learning_rate": 2.1457907716781112e-05, "loss": 0.4602, "num_input_tokens_seen": 2578832, "step": 7645 }, { "epoch": 5.911901081916538, "grad_norm": 4.076058387756348, "learning_rate": 2.14245314600628e-05, "loss": 0.818, "num_input_tokens_seen": 2580656, "step": 7650 }, { "epoch": 5.915765069551777, "grad_norm": 1.9119442701339722, "learning_rate": 2.1391161707921775e-05, "loss": 0.4102, "num_input_tokens_seen": 2582448, "step": 7655 }, { "epoch": 5.919629057187017, "grad_norm": 2.425401449203491, "learning_rate": 2.1357798521065066e-05, "loss": 0.4295, "num_input_tokens_seen": 2584048, "step": 7660 }, { "epoch": 5.923493044822257, "grad_norm": 3.137676239013672, "learning_rate": 2.1324441960187785e-05, "loss": 0.456, "num_input_tokens_seen": 2585872, "step": 7665 }, { "epoch": 5.927357032457496, "grad_norm": 1.1759079694747925, "learning_rate": 2.129109208597299e-05, "loss": 0.3907, "num_input_tokens_seen": 2587472, "step": 7670 }, { "epoch": 5.931221020092735, "grad_norm": 2.0755224227905273, "learning_rate": 2.1257748959091543e-05, "loss": 0.4098, "num_input_tokens_seen": 2589168, "step": 7675 }, { "epoch": 5.9350850077279755, "grad_norm": 4.329780578613281, "learning_rate": 2.1224412640202053e-05, "loss": 0.3844, "num_input_tokens_seen": 2591248, "step": 7680 }, { "epoch": 5.938948995363215, "grad_norm": 2.712210178375244, "learning_rate": 2.119108318995076e-05, "loss": 0.4464, "num_input_tokens_seen": 2592976, "step": 7685 }, { "epoch": 5.942812982998454, "grad_norm": 1.7012650966644287, "learning_rate": 2.1157760668971382e-05, "loss": 0.442, "num_input_tokens_seen": 2594640, "step": 7690 }, { "epoch": 5.946676970633694, "grad_norm": 3.0868141651153564, "learning_rate": 2.1124445137885048e-05, "loss": 0.4795, "num_input_tokens_seen": 2596272, "step": 7695 }, { "epoch": 5.950540958268934, "grad_norm": 1.5817790031433105, "learning_rate": 2.1091136657300143e-05, "loss": 0.4545, "num_input_tokens_seen": 2598128, "step": 7700 }, { "epoch": 5.954404945904173, "grad_norm": 2.888702630996704, "learning_rate": 2.105783528781227e-05, "loss": 0.5164, "num_input_tokens_seen": 2600048, "step": 7705 }, { "epoch": 5.958268933539412, "grad_norm": 2.2859480381011963, "learning_rate": 2.102454109000406e-05, "loss": 0.8073, "num_input_tokens_seen": 2601616, "step": 7710 }, { "epoch": 5.9621329211746525, "grad_norm": 1.99262273311615, "learning_rate": 2.0991254124445106e-05, "loss": 0.433, "num_input_tokens_seen": 2603504, "step": 7715 }, { "epoch": 5.965996908809892, "grad_norm": 1.3591032028198242, "learning_rate": 2.0957974451691846e-05, "loss": 0.3739, "num_input_tokens_seen": 2605200, "step": 7720 }, { "epoch": 5.969860896445131, "grad_norm": 1.7795175313949585, "learning_rate": 2.092470213228746e-05, "loss": 0.5172, "num_input_tokens_seen": 2606928, "step": 7725 }, { "epoch": 5.9737248840803705, "grad_norm": 1.9887458086013794, "learning_rate": 2.089143722676174e-05, "loss": 0.5573, "num_input_tokens_seen": 2608848, "step": 7730 }, { "epoch": 5.977588871715611, "grad_norm": 2.0472915172576904, "learning_rate": 2.0858179795630982e-05, "loss": 0.4375, "num_input_tokens_seen": 2610704, "step": 7735 }, { "epoch": 5.98145285935085, "grad_norm": 1.5997027158737183, "learning_rate": 2.0824929899397897e-05, "loss": 0.7267, "num_input_tokens_seen": 2612272, "step": 7740 }, { "epoch": 5.985316846986089, "grad_norm": 1.519073486328125, "learning_rate": 2.0791687598551493e-05, "loss": 0.348, "num_input_tokens_seen": 2613776, "step": 7745 }, { "epoch": 5.9891808346213296, "grad_norm": 1.9857628345489502, "learning_rate": 2.075845295356695e-05, "loss": 0.7761, "num_input_tokens_seen": 2615472, "step": 7750 }, { "epoch": 5.993044822256569, "grad_norm": 1.6122026443481445, "learning_rate": 2.072522602490553e-05, "loss": 0.6966, "num_input_tokens_seen": 2617136, "step": 7755 }, { "epoch": 5.996908809891808, "grad_norm": 2.0206170082092285, "learning_rate": 2.0692006873014427e-05, "loss": 0.5929, "num_input_tokens_seen": 2618800, "step": 7760 }, { "epoch": 6.0, "eval_loss": 0.5058012008666992, "eval_runtime": 11.4329, "eval_samples_per_second": 50.294, "eval_steps_per_second": 12.595, "num_input_tokens_seen": 2619888, "step": 7764 }, { "epoch": 6.0007727975270475, "grad_norm": 1.746410846710205, "learning_rate": 2.0658795558326743e-05, "loss": 0.4249, "num_input_tokens_seen": 2620208, "step": 7765 }, { "epoch": 6.004636785162288, "grad_norm": 2.5654444694519043, "learning_rate": 2.0625592141261273e-05, "loss": 0.5408, "num_input_tokens_seen": 2621584, "step": 7770 }, { "epoch": 6.008500772797527, "grad_norm": 1.7383391857147217, "learning_rate": 2.059239668222246e-05, "loss": 0.3904, "num_input_tokens_seen": 2623280, "step": 7775 }, { "epoch": 6.012364760432766, "grad_norm": 1.9031904935836792, "learning_rate": 2.055920924160027e-05, "loss": 0.4315, "num_input_tokens_seen": 2624784, "step": 7780 }, { "epoch": 6.016228748068007, "grad_norm": 1.529939889907837, "learning_rate": 2.0526029879770095e-05, "loss": 0.527, "num_input_tokens_seen": 2626320, "step": 7785 }, { "epoch": 6.020092735703246, "grad_norm": 1.3323259353637695, "learning_rate": 2.049285865709262e-05, "loss": 0.3252, "num_input_tokens_seen": 2628144, "step": 7790 }, { "epoch": 6.023956723338485, "grad_norm": 2.057878255844116, "learning_rate": 2.04596956339137e-05, "loss": 0.3475, "num_input_tokens_seen": 2630000, "step": 7795 }, { "epoch": 6.0278207109737245, "grad_norm": 2.483901262283325, "learning_rate": 2.042654087056431e-05, "loss": 0.3607, "num_input_tokens_seen": 2631664, "step": 7800 }, { "epoch": 6.031684698608965, "grad_norm": 2.1016666889190674, "learning_rate": 2.039339442736038e-05, "loss": 0.4665, "num_input_tokens_seen": 2633392, "step": 7805 }, { "epoch": 6.035548686244204, "grad_norm": 1.5706173181533813, "learning_rate": 2.0360256364602705e-05, "loss": 0.4626, "num_input_tokens_seen": 2635248, "step": 7810 }, { "epoch": 6.039412673879443, "grad_norm": 2.6190474033355713, "learning_rate": 2.0327126742576846e-05, "loss": 0.5925, "num_input_tokens_seen": 2636912, "step": 7815 }, { "epoch": 6.043276661514684, "grad_norm": 2.51755428314209, "learning_rate": 2.0294005621552976e-05, "loss": 0.4313, "num_input_tokens_seen": 2638736, "step": 7820 }, { "epoch": 6.047140649149923, "grad_norm": 1.4556899070739746, "learning_rate": 2.0260893061785845e-05, "loss": 0.3716, "num_input_tokens_seen": 2640144, "step": 7825 }, { "epoch": 6.051004636785162, "grad_norm": 2.679481029510498, "learning_rate": 2.02277891235146e-05, "loss": 0.5199, "num_input_tokens_seen": 2641936, "step": 7830 }, { "epoch": 6.0548686244204015, "grad_norm": 2.136150360107422, "learning_rate": 2.0194693866962717e-05, "loss": 0.3708, "num_input_tokens_seen": 2643472, "step": 7835 }, { "epoch": 6.058732612055642, "grad_norm": 1.613372802734375, "learning_rate": 2.0161607352337862e-05, "loss": 0.4605, "num_input_tokens_seen": 2645136, "step": 7840 }, { "epoch": 6.062596599690881, "grad_norm": 2.3679583072662354, "learning_rate": 2.0128529639831824e-05, "loss": 0.4078, "num_input_tokens_seen": 2646864, "step": 7845 }, { "epoch": 6.06646058732612, "grad_norm": 1.553654670715332, "learning_rate": 2.0095460789620348e-05, "loss": 0.4802, "num_input_tokens_seen": 2648304, "step": 7850 }, { "epoch": 6.07032457496136, "grad_norm": 2.022379159927368, "learning_rate": 2.0062400861863077e-05, "loss": 0.3695, "num_input_tokens_seen": 2649808, "step": 7855 }, { "epoch": 6.0741885625966, "grad_norm": 2.0507311820983887, "learning_rate": 2.0029349916703412e-05, "loss": 0.3526, "num_input_tokens_seen": 2651440, "step": 7860 }, { "epoch": 6.078052550231839, "grad_norm": 3.1862809658050537, "learning_rate": 1.999630801426843e-05, "loss": 0.3982, "num_input_tokens_seen": 2653232, "step": 7865 }, { "epoch": 6.0819165378670785, "grad_norm": 1.6611305475234985, "learning_rate": 1.996327521466874e-05, "loss": 0.4182, "num_input_tokens_seen": 2654896, "step": 7870 }, { "epoch": 6.085780525502319, "grad_norm": 2.2540652751922607, "learning_rate": 1.993025157799839e-05, "loss": 0.4211, "num_input_tokens_seen": 2656400, "step": 7875 }, { "epoch": 6.089644513137558, "grad_norm": 2.8165814876556396, "learning_rate": 1.9897237164334767e-05, "loss": 0.3997, "num_input_tokens_seen": 2657968, "step": 7880 }, { "epoch": 6.093508500772797, "grad_norm": 1.2180988788604736, "learning_rate": 1.986423203373847e-05, "loss": 0.5186, "num_input_tokens_seen": 2659760, "step": 7885 }, { "epoch": 6.097372488408037, "grad_norm": 1.8162685632705688, "learning_rate": 1.9831236246253226e-05, "loss": 0.3837, "num_input_tokens_seen": 2661552, "step": 7890 }, { "epoch": 6.101236476043277, "grad_norm": 1.3324226140975952, "learning_rate": 1.979824986190576e-05, "loss": 0.3185, "num_input_tokens_seen": 2663120, "step": 7895 }, { "epoch": 6.105100463678516, "grad_norm": 3.5165412425994873, "learning_rate": 1.976527294070567e-05, "loss": 0.4453, "num_input_tokens_seen": 2665040, "step": 7900 }, { "epoch": 6.1089644513137555, "grad_norm": 2.869887351989746, "learning_rate": 1.9732305542645357e-05, "loss": 0.4454, "num_input_tokens_seen": 2666576, "step": 7905 }, { "epoch": 6.112828438948996, "grad_norm": 3.3376667499542236, "learning_rate": 1.96993477276999e-05, "loss": 0.4748, "num_input_tokens_seen": 2668176, "step": 7910 }, { "epoch": 6.116692426584235, "grad_norm": 3.3883323669433594, "learning_rate": 1.9666399555826946e-05, "loss": 0.3587, "num_input_tokens_seen": 2669680, "step": 7915 }, { "epoch": 6.120556414219474, "grad_norm": 2.8434324264526367, "learning_rate": 1.9633461086966585e-05, "loss": 0.3617, "num_input_tokens_seen": 2671312, "step": 7920 }, { "epoch": 6.124420401854714, "grad_norm": 2.2145442962646484, "learning_rate": 1.960053238104124e-05, "loss": 0.4242, "num_input_tokens_seen": 2673072, "step": 7925 }, { "epoch": 6.128284389489954, "grad_norm": 3.113286256790161, "learning_rate": 1.9567613497955638e-05, "loss": 0.3576, "num_input_tokens_seen": 2674992, "step": 7930 }, { "epoch": 6.132148377125193, "grad_norm": 1.6584659814834595, "learning_rate": 1.953470449759656e-05, "loss": 0.3704, "num_input_tokens_seen": 2676400, "step": 7935 }, { "epoch": 6.1360123647604325, "grad_norm": 2.313167095184326, "learning_rate": 1.9501805439832856e-05, "loss": 0.387, "num_input_tokens_seen": 2678448, "step": 7940 }, { "epoch": 6.139876352395673, "grad_norm": 5.073770046234131, "learning_rate": 1.9468916384515258e-05, "loss": 0.4503, "num_input_tokens_seen": 2680080, "step": 7945 }, { "epoch": 6.143740340030912, "grad_norm": 3.103668689727783, "learning_rate": 1.9436037391476337e-05, "loss": 0.5139, "num_input_tokens_seen": 2681808, "step": 7950 }, { "epoch": 6.147604327666151, "grad_norm": 2.309156894683838, "learning_rate": 1.940316852053033e-05, "loss": 0.4643, "num_input_tokens_seen": 2683760, "step": 7955 }, { "epoch": 6.151468315301391, "grad_norm": 4.084736347198486, "learning_rate": 1.9370309831473058e-05, "loss": 0.3804, "num_input_tokens_seen": 2685424, "step": 7960 }, { "epoch": 6.155332302936631, "grad_norm": 4.28459358215332, "learning_rate": 1.933746138408183e-05, "loss": 0.4263, "num_input_tokens_seen": 2687376, "step": 7965 }, { "epoch": 6.15919629057187, "grad_norm": 3.454132556915283, "learning_rate": 1.9304623238115335e-05, "loss": 0.3575, "num_input_tokens_seen": 2689040, "step": 7970 }, { "epoch": 6.1630602782071096, "grad_norm": 2.488379716873169, "learning_rate": 1.92717954533135e-05, "loss": 0.6355, "num_input_tokens_seen": 2690608, "step": 7975 }, { "epoch": 6.166924265842349, "grad_norm": 3.25065541267395, "learning_rate": 1.9238978089397416e-05, "loss": 0.4807, "num_input_tokens_seen": 2692368, "step": 7980 }, { "epoch": 6.170788253477589, "grad_norm": 2.601442813873291, "learning_rate": 1.920617120606919e-05, "loss": 0.3658, "num_input_tokens_seen": 2693936, "step": 7985 }, { "epoch": 6.174652241112828, "grad_norm": 2.225637674331665, "learning_rate": 1.9173374863011916e-05, "loss": 0.4601, "num_input_tokens_seen": 2695600, "step": 7990 }, { "epoch": 6.178516228748068, "grad_norm": 2.3177599906921387, "learning_rate": 1.9140589119889458e-05, "loss": 0.4978, "num_input_tokens_seen": 2697648, "step": 7995 }, { "epoch": 6.182380216383308, "grad_norm": 3.512901544570923, "learning_rate": 1.910781403634642e-05, "loss": 0.4347, "num_input_tokens_seen": 2699536, "step": 8000 }, { "epoch": 6.186244204018547, "grad_norm": 1.6675437688827515, "learning_rate": 1.9075049672008012e-05, "loss": 0.3741, "num_input_tokens_seen": 2701232, "step": 8005 }, { "epoch": 6.190108191653787, "grad_norm": 4.206269264221191, "learning_rate": 1.9042296086479954e-05, "loss": 0.4403, "num_input_tokens_seen": 2702928, "step": 8010 }, { "epoch": 6.193972179289026, "grad_norm": 4.506382465362549, "learning_rate": 1.9009553339348348e-05, "loss": 0.4567, "num_input_tokens_seen": 2704656, "step": 8015 }, { "epoch": 6.197836166924266, "grad_norm": 3.009315252304077, "learning_rate": 1.8976821490179557e-05, "loss": 0.4166, "num_input_tokens_seen": 2706160, "step": 8020 }, { "epoch": 6.201700154559505, "grad_norm": 2.8650965690612793, "learning_rate": 1.8944100598520152e-05, "loss": 0.4272, "num_input_tokens_seen": 2707856, "step": 8025 }, { "epoch": 6.205564142194745, "grad_norm": 1.6196969747543335, "learning_rate": 1.891139072389676e-05, "loss": 0.3829, "num_input_tokens_seen": 2709808, "step": 8030 }, { "epoch": 6.209428129829985, "grad_norm": 1.5261790752410889, "learning_rate": 1.8878691925815954e-05, "loss": 0.4423, "num_input_tokens_seen": 2711440, "step": 8035 }, { "epoch": 6.213292117465224, "grad_norm": 2.8401732444763184, "learning_rate": 1.8846004263764172e-05, "loss": 0.4341, "num_input_tokens_seen": 2713008, "step": 8040 }, { "epoch": 6.217156105100464, "grad_norm": 1.5713393688201904, "learning_rate": 1.8813327797207564e-05, "loss": 0.5629, "num_input_tokens_seen": 2714480, "step": 8045 }, { "epoch": 6.221020092735703, "grad_norm": 2.1650500297546387, "learning_rate": 1.878066258559197e-05, "loss": 0.3839, "num_input_tokens_seen": 2716016, "step": 8050 }, { "epoch": 6.224884080370943, "grad_norm": 1.9237794876098633, "learning_rate": 1.874800868834269e-05, "loss": 0.359, "num_input_tokens_seen": 2717456, "step": 8055 }, { "epoch": 6.228748068006182, "grad_norm": 3.5979835987091064, "learning_rate": 1.8715366164864474e-05, "loss": 0.7436, "num_input_tokens_seen": 2719312, "step": 8060 }, { "epoch": 6.232612055641422, "grad_norm": 1.981261968612671, "learning_rate": 1.868273507454138e-05, "loss": 0.4104, "num_input_tokens_seen": 2721008, "step": 8065 }, { "epoch": 6.236476043276662, "grad_norm": 2.015746593475342, "learning_rate": 1.8650115476736662e-05, "loss": 0.4741, "num_input_tokens_seen": 2722768, "step": 8070 }, { "epoch": 6.240340030911901, "grad_norm": 2.0581960678100586, "learning_rate": 1.8617507430792678e-05, "loss": 0.464, "num_input_tokens_seen": 2724496, "step": 8075 }, { "epoch": 6.244204018547141, "grad_norm": 1.582340121269226, "learning_rate": 1.858491099603074e-05, "loss": 0.4809, "num_input_tokens_seen": 2726288, "step": 8080 }, { "epoch": 6.24806800618238, "grad_norm": 3.649815320968628, "learning_rate": 1.855232623175106e-05, "loss": 0.5323, "num_input_tokens_seen": 2727888, "step": 8085 }, { "epoch": 6.25193199381762, "grad_norm": 1.7664064168930054, "learning_rate": 1.8519753197232625e-05, "loss": 0.3435, "num_input_tokens_seen": 2729744, "step": 8090 }, { "epoch": 6.255795981452859, "grad_norm": 1.541700839996338, "learning_rate": 1.8487191951733067e-05, "loss": 0.4267, "num_input_tokens_seen": 2731312, "step": 8095 }, { "epoch": 6.259659969088099, "grad_norm": 3.3778581619262695, "learning_rate": 1.8454642554488587e-05, "loss": 0.4408, "num_input_tokens_seen": 2733008, "step": 8100 }, { "epoch": 6.263523956723338, "grad_norm": 1.193193793296814, "learning_rate": 1.8422105064713796e-05, "loss": 0.4604, "num_input_tokens_seen": 2734576, "step": 8105 }, { "epoch": 6.267387944358578, "grad_norm": 1.463523507118225, "learning_rate": 1.83895795416017e-05, "loss": 0.519, "num_input_tokens_seen": 2736336, "step": 8110 }, { "epoch": 6.271251931993818, "grad_norm": 2.0038065910339355, "learning_rate": 1.835706604432348e-05, "loss": 0.431, "num_input_tokens_seen": 2738128, "step": 8115 }, { "epoch": 6.275115919629057, "grad_norm": 2.0884125232696533, "learning_rate": 1.832456463202847e-05, "loss": 0.3829, "num_input_tokens_seen": 2739696, "step": 8120 }, { "epoch": 6.278979907264297, "grad_norm": 2.0732226371765137, "learning_rate": 1.8292075363844008e-05, "loss": 0.4275, "num_input_tokens_seen": 2741232, "step": 8125 }, { "epoch": 6.282843894899536, "grad_norm": 1.1720120906829834, "learning_rate": 1.8259598298875347e-05, "loss": 0.3511, "num_input_tokens_seen": 2742992, "step": 8130 }, { "epoch": 6.286707882534776, "grad_norm": 1.7388063669204712, "learning_rate": 1.8227133496205542e-05, "loss": 0.4092, "num_input_tokens_seen": 2744720, "step": 8135 }, { "epoch": 6.290571870170015, "grad_norm": 2.538116216659546, "learning_rate": 1.8194681014895317e-05, "loss": 0.3552, "num_input_tokens_seen": 2746320, "step": 8140 }, { "epoch": 6.294435857805255, "grad_norm": 2.1238465309143066, "learning_rate": 1.8162240913983e-05, "loss": 0.3764, "num_input_tokens_seen": 2748304, "step": 8145 }, { "epoch": 6.298299845440495, "grad_norm": 1.5364316701889038, "learning_rate": 1.81298132524844e-05, "loss": 0.5106, "num_input_tokens_seen": 2749872, "step": 8150 }, { "epoch": 6.302163833075734, "grad_norm": 2.2663774490356445, "learning_rate": 1.8097398089392688e-05, "loss": 0.4242, "num_input_tokens_seen": 2751536, "step": 8155 }, { "epoch": 6.306027820710974, "grad_norm": 3.9114458560943604, "learning_rate": 1.80649954836783e-05, "loss": 0.4217, "num_input_tokens_seen": 2753488, "step": 8160 }, { "epoch": 6.309891808346213, "grad_norm": 1.7841675281524658, "learning_rate": 1.803260549428881e-05, "loss": 0.3952, "num_input_tokens_seen": 2755344, "step": 8165 }, { "epoch": 6.313755795981453, "grad_norm": 4.208410263061523, "learning_rate": 1.800022818014888e-05, "loss": 0.5248, "num_input_tokens_seen": 2757232, "step": 8170 }, { "epoch": 6.317619783616692, "grad_norm": 1.739138126373291, "learning_rate": 1.7967863600160072e-05, "loss": 0.4665, "num_input_tokens_seen": 2758768, "step": 8175 }, { "epoch": 6.321483771251932, "grad_norm": 4.118843078613281, "learning_rate": 1.7935511813200803e-05, "loss": 0.4986, "num_input_tokens_seen": 2760336, "step": 8180 }, { "epoch": 6.325347758887172, "grad_norm": 2.773054838180542, "learning_rate": 1.7903172878126207e-05, "loss": 0.6473, "num_input_tokens_seen": 2762160, "step": 8185 }, { "epoch": 6.329211746522411, "grad_norm": 3.472121000289917, "learning_rate": 1.7870846853768053e-05, "loss": 0.4269, "num_input_tokens_seen": 2763888, "step": 8190 }, { "epoch": 6.333075734157651, "grad_norm": 2.2150962352752686, "learning_rate": 1.7838533798934614e-05, "loss": 0.386, "num_input_tokens_seen": 2765648, "step": 8195 }, { "epoch": 6.3369397217928904, "grad_norm": 1.6835377216339111, "learning_rate": 1.7806233772410553e-05, "loss": 0.3425, "num_input_tokens_seen": 2767248, "step": 8200 }, { "epoch": 6.34080370942813, "grad_norm": 2.291408061981201, "learning_rate": 1.777394683295685e-05, "loss": 0.5221, "num_input_tokens_seen": 2768688, "step": 8205 }, { "epoch": 6.344667697063369, "grad_norm": 1.1376018524169922, "learning_rate": 1.7741673039310676e-05, "loss": 0.5119, "num_input_tokens_seen": 2770416, "step": 8210 }, { "epoch": 6.348531684698609, "grad_norm": 2.227490186691284, "learning_rate": 1.7709412450185285e-05, "loss": 0.641, "num_input_tokens_seen": 2772368, "step": 8215 }, { "epoch": 6.352395672333849, "grad_norm": 2.5135841369628906, "learning_rate": 1.7677165124269906e-05, "loss": 0.3986, "num_input_tokens_seen": 2774000, "step": 8220 }, { "epoch": 6.356259659969088, "grad_norm": 2.611180543899536, "learning_rate": 1.7644931120229628e-05, "loss": 0.4086, "num_input_tokens_seen": 2775728, "step": 8225 }, { "epoch": 6.360123647604327, "grad_norm": 1.7609162330627441, "learning_rate": 1.761271049670533e-05, "loss": 0.3582, "num_input_tokens_seen": 2777168, "step": 8230 }, { "epoch": 6.3639876352395675, "grad_norm": 1.5621509552001953, "learning_rate": 1.758050331231353e-05, "loss": 0.3683, "num_input_tokens_seen": 2778928, "step": 8235 }, { "epoch": 6.367851622874807, "grad_norm": 1.7128218412399292, "learning_rate": 1.7548309625646305e-05, "loss": 0.3441, "num_input_tokens_seen": 2780400, "step": 8240 }, { "epoch": 6.371715610510046, "grad_norm": 2.135310411453247, "learning_rate": 1.7516129495271168e-05, "loss": 0.5151, "num_input_tokens_seen": 2781872, "step": 8245 }, { "epoch": 6.375579598145286, "grad_norm": 3.1962969303131104, "learning_rate": 1.7483962979730996e-05, "loss": 0.5694, "num_input_tokens_seen": 2783728, "step": 8250 }, { "epoch": 6.379443585780526, "grad_norm": 3.711057186126709, "learning_rate": 1.7451810137543855e-05, "loss": 0.4864, "num_input_tokens_seen": 2785392, "step": 8255 }, { "epoch": 6.383307573415765, "grad_norm": 2.3682785034179688, "learning_rate": 1.7419671027202967e-05, "loss": 0.4476, "num_input_tokens_seen": 2787280, "step": 8260 }, { "epoch": 6.387171561051004, "grad_norm": 2.626344680786133, "learning_rate": 1.738754570717656e-05, "loss": 0.2842, "num_input_tokens_seen": 2788752, "step": 8265 }, { "epoch": 6.3910355486862445, "grad_norm": 1.9660841226577759, "learning_rate": 1.7355434235907793e-05, "loss": 0.3625, "num_input_tokens_seen": 2790128, "step": 8270 }, { "epoch": 6.394899536321484, "grad_norm": 3.188807725906372, "learning_rate": 1.732333667181461e-05, "loss": 0.536, "num_input_tokens_seen": 2791792, "step": 8275 }, { "epoch": 6.398763523956723, "grad_norm": 4.390456676483154, "learning_rate": 1.729125307328966e-05, "loss": 0.6132, "num_input_tokens_seen": 2793424, "step": 8280 }, { "epoch": 6.402627511591963, "grad_norm": 2.962252140045166, "learning_rate": 1.7259183498700178e-05, "loss": 0.3931, "num_input_tokens_seen": 2795056, "step": 8285 }, { "epoch": 6.406491499227203, "grad_norm": 1.3487548828125, "learning_rate": 1.7227128006387912e-05, "loss": 0.3631, "num_input_tokens_seen": 2796848, "step": 8290 }, { "epoch": 6.410355486862442, "grad_norm": 4.480303764343262, "learning_rate": 1.7195086654668972e-05, "loss": 0.4514, "num_input_tokens_seen": 2798800, "step": 8295 }, { "epoch": 6.414219474497681, "grad_norm": 1.7091830968856812, "learning_rate": 1.7163059501833746e-05, "loss": 0.4716, "num_input_tokens_seen": 2800688, "step": 8300 }, { "epoch": 6.4180834621329215, "grad_norm": 2.4581546783447266, "learning_rate": 1.7131046606146774e-05, "loss": 0.4583, "num_input_tokens_seen": 2802480, "step": 8305 }, { "epoch": 6.421947449768161, "grad_norm": 1.6526386737823486, "learning_rate": 1.70990480258467e-05, "loss": 0.4433, "num_input_tokens_seen": 2804144, "step": 8310 }, { "epoch": 6.4258114374034, "grad_norm": 1.1776213645935059, "learning_rate": 1.706706381914609e-05, "loss": 0.4602, "num_input_tokens_seen": 2805808, "step": 8315 }, { "epoch": 6.42967542503864, "grad_norm": 2.0500807762145996, "learning_rate": 1.703509404423137e-05, "loss": 0.455, "num_input_tokens_seen": 2807472, "step": 8320 }, { "epoch": 6.43353941267388, "grad_norm": 1.210785984992981, "learning_rate": 1.7003138759262716e-05, "loss": 0.4321, "num_input_tokens_seen": 2809200, "step": 8325 }, { "epoch": 6.437403400309119, "grad_norm": 1.729555606842041, "learning_rate": 1.6971198022373944e-05, "loss": 0.5201, "num_input_tokens_seen": 2810960, "step": 8330 }, { "epoch": 6.441267387944358, "grad_norm": 2.516319990158081, "learning_rate": 1.6939271891672403e-05, "loss": 0.3907, "num_input_tokens_seen": 2812912, "step": 8335 }, { "epoch": 6.4451313755795985, "grad_norm": 1.4284123182296753, "learning_rate": 1.6907360425238865e-05, "loss": 0.5817, "num_input_tokens_seen": 2814480, "step": 8340 }, { "epoch": 6.448995363214838, "grad_norm": 1.7362911701202393, "learning_rate": 1.687546368112742e-05, "loss": 0.359, "num_input_tokens_seen": 2815856, "step": 8345 }, { "epoch": 6.452859350850077, "grad_norm": 1.9168951511383057, "learning_rate": 1.6843581717365394e-05, "loss": 0.5529, "num_input_tokens_seen": 2817488, "step": 8350 }, { "epoch": 6.456723338485316, "grad_norm": 4.74972677230835, "learning_rate": 1.6811714591953206e-05, "loss": 0.5533, "num_input_tokens_seen": 2819312, "step": 8355 }, { "epoch": 6.460587326120557, "grad_norm": 2.666496992111206, "learning_rate": 1.6779862362864294e-05, "loss": 0.4252, "num_input_tokens_seen": 2820944, "step": 8360 }, { "epoch": 6.464451313755796, "grad_norm": 1.2217291593551636, "learning_rate": 1.6748025088044973e-05, "loss": 0.3453, "num_input_tokens_seen": 2822512, "step": 8365 }, { "epoch": 6.468315301391035, "grad_norm": 1.5577659606933594, "learning_rate": 1.67162028254144e-05, "loss": 0.5691, "num_input_tokens_seen": 2824144, "step": 8370 }, { "epoch": 6.4721792890262755, "grad_norm": 4.350645065307617, "learning_rate": 1.668439563286436e-05, "loss": 0.6262, "num_input_tokens_seen": 2825904, "step": 8375 }, { "epoch": 6.476043276661515, "grad_norm": 1.6922991275787354, "learning_rate": 1.665260356825927e-05, "loss": 0.3748, "num_input_tokens_seen": 2827568, "step": 8380 }, { "epoch": 6.479907264296754, "grad_norm": 1.7214277982711792, "learning_rate": 1.6620826689436e-05, "loss": 0.4783, "num_input_tokens_seen": 2829232, "step": 8385 }, { "epoch": 6.483771251931993, "grad_norm": 1.485700011253357, "learning_rate": 1.6589065054203808e-05, "loss": 0.3675, "num_input_tokens_seen": 2830928, "step": 8390 }, { "epoch": 6.487635239567234, "grad_norm": 3.227741241455078, "learning_rate": 1.6557318720344222e-05, "loss": 0.388, "num_input_tokens_seen": 2832432, "step": 8395 }, { "epoch": 6.491499227202473, "grad_norm": 1.5531940460205078, "learning_rate": 1.652558774561091e-05, "loss": 0.3579, "num_input_tokens_seen": 2833840, "step": 8400 }, { "epoch": 6.495363214837712, "grad_norm": 2.312067985534668, "learning_rate": 1.649387218772962e-05, "loss": 0.4418, "num_input_tokens_seen": 2835728, "step": 8405 }, { "epoch": 6.4992272024729525, "grad_norm": 1.5273774862289429, "learning_rate": 1.646217210439806e-05, "loss": 0.5626, "num_input_tokens_seen": 2837520, "step": 8410 }, { "epoch": 6.5, "eval_loss": 0.5047187209129333, "eval_runtime": 11.4571, "eval_samples_per_second": 50.187, "eval_steps_per_second": 12.569, "num_input_tokens_seen": 2837808, "step": 8411 }, { "epoch": 6.503091190108192, "grad_norm": 3.183053970336914, "learning_rate": 1.6430487553285762e-05, "loss": 0.3922, "num_input_tokens_seen": 2839344, "step": 8415 }, { "epoch": 6.506955177743431, "grad_norm": 6.910478115081787, "learning_rate": 1.6398818592034026e-05, "loss": 0.7643, "num_input_tokens_seen": 2840976, "step": 8420 }, { "epoch": 6.5108191653786704, "grad_norm": 2.6739814281463623, "learning_rate": 1.6367165278255753e-05, "loss": 0.3967, "num_input_tokens_seen": 2842608, "step": 8425 }, { "epoch": 6.514683153013911, "grad_norm": 1.7727009057998657, "learning_rate": 1.633552766953543e-05, "loss": 0.3381, "num_input_tokens_seen": 2844112, "step": 8430 }, { "epoch": 6.51854714064915, "grad_norm": 2.5339133739471436, "learning_rate": 1.630390582342894e-05, "loss": 0.3918, "num_input_tokens_seen": 2845520, "step": 8435 }, { "epoch": 6.522411128284389, "grad_norm": 1.7501474618911743, "learning_rate": 1.627229979746349e-05, "loss": 0.5056, "num_input_tokens_seen": 2847216, "step": 8440 }, { "epoch": 6.5262751159196295, "grad_norm": 2.7982568740844727, "learning_rate": 1.624070964913751e-05, "loss": 0.4179, "num_input_tokens_seen": 2848944, "step": 8445 }, { "epoch": 6.530139103554869, "grad_norm": 1.5035661458969116, "learning_rate": 1.620913543592056e-05, "loss": 0.3539, "num_input_tokens_seen": 2850608, "step": 8450 }, { "epoch": 6.534003091190108, "grad_norm": 2.3156399726867676, "learning_rate": 1.6177577215253193e-05, "loss": 0.4865, "num_input_tokens_seen": 2852304, "step": 8455 }, { "epoch": 6.5378670788253475, "grad_norm": 1.9137687683105469, "learning_rate": 1.614603504454687e-05, "loss": 0.4488, "num_input_tokens_seen": 2853936, "step": 8460 }, { "epoch": 6.541731066460588, "grad_norm": 1.6659657955169678, "learning_rate": 1.6114508981183855e-05, "loss": 0.3574, "num_input_tokens_seen": 2855568, "step": 8465 }, { "epoch": 6.545595054095827, "grad_norm": 2.8926591873168945, "learning_rate": 1.6082999082517122e-05, "loss": 0.4263, "num_input_tokens_seen": 2857456, "step": 8470 }, { "epoch": 6.549459041731066, "grad_norm": 1.975269079208374, "learning_rate": 1.605150540587022e-05, "loss": 0.3475, "num_input_tokens_seen": 2859024, "step": 8475 }, { "epoch": 6.553323029366306, "grad_norm": 1.6827679872512817, "learning_rate": 1.6020028008537203e-05, "loss": 0.3384, "num_input_tokens_seen": 2860624, "step": 8480 }, { "epoch": 6.557187017001546, "grad_norm": 3.86124324798584, "learning_rate": 1.598856694778247e-05, "loss": 0.6598, "num_input_tokens_seen": 2862128, "step": 8485 }, { "epoch": 6.561051004636785, "grad_norm": 2.2235267162323, "learning_rate": 1.595712228084077e-05, "loss": 0.4967, "num_input_tokens_seen": 2863760, "step": 8490 }, { "epoch": 6.5649149922720245, "grad_norm": 4.438138961791992, "learning_rate": 1.5925694064916963e-05, "loss": 0.6926, "num_input_tokens_seen": 2865488, "step": 8495 }, { "epoch": 6.568778979907265, "grad_norm": 2.8417093753814697, "learning_rate": 1.589428235718601e-05, "loss": 0.3812, "num_input_tokens_seen": 2867184, "step": 8500 }, { "epoch": 6.572642967542504, "grad_norm": 1.6543086767196655, "learning_rate": 1.5862887214792832e-05, "loss": 0.4299, "num_input_tokens_seen": 2868912, "step": 8505 }, { "epoch": 6.576506955177743, "grad_norm": 3.04862642288208, "learning_rate": 1.583150869485223e-05, "loss": 0.4079, "num_input_tokens_seen": 2870544, "step": 8510 }, { "epoch": 6.580370942812983, "grad_norm": 1.9609928131103516, "learning_rate": 1.5800146854448752e-05, "loss": 0.7085, "num_input_tokens_seen": 2872368, "step": 8515 }, { "epoch": 6.584234930448223, "grad_norm": 3.7250828742980957, "learning_rate": 1.5768801750636593e-05, "loss": 0.3773, "num_input_tokens_seen": 2874064, "step": 8520 }, { "epoch": 6.588098918083462, "grad_norm": 5.39861536026001, "learning_rate": 1.57374734404395e-05, "loss": 0.6512, "num_input_tokens_seen": 2875856, "step": 8525 }, { "epoch": 6.5919629057187015, "grad_norm": 1.6593221426010132, "learning_rate": 1.5706161980850704e-05, "loss": 0.5151, "num_input_tokens_seen": 2877584, "step": 8530 }, { "epoch": 6.595826893353941, "grad_norm": 4.2509026527404785, "learning_rate": 1.5674867428832745e-05, "loss": 0.6018, "num_input_tokens_seen": 2879216, "step": 8535 }, { "epoch": 6.599690880989181, "grad_norm": 1.3194881677627563, "learning_rate": 1.564358984131742e-05, "loss": 0.4069, "num_input_tokens_seen": 2880976, "step": 8540 }, { "epoch": 6.60355486862442, "grad_norm": 2.0142319202423096, "learning_rate": 1.561232927520564e-05, "loss": 0.4354, "num_input_tokens_seen": 2882576, "step": 8545 }, { "epoch": 6.60741885625966, "grad_norm": 2.735901117324829, "learning_rate": 1.558108578736739e-05, "loss": 0.3543, "num_input_tokens_seen": 2884432, "step": 8550 }, { "epoch": 6.6112828438949, "grad_norm": 1.8466538190841675, "learning_rate": 1.5549859434641556e-05, "loss": 0.3943, "num_input_tokens_seen": 2885936, "step": 8555 }, { "epoch": 6.615146831530139, "grad_norm": 1.1523810625076294, "learning_rate": 1.5518650273835866e-05, "loss": 0.3746, "num_input_tokens_seen": 2887664, "step": 8560 }, { "epoch": 6.6190108191653785, "grad_norm": 3.1131138801574707, "learning_rate": 1.5487458361726754e-05, "loss": 0.3716, "num_input_tokens_seen": 2889488, "step": 8565 }, { "epoch": 6.622874806800619, "grad_norm": 1.7499080896377563, "learning_rate": 1.5456283755059308e-05, "loss": 0.4708, "num_input_tokens_seen": 2890992, "step": 8570 }, { "epoch": 6.626738794435858, "grad_norm": 3.4902966022491455, "learning_rate": 1.54251265105471e-05, "loss": 0.4119, "num_input_tokens_seen": 2892528, "step": 8575 }, { "epoch": 6.630602782071097, "grad_norm": 2.894718647003174, "learning_rate": 1.5393986684872125e-05, "loss": 0.3519, "num_input_tokens_seen": 2894320, "step": 8580 }, { "epoch": 6.634466769706337, "grad_norm": 2.5835232734680176, "learning_rate": 1.5362864334684696e-05, "loss": 0.4768, "num_input_tokens_seen": 2896176, "step": 8585 }, { "epoch": 6.638330757341577, "grad_norm": 3.5658438205718994, "learning_rate": 1.5331759516603337e-05, "loss": 0.4682, "num_input_tokens_seen": 2897840, "step": 8590 }, { "epoch": 6.642194744976816, "grad_norm": 3.124873399734497, "learning_rate": 1.5300672287214674e-05, "loss": 0.4123, "num_input_tokens_seen": 2899728, "step": 8595 }, { "epoch": 6.6460587326120555, "grad_norm": 3.7263433933258057, "learning_rate": 1.5269602703073324e-05, "loss": 0.4315, "num_input_tokens_seen": 2901328, "step": 8600 }, { "epoch": 6.649922720247295, "grad_norm": 1.9488509893417358, "learning_rate": 1.523855082070181e-05, "loss": 0.3937, "num_input_tokens_seen": 2902896, "step": 8605 }, { "epoch": 6.653786707882535, "grad_norm": 3.0745115280151367, "learning_rate": 1.5207516696590468e-05, "loss": 0.5301, "num_input_tokens_seen": 2904528, "step": 8610 }, { "epoch": 6.657650695517774, "grad_norm": 1.7952618598937988, "learning_rate": 1.5176500387197301e-05, "loss": 0.3792, "num_input_tokens_seen": 2906352, "step": 8615 }, { "epoch": 6.661514683153014, "grad_norm": 1.89523446559906, "learning_rate": 1.5145501948947926e-05, "loss": 0.3884, "num_input_tokens_seen": 2907920, "step": 8620 }, { "epoch": 6.665378670788254, "grad_norm": 2.576425790786743, "learning_rate": 1.5114521438235417e-05, "loss": 0.5573, "num_input_tokens_seen": 2909424, "step": 8625 }, { "epoch": 6.669242658423493, "grad_norm": 3.539116144180298, "learning_rate": 1.508355891142029e-05, "loss": 0.4816, "num_input_tokens_seen": 2911024, "step": 8630 }, { "epoch": 6.6731066460587325, "grad_norm": 2.6953797340393066, "learning_rate": 1.505261442483028e-05, "loss": 0.3448, "num_input_tokens_seen": 2913040, "step": 8635 }, { "epoch": 6.676970633693972, "grad_norm": 2.7413885593414307, "learning_rate": 1.5021688034760343e-05, "loss": 0.4373, "num_input_tokens_seen": 2914832, "step": 8640 }, { "epoch": 6.680834621329212, "grad_norm": 1.8563615083694458, "learning_rate": 1.4990779797472494e-05, "loss": 0.3478, "num_input_tokens_seen": 2916432, "step": 8645 }, { "epoch": 6.684698608964451, "grad_norm": 2.2493228912353516, "learning_rate": 1.4959889769195743e-05, "loss": 0.4381, "num_input_tokens_seen": 2918128, "step": 8650 }, { "epoch": 6.688562596599691, "grad_norm": 1.7549382448196411, "learning_rate": 1.4929018006125964e-05, "loss": 0.3906, "num_input_tokens_seen": 2919792, "step": 8655 }, { "epoch": 6.69242658423493, "grad_norm": 3.6498241424560547, "learning_rate": 1.489816456442579e-05, "loss": 0.3789, "num_input_tokens_seen": 2921296, "step": 8660 }, { "epoch": 6.69629057187017, "grad_norm": 2.6413142681121826, "learning_rate": 1.4867329500224536e-05, "loss": 0.3677, "num_input_tokens_seen": 2922768, "step": 8665 }, { "epoch": 6.7001545595054095, "grad_norm": 1.8952198028564453, "learning_rate": 1.4836512869618097e-05, "loss": 0.4943, "num_input_tokens_seen": 2924464, "step": 8670 }, { "epoch": 6.704018547140649, "grad_norm": 1.7828210592269897, "learning_rate": 1.4805714728668807e-05, "loss": 0.3356, "num_input_tokens_seen": 2926032, "step": 8675 }, { "epoch": 6.707882534775889, "grad_norm": 1.5816617012023926, "learning_rate": 1.4774935133405382e-05, "loss": 0.3658, "num_input_tokens_seen": 2927696, "step": 8680 }, { "epoch": 6.711746522411128, "grad_norm": 1.8697855472564697, "learning_rate": 1.4744174139822775e-05, "loss": 0.7895, "num_input_tokens_seen": 2929488, "step": 8685 }, { "epoch": 6.715610510046368, "grad_norm": 1.892409086227417, "learning_rate": 1.471343180388215e-05, "loss": 0.4852, "num_input_tokens_seen": 2931216, "step": 8690 }, { "epoch": 6.719474497681608, "grad_norm": 3.1919102668762207, "learning_rate": 1.4682708181510663e-05, "loss": 0.4102, "num_input_tokens_seen": 2932784, "step": 8695 }, { "epoch": 6.723338485316847, "grad_norm": 3.2203376293182373, "learning_rate": 1.4652003328601475e-05, "loss": 0.3667, "num_input_tokens_seen": 2934288, "step": 8700 }, { "epoch": 6.7272024729520865, "grad_norm": 1.972880482673645, "learning_rate": 1.4621317301013571e-05, "loss": 0.3341, "num_input_tokens_seen": 2935952, "step": 8705 }, { "epoch": 6.731066460587326, "grad_norm": 2.1940972805023193, "learning_rate": 1.4590650154571716e-05, "loss": 0.5225, "num_input_tokens_seen": 2937456, "step": 8710 }, { "epoch": 6.734930448222566, "grad_norm": 3.7750885486602783, "learning_rate": 1.4560001945066298e-05, "loss": 0.3954, "num_input_tokens_seen": 2939184, "step": 8715 }, { "epoch": 6.738794435857805, "grad_norm": 3.980043649673462, "learning_rate": 1.452937272825328e-05, "loss": 0.4148, "num_input_tokens_seen": 2941136, "step": 8720 }, { "epoch": 6.742658423493045, "grad_norm": 1.8707879781723022, "learning_rate": 1.4498762559854043e-05, "loss": 0.3989, "num_input_tokens_seen": 2942704, "step": 8725 }, { "epoch": 6.746522411128284, "grad_norm": 2.619156837463379, "learning_rate": 1.4468171495555356e-05, "loss": 0.6086, "num_input_tokens_seen": 2944368, "step": 8730 }, { "epoch": 6.750386398763524, "grad_norm": 3.8591370582580566, "learning_rate": 1.4437599591009181e-05, "loss": 0.5073, "num_input_tokens_seen": 2945936, "step": 8735 }, { "epoch": 6.7542503863987635, "grad_norm": 3.0425291061401367, "learning_rate": 1.4407046901832683e-05, "loss": 0.5843, "num_input_tokens_seen": 2947664, "step": 8740 }, { "epoch": 6.758114374034003, "grad_norm": 4.0033955574035645, "learning_rate": 1.4376513483608012e-05, "loss": 0.4003, "num_input_tokens_seen": 2949392, "step": 8745 }, { "epoch": 6.761978361669243, "grad_norm": 2.7341015338897705, "learning_rate": 1.4345999391882297e-05, "loss": 0.4577, "num_input_tokens_seen": 2951248, "step": 8750 }, { "epoch": 6.765842349304482, "grad_norm": 2.04049015045166, "learning_rate": 1.4315504682167518e-05, "loss": 0.3429, "num_input_tokens_seen": 2953072, "step": 8755 }, { "epoch": 6.769706336939722, "grad_norm": 3.119964599609375, "learning_rate": 1.4285029409940359e-05, "loss": 0.3616, "num_input_tokens_seen": 2954512, "step": 8760 }, { "epoch": 6.773570324574961, "grad_norm": 1.7895478010177612, "learning_rate": 1.4254573630642143e-05, "loss": 0.3836, "num_input_tokens_seen": 2956048, "step": 8765 }, { "epoch": 6.777434312210201, "grad_norm": 2.014575958251953, "learning_rate": 1.4224137399678767e-05, "loss": 0.3824, "num_input_tokens_seen": 2957680, "step": 8770 }, { "epoch": 6.7812982998454405, "grad_norm": 1.91763436794281, "learning_rate": 1.4193720772420552e-05, "loss": 0.3815, "num_input_tokens_seen": 2959472, "step": 8775 }, { "epoch": 6.78516228748068, "grad_norm": 2.3600988388061523, "learning_rate": 1.4163323804202137e-05, "loss": 0.6185, "num_input_tokens_seen": 2961168, "step": 8780 }, { "epoch": 6.789026275115919, "grad_norm": 2.0091166496276855, "learning_rate": 1.4132946550322387e-05, "loss": 0.4006, "num_input_tokens_seen": 2962704, "step": 8785 }, { "epoch": 6.792890262751159, "grad_norm": 3.711113452911377, "learning_rate": 1.4102589066044366e-05, "loss": 0.3458, "num_input_tokens_seen": 2964464, "step": 8790 }, { "epoch": 6.796754250386399, "grad_norm": 1.7318971157073975, "learning_rate": 1.407225140659511e-05, "loss": 0.5886, "num_input_tokens_seen": 2966416, "step": 8795 }, { "epoch": 6.800618238021638, "grad_norm": 2.5620882511138916, "learning_rate": 1.4041933627165602e-05, "loss": 0.4412, "num_input_tokens_seen": 2967984, "step": 8800 }, { "epoch": 6.804482225656878, "grad_norm": 3.1111576557159424, "learning_rate": 1.4011635782910673e-05, "loss": 0.3932, "num_input_tokens_seen": 2969712, "step": 8805 }, { "epoch": 6.8083462132921175, "grad_norm": 3.3577871322631836, "learning_rate": 1.398135792894889e-05, "loss": 1.0223, "num_input_tokens_seen": 2971312, "step": 8810 }, { "epoch": 6.812210200927357, "grad_norm": 1.3942831754684448, "learning_rate": 1.3951100120362438e-05, "loss": 0.3457, "num_input_tokens_seen": 2973040, "step": 8815 }, { "epoch": 6.816074188562597, "grad_norm": 4.24552583694458, "learning_rate": 1.3920862412197027e-05, "loss": 0.4082, "num_input_tokens_seen": 2974704, "step": 8820 }, { "epoch": 6.819938176197836, "grad_norm": 2.4689886569976807, "learning_rate": 1.3890644859461837e-05, "loss": 0.4854, "num_input_tokens_seen": 2976368, "step": 8825 }, { "epoch": 6.823802163833076, "grad_norm": 1.9238961935043335, "learning_rate": 1.3860447517129332e-05, "loss": 0.3751, "num_input_tokens_seen": 2978000, "step": 8830 }, { "epoch": 6.827666151468315, "grad_norm": 3.040152072906494, "learning_rate": 1.3830270440135253e-05, "loss": 0.4508, "num_input_tokens_seen": 2979888, "step": 8835 }, { "epoch": 6.831530139103555, "grad_norm": 1.6698857545852661, "learning_rate": 1.3800113683378458e-05, "loss": 0.513, "num_input_tokens_seen": 2981776, "step": 8840 }, { "epoch": 6.8353941267387945, "grad_norm": 1.351758360862732, "learning_rate": 1.376997730172083e-05, "loss": 0.3296, "num_input_tokens_seen": 2983536, "step": 8845 }, { "epoch": 6.839258114374034, "grad_norm": 2.8224449157714844, "learning_rate": 1.3739861349987177e-05, "loss": 0.5962, "num_input_tokens_seen": 2985136, "step": 8850 }, { "epoch": 6.843122102009273, "grad_norm": 3.50205135345459, "learning_rate": 1.3709765882965162e-05, "loss": 0.3655, "num_input_tokens_seen": 2987184, "step": 8855 }, { "epoch": 6.846986089644513, "grad_norm": 3.9578466415405273, "learning_rate": 1.3679690955405187e-05, "loss": 0.3642, "num_input_tokens_seen": 2989232, "step": 8860 }, { "epoch": 6.850850077279753, "grad_norm": 2.787277936935425, "learning_rate": 1.3649636622020262e-05, "loss": 0.4926, "num_input_tokens_seen": 2990832, "step": 8865 }, { "epoch": 6.854714064914992, "grad_norm": 3.258970260620117, "learning_rate": 1.3619602937485925e-05, "loss": 0.3996, "num_input_tokens_seen": 2992592, "step": 8870 }, { "epoch": 6.858578052550232, "grad_norm": 3.534452199935913, "learning_rate": 1.3589589956440208e-05, "loss": 0.4806, "num_input_tokens_seen": 2994448, "step": 8875 }, { "epoch": 6.8624420401854715, "grad_norm": 1.5600824356079102, "learning_rate": 1.355959773348342e-05, "loss": 0.4927, "num_input_tokens_seen": 2996304, "step": 8880 }, { "epoch": 6.866306027820711, "grad_norm": 2.8018884658813477, "learning_rate": 1.3529626323178116e-05, "loss": 0.3892, "num_input_tokens_seen": 2998256, "step": 8885 }, { "epoch": 6.87017001545595, "grad_norm": 4.343911170959473, "learning_rate": 1.3499675780049003e-05, "loss": 0.5378, "num_input_tokens_seen": 3000144, "step": 8890 }, { "epoch": 6.87403400309119, "grad_norm": 1.4759547710418701, "learning_rate": 1.3469746158582835e-05, "loss": 0.3715, "num_input_tokens_seen": 3002064, "step": 8895 }, { "epoch": 6.87789799072643, "grad_norm": 1.9287984371185303, "learning_rate": 1.3439837513228277e-05, "loss": 0.446, "num_input_tokens_seen": 3003760, "step": 8900 }, { "epoch": 6.881761978361669, "grad_norm": 2.1115024089813232, "learning_rate": 1.340994989839584e-05, "loss": 0.4278, "num_input_tokens_seen": 3005488, "step": 8905 }, { "epoch": 6.885625965996908, "grad_norm": 1.430432677268982, "learning_rate": 1.3380083368457788e-05, "loss": 0.5711, "num_input_tokens_seen": 3006896, "step": 8910 }, { "epoch": 6.8894899536321486, "grad_norm": 2.1285295486450195, "learning_rate": 1.3350237977748037e-05, "loss": 0.5514, "num_input_tokens_seen": 3008464, "step": 8915 }, { "epoch": 6.893353941267388, "grad_norm": 1.4700311422348022, "learning_rate": 1.3320413780562006e-05, "loss": 0.3378, "num_input_tokens_seen": 3010000, "step": 8920 }, { "epoch": 6.897217928902627, "grad_norm": 1.7409234046936035, "learning_rate": 1.3290610831156606e-05, "loss": 0.5905, "num_input_tokens_seen": 3011792, "step": 8925 }, { "epoch": 6.901081916537867, "grad_norm": 1.672373652458191, "learning_rate": 1.3260829183750045e-05, "loss": 0.3462, "num_input_tokens_seen": 3013584, "step": 8930 }, { "epoch": 6.904945904173107, "grad_norm": 1.4137651920318604, "learning_rate": 1.323106889252183e-05, "loss": 0.3704, "num_input_tokens_seen": 3015184, "step": 8935 }, { "epoch": 6.908809891808346, "grad_norm": 1.6452449560165405, "learning_rate": 1.320133001161257e-05, "loss": 0.333, "num_input_tokens_seen": 3016944, "step": 8940 }, { "epoch": 6.912673879443586, "grad_norm": 3.3206825256347656, "learning_rate": 1.3171612595123955e-05, "loss": 0.5012, "num_input_tokens_seen": 3018512, "step": 8945 }, { "epoch": 6.916537867078826, "grad_norm": 1.3988215923309326, "learning_rate": 1.3141916697118606e-05, "loss": 0.3022, "num_input_tokens_seen": 3020400, "step": 8950 }, { "epoch": 6.920401854714065, "grad_norm": 2.695338010787964, "learning_rate": 1.3112242371620004e-05, "loss": 0.3592, "num_input_tokens_seen": 3022288, "step": 8955 }, { "epoch": 6.924265842349304, "grad_norm": 4.296004295349121, "learning_rate": 1.3082589672612405e-05, "loss": 0.4072, "num_input_tokens_seen": 3024112, "step": 8960 }, { "epoch": 6.928129829984544, "grad_norm": 1.6558641195297241, "learning_rate": 1.3052958654040692e-05, "loss": 0.4612, "num_input_tokens_seen": 3026096, "step": 8965 }, { "epoch": 6.931993817619784, "grad_norm": 3.5653750896453857, "learning_rate": 1.3023349369810306e-05, "loss": 0.3816, "num_input_tokens_seen": 3027728, "step": 8970 }, { "epoch": 6.935857805255023, "grad_norm": 2.08186936378479, "learning_rate": 1.299376187378717e-05, "loss": 0.3748, "num_input_tokens_seen": 3029488, "step": 8975 }, { "epoch": 6.939721792890262, "grad_norm": 3.3812241554260254, "learning_rate": 1.296419621979757e-05, "loss": 0.551, "num_input_tokens_seen": 3031344, "step": 8980 }, { "epoch": 6.943585780525503, "grad_norm": 1.872578740119934, "learning_rate": 1.2934652461628034e-05, "loss": 0.3535, "num_input_tokens_seen": 3032880, "step": 8985 }, { "epoch": 6.947449768160742, "grad_norm": 5.0612101554870605, "learning_rate": 1.290513065302525e-05, "loss": 0.5355, "num_input_tokens_seen": 3034352, "step": 8990 }, { "epoch": 6.951313755795981, "grad_norm": 2.196460247039795, "learning_rate": 1.2875630847696035e-05, "loss": 0.3792, "num_input_tokens_seen": 3035920, "step": 8995 }, { "epoch": 6.955177743431221, "grad_norm": 1.5696874856948853, "learning_rate": 1.2846153099307116e-05, "loss": 0.4014, "num_input_tokens_seen": 3037744, "step": 9000 }, { "epoch": 6.959041731066461, "grad_norm": 1.923835277557373, "learning_rate": 1.2816697461485097e-05, "loss": 0.4758, "num_input_tokens_seen": 3039312, "step": 9005 }, { "epoch": 6.9629057187017, "grad_norm": 3.89986252784729, "learning_rate": 1.2787263987816388e-05, "loss": 0.3843, "num_input_tokens_seen": 3041296, "step": 9010 }, { "epoch": 6.966769706336939, "grad_norm": 1.1973180770874023, "learning_rate": 1.2757852731847075e-05, "loss": 0.4861, "num_input_tokens_seen": 3042992, "step": 9015 }, { "epoch": 6.97063369397218, "grad_norm": 1.7698969841003418, "learning_rate": 1.2728463747082803e-05, "loss": 0.4972, "num_input_tokens_seen": 3044528, "step": 9020 }, { "epoch": 6.974497681607419, "grad_norm": 2.3640801906585693, "learning_rate": 1.2699097086988698e-05, "loss": 0.3213, "num_input_tokens_seen": 3046096, "step": 9025 }, { "epoch": 6.978361669242658, "grad_norm": 3.747734546661377, "learning_rate": 1.2669752804989294e-05, "loss": 0.6303, "num_input_tokens_seen": 3047760, "step": 9030 }, { "epoch": 6.9822256568778975, "grad_norm": 3.4997925758361816, "learning_rate": 1.2640430954468421e-05, "loss": 0.4466, "num_input_tokens_seen": 3049264, "step": 9035 }, { "epoch": 6.986089644513138, "grad_norm": 1.8814175128936768, "learning_rate": 1.2611131588769064e-05, "loss": 0.3472, "num_input_tokens_seen": 3050864, "step": 9040 }, { "epoch": 6.989953632148377, "grad_norm": 4.281711578369141, "learning_rate": 1.2581854761193342e-05, "loss": 0.5333, "num_input_tokens_seen": 3052624, "step": 9045 }, { "epoch": 6.993817619783616, "grad_norm": 3.9346790313720703, "learning_rate": 1.255260052500234e-05, "loss": 0.4164, "num_input_tokens_seen": 3054544, "step": 9050 }, { "epoch": 6.997681607418857, "grad_norm": 2.0704174041748047, "learning_rate": 1.2523368933416075e-05, "loss": 0.3735, "num_input_tokens_seen": 3056272, "step": 9055 }, { "epoch": 7.0, "eval_loss": 0.509260356426239, "eval_runtime": 11.4855, "eval_samples_per_second": 50.063, "eval_steps_per_second": 12.538, "num_input_tokens_seen": 3057216, "step": 9058 }, { "epoch": 7.001545595054096, "grad_norm": 3.0974295139312744, "learning_rate": 1.249416003961333e-05, "loss": 0.6626, "num_input_tokens_seen": 3057952, "step": 9060 }, { "epoch": 7.005409582689335, "grad_norm": 1.0747054815292358, "learning_rate": 1.246497389673165e-05, "loss": 0.3556, "num_input_tokens_seen": 3059360, "step": 9065 }, { "epoch": 7.0092735703245745, "grad_norm": 2.012049436569214, "learning_rate": 1.2435810557867125e-05, "loss": 0.3842, "num_input_tokens_seen": 3060864, "step": 9070 }, { "epoch": 7.013137557959815, "grad_norm": 3.26240873336792, "learning_rate": 1.2406670076074415e-05, "loss": 0.4978, "num_input_tokens_seen": 3062528, "step": 9075 }, { "epoch": 7.017001545595054, "grad_norm": 3.626330852508545, "learning_rate": 1.2377552504366577e-05, "loss": 0.346, "num_input_tokens_seen": 3064384, "step": 9080 }, { "epoch": 7.020865533230293, "grad_norm": 1.9356229305267334, "learning_rate": 1.2348457895714985e-05, "loss": 0.5624, "num_input_tokens_seen": 3066016, "step": 9085 }, { "epoch": 7.024729520865534, "grad_norm": 2.9129693508148193, "learning_rate": 1.2319386303049233e-05, "loss": 0.3885, "num_input_tokens_seen": 3067616, "step": 9090 }, { "epoch": 7.028593508500773, "grad_norm": 2.1344552040100098, "learning_rate": 1.2290337779257058e-05, "loss": 0.3772, "num_input_tokens_seen": 3069248, "step": 9095 }, { "epoch": 7.032457496136012, "grad_norm": 2.8879215717315674, "learning_rate": 1.2261312377184239e-05, "loss": 0.4999, "num_input_tokens_seen": 3070880, "step": 9100 }, { "epoch": 7.0363214837712516, "grad_norm": 1.9931401014328003, "learning_rate": 1.2232310149634462e-05, "loss": 0.3756, "num_input_tokens_seen": 3072800, "step": 9105 }, { "epoch": 7.040185471406492, "grad_norm": 2.1334798336029053, "learning_rate": 1.2203331149369263e-05, "loss": 0.4418, "num_input_tokens_seen": 3074240, "step": 9110 }, { "epoch": 7.044049459041731, "grad_norm": 4.429936408996582, "learning_rate": 1.2174375429107934e-05, "loss": 0.6601, "num_input_tokens_seen": 3075744, "step": 9115 }, { "epoch": 7.04791344667697, "grad_norm": 3.2546322345733643, "learning_rate": 1.2145443041527418e-05, "loss": 0.5041, "num_input_tokens_seen": 3077248, "step": 9120 }, { "epoch": 7.051777434312211, "grad_norm": 4.045584201812744, "learning_rate": 1.2116534039262187e-05, "loss": 0.5925, "num_input_tokens_seen": 3078880, "step": 9125 }, { "epoch": 7.05564142194745, "grad_norm": 1.2837884426116943, "learning_rate": 1.208764847490419e-05, "loss": 0.325, "num_input_tokens_seen": 3080544, "step": 9130 }, { "epoch": 7.059505409582689, "grad_norm": 3.396745443344116, "learning_rate": 1.2058786401002742e-05, "loss": 0.6145, "num_input_tokens_seen": 3082112, "step": 9135 }, { "epoch": 7.063369397217929, "grad_norm": 3.1542437076568604, "learning_rate": 1.2029947870064403e-05, "loss": 0.3824, "num_input_tokens_seen": 3083872, "step": 9140 }, { "epoch": 7.067233384853169, "grad_norm": 2.8281660079956055, "learning_rate": 1.2001132934552906e-05, "loss": 0.4107, "num_input_tokens_seen": 3085696, "step": 9145 }, { "epoch": 7.071097372488408, "grad_norm": 2.10642671585083, "learning_rate": 1.1972341646889073e-05, "loss": 0.3921, "num_input_tokens_seen": 3087456, "step": 9150 }, { "epoch": 7.074961360123647, "grad_norm": 1.251649022102356, "learning_rate": 1.194357405945071e-05, "loss": 0.3589, "num_input_tokens_seen": 3089056, "step": 9155 }, { "epoch": 7.078825347758887, "grad_norm": 2.960524082183838, "learning_rate": 1.1914830224572485e-05, "loss": 0.4307, "num_input_tokens_seen": 3090560, "step": 9160 }, { "epoch": 7.082689335394127, "grad_norm": 2.3570780754089355, "learning_rate": 1.1886110194545855e-05, "loss": 0.6107, "num_input_tokens_seen": 3092384, "step": 9165 }, { "epoch": 7.086553323029366, "grad_norm": 2.849635601043701, "learning_rate": 1.185741402161899e-05, "loss": 0.3464, "num_input_tokens_seen": 3093984, "step": 9170 }, { "epoch": 7.090417310664606, "grad_norm": 5.125016689300537, "learning_rate": 1.1828741757996662e-05, "loss": 0.4882, "num_input_tokens_seen": 3095616, "step": 9175 }, { "epoch": 7.094281298299846, "grad_norm": 2.246525764465332, "learning_rate": 1.1800093455840108e-05, "loss": 0.3171, "num_input_tokens_seen": 3097248, "step": 9180 }, { "epoch": 7.098145285935085, "grad_norm": 2.653876543045044, "learning_rate": 1.1771469167267027e-05, "loss": 0.5105, "num_input_tokens_seen": 3099104, "step": 9185 }, { "epoch": 7.102009273570324, "grad_norm": 4.304300785064697, "learning_rate": 1.174286894435138e-05, "loss": 0.5865, "num_input_tokens_seen": 3100640, "step": 9190 }, { "epoch": 7.105873261205564, "grad_norm": 3.418368101119995, "learning_rate": 1.171429283912339e-05, "loss": 0.6305, "num_input_tokens_seen": 3102368, "step": 9195 }, { "epoch": 7.109737248840804, "grad_norm": 2.5747087001800537, "learning_rate": 1.168574090356937e-05, "loss": 0.3336, "num_input_tokens_seen": 3104096, "step": 9200 }, { "epoch": 7.113601236476043, "grad_norm": 3.525707960128784, "learning_rate": 1.16572131896317e-05, "loss": 0.4886, "num_input_tokens_seen": 3105856, "step": 9205 }, { "epoch": 7.117465224111283, "grad_norm": 4.712860584259033, "learning_rate": 1.1628709749208652e-05, "loss": 0.527, "num_input_tokens_seen": 3107840, "step": 9210 }, { "epoch": 7.121329211746523, "grad_norm": 2.5591368675231934, "learning_rate": 1.1600230634154372e-05, "loss": 0.3878, "num_input_tokens_seen": 3109504, "step": 9215 }, { "epoch": 7.125193199381762, "grad_norm": 3.4065284729003906, "learning_rate": 1.1571775896278753e-05, "loss": 0.4826, "num_input_tokens_seen": 3111200, "step": 9220 }, { "epoch": 7.129057187017001, "grad_norm": 4.212822914123535, "learning_rate": 1.1543345587347318e-05, "loss": 0.5307, "num_input_tokens_seen": 3112864, "step": 9225 }, { "epoch": 7.132921174652241, "grad_norm": 4.731650352478027, "learning_rate": 1.1514939759081153e-05, "loss": 0.4065, "num_input_tokens_seen": 3114464, "step": 9230 }, { "epoch": 7.136785162287481, "grad_norm": 2.389244556427002, "learning_rate": 1.1486558463156827e-05, "loss": 0.3669, "num_input_tokens_seen": 3116192, "step": 9235 }, { "epoch": 7.14064914992272, "grad_norm": 2.6830830574035645, "learning_rate": 1.1458201751206271e-05, "loss": 0.4271, "num_input_tokens_seen": 3117728, "step": 9240 }, { "epoch": 7.14451313755796, "grad_norm": 2.8105056285858154, "learning_rate": 1.1429869674816684e-05, "loss": 0.4299, "num_input_tokens_seen": 3119296, "step": 9245 }, { "epoch": 7.1483771251932, "grad_norm": 3.0470306873321533, "learning_rate": 1.1401562285530435e-05, "loss": 0.3848, "num_input_tokens_seen": 3120768, "step": 9250 }, { "epoch": 7.152241112828439, "grad_norm": 3.768038272857666, "learning_rate": 1.137327963484503e-05, "loss": 0.4136, "num_input_tokens_seen": 3122656, "step": 9255 }, { "epoch": 7.156105100463678, "grad_norm": 2.065084934234619, "learning_rate": 1.1345021774212925e-05, "loss": 0.3465, "num_input_tokens_seen": 3124064, "step": 9260 }, { "epoch": 7.159969088098918, "grad_norm": 2.61859130859375, "learning_rate": 1.1316788755041476e-05, "loss": 0.3965, "num_input_tokens_seen": 3125856, "step": 9265 }, { "epoch": 7.163833075734158, "grad_norm": 3.161231279373169, "learning_rate": 1.1288580628692883e-05, "loss": 0.4052, "num_input_tokens_seen": 3127648, "step": 9270 }, { "epoch": 7.167697063369397, "grad_norm": 2.8089349269866943, "learning_rate": 1.1260397446484036e-05, "loss": 0.5316, "num_input_tokens_seen": 3129216, "step": 9275 }, { "epoch": 7.171561051004637, "grad_norm": 3.387049674987793, "learning_rate": 1.1232239259686447e-05, "loss": 0.4098, "num_input_tokens_seen": 3131008, "step": 9280 }, { "epoch": 7.175425038639876, "grad_norm": 2.7599880695343018, "learning_rate": 1.1204106119526147e-05, "loss": 0.3936, "num_input_tokens_seen": 3132832, "step": 9285 }, { "epoch": 7.179289026275116, "grad_norm": 2.8573827743530273, "learning_rate": 1.1175998077183621e-05, "loss": 0.3759, "num_input_tokens_seen": 3134336, "step": 9290 }, { "epoch": 7.183153013910355, "grad_norm": 1.593995451927185, "learning_rate": 1.11479151837937e-05, "loss": 0.355, "num_input_tokens_seen": 3136224, "step": 9295 }, { "epoch": 7.187017001545595, "grad_norm": 1.7038973569869995, "learning_rate": 1.1119857490445428e-05, "loss": 0.3641, "num_input_tokens_seen": 3138272, "step": 9300 }, { "epoch": 7.190880989180835, "grad_norm": 2.157503128051758, "learning_rate": 1.1091825048182047e-05, "loss": 0.4694, "num_input_tokens_seen": 3140224, "step": 9305 }, { "epoch": 7.194744976816074, "grad_norm": 6.473684310913086, "learning_rate": 1.1063817908000826e-05, "loss": 0.6285, "num_input_tokens_seen": 3142048, "step": 9310 }, { "epoch": 7.198608964451314, "grad_norm": 2.346626043319702, "learning_rate": 1.1035836120853033e-05, "loss": 0.4393, "num_input_tokens_seen": 3143712, "step": 9315 }, { "epoch": 7.202472952086553, "grad_norm": 3.698164939880371, "learning_rate": 1.1007879737643783e-05, "loss": 0.5222, "num_input_tokens_seen": 3145440, "step": 9320 }, { "epoch": 7.206336939721793, "grad_norm": 3.5885770320892334, "learning_rate": 1.0979948809232013e-05, "loss": 0.6089, "num_input_tokens_seen": 3146816, "step": 9325 }, { "epoch": 7.210200927357032, "grad_norm": 5.091559410095215, "learning_rate": 1.095204338643031e-05, "loss": 0.4349, "num_input_tokens_seen": 3148448, "step": 9330 }, { "epoch": 7.214064914992272, "grad_norm": 3.46703839302063, "learning_rate": 1.0924163520004895e-05, "loss": 0.5041, "num_input_tokens_seen": 3150336, "step": 9335 }, { "epoch": 7.217928902627512, "grad_norm": 3.0702860355377197, "learning_rate": 1.0896309260675488e-05, "loss": 0.6073, "num_input_tokens_seen": 3152000, "step": 9340 }, { "epoch": 7.221792890262751, "grad_norm": 2.3972418308258057, "learning_rate": 1.086848065911521e-05, "loss": 0.3867, "num_input_tokens_seen": 3153760, "step": 9345 }, { "epoch": 7.225656877897991, "grad_norm": 1.8204187154769897, "learning_rate": 1.0840677765950508e-05, "loss": 0.3057, "num_input_tokens_seen": 3155296, "step": 9350 }, { "epoch": 7.22952086553323, "grad_norm": 2.031524658203125, "learning_rate": 1.0812900631761072e-05, "loss": 0.4081, "num_input_tokens_seen": 3157152, "step": 9355 }, { "epoch": 7.23338485316847, "grad_norm": 3.056143283843994, "learning_rate": 1.0785149307079733e-05, "loss": 0.3904, "num_input_tokens_seen": 3158720, "step": 9360 }, { "epoch": 7.2372488408037094, "grad_norm": 2.131702184677124, "learning_rate": 1.0757423842392353e-05, "loss": 0.4054, "num_input_tokens_seen": 3160352, "step": 9365 }, { "epoch": 7.241112828438949, "grad_norm": 2.120316982269287, "learning_rate": 1.0729724288137736e-05, "loss": 0.4588, "num_input_tokens_seen": 3161856, "step": 9370 }, { "epoch": 7.244976816074189, "grad_norm": 1.4198640584945679, "learning_rate": 1.0702050694707604e-05, "loss": 0.3288, "num_input_tokens_seen": 3163680, "step": 9375 }, { "epoch": 7.248840803709428, "grad_norm": 4.408343315124512, "learning_rate": 1.0674403112446401e-05, "loss": 0.4146, "num_input_tokens_seen": 3165408, "step": 9380 }, { "epoch": 7.252704791344668, "grad_norm": 2.1034207344055176, "learning_rate": 1.0646781591651256e-05, "loss": 0.397, "num_input_tokens_seen": 3167104, "step": 9385 }, { "epoch": 7.256568778979907, "grad_norm": 3.9215645790100098, "learning_rate": 1.06191861825719e-05, "loss": 0.3347, "num_input_tokens_seen": 3168672, "step": 9390 }, { "epoch": 7.260432766615147, "grad_norm": 2.408116579055786, "learning_rate": 1.059161693541057e-05, "loss": 0.4235, "num_input_tokens_seen": 3170304, "step": 9395 }, { "epoch": 7.2642967542503865, "grad_norm": 3.3670716285705566, "learning_rate": 1.0564073900321883e-05, "loss": 0.4369, "num_input_tokens_seen": 3172096, "step": 9400 }, { "epoch": 7.268160741885626, "grad_norm": 2.3424758911132812, "learning_rate": 1.0536557127412774e-05, "loss": 0.3752, "num_input_tokens_seen": 3173856, "step": 9405 }, { "epoch": 7.272024729520865, "grad_norm": 2.776566982269287, "learning_rate": 1.0509066666742421e-05, "loss": 0.3969, "num_input_tokens_seen": 3175648, "step": 9410 }, { "epoch": 7.275888717156105, "grad_norm": 3.738474130630493, "learning_rate": 1.048160256832213e-05, "loss": 0.454, "num_input_tokens_seen": 3177728, "step": 9415 }, { "epoch": 7.279752704791345, "grad_norm": 3.2426416873931885, "learning_rate": 1.0454164882115225e-05, "loss": 0.4723, "num_input_tokens_seen": 3179328, "step": 9420 }, { "epoch": 7.283616692426584, "grad_norm": 2.8302993774414062, "learning_rate": 1.0426753658037015e-05, "loss": 0.3886, "num_input_tokens_seen": 3180928, "step": 9425 }, { "epoch": 7.287480680061824, "grad_norm": 2.246079206466675, "learning_rate": 1.0399368945954623e-05, "loss": 0.3606, "num_input_tokens_seen": 3182560, "step": 9430 }, { "epoch": 7.2913446676970635, "grad_norm": 4.000402450561523, "learning_rate": 1.0372010795686998e-05, "loss": 0.3701, "num_input_tokens_seen": 3184320, "step": 9435 }, { "epoch": 7.295208655332303, "grad_norm": 3.678452730178833, "learning_rate": 1.0344679257004713e-05, "loss": 0.4264, "num_input_tokens_seen": 3186016, "step": 9440 }, { "epoch": 7.299072642967542, "grad_norm": 2.2071473598480225, "learning_rate": 1.0317374379629968e-05, "loss": 0.509, "num_input_tokens_seen": 3187840, "step": 9445 }, { "epoch": 7.302936630602782, "grad_norm": 1.5556858777999878, "learning_rate": 1.029009621323643e-05, "loss": 0.3215, "num_input_tokens_seen": 3189696, "step": 9450 }, { "epoch": 7.306800618238022, "grad_norm": 3.6619036197662354, "learning_rate": 1.0262844807449203e-05, "loss": 0.6476, "num_input_tokens_seen": 3191328, "step": 9455 }, { "epoch": 7.310664605873261, "grad_norm": 1.6162028312683105, "learning_rate": 1.0235620211844677e-05, "loss": 0.3539, "num_input_tokens_seen": 3193312, "step": 9460 }, { "epoch": 7.314528593508501, "grad_norm": 2.0255489349365234, "learning_rate": 1.0208422475950494e-05, "loss": 0.3908, "num_input_tokens_seen": 3195360, "step": 9465 }, { "epoch": 7.3183925811437405, "grad_norm": 3.2613561153411865, "learning_rate": 1.018125164924541e-05, "loss": 0.3714, "num_input_tokens_seen": 3196896, "step": 9470 }, { "epoch": 7.32225656877898, "grad_norm": 2.9479241371154785, "learning_rate": 1.0154107781159245e-05, "loss": 0.5206, "num_input_tokens_seen": 3198560, "step": 9475 }, { "epoch": 7.326120556414219, "grad_norm": 2.7474865913391113, "learning_rate": 1.0126990921072774e-05, "loss": 0.4976, "num_input_tokens_seen": 3200320, "step": 9480 }, { "epoch": 7.329984544049459, "grad_norm": 2.3555893898010254, "learning_rate": 1.0099901118317629e-05, "loss": 0.3523, "num_input_tokens_seen": 3201952, "step": 9485 }, { "epoch": 7.333848531684699, "grad_norm": 5.306360244750977, "learning_rate": 1.0072838422176208e-05, "loss": 0.4253, "num_input_tokens_seen": 3203392, "step": 9490 }, { "epoch": 7.337712519319938, "grad_norm": 2.1444122791290283, "learning_rate": 1.0045802881881628e-05, "loss": 0.3788, "num_input_tokens_seen": 3205056, "step": 9495 }, { "epoch": 7.341576506955178, "grad_norm": 2.4545509815216064, "learning_rate": 1.0018794546617588e-05, "loss": 0.3625, "num_input_tokens_seen": 3206592, "step": 9500 }, { "epoch": 7.3454404945904175, "grad_norm": 2.842144012451172, "learning_rate": 9.991813465518285e-06, "loss": 0.3121, "num_input_tokens_seen": 3208032, "step": 9505 }, { "epoch": 7.349304482225657, "grad_norm": 1.7607749700546265, "learning_rate": 9.964859687668324e-06, "loss": 0.3744, "num_input_tokens_seen": 3209792, "step": 9510 }, { "epoch": 7.353168469860896, "grad_norm": 2.1454930305480957, "learning_rate": 9.937933262102698e-06, "loss": 0.3316, "num_input_tokens_seen": 3211424, "step": 9515 }, { "epoch": 7.357032457496136, "grad_norm": 3.1706037521362305, "learning_rate": 9.911034237806574e-06, "loss": 0.3821, "num_input_tokens_seen": 3212928, "step": 9520 }, { "epoch": 7.360896445131376, "grad_norm": 2.168855905532837, "learning_rate": 9.884162663715294e-06, "loss": 0.3748, "num_input_tokens_seen": 3214752, "step": 9525 }, { "epoch": 7.364760432766615, "grad_norm": 3.445265054702759, "learning_rate": 9.85731858871427e-06, "loss": 0.5313, "num_input_tokens_seen": 3216576, "step": 9530 }, { "epoch": 7.368624420401854, "grad_norm": 1.5941659212112427, "learning_rate": 9.830502061638891e-06, "loss": 0.4983, "num_input_tokens_seen": 3218208, "step": 9535 }, { "epoch": 7.3724884080370945, "grad_norm": 4.025851249694824, "learning_rate": 9.803713131274411e-06, "loss": 0.4841, "num_input_tokens_seen": 3219872, "step": 9540 }, { "epoch": 7.376352395672334, "grad_norm": 2.592628240585327, "learning_rate": 9.776951846355876e-06, "loss": 0.3619, "num_input_tokens_seen": 3221632, "step": 9545 }, { "epoch": 7.380216383307573, "grad_norm": 3.1934759616851807, "learning_rate": 9.750218255568064e-06, "loss": 0.5158, "num_input_tokens_seen": 3223264, "step": 9550 }, { "epoch": 7.384080370942813, "grad_norm": 2.578735113143921, "learning_rate": 9.723512407545363e-06, "loss": 0.4346, "num_input_tokens_seen": 3224992, "step": 9555 }, { "epoch": 7.387944358578053, "grad_norm": 3.5882208347320557, "learning_rate": 9.696834350871672e-06, "loss": 0.3791, "num_input_tokens_seen": 3226528, "step": 9560 }, { "epoch": 7.391808346213292, "grad_norm": 2.3617842197418213, "learning_rate": 9.670184134080363e-06, "loss": 0.4524, "num_input_tokens_seen": 3228320, "step": 9565 }, { "epoch": 7.395672333848531, "grad_norm": 2.0838723182678223, "learning_rate": 9.643561805654123e-06, "loss": 0.4885, "num_input_tokens_seen": 3229920, "step": 9570 }, { "epoch": 7.3995363214837715, "grad_norm": 2.4097766876220703, "learning_rate": 9.616967414024942e-06, "loss": 0.7566, "num_input_tokens_seen": 3231488, "step": 9575 }, { "epoch": 7.403400309119011, "grad_norm": 6.22976016998291, "learning_rate": 9.590401007573952e-06, "loss": 0.608, "num_input_tokens_seen": 3233312, "step": 9580 }, { "epoch": 7.40726429675425, "grad_norm": 1.381002426147461, "learning_rate": 9.563862634631407e-06, "loss": 0.4288, "num_input_tokens_seen": 3235072, "step": 9585 }, { "epoch": 7.41112828438949, "grad_norm": 2.944422483444214, "learning_rate": 9.537352343476532e-06, "loss": 0.3753, "num_input_tokens_seen": 3236672, "step": 9590 }, { "epoch": 7.41499227202473, "grad_norm": 1.8368598222732544, "learning_rate": 9.510870182337483e-06, "loss": 0.3622, "num_input_tokens_seen": 3238560, "step": 9595 }, { "epoch": 7.418856259659969, "grad_norm": 2.3920998573303223, "learning_rate": 9.484416199391246e-06, "loss": 0.3652, "num_input_tokens_seen": 3240256, "step": 9600 }, { "epoch": 7.422720247295208, "grad_norm": 3.2996020317077637, "learning_rate": 9.457990442763526e-06, "loss": 0.5607, "num_input_tokens_seen": 3241952, "step": 9605 }, { "epoch": 7.4265842349304485, "grad_norm": 2.6925697326660156, "learning_rate": 9.431592960528678e-06, "loss": 0.4919, "num_input_tokens_seen": 3243616, "step": 9610 }, { "epoch": 7.430448222565688, "grad_norm": 2.3321962356567383, "learning_rate": 9.405223800709643e-06, "loss": 0.3222, "num_input_tokens_seen": 3245280, "step": 9615 }, { "epoch": 7.434312210200927, "grad_norm": 3.234734058380127, "learning_rate": 9.37888301127782e-06, "loss": 0.4241, "num_input_tokens_seen": 3246816, "step": 9620 }, { "epoch": 7.438176197836167, "grad_norm": 3.549264669418335, "learning_rate": 9.352570640152999e-06, "loss": 0.3324, "num_input_tokens_seen": 3248480, "step": 9625 }, { "epoch": 7.442040185471407, "grad_norm": 3.0262434482574463, "learning_rate": 9.326286735203252e-06, "loss": 0.3836, "num_input_tokens_seen": 3250336, "step": 9630 }, { "epoch": 7.445904173106646, "grad_norm": 1.8162050247192383, "learning_rate": 9.300031344244914e-06, "loss": 0.3479, "num_input_tokens_seen": 3252192, "step": 9635 }, { "epoch": 7.449768160741885, "grad_norm": 3.4954307079315186, "learning_rate": 9.273804515042404e-06, "loss": 0.4219, "num_input_tokens_seen": 3254080, "step": 9640 }, { "epoch": 7.4536321483771255, "grad_norm": 1.6925381422042847, "learning_rate": 9.247606295308178e-06, "loss": 0.4707, "num_input_tokens_seen": 3255808, "step": 9645 }, { "epoch": 7.457496136012365, "grad_norm": 2.7990715503692627, "learning_rate": 9.22143673270267e-06, "loss": 0.3829, "num_input_tokens_seen": 3257824, "step": 9650 }, { "epoch": 7.461360123647604, "grad_norm": 3.550091028213501, "learning_rate": 9.195295874834181e-06, "loss": 0.6728, "num_input_tokens_seen": 3259328, "step": 9655 }, { "epoch": 7.4652241112828435, "grad_norm": 1.7208251953125, "learning_rate": 9.169183769258769e-06, "loss": 0.5269, "num_input_tokens_seen": 3261152, "step": 9660 }, { "epoch": 7.469088098918084, "grad_norm": 1.8515422344207764, "learning_rate": 9.14310046348019e-06, "loss": 0.4027, "num_input_tokens_seen": 3262624, "step": 9665 }, { "epoch": 7.472952086553323, "grad_norm": 4.34616756439209, "learning_rate": 9.11704600494982e-06, "loss": 0.3508, "num_input_tokens_seen": 3264256, "step": 9670 }, { "epoch": 7.476816074188562, "grad_norm": 1.636586308479309, "learning_rate": 9.091020441066559e-06, "loss": 0.3648, "num_input_tokens_seen": 3265824, "step": 9675 }, { "epoch": 7.4806800618238025, "grad_norm": 1.4778701066970825, "learning_rate": 9.06502381917671e-06, "loss": 0.4791, "num_input_tokens_seen": 3267296, "step": 9680 }, { "epoch": 7.484544049459042, "grad_norm": 3.3353161811828613, "learning_rate": 9.03905618657397e-06, "loss": 0.4246, "num_input_tokens_seen": 3269152, "step": 9685 }, { "epoch": 7.488408037094281, "grad_norm": 3.121706485748291, "learning_rate": 9.013117590499246e-06, "loss": 0.4098, "num_input_tokens_seen": 3270976, "step": 9690 }, { "epoch": 7.4922720247295205, "grad_norm": 2.354724645614624, "learning_rate": 8.987208078140671e-06, "loss": 0.3993, "num_input_tokens_seen": 3272736, "step": 9695 }, { "epoch": 7.496136012364761, "grad_norm": 1.8900988101959229, "learning_rate": 8.961327696633429e-06, "loss": 0.3418, "num_input_tokens_seen": 3274304, "step": 9700 }, { "epoch": 7.5, "grad_norm": 3.7214386463165283, "learning_rate": 8.935476493059736e-06, "loss": 0.3429, "num_input_tokens_seen": 3275904, "step": 9705 }, { "epoch": 7.5, "eval_loss": 0.5019899606704712, "eval_runtime": 11.5486, "eval_samples_per_second": 49.79, "eval_steps_per_second": 12.469, "num_input_tokens_seen": 3275904, "step": 9705 }, { "epoch": 7.503863987635239, "grad_norm": 2.173996925354004, "learning_rate": 8.909654514448704e-06, "loss": 0.388, "num_input_tokens_seen": 3277824, "step": 9710 }, { "epoch": 7.507727975270479, "grad_norm": 3.7044551372528076, "learning_rate": 8.883861807776297e-06, "loss": 0.5044, "num_input_tokens_seen": 3279552, "step": 9715 }, { "epoch": 7.511591962905719, "grad_norm": 3.3015599250793457, "learning_rate": 8.858098419965227e-06, "loss": 0.3384, "num_input_tokens_seen": 3281152, "step": 9720 }, { "epoch": 7.515455950540958, "grad_norm": 3.749021291732788, "learning_rate": 8.832364397884852e-06, "loss": 0.3736, "num_input_tokens_seen": 3282528, "step": 9725 }, { "epoch": 7.5193199381761975, "grad_norm": 2.212509870529175, "learning_rate": 8.80665978835111e-06, "loss": 0.3597, "num_input_tokens_seen": 3284416, "step": 9730 }, { "epoch": 7.523183925811438, "grad_norm": 2.328566312789917, "learning_rate": 8.78098463812644e-06, "loss": 0.4668, "num_input_tokens_seen": 3286304, "step": 9735 }, { "epoch": 7.527047913446677, "grad_norm": 1.5004889965057373, "learning_rate": 8.755338993919702e-06, "loss": 0.3343, "num_input_tokens_seen": 3287936, "step": 9740 }, { "epoch": 7.530911901081916, "grad_norm": 1.5239245891571045, "learning_rate": 8.729722902386044e-06, "loss": 0.357, "num_input_tokens_seen": 3289696, "step": 9745 }, { "epoch": 7.5347758887171565, "grad_norm": 3.740729331970215, "learning_rate": 8.704136410126867e-06, "loss": 0.3906, "num_input_tokens_seen": 3291328, "step": 9750 }, { "epoch": 7.538639876352396, "grad_norm": 1.9513980150222778, "learning_rate": 8.67857956368973e-06, "loss": 0.3947, "num_input_tokens_seen": 3292960, "step": 9755 }, { "epoch": 7.542503863987635, "grad_norm": 4.727505207061768, "learning_rate": 8.65305240956826e-06, "loss": 0.5166, "num_input_tokens_seen": 3294624, "step": 9760 }, { "epoch": 7.5463678516228745, "grad_norm": 1.7074130773544312, "learning_rate": 8.62755499420205e-06, "loss": 0.4007, "num_input_tokens_seen": 3296448, "step": 9765 }, { "epoch": 7.550231839258115, "grad_norm": 3.308751106262207, "learning_rate": 8.602087363976622e-06, "loss": 0.3665, "num_input_tokens_seen": 3298368, "step": 9770 }, { "epoch": 7.554095826893354, "grad_norm": 1.9069124460220337, "learning_rate": 8.576649565223272e-06, "loss": 0.3737, "num_input_tokens_seen": 3299936, "step": 9775 }, { "epoch": 7.557959814528593, "grad_norm": 3.4255731105804443, "learning_rate": 8.55124164421907e-06, "loss": 0.323, "num_input_tokens_seen": 3301664, "step": 9780 }, { "epoch": 7.561823802163833, "grad_norm": 5.141074180603027, "learning_rate": 8.525863647186683e-06, "loss": 0.394, "num_input_tokens_seen": 3303616, "step": 9785 }, { "epoch": 7.565687789799073, "grad_norm": 1.99637770652771, "learning_rate": 8.500515620294391e-06, "loss": 0.4119, "num_input_tokens_seen": 3305344, "step": 9790 }, { "epoch": 7.569551777434312, "grad_norm": 2.38132905960083, "learning_rate": 8.475197609655911e-06, "loss": 0.4051, "num_input_tokens_seen": 3306944, "step": 9795 }, { "epoch": 7.5734157650695515, "grad_norm": 2.9932432174682617, "learning_rate": 8.449909661330372e-06, "loss": 0.3369, "num_input_tokens_seen": 3308576, "step": 9800 }, { "epoch": 7.577279752704792, "grad_norm": 4.754586219787598, "learning_rate": 8.42465182132222e-06, "loss": 0.3903, "num_input_tokens_seen": 3310304, "step": 9805 }, { "epoch": 7.581143740340031, "grad_norm": 2.2513694763183594, "learning_rate": 8.399424135581108e-06, "loss": 0.3411, "num_input_tokens_seen": 3311840, "step": 9810 }, { "epoch": 7.58500772797527, "grad_norm": 2.334599018096924, "learning_rate": 8.374226650001835e-06, "loss": 0.3201, "num_input_tokens_seen": 3313344, "step": 9815 }, { "epoch": 7.58887171561051, "grad_norm": 4.07456636428833, "learning_rate": 8.349059410424274e-06, "loss": 0.4519, "num_input_tokens_seen": 3315008, "step": 9820 }, { "epoch": 7.59273570324575, "grad_norm": 1.3044357299804688, "learning_rate": 8.323922462633272e-06, "loss": 0.3539, "num_input_tokens_seen": 3316672, "step": 9825 }, { "epoch": 7.596599690880989, "grad_norm": 4.955140590667725, "learning_rate": 8.298815852358552e-06, "loss": 0.5218, "num_input_tokens_seen": 3318272, "step": 9830 }, { "epoch": 7.6004636785162285, "grad_norm": 2.262714147567749, "learning_rate": 8.273739625274648e-06, "loss": 0.4718, "num_input_tokens_seen": 3320096, "step": 9835 }, { "epoch": 7.604327666151468, "grad_norm": 3.6358449459075928, "learning_rate": 8.248693827000837e-06, "loss": 0.402, "num_input_tokens_seen": 3321888, "step": 9840 }, { "epoch": 7.608191653786708, "grad_norm": 2.6541318893432617, "learning_rate": 8.223678503101037e-06, "loss": 0.5672, "num_input_tokens_seen": 3323776, "step": 9845 }, { "epoch": 7.612055641421947, "grad_norm": 1.7500377893447876, "learning_rate": 8.198693699083707e-06, "loss": 0.4102, "num_input_tokens_seen": 3325632, "step": 9850 }, { "epoch": 7.615919629057187, "grad_norm": 1.964523196220398, "learning_rate": 8.173739460401797e-06, "loss": 0.3772, "num_input_tokens_seen": 3327104, "step": 9855 }, { "epoch": 7.619783616692427, "grad_norm": 3.151820659637451, "learning_rate": 8.148815832452664e-06, "loss": 0.5356, "num_input_tokens_seen": 3328800, "step": 9860 }, { "epoch": 7.623647604327666, "grad_norm": 2.7200474739074707, "learning_rate": 8.123922860577952e-06, "loss": 0.4271, "num_input_tokens_seen": 3330528, "step": 9865 }, { "epoch": 7.6275115919629055, "grad_norm": 2.7136292457580566, "learning_rate": 8.099060590063543e-06, "loss": 0.4091, "num_input_tokens_seen": 3332640, "step": 9870 }, { "epoch": 7.631375579598146, "grad_norm": 4.60878849029541, "learning_rate": 8.074229066139474e-06, "loss": 0.4263, "num_input_tokens_seen": 3334592, "step": 9875 }, { "epoch": 7.635239567233385, "grad_norm": 3.3110218048095703, "learning_rate": 8.049428333979853e-06, "loss": 0.3816, "num_input_tokens_seen": 3336192, "step": 9880 }, { "epoch": 7.639103554868624, "grad_norm": 2.394339084625244, "learning_rate": 8.024658438702754e-06, "loss": 0.3813, "num_input_tokens_seen": 3338112, "step": 9885 }, { "epoch": 7.642967542503864, "grad_norm": 4.033210277557373, "learning_rate": 7.999919425370147e-06, "loss": 0.4924, "num_input_tokens_seen": 3339968, "step": 9890 }, { "epoch": 7.646831530139104, "grad_norm": 1.1272213459014893, "learning_rate": 7.975211338987845e-06, "loss": 0.465, "num_input_tokens_seen": 3341664, "step": 9895 }, { "epoch": 7.650695517774343, "grad_norm": 2.4584059715270996, "learning_rate": 7.95053422450539e-06, "loss": 0.4264, "num_input_tokens_seen": 3343264, "step": 9900 }, { "epoch": 7.6545595054095825, "grad_norm": 1.5783307552337646, "learning_rate": 7.92588812681596e-06, "loss": 0.3335, "num_input_tokens_seen": 3344992, "step": 9905 }, { "epoch": 7.658423493044822, "grad_norm": 1.8588451147079468, "learning_rate": 7.901273090756339e-06, "loss": 0.431, "num_input_tokens_seen": 3346560, "step": 9910 }, { "epoch": 7.662287480680062, "grad_norm": 3.723414897918701, "learning_rate": 7.876689161106768e-06, "loss": 0.5137, "num_input_tokens_seen": 3348256, "step": 9915 }, { "epoch": 7.666151468315301, "grad_norm": 1.5488903522491455, "learning_rate": 7.85213638259093e-06, "loss": 0.3073, "num_input_tokens_seen": 3349984, "step": 9920 }, { "epoch": 7.670015455950541, "grad_norm": 2.9740357398986816, "learning_rate": 7.827614799875807e-06, "loss": 0.4471, "num_input_tokens_seen": 3351808, "step": 9925 }, { "epoch": 7.673879443585781, "grad_norm": 2.40726637840271, "learning_rate": 7.80312445757166e-06, "loss": 0.3962, "num_input_tokens_seen": 3353408, "step": 9930 }, { "epoch": 7.67774343122102, "grad_norm": 2.2615129947662354, "learning_rate": 7.77866540023189e-06, "loss": 0.789, "num_input_tokens_seen": 3355584, "step": 9935 }, { "epoch": 7.6816074188562595, "grad_norm": 1.7667981386184692, "learning_rate": 7.754237672352999e-06, "loss": 0.3656, "num_input_tokens_seen": 3357376, "step": 9940 }, { "epoch": 7.685471406491499, "grad_norm": 3.370790719985962, "learning_rate": 7.729841318374498e-06, "loss": 0.5061, "num_input_tokens_seen": 3358944, "step": 9945 }, { "epoch": 7.689335394126739, "grad_norm": 3.2846944332122803, "learning_rate": 7.705476382678808e-06, "loss": 0.4072, "num_input_tokens_seen": 3360416, "step": 9950 }, { "epoch": 7.693199381761978, "grad_norm": 1.7826608419418335, "learning_rate": 7.681142909591186e-06, "loss": 0.3382, "num_input_tokens_seen": 3362176, "step": 9955 }, { "epoch": 7.697063369397218, "grad_norm": 2.330399751663208, "learning_rate": 7.656840943379676e-06, "loss": 0.3832, "num_input_tokens_seen": 3363904, "step": 9960 }, { "epoch": 7.700927357032457, "grad_norm": 2.429669141769409, "learning_rate": 7.632570528254998e-06, "loss": 0.2964, "num_input_tokens_seen": 3365248, "step": 9965 }, { "epoch": 7.704791344667697, "grad_norm": 2.208981990814209, "learning_rate": 7.608331708370462e-06, "loss": 0.3702, "num_input_tokens_seen": 3367136, "step": 9970 }, { "epoch": 7.7086553323029365, "grad_norm": 3.8133530616760254, "learning_rate": 7.584124527821884e-06, "loss": 0.4786, "num_input_tokens_seen": 3368736, "step": 9975 }, { "epoch": 7.712519319938176, "grad_norm": 2.4777417182922363, "learning_rate": 7.559949030647581e-06, "loss": 0.3994, "num_input_tokens_seen": 3370272, "step": 9980 }, { "epoch": 7.716383307573416, "grad_norm": 2.30204439163208, "learning_rate": 7.535805260828169e-06, "loss": 0.4633, "num_input_tokens_seen": 3372064, "step": 9985 }, { "epoch": 7.720247295208655, "grad_norm": 3.1605591773986816, "learning_rate": 7.511693262286562e-06, "loss": 0.3726, "num_input_tokens_seen": 3373792, "step": 9990 }, { "epoch": 7.724111282843895, "grad_norm": 4.219874858856201, "learning_rate": 7.48761307888789e-06, "loss": 0.6382, "num_input_tokens_seen": 3375520, "step": 9995 }, { "epoch": 7.727975270479135, "grad_norm": 2.410836935043335, "learning_rate": 7.463564754439397e-06, "loss": 0.3907, "num_input_tokens_seen": 3377376, "step": 10000 }, { "epoch": 7.731839258114374, "grad_norm": 5.588906288146973, "learning_rate": 7.439548332690363e-06, "loss": 0.3829, "num_input_tokens_seen": 3379040, "step": 10005 }, { "epoch": 7.7357032457496135, "grad_norm": 2.999885320663452, "learning_rate": 7.415563857332025e-06, "loss": 0.6436, "num_input_tokens_seen": 3380800, "step": 10010 }, { "epoch": 7.739567233384853, "grad_norm": 2.3598828315734863, "learning_rate": 7.391611371997517e-06, "loss": 0.3915, "num_input_tokens_seen": 3382400, "step": 10015 }, { "epoch": 7.743431221020093, "grad_norm": 1.9081141948699951, "learning_rate": 7.367690920261777e-06, "loss": 0.4557, "num_input_tokens_seen": 3383936, "step": 10020 }, { "epoch": 7.747295208655332, "grad_norm": 3.187286376953125, "learning_rate": 7.34380254564144e-06, "loss": 0.3851, "num_input_tokens_seen": 3385664, "step": 10025 }, { "epoch": 7.751159196290572, "grad_norm": 2.135157823562622, "learning_rate": 7.319946291594826e-06, "loss": 0.3115, "num_input_tokens_seen": 3387104, "step": 10030 }, { "epoch": 7.755023183925811, "grad_norm": 4.586842060089111, "learning_rate": 7.29612220152178e-06, "loss": 0.615, "num_input_tokens_seen": 3388640, "step": 10035 }, { "epoch": 7.758887171561051, "grad_norm": 5.767529487609863, "learning_rate": 7.2723303187636645e-06, "loss": 0.4599, "num_input_tokens_seen": 3390368, "step": 10040 }, { "epoch": 7.7627511591962906, "grad_norm": 3.6441547870635986, "learning_rate": 7.248570686603229e-06, "loss": 0.4211, "num_input_tokens_seen": 3392288, "step": 10045 }, { "epoch": 7.76661514683153, "grad_norm": 2.0557754039764404, "learning_rate": 7.224843348264573e-06, "loss": 0.2792, "num_input_tokens_seen": 3393760, "step": 10050 }, { "epoch": 7.77047913446677, "grad_norm": 2.456514358520508, "learning_rate": 7.201148346913017e-06, "loss": 0.5156, "num_input_tokens_seen": 3395360, "step": 10055 }, { "epoch": 7.774343122102009, "grad_norm": 1.8686484098434448, "learning_rate": 7.1774857256550784e-06, "loss": 0.389, "num_input_tokens_seen": 3396896, "step": 10060 }, { "epoch": 7.778207109737249, "grad_norm": 2.3659634590148926, "learning_rate": 7.1538555275383694e-06, "loss": 0.3923, "num_input_tokens_seen": 3398496, "step": 10065 }, { "epoch": 7.782071097372488, "grad_norm": 2.090318202972412, "learning_rate": 7.130257795551501e-06, "loss": 0.3878, "num_input_tokens_seen": 3400064, "step": 10070 }, { "epoch": 7.785935085007728, "grad_norm": 3.088559150695801, "learning_rate": 7.106692572624016e-06, "loss": 0.4876, "num_input_tokens_seen": 3401696, "step": 10075 }, { "epoch": 7.789799072642968, "grad_norm": 7.531291961669922, "learning_rate": 7.083159901626335e-06, "loss": 0.526, "num_input_tokens_seen": 3403456, "step": 10080 }, { "epoch": 7.793663060278207, "grad_norm": 4.726618766784668, "learning_rate": 7.059659825369664e-06, "loss": 0.5268, "num_input_tokens_seen": 3405088, "step": 10085 }, { "epoch": 7.797527047913446, "grad_norm": 5.844390392303467, "learning_rate": 7.036192386605888e-06, "loss": 0.3745, "num_input_tokens_seen": 3406464, "step": 10090 }, { "epoch": 7.801391035548686, "grad_norm": 3.3183538913726807, "learning_rate": 7.012757628027517e-06, "loss": 0.5989, "num_input_tokens_seen": 3408192, "step": 10095 }, { "epoch": 7.805255023183926, "grad_norm": 2.697873592376709, "learning_rate": 6.9893555922676465e-06, "loss": 0.3928, "num_input_tokens_seen": 3409728, "step": 10100 }, { "epoch": 7.809119010819165, "grad_norm": 2.348828077316284, "learning_rate": 6.965986321899806e-06, "loss": 0.3243, "num_input_tokens_seen": 3411456, "step": 10105 }, { "epoch": 7.812982998454405, "grad_norm": 2.4893381595611572, "learning_rate": 6.942649859437916e-06, "loss": 0.3931, "num_input_tokens_seen": 3412896, "step": 10110 }, { "epoch": 7.816846986089645, "grad_norm": 2.620225429534912, "learning_rate": 6.919346247336231e-06, "loss": 0.3578, "num_input_tokens_seen": 3414336, "step": 10115 }, { "epoch": 7.820710973724884, "grad_norm": 2.041069746017456, "learning_rate": 6.896075527989243e-06, "loss": 0.3598, "num_input_tokens_seen": 3416128, "step": 10120 }, { "epoch": 7.824574961360124, "grad_norm": 3.521493911743164, "learning_rate": 6.8728377437315916e-06, "loss": 0.3405, "num_input_tokens_seen": 3417760, "step": 10125 }, { "epoch": 7.828438948995363, "grad_norm": 5.186826229095459, "learning_rate": 6.8496329368379965e-06, "loss": 0.5488, "num_input_tokens_seen": 3419424, "step": 10130 }, { "epoch": 7.832302936630603, "grad_norm": 1.7082468271255493, "learning_rate": 6.8264611495232e-06, "loss": 0.3334, "num_input_tokens_seen": 3421184, "step": 10135 }, { "epoch": 7.836166924265842, "grad_norm": 2.658151626586914, "learning_rate": 6.80332242394188e-06, "loss": 0.5248, "num_input_tokens_seen": 3422656, "step": 10140 }, { "epoch": 7.840030911901082, "grad_norm": 4.800131320953369, "learning_rate": 6.780216802188532e-06, "loss": 0.5281, "num_input_tokens_seen": 3424192, "step": 10145 }, { "epoch": 7.843894899536322, "grad_norm": 3.8047196865081787, "learning_rate": 6.757144326297476e-06, "loss": 0.4974, "num_input_tokens_seen": 3425888, "step": 10150 }, { "epoch": 7.847758887171561, "grad_norm": 3.8972294330596924, "learning_rate": 6.734105038242691e-06, "loss": 0.4543, "num_input_tokens_seen": 3427456, "step": 10155 }, { "epoch": 7.8516228748068, "grad_norm": 6.19030237197876, "learning_rate": 6.7110989799378076e-06, "loss": 0.4379, "num_input_tokens_seen": 3429120, "step": 10160 }, { "epoch": 7.85548686244204, "grad_norm": 1.674472689628601, "learning_rate": 6.6881261932359825e-06, "loss": 0.3481, "num_input_tokens_seen": 3430912, "step": 10165 }, { "epoch": 7.85935085007728, "grad_norm": 3.640812635421753, "learning_rate": 6.665186719929867e-06, "loss": 0.4582, "num_input_tokens_seen": 3432608, "step": 10170 }, { "epoch": 7.863214837712519, "grad_norm": 1.7687418460845947, "learning_rate": 6.642280601751483e-06, "loss": 0.3023, "num_input_tokens_seen": 3434240, "step": 10175 }, { "epoch": 7.867078825347759, "grad_norm": 2.0260345935821533, "learning_rate": 6.6194078803722e-06, "loss": 0.3195, "num_input_tokens_seen": 3435936, "step": 10180 }, { "epoch": 7.870942812982999, "grad_norm": 4.330557346343994, "learning_rate": 6.596568597402597e-06, "loss": 0.4029, "num_input_tokens_seen": 3437472, "step": 10185 }, { "epoch": 7.874806800618238, "grad_norm": 2.0488340854644775, "learning_rate": 6.57376279439246e-06, "loss": 0.3331, "num_input_tokens_seen": 3439168, "step": 10190 }, { "epoch": 7.878670788253477, "grad_norm": 2.382809638977051, "learning_rate": 6.550990512830629e-06, "loss": 0.3693, "num_input_tokens_seen": 3440672, "step": 10195 }, { "epoch": 7.882534775888717, "grad_norm": 3.9801318645477295, "learning_rate": 6.5282517941449876e-06, "loss": 0.3694, "num_input_tokens_seen": 3442432, "step": 10200 }, { "epoch": 7.886398763523957, "grad_norm": 2.184739589691162, "learning_rate": 6.5055466797023625e-06, "loss": 0.4045, "num_input_tokens_seen": 3443904, "step": 10205 }, { "epoch": 7.890262751159196, "grad_norm": 3.528862237930298, "learning_rate": 6.482875210808426e-06, "loss": 0.5348, "num_input_tokens_seen": 3445728, "step": 10210 }, { "epoch": 7.894126738794435, "grad_norm": 5.425795555114746, "learning_rate": 6.46023742870765e-06, "loss": 0.4598, "num_input_tokens_seen": 3447552, "step": 10215 }, { "epoch": 7.897990726429676, "grad_norm": 5.208679676055908, "learning_rate": 6.437633374583226e-06, "loss": 0.5992, "num_input_tokens_seen": 3449184, "step": 10220 }, { "epoch": 7.901854714064915, "grad_norm": 2.0160460472106934, "learning_rate": 6.415063089557e-06, "loss": 0.3984, "num_input_tokens_seen": 3451008, "step": 10225 }, { "epoch": 7.905718701700154, "grad_norm": 1.4675148725509644, "learning_rate": 6.392526614689359e-06, "loss": 0.4507, "num_input_tokens_seen": 3452704, "step": 10230 }, { "epoch": 7.909582689335394, "grad_norm": 3.5498476028442383, "learning_rate": 6.370023990979182e-06, "loss": 0.4163, "num_input_tokens_seen": 3454336, "step": 10235 }, { "epoch": 7.913446676970634, "grad_norm": 2.5818939208984375, "learning_rate": 6.347555259363802e-06, "loss": 0.4054, "num_input_tokens_seen": 3456000, "step": 10240 }, { "epoch": 7.917310664605873, "grad_norm": 1.7408181428909302, "learning_rate": 6.325120460718858e-06, "loss": 0.3832, "num_input_tokens_seen": 3457792, "step": 10245 }, { "epoch": 7.921174652241113, "grad_norm": 2.363948106765747, "learning_rate": 6.302719635858259e-06, "loss": 0.5335, "num_input_tokens_seen": 3459456, "step": 10250 }, { "epoch": 7.925038639876353, "grad_norm": 2.5908329486846924, "learning_rate": 6.280352825534125e-06, "loss": 0.4512, "num_input_tokens_seen": 3461152, "step": 10255 }, { "epoch": 7.928902627511592, "grad_norm": 3.075213670730591, "learning_rate": 6.258020070436696e-06, "loss": 0.3731, "num_input_tokens_seen": 3462944, "step": 10260 }, { "epoch": 7.932766615146831, "grad_norm": 3.288975715637207, "learning_rate": 6.2357214111942426e-06, "loss": 0.3526, "num_input_tokens_seen": 3464448, "step": 10265 }, { "epoch": 7.936630602782071, "grad_norm": 2.6713340282440186, "learning_rate": 6.213456888373012e-06, "loss": 0.5731, "num_input_tokens_seen": 3466016, "step": 10270 }, { "epoch": 7.940494590417311, "grad_norm": 1.8247013092041016, "learning_rate": 6.191226542477155e-06, "loss": 0.4284, "num_input_tokens_seen": 3467680, "step": 10275 }, { "epoch": 7.94435857805255, "grad_norm": 2.2879669666290283, "learning_rate": 6.169030413948651e-06, "loss": 0.4584, "num_input_tokens_seen": 3469344, "step": 10280 }, { "epoch": 7.948222565687789, "grad_norm": 2.860065221786499, "learning_rate": 6.1468685431672165e-06, "loss": 0.2848, "num_input_tokens_seen": 3470976, "step": 10285 }, { "epoch": 7.95208655332303, "grad_norm": 2.217526912689209, "learning_rate": 6.1247409704502626e-06, "loss": 0.365, "num_input_tokens_seen": 3472608, "step": 10290 }, { "epoch": 7.955950540958269, "grad_norm": 2.1700079441070557, "learning_rate": 6.10264773605278e-06, "loss": 0.3652, "num_input_tokens_seen": 3474368, "step": 10295 }, { "epoch": 7.959814528593508, "grad_norm": 2.582806348800659, "learning_rate": 6.080588880167323e-06, "loss": 0.3993, "num_input_tokens_seen": 3476000, "step": 10300 }, { "epoch": 7.9636785162287484, "grad_norm": 1.530856728553772, "learning_rate": 6.058564442923873e-06, "loss": 0.5395, "num_input_tokens_seen": 3477536, "step": 10305 }, { "epoch": 7.967542503863988, "grad_norm": 2.691877841949463, "learning_rate": 6.036574464389816e-06, "loss": 0.5089, "num_input_tokens_seen": 3479232, "step": 10310 }, { "epoch": 7.971406491499227, "grad_norm": 2.3611013889312744, "learning_rate": 6.0146189845698366e-06, "loss": 0.3414, "num_input_tokens_seen": 3480928, "step": 10315 }, { "epoch": 7.975270479134466, "grad_norm": 3.4590344429016113, "learning_rate": 5.992698043405867e-06, "loss": 0.3523, "num_input_tokens_seen": 3482432, "step": 10320 }, { "epoch": 7.979134466769707, "grad_norm": 1.5800747871398926, "learning_rate": 5.970811680777017e-06, "loss": 0.3016, "num_input_tokens_seen": 3484480, "step": 10325 }, { "epoch": 7.982998454404946, "grad_norm": 5.198740482330322, "learning_rate": 5.948959936499463e-06, "loss": 0.3973, "num_input_tokens_seen": 3486176, "step": 10330 }, { "epoch": 7.986862442040185, "grad_norm": 4.349247455596924, "learning_rate": 5.927142850326414e-06, "loss": 0.4729, "num_input_tokens_seen": 3488032, "step": 10335 }, { "epoch": 7.990726429675425, "grad_norm": 3.02638840675354, "learning_rate": 5.90536046194804e-06, "loss": 0.3301, "num_input_tokens_seen": 3489504, "step": 10340 }, { "epoch": 7.994590417310665, "grad_norm": 2.1727259159088135, "learning_rate": 5.883612810991382e-06, "loss": 0.4091, "num_input_tokens_seen": 3491584, "step": 10345 }, { "epoch": 7.998454404945904, "grad_norm": 4.164893627166748, "learning_rate": 5.8618999370202795e-06, "loss": 0.4286, "num_input_tokens_seen": 3493120, "step": 10350 }, { "epoch": 8.0, "eval_loss": 0.5119831562042236, "eval_runtime": 11.4297, "eval_samples_per_second": 50.308, "eval_steps_per_second": 12.599, "num_input_tokens_seen": 3493600, "step": 10352 }, { "epoch": 8.002318392581143, "grad_norm": 2.4714415073394775, "learning_rate": 5.840221879535293e-06, "loss": 0.4108, "num_input_tokens_seen": 3494592, "step": 10355 }, { "epoch": 8.006182380216384, "grad_norm": 2.3069677352905273, "learning_rate": 5.818578677973688e-06, "loss": 0.3519, "num_input_tokens_seen": 3496128, "step": 10360 }, { "epoch": 8.010046367851622, "grad_norm": 3.2760424613952637, "learning_rate": 5.796970371709276e-06, "loss": 0.429, "num_input_tokens_seen": 3497952, "step": 10365 }, { "epoch": 8.013910355486862, "grad_norm": 2.921566963195801, "learning_rate": 5.775397000052396e-06, "loss": 0.3379, "num_input_tokens_seen": 3499584, "step": 10370 }, { "epoch": 8.017774343122102, "grad_norm": 1.8068013191223145, "learning_rate": 5.753858602249842e-06, "loss": 0.3974, "num_input_tokens_seen": 3501536, "step": 10375 }, { "epoch": 8.021638330757341, "grad_norm": 2.8224196434020996, "learning_rate": 5.732355217484786e-06, "loss": 0.3256, "num_input_tokens_seen": 3503008, "step": 10380 }, { "epoch": 8.025502318392581, "grad_norm": 1.8929178714752197, "learning_rate": 5.710886884876693e-06, "loss": 0.3857, "num_input_tokens_seen": 3504864, "step": 10385 }, { "epoch": 8.029366306027821, "grad_norm": 5.892085552215576, "learning_rate": 5.689453643481252e-06, "loss": 0.4145, "num_input_tokens_seen": 3506464, "step": 10390 }, { "epoch": 8.03323029366306, "grad_norm": 2.9718658924102783, "learning_rate": 5.668055532290334e-06, "loss": 0.362, "num_input_tokens_seen": 3508000, "step": 10395 }, { "epoch": 8.0370942812983, "grad_norm": 4.003775596618652, "learning_rate": 5.6466925902319e-06, "loss": 0.4198, "num_input_tokens_seen": 3509504, "step": 10400 }, { "epoch": 8.04095826893354, "grad_norm": 2.7956979274749756, "learning_rate": 5.625364856169901e-06, "loss": 0.3679, "num_input_tokens_seen": 3511104, "step": 10405 }, { "epoch": 8.044822256568779, "grad_norm": 4.315577507019043, "learning_rate": 5.604072368904279e-06, "loss": 0.384, "num_input_tokens_seen": 3512800, "step": 10410 }, { "epoch": 8.048686244204019, "grad_norm": 3.8754422664642334, "learning_rate": 5.582815167170816e-06, "loss": 0.4688, "num_input_tokens_seen": 3514624, "step": 10415 }, { "epoch": 8.052550231839259, "grad_norm": 1.90727698802948, "learning_rate": 5.5615932896411285e-06, "loss": 0.6794, "num_input_tokens_seen": 3516320, "step": 10420 }, { "epoch": 8.056414219474497, "grad_norm": 1.5020394325256348, "learning_rate": 5.540406774922549e-06, "loss": 0.3573, "num_input_tokens_seen": 3518016, "step": 10425 }, { "epoch": 8.060278207109738, "grad_norm": 2.324713945388794, "learning_rate": 5.519255661558104e-06, "loss": 0.4958, "num_input_tokens_seen": 3519616, "step": 10430 }, { "epoch": 8.064142194744976, "grad_norm": 3.004500150680542, "learning_rate": 5.498139988026385e-06, "loss": 0.3359, "num_input_tokens_seen": 3521024, "step": 10435 }, { "epoch": 8.068006182380216, "grad_norm": 3.028193473815918, "learning_rate": 5.477059792741532e-06, "loss": 0.5773, "num_input_tokens_seen": 3522816, "step": 10440 }, { "epoch": 8.071870170015456, "grad_norm": 1.3574358224868774, "learning_rate": 5.456015114053148e-06, "loss": 0.5339, "num_input_tokens_seen": 3524384, "step": 10445 }, { "epoch": 8.075734157650695, "grad_norm": 2.771818161010742, "learning_rate": 5.435005990246203e-06, "loss": 0.3239, "num_input_tokens_seen": 3525824, "step": 10450 }, { "epoch": 8.079598145285935, "grad_norm": 1.9324572086334229, "learning_rate": 5.414032459540985e-06, "loss": 0.3533, "num_input_tokens_seen": 3527520, "step": 10455 }, { "epoch": 8.083462132921175, "grad_norm": 1.5582910776138306, "learning_rate": 5.393094560093051e-06, "loss": 0.3598, "num_input_tokens_seen": 3529184, "step": 10460 }, { "epoch": 8.087326120556414, "grad_norm": 3.5524046421051025, "learning_rate": 5.372192329993128e-06, "loss": 0.6167, "num_input_tokens_seen": 3530880, "step": 10465 }, { "epoch": 8.091190108191654, "grad_norm": 2.401829481124878, "learning_rate": 5.351325807267047e-06, "loss": 0.4509, "num_input_tokens_seen": 3532736, "step": 10470 }, { "epoch": 8.095054095826894, "grad_norm": 1.8269903659820557, "learning_rate": 5.33049502987566e-06, "loss": 0.321, "num_input_tokens_seen": 3534592, "step": 10475 }, { "epoch": 8.098918083462133, "grad_norm": 2.1082043647766113, "learning_rate": 5.309700035714845e-06, "loss": 0.451, "num_input_tokens_seen": 3536384, "step": 10480 }, { "epoch": 8.102782071097373, "grad_norm": 2.2108561992645264, "learning_rate": 5.288940862615335e-06, "loss": 0.6534, "num_input_tokens_seen": 3538144, "step": 10485 }, { "epoch": 8.106646058732611, "grad_norm": 3.908557891845703, "learning_rate": 5.268217548342702e-06, "loss": 0.4571, "num_input_tokens_seen": 3540064, "step": 10490 }, { "epoch": 8.110510046367851, "grad_norm": 2.5133674144744873, "learning_rate": 5.247530130597303e-06, "loss": 0.3489, "num_input_tokens_seen": 3541568, "step": 10495 }, { "epoch": 8.114374034003092, "grad_norm": 4.029519557952881, "learning_rate": 5.22687864701418e-06, "loss": 0.5231, "num_input_tokens_seen": 3543424, "step": 10500 }, { "epoch": 8.11823802163833, "grad_norm": 2.8610763549804688, "learning_rate": 5.206263135162997e-06, "loss": 0.3523, "num_input_tokens_seen": 3545280, "step": 10505 }, { "epoch": 8.12210200927357, "grad_norm": 2.5223186016082764, "learning_rate": 5.185683632547983e-06, "loss": 0.7724, "num_input_tokens_seen": 3547040, "step": 10510 }, { "epoch": 8.12596599690881, "grad_norm": 5.750840187072754, "learning_rate": 5.165140176607855e-06, "loss": 0.5257, "num_input_tokens_seen": 3548704, "step": 10515 }, { "epoch": 8.129829984544049, "grad_norm": 2.912219524383545, "learning_rate": 5.144632804715768e-06, "loss": 0.4159, "num_input_tokens_seen": 3550336, "step": 10520 }, { "epoch": 8.13369397217929, "grad_norm": 1.8155642747879028, "learning_rate": 5.124161554179205e-06, "loss": 0.3534, "num_input_tokens_seen": 3551936, "step": 10525 }, { "epoch": 8.13755795981453, "grad_norm": 2.15326189994812, "learning_rate": 5.103726462239966e-06, "loss": 0.3458, "num_input_tokens_seen": 3553440, "step": 10530 }, { "epoch": 8.141421947449768, "grad_norm": 4.054314136505127, "learning_rate": 5.083327566074042e-06, "loss": 0.3594, "num_input_tokens_seen": 3555072, "step": 10535 }, { "epoch": 8.145285935085008, "grad_norm": 2.0219459533691406, "learning_rate": 5.0629649027916e-06, "loss": 0.3372, "num_input_tokens_seen": 3556928, "step": 10540 }, { "epoch": 8.149149922720248, "grad_norm": 1.9022960662841797, "learning_rate": 5.04263850943687e-06, "loss": 0.4648, "num_input_tokens_seen": 3558624, "step": 10545 }, { "epoch": 8.153013910355487, "grad_norm": 2.699125289916992, "learning_rate": 5.022348422988121e-06, "loss": 0.3281, "num_input_tokens_seen": 3560256, "step": 10550 }, { "epoch": 8.156877897990727, "grad_norm": 1.5513676404953003, "learning_rate": 5.00209468035755e-06, "loss": 0.4041, "num_input_tokens_seen": 3561632, "step": 10555 }, { "epoch": 8.160741885625965, "grad_norm": 2.2729995250701904, "learning_rate": 4.981877318391254e-06, "loss": 0.4181, "num_input_tokens_seen": 3563360, "step": 10560 }, { "epoch": 8.164605873261205, "grad_norm": 1.6029103994369507, "learning_rate": 4.961696373869132e-06, "loss": 0.3271, "num_input_tokens_seen": 3564960, "step": 10565 }, { "epoch": 8.168469860896446, "grad_norm": 3.8041954040527344, "learning_rate": 4.941551883504844e-06, "loss": 0.3984, "num_input_tokens_seen": 3566656, "step": 10570 }, { "epoch": 8.172333848531684, "grad_norm": 1.848907709121704, "learning_rate": 4.9214438839457174e-06, "loss": 0.4864, "num_input_tokens_seen": 3568352, "step": 10575 }, { "epoch": 8.176197836166924, "grad_norm": 2.127819776535034, "learning_rate": 4.901372411772706e-06, "loss": 0.3905, "num_input_tokens_seen": 3570080, "step": 10580 }, { "epoch": 8.180061823802165, "grad_norm": 2.222987174987793, "learning_rate": 4.8813375035003165e-06, "loss": 0.3694, "num_input_tokens_seen": 3571616, "step": 10585 }, { "epoch": 8.183925811437403, "grad_norm": 2.488675594329834, "learning_rate": 4.861339195576522e-06, "loss": 0.5152, "num_input_tokens_seen": 3573408, "step": 10590 }, { "epoch": 8.187789799072643, "grad_norm": 3.4817850589752197, "learning_rate": 4.841377524382715e-06, "loss": 0.37, "num_input_tokens_seen": 3575264, "step": 10595 }, { "epoch": 8.191653786707883, "grad_norm": 3.2797694206237793, "learning_rate": 4.821452526233647e-06, "loss": 0.5023, "num_input_tokens_seen": 3577024, "step": 10600 }, { "epoch": 8.195517774343122, "grad_norm": 4.8245744705200195, "learning_rate": 4.801564237377351e-06, "loss": 0.3899, "num_input_tokens_seen": 3578816, "step": 10605 }, { "epoch": 8.199381761978362, "grad_norm": 3.846759080886841, "learning_rate": 4.7817126939950745e-06, "loss": 0.5876, "num_input_tokens_seen": 3580480, "step": 10610 }, { "epoch": 8.2032457496136, "grad_norm": 3.0043540000915527, "learning_rate": 4.761897932201201e-06, "loss": 0.3959, "num_input_tokens_seen": 3582112, "step": 10615 }, { "epoch": 8.20710973724884, "grad_norm": 4.031718730926514, "learning_rate": 4.742119988043237e-06, "loss": 0.6575, "num_input_tokens_seen": 3583744, "step": 10620 }, { "epoch": 8.21097372488408, "grad_norm": 1.5009039640426636, "learning_rate": 4.722378897501684e-06, "loss": 0.3617, "num_input_tokens_seen": 3585184, "step": 10625 }, { "epoch": 8.21483771251932, "grad_norm": 4.194557189941406, "learning_rate": 4.702674696489986e-06, "loss": 0.3165, "num_input_tokens_seen": 3587360, "step": 10630 }, { "epoch": 8.21870170015456, "grad_norm": 1.8200429677963257, "learning_rate": 4.68300742085451e-06, "loss": 0.3107, "num_input_tokens_seen": 3589024, "step": 10635 }, { "epoch": 8.2225656877898, "grad_norm": 1.9723803997039795, "learning_rate": 4.663377106374431e-06, "loss": 0.2955, "num_input_tokens_seen": 3590752, "step": 10640 }, { "epoch": 8.226429675425038, "grad_norm": 2.9578683376312256, "learning_rate": 4.643783788761677e-06, "loss": 0.4829, "num_input_tokens_seen": 3592608, "step": 10645 }, { "epoch": 8.230293663060278, "grad_norm": 1.748741626739502, "learning_rate": 4.624227503660875e-06, "loss": 0.3197, "num_input_tokens_seen": 3594304, "step": 10650 }, { "epoch": 8.234157650695519, "grad_norm": 2.098125696182251, "learning_rate": 4.604708286649284e-06, "loss": 0.423, "num_input_tokens_seen": 3595936, "step": 10655 }, { "epoch": 8.238021638330757, "grad_norm": 2.864014148712158, "learning_rate": 4.585226173236737e-06, "loss": 0.386, "num_input_tokens_seen": 3597728, "step": 10660 }, { "epoch": 8.241885625965997, "grad_norm": 3.822622537612915, "learning_rate": 4.565781198865543e-06, "loss": 0.4404, "num_input_tokens_seen": 3599392, "step": 10665 }, { "epoch": 8.245749613601237, "grad_norm": 2.086388349533081, "learning_rate": 4.546373398910475e-06, "loss": 0.3325, "num_input_tokens_seen": 3601184, "step": 10670 }, { "epoch": 8.249613601236476, "grad_norm": 3.6695127487182617, "learning_rate": 4.527002808678649e-06, "loss": 0.422, "num_input_tokens_seen": 3602880, "step": 10675 }, { "epoch": 8.253477588871716, "grad_norm": 3.956144094467163, "learning_rate": 4.507669463409519e-06, "loss": 0.4205, "num_input_tokens_seen": 3604704, "step": 10680 }, { "epoch": 8.257341576506954, "grad_norm": 3.2939915657043457, "learning_rate": 4.488373398274751e-06, "loss": 0.5256, "num_input_tokens_seen": 3606400, "step": 10685 }, { "epoch": 8.261205564142195, "grad_norm": 3.392853260040283, "learning_rate": 4.469114648378217e-06, "loss": 0.47, "num_input_tokens_seen": 3607872, "step": 10690 }, { "epoch": 8.265069551777435, "grad_norm": 2.83211088180542, "learning_rate": 4.449893248755885e-06, "loss": 0.3828, "num_input_tokens_seen": 3609632, "step": 10695 }, { "epoch": 8.268933539412673, "grad_norm": 3.452181100845337, "learning_rate": 4.430709234375771e-06, "loss": 0.3993, "num_input_tokens_seen": 3611200, "step": 10700 }, { "epoch": 8.272797527047913, "grad_norm": 3.4486305713653564, "learning_rate": 4.4115626401379175e-06, "loss": 0.4116, "num_input_tokens_seen": 3613024, "step": 10705 }, { "epoch": 8.276661514683154, "grad_norm": 3.0359511375427246, "learning_rate": 4.392453500874244e-06, "loss": 0.3966, "num_input_tokens_seen": 3614368, "step": 10710 }, { "epoch": 8.280525502318392, "grad_norm": 5.227890968322754, "learning_rate": 4.373381851348549e-06, "loss": 0.8085, "num_input_tokens_seen": 3616384, "step": 10715 }, { "epoch": 8.284389489953632, "grad_norm": 2.099980592727661, "learning_rate": 4.3543477262564355e-06, "loss": 0.3803, "num_input_tokens_seen": 3617984, "step": 10720 }, { "epoch": 8.288253477588873, "grad_norm": 2.491089344024658, "learning_rate": 4.335351160225243e-06, "loss": 0.3598, "num_input_tokens_seen": 3619776, "step": 10725 }, { "epoch": 8.292117465224111, "grad_norm": 3.5521655082702637, "learning_rate": 4.316392187813967e-06, "loss": 0.3893, "num_input_tokens_seen": 3621408, "step": 10730 }, { "epoch": 8.295981452859351, "grad_norm": 3.2083287239074707, "learning_rate": 4.297470843513216e-06, "loss": 0.3762, "num_input_tokens_seen": 3623136, "step": 10735 }, { "epoch": 8.29984544049459, "grad_norm": 1.4046692848205566, "learning_rate": 4.278587161745154e-06, "loss": 0.5965, "num_input_tokens_seen": 3624928, "step": 10740 }, { "epoch": 8.30370942812983, "grad_norm": 4.14080810546875, "learning_rate": 4.25974117686343e-06, "loss": 0.3908, "num_input_tokens_seen": 3626528, "step": 10745 }, { "epoch": 8.30757341576507, "grad_norm": 1.8381528854370117, "learning_rate": 4.240932923153093e-06, "loss": 0.5198, "num_input_tokens_seen": 3628192, "step": 10750 }, { "epoch": 8.311437403400308, "grad_norm": 3.6159591674804688, "learning_rate": 4.222162434830582e-06, "loss": 0.5385, "num_input_tokens_seen": 3629632, "step": 10755 }, { "epoch": 8.315301391035549, "grad_norm": 4.0893659591674805, "learning_rate": 4.2034297460436e-06, "loss": 0.3939, "num_input_tokens_seen": 3631392, "step": 10760 }, { "epoch": 8.319165378670789, "grad_norm": 3.2511396408081055, "learning_rate": 4.184734890871112e-06, "loss": 0.3886, "num_input_tokens_seen": 3633024, "step": 10765 }, { "epoch": 8.323029366306027, "grad_norm": 2.491684675216675, "learning_rate": 4.166077903323226e-06, "loss": 0.2952, "num_input_tokens_seen": 3634560, "step": 10770 }, { "epoch": 8.326893353941268, "grad_norm": 2.431828022003174, "learning_rate": 4.147458817341196e-06, "loss": 0.4971, "num_input_tokens_seen": 3636448, "step": 10775 }, { "epoch": 8.330757341576508, "grad_norm": 5.320882320404053, "learning_rate": 4.12887766679729e-06, "loss": 0.458, "num_input_tokens_seen": 3638080, "step": 10780 }, { "epoch": 8.334621329211746, "grad_norm": 4.053700923919678, "learning_rate": 4.110334485494782e-06, "loss": 0.3926, "num_input_tokens_seen": 3639776, "step": 10785 }, { "epoch": 8.338485316846986, "grad_norm": 3.4381229877471924, "learning_rate": 4.091829307167877e-06, "loss": 0.3205, "num_input_tokens_seen": 3641440, "step": 10790 }, { "epoch": 8.342349304482227, "grad_norm": 2.6003925800323486, "learning_rate": 4.0733621654816286e-06, "loss": 0.3837, "num_input_tokens_seen": 3643104, "step": 10795 }, { "epoch": 8.346213292117465, "grad_norm": 6.354546546936035, "learning_rate": 4.054933094031893e-06, "loss": 0.4269, "num_input_tokens_seen": 3644992, "step": 10800 }, { "epoch": 8.350077279752705, "grad_norm": 2.1376519203186035, "learning_rate": 4.036542126345283e-06, "loss": 0.3791, "num_input_tokens_seen": 3646656, "step": 10805 }, { "epoch": 8.353941267387944, "grad_norm": 3.269439220428467, "learning_rate": 4.018189295879085e-06, "loss": 0.3177, "num_input_tokens_seen": 3648032, "step": 10810 }, { "epoch": 8.357805255023184, "grad_norm": 3.402838945388794, "learning_rate": 3.9998746360212e-06, "loss": 0.5201, "num_input_tokens_seen": 3649856, "step": 10815 }, { "epoch": 8.361669242658424, "grad_norm": 2.629776954650879, "learning_rate": 3.981598180090082e-06, "loss": 0.323, "num_input_tokens_seen": 3651648, "step": 10820 }, { "epoch": 8.365533230293662, "grad_norm": 2.8184189796447754, "learning_rate": 3.963359961334714e-06, "loss": 0.3556, "num_input_tokens_seen": 3653568, "step": 10825 }, { "epoch": 8.369397217928903, "grad_norm": 2.5979747772216797, "learning_rate": 3.94516001293449e-06, "loss": 0.3821, "num_input_tokens_seen": 3655488, "step": 10830 }, { "epoch": 8.373261205564143, "grad_norm": 1.7496354579925537, "learning_rate": 3.9269983679991765e-06, "loss": 0.3102, "num_input_tokens_seen": 3657344, "step": 10835 }, { "epoch": 8.377125193199381, "grad_norm": 2.301877737045288, "learning_rate": 3.9088750595688786e-06, "loss": 0.3523, "num_input_tokens_seen": 3658976, "step": 10840 }, { "epoch": 8.380989180834622, "grad_norm": 2.785252809524536, "learning_rate": 3.890790120613955e-06, "loss": 0.428, "num_input_tokens_seen": 3660704, "step": 10845 }, { "epoch": 8.384853168469862, "grad_norm": 2.8665072917938232, "learning_rate": 3.872743584034952e-06, "loss": 0.3814, "num_input_tokens_seen": 3662208, "step": 10850 }, { "epoch": 8.3887171561051, "grad_norm": 2.4041948318481445, "learning_rate": 3.854735482662547e-06, "loss": 0.5776, "num_input_tokens_seen": 3663936, "step": 10855 }, { "epoch": 8.39258114374034, "grad_norm": 2.3379931449890137, "learning_rate": 3.8367658492575145e-06, "loss": 0.336, "num_input_tokens_seen": 3665504, "step": 10860 }, { "epoch": 8.396445131375579, "grad_norm": 2.406830310821533, "learning_rate": 3.818834716510644e-06, "loss": 0.3304, "num_input_tokens_seen": 3666944, "step": 10865 }, { "epoch": 8.400309119010819, "grad_norm": 3.9672744274139404, "learning_rate": 3.800942117042669e-06, "loss": 0.37, "num_input_tokens_seen": 3668640, "step": 10870 }, { "epoch": 8.40417310664606, "grad_norm": 2.0406599044799805, "learning_rate": 3.7830880834042375e-06, "loss": 0.3071, "num_input_tokens_seen": 3670144, "step": 10875 }, { "epoch": 8.408037094281298, "grad_norm": 2.1534011363983154, "learning_rate": 3.765272648075821e-06, "loss": 0.5299, "num_input_tokens_seen": 3671808, "step": 10880 }, { "epoch": 8.411901081916538, "grad_norm": 3.24815034866333, "learning_rate": 3.7474958434676994e-06, "loss": 0.3452, "num_input_tokens_seen": 3673408, "step": 10885 }, { "epoch": 8.415765069551778, "grad_norm": 2.7671926021575928, "learning_rate": 3.7297577019198393e-06, "loss": 0.3529, "num_input_tokens_seen": 3674784, "step": 10890 }, { "epoch": 8.419629057187016, "grad_norm": 3.2292258739471436, "learning_rate": 3.7120582557019056e-06, "loss": 0.4141, "num_input_tokens_seen": 3676736, "step": 10895 }, { "epoch": 8.423493044822257, "grad_norm": 1.9701787233352661, "learning_rate": 3.6943975370131388e-06, "loss": 0.33, "num_input_tokens_seen": 3678272, "step": 10900 }, { "epoch": 8.427357032457497, "grad_norm": 7.934124946594238, "learning_rate": 3.6767755779823472e-06, "loss": 0.5786, "num_input_tokens_seen": 3680128, "step": 10905 }, { "epoch": 8.431221020092735, "grad_norm": 2.1799187660217285, "learning_rate": 3.6591924106678057e-06, "loss": 0.3477, "num_input_tokens_seen": 3681792, "step": 10910 }, { "epoch": 8.435085007727976, "grad_norm": 1.800135612487793, "learning_rate": 3.6416480670572433e-06, "loss": 0.3363, "num_input_tokens_seen": 3683296, "step": 10915 }, { "epoch": 8.438948995363216, "grad_norm": 2.4710590839385986, "learning_rate": 3.6241425790677353e-06, "loss": 0.3095, "num_input_tokens_seen": 3685088, "step": 10920 }, { "epoch": 8.442812982998454, "grad_norm": 1.7486380338668823, "learning_rate": 3.6066759785456845e-06, "loss": 0.3257, "num_input_tokens_seen": 3686944, "step": 10925 }, { "epoch": 8.446676970633694, "grad_norm": 3.887223958969116, "learning_rate": 3.5892482972667536e-06, "loss": 0.4981, "num_input_tokens_seen": 3688768, "step": 10930 }, { "epoch": 8.450540958268933, "grad_norm": 2.715413808822632, "learning_rate": 3.571859566935787e-06, "loss": 0.5887, "num_input_tokens_seen": 3690432, "step": 10935 }, { "epoch": 8.454404945904173, "grad_norm": 3.34333872795105, "learning_rate": 3.5545098191867714e-06, "loss": 0.3709, "num_input_tokens_seen": 3692000, "step": 10940 }, { "epoch": 8.458268933539413, "grad_norm": 2.017627000808716, "learning_rate": 3.5371990855827817e-06, "loss": 0.437, "num_input_tokens_seen": 3694016, "step": 10945 }, { "epoch": 8.462132921174652, "grad_norm": 4.78721284866333, "learning_rate": 3.519927397615927e-06, "loss": 0.4472, "num_input_tokens_seen": 3695616, "step": 10950 }, { "epoch": 8.465996908809892, "grad_norm": 1.5930720567703247, "learning_rate": 3.502694786707264e-06, "loss": 0.6257, "num_input_tokens_seen": 3697056, "step": 10955 }, { "epoch": 8.469860896445132, "grad_norm": 4.5601091384887695, "learning_rate": 3.4855012842067547e-06, "loss": 0.4631, "num_input_tokens_seen": 3698848, "step": 10960 }, { "epoch": 8.47372488408037, "grad_norm": 2.3582167625427246, "learning_rate": 3.4683469213932534e-06, "loss": 0.3493, "num_input_tokens_seen": 3700448, "step": 10965 }, { "epoch": 8.47758887171561, "grad_norm": 1.8430407047271729, "learning_rate": 3.451231729474369e-06, "loss": 0.7085, "num_input_tokens_seen": 3702080, "step": 10970 }, { "epoch": 8.481452859350851, "grad_norm": 5.280860424041748, "learning_rate": 3.434155739586464e-06, "loss": 0.6172, "num_input_tokens_seen": 3703808, "step": 10975 }, { "epoch": 8.48531684698609, "grad_norm": 1.7467107772827148, "learning_rate": 3.417118982794584e-06, "loss": 0.496, "num_input_tokens_seen": 3705472, "step": 10980 }, { "epoch": 8.48918083462133, "grad_norm": 3.2739968299865723, "learning_rate": 3.4001214900924144e-06, "loss": 0.3303, "num_input_tokens_seen": 3707072, "step": 10985 }, { "epoch": 8.493044822256568, "grad_norm": 2.3903024196624756, "learning_rate": 3.383163292402186e-06, "loss": 0.4232, "num_input_tokens_seen": 3708928, "step": 10990 }, { "epoch": 8.496908809891808, "grad_norm": 1.5319230556488037, "learning_rate": 3.3662444205746545e-06, "loss": 0.4575, "num_input_tokens_seen": 3710976, "step": 10995 }, { "epoch": 8.5, "eval_loss": 0.5040460824966431, "eval_runtime": 11.4851, "eval_samples_per_second": 50.065, "eval_steps_per_second": 12.538, "num_input_tokens_seen": 3712320, "step": 10999 }, { "epoch": 8.500772797527048, "grad_norm": 4.072979927062988, "learning_rate": 3.3493649053890326e-06, "loss": 0.3441, "num_input_tokens_seen": 3712576, "step": 11000 }, { "epoch": 8.504636785162287, "grad_norm": 4.6860527992248535, "learning_rate": 3.3325247775529476e-06, "loss": 0.4551, "num_input_tokens_seen": 3714304, "step": 11005 }, { "epoch": 8.508500772797527, "grad_norm": 4.6044511795043945, "learning_rate": 3.3157240677023445e-06, "loss": 0.4315, "num_input_tokens_seen": 3716320, "step": 11010 }, { "epoch": 8.512364760432767, "grad_norm": 4.804174900054932, "learning_rate": 3.2989628064014844e-06, "loss": 0.3582, "num_input_tokens_seen": 3717696, "step": 11015 }, { "epoch": 8.516228748068006, "grad_norm": 3.698613166809082, "learning_rate": 3.282241024142843e-06, "loss": 0.3497, "num_input_tokens_seen": 3719456, "step": 11020 }, { "epoch": 8.520092735703246, "grad_norm": 3.722111463546753, "learning_rate": 3.2655587513470947e-06, "loss": 0.3911, "num_input_tokens_seen": 3721248, "step": 11025 }, { "epoch": 8.523956723338486, "grad_norm": 1.8001970052719116, "learning_rate": 3.248916018363013e-06, "loss": 0.338, "num_input_tokens_seen": 3722880, "step": 11030 }, { "epoch": 8.527820710973725, "grad_norm": 1.9601612091064453, "learning_rate": 3.232312855467465e-06, "loss": 0.3105, "num_input_tokens_seen": 3724416, "step": 11035 }, { "epoch": 8.531684698608965, "grad_norm": 9.029346466064453, "learning_rate": 3.215749292865311e-06, "loss": 0.5264, "num_input_tokens_seen": 3725856, "step": 11040 }, { "epoch": 8.535548686244205, "grad_norm": 3.7650084495544434, "learning_rate": 3.1992253606893784e-06, "loss": 0.4492, "num_input_tokens_seen": 3727712, "step": 11045 }, { "epoch": 8.539412673879443, "grad_norm": 2.77941632270813, "learning_rate": 3.1827410890004073e-06, "loss": 0.5143, "num_input_tokens_seen": 3729440, "step": 11050 }, { "epoch": 8.543276661514684, "grad_norm": 1.6151832342147827, "learning_rate": 3.1662965077869646e-06, "loss": 0.3445, "num_input_tokens_seen": 3730976, "step": 11055 }, { "epoch": 8.547140649149922, "grad_norm": 3.6886699199676514, "learning_rate": 3.149891646965422e-06, "loss": 0.4345, "num_input_tokens_seen": 3732704, "step": 11060 }, { "epoch": 8.551004636785162, "grad_norm": 2.797335624694824, "learning_rate": 3.1335265363798967e-06, "loss": 0.6031, "num_input_tokens_seen": 3734432, "step": 11065 }, { "epoch": 8.554868624420402, "grad_norm": 2.6657540798187256, "learning_rate": 3.1172012058021923e-06, "loss": 0.3875, "num_input_tokens_seen": 3736064, "step": 11070 }, { "epoch": 8.55873261205564, "grad_norm": 2.3393394947052, "learning_rate": 3.1009156849317294e-06, "loss": 0.3074, "num_input_tokens_seen": 3737696, "step": 11075 }, { "epoch": 8.562596599690881, "grad_norm": 2.2053110599517822, "learning_rate": 3.084670003395507e-06, "loss": 0.3338, "num_input_tokens_seen": 3739520, "step": 11080 }, { "epoch": 8.566460587326121, "grad_norm": 3.5123589038848877, "learning_rate": 3.0684641907480773e-06, "loss": 0.5077, "num_input_tokens_seen": 3741408, "step": 11085 }, { "epoch": 8.57032457496136, "grad_norm": 1.7851293087005615, "learning_rate": 3.0522982764714243e-06, "loss": 0.3376, "num_input_tokens_seen": 3742848, "step": 11090 }, { "epoch": 8.5741885625966, "grad_norm": 2.8182179927825928, "learning_rate": 3.036172289974959e-06, "loss": 0.3235, "num_input_tokens_seen": 3744480, "step": 11095 }, { "epoch": 8.578052550231838, "grad_norm": 1.5331194400787354, "learning_rate": 3.0200862605954674e-06, "loss": 0.3733, "num_input_tokens_seen": 3746176, "step": 11100 }, { "epoch": 8.581916537867079, "grad_norm": 1.9983080625534058, "learning_rate": 3.004040217597037e-06, "loss": 0.4278, "num_input_tokens_seen": 3748064, "step": 11105 }, { "epoch": 8.585780525502319, "grad_norm": 6.214904308319092, "learning_rate": 2.98803419017101e-06, "loss": 0.5485, "num_input_tokens_seen": 3749984, "step": 11110 }, { "epoch": 8.589644513137557, "grad_norm": 3.4705450534820557, "learning_rate": 2.972068207435924e-06, "loss": 0.2975, "num_input_tokens_seen": 3751680, "step": 11115 }, { "epoch": 8.593508500772797, "grad_norm": 2.080606698989868, "learning_rate": 2.956142298437478e-06, "loss": 0.3454, "num_input_tokens_seen": 3753344, "step": 11120 }, { "epoch": 8.597372488408038, "grad_norm": 2.581493854522705, "learning_rate": 2.940256492148469e-06, "loss": 0.4815, "num_input_tokens_seen": 3755008, "step": 11125 }, { "epoch": 8.601236476043276, "grad_norm": 1.6936612129211426, "learning_rate": 2.9244108174687257e-06, "loss": 0.4076, "num_input_tokens_seen": 3756608, "step": 11130 }, { "epoch": 8.605100463678516, "grad_norm": 2.0995686054229736, "learning_rate": 2.9086053032250815e-06, "loss": 0.3037, "num_input_tokens_seen": 3758592, "step": 11135 }, { "epoch": 8.608964451313756, "grad_norm": 2.863210678100586, "learning_rate": 2.8928399781712993e-06, "loss": 0.4207, "num_input_tokens_seen": 3760608, "step": 11140 }, { "epoch": 8.612828438948995, "grad_norm": 1.914675235748291, "learning_rate": 2.8771148709880376e-06, "loss": 0.4828, "num_input_tokens_seen": 3762304, "step": 11145 }, { "epoch": 8.616692426584235, "grad_norm": 1.6017825603485107, "learning_rate": 2.8614300102827757e-06, "loss": 0.4896, "num_input_tokens_seen": 3763872, "step": 11150 }, { "epoch": 8.620556414219475, "grad_norm": 1.9888170957565308, "learning_rate": 2.8457854245897975e-06, "loss": 0.6044, "num_input_tokens_seen": 3765696, "step": 11155 }, { "epoch": 8.624420401854714, "grad_norm": 4.9019670486450195, "learning_rate": 2.830181142370092e-06, "loss": 0.361, "num_input_tokens_seen": 3767360, "step": 11160 }, { "epoch": 8.628284389489954, "grad_norm": 4.115179061889648, "learning_rate": 2.814617192011346e-06, "loss": 0.4336, "num_input_tokens_seen": 3769088, "step": 11165 }, { "epoch": 8.632148377125194, "grad_norm": 3.925102710723877, "learning_rate": 2.799093601827871e-06, "loss": 0.5473, "num_input_tokens_seen": 3770656, "step": 11170 }, { "epoch": 8.636012364760433, "grad_norm": 2.9603769779205322, "learning_rate": 2.783610400060549e-06, "loss": 0.3424, "num_input_tokens_seen": 3772320, "step": 11175 }, { "epoch": 8.639876352395673, "grad_norm": 3.5191426277160645, "learning_rate": 2.7681676148767795e-06, "loss": 0.3766, "num_input_tokens_seen": 3773920, "step": 11180 }, { "epoch": 8.643740340030911, "grad_norm": 4.770402908325195, "learning_rate": 2.752765274370453e-06, "loss": 0.4916, "num_input_tokens_seen": 3775840, "step": 11185 }, { "epoch": 8.647604327666151, "grad_norm": 2.3528311252593994, "learning_rate": 2.737403406561878e-06, "loss": 0.3931, "num_input_tokens_seen": 3777376, "step": 11190 }, { "epoch": 8.651468315301392, "grad_norm": 2.242156744003296, "learning_rate": 2.722082039397722e-06, "loss": 0.6988, "num_input_tokens_seen": 3778912, "step": 11195 }, { "epoch": 8.65533230293663, "grad_norm": 2.778029441833496, "learning_rate": 2.7068012007509745e-06, "loss": 0.3859, "num_input_tokens_seen": 3780672, "step": 11200 }, { "epoch": 8.65919629057187, "grad_norm": 2.5363404750823975, "learning_rate": 2.6915609184209043e-06, "loss": 0.2979, "num_input_tokens_seen": 3782272, "step": 11205 }, { "epoch": 8.66306027820711, "grad_norm": 2.036832094192505, "learning_rate": 2.6763612201330056e-06, "loss": 0.3031, "num_input_tokens_seen": 3783936, "step": 11210 }, { "epoch": 8.666924265842349, "grad_norm": 1.9083291292190552, "learning_rate": 2.6612021335389146e-06, "loss": 0.3538, "num_input_tokens_seen": 3785792, "step": 11215 }, { "epoch": 8.670788253477589, "grad_norm": 3.2325806617736816, "learning_rate": 2.646083686216408e-06, "loss": 0.5171, "num_input_tokens_seen": 3787488, "step": 11220 }, { "epoch": 8.674652241112828, "grad_norm": 2.0868630409240723, "learning_rate": 2.6310059056693314e-06, "loss": 0.4737, "num_input_tokens_seen": 3789088, "step": 11225 }, { "epoch": 8.678516228748068, "grad_norm": 2.088674306869507, "learning_rate": 2.6159688193275355e-06, "loss": 0.4102, "num_input_tokens_seen": 3790848, "step": 11230 }, { "epoch": 8.682380216383308, "grad_norm": 3.2685530185699463, "learning_rate": 2.6009724545468377e-06, "loss": 0.4459, "num_input_tokens_seen": 3792416, "step": 11235 }, { "epoch": 8.686244204018546, "grad_norm": 1.8989554643630981, "learning_rate": 2.58601683860899e-06, "loss": 0.2804, "num_input_tokens_seen": 3793952, "step": 11240 }, { "epoch": 8.690108191653787, "grad_norm": 3.65527081489563, "learning_rate": 2.5711019987216035e-06, "loss": 0.343, "num_input_tokens_seen": 3795488, "step": 11245 }, { "epoch": 8.693972179289027, "grad_norm": 6.783175945281982, "learning_rate": 2.5562279620181098e-06, "loss": 0.5417, "num_input_tokens_seen": 3797056, "step": 11250 }, { "epoch": 8.697836166924265, "grad_norm": 1.7768375873565674, "learning_rate": 2.5413947555576993e-06, "loss": 0.3542, "num_input_tokens_seen": 3798624, "step": 11255 }, { "epoch": 8.701700154559505, "grad_norm": 2.6253719329833984, "learning_rate": 2.5266024063253014e-06, "loss": 0.3484, "num_input_tokens_seen": 3800064, "step": 11260 }, { "epoch": 8.705564142194746, "grad_norm": 3.143209457397461, "learning_rate": 2.511850941231517e-06, "loss": 0.4638, "num_input_tokens_seen": 3802112, "step": 11265 }, { "epoch": 8.709428129829984, "grad_norm": 2.1550936698913574, "learning_rate": 2.4971403871125455e-06, "loss": 0.4611, "num_input_tokens_seen": 3803744, "step": 11270 }, { "epoch": 8.713292117465224, "grad_norm": 1.9111055135726929, "learning_rate": 2.4824707707301915e-06, "loss": 0.346, "num_input_tokens_seen": 3805312, "step": 11275 }, { "epoch": 8.717156105100464, "grad_norm": 2.2528412342071533, "learning_rate": 2.467842118771754e-06, "loss": 0.459, "num_input_tokens_seen": 3807296, "step": 11280 }, { "epoch": 8.721020092735703, "grad_norm": 3.561589241027832, "learning_rate": 2.4532544578500405e-06, "loss": 0.4941, "num_input_tokens_seen": 3809248, "step": 11285 }, { "epoch": 8.724884080370943, "grad_norm": 2.3085572719573975, "learning_rate": 2.4387078145032577e-06, "loss": 0.3632, "num_input_tokens_seen": 3810752, "step": 11290 }, { "epoch": 8.728748068006183, "grad_norm": 2.0806076526641846, "learning_rate": 2.424202215195012e-06, "loss": 0.386, "num_input_tokens_seen": 3812352, "step": 11295 }, { "epoch": 8.732612055641422, "grad_norm": 1.8107737302780151, "learning_rate": 2.4097376863142286e-06, "loss": 0.2972, "num_input_tokens_seen": 3813920, "step": 11300 }, { "epoch": 8.736476043276662, "grad_norm": 1.7933536767959595, "learning_rate": 2.395314254175124e-06, "loss": 0.3308, "num_input_tokens_seen": 3815488, "step": 11305 }, { "epoch": 8.7403400309119, "grad_norm": 3.021625518798828, "learning_rate": 2.3809319450171534e-06, "loss": 0.4137, "num_input_tokens_seen": 3816928, "step": 11310 }, { "epoch": 8.74420401854714, "grad_norm": 3.169037342071533, "learning_rate": 2.366590785004949e-06, "loss": 0.3882, "num_input_tokens_seen": 3818560, "step": 11315 }, { "epoch": 8.74806800618238, "grad_norm": 3.8616044521331787, "learning_rate": 2.3522908002282906e-06, "loss": 0.4236, "num_input_tokens_seen": 3820288, "step": 11320 }, { "epoch": 8.75193199381762, "grad_norm": 2.4322664737701416, "learning_rate": 2.3380320167020463e-06, "loss": 0.6892, "num_input_tokens_seen": 3821728, "step": 11325 }, { "epoch": 8.75579598145286, "grad_norm": 1.6659396886825562, "learning_rate": 2.323814460366147e-06, "loss": 0.4218, "num_input_tokens_seen": 3823392, "step": 11330 }, { "epoch": 8.7596599690881, "grad_norm": 2.270264148712158, "learning_rate": 2.3096381570855e-06, "loss": 0.4158, "num_input_tokens_seen": 3825184, "step": 11335 }, { "epoch": 8.763523956723338, "grad_norm": 3.069082498550415, "learning_rate": 2.295503132649962e-06, "loss": 0.3925, "num_input_tokens_seen": 3826816, "step": 11340 }, { "epoch": 8.767387944358578, "grad_norm": 1.8417996168136597, "learning_rate": 2.2814094127743267e-06, "loss": 0.3437, "num_input_tokens_seen": 3828576, "step": 11345 }, { "epoch": 8.771251931993817, "grad_norm": 1.7009248733520508, "learning_rate": 2.2673570230982128e-06, "loss": 0.3269, "num_input_tokens_seen": 3830016, "step": 11350 }, { "epoch": 8.775115919629057, "grad_norm": 3.3610799312591553, "learning_rate": 2.253345989186059e-06, "loss": 0.3682, "num_input_tokens_seen": 3831392, "step": 11355 }, { "epoch": 8.778979907264297, "grad_norm": 2.88627290725708, "learning_rate": 2.2393763365270716e-06, "loss": 0.3685, "num_input_tokens_seen": 3833216, "step": 11360 }, { "epoch": 8.782843894899536, "grad_norm": 1.7363401651382446, "learning_rate": 2.2254480905351843e-06, "loss": 0.4846, "num_input_tokens_seen": 3834752, "step": 11365 }, { "epoch": 8.786707882534776, "grad_norm": 3.537994623184204, "learning_rate": 2.2115612765489848e-06, "loss": 0.3637, "num_input_tokens_seen": 3836416, "step": 11370 }, { "epoch": 8.790571870170016, "grad_norm": 3.2509188652038574, "learning_rate": 2.197715919831689e-06, "loss": 0.3714, "num_input_tokens_seen": 3838048, "step": 11375 }, { "epoch": 8.794435857805254, "grad_norm": 1.8677805662155151, "learning_rate": 2.1839120455711072e-06, "loss": 0.3084, "num_input_tokens_seen": 3839712, "step": 11380 }, { "epoch": 8.798299845440495, "grad_norm": 2.615952730178833, "learning_rate": 2.1701496788795766e-06, "loss": 0.3047, "num_input_tokens_seen": 3841216, "step": 11385 }, { "epoch": 8.802163833075735, "grad_norm": 2.95133900642395, "learning_rate": 2.1564288447939145e-06, "loss": 0.3615, "num_input_tokens_seen": 3842688, "step": 11390 }, { "epoch": 8.806027820710973, "grad_norm": 3.2093923091888428, "learning_rate": 2.1427495682753984e-06, "loss": 0.4182, "num_input_tokens_seen": 3844608, "step": 11395 }, { "epoch": 8.809891808346213, "grad_norm": 2.4294750690460205, "learning_rate": 2.1291118742096798e-06, "loss": 0.4139, "num_input_tokens_seen": 3846464, "step": 11400 }, { "epoch": 8.813755795981454, "grad_norm": 4.110589981079102, "learning_rate": 2.1155157874067856e-06, "loss": 0.3999, "num_input_tokens_seen": 3847904, "step": 11405 }, { "epoch": 8.817619783616692, "grad_norm": 2.185783624649048, "learning_rate": 2.101961332601035e-06, "loss": 0.3991, "num_input_tokens_seen": 3849600, "step": 11410 }, { "epoch": 8.821483771251932, "grad_norm": 2.6697134971618652, "learning_rate": 2.088448534451018e-06, "loss": 0.3013, "num_input_tokens_seen": 3851008, "step": 11415 }, { "epoch": 8.825347758887172, "grad_norm": 1.6395169496536255, "learning_rate": 2.0749774175395335e-06, "loss": 0.2897, "num_input_tokens_seen": 3852736, "step": 11420 }, { "epoch": 8.829211746522411, "grad_norm": 2.8095085620880127, "learning_rate": 2.0615480063735563e-06, "loss": 0.2951, "num_input_tokens_seen": 3854176, "step": 11425 }, { "epoch": 8.833075734157651, "grad_norm": 3.304903268814087, "learning_rate": 2.048160325384199e-06, "loss": 0.3599, "num_input_tokens_seen": 3855872, "step": 11430 }, { "epoch": 8.83693972179289, "grad_norm": 2.7724812030792236, "learning_rate": 2.0348143989266434e-06, "loss": 0.317, "num_input_tokens_seen": 3857280, "step": 11435 }, { "epoch": 8.84080370942813, "grad_norm": 1.7249308824539185, "learning_rate": 2.021510251280109e-06, "loss": 0.3635, "num_input_tokens_seen": 3858976, "step": 11440 }, { "epoch": 8.84466769706337, "grad_norm": 6.058446407318115, "learning_rate": 2.0082479066478256e-06, "loss": 0.5161, "num_input_tokens_seen": 3860704, "step": 11445 }, { "epoch": 8.848531684698608, "grad_norm": 4.247896671295166, "learning_rate": 1.995027389156967e-06, "loss": 0.5712, "num_input_tokens_seen": 3862528, "step": 11450 }, { "epoch": 8.852395672333849, "grad_norm": 3.312727928161621, "learning_rate": 1.981848722858609e-06, "loss": 0.3854, "num_input_tokens_seen": 3864064, "step": 11455 }, { "epoch": 8.856259659969089, "grad_norm": 4.02152156829834, "learning_rate": 1.9687119317276873e-06, "loss": 0.4313, "num_input_tokens_seen": 3865792, "step": 11460 }, { "epoch": 8.860123647604327, "grad_norm": 2.2630362510681152, "learning_rate": 1.955617039662977e-06, "loss": 0.3512, "num_input_tokens_seen": 3867264, "step": 11465 }, { "epoch": 8.863987635239567, "grad_norm": 2.2299959659576416, "learning_rate": 1.9425640704870128e-06, "loss": 0.4089, "num_input_tokens_seen": 3869152, "step": 11470 }, { "epoch": 8.867851622874806, "grad_norm": 4.027388095855713, "learning_rate": 1.9295530479460614e-06, "loss": 0.3639, "num_input_tokens_seen": 3870784, "step": 11475 }, { "epoch": 8.871715610510046, "grad_norm": 3.08253812789917, "learning_rate": 1.9165839957100835e-06, "loss": 0.3529, "num_input_tokens_seen": 3872384, "step": 11480 }, { "epoch": 8.875579598145286, "grad_norm": 2.334501028060913, "learning_rate": 1.903656937372697e-06, "loss": 0.3749, "num_input_tokens_seen": 3873888, "step": 11485 }, { "epoch": 8.879443585780525, "grad_norm": 2.949016571044922, "learning_rate": 1.8907718964511074e-06, "loss": 0.4648, "num_input_tokens_seen": 3875616, "step": 11490 }, { "epoch": 8.883307573415765, "grad_norm": 2.594677686691284, "learning_rate": 1.8779288963860802e-06, "loss": 0.3621, "num_input_tokens_seen": 3877216, "step": 11495 }, { "epoch": 8.887171561051005, "grad_norm": 3.7975666522979736, "learning_rate": 1.8651279605419153e-06, "loss": 0.4363, "num_input_tokens_seen": 3878720, "step": 11500 }, { "epoch": 8.891035548686244, "grad_norm": 2.834266185760498, "learning_rate": 1.8523691122063785e-06, "loss": 0.4096, "num_input_tokens_seen": 3880416, "step": 11505 }, { "epoch": 8.894899536321484, "grad_norm": 3.3913493156433105, "learning_rate": 1.839652374590664e-06, "loss": 0.4428, "num_input_tokens_seen": 3882208, "step": 11510 }, { "epoch": 8.898763523956724, "grad_norm": 3.684934616088867, "learning_rate": 1.8269777708293718e-06, "loss": 0.4006, "num_input_tokens_seen": 3884224, "step": 11515 }, { "epoch": 8.902627511591962, "grad_norm": 1.772722840309143, "learning_rate": 1.814345323980432e-06, "loss": 0.5116, "num_input_tokens_seen": 3885920, "step": 11520 }, { "epoch": 8.906491499227203, "grad_norm": 2.1869184970855713, "learning_rate": 1.8017550570251007e-06, "loss": 0.3045, "num_input_tokens_seen": 3887712, "step": 11525 }, { "epoch": 8.910355486862443, "grad_norm": 3.834782123565674, "learning_rate": 1.789206992867881e-06, "loss": 0.3323, "num_input_tokens_seen": 3889504, "step": 11530 }, { "epoch": 8.914219474497681, "grad_norm": 2.1423606872558594, "learning_rate": 1.7767011543365213e-06, "loss": 0.4589, "num_input_tokens_seen": 3891040, "step": 11535 }, { "epoch": 8.918083462132921, "grad_norm": 6.152333736419678, "learning_rate": 1.7642375641819282e-06, "loss": 0.4442, "num_input_tokens_seen": 3892704, "step": 11540 }, { "epoch": 8.921947449768162, "grad_norm": 5.500743865966797, "learning_rate": 1.7518162450781638e-06, "loss": 0.4226, "num_input_tokens_seen": 3894368, "step": 11545 }, { "epoch": 8.9258114374034, "grad_norm": 3.509204149246216, "learning_rate": 1.7394372196223973e-06, "loss": 0.6163, "num_input_tokens_seen": 3896096, "step": 11550 }, { "epoch": 8.92967542503864, "grad_norm": 3.4085299968719482, "learning_rate": 1.7271005103348337e-06, "loss": 0.4173, "num_input_tokens_seen": 3897760, "step": 11555 }, { "epoch": 8.933539412673879, "grad_norm": 2.52606463432312, "learning_rate": 1.714806139658709e-06, "loss": 0.4115, "num_input_tokens_seen": 3899744, "step": 11560 }, { "epoch": 8.937403400309119, "grad_norm": 5.94285774230957, "learning_rate": 1.7025541299602376e-06, "loss": 0.4228, "num_input_tokens_seen": 3901280, "step": 11565 }, { "epoch": 8.94126738794436, "grad_norm": 2.227673053741455, "learning_rate": 1.690344503528571e-06, "loss": 0.3844, "num_input_tokens_seen": 3903008, "step": 11570 }, { "epoch": 8.945131375579598, "grad_norm": 2.2772319316864014, "learning_rate": 1.6781772825757436e-06, "loss": 0.3051, "num_input_tokens_seen": 3904576, "step": 11575 }, { "epoch": 8.948995363214838, "grad_norm": 3.325108528137207, "learning_rate": 1.6660524892366525e-06, "loss": 0.344, "num_input_tokens_seen": 3905984, "step": 11580 }, { "epoch": 8.952859350850078, "grad_norm": 2.3473012447357178, "learning_rate": 1.6539701455690115e-06, "loss": 0.335, "num_input_tokens_seen": 3907712, "step": 11585 }, { "epoch": 8.956723338485316, "grad_norm": 3.022437810897827, "learning_rate": 1.6419302735533132e-06, "loss": 0.4511, "num_input_tokens_seen": 3909440, "step": 11590 }, { "epoch": 8.960587326120557, "grad_norm": 5.26802396774292, "learning_rate": 1.62993289509277e-06, "loss": 0.3286, "num_input_tokens_seen": 3910880, "step": 11595 }, { "epoch": 8.964451313755795, "grad_norm": 1.7059409618377686, "learning_rate": 1.6179780320132981e-06, "loss": 0.6068, "num_input_tokens_seen": 3912896, "step": 11600 }, { "epoch": 8.968315301391035, "grad_norm": 4.357231616973877, "learning_rate": 1.6060657060634782e-06, "loss": 0.3839, "num_input_tokens_seen": 3914624, "step": 11605 }, { "epoch": 8.972179289026275, "grad_norm": 5.337675094604492, "learning_rate": 1.5941959389144917e-06, "loss": 0.4217, "num_input_tokens_seen": 3916416, "step": 11610 }, { "epoch": 8.976043276661514, "grad_norm": 4.774530410766602, "learning_rate": 1.5823687521600933e-06, "loss": 0.3993, "num_input_tokens_seen": 3918144, "step": 11615 }, { "epoch": 8.979907264296754, "grad_norm": 3.0430338382720947, "learning_rate": 1.5705841673165884e-06, "loss": 0.3931, "num_input_tokens_seen": 3919712, "step": 11620 }, { "epoch": 8.983771251931994, "grad_norm": 3.7800376415252686, "learning_rate": 1.5588422058227747e-06, "loss": 0.4399, "num_input_tokens_seen": 3921472, "step": 11625 }, { "epoch": 8.987635239567233, "grad_norm": 2.5596067905426025, "learning_rate": 1.547142889039907e-06, "loss": 0.4213, "num_input_tokens_seen": 3923040, "step": 11630 }, { "epoch": 8.991499227202473, "grad_norm": 3.2150039672851562, "learning_rate": 1.535486238251657e-06, "loss": 0.4813, "num_input_tokens_seen": 3925024, "step": 11635 }, { "epoch": 8.995363214837713, "grad_norm": 2.735646963119507, "learning_rate": 1.5238722746640844e-06, "loss": 0.4076, "num_input_tokens_seen": 3927040, "step": 11640 }, { "epoch": 8.999227202472952, "grad_norm": 3.541363000869751, "learning_rate": 1.512301019405582e-06, "loss": 0.3782, "num_input_tokens_seen": 3928576, "step": 11645 }, { "epoch": 9.0, "eval_loss": 0.5048345923423767, "eval_runtime": 11.4878, "eval_samples_per_second": 50.053, "eval_steps_per_second": 12.535, "num_input_tokens_seen": 3928704, "step": 11646 }, { "epoch": 9.003091190108192, "grad_norm": 1.822711706161499, "learning_rate": 1.5007724935268557e-06, "loss": 0.415, "num_input_tokens_seen": 3929984, "step": 11650 }, { "epoch": 9.006955177743432, "grad_norm": 2.6063244342803955, "learning_rate": 1.489286718000879e-06, "loss": 0.3829, "num_input_tokens_seen": 3931968, "step": 11655 }, { "epoch": 9.01081916537867, "grad_norm": 3.6095988750457764, "learning_rate": 1.477843713722843e-06, "loss": 0.572, "num_input_tokens_seen": 3933728, "step": 11660 }, { "epoch": 9.01468315301391, "grad_norm": 2.662745475769043, "learning_rate": 1.4664435015101258e-06, "loss": 0.4516, "num_input_tokens_seen": 3935488, "step": 11665 }, { "epoch": 9.018547140649149, "grad_norm": 4.9169487953186035, "learning_rate": 1.455086102102271e-06, "loss": 0.344, "num_input_tokens_seen": 3937056, "step": 11670 }, { "epoch": 9.02241112828439, "grad_norm": 2.532947301864624, "learning_rate": 1.4437715361609345e-06, "loss": 0.3834, "num_input_tokens_seen": 3938976, "step": 11675 }, { "epoch": 9.02627511591963, "grad_norm": 3.1448171138763428, "learning_rate": 1.4324998242698368e-06, "loss": 0.2955, "num_input_tokens_seen": 3940384, "step": 11680 }, { "epoch": 9.030139103554868, "grad_norm": 3.0978505611419678, "learning_rate": 1.4212709869347363e-06, "loss": 0.4866, "num_input_tokens_seen": 3942176, "step": 11685 }, { "epoch": 9.034003091190108, "grad_norm": 1.8106416463851929, "learning_rate": 1.4100850445834175e-06, "loss": 0.3119, "num_input_tokens_seen": 3943776, "step": 11690 }, { "epoch": 9.037867078825348, "grad_norm": 3.06313419342041, "learning_rate": 1.3989420175656049e-06, "loss": 0.4354, "num_input_tokens_seen": 3945664, "step": 11695 }, { "epoch": 9.041731066460587, "grad_norm": 2.656055450439453, "learning_rate": 1.3878419261529525e-06, "loss": 0.3953, "num_input_tokens_seen": 3947584, "step": 11700 }, { "epoch": 9.045595054095827, "grad_norm": 2.215618371963501, "learning_rate": 1.3767847905390124e-06, "loss": 0.4879, "num_input_tokens_seen": 3949344, "step": 11705 }, { "epoch": 9.049459041731067, "grad_norm": 2.9614014625549316, "learning_rate": 1.3657706308391971e-06, "loss": 0.3548, "num_input_tokens_seen": 3950880, "step": 11710 }, { "epoch": 9.053323029366306, "grad_norm": 3.0618507862091064, "learning_rate": 1.35479946709072e-06, "loss": 0.3711, "num_input_tokens_seen": 3952928, "step": 11715 }, { "epoch": 9.057187017001546, "grad_norm": 5.065474033355713, "learning_rate": 1.3438713192525793e-06, "loss": 0.4858, "num_input_tokens_seen": 3954624, "step": 11720 }, { "epoch": 9.061051004636786, "grad_norm": 5.287033557891846, "learning_rate": 1.332986207205525e-06, "loss": 0.3906, "num_input_tokens_seen": 3956352, "step": 11725 }, { "epoch": 9.064914992272024, "grad_norm": 2.099496841430664, "learning_rate": 1.3221441507520167e-06, "loss": 0.5325, "num_input_tokens_seen": 3958144, "step": 11730 }, { "epoch": 9.068778979907265, "grad_norm": 1.3744266033172607, "learning_rate": 1.311345169616171e-06, "loss": 0.4584, "num_input_tokens_seen": 3959712, "step": 11735 }, { "epoch": 9.072642967542503, "grad_norm": 3.2077858448028564, "learning_rate": 1.3005892834437594e-06, "loss": 0.3971, "num_input_tokens_seen": 3961312, "step": 11740 }, { "epoch": 9.076506955177743, "grad_norm": 3.6630074977874756, "learning_rate": 1.2898765118021406e-06, "loss": 0.3496, "num_input_tokens_seen": 3962976, "step": 11745 }, { "epoch": 9.080370942812984, "grad_norm": 2.745992660522461, "learning_rate": 1.279206874180247e-06, "loss": 0.4629, "num_input_tokens_seen": 3964480, "step": 11750 }, { "epoch": 9.084234930448222, "grad_norm": 2.331529140472412, "learning_rate": 1.2685803899885301e-06, "loss": 0.3183, "num_input_tokens_seen": 3966368, "step": 11755 }, { "epoch": 9.088098918083462, "grad_norm": 2.7681639194488525, "learning_rate": 1.2579970785589507e-06, "loss": 0.3382, "num_input_tokens_seen": 3967968, "step": 11760 }, { "epoch": 9.091962905718702, "grad_norm": 4.4319539070129395, "learning_rate": 1.247456959144913e-06, "loss": 0.3175, "num_input_tokens_seen": 3969824, "step": 11765 }, { "epoch": 9.09582689335394, "grad_norm": 2.806130886077881, "learning_rate": 1.2369600509212543e-06, "loss": 0.3196, "num_input_tokens_seen": 3971296, "step": 11770 }, { "epoch": 9.099690880989181, "grad_norm": 5.190746784210205, "learning_rate": 1.2265063729842068e-06, "loss": 0.5826, "num_input_tokens_seen": 3972992, "step": 11775 }, { "epoch": 9.103554868624421, "grad_norm": 1.5593712329864502, "learning_rate": 1.216095944351342e-06, "loss": 0.459, "num_input_tokens_seen": 3974784, "step": 11780 }, { "epoch": 9.10741885625966, "grad_norm": 4.48858642578125, "learning_rate": 1.2057287839615544e-06, "loss": 0.6045, "num_input_tokens_seen": 3976512, "step": 11785 }, { "epoch": 9.1112828438949, "grad_norm": 3.192274808883667, "learning_rate": 1.195404910675038e-06, "loss": 0.3237, "num_input_tokens_seen": 3978240, "step": 11790 }, { "epoch": 9.115146831530138, "grad_norm": 3.311108350753784, "learning_rate": 1.1851243432732278e-06, "loss": 0.6548, "num_input_tokens_seen": 3980192, "step": 11795 }, { "epoch": 9.119010819165378, "grad_norm": 3.051602840423584, "learning_rate": 1.1748871004587714e-06, "loss": 0.3307, "num_input_tokens_seen": 3981696, "step": 11800 }, { "epoch": 9.122874806800619, "grad_norm": 3.512928009033203, "learning_rate": 1.164693200855499e-06, "loss": 0.3915, "num_input_tokens_seen": 3983104, "step": 11805 }, { "epoch": 9.126738794435857, "grad_norm": 3.818406581878662, "learning_rate": 1.1545426630084093e-06, "loss": 0.3583, "num_input_tokens_seen": 3984544, "step": 11810 }, { "epoch": 9.130602782071097, "grad_norm": 3.2931342124938965, "learning_rate": 1.1444355053835926e-06, "loss": 0.3939, "num_input_tokens_seen": 3986176, "step": 11815 }, { "epoch": 9.134466769706338, "grad_norm": 5.674528121948242, "learning_rate": 1.134371746368229e-06, "loss": 0.3638, "num_input_tokens_seen": 3987872, "step": 11820 }, { "epoch": 9.138330757341576, "grad_norm": 1.630340576171875, "learning_rate": 1.1243514042705517e-06, "loss": 0.3699, "num_input_tokens_seen": 3989696, "step": 11825 }, { "epoch": 9.142194744976816, "grad_norm": 3.1280972957611084, "learning_rate": 1.1143744973198062e-06, "loss": 0.32, "num_input_tokens_seen": 3991328, "step": 11830 }, { "epoch": 9.146058732612056, "grad_norm": 3.1116559505462646, "learning_rate": 1.1044410436662156e-06, "loss": 0.4116, "num_input_tokens_seen": 3992960, "step": 11835 }, { "epoch": 9.149922720247295, "grad_norm": 6.649744510650635, "learning_rate": 1.0945510613809523e-06, "loss": 0.4835, "num_input_tokens_seen": 3994560, "step": 11840 }, { "epoch": 9.153786707882535, "grad_norm": 4.428999423980713, "learning_rate": 1.0847045684561076e-06, "loss": 0.395, "num_input_tokens_seen": 3996480, "step": 11845 }, { "epoch": 9.157650695517773, "grad_norm": 2.757153272628784, "learning_rate": 1.0749015828046632e-06, "loss": 0.4228, "num_input_tokens_seen": 3998016, "step": 11850 }, { "epoch": 9.161514683153014, "grad_norm": 4.228115081787109, "learning_rate": 1.0651421222604346e-06, "loss": 0.3089, "num_input_tokens_seen": 3999712, "step": 11855 }, { "epoch": 9.165378670788254, "grad_norm": 3.5834105014801025, "learning_rate": 1.0554262045780694e-06, "loss": 0.4504, "num_input_tokens_seen": 4001408, "step": 11860 }, { "epoch": 9.169242658423492, "grad_norm": 4.680394172668457, "learning_rate": 1.0457538474329897e-06, "loss": 0.3183, "num_input_tokens_seen": 4003072, "step": 11865 }, { "epoch": 9.173106646058732, "grad_norm": 3.191155433654785, "learning_rate": 1.0361250684213868e-06, "loss": 0.5185, "num_input_tokens_seen": 4004576, "step": 11870 }, { "epoch": 9.176970633693973, "grad_norm": 4.595128059387207, "learning_rate": 1.0265398850601544e-06, "loss": 0.4194, "num_input_tokens_seen": 4006016, "step": 11875 }, { "epoch": 9.180834621329211, "grad_norm": 4.950393199920654, "learning_rate": 1.0169983147868934e-06, "loss": 0.3538, "num_input_tokens_seen": 4007456, "step": 11880 }, { "epoch": 9.184698608964451, "grad_norm": 3.9157052040100098, "learning_rate": 1.0075003749598467e-06, "loss": 0.3572, "num_input_tokens_seen": 4009280, "step": 11885 }, { "epoch": 9.188562596599692, "grad_norm": 4.167463302612305, "learning_rate": 9.98046082857898e-07, "loss": 0.406, "num_input_tokens_seen": 4010688, "step": 11890 }, { "epoch": 9.19242658423493, "grad_norm": 1.9562163352966309, "learning_rate": 9.886354556805227e-07, "loss": 0.3241, "num_input_tokens_seen": 4012480, "step": 11895 }, { "epoch": 9.19629057187017, "grad_norm": 2.914872407913208, "learning_rate": 9.792685105477512e-07, "loss": 0.3106, "num_input_tokens_seen": 4013920, "step": 11900 }, { "epoch": 9.20015455950541, "grad_norm": 2.0528273582458496, "learning_rate": 9.69945264500155e-07, "loss": 0.3213, "num_input_tokens_seen": 4015456, "step": 11905 }, { "epoch": 9.204018547140649, "grad_norm": 3.3772757053375244, "learning_rate": 9.606657344988028e-07, "loss": 0.3601, "num_input_tokens_seen": 4017088, "step": 11910 }, { "epoch": 9.207882534775889, "grad_norm": 3.077155828475952, "learning_rate": 9.514299374252384e-07, "loss": 0.4321, "num_input_tokens_seen": 4018688, "step": 11915 }, { "epoch": 9.211746522411127, "grad_norm": 3.03593111038208, "learning_rate": 9.42237890081446e-07, "loss": 0.2887, "num_input_tokens_seen": 4020352, "step": 11920 }, { "epoch": 9.215610510046368, "grad_norm": 3.83441162109375, "learning_rate": 9.330896091898078e-07, "loss": 0.4599, "num_input_tokens_seen": 4021984, "step": 11925 }, { "epoch": 9.219474497681608, "grad_norm": 2.964686870574951, "learning_rate": 9.239851113930997e-07, "loss": 0.351, "num_input_tokens_seen": 4023872, "step": 11930 }, { "epoch": 9.223338485316846, "grad_norm": 2.173654079437256, "learning_rate": 9.149244132544393e-07, "loss": 0.288, "num_input_tokens_seen": 4025536, "step": 11935 }, { "epoch": 9.227202472952087, "grad_norm": 2.555044174194336, "learning_rate": 9.059075312572635e-07, "loss": 0.3241, "num_input_tokens_seen": 4027040, "step": 11940 }, { "epoch": 9.231066460587327, "grad_norm": 2.0183846950531006, "learning_rate": 8.969344818052977e-07, "loss": 0.3883, "num_input_tokens_seen": 4028960, "step": 11945 }, { "epoch": 9.234930448222565, "grad_norm": 4.130044937133789, "learning_rate": 8.880052812225314e-07, "loss": 0.4248, "num_input_tokens_seen": 4030688, "step": 11950 }, { "epoch": 9.238794435857805, "grad_norm": 3.130122184753418, "learning_rate": 8.791199457531735e-07, "loss": 0.6813, "num_input_tokens_seen": 4032128, "step": 11955 }, { "epoch": 9.242658423493046, "grad_norm": 2.1162891387939453, "learning_rate": 8.702784915616352e-07, "loss": 0.346, "num_input_tokens_seen": 4033952, "step": 11960 }, { "epoch": 9.246522411128284, "grad_norm": 2.069443702697754, "learning_rate": 8.614809347325031e-07, "loss": 0.333, "num_input_tokens_seen": 4035744, "step": 11965 }, { "epoch": 9.250386398763524, "grad_norm": 3.1797425746917725, "learning_rate": 8.527272912705025e-07, "loss": 0.4332, "num_input_tokens_seen": 4037440, "step": 11970 }, { "epoch": 9.254250386398763, "grad_norm": 4.089958667755127, "learning_rate": 8.440175771004699e-07, "loss": 0.5924, "num_input_tokens_seen": 4039072, "step": 11975 }, { "epoch": 9.258114374034003, "grad_norm": 2.8615880012512207, "learning_rate": 8.353518080673195e-07, "loss": 0.3603, "num_input_tokens_seen": 4040832, "step": 11980 }, { "epoch": 9.261978361669243, "grad_norm": 2.3594863414764404, "learning_rate": 8.267299999360267e-07, "loss": 0.3178, "num_input_tokens_seen": 4042528, "step": 11985 }, { "epoch": 9.265842349304481, "grad_norm": 4.334838390350342, "learning_rate": 8.181521683915921e-07, "loss": 0.3962, "num_input_tokens_seen": 4044224, "step": 11990 }, { "epoch": 9.269706336939722, "grad_norm": 4.061600685119629, "learning_rate": 8.096183290390053e-07, "loss": 0.4961, "num_input_tokens_seen": 4045920, "step": 11995 }, { "epoch": 9.273570324574962, "grad_norm": 1.912658452987671, "learning_rate": 8.011284974032363e-07, "loss": 0.4487, "num_input_tokens_seen": 4047584, "step": 12000 }, { "epoch": 9.2774343122102, "grad_norm": 1.3699579238891602, "learning_rate": 7.926826889291833e-07, "loss": 0.285, "num_input_tokens_seen": 4049216, "step": 12005 }, { "epoch": 9.28129829984544, "grad_norm": 4.001827239990234, "learning_rate": 7.842809189816641e-07, "loss": 0.3205, "num_input_tokens_seen": 4050912, "step": 12010 }, { "epoch": 9.28516228748068, "grad_norm": 3.851135492324829, "learning_rate": 7.759232028453744e-07, "loss": 0.3262, "num_input_tokens_seen": 4052704, "step": 12015 }, { "epoch": 9.28902627511592, "grad_norm": 3.818401575088501, "learning_rate": 7.676095557248769e-07, "loss": 0.4121, "num_input_tokens_seen": 4054368, "step": 12020 }, { "epoch": 9.29289026275116, "grad_norm": 2.084644317626953, "learning_rate": 7.593399927445454e-07, "loss": 0.3629, "num_input_tokens_seen": 4056000, "step": 12025 }, { "epoch": 9.2967542503864, "grad_norm": 3.1913681030273438, "learning_rate": 7.511145289485738e-07, "loss": 0.3439, "num_input_tokens_seen": 4057824, "step": 12030 }, { "epoch": 9.300618238021638, "grad_norm": 4.76110315322876, "learning_rate": 7.429331793009225e-07, "loss": 0.5464, "num_input_tokens_seen": 4059744, "step": 12035 }, { "epoch": 9.304482225656878, "grad_norm": 2.769914150238037, "learning_rate": 7.347959586852915e-07, "loss": 0.3348, "num_input_tokens_seen": 4061312, "step": 12040 }, { "epoch": 9.308346213292117, "grad_norm": 3.5497872829437256, "learning_rate": 7.267028819051058e-07, "loss": 0.5156, "num_input_tokens_seen": 4062752, "step": 12045 }, { "epoch": 9.312210200927357, "grad_norm": 3.605487823486328, "learning_rate": 7.186539636834855e-07, "loss": 0.33, "num_input_tokens_seen": 4064224, "step": 12050 }, { "epoch": 9.316074188562597, "grad_norm": 4.251762866973877, "learning_rate": 7.106492186632147e-07, "loss": 0.4544, "num_input_tokens_seen": 4065920, "step": 12055 }, { "epoch": 9.319938176197835, "grad_norm": 4.534819602966309, "learning_rate": 7.026886614067141e-07, "loss": 0.4436, "num_input_tokens_seen": 4067744, "step": 12060 }, { "epoch": 9.323802163833076, "grad_norm": 3.716170072555542, "learning_rate": 6.947723063960132e-07, "loss": 0.3134, "num_input_tokens_seen": 4069152, "step": 12065 }, { "epoch": 9.327666151468316, "grad_norm": 1.9188189506530762, "learning_rate": 6.869001680327447e-07, "loss": 0.4172, "num_input_tokens_seen": 4070976, "step": 12070 }, { "epoch": 9.331530139103554, "grad_norm": 3.902869462966919, "learning_rate": 6.790722606380806e-07, "loss": 0.3875, "num_input_tokens_seen": 4072416, "step": 12075 }, { "epoch": 9.335394126738795, "grad_norm": 2.409397602081299, "learning_rate": 6.712885984527378e-07, "loss": 0.4109, "num_input_tokens_seen": 4074208, "step": 12080 }, { "epoch": 9.339258114374035, "grad_norm": 4.041140556335449, "learning_rate": 6.635491956369394e-07, "loss": 0.4116, "num_input_tokens_seen": 4075904, "step": 12085 }, { "epoch": 9.343122102009273, "grad_norm": 2.0667660236358643, "learning_rate": 6.558540662703922e-07, "loss": 0.3383, "num_input_tokens_seen": 4077568, "step": 12090 }, { "epoch": 9.346986089644513, "grad_norm": 2.5909202098846436, "learning_rate": 6.482032243522618e-07, "loss": 0.3387, "num_input_tokens_seen": 4079520, "step": 12095 }, { "epoch": 9.350850077279752, "grad_norm": 5.10789680480957, "learning_rate": 6.405966838011313e-07, "loss": 0.567, "num_input_tokens_seen": 4081280, "step": 12100 }, { "epoch": 9.354714064914992, "grad_norm": 2.1008901596069336, "learning_rate": 6.330344584550063e-07, "loss": 0.3708, "num_input_tokens_seen": 4082976, "step": 12105 }, { "epoch": 9.358578052550232, "grad_norm": 4.845943450927734, "learning_rate": 6.255165620712711e-07, "loss": 0.3918, "num_input_tokens_seen": 4084704, "step": 12110 }, { "epoch": 9.36244204018547, "grad_norm": 4.144754409790039, "learning_rate": 6.18043008326652e-07, "loss": 0.3938, "num_input_tokens_seen": 4086368, "step": 12115 }, { "epoch": 9.36630602782071, "grad_norm": 2.661565065383911, "learning_rate": 6.106138108172233e-07, "loss": 0.3442, "num_input_tokens_seen": 4088192, "step": 12120 }, { "epoch": 9.370170015455951, "grad_norm": 3.908684253692627, "learning_rate": 6.032289830583515e-07, "loss": 0.3421, "num_input_tokens_seen": 4089824, "step": 12125 }, { "epoch": 9.37403400309119, "grad_norm": 2.5917980670928955, "learning_rate": 5.958885384846958e-07, "loss": 0.5586, "num_input_tokens_seen": 4091552, "step": 12130 }, { "epoch": 9.37789799072643, "grad_norm": 4.4656291007995605, "learning_rate": 5.885924904501627e-07, "loss": 0.5366, "num_input_tokens_seen": 4093312, "step": 12135 }, { "epoch": 9.38176197836167, "grad_norm": 2.406142473220825, "learning_rate": 5.81340852227899e-07, "loss": 0.3096, "num_input_tokens_seen": 4095040, "step": 12140 }, { "epoch": 9.385625965996908, "grad_norm": 2.6893672943115234, "learning_rate": 5.741336370102573e-07, "loss": 0.3299, "num_input_tokens_seen": 4096704, "step": 12145 }, { "epoch": 9.389489953632149, "grad_norm": 2.4548213481903076, "learning_rate": 5.669708579087718e-07, "loss": 0.4277, "num_input_tokens_seen": 4098432, "step": 12150 }, { "epoch": 9.393353941267389, "grad_norm": 3.3300273418426514, "learning_rate": 5.598525279541438e-07, "loss": 0.6135, "num_input_tokens_seen": 4099968, "step": 12155 }, { "epoch": 9.397217928902627, "grad_norm": 3.5510623455047607, "learning_rate": 5.527786600962093e-07, "loss": 0.6291, "num_input_tokens_seen": 4102016, "step": 12160 }, { "epoch": 9.401081916537867, "grad_norm": 1.9719479084014893, "learning_rate": 5.4574926720391e-07, "loss": 0.3263, "num_input_tokens_seen": 4103616, "step": 12165 }, { "epoch": 9.404945904173106, "grad_norm": 3.7611582279205322, "learning_rate": 5.387643620652888e-07, "loss": 0.4533, "num_input_tokens_seen": 4105376, "step": 12170 }, { "epoch": 9.408809891808346, "grad_norm": 4.240932464599609, "learning_rate": 5.318239573874534e-07, "loss": 0.5429, "num_input_tokens_seen": 4107296, "step": 12175 }, { "epoch": 9.412673879443586, "grad_norm": 4.189836502075195, "learning_rate": 5.24928065796551e-07, "loss": 0.3542, "num_input_tokens_seen": 4109056, "step": 12180 }, { "epoch": 9.416537867078825, "grad_norm": 5.266158103942871, "learning_rate": 5.18076699837744e-07, "loss": 0.569, "num_input_tokens_seen": 4110816, "step": 12185 }, { "epoch": 9.420401854714065, "grad_norm": 2.48465895652771, "learning_rate": 5.112698719752119e-07, "loss": 0.3652, "num_input_tokens_seen": 4112416, "step": 12190 }, { "epoch": 9.424265842349305, "grad_norm": 3.184039831161499, "learning_rate": 5.045075945920935e-07, "loss": 0.4423, "num_input_tokens_seen": 4114112, "step": 12195 }, { "epoch": 9.428129829984544, "grad_norm": 4.348435878753662, "learning_rate": 4.97789879990479e-07, "loss": 0.4525, "num_input_tokens_seen": 4115872, "step": 12200 }, { "epoch": 9.431993817619784, "grad_norm": 4.336721897125244, "learning_rate": 4.911167403913979e-07, "loss": 0.3924, "num_input_tokens_seen": 4117600, "step": 12205 }, { "epoch": 9.435857805255024, "grad_norm": 2.2060012817382812, "learning_rate": 4.844881879347896e-07, "loss": 0.4197, "num_input_tokens_seen": 4119360, "step": 12210 }, { "epoch": 9.439721792890262, "grad_norm": 1.9773855209350586, "learning_rate": 4.779042346794688e-07, "loss": 0.4111, "num_input_tokens_seen": 4121088, "step": 12215 }, { "epoch": 9.443585780525503, "grad_norm": 3.0531575679779053, "learning_rate": 4.7136489260311553e-07, "loss": 0.3731, "num_input_tokens_seen": 4122592, "step": 12220 }, { "epoch": 9.447449768160741, "grad_norm": 1.9099849462509155, "learning_rate": 4.648701736022637e-07, "loss": 0.3655, "num_input_tokens_seen": 4124160, "step": 12225 }, { "epoch": 9.451313755795981, "grad_norm": 2.5135865211486816, "learning_rate": 4.5842008949225913e-07, "loss": 0.38, "num_input_tokens_seen": 4125920, "step": 12230 }, { "epoch": 9.455177743431221, "grad_norm": 1.4923733472824097, "learning_rate": 4.5201465200724327e-07, "loss": 0.3092, "num_input_tokens_seen": 4127808, "step": 12235 }, { "epoch": 9.45904173106646, "grad_norm": 2.92545485496521, "learning_rate": 4.456538728001475e-07, "loss": 0.3973, "num_input_tokens_seen": 4129760, "step": 12240 }, { "epoch": 9.4629057187017, "grad_norm": 1.9812688827514648, "learning_rate": 4.393377634426432e-07, "loss": 0.5579, "num_input_tokens_seen": 4131264, "step": 12245 }, { "epoch": 9.46676970633694, "grad_norm": 2.297494411468506, "learning_rate": 4.3306633542515574e-07, "loss": 0.3081, "num_input_tokens_seen": 4133152, "step": 12250 }, { "epoch": 9.470633693972179, "grad_norm": 3.959681987762451, "learning_rate": 4.268396001568087e-07, "loss": 0.4624, "num_input_tokens_seen": 4134816, "step": 12255 }, { "epoch": 9.474497681607419, "grad_norm": 3.995067596435547, "learning_rate": 4.206575689654324e-07, "loss": 0.3868, "num_input_tokens_seen": 4136576, "step": 12260 }, { "epoch": 9.478361669242659, "grad_norm": 3.359642267227173, "learning_rate": 4.1452025309751673e-07, "loss": 0.535, "num_input_tokens_seen": 4138208, "step": 12265 }, { "epoch": 9.482225656877898, "grad_norm": 5.941317558288574, "learning_rate": 4.0842766371822216e-07, "loss": 0.5027, "num_input_tokens_seen": 4139552, "step": 12270 }, { "epoch": 9.486089644513138, "grad_norm": 5.412103176116943, "learning_rate": 4.02379811911327e-07, "loss": 0.3818, "num_input_tokens_seen": 4141216, "step": 12275 }, { "epoch": 9.489953632148378, "grad_norm": 3.9041712284088135, "learning_rate": 3.963767086792275e-07, "loss": 0.4078, "num_input_tokens_seen": 4142848, "step": 12280 }, { "epoch": 9.493817619783616, "grad_norm": 2.5263516902923584, "learning_rate": 3.9041836494291275e-07, "loss": 0.4355, "num_input_tokens_seen": 4144416, "step": 12285 }, { "epoch": 9.497681607418857, "grad_norm": 3.3815720081329346, "learning_rate": 3.845047915419397e-07, "loss": 0.3722, "num_input_tokens_seen": 4146112, "step": 12290 }, { "epoch": 9.5, "eval_loss": 0.5038707852363586, "eval_runtime": 11.5527, "eval_samples_per_second": 49.772, "eval_steps_per_second": 12.465, "num_input_tokens_seen": 4147200, "step": 12293 }, { "epoch": 9.501545595054095, "grad_norm": 2.7004220485687256, "learning_rate": 3.7863599923442516e-07, "loss": 0.2858, "num_input_tokens_seen": 4148032, "step": 12295 }, { "epoch": 9.505409582689335, "grad_norm": 2.811464309692383, "learning_rate": 3.728119986970147e-07, "loss": 0.3949, "num_input_tokens_seen": 4149664, "step": 12300 }, { "epoch": 9.509273570324575, "grad_norm": 2.1385436058044434, "learning_rate": 3.670328005248663e-07, "loss": 0.3249, "num_input_tokens_seen": 4151328, "step": 12305 }, { "epoch": 9.513137557959814, "grad_norm": 4.42088508605957, "learning_rate": 3.612984152316368e-07, "loss": 0.3484, "num_input_tokens_seen": 4152928, "step": 12310 }, { "epoch": 9.517001545595054, "grad_norm": 5.489293575286865, "learning_rate": 3.5560885324945614e-07, "loss": 0.5367, "num_input_tokens_seen": 4154656, "step": 12315 }, { "epoch": 9.520865533230294, "grad_norm": 2.163701295852661, "learning_rate": 3.499641249289087e-07, "loss": 0.3529, "num_input_tokens_seen": 4156384, "step": 12320 }, { "epoch": 9.524729520865533, "grad_norm": 2.338674306869507, "learning_rate": 3.443642405390191e-07, "loss": 0.3302, "num_input_tokens_seen": 4158112, "step": 12325 }, { "epoch": 9.528593508500773, "grad_norm": 3.0639963150024414, "learning_rate": 3.3880921026723243e-07, "loss": 0.3465, "num_input_tokens_seen": 4159808, "step": 12330 }, { "epoch": 9.532457496136013, "grad_norm": 4.385707855224609, "learning_rate": 3.332990442193901e-07, "loss": 0.3361, "num_input_tokens_seen": 4161568, "step": 12335 }, { "epoch": 9.536321483771252, "grad_norm": 2.2154011726379395, "learning_rate": 3.2783375241971223e-07, "loss": 0.3253, "num_input_tokens_seen": 4163200, "step": 12340 }, { "epoch": 9.540185471406492, "grad_norm": 3.0670483112335205, "learning_rate": 3.2241334481079576e-07, "loss": 0.4767, "num_input_tokens_seen": 4164544, "step": 12345 }, { "epoch": 9.54404945904173, "grad_norm": 2.232095956802368, "learning_rate": 3.170378312535693e-07, "loss": 0.3247, "num_input_tokens_seen": 4166368, "step": 12350 }, { "epoch": 9.54791344667697, "grad_norm": 3.4633028507232666, "learning_rate": 3.1170722152729925e-07, "loss": 0.4064, "num_input_tokens_seen": 4168160, "step": 12355 }, { "epoch": 9.55177743431221, "grad_norm": 2.169146776199341, "learning_rate": 3.064215253295505e-07, "loss": 0.4454, "num_input_tokens_seen": 4169760, "step": 12360 }, { "epoch": 9.555641421947449, "grad_norm": 2.7106435298919678, "learning_rate": 3.011807522761922e-07, "loss": 0.3249, "num_input_tokens_seen": 4171072, "step": 12365 }, { "epoch": 9.55950540958269, "grad_norm": 3.6266634464263916, "learning_rate": 2.9598491190136467e-07, "loss": 0.3822, "num_input_tokens_seen": 4172736, "step": 12370 }, { "epoch": 9.56336939721793, "grad_norm": 3.3651204109191895, "learning_rate": 2.908340136574594e-07, "loss": 0.3543, "num_input_tokens_seen": 4174240, "step": 12375 }, { "epoch": 9.567233384853168, "grad_norm": 4.2870049476623535, "learning_rate": 2.857280669151141e-07, "loss": 0.4579, "num_input_tokens_seen": 4176224, "step": 12380 }, { "epoch": 9.571097372488408, "grad_norm": 2.2344555854797363, "learning_rate": 2.8066708096319005e-07, "loss": 0.4261, "num_input_tokens_seen": 4177792, "step": 12385 }, { "epoch": 9.574961360123648, "grad_norm": 3.547192096710205, "learning_rate": 2.756510650087557e-07, "loss": 0.5223, "num_input_tokens_seen": 4179744, "step": 12390 }, { "epoch": 9.578825347758887, "grad_norm": 3.632986068725586, "learning_rate": 2.706800281770588e-07, "loss": 0.4787, "num_input_tokens_seen": 4181376, "step": 12395 }, { "epoch": 9.582689335394127, "grad_norm": 3.0051348209381104, "learning_rate": 2.6575397951153213e-07, "loss": 0.4241, "num_input_tokens_seen": 4183136, "step": 12400 }, { "epoch": 9.586553323029367, "grad_norm": 3.9939866065979004, "learning_rate": 2.6087292797375983e-07, "loss": 0.5363, "num_input_tokens_seen": 4184704, "step": 12405 }, { "epoch": 9.590417310664606, "grad_norm": 5.929431915283203, "learning_rate": 2.5603688244346947e-07, "loss": 0.4596, "num_input_tokens_seen": 4186304, "step": 12410 }, { "epoch": 9.594281298299846, "grad_norm": 2.3288938999176025, "learning_rate": 2.512458517185068e-07, "loss": 0.6847, "num_input_tokens_seen": 4187840, "step": 12415 }, { "epoch": 9.598145285935084, "grad_norm": 3.3580892086029053, "learning_rate": 2.4649984451482753e-07, "loss": 0.3525, "num_input_tokens_seen": 4189376, "step": 12420 }, { "epoch": 9.602009273570324, "grad_norm": 4.576455116271973, "learning_rate": 2.417988694664836e-07, "loss": 0.2785, "num_input_tokens_seen": 4190816, "step": 12425 }, { "epoch": 9.605873261205565, "grad_norm": 1.87228524684906, "learning_rate": 2.3714293512560048e-07, "loss": 0.3433, "num_input_tokens_seen": 4192800, "step": 12430 }, { "epoch": 9.609737248840803, "grad_norm": 1.7355762720108032, "learning_rate": 2.3253204996236122e-07, "loss": 0.351, "num_input_tokens_seen": 4194304, "step": 12435 }, { "epoch": 9.613601236476043, "grad_norm": 2.8246798515319824, "learning_rate": 2.279662223650031e-07, "loss": 0.6185, "num_input_tokens_seen": 4196128, "step": 12440 }, { "epoch": 9.617465224111283, "grad_norm": 3.6127450466156006, "learning_rate": 2.23445460639779e-07, "loss": 0.4445, "num_input_tokens_seen": 4197760, "step": 12445 }, { "epoch": 9.621329211746522, "grad_norm": 1.552925705909729, "learning_rate": 2.1896977301097686e-07, "loss": 0.4241, "num_input_tokens_seen": 4199520, "step": 12450 }, { "epoch": 9.625193199381762, "grad_norm": 1.9134658575057983, "learning_rate": 2.145391676208669e-07, "loss": 0.3857, "num_input_tokens_seen": 4201152, "step": 12455 }, { "epoch": 9.629057187017002, "grad_norm": 4.811909198760986, "learning_rate": 2.1015365252971265e-07, "loss": 0.4717, "num_input_tokens_seen": 4202944, "step": 12460 }, { "epoch": 9.63292117465224, "grad_norm": 2.0131659507751465, "learning_rate": 2.0581323571574885e-07, "loss": 0.5431, "num_input_tokens_seen": 4204640, "step": 12465 }, { "epoch": 9.636785162287481, "grad_norm": 3.1481714248657227, "learning_rate": 2.0151792507516754e-07, "loss": 0.3901, "num_input_tokens_seen": 4206304, "step": 12470 }, { "epoch": 9.64064914992272, "grad_norm": 2.3905346393585205, "learning_rate": 1.9726772842209307e-07, "loss": 0.3175, "num_input_tokens_seen": 4207968, "step": 12475 }, { "epoch": 9.64451313755796, "grad_norm": 3.0238401889801025, "learning_rate": 1.9306265348858766e-07, "loss": 0.3083, "num_input_tokens_seen": 4209568, "step": 12480 }, { "epoch": 9.6483771251932, "grad_norm": 3.815396547317505, "learning_rate": 1.889027079246236e-07, "loss": 0.4579, "num_input_tokens_seen": 4211200, "step": 12485 }, { "epoch": 9.652241112828438, "grad_norm": 3.164646625518799, "learning_rate": 1.8478789929807505e-07, "loss": 0.4784, "num_input_tokens_seen": 4212800, "step": 12490 }, { "epoch": 9.656105100463678, "grad_norm": 2.745734214782715, "learning_rate": 1.8071823509469288e-07, "loss": 0.3265, "num_input_tokens_seen": 4214240, "step": 12495 }, { "epoch": 9.659969088098919, "grad_norm": 2.863051652908325, "learning_rate": 1.7669372271811313e-07, "loss": 0.2973, "num_input_tokens_seen": 4215808, "step": 12500 }, { "epoch": 9.663833075734157, "grad_norm": 1.6896435022354126, "learning_rate": 1.7271436948982368e-07, "loss": 0.2802, "num_input_tokens_seen": 4217280, "step": 12505 }, { "epoch": 9.667697063369397, "grad_norm": 4.163987636566162, "learning_rate": 1.6878018264915584e-07, "loss": 0.4277, "num_input_tokens_seen": 4218944, "step": 12510 }, { "epoch": 9.671561051004637, "grad_norm": 2.701277732849121, "learning_rate": 1.6489116935327343e-07, "loss": 0.3073, "num_input_tokens_seen": 4220640, "step": 12515 }, { "epoch": 9.675425038639876, "grad_norm": 2.0621275901794434, "learning_rate": 1.6104733667716975e-07, "loss": 0.4403, "num_input_tokens_seen": 4222464, "step": 12520 }, { "epoch": 9.679289026275116, "grad_norm": 1.9660770893096924, "learning_rate": 1.57248691613629e-07, "loss": 0.3775, "num_input_tokens_seen": 4224352, "step": 12525 }, { "epoch": 9.683153013910356, "grad_norm": 4.617716312408447, "learning_rate": 1.5349524107323988e-07, "loss": 0.5308, "num_input_tokens_seen": 4226304, "step": 12530 }, { "epoch": 9.687017001545595, "grad_norm": 3.228199005126953, "learning_rate": 1.4978699188437083e-07, "loss": 0.3465, "num_input_tokens_seen": 4227872, "step": 12535 }, { "epoch": 9.690880989180835, "grad_norm": 2.934948682785034, "learning_rate": 1.4612395079315334e-07, "loss": 0.3727, "num_input_tokens_seen": 4229472, "step": 12540 }, { "epoch": 9.694744976816073, "grad_norm": 4.764428615570068, "learning_rate": 1.4250612446347622e-07, "loss": 0.582, "num_input_tokens_seen": 4231360, "step": 12545 }, { "epoch": 9.698608964451314, "grad_norm": 1.8488270044326782, "learning_rate": 1.389335194769803e-07, "loss": 0.2928, "num_input_tokens_seen": 4233056, "step": 12550 }, { "epoch": 9.702472952086554, "grad_norm": 2.9798285961151123, "learning_rate": 1.354061423330333e-07, "loss": 0.3887, "num_input_tokens_seen": 4234784, "step": 12555 }, { "epoch": 9.706336939721792, "grad_norm": 5.29259729385376, "learning_rate": 1.3192399944872148e-07, "loss": 0.4355, "num_input_tokens_seen": 4236416, "step": 12560 }, { "epoch": 9.710200927357032, "grad_norm": 2.224881649017334, "learning_rate": 1.2848709715884145e-07, "loss": 0.4145, "num_input_tokens_seen": 4237856, "step": 12565 }, { "epoch": 9.714064914992273, "grad_norm": 4.5395684242248535, "learning_rate": 1.2509544171588893e-07, "loss": 0.4684, "num_input_tokens_seen": 4239616, "step": 12570 }, { "epoch": 9.717928902627511, "grad_norm": 2.8548245429992676, "learning_rate": 1.217490392900422e-07, "loss": 0.3258, "num_input_tokens_seen": 4241440, "step": 12575 }, { "epoch": 9.721792890262751, "grad_norm": 2.956843376159668, "learning_rate": 1.1844789596915651e-07, "loss": 0.2769, "num_input_tokens_seen": 4243136, "step": 12580 }, { "epoch": 9.725656877897991, "grad_norm": 3.3378677368164062, "learning_rate": 1.1519201775875288e-07, "loss": 0.4726, "num_input_tokens_seen": 4244608, "step": 12585 }, { "epoch": 9.72952086553323, "grad_norm": 3.0103256702423096, "learning_rate": 1.1198141058199885e-07, "loss": 0.378, "num_input_tokens_seen": 4245984, "step": 12590 }, { "epoch": 9.73338485316847, "grad_norm": 3.071077346801758, "learning_rate": 1.0881608027971114e-07, "loss": 0.4027, "num_input_tokens_seen": 4247776, "step": 12595 }, { "epoch": 9.737248840803709, "grad_norm": 2.644495964050293, "learning_rate": 1.056960326103279e-07, "loss": 0.4663, "num_input_tokens_seen": 4249376, "step": 12600 }, { "epoch": 9.741112828438949, "grad_norm": 2.9050793647766113, "learning_rate": 1.0262127324991988e-07, "loss": 0.3626, "num_input_tokens_seen": 4250944, "step": 12605 }, { "epoch": 9.744976816074189, "grad_norm": 2.609511613845825, "learning_rate": 9.959180779215704e-08, "loss": 0.3471, "num_input_tokens_seen": 4253056, "step": 12610 }, { "epoch": 9.748840803709427, "grad_norm": 3.760219097137451, "learning_rate": 9.660764174831971e-08, "loss": 0.3606, "num_input_tokens_seen": 4254624, "step": 12615 }, { "epoch": 9.752704791344668, "grad_norm": 2.3010716438293457, "learning_rate": 9.366878054727079e-08, "loss": 0.3397, "num_input_tokens_seen": 4256352, "step": 12620 }, { "epoch": 9.756568778979908, "grad_norm": 2.6140429973602295, "learning_rate": 9.077522953545859e-08, "loss": 0.3416, "num_input_tokens_seen": 4258112, "step": 12625 }, { "epoch": 9.760432766615146, "grad_norm": 2.1192972660064697, "learning_rate": 8.79269939768973e-08, "loss": 0.3426, "num_input_tokens_seen": 4259776, "step": 12630 }, { "epoch": 9.764296754250386, "grad_norm": 3.0398504734039307, "learning_rate": 8.512407905316432e-08, "loss": 0.2975, "num_input_tokens_seen": 4261568, "step": 12635 }, { "epoch": 9.768160741885627, "grad_norm": 3.8385674953460693, "learning_rate": 8.23664898633919e-08, "loss": 0.6986, "num_input_tokens_seen": 4263200, "step": 12640 }, { "epoch": 9.772024729520865, "grad_norm": 3.394974946975708, "learning_rate": 7.965423142425044e-08, "loss": 0.5382, "num_input_tokens_seen": 4264800, "step": 12645 }, { "epoch": 9.775888717156105, "grad_norm": 5.671842575073242, "learning_rate": 7.698730866994575e-08, "loss": 0.6548, "num_input_tokens_seen": 4266464, "step": 12650 }, { "epoch": 9.779752704791346, "grad_norm": 3.1489827632904053, "learning_rate": 7.436572645220519e-08, "loss": 0.582, "num_input_tokens_seen": 4268000, "step": 12655 }, { "epoch": 9.783616692426584, "grad_norm": 2.5151143074035645, "learning_rate": 7.178948954027209e-08, "loss": 0.4289, "num_input_tokens_seen": 4269632, "step": 12660 }, { "epoch": 9.787480680061824, "grad_norm": 2.244499683380127, "learning_rate": 6.925860262090301e-08, "loss": 0.3258, "num_input_tokens_seen": 4271616, "step": 12665 }, { "epoch": 9.791344667697063, "grad_norm": 3.0234575271606445, "learning_rate": 6.677307029834267e-08, "loss": 0.3312, "num_input_tokens_seen": 4273280, "step": 12670 }, { "epoch": 9.795208655332303, "grad_norm": 1.9545496702194214, "learning_rate": 6.43328970943352e-08, "loss": 0.3729, "num_input_tokens_seen": 4275136, "step": 12675 }, { "epoch": 9.799072642967543, "grad_norm": 4.744884967803955, "learning_rate": 6.193808744809626e-08, "loss": 0.3585, "num_input_tokens_seen": 4276736, "step": 12680 }, { "epoch": 9.802936630602781, "grad_norm": 4.335810661315918, "learning_rate": 5.9588645716324164e-08, "loss": 0.3921, "num_input_tokens_seen": 4278144, "step": 12685 }, { "epoch": 9.806800618238022, "grad_norm": 1.6805505752563477, "learning_rate": 5.728457617317773e-08, "loss": 0.3523, "num_input_tokens_seen": 4279904, "step": 12690 }, { "epoch": 9.810664605873262, "grad_norm": 2.5937626361846924, "learning_rate": 5.502588301027345e-08, "loss": 0.3226, "num_input_tokens_seen": 4281600, "step": 12695 }, { "epoch": 9.8145285935085, "grad_norm": 2.872339963912964, "learning_rate": 5.281257033668552e-08, "loss": 0.4694, "num_input_tokens_seen": 4283168, "step": 12700 }, { "epoch": 9.81839258114374, "grad_norm": 2.3865396976470947, "learning_rate": 5.064464217891807e-08, "loss": 0.5221, "num_input_tokens_seen": 4284512, "step": 12705 }, { "epoch": 9.82225656877898, "grad_norm": 4.482175827026367, "learning_rate": 4.852210248091904e-08, "loss": 0.3996, "num_input_tokens_seen": 4286400, "step": 12710 }, { "epoch": 9.826120556414219, "grad_norm": 2.052034378051758, "learning_rate": 4.644495510406632e-08, "loss": 0.4497, "num_input_tokens_seen": 4288320, "step": 12715 }, { "epoch": 9.82998454404946, "grad_norm": 2.5211973190307617, "learning_rate": 4.441320382715108e-08, "loss": 0.3119, "num_input_tokens_seen": 4290080, "step": 12720 }, { "epoch": 9.833848531684698, "grad_norm": 3.225955009460449, "learning_rate": 4.242685234638888e-08, "loss": 0.3016, "num_input_tokens_seen": 4291808, "step": 12725 }, { "epoch": 9.837712519319938, "grad_norm": 3.4123611450195312, "learning_rate": 4.0485904275391897e-08, "loss": 0.4407, "num_input_tokens_seen": 4293504, "step": 12730 }, { "epoch": 9.841576506955178, "grad_norm": 3.4620604515075684, "learning_rate": 3.859036314518283e-08, "loss": 0.353, "num_input_tokens_seen": 4295232, "step": 12735 }, { "epoch": 9.845440494590417, "grad_norm": 2.629182815551758, "learning_rate": 3.6740232404175454e-08, "loss": 0.3825, "num_input_tokens_seen": 4296928, "step": 12740 }, { "epoch": 9.849304482225657, "grad_norm": 2.251512050628662, "learning_rate": 3.4935515418169085e-08, "loss": 0.3618, "num_input_tokens_seen": 4298848, "step": 12745 }, { "epoch": 9.853168469860897, "grad_norm": 1.867733359336853, "learning_rate": 3.3176215470348546e-08, "loss": 0.4566, "num_input_tokens_seen": 4300768, "step": 12750 }, { "epoch": 9.857032457496135, "grad_norm": 2.154043197631836, "learning_rate": 3.146233576127311e-08, "loss": 0.3259, "num_input_tokens_seen": 4302304, "step": 12755 }, { "epoch": 9.860896445131376, "grad_norm": 2.700408935546875, "learning_rate": 2.979387940887646e-08, "loss": 0.3904, "num_input_tokens_seen": 4303936, "step": 12760 }, { "epoch": 9.864760432766616, "grad_norm": 1.5906521081924438, "learning_rate": 2.817084944845283e-08, "loss": 0.3409, "num_input_tokens_seen": 4305536, "step": 12765 }, { "epoch": 9.868624420401854, "grad_norm": 2.562021017074585, "learning_rate": 2.6593248832654237e-08, "loss": 0.388, "num_input_tokens_seen": 4307136, "step": 12770 }, { "epoch": 9.872488408037094, "grad_norm": 3.066938638687134, "learning_rate": 2.506108043149047e-08, "loss": 0.4761, "num_input_tokens_seen": 4309088, "step": 12775 }, { "epoch": 9.876352395672335, "grad_norm": 3.498051881790161, "learning_rate": 2.3574347032320754e-08, "loss": 0.3983, "num_input_tokens_seen": 4310592, "step": 12780 }, { "epoch": 9.880216383307573, "grad_norm": 2.5193159580230713, "learning_rate": 2.2133051339842668e-08, "loss": 0.347, "num_input_tokens_seen": 4312256, "step": 12785 }, { "epoch": 9.884080370942813, "grad_norm": 3.0472331047058105, "learning_rate": 2.0737195976100464e-08, "loss": 0.329, "num_input_tokens_seen": 4313888, "step": 12790 }, { "epoch": 9.887944358578052, "grad_norm": 2.916365146636963, "learning_rate": 1.938678348046008e-08, "loss": 0.4648, "num_input_tokens_seen": 4315552, "step": 12795 }, { "epoch": 9.891808346213292, "grad_norm": 2.862055540084839, "learning_rate": 1.808181630963135e-08, "loss": 0.2715, "num_input_tokens_seen": 4317344, "step": 12800 }, { "epoch": 9.895672333848532, "grad_norm": 2.0580503940582275, "learning_rate": 1.6822296837637474e-08, "loss": 0.4197, "num_input_tokens_seen": 4319008, "step": 12805 }, { "epoch": 9.89953632148377, "grad_norm": 2.9503679275512695, "learning_rate": 1.5608227355826123e-08, "loss": 0.3993, "num_input_tokens_seen": 4320640, "step": 12810 }, { "epoch": 9.90340030911901, "grad_norm": 3.6856019496917725, "learning_rate": 1.4439610072863874e-08, "loss": 0.3689, "num_input_tokens_seen": 4322752, "step": 12815 }, { "epoch": 9.907264296754251, "grad_norm": 1.490685224533081, "learning_rate": 1.3316447114725129e-08, "loss": 0.2754, "num_input_tokens_seen": 4324288, "step": 12820 }, { "epoch": 9.91112828438949, "grad_norm": 1.670583724975586, "learning_rate": 1.2238740524692094e-08, "loss": 0.2842, "num_input_tokens_seen": 4325728, "step": 12825 }, { "epoch": 9.91499227202473, "grad_norm": 4.434152603149414, "learning_rate": 1.1206492263360347e-08, "loss": 0.4791, "num_input_tokens_seen": 4327424, "step": 12830 }, { "epoch": 9.91885625965997, "grad_norm": 3.289130687713623, "learning_rate": 1.0219704208616621e-08, "loss": 0.3733, "num_input_tokens_seen": 4328928, "step": 12835 }, { "epoch": 9.922720247295208, "grad_norm": 1.6618568897247314, "learning_rate": 9.278378155647138e-09, "loss": 0.4155, "num_input_tokens_seen": 4330944, "step": 12840 }, { "epoch": 9.926584234930449, "grad_norm": 2.3654398918151855, "learning_rate": 8.382515816937609e-09, "loss": 0.2891, "num_input_tokens_seen": 4332864, "step": 12845 }, { "epoch": 9.930448222565687, "grad_norm": 2.3738040924072266, "learning_rate": 7.532118822262124e-09, "loss": 0.3042, "num_input_tokens_seen": 4334432, "step": 12850 }, { "epoch": 9.934312210200927, "grad_norm": 2.8485097885131836, "learning_rate": 6.727188718683164e-09, "loss": 0.3138, "num_input_tokens_seen": 4336096, "step": 12855 }, { "epoch": 9.938176197836167, "grad_norm": 3.2798142433166504, "learning_rate": 5.967726970548815e-09, "loss": 0.3776, "num_input_tokens_seen": 4337728, "step": 12860 }, { "epoch": 9.942040185471406, "grad_norm": 2.3865997791290283, "learning_rate": 5.2537349594872224e-09, "loss": 0.3127, "num_input_tokens_seen": 4339360, "step": 12865 }, { "epoch": 9.945904173106646, "grad_norm": 1.958700180053711, "learning_rate": 4.58521398441214e-09, "loss": 0.3483, "num_input_tokens_seen": 4341184, "step": 12870 }, { "epoch": 9.949768160741886, "grad_norm": 3.0867204666137695, "learning_rate": 3.9621652615118295e-09, "loss": 0.5156, "num_input_tokens_seen": 4342848, "step": 12875 }, { "epoch": 9.953632148377125, "grad_norm": 4.302017688751221, "learning_rate": 3.3845899242546108e-09, "loss": 0.4649, "num_input_tokens_seen": 4344672, "step": 12880 }, { "epoch": 9.957496136012365, "grad_norm": 3.5528242588043213, "learning_rate": 2.8524890233722066e-09, "loss": 0.3251, "num_input_tokens_seen": 4346272, "step": 12885 }, { "epoch": 9.961360123647605, "grad_norm": 1.7357161045074463, "learning_rate": 2.3658635268819507e-09, "loss": 0.5271, "num_input_tokens_seen": 4348224, "step": 12890 }, { "epoch": 9.965224111282843, "grad_norm": 5.57891321182251, "learning_rate": 1.9247143200618043e-09, "loss": 0.4358, "num_input_tokens_seen": 4349984, "step": 12895 }, { "epoch": 9.969088098918084, "grad_norm": 2.2650599479675293, "learning_rate": 1.529042205458686e-09, "loss": 0.3581, "num_input_tokens_seen": 4351488, "step": 12900 }, { "epoch": 9.972952086553324, "grad_norm": 6.011088848114014, "learning_rate": 1.1788479028940203e-09, "loss": 0.3865, "num_input_tokens_seen": 4353024, "step": 12905 }, { "epoch": 9.976816074188562, "grad_norm": 5.385447025299072, "learning_rate": 8.741320494443095e-10, "loss": 0.5942, "num_input_tokens_seen": 4354560, "step": 12910 }, { "epoch": 9.980680061823803, "grad_norm": 3.2540972232818604, "learning_rate": 6.148951994577878e-10, "loss": 0.3823, "num_input_tokens_seen": 4356128, "step": 12915 }, { "epoch": 9.984544049459041, "grad_norm": 2.619534969329834, "learning_rate": 4.0113782454609395e-10, "loss": 0.4517, "num_input_tokens_seen": 4357888, "step": 12920 }, { "epoch": 9.988408037094281, "grad_norm": 3.0035240650177, "learning_rate": 2.3286031357871994e-10, "loss": 0.3333, "num_input_tokens_seen": 4359456, "step": 12925 }, { "epoch": 9.992272024729521, "grad_norm": 2.3697316646575928, "learning_rate": 1.1006297269411381e-10, "loss": 0.4075, "num_input_tokens_seen": 4361152, "step": 12930 }, { "epoch": 9.99613601236476, "grad_norm": 2.007124185562134, "learning_rate": 3.274602528302584e-11, "loss": 0.734, "num_input_tokens_seen": 4362496, "step": 12935 }, { "epoch": 10.0, "grad_norm": 5.1930389404296875, "learning_rate": 9.096120051621526e-13, "loss": 0.4486, "num_input_tokens_seen": 4364240, "step": 12940 }, { "epoch": 10.0, "eval_loss": 0.5031372904777527, "eval_runtime": 11.5556, "eval_samples_per_second": 49.759, "eval_steps_per_second": 12.461, "num_input_tokens_seen": 4364240, "step": 12940 }, { "epoch": 10.0, "num_input_tokens_seen": 4364240, "step": 12940, "total_flos": 1.9651975470317568e+17, "train_loss": 0.5767820302172458, "train_runtime": 2616.0084, "train_samples_per_second": 19.778, "train_steps_per_second": 4.946 } ], "logging_steps": 5, "max_steps": 12940, "num_input_tokens_seen": 4364240, "num_train_epochs": 10, "save_steps": 647, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.9651975470317568e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }