diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,41632 @@ +{ + "best_global_step": 21998, + "best_metric": 0.4446313679218292, + "best_model_checkpoint": "saves_multiple/ia3/llama-3-8b-instruct/train_stsb_456_1760637814/checkpoint-21998", + "epoch": 20.0, + "eval_steps": 1294, + "global_step": 25880, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0038639876352395673, + "grad_norm": 3.6805174350738525, + "learning_rate": 7.727975270479134e-08, + "loss": 4.9406, + "num_input_tokens_seen": 1728, + "step": 5 + }, + { + "epoch": 0.0077279752704791345, + "grad_norm": 4.186256408691406, + "learning_rate": 1.7387944358578052e-07, + "loss": 4.6607, + "num_input_tokens_seen": 3520, + "step": 10 + }, + { + "epoch": 0.011591962905718702, + "grad_norm": 5.091475009918213, + "learning_rate": 2.704791344667697e-07, + "loss": 4.8512, + "num_input_tokens_seen": 5120, + "step": 15 + }, + { + "epoch": 0.015455950540958269, + "grad_norm": 4.099307060241699, + "learning_rate": 3.670788253477589e-07, + "loss": 5.4205, + "num_input_tokens_seen": 6720, + "step": 20 + }, + { + "epoch": 0.019319938176197836, + "grad_norm": 6.88045072555542, + "learning_rate": 4.636785162287481e-07, + "loss": 4.7156, + "num_input_tokens_seen": 8576, + "step": 25 + }, + { + "epoch": 0.023183925811437404, + "grad_norm": 4.43112325668335, + "learning_rate": 5.602782071097372e-07, + "loss": 4.7277, + "num_input_tokens_seen": 10240, + "step": 30 + }, + { + "epoch": 0.02704791344667697, + "grad_norm": 3.841653347015381, + "learning_rate": 6.568778979907265e-07, + "loss": 4.577, + "num_input_tokens_seen": 12032, + "step": 35 + }, + { + "epoch": 0.030911901081916538, + "grad_norm": 5.664732933044434, + "learning_rate": 7.534775888717157e-07, + "loss": 5.1788, + "num_input_tokens_seen": 13856, + "step": 40 + }, + { + "epoch": 0.0347758887171561, + "grad_norm": 3.5554909706115723, + "learning_rate": 8.500772797527049e-07, + "loss": 4.7527, + "num_input_tokens_seen": 15392, + "step": 45 + }, + { + "epoch": 0.03863987635239567, + "grad_norm": 5.954793453216553, + "learning_rate": 9.466769706336941e-07, + "loss": 5.2011, + "num_input_tokens_seen": 17088, + "step": 50 + }, + { + "epoch": 0.04250386398763524, + "grad_norm": 4.14451789855957, + "learning_rate": 1.0432766615146832e-06, + "loss": 5.1104, + "num_input_tokens_seen": 18528, + "step": 55 + }, + { + "epoch": 0.04636785162287481, + "grad_norm": 5.51561164855957, + "learning_rate": 1.1398763523956722e-06, + "loss": 5.1824, + "num_input_tokens_seen": 20128, + "step": 60 + }, + { + "epoch": 0.05023183925811438, + "grad_norm": 4.798835754394531, + "learning_rate": 1.2364760432766615e-06, + "loss": 4.8092, + "num_input_tokens_seen": 21728, + "step": 65 + }, + { + "epoch": 0.05409582689335394, + "grad_norm": 3.721454620361328, + "learning_rate": 1.3330757341576507e-06, + "loss": 4.7296, + "num_input_tokens_seen": 23680, + "step": 70 + }, + { + "epoch": 0.05795981452859351, + "grad_norm": 5.345273494720459, + "learning_rate": 1.42967542503864e-06, + "loss": 5.2396, + "num_input_tokens_seen": 25376, + "step": 75 + }, + { + "epoch": 0.061823802163833076, + "grad_norm": 5.0508270263671875, + "learning_rate": 1.5262751159196291e-06, + "loss": 5.0289, + "num_input_tokens_seen": 27008, + "step": 80 + }, + { + "epoch": 0.06568778979907264, + "grad_norm": 3.4617745876312256, + "learning_rate": 1.6228748068006183e-06, + "loss": 4.8012, + "num_input_tokens_seen": 28736, + "step": 85 + }, + { + "epoch": 0.0695517774343122, + "grad_norm": 6.158999919891357, + "learning_rate": 1.7194744976816076e-06, + "loss": 4.905, + "num_input_tokens_seen": 30528, + "step": 90 + }, + { + "epoch": 0.07341576506955177, + "grad_norm": 3.679497480392456, + "learning_rate": 1.8160741885625968e-06, + "loss": 5.1084, + "num_input_tokens_seen": 32352, + "step": 95 + }, + { + "epoch": 0.07727975270479134, + "grad_norm": 4.463337421417236, + "learning_rate": 1.9126738794435858e-06, + "loss": 4.7781, + "num_input_tokens_seen": 34240, + "step": 100 + }, + { + "epoch": 0.08114374034003091, + "grad_norm": 3.6712703704833984, + "learning_rate": 2.009273570324575e-06, + "loss": 4.9044, + "num_input_tokens_seen": 35968, + "step": 105 + }, + { + "epoch": 0.08500772797527048, + "grad_norm": 4.574020862579346, + "learning_rate": 2.1058732612055642e-06, + "loss": 4.9974, + "num_input_tokens_seen": 37376, + "step": 110 + }, + { + "epoch": 0.08887171561051005, + "grad_norm": 4.872241497039795, + "learning_rate": 2.2024729520865534e-06, + "loss": 4.7524, + "num_input_tokens_seen": 38944, + "step": 115 + }, + { + "epoch": 0.09273570324574962, + "grad_norm": 4.872554779052734, + "learning_rate": 2.2990726429675427e-06, + "loss": 4.7485, + "num_input_tokens_seen": 40768, + "step": 120 + }, + { + "epoch": 0.09659969088098919, + "grad_norm": 3.6612114906311035, + "learning_rate": 2.3956723338485315e-06, + "loss": 4.9618, + "num_input_tokens_seen": 42240, + "step": 125 + }, + { + "epoch": 0.10046367851622875, + "grad_norm": 4.086244106292725, + "learning_rate": 2.4922720247295207e-06, + "loss": 4.616, + "num_input_tokens_seen": 44128, + "step": 130 + }, + { + "epoch": 0.10432766615146831, + "grad_norm": 5.514211177825928, + "learning_rate": 2.58887171561051e-06, + "loss": 4.9914, + "num_input_tokens_seen": 45728, + "step": 135 + }, + { + "epoch": 0.10819165378670788, + "grad_norm": 4.810798168182373, + "learning_rate": 2.685471406491499e-06, + "loss": 4.6327, + "num_input_tokens_seen": 47200, + "step": 140 + }, + { + "epoch": 0.11205564142194745, + "grad_norm": 3.7766611576080322, + "learning_rate": 2.7820710973724883e-06, + "loss": 4.7806, + "num_input_tokens_seen": 49056, + "step": 145 + }, + { + "epoch": 0.11591962905718702, + "grad_norm": 5.025806427001953, + "learning_rate": 2.8786707882534776e-06, + "loss": 5.0558, + "num_input_tokens_seen": 50880, + "step": 150 + }, + { + "epoch": 0.11978361669242658, + "grad_norm": 3.660550594329834, + "learning_rate": 2.9752704791344668e-06, + "loss": 5.2515, + "num_input_tokens_seen": 52384, + "step": 155 + }, + { + "epoch": 0.12364760432766615, + "grad_norm": 4.906343460083008, + "learning_rate": 3.071870170015456e-06, + "loss": 4.9144, + "num_input_tokens_seen": 54048, + "step": 160 + }, + { + "epoch": 0.1275115919629057, + "grad_norm": 4.28169584274292, + "learning_rate": 3.1684698608964452e-06, + "loss": 4.9702, + "num_input_tokens_seen": 55584, + "step": 165 + }, + { + "epoch": 0.13137557959814528, + "grad_norm": 6.88645076751709, + "learning_rate": 3.2650695517774344e-06, + "loss": 5.0235, + "num_input_tokens_seen": 57408, + "step": 170 + }, + { + "epoch": 0.13523956723338484, + "grad_norm": 4.4947404861450195, + "learning_rate": 3.3616692426584237e-06, + "loss": 5.362, + "num_input_tokens_seen": 59072, + "step": 175 + }, + { + "epoch": 0.1391035548686244, + "grad_norm": 3.896484136581421, + "learning_rate": 3.458268933539413e-06, + "loss": 4.7239, + "num_input_tokens_seen": 60672, + "step": 180 + }, + { + "epoch": 0.14296754250386398, + "grad_norm": 5.122371196746826, + "learning_rate": 3.554868624420402e-06, + "loss": 5.5912, + "num_input_tokens_seen": 62144, + "step": 185 + }, + { + "epoch": 0.14683153013910355, + "grad_norm": 6.12204647064209, + "learning_rate": 3.6514683153013913e-06, + "loss": 5.1848, + "num_input_tokens_seen": 63776, + "step": 190 + }, + { + "epoch": 0.15069551777434312, + "grad_norm": 3.8710076808929443, + "learning_rate": 3.7480680061823805e-06, + "loss": 5.1779, + "num_input_tokens_seen": 65504, + "step": 195 + }, + { + "epoch": 0.1545595054095827, + "grad_norm": 4.689063549041748, + "learning_rate": 3.844667697063369e-06, + "loss": 4.5636, + "num_input_tokens_seen": 67136, + "step": 200 + }, + { + "epoch": 0.15842349304482226, + "grad_norm": 3.481635332107544, + "learning_rate": 3.941267387944358e-06, + "loss": 5.062, + "num_input_tokens_seen": 69120, + "step": 205 + }, + { + "epoch": 0.16228748068006182, + "grad_norm": 3.804504156112671, + "learning_rate": 4.037867078825347e-06, + "loss": 4.7703, + "num_input_tokens_seen": 70688, + "step": 210 + }, + { + "epoch": 0.1661514683153014, + "grad_norm": 5.110154151916504, + "learning_rate": 4.1344667697063366e-06, + "loss": 4.857, + "num_input_tokens_seen": 72192, + "step": 215 + }, + { + "epoch": 0.17001545595054096, + "grad_norm": 4.826644420623779, + "learning_rate": 4.231066460587326e-06, + "loss": 4.855, + "num_input_tokens_seen": 73600, + "step": 220 + }, + { + "epoch": 0.17387944358578053, + "grad_norm": 4.60107421875, + "learning_rate": 4.327666151468315e-06, + "loss": 4.757, + "num_input_tokens_seen": 75296, + "step": 225 + }, + { + "epoch": 0.1777434312210201, + "grad_norm": 3.3634376525878906, + "learning_rate": 4.424265842349304e-06, + "loss": 4.6414, + "num_input_tokens_seen": 76864, + "step": 230 + }, + { + "epoch": 0.18160741885625967, + "grad_norm": 5.0371928215026855, + "learning_rate": 4.5208655332302934e-06, + "loss": 5.2475, + "num_input_tokens_seen": 78432, + "step": 235 + }, + { + "epoch": 0.18547140649149924, + "grad_norm": 4.673345565795898, + "learning_rate": 4.617465224111283e-06, + "loss": 4.8825, + "num_input_tokens_seen": 79904, + "step": 240 + }, + { + "epoch": 0.1893353941267388, + "grad_norm": 5.228572368621826, + "learning_rate": 4.714064914992272e-06, + "loss": 4.8198, + "num_input_tokens_seen": 81472, + "step": 245 + }, + { + "epoch": 0.19319938176197837, + "grad_norm": 5.808956146240234, + "learning_rate": 4.810664605873261e-06, + "loss": 5.1266, + "num_input_tokens_seen": 83232, + "step": 250 + }, + { + "epoch": 0.19706336939721794, + "grad_norm": 5.369587421417236, + "learning_rate": 4.90726429675425e-06, + "loss": 4.9931, + "num_input_tokens_seen": 84640, + "step": 255 + }, + { + "epoch": 0.2009273570324575, + "grad_norm": 4.453649520874023, + "learning_rate": 5.0038639876352395e-06, + "loss": 4.9528, + "num_input_tokens_seen": 86400, + "step": 260 + }, + { + "epoch": 0.20479134466769705, + "grad_norm": 6.332608699798584, + "learning_rate": 5.100463678516229e-06, + "loss": 5.0311, + "num_input_tokens_seen": 88032, + "step": 265 + }, + { + "epoch": 0.20865533230293662, + "grad_norm": 4.272708892822266, + "learning_rate": 5.197063369397218e-06, + "loss": 4.6926, + "num_input_tokens_seen": 89920, + "step": 270 + }, + { + "epoch": 0.2125193199381762, + "grad_norm": 7.445973873138428, + "learning_rate": 5.293663060278207e-06, + "loss": 4.6278, + "num_input_tokens_seen": 91520, + "step": 275 + }, + { + "epoch": 0.21638330757341576, + "grad_norm": 4.215386867523193, + "learning_rate": 5.390262751159196e-06, + "loss": 4.7017, + "num_input_tokens_seen": 93312, + "step": 280 + }, + { + "epoch": 0.22024729520865532, + "grad_norm": 4.505995273590088, + "learning_rate": 5.486862442040186e-06, + "loss": 5.2456, + "num_input_tokens_seen": 94848, + "step": 285 + }, + { + "epoch": 0.2241112828438949, + "grad_norm": 3.680572032928467, + "learning_rate": 5.583462132921175e-06, + "loss": 4.5175, + "num_input_tokens_seen": 96480, + "step": 290 + }, + { + "epoch": 0.22797527047913446, + "grad_norm": 4.184290885925293, + "learning_rate": 5.680061823802164e-06, + "loss": 4.958, + "num_input_tokens_seen": 98112, + "step": 295 + }, + { + "epoch": 0.23183925811437403, + "grad_norm": 4.326183319091797, + "learning_rate": 5.776661514683153e-06, + "loss": 4.865, + "num_input_tokens_seen": 99552, + "step": 300 + }, + { + "epoch": 0.2357032457496136, + "grad_norm": 3.9850432872772217, + "learning_rate": 5.8732612055641425e-06, + "loss": 5.0237, + "num_input_tokens_seen": 101184, + "step": 305 + }, + { + "epoch": 0.23956723338485317, + "grad_norm": 4.029183864593506, + "learning_rate": 5.969860896445132e-06, + "loss": 4.8201, + "num_input_tokens_seen": 102912, + "step": 310 + }, + { + "epoch": 0.24343122102009274, + "grad_norm": 4.530503273010254, + "learning_rate": 6.066460587326121e-06, + "loss": 5.0544, + "num_input_tokens_seen": 104352, + "step": 315 + }, + { + "epoch": 0.2472952086553323, + "grad_norm": 4.8182148933410645, + "learning_rate": 6.16306027820711e-06, + "loss": 5.0646, + "num_input_tokens_seen": 106272, + "step": 320 + }, + { + "epoch": 0.2511591962905719, + "grad_norm": 4.47132682800293, + "learning_rate": 6.259659969088099e-06, + "loss": 5.1606, + "num_input_tokens_seen": 107776, + "step": 325 + }, + { + "epoch": 0.2550231839258114, + "grad_norm": 3.988560914993286, + "learning_rate": 6.356259659969089e-06, + "loss": 4.8144, + "num_input_tokens_seen": 109312, + "step": 330 + }, + { + "epoch": 0.258887171561051, + "grad_norm": 4.715147972106934, + "learning_rate": 6.452859350850078e-06, + "loss": 5.3652, + "num_input_tokens_seen": 111040, + "step": 335 + }, + { + "epoch": 0.26275115919629055, + "grad_norm": 5.515060901641846, + "learning_rate": 6.549459041731067e-06, + "loss": 5.0706, + "num_input_tokens_seen": 112448, + "step": 340 + }, + { + "epoch": 0.26661514683153015, + "grad_norm": 4.515748977661133, + "learning_rate": 6.646058732612056e-06, + "loss": 4.6075, + "num_input_tokens_seen": 114368, + "step": 345 + }, + { + "epoch": 0.2704791344667697, + "grad_norm": 3.5851542949676514, + "learning_rate": 6.7426584234930455e-06, + "loss": 4.812, + "num_input_tokens_seen": 116256, + "step": 350 + }, + { + "epoch": 0.2743431221020093, + "grad_norm": 3.7061409950256348, + "learning_rate": 6.839258114374035e-06, + "loss": 5.0968, + "num_input_tokens_seen": 117568, + "step": 355 + }, + { + "epoch": 0.2782071097372488, + "grad_norm": 6.0998148918151855, + "learning_rate": 6.935857805255024e-06, + "loss": 5.1112, + "num_input_tokens_seen": 119168, + "step": 360 + }, + { + "epoch": 0.2820710973724884, + "grad_norm": 7.725158214569092, + "learning_rate": 7.032457496136012e-06, + "loss": 4.9412, + "num_input_tokens_seen": 120992, + "step": 365 + }, + { + "epoch": 0.28593508500772796, + "grad_norm": 4.401492118835449, + "learning_rate": 7.129057187017002e-06, + "loss": 4.7925, + "num_input_tokens_seen": 122528, + "step": 370 + }, + { + "epoch": 0.28979907264296756, + "grad_norm": 5.568647384643555, + "learning_rate": 7.225656877897991e-06, + "loss": 5.2659, + "num_input_tokens_seen": 124096, + "step": 375 + }, + { + "epoch": 0.2936630602782071, + "grad_norm": 3.326474905014038, + "learning_rate": 7.322256568778981e-06, + "loss": 4.8918, + "num_input_tokens_seen": 126016, + "step": 380 + }, + { + "epoch": 0.2975270479134467, + "grad_norm": 3.3305108547210693, + "learning_rate": 7.418856259659969e-06, + "loss": 4.8946, + "num_input_tokens_seen": 127648, + "step": 385 + }, + { + "epoch": 0.30139103554868624, + "grad_norm": 4.971543788909912, + "learning_rate": 7.515455950540959e-06, + "loss": 4.9667, + "num_input_tokens_seen": 129408, + "step": 390 + }, + { + "epoch": 0.30525502318392583, + "grad_norm": 3.410069465637207, + "learning_rate": 7.612055641421948e-06, + "loss": 5.1623, + "num_input_tokens_seen": 131168, + "step": 395 + }, + { + "epoch": 0.3091190108191654, + "grad_norm": 3.749352216720581, + "learning_rate": 7.708655332302938e-06, + "loss": 4.7188, + "num_input_tokens_seen": 132992, + "step": 400 + }, + { + "epoch": 0.31298299845440497, + "grad_norm": 5.316215991973877, + "learning_rate": 7.805255023183925e-06, + "loss": 4.896, + "num_input_tokens_seen": 134784, + "step": 405 + }, + { + "epoch": 0.3168469860896445, + "grad_norm": 6.16975212097168, + "learning_rate": 7.901854714064916e-06, + "loss": 4.903, + "num_input_tokens_seen": 136192, + "step": 410 + }, + { + "epoch": 0.3207109737248841, + "grad_norm": 4.628363609313965, + "learning_rate": 7.998454404945904e-06, + "loss": 4.5454, + "num_input_tokens_seen": 138048, + "step": 415 + }, + { + "epoch": 0.32457496136012365, + "grad_norm": 3.7655742168426514, + "learning_rate": 8.095054095826895e-06, + "loss": 4.7332, + "num_input_tokens_seen": 139520, + "step": 420 + }, + { + "epoch": 0.3284389489953632, + "grad_norm": 4.0720438957214355, + "learning_rate": 8.191653786707882e-06, + "loss": 4.7965, + "num_input_tokens_seen": 141280, + "step": 425 + }, + { + "epoch": 0.3323029366306028, + "grad_norm": 4.058412551879883, + "learning_rate": 8.288253477588873e-06, + "loss": 4.6925, + "num_input_tokens_seen": 142848, + "step": 430 + }, + { + "epoch": 0.3361669242658423, + "grad_norm": 3.7655951976776123, + "learning_rate": 8.38485316846986e-06, + "loss": 4.9576, + "num_input_tokens_seen": 144256, + "step": 435 + }, + { + "epoch": 0.3400309119010819, + "grad_norm": 3.907864809036255, + "learning_rate": 8.481452859350851e-06, + "loss": 4.7943, + "num_input_tokens_seen": 146080, + "step": 440 + }, + { + "epoch": 0.34389489953632146, + "grad_norm": 3.8371055126190186, + "learning_rate": 8.578052550231839e-06, + "loss": 4.9278, + "num_input_tokens_seen": 147776, + "step": 445 + }, + { + "epoch": 0.34775888717156106, + "grad_norm": 4.158056735992432, + "learning_rate": 8.67465224111283e-06, + "loss": 5.1833, + "num_input_tokens_seen": 149632, + "step": 450 + }, + { + "epoch": 0.3516228748068006, + "grad_norm": 5.039457321166992, + "learning_rate": 8.771251931993817e-06, + "loss": 4.7911, + "num_input_tokens_seen": 151264, + "step": 455 + }, + { + "epoch": 0.3554868624420402, + "grad_norm": 4.278141021728516, + "learning_rate": 8.867851622874808e-06, + "loss": 4.9534, + "num_input_tokens_seen": 153056, + "step": 460 + }, + { + "epoch": 0.35935085007727974, + "grad_norm": 3.9893436431884766, + "learning_rate": 8.964451313755796e-06, + "loss": 4.8729, + "num_input_tokens_seen": 154560, + "step": 465 + }, + { + "epoch": 0.36321483771251933, + "grad_norm": 4.37880802154541, + "learning_rate": 9.061051004636785e-06, + "loss": 4.7666, + "num_input_tokens_seen": 156320, + "step": 470 + }, + { + "epoch": 0.3670788253477589, + "grad_norm": 3.7590625286102295, + "learning_rate": 9.157650695517774e-06, + "loss": 4.9472, + "num_input_tokens_seen": 158080, + "step": 475 + }, + { + "epoch": 0.37094281298299847, + "grad_norm": 3.8684964179992676, + "learning_rate": 9.254250386398764e-06, + "loss": 4.6479, + "num_input_tokens_seen": 159808, + "step": 480 + }, + { + "epoch": 0.374806800618238, + "grad_norm": 3.970315456390381, + "learning_rate": 9.350850077279753e-06, + "loss": 4.6125, + "num_input_tokens_seen": 161536, + "step": 485 + }, + { + "epoch": 0.3786707882534776, + "grad_norm": 3.9221572875976562, + "learning_rate": 9.447449768160742e-06, + "loss": 4.9294, + "num_input_tokens_seen": 163328, + "step": 490 + }, + { + "epoch": 0.38253477588871715, + "grad_norm": 4.067615509033203, + "learning_rate": 9.544049459041731e-06, + "loss": 5.0473, + "num_input_tokens_seen": 164832, + "step": 495 + }, + { + "epoch": 0.38639876352395675, + "grad_norm": 3.307046890258789, + "learning_rate": 9.64064914992272e-06, + "loss": 4.4512, + "num_input_tokens_seen": 166464, + "step": 500 + }, + { + "epoch": 0.3902627511591963, + "grad_norm": 4.478376388549805, + "learning_rate": 9.73724884080371e-06, + "loss": 4.4699, + "num_input_tokens_seen": 168000, + "step": 505 + }, + { + "epoch": 0.3941267387944359, + "grad_norm": 5.069242477416992, + "learning_rate": 9.833848531684699e-06, + "loss": 4.4894, + "num_input_tokens_seen": 169664, + "step": 510 + }, + { + "epoch": 0.3979907264296754, + "grad_norm": 3.819711923599243, + "learning_rate": 9.930448222565688e-06, + "loss": 4.2944, + "num_input_tokens_seen": 171232, + "step": 515 + }, + { + "epoch": 0.401854714064915, + "grad_norm": 4.125909805297852, + "learning_rate": 1.0027047913446677e-05, + "loss": 4.4943, + "num_input_tokens_seen": 172928, + "step": 520 + }, + { + "epoch": 0.40571870170015456, + "grad_norm": 3.4513394832611084, + "learning_rate": 1.0123647604327666e-05, + "loss": 4.3343, + "num_input_tokens_seen": 174656, + "step": 525 + }, + { + "epoch": 0.4095826893353941, + "grad_norm": 4.693109035491943, + "learning_rate": 1.0220247295208656e-05, + "loss": 4.1091, + "num_input_tokens_seen": 176448, + "step": 530 + }, + { + "epoch": 0.4134466769706337, + "grad_norm": 3.3794727325439453, + "learning_rate": 1.0316846986089645e-05, + "loss": 4.563, + "num_input_tokens_seen": 177920, + "step": 535 + }, + { + "epoch": 0.41731066460587324, + "grad_norm": 3.4081645011901855, + "learning_rate": 1.0413446676970634e-05, + "loss": 4.1849, + "num_input_tokens_seen": 179456, + "step": 540 + }, + { + "epoch": 0.42117465224111283, + "grad_norm": 4.791799068450928, + "learning_rate": 1.0510046367851623e-05, + "loss": 3.8085, + "num_input_tokens_seen": 181120, + "step": 545 + }, + { + "epoch": 0.4250386398763524, + "grad_norm": 3.795644998550415, + "learning_rate": 1.0606646058732613e-05, + "loss": 4.0528, + "num_input_tokens_seen": 182656, + "step": 550 + }, + { + "epoch": 0.42890262751159197, + "grad_norm": 2.8246705532073975, + "learning_rate": 1.0703245749613602e-05, + "loss": 4.0321, + "num_input_tokens_seen": 184384, + "step": 555 + }, + { + "epoch": 0.4327666151468315, + "grad_norm": 2.7717151641845703, + "learning_rate": 1.0799845440494591e-05, + "loss": 4.0111, + "num_input_tokens_seen": 186016, + "step": 560 + }, + { + "epoch": 0.4366306027820711, + "grad_norm": 2.9911084175109863, + "learning_rate": 1.089644513137558e-05, + "loss": 3.7839, + "num_input_tokens_seen": 187872, + "step": 565 + }, + { + "epoch": 0.44049459041731065, + "grad_norm": 2.9883596897125244, + "learning_rate": 1.099304482225657e-05, + "loss": 3.6635, + "num_input_tokens_seen": 189376, + "step": 570 + }, + { + "epoch": 0.44435857805255025, + "grad_norm": 2.9561715126037598, + "learning_rate": 1.1089644513137559e-05, + "loss": 3.4506, + "num_input_tokens_seen": 190944, + "step": 575 + }, + { + "epoch": 0.4482225656877898, + "grad_norm": 2.991717576980591, + "learning_rate": 1.1186244204018548e-05, + "loss": 3.5431, + "num_input_tokens_seen": 193024, + "step": 580 + }, + { + "epoch": 0.4520865533230294, + "grad_norm": 3.1864638328552246, + "learning_rate": 1.1282843894899537e-05, + "loss": 3.4831, + "num_input_tokens_seen": 194816, + "step": 585 + }, + { + "epoch": 0.4559505409582689, + "grad_norm": 3.0469613075256348, + "learning_rate": 1.1379443585780526e-05, + "loss": 3.1789, + "num_input_tokens_seen": 196288, + "step": 590 + }, + { + "epoch": 0.4598145285935085, + "grad_norm": 3.118184804916382, + "learning_rate": 1.1476043276661516e-05, + "loss": 3.4682, + "num_input_tokens_seen": 197952, + "step": 595 + }, + { + "epoch": 0.46367851622874806, + "grad_norm": 2.7087576389312744, + "learning_rate": 1.1572642967542505e-05, + "loss": 3.138, + "num_input_tokens_seen": 199552, + "step": 600 + }, + { + "epoch": 0.46754250386398766, + "grad_norm": 4.019400596618652, + "learning_rate": 1.1669242658423494e-05, + "loss": 3.1462, + "num_input_tokens_seen": 201088, + "step": 605 + }, + { + "epoch": 0.4714064914992272, + "grad_norm": 2.884896755218506, + "learning_rate": 1.1765842349304483e-05, + "loss": 3.1362, + "num_input_tokens_seen": 202656, + "step": 610 + }, + { + "epoch": 0.4752704791344668, + "grad_norm": 2.6672661304473877, + "learning_rate": 1.1862442040185472e-05, + "loss": 3.2543, + "num_input_tokens_seen": 204480, + "step": 615 + }, + { + "epoch": 0.47913446676970634, + "grad_norm": 2.749147415161133, + "learning_rate": 1.1959041731066462e-05, + "loss": 3.4195, + "num_input_tokens_seen": 205984, + "step": 620 + }, + { + "epoch": 0.48299845440494593, + "grad_norm": 3.34041690826416, + "learning_rate": 1.2055641421947451e-05, + "loss": 3.0572, + "num_input_tokens_seen": 207616, + "step": 625 + }, + { + "epoch": 0.4868624420401855, + "grad_norm": 2.419572353363037, + "learning_rate": 1.215224111282844e-05, + "loss": 3.0094, + "num_input_tokens_seen": 209408, + "step": 630 + }, + { + "epoch": 0.490726429675425, + "grad_norm": 2.502732992172241, + "learning_rate": 1.224884080370943e-05, + "loss": 2.8415, + "num_input_tokens_seen": 210944, + "step": 635 + }, + { + "epoch": 0.4945904173106646, + "grad_norm": 3.2450432777404785, + "learning_rate": 1.2345440494590419e-05, + "loss": 2.9567, + "num_input_tokens_seen": 212608, + "step": 640 + }, + { + "epoch": 0.49845440494590415, + "grad_norm": 2.9201743602752686, + "learning_rate": 1.2442040185471408e-05, + "loss": 2.8341, + "num_input_tokens_seen": 214368, + "step": 645 + }, + { + "epoch": 0.5023183925811437, + "grad_norm": 2.6117916107177734, + "learning_rate": 1.2538639876352395e-05, + "loss": 3.0014, + "num_input_tokens_seen": 216064, + "step": 650 + }, + { + "epoch": 0.5061823802163833, + "grad_norm": 3.230393409729004, + "learning_rate": 1.2635239567233386e-05, + "loss": 2.4902, + "num_input_tokens_seen": 217664, + "step": 655 + }, + { + "epoch": 0.5100463678516228, + "grad_norm": 2.9872682094573975, + "learning_rate": 1.2731839258114375e-05, + "loss": 2.8583, + "num_input_tokens_seen": 219200, + "step": 660 + }, + { + "epoch": 0.5139103554868625, + "grad_norm": 2.5046072006225586, + "learning_rate": 1.2828438948995365e-05, + "loss": 2.9773, + "num_input_tokens_seen": 220832, + "step": 665 + }, + { + "epoch": 0.517774343122102, + "grad_norm": 2.4766368865966797, + "learning_rate": 1.2925038639876352e-05, + "loss": 2.5416, + "num_input_tokens_seen": 222528, + "step": 670 + }, + { + "epoch": 0.5216383307573416, + "grad_norm": 3.704665184020996, + "learning_rate": 1.3021638330757341e-05, + "loss": 3.1747, + "num_input_tokens_seen": 224000, + "step": 675 + }, + { + "epoch": 0.5255023183925811, + "grad_norm": 2.735163927078247, + "learning_rate": 1.3118238021638332e-05, + "loss": 2.5039, + "num_input_tokens_seen": 225984, + "step": 680 + }, + { + "epoch": 0.5293663060278208, + "grad_norm": 3.165984630584717, + "learning_rate": 1.3214837712519322e-05, + "loss": 2.4288, + "num_input_tokens_seen": 227616, + "step": 685 + }, + { + "epoch": 0.5332302936630603, + "grad_norm": 2.6482503414154053, + "learning_rate": 1.3311437403400309e-05, + "loss": 2.8146, + "num_input_tokens_seen": 229632, + "step": 690 + }, + { + "epoch": 0.5370942812982998, + "grad_norm": 2.448072910308838, + "learning_rate": 1.3408037094281298e-05, + "loss": 2.9714, + "num_input_tokens_seen": 231136, + "step": 695 + }, + { + "epoch": 0.5409582689335394, + "grad_norm": 2.695878267288208, + "learning_rate": 1.350463678516229e-05, + "loss": 2.268, + "num_input_tokens_seen": 233056, + "step": 700 + }, + { + "epoch": 0.544822256568779, + "grad_norm": 3.0769786834716797, + "learning_rate": 1.3601236476043278e-05, + "loss": 2.5508, + "num_input_tokens_seen": 234912, + "step": 705 + }, + { + "epoch": 0.5486862442040186, + "grad_norm": 2.682255744934082, + "learning_rate": 1.3697836166924266e-05, + "loss": 2.48, + "num_input_tokens_seen": 236480, + "step": 710 + }, + { + "epoch": 0.5525502318392581, + "grad_norm": 2.5016865730285645, + "learning_rate": 1.3794435857805255e-05, + "loss": 2.1732, + "num_input_tokens_seen": 238048, + "step": 715 + }, + { + "epoch": 0.5564142194744977, + "grad_norm": 2.401695728302002, + "learning_rate": 1.3891035548686246e-05, + "loss": 2.7279, + "num_input_tokens_seen": 239968, + "step": 720 + }, + { + "epoch": 0.5602782071097373, + "grad_norm": 2.802387237548828, + "learning_rate": 1.3987635239567235e-05, + "loss": 2.7073, + "num_input_tokens_seen": 241664, + "step": 725 + }, + { + "epoch": 0.5641421947449768, + "grad_norm": 2.573873996734619, + "learning_rate": 1.4084234930448223e-05, + "loss": 2.2728, + "num_input_tokens_seen": 243424, + "step": 730 + }, + { + "epoch": 0.5680061823802164, + "grad_norm": 2.4397976398468018, + "learning_rate": 1.4180834621329212e-05, + "loss": 2.4815, + "num_input_tokens_seen": 244992, + "step": 735 + }, + { + "epoch": 0.5718701700154559, + "grad_norm": 1.9084938764572144, + "learning_rate": 1.4277434312210203e-05, + "loss": 2.3776, + "num_input_tokens_seen": 246688, + "step": 740 + }, + { + "epoch": 0.5757341576506955, + "grad_norm": 2.897129535675049, + "learning_rate": 1.4374034003091192e-05, + "loss": 2.5601, + "num_input_tokens_seen": 248288, + "step": 745 + }, + { + "epoch": 0.5795981452859351, + "grad_norm": 2.8136632442474365, + "learning_rate": 1.447063369397218e-05, + "loss": 2.58, + "num_input_tokens_seen": 250144, + "step": 750 + }, + { + "epoch": 0.5834621329211747, + "grad_norm": 2.5259265899658203, + "learning_rate": 1.4567233384853169e-05, + "loss": 2.3088, + "num_input_tokens_seen": 251776, + "step": 755 + }, + { + "epoch": 0.5873261205564142, + "grad_norm": 3.675905466079712, + "learning_rate": 1.466383307573416e-05, + "loss": 2.6533, + "num_input_tokens_seen": 253376, + "step": 760 + }, + { + "epoch": 0.5911901081916537, + "grad_norm": 2.3405559062957764, + "learning_rate": 1.4760432766615149e-05, + "loss": 2.605, + "num_input_tokens_seen": 254976, + "step": 765 + }, + { + "epoch": 0.5950540958268934, + "grad_norm": 3.094191312789917, + "learning_rate": 1.4857032457496137e-05, + "loss": 2.181, + "num_input_tokens_seen": 256704, + "step": 770 + }, + { + "epoch": 0.5989180834621329, + "grad_norm": 2.3928945064544678, + "learning_rate": 1.4953632148377126e-05, + "loss": 2.3367, + "num_input_tokens_seen": 258496, + "step": 775 + }, + { + "epoch": 0.6027820710973725, + "grad_norm": 2.263355016708374, + "learning_rate": 1.5050231839258113e-05, + "loss": 2.0375, + "num_input_tokens_seen": 260224, + "step": 780 + }, + { + "epoch": 0.606646058732612, + "grad_norm": 2.8704583644866943, + "learning_rate": 1.5146831530139106e-05, + "loss": 2.6148, + "num_input_tokens_seen": 262016, + "step": 785 + }, + { + "epoch": 0.6105100463678517, + "grad_norm": 2.299835443496704, + "learning_rate": 1.5243431221020093e-05, + "loss": 2.2299, + "num_input_tokens_seen": 264096, + "step": 790 + }, + { + "epoch": 0.6143740340030912, + "grad_norm": 2.230757236480713, + "learning_rate": 1.5340030911901083e-05, + "loss": 2.7957, + "num_input_tokens_seen": 265632, + "step": 795 + }, + { + "epoch": 0.6182380216383307, + "grad_norm": 2.318157434463501, + "learning_rate": 1.5436630602782072e-05, + "loss": 2.198, + "num_input_tokens_seen": 267488, + "step": 800 + }, + { + "epoch": 0.6221020092735703, + "grad_norm": 2.869781732559204, + "learning_rate": 1.553323029366306e-05, + "loss": 2.3341, + "num_input_tokens_seen": 269248, + "step": 805 + }, + { + "epoch": 0.6259659969088099, + "grad_norm": 2.393261432647705, + "learning_rate": 1.562982998454405e-05, + "loss": 2.1675, + "num_input_tokens_seen": 270848, + "step": 810 + }, + { + "epoch": 0.6298299845440495, + "grad_norm": 2.6343114376068115, + "learning_rate": 1.572642967542504e-05, + "loss": 2.1174, + "num_input_tokens_seen": 272640, + "step": 815 + }, + { + "epoch": 0.633693972179289, + "grad_norm": 2.0376734733581543, + "learning_rate": 1.582302936630603e-05, + "loss": 1.9669, + "num_input_tokens_seen": 274528, + "step": 820 + }, + { + "epoch": 0.6375579598145286, + "grad_norm": 2.7749550342559814, + "learning_rate": 1.5919629057187018e-05, + "loss": 2.2926, + "num_input_tokens_seen": 276192, + "step": 825 + }, + { + "epoch": 0.6414219474497682, + "grad_norm": 2.08839750289917, + "learning_rate": 1.6016228748068007e-05, + "loss": 2.3585, + "num_input_tokens_seen": 277856, + "step": 830 + }, + { + "epoch": 0.6452859350850078, + "grad_norm": 2.258702516555786, + "learning_rate": 1.6112828438948996e-05, + "loss": 2.1608, + "num_input_tokens_seen": 279680, + "step": 835 + }, + { + "epoch": 0.6491499227202473, + "grad_norm": 2.055309772491455, + "learning_rate": 1.6209428129829986e-05, + "loss": 2.273, + "num_input_tokens_seen": 281440, + "step": 840 + }, + { + "epoch": 0.6530139103554868, + "grad_norm": 2.6223304271698, + "learning_rate": 1.6306027820710975e-05, + "loss": 2.6457, + "num_input_tokens_seen": 283072, + "step": 845 + }, + { + "epoch": 0.6568778979907264, + "grad_norm": 2.2198402881622314, + "learning_rate": 1.6402627511591964e-05, + "loss": 2.0739, + "num_input_tokens_seen": 284832, + "step": 850 + }, + { + "epoch": 0.660741885625966, + "grad_norm": 2.4770278930664062, + "learning_rate": 1.6499227202472953e-05, + "loss": 2.1028, + "num_input_tokens_seen": 286528, + "step": 855 + }, + { + "epoch": 0.6646058732612056, + "grad_norm": 2.2246718406677246, + "learning_rate": 1.6595826893353942e-05, + "loss": 1.8553, + "num_input_tokens_seen": 288192, + "step": 860 + }, + { + "epoch": 0.6684698608964451, + "grad_norm": 2.0216636657714844, + "learning_rate": 1.6692426584234932e-05, + "loss": 2.1973, + "num_input_tokens_seen": 289856, + "step": 865 + }, + { + "epoch": 0.6723338485316847, + "grad_norm": 2.053602457046509, + "learning_rate": 1.678902627511592e-05, + "loss": 2.497, + "num_input_tokens_seen": 291552, + "step": 870 + }, + { + "epoch": 0.6761978361669243, + "grad_norm": 2.5723683834075928, + "learning_rate": 1.688562596599691e-05, + "loss": 2.1789, + "num_input_tokens_seen": 293120, + "step": 875 + }, + { + "epoch": 0.6800618238021638, + "grad_norm": 1.7902522087097168, + "learning_rate": 1.69822256568779e-05, + "loss": 1.958, + "num_input_tokens_seen": 294720, + "step": 880 + }, + { + "epoch": 0.6839258114374034, + "grad_norm": 2.063884973526001, + "learning_rate": 1.7078825347758885e-05, + "loss": 2.1074, + "num_input_tokens_seen": 296384, + "step": 885 + }, + { + "epoch": 0.6877897990726429, + "grad_norm": 2.207538604736328, + "learning_rate": 1.7175425038639878e-05, + "loss": 1.8564, + "num_input_tokens_seen": 297824, + "step": 890 + }, + { + "epoch": 0.6916537867078826, + "grad_norm": 1.9068430662155151, + "learning_rate": 1.7272024729520867e-05, + "loss": 1.8601, + "num_input_tokens_seen": 299488, + "step": 895 + }, + { + "epoch": 0.6955177743431221, + "grad_norm": 2.0543270111083984, + "learning_rate": 1.7368624420401856e-05, + "loss": 2.0481, + "num_input_tokens_seen": 301280, + "step": 900 + }, + { + "epoch": 0.6993817619783617, + "grad_norm": 2.044473886489868, + "learning_rate": 1.7465224111282842e-05, + "loss": 2.0931, + "num_input_tokens_seen": 302944, + "step": 905 + }, + { + "epoch": 0.7032457496136012, + "grad_norm": 2.361893892288208, + "learning_rate": 1.7561823802163835e-05, + "loss": 2.2002, + "num_input_tokens_seen": 304928, + "step": 910 + }, + { + "epoch": 0.7071097372488409, + "grad_norm": 1.342307686805725, + "learning_rate": 1.7658423493044824e-05, + "loss": 1.7323, + "num_input_tokens_seen": 306656, + "step": 915 + }, + { + "epoch": 0.7109737248840804, + "grad_norm": 2.0836408138275146, + "learning_rate": 1.7755023183925813e-05, + "loss": 1.8292, + "num_input_tokens_seen": 308384, + "step": 920 + }, + { + "epoch": 0.7148377125193199, + "grad_norm": 1.6608034372329712, + "learning_rate": 1.78516228748068e-05, + "loss": 1.6646, + "num_input_tokens_seen": 310144, + "step": 925 + }, + { + "epoch": 0.7187017001545595, + "grad_norm": 2.147900342941284, + "learning_rate": 1.794822256568779e-05, + "loss": 1.8185, + "num_input_tokens_seen": 311904, + "step": 930 + }, + { + "epoch": 0.7225656877897991, + "grad_norm": 2.032017469406128, + "learning_rate": 1.804482225656878e-05, + "loss": 1.6086, + "num_input_tokens_seen": 313600, + "step": 935 + }, + { + "epoch": 0.7264296754250387, + "grad_norm": 1.7658897638320923, + "learning_rate": 1.814142194744977e-05, + "loss": 1.7894, + "num_input_tokens_seen": 315104, + "step": 940 + }, + { + "epoch": 0.7302936630602782, + "grad_norm": 1.8841807842254639, + "learning_rate": 1.8238021638330756e-05, + "loss": 1.9533, + "num_input_tokens_seen": 316480, + "step": 945 + }, + { + "epoch": 0.7341576506955177, + "grad_norm": 1.8372564315795898, + "learning_rate": 1.833462132921175e-05, + "loss": 1.5692, + "num_input_tokens_seen": 317888, + "step": 950 + }, + { + "epoch": 0.7380216383307573, + "grad_norm": 1.691672921180725, + "learning_rate": 1.8431221020092738e-05, + "loss": 1.7776, + "num_input_tokens_seen": 319744, + "step": 955 + }, + { + "epoch": 0.7418856259659969, + "grad_norm": 1.5950008630752563, + "learning_rate": 1.8527820710973727e-05, + "loss": 1.6131, + "num_input_tokens_seen": 321440, + "step": 960 + }, + { + "epoch": 0.7457496136012365, + "grad_norm": 2.1656761169433594, + "learning_rate": 1.8624420401854713e-05, + "loss": 1.6691, + "num_input_tokens_seen": 323232, + "step": 965 + }, + { + "epoch": 0.749613601236476, + "grad_norm": 1.2306383848190308, + "learning_rate": 1.8721020092735705e-05, + "loss": 1.7332, + "num_input_tokens_seen": 324960, + "step": 970 + }, + { + "epoch": 0.7534775888717156, + "grad_norm": 2.245983362197876, + "learning_rate": 1.8817619783616695e-05, + "loss": 1.8248, + "num_input_tokens_seen": 326688, + "step": 975 + }, + { + "epoch": 0.7573415765069552, + "grad_norm": 1.9021021127700806, + "learning_rate": 1.8914219474497684e-05, + "loss": 1.7382, + "num_input_tokens_seen": 328416, + "step": 980 + }, + { + "epoch": 0.7612055641421948, + "grad_norm": 1.340457797050476, + "learning_rate": 1.901081916537867e-05, + "loss": 1.9714, + "num_input_tokens_seen": 329888, + "step": 985 + }, + { + "epoch": 0.7650695517774343, + "grad_norm": 1.8272640705108643, + "learning_rate": 1.910741885625966e-05, + "loss": 1.5439, + "num_input_tokens_seen": 331712, + "step": 990 + }, + { + "epoch": 0.7689335394126738, + "grad_norm": 1.5553293228149414, + "learning_rate": 1.920401854714065e-05, + "loss": 1.4105, + "num_input_tokens_seen": 333664, + "step": 995 + }, + { + "epoch": 0.7727975270479135, + "grad_norm": 1.6645678281784058, + "learning_rate": 1.930061823802164e-05, + "loss": 1.4528, + "num_input_tokens_seen": 335424, + "step": 1000 + }, + { + "epoch": 0.776661514683153, + "grad_norm": 1.477904200553894, + "learning_rate": 1.9397217928902626e-05, + "loss": 1.4044, + "num_input_tokens_seen": 337376, + "step": 1005 + }, + { + "epoch": 0.7805255023183926, + "grad_norm": 1.6714982986450195, + "learning_rate": 1.9493817619783616e-05, + "loss": 1.4953, + "num_input_tokens_seen": 339136, + "step": 1010 + }, + { + "epoch": 0.7843894899536321, + "grad_norm": 1.7422798871994019, + "learning_rate": 1.9590417310664608e-05, + "loss": 1.5242, + "num_input_tokens_seen": 340736, + "step": 1015 + }, + { + "epoch": 0.7882534775888718, + "grad_norm": 0.9459184408187866, + "learning_rate": 1.9687017001545598e-05, + "loss": 1.4444, + "num_input_tokens_seen": 342496, + "step": 1020 + }, + { + "epoch": 0.7921174652241113, + "grad_norm": 1.40915846824646, + "learning_rate": 1.9783616692426583e-05, + "loss": 1.4012, + "num_input_tokens_seen": 344352, + "step": 1025 + }, + { + "epoch": 0.7959814528593508, + "grad_norm": 1.3908138275146484, + "learning_rate": 1.9880216383307573e-05, + "loss": 1.3634, + "num_input_tokens_seen": 345856, + "step": 1030 + }, + { + "epoch": 0.7998454404945904, + "grad_norm": 1.1213754415512085, + "learning_rate": 1.9976816074188565e-05, + "loss": 1.1757, + "num_input_tokens_seen": 347712, + "step": 1035 + }, + { + "epoch": 0.80370942812983, + "grad_norm": 1.2550541162490845, + "learning_rate": 2.0073415765069554e-05, + "loss": 1.384, + "num_input_tokens_seen": 349440, + "step": 1040 + }, + { + "epoch": 0.8075734157650696, + "grad_norm": 2.008554220199585, + "learning_rate": 2.017001545595054e-05, + "loss": 1.666, + "num_input_tokens_seen": 351136, + "step": 1045 + }, + { + "epoch": 0.8114374034003091, + "grad_norm": 1.2999014854431152, + "learning_rate": 2.026661514683153e-05, + "loss": 1.4654, + "num_input_tokens_seen": 352576, + "step": 1050 + }, + { + "epoch": 0.8153013910355487, + "grad_norm": 1.5222673416137695, + "learning_rate": 2.0363214837712522e-05, + "loss": 1.4969, + "num_input_tokens_seen": 353952, + "step": 1055 + }, + { + "epoch": 0.8191653786707882, + "grad_norm": 1.2305611371994019, + "learning_rate": 2.045981452859351e-05, + "loss": 1.2079, + "num_input_tokens_seen": 355904, + "step": 1060 + }, + { + "epoch": 0.8230293663060279, + "grad_norm": 1.217146396636963, + "learning_rate": 2.0556414219474497e-05, + "loss": 1.256, + "num_input_tokens_seen": 357600, + "step": 1065 + }, + { + "epoch": 0.8268933539412674, + "grad_norm": 1.514101505279541, + "learning_rate": 2.0653013910355486e-05, + "loss": 1.3303, + "num_input_tokens_seen": 359776, + "step": 1070 + }, + { + "epoch": 0.8307573415765069, + "grad_norm": 1.0365791320800781, + "learning_rate": 2.074961360123648e-05, + "loss": 1.2087, + "num_input_tokens_seen": 361312, + "step": 1075 + }, + { + "epoch": 0.8346213292117465, + "grad_norm": 0.9848813414573669, + "learning_rate": 2.0846213292117468e-05, + "loss": 1.2778, + "num_input_tokens_seen": 362944, + "step": 1080 + }, + { + "epoch": 0.8384853168469861, + "grad_norm": 0.8346930742263794, + "learning_rate": 2.0942812982998454e-05, + "loss": 1.2258, + "num_input_tokens_seen": 364736, + "step": 1085 + }, + { + "epoch": 0.8423493044822257, + "grad_norm": 1.9258639812469482, + "learning_rate": 2.1039412673879443e-05, + "loss": 1.5704, + "num_input_tokens_seen": 366304, + "step": 1090 + }, + { + "epoch": 0.8462132921174652, + "grad_norm": 1.3529118299484253, + "learning_rate": 2.1136012364760432e-05, + "loss": 1.4254, + "num_input_tokens_seen": 368096, + "step": 1095 + }, + { + "epoch": 0.8500772797527048, + "grad_norm": 1.3674811124801636, + "learning_rate": 2.1232612055641425e-05, + "loss": 1.3614, + "num_input_tokens_seen": 369536, + "step": 1100 + }, + { + "epoch": 0.8539412673879444, + "grad_norm": 1.0025947093963623, + "learning_rate": 2.132921174652241e-05, + "loss": 1.2013, + "num_input_tokens_seen": 371200, + "step": 1105 + }, + { + "epoch": 0.8578052550231839, + "grad_norm": 1.2838925123214722, + "learning_rate": 2.14258114374034e-05, + "loss": 1.4188, + "num_input_tokens_seen": 372768, + "step": 1110 + }, + { + "epoch": 0.8616692426584235, + "grad_norm": 1.1110647916793823, + "learning_rate": 2.152241112828439e-05, + "loss": 1.1571, + "num_input_tokens_seen": 374432, + "step": 1115 + }, + { + "epoch": 0.865533230293663, + "grad_norm": 1.1052254438400269, + "learning_rate": 2.1619010819165382e-05, + "loss": 1.4895, + "num_input_tokens_seen": 376320, + "step": 1120 + }, + { + "epoch": 0.8693972179289027, + "grad_norm": 1.0645831823349, + "learning_rate": 2.1715610510046368e-05, + "loss": 1.2112, + "num_input_tokens_seen": 377952, + "step": 1125 + }, + { + "epoch": 0.8732612055641422, + "grad_norm": 0.9325103163719177, + "learning_rate": 2.1812210200927357e-05, + "loss": 1.1703, + "num_input_tokens_seen": 379360, + "step": 1130 + }, + { + "epoch": 0.8771251931993818, + "grad_norm": 1.43287193775177, + "learning_rate": 2.1908809891808346e-05, + "loss": 1.2756, + "num_input_tokens_seen": 381088, + "step": 1135 + }, + { + "epoch": 0.8809891808346213, + "grad_norm": 0.841060221195221, + "learning_rate": 2.200540958268934e-05, + "loss": 1.0208, + "num_input_tokens_seen": 382656, + "step": 1140 + }, + { + "epoch": 0.884853168469861, + "grad_norm": 1.6885892152786255, + "learning_rate": 2.2102009273570325e-05, + "loss": 1.089, + "num_input_tokens_seen": 384320, + "step": 1145 + }, + { + "epoch": 0.8887171561051005, + "grad_norm": 1.3935961723327637, + "learning_rate": 2.2198608964451314e-05, + "loss": 1.4137, + "num_input_tokens_seen": 386080, + "step": 1150 + }, + { + "epoch": 0.89258114374034, + "grad_norm": 1.3351738452911377, + "learning_rate": 2.2295208655332303e-05, + "loss": 1.2112, + "num_input_tokens_seen": 387904, + "step": 1155 + }, + { + "epoch": 0.8964451313755796, + "grad_norm": 0.807194709777832, + "learning_rate": 2.2391808346213296e-05, + "loss": 1.0659, + "num_input_tokens_seen": 389504, + "step": 1160 + }, + { + "epoch": 0.9003091190108191, + "grad_norm": 0.6564492583274841, + "learning_rate": 2.248840803709428e-05, + "loss": 1.0306, + "num_input_tokens_seen": 390944, + "step": 1165 + }, + { + "epoch": 0.9041731066460588, + "grad_norm": 1.3586955070495605, + "learning_rate": 2.258500772797527e-05, + "loss": 1.3617, + "num_input_tokens_seen": 392704, + "step": 1170 + }, + { + "epoch": 0.9080370942812983, + "grad_norm": 0.8704670667648315, + "learning_rate": 2.268160741885626e-05, + "loss": 1.1064, + "num_input_tokens_seen": 394528, + "step": 1175 + }, + { + "epoch": 0.9119010819165378, + "grad_norm": 1.9945849180221558, + "learning_rate": 2.2778207109737253e-05, + "loss": 1.5028, + "num_input_tokens_seen": 395872, + "step": 1180 + }, + { + "epoch": 0.9157650695517774, + "grad_norm": 0.9600248336791992, + "learning_rate": 2.287480680061824e-05, + "loss": 1.2965, + "num_input_tokens_seen": 397728, + "step": 1185 + }, + { + "epoch": 0.919629057187017, + "grad_norm": 1.0356462001800537, + "learning_rate": 2.2971406491499228e-05, + "loss": 1.1715, + "num_input_tokens_seen": 399264, + "step": 1190 + }, + { + "epoch": 0.9234930448222566, + "grad_norm": 1.3660202026367188, + "learning_rate": 2.3068006182380217e-05, + "loss": 1.0048, + "num_input_tokens_seen": 400864, + "step": 1195 + }, + { + "epoch": 0.9273570324574961, + "grad_norm": 0.5440446138381958, + "learning_rate": 2.3164605873261206e-05, + "loss": 1.1676, + "num_input_tokens_seen": 402432, + "step": 1200 + }, + { + "epoch": 0.9312210200927357, + "grad_norm": 1.0392385721206665, + "learning_rate": 2.3261205564142195e-05, + "loss": 1.0711, + "num_input_tokens_seen": 404000, + "step": 1205 + }, + { + "epoch": 0.9350850077279753, + "grad_norm": 1.1492503881454468, + "learning_rate": 2.3357805255023184e-05, + "loss": 1.029, + "num_input_tokens_seen": 405632, + "step": 1210 + }, + { + "epoch": 0.9389489953632149, + "grad_norm": 1.0171775817871094, + "learning_rate": 2.3454404945904174e-05, + "loss": 1.1775, + "num_input_tokens_seen": 407328, + "step": 1215 + }, + { + "epoch": 0.9428129829984544, + "grad_norm": 0.6542432308197021, + "learning_rate": 2.3551004636785163e-05, + "loss": 0.9562, + "num_input_tokens_seen": 409088, + "step": 1220 + }, + { + "epoch": 0.9466769706336939, + "grad_norm": 1.2752037048339844, + "learning_rate": 2.3647604327666152e-05, + "loss": 1.0574, + "num_input_tokens_seen": 410752, + "step": 1225 + }, + { + "epoch": 0.9505409582689336, + "grad_norm": 1.1602507829666138, + "learning_rate": 2.374420401854714e-05, + "loss": 1.3267, + "num_input_tokens_seen": 412512, + "step": 1230 + }, + { + "epoch": 0.9544049459041731, + "grad_norm": 1.4732599258422852, + "learning_rate": 2.384080370942813e-05, + "loss": 1.5001, + "num_input_tokens_seen": 414336, + "step": 1235 + }, + { + "epoch": 0.9582689335394127, + "grad_norm": 0.7993325591087341, + "learning_rate": 2.393740340030912e-05, + "loss": 1.1248, + "num_input_tokens_seen": 416672, + "step": 1240 + }, + { + "epoch": 0.9621329211746522, + "grad_norm": 1.8325283527374268, + "learning_rate": 2.403400309119011e-05, + "loss": 1.2869, + "num_input_tokens_seen": 418592, + "step": 1245 + }, + { + "epoch": 0.9659969088098919, + "grad_norm": 1.0647400617599487, + "learning_rate": 2.4130602782071098e-05, + "loss": 0.9643, + "num_input_tokens_seen": 420288, + "step": 1250 + }, + { + "epoch": 0.9698608964451314, + "grad_norm": 0.8173490762710571, + "learning_rate": 2.4227202472952087e-05, + "loss": 0.9955, + "num_input_tokens_seen": 422208, + "step": 1255 + }, + { + "epoch": 0.973724884080371, + "grad_norm": 0.798651933670044, + "learning_rate": 2.4323802163833077e-05, + "loss": 1.1829, + "num_input_tokens_seen": 423712, + "step": 1260 + }, + { + "epoch": 0.9775888717156105, + "grad_norm": 0.8122785687446594, + "learning_rate": 2.4420401854714066e-05, + "loss": 1.2709, + "num_input_tokens_seen": 425664, + "step": 1265 + }, + { + "epoch": 0.98145285935085, + "grad_norm": 0.6492089033126831, + "learning_rate": 2.4517001545595055e-05, + "loss": 1.5446, + "num_input_tokens_seen": 427456, + "step": 1270 + }, + { + "epoch": 0.9853168469860897, + "grad_norm": 0.9390770792961121, + "learning_rate": 2.4613601236476044e-05, + "loss": 1.0293, + "num_input_tokens_seen": 429152, + "step": 1275 + }, + { + "epoch": 0.9891808346213292, + "grad_norm": 1.5338762998580933, + "learning_rate": 2.4710200927357034e-05, + "loss": 1.3239, + "num_input_tokens_seen": 430944, + "step": 1280 + }, + { + "epoch": 0.9930448222565688, + "grad_norm": 1.0827213525772095, + "learning_rate": 2.4806800618238023e-05, + "loss": 1.1952, + "num_input_tokens_seen": 432544, + "step": 1285 + }, + { + "epoch": 0.9969088098918083, + "grad_norm": 0.5509246587753296, + "learning_rate": 2.4903400309119012e-05, + "loss": 1.0243, + "num_input_tokens_seen": 434144, + "step": 1290 + }, + { + "epoch": 1.0, + "eval_loss": 1.0522016286849976, + "eval_runtime": 6.2518, + "eval_samples_per_second": 91.974, + "eval_steps_per_second": 23.033, + "num_input_tokens_seen": 435104, + "step": 1294 + }, + { + "epoch": 1.000772797527048, + "grad_norm": 1.112228512763977, + "learning_rate": 2.5e-05, + "loss": 1.0292, + "num_input_tokens_seen": 435488, + "step": 1295 + }, + { + "epoch": 1.0046367851622875, + "grad_norm": 1.0624983310699463, + "learning_rate": 2.509659969088099e-05, + "loss": 1.1734, + "num_input_tokens_seen": 436928, + "step": 1300 + }, + { + "epoch": 1.008500772797527, + "grad_norm": 0.633518636226654, + "learning_rate": 2.519319938176198e-05, + "loss": 1.1229, + "num_input_tokens_seen": 438784, + "step": 1305 + }, + { + "epoch": 1.0123647604327666, + "grad_norm": 1.2441682815551758, + "learning_rate": 2.5289799072642965e-05, + "loss": 1.215, + "num_input_tokens_seen": 440640, + "step": 1310 + }, + { + "epoch": 1.0162287480680061, + "grad_norm": 0.9142671227455139, + "learning_rate": 2.5386398763523955e-05, + "loss": 0.9233, + "num_input_tokens_seen": 442240, + "step": 1315 + }, + { + "epoch": 1.0200927357032457, + "grad_norm": 1.0873416662216187, + "learning_rate": 2.548299845440495e-05, + "loss": 1.1925, + "num_input_tokens_seen": 443808, + "step": 1320 + }, + { + "epoch": 1.0239567233384854, + "grad_norm": 0.6712993383407593, + "learning_rate": 2.5579598145285937e-05, + "loss": 0.9134, + "num_input_tokens_seen": 445504, + "step": 1325 + }, + { + "epoch": 1.027820710973725, + "grad_norm": 0.7719399929046631, + "learning_rate": 2.5676197836166926e-05, + "loss": 0.9898, + "num_input_tokens_seen": 447456, + "step": 1330 + }, + { + "epoch": 1.0316846986089645, + "grad_norm": 1.076035737991333, + "learning_rate": 2.5772797527047915e-05, + "loss": 0.9594, + "num_input_tokens_seen": 449184, + "step": 1335 + }, + { + "epoch": 1.035548686244204, + "grad_norm": 0.9154300689697266, + "learning_rate": 2.5869397217928904e-05, + "loss": 0.8621, + "num_input_tokens_seen": 450688, + "step": 1340 + }, + { + "epoch": 1.0394126738794436, + "grad_norm": 0.929857611656189, + "learning_rate": 2.5965996908809893e-05, + "loss": 1.1922, + "num_input_tokens_seen": 452224, + "step": 1345 + }, + { + "epoch": 1.0432766615146831, + "grad_norm": 1.3176270723342896, + "learning_rate": 2.606259659969088e-05, + "loss": 1.0127, + "num_input_tokens_seen": 453664, + "step": 1350 + }, + { + "epoch": 1.0471406491499227, + "grad_norm": 0.9433643817901611, + "learning_rate": 2.615919629057187e-05, + "loss": 1.1003, + "num_input_tokens_seen": 455264, + "step": 1355 + }, + { + "epoch": 1.0510046367851622, + "grad_norm": 0.7905700206756592, + "learning_rate": 2.6255795981452864e-05, + "loss": 0.9443, + "num_input_tokens_seen": 457120, + "step": 1360 + }, + { + "epoch": 1.054868624420402, + "grad_norm": 0.8720052242279053, + "learning_rate": 2.635239567233385e-05, + "loss": 0.9186, + "num_input_tokens_seen": 458816, + "step": 1365 + }, + { + "epoch": 1.0587326120556415, + "grad_norm": 1.454528570175171, + "learning_rate": 2.644899536321484e-05, + "loss": 1.1797, + "num_input_tokens_seen": 460544, + "step": 1370 + }, + { + "epoch": 1.062596599690881, + "grad_norm": 0.8374622464179993, + "learning_rate": 2.654559505409583e-05, + "loss": 0.8181, + "num_input_tokens_seen": 462080, + "step": 1375 + }, + { + "epoch": 1.0664605873261206, + "grad_norm": 1.0908383131027222, + "learning_rate": 2.6642194744976818e-05, + "loss": 1.5218, + "num_input_tokens_seen": 463712, + "step": 1380 + }, + { + "epoch": 1.0703245749613601, + "grad_norm": 1.052261233329773, + "learning_rate": 2.6738794435857807e-05, + "loss": 0.9599, + "num_input_tokens_seen": 465600, + "step": 1385 + }, + { + "epoch": 1.0741885625965997, + "grad_norm": 0.7347571849822998, + "learning_rate": 2.6835394126738793e-05, + "loss": 0.9205, + "num_input_tokens_seen": 467296, + "step": 1390 + }, + { + "epoch": 1.0780525502318392, + "grad_norm": 1.076370358467102, + "learning_rate": 2.6931993817619782e-05, + "loss": 1.0122, + "num_input_tokens_seen": 469088, + "step": 1395 + }, + { + "epoch": 1.0819165378670788, + "grad_norm": 0.9912744760513306, + "learning_rate": 2.702859350850077e-05, + "loss": 0.9191, + "num_input_tokens_seen": 470688, + "step": 1400 + }, + { + "epoch": 1.0857805255023183, + "grad_norm": 0.7536038160324097, + "learning_rate": 2.7125193199381764e-05, + "loss": 0.9392, + "num_input_tokens_seen": 472160, + "step": 1405 + }, + { + "epoch": 1.089644513137558, + "grad_norm": 0.6524299383163452, + "learning_rate": 2.7221792890262753e-05, + "loss": 1.1699, + "num_input_tokens_seen": 474016, + "step": 1410 + }, + { + "epoch": 1.0935085007727976, + "grad_norm": 0.8446447253227234, + "learning_rate": 2.7318392581143742e-05, + "loss": 1.0126, + "num_input_tokens_seen": 475744, + "step": 1415 + }, + { + "epoch": 1.0973724884080371, + "grad_norm": 1.3406308889389038, + "learning_rate": 2.741499227202473e-05, + "loss": 0.9272, + "num_input_tokens_seen": 477248, + "step": 1420 + }, + { + "epoch": 1.1012364760432767, + "grad_norm": 0.8957698345184326, + "learning_rate": 2.751159196290572e-05, + "loss": 1.0436, + "num_input_tokens_seen": 479264, + "step": 1425 + }, + { + "epoch": 1.1051004636785162, + "grad_norm": 0.6078705191612244, + "learning_rate": 2.7608191653786707e-05, + "loss": 0.8956, + "num_input_tokens_seen": 480992, + "step": 1430 + }, + { + "epoch": 1.1089644513137558, + "grad_norm": 0.7974660396575928, + "learning_rate": 2.7704791344667696e-05, + "loss": 0.9303, + "num_input_tokens_seen": 482784, + "step": 1435 + }, + { + "epoch": 1.1128284389489953, + "grad_norm": 0.9706776142120361, + "learning_rate": 2.7801391035548685e-05, + "loss": 1.1394, + "num_input_tokens_seen": 484448, + "step": 1440 + }, + { + "epoch": 1.1166924265842348, + "grad_norm": 0.762624204158783, + "learning_rate": 2.7897990726429678e-05, + "loss": 1.0394, + "num_input_tokens_seen": 486016, + "step": 1445 + }, + { + "epoch": 1.1205564142194744, + "grad_norm": 0.9965543746948242, + "learning_rate": 2.7994590417310667e-05, + "loss": 0.9055, + "num_input_tokens_seen": 487584, + "step": 1450 + }, + { + "epoch": 1.1244204018547141, + "grad_norm": 1.415420413017273, + "learning_rate": 2.8091190108191656e-05, + "loss": 1.1619, + "num_input_tokens_seen": 489440, + "step": 1455 + }, + { + "epoch": 1.1282843894899537, + "grad_norm": 1.0789326429367065, + "learning_rate": 2.8187789799072645e-05, + "loss": 0.8821, + "num_input_tokens_seen": 491008, + "step": 1460 + }, + { + "epoch": 1.1321483771251932, + "grad_norm": 0.6616756319999695, + "learning_rate": 2.8284389489953635e-05, + "loss": 0.8536, + "num_input_tokens_seen": 492864, + "step": 1465 + }, + { + "epoch": 1.1360123647604328, + "grad_norm": 0.6885191798210144, + "learning_rate": 2.838098918083462e-05, + "loss": 0.9157, + "num_input_tokens_seen": 494368, + "step": 1470 + }, + { + "epoch": 1.1398763523956723, + "grad_norm": 1.6894086599349976, + "learning_rate": 2.847758887171561e-05, + "loss": 0.9173, + "num_input_tokens_seen": 495872, + "step": 1475 + }, + { + "epoch": 1.1437403400309119, + "grad_norm": 0.9888666868209839, + "learning_rate": 2.85741885625966e-05, + "loss": 0.8997, + "num_input_tokens_seen": 497472, + "step": 1480 + }, + { + "epoch": 1.1476043276661514, + "grad_norm": 0.7453562617301941, + "learning_rate": 2.867078825347759e-05, + "loss": 1.006, + "num_input_tokens_seen": 498848, + "step": 1485 + }, + { + "epoch": 1.1514683153013912, + "grad_norm": 1.1218395233154297, + "learning_rate": 2.876738794435858e-05, + "loss": 0.8425, + "num_input_tokens_seen": 501120, + "step": 1490 + }, + { + "epoch": 1.1553323029366307, + "grad_norm": 1.9403176307678223, + "learning_rate": 2.886398763523957e-05, + "loss": 1.1247, + "num_input_tokens_seen": 502912, + "step": 1495 + }, + { + "epoch": 1.1591962905718702, + "grad_norm": 0.776176393032074, + "learning_rate": 2.896058732612056e-05, + "loss": 0.8422, + "num_input_tokens_seen": 504672, + "step": 1500 + }, + { + "epoch": 1.1630602782071098, + "grad_norm": 0.9120940566062927, + "learning_rate": 2.905718701700155e-05, + "loss": 0.9891, + "num_input_tokens_seen": 506304, + "step": 1505 + }, + { + "epoch": 1.1669242658423493, + "grad_norm": 1.2828611135482788, + "learning_rate": 2.9153786707882534e-05, + "loss": 0.8399, + "num_input_tokens_seen": 508032, + "step": 1510 + }, + { + "epoch": 1.1707882534775889, + "grad_norm": 1.0802677869796753, + "learning_rate": 2.9250386398763523e-05, + "loss": 1.3458, + "num_input_tokens_seen": 509536, + "step": 1515 + }, + { + "epoch": 1.1746522411128284, + "grad_norm": 0.9446490406990051, + "learning_rate": 2.9346986089644513e-05, + "loss": 0.8942, + "num_input_tokens_seen": 511040, + "step": 1520 + }, + { + "epoch": 1.178516228748068, + "grad_norm": 0.7290072441101074, + "learning_rate": 2.9443585780525502e-05, + "loss": 0.905, + "num_input_tokens_seen": 512704, + "step": 1525 + }, + { + "epoch": 1.1823802163833075, + "grad_norm": 0.7489339709281921, + "learning_rate": 2.9540185471406495e-05, + "loss": 0.8696, + "num_input_tokens_seen": 514272, + "step": 1530 + }, + { + "epoch": 1.1862442040185472, + "grad_norm": 0.964288592338562, + "learning_rate": 2.9636785162287484e-05, + "loss": 0.8311, + "num_input_tokens_seen": 515872, + "step": 1535 + }, + { + "epoch": 1.1901081916537868, + "grad_norm": 0.8200055360794067, + "learning_rate": 2.9733384853168473e-05, + "loss": 0.8782, + "num_input_tokens_seen": 517568, + "step": 1540 + }, + { + "epoch": 1.1939721792890263, + "grad_norm": 0.9377257227897644, + "learning_rate": 2.9829984544049462e-05, + "loss": 0.9411, + "num_input_tokens_seen": 519328, + "step": 1545 + }, + { + "epoch": 1.1978361669242659, + "grad_norm": 0.9676443934440613, + "learning_rate": 2.9926584234930448e-05, + "loss": 0.976, + "num_input_tokens_seen": 520864, + "step": 1550 + }, + { + "epoch": 1.2017001545595054, + "grad_norm": 0.9027989506721497, + "learning_rate": 3.0023183925811437e-05, + "loss": 0.8696, + "num_input_tokens_seen": 522432, + "step": 1555 + }, + { + "epoch": 1.205564142194745, + "grad_norm": 0.7568979263305664, + "learning_rate": 3.0119783616692426e-05, + "loss": 0.8333, + "num_input_tokens_seen": 524192, + "step": 1560 + }, + { + "epoch": 1.2094281298299845, + "grad_norm": 0.56147700548172, + "learning_rate": 3.0216383307573416e-05, + "loss": 0.8984, + "num_input_tokens_seen": 525856, + "step": 1565 + }, + { + "epoch": 1.213292117465224, + "grad_norm": 1.0240583419799805, + "learning_rate": 3.0312982998454408e-05, + "loss": 0.8853, + "num_input_tokens_seen": 527648, + "step": 1570 + }, + { + "epoch": 1.2171561051004636, + "grad_norm": 1.3248554468154907, + "learning_rate": 3.0409582689335397e-05, + "loss": 0.8698, + "num_input_tokens_seen": 529216, + "step": 1575 + }, + { + "epoch": 1.2210200927357033, + "grad_norm": 0.6531286239624023, + "learning_rate": 3.0506182380216387e-05, + "loss": 0.8175, + "num_input_tokens_seen": 531104, + "step": 1580 + }, + { + "epoch": 1.2248840803709429, + "grad_norm": 1.5755037069320679, + "learning_rate": 3.060278207109737e-05, + "loss": 0.9968, + "num_input_tokens_seen": 532736, + "step": 1585 + }, + { + "epoch": 1.2287480680061824, + "grad_norm": 1.6489534378051758, + "learning_rate": 3.0699381761978365e-05, + "loss": 1.0906, + "num_input_tokens_seen": 534432, + "step": 1590 + }, + { + "epoch": 1.232612055641422, + "grad_norm": 1.2310640811920166, + "learning_rate": 3.079598145285935e-05, + "loss": 0.9987, + "num_input_tokens_seen": 535840, + "step": 1595 + }, + { + "epoch": 1.2364760432766615, + "grad_norm": 0.9531770944595337, + "learning_rate": 3.089258114374034e-05, + "loss": 0.8139, + "num_input_tokens_seen": 537280, + "step": 1600 + }, + { + "epoch": 1.240340030911901, + "grad_norm": 0.7372135519981384, + "learning_rate": 3.098918083462133e-05, + "loss": 0.8664, + "num_input_tokens_seen": 538656, + "step": 1605 + }, + { + "epoch": 1.2442040185471406, + "grad_norm": 1.3575127124786377, + "learning_rate": 3.1085780525502315e-05, + "loss": 0.845, + "num_input_tokens_seen": 540384, + "step": 1610 + }, + { + "epoch": 1.2480680061823803, + "grad_norm": 1.2221133708953857, + "learning_rate": 3.118238021638331e-05, + "loss": 0.7824, + "num_input_tokens_seen": 541984, + "step": 1615 + }, + { + "epoch": 1.2519319938176197, + "grad_norm": 0.9198204278945923, + "learning_rate": 3.12789799072643e-05, + "loss": 1.0202, + "num_input_tokens_seen": 543712, + "step": 1620 + }, + { + "epoch": 1.2557959814528594, + "grad_norm": 0.7343609929084778, + "learning_rate": 3.1375579598145286e-05, + "loss": 0.8377, + "num_input_tokens_seen": 545376, + "step": 1625 + }, + { + "epoch": 1.259659969088099, + "grad_norm": 1.8325625658035278, + "learning_rate": 3.147217928902628e-05, + "loss": 0.8812, + "num_input_tokens_seen": 546880, + "step": 1630 + }, + { + "epoch": 1.2635239567233385, + "grad_norm": 0.9037417769432068, + "learning_rate": 3.1568778979907265e-05, + "loss": 0.8096, + "num_input_tokens_seen": 548640, + "step": 1635 + }, + { + "epoch": 1.267387944358578, + "grad_norm": 0.9189191460609436, + "learning_rate": 3.166537867078825e-05, + "loss": 0.9779, + "num_input_tokens_seen": 550368, + "step": 1640 + }, + { + "epoch": 1.2712519319938176, + "grad_norm": 0.8031083345413208, + "learning_rate": 3.176197836166924e-05, + "loss": 1.1747, + "num_input_tokens_seen": 552000, + "step": 1645 + }, + { + "epoch": 1.2751159196290571, + "grad_norm": 0.6790215969085693, + "learning_rate": 3.185857805255023e-05, + "loss": 0.8978, + "num_input_tokens_seen": 553952, + "step": 1650 + }, + { + "epoch": 1.2789799072642967, + "grad_norm": 0.5857919454574585, + "learning_rate": 3.195517774343122e-05, + "loss": 0.7381, + "num_input_tokens_seen": 555616, + "step": 1655 + }, + { + "epoch": 1.2828438948995364, + "grad_norm": 2.29451584815979, + "learning_rate": 3.2051777434312214e-05, + "loss": 1.162, + "num_input_tokens_seen": 557472, + "step": 1660 + }, + { + "epoch": 1.286707882534776, + "grad_norm": 1.0227851867675781, + "learning_rate": 3.21483771251932e-05, + "loss": 0.7998, + "num_input_tokens_seen": 559328, + "step": 1665 + }, + { + "epoch": 1.2905718701700155, + "grad_norm": 0.8425119519233704, + "learning_rate": 3.224497681607419e-05, + "loss": 0.7038, + "num_input_tokens_seen": 560896, + "step": 1670 + }, + { + "epoch": 1.294435857805255, + "grad_norm": 0.7514940500259399, + "learning_rate": 3.234157650695518e-05, + "loss": 0.7266, + "num_input_tokens_seen": 562688, + "step": 1675 + }, + { + "epoch": 1.2982998454404946, + "grad_norm": 0.8277683854103088, + "learning_rate": 3.2438176197836164e-05, + "loss": 0.6971, + "num_input_tokens_seen": 564576, + "step": 1680 + }, + { + "epoch": 1.3021638330757341, + "grad_norm": 0.794466495513916, + "learning_rate": 3.253477588871716e-05, + "loss": 1.0273, + "num_input_tokens_seen": 566144, + "step": 1685 + }, + { + "epoch": 1.3060278207109737, + "grad_norm": 0.7259617447853088, + "learning_rate": 3.263137557959814e-05, + "loss": 1.1412, + "num_input_tokens_seen": 567712, + "step": 1690 + }, + { + "epoch": 1.3098918083462132, + "grad_norm": 1.6643370389938354, + "learning_rate": 3.2727975270479135e-05, + "loss": 1.039, + "num_input_tokens_seen": 569344, + "step": 1695 + }, + { + "epoch": 1.3137557959814528, + "grad_norm": 1.134089469909668, + "learning_rate": 3.282457496136013e-05, + "loss": 0.8104, + "num_input_tokens_seen": 570976, + "step": 1700 + }, + { + "epoch": 1.3176197836166925, + "grad_norm": 0.8344258069992065, + "learning_rate": 3.2921174652241114e-05, + "loss": 0.9187, + "num_input_tokens_seen": 572384, + "step": 1705 + }, + { + "epoch": 1.321483771251932, + "grad_norm": 1.0735094547271729, + "learning_rate": 3.3017774343122106e-05, + "loss": 0.8814, + "num_input_tokens_seen": 574432, + "step": 1710 + }, + { + "epoch": 1.3253477588871716, + "grad_norm": 1.3040337562561035, + "learning_rate": 3.311437403400309e-05, + "loss": 1.0067, + "num_input_tokens_seen": 576160, + "step": 1715 + }, + { + "epoch": 1.3292117465224111, + "grad_norm": 1.2635324001312256, + "learning_rate": 3.321097372488408e-05, + "loss": 0.9739, + "num_input_tokens_seen": 577632, + "step": 1720 + }, + { + "epoch": 1.3330757341576507, + "grad_norm": 0.6480897665023804, + "learning_rate": 3.330757341576507e-05, + "loss": 0.8179, + "num_input_tokens_seen": 579520, + "step": 1725 + }, + { + "epoch": 1.3369397217928902, + "grad_norm": 0.5738648176193237, + "learning_rate": 3.3404173106646057e-05, + "loss": 0.7893, + "num_input_tokens_seen": 581088, + "step": 1730 + }, + { + "epoch": 1.3408037094281298, + "grad_norm": 0.6505524516105652, + "learning_rate": 3.350077279752705e-05, + "loss": 0.8287, + "num_input_tokens_seen": 583200, + "step": 1735 + }, + { + "epoch": 1.3446676970633695, + "grad_norm": 0.7261303067207336, + "learning_rate": 3.359737248840804e-05, + "loss": 0.7571, + "num_input_tokens_seen": 585216, + "step": 1740 + }, + { + "epoch": 1.3485316846986088, + "grad_norm": 0.8606592416763306, + "learning_rate": 3.369397217928903e-05, + "loss": 0.9432, + "num_input_tokens_seen": 586848, + "step": 1745 + }, + { + "epoch": 1.3523956723338486, + "grad_norm": 0.6878827214241028, + "learning_rate": 3.379057187017002e-05, + "loss": 0.794, + "num_input_tokens_seen": 588640, + "step": 1750 + }, + { + "epoch": 1.3562596599690881, + "grad_norm": 0.5911726355552673, + "learning_rate": 3.3887171561051006e-05, + "loss": 0.6874, + "num_input_tokens_seen": 590592, + "step": 1755 + }, + { + "epoch": 1.3601236476043277, + "grad_norm": 0.8146846294403076, + "learning_rate": 3.398377125193199e-05, + "loss": 1.2325, + "num_input_tokens_seen": 592192, + "step": 1760 + }, + { + "epoch": 1.3639876352395672, + "grad_norm": 0.5287665128707886, + "learning_rate": 3.4080370942812984e-05, + "loss": 0.7493, + "num_input_tokens_seen": 594016, + "step": 1765 + }, + { + "epoch": 1.3678516228748068, + "grad_norm": 1.0378830432891846, + "learning_rate": 3.417697063369397e-05, + "loss": 0.9563, + "num_input_tokens_seen": 595968, + "step": 1770 + }, + { + "epoch": 1.3717156105100463, + "grad_norm": 1.5728830099105835, + "learning_rate": 3.427357032457496e-05, + "loss": 1.371, + "num_input_tokens_seen": 597824, + "step": 1775 + }, + { + "epoch": 1.3755795981452859, + "grad_norm": 0.7860745191574097, + "learning_rate": 3.4370170015455955e-05, + "loss": 0.7226, + "num_input_tokens_seen": 599456, + "step": 1780 + }, + { + "epoch": 1.3794435857805256, + "grad_norm": 0.731859564781189, + "learning_rate": 3.446676970633694e-05, + "loss": 0.7808, + "num_input_tokens_seen": 601248, + "step": 1785 + }, + { + "epoch": 1.383307573415765, + "grad_norm": 0.6981267333030701, + "learning_rate": 3.4563369397217934e-05, + "loss": 0.7595, + "num_input_tokens_seen": 602944, + "step": 1790 + }, + { + "epoch": 1.3871715610510047, + "grad_norm": 1.8271600008010864, + "learning_rate": 3.465996908809892e-05, + "loss": 0.8096, + "num_input_tokens_seen": 604448, + "step": 1795 + }, + { + "epoch": 1.3910355486862442, + "grad_norm": 0.7971770167350769, + "learning_rate": 3.4756568778979906e-05, + "loss": 1.0319, + "num_input_tokens_seen": 606400, + "step": 1800 + }, + { + "epoch": 1.3948995363214838, + "grad_norm": 0.7875288724899292, + "learning_rate": 3.48531684698609e-05, + "loss": 0.8676, + "num_input_tokens_seen": 608000, + "step": 1805 + }, + { + "epoch": 1.3987635239567233, + "grad_norm": 0.6518684029579163, + "learning_rate": 3.4949768160741884e-05, + "loss": 0.7362, + "num_input_tokens_seen": 609600, + "step": 1810 + }, + { + "epoch": 1.4026275115919629, + "grad_norm": 0.5009527802467346, + "learning_rate": 3.504636785162288e-05, + "loss": 0.6744, + "num_input_tokens_seen": 611136, + "step": 1815 + }, + { + "epoch": 1.4064914992272024, + "grad_norm": 1.9694048166275024, + "learning_rate": 3.514296754250386e-05, + "loss": 1.0783, + "num_input_tokens_seen": 612832, + "step": 1820 + }, + { + "epoch": 1.410355486862442, + "grad_norm": 0.5805385112762451, + "learning_rate": 3.5239567233384855e-05, + "loss": 0.7124, + "num_input_tokens_seen": 614272, + "step": 1825 + }, + { + "epoch": 1.4142194744976817, + "grad_norm": 1.3927682638168335, + "learning_rate": 3.533616692426585e-05, + "loss": 0.8516, + "num_input_tokens_seen": 616000, + "step": 1830 + }, + { + "epoch": 1.4180834621329212, + "grad_norm": 0.6991333961486816, + "learning_rate": 3.5432766615146834e-05, + "loss": 0.7407, + "num_input_tokens_seen": 617664, + "step": 1835 + }, + { + "epoch": 1.4219474497681608, + "grad_norm": 0.656047523021698, + "learning_rate": 3.552936630602782e-05, + "loss": 0.6608, + "num_input_tokens_seen": 619808, + "step": 1840 + }, + { + "epoch": 1.4258114374034003, + "grad_norm": 1.240949034690857, + "learning_rate": 3.562596599690881e-05, + "loss": 0.7588, + "num_input_tokens_seen": 621408, + "step": 1845 + }, + { + "epoch": 1.4296754250386399, + "grad_norm": 0.7090048789978027, + "learning_rate": 3.57225656877898e-05, + "loss": 0.714, + "num_input_tokens_seen": 623008, + "step": 1850 + }, + { + "epoch": 1.4335394126738794, + "grad_norm": 1.8029797077178955, + "learning_rate": 3.581916537867079e-05, + "loss": 0.8207, + "num_input_tokens_seen": 624608, + "step": 1855 + }, + { + "epoch": 1.437403400309119, + "grad_norm": 0.5100898742675781, + "learning_rate": 3.5915765069551776e-05, + "loss": 0.7533, + "num_input_tokens_seen": 626304, + "step": 1860 + }, + { + "epoch": 1.4412673879443587, + "grad_norm": 1.3387116193771362, + "learning_rate": 3.601236476043277e-05, + "loss": 1.0317, + "num_input_tokens_seen": 628288, + "step": 1865 + }, + { + "epoch": 1.445131375579598, + "grad_norm": 1.1769415140151978, + "learning_rate": 3.610896445131376e-05, + "loss": 0.8312, + "num_input_tokens_seen": 630016, + "step": 1870 + }, + { + "epoch": 1.4489953632148378, + "grad_norm": 0.6714016795158386, + "learning_rate": 3.620556414219475e-05, + "loss": 0.8343, + "num_input_tokens_seen": 631552, + "step": 1875 + }, + { + "epoch": 1.4528593508500773, + "grad_norm": 0.9646749496459961, + "learning_rate": 3.630216383307573e-05, + "loss": 0.6758, + "num_input_tokens_seen": 633216, + "step": 1880 + }, + { + "epoch": 1.4567233384853169, + "grad_norm": 1.0558754205703735, + "learning_rate": 3.6398763523956726e-05, + "loss": 0.7768, + "num_input_tokens_seen": 634624, + "step": 1885 + }, + { + "epoch": 1.4605873261205564, + "grad_norm": 1.128220558166504, + "learning_rate": 3.649536321483771e-05, + "loss": 0.7441, + "num_input_tokens_seen": 636544, + "step": 1890 + }, + { + "epoch": 1.464451313755796, + "grad_norm": 1.9872965812683105, + "learning_rate": 3.6591962905718704e-05, + "loss": 0.869, + "num_input_tokens_seen": 638112, + "step": 1895 + }, + { + "epoch": 1.4683153013910355, + "grad_norm": 1.0502241849899292, + "learning_rate": 3.668856259659969e-05, + "loss": 0.8401, + "num_input_tokens_seen": 639872, + "step": 1900 + }, + { + "epoch": 1.472179289026275, + "grad_norm": 1.2619127035140991, + "learning_rate": 3.678516228748068e-05, + "loss": 1.0022, + "num_input_tokens_seen": 641280, + "step": 1905 + }, + { + "epoch": 1.4760432766615148, + "grad_norm": 1.1419217586517334, + "learning_rate": 3.6881761978361675e-05, + "loss": 0.874, + "num_input_tokens_seen": 642976, + "step": 1910 + }, + { + "epoch": 1.4799072642967541, + "grad_norm": 0.75091952085495, + "learning_rate": 3.697836166924266e-05, + "loss": 0.7839, + "num_input_tokens_seen": 644512, + "step": 1915 + }, + { + "epoch": 1.4837712519319939, + "grad_norm": 0.7873895168304443, + "learning_rate": 3.707496136012365e-05, + "loss": 0.6582, + "num_input_tokens_seen": 645792, + "step": 1920 + }, + { + "epoch": 1.4876352395672334, + "grad_norm": 1.0174239873886108, + "learning_rate": 3.717156105100464e-05, + "loss": 1.0286, + "num_input_tokens_seen": 647456, + "step": 1925 + }, + { + "epoch": 1.491499227202473, + "grad_norm": 0.7032561898231506, + "learning_rate": 3.7268160741885625e-05, + "loss": 0.9471, + "num_input_tokens_seen": 649184, + "step": 1930 + }, + { + "epoch": 1.4953632148377125, + "grad_norm": 2.31429123878479, + "learning_rate": 3.736476043276662e-05, + "loss": 0.9549, + "num_input_tokens_seen": 650880, + "step": 1935 + }, + { + "epoch": 1.499227202472952, + "grad_norm": 1.2286158800125122, + "learning_rate": 3.7461360123647604e-05, + "loss": 0.7629, + "num_input_tokens_seen": 652320, + "step": 1940 + }, + { + "epoch": 1.5030911901081918, + "grad_norm": 0.8777549862861633, + "learning_rate": 3.755795981452859e-05, + "loss": 0.6581, + "num_input_tokens_seen": 654048, + "step": 1945 + }, + { + "epoch": 1.5069551777434311, + "grad_norm": 0.6985889077186584, + "learning_rate": 3.765455950540959e-05, + "loss": 0.7782, + "num_input_tokens_seen": 655616, + "step": 1950 + }, + { + "epoch": 1.510819165378671, + "grad_norm": 0.6610350012779236, + "learning_rate": 3.7751159196290575e-05, + "loss": 0.6026, + "num_input_tokens_seen": 656992, + "step": 1955 + }, + { + "epoch": 1.5146831530139102, + "grad_norm": 0.7594138979911804, + "learning_rate": 3.784775888717156e-05, + "loss": 0.6703, + "num_input_tokens_seen": 658784, + "step": 1960 + }, + { + "epoch": 1.51854714064915, + "grad_norm": 1.1340800523757935, + "learning_rate": 3.794435857805255e-05, + "loss": 0.6998, + "num_input_tokens_seen": 660640, + "step": 1965 + }, + { + "epoch": 1.5224111282843895, + "grad_norm": 0.62305748462677, + "learning_rate": 3.804095826893354e-05, + "loss": 0.7457, + "num_input_tokens_seen": 662496, + "step": 1970 + }, + { + "epoch": 1.526275115919629, + "grad_norm": 0.8767487406730652, + "learning_rate": 3.813755795981453e-05, + "loss": 0.6222, + "num_input_tokens_seen": 664032, + "step": 1975 + }, + { + "epoch": 1.5301391035548686, + "grad_norm": 0.9841179847717285, + "learning_rate": 3.823415765069552e-05, + "loss": 0.6176, + "num_input_tokens_seen": 665664, + "step": 1980 + }, + { + "epoch": 1.5340030911901081, + "grad_norm": 1.1869940757751465, + "learning_rate": 3.83307573415765e-05, + "loss": 0.5718, + "num_input_tokens_seen": 667296, + "step": 1985 + }, + { + "epoch": 1.537867078825348, + "grad_norm": 0.5355455875396729, + "learning_rate": 3.84273570324575e-05, + "loss": 0.6594, + "num_input_tokens_seen": 668768, + "step": 1990 + }, + { + "epoch": 1.5417310664605872, + "grad_norm": 0.5825943946838379, + "learning_rate": 3.852395672333849e-05, + "loss": 0.693, + "num_input_tokens_seen": 670400, + "step": 1995 + }, + { + "epoch": 1.545595054095827, + "grad_norm": 0.750137209892273, + "learning_rate": 3.8620556414219474e-05, + "loss": 0.7012, + "num_input_tokens_seen": 672096, + "step": 2000 + }, + { + "epoch": 1.5494590417310663, + "grad_norm": 0.9335042834281921, + "learning_rate": 3.871715610510047e-05, + "loss": 0.7194, + "num_input_tokens_seen": 673632, + "step": 2005 + }, + { + "epoch": 1.553323029366306, + "grad_norm": 0.469318687915802, + "learning_rate": 3.881375579598145e-05, + "loss": 0.7306, + "num_input_tokens_seen": 675328, + "step": 2010 + }, + { + "epoch": 1.5571870170015456, + "grad_norm": 0.7021000981330872, + "learning_rate": 3.8910355486862445e-05, + "loss": 0.6166, + "num_input_tokens_seen": 676832, + "step": 2015 + }, + { + "epoch": 1.5610510046367851, + "grad_norm": 0.6276777386665344, + "learning_rate": 3.900695517774343e-05, + "loss": 0.6585, + "num_input_tokens_seen": 678560, + "step": 2020 + }, + { + "epoch": 1.5649149922720247, + "grad_norm": 1.0510494709014893, + "learning_rate": 3.910355486862442e-05, + "loss": 0.6907, + "num_input_tokens_seen": 680512, + "step": 2025 + }, + { + "epoch": 1.5687789799072642, + "grad_norm": 1.123730182647705, + "learning_rate": 3.920015455950541e-05, + "loss": 0.5599, + "num_input_tokens_seen": 682304, + "step": 2030 + }, + { + "epoch": 1.572642967542504, + "grad_norm": 0.6475264430046082, + "learning_rate": 3.92967542503864e-05, + "loss": 0.6415, + "num_input_tokens_seen": 683936, + "step": 2035 + }, + { + "epoch": 1.5765069551777433, + "grad_norm": 0.6031140685081482, + "learning_rate": 3.939335394126739e-05, + "loss": 0.9224, + "num_input_tokens_seen": 685952, + "step": 2040 + }, + { + "epoch": 1.580370942812983, + "grad_norm": 1.4751993417739868, + "learning_rate": 3.948995363214838e-05, + "loss": 0.8975, + "num_input_tokens_seen": 687776, + "step": 2045 + }, + { + "epoch": 1.5842349304482226, + "grad_norm": 0.6555319428443909, + "learning_rate": 3.9586553323029367e-05, + "loss": 0.6382, + "num_input_tokens_seen": 689248, + "step": 2050 + }, + { + "epoch": 1.5880989180834622, + "grad_norm": 1.041896939277649, + "learning_rate": 3.968315301391036e-05, + "loss": 0.6223, + "num_input_tokens_seen": 691296, + "step": 2055 + }, + { + "epoch": 1.5919629057187017, + "grad_norm": 1.0585960149765015, + "learning_rate": 3.9779752704791345e-05, + "loss": 0.587, + "num_input_tokens_seen": 692800, + "step": 2060 + }, + { + "epoch": 1.5958268933539412, + "grad_norm": 0.8075536489486694, + "learning_rate": 3.987635239567233e-05, + "loss": 0.6831, + "num_input_tokens_seen": 694624, + "step": 2065 + }, + { + "epoch": 1.599690880989181, + "grad_norm": 0.7926710844039917, + "learning_rate": 3.9972952086553323e-05, + "loss": 0.6457, + "num_input_tokens_seen": 696384, + "step": 2070 + }, + { + "epoch": 1.6035548686244203, + "grad_norm": 1.0036762952804565, + "learning_rate": 4.0069551777434316e-05, + "loss": 0.8157, + "num_input_tokens_seen": 697952, + "step": 2075 + }, + { + "epoch": 1.60741885625966, + "grad_norm": 0.9002162218093872, + "learning_rate": 4.01661514683153e-05, + "loss": 0.576, + "num_input_tokens_seen": 699520, + "step": 2080 + }, + { + "epoch": 1.6112828438948994, + "grad_norm": 0.9870691895484924, + "learning_rate": 4.0262751159196294e-05, + "loss": 0.6883, + "num_input_tokens_seen": 701088, + "step": 2085 + }, + { + "epoch": 1.6151468315301392, + "grad_norm": 0.7843000292778015, + "learning_rate": 4.035935085007728e-05, + "loss": 0.6181, + "num_input_tokens_seen": 702880, + "step": 2090 + }, + { + "epoch": 1.6190108191653787, + "grad_norm": 0.5101130604743958, + "learning_rate": 4.045595054095827e-05, + "loss": 0.6127, + "num_input_tokens_seen": 704544, + "step": 2095 + }, + { + "epoch": 1.6228748068006182, + "grad_norm": 0.6765264868736267, + "learning_rate": 4.055255023183926e-05, + "loss": 0.6367, + "num_input_tokens_seen": 706240, + "step": 2100 + }, + { + "epoch": 1.6267387944358578, + "grad_norm": 0.7340437769889832, + "learning_rate": 4.0649149922720245e-05, + "loss": 0.5887, + "num_input_tokens_seen": 707936, + "step": 2105 + }, + { + "epoch": 1.6306027820710973, + "grad_norm": 0.6624929308891296, + "learning_rate": 4.074574961360124e-05, + "loss": 0.8491, + "num_input_tokens_seen": 709280, + "step": 2110 + }, + { + "epoch": 1.634466769706337, + "grad_norm": 0.6220786571502686, + "learning_rate": 4.084234930448223e-05, + "loss": 0.5848, + "num_input_tokens_seen": 711360, + "step": 2115 + }, + { + "epoch": 1.6383307573415764, + "grad_norm": 0.7684192061424255, + "learning_rate": 4.0938948995363216e-05, + "loss": 0.6742, + "num_input_tokens_seen": 712992, + "step": 2120 + }, + { + "epoch": 1.6421947449768162, + "grad_norm": 0.5980483889579773, + "learning_rate": 4.103554868624421e-05, + "loss": 0.7731, + "num_input_tokens_seen": 714528, + "step": 2125 + }, + { + "epoch": 1.6460587326120555, + "grad_norm": 0.4681437611579895, + "learning_rate": 4.1132148377125194e-05, + "loss": 0.5215, + "num_input_tokens_seen": 716128, + "step": 2130 + }, + { + "epoch": 1.6499227202472952, + "grad_norm": 1.1008296012878418, + "learning_rate": 4.122874806800619e-05, + "loss": 0.9039, + "num_input_tokens_seen": 717920, + "step": 2135 + }, + { + "epoch": 1.6537867078825348, + "grad_norm": 1.6555272340774536, + "learning_rate": 4.132534775888717e-05, + "loss": 0.8357, + "num_input_tokens_seen": 719520, + "step": 2140 + }, + { + "epoch": 1.6576506955177743, + "grad_norm": 0.6261450052261353, + "learning_rate": 4.142194744976816e-05, + "loss": 0.5692, + "num_input_tokens_seen": 721152, + "step": 2145 + }, + { + "epoch": 1.6615146831530139, + "grad_norm": 1.0228190422058105, + "learning_rate": 4.151854714064915e-05, + "loss": 0.7445, + "num_input_tokens_seen": 722848, + "step": 2150 + }, + { + "epoch": 1.6653786707882534, + "grad_norm": 0.6840324997901917, + "learning_rate": 4.161514683153014e-05, + "loss": 0.5596, + "num_input_tokens_seen": 724544, + "step": 2155 + }, + { + "epoch": 1.6692426584234932, + "grad_norm": 0.9689034223556519, + "learning_rate": 4.171174652241113e-05, + "loss": 0.8037, + "num_input_tokens_seen": 726080, + "step": 2160 + }, + { + "epoch": 1.6731066460587325, + "grad_norm": 0.720514714717865, + "learning_rate": 4.180834621329212e-05, + "loss": 0.5986, + "num_input_tokens_seen": 728032, + "step": 2165 + }, + { + "epoch": 1.6769706336939723, + "grad_norm": 0.6230344176292419, + "learning_rate": 4.190494590417311e-05, + "loss": 0.6947, + "num_input_tokens_seen": 729920, + "step": 2170 + }, + { + "epoch": 1.6808346213292118, + "grad_norm": 1.5009418725967407, + "learning_rate": 4.20015455950541e-05, + "loss": 1.0231, + "num_input_tokens_seen": 731680, + "step": 2175 + }, + { + "epoch": 1.6846986089644513, + "grad_norm": 0.9078333377838135, + "learning_rate": 4.2098145285935086e-05, + "loss": 1.0198, + "num_input_tokens_seen": 733248, + "step": 2180 + }, + { + "epoch": 1.6885625965996909, + "grad_norm": 0.4843917787075043, + "learning_rate": 4.219474497681607e-05, + "loss": 0.9743, + "num_input_tokens_seen": 734816, + "step": 2185 + }, + { + "epoch": 1.6924265842349304, + "grad_norm": 0.9047601819038391, + "learning_rate": 4.2291344667697065e-05, + "loss": 0.6422, + "num_input_tokens_seen": 736256, + "step": 2190 + }, + { + "epoch": 1.69629057187017, + "grad_norm": 0.6598555445671082, + "learning_rate": 4.238794435857805e-05, + "loss": 0.6, + "num_input_tokens_seen": 737792, + "step": 2195 + }, + { + "epoch": 1.7001545595054095, + "grad_norm": 0.9328449368476868, + "learning_rate": 4.248454404945904e-05, + "loss": 0.8453, + "num_input_tokens_seen": 739488, + "step": 2200 + }, + { + "epoch": 1.7040185471406493, + "grad_norm": 0.6362072229385376, + "learning_rate": 4.2581143740340036e-05, + "loss": 0.6881, + "num_input_tokens_seen": 740960, + "step": 2205 + }, + { + "epoch": 1.7078825347758886, + "grad_norm": 0.6023277044296265, + "learning_rate": 4.267774343122102e-05, + "loss": 0.7452, + "num_input_tokens_seen": 742848, + "step": 2210 + }, + { + "epoch": 1.7117465224111283, + "grad_norm": 0.7039755582809448, + "learning_rate": 4.2774343122102014e-05, + "loss": 0.7954, + "num_input_tokens_seen": 744480, + "step": 2215 + }, + { + "epoch": 1.7156105100463679, + "grad_norm": 0.6024516820907593, + "learning_rate": 4.2870942812983e-05, + "loss": 0.6584, + "num_input_tokens_seen": 746208, + "step": 2220 + }, + { + "epoch": 1.7194744976816074, + "grad_norm": 0.6322065591812134, + "learning_rate": 4.2967542503863986e-05, + "loss": 0.7213, + "num_input_tokens_seen": 747808, + "step": 2225 + }, + { + "epoch": 1.723338485316847, + "grad_norm": 0.6870982646942139, + "learning_rate": 4.306414219474498e-05, + "loss": 0.5327, + "num_input_tokens_seen": 749568, + "step": 2230 + }, + { + "epoch": 1.7272024729520865, + "grad_norm": 0.6395605206489563, + "learning_rate": 4.3160741885625964e-05, + "loss": 0.5211, + "num_input_tokens_seen": 751264, + "step": 2235 + }, + { + "epoch": 1.7310664605873263, + "grad_norm": 0.7244870066642761, + "learning_rate": 4.325734157650696e-05, + "loss": 0.6987, + "num_input_tokens_seen": 752896, + "step": 2240 + }, + { + "epoch": 1.7349304482225656, + "grad_norm": 0.5896536111831665, + "learning_rate": 4.335394126738795e-05, + "loss": 0.6639, + "num_input_tokens_seen": 754592, + "step": 2245 + }, + { + "epoch": 1.7387944358578054, + "grad_norm": 0.9131588935852051, + "learning_rate": 4.3450540958268935e-05, + "loss": 0.7041, + "num_input_tokens_seen": 756192, + "step": 2250 + }, + { + "epoch": 1.7426584234930447, + "grad_norm": 0.6544005274772644, + "learning_rate": 4.354714064914993e-05, + "loss": 0.621, + "num_input_tokens_seen": 757888, + "step": 2255 + }, + { + "epoch": 1.7465224111282844, + "grad_norm": 1.3426380157470703, + "learning_rate": 4.3643740340030914e-05, + "loss": 0.6954, + "num_input_tokens_seen": 759808, + "step": 2260 + }, + { + "epoch": 1.750386398763524, + "grad_norm": 0.5650304555892944, + "learning_rate": 4.37403400309119e-05, + "loss": 0.6413, + "num_input_tokens_seen": 761440, + "step": 2265 + }, + { + "epoch": 1.7542503863987635, + "grad_norm": 0.846203625202179, + "learning_rate": 4.383693972179289e-05, + "loss": 0.8699, + "num_input_tokens_seen": 763136, + "step": 2270 + }, + { + "epoch": 1.758114374034003, + "grad_norm": 0.46728819608688354, + "learning_rate": 4.393353941267388e-05, + "loss": 0.5083, + "num_input_tokens_seen": 764928, + "step": 2275 + }, + { + "epoch": 1.7619783616692426, + "grad_norm": 0.7498624920845032, + "learning_rate": 4.403013910355487e-05, + "loss": 0.5555, + "num_input_tokens_seen": 766592, + "step": 2280 + }, + { + "epoch": 1.7658423493044824, + "grad_norm": 0.908178985118866, + "learning_rate": 4.412673879443586e-05, + "loss": 0.7076, + "num_input_tokens_seen": 768352, + "step": 2285 + }, + { + "epoch": 1.7697063369397217, + "grad_norm": 0.724461555480957, + "learning_rate": 4.422333848531685e-05, + "loss": 0.6597, + "num_input_tokens_seen": 770240, + "step": 2290 + }, + { + "epoch": 1.7735703245749614, + "grad_norm": 0.7264003753662109, + "learning_rate": 4.431993817619784e-05, + "loss": 0.5552, + "num_input_tokens_seen": 771904, + "step": 2295 + }, + { + "epoch": 1.7774343122102008, + "grad_norm": 0.7689129710197449, + "learning_rate": 4.441653786707883e-05, + "loss": 0.6948, + "num_input_tokens_seen": 773344, + "step": 2300 + }, + { + "epoch": 1.7812982998454405, + "grad_norm": 0.3946194052696228, + "learning_rate": 4.451313755795981e-05, + "loss": 0.5415, + "num_input_tokens_seen": 775040, + "step": 2305 + }, + { + "epoch": 1.78516228748068, + "grad_norm": 0.9400131702423096, + "learning_rate": 4.4609737248840806e-05, + "loss": 0.6877, + "num_input_tokens_seen": 776800, + "step": 2310 + }, + { + "epoch": 1.7890262751159196, + "grad_norm": 0.5155462026596069, + "learning_rate": 4.470633693972179e-05, + "loss": 0.5115, + "num_input_tokens_seen": 778336, + "step": 2315 + }, + { + "epoch": 1.7928902627511591, + "grad_norm": 1.4037015438079834, + "learning_rate": 4.4802936630602784e-05, + "loss": 0.762, + "num_input_tokens_seen": 779904, + "step": 2320 + }, + { + "epoch": 1.7967542503863987, + "grad_norm": 0.6965145468711853, + "learning_rate": 4.489953632148378e-05, + "loss": 0.518, + "num_input_tokens_seen": 781600, + "step": 2325 + }, + { + "epoch": 1.8006182380216385, + "grad_norm": 1.046779751777649, + "learning_rate": 4.499613601236476e-05, + "loss": 0.5863, + "num_input_tokens_seen": 783168, + "step": 2330 + }, + { + "epoch": 1.8044822256568778, + "grad_norm": 0.46384111046791077, + "learning_rate": 4.5092735703245755e-05, + "loss": 0.5546, + "num_input_tokens_seen": 784704, + "step": 2335 + }, + { + "epoch": 1.8083462132921175, + "grad_norm": 0.5563401579856873, + "learning_rate": 4.518933539412674e-05, + "loss": 0.5419, + "num_input_tokens_seen": 786400, + "step": 2340 + }, + { + "epoch": 1.812210200927357, + "grad_norm": 0.9527631998062134, + "learning_rate": 4.528593508500773e-05, + "loss": 0.6787, + "num_input_tokens_seen": 787968, + "step": 2345 + }, + { + "epoch": 1.8160741885625966, + "grad_norm": 0.7986218333244324, + "learning_rate": 4.538253477588872e-05, + "loss": 0.6584, + "num_input_tokens_seen": 789920, + "step": 2350 + }, + { + "epoch": 1.8199381761978362, + "grad_norm": 0.6329674124717712, + "learning_rate": 4.5479134466769706e-05, + "loss": 0.6351, + "num_input_tokens_seen": 791520, + "step": 2355 + }, + { + "epoch": 1.8238021638330757, + "grad_norm": 0.6726483106613159, + "learning_rate": 4.55757341576507e-05, + "loss": 0.5074, + "num_input_tokens_seen": 793216, + "step": 2360 + }, + { + "epoch": 1.8276661514683155, + "grad_norm": 0.6967073678970337, + "learning_rate": 4.5672333848531684e-05, + "loss": 0.7225, + "num_input_tokens_seen": 795040, + "step": 2365 + }, + { + "epoch": 1.8315301391035548, + "grad_norm": 0.659583568572998, + "learning_rate": 4.576893353941268e-05, + "loss": 0.898, + "num_input_tokens_seen": 796608, + "step": 2370 + }, + { + "epoch": 1.8353941267387945, + "grad_norm": 1.0558751821517944, + "learning_rate": 4.586553323029367e-05, + "loss": 0.8349, + "num_input_tokens_seen": 798336, + "step": 2375 + }, + { + "epoch": 1.8392581143740339, + "grad_norm": 0.8489737510681152, + "learning_rate": 4.5962132921174655e-05, + "loss": 0.6322, + "num_input_tokens_seen": 800000, + "step": 2380 + }, + { + "epoch": 1.8431221020092736, + "grad_norm": 0.9816057682037354, + "learning_rate": 4.605873261205564e-05, + "loss": 0.7285, + "num_input_tokens_seen": 801856, + "step": 2385 + }, + { + "epoch": 1.8469860896445132, + "grad_norm": 0.4874454140663147, + "learning_rate": 4.6155332302936633e-05, + "loss": 0.6341, + "num_input_tokens_seen": 803584, + "step": 2390 + }, + { + "epoch": 1.8508500772797527, + "grad_norm": 0.489947110414505, + "learning_rate": 4.625193199381762e-05, + "loss": 0.8318, + "num_input_tokens_seen": 805440, + "step": 2395 + }, + { + "epoch": 1.8547140649149922, + "grad_norm": 0.7590018510818481, + "learning_rate": 4.634853168469861e-05, + "loss": 0.7359, + "num_input_tokens_seen": 807296, + "step": 2400 + }, + { + "epoch": 1.8585780525502318, + "grad_norm": 1.214059829711914, + "learning_rate": 4.64451313755796e-05, + "loss": 0.5976, + "num_input_tokens_seen": 808864, + "step": 2405 + }, + { + "epoch": 1.8624420401854715, + "grad_norm": 0.8193029165267944, + "learning_rate": 4.654173106646059e-05, + "loss": 0.8666, + "num_input_tokens_seen": 810400, + "step": 2410 + }, + { + "epoch": 1.8663060278207109, + "grad_norm": 0.7902681827545166, + "learning_rate": 4.663833075734158e-05, + "loss": 0.6305, + "num_input_tokens_seen": 812096, + "step": 2415 + }, + { + "epoch": 1.8701700154559506, + "grad_norm": 0.9720790982246399, + "learning_rate": 4.673493044822257e-05, + "loss": 0.8353, + "num_input_tokens_seen": 813920, + "step": 2420 + }, + { + "epoch": 1.87403400309119, + "grad_norm": 1.3100781440734863, + "learning_rate": 4.6831530139103555e-05, + "loss": 0.9363, + "num_input_tokens_seen": 815840, + "step": 2425 + }, + { + "epoch": 1.8778979907264297, + "grad_norm": 0.5756559371948242, + "learning_rate": 4.692812982998455e-05, + "loss": 0.5537, + "num_input_tokens_seen": 817600, + "step": 2430 + }, + { + "epoch": 1.8817619783616693, + "grad_norm": 0.5974310040473938, + "learning_rate": 4.702472952086553e-05, + "loss": 0.4818, + "num_input_tokens_seen": 819008, + "step": 2435 + }, + { + "epoch": 1.8856259659969088, + "grad_norm": 0.5591315031051636, + "learning_rate": 4.7121329211746526e-05, + "loss": 0.5018, + "num_input_tokens_seen": 820480, + "step": 2440 + }, + { + "epoch": 1.8894899536321483, + "grad_norm": 0.8397132754325867, + "learning_rate": 4.721792890262751e-05, + "loss": 0.5765, + "num_input_tokens_seen": 822144, + "step": 2445 + }, + { + "epoch": 1.8933539412673879, + "grad_norm": 0.625138521194458, + "learning_rate": 4.73145285935085e-05, + "loss": 0.595, + "num_input_tokens_seen": 823936, + "step": 2450 + }, + { + "epoch": 1.8972179289026276, + "grad_norm": 0.5920037031173706, + "learning_rate": 4.74111282843895e-05, + "loss": 0.598, + "num_input_tokens_seen": 825696, + "step": 2455 + }, + { + "epoch": 1.901081916537867, + "grad_norm": 0.8419007658958435, + "learning_rate": 4.750772797527048e-05, + "loss": 0.5513, + "num_input_tokens_seen": 827424, + "step": 2460 + }, + { + "epoch": 1.9049459041731067, + "grad_norm": 1.2198939323425293, + "learning_rate": 4.760432766615147e-05, + "loss": 0.6997, + "num_input_tokens_seen": 829152, + "step": 2465 + }, + { + "epoch": 1.9088098918083463, + "grad_norm": 0.6312524080276489, + "learning_rate": 4.770092735703246e-05, + "loss": 0.4774, + "num_input_tokens_seen": 830784, + "step": 2470 + }, + { + "epoch": 1.9126738794435858, + "grad_norm": 0.7124226689338684, + "learning_rate": 4.779752704791345e-05, + "loss": 0.5202, + "num_input_tokens_seen": 832224, + "step": 2475 + }, + { + "epoch": 1.9165378670788253, + "grad_norm": 0.6577274799346924, + "learning_rate": 4.789412673879444e-05, + "loss": 0.5338, + "num_input_tokens_seen": 833920, + "step": 2480 + }, + { + "epoch": 1.9204018547140649, + "grad_norm": 2.0061707496643066, + "learning_rate": 4.7990726429675425e-05, + "loss": 0.6653, + "num_input_tokens_seen": 835552, + "step": 2485 + }, + { + "epoch": 1.9242658423493046, + "grad_norm": 0.5817850232124329, + "learning_rate": 4.808732612055641e-05, + "loss": 0.6936, + "num_input_tokens_seen": 837216, + "step": 2490 + }, + { + "epoch": 1.928129829984544, + "grad_norm": 0.8049213886260986, + "learning_rate": 4.818392581143741e-05, + "loss": 0.65, + "num_input_tokens_seen": 838720, + "step": 2495 + }, + { + "epoch": 1.9319938176197837, + "grad_norm": 1.250211477279663, + "learning_rate": 4.8280525502318396e-05, + "loss": 1.0047, + "num_input_tokens_seen": 840480, + "step": 2500 + }, + { + "epoch": 1.935857805255023, + "grad_norm": 1.018958568572998, + "learning_rate": 4.837712519319938e-05, + "loss": 0.7184, + "num_input_tokens_seen": 842144, + "step": 2505 + }, + { + "epoch": 1.9397217928902628, + "grad_norm": 0.4374702274799347, + "learning_rate": 4.8473724884080375e-05, + "loss": 0.5198, + "num_input_tokens_seen": 843648, + "step": 2510 + }, + { + "epoch": 1.9435857805255023, + "grad_norm": 0.70224529504776, + "learning_rate": 4.857032457496136e-05, + "loss": 0.7089, + "num_input_tokens_seen": 845344, + "step": 2515 + }, + { + "epoch": 1.947449768160742, + "grad_norm": 0.7834856510162354, + "learning_rate": 4.866692426584235e-05, + "loss": 0.5349, + "num_input_tokens_seen": 847136, + "step": 2520 + }, + { + "epoch": 1.9513137557959814, + "grad_norm": 0.5413835644721985, + "learning_rate": 4.876352395672334e-05, + "loss": 0.8294, + "num_input_tokens_seen": 848512, + "step": 2525 + }, + { + "epoch": 1.955177743431221, + "grad_norm": 0.9647769331932068, + "learning_rate": 4.8860123647604325e-05, + "loss": 0.6453, + "num_input_tokens_seen": 850304, + "step": 2530 + }, + { + "epoch": 1.9590417310664607, + "grad_norm": 1.002647876739502, + "learning_rate": 4.8956723338485324e-05, + "loss": 0.5986, + "num_input_tokens_seen": 851840, + "step": 2535 + }, + { + "epoch": 1.9629057187017, + "grad_norm": 0.598667323589325, + "learning_rate": 4.905332302936631e-05, + "loss": 0.7546, + "num_input_tokens_seen": 853696, + "step": 2540 + }, + { + "epoch": 1.9667697063369398, + "grad_norm": 0.5221814513206482, + "learning_rate": 4.9149922720247296e-05, + "loss": 0.4835, + "num_input_tokens_seen": 855424, + "step": 2545 + }, + { + "epoch": 1.9706336939721791, + "grad_norm": 0.6796756386756897, + "learning_rate": 4.924652241112829e-05, + "loss": 0.5489, + "num_input_tokens_seen": 857152, + "step": 2550 + }, + { + "epoch": 1.974497681607419, + "grad_norm": 0.4885543882846832, + "learning_rate": 4.9343122102009274e-05, + "loss": 0.5367, + "num_input_tokens_seen": 858976, + "step": 2555 + }, + { + "epoch": 1.9783616692426584, + "grad_norm": 0.7015871405601501, + "learning_rate": 4.943972179289027e-05, + "loss": 0.5073, + "num_input_tokens_seen": 860576, + "step": 2560 + }, + { + "epoch": 1.982225656877898, + "grad_norm": 0.8274369239807129, + "learning_rate": 4.953632148377125e-05, + "loss": 0.6635, + "num_input_tokens_seen": 862368, + "step": 2565 + }, + { + "epoch": 1.9860896445131375, + "grad_norm": 0.632579505443573, + "learning_rate": 4.963292117465224e-05, + "loss": 0.6429, + "num_input_tokens_seen": 864288, + "step": 2570 + }, + { + "epoch": 1.989953632148377, + "grad_norm": 1.7991975545883179, + "learning_rate": 4.972952086553323e-05, + "loss": 0.7924, + "num_input_tokens_seen": 865984, + "step": 2575 + }, + { + "epoch": 1.9938176197836168, + "grad_norm": 1.0472413301467896, + "learning_rate": 4.9826120556414224e-05, + "loss": 0.6769, + "num_input_tokens_seen": 867424, + "step": 2580 + }, + { + "epoch": 1.9976816074188561, + "grad_norm": 0.6398449540138245, + "learning_rate": 4.992272024729521e-05, + "loss": 0.5072, + "num_input_tokens_seen": 869312, + "step": 2585 + }, + { + "epoch": 2.0, + "eval_loss": 0.61175537109375, + "eval_runtime": 6.257, + "eval_samples_per_second": 91.897, + "eval_steps_per_second": 23.014, + "num_input_tokens_seen": 870112, + "step": 2588 + }, + { + "epoch": 2.001545595054096, + "grad_norm": 0.5872021317481995, + "learning_rate": 4.9999999772597e-05, + "loss": 0.5958, + "num_input_tokens_seen": 870784, + "step": 2590 + }, + { + "epoch": 2.0054095826893352, + "grad_norm": 0.6273124814033508, + "learning_rate": 4.9999991813492344e-05, + "loss": 0.4975, + "num_input_tokens_seen": 872320, + "step": 2595 + }, + { + "epoch": 2.009273570324575, + "grad_norm": 0.9390838742256165, + "learning_rate": 4.999997248424169e-05, + "loss": 0.56, + "num_input_tokens_seen": 874208, + "step": 2600 + }, + { + "epoch": 2.0131375579598147, + "grad_norm": 0.8733283281326294, + "learning_rate": 4.9999941784853825e-05, + "loss": 0.7414, + "num_input_tokens_seen": 875712, + "step": 2605 + }, + { + "epoch": 2.017001545595054, + "grad_norm": 0.6319810748100281, + "learning_rate": 4.999989971534272e-05, + "loss": 0.5822, + "num_input_tokens_seen": 877312, + "step": 2610 + }, + { + "epoch": 2.020865533230294, + "grad_norm": 0.7517837882041931, + "learning_rate": 4.9999846275727515e-05, + "loss": 0.694, + "num_input_tokens_seen": 878880, + "step": 2615 + }, + { + "epoch": 2.024729520865533, + "grad_norm": 0.7612168192863464, + "learning_rate": 4.99997814660325e-05, + "loss": 0.7902, + "num_input_tokens_seen": 880704, + "step": 2620 + }, + { + "epoch": 2.028593508500773, + "grad_norm": 0.6238552331924438, + "learning_rate": 4.999970528628716e-05, + "loss": 0.48, + "num_input_tokens_seen": 882400, + "step": 2625 + }, + { + "epoch": 2.0324574961360122, + "grad_norm": 0.7304152250289917, + "learning_rate": 4.999961773652613e-05, + "loss": 0.9398, + "num_input_tokens_seen": 884160, + "step": 2630 + }, + { + "epoch": 2.036321483771252, + "grad_norm": 1.2428723573684692, + "learning_rate": 4.999951881678924e-05, + "loss": 0.6789, + "num_input_tokens_seen": 885760, + "step": 2635 + }, + { + "epoch": 2.0401854714064913, + "grad_norm": 0.6586730480194092, + "learning_rate": 4.9999408527121474e-05, + "loss": 0.6134, + "num_input_tokens_seen": 887392, + "step": 2640 + }, + { + "epoch": 2.044049459041731, + "grad_norm": 0.491776704788208, + "learning_rate": 4.9999286867573004e-05, + "loss": 0.729, + "num_input_tokens_seen": 888992, + "step": 2645 + }, + { + "epoch": 2.047913446676971, + "grad_norm": 0.7134327292442322, + "learning_rate": 4.9999153838199144e-05, + "loss": 0.6026, + "num_input_tokens_seen": 890720, + "step": 2650 + }, + { + "epoch": 2.05177743431221, + "grad_norm": 0.6755256652832031, + "learning_rate": 4.999900943906041e-05, + "loss": 0.4828, + "num_input_tokens_seen": 892032, + "step": 2655 + }, + { + "epoch": 2.05564142194745, + "grad_norm": 0.8085938692092896, + "learning_rate": 4.9998853670222454e-05, + "loss": 0.5773, + "num_input_tokens_seen": 893600, + "step": 2660 + }, + { + "epoch": 2.0595054095826892, + "grad_norm": 1.0310643911361694, + "learning_rate": 4.999868653175616e-05, + "loss": 0.4788, + "num_input_tokens_seen": 895360, + "step": 2665 + }, + { + "epoch": 2.063369397217929, + "grad_norm": 0.9009783864021301, + "learning_rate": 4.99985080237375e-05, + "loss": 0.7232, + "num_input_tokens_seen": 896928, + "step": 2670 + }, + { + "epoch": 2.0672333848531683, + "grad_norm": 1.3625503778457642, + "learning_rate": 4.9998318146247694e-05, + "loss": 0.7546, + "num_input_tokens_seen": 898624, + "step": 2675 + }, + { + "epoch": 2.071097372488408, + "grad_norm": 0.5553984642028809, + "learning_rate": 4.9998116899373073e-05, + "loss": 0.6329, + "num_input_tokens_seen": 900320, + "step": 2680 + }, + { + "epoch": 2.0749613601236474, + "grad_norm": 0.6911149621009827, + "learning_rate": 4.999790428320519e-05, + "loss": 0.7347, + "num_input_tokens_seen": 902144, + "step": 2685 + }, + { + "epoch": 2.078825347758887, + "grad_norm": 0.7244237661361694, + "learning_rate": 4.9997680297840734e-05, + "loss": 0.58, + "num_input_tokens_seen": 904064, + "step": 2690 + }, + { + "epoch": 2.082689335394127, + "grad_norm": 0.6023887991905212, + "learning_rate": 4.9997444943381566e-05, + "loss": 0.5101, + "num_input_tokens_seen": 905952, + "step": 2695 + }, + { + "epoch": 2.0865533230293662, + "grad_norm": 1.1316512823104858, + "learning_rate": 4.999719821993473e-05, + "loss": 0.9466, + "num_input_tokens_seen": 907648, + "step": 2700 + }, + { + "epoch": 2.090417310664606, + "grad_norm": 0.8259800672531128, + "learning_rate": 4.9996940127612444e-05, + "loss": 0.6849, + "num_input_tokens_seen": 909440, + "step": 2705 + }, + { + "epoch": 2.0942812982998453, + "grad_norm": 1.321529507637024, + "learning_rate": 4.9996670666532096e-05, + "loss": 0.8106, + "num_input_tokens_seen": 911104, + "step": 2710 + }, + { + "epoch": 2.098145285935085, + "grad_norm": 0.7501927018165588, + "learning_rate": 4.999638983681622e-05, + "loss": 0.5262, + "num_input_tokens_seen": 912896, + "step": 2715 + }, + { + "epoch": 2.1020092735703244, + "grad_norm": 0.6282104253768921, + "learning_rate": 4.999609763859255e-05, + "loss": 0.7535, + "num_input_tokens_seen": 914368, + "step": 2720 + }, + { + "epoch": 2.105873261205564, + "grad_norm": 1.3721965551376343, + "learning_rate": 4.999579407199398e-05, + "loss": 0.6246, + "num_input_tokens_seen": 916128, + "step": 2725 + }, + { + "epoch": 2.109737248840804, + "grad_norm": 0.476744681596756, + "learning_rate": 4.9995479137158577e-05, + "loss": 0.4902, + "num_input_tokens_seen": 917664, + "step": 2730 + }, + { + "epoch": 2.1136012364760433, + "grad_norm": 0.7200570702552795, + "learning_rate": 4.9995152834229564e-05, + "loss": 0.6671, + "num_input_tokens_seen": 919392, + "step": 2735 + }, + { + "epoch": 2.117465224111283, + "grad_norm": 0.8603159785270691, + "learning_rate": 4.999481516335536e-05, + "loss": 0.5691, + "num_input_tokens_seen": 921184, + "step": 2740 + }, + { + "epoch": 2.1213292117465223, + "grad_norm": 0.5382753014564514, + "learning_rate": 4.999446612468952e-05, + "loss": 0.5521, + "num_input_tokens_seen": 923104, + "step": 2745 + }, + { + "epoch": 2.125193199381762, + "grad_norm": 0.9608195424079895, + "learning_rate": 4.9994105718390804e-05, + "loss": 0.5977, + "num_input_tokens_seen": 924832, + "step": 2750 + }, + { + "epoch": 2.1290571870170014, + "grad_norm": 0.7191005349159241, + "learning_rate": 4.9993733944623136e-05, + "loss": 0.688, + "num_input_tokens_seen": 926592, + "step": 2755 + }, + { + "epoch": 2.132921174652241, + "grad_norm": 0.8546885848045349, + "learning_rate": 4.999335080355557e-05, + "loss": 0.6058, + "num_input_tokens_seen": 928192, + "step": 2760 + }, + { + "epoch": 2.1367851622874805, + "grad_norm": 0.8555856347084045, + "learning_rate": 4.9992956295362395e-05, + "loss": 0.4935, + "num_input_tokens_seen": 929984, + "step": 2765 + }, + { + "epoch": 2.1406491499227203, + "grad_norm": 0.6710054874420166, + "learning_rate": 4.9992550420223e-05, + "loss": 0.5776, + "num_input_tokens_seen": 931520, + "step": 2770 + }, + { + "epoch": 2.1445131375579596, + "grad_norm": 0.8033020496368408, + "learning_rate": 4.999213317832202e-05, + "loss": 0.5412, + "num_input_tokens_seen": 933344, + "step": 2775 + }, + { + "epoch": 2.1483771251931993, + "grad_norm": 0.7143843173980713, + "learning_rate": 4.999170456984918e-05, + "loss": 0.633, + "num_input_tokens_seen": 935040, + "step": 2780 + }, + { + "epoch": 2.152241112828439, + "grad_norm": 0.9021994471549988, + "learning_rate": 4.999126459499945e-05, + "loss": 0.5539, + "num_input_tokens_seen": 936704, + "step": 2785 + }, + { + "epoch": 2.1561051004636784, + "grad_norm": 0.48500338196754456, + "learning_rate": 4.999081325397291e-05, + "loss": 0.7501, + "num_input_tokens_seen": 938560, + "step": 2790 + }, + { + "epoch": 2.159969088098918, + "grad_norm": 0.6963142156600952, + "learning_rate": 4.999035054697483e-05, + "loss": 0.71, + "num_input_tokens_seen": 940160, + "step": 2795 + }, + { + "epoch": 2.1638330757341575, + "grad_norm": 0.5665738582611084, + "learning_rate": 4.9989876474215666e-05, + "loss": 0.7278, + "num_input_tokens_seen": 941760, + "step": 2800 + }, + { + "epoch": 2.1676970633693973, + "grad_norm": 0.6060354709625244, + "learning_rate": 4.998939103591103e-05, + "loss": 0.4882, + "num_input_tokens_seen": 943680, + "step": 2805 + }, + { + "epoch": 2.1715610510046366, + "grad_norm": 0.5473504066467285, + "learning_rate": 4.998889423228168e-05, + "loss": 0.7771, + "num_input_tokens_seen": 945312, + "step": 2810 + }, + { + "epoch": 2.1754250386398764, + "grad_norm": 0.4546893239021301, + "learning_rate": 4.998838606355359e-05, + "loss": 0.4962, + "num_input_tokens_seen": 946976, + "step": 2815 + }, + { + "epoch": 2.179289026275116, + "grad_norm": 0.7495536208152771, + "learning_rate": 4.998786652995787e-05, + "loss": 0.5072, + "num_input_tokens_seen": 948544, + "step": 2820 + }, + { + "epoch": 2.1831530139103554, + "grad_norm": 0.8430508375167847, + "learning_rate": 4.99873356317308e-05, + "loss": 0.8445, + "num_input_tokens_seen": 950112, + "step": 2825 + }, + { + "epoch": 2.187017001545595, + "grad_norm": 0.6894240379333496, + "learning_rate": 4.9986793369113846e-05, + "loss": 0.52, + "num_input_tokens_seen": 951680, + "step": 2830 + }, + { + "epoch": 2.1908809891808345, + "grad_norm": 0.829939067363739, + "learning_rate": 4.9986239742353627e-05, + "loss": 0.5309, + "num_input_tokens_seen": 953280, + "step": 2835 + }, + { + "epoch": 2.1947449768160743, + "grad_norm": 0.5590940713882446, + "learning_rate": 4.998567475170193e-05, + "loss": 0.4697, + "num_input_tokens_seen": 954784, + "step": 2840 + }, + { + "epoch": 2.1986089644513136, + "grad_norm": 0.8005167245864868, + "learning_rate": 4.998509839741573e-05, + "loss": 0.5636, + "num_input_tokens_seen": 956128, + "step": 2845 + }, + { + "epoch": 2.2024729520865534, + "grad_norm": 0.6016167998313904, + "learning_rate": 4.998451067975714e-05, + "loss": 0.4815, + "num_input_tokens_seen": 957536, + "step": 2850 + }, + { + "epoch": 2.206336939721793, + "grad_norm": 0.7607539296150208, + "learning_rate": 4.998391159899348e-05, + "loss": 0.4808, + "num_input_tokens_seen": 959424, + "step": 2855 + }, + { + "epoch": 2.2102009273570324, + "grad_norm": 0.9889962077140808, + "learning_rate": 4.9983301155397195e-05, + "loss": 0.6232, + "num_input_tokens_seen": 961088, + "step": 2860 + }, + { + "epoch": 2.214064914992272, + "grad_norm": 1.0142638683319092, + "learning_rate": 4.998267934924593e-05, + "loss": 0.6547, + "num_input_tokens_seen": 962464, + "step": 2865 + }, + { + "epoch": 2.2179289026275115, + "grad_norm": 1.0872156620025635, + "learning_rate": 4.9982046180822475e-05, + "loss": 0.743, + "num_input_tokens_seen": 964096, + "step": 2870 + }, + { + "epoch": 2.2217928902627513, + "grad_norm": 0.5526584386825562, + "learning_rate": 4.9981401650414806e-05, + "loss": 0.5439, + "num_input_tokens_seen": 965664, + "step": 2875 + }, + { + "epoch": 2.2256568778979906, + "grad_norm": 0.581208348274231, + "learning_rate": 4.998074575831606e-05, + "loss": 0.5096, + "num_input_tokens_seen": 967232, + "step": 2880 + }, + { + "epoch": 2.2295208655332304, + "grad_norm": 1.8638406991958618, + "learning_rate": 4.998007850482454e-05, + "loss": 0.611, + "num_input_tokens_seen": 968928, + "step": 2885 + }, + { + "epoch": 2.2333848531684697, + "grad_norm": 0.774884819984436, + "learning_rate": 4.997939989024372e-05, + "loss": 0.7866, + "num_input_tokens_seen": 970752, + "step": 2890 + }, + { + "epoch": 2.2372488408037094, + "grad_norm": 0.7684866786003113, + "learning_rate": 4.9978709914882225e-05, + "loss": 0.6098, + "num_input_tokens_seen": 972288, + "step": 2895 + }, + { + "epoch": 2.2411128284389488, + "grad_norm": 0.5281456112861633, + "learning_rate": 4.997800857905387e-05, + "loss": 0.5363, + "num_input_tokens_seen": 973920, + "step": 2900 + }, + { + "epoch": 2.2449768160741885, + "grad_norm": 1.035874605178833, + "learning_rate": 4.9977295883077634e-05, + "loss": 0.6322, + "num_input_tokens_seen": 975776, + "step": 2905 + }, + { + "epoch": 2.2488408037094283, + "grad_norm": 0.8213813900947571, + "learning_rate": 4.997657182727764e-05, + "loss": 0.6555, + "num_input_tokens_seen": 977504, + "step": 2910 + }, + { + "epoch": 2.2527047913446676, + "grad_norm": 0.6057813763618469, + "learning_rate": 4.997583641198321e-05, + "loss": 0.5256, + "num_input_tokens_seen": 979072, + "step": 2915 + }, + { + "epoch": 2.2565687789799074, + "grad_norm": 0.8608366847038269, + "learning_rate": 4.997508963752879e-05, + "loss": 0.5075, + "num_input_tokens_seen": 981120, + "step": 2920 + }, + { + "epoch": 2.2604327666151467, + "grad_norm": 0.889830470085144, + "learning_rate": 4.9974331504254047e-05, + "loss": 0.4298, + "num_input_tokens_seen": 982656, + "step": 2925 + }, + { + "epoch": 2.2642967542503865, + "grad_norm": 0.6961929202079773, + "learning_rate": 4.997356201250376e-05, + "loss": 0.6168, + "num_input_tokens_seen": 984448, + "step": 2930 + }, + { + "epoch": 2.2681607418856258, + "grad_norm": 0.7187016010284424, + "learning_rate": 4.997278116262792e-05, + "loss": 0.73, + "num_input_tokens_seen": 986112, + "step": 2935 + }, + { + "epoch": 2.2720247295208655, + "grad_norm": 0.6469072699546814, + "learning_rate": 4.997198895498164e-05, + "loss": 0.4935, + "num_input_tokens_seen": 987648, + "step": 2940 + }, + { + "epoch": 2.2758887171561053, + "grad_norm": 0.48280394077301025, + "learning_rate": 4.997118538992524e-05, + "loss": 0.5306, + "num_input_tokens_seen": 989344, + "step": 2945 + }, + { + "epoch": 2.2797527047913446, + "grad_norm": 1.1507192850112915, + "learning_rate": 4.9970370467824174e-05, + "loss": 0.7862, + "num_input_tokens_seen": 991072, + "step": 2950 + }, + { + "epoch": 2.2836166924265844, + "grad_norm": 0.7736374735832214, + "learning_rate": 4.996954418904908e-05, + "loss": 0.6067, + "num_input_tokens_seen": 992640, + "step": 2955 + }, + { + "epoch": 2.2874806800618237, + "grad_norm": 0.4766157865524292, + "learning_rate": 4.9968706553975754e-05, + "loss": 0.5707, + "num_input_tokens_seen": 994272, + "step": 2960 + }, + { + "epoch": 2.2913446676970635, + "grad_norm": 0.5963830351829529, + "learning_rate": 4.996785756298514e-05, + "loss": 0.6471, + "num_input_tokens_seen": 996128, + "step": 2965 + }, + { + "epoch": 2.295208655332303, + "grad_norm": 0.7155900001525879, + "learning_rate": 4.996699721646339e-05, + "loss": 0.6961, + "num_input_tokens_seen": 997888, + "step": 2970 + }, + { + "epoch": 2.2990726429675425, + "grad_norm": 0.5099900364875793, + "learning_rate": 4.99661255148018e-05, + "loss": 0.6499, + "num_input_tokens_seen": 999360, + "step": 2975 + }, + { + "epoch": 2.3029366306027823, + "grad_norm": 1.2854065895080566, + "learning_rate": 4.996524245839679e-05, + "loss": 0.5526, + "num_input_tokens_seen": 1000928, + "step": 2980 + }, + { + "epoch": 2.3068006182380216, + "grad_norm": 0.3984777629375458, + "learning_rate": 4.9964348047650004e-05, + "loss": 0.594, + "num_input_tokens_seen": 1002464, + "step": 2985 + }, + { + "epoch": 2.3106646058732614, + "grad_norm": 0.942324161529541, + "learning_rate": 4.996344228296822e-05, + "loss": 0.6004, + "num_input_tokens_seen": 1004352, + "step": 2990 + }, + { + "epoch": 2.3145285935085007, + "grad_norm": 0.8272197246551514, + "learning_rate": 4.996252516476339e-05, + "loss": 0.7062, + "num_input_tokens_seen": 1005984, + "step": 2995 + }, + { + "epoch": 2.3183925811437405, + "grad_norm": 0.6900989413261414, + "learning_rate": 4.9961596693452615e-05, + "loss": 0.5208, + "num_input_tokens_seen": 1007552, + "step": 3000 + }, + { + "epoch": 2.32225656877898, + "grad_norm": 1.1386860609054565, + "learning_rate": 4.9960656869458176e-05, + "loss": 0.4625, + "num_input_tokens_seen": 1009120, + "step": 3005 + }, + { + "epoch": 2.3261205564142196, + "grad_norm": 0.9937919974327087, + "learning_rate": 4.995970569320752e-05, + "loss": 0.912, + "num_input_tokens_seen": 1010912, + "step": 3010 + }, + { + "epoch": 2.329984544049459, + "grad_norm": 0.6439319252967834, + "learning_rate": 4.995874316513322e-05, + "loss": 0.5501, + "num_input_tokens_seen": 1012480, + "step": 3015 + }, + { + "epoch": 2.3338485316846986, + "grad_norm": 0.6188773512840271, + "learning_rate": 4.995776928567306e-05, + "loss": 0.4705, + "num_input_tokens_seen": 1013952, + "step": 3020 + }, + { + "epoch": 2.337712519319938, + "grad_norm": 0.8019397258758545, + "learning_rate": 4.995678405526997e-05, + "loss": 0.8688, + "num_input_tokens_seen": 1015680, + "step": 3025 + }, + { + "epoch": 2.3415765069551777, + "grad_norm": 0.5938063859939575, + "learning_rate": 4.995578747437203e-05, + "loss": 0.5407, + "num_input_tokens_seen": 1017408, + "step": 3030 + }, + { + "epoch": 2.3454404945904175, + "grad_norm": 0.764053463935852, + "learning_rate": 4.995477954343249e-05, + "loss": 0.5387, + "num_input_tokens_seen": 1019168, + "step": 3035 + }, + { + "epoch": 2.349304482225657, + "grad_norm": 0.8432363271713257, + "learning_rate": 4.995376026290976e-05, + "loss": 0.5022, + "num_input_tokens_seen": 1020832, + "step": 3040 + }, + { + "epoch": 2.3531684698608966, + "grad_norm": 0.5417863726615906, + "learning_rate": 4.9952729633267425e-05, + "loss": 0.7155, + "num_input_tokens_seen": 1022336, + "step": 3045 + }, + { + "epoch": 2.357032457496136, + "grad_norm": 0.6004450917243958, + "learning_rate": 4.995168765497422e-05, + "loss": 0.5082, + "num_input_tokens_seen": 1023904, + "step": 3050 + }, + { + "epoch": 2.3608964451313756, + "grad_norm": 0.4357922375202179, + "learning_rate": 4.995063432850403e-05, + "loss": 0.6062, + "num_input_tokens_seen": 1025568, + "step": 3055 + }, + { + "epoch": 2.364760432766615, + "grad_norm": 0.7089182138442993, + "learning_rate": 4.9949569654335936e-05, + "loss": 1.0264, + "num_input_tokens_seen": 1027296, + "step": 3060 + }, + { + "epoch": 2.3686244204018547, + "grad_norm": 0.8174355626106262, + "learning_rate": 4.9948493632954144e-05, + "loss": 0.4941, + "num_input_tokens_seen": 1028896, + "step": 3065 + }, + { + "epoch": 2.3724884080370945, + "grad_norm": 1.60548996925354, + "learning_rate": 4.994740626484803e-05, + "loss": 0.5207, + "num_input_tokens_seen": 1030656, + "step": 3070 + }, + { + "epoch": 2.376352395672334, + "grad_norm": 0.6952435970306396, + "learning_rate": 4.994630755051214e-05, + "loss": 0.6018, + "num_input_tokens_seen": 1032128, + "step": 3075 + }, + { + "epoch": 2.3802163833075736, + "grad_norm": 0.5937443375587463, + "learning_rate": 4.9945197490446194e-05, + "loss": 0.6647, + "num_input_tokens_seen": 1033504, + "step": 3080 + }, + { + "epoch": 2.384080370942813, + "grad_norm": 0.4947974681854248, + "learning_rate": 4.9944076085155024e-05, + "loss": 0.5289, + "num_input_tokens_seen": 1035104, + "step": 3085 + }, + { + "epoch": 2.3879443585780527, + "grad_norm": 0.503840982913971, + "learning_rate": 4.9942943335148674e-05, + "loss": 0.4536, + "num_input_tokens_seen": 1036736, + "step": 3090 + }, + { + "epoch": 2.391808346213292, + "grad_norm": 0.41519609093666077, + "learning_rate": 4.994179924094231e-05, + "loss": 0.4534, + "num_input_tokens_seen": 1038368, + "step": 3095 + }, + { + "epoch": 2.3956723338485317, + "grad_norm": 0.9600173234939575, + "learning_rate": 4.994064380305629e-05, + "loss": 0.6686, + "num_input_tokens_seen": 1039808, + "step": 3100 + }, + { + "epoch": 2.3995363214837715, + "grad_norm": 0.753653347492218, + "learning_rate": 4.99394770220161e-05, + "loss": 0.7933, + "num_input_tokens_seen": 1041440, + "step": 3105 + }, + { + "epoch": 2.403400309119011, + "grad_norm": 1.2706314325332642, + "learning_rate": 4.99382988983524e-05, + "loss": 0.4731, + "num_input_tokens_seen": 1042912, + "step": 3110 + }, + { + "epoch": 2.4072642967542506, + "grad_norm": 0.7539973258972168, + "learning_rate": 4.993710943260102e-05, + "loss": 0.4803, + "num_input_tokens_seen": 1044832, + "step": 3115 + }, + { + "epoch": 2.41112828438949, + "grad_norm": 0.6897740960121155, + "learning_rate": 4.993590862530292e-05, + "loss": 0.7461, + "num_input_tokens_seen": 1046496, + "step": 3120 + }, + { + "epoch": 2.4149922720247297, + "grad_norm": 0.5748365521430969, + "learning_rate": 4.993469647700425e-05, + "loss": 0.386, + "num_input_tokens_seen": 1048096, + "step": 3125 + }, + { + "epoch": 2.418856259659969, + "grad_norm": 0.7032409310340881, + "learning_rate": 4.993347298825629e-05, + "loss": 0.7707, + "num_input_tokens_seen": 1049888, + "step": 3130 + }, + { + "epoch": 2.4227202472952087, + "grad_norm": 0.805913507938385, + "learning_rate": 4.993223815961549e-05, + "loss": 0.7291, + "num_input_tokens_seen": 1051392, + "step": 3135 + }, + { + "epoch": 2.426584234930448, + "grad_norm": 0.7348105311393738, + "learning_rate": 4.993099199164347e-05, + "loss": 0.4463, + "num_input_tokens_seen": 1052768, + "step": 3140 + }, + { + "epoch": 2.430448222565688, + "grad_norm": 1.2092549800872803, + "learning_rate": 4.992973448490698e-05, + "loss": 0.8441, + "num_input_tokens_seen": 1054528, + "step": 3145 + }, + { + "epoch": 2.434312210200927, + "grad_norm": 0.8649962544441223, + "learning_rate": 4.992846563997795e-05, + "loss": 0.6313, + "num_input_tokens_seen": 1056224, + "step": 3150 + }, + { + "epoch": 2.438176197836167, + "grad_norm": 0.6945915818214417, + "learning_rate": 4.992718545743346e-05, + "loss": 0.539, + "num_input_tokens_seen": 1057728, + "step": 3155 + }, + { + "epoch": 2.4420401854714067, + "grad_norm": 1.2351760864257812, + "learning_rate": 4.9925893937855726e-05, + "loss": 0.5803, + "num_input_tokens_seen": 1059360, + "step": 3160 + }, + { + "epoch": 2.445904173106646, + "grad_norm": 0.4700799882411957, + "learning_rate": 4.992459108183217e-05, + "loss": 0.4731, + "num_input_tokens_seen": 1061216, + "step": 3165 + }, + { + "epoch": 2.4497681607418857, + "grad_norm": 0.9827234745025635, + "learning_rate": 4.9923276889955317e-05, + "loss": 0.8128, + "num_input_tokens_seen": 1062656, + "step": 3170 + }, + { + "epoch": 2.453632148377125, + "grad_norm": 0.8874033093452454, + "learning_rate": 4.992195136282287e-05, + "loss": 0.4273, + "num_input_tokens_seen": 1064256, + "step": 3175 + }, + { + "epoch": 2.457496136012365, + "grad_norm": 0.7011842727661133, + "learning_rate": 4.99206145010377e-05, + "loss": 0.6537, + "num_input_tokens_seen": 1066016, + "step": 3180 + }, + { + "epoch": 2.461360123647604, + "grad_norm": 1.0611852407455444, + "learning_rate": 4.9919266305207806e-05, + "loss": 0.4558, + "num_input_tokens_seen": 1067328, + "step": 3185 + }, + { + "epoch": 2.465224111282844, + "grad_norm": 0.5809113383293152, + "learning_rate": 4.9917906775946366e-05, + "loss": 0.5496, + "num_input_tokens_seen": 1069152, + "step": 3190 + }, + { + "epoch": 2.4690880989180837, + "grad_norm": 0.5682278871536255, + "learning_rate": 4.9916535913871685e-05, + "loss": 0.5413, + "num_input_tokens_seen": 1070752, + "step": 3195 + }, + { + "epoch": 2.472952086553323, + "grad_norm": 0.7924808263778687, + "learning_rate": 4.9915153719607266e-05, + "loss": 0.6313, + "num_input_tokens_seen": 1072512, + "step": 3200 + }, + { + "epoch": 2.4768160741885628, + "grad_norm": 0.9254430532455444, + "learning_rate": 4.991376019378172e-05, + "loss": 0.5873, + "num_input_tokens_seen": 1074048, + "step": 3205 + }, + { + "epoch": 2.480680061823802, + "grad_norm": 0.8778023719787598, + "learning_rate": 4.991235533702883e-05, + "loss": 0.7182, + "num_input_tokens_seen": 1075616, + "step": 3210 + }, + { + "epoch": 2.484544049459042, + "grad_norm": 0.8999536037445068, + "learning_rate": 4.991093914998754e-05, + "loss": 0.5438, + "num_input_tokens_seen": 1077088, + "step": 3215 + }, + { + "epoch": 2.488408037094281, + "grad_norm": 1.107157588005066, + "learning_rate": 4.990951163330194e-05, + "loss": 0.6349, + "num_input_tokens_seen": 1078784, + "step": 3220 + }, + { + "epoch": 2.492272024729521, + "grad_norm": 0.8261156678199768, + "learning_rate": 4.990807278762127e-05, + "loss": 0.5067, + "num_input_tokens_seen": 1080288, + "step": 3225 + }, + { + "epoch": 2.4961360123647607, + "grad_norm": 1.0929076671600342, + "learning_rate": 4.990662261359993e-05, + "loss": 0.7751, + "num_input_tokens_seen": 1081984, + "step": 3230 + }, + { + "epoch": 2.5, + "grad_norm": 0.948244035243988, + "learning_rate": 4.990516111189747e-05, + "loss": 0.4887, + "num_input_tokens_seen": 1083616, + "step": 3235 + }, + { + "epoch": 2.5038639876352393, + "grad_norm": 0.9620707035064697, + "learning_rate": 4.990368828317857e-05, + "loss": 0.7591, + "num_input_tokens_seen": 1085056, + "step": 3240 + }, + { + "epoch": 2.507727975270479, + "grad_norm": 0.9530314803123474, + "learning_rate": 4.990220412811311e-05, + "loss": 0.4344, + "num_input_tokens_seen": 1086624, + "step": 3245 + }, + { + "epoch": 2.511591962905719, + "grad_norm": 0.7230897545814514, + "learning_rate": 4.990070864737608e-05, + "loss": 0.5694, + "num_input_tokens_seen": 1088480, + "step": 3250 + }, + { + "epoch": 2.515455950540958, + "grad_norm": 0.7740578651428223, + "learning_rate": 4.989920184164763e-05, + "loss": 0.6602, + "num_input_tokens_seen": 1090208, + "step": 3255 + }, + { + "epoch": 2.519319938176198, + "grad_norm": 0.6022875905036926, + "learning_rate": 4.989768371161306e-05, + "loss": 0.8741, + "num_input_tokens_seen": 1091904, + "step": 3260 + }, + { + "epoch": 2.5231839258114372, + "grad_norm": 1.0473324060440063, + "learning_rate": 4.989615425796283e-05, + "loss": 0.6522, + "num_input_tokens_seen": 1093536, + "step": 3265 + }, + { + "epoch": 2.527047913446677, + "grad_norm": 0.736672580242157, + "learning_rate": 4.989461348139256e-05, + "loss": 0.6856, + "num_input_tokens_seen": 1095328, + "step": 3270 + }, + { + "epoch": 2.5309119010819163, + "grad_norm": 0.6277773380279541, + "learning_rate": 4.9893061382602985e-05, + "loss": 0.5194, + "num_input_tokens_seen": 1097280, + "step": 3275 + }, + { + "epoch": 2.534775888717156, + "grad_norm": 0.6538275480270386, + "learning_rate": 4.9891497962300017e-05, + "loss": 0.6996, + "num_input_tokens_seen": 1099008, + "step": 3280 + }, + { + "epoch": 2.538639876352396, + "grad_norm": 0.6664966344833374, + "learning_rate": 4.98899232211947e-05, + "loss": 0.5471, + "num_input_tokens_seen": 1100832, + "step": 3285 + }, + { + "epoch": 2.542503863987635, + "grad_norm": 0.8980090618133545, + "learning_rate": 4.988833716000324e-05, + "loss": 0.472, + "num_input_tokens_seen": 1102528, + "step": 3290 + }, + { + "epoch": 2.546367851622875, + "grad_norm": 0.703944981098175, + "learning_rate": 4.9886739779447e-05, + "loss": 0.4711, + "num_input_tokens_seen": 1103968, + "step": 3295 + }, + { + "epoch": 2.5502318392581143, + "grad_norm": 0.4714563190937042, + "learning_rate": 4.9885131080252454e-05, + "loss": 0.5026, + "num_input_tokens_seen": 1105472, + "step": 3300 + }, + { + "epoch": 2.554095826893354, + "grad_norm": 1.4230692386627197, + "learning_rate": 4.9883511063151274e-05, + "loss": 0.7676, + "num_input_tokens_seen": 1107104, + "step": 3305 + }, + { + "epoch": 2.5579598145285933, + "grad_norm": 0.7487415075302124, + "learning_rate": 4.988187972888023e-05, + "loss": 0.5811, + "num_input_tokens_seen": 1108672, + "step": 3310 + }, + { + "epoch": 2.561823802163833, + "grad_norm": 0.6060789823532104, + "learning_rate": 4.988023707818129e-05, + "loss": 0.5203, + "num_input_tokens_seen": 1110432, + "step": 3315 + }, + { + "epoch": 2.565687789799073, + "grad_norm": 0.6036278605461121, + "learning_rate": 4.9878583111801506e-05, + "loss": 0.4597, + "num_input_tokens_seen": 1112032, + "step": 3320 + }, + { + "epoch": 2.569551777434312, + "grad_norm": 0.7548378705978394, + "learning_rate": 4.987691783049314e-05, + "loss": 0.4952, + "num_input_tokens_seen": 1113600, + "step": 3325 + }, + { + "epoch": 2.573415765069552, + "grad_norm": 0.6918869614601135, + "learning_rate": 4.9875241235013566e-05, + "loss": 0.4705, + "num_input_tokens_seen": 1115456, + "step": 3330 + }, + { + "epoch": 2.5772797527047913, + "grad_norm": 0.569870114326477, + "learning_rate": 4.98735533261253e-05, + "loss": 0.4764, + "num_input_tokens_seen": 1117408, + "step": 3335 + }, + { + "epoch": 2.581143740340031, + "grad_norm": 1.6201627254486084, + "learning_rate": 4.987185410459602e-05, + "loss": 0.8497, + "num_input_tokens_seen": 1119264, + "step": 3340 + }, + { + "epoch": 2.5850077279752703, + "grad_norm": 0.5430865287780762, + "learning_rate": 4.9870143571198545e-05, + "loss": 0.6704, + "num_input_tokens_seen": 1121056, + "step": 3345 + }, + { + "epoch": 2.58887171561051, + "grad_norm": 1.0482897758483887, + "learning_rate": 4.986842172671083e-05, + "loss": 0.5865, + "num_input_tokens_seen": 1122848, + "step": 3350 + }, + { + "epoch": 2.59273570324575, + "grad_norm": 0.7275490760803223, + "learning_rate": 4.9866688571915984e-05, + "loss": 0.5537, + "num_input_tokens_seen": 1124448, + "step": 3355 + }, + { + "epoch": 2.596599690880989, + "grad_norm": 1.354857087135315, + "learning_rate": 4.986494410760225e-05, + "loss": 0.4684, + "num_input_tokens_seen": 1125824, + "step": 3360 + }, + { + "epoch": 2.6004636785162285, + "grad_norm": 0.49000343680381775, + "learning_rate": 4.986318833456303e-05, + "loss": 0.6479, + "num_input_tokens_seen": 1127488, + "step": 3365 + }, + { + "epoch": 2.6043276661514683, + "grad_norm": 0.6173970699310303, + "learning_rate": 4.9861421253596854e-05, + "loss": 0.9323, + "num_input_tokens_seen": 1129248, + "step": 3370 + }, + { + "epoch": 2.608191653786708, + "grad_norm": 0.8981128334999084, + "learning_rate": 4.985964286550741e-05, + "loss": 0.556, + "num_input_tokens_seen": 1130944, + "step": 3375 + }, + { + "epoch": 2.6120556414219473, + "grad_norm": 0.7053803205490112, + "learning_rate": 4.98578531711035e-05, + "loss": 0.4881, + "num_input_tokens_seen": 1132672, + "step": 3380 + }, + { + "epoch": 2.615919629057187, + "grad_norm": 0.4407755136489868, + "learning_rate": 4.985605217119911e-05, + "loss": 0.4705, + "num_input_tokens_seen": 1134208, + "step": 3385 + }, + { + "epoch": 2.6197836166924264, + "grad_norm": 0.5171101093292236, + "learning_rate": 4.985423986661333e-05, + "loss": 0.4539, + "num_input_tokens_seen": 1136128, + "step": 3390 + }, + { + "epoch": 2.623647604327666, + "grad_norm": 0.6036444902420044, + "learning_rate": 4.985241625817041e-05, + "loss": 0.7747, + "num_input_tokens_seen": 1138048, + "step": 3395 + }, + { + "epoch": 2.6275115919629055, + "grad_norm": 0.513672947883606, + "learning_rate": 4.985058134669975e-05, + "loss": 0.6689, + "num_input_tokens_seen": 1139776, + "step": 3400 + }, + { + "epoch": 2.6313755795981453, + "grad_norm": 0.6637961864471436, + "learning_rate": 4.984873513303586e-05, + "loss": 0.6595, + "num_input_tokens_seen": 1141440, + "step": 3405 + }, + { + "epoch": 2.635239567233385, + "grad_norm": 0.6176183819770813, + "learning_rate": 4.984687761801842e-05, + "loss": 0.5305, + "num_input_tokens_seen": 1143072, + "step": 3410 + }, + { + "epoch": 2.6391035548686244, + "grad_norm": 0.7208863496780396, + "learning_rate": 4.9845008802492245e-05, + "loss": 0.4168, + "num_input_tokens_seen": 1144704, + "step": 3415 + }, + { + "epoch": 2.642967542503864, + "grad_norm": 0.9012816548347473, + "learning_rate": 4.984312868730727e-05, + "loss": 0.8528, + "num_input_tokens_seen": 1146560, + "step": 3420 + }, + { + "epoch": 2.6468315301391034, + "grad_norm": 0.4904487133026123, + "learning_rate": 4.984123727331859e-05, + "loss": 0.4344, + "num_input_tokens_seen": 1148160, + "step": 3425 + }, + { + "epoch": 2.650695517774343, + "grad_norm": 0.7047887444496155, + "learning_rate": 4.983933456138642e-05, + "loss": 0.4703, + "num_input_tokens_seen": 1149792, + "step": 3430 + }, + { + "epoch": 2.6545595054095825, + "grad_norm": 0.582136869430542, + "learning_rate": 4.9837420552376144e-05, + "loss": 0.5547, + "num_input_tokens_seen": 1151488, + "step": 3435 + }, + { + "epoch": 2.6584234930448223, + "grad_norm": 0.8566999435424805, + "learning_rate": 4.983549524715825e-05, + "loss": 0.6812, + "num_input_tokens_seen": 1153280, + "step": 3440 + }, + { + "epoch": 2.662287480680062, + "grad_norm": 0.7863365411758423, + "learning_rate": 4.983355864660839e-05, + "loss": 0.5784, + "num_input_tokens_seen": 1155104, + "step": 3445 + }, + { + "epoch": 2.6661514683153014, + "grad_norm": 0.6080079674720764, + "learning_rate": 4.983161075160733e-05, + "loss": 0.7269, + "num_input_tokens_seen": 1156928, + "step": 3450 + }, + { + "epoch": 2.6700154559505407, + "grad_norm": 0.7496310472488403, + "learning_rate": 4.982965156304099e-05, + "loss": 0.5926, + "num_input_tokens_seen": 1158560, + "step": 3455 + }, + { + "epoch": 2.6738794435857804, + "grad_norm": 1.6163119077682495, + "learning_rate": 4.9827681081800423e-05, + "loss": 0.69, + "num_input_tokens_seen": 1160224, + "step": 3460 + }, + { + "epoch": 2.67774343122102, + "grad_norm": 1.4075508117675781, + "learning_rate": 4.982569930878181e-05, + "loss": 0.6658, + "num_input_tokens_seen": 1161824, + "step": 3465 + }, + { + "epoch": 2.6816074188562595, + "grad_norm": 0.6033924221992493, + "learning_rate": 4.982370624488648e-05, + "loss": 0.5296, + "num_input_tokens_seen": 1163520, + "step": 3470 + }, + { + "epoch": 2.6854714064914993, + "grad_norm": 0.8615813255310059, + "learning_rate": 4.9821701891020887e-05, + "loss": 0.595, + "num_input_tokens_seen": 1165280, + "step": 3475 + }, + { + "epoch": 2.689335394126739, + "grad_norm": 0.48313701152801514, + "learning_rate": 4.981968624809662e-05, + "loss": 0.4806, + "num_input_tokens_seen": 1167168, + "step": 3480 + }, + { + "epoch": 2.6931993817619784, + "grad_norm": 0.41629400849342346, + "learning_rate": 4.981765931703041e-05, + "loss": 0.828, + "num_input_tokens_seen": 1168768, + "step": 3485 + }, + { + "epoch": 2.6970633693972177, + "grad_norm": 1.0048731565475464, + "learning_rate": 4.9815621098744115e-05, + "loss": 0.7113, + "num_input_tokens_seen": 1170688, + "step": 3490 + }, + { + "epoch": 2.7009273570324575, + "grad_norm": 0.5707192420959473, + "learning_rate": 4.9813571594164726e-05, + "loss": 0.5938, + "num_input_tokens_seen": 1172480, + "step": 3495 + }, + { + "epoch": 2.704791344667697, + "grad_norm": 0.575785756111145, + "learning_rate": 4.981151080422437e-05, + "loss": 0.5226, + "num_input_tokens_seen": 1174304, + "step": 3500 + }, + { + "epoch": 2.7086553323029365, + "grad_norm": 0.9024950265884399, + "learning_rate": 4.980943872986033e-05, + "loss": 0.4805, + "num_input_tokens_seen": 1175840, + "step": 3505 + }, + { + "epoch": 2.7125193199381763, + "grad_norm": 0.6177178025245667, + "learning_rate": 4.980735537201495e-05, + "loss": 0.678, + "num_input_tokens_seen": 1177312, + "step": 3510 + }, + { + "epoch": 2.7163833075734156, + "grad_norm": 1.1158596277236938, + "learning_rate": 4.9805260731635794e-05, + "loss": 0.5806, + "num_input_tokens_seen": 1178944, + "step": 3515 + }, + { + "epoch": 2.7202472952086554, + "grad_norm": 1.0008052587509155, + "learning_rate": 4.980315480967551e-05, + "loss": 0.6121, + "num_input_tokens_seen": 1180768, + "step": 3520 + }, + { + "epoch": 2.7241112828438947, + "grad_norm": 0.6258446574211121, + "learning_rate": 4.980103760709187e-05, + "loss": 0.4913, + "num_input_tokens_seen": 1182272, + "step": 3525 + }, + { + "epoch": 2.7279752704791345, + "grad_norm": 0.7314649224281311, + "learning_rate": 4.9798909124847804e-05, + "loss": 0.4913, + "num_input_tokens_seen": 1184256, + "step": 3530 + }, + { + "epoch": 2.7318392581143742, + "grad_norm": 0.58454430103302, + "learning_rate": 4.979676936391135e-05, + "loss": 0.4956, + "num_input_tokens_seen": 1185856, + "step": 3535 + }, + { + "epoch": 2.7357032457496135, + "grad_norm": 0.6129636764526367, + "learning_rate": 4.979461832525569e-05, + "loss": 0.5726, + "num_input_tokens_seen": 1187936, + "step": 3540 + }, + { + "epoch": 2.7395672333848533, + "grad_norm": 0.5418773293495178, + "learning_rate": 4.9792456009859126e-05, + "loss": 0.5228, + "num_input_tokens_seen": 1189632, + "step": 3545 + }, + { + "epoch": 2.7434312210200926, + "grad_norm": 1.0541815757751465, + "learning_rate": 4.979028241870509e-05, + "loss": 0.7103, + "num_input_tokens_seen": 1191168, + "step": 3550 + }, + { + "epoch": 2.7472952086553324, + "grad_norm": 1.0986506938934326, + "learning_rate": 4.978809755278215e-05, + "loss": 0.5688, + "num_input_tokens_seen": 1192800, + "step": 3555 + }, + { + "epoch": 2.7511591962905717, + "grad_norm": 0.710797905921936, + "learning_rate": 4.978590141308399e-05, + "loss": 0.5105, + "num_input_tokens_seen": 1194656, + "step": 3560 + }, + { + "epoch": 2.7550231839258115, + "grad_norm": 0.6524711847305298, + "learning_rate": 4.978369400060943e-05, + "loss": 0.6534, + "num_input_tokens_seen": 1196544, + "step": 3565 + }, + { + "epoch": 2.7588871715610512, + "grad_norm": 1.7446621656417847, + "learning_rate": 4.978147531636241e-05, + "loss": 0.5965, + "num_input_tokens_seen": 1198176, + "step": 3570 + }, + { + "epoch": 2.7627511591962906, + "grad_norm": 1.1870133876800537, + "learning_rate": 4.977924536135202e-05, + "loss": 0.4673, + "num_input_tokens_seen": 1199904, + "step": 3575 + }, + { + "epoch": 2.76661514683153, + "grad_norm": 0.5083221793174744, + "learning_rate": 4.977700413659243e-05, + "loss": 0.4984, + "num_input_tokens_seen": 1201280, + "step": 3580 + }, + { + "epoch": 2.7704791344667696, + "grad_norm": 0.708891749382019, + "learning_rate": 4.977475164310298e-05, + "loss": 0.5916, + "num_input_tokens_seen": 1202784, + "step": 3585 + }, + { + "epoch": 2.7743431221020094, + "grad_norm": 0.6648864150047302, + "learning_rate": 4.9772487881908115e-05, + "loss": 0.5632, + "num_input_tokens_seen": 1204224, + "step": 3590 + }, + { + "epoch": 2.7782071097372487, + "grad_norm": 0.5736276507377625, + "learning_rate": 4.97702128540374e-05, + "loss": 0.5059, + "num_input_tokens_seen": 1205824, + "step": 3595 + }, + { + "epoch": 2.7820710973724885, + "grad_norm": 0.6951351761817932, + "learning_rate": 4.9767926560525536e-05, + "loss": 0.4632, + "num_input_tokens_seen": 1207648, + "step": 3600 + }, + { + "epoch": 2.7859350850077282, + "grad_norm": 0.560763418674469, + "learning_rate": 4.9765629002412346e-05, + "loss": 0.3997, + "num_input_tokens_seen": 1209344, + "step": 3605 + }, + { + "epoch": 2.7897990726429676, + "grad_norm": 1.414608120918274, + "learning_rate": 4.976332018074277e-05, + "loss": 0.6072, + "num_input_tokens_seen": 1211232, + "step": 3610 + }, + { + "epoch": 2.793663060278207, + "grad_norm": 0.7670677900314331, + "learning_rate": 4.976100009656687e-05, + "loss": 0.4592, + "num_input_tokens_seen": 1212768, + "step": 3615 + }, + { + "epoch": 2.7975270479134466, + "grad_norm": 0.5580670833587646, + "learning_rate": 4.975866875093984e-05, + "loss": 0.6771, + "num_input_tokens_seen": 1214432, + "step": 3620 + }, + { + "epoch": 2.8013910355486864, + "grad_norm": 0.8309062719345093, + "learning_rate": 4.975632614492199e-05, + "loss": 0.7496, + "num_input_tokens_seen": 1216256, + "step": 3625 + }, + { + "epoch": 2.8052550231839257, + "grad_norm": 1.0229021310806274, + "learning_rate": 4.975397227957875e-05, + "loss": 0.6686, + "num_input_tokens_seen": 1218112, + "step": 3630 + }, + { + "epoch": 2.8091190108191655, + "grad_norm": 1.0659948587417603, + "learning_rate": 4.9751607155980676e-05, + "loss": 0.6601, + "num_input_tokens_seen": 1219712, + "step": 3635 + }, + { + "epoch": 2.812982998454405, + "grad_norm": 0.771364152431488, + "learning_rate": 4.9749230775203425e-05, + "loss": 0.6338, + "num_input_tokens_seen": 1221408, + "step": 3640 + }, + { + "epoch": 2.8168469860896446, + "grad_norm": 0.5099998712539673, + "learning_rate": 4.9746843138327806e-05, + "loss": 0.6108, + "num_input_tokens_seen": 1223200, + "step": 3645 + }, + { + "epoch": 2.820710973724884, + "grad_norm": 1.1135658025741577, + "learning_rate": 4.974444424643973e-05, + "loss": 0.5762, + "num_input_tokens_seen": 1224832, + "step": 3650 + }, + { + "epoch": 2.8245749613601236, + "grad_norm": 1.1114290952682495, + "learning_rate": 4.974203410063021e-05, + "loss": 0.6953, + "num_input_tokens_seen": 1226528, + "step": 3655 + }, + { + "epoch": 2.8284389489953634, + "grad_norm": 0.7879988551139832, + "learning_rate": 4.9739612701995414e-05, + "loss": 0.5983, + "num_input_tokens_seen": 1228096, + "step": 3660 + }, + { + "epoch": 2.8323029366306027, + "grad_norm": 0.4986858665943146, + "learning_rate": 4.97371800516366e-05, + "loss": 0.5082, + "num_input_tokens_seen": 1230016, + "step": 3665 + }, + { + "epoch": 2.8361669242658425, + "grad_norm": 0.725866973400116, + "learning_rate": 4.973473615066015e-05, + "loss": 0.5721, + "num_input_tokens_seen": 1231680, + "step": 3670 + }, + { + "epoch": 2.840030911901082, + "grad_norm": 0.5717026591300964, + "learning_rate": 4.973228100017757e-05, + "loss": 0.4641, + "num_input_tokens_seen": 1233568, + "step": 3675 + }, + { + "epoch": 2.8438948995363216, + "grad_norm": 0.6371144652366638, + "learning_rate": 4.972981460130548e-05, + "loss": 0.54, + "num_input_tokens_seen": 1235584, + "step": 3680 + }, + { + "epoch": 2.847758887171561, + "grad_norm": 0.8305729031562805, + "learning_rate": 4.9727336955165606e-05, + "loss": 0.6131, + "num_input_tokens_seen": 1237216, + "step": 3685 + }, + { + "epoch": 2.8516228748068007, + "grad_norm": 0.7461141347885132, + "learning_rate": 4.97248480628848e-05, + "loss": 0.4331, + "num_input_tokens_seen": 1238976, + "step": 3690 + }, + { + "epoch": 2.8554868624420404, + "grad_norm": 0.8147376775741577, + "learning_rate": 4.972234792559503e-05, + "loss": 0.8297, + "num_input_tokens_seen": 1241248, + "step": 3695 + }, + { + "epoch": 2.8593508500772797, + "grad_norm": 0.7809189558029175, + "learning_rate": 4.971983654443335e-05, + "loss": 0.7031, + "num_input_tokens_seen": 1242880, + "step": 3700 + }, + { + "epoch": 2.863214837712519, + "grad_norm": 0.615028977394104, + "learning_rate": 4.971731392054198e-05, + "loss": 0.4065, + "num_input_tokens_seen": 1244512, + "step": 3705 + }, + { + "epoch": 2.867078825347759, + "grad_norm": 1.0226339101791382, + "learning_rate": 4.971478005506821e-05, + "loss": 0.4689, + "num_input_tokens_seen": 1245984, + "step": 3710 + }, + { + "epoch": 2.8709428129829986, + "grad_norm": 0.6471822261810303, + "learning_rate": 4.971223494916446e-05, + "loss": 0.5711, + "num_input_tokens_seen": 1247680, + "step": 3715 + }, + { + "epoch": 2.874806800618238, + "grad_norm": 0.4454590976238251, + "learning_rate": 4.970967860398825e-05, + "loss": 0.4669, + "num_input_tokens_seen": 1249312, + "step": 3720 + }, + { + "epoch": 2.8786707882534777, + "grad_norm": 0.5417119860649109, + "learning_rate": 4.9707111020702245e-05, + "loss": 0.4825, + "num_input_tokens_seen": 1251008, + "step": 3725 + }, + { + "epoch": 2.8825347758887174, + "grad_norm": 0.6805733442306519, + "learning_rate": 4.970453220047417e-05, + "loss": 0.4737, + "num_input_tokens_seen": 1252736, + "step": 3730 + }, + { + "epoch": 2.8863987635239567, + "grad_norm": 0.897691011428833, + "learning_rate": 4.970194214447691e-05, + "loss": 0.5614, + "num_input_tokens_seen": 1254368, + "step": 3735 + }, + { + "epoch": 2.890262751159196, + "grad_norm": 0.48385271430015564, + "learning_rate": 4.9699340853888435e-05, + "loss": 0.4543, + "num_input_tokens_seen": 1256384, + "step": 3740 + }, + { + "epoch": 2.894126738794436, + "grad_norm": 0.7606428861618042, + "learning_rate": 4.9696728329891806e-05, + "loss": 0.6595, + "num_input_tokens_seen": 1258080, + "step": 3745 + }, + { + "epoch": 2.8979907264296756, + "grad_norm": 0.7689409852027893, + "learning_rate": 4.9694104573675236e-05, + "loss": 0.6246, + "num_input_tokens_seen": 1259776, + "step": 3750 + }, + { + "epoch": 2.901854714064915, + "grad_norm": 0.6122153401374817, + "learning_rate": 4.9691469586432025e-05, + "loss": 0.767, + "num_input_tokens_seen": 1261376, + "step": 3755 + }, + { + "epoch": 2.9057187017001547, + "grad_norm": 0.5611766576766968, + "learning_rate": 4.968882336936056e-05, + "loss": 0.5527, + "num_input_tokens_seen": 1263072, + "step": 3760 + }, + { + "epoch": 2.909582689335394, + "grad_norm": 0.8958523273468018, + "learning_rate": 4.968616592366439e-05, + "loss": 0.4474, + "num_input_tokens_seen": 1264736, + "step": 3765 + }, + { + "epoch": 2.9134466769706338, + "grad_norm": 0.8942545652389526, + "learning_rate": 4.96834972505521e-05, + "loss": 0.5625, + "num_input_tokens_seen": 1266560, + "step": 3770 + }, + { + "epoch": 2.917310664605873, + "grad_norm": 0.879321277141571, + "learning_rate": 4.968081735123745e-05, + "loss": 0.3957, + "num_input_tokens_seen": 1268192, + "step": 3775 + }, + { + "epoch": 2.921174652241113, + "grad_norm": 0.7475071549415588, + "learning_rate": 4.9678126226939255e-05, + "loss": 0.5358, + "num_input_tokens_seen": 1269728, + "step": 3780 + }, + { + "epoch": 2.9250386398763526, + "grad_norm": 0.5887591242790222, + "learning_rate": 4.967542387888146e-05, + "loss": 0.7014, + "num_input_tokens_seen": 1271264, + "step": 3785 + }, + { + "epoch": 2.928902627511592, + "grad_norm": 0.5189406871795654, + "learning_rate": 4.9672710308293115e-05, + "loss": 0.4839, + "num_input_tokens_seen": 1272928, + "step": 3790 + }, + { + "epoch": 2.9327666151468317, + "grad_norm": 0.5869417190551758, + "learning_rate": 4.966998551640836e-05, + "loss": 0.4849, + "num_input_tokens_seen": 1274560, + "step": 3795 + }, + { + "epoch": 2.936630602782071, + "grad_norm": 0.6148399114608765, + "learning_rate": 4.966724950446644e-05, + "loss": 0.5829, + "num_input_tokens_seen": 1276640, + "step": 3800 + }, + { + "epoch": 2.9404945904173108, + "grad_norm": 0.5286694765090942, + "learning_rate": 4.9664502273711735e-05, + "loss": 0.4567, + "num_input_tokens_seen": 1278368, + "step": 3805 + }, + { + "epoch": 2.94435857805255, + "grad_norm": 0.7245681881904602, + "learning_rate": 4.966174382539367e-05, + "loss": 0.4586, + "num_input_tokens_seen": 1280320, + "step": 3810 + }, + { + "epoch": 2.94822256568779, + "grad_norm": 0.6545013785362244, + "learning_rate": 4.965897416076683e-05, + "loss": 0.4411, + "num_input_tokens_seen": 1282080, + "step": 3815 + }, + { + "epoch": 2.9520865533230296, + "grad_norm": 1.2088731527328491, + "learning_rate": 4.965619328109086e-05, + "loss": 0.9783, + "num_input_tokens_seen": 1283648, + "step": 3820 + }, + { + "epoch": 2.955950540958269, + "grad_norm": 0.5256138443946838, + "learning_rate": 4.9653401187630535e-05, + "loss": 0.8446, + "num_input_tokens_seen": 1285440, + "step": 3825 + }, + { + "epoch": 2.9598145285935082, + "grad_norm": 0.616633951663971, + "learning_rate": 4.965059788165569e-05, + "loss": 0.8153, + "num_input_tokens_seen": 1287072, + "step": 3830 + }, + { + "epoch": 2.963678516228748, + "grad_norm": 0.4775032699108124, + "learning_rate": 4.9647783364441315e-05, + "loss": 0.4496, + "num_input_tokens_seen": 1288704, + "step": 3835 + }, + { + "epoch": 2.9675425038639878, + "grad_norm": 1.3060977458953857, + "learning_rate": 4.964495763726745e-05, + "loss": 0.6103, + "num_input_tokens_seen": 1290688, + "step": 3840 + }, + { + "epoch": 2.971406491499227, + "grad_norm": 0.9492464661598206, + "learning_rate": 4.964212070141927e-05, + "loss": 0.7124, + "num_input_tokens_seen": 1292416, + "step": 3845 + }, + { + "epoch": 2.975270479134467, + "grad_norm": 1.1862841844558716, + "learning_rate": 4.963927255818701e-05, + "loss": 0.6133, + "num_input_tokens_seen": 1294048, + "step": 3850 + }, + { + "epoch": 2.9791344667697066, + "grad_norm": 1.4498687982559204, + "learning_rate": 4.9636413208866026e-05, + "loss": 0.8269, + "num_input_tokens_seen": 1295776, + "step": 3855 + }, + { + "epoch": 2.982998454404946, + "grad_norm": 0.6608230471611023, + "learning_rate": 4.963354265475678e-05, + "loss": 0.5003, + "num_input_tokens_seen": 1297600, + "step": 3860 + }, + { + "epoch": 2.9868624420401853, + "grad_norm": 0.8368356227874756, + "learning_rate": 4.963066089716481e-05, + "loss": 0.5202, + "num_input_tokens_seen": 1299232, + "step": 3865 + }, + { + "epoch": 2.990726429675425, + "grad_norm": 0.8017939329147339, + "learning_rate": 4.9627767937400754e-05, + "loss": 0.5971, + "num_input_tokens_seen": 1300928, + "step": 3870 + }, + { + "epoch": 2.9945904173106648, + "grad_norm": 0.6762252449989319, + "learning_rate": 4.962486377678035e-05, + "loss": 0.3965, + "num_input_tokens_seen": 1302560, + "step": 3875 + }, + { + "epoch": 2.998454404945904, + "grad_norm": 0.6706210970878601, + "learning_rate": 4.962194841662443e-05, + "loss": 0.4585, + "num_input_tokens_seen": 1304384, + "step": 3880 + }, + { + "epoch": 3.0, + "eval_loss": 0.5376895666122437, + "eval_runtime": 6.2198, + "eval_samples_per_second": 92.447, + "eval_steps_per_second": 23.152, + "num_input_tokens_seen": 1305024, + "step": 3882 + }, + { + "epoch": 3.002318392581144, + "grad_norm": 0.6692524552345276, + "learning_rate": 4.961902185825892e-05, + "loss": 0.5, + "num_input_tokens_seen": 1306144, + "step": 3885 + }, + { + "epoch": 3.006182380216383, + "grad_norm": 0.6399476528167725, + "learning_rate": 4.961608410301482e-05, + "loss": 0.7209, + "num_input_tokens_seen": 1307936, + "step": 3890 + }, + { + "epoch": 3.010046367851623, + "grad_norm": 0.9378648400306702, + "learning_rate": 4.961313515222826e-05, + "loss": 0.4909, + "num_input_tokens_seen": 1309472, + "step": 3895 + }, + { + "epoch": 3.0139103554868623, + "grad_norm": 0.9244621992111206, + "learning_rate": 4.9610175007240424e-05, + "loss": 0.5549, + "num_input_tokens_seen": 1311168, + "step": 3900 + }, + { + "epoch": 3.017774343122102, + "grad_norm": 0.6442186236381531, + "learning_rate": 4.960720366939762e-05, + "loss": 0.7648, + "num_input_tokens_seen": 1312896, + "step": 3905 + }, + { + "epoch": 3.021638330757342, + "grad_norm": 0.5467321276664734, + "learning_rate": 4.960422114005121e-05, + "loss": 0.4793, + "num_input_tokens_seen": 1314464, + "step": 3910 + }, + { + "epoch": 3.025502318392581, + "grad_norm": 0.7016013860702515, + "learning_rate": 4.9601227420557675e-05, + "loss": 0.79, + "num_input_tokens_seen": 1316064, + "step": 3915 + }, + { + "epoch": 3.029366306027821, + "grad_norm": 0.9105775952339172, + "learning_rate": 4.959822251227858e-05, + "loss": 0.6247, + "num_input_tokens_seen": 1317888, + "step": 3920 + }, + { + "epoch": 3.03323029366306, + "grad_norm": 1.062925934791565, + "learning_rate": 4.959520641658058e-05, + "loss": 0.5484, + "num_input_tokens_seen": 1319872, + "step": 3925 + }, + { + "epoch": 3.0370942812983, + "grad_norm": 0.6849232912063599, + "learning_rate": 4.9592179134835406e-05, + "loss": 0.5046, + "num_input_tokens_seen": 1321536, + "step": 3930 + }, + { + "epoch": 3.0409582689335393, + "grad_norm": 0.9092586636543274, + "learning_rate": 4.958914066841988e-05, + "loss": 0.5194, + "num_input_tokens_seen": 1323424, + "step": 3935 + }, + { + "epoch": 3.044822256568779, + "grad_norm": 0.5917360186576843, + "learning_rate": 4.9586091018715916e-05, + "loss": 0.4613, + "num_input_tokens_seen": 1325024, + "step": 3940 + }, + { + "epoch": 3.0486862442040183, + "grad_norm": 1.1864020824432373, + "learning_rate": 4.9583030187110525e-05, + "loss": 0.6035, + "num_input_tokens_seen": 1326720, + "step": 3945 + }, + { + "epoch": 3.052550231839258, + "grad_norm": 0.5389000773429871, + "learning_rate": 4.957995817499578e-05, + "loss": 0.4536, + "num_input_tokens_seen": 1328096, + "step": 3950 + }, + { + "epoch": 3.056414219474498, + "grad_norm": 1.2998607158660889, + "learning_rate": 4.957687498376886e-05, + "loss": 0.5597, + "num_input_tokens_seen": 1329664, + "step": 3955 + }, + { + "epoch": 3.060278207109737, + "grad_norm": 1.22916579246521, + "learning_rate": 4.9573780614832e-05, + "loss": 0.6893, + "num_input_tokens_seen": 1331296, + "step": 3960 + }, + { + "epoch": 3.064142194744977, + "grad_norm": 1.2127755880355835, + "learning_rate": 4.9570675069592553e-05, + "loss": 0.4611, + "num_input_tokens_seen": 1332896, + "step": 3965 + }, + { + "epoch": 3.0680061823802163, + "grad_norm": 0.8859630823135376, + "learning_rate": 4.956755834946294e-05, + "loss": 0.4614, + "num_input_tokens_seen": 1334688, + "step": 3970 + }, + { + "epoch": 3.071870170015456, + "grad_norm": 0.8405247926712036, + "learning_rate": 4.9564430455860655e-05, + "loss": 0.4601, + "num_input_tokens_seen": 1336576, + "step": 3975 + }, + { + "epoch": 3.0757341576506954, + "grad_norm": 0.8325818181037903, + "learning_rate": 4.95612913902083e-05, + "loss": 0.6677, + "num_input_tokens_seen": 1338016, + "step": 3980 + }, + { + "epoch": 3.079598145285935, + "grad_norm": 1.2888367176055908, + "learning_rate": 4.9558141153933515e-05, + "loss": 0.9182, + "num_input_tokens_seen": 1339424, + "step": 3985 + }, + { + "epoch": 3.0834621329211744, + "grad_norm": 0.9293464422225952, + "learning_rate": 4.955497974846907e-05, + "loss": 0.7447, + "num_input_tokens_seen": 1341152, + "step": 3990 + }, + { + "epoch": 3.087326120556414, + "grad_norm": 0.9788671731948853, + "learning_rate": 4.955180717525277e-05, + "loss": 0.6094, + "num_input_tokens_seen": 1343136, + "step": 3995 + }, + { + "epoch": 3.091190108191654, + "grad_norm": 0.3800913095474243, + "learning_rate": 4.954862343572755e-05, + "loss": 0.4768, + "num_input_tokens_seen": 1344864, + "step": 4000 + }, + { + "epoch": 3.0950540958268933, + "grad_norm": 1.0462840795516968, + "learning_rate": 4.954542853134136e-05, + "loss": 0.5344, + "num_input_tokens_seen": 1346400, + "step": 4005 + }, + { + "epoch": 3.098918083462133, + "grad_norm": 0.9114061594009399, + "learning_rate": 4.9542222463547286e-05, + "loss": 0.5235, + "num_input_tokens_seen": 1348128, + "step": 4010 + }, + { + "epoch": 3.1027820710973724, + "grad_norm": 1.2791626453399658, + "learning_rate": 4.953900523380345e-05, + "loss": 0.5158, + "num_input_tokens_seen": 1349824, + "step": 4015 + }, + { + "epoch": 3.106646058732612, + "grad_norm": 0.8000045418739319, + "learning_rate": 4.953577684357308e-05, + "loss": 0.5788, + "num_input_tokens_seen": 1351680, + "step": 4020 + }, + { + "epoch": 3.1105100463678514, + "grad_norm": 0.5990275144577026, + "learning_rate": 4.9532537294324456e-05, + "loss": 0.41, + "num_input_tokens_seen": 1353504, + "step": 4025 + }, + { + "epoch": 3.114374034003091, + "grad_norm": 0.6965031027793884, + "learning_rate": 4.9529286587530955e-05, + "loss": 0.4189, + "num_input_tokens_seen": 1355168, + "step": 4030 + }, + { + "epoch": 3.118238021638331, + "grad_norm": 0.9108599424362183, + "learning_rate": 4.9526024724671014e-05, + "loss": 0.4967, + "num_input_tokens_seen": 1357056, + "step": 4035 + }, + { + "epoch": 3.1221020092735703, + "grad_norm": 0.5733317136764526, + "learning_rate": 4.952275170722815e-05, + "loss": 0.5076, + "num_input_tokens_seen": 1358720, + "step": 4040 + }, + { + "epoch": 3.12596599690881, + "grad_norm": 0.6941772699356079, + "learning_rate": 4.951946753669095e-05, + "loss": 0.5101, + "num_input_tokens_seen": 1360544, + "step": 4045 + }, + { + "epoch": 3.1298299845440494, + "grad_norm": 0.5935850143432617, + "learning_rate": 4.951617221455307e-05, + "loss": 0.6383, + "num_input_tokens_seen": 1362240, + "step": 4050 + }, + { + "epoch": 3.133693972179289, + "grad_norm": 0.6881517767906189, + "learning_rate": 4.951286574231325e-05, + "loss": 0.8708, + "num_input_tokens_seen": 1364064, + "step": 4055 + }, + { + "epoch": 3.1375579598145285, + "grad_norm": 0.8645124435424805, + "learning_rate": 4.950954812147528e-05, + "loss": 0.478, + "num_input_tokens_seen": 1366016, + "step": 4060 + }, + { + "epoch": 3.141421947449768, + "grad_norm": 0.722486674785614, + "learning_rate": 4.9506219353548045e-05, + "loss": 0.4613, + "num_input_tokens_seen": 1367840, + "step": 4065 + }, + { + "epoch": 3.1452859350850075, + "grad_norm": 1.1016780138015747, + "learning_rate": 4.9502879440045494e-05, + "loss": 0.6869, + "num_input_tokens_seen": 1369536, + "step": 4070 + }, + { + "epoch": 3.1491499227202473, + "grad_norm": 0.6162155270576477, + "learning_rate": 4.9499528382486624e-05, + "loss": 0.4922, + "num_input_tokens_seen": 1371136, + "step": 4075 + }, + { + "epoch": 3.153013910355487, + "grad_norm": 0.5458917021751404, + "learning_rate": 4.949616618239552e-05, + "loss": 0.5596, + "num_input_tokens_seen": 1372768, + "step": 4080 + }, + { + "epoch": 3.1568778979907264, + "grad_norm": 0.7317019701004028, + "learning_rate": 4.949279284130134e-05, + "loss": 0.5243, + "num_input_tokens_seen": 1374208, + "step": 4085 + }, + { + "epoch": 3.160741885625966, + "grad_norm": 0.936220109462738, + "learning_rate": 4.94894083607383e-05, + "loss": 0.4849, + "num_input_tokens_seen": 1376000, + "step": 4090 + }, + { + "epoch": 3.1646058732612055, + "grad_norm": 0.9624898433685303, + "learning_rate": 4.948601274224567e-05, + "loss": 0.7673, + "num_input_tokens_seen": 1377664, + "step": 4095 + }, + { + "epoch": 3.1684698608964452, + "grad_norm": 1.0114758014678955, + "learning_rate": 4.94826059873678e-05, + "loss": 0.6, + "num_input_tokens_seen": 1379136, + "step": 4100 + }, + { + "epoch": 3.1723338485316845, + "grad_norm": 0.47681665420532227, + "learning_rate": 4.947918809765411e-05, + "loss": 0.4648, + "num_input_tokens_seen": 1380736, + "step": 4105 + }, + { + "epoch": 3.1761978361669243, + "grad_norm": 0.43349337577819824, + "learning_rate": 4.9475759074659076e-05, + "loss": 0.5106, + "num_input_tokens_seen": 1382400, + "step": 4110 + }, + { + "epoch": 3.1800618238021636, + "grad_norm": 0.9777429699897766, + "learning_rate": 4.947231891994223e-05, + "loss": 0.3934, + "num_input_tokens_seen": 1384576, + "step": 4115 + }, + { + "epoch": 3.1839258114374034, + "grad_norm": 0.6951308250427246, + "learning_rate": 4.946886763506818e-05, + "loss": 0.4922, + "num_input_tokens_seen": 1386272, + "step": 4120 + }, + { + "epoch": 3.187789799072643, + "grad_norm": 0.6913830637931824, + "learning_rate": 4.94654052216066e-05, + "loss": 0.4729, + "num_input_tokens_seen": 1388000, + "step": 4125 + }, + { + "epoch": 3.1916537867078825, + "grad_norm": 0.5160238146781921, + "learning_rate": 4.94619316811322e-05, + "loss": 0.4791, + "num_input_tokens_seen": 1389952, + "step": 4130 + }, + { + "epoch": 3.1955177743431222, + "grad_norm": 0.5371643900871277, + "learning_rate": 4.9458447015224776e-05, + "loss": 0.5745, + "num_input_tokens_seen": 1391744, + "step": 4135 + }, + { + "epoch": 3.1993817619783615, + "grad_norm": 1.170280933380127, + "learning_rate": 4.945495122546917e-05, + "loss": 0.8978, + "num_input_tokens_seen": 1393408, + "step": 4140 + }, + { + "epoch": 3.2032457496136013, + "grad_norm": 0.7129823565483093, + "learning_rate": 4.9451444313455295e-05, + "loss": 0.5957, + "num_input_tokens_seen": 1395104, + "step": 4145 + }, + { + "epoch": 3.2071097372488406, + "grad_norm": 0.45300835371017456, + "learning_rate": 4.944792628077811e-05, + "loss": 0.7408, + "num_input_tokens_seen": 1396832, + "step": 4150 + }, + { + "epoch": 3.2109737248840804, + "grad_norm": 1.0709989070892334, + "learning_rate": 4.9444397129037645e-05, + "loss": 0.5875, + "num_input_tokens_seen": 1398432, + "step": 4155 + }, + { + "epoch": 3.21483771251932, + "grad_norm": 0.75050288438797, + "learning_rate": 4.944085685983898e-05, + "loss": 0.434, + "num_input_tokens_seen": 1400288, + "step": 4160 + }, + { + "epoch": 3.2187017001545595, + "grad_norm": 0.8618924021720886, + "learning_rate": 4.9437305474792225e-05, + "loss": 0.4928, + "num_input_tokens_seen": 1401888, + "step": 4165 + }, + { + "epoch": 3.2225656877897992, + "grad_norm": 0.9864931702613831, + "learning_rate": 4.943374297551261e-05, + "loss": 0.8027, + "num_input_tokens_seen": 1403840, + "step": 4170 + }, + { + "epoch": 3.2264296754250386, + "grad_norm": 0.7010461091995239, + "learning_rate": 4.943016936362035e-05, + "loss": 0.7147, + "num_input_tokens_seen": 1405408, + "step": 4175 + }, + { + "epoch": 3.2302936630602783, + "grad_norm": 1.1368600130081177, + "learning_rate": 4.942658464074076e-05, + "loss": 0.4853, + "num_input_tokens_seen": 1407232, + "step": 4180 + }, + { + "epoch": 3.2341576506955176, + "grad_norm": 0.8179258108139038, + "learning_rate": 4.942298880850419e-05, + "loss": 0.5301, + "num_input_tokens_seen": 1409280, + "step": 4185 + }, + { + "epoch": 3.2380216383307574, + "grad_norm": 0.581548273563385, + "learning_rate": 4.941938186854605e-05, + "loss": 0.4547, + "num_input_tokens_seen": 1410848, + "step": 4190 + }, + { + "epoch": 3.2418856259659967, + "grad_norm": 0.7356603741645813, + "learning_rate": 4.941576382250679e-05, + "loss": 0.6567, + "num_input_tokens_seen": 1412480, + "step": 4195 + }, + { + "epoch": 3.2457496136012365, + "grad_norm": 0.7031446695327759, + "learning_rate": 4.941213467203193e-05, + "loss": 0.506, + "num_input_tokens_seen": 1414080, + "step": 4200 + }, + { + "epoch": 3.2496136012364762, + "grad_norm": 0.8540570139884949, + "learning_rate": 4.940849441877201e-05, + "loss": 0.4623, + "num_input_tokens_seen": 1415616, + "step": 4205 + }, + { + "epoch": 3.2534775888717156, + "grad_norm": 1.1943670511245728, + "learning_rate": 4.940484306438266e-05, + "loss": 0.521, + "num_input_tokens_seen": 1417312, + "step": 4210 + }, + { + "epoch": 3.2573415765069553, + "grad_norm": 1.0022284984588623, + "learning_rate": 4.940118061052453e-05, + "loss": 0.591, + "num_input_tokens_seen": 1419040, + "step": 4215 + }, + { + "epoch": 3.2612055641421946, + "grad_norm": 1.1619199514389038, + "learning_rate": 4.939750705886332e-05, + "loss": 0.5013, + "num_input_tokens_seen": 1420640, + "step": 4220 + }, + { + "epoch": 3.2650695517774344, + "grad_norm": 0.836362898349762, + "learning_rate": 4.9393822411069794e-05, + "loss": 0.6146, + "num_input_tokens_seen": 1422144, + "step": 4225 + }, + { + "epoch": 3.2689335394126737, + "grad_norm": 0.7009983062744141, + "learning_rate": 4.939012666881975e-05, + "loss": 0.4666, + "num_input_tokens_seen": 1423808, + "step": 4230 + }, + { + "epoch": 3.2727975270479135, + "grad_norm": 0.7430618405342102, + "learning_rate": 4.938641983379402e-05, + "loss": 0.5426, + "num_input_tokens_seen": 1425600, + "step": 4235 + }, + { + "epoch": 3.276661514683153, + "grad_norm": 0.7547100186347961, + "learning_rate": 4.9382701907678514e-05, + "loss": 0.4921, + "num_input_tokens_seen": 1427296, + "step": 4240 + }, + { + "epoch": 3.2805255023183926, + "grad_norm": 0.8652698397636414, + "learning_rate": 4.9378972892164156e-05, + "loss": 0.4784, + "num_input_tokens_seen": 1429024, + "step": 4245 + }, + { + "epoch": 3.2843894899536323, + "grad_norm": 1.3067700862884521, + "learning_rate": 4.9375232788946926e-05, + "loss": 0.7541, + "num_input_tokens_seen": 1430752, + "step": 4250 + }, + { + "epoch": 3.2882534775888717, + "grad_norm": 0.5043982267379761, + "learning_rate": 4.937148159972784e-05, + "loss": 0.5059, + "num_input_tokens_seen": 1432576, + "step": 4255 + }, + { + "epoch": 3.2921174652241114, + "grad_norm": 1.186449646949768, + "learning_rate": 4.936771932621297e-05, + "loss": 0.5129, + "num_input_tokens_seen": 1434592, + "step": 4260 + }, + { + "epoch": 3.2959814528593507, + "grad_norm": 0.8943009376525879, + "learning_rate": 4.936394597011342e-05, + "loss": 0.4657, + "num_input_tokens_seen": 1436192, + "step": 4265 + }, + { + "epoch": 3.2998454404945905, + "grad_norm": 0.6977298855781555, + "learning_rate": 4.936016153314534e-05, + "loss": 0.6493, + "num_input_tokens_seen": 1438112, + "step": 4270 + }, + { + "epoch": 3.30370942812983, + "grad_norm": 1.0526195764541626, + "learning_rate": 4.9356366017029897e-05, + "loss": 0.5851, + "num_input_tokens_seen": 1439808, + "step": 4275 + }, + { + "epoch": 3.3075734157650696, + "grad_norm": 0.7907839417457581, + "learning_rate": 4.9352559423493326e-05, + "loss": 0.516, + "num_input_tokens_seen": 1441376, + "step": 4280 + }, + { + "epoch": 3.3114374034003093, + "grad_norm": 0.6684276461601257, + "learning_rate": 4.934874175426689e-05, + "loss": 0.9539, + "num_input_tokens_seen": 1443136, + "step": 4285 + }, + { + "epoch": 3.3153013910355487, + "grad_norm": 0.9006250500679016, + "learning_rate": 4.9344913011086894e-05, + "loss": 1.0196, + "num_input_tokens_seen": 1444896, + "step": 4290 + }, + { + "epoch": 3.3191653786707884, + "grad_norm": 0.5442780256271362, + "learning_rate": 4.934107319569465e-05, + "loss": 0.4997, + "num_input_tokens_seen": 1446368, + "step": 4295 + }, + { + "epoch": 3.3230293663060277, + "grad_norm": 0.5007539987564087, + "learning_rate": 4.9337222309836554e-05, + "loss": 0.4224, + "num_input_tokens_seen": 1448064, + "step": 4300 + }, + { + "epoch": 3.3268933539412675, + "grad_norm": 0.7580357789993286, + "learning_rate": 4.933336035526399e-05, + "loss": 0.4395, + "num_input_tokens_seen": 1449504, + "step": 4305 + }, + { + "epoch": 3.330757341576507, + "grad_norm": 0.5057467222213745, + "learning_rate": 4.932948733373342e-05, + "loss": 0.4697, + "num_input_tokens_seen": 1451296, + "step": 4310 + }, + { + "epoch": 3.3346213292117466, + "grad_norm": 0.7365680932998657, + "learning_rate": 4.93256032470063e-05, + "loss": 0.4175, + "num_input_tokens_seen": 1452992, + "step": 4315 + }, + { + "epoch": 3.338485316846986, + "grad_norm": 0.767287015914917, + "learning_rate": 4.932170809684915e-05, + "loss": 0.4897, + "num_input_tokens_seen": 1454688, + "step": 4320 + }, + { + "epoch": 3.3423493044822257, + "grad_norm": 0.6868230700492859, + "learning_rate": 4.93178018850335e-05, + "loss": 0.6058, + "num_input_tokens_seen": 1456448, + "step": 4325 + }, + { + "epoch": 3.346213292117465, + "grad_norm": 0.6111182570457458, + "learning_rate": 4.931388461333591e-05, + "loss": 0.53, + "num_input_tokens_seen": 1458048, + "step": 4330 + }, + { + "epoch": 3.3500772797527048, + "grad_norm": 0.7401806712150574, + "learning_rate": 4.9309956283538e-05, + "loss": 0.4184, + "num_input_tokens_seen": 1459584, + "step": 4335 + }, + { + "epoch": 3.3539412673879445, + "grad_norm": 0.6967880725860596, + "learning_rate": 4.9306016897426375e-05, + "loss": 0.507, + "num_input_tokens_seen": 1461248, + "step": 4340 + }, + { + "epoch": 3.357805255023184, + "grad_norm": 0.5394587516784668, + "learning_rate": 4.930206645679271e-05, + "loss": 0.5622, + "num_input_tokens_seen": 1462784, + "step": 4345 + }, + { + "epoch": 3.3616692426584236, + "grad_norm": 0.7040392756462097, + "learning_rate": 4.929810496343368e-05, + "loss": 0.5584, + "num_input_tokens_seen": 1464384, + "step": 4350 + }, + { + "epoch": 3.365533230293663, + "grad_norm": 0.7832682728767395, + "learning_rate": 4.9294132419150995e-05, + "loss": 0.5428, + "num_input_tokens_seen": 1466304, + "step": 4355 + }, + { + "epoch": 3.3693972179289027, + "grad_norm": 0.904850959777832, + "learning_rate": 4.929014882575139e-05, + "loss": 0.5137, + "num_input_tokens_seen": 1468000, + "step": 4360 + }, + { + "epoch": 3.373261205564142, + "grad_norm": 1.0855481624603271, + "learning_rate": 4.928615418504664e-05, + "loss": 0.6377, + "num_input_tokens_seen": 1469792, + "step": 4365 + }, + { + "epoch": 3.3771251931993818, + "grad_norm": 0.6517446637153625, + "learning_rate": 4.9282148498853513e-05, + "loss": 0.4677, + "num_input_tokens_seen": 1471488, + "step": 4370 + }, + { + "epoch": 3.3809891808346215, + "grad_norm": 0.7726370692253113, + "learning_rate": 4.927813176899383e-05, + "loss": 0.4381, + "num_input_tokens_seen": 1473024, + "step": 4375 + }, + { + "epoch": 3.384853168469861, + "grad_norm": 0.6817806363105774, + "learning_rate": 4.927410399729443e-05, + "loss": 0.5104, + "num_input_tokens_seen": 1474688, + "step": 4380 + }, + { + "epoch": 3.3887171561051006, + "grad_norm": 0.792678713798523, + "learning_rate": 4.9270065185587154e-05, + "loss": 0.4554, + "num_input_tokens_seen": 1476288, + "step": 4385 + }, + { + "epoch": 3.39258114374034, + "grad_norm": 0.5130274891853333, + "learning_rate": 4.9266015335708884e-05, + "loss": 0.4816, + "num_input_tokens_seen": 1477632, + "step": 4390 + }, + { + "epoch": 3.3964451313755797, + "grad_norm": 1.5030620098114014, + "learning_rate": 4.9261954449501525e-05, + "loss": 0.6966, + "num_input_tokens_seen": 1479328, + "step": 4395 + }, + { + "epoch": 3.400309119010819, + "grad_norm": 0.6817244291305542, + "learning_rate": 4.925788252881197e-05, + "loss": 0.4868, + "num_input_tokens_seen": 1480928, + "step": 4400 + }, + { + "epoch": 3.4041731066460588, + "grad_norm": 0.7191501259803772, + "learning_rate": 4.925379957549217e-05, + "loss": 0.4042, + "num_input_tokens_seen": 1482720, + "step": 4405 + }, + { + "epoch": 3.4080370942812985, + "grad_norm": 1.0146466493606567, + "learning_rate": 4.924970559139908e-05, + "loss": 0.7505, + "num_input_tokens_seen": 1484384, + "step": 4410 + }, + { + "epoch": 3.411901081916538, + "grad_norm": 0.8537068367004395, + "learning_rate": 4.9245600578394654e-05, + "loss": 0.6523, + "num_input_tokens_seen": 1486208, + "step": 4415 + }, + { + "epoch": 3.4157650695517776, + "grad_norm": 0.6270790100097656, + "learning_rate": 4.9241484538345887e-05, + "loss": 0.4586, + "num_input_tokens_seen": 1487712, + "step": 4420 + }, + { + "epoch": 3.419629057187017, + "grad_norm": 1.2254841327667236, + "learning_rate": 4.923735747312477e-05, + "loss": 0.6089, + "num_input_tokens_seen": 1489536, + "step": 4425 + }, + { + "epoch": 3.4234930448222567, + "grad_norm": 0.7544763088226318, + "learning_rate": 4.923321938460833e-05, + "loss": 0.3997, + "num_input_tokens_seen": 1491200, + "step": 4430 + }, + { + "epoch": 3.427357032457496, + "grad_norm": 0.5670571327209473, + "learning_rate": 4.922907027467858e-05, + "loss": 0.3837, + "num_input_tokens_seen": 1492928, + "step": 4435 + }, + { + "epoch": 3.4312210200927358, + "grad_norm": 0.9199605584144592, + "learning_rate": 4.922491014522257e-05, + "loss": 0.6508, + "num_input_tokens_seen": 1494432, + "step": 4440 + }, + { + "epoch": 3.435085007727975, + "grad_norm": 0.5446799397468567, + "learning_rate": 4.922073899813235e-05, + "loss": 0.488, + "num_input_tokens_seen": 1496096, + "step": 4445 + }, + { + "epoch": 3.438948995363215, + "grad_norm": 0.9909490346908569, + "learning_rate": 4.9216556835304975e-05, + "loss": 0.6295, + "num_input_tokens_seen": 1497984, + "step": 4450 + }, + { + "epoch": 3.442812982998454, + "grad_norm": 1.2625200748443604, + "learning_rate": 4.9212363658642536e-05, + "loss": 0.5598, + "num_input_tokens_seen": 1499584, + "step": 4455 + }, + { + "epoch": 3.446676970633694, + "grad_norm": 0.7942723631858826, + "learning_rate": 4.920815947005209e-05, + "loss": 0.3948, + "num_input_tokens_seen": 1501184, + "step": 4460 + }, + { + "epoch": 3.4505409582689337, + "grad_norm": 0.572092592716217, + "learning_rate": 4.920394427144575e-05, + "loss": 0.5431, + "num_input_tokens_seen": 1502848, + "step": 4465 + }, + { + "epoch": 3.454404945904173, + "grad_norm": 0.7701777219772339, + "learning_rate": 4.91997180647406e-05, + "loss": 0.356, + "num_input_tokens_seen": 1504544, + "step": 4470 + }, + { + "epoch": 3.458268933539413, + "grad_norm": 0.5603532791137695, + "learning_rate": 4.9195480851858743e-05, + "loss": 0.444, + "num_input_tokens_seen": 1506048, + "step": 4475 + }, + { + "epoch": 3.462132921174652, + "grad_norm": 0.5866876244544983, + "learning_rate": 4.91912326347273e-05, + "loss": 0.5008, + "num_input_tokens_seen": 1507936, + "step": 4480 + }, + { + "epoch": 3.465996908809892, + "grad_norm": 1.1248518228530884, + "learning_rate": 4.9186973415278375e-05, + "loss": 0.4907, + "num_input_tokens_seen": 1509504, + "step": 4485 + }, + { + "epoch": 3.469860896445131, + "grad_norm": 0.5839817523956299, + "learning_rate": 4.918270319544909e-05, + "loss": 0.4779, + "num_input_tokens_seen": 1511072, + "step": 4490 + }, + { + "epoch": 3.473724884080371, + "grad_norm": 0.5727483630180359, + "learning_rate": 4.917842197718157e-05, + "loss": 0.4274, + "num_input_tokens_seen": 1512704, + "step": 4495 + }, + { + "epoch": 3.4775888717156107, + "grad_norm": 0.8275544047355652, + "learning_rate": 4.917412976242294e-05, + "loss": 0.6999, + "num_input_tokens_seen": 1514368, + "step": 4500 + }, + { + "epoch": 3.48145285935085, + "grad_norm": 0.5195562243461609, + "learning_rate": 4.916982655312532e-05, + "loss": 0.5876, + "num_input_tokens_seen": 1516064, + "step": 4505 + }, + { + "epoch": 3.48531684698609, + "grad_norm": 1.0977343320846558, + "learning_rate": 4.916551235124582e-05, + "loss": 0.4103, + "num_input_tokens_seen": 1517792, + "step": 4510 + }, + { + "epoch": 3.489180834621329, + "grad_norm": 0.6532158255577087, + "learning_rate": 4.91611871587466e-05, + "loss": 0.3732, + "num_input_tokens_seen": 1519584, + "step": 4515 + }, + { + "epoch": 3.493044822256569, + "grad_norm": 0.7309983968734741, + "learning_rate": 4.915685097759476e-05, + "loss": 0.4762, + "num_input_tokens_seen": 1521472, + "step": 4520 + }, + { + "epoch": 3.496908809891808, + "grad_norm": 0.9904599785804749, + "learning_rate": 4.915250380976242e-05, + "loss": 0.458, + "num_input_tokens_seen": 1523264, + "step": 4525 + }, + { + "epoch": 3.500772797527048, + "grad_norm": 0.8072605133056641, + "learning_rate": 4.914814565722671e-05, + "loss": 0.4754, + "num_input_tokens_seen": 1524672, + "step": 4530 + }, + { + "epoch": 3.5046367851622877, + "grad_norm": 1.0266181230545044, + "learning_rate": 4.914377652196973e-05, + "loss": 0.5407, + "num_input_tokens_seen": 1526496, + "step": 4535 + }, + { + "epoch": 3.508500772797527, + "grad_norm": 0.429245263338089, + "learning_rate": 4.9139396405978604e-05, + "loss": 0.7704, + "num_input_tokens_seen": 1528096, + "step": 4540 + }, + { + "epoch": 3.5123647604327664, + "grad_norm": 1.5166023969650269, + "learning_rate": 4.913500531124543e-05, + "loss": 1.054, + "num_input_tokens_seen": 1529920, + "step": 4545 + }, + { + "epoch": 3.516228748068006, + "grad_norm": 1.737676978111267, + "learning_rate": 4.9130603239767294e-05, + "loss": 0.7654, + "num_input_tokens_seen": 1531456, + "step": 4550 + }, + { + "epoch": 3.520092735703246, + "grad_norm": 2.1113433837890625, + "learning_rate": 4.912619019354629e-05, + "loss": 0.4555, + "num_input_tokens_seen": 1533312, + "step": 4555 + }, + { + "epoch": 3.523956723338485, + "grad_norm": 1.0889248847961426, + "learning_rate": 4.912176617458951e-05, + "loss": 1.0186, + "num_input_tokens_seen": 1535040, + "step": 4560 + }, + { + "epoch": 3.527820710973725, + "grad_norm": 0.5419684052467346, + "learning_rate": 4.911733118490901e-05, + "loss": 0.4567, + "num_input_tokens_seen": 1536864, + "step": 4565 + }, + { + "epoch": 3.5316846986089647, + "grad_norm": 1.129286289215088, + "learning_rate": 4.9112885226521846e-05, + "loss": 0.532, + "num_input_tokens_seen": 1538688, + "step": 4570 + }, + { + "epoch": 3.535548686244204, + "grad_norm": 0.5378193855285645, + "learning_rate": 4.9108428301450084e-05, + "loss": 0.5461, + "num_input_tokens_seen": 1540256, + "step": 4575 + }, + { + "epoch": 3.5394126738794434, + "grad_norm": 0.8871030807495117, + "learning_rate": 4.9103960411720754e-05, + "loss": 0.5066, + "num_input_tokens_seen": 1541984, + "step": 4580 + }, + { + "epoch": 3.543276661514683, + "grad_norm": 1.8419796228408813, + "learning_rate": 4.909948155936587e-05, + "loss": 0.5434, + "num_input_tokens_seen": 1543648, + "step": 4585 + }, + { + "epoch": 3.547140649149923, + "grad_norm": 1.0311172008514404, + "learning_rate": 4.9094991746422434e-05, + "loss": 0.5051, + "num_input_tokens_seen": 1545344, + "step": 4590 + }, + { + "epoch": 3.551004636785162, + "grad_norm": 0.6786298751831055, + "learning_rate": 4.909049097493247e-05, + "loss": 0.4819, + "num_input_tokens_seen": 1547392, + "step": 4595 + }, + { + "epoch": 3.554868624420402, + "grad_norm": 1.4390928745269775, + "learning_rate": 4.9085979246942935e-05, + "loss": 0.6938, + "num_input_tokens_seen": 1549152, + "step": 4600 + }, + { + "epoch": 3.5587326120556413, + "grad_norm": 0.5312060713768005, + "learning_rate": 4.908145656450579e-05, + "loss": 0.431, + "num_input_tokens_seen": 1550784, + "step": 4605 + }, + { + "epoch": 3.562596599690881, + "grad_norm": 0.8115399479866028, + "learning_rate": 4.9076922929677984e-05, + "loss": 0.5195, + "num_input_tokens_seen": 1552512, + "step": 4610 + }, + { + "epoch": 3.5664605873261204, + "grad_norm": 0.6858322620391846, + "learning_rate": 4.907237834452144e-05, + "loss": 0.4761, + "num_input_tokens_seen": 1553952, + "step": 4615 + }, + { + "epoch": 3.57032457496136, + "grad_norm": 0.7551806569099426, + "learning_rate": 4.9067822811103055e-05, + "loss": 0.7577, + "num_input_tokens_seen": 1555936, + "step": 4620 + }, + { + "epoch": 3.5741885625966, + "grad_norm": 1.0317721366882324, + "learning_rate": 4.906325633149472e-05, + "loss": 0.7331, + "num_input_tokens_seen": 1557408, + "step": 4625 + }, + { + "epoch": 3.578052550231839, + "grad_norm": 0.6299299597740173, + "learning_rate": 4.9058678907773305e-05, + "loss": 0.4446, + "num_input_tokens_seen": 1559104, + "step": 4630 + }, + { + "epoch": 3.581916537867079, + "grad_norm": 0.579904317855835, + "learning_rate": 4.905409054202063e-05, + "loss": 0.7364, + "num_input_tokens_seen": 1560896, + "step": 4635 + }, + { + "epoch": 3.5857805255023183, + "grad_norm": 0.6001703143119812, + "learning_rate": 4.904949123632353e-05, + "loss": 0.4695, + "num_input_tokens_seen": 1562656, + "step": 4640 + }, + { + "epoch": 3.589644513137558, + "grad_norm": 0.5586643815040588, + "learning_rate": 4.9044880992773776e-05, + "loss": 0.5396, + "num_input_tokens_seen": 1564608, + "step": 4645 + }, + { + "epoch": 3.5935085007727974, + "grad_norm": 0.7880778908729553, + "learning_rate": 4.904025981346816e-05, + "loss": 0.4962, + "num_input_tokens_seen": 1566176, + "step": 4650 + }, + { + "epoch": 3.597372488408037, + "grad_norm": 0.6998445987701416, + "learning_rate": 4.903562770050841e-05, + "loss": 0.5006, + "num_input_tokens_seen": 1567808, + "step": 4655 + }, + { + "epoch": 3.601236476043277, + "grad_norm": 0.47474539279937744, + "learning_rate": 4.9030984656001236e-05, + "loss": 0.4594, + "num_input_tokens_seen": 1569408, + "step": 4660 + }, + { + "epoch": 3.605100463678516, + "grad_norm": 0.700725793838501, + "learning_rate": 4.9026330682058316e-05, + "loss": 0.6331, + "num_input_tokens_seen": 1570912, + "step": 4665 + }, + { + "epoch": 3.6089644513137555, + "grad_norm": 1.2765792608261108, + "learning_rate": 4.902166578079633e-05, + "loss": 0.5014, + "num_input_tokens_seen": 1572480, + "step": 4670 + }, + { + "epoch": 3.6128284389489953, + "grad_norm": 0.9706118106842041, + "learning_rate": 4.9016989954336875e-05, + "loss": 0.5445, + "num_input_tokens_seen": 1574240, + "step": 4675 + }, + { + "epoch": 3.616692426584235, + "grad_norm": 0.6145281195640564, + "learning_rate": 4.9012303204806556e-05, + "loss": 0.4383, + "num_input_tokens_seen": 1575744, + "step": 4680 + }, + { + "epoch": 3.6205564142194744, + "grad_norm": 0.5513226985931396, + "learning_rate": 4.900760553433694e-05, + "loss": 0.5806, + "num_input_tokens_seen": 1577600, + "step": 4685 + }, + { + "epoch": 3.624420401854714, + "grad_norm": 1.1943448781967163, + "learning_rate": 4.900289694506455e-05, + "loss": 0.5453, + "num_input_tokens_seen": 1579360, + "step": 4690 + }, + { + "epoch": 3.628284389489954, + "grad_norm": 0.5047418475151062, + "learning_rate": 4.899817743913088e-05, + "loss": 0.6098, + "num_input_tokens_seen": 1581152, + "step": 4695 + }, + { + "epoch": 3.6321483771251932, + "grad_norm": 0.6173385977745056, + "learning_rate": 4.8993447018682395e-05, + "loss": 0.4896, + "num_input_tokens_seen": 1582848, + "step": 4700 + }, + { + "epoch": 3.6360123647604325, + "grad_norm": 1.1088685989379883, + "learning_rate": 4.898870568587051e-05, + "loss": 1.0363, + "num_input_tokens_seen": 1584512, + "step": 4705 + }, + { + "epoch": 3.6398763523956723, + "grad_norm": 0.9079142808914185, + "learning_rate": 4.898395344285162e-05, + "loss": 0.5687, + "num_input_tokens_seen": 1586560, + "step": 4710 + }, + { + "epoch": 3.643740340030912, + "grad_norm": 0.7804763913154602, + "learning_rate": 4.897919029178707e-05, + "loss": 0.4417, + "num_input_tokens_seen": 1588416, + "step": 4715 + }, + { + "epoch": 3.6476043276661514, + "grad_norm": 1.6030346155166626, + "learning_rate": 4.8974416234843165e-05, + "loss": 0.511, + "num_input_tokens_seen": 1590208, + "step": 4720 + }, + { + "epoch": 3.651468315301391, + "grad_norm": 1.220643401145935, + "learning_rate": 4.896963127419118e-05, + "loss": 0.4922, + "num_input_tokens_seen": 1591840, + "step": 4725 + }, + { + "epoch": 3.6553323029366305, + "grad_norm": 1.9330768585205078, + "learning_rate": 4.896483541200735e-05, + "loss": 0.5417, + "num_input_tokens_seen": 1593664, + "step": 4730 + }, + { + "epoch": 3.6591962905718702, + "grad_norm": 1.3645195960998535, + "learning_rate": 4.896002865047285e-05, + "loss": 0.6047, + "num_input_tokens_seen": 1595264, + "step": 4735 + }, + { + "epoch": 3.6630602782071096, + "grad_norm": 0.7741415500640869, + "learning_rate": 4.8955210991773825e-05, + "loss": 0.4931, + "num_input_tokens_seen": 1596992, + "step": 4740 + }, + { + "epoch": 3.6669242658423493, + "grad_norm": 0.8445459008216858, + "learning_rate": 4.895038243810138e-05, + "loss": 0.4933, + "num_input_tokens_seen": 1598656, + "step": 4745 + }, + { + "epoch": 3.670788253477589, + "grad_norm": 1.9139037132263184, + "learning_rate": 4.8945542991651574e-05, + "loss": 0.4437, + "num_input_tokens_seen": 1600128, + "step": 4750 + }, + { + "epoch": 3.6746522411128284, + "grad_norm": 0.6072549223899841, + "learning_rate": 4.894069265462542e-05, + "loss": 0.7284, + "num_input_tokens_seen": 1601888, + "step": 4755 + }, + { + "epoch": 3.678516228748068, + "grad_norm": 0.8049539923667908, + "learning_rate": 4.893583142922885e-05, + "loss": 0.5041, + "num_input_tokens_seen": 1603424, + "step": 4760 + }, + { + "epoch": 3.6823802163833075, + "grad_norm": 0.9471360445022583, + "learning_rate": 4.893095931767281e-05, + "loss": 0.6634, + "num_input_tokens_seen": 1605120, + "step": 4765 + }, + { + "epoch": 3.6862442040185472, + "grad_norm": 0.6487666964530945, + "learning_rate": 4.8926076322173156e-05, + "loss": 0.4395, + "num_input_tokens_seen": 1606976, + "step": 4770 + }, + { + "epoch": 3.6901081916537866, + "grad_norm": 0.9448869824409485, + "learning_rate": 4.892118244495071e-05, + "loss": 0.728, + "num_input_tokens_seen": 1608896, + "step": 4775 + }, + { + "epoch": 3.6939721792890263, + "grad_norm": 0.6307618021965027, + "learning_rate": 4.891627768823122e-05, + "loss": 0.4757, + "num_input_tokens_seen": 1610496, + "step": 4780 + }, + { + "epoch": 3.697836166924266, + "grad_norm": 0.7095080018043518, + "learning_rate": 4.8911362054245416e-05, + "loss": 0.5003, + "num_input_tokens_seen": 1612256, + "step": 4785 + }, + { + "epoch": 3.7017001545595054, + "grad_norm": 1.097547173500061, + "learning_rate": 4.890643554522894e-05, + "loss": 0.5536, + "num_input_tokens_seen": 1613792, + "step": 4790 + }, + { + "epoch": 3.7055641421947447, + "grad_norm": 0.9648586511611938, + "learning_rate": 4.890149816342241e-05, + "loss": 0.5185, + "num_input_tokens_seen": 1615488, + "step": 4795 + }, + { + "epoch": 3.7094281298299845, + "grad_norm": 0.9328175187110901, + "learning_rate": 4.889654991107138e-05, + "loss": 0.7335, + "num_input_tokens_seen": 1617088, + "step": 4800 + }, + { + "epoch": 3.7132921174652243, + "grad_norm": 1.3313509225845337, + "learning_rate": 4.889159079042634e-05, + "loss": 0.6064, + "num_input_tokens_seen": 1618752, + "step": 4805 + }, + { + "epoch": 3.7171561051004636, + "grad_norm": 1.5336239337921143, + "learning_rate": 4.888662080374272e-05, + "loss": 0.5346, + "num_input_tokens_seen": 1620416, + "step": 4810 + }, + { + "epoch": 3.7210200927357033, + "grad_norm": 0.5737871527671814, + "learning_rate": 4.8881639953280914e-05, + "loss": 0.453, + "num_input_tokens_seen": 1622016, + "step": 4815 + }, + { + "epoch": 3.7248840803709427, + "grad_norm": 0.7984122633934021, + "learning_rate": 4.887664824130623e-05, + "loss": 0.4282, + "num_input_tokens_seen": 1623616, + "step": 4820 + }, + { + "epoch": 3.7287480680061824, + "grad_norm": 0.815140426158905, + "learning_rate": 4.887164567008894e-05, + "loss": 0.5725, + "num_input_tokens_seen": 1625216, + "step": 4825 + }, + { + "epoch": 3.7326120556414217, + "grad_norm": 0.6559157371520996, + "learning_rate": 4.886663224190424e-05, + "loss": 0.7283, + "num_input_tokens_seen": 1626944, + "step": 4830 + }, + { + "epoch": 3.7364760432766615, + "grad_norm": 1.4241310358047485, + "learning_rate": 4.886160795903226e-05, + "loss": 0.5998, + "num_input_tokens_seen": 1628704, + "step": 4835 + }, + { + "epoch": 3.7403400309119013, + "grad_norm": 0.9293547868728638, + "learning_rate": 4.885657282375808e-05, + "loss": 0.4721, + "num_input_tokens_seen": 1630336, + "step": 4840 + }, + { + "epoch": 3.7442040185471406, + "grad_norm": 0.5614771246910095, + "learning_rate": 4.8851526838371706e-05, + "loss": 0.472, + "num_input_tokens_seen": 1631968, + "step": 4845 + }, + { + "epoch": 3.7480680061823803, + "grad_norm": 1.5944970846176147, + "learning_rate": 4.8846470005168085e-05, + "loss": 0.6301, + "num_input_tokens_seen": 1633440, + "step": 4850 + }, + { + "epoch": 3.7519319938176197, + "grad_norm": 0.721488893032074, + "learning_rate": 4.8841402326447096e-05, + "loss": 0.4989, + "num_input_tokens_seen": 1635040, + "step": 4855 + }, + { + "epoch": 3.7557959814528594, + "grad_norm": 0.46438825130462646, + "learning_rate": 4.883632380451355e-05, + "loss": 0.3934, + "num_input_tokens_seen": 1636736, + "step": 4860 + }, + { + "epoch": 3.7596599690880987, + "grad_norm": 0.6369669437408447, + "learning_rate": 4.8831234441677186e-05, + "loss": 0.4395, + "num_input_tokens_seen": 1638336, + "step": 4865 + }, + { + "epoch": 3.7635239567233385, + "grad_norm": 0.6508880853652954, + "learning_rate": 4.882613424025267e-05, + "loss": 0.6238, + "num_input_tokens_seen": 1639968, + "step": 4870 + }, + { + "epoch": 3.7673879443585783, + "grad_norm": 0.9201847910881042, + "learning_rate": 4.8821023202559624e-05, + "loss": 0.4608, + "num_input_tokens_seen": 1641760, + "step": 4875 + }, + { + "epoch": 3.7712519319938176, + "grad_norm": 1.4950332641601562, + "learning_rate": 4.881590133092256e-05, + "loss": 0.6623, + "num_input_tokens_seen": 1643200, + "step": 4880 + }, + { + "epoch": 3.7751159196290573, + "grad_norm": 1.5492192506790161, + "learning_rate": 4.881076862767095e-05, + "loss": 0.6934, + "num_input_tokens_seen": 1645024, + "step": 4885 + }, + { + "epoch": 3.7789799072642967, + "grad_norm": 1.849652647972107, + "learning_rate": 4.880562509513917e-05, + "loss": 0.6824, + "num_input_tokens_seen": 1646944, + "step": 4890 + }, + { + "epoch": 3.7828438948995364, + "grad_norm": 0.9685112237930298, + "learning_rate": 4.8800470735666525e-05, + "loss": 0.4572, + "num_input_tokens_seen": 1648704, + "step": 4895 + }, + { + "epoch": 3.7867078825347757, + "grad_norm": 1.117743968963623, + "learning_rate": 4.879530555159726e-05, + "loss": 0.6315, + "num_input_tokens_seen": 1650272, + "step": 4900 + }, + { + "epoch": 3.7905718701700155, + "grad_norm": 0.7051230669021606, + "learning_rate": 4.8790129545280514e-05, + "loss": 0.4453, + "num_input_tokens_seen": 1652032, + "step": 4905 + }, + { + "epoch": 3.7944358578052553, + "grad_norm": 0.7262768745422363, + "learning_rate": 4.8784942719070395e-05, + "loss": 0.6504, + "num_input_tokens_seen": 1653600, + "step": 4910 + }, + { + "epoch": 3.7982998454404946, + "grad_norm": 0.60104900598526, + "learning_rate": 4.8779745075325874e-05, + "loss": 0.4165, + "num_input_tokens_seen": 1655712, + "step": 4915 + }, + { + "epoch": 3.802163833075734, + "grad_norm": 1.2906758785247803, + "learning_rate": 4.8774536616410884e-05, + "loss": 0.6572, + "num_input_tokens_seen": 1657248, + "step": 4920 + }, + { + "epoch": 3.8060278207109737, + "grad_norm": 1.2297672033309937, + "learning_rate": 4.876931734469425e-05, + "loss": 0.6897, + "num_input_tokens_seen": 1658912, + "step": 4925 + }, + { + "epoch": 3.8098918083462134, + "grad_norm": 0.9836766123771667, + "learning_rate": 4.876408726254975e-05, + "loss": 0.4555, + "num_input_tokens_seen": 1660800, + "step": 4930 + }, + { + "epoch": 3.8137557959814528, + "grad_norm": 1.0727016925811768, + "learning_rate": 4.875884637235605e-05, + "loss": 0.5346, + "num_input_tokens_seen": 1662592, + "step": 4935 + }, + { + "epoch": 3.8176197836166925, + "grad_norm": 0.8303393721580505, + "learning_rate": 4.8753594676496725e-05, + "loss": 0.496, + "num_input_tokens_seen": 1664288, + "step": 4940 + }, + { + "epoch": 3.821483771251932, + "grad_norm": 1.1210566759109497, + "learning_rate": 4.874833217736029e-05, + "loss": 0.4883, + "num_input_tokens_seen": 1665824, + "step": 4945 + }, + { + "epoch": 3.8253477588871716, + "grad_norm": 0.6832841634750366, + "learning_rate": 4.874305887734016e-05, + "loss": 0.487, + "num_input_tokens_seen": 1667712, + "step": 4950 + }, + { + "epoch": 3.829211746522411, + "grad_norm": 0.7186324596405029, + "learning_rate": 4.8737774778834654e-05, + "loss": 0.4008, + "num_input_tokens_seen": 1669248, + "step": 4955 + }, + { + "epoch": 3.8330757341576507, + "grad_norm": 0.5219962000846863, + "learning_rate": 4.8732479884247025e-05, + "loss": 0.4946, + "num_input_tokens_seen": 1670656, + "step": 4960 + }, + { + "epoch": 3.8369397217928904, + "grad_norm": 0.5625699758529663, + "learning_rate": 4.872717419598541e-05, + "loss": 0.5, + "num_input_tokens_seen": 1672256, + "step": 4965 + }, + { + "epoch": 3.8408037094281298, + "grad_norm": 0.8781502842903137, + "learning_rate": 4.872185771646288e-05, + "loss": 0.4667, + "num_input_tokens_seen": 1674080, + "step": 4970 + }, + { + "epoch": 3.8446676970633695, + "grad_norm": 0.5027551054954529, + "learning_rate": 4.87165304480974e-05, + "loss": 0.4794, + "num_input_tokens_seen": 1675904, + "step": 4975 + }, + { + "epoch": 3.848531684698609, + "grad_norm": 0.7467421889305115, + "learning_rate": 4.871119239331183e-05, + "loss": 0.4326, + "num_input_tokens_seen": 1677472, + "step": 4980 + }, + { + "epoch": 3.8523956723338486, + "grad_norm": 1.0676099061965942, + "learning_rate": 4.870584355453396e-05, + "loss": 0.582, + "num_input_tokens_seen": 1679168, + "step": 4985 + }, + { + "epoch": 3.856259659969088, + "grad_norm": 0.954439103603363, + "learning_rate": 4.870048393419647e-05, + "loss": 0.783, + "num_input_tokens_seen": 1680736, + "step": 4990 + }, + { + "epoch": 3.8601236476043277, + "grad_norm": 0.7265281677246094, + "learning_rate": 4.869511353473696e-05, + "loss": 0.5242, + "num_input_tokens_seen": 1682624, + "step": 4995 + }, + { + "epoch": 3.8639876352395675, + "grad_norm": 0.7775354385375977, + "learning_rate": 4.868973235859791e-05, + "loss": 0.4554, + "num_input_tokens_seen": 1684288, + "step": 5000 + }, + { + "epoch": 3.8678516228748068, + "grad_norm": 0.669346272945404, + "learning_rate": 4.8684340408226696e-05, + "loss": 0.6692, + "num_input_tokens_seen": 1685952, + "step": 5005 + }, + { + "epoch": 3.871715610510046, + "grad_norm": 1.3222934007644653, + "learning_rate": 4.867893768607564e-05, + "loss": 0.6605, + "num_input_tokens_seen": 1687520, + "step": 5010 + }, + { + "epoch": 3.875579598145286, + "grad_norm": 0.6794452667236328, + "learning_rate": 4.86735241946019e-05, + "loss": 0.5503, + "num_input_tokens_seen": 1689248, + "step": 5015 + }, + { + "epoch": 3.8794435857805256, + "grad_norm": 0.6168376803398132, + "learning_rate": 4.86680999362676e-05, + "loss": 0.4413, + "num_input_tokens_seen": 1690848, + "step": 5020 + }, + { + "epoch": 3.883307573415765, + "grad_norm": 0.6392268538475037, + "learning_rate": 4.86626649135397e-05, + "loss": 0.4698, + "num_input_tokens_seen": 1692512, + "step": 5025 + }, + { + "epoch": 3.8871715610510047, + "grad_norm": 0.7058766484260559, + "learning_rate": 4.865721912889009e-05, + "loss": 0.3981, + "num_input_tokens_seen": 1694080, + "step": 5030 + }, + { + "epoch": 3.8910355486862445, + "grad_norm": 1.0588736534118652, + "learning_rate": 4.8651762584795535e-05, + "loss": 0.5091, + "num_input_tokens_seen": 1695680, + "step": 5035 + }, + { + "epoch": 3.894899536321484, + "grad_norm": 1.4063360691070557, + "learning_rate": 4.864629528373771e-05, + "loss": 0.5177, + "num_input_tokens_seen": 1697504, + "step": 5040 + }, + { + "epoch": 3.898763523956723, + "grad_norm": 1.232316255569458, + "learning_rate": 4.864081722820318e-05, + "loss": 0.5856, + "num_input_tokens_seen": 1699328, + "step": 5045 + }, + { + "epoch": 3.902627511591963, + "grad_norm": 0.8070915937423706, + "learning_rate": 4.86353284206834e-05, + "loss": 0.5083, + "num_input_tokens_seen": 1700896, + "step": 5050 + }, + { + "epoch": 3.9064914992272026, + "grad_norm": 0.6913489699363708, + "learning_rate": 4.862982886367471e-05, + "loss": 0.4487, + "num_input_tokens_seen": 1702304, + "step": 5055 + }, + { + "epoch": 3.910355486862442, + "grad_norm": 1.1003528833389282, + "learning_rate": 4.862431855967833e-05, + "loss": 0.6143, + "num_input_tokens_seen": 1703680, + "step": 5060 + }, + { + "epoch": 3.9142194744976817, + "grad_norm": 1.1706411838531494, + "learning_rate": 4.86187975112004e-05, + "loss": 0.6927, + "num_input_tokens_seen": 1705152, + "step": 5065 + }, + { + "epoch": 3.918083462132921, + "grad_norm": 0.7192736864089966, + "learning_rate": 4.8613265720751904e-05, + "loss": 0.5089, + "num_input_tokens_seen": 1706944, + "step": 5070 + }, + { + "epoch": 3.921947449768161, + "grad_norm": 0.6184355020523071, + "learning_rate": 4.860772319084875e-05, + "loss": 0.63, + "num_input_tokens_seen": 1708736, + "step": 5075 + }, + { + "epoch": 3.9258114374034, + "grad_norm": 0.7095210552215576, + "learning_rate": 4.8602169924011703e-05, + "loss": 0.6847, + "num_input_tokens_seen": 1710560, + "step": 5080 + }, + { + "epoch": 3.92967542503864, + "grad_norm": 1.4186955690383911, + "learning_rate": 4.859660592276643e-05, + "loss": 0.5203, + "num_input_tokens_seen": 1712224, + "step": 5085 + }, + { + "epoch": 3.9335394126738796, + "grad_norm": 0.584677517414093, + "learning_rate": 4.859103118964347e-05, + "loss": 0.5329, + "num_input_tokens_seen": 1713984, + "step": 5090 + }, + { + "epoch": 3.937403400309119, + "grad_norm": 0.5140814185142517, + "learning_rate": 4.858544572717824e-05, + "loss": 0.4901, + "num_input_tokens_seen": 1715712, + "step": 5095 + }, + { + "epoch": 3.9412673879443587, + "grad_norm": 1.4119728803634644, + "learning_rate": 4.857984953791105e-05, + "loss": 0.5722, + "num_input_tokens_seen": 1717472, + "step": 5100 + }, + { + "epoch": 3.945131375579598, + "grad_norm": 0.6006684303283691, + "learning_rate": 4.8574242624387066e-05, + "loss": 0.7609, + "num_input_tokens_seen": 1719328, + "step": 5105 + }, + { + "epoch": 3.948995363214838, + "grad_norm": 1.698248267173767, + "learning_rate": 4.856862498915637e-05, + "loss": 0.4814, + "num_input_tokens_seen": 1720768, + "step": 5110 + }, + { + "epoch": 3.952859350850077, + "grad_norm": 0.6417241096496582, + "learning_rate": 4.8562996634773875e-05, + "loss": 0.417, + "num_input_tokens_seen": 1722080, + "step": 5115 + }, + { + "epoch": 3.956723338485317, + "grad_norm": 1.0710422992706299, + "learning_rate": 4.85573575637994e-05, + "loss": 0.4701, + "num_input_tokens_seen": 1723680, + "step": 5120 + }, + { + "epoch": 3.9605873261205566, + "grad_norm": 0.7768991589546204, + "learning_rate": 4.855170777879762e-05, + "loss": 0.4538, + "num_input_tokens_seen": 1725216, + "step": 5125 + }, + { + "epoch": 3.964451313755796, + "grad_norm": 0.6799469590187073, + "learning_rate": 4.8546047282338105e-05, + "loss": 0.5332, + "num_input_tokens_seen": 1727040, + "step": 5130 + }, + { + "epoch": 3.9683153013910353, + "grad_norm": 0.4697803556919098, + "learning_rate": 4.854037607699526e-05, + "loss": 0.3606, + "num_input_tokens_seen": 1728736, + "step": 5135 + }, + { + "epoch": 3.972179289026275, + "grad_norm": 0.6561422348022461, + "learning_rate": 4.853469416534841e-05, + "loss": 0.6772, + "num_input_tokens_seen": 1730432, + "step": 5140 + }, + { + "epoch": 3.976043276661515, + "grad_norm": 1.0014511346817017, + "learning_rate": 4.85290015499817e-05, + "loss": 0.5033, + "num_input_tokens_seen": 1732064, + "step": 5145 + }, + { + "epoch": 3.979907264296754, + "grad_norm": 1.2340372800827026, + "learning_rate": 4.852329823348419e-05, + "loss": 0.5889, + "num_input_tokens_seen": 1733632, + "step": 5150 + }, + { + "epoch": 3.983771251931994, + "grad_norm": 0.5345836281776428, + "learning_rate": 4.851758421844976e-05, + "loss": 0.4449, + "num_input_tokens_seen": 1735424, + "step": 5155 + }, + { + "epoch": 3.9876352395672336, + "grad_norm": 0.9518256187438965, + "learning_rate": 4.8511859507477185e-05, + "loss": 0.4602, + "num_input_tokens_seen": 1736800, + "step": 5160 + }, + { + "epoch": 3.991499227202473, + "grad_norm": 0.5176656246185303, + "learning_rate": 4.8506124103170096e-05, + "loss": 0.3963, + "num_input_tokens_seen": 1738496, + "step": 5165 + }, + { + "epoch": 3.9953632148377123, + "grad_norm": 1.2297601699829102, + "learning_rate": 4.850037800813699e-05, + "loss": 0.5349, + "num_input_tokens_seen": 1740256, + "step": 5170 + }, + { + "epoch": 3.999227202472952, + "grad_norm": 0.7904630899429321, + "learning_rate": 4.849462122499124e-05, + "loss": 0.4735, + "num_input_tokens_seen": 1741888, + "step": 5175 + }, + { + "epoch": 4.0, + "eval_loss": 0.5073549747467041, + "eval_runtime": 6.2739, + "eval_samples_per_second": 91.65, + "eval_steps_per_second": 22.952, + "num_input_tokens_seen": 1742048, + "step": 5176 + }, + { + "epoch": 4.003091190108192, + "grad_norm": 0.5853859186172485, + "learning_rate": 4.848885375635105e-05, + "loss": 0.4106, + "num_input_tokens_seen": 1743232, + "step": 5180 + }, + { + "epoch": 4.006955177743431, + "grad_norm": 0.4420335292816162, + "learning_rate": 4.84830756048395e-05, + "loss": 0.4541, + "num_input_tokens_seen": 1744896, + "step": 5185 + }, + { + "epoch": 4.0108191653786704, + "grad_norm": 0.5625958442687988, + "learning_rate": 4.847728677308453e-05, + "loss": 0.6266, + "num_input_tokens_seen": 1746432, + "step": 5190 + }, + { + "epoch": 4.014683153013911, + "grad_norm": 0.5528249144554138, + "learning_rate": 4.847148726371893e-05, + "loss": 0.4667, + "num_input_tokens_seen": 1748320, + "step": 5195 + }, + { + "epoch": 4.01854714064915, + "grad_norm": 0.6384862661361694, + "learning_rate": 4.846567707938036e-05, + "loss": 0.3992, + "num_input_tokens_seen": 1749824, + "step": 5200 + }, + { + "epoch": 4.022411128284389, + "grad_norm": 0.9269810318946838, + "learning_rate": 4.845985622271133e-05, + "loss": 0.5107, + "num_input_tokens_seen": 1751392, + "step": 5205 + }, + { + "epoch": 4.0262751159196295, + "grad_norm": 0.46799835562705994, + "learning_rate": 4.845402469635919e-05, + "loss": 0.7123, + "num_input_tokens_seen": 1753120, + "step": 5210 + }, + { + "epoch": 4.030139103554869, + "grad_norm": 1.068146824836731, + "learning_rate": 4.844818250297616e-05, + "loss": 0.683, + "num_input_tokens_seen": 1754816, + "step": 5215 + }, + { + "epoch": 4.034003091190108, + "grad_norm": 0.7642729878425598, + "learning_rate": 4.84423296452193e-05, + "loss": 0.5809, + "num_input_tokens_seen": 1756480, + "step": 5220 + }, + { + "epoch": 4.0378670788253475, + "grad_norm": 0.775524377822876, + "learning_rate": 4.843646612575052e-05, + "loss": 0.4498, + "num_input_tokens_seen": 1757920, + "step": 5225 + }, + { + "epoch": 4.041731066460588, + "grad_norm": 1.279876470565796, + "learning_rate": 4.8430591947236605e-05, + "loss": 0.8108, + "num_input_tokens_seen": 1759616, + "step": 5230 + }, + { + "epoch": 4.045595054095827, + "grad_norm": 0.6826686263084412, + "learning_rate": 4.842470711234914e-05, + "loss": 0.6556, + "num_input_tokens_seen": 1761376, + "step": 5235 + }, + { + "epoch": 4.049459041731066, + "grad_norm": 0.7730547189712524, + "learning_rate": 4.84188116237646e-05, + "loss": 0.5442, + "num_input_tokens_seen": 1763072, + "step": 5240 + }, + { + "epoch": 4.053323029366306, + "grad_norm": 1.4525196552276611, + "learning_rate": 4.841290548416428e-05, + "loss": 0.8317, + "num_input_tokens_seen": 1764928, + "step": 5245 + }, + { + "epoch": 4.057187017001546, + "grad_norm": 0.5560858845710754, + "learning_rate": 4.8406988696234336e-05, + "loss": 0.5156, + "num_input_tokens_seen": 1766592, + "step": 5250 + }, + { + "epoch": 4.061051004636785, + "grad_norm": 0.9475115537643433, + "learning_rate": 4.840106126266575e-05, + "loss": 0.5474, + "num_input_tokens_seen": 1768224, + "step": 5255 + }, + { + "epoch": 4.0649149922720245, + "grad_norm": 0.5945225954055786, + "learning_rate": 4.8395123186154365e-05, + "loss": 0.5023, + "num_input_tokens_seen": 1769760, + "step": 5260 + }, + { + "epoch": 4.068778979907265, + "grad_norm": 0.6572440266609192, + "learning_rate": 4.838917446940084e-05, + "loss": 0.5792, + "num_input_tokens_seen": 1771552, + "step": 5265 + }, + { + "epoch": 4.072642967542504, + "grad_norm": 0.9557105898857117, + "learning_rate": 4.83832151151107e-05, + "loss": 0.4063, + "num_input_tokens_seen": 1773248, + "step": 5270 + }, + { + "epoch": 4.076506955177743, + "grad_norm": 0.8478562831878662, + "learning_rate": 4.8377245125994285e-05, + "loss": 0.8795, + "num_input_tokens_seen": 1774720, + "step": 5275 + }, + { + "epoch": 4.080370942812983, + "grad_norm": 1.5660381317138672, + "learning_rate": 4.837126450476679e-05, + "loss": 0.5888, + "num_input_tokens_seen": 1776416, + "step": 5280 + }, + { + "epoch": 4.084234930448223, + "grad_norm": 1.0994998216629028, + "learning_rate": 4.8365273254148226e-05, + "loss": 0.4487, + "num_input_tokens_seen": 1778176, + "step": 5285 + }, + { + "epoch": 4.088098918083462, + "grad_norm": 0.5971387624740601, + "learning_rate": 4.8359271376863454e-05, + "loss": 0.5011, + "num_input_tokens_seen": 1779744, + "step": 5290 + }, + { + "epoch": 4.0919629057187015, + "grad_norm": 0.7462543249130249, + "learning_rate": 4.8353258875642185e-05, + "loss": 0.4872, + "num_input_tokens_seen": 1781536, + "step": 5295 + }, + { + "epoch": 4.095826893353942, + "grad_norm": 0.6857373714447021, + "learning_rate": 4.8347235753218904e-05, + "loss": 0.5137, + "num_input_tokens_seen": 1783328, + "step": 5300 + }, + { + "epoch": 4.099690880989181, + "grad_norm": 1.3147252798080444, + "learning_rate": 4.8341202012332985e-05, + "loss": 0.5352, + "num_input_tokens_seen": 1784928, + "step": 5305 + }, + { + "epoch": 4.10355486862442, + "grad_norm": 0.5121661424636841, + "learning_rate": 4.833515765572861e-05, + "loss": 0.4246, + "num_input_tokens_seen": 1786560, + "step": 5310 + }, + { + "epoch": 4.10741885625966, + "grad_norm": 1.296246886253357, + "learning_rate": 4.8329102686154784e-05, + "loss": 0.4116, + "num_input_tokens_seen": 1788192, + "step": 5315 + }, + { + "epoch": 4.1112828438949, + "grad_norm": 0.5399512648582458, + "learning_rate": 4.832303710636535e-05, + "loss": 0.759, + "num_input_tokens_seen": 1790080, + "step": 5320 + }, + { + "epoch": 4.115146831530139, + "grad_norm": 0.8607836961746216, + "learning_rate": 4.831696091911895e-05, + "loss": 0.4359, + "num_input_tokens_seen": 1791456, + "step": 5325 + }, + { + "epoch": 4.1190108191653785, + "grad_norm": 0.6372464299201965, + "learning_rate": 4.83108741271791e-05, + "loss": 0.5062, + "num_input_tokens_seen": 1793216, + "step": 5330 + }, + { + "epoch": 4.122874806800619, + "grad_norm": 0.7408809065818787, + "learning_rate": 4.8304776733314085e-05, + "loss": 0.4515, + "num_input_tokens_seen": 1794592, + "step": 5335 + }, + { + "epoch": 4.126738794435858, + "grad_norm": 0.5640397071838379, + "learning_rate": 4.8298668740297036e-05, + "loss": 0.3919, + "num_input_tokens_seen": 1796128, + "step": 5340 + }, + { + "epoch": 4.130602782071097, + "grad_norm": 1.1042429208755493, + "learning_rate": 4.829255015090592e-05, + "loss": 0.6716, + "num_input_tokens_seen": 1797728, + "step": 5345 + }, + { + "epoch": 4.134466769706337, + "grad_norm": 0.8067267537117004, + "learning_rate": 4.828642096792351e-05, + "loss": 0.5892, + "num_input_tokens_seen": 1799392, + "step": 5350 + }, + { + "epoch": 4.138330757341577, + "grad_norm": 0.7686164379119873, + "learning_rate": 4.828028119413738e-05, + "loss": 0.4751, + "num_input_tokens_seen": 1801024, + "step": 5355 + }, + { + "epoch": 4.142194744976816, + "grad_norm": 0.7502079010009766, + "learning_rate": 4.827413083233995e-05, + "loss": 0.4175, + "num_input_tokens_seen": 1803072, + "step": 5360 + }, + { + "epoch": 4.1460587326120555, + "grad_norm": 0.6895010471343994, + "learning_rate": 4.8267969885328426e-05, + "loss": 0.6206, + "num_input_tokens_seen": 1804608, + "step": 5365 + }, + { + "epoch": 4.149922720247295, + "grad_norm": 1.6396863460540771, + "learning_rate": 4.8261798355904854e-05, + "loss": 0.3796, + "num_input_tokens_seen": 1806432, + "step": 5370 + }, + { + "epoch": 4.153786707882535, + "grad_norm": 0.6023380160331726, + "learning_rate": 4.825561624687608e-05, + "loss": 0.6849, + "num_input_tokens_seen": 1808224, + "step": 5375 + }, + { + "epoch": 4.157650695517774, + "grad_norm": 1.042598009109497, + "learning_rate": 4.824942356105376e-05, + "loss": 0.6342, + "num_input_tokens_seen": 1809792, + "step": 5380 + }, + { + "epoch": 4.161514683153014, + "grad_norm": 0.8586552739143372, + "learning_rate": 4.8243220301254377e-05, + "loss": 0.5005, + "num_input_tokens_seen": 1811648, + "step": 5385 + }, + { + "epoch": 4.165378670788254, + "grad_norm": 0.9107403755187988, + "learning_rate": 4.8237006470299197e-05, + "loss": 0.4026, + "num_input_tokens_seen": 1813312, + "step": 5390 + }, + { + "epoch": 4.169242658423493, + "grad_norm": 0.7511790990829468, + "learning_rate": 4.823078207101431e-05, + "loss": 0.4206, + "num_input_tokens_seen": 1814944, + "step": 5395 + }, + { + "epoch": 4.1731066460587325, + "grad_norm": 0.7631901502609253, + "learning_rate": 4.8224547106230624e-05, + "loss": 0.419, + "num_input_tokens_seen": 1816672, + "step": 5400 + }, + { + "epoch": 4.176970633693972, + "grad_norm": 0.709194004535675, + "learning_rate": 4.821830157878382e-05, + "loss": 0.4788, + "num_input_tokens_seen": 1818464, + "step": 5405 + }, + { + "epoch": 4.180834621329212, + "grad_norm": 0.9640382528305054, + "learning_rate": 4.821204549151441e-05, + "loss": 0.5578, + "num_input_tokens_seen": 1820000, + "step": 5410 + }, + { + "epoch": 4.184698608964451, + "grad_norm": 0.7411308288574219, + "learning_rate": 4.82057788472677e-05, + "loss": 0.4096, + "num_input_tokens_seen": 1821856, + "step": 5415 + }, + { + "epoch": 4.188562596599691, + "grad_norm": 1.0046608448028564, + "learning_rate": 4.81995016488938e-05, + "loss": 0.5117, + "num_input_tokens_seen": 1823808, + "step": 5420 + }, + { + "epoch": 4.192426584234931, + "grad_norm": 0.4360809922218323, + "learning_rate": 4.8193213899247616e-05, + "loss": 0.4234, + "num_input_tokens_seen": 1825312, + "step": 5425 + }, + { + "epoch": 4.19629057187017, + "grad_norm": 0.6449095010757446, + "learning_rate": 4.818691560118884e-05, + "loss": 0.633, + "num_input_tokens_seen": 1826976, + "step": 5430 + }, + { + "epoch": 4.2001545595054095, + "grad_norm": 0.584163248538971, + "learning_rate": 4.8180606757582e-05, + "loss": 0.4383, + "num_input_tokens_seen": 1828704, + "step": 5435 + }, + { + "epoch": 4.204018547140649, + "grad_norm": 0.6105682253837585, + "learning_rate": 4.817428737129638e-05, + "loss": 0.4655, + "num_input_tokens_seen": 1830432, + "step": 5440 + }, + { + "epoch": 4.207882534775889, + "grad_norm": 1.5213561058044434, + "learning_rate": 4.8167957445206066e-05, + "loss": 0.6276, + "num_input_tokens_seen": 1831968, + "step": 5445 + }, + { + "epoch": 4.211746522411128, + "grad_norm": 0.7355697751045227, + "learning_rate": 4.816161698218997e-05, + "loss": 0.4984, + "num_input_tokens_seen": 1833696, + "step": 5450 + }, + { + "epoch": 4.215610510046368, + "grad_norm": 0.8915214538574219, + "learning_rate": 4.8155265985131756e-05, + "loss": 0.6208, + "num_input_tokens_seen": 1835264, + "step": 5455 + }, + { + "epoch": 4.219474497681608, + "grad_norm": 0.9177555441856384, + "learning_rate": 4.814890445691991e-05, + "loss": 0.7879, + "num_input_tokens_seen": 1836672, + "step": 5460 + }, + { + "epoch": 4.223338485316847, + "grad_norm": 0.7319304943084717, + "learning_rate": 4.8142532400447676e-05, + "loss": 0.4978, + "num_input_tokens_seen": 1838336, + "step": 5465 + }, + { + "epoch": 4.2272024729520865, + "grad_norm": 0.8910127878189087, + "learning_rate": 4.813614981861311e-05, + "loss": 0.5218, + "num_input_tokens_seen": 1840128, + "step": 5470 + }, + { + "epoch": 4.231066460587326, + "grad_norm": 0.9812374711036682, + "learning_rate": 4.8129756714319053e-05, + "loss": 0.5767, + "num_input_tokens_seen": 1841920, + "step": 5475 + }, + { + "epoch": 4.234930448222566, + "grad_norm": 0.7736693620681763, + "learning_rate": 4.812335309047312e-05, + "loss": 0.6088, + "num_input_tokens_seen": 1843616, + "step": 5480 + }, + { + "epoch": 4.238794435857805, + "grad_norm": 0.8185803890228271, + "learning_rate": 4.811693894998773e-05, + "loss": 0.385, + "num_input_tokens_seen": 1845344, + "step": 5485 + }, + { + "epoch": 4.242658423493045, + "grad_norm": 1.1114150285720825, + "learning_rate": 4.8110514295780054e-05, + "loss": 0.5074, + "num_input_tokens_seen": 1846880, + "step": 5490 + }, + { + "epoch": 4.246522411128284, + "grad_norm": 0.6866536140441895, + "learning_rate": 4.810407913077208e-05, + "loss": 0.6003, + "num_input_tokens_seen": 1848576, + "step": 5495 + }, + { + "epoch": 4.250386398763524, + "grad_norm": 1.7095365524291992, + "learning_rate": 4.809763345789054e-05, + "loss": 0.4893, + "num_input_tokens_seen": 1849984, + "step": 5500 + }, + { + "epoch": 4.2542503863987635, + "grad_norm": 0.5302792191505432, + "learning_rate": 4.809117728006699e-05, + "loss": 0.424, + "num_input_tokens_seen": 1851712, + "step": 5505 + }, + { + "epoch": 4.258114374034003, + "grad_norm": 1.6755445003509521, + "learning_rate": 4.8084710600237726e-05, + "loss": 0.6562, + "num_input_tokens_seen": 1853440, + "step": 5510 + }, + { + "epoch": 4.261978361669243, + "grad_norm": 0.6352584362030029, + "learning_rate": 4.807823342134382e-05, + "loss": 0.5878, + "num_input_tokens_seen": 1855136, + "step": 5515 + }, + { + "epoch": 4.265842349304482, + "grad_norm": 0.6280697584152222, + "learning_rate": 4.807174574633115e-05, + "loss": 0.7679, + "num_input_tokens_seen": 1856704, + "step": 5520 + }, + { + "epoch": 4.269706336939722, + "grad_norm": 0.6635041236877441, + "learning_rate": 4.806524757815035e-05, + "loss": 0.5538, + "num_input_tokens_seen": 1858304, + "step": 5525 + }, + { + "epoch": 4.273570324574961, + "grad_norm": 0.9485375881195068, + "learning_rate": 4.8058738919756816e-05, + "loss": 0.5176, + "num_input_tokens_seen": 1860128, + "step": 5530 + }, + { + "epoch": 4.277434312210201, + "grad_norm": 0.7338131666183472, + "learning_rate": 4.805221977411072e-05, + "loss": 0.4805, + "num_input_tokens_seen": 1861920, + "step": 5535 + }, + { + "epoch": 4.2812982998454405, + "grad_norm": 1.1070585250854492, + "learning_rate": 4.804569014417703e-05, + "loss": 0.7656, + "num_input_tokens_seen": 1863360, + "step": 5540 + }, + { + "epoch": 4.28516228748068, + "grad_norm": 0.5890947580337524, + "learning_rate": 4.8039150032925433e-05, + "loss": 0.4697, + "num_input_tokens_seen": 1865056, + "step": 5545 + }, + { + "epoch": 4.289026275115919, + "grad_norm": 0.6082859039306641, + "learning_rate": 4.803259944333043e-05, + "loss": 0.7285, + "num_input_tokens_seen": 1866912, + "step": 5550 + }, + { + "epoch": 4.292890262751159, + "grad_norm": 1.0738953351974487, + "learning_rate": 4.8026038378371265e-05, + "loss": 0.5939, + "num_input_tokens_seen": 1868768, + "step": 5555 + }, + { + "epoch": 4.296754250386399, + "grad_norm": 0.6753178238868713, + "learning_rate": 4.8019466841031946e-05, + "loss": 0.5297, + "num_input_tokens_seen": 1870496, + "step": 5560 + }, + { + "epoch": 4.300618238021638, + "grad_norm": 1.3797118663787842, + "learning_rate": 4.8012884834301255e-05, + "loss": 0.4473, + "num_input_tokens_seen": 1872000, + "step": 5565 + }, + { + "epoch": 4.304482225656878, + "grad_norm": 1.437566876411438, + "learning_rate": 4.800629236117272e-05, + "loss": 0.5477, + "num_input_tokens_seen": 1873504, + "step": 5570 + }, + { + "epoch": 4.3083462132921175, + "grad_norm": 1.2575228214263916, + "learning_rate": 4.799968942464463e-05, + "loss": 0.5278, + "num_input_tokens_seen": 1875136, + "step": 5575 + }, + { + "epoch": 4.312210200927357, + "grad_norm": 0.7300248742103577, + "learning_rate": 4.799307602772006e-05, + "loss": 0.4188, + "num_input_tokens_seen": 1876608, + "step": 5580 + }, + { + "epoch": 4.316074188562597, + "grad_norm": 0.7400534749031067, + "learning_rate": 4.7986452173406815e-05, + "loss": 0.5795, + "num_input_tokens_seen": 1878240, + "step": 5585 + }, + { + "epoch": 4.319938176197836, + "grad_norm": 0.7248276472091675, + "learning_rate": 4.797981786471746e-05, + "loss": 0.4447, + "num_input_tokens_seen": 1879904, + "step": 5590 + }, + { + "epoch": 4.323802163833076, + "grad_norm": 0.9609296321868896, + "learning_rate": 4.7973173104669314e-05, + "loss": 0.495, + "num_input_tokens_seen": 1881504, + "step": 5595 + }, + { + "epoch": 4.327666151468315, + "grad_norm": 0.7334603071212769, + "learning_rate": 4.796651789628446e-05, + "loss": 0.5441, + "num_input_tokens_seen": 1883168, + "step": 5600 + }, + { + "epoch": 4.331530139103555, + "grad_norm": 1.1103813648223877, + "learning_rate": 4.795985224258973e-05, + "loss": 0.5046, + "num_input_tokens_seen": 1884704, + "step": 5605 + }, + { + "epoch": 4.3353941267387945, + "grad_norm": 0.6759241223335266, + "learning_rate": 4.7953176146616695e-05, + "loss": 0.6068, + "num_input_tokens_seen": 1886304, + "step": 5610 + }, + { + "epoch": 4.339258114374034, + "grad_norm": 0.7870677709579468, + "learning_rate": 4.794648961140169e-05, + "loss": 0.6217, + "num_input_tokens_seen": 1887936, + "step": 5615 + }, + { + "epoch": 4.343122102009273, + "grad_norm": 1.0998455286026, + "learning_rate": 4.793979263998578e-05, + "loss": 0.5478, + "num_input_tokens_seen": 1889760, + "step": 5620 + }, + { + "epoch": 4.346986089644513, + "grad_norm": 0.7486963868141174, + "learning_rate": 4.793308523541481e-05, + "loss": 0.6202, + "num_input_tokens_seen": 1891296, + "step": 5625 + }, + { + "epoch": 4.350850077279753, + "grad_norm": 0.7745601534843445, + "learning_rate": 4.792636740073932e-05, + "loss": 0.4741, + "num_input_tokens_seen": 1892928, + "step": 5630 + }, + { + "epoch": 4.354714064914992, + "grad_norm": 1.4769309759140015, + "learning_rate": 4.791963913901465e-05, + "loss": 0.6511, + "num_input_tokens_seen": 1894368, + "step": 5635 + }, + { + "epoch": 4.358578052550232, + "grad_norm": 0.9295845031738281, + "learning_rate": 4.791290045330083e-05, + "loss": 0.4236, + "num_input_tokens_seen": 1895968, + "step": 5640 + }, + { + "epoch": 4.3624420401854715, + "grad_norm": 0.8909996151924133, + "learning_rate": 4.7906151346662665e-05, + "loss": 0.3754, + "num_input_tokens_seen": 1897472, + "step": 5645 + }, + { + "epoch": 4.366306027820711, + "grad_norm": 1.0325114727020264, + "learning_rate": 4.7899391822169684e-05, + "loss": 0.5863, + "num_input_tokens_seen": 1899264, + "step": 5650 + }, + { + "epoch": 4.37017001545595, + "grad_norm": 0.8115125894546509, + "learning_rate": 4.7892621882896173e-05, + "loss": 0.4735, + "num_input_tokens_seen": 1901056, + "step": 5655 + }, + { + "epoch": 4.37403400309119, + "grad_norm": 0.7526588439941406, + "learning_rate": 4.7885841531921126e-05, + "loss": 0.47, + "num_input_tokens_seen": 1903040, + "step": 5660 + }, + { + "epoch": 4.37789799072643, + "grad_norm": 1.2085556983947754, + "learning_rate": 4.787905077232829e-05, + "loss": 0.4685, + "num_input_tokens_seen": 1904448, + "step": 5665 + }, + { + "epoch": 4.381761978361669, + "grad_norm": 0.9949292540550232, + "learning_rate": 4.7872249607206146e-05, + "loss": 0.5195, + "num_input_tokens_seen": 1906208, + "step": 5670 + }, + { + "epoch": 4.385625965996908, + "grad_norm": 0.7965425252914429, + "learning_rate": 4.7865438039647906e-05, + "loss": 0.7185, + "num_input_tokens_seen": 1907904, + "step": 5675 + }, + { + "epoch": 4.3894899536321486, + "grad_norm": 0.5617475509643555, + "learning_rate": 4.785861607275152e-05, + "loss": 0.4224, + "num_input_tokens_seen": 1909568, + "step": 5680 + }, + { + "epoch": 4.393353941267388, + "grad_norm": 0.7053916454315186, + "learning_rate": 4.7851783709619634e-05, + "loss": 0.8165, + "num_input_tokens_seen": 1911296, + "step": 5685 + }, + { + "epoch": 4.397217928902627, + "grad_norm": 0.8158603310585022, + "learning_rate": 4.784494095335966e-05, + "loss": 0.4257, + "num_input_tokens_seen": 1912832, + "step": 5690 + }, + { + "epoch": 4.401081916537867, + "grad_norm": 1.1881541013717651, + "learning_rate": 4.783808780708374e-05, + "loss": 0.5752, + "num_input_tokens_seen": 1914592, + "step": 5695 + }, + { + "epoch": 4.404945904173107, + "grad_norm": 1.3444620370864868, + "learning_rate": 4.783122427390871e-05, + "loss": 0.4048, + "num_input_tokens_seen": 1915968, + "step": 5700 + }, + { + "epoch": 4.408809891808346, + "grad_norm": 1.2901983261108398, + "learning_rate": 4.782435035695615e-05, + "loss": 0.525, + "num_input_tokens_seen": 1917568, + "step": 5705 + }, + { + "epoch": 4.412673879443586, + "grad_norm": 0.7288272976875305, + "learning_rate": 4.781746605935236e-05, + "loss": 0.5811, + "num_input_tokens_seen": 1919616, + "step": 5710 + }, + { + "epoch": 4.416537867078826, + "grad_norm": 0.6769952774047852, + "learning_rate": 4.781057138422835e-05, + "loss": 0.5084, + "num_input_tokens_seen": 1921632, + "step": 5715 + }, + { + "epoch": 4.420401854714065, + "grad_norm": 0.6132363677024841, + "learning_rate": 4.780366633471987e-05, + "loss": 0.9552, + "num_input_tokens_seen": 1923552, + "step": 5720 + }, + { + "epoch": 4.424265842349304, + "grad_norm": 0.7942855358123779, + "learning_rate": 4.7796750913967374e-05, + "loss": 0.4292, + "num_input_tokens_seen": 1925280, + "step": 5725 + }, + { + "epoch": 4.428129829984544, + "grad_norm": 0.5419499278068542, + "learning_rate": 4.778982512511604e-05, + "loss": 0.417, + "num_input_tokens_seen": 1927200, + "step": 5730 + }, + { + "epoch": 4.431993817619784, + "grad_norm": 0.5551798939704895, + "learning_rate": 4.778288897131576e-05, + "loss": 0.4203, + "num_input_tokens_seen": 1928768, + "step": 5735 + }, + { + "epoch": 4.435857805255023, + "grad_norm": 0.4460228979587555, + "learning_rate": 4.777594245572113e-05, + "loss": 0.5191, + "num_input_tokens_seen": 1930496, + "step": 5740 + }, + { + "epoch": 4.439721792890262, + "grad_norm": 0.8118734359741211, + "learning_rate": 4.7768985581491474e-05, + "loss": 0.5989, + "num_input_tokens_seen": 1932320, + "step": 5745 + }, + { + "epoch": 4.443585780525503, + "grad_norm": 0.5759095549583435, + "learning_rate": 4.776201835179082e-05, + "loss": 0.4143, + "num_input_tokens_seen": 1933856, + "step": 5750 + }, + { + "epoch": 4.447449768160742, + "grad_norm": 0.8807557821273804, + "learning_rate": 4.7755040769787895e-05, + "loss": 0.4349, + "num_input_tokens_seen": 1935360, + "step": 5755 + }, + { + "epoch": 4.451313755795981, + "grad_norm": 1.6015528440475464, + "learning_rate": 4.774805283865616e-05, + "loss": 0.6177, + "num_input_tokens_seen": 1937056, + "step": 5760 + }, + { + "epoch": 4.455177743431221, + "grad_norm": 0.526136577129364, + "learning_rate": 4.774105456157375e-05, + "loss": 0.4764, + "num_input_tokens_seen": 1938656, + "step": 5765 + }, + { + "epoch": 4.459041731066461, + "grad_norm": 0.774795413017273, + "learning_rate": 4.773404594172355e-05, + "loss": 0.5172, + "num_input_tokens_seen": 1940320, + "step": 5770 + }, + { + "epoch": 4.4629057187017, + "grad_norm": 1.014614224433899, + "learning_rate": 4.77270269822931e-05, + "loss": 0.437, + "num_input_tokens_seen": 1942368, + "step": 5775 + }, + { + "epoch": 4.466769706336939, + "grad_norm": 1.0157074928283691, + "learning_rate": 4.771999768647467e-05, + "loss": 0.7174, + "num_input_tokens_seen": 1944000, + "step": 5780 + }, + { + "epoch": 4.47063369397218, + "grad_norm": 0.7078235149383545, + "learning_rate": 4.771295805746523e-05, + "loss": 0.4957, + "num_input_tokens_seen": 1945824, + "step": 5785 + }, + { + "epoch": 4.474497681607419, + "grad_norm": 1.1728479862213135, + "learning_rate": 4.770590809846644e-05, + "loss": 0.4761, + "num_input_tokens_seen": 1947424, + "step": 5790 + }, + { + "epoch": 4.478361669242658, + "grad_norm": 0.7324485778808594, + "learning_rate": 4.7698847812684663e-05, + "loss": 0.4212, + "num_input_tokens_seen": 1948992, + "step": 5795 + }, + { + "epoch": 4.4822256568778975, + "grad_norm": 0.7609859108924866, + "learning_rate": 4.769177720333097e-05, + "loss": 0.4603, + "num_input_tokens_seen": 1950784, + "step": 5800 + }, + { + "epoch": 4.486089644513138, + "grad_norm": 0.5169820785522461, + "learning_rate": 4.768469627362111e-05, + "loss": 0.466, + "num_input_tokens_seen": 1952352, + "step": 5805 + }, + { + "epoch": 4.489953632148377, + "grad_norm": 1.12014639377594, + "learning_rate": 4.767760502677553e-05, + "loss": 0.4225, + "num_input_tokens_seen": 1953792, + "step": 5810 + }, + { + "epoch": 4.493817619783616, + "grad_norm": 0.6306894421577454, + "learning_rate": 4.767050346601937e-05, + "loss": 0.6148, + "num_input_tokens_seen": 1955360, + "step": 5815 + }, + { + "epoch": 4.497681607418857, + "grad_norm": 0.7548155188560486, + "learning_rate": 4.7663391594582465e-05, + "loss": 0.5994, + "num_input_tokens_seen": 1957120, + "step": 5820 + }, + { + "epoch": 4.501545595054096, + "grad_norm": 1.1185754537582397, + "learning_rate": 4.7656269415699344e-05, + "loss": 0.5367, + "num_input_tokens_seen": 1958912, + "step": 5825 + }, + { + "epoch": 4.505409582689335, + "grad_norm": 0.7881764769554138, + "learning_rate": 4.7649136932609204e-05, + "loss": 0.4706, + "num_input_tokens_seen": 1960736, + "step": 5830 + }, + { + "epoch": 4.509273570324575, + "grad_norm": 0.6836611032485962, + "learning_rate": 4.7641994148555944e-05, + "loss": 0.4199, + "num_input_tokens_seen": 1962368, + "step": 5835 + }, + { + "epoch": 4.513137557959815, + "grad_norm": 1.0671296119689941, + "learning_rate": 4.7634841066788154e-05, + "loss": 0.4396, + "num_input_tokens_seen": 1964000, + "step": 5840 + }, + { + "epoch": 4.517001545595054, + "grad_norm": 0.924066424369812, + "learning_rate": 4.762767769055909e-05, + "loss": 0.4306, + "num_input_tokens_seen": 1965728, + "step": 5845 + }, + { + "epoch": 4.520865533230293, + "grad_norm": 0.6996140480041504, + "learning_rate": 4.7620504023126697e-05, + "loss": 0.7366, + "num_input_tokens_seen": 1967520, + "step": 5850 + }, + { + "epoch": 4.524729520865534, + "grad_norm": 0.7458701133728027, + "learning_rate": 4.761332006775361e-05, + "loss": 0.3934, + "num_input_tokens_seen": 1969184, + "step": 5855 + }, + { + "epoch": 4.528593508500773, + "grad_norm": 1.0512794256210327, + "learning_rate": 4.7606125827707125e-05, + "loss": 0.4561, + "num_input_tokens_seen": 1970880, + "step": 5860 + }, + { + "epoch": 4.532457496136012, + "grad_norm": 1.4399982690811157, + "learning_rate": 4.7598921306259236e-05, + "loss": 0.6722, + "num_input_tokens_seen": 1972320, + "step": 5865 + }, + { + "epoch": 4.5363214837712516, + "grad_norm": 0.6652366518974304, + "learning_rate": 4.7591706506686595e-05, + "loss": 0.4312, + "num_input_tokens_seen": 1973888, + "step": 5870 + }, + { + "epoch": 4.540185471406492, + "grad_norm": 0.8163452744483948, + "learning_rate": 4.7584481432270545e-05, + "loss": 0.5556, + "num_input_tokens_seen": 1975680, + "step": 5875 + }, + { + "epoch": 4.544049459041731, + "grad_norm": 0.7258128523826599, + "learning_rate": 4.757724608629708e-05, + "loss": 0.5197, + "num_input_tokens_seen": 1977728, + "step": 5880 + }, + { + "epoch": 4.54791344667697, + "grad_norm": 0.9643172025680542, + "learning_rate": 4.757000047205688e-05, + "loss": 0.5024, + "num_input_tokens_seen": 1979552, + "step": 5885 + }, + { + "epoch": 4.551777434312211, + "grad_norm": 1.0939429998397827, + "learning_rate": 4.756274459284531e-05, + "loss": 0.9044, + "num_input_tokens_seen": 1981312, + "step": 5890 + }, + { + "epoch": 4.55564142194745, + "grad_norm": 0.780862033367157, + "learning_rate": 4.755547845196236e-05, + "loss": 0.4951, + "num_input_tokens_seen": 1982848, + "step": 5895 + }, + { + "epoch": 4.559505409582689, + "grad_norm": 0.6893169283866882, + "learning_rate": 4.754820205271275e-05, + "loss": 0.4244, + "num_input_tokens_seen": 1984480, + "step": 5900 + }, + { + "epoch": 4.563369397217929, + "grad_norm": 1.1471503973007202, + "learning_rate": 4.75409153984058e-05, + "loss": 0.403, + "num_input_tokens_seen": 1986112, + "step": 5905 + }, + { + "epoch": 4.567233384853169, + "grad_norm": 0.9241186380386353, + "learning_rate": 4.753361849235554e-05, + "loss": 0.3476, + "num_input_tokens_seen": 1987616, + "step": 5910 + }, + { + "epoch": 4.571097372488408, + "grad_norm": 0.7196730375289917, + "learning_rate": 4.752631133788064e-05, + "loss": 0.5639, + "num_input_tokens_seen": 1989088, + "step": 5915 + }, + { + "epoch": 4.574961360123647, + "grad_norm": 0.5545175075531006, + "learning_rate": 4.751899393830443e-05, + "loss": 0.4566, + "num_input_tokens_seen": 1990784, + "step": 5920 + }, + { + "epoch": 4.578825347758887, + "grad_norm": 1.0132020711898804, + "learning_rate": 4.751166629695492e-05, + "loss": 0.6005, + "num_input_tokens_seen": 1992192, + "step": 5925 + }, + { + "epoch": 4.582689335394127, + "grad_norm": 0.9583152532577515, + "learning_rate": 4.7504328417164765e-05, + "loss": 0.4466, + "num_input_tokens_seen": 1994048, + "step": 5930 + }, + { + "epoch": 4.586553323029366, + "grad_norm": 0.75163334608078, + "learning_rate": 4.749698030227127e-05, + "loss": 0.8701, + "num_input_tokens_seen": 1995744, + "step": 5935 + }, + { + "epoch": 4.590417310664606, + "grad_norm": 1.2434847354888916, + "learning_rate": 4.74896219556164e-05, + "loss": 0.6964, + "num_input_tokens_seen": 1997568, + "step": 5940 + }, + { + "epoch": 4.594281298299846, + "grad_norm": 0.812267005443573, + "learning_rate": 4.748225338054679e-05, + "loss": 0.4638, + "num_input_tokens_seen": 1999424, + "step": 5945 + }, + { + "epoch": 4.598145285935085, + "grad_norm": 0.585430920124054, + "learning_rate": 4.74748745804137e-05, + "loss": 0.5168, + "num_input_tokens_seen": 2001344, + "step": 5950 + }, + { + "epoch": 4.602009273570324, + "grad_norm": 0.481047123670578, + "learning_rate": 4.746748555857304e-05, + "loss": 0.3747, + "num_input_tokens_seen": 2002944, + "step": 5955 + }, + { + "epoch": 4.605873261205565, + "grad_norm": 0.658066987991333, + "learning_rate": 4.746008631838541e-05, + "loss": 0.5088, + "num_input_tokens_seen": 2004672, + "step": 5960 + }, + { + "epoch": 4.609737248840804, + "grad_norm": 1.4388642311096191, + "learning_rate": 4.7452676863216015e-05, + "loss": 0.8646, + "num_input_tokens_seen": 2006336, + "step": 5965 + }, + { + "epoch": 4.613601236476043, + "grad_norm": 0.7448330521583557, + "learning_rate": 4.744525719643471e-05, + "loss": 0.3864, + "num_input_tokens_seen": 2007904, + "step": 5970 + }, + { + "epoch": 4.617465224111283, + "grad_norm": 0.589579164981842, + "learning_rate": 4.743782732141602e-05, + "loss": 0.572, + "num_input_tokens_seen": 2009440, + "step": 5975 + }, + { + "epoch": 4.621329211746523, + "grad_norm": 0.9281731247901917, + "learning_rate": 4.7430387241539085e-05, + "loss": 0.4726, + "num_input_tokens_seen": 2011168, + "step": 5980 + }, + { + "epoch": 4.625193199381762, + "grad_norm": 0.7120820879936218, + "learning_rate": 4.74229369601877e-05, + "loss": 0.6939, + "num_input_tokens_seen": 2012896, + "step": 5985 + }, + { + "epoch": 4.629057187017001, + "grad_norm": 0.7161136865615845, + "learning_rate": 4.74154764807503e-05, + "loss": 0.4291, + "num_input_tokens_seen": 2014752, + "step": 5990 + }, + { + "epoch": 4.632921174652241, + "grad_norm": 0.6599791646003723, + "learning_rate": 4.740800580661996e-05, + "loss": 0.4602, + "num_input_tokens_seen": 2016672, + "step": 5995 + }, + { + "epoch": 4.636785162287481, + "grad_norm": 0.7856085896492004, + "learning_rate": 4.740052494119439e-05, + "loss": 0.4769, + "num_input_tokens_seen": 2018368, + "step": 6000 + }, + { + "epoch": 4.64064914992272, + "grad_norm": 0.959252655506134, + "learning_rate": 4.7393033887875916e-05, + "loss": 0.6088, + "num_input_tokens_seen": 2020032, + "step": 6005 + }, + { + "epoch": 4.64451313755796, + "grad_norm": 0.6184768080711365, + "learning_rate": 4.738553265007152e-05, + "loss": 0.3747, + "num_input_tokens_seen": 2021600, + "step": 6010 + }, + { + "epoch": 4.6483771251932, + "grad_norm": 1.4959540367126465, + "learning_rate": 4.7378021231192815e-05, + "loss": 0.6711, + "num_input_tokens_seen": 2023488, + "step": 6015 + }, + { + "epoch": 4.652241112828439, + "grad_norm": 1.0032861232757568, + "learning_rate": 4.737049963465604e-05, + "loss": 0.6, + "num_input_tokens_seen": 2024992, + "step": 6020 + }, + { + "epoch": 4.656105100463678, + "grad_norm": 1.7425754070281982, + "learning_rate": 4.7362967863882056e-05, + "loss": 0.4947, + "num_input_tokens_seen": 2026592, + "step": 6025 + }, + { + "epoch": 4.659969088098918, + "grad_norm": 0.5893528461456299, + "learning_rate": 4.7355425922296364e-05, + "loss": 0.4275, + "num_input_tokens_seen": 2028320, + "step": 6030 + }, + { + "epoch": 4.663833075734158, + "grad_norm": 0.5791584253311157, + "learning_rate": 4.734787381332908e-05, + "loss": 0.5647, + "num_input_tokens_seen": 2030080, + "step": 6035 + }, + { + "epoch": 4.667697063369397, + "grad_norm": 1.1000739336013794, + "learning_rate": 4.734031154041495e-05, + "loss": 0.7042, + "num_input_tokens_seen": 2031584, + "step": 6040 + }, + { + "epoch": 4.671561051004637, + "grad_norm": 1.087368130683899, + "learning_rate": 4.733273910699334e-05, + "loss": 0.4788, + "num_input_tokens_seen": 2033184, + "step": 6045 + }, + { + "epoch": 4.675425038639876, + "grad_norm": 0.6357202529907227, + "learning_rate": 4.732515651650824e-05, + "loss": 0.4838, + "num_input_tokens_seen": 2035168, + "step": 6050 + }, + { + "epoch": 4.679289026275116, + "grad_norm": 1.1320610046386719, + "learning_rate": 4.7317563772408255e-05, + "loss": 0.4528, + "num_input_tokens_seen": 2036864, + "step": 6055 + }, + { + "epoch": 4.683153013910355, + "grad_norm": 0.8737779855728149, + "learning_rate": 4.730996087814662e-05, + "loss": 0.5022, + "num_input_tokens_seen": 2038656, + "step": 6060 + }, + { + "epoch": 4.687017001545595, + "grad_norm": 0.6491118669509888, + "learning_rate": 4.730234783718116e-05, + "loss": 0.4915, + "num_input_tokens_seen": 2040352, + "step": 6065 + }, + { + "epoch": 4.690880989180835, + "grad_norm": 0.784200131893158, + "learning_rate": 4.729472465297434e-05, + "loss": 0.464, + "num_input_tokens_seen": 2041952, + "step": 6070 + }, + { + "epoch": 4.694744976816074, + "grad_norm": 0.7120278477668762, + "learning_rate": 4.7287091328993226e-05, + "loss": 0.4815, + "num_input_tokens_seen": 2043648, + "step": 6075 + }, + { + "epoch": 4.698608964451314, + "grad_norm": 0.7722207903862, + "learning_rate": 4.727944786870951e-05, + "loss": 0.4559, + "num_input_tokens_seen": 2045216, + "step": 6080 + }, + { + "epoch": 4.702472952086554, + "grad_norm": 0.7104645371437073, + "learning_rate": 4.7271794275599477e-05, + "loss": 0.6762, + "num_input_tokens_seen": 2046976, + "step": 6085 + }, + { + "epoch": 4.706336939721793, + "grad_norm": 0.6591759324073792, + "learning_rate": 4.726413055314403e-05, + "loss": 0.3634, + "num_input_tokens_seen": 2048672, + "step": 6090 + }, + { + "epoch": 4.710200927357032, + "grad_norm": 0.5596489310264587, + "learning_rate": 4.725645670482866e-05, + "loss": 0.3685, + "num_input_tokens_seen": 2050304, + "step": 6095 + }, + { + "epoch": 4.714064914992272, + "grad_norm": 0.9198327660560608, + "learning_rate": 4.72487727341435e-05, + "loss": 0.5287, + "num_input_tokens_seen": 2052032, + "step": 6100 + }, + { + "epoch": 4.717928902627512, + "grad_norm": 0.591156542301178, + "learning_rate": 4.724107864458326e-05, + "loss": 0.4883, + "num_input_tokens_seen": 2053952, + "step": 6105 + }, + { + "epoch": 4.721792890262751, + "grad_norm": 1.0726152658462524, + "learning_rate": 4.723337443964725e-05, + "loss": 0.4998, + "num_input_tokens_seen": 2055520, + "step": 6110 + }, + { + "epoch": 4.725656877897991, + "grad_norm": 0.8542501330375671, + "learning_rate": 4.7225660122839396e-05, + "loss": 0.5708, + "num_input_tokens_seen": 2057376, + "step": 6115 + }, + { + "epoch": 4.72952086553323, + "grad_norm": 0.8091809749603271, + "learning_rate": 4.721793569766822e-05, + "loss": 0.4418, + "num_input_tokens_seen": 2059104, + "step": 6120 + }, + { + "epoch": 4.73338485316847, + "grad_norm": 0.9023493528366089, + "learning_rate": 4.721020116764683e-05, + "loss": 0.5774, + "num_input_tokens_seen": 2060672, + "step": 6125 + }, + { + "epoch": 4.7372488408037094, + "grad_norm": 0.7874920964241028, + "learning_rate": 4.720245653629293e-05, + "loss": 0.4954, + "num_input_tokens_seen": 2062400, + "step": 6130 + }, + { + "epoch": 4.741112828438949, + "grad_norm": 1.0422172546386719, + "learning_rate": 4.719470180712884e-05, + "loss": 0.4465, + "num_input_tokens_seen": 2064256, + "step": 6135 + }, + { + "epoch": 4.744976816074189, + "grad_norm": 0.6198582053184509, + "learning_rate": 4.718693698368144e-05, + "loss": 0.4685, + "num_input_tokens_seen": 2065920, + "step": 6140 + }, + { + "epoch": 4.748840803709428, + "grad_norm": 0.6313183903694153, + "learning_rate": 4.717916206948223e-05, + "loss": 0.4892, + "num_input_tokens_seen": 2067712, + "step": 6145 + }, + { + "epoch": 4.752704791344668, + "grad_norm": 0.8163815140724182, + "learning_rate": 4.7171377068067294e-05, + "loss": 0.557, + "num_input_tokens_seen": 2069536, + "step": 6150 + }, + { + "epoch": 4.756568778979907, + "grad_norm": 0.6101660132408142, + "learning_rate": 4.716358198297728e-05, + "loss": 0.4789, + "num_input_tokens_seen": 2071104, + "step": 6155 + }, + { + "epoch": 4.760432766615147, + "grad_norm": 1.0347535610198975, + "learning_rate": 4.715577681775744e-05, + "loss": 0.5559, + "num_input_tokens_seen": 2072896, + "step": 6160 + }, + { + "epoch": 4.7642967542503865, + "grad_norm": 0.9800088405609131, + "learning_rate": 4.714796157595763e-05, + "loss": 0.3695, + "num_input_tokens_seen": 2074656, + "step": 6165 + }, + { + "epoch": 4.768160741885626, + "grad_norm": 0.8175681829452515, + "learning_rate": 4.714013626113226e-05, + "loss": 0.4261, + "num_input_tokens_seen": 2076320, + "step": 6170 + }, + { + "epoch": 4.772024729520865, + "grad_norm": 1.4571336507797241, + "learning_rate": 4.713230087684032e-05, + "loss": 0.4935, + "num_input_tokens_seen": 2078016, + "step": 6175 + }, + { + "epoch": 4.775888717156105, + "grad_norm": 1.3035660982131958, + "learning_rate": 4.7124455426645396e-05, + "loss": 0.7497, + "num_input_tokens_seen": 2079904, + "step": 6180 + }, + { + "epoch": 4.779752704791345, + "grad_norm": 0.692700207233429, + "learning_rate": 4.7116599914115645e-05, + "loss": 0.6001, + "num_input_tokens_seen": 2081664, + "step": 6185 + }, + { + "epoch": 4.783616692426584, + "grad_norm": 0.6380177736282349, + "learning_rate": 4.7108734342823803e-05, + "loss": 0.5524, + "num_input_tokens_seen": 2083360, + "step": 6190 + }, + { + "epoch": 4.787480680061824, + "grad_norm": 1.0023027658462524, + "learning_rate": 4.7100858716347175e-05, + "loss": 0.6439, + "num_input_tokens_seen": 2085184, + "step": 6195 + }, + { + "epoch": 4.7913446676970635, + "grad_norm": 1.0515997409820557, + "learning_rate": 4.709297303826765e-05, + "loss": 0.4996, + "num_input_tokens_seen": 2087040, + "step": 6200 + }, + { + "epoch": 4.795208655332303, + "grad_norm": 1.0972572565078735, + "learning_rate": 4.708507731217168e-05, + "loss": 0.4404, + "num_input_tokens_seen": 2088832, + "step": 6205 + }, + { + "epoch": 4.799072642967543, + "grad_norm": 0.6143618226051331, + "learning_rate": 4.707717154165028e-05, + "loss": 0.4844, + "num_input_tokens_seen": 2090560, + "step": 6210 + }, + { + "epoch": 4.802936630602782, + "grad_norm": 0.8239488005638123, + "learning_rate": 4.7069255730299044e-05, + "loss": 0.5792, + "num_input_tokens_seen": 2092256, + "step": 6215 + }, + { + "epoch": 4.806800618238022, + "grad_norm": 0.8728181719779968, + "learning_rate": 4.706132988171814e-05, + "loss": 0.5839, + "num_input_tokens_seen": 2094080, + "step": 6220 + }, + { + "epoch": 4.810664605873261, + "grad_norm": 0.6484626531600952, + "learning_rate": 4.705339399951229e-05, + "loss": 0.4567, + "num_input_tokens_seen": 2096032, + "step": 6225 + }, + { + "epoch": 4.814528593508501, + "grad_norm": 1.6561169624328613, + "learning_rate": 4.7045448087290763e-05, + "loss": 0.6993, + "num_input_tokens_seen": 2097600, + "step": 6230 + }, + { + "epoch": 4.8183925811437405, + "grad_norm": 1.175653100013733, + "learning_rate": 4.703749214866744e-05, + "loss": 0.4833, + "num_input_tokens_seen": 2099296, + "step": 6235 + }, + { + "epoch": 4.82225656877898, + "grad_norm": 1.6277350187301636, + "learning_rate": 4.7029526187260694e-05, + "loss": 0.5363, + "num_input_tokens_seen": 2100896, + "step": 6240 + }, + { + "epoch": 4.826120556414219, + "grad_norm": 0.7049555778503418, + "learning_rate": 4.702155020669352e-05, + "loss": 0.6339, + "num_input_tokens_seen": 2102848, + "step": 6245 + }, + { + "epoch": 4.829984544049459, + "grad_norm": 0.5586385130882263, + "learning_rate": 4.701356421059342e-05, + "loss": 0.4749, + "num_input_tokens_seen": 2104480, + "step": 6250 + }, + { + "epoch": 4.833848531684699, + "grad_norm": 1.0357210636138916, + "learning_rate": 4.700556820259249e-05, + "loss": 0.6698, + "num_input_tokens_seen": 2106240, + "step": 6255 + }, + { + "epoch": 4.837712519319938, + "grad_norm": 1.1988939046859741, + "learning_rate": 4.6997562186327355e-05, + "loss": 0.6157, + "num_input_tokens_seen": 2107648, + "step": 6260 + }, + { + "epoch": 4.841576506955178, + "grad_norm": 0.6032547354698181, + "learning_rate": 4.6989546165439196e-05, + "loss": 0.445, + "num_input_tokens_seen": 2109280, + "step": 6265 + }, + { + "epoch": 4.8454404945904175, + "grad_norm": 0.8753820657730103, + "learning_rate": 4.698152014357376e-05, + "loss": 0.5083, + "num_input_tokens_seen": 2110752, + "step": 6270 + }, + { + "epoch": 4.849304482225657, + "grad_norm": 1.1751611232757568, + "learning_rate": 4.697348412438131e-05, + "loss": 0.5211, + "num_input_tokens_seen": 2112480, + "step": 6275 + }, + { + "epoch": 4.853168469860896, + "grad_norm": 0.7211741209030151, + "learning_rate": 4.6965438111516685e-05, + "loss": 0.591, + "num_input_tokens_seen": 2114272, + "step": 6280 + }, + { + "epoch": 4.857032457496136, + "grad_norm": 0.6560580730438232, + "learning_rate": 4.695738210863926e-05, + "loss": 0.4594, + "num_input_tokens_seen": 2116032, + "step": 6285 + }, + { + "epoch": 4.860896445131376, + "grad_norm": 0.9903709888458252, + "learning_rate": 4.694931611941297e-05, + "loss": 0.6325, + "num_input_tokens_seen": 2117632, + "step": 6290 + }, + { + "epoch": 4.864760432766615, + "grad_norm": 0.942850649356842, + "learning_rate": 4.694124014750624e-05, + "loss": 0.4016, + "num_input_tokens_seen": 2119296, + "step": 6295 + }, + { + "epoch": 4.868624420401854, + "grad_norm": 0.7375192046165466, + "learning_rate": 4.69331541965921e-05, + "loss": 0.6515, + "num_input_tokens_seen": 2120992, + "step": 6300 + }, + { + "epoch": 4.8724884080370945, + "grad_norm": 0.8455231189727783, + "learning_rate": 4.6925058270348076e-05, + "loss": 0.4591, + "num_input_tokens_seen": 2122688, + "step": 6305 + }, + { + "epoch": 4.876352395672334, + "grad_norm": 0.546376645565033, + "learning_rate": 4.691695237245625e-05, + "loss": 0.4427, + "num_input_tokens_seen": 2124512, + "step": 6310 + }, + { + "epoch": 4.880216383307573, + "grad_norm": 0.6278719305992126, + "learning_rate": 4.690883650660323e-05, + "loss": 0.5717, + "num_input_tokens_seen": 2126144, + "step": 6315 + }, + { + "epoch": 4.884080370942813, + "grad_norm": 0.6174848079681396, + "learning_rate": 4.690071067648016e-05, + "loss": 0.4169, + "num_input_tokens_seen": 2127616, + "step": 6320 + }, + { + "epoch": 4.887944358578053, + "grad_norm": 0.48595789074897766, + "learning_rate": 4.6892574885782714e-05, + "loss": 0.4251, + "num_input_tokens_seen": 2129120, + "step": 6325 + }, + { + "epoch": 4.891808346213292, + "grad_norm": 0.6716845631599426, + "learning_rate": 4.68844291382111e-05, + "loss": 0.4345, + "num_input_tokens_seen": 2130880, + "step": 6330 + }, + { + "epoch": 4.895672333848532, + "grad_norm": 0.7391566634178162, + "learning_rate": 4.687627343747005e-05, + "loss": 0.5073, + "num_input_tokens_seen": 2132480, + "step": 6335 + }, + { + "epoch": 4.8995363214837715, + "grad_norm": 1.4256454706192017, + "learning_rate": 4.6868107787268835e-05, + "loss": 0.5249, + "num_input_tokens_seen": 2134304, + "step": 6340 + }, + { + "epoch": 4.903400309119011, + "grad_norm": 0.7005415558815002, + "learning_rate": 4.685993219132123e-05, + "loss": 0.4338, + "num_input_tokens_seen": 2135712, + "step": 6345 + }, + { + "epoch": 4.90726429675425, + "grad_norm": 0.7871833443641663, + "learning_rate": 4.685174665334556e-05, + "loss": 0.4364, + "num_input_tokens_seen": 2137216, + "step": 6350 + }, + { + "epoch": 4.91112828438949, + "grad_norm": 0.5518500804901123, + "learning_rate": 4.684355117706464e-05, + "loss": 0.3411, + "num_input_tokens_seen": 2138880, + "step": 6355 + }, + { + "epoch": 4.91499227202473, + "grad_norm": 0.5899852514266968, + "learning_rate": 4.683534576620583e-05, + "loss": 0.4009, + "num_input_tokens_seen": 2140576, + "step": 6360 + }, + { + "epoch": 4.918856259659969, + "grad_norm": 0.4277820289134979, + "learning_rate": 4.6827130424501e-05, + "loss": 0.4516, + "num_input_tokens_seen": 2141952, + "step": 6365 + }, + { + "epoch": 4.922720247295208, + "grad_norm": 0.6031973361968994, + "learning_rate": 4.6818905155686526e-05, + "loss": 0.4242, + "num_input_tokens_seen": 2143488, + "step": 6370 + }, + { + "epoch": 4.9265842349304485, + "grad_norm": 0.6769505143165588, + "learning_rate": 4.681066996350333e-05, + "loss": 0.5661, + "num_input_tokens_seen": 2145120, + "step": 6375 + }, + { + "epoch": 4.930448222565688, + "grad_norm": 1.3897219896316528, + "learning_rate": 4.6802424851696816e-05, + "loss": 0.6618, + "num_input_tokens_seen": 2146624, + "step": 6380 + }, + { + "epoch": 4.934312210200927, + "grad_norm": 0.8296585083007812, + "learning_rate": 4.6794169824016896e-05, + "loss": 0.3937, + "num_input_tokens_seen": 2148384, + "step": 6385 + }, + { + "epoch": 4.938176197836167, + "grad_norm": 0.4110202193260193, + "learning_rate": 4.678590488421803e-05, + "loss": 0.3935, + "num_input_tokens_seen": 2150112, + "step": 6390 + }, + { + "epoch": 4.942040185471407, + "grad_norm": 0.9150643348693848, + "learning_rate": 4.6777630036059154e-05, + "loss": 0.4592, + "num_input_tokens_seen": 2151808, + "step": 6395 + }, + { + "epoch": 4.945904173106646, + "grad_norm": 1.215728759765625, + "learning_rate": 4.676934528330371e-05, + "loss": 0.4512, + "num_input_tokens_seen": 2153216, + "step": 6400 + }, + { + "epoch": 4.949768160741885, + "grad_norm": 1.1124768257141113, + "learning_rate": 4.676105062971967e-05, + "loss": 0.4238, + "num_input_tokens_seen": 2155072, + "step": 6405 + }, + { + "epoch": 4.9536321483771255, + "grad_norm": 0.7619708180427551, + "learning_rate": 4.675274607907947e-05, + "loss": 0.4979, + "num_input_tokens_seen": 2156736, + "step": 6410 + }, + { + "epoch": 4.957496136012365, + "grad_norm": 0.8756499886512756, + "learning_rate": 4.6744431635160094e-05, + "loss": 0.5718, + "num_input_tokens_seen": 2158368, + "step": 6415 + }, + { + "epoch": 4.961360123647604, + "grad_norm": 1.1871299743652344, + "learning_rate": 4.673610730174298e-05, + "loss": 0.422, + "num_input_tokens_seen": 2159872, + "step": 6420 + }, + { + "epoch": 4.9652241112828435, + "grad_norm": 0.8678427338600159, + "learning_rate": 4.672777308261409e-05, + "loss": 0.3749, + "num_input_tokens_seen": 2161376, + "step": 6425 + }, + { + "epoch": 4.969088098918084, + "grad_norm": 1.5428732633590698, + "learning_rate": 4.6719428981563885e-05, + "loss": 0.6177, + "num_input_tokens_seen": 2163232, + "step": 6430 + }, + { + "epoch": 4.972952086553323, + "grad_norm": 1.4038500785827637, + "learning_rate": 4.6711075002387304e-05, + "loss": 0.6011, + "num_input_tokens_seen": 2164960, + "step": 6435 + }, + { + "epoch": 4.976816074188562, + "grad_norm": 0.7594925761222839, + "learning_rate": 4.6702711148883794e-05, + "loss": 0.6413, + "num_input_tokens_seen": 2166528, + "step": 6440 + }, + { + "epoch": 4.9806800618238025, + "grad_norm": 0.9281092882156372, + "learning_rate": 4.669433742485727e-05, + "loss": 0.3581, + "num_input_tokens_seen": 2168256, + "step": 6445 + }, + { + "epoch": 4.984544049459042, + "grad_norm": 0.8751462697982788, + "learning_rate": 4.668595383411617e-05, + "loss": 1.0258, + "num_input_tokens_seen": 2170176, + "step": 6450 + }, + { + "epoch": 4.988408037094281, + "grad_norm": 0.5335502028465271, + "learning_rate": 4.6677560380473396e-05, + "loss": 0.5336, + "num_input_tokens_seen": 2171936, + "step": 6455 + }, + { + "epoch": 4.992272024729521, + "grad_norm": 0.6274211406707764, + "learning_rate": 4.666915706774634e-05, + "loss": 0.5134, + "num_input_tokens_seen": 2173760, + "step": 6460 + }, + { + "epoch": 4.996136012364761, + "grad_norm": 0.8410266637802124, + "learning_rate": 4.6660743899756875e-05, + "loss": 0.4036, + "num_input_tokens_seen": 2175264, + "step": 6465 + }, + { + "epoch": 5.0, + "grad_norm": 0.9543517827987671, + "learning_rate": 4.665232088033136e-05, + "loss": 0.4916, + "num_input_tokens_seen": 2176672, + "step": 6470 + }, + { + "epoch": 5.0, + "eval_loss": 0.4901926815509796, + "eval_runtime": 6.2375, + "eval_samples_per_second": 92.184, + "eval_steps_per_second": 23.086, + "num_input_tokens_seen": 2176672, + "step": 6470 + }, + { + "epoch": 5.003863987635239, + "grad_norm": 0.637384295463562, + "learning_rate": 4.664388801330064e-05, + "loss": 0.4742, + "num_input_tokens_seen": 2178304, + "step": 6475 + }, + { + "epoch": 5.0077279752704795, + "grad_norm": 0.7223641872406006, + "learning_rate": 4.663544530250004e-05, + "loss": 0.3876, + "num_input_tokens_seen": 2179744, + "step": 6480 + }, + { + "epoch": 5.011591962905719, + "grad_norm": 1.2495654821395874, + "learning_rate": 4.662699275176934e-05, + "loss": 0.4473, + "num_input_tokens_seen": 2181344, + "step": 6485 + }, + { + "epoch": 5.015455950540958, + "grad_norm": 0.5381432771682739, + "learning_rate": 4.661853036495281e-05, + "loss": 0.5056, + "num_input_tokens_seen": 2183232, + "step": 6490 + }, + { + "epoch": 5.0193199381761975, + "grad_norm": 1.11477792263031, + "learning_rate": 4.661005814589921e-05, + "loss": 0.4976, + "num_input_tokens_seen": 2184832, + "step": 6495 + }, + { + "epoch": 5.023183925811438, + "grad_norm": 1.5717432498931885, + "learning_rate": 4.660157609846175e-05, + "loss": 0.5363, + "num_input_tokens_seen": 2186400, + "step": 6500 + }, + { + "epoch": 5.027047913446677, + "grad_norm": 0.9892496466636658, + "learning_rate": 4.659308422649811e-05, + "loss": 0.7615, + "num_input_tokens_seen": 2188256, + "step": 6505 + }, + { + "epoch": 5.030911901081916, + "grad_norm": 0.9862325191497803, + "learning_rate": 4.6584582533870445e-05, + "loss": 0.4709, + "num_input_tokens_seen": 2189728, + "step": 6510 + }, + { + "epoch": 5.0347758887171565, + "grad_norm": 0.7465859651565552, + "learning_rate": 4.657607102444538e-05, + "loss": 0.5111, + "num_input_tokens_seen": 2191328, + "step": 6515 + }, + { + "epoch": 5.038639876352396, + "grad_norm": 0.6720333099365234, + "learning_rate": 4.656754970209401e-05, + "loss": 0.4655, + "num_input_tokens_seen": 2193184, + "step": 6520 + }, + { + "epoch": 5.042503863987635, + "grad_norm": 0.5696495175361633, + "learning_rate": 4.655901857069186e-05, + "loss": 0.4921, + "num_input_tokens_seen": 2194912, + "step": 6525 + }, + { + "epoch": 5.0463678516228745, + "grad_norm": 1.9702743291854858, + "learning_rate": 4.655047763411895e-05, + "loss": 0.476, + "num_input_tokens_seen": 2196640, + "step": 6530 + }, + { + "epoch": 5.050231839258115, + "grad_norm": 1.277234435081482, + "learning_rate": 4.654192689625976e-05, + "loss": 0.5544, + "num_input_tokens_seen": 2198496, + "step": 6535 + }, + { + "epoch": 5.054095826893354, + "grad_norm": 0.7035170197486877, + "learning_rate": 4.6533366361003204e-05, + "loss": 0.6522, + "num_input_tokens_seen": 2200096, + "step": 6540 + }, + { + "epoch": 5.057959814528593, + "grad_norm": 0.9198494553565979, + "learning_rate": 4.652479603224267e-05, + "loss": 0.4015, + "num_input_tokens_seen": 2201728, + "step": 6545 + }, + { + "epoch": 5.061823802163833, + "grad_norm": 0.7470571398735046, + "learning_rate": 4.651621591387599e-05, + "loss": 0.4464, + "num_input_tokens_seen": 2203200, + "step": 6550 + }, + { + "epoch": 5.065687789799073, + "grad_norm": 0.6377326250076294, + "learning_rate": 4.650762600980546e-05, + "loss": 0.6176, + "num_input_tokens_seen": 2204832, + "step": 6555 + }, + { + "epoch": 5.069551777434312, + "grad_norm": 0.9444475769996643, + "learning_rate": 4.6499026323937824e-05, + "loss": 0.5913, + "num_input_tokens_seen": 2206656, + "step": 6560 + }, + { + "epoch": 5.0734157650695515, + "grad_norm": 0.7857267260551453, + "learning_rate": 4.649041686018425e-05, + "loss": 0.8552, + "num_input_tokens_seen": 2208224, + "step": 6565 + }, + { + "epoch": 5.077279752704792, + "grad_norm": 0.9105510711669922, + "learning_rate": 4.6481797622460394e-05, + "loss": 0.5728, + "num_input_tokens_seen": 2210112, + "step": 6570 + }, + { + "epoch": 5.081143740340031, + "grad_norm": 0.5312409400939941, + "learning_rate": 4.647316861468633e-05, + "loss": 0.3943, + "num_input_tokens_seen": 2211872, + "step": 6575 + }, + { + "epoch": 5.08500772797527, + "grad_norm": 0.9020838141441345, + "learning_rate": 4.646452984078658e-05, + "loss": 0.4425, + "num_input_tokens_seen": 2213536, + "step": 6580 + }, + { + "epoch": 5.08887171561051, + "grad_norm": 0.5658096075057983, + "learning_rate": 4.6455881304690116e-05, + "loss": 0.5904, + "num_input_tokens_seen": 2215328, + "step": 6585 + }, + { + "epoch": 5.09273570324575, + "grad_norm": 1.4443256855010986, + "learning_rate": 4.6447223010330334e-05, + "loss": 0.5004, + "num_input_tokens_seen": 2216992, + "step": 6590 + }, + { + "epoch": 5.096599690880989, + "grad_norm": 0.8069081902503967, + "learning_rate": 4.6438554961645084e-05, + "loss": 0.456, + "num_input_tokens_seen": 2218880, + "step": 6595 + }, + { + "epoch": 5.1004636785162285, + "grad_norm": 0.9157733917236328, + "learning_rate": 4.642987716257665e-05, + "loss": 0.5837, + "num_input_tokens_seen": 2220608, + "step": 6600 + }, + { + "epoch": 5.104327666151469, + "grad_norm": 1.1800764799118042, + "learning_rate": 4.6421189617071754e-05, + "loss": 0.3967, + "num_input_tokens_seen": 2222176, + "step": 6605 + }, + { + "epoch": 5.108191653786708, + "grad_norm": 0.5795477628707886, + "learning_rate": 4.6412492329081524e-05, + "loss": 0.3831, + "num_input_tokens_seen": 2223872, + "step": 6610 + }, + { + "epoch": 5.112055641421947, + "grad_norm": 0.6586823463439941, + "learning_rate": 4.640378530256155e-05, + "loss": 0.8554, + "num_input_tokens_seen": 2225664, + "step": 6615 + }, + { + "epoch": 5.115919629057187, + "grad_norm": 0.8910622000694275, + "learning_rate": 4.6395068541471834e-05, + "loss": 0.4766, + "num_input_tokens_seen": 2227424, + "step": 6620 + }, + { + "epoch": 5.119783616692427, + "grad_norm": 1.2056357860565186, + "learning_rate": 4.638634204977682e-05, + "loss": 0.6007, + "num_input_tokens_seen": 2228992, + "step": 6625 + }, + { + "epoch": 5.123647604327666, + "grad_norm": 0.48523539304733276, + "learning_rate": 4.637760583144536e-05, + "loss": 0.5574, + "num_input_tokens_seen": 2230688, + "step": 6630 + }, + { + "epoch": 5.1275115919629055, + "grad_norm": 0.7610316276550293, + "learning_rate": 4.636885989045074e-05, + "loss": 0.4852, + "num_input_tokens_seen": 2232640, + "step": 6635 + }, + { + "epoch": 5.131375579598146, + "grad_norm": 1.0388654470443726, + "learning_rate": 4.6360104230770685e-05, + "loss": 0.4581, + "num_input_tokens_seen": 2234304, + "step": 6640 + }, + { + "epoch": 5.135239567233385, + "grad_norm": 0.5474745035171509, + "learning_rate": 4.635133885638729e-05, + "loss": 0.4938, + "num_input_tokens_seen": 2236032, + "step": 6645 + }, + { + "epoch": 5.139103554868624, + "grad_norm": 1.3278520107269287, + "learning_rate": 4.634256377128712e-05, + "loss": 0.5085, + "num_input_tokens_seen": 2237856, + "step": 6650 + }, + { + "epoch": 5.142967542503864, + "grad_norm": 0.6041468977928162, + "learning_rate": 4.633377897946113e-05, + "loss": 0.4657, + "num_input_tokens_seen": 2239552, + "step": 6655 + }, + { + "epoch": 5.146831530139104, + "grad_norm": 0.6758517026901245, + "learning_rate": 4.6324984484904696e-05, + "loss": 0.4647, + "num_input_tokens_seen": 2241184, + "step": 6660 + }, + { + "epoch": 5.150695517774343, + "grad_norm": 1.0143979787826538, + "learning_rate": 4.631618029161761e-05, + "loss": 0.5194, + "num_input_tokens_seen": 2243168, + "step": 6665 + }, + { + "epoch": 5.1545595054095825, + "grad_norm": 0.7706741690635681, + "learning_rate": 4.630736640360407e-05, + "loss": 0.497, + "num_input_tokens_seen": 2244736, + "step": 6670 + }, + { + "epoch": 5.158423493044822, + "grad_norm": 1.547992467880249, + "learning_rate": 4.629854282487268e-05, + "loss": 0.4434, + "num_input_tokens_seen": 2246336, + "step": 6675 + }, + { + "epoch": 5.162287480680062, + "grad_norm": 1.0905689001083374, + "learning_rate": 4.6289709559436466e-05, + "loss": 0.4803, + "num_input_tokens_seen": 2248192, + "step": 6680 + }, + { + "epoch": 5.166151468315301, + "grad_norm": 1.1617836952209473, + "learning_rate": 4.6280866611312846e-05, + "loss": 0.716, + "num_input_tokens_seen": 2249824, + "step": 6685 + }, + { + "epoch": 5.170015455950541, + "grad_norm": 0.7571743726730347, + "learning_rate": 4.627201398452364e-05, + "loss": 0.3678, + "num_input_tokens_seen": 2251552, + "step": 6690 + }, + { + "epoch": 5.173879443585781, + "grad_norm": 1.6077580451965332, + "learning_rate": 4.626315168309509e-05, + "loss": 0.7336, + "num_input_tokens_seen": 2253504, + "step": 6695 + }, + { + "epoch": 5.17774343122102, + "grad_norm": 0.7982884645462036, + "learning_rate": 4.6254279711057804e-05, + "loss": 0.3929, + "num_input_tokens_seen": 2255328, + "step": 6700 + }, + { + "epoch": 5.1816074188562595, + "grad_norm": 1.0296738147735596, + "learning_rate": 4.624539807244682e-05, + "loss": 0.5167, + "num_input_tokens_seen": 2257152, + "step": 6705 + }, + { + "epoch": 5.185471406491499, + "grad_norm": 0.7312260866165161, + "learning_rate": 4.623650677130157e-05, + "loss": 0.3731, + "num_input_tokens_seen": 2258752, + "step": 6710 + }, + { + "epoch": 5.189335394126739, + "grad_norm": 0.6812884211540222, + "learning_rate": 4.622760581166585e-05, + "loss": 0.4088, + "num_input_tokens_seen": 2260512, + "step": 6715 + }, + { + "epoch": 5.193199381761978, + "grad_norm": 0.8354362845420837, + "learning_rate": 4.621869519758788e-05, + "loss": 0.6786, + "num_input_tokens_seen": 2262144, + "step": 6720 + }, + { + "epoch": 5.197063369397218, + "grad_norm": 1.2651112079620361, + "learning_rate": 4.620977493312026e-05, + "loss": 0.5836, + "num_input_tokens_seen": 2263872, + "step": 6725 + }, + { + "epoch": 5.200927357032458, + "grad_norm": 0.925995945930481, + "learning_rate": 4.6200845022319985e-05, + "loss": 0.5703, + "num_input_tokens_seen": 2265600, + "step": 6730 + }, + { + "epoch": 5.204791344667697, + "grad_norm": 0.5206258893013, + "learning_rate": 4.619190546924843e-05, + "loss": 0.3849, + "num_input_tokens_seen": 2267296, + "step": 6735 + }, + { + "epoch": 5.2086553323029365, + "grad_norm": 0.6949443221092224, + "learning_rate": 4.6182956277971346e-05, + "loss": 0.4529, + "num_input_tokens_seen": 2268960, + "step": 6740 + }, + { + "epoch": 5.212519319938176, + "grad_norm": 0.6662328839302063, + "learning_rate": 4.617399745255889e-05, + "loss": 0.4112, + "num_input_tokens_seen": 2270592, + "step": 6745 + }, + { + "epoch": 5.216383307573416, + "grad_norm": 1.0872700214385986, + "learning_rate": 4.616502899708558e-05, + "loss": 0.5097, + "num_input_tokens_seen": 2272160, + "step": 6750 + }, + { + "epoch": 5.220247295208655, + "grad_norm": 0.7396667003631592, + "learning_rate": 4.615605091563033e-05, + "loss": 0.4758, + "num_input_tokens_seen": 2274144, + "step": 6755 + }, + { + "epoch": 5.224111282843895, + "grad_norm": 0.5548027753829956, + "learning_rate": 4.614706321227644e-05, + "loss": 0.4154, + "num_input_tokens_seen": 2275680, + "step": 6760 + }, + { + "epoch": 5.227975270479135, + "grad_norm": 0.6944026350975037, + "learning_rate": 4.613806589111155e-05, + "loss": 0.708, + "num_input_tokens_seen": 2277568, + "step": 6765 + }, + { + "epoch": 5.231839258114374, + "grad_norm": 1.3733302354812622, + "learning_rate": 4.6129058956227695e-05, + "loss": 0.5536, + "num_input_tokens_seen": 2279168, + "step": 6770 + }, + { + "epoch": 5.2357032457496135, + "grad_norm": 1.8016326427459717, + "learning_rate": 4.612004241172129e-05, + "loss": 0.4451, + "num_input_tokens_seen": 2280768, + "step": 6775 + }, + { + "epoch": 5.239567233384853, + "grad_norm": 0.7341510057449341, + "learning_rate": 4.6111016261693116e-05, + "loss": 0.6375, + "num_input_tokens_seen": 2282400, + "step": 6780 + }, + { + "epoch": 5.243431221020093, + "grad_norm": 1.2387148141860962, + "learning_rate": 4.610198051024832e-05, + "loss": 0.6141, + "num_input_tokens_seen": 2283968, + "step": 6785 + }, + { + "epoch": 5.247295208655332, + "grad_norm": 1.1520726680755615, + "learning_rate": 4.609293516149641e-05, + "loss": 0.6812, + "num_input_tokens_seen": 2285760, + "step": 6790 + }, + { + "epoch": 5.251159196290572, + "grad_norm": 0.9339457154273987, + "learning_rate": 4.6083880219551265e-05, + "loss": 0.4841, + "num_input_tokens_seen": 2287296, + "step": 6795 + }, + { + "epoch": 5.255023183925811, + "grad_norm": 0.8143020868301392, + "learning_rate": 4.607481568853114e-05, + "loss": 0.4307, + "num_input_tokens_seen": 2288768, + "step": 6800 + }, + { + "epoch": 5.258887171561051, + "grad_norm": 0.8493896126747131, + "learning_rate": 4.6065741572558616e-05, + "loss": 0.454, + "num_input_tokens_seen": 2290496, + "step": 6805 + }, + { + "epoch": 5.2627511591962906, + "grad_norm": 0.6462670564651489, + "learning_rate": 4.605665787576068e-05, + "loss": 0.4476, + "num_input_tokens_seen": 2292352, + "step": 6810 + }, + { + "epoch": 5.26661514683153, + "grad_norm": 1.8766515254974365, + "learning_rate": 4.6047564602268626e-05, + "loss": 0.4779, + "num_input_tokens_seen": 2293952, + "step": 6815 + }, + { + "epoch": 5.27047913446677, + "grad_norm": 0.9049794673919678, + "learning_rate": 4.603846175621816e-05, + "loss": 0.4173, + "num_input_tokens_seen": 2295712, + "step": 6820 + }, + { + "epoch": 5.274343122102009, + "grad_norm": 0.958241879940033, + "learning_rate": 4.602934934174927e-05, + "loss": 0.6858, + "num_input_tokens_seen": 2297600, + "step": 6825 + }, + { + "epoch": 5.278207109737249, + "grad_norm": 0.7042433619499207, + "learning_rate": 4.6020227363006375e-05, + "loss": 0.58, + "num_input_tokens_seen": 2299296, + "step": 6830 + }, + { + "epoch": 5.282071097372488, + "grad_norm": 0.7877497673034668, + "learning_rate": 4.601109582413818e-05, + "loss": 0.5968, + "num_input_tokens_seen": 2300928, + "step": 6835 + }, + { + "epoch": 5.285935085007728, + "grad_norm": 1.3019059896469116, + "learning_rate": 4.600195472929778e-05, + "loss": 0.4908, + "num_input_tokens_seen": 2302624, + "step": 6840 + }, + { + "epoch": 5.289799072642968, + "grad_norm": 1.271315097808838, + "learning_rate": 4.5992804082642594e-05, + "loss": 0.6237, + "num_input_tokens_seen": 2304448, + "step": 6845 + }, + { + "epoch": 5.293663060278207, + "grad_norm": 0.5036696195602417, + "learning_rate": 4.5983643888334385e-05, + "loss": 0.465, + "num_input_tokens_seen": 2306048, + "step": 6850 + }, + { + "epoch": 5.297527047913447, + "grad_norm": 0.7724022269248962, + "learning_rate": 4.597447415053927e-05, + "loss": 0.558, + "num_input_tokens_seen": 2307584, + "step": 6855 + }, + { + "epoch": 5.301391035548686, + "grad_norm": 0.9255558848381042, + "learning_rate": 4.59652948734277e-05, + "loss": 0.4375, + "num_input_tokens_seen": 2309216, + "step": 6860 + }, + { + "epoch": 5.305255023183926, + "grad_norm": 0.7825157046318054, + "learning_rate": 4.5956106061174476e-05, + "loss": 0.396, + "num_input_tokens_seen": 2310912, + "step": 6865 + }, + { + "epoch": 5.309119010819165, + "grad_norm": 1.220924735069275, + "learning_rate": 4.59469077179587e-05, + "loss": 0.5359, + "num_input_tokens_seen": 2312544, + "step": 6870 + }, + { + "epoch": 5.312982998454405, + "grad_norm": 0.8394623398780823, + "learning_rate": 4.593769984796385e-05, + "loss": 0.4714, + "num_input_tokens_seen": 2314336, + "step": 6875 + }, + { + "epoch": 5.316846986089645, + "grad_norm": 1.0271354913711548, + "learning_rate": 4.592848245537773e-05, + "loss": 0.419, + "num_input_tokens_seen": 2316064, + "step": 6880 + }, + { + "epoch": 5.320710973724884, + "grad_norm": 1.188844084739685, + "learning_rate": 4.591925554439244e-05, + "loss": 0.4724, + "num_input_tokens_seen": 2317824, + "step": 6885 + }, + { + "epoch": 5.324574961360124, + "grad_norm": 0.8580700159072876, + "learning_rate": 4.5910019119204456e-05, + "loss": 0.9671, + "num_input_tokens_seen": 2319424, + "step": 6890 + }, + { + "epoch": 5.328438948995363, + "grad_norm": 0.6981192827224731, + "learning_rate": 4.5900773184014546e-05, + "loss": 0.4278, + "num_input_tokens_seen": 2321024, + "step": 6895 + }, + { + "epoch": 5.332302936630603, + "grad_norm": 0.6112363934516907, + "learning_rate": 4.5891517743027824e-05, + "loss": 0.4959, + "num_input_tokens_seen": 2322464, + "step": 6900 + }, + { + "epoch": 5.336166924265842, + "grad_norm": 0.5031453371047974, + "learning_rate": 4.5882252800453726e-05, + "loss": 0.3718, + "num_input_tokens_seen": 2324448, + "step": 6905 + }, + { + "epoch": 5.340030911901082, + "grad_norm": 1.0443317890167236, + "learning_rate": 4.587297836050598e-05, + "loss": 0.4994, + "num_input_tokens_seen": 2326080, + "step": 6910 + }, + { + "epoch": 5.343894899536322, + "grad_norm": 0.9828284978866577, + "learning_rate": 4.5863694427402684e-05, + "loss": 0.6157, + "num_input_tokens_seen": 2327680, + "step": 6915 + }, + { + "epoch": 5.347758887171561, + "grad_norm": 0.5356892943382263, + "learning_rate": 4.5854401005366206e-05, + "loss": 0.414, + "num_input_tokens_seen": 2329280, + "step": 6920 + }, + { + "epoch": 5.3516228748068, + "grad_norm": 0.5890492796897888, + "learning_rate": 4.584509809862327e-05, + "loss": 0.6457, + "num_input_tokens_seen": 2330944, + "step": 6925 + }, + { + "epoch": 5.35548686244204, + "grad_norm": 0.990821123123169, + "learning_rate": 4.583578571140488e-05, + "loss": 0.5109, + "num_input_tokens_seen": 2332448, + "step": 6930 + }, + { + "epoch": 5.35935085007728, + "grad_norm": 0.7367907762527466, + "learning_rate": 4.582646384794636e-05, + "loss": 0.4072, + "num_input_tokens_seen": 2334112, + "step": 6935 + }, + { + "epoch": 5.363214837712519, + "grad_norm": 1.0571459531784058, + "learning_rate": 4.581713251248736e-05, + "loss": 0.5245, + "num_input_tokens_seen": 2335904, + "step": 6940 + }, + { + "epoch": 5.367078825347759, + "grad_norm": 0.4937739074230194, + "learning_rate": 4.580779170927183e-05, + "loss": 0.4596, + "num_input_tokens_seen": 2337472, + "step": 6945 + }, + { + "epoch": 5.370942812982999, + "grad_norm": 0.6879020929336548, + "learning_rate": 4.5798441442548014e-05, + "loss": 0.6551, + "num_input_tokens_seen": 2339040, + "step": 6950 + }, + { + "epoch": 5.374806800618238, + "grad_norm": 0.8427323698997498, + "learning_rate": 4.5789081716568474e-05, + "loss": 0.6561, + "num_input_tokens_seen": 2340608, + "step": 6955 + }, + { + "epoch": 5.378670788253477, + "grad_norm": 1.59781014919281, + "learning_rate": 4.577971253559006e-05, + "loss": 0.538, + "num_input_tokens_seen": 2342208, + "step": 6960 + }, + { + "epoch": 5.382534775888717, + "grad_norm": 0.684333324432373, + "learning_rate": 4.5770333903873955e-05, + "loss": 0.4183, + "num_input_tokens_seen": 2343904, + "step": 6965 + }, + { + "epoch": 5.386398763523957, + "grad_norm": 0.8780426383018494, + "learning_rate": 4.576094582568558e-05, + "loss": 0.3908, + "num_input_tokens_seen": 2345600, + "step": 6970 + }, + { + "epoch": 5.390262751159196, + "grad_norm": 1.0178335905075073, + "learning_rate": 4.575154830529473e-05, + "loss": 0.4334, + "num_input_tokens_seen": 2347456, + "step": 6975 + }, + { + "epoch": 5.394126738794436, + "grad_norm": 0.7265796661376953, + "learning_rate": 4.574214134697543e-05, + "loss": 0.4039, + "num_input_tokens_seen": 2349664, + "step": 6980 + }, + { + "epoch": 5.397990726429676, + "grad_norm": 0.9111684560775757, + "learning_rate": 4.573272495500602e-05, + "loss": 0.7575, + "num_input_tokens_seen": 2351168, + "step": 6985 + }, + { + "epoch": 5.401854714064915, + "grad_norm": 0.9152219295501709, + "learning_rate": 4.572329913366915e-05, + "loss": 0.4621, + "num_input_tokens_seen": 2353024, + "step": 6990 + }, + { + "epoch": 5.405718701700154, + "grad_norm": 0.7298804521560669, + "learning_rate": 4.571386388725172e-05, + "loss": 0.4544, + "num_input_tokens_seen": 2354624, + "step": 6995 + }, + { + "epoch": 5.409582689335394, + "grad_norm": 0.9615209102630615, + "learning_rate": 4.570441922004494e-05, + "loss": 0.5168, + "num_input_tokens_seen": 2356064, + "step": 7000 + }, + { + "epoch": 5.413446676970634, + "grad_norm": 1.141760230064392, + "learning_rate": 4.5694965136344305e-05, + "loss": 0.4938, + "num_input_tokens_seen": 2357824, + "step": 7005 + }, + { + "epoch": 5.417310664605873, + "grad_norm": 0.43453657627105713, + "learning_rate": 4.568550164044959e-05, + "loss": 1.026, + "num_input_tokens_seen": 2359584, + "step": 7010 + }, + { + "epoch": 5.421174652241113, + "grad_norm": 0.7597192525863647, + "learning_rate": 4.567602873666486e-05, + "loss": 0.4619, + "num_input_tokens_seen": 2361280, + "step": 7015 + }, + { + "epoch": 5.425038639876353, + "grad_norm": 0.6004551649093628, + "learning_rate": 4.5666546429298415e-05, + "loss": 0.3911, + "num_input_tokens_seen": 2362912, + "step": 7020 + }, + { + "epoch": 5.428902627511592, + "grad_norm": 0.5512177348136902, + "learning_rate": 4.56570547226629e-05, + "loss": 0.4108, + "num_input_tokens_seen": 2364352, + "step": 7025 + }, + { + "epoch": 5.432766615146831, + "grad_norm": 1.436280608177185, + "learning_rate": 4.5647553621075184e-05, + "loss": 0.6674, + "num_input_tokens_seen": 2366080, + "step": 7030 + }, + { + "epoch": 5.436630602782071, + "grad_norm": 1.33902108669281, + "learning_rate": 4.5638043128856436e-05, + "loss": 0.5468, + "num_input_tokens_seen": 2367712, + "step": 7035 + }, + { + "epoch": 5.440494590417311, + "grad_norm": 0.7962291836738586, + "learning_rate": 4.5628523250332065e-05, + "loss": 0.4623, + "num_input_tokens_seen": 2369280, + "step": 7040 + }, + { + "epoch": 5.44435857805255, + "grad_norm": 0.6936927437782288, + "learning_rate": 4.5618993989831785e-05, + "loss": 0.4373, + "num_input_tokens_seen": 2370688, + "step": 7045 + }, + { + "epoch": 5.448222565687789, + "grad_norm": 1.6880217790603638, + "learning_rate": 4.560945535168956e-05, + "loss": 0.7123, + "num_input_tokens_seen": 2372448, + "step": 7050 + }, + { + "epoch": 5.45208655332303, + "grad_norm": 0.7533669471740723, + "learning_rate": 4.559990734024361e-05, + "loss": 0.6549, + "num_input_tokens_seen": 2374080, + "step": 7055 + }, + { + "epoch": 5.455950540958269, + "grad_norm": 0.8275957703590393, + "learning_rate": 4.559034995983643e-05, + "loss": 0.4955, + "num_input_tokens_seen": 2375808, + "step": 7060 + }, + { + "epoch": 5.459814528593508, + "grad_norm": 0.9064179062843323, + "learning_rate": 4.558078321481478e-05, + "loss": 0.6043, + "num_input_tokens_seen": 2377312, + "step": 7065 + }, + { + "epoch": 5.4636785162287484, + "grad_norm": 0.7251551747322083, + "learning_rate": 4.557120710952968e-05, + "loss": 0.4856, + "num_input_tokens_seen": 2379104, + "step": 7070 + }, + { + "epoch": 5.467542503863988, + "grad_norm": 1.4006905555725098, + "learning_rate": 4.556162164833638e-05, + "loss": 0.5583, + "num_input_tokens_seen": 2380608, + "step": 7075 + }, + { + "epoch": 5.471406491499227, + "grad_norm": 0.781794011592865, + "learning_rate": 4.5552026835594416e-05, + "loss": 0.6079, + "num_input_tokens_seen": 2382400, + "step": 7080 + }, + { + "epoch": 5.475270479134466, + "grad_norm": 1.0673385858535767, + "learning_rate": 4.554242267566757e-05, + "loss": 0.4251, + "num_input_tokens_seen": 2384320, + "step": 7085 + }, + { + "epoch": 5.479134466769707, + "grad_norm": 0.8862619996070862, + "learning_rate": 4.553280917292387e-05, + "loss": 0.403, + "num_input_tokens_seen": 2386048, + "step": 7090 + }, + { + "epoch": 5.482998454404946, + "grad_norm": 0.6930130124092102, + "learning_rate": 4.552318633173559e-05, + "loss": 0.3917, + "num_input_tokens_seen": 2387744, + "step": 7095 + }, + { + "epoch": 5.486862442040185, + "grad_norm": 0.7275981903076172, + "learning_rate": 4.551355415647925e-05, + "loss": 0.5046, + "num_input_tokens_seen": 2389280, + "step": 7100 + }, + { + "epoch": 5.490726429675425, + "grad_norm": 0.754836916923523, + "learning_rate": 4.550391265153564e-05, + "loss": 0.6437, + "num_input_tokens_seen": 2390816, + "step": 7105 + }, + { + "epoch": 5.494590417310665, + "grad_norm": 0.6371362209320068, + "learning_rate": 4.5494261821289755e-05, + "loss": 0.3722, + "num_input_tokens_seen": 2392224, + "step": 7110 + }, + { + "epoch": 5.498454404945904, + "grad_norm": 0.5433828830718994, + "learning_rate": 4.548460167013086e-05, + "loss": 0.4499, + "num_input_tokens_seen": 2394112, + "step": 7115 + }, + { + "epoch": 5.502318392581143, + "grad_norm": 0.9806631207466125, + "learning_rate": 4.547493220245245e-05, + "loss": 0.4578, + "num_input_tokens_seen": 2396000, + "step": 7120 + }, + { + "epoch": 5.506182380216384, + "grad_norm": 0.7135473489761353, + "learning_rate": 4.5465253422652254e-05, + "loss": 0.4271, + "num_input_tokens_seen": 2397664, + "step": 7125 + }, + { + "epoch": 5.510046367851623, + "grad_norm": 1.4158446788787842, + "learning_rate": 4.545556533513224e-05, + "loss": 0.4771, + "num_input_tokens_seen": 2399392, + "step": 7130 + }, + { + "epoch": 5.513910355486862, + "grad_norm": 0.9029484987258911, + "learning_rate": 4.54458679442986e-05, + "loss": 0.4138, + "num_input_tokens_seen": 2401152, + "step": 7135 + }, + { + "epoch": 5.5177743431221025, + "grad_norm": 0.7756376266479492, + "learning_rate": 4.543616125456179e-05, + "loss": 0.389, + "num_input_tokens_seen": 2402656, + "step": 7140 + }, + { + "epoch": 5.521638330757342, + "grad_norm": 0.5488321781158447, + "learning_rate": 4.5426445270336446e-05, + "loss": 0.3753, + "num_input_tokens_seen": 2404224, + "step": 7145 + }, + { + "epoch": 5.525502318392581, + "grad_norm": 0.7776462435722351, + "learning_rate": 4.5416719996041466e-05, + "loss": 0.5858, + "num_input_tokens_seen": 2405920, + "step": 7150 + }, + { + "epoch": 5.52936630602782, + "grad_norm": 0.578097939491272, + "learning_rate": 4.5406985436099954e-05, + "loss": 0.3852, + "num_input_tokens_seen": 2407872, + "step": 7155 + }, + { + "epoch": 5.533230293663061, + "grad_norm": 0.8192168474197388, + "learning_rate": 4.539724159493926e-05, + "loss": 0.493, + "num_input_tokens_seen": 2409632, + "step": 7160 + }, + { + "epoch": 5.5370942812983, + "grad_norm": 0.5948290228843689, + "learning_rate": 4.538748847699092e-05, + "loss": 0.5104, + "num_input_tokens_seen": 2411616, + "step": 7165 + }, + { + "epoch": 5.540958268933539, + "grad_norm": 1.755394697189331, + "learning_rate": 4.537772608669074e-05, + "loss": 0.4425, + "num_input_tokens_seen": 2413440, + "step": 7170 + }, + { + "epoch": 5.544822256568779, + "grad_norm": 1.1829092502593994, + "learning_rate": 4.5367954428478695e-05, + "loss": 0.7462, + "num_input_tokens_seen": 2415104, + "step": 7175 + }, + { + "epoch": 5.548686244204019, + "grad_norm": 0.8582594394683838, + "learning_rate": 4.5358173506799e-05, + "loss": 0.493, + "num_input_tokens_seen": 2416800, + "step": 7180 + }, + { + "epoch": 5.552550231839258, + "grad_norm": 0.5626686811447144, + "learning_rate": 4.5348383326100076e-05, + "loss": 0.4307, + "num_input_tokens_seen": 2418496, + "step": 7185 + }, + { + "epoch": 5.556414219474497, + "grad_norm": 0.6191037893295288, + "learning_rate": 4.533858389083454e-05, + "loss": 0.6318, + "num_input_tokens_seen": 2420032, + "step": 7190 + }, + { + "epoch": 5.560278207109738, + "grad_norm": 1.1482634544372559, + "learning_rate": 4.5328775205459256e-05, + "loss": 0.5129, + "num_input_tokens_seen": 2421632, + "step": 7195 + }, + { + "epoch": 5.564142194744977, + "grad_norm": 0.8675625920295715, + "learning_rate": 4.5318957274435266e-05, + "loss": 0.4162, + "num_input_tokens_seen": 2423296, + "step": 7200 + }, + { + "epoch": 5.568006182380216, + "grad_norm": 0.7550934553146362, + "learning_rate": 4.530913010222782e-05, + "loss": 0.4085, + "num_input_tokens_seen": 2425088, + "step": 7205 + }, + { + "epoch": 5.571870170015456, + "grad_norm": 0.8684558868408203, + "learning_rate": 4.529929369330638e-05, + "loss": 0.449, + "num_input_tokens_seen": 2426880, + "step": 7210 + }, + { + "epoch": 5.575734157650696, + "grad_norm": 0.6908988952636719, + "learning_rate": 4.528944805214459e-05, + "loss": 0.3844, + "num_input_tokens_seen": 2428288, + "step": 7215 + }, + { + "epoch": 5.579598145285935, + "grad_norm": 1.16767418384552, + "learning_rate": 4.527959318322033e-05, + "loss": 0.6576, + "num_input_tokens_seen": 2429920, + "step": 7220 + }, + { + "epoch": 5.583462132921174, + "grad_norm": 0.6822666525840759, + "learning_rate": 4.526972909101563e-05, + "loss": 0.4235, + "num_input_tokens_seen": 2431456, + "step": 7225 + }, + { + "epoch": 5.587326120556414, + "grad_norm": 0.5859559178352356, + "learning_rate": 4.525985578001676e-05, + "loss": 0.5314, + "num_input_tokens_seen": 2433056, + "step": 7230 + }, + { + "epoch": 5.591190108191654, + "grad_norm": 0.7277525067329407, + "learning_rate": 4.524997325471414e-05, + "loss": 0.3987, + "num_input_tokens_seen": 2434528, + "step": 7235 + }, + { + "epoch": 5.595054095826893, + "grad_norm": 0.8197094798088074, + "learning_rate": 4.5240081519602416e-05, + "loss": 0.5953, + "num_input_tokens_seen": 2436096, + "step": 7240 + }, + { + "epoch": 5.598918083462133, + "grad_norm": 0.9463561177253723, + "learning_rate": 4.5230180579180405e-05, + "loss": 0.4826, + "num_input_tokens_seen": 2437760, + "step": 7245 + }, + { + "epoch": 5.602782071097373, + "grad_norm": 1.0100539922714233, + "learning_rate": 4.5220270437951104e-05, + "loss": 0.5985, + "num_input_tokens_seen": 2439584, + "step": 7250 + }, + { + "epoch": 5.606646058732612, + "grad_norm": 0.7744545340538025, + "learning_rate": 4.521035110042172e-05, + "loss": 0.3721, + "num_input_tokens_seen": 2441152, + "step": 7255 + }, + { + "epoch": 5.6105100463678514, + "grad_norm": 1.331579327583313, + "learning_rate": 4.5200422571103625e-05, + "loss": 0.4717, + "num_input_tokens_seen": 2442720, + "step": 7260 + }, + { + "epoch": 5.614374034003092, + "grad_norm": 0.6740531921386719, + "learning_rate": 4.519048485451236e-05, + "loss": 0.4191, + "num_input_tokens_seen": 2444192, + "step": 7265 + }, + { + "epoch": 5.618238021638331, + "grad_norm": 0.7355599999427795, + "learning_rate": 4.518053795516768e-05, + "loss": 0.5797, + "num_input_tokens_seen": 2445792, + "step": 7270 + }, + { + "epoch": 5.62210200927357, + "grad_norm": 0.6499999761581421, + "learning_rate": 4.517058187759347e-05, + "loss": 0.368, + "num_input_tokens_seen": 2447424, + "step": 7275 + }, + { + "epoch": 5.62596599690881, + "grad_norm": 0.5235801935195923, + "learning_rate": 4.5160616626317825e-05, + "loss": 0.4255, + "num_input_tokens_seen": 2449120, + "step": 7280 + }, + { + "epoch": 5.62982998454405, + "grad_norm": 0.8751282691955566, + "learning_rate": 4.515064220587301e-05, + "loss": 0.4398, + "num_input_tokens_seen": 2450944, + "step": 7285 + }, + { + "epoch": 5.633693972179289, + "grad_norm": 1.2340474128723145, + "learning_rate": 4.5140658620795426e-05, + "loss": 0.6988, + "num_input_tokens_seen": 2452480, + "step": 7290 + }, + { + "epoch": 5.6375579598145285, + "grad_norm": 0.9758970141410828, + "learning_rate": 4.51306658756257e-05, + "loss": 0.5913, + "num_input_tokens_seen": 2454112, + "step": 7295 + }, + { + "epoch": 5.641421947449768, + "grad_norm": 0.6659824848175049, + "learning_rate": 4.512066397490857e-05, + "loss": 0.5409, + "num_input_tokens_seen": 2455936, + "step": 7300 + }, + { + "epoch": 5.645285935085008, + "grad_norm": 1.8431038856506348, + "learning_rate": 4.511065292319296e-05, + "loss": 0.6675, + "num_input_tokens_seen": 2457568, + "step": 7305 + }, + { + "epoch": 5.649149922720247, + "grad_norm": 0.7612937092781067, + "learning_rate": 4.510063272503198e-05, + "loss": 0.4143, + "num_input_tokens_seen": 2459200, + "step": 7310 + }, + { + "epoch": 5.653013910355487, + "grad_norm": 0.7089970707893372, + "learning_rate": 4.5090603384982844e-05, + "loss": 0.5397, + "num_input_tokens_seen": 2460992, + "step": 7315 + }, + { + "epoch": 5.656877897990727, + "grad_norm": 0.561152994632721, + "learning_rate": 4.508056490760697e-05, + "loss": 0.5053, + "num_input_tokens_seen": 2462560, + "step": 7320 + }, + { + "epoch": 5.660741885625966, + "grad_norm": 0.9099323749542236, + "learning_rate": 4.507051729746993e-05, + "loss": 0.4526, + "num_input_tokens_seen": 2464032, + "step": 7325 + }, + { + "epoch": 5.6646058732612055, + "grad_norm": 0.9175041913986206, + "learning_rate": 4.5060460559141414e-05, + "loss": 0.4248, + "num_input_tokens_seen": 2465792, + "step": 7330 + }, + { + "epoch": 5.668469860896445, + "grad_norm": 0.5624104142189026, + "learning_rate": 4.5050394697195294e-05, + "loss": 0.5935, + "num_input_tokens_seen": 2467392, + "step": 7335 + }, + { + "epoch": 5.672333848531685, + "grad_norm": 0.8147039413452148, + "learning_rate": 4.5040319716209605e-05, + "loss": 0.4822, + "num_input_tokens_seen": 2468960, + "step": 7340 + }, + { + "epoch": 5.676197836166924, + "grad_norm": 0.9573432803153992, + "learning_rate": 4.503023562076648e-05, + "loss": 0.6937, + "num_input_tokens_seen": 2470432, + "step": 7345 + }, + { + "epoch": 5.680061823802164, + "grad_norm": 0.9767632484436035, + "learning_rate": 4.502014241545225e-05, + "loss": 0.4199, + "num_input_tokens_seen": 2472256, + "step": 7350 + }, + { + "epoch": 5.683925811437403, + "grad_norm": 0.6808250546455383, + "learning_rate": 4.501004010485734e-05, + "loss": 0.3675, + "num_input_tokens_seen": 2473760, + "step": 7355 + }, + { + "epoch": 5.687789799072643, + "grad_norm": 0.9592287540435791, + "learning_rate": 4.499992869357637e-05, + "loss": 0.8108, + "num_input_tokens_seen": 2475296, + "step": 7360 + }, + { + "epoch": 5.6916537867078825, + "grad_norm": 0.9849753379821777, + "learning_rate": 4.498980818620804e-05, + "loss": 0.4007, + "num_input_tokens_seen": 2477056, + "step": 7365 + }, + { + "epoch": 5.695517774343122, + "grad_norm": 0.7885463833808899, + "learning_rate": 4.4979678587355236e-05, + "loss": 0.5278, + "num_input_tokens_seen": 2478816, + "step": 7370 + }, + { + "epoch": 5.699381761978362, + "grad_norm": 1.0332342386245728, + "learning_rate": 4.496953990162496e-05, + "loss": 0.4073, + "num_input_tokens_seen": 2480384, + "step": 7375 + }, + { + "epoch": 5.703245749613601, + "grad_norm": 0.7696319222450256, + "learning_rate": 4.4959392133628345e-05, + "loss": 0.4958, + "num_input_tokens_seen": 2482080, + "step": 7380 + }, + { + "epoch": 5.707109737248841, + "grad_norm": 0.5671315789222717, + "learning_rate": 4.4949235287980654e-05, + "loss": 0.5323, + "num_input_tokens_seen": 2483776, + "step": 7385 + }, + { + "epoch": 5.710973724884081, + "grad_norm": 0.7349467277526855, + "learning_rate": 4.493906936930128e-05, + "loss": 0.5557, + "num_input_tokens_seen": 2485760, + "step": 7390 + }, + { + "epoch": 5.71483771251932, + "grad_norm": 0.5011353492736816, + "learning_rate": 4.492889438221375e-05, + "loss": 0.4493, + "num_input_tokens_seen": 2487584, + "step": 7395 + }, + { + "epoch": 5.7187017001545595, + "grad_norm": 1.6512941122055054, + "learning_rate": 4.491871033134571e-05, + "loss": 0.4087, + "num_input_tokens_seen": 2489248, + "step": 7400 + }, + { + "epoch": 5.722565687789799, + "grad_norm": 0.8349025845527649, + "learning_rate": 4.4908517221328915e-05, + "loss": 0.4763, + "num_input_tokens_seen": 2490944, + "step": 7405 + }, + { + "epoch": 5.726429675425039, + "grad_norm": 1.516764521598816, + "learning_rate": 4.489831505679927e-05, + "loss": 0.4307, + "num_input_tokens_seen": 2492640, + "step": 7410 + }, + { + "epoch": 5.730293663060278, + "grad_norm": 1.43715238571167, + "learning_rate": 4.488810384239675e-05, + "loss": 0.4798, + "num_input_tokens_seen": 2494752, + "step": 7415 + }, + { + "epoch": 5.734157650695518, + "grad_norm": 0.7379100918769836, + "learning_rate": 4.487788358276552e-05, + "loss": 0.5607, + "num_input_tokens_seen": 2496320, + "step": 7420 + }, + { + "epoch": 5.738021638330757, + "grad_norm": 0.7501834034919739, + "learning_rate": 4.4867654282553784e-05, + "loss": 0.4428, + "num_input_tokens_seen": 2497952, + "step": 7425 + }, + { + "epoch": 5.741885625965997, + "grad_norm": 0.521294891834259, + "learning_rate": 4.4857415946413896e-05, + "loss": 0.4451, + "num_input_tokens_seen": 2499968, + "step": 7430 + }, + { + "epoch": 5.7457496136012365, + "grad_norm": 0.88023442029953, + "learning_rate": 4.484716857900232e-05, + "loss": 0.4415, + "num_input_tokens_seen": 2501792, + "step": 7435 + }, + { + "epoch": 5.749613601236476, + "grad_norm": 1.3212934732437134, + "learning_rate": 4.4836912184979606e-05, + "loss": 0.4975, + "num_input_tokens_seen": 2503616, + "step": 7440 + }, + { + "epoch": 5.753477588871716, + "grad_norm": 0.843912661075592, + "learning_rate": 4.482664676901043e-05, + "loss": 0.4308, + "num_input_tokens_seen": 2505184, + "step": 7445 + }, + { + "epoch": 5.757341576506955, + "grad_norm": 1.136330485343933, + "learning_rate": 4.481637233576358e-05, + "loss": 0.4542, + "num_input_tokens_seen": 2507040, + "step": 7450 + }, + { + "epoch": 5.761205564142195, + "grad_norm": 1.1890625953674316, + "learning_rate": 4.48060888899119e-05, + "loss": 0.5815, + "num_input_tokens_seen": 2508608, + "step": 7455 + }, + { + "epoch": 5.765069551777434, + "grad_norm": 0.46369966864585876, + "learning_rate": 4.4795796436132384e-05, + "loss": 0.4496, + "num_input_tokens_seen": 2510144, + "step": 7460 + }, + { + "epoch": 5.768933539412674, + "grad_norm": 0.5415319800376892, + "learning_rate": 4.47854949791061e-05, + "loss": 0.4178, + "num_input_tokens_seen": 2511840, + "step": 7465 + }, + { + "epoch": 5.7727975270479135, + "grad_norm": 1.0110344886779785, + "learning_rate": 4.477518452351821e-05, + "loss": 0.4655, + "num_input_tokens_seen": 2513472, + "step": 7470 + }, + { + "epoch": 5.776661514683153, + "grad_norm": 0.7727068662643433, + "learning_rate": 4.4764865074057974e-05, + "loss": 0.5357, + "num_input_tokens_seen": 2515136, + "step": 7475 + }, + { + "epoch": 5.780525502318392, + "grad_norm": 0.7683701515197754, + "learning_rate": 4.4754536635418725e-05, + "loss": 0.411, + "num_input_tokens_seen": 2516928, + "step": 7480 + }, + { + "epoch": 5.784389489953632, + "grad_norm": 1.5530121326446533, + "learning_rate": 4.4744199212297914e-05, + "loss": 0.7345, + "num_input_tokens_seen": 2518560, + "step": 7485 + }, + { + "epoch": 5.788253477588872, + "grad_norm": 0.5988789200782776, + "learning_rate": 4.473385280939706e-05, + "loss": 0.4772, + "num_input_tokens_seen": 2520192, + "step": 7490 + }, + { + "epoch": 5.792117465224111, + "grad_norm": 1.215806484222412, + "learning_rate": 4.4723497431421756e-05, + "loss": 0.6742, + "num_input_tokens_seen": 2521728, + "step": 7495 + }, + { + "epoch": 5.795981452859351, + "grad_norm": 1.3820728063583374, + "learning_rate": 4.4713133083081715e-05, + "loss": 0.5182, + "num_input_tokens_seen": 2523488, + "step": 7500 + }, + { + "epoch": 5.7998454404945905, + "grad_norm": 0.986339807510376, + "learning_rate": 4.470275976909068e-05, + "loss": 0.5142, + "num_input_tokens_seen": 2524960, + "step": 7505 + }, + { + "epoch": 5.80370942812983, + "grad_norm": 0.8885478973388672, + "learning_rate": 4.469237749416651e-05, + "loss": 0.5038, + "num_input_tokens_seen": 2526720, + "step": 7510 + }, + { + "epoch": 5.80757341576507, + "grad_norm": 0.8221397995948792, + "learning_rate": 4.4681986263031125e-05, + "loss": 0.451, + "num_input_tokens_seen": 2528512, + "step": 7515 + }, + { + "epoch": 5.811437403400309, + "grad_norm": 1.8666731119155884, + "learning_rate": 4.467158608041051e-05, + "loss": 0.6676, + "num_input_tokens_seen": 2530176, + "step": 7520 + }, + { + "epoch": 5.815301391035549, + "grad_norm": 0.6179053783416748, + "learning_rate": 4.466117695103474e-05, + "loss": 0.5736, + "num_input_tokens_seen": 2531968, + "step": 7525 + }, + { + "epoch": 5.819165378670788, + "grad_norm": 0.9896422624588013, + "learning_rate": 4.465075887963796e-05, + "loss": 0.4425, + "num_input_tokens_seen": 2533856, + "step": 7530 + }, + { + "epoch": 5.823029366306028, + "grad_norm": 0.9209406971931458, + "learning_rate": 4.464033187095834e-05, + "loss": 0.4054, + "num_input_tokens_seen": 2535488, + "step": 7535 + }, + { + "epoch": 5.8268933539412675, + "grad_norm": 0.7848178744316101, + "learning_rate": 4.462989592973817e-05, + "loss": 0.4613, + "num_input_tokens_seen": 2537120, + "step": 7540 + }, + { + "epoch": 5.830757341576507, + "grad_norm": 0.903702437877655, + "learning_rate": 4.461945106072377e-05, + "loss": 0.4374, + "num_input_tokens_seen": 2538656, + "step": 7545 + }, + { + "epoch": 5.834621329211746, + "grad_norm": 0.5934814810752869, + "learning_rate": 4.460899726866554e-05, + "loss": 0.3675, + "num_input_tokens_seen": 2540096, + "step": 7550 + }, + { + "epoch": 5.838485316846986, + "grad_norm": 0.6356049180030823, + "learning_rate": 4.459853455831791e-05, + "loss": 0.5362, + "num_input_tokens_seen": 2541760, + "step": 7555 + }, + { + "epoch": 5.842349304482226, + "grad_norm": 0.672792375087738, + "learning_rate": 4.458806293443939e-05, + "loss": 0.6006, + "num_input_tokens_seen": 2543328, + "step": 7560 + }, + { + "epoch": 5.846213292117465, + "grad_norm": 0.7509033679962158, + "learning_rate": 4.457758240179255e-05, + "loss": 0.4141, + "num_input_tokens_seen": 2545056, + "step": 7565 + }, + { + "epoch": 5.850077279752705, + "grad_norm": 1.127813458442688, + "learning_rate": 4.4567092965143974e-05, + "loss": 0.4775, + "num_input_tokens_seen": 2546720, + "step": 7570 + }, + { + "epoch": 5.8539412673879445, + "grad_norm": 0.8243081569671631, + "learning_rate": 4.455659462926435e-05, + "loss": 0.4762, + "num_input_tokens_seen": 2548512, + "step": 7575 + }, + { + "epoch": 5.857805255023184, + "grad_norm": 0.46655091643333435, + "learning_rate": 4.454608739892836e-05, + "loss": 0.4708, + "num_input_tokens_seen": 2550368, + "step": 7580 + }, + { + "epoch": 5.861669242658423, + "grad_norm": 0.7779222726821899, + "learning_rate": 4.4535571278914765e-05, + "loss": 0.456, + "num_input_tokens_seen": 2552000, + "step": 7585 + }, + { + "epoch": 5.865533230293663, + "grad_norm": 0.7458489537239075, + "learning_rate": 4.452504627400635e-05, + "loss": 0.4031, + "num_input_tokens_seen": 2553696, + "step": 7590 + }, + { + "epoch": 5.869397217928903, + "grad_norm": 1.4066767692565918, + "learning_rate": 4.451451238898997e-05, + "loss": 0.4468, + "num_input_tokens_seen": 2555488, + "step": 7595 + }, + { + "epoch": 5.873261205564142, + "grad_norm": 1.2928438186645508, + "learning_rate": 4.4503969628656484e-05, + "loss": 0.4377, + "num_input_tokens_seen": 2557056, + "step": 7600 + }, + { + "epoch": 5.877125193199381, + "grad_norm": 0.7343565225601196, + "learning_rate": 4.449341799780081e-05, + "loss": 0.6771, + "num_input_tokens_seen": 2558816, + "step": 7605 + }, + { + "epoch": 5.8809891808346215, + "grad_norm": 0.6515517830848694, + "learning_rate": 4.448285750122188e-05, + "loss": 0.377, + "num_input_tokens_seen": 2560672, + "step": 7610 + }, + { + "epoch": 5.884853168469861, + "grad_norm": 1.5162814855575562, + "learning_rate": 4.44722881437227e-05, + "loss": 0.5334, + "num_input_tokens_seen": 2562400, + "step": 7615 + }, + { + "epoch": 5.8887171561051, + "grad_norm": 1.3403422832489014, + "learning_rate": 4.4461709930110236e-05, + "loss": 0.5034, + "num_input_tokens_seen": 2564384, + "step": 7620 + }, + { + "epoch": 5.89258114374034, + "grad_norm": 0.6364073157310486, + "learning_rate": 4.445112286519555e-05, + "loss": 0.4494, + "num_input_tokens_seen": 2566144, + "step": 7625 + }, + { + "epoch": 5.89644513137558, + "grad_norm": 1.0185022354125977, + "learning_rate": 4.44405269537937e-05, + "loss": 0.3917, + "num_input_tokens_seen": 2568096, + "step": 7630 + }, + { + "epoch": 5.900309119010819, + "grad_norm": 0.7399530410766602, + "learning_rate": 4.442992220072376e-05, + "loss": 0.5456, + "num_input_tokens_seen": 2569664, + "step": 7635 + }, + { + "epoch": 5.904173106646059, + "grad_norm": 0.7676199674606323, + "learning_rate": 4.4419308610808854e-05, + "loss": 0.499, + "num_input_tokens_seen": 2571520, + "step": 7640 + }, + { + "epoch": 5.9080370942812985, + "grad_norm": 1.2224538326263428, + "learning_rate": 4.440868618887608e-05, + "loss": 0.5733, + "num_input_tokens_seen": 2573120, + "step": 7645 + }, + { + "epoch": 5.911901081916538, + "grad_norm": 0.7079630494117737, + "learning_rate": 4.4398054939756606e-05, + "loss": 0.4191, + "num_input_tokens_seen": 2574784, + "step": 7650 + }, + { + "epoch": 5.915765069551777, + "grad_norm": 0.806405246257782, + "learning_rate": 4.4387414868285566e-05, + "loss": 0.5487, + "num_input_tokens_seen": 2576384, + "step": 7655 + }, + { + "epoch": 5.919629057187017, + "grad_norm": 0.9857746362686157, + "learning_rate": 4.437676597930214e-05, + "loss": 0.4399, + "num_input_tokens_seen": 2577984, + "step": 7660 + }, + { + "epoch": 5.923493044822257, + "grad_norm": 0.703589916229248, + "learning_rate": 4.436610827764951e-05, + "loss": 0.5454, + "num_input_tokens_seen": 2579712, + "step": 7665 + }, + { + "epoch": 5.927357032457496, + "grad_norm": 0.8538737893104553, + "learning_rate": 4.435544176817484e-05, + "loss": 0.7725, + "num_input_tokens_seen": 2581536, + "step": 7670 + }, + { + "epoch": 5.931221020092735, + "grad_norm": 0.9143222570419312, + "learning_rate": 4.4344766455729357e-05, + "loss": 0.4427, + "num_input_tokens_seen": 2583392, + "step": 7675 + }, + { + "epoch": 5.9350850077279755, + "grad_norm": 0.8711961507797241, + "learning_rate": 4.433408234516823e-05, + "loss": 0.5186, + "num_input_tokens_seen": 2584864, + "step": 7680 + }, + { + "epoch": 5.938948995363215, + "grad_norm": 0.6641161441802979, + "learning_rate": 4.4323389441350664e-05, + "loss": 0.4275, + "num_input_tokens_seen": 2586496, + "step": 7685 + }, + { + "epoch": 5.942812982998454, + "grad_norm": 1.0838510990142822, + "learning_rate": 4.4312687749139857e-05, + "loss": 0.4362, + "num_input_tokens_seen": 2588224, + "step": 7690 + }, + { + "epoch": 5.946676970633694, + "grad_norm": 0.6036345958709717, + "learning_rate": 4.4301977273403005e-05, + "loss": 0.4275, + "num_input_tokens_seen": 2590144, + "step": 7695 + }, + { + "epoch": 5.950540958268934, + "grad_norm": 0.9225648045539856, + "learning_rate": 4.4291258019011294e-05, + "loss": 0.6645, + "num_input_tokens_seen": 2591840, + "step": 7700 + }, + { + "epoch": 5.954404945904173, + "grad_norm": 0.8475850820541382, + "learning_rate": 4.42805299908399e-05, + "loss": 0.4697, + "num_input_tokens_seen": 2593472, + "step": 7705 + }, + { + "epoch": 5.958268933539412, + "grad_norm": 1.2149916887283325, + "learning_rate": 4.426979319376801e-05, + "loss": 0.5682, + "num_input_tokens_seen": 2595168, + "step": 7710 + }, + { + "epoch": 5.9621329211746525, + "grad_norm": 1.1008656024932861, + "learning_rate": 4.425904763267877e-05, + "loss": 0.5362, + "num_input_tokens_seen": 2596672, + "step": 7715 + }, + { + "epoch": 5.965996908809892, + "grad_norm": 0.9169328808784485, + "learning_rate": 4.424829331245932e-05, + "loss": 0.3957, + "num_input_tokens_seen": 2598432, + "step": 7720 + }, + { + "epoch": 5.969860896445131, + "grad_norm": 0.49159717559814453, + "learning_rate": 4.423753023800081e-05, + "loss": 0.6274, + "num_input_tokens_seen": 2600160, + "step": 7725 + }, + { + "epoch": 5.9737248840803705, + "grad_norm": 0.916046679019928, + "learning_rate": 4.4226758414198325e-05, + "loss": 0.7316, + "num_input_tokens_seen": 2601824, + "step": 7730 + }, + { + "epoch": 5.977588871715611, + "grad_norm": 1.0995888710021973, + "learning_rate": 4.421597784595098e-05, + "loss": 0.4408, + "num_input_tokens_seen": 2603648, + "step": 7735 + }, + { + "epoch": 5.98145285935085, + "grad_norm": 0.6648110747337341, + "learning_rate": 4.420518853816182e-05, + "loss": 0.384, + "num_input_tokens_seen": 2605440, + "step": 7740 + }, + { + "epoch": 5.985316846986089, + "grad_norm": 2.1021459102630615, + "learning_rate": 4.4194390495737915e-05, + "loss": 0.4883, + "num_input_tokens_seen": 2607264, + "step": 7745 + }, + { + "epoch": 5.9891808346213296, + "grad_norm": 0.8763461112976074, + "learning_rate": 4.418358372359025e-05, + "loss": 0.5948, + "num_input_tokens_seen": 2608992, + "step": 7750 + }, + { + "epoch": 5.993044822256569, + "grad_norm": 1.4945613145828247, + "learning_rate": 4.417276822663382e-05, + "loss": 0.6044, + "num_input_tokens_seen": 2610592, + "step": 7755 + }, + { + "epoch": 5.996908809891808, + "grad_norm": 1.093934178352356, + "learning_rate": 4.416194400978758e-05, + "loss": 0.4197, + "num_input_tokens_seen": 2612416, + "step": 7760 + }, + { + "epoch": 6.0, + "eval_loss": 0.4767547845840454, + "eval_runtime": 6.2477, + "eval_samples_per_second": 92.034, + "eval_steps_per_second": 23.049, + "num_input_tokens_seen": 2613648, + "step": 7764 + }, + { + "epoch": 6.0007727975270475, + "grad_norm": 0.9511370062828064, + "learning_rate": 4.415111107797445e-05, + "loss": 0.4822, + "num_input_tokens_seen": 2613968, + "step": 7765 + }, + { + "epoch": 6.004636785162288, + "grad_norm": 0.6964737176895142, + "learning_rate": 4.414026943612132e-05, + "loss": 0.4089, + "num_input_tokens_seen": 2615760, + "step": 7770 + }, + { + "epoch": 6.008500772797527, + "grad_norm": 0.7107896208763123, + "learning_rate": 4.412941908915901e-05, + "loss": 0.3892, + "num_input_tokens_seen": 2617104, + "step": 7775 + }, + { + "epoch": 6.012364760432766, + "grad_norm": 1.1768757104873657, + "learning_rate": 4.411856004202234e-05, + "loss": 0.5547, + "num_input_tokens_seen": 2618640, + "step": 7780 + }, + { + "epoch": 6.016228748068007, + "grad_norm": 0.5865057706832886, + "learning_rate": 4.4107692299650064e-05, + "loss": 0.4041, + "num_input_tokens_seen": 2620176, + "step": 7785 + }, + { + "epoch": 6.020092735703246, + "grad_norm": 0.6723847985267639, + "learning_rate": 4.4096815866984905e-05, + "loss": 0.3718, + "num_input_tokens_seen": 2621776, + "step": 7790 + }, + { + "epoch": 6.023956723338485, + "grad_norm": 1.3268084526062012, + "learning_rate": 4.408593074897352e-05, + "loss": 0.5021, + "num_input_tokens_seen": 2623568, + "step": 7795 + }, + { + "epoch": 6.0278207109737245, + "grad_norm": 0.7014374136924744, + "learning_rate": 4.407503695056653e-05, + "loss": 0.5732, + "num_input_tokens_seen": 2625200, + "step": 7800 + }, + { + "epoch": 6.031684698608965, + "grad_norm": 0.5426621437072754, + "learning_rate": 4.40641344767185e-05, + "loss": 0.4094, + "num_input_tokens_seen": 2626672, + "step": 7805 + }, + { + "epoch": 6.035548686244204, + "grad_norm": 0.682293713092804, + "learning_rate": 4.4053223332387936e-05, + "loss": 0.3883, + "num_input_tokens_seen": 2628272, + "step": 7810 + }, + { + "epoch": 6.039412673879443, + "grad_norm": 0.7965031862258911, + "learning_rate": 4.40423035225373e-05, + "loss": 0.4905, + "num_input_tokens_seen": 2629904, + "step": 7815 + }, + { + "epoch": 6.043276661514684, + "grad_norm": 0.6119529008865356, + "learning_rate": 4.403137505213297e-05, + "loss": 0.6375, + "num_input_tokens_seen": 2631408, + "step": 7820 + }, + { + "epoch": 6.047140649149923, + "grad_norm": 1.1031341552734375, + "learning_rate": 4.402043792614531e-05, + "loss": 0.4872, + "num_input_tokens_seen": 2633136, + "step": 7825 + }, + { + "epoch": 6.051004636785162, + "grad_norm": 0.8657554388046265, + "learning_rate": 4.400949214954856e-05, + "loss": 0.4523, + "num_input_tokens_seen": 2634832, + "step": 7830 + }, + { + "epoch": 6.0548686244204015, + "grad_norm": 0.5638367533683777, + "learning_rate": 4.3998537727320944e-05, + "loss": 0.4549, + "num_input_tokens_seen": 2636496, + "step": 7835 + }, + { + "epoch": 6.058732612055642, + "grad_norm": 0.7747939825057983, + "learning_rate": 4.398757466444459e-05, + "loss": 0.4757, + "num_input_tokens_seen": 2638192, + "step": 7840 + }, + { + "epoch": 6.062596599690881, + "grad_norm": 0.7880018949508667, + "learning_rate": 4.397660296590556e-05, + "loss": 0.5168, + "num_input_tokens_seen": 2639760, + "step": 7845 + }, + { + "epoch": 6.06646058732612, + "grad_norm": 0.8906763792037964, + "learning_rate": 4.396562263669386e-05, + "loss": 0.5341, + "num_input_tokens_seen": 2641264, + "step": 7850 + }, + { + "epoch": 6.07032457496136, + "grad_norm": 0.7394819855690002, + "learning_rate": 4.39546336818034e-05, + "loss": 0.3899, + "num_input_tokens_seen": 2642768, + "step": 7855 + }, + { + "epoch": 6.0741885625966, + "grad_norm": 0.5608252882957458, + "learning_rate": 4.394363610623203e-05, + "loss": 0.422, + "num_input_tokens_seen": 2644368, + "step": 7860 + }, + { + "epoch": 6.078052550231839, + "grad_norm": 1.342433214187622, + "learning_rate": 4.393262991498151e-05, + "loss": 0.58, + "num_input_tokens_seen": 2646096, + "step": 7865 + }, + { + "epoch": 6.0819165378670785, + "grad_norm": 1.8481080532073975, + "learning_rate": 4.3921615113057524e-05, + "loss": 0.5962, + "num_input_tokens_seen": 2647696, + "step": 7870 + }, + { + "epoch": 6.085780525502319, + "grad_norm": 1.1681493520736694, + "learning_rate": 4.391059170546966e-05, + "loss": 0.5796, + "num_input_tokens_seen": 2649200, + "step": 7875 + }, + { + "epoch": 6.089644513137558, + "grad_norm": 1.3246541023254395, + "learning_rate": 4.389955969723144e-05, + "loss": 0.4309, + "num_input_tokens_seen": 2650896, + "step": 7880 + }, + { + "epoch": 6.093508500772797, + "grad_norm": 1.2794638872146606, + "learning_rate": 4.3888519093360294e-05, + "loss": 0.4484, + "num_input_tokens_seen": 2652560, + "step": 7885 + }, + { + "epoch": 6.097372488408037, + "grad_norm": 0.7746421098709106, + "learning_rate": 4.387746989887753e-05, + "loss": 0.6395, + "num_input_tokens_seen": 2654544, + "step": 7890 + }, + { + "epoch": 6.101236476043277, + "grad_norm": 1.2523189783096313, + "learning_rate": 4.386641211880842e-05, + "loss": 0.7521, + "num_input_tokens_seen": 2656368, + "step": 7895 + }, + { + "epoch": 6.105100463678516, + "grad_norm": 1.2294808626174927, + "learning_rate": 4.385534575818208e-05, + "loss": 0.3963, + "num_input_tokens_seen": 2657808, + "step": 7900 + }, + { + "epoch": 6.1089644513137555, + "grad_norm": 1.2084044218063354, + "learning_rate": 4.384427082203157e-05, + "loss": 0.5087, + "num_input_tokens_seen": 2659792, + "step": 7905 + }, + { + "epoch": 6.112828438948996, + "grad_norm": 0.8015757203102112, + "learning_rate": 4.383318731539384e-05, + "loss": 0.3652, + "num_input_tokens_seen": 2661648, + "step": 7910 + }, + { + "epoch": 6.116692426584235, + "grad_norm": 0.9788172841072083, + "learning_rate": 4.3822095243309734e-05, + "loss": 0.5691, + "num_input_tokens_seen": 2663280, + "step": 7915 + }, + { + "epoch": 6.120556414219474, + "grad_norm": 0.6039725542068481, + "learning_rate": 4.381099461082399e-05, + "loss": 0.4072, + "num_input_tokens_seen": 2664944, + "step": 7920 + }, + { + "epoch": 6.124420401854714, + "grad_norm": 1.1409680843353271, + "learning_rate": 4.3799885422985234e-05, + "loss": 0.5852, + "num_input_tokens_seen": 2666768, + "step": 7925 + }, + { + "epoch": 6.128284389489954, + "grad_norm": 0.6797494888305664, + "learning_rate": 4.3788767684846e-05, + "loss": 0.5318, + "num_input_tokens_seen": 2668432, + "step": 7930 + }, + { + "epoch": 6.132148377125193, + "grad_norm": 0.7862567901611328, + "learning_rate": 4.377764140146271e-05, + "loss": 0.6507, + "num_input_tokens_seen": 2670160, + "step": 7935 + }, + { + "epoch": 6.1360123647604325, + "grad_norm": 0.9382320642471313, + "learning_rate": 4.3766506577895646e-05, + "loss": 0.5208, + "num_input_tokens_seen": 2671824, + "step": 7940 + }, + { + "epoch": 6.139876352395673, + "grad_norm": 0.8186858892440796, + "learning_rate": 4.375536321920901e-05, + "loss": 0.5101, + "num_input_tokens_seen": 2673456, + "step": 7945 + }, + { + "epoch": 6.143740340030912, + "grad_norm": 0.7574723362922668, + "learning_rate": 4.374421133047086e-05, + "loss": 0.4209, + "num_input_tokens_seen": 2674896, + "step": 7950 + }, + { + "epoch": 6.147604327666151, + "grad_norm": 0.7758065462112427, + "learning_rate": 4.373305091675314e-05, + "loss": 0.5038, + "num_input_tokens_seen": 2676208, + "step": 7955 + }, + { + "epoch": 6.151468315301391, + "grad_norm": 1.1338006258010864, + "learning_rate": 4.3721881983131674e-05, + "loss": 0.4374, + "num_input_tokens_seen": 2678000, + "step": 7960 + }, + { + "epoch": 6.155332302936631, + "grad_norm": 0.8261665105819702, + "learning_rate": 4.3710704534686166e-05, + "loss": 0.5159, + "num_input_tokens_seen": 2679664, + "step": 7965 + }, + { + "epoch": 6.15919629057187, + "grad_norm": 0.6478418707847595, + "learning_rate": 4.369951857650018e-05, + "loss": 0.4277, + "num_input_tokens_seen": 2681168, + "step": 7970 + }, + { + "epoch": 6.1630602782071096, + "grad_norm": 0.7758233547210693, + "learning_rate": 4.368832411366115e-05, + "loss": 0.4216, + "num_input_tokens_seen": 2682512, + "step": 7975 + }, + { + "epoch": 6.166924265842349, + "grad_norm": 0.7054232358932495, + "learning_rate": 4.36771211512604e-05, + "loss": 0.4393, + "num_input_tokens_seen": 2684176, + "step": 7980 + }, + { + "epoch": 6.170788253477589, + "grad_norm": 0.901898205280304, + "learning_rate": 4.36659096943931e-05, + "loss": 0.4579, + "num_input_tokens_seen": 2685968, + "step": 7985 + }, + { + "epoch": 6.174652241112828, + "grad_norm": 0.6477704644203186, + "learning_rate": 4.365468974815828e-05, + "loss": 0.4309, + "num_input_tokens_seen": 2687568, + "step": 7990 + }, + { + "epoch": 6.178516228748068, + "grad_norm": 0.7741856575012207, + "learning_rate": 4.3643461317658846e-05, + "loss": 0.453, + "num_input_tokens_seen": 2689456, + "step": 7995 + }, + { + "epoch": 6.182380216383308, + "grad_norm": 0.7519932389259338, + "learning_rate": 4.363222440800155e-05, + "loss": 0.5363, + "num_input_tokens_seen": 2690960, + "step": 8000 + }, + { + "epoch": 6.186244204018547, + "grad_norm": 0.6219892501831055, + "learning_rate": 4.3620979024297015e-05, + "loss": 0.4623, + "num_input_tokens_seen": 2692624, + "step": 8005 + }, + { + "epoch": 6.190108191653787, + "grad_norm": 1.075729250907898, + "learning_rate": 4.3609725171659696e-05, + "loss": 0.5182, + "num_input_tokens_seen": 2694288, + "step": 8010 + }, + { + "epoch": 6.193972179289026, + "grad_norm": 0.9659681916236877, + "learning_rate": 4.3598462855207935e-05, + "loss": 0.4549, + "num_input_tokens_seen": 2696048, + "step": 8015 + }, + { + "epoch": 6.197836166924266, + "grad_norm": 0.8298275470733643, + "learning_rate": 4.358719208006387e-05, + "loss": 0.4074, + "num_input_tokens_seen": 2697840, + "step": 8020 + }, + { + "epoch": 6.201700154559505, + "grad_norm": 0.6171674132347107, + "learning_rate": 4.357591285135354e-05, + "loss": 0.4231, + "num_input_tokens_seen": 2699280, + "step": 8025 + }, + { + "epoch": 6.205564142194745, + "grad_norm": 0.7163162231445312, + "learning_rate": 4.3564625174206794e-05, + "loss": 0.4336, + "num_input_tokens_seen": 2700592, + "step": 8030 + }, + { + "epoch": 6.209428129829985, + "grad_norm": 0.7469598054885864, + "learning_rate": 4.355332905375734e-05, + "loss": 0.4666, + "num_input_tokens_seen": 2702480, + "step": 8035 + }, + { + "epoch": 6.213292117465224, + "grad_norm": 0.526566743850708, + "learning_rate": 4.354202449514273e-05, + "loss": 0.4121, + "num_input_tokens_seen": 2704272, + "step": 8040 + }, + { + "epoch": 6.217156105100464, + "grad_norm": 0.6400463581085205, + "learning_rate": 4.3530711503504326e-05, + "loss": 0.4728, + "num_input_tokens_seen": 2706256, + "step": 8045 + }, + { + "epoch": 6.221020092735703, + "grad_norm": 1.2546371221542358, + "learning_rate": 4.351939008398736e-05, + "loss": 0.4845, + "num_input_tokens_seen": 2707888, + "step": 8050 + }, + { + "epoch": 6.224884080370943, + "grad_norm": 1.594642996788025, + "learning_rate": 4.350806024174087e-05, + "loss": 0.5633, + "num_input_tokens_seen": 2709840, + "step": 8055 + }, + { + "epoch": 6.228748068006182, + "grad_norm": 0.809033215045929, + "learning_rate": 4.3496721981917744e-05, + "loss": 0.6285, + "num_input_tokens_seen": 2711504, + "step": 8060 + }, + { + "epoch": 6.232612055641422, + "grad_norm": 0.9115098714828491, + "learning_rate": 4.3485375309674683e-05, + "loss": 0.4241, + "num_input_tokens_seen": 2713072, + "step": 8065 + }, + { + "epoch": 6.236476043276662, + "grad_norm": 0.8143134713172913, + "learning_rate": 4.347402023017223e-05, + "loss": 0.5038, + "num_input_tokens_seen": 2714512, + "step": 8070 + }, + { + "epoch": 6.240340030911901, + "grad_norm": 0.8577076196670532, + "learning_rate": 4.3462656748574745e-05, + "loss": 0.5146, + "num_input_tokens_seen": 2716144, + "step": 8075 + }, + { + "epoch": 6.244204018547141, + "grad_norm": 0.6553500890731812, + "learning_rate": 4.34512848700504e-05, + "loss": 0.5372, + "num_input_tokens_seen": 2717904, + "step": 8080 + }, + { + "epoch": 6.24806800618238, + "grad_norm": 0.8019552826881409, + "learning_rate": 4.34399045997712e-05, + "loss": 0.4686, + "num_input_tokens_seen": 2719792, + "step": 8085 + }, + { + "epoch": 6.25193199381762, + "grad_norm": 0.6724003553390503, + "learning_rate": 4.342851594291294e-05, + "loss": 0.5875, + "num_input_tokens_seen": 2722160, + "step": 8090 + }, + { + "epoch": 6.255795981452859, + "grad_norm": 0.6861489415168762, + "learning_rate": 4.341711890465528e-05, + "loss": 0.5755, + "num_input_tokens_seen": 2723920, + "step": 8095 + }, + { + "epoch": 6.259659969088099, + "grad_norm": 0.7811505198478699, + "learning_rate": 4.3405713490181645e-05, + "loss": 0.4038, + "num_input_tokens_seen": 2725776, + "step": 8100 + }, + { + "epoch": 6.263523956723338, + "grad_norm": 0.6824927926063538, + "learning_rate": 4.339429970467928e-05, + "loss": 0.4413, + "num_input_tokens_seen": 2727632, + "step": 8105 + }, + { + "epoch": 6.267387944358578, + "grad_norm": 0.5180096626281738, + "learning_rate": 4.338287755333925e-05, + "loss": 0.3672, + "num_input_tokens_seen": 2729264, + "step": 8110 + }, + { + "epoch": 6.271251931993818, + "grad_norm": 1.0090491771697998, + "learning_rate": 4.337144704135643e-05, + "loss": 0.3908, + "num_input_tokens_seen": 2730896, + "step": 8115 + }, + { + "epoch": 6.275115919629057, + "grad_norm": 0.578727126121521, + "learning_rate": 4.3360008173929454e-05, + "loss": 0.6331, + "num_input_tokens_seen": 2732752, + "step": 8120 + }, + { + "epoch": 6.278979907264297, + "grad_norm": 1.370762586593628, + "learning_rate": 4.3348560956260825e-05, + "loss": 0.8123, + "num_input_tokens_seen": 2734544, + "step": 8125 + }, + { + "epoch": 6.282843894899536, + "grad_norm": 0.5715421438217163, + "learning_rate": 4.333710539355678e-05, + "loss": 0.6331, + "num_input_tokens_seen": 2736176, + "step": 8130 + }, + { + "epoch": 6.286707882534776, + "grad_norm": 0.9384443163871765, + "learning_rate": 4.332564149102739e-05, + "loss": 0.4799, + "num_input_tokens_seen": 2737776, + "step": 8135 + }, + { + "epoch": 6.290571870170015, + "grad_norm": 0.902693510055542, + "learning_rate": 4.331416925388649e-05, + "loss": 0.5685, + "num_input_tokens_seen": 2739184, + "step": 8140 + }, + { + "epoch": 6.294435857805255, + "grad_norm": 1.246533751487732, + "learning_rate": 4.330268868735174e-05, + "loss": 0.4713, + "num_input_tokens_seen": 2740912, + "step": 8145 + }, + { + "epoch": 6.298299845440495, + "grad_norm": 0.9309785962104797, + "learning_rate": 4.329119979664457e-05, + "loss": 0.6015, + "num_input_tokens_seen": 2742608, + "step": 8150 + }, + { + "epoch": 6.302163833075734, + "grad_norm": 1.0002663135528564, + "learning_rate": 4.327970258699019e-05, + "loss": 0.3935, + "num_input_tokens_seen": 2744400, + "step": 8155 + }, + { + "epoch": 6.306027820710974, + "grad_norm": 0.6984794735908508, + "learning_rate": 4.32681970636176e-05, + "loss": 0.459, + "num_input_tokens_seen": 2746128, + "step": 8160 + }, + { + "epoch": 6.309891808346213, + "grad_norm": 0.6819292902946472, + "learning_rate": 4.3256683231759574e-05, + "loss": 0.4777, + "num_input_tokens_seen": 2747632, + "step": 8165 + }, + { + "epoch": 6.313755795981453, + "grad_norm": 0.707101047039032, + "learning_rate": 4.3245161096652684e-05, + "loss": 0.5478, + "num_input_tokens_seen": 2749200, + "step": 8170 + }, + { + "epoch": 6.317619783616692, + "grad_norm": 0.5957554578781128, + "learning_rate": 4.323363066353727e-05, + "loss": 0.3649, + "num_input_tokens_seen": 2750736, + "step": 8175 + }, + { + "epoch": 6.321483771251932, + "grad_norm": 0.8291443586349487, + "learning_rate": 4.322209193765742e-05, + "loss": 0.3667, + "num_input_tokens_seen": 2752336, + "step": 8180 + }, + { + "epoch": 6.325347758887172, + "grad_norm": 0.6070759892463684, + "learning_rate": 4.321054492426103e-05, + "loss": 0.5785, + "num_input_tokens_seen": 2754032, + "step": 8185 + }, + { + "epoch": 6.329211746522411, + "grad_norm": 1.2539927959442139, + "learning_rate": 4.319898962859976e-05, + "loss": 0.3916, + "num_input_tokens_seen": 2755728, + "step": 8190 + }, + { + "epoch": 6.333075734157651, + "grad_norm": 0.7654266357421875, + "learning_rate": 4.3187426055929006e-05, + "loss": 0.4145, + "num_input_tokens_seen": 2757296, + "step": 8195 + }, + { + "epoch": 6.3369397217928904, + "grad_norm": 0.9207203984260559, + "learning_rate": 4.317585421150797e-05, + "loss": 0.4968, + "num_input_tokens_seen": 2759056, + "step": 8200 + }, + { + "epoch": 6.34080370942813, + "grad_norm": 0.7546350955963135, + "learning_rate": 4.316427410059959e-05, + "loss": 0.4522, + "num_input_tokens_seen": 2760688, + "step": 8205 + }, + { + "epoch": 6.344667697063369, + "grad_norm": 0.9424193501472473, + "learning_rate": 4.315268572847056e-05, + "loss": 0.5274, + "num_input_tokens_seen": 2762416, + "step": 8210 + }, + { + "epoch": 6.348531684698609, + "grad_norm": 1.0519219636917114, + "learning_rate": 4.314108910039135e-05, + "loss": 0.5591, + "num_input_tokens_seen": 2764208, + "step": 8215 + }, + { + "epoch": 6.352395672333849, + "grad_norm": 0.5551214218139648, + "learning_rate": 4.3129484221636176e-05, + "loss": 0.3997, + "num_input_tokens_seen": 2765936, + "step": 8220 + }, + { + "epoch": 6.356259659969088, + "grad_norm": 0.7650814056396484, + "learning_rate": 4.3117871097483e-05, + "loss": 0.6127, + "num_input_tokens_seen": 2767600, + "step": 8225 + }, + { + "epoch": 6.360123647604327, + "grad_norm": 1.101555585861206, + "learning_rate": 4.310624973321355e-05, + "loss": 0.4058, + "num_input_tokens_seen": 2769616, + "step": 8230 + }, + { + "epoch": 6.3639876352395675, + "grad_norm": 1.479472041130066, + "learning_rate": 4.309462013411328e-05, + "loss": 0.4635, + "num_input_tokens_seen": 2771472, + "step": 8235 + }, + { + "epoch": 6.367851622874807, + "grad_norm": 1.6455119848251343, + "learning_rate": 4.308298230547142e-05, + "loss": 0.6223, + "num_input_tokens_seen": 2773136, + "step": 8240 + }, + { + "epoch": 6.371715610510046, + "grad_norm": 0.7940467596054077, + "learning_rate": 4.307133625258091e-05, + "loss": 0.6539, + "num_input_tokens_seen": 2774736, + "step": 8245 + }, + { + "epoch": 6.375579598145286, + "grad_norm": 0.9641621708869934, + "learning_rate": 4.3059681980738445e-05, + "loss": 0.5695, + "num_input_tokens_seen": 2776400, + "step": 8250 + }, + { + "epoch": 6.379443585780526, + "grad_norm": 0.9667071104049683, + "learning_rate": 4.304801949524446e-05, + "loss": 0.5175, + "num_input_tokens_seen": 2778064, + "step": 8255 + }, + { + "epoch": 6.383307573415765, + "grad_norm": 0.8663448095321655, + "learning_rate": 4.303634880140312e-05, + "loss": 0.6107, + "num_input_tokens_seen": 2779728, + "step": 8260 + }, + { + "epoch": 6.387171561051004, + "grad_norm": 0.8490527868270874, + "learning_rate": 4.302466990452233e-05, + "loss": 0.4014, + "num_input_tokens_seen": 2781168, + "step": 8265 + }, + { + "epoch": 6.3910355486862445, + "grad_norm": 1.053136944770813, + "learning_rate": 4.301298280991373e-05, + "loss": 0.4264, + "num_input_tokens_seen": 2782896, + "step": 8270 + }, + { + "epoch": 6.394899536321484, + "grad_norm": 1.0793218612670898, + "learning_rate": 4.3001287522892665e-05, + "loss": 0.4306, + "num_input_tokens_seen": 2784624, + "step": 8275 + }, + { + "epoch": 6.398763523956723, + "grad_norm": 0.9694398641586304, + "learning_rate": 4.298958404877823e-05, + "loss": 0.4365, + "num_input_tokens_seen": 2786352, + "step": 8280 + }, + { + "epoch": 6.402627511591963, + "grad_norm": 0.7393090724945068, + "learning_rate": 4.2977872392893235e-05, + "loss": 0.637, + "num_input_tokens_seen": 2787888, + "step": 8285 + }, + { + "epoch": 6.406491499227203, + "grad_norm": 0.6244948506355286, + "learning_rate": 4.296615256056421e-05, + "loss": 0.7178, + "num_input_tokens_seen": 2789840, + "step": 8290 + }, + { + "epoch": 6.410355486862442, + "grad_norm": 0.813159704208374, + "learning_rate": 4.295442455712141e-05, + "loss": 0.4296, + "num_input_tokens_seen": 2791408, + "step": 8295 + }, + { + "epoch": 6.414219474497681, + "grad_norm": 0.6073101758956909, + "learning_rate": 4.294268838789879e-05, + "loss": 0.3383, + "num_input_tokens_seen": 2793136, + "step": 8300 + }, + { + "epoch": 6.4180834621329215, + "grad_norm": 0.796737551689148, + "learning_rate": 4.293094405823404e-05, + "loss": 0.4049, + "num_input_tokens_seen": 2794928, + "step": 8305 + }, + { + "epoch": 6.421947449768161, + "grad_norm": 1.370376706123352, + "learning_rate": 4.2919191573468555e-05, + "loss": 0.7932, + "num_input_tokens_seen": 2796688, + "step": 8310 + }, + { + "epoch": 6.4258114374034, + "grad_norm": 1.2308462858200073, + "learning_rate": 4.290743093894742e-05, + "loss": 0.6041, + "num_input_tokens_seen": 2798576, + "step": 8315 + }, + { + "epoch": 6.42967542503864, + "grad_norm": 0.8360562324523926, + "learning_rate": 4.2895662160019444e-05, + "loss": 0.4442, + "num_input_tokens_seen": 2800464, + "step": 8320 + }, + { + "epoch": 6.43353941267388, + "grad_norm": 0.6387249231338501, + "learning_rate": 4.288388524203716e-05, + "loss": 0.4282, + "num_input_tokens_seen": 2802096, + "step": 8325 + }, + { + "epoch": 6.437403400309119, + "grad_norm": 0.6584174633026123, + "learning_rate": 4.2872100190356756e-05, + "loss": 0.4983, + "num_input_tokens_seen": 2803728, + "step": 8330 + }, + { + "epoch": 6.441267387944358, + "grad_norm": 0.6397181153297424, + "learning_rate": 4.286030701033815e-05, + "loss": 0.4294, + "num_input_tokens_seen": 2805168, + "step": 8335 + }, + { + "epoch": 6.4451313755795985, + "grad_norm": 0.8850130438804626, + "learning_rate": 4.2848505707344965e-05, + "loss": 0.4906, + "num_input_tokens_seen": 2806672, + "step": 8340 + }, + { + "epoch": 6.448995363214838, + "grad_norm": 0.6843503713607788, + "learning_rate": 4.283669628674449e-05, + "loss": 0.6889, + "num_input_tokens_seen": 2808944, + "step": 8345 + }, + { + "epoch": 6.452859350850077, + "grad_norm": 0.6080131530761719, + "learning_rate": 4.282487875390772e-05, + "loss": 0.3966, + "num_input_tokens_seen": 2810736, + "step": 8350 + }, + { + "epoch": 6.456723338485316, + "grad_norm": 0.7903328537940979, + "learning_rate": 4.2813053114209345e-05, + "loss": 0.4682, + "num_input_tokens_seen": 2812432, + "step": 8355 + }, + { + "epoch": 6.460587326120557, + "grad_norm": 0.7933758497238159, + "learning_rate": 4.280121937302774e-05, + "loss": 0.4064, + "num_input_tokens_seen": 2814192, + "step": 8360 + }, + { + "epoch": 6.464451313755796, + "grad_norm": 0.7146584987640381, + "learning_rate": 4.2789377535744955e-05, + "loss": 0.4035, + "num_input_tokens_seen": 2815856, + "step": 8365 + }, + { + "epoch": 6.468315301391035, + "grad_norm": 1.0519225597381592, + "learning_rate": 4.2777527607746725e-05, + "loss": 0.5777, + "num_input_tokens_seen": 2817584, + "step": 8370 + }, + { + "epoch": 6.4721792890262755, + "grad_norm": 0.9685955047607422, + "learning_rate": 4.2765669594422486e-05, + "loss": 0.4632, + "num_input_tokens_seen": 2819216, + "step": 8375 + }, + { + "epoch": 6.476043276661515, + "grad_norm": 0.7288817167282104, + "learning_rate": 4.2753803501165304e-05, + "loss": 0.4398, + "num_input_tokens_seen": 2820976, + "step": 8380 + }, + { + "epoch": 6.479907264296754, + "grad_norm": 1.104540228843689, + "learning_rate": 4.2741929333371986e-05, + "loss": 0.5529, + "num_input_tokens_seen": 2822736, + "step": 8385 + }, + { + "epoch": 6.483771251931993, + "grad_norm": 0.757056474685669, + "learning_rate": 4.2730047096442935e-05, + "loss": 0.4757, + "num_input_tokens_seen": 2824272, + "step": 8390 + }, + { + "epoch": 6.487635239567234, + "grad_norm": 0.8333421945571899, + "learning_rate": 4.271815679578229e-05, + "loss": 0.3978, + "num_input_tokens_seen": 2825808, + "step": 8395 + }, + { + "epoch": 6.491499227202473, + "grad_norm": 0.724192202091217, + "learning_rate": 4.270625843679783e-05, + "loss": 0.4235, + "num_input_tokens_seen": 2827504, + "step": 8400 + }, + { + "epoch": 6.495363214837712, + "grad_norm": 0.8652176856994629, + "learning_rate": 4.2694352024901e-05, + "loss": 0.5377, + "num_input_tokens_seen": 2829264, + "step": 8405 + }, + { + "epoch": 6.4992272024729525, + "grad_norm": 0.6312440037727356, + "learning_rate": 4.268243756550689e-05, + "loss": 0.5196, + "num_input_tokens_seen": 2831088, + "step": 8410 + }, + { + "epoch": 6.503091190108192, + "grad_norm": 0.9621128439903259, + "learning_rate": 4.267051506403428e-05, + "loss": 0.4047, + "num_input_tokens_seen": 2832528, + "step": 8415 + }, + { + "epoch": 6.506955177743431, + "grad_norm": 1.2427406311035156, + "learning_rate": 4.26585845259056e-05, + "loss": 0.5454, + "num_input_tokens_seen": 2834000, + "step": 8420 + }, + { + "epoch": 6.5108191653786704, + "grad_norm": 0.7084592580795288, + "learning_rate": 4.264664595654692e-05, + "loss": 0.6171, + "num_input_tokens_seen": 2835440, + "step": 8425 + }, + { + "epoch": 6.514683153013911, + "grad_norm": 0.695348858833313, + "learning_rate": 4.263469936138797e-05, + "loss": 0.5387, + "num_input_tokens_seen": 2837296, + "step": 8430 + }, + { + "epoch": 6.51854714064915, + "grad_norm": 0.8794716000556946, + "learning_rate": 4.2622744745862154e-05, + "loss": 0.5656, + "num_input_tokens_seen": 2839120, + "step": 8435 + }, + { + "epoch": 6.522411128284389, + "grad_norm": 0.6547197103500366, + "learning_rate": 4.2610782115406483e-05, + "loss": 0.4026, + "num_input_tokens_seen": 2840848, + "step": 8440 + }, + { + "epoch": 6.5262751159196295, + "grad_norm": 0.6912714242935181, + "learning_rate": 4.259881147546164e-05, + "loss": 0.4302, + "num_input_tokens_seen": 2842224, + "step": 8445 + }, + { + "epoch": 6.530139103554869, + "grad_norm": 0.7575278878211975, + "learning_rate": 4.258683283147195e-05, + "loss": 0.4564, + "num_input_tokens_seen": 2843856, + "step": 8450 + }, + { + "epoch": 6.534003091190108, + "grad_norm": 0.7221788167953491, + "learning_rate": 4.2574846188885356e-05, + "loss": 0.4089, + "num_input_tokens_seen": 2845520, + "step": 8455 + }, + { + "epoch": 6.5378670788253475, + "grad_norm": 0.9054480791091919, + "learning_rate": 4.256285155315346e-05, + "loss": 0.4264, + "num_input_tokens_seen": 2847056, + "step": 8460 + }, + { + "epoch": 6.541731066460588, + "grad_norm": 1.0897403955459595, + "learning_rate": 4.25508489297315e-05, + "loss": 0.5265, + "num_input_tokens_seen": 2848976, + "step": 8465 + }, + { + "epoch": 6.545595054095827, + "grad_norm": 0.8980346918106079, + "learning_rate": 4.253883832407835e-05, + "loss": 0.4398, + "num_input_tokens_seen": 2850672, + "step": 8470 + }, + { + "epoch": 6.549459041731066, + "grad_norm": 1.4755980968475342, + "learning_rate": 4.2526819741656485e-05, + "loss": 0.5826, + "num_input_tokens_seen": 2852240, + "step": 8475 + }, + { + "epoch": 6.553323029366306, + "grad_norm": 0.9522770047187805, + "learning_rate": 4.2514793187932036e-05, + "loss": 0.5088, + "num_input_tokens_seen": 2853808, + "step": 8480 + }, + { + "epoch": 6.557187017001546, + "grad_norm": 0.6687210202217102, + "learning_rate": 4.250275866837475e-05, + "loss": 0.3806, + "num_input_tokens_seen": 2855472, + "step": 8485 + }, + { + "epoch": 6.561051004636785, + "grad_norm": 0.6765424013137817, + "learning_rate": 4.2490716188458014e-05, + "loss": 0.4258, + "num_input_tokens_seen": 2857296, + "step": 8490 + }, + { + "epoch": 6.5649149922720245, + "grad_norm": 0.7947444319725037, + "learning_rate": 4.2478665753658794e-05, + "loss": 0.4297, + "num_input_tokens_seen": 2858800, + "step": 8495 + }, + { + "epoch": 6.568778979907265, + "grad_norm": 0.7555139660835266, + "learning_rate": 4.246660736945773e-05, + "loss": 0.4771, + "num_input_tokens_seen": 2860496, + "step": 8500 + }, + { + "epoch": 6.572642967542504, + "grad_norm": 0.9810084104537964, + "learning_rate": 4.2454541041339027e-05, + "loss": 0.5246, + "num_input_tokens_seen": 2862416, + "step": 8505 + }, + { + "epoch": 6.576506955177743, + "grad_norm": 1.0549488067626953, + "learning_rate": 4.2442466774790516e-05, + "loss": 0.6272, + "num_input_tokens_seen": 2864112, + "step": 8510 + }, + { + "epoch": 6.580370942812983, + "grad_norm": 0.8754095435142517, + "learning_rate": 4.243038457530366e-05, + "loss": 0.4945, + "num_input_tokens_seen": 2865808, + "step": 8515 + }, + { + "epoch": 6.584234930448223, + "grad_norm": 0.8457027077674866, + "learning_rate": 4.241829444837352e-05, + "loss": 0.3815, + "num_input_tokens_seen": 2867472, + "step": 8520 + }, + { + "epoch": 6.588098918083462, + "grad_norm": 0.6663288474082947, + "learning_rate": 4.240619639949874e-05, + "loss": 0.4389, + "num_input_tokens_seen": 2869104, + "step": 8525 + }, + { + "epoch": 6.5919629057187015, + "grad_norm": 1.1967979669570923, + "learning_rate": 4.239409043418161e-05, + "loss": 0.5479, + "num_input_tokens_seen": 2870832, + "step": 8530 + }, + { + "epoch": 6.595826893353941, + "grad_norm": 0.838874876499176, + "learning_rate": 4.2381976557927974e-05, + "loss": 0.4079, + "num_input_tokens_seen": 2872304, + "step": 8535 + }, + { + "epoch": 6.599690880989181, + "grad_norm": 0.8656036257743835, + "learning_rate": 4.2369854776247295e-05, + "loss": 0.3803, + "num_input_tokens_seen": 2873968, + "step": 8540 + }, + { + "epoch": 6.60355486862442, + "grad_norm": 1.0225456953048706, + "learning_rate": 4.235772509465266e-05, + "loss": 0.5844, + "num_input_tokens_seen": 2875728, + "step": 8545 + }, + { + "epoch": 6.60741885625966, + "grad_norm": 0.760628342628479, + "learning_rate": 4.234558751866068e-05, + "loss": 0.6693, + "num_input_tokens_seen": 2877488, + "step": 8550 + }, + { + "epoch": 6.6112828438949, + "grad_norm": 0.6566743850708008, + "learning_rate": 4.2333442053791625e-05, + "loss": 0.5518, + "num_input_tokens_seen": 2879216, + "step": 8555 + }, + { + "epoch": 6.615146831530139, + "grad_norm": 0.5627212524414062, + "learning_rate": 4.2321288705569315e-05, + "loss": 0.4608, + "num_input_tokens_seen": 2880912, + "step": 8560 + }, + { + "epoch": 6.6190108191653785, + "grad_norm": 0.755752444267273, + "learning_rate": 4.230912747952118e-05, + "loss": 0.5016, + "num_input_tokens_seen": 2882768, + "step": 8565 + }, + { + "epoch": 6.622874806800619, + "grad_norm": 1.6357156038284302, + "learning_rate": 4.22969583811782e-05, + "loss": 0.4838, + "num_input_tokens_seen": 2884560, + "step": 8570 + }, + { + "epoch": 6.626738794435858, + "grad_norm": 0.5755021572113037, + "learning_rate": 4.228478141607496e-05, + "loss": 0.463, + "num_input_tokens_seen": 2886064, + "step": 8575 + }, + { + "epoch": 6.630602782071097, + "grad_norm": 0.8826261162757874, + "learning_rate": 4.227259658974961e-05, + "loss": 0.4947, + "num_input_tokens_seen": 2887664, + "step": 8580 + }, + { + "epoch": 6.634466769706337, + "grad_norm": 0.7268012762069702, + "learning_rate": 4.2260403907743906e-05, + "loss": 0.5441, + "num_input_tokens_seen": 2889328, + "step": 8585 + }, + { + "epoch": 6.638330757341577, + "grad_norm": 0.8044947385787964, + "learning_rate": 4.224820337560313e-05, + "loss": 0.3971, + "num_input_tokens_seen": 2891440, + "step": 8590 + }, + { + "epoch": 6.642194744976816, + "grad_norm": 1.3200652599334717, + "learning_rate": 4.2235994998876156e-05, + "loss": 0.5341, + "num_input_tokens_seen": 2893264, + "step": 8595 + }, + { + "epoch": 6.6460587326120555, + "grad_norm": 0.48483628034591675, + "learning_rate": 4.222377878311544e-05, + "loss": 0.4373, + "num_input_tokens_seen": 2894768, + "step": 8600 + }, + { + "epoch": 6.649922720247295, + "grad_norm": 0.6385105848312378, + "learning_rate": 4.2211554733876984e-05, + "loss": 0.6484, + "num_input_tokens_seen": 2896624, + "step": 8605 + }, + { + "epoch": 6.653786707882535, + "grad_norm": 1.8929475545883179, + "learning_rate": 4.2199322856720356e-05, + "loss": 0.5133, + "num_input_tokens_seen": 2898608, + "step": 8610 + }, + { + "epoch": 6.657650695517774, + "grad_norm": 1.4383915662765503, + "learning_rate": 4.218708315720869e-05, + "loss": 0.5632, + "num_input_tokens_seen": 2900368, + "step": 8615 + }, + { + "epoch": 6.661514683153014, + "grad_norm": 0.9022157788276672, + "learning_rate": 4.217483564090868e-05, + "loss": 0.5908, + "num_input_tokens_seen": 2902128, + "step": 8620 + }, + { + "epoch": 6.665378670788254, + "grad_norm": 0.8761652708053589, + "learning_rate": 4.216258031339056e-05, + "loss": 0.4389, + "num_input_tokens_seen": 2903792, + "step": 8625 + }, + { + "epoch": 6.669242658423493, + "grad_norm": 0.6691360473632812, + "learning_rate": 4.2150317180228135e-05, + "loss": 0.3563, + "num_input_tokens_seen": 2905232, + "step": 8630 + }, + { + "epoch": 6.6731066460587325, + "grad_norm": 0.8297411203384399, + "learning_rate": 4.2138046246998746e-05, + "loss": 0.3639, + "num_input_tokens_seen": 2906864, + "step": 8635 + }, + { + "epoch": 6.676970633693972, + "grad_norm": 0.6346117854118347, + "learning_rate": 4.2125767519283285e-05, + "loss": 0.4541, + "num_input_tokens_seen": 2908816, + "step": 8640 + }, + { + "epoch": 6.680834621329212, + "grad_norm": 0.9302797317504883, + "learning_rate": 4.21134810026662e-05, + "loss": 0.5643, + "num_input_tokens_seen": 2910544, + "step": 8645 + }, + { + "epoch": 6.684698608964451, + "grad_norm": 0.8519047498703003, + "learning_rate": 4.210118670273546e-05, + "loss": 0.5365, + "num_input_tokens_seen": 2912048, + "step": 8650 + }, + { + "epoch": 6.688562596599691, + "grad_norm": 0.6449496746063232, + "learning_rate": 4.20888846250826e-05, + "loss": 0.412, + "num_input_tokens_seen": 2913968, + "step": 8655 + }, + { + "epoch": 6.69242658423493, + "grad_norm": 0.9333447813987732, + "learning_rate": 4.2076574775302665e-05, + "loss": 0.3991, + "num_input_tokens_seen": 2915632, + "step": 8660 + }, + { + "epoch": 6.69629057187017, + "grad_norm": 0.8085376024246216, + "learning_rate": 4.206425715899425e-05, + "loss": 0.4635, + "num_input_tokens_seen": 2917168, + "step": 8665 + }, + { + "epoch": 6.7001545595054095, + "grad_norm": 1.0905505418777466, + "learning_rate": 4.205193178175949e-05, + "loss": 0.4281, + "num_input_tokens_seen": 2918928, + "step": 8670 + }, + { + "epoch": 6.704018547140649, + "grad_norm": 1.9249273538589478, + "learning_rate": 4.203959864920404e-05, + "loss": 0.657, + "num_input_tokens_seen": 2920688, + "step": 8675 + }, + { + "epoch": 6.707882534775889, + "grad_norm": 2.2109618186950684, + "learning_rate": 4.202725776693707e-05, + "loss": 0.5467, + "num_input_tokens_seen": 2922320, + "step": 8680 + }, + { + "epoch": 6.711746522411128, + "grad_norm": 1.1500701904296875, + "learning_rate": 4.2014909140571305e-05, + "loss": 0.5754, + "num_input_tokens_seen": 2923920, + "step": 8685 + }, + { + "epoch": 6.715610510046368, + "grad_norm": 1.2891563177108765, + "learning_rate": 4.2002552775722956e-05, + "loss": 0.413, + "num_input_tokens_seen": 2925552, + "step": 8690 + }, + { + "epoch": 6.719474497681608, + "grad_norm": 0.8528351187705994, + "learning_rate": 4.199018867801179e-05, + "loss": 0.4949, + "num_input_tokens_seen": 2927280, + "step": 8695 + }, + { + "epoch": 6.723338485316847, + "grad_norm": 1.287847638130188, + "learning_rate": 4.197781685306105e-05, + "loss": 0.6078, + "num_input_tokens_seen": 2928944, + "step": 8700 + }, + { + "epoch": 6.7272024729520865, + "grad_norm": 0.8117544651031494, + "learning_rate": 4.196543730649754e-05, + "loss": 0.5858, + "num_input_tokens_seen": 2930704, + "step": 8705 + }, + { + "epoch": 6.731066460587326, + "grad_norm": 0.723794162273407, + "learning_rate": 4.1953050043951537e-05, + "loss": 0.4139, + "num_input_tokens_seen": 2932432, + "step": 8710 + }, + { + "epoch": 6.734930448222566, + "grad_norm": 0.5914732813835144, + "learning_rate": 4.194065507105685e-05, + "loss": 0.4896, + "num_input_tokens_seen": 2934160, + "step": 8715 + }, + { + "epoch": 6.738794435857805, + "grad_norm": 0.8705365657806396, + "learning_rate": 4.192825239345077e-05, + "loss": 0.5418, + "num_input_tokens_seen": 2935760, + "step": 8720 + }, + { + "epoch": 6.742658423493045, + "grad_norm": 1.0129063129425049, + "learning_rate": 4.191584201677414e-05, + "loss": 0.5112, + "num_input_tokens_seen": 2937552, + "step": 8725 + }, + { + "epoch": 6.746522411128284, + "grad_norm": 1.112492561340332, + "learning_rate": 4.190342394667124e-05, + "loss": 0.4522, + "num_input_tokens_seen": 2939152, + "step": 8730 + }, + { + "epoch": 6.750386398763524, + "grad_norm": 1.3260986804962158, + "learning_rate": 4.189099818878991e-05, + "loss": 0.583, + "num_input_tokens_seen": 2940880, + "step": 8735 + }, + { + "epoch": 6.7542503863987635, + "grad_norm": 0.6910870671272278, + "learning_rate": 4.1878564748781446e-05, + "loss": 0.4325, + "num_input_tokens_seen": 2942544, + "step": 8740 + }, + { + "epoch": 6.758114374034003, + "grad_norm": 0.5814184546470642, + "learning_rate": 4.186612363230065e-05, + "loss": 0.4057, + "num_input_tokens_seen": 2944208, + "step": 8745 + }, + { + "epoch": 6.761978361669243, + "grad_norm": 1.0353422164916992, + "learning_rate": 4.185367484500582e-05, + "loss": 0.6035, + "num_input_tokens_seen": 2946000, + "step": 8750 + }, + { + "epoch": 6.765842349304482, + "grad_norm": 0.8195891976356506, + "learning_rate": 4.184121839255873e-05, + "loss": 0.3968, + "num_input_tokens_seen": 2947504, + "step": 8755 + }, + { + "epoch": 6.769706336939722, + "grad_norm": 0.8832127451896667, + "learning_rate": 4.182875428062467e-05, + "loss": 0.5021, + "num_input_tokens_seen": 2949328, + "step": 8760 + }, + { + "epoch": 6.773570324574961, + "grad_norm": 0.7900398373603821, + "learning_rate": 4.181628251487237e-05, + "loss": 0.4996, + "num_input_tokens_seen": 2951024, + "step": 8765 + }, + { + "epoch": 6.777434312210201, + "grad_norm": 0.8847339153289795, + "learning_rate": 4.1803803100974075e-05, + "loss": 0.4298, + "num_input_tokens_seen": 2952656, + "step": 8770 + }, + { + "epoch": 6.7812982998454405, + "grad_norm": 0.8251826167106628, + "learning_rate": 4.17913160446055e-05, + "loss": 0.4984, + "num_input_tokens_seen": 2954416, + "step": 8775 + }, + { + "epoch": 6.78516228748068, + "grad_norm": 1.0853108167648315, + "learning_rate": 4.177882135144582e-05, + "loss": 0.4092, + "num_input_tokens_seen": 2955792, + "step": 8780 + }, + { + "epoch": 6.789026275115919, + "grad_norm": 0.9221647381782532, + "learning_rate": 4.1766319027177715e-05, + "loss": 0.4725, + "num_input_tokens_seen": 2957488, + "step": 8785 + }, + { + "epoch": 6.792890262751159, + "grad_norm": 0.9622830748558044, + "learning_rate": 4.1753809077487304e-05, + "loss": 0.4044, + "num_input_tokens_seen": 2958992, + "step": 8790 + }, + { + "epoch": 6.796754250386399, + "grad_norm": 0.5433503985404968, + "learning_rate": 4.174129150806419e-05, + "loss": 0.351, + "num_input_tokens_seen": 2960624, + "step": 8795 + }, + { + "epoch": 6.800618238021638, + "grad_norm": 0.9602607488632202, + "learning_rate": 4.172876632460143e-05, + "loss": 0.4613, + "num_input_tokens_seen": 2962256, + "step": 8800 + }, + { + "epoch": 6.804482225656878, + "grad_norm": 1.047360897064209, + "learning_rate": 4.1716233532795564e-05, + "loss": 0.575, + "num_input_tokens_seen": 2963824, + "step": 8805 + }, + { + "epoch": 6.8083462132921175, + "grad_norm": 0.8497563600540161, + "learning_rate": 4.170369313834659e-05, + "loss": 0.4859, + "num_input_tokens_seen": 2965648, + "step": 8810 + }, + { + "epoch": 6.812210200927357, + "grad_norm": 0.8853704929351807, + "learning_rate": 4.1691145146957934e-05, + "loss": 0.3571, + "num_input_tokens_seen": 2967312, + "step": 8815 + }, + { + "epoch": 6.816074188562597, + "grad_norm": 1.377980351448059, + "learning_rate": 4.16785895643365e-05, + "loss": 0.5866, + "num_input_tokens_seen": 2969104, + "step": 8820 + }, + { + "epoch": 6.819938176197836, + "grad_norm": 0.9950586557388306, + "learning_rate": 4.1666026396192656e-05, + "loss": 0.5049, + "num_input_tokens_seen": 2970736, + "step": 8825 + }, + { + "epoch": 6.823802163833076, + "grad_norm": 1.242020845413208, + "learning_rate": 4.16534556482402e-05, + "loss": 0.38, + "num_input_tokens_seen": 2972336, + "step": 8830 + }, + { + "epoch": 6.827666151468315, + "grad_norm": 0.5972198843955994, + "learning_rate": 4.164087732619637e-05, + "loss": 0.5279, + "num_input_tokens_seen": 2973776, + "step": 8835 + }, + { + "epoch": 6.831530139103555, + "grad_norm": 1.3315448760986328, + "learning_rate": 4.162829143578189e-05, + "loss": 0.846, + "num_input_tokens_seen": 2975344, + "step": 8840 + }, + { + "epoch": 6.8353941267387945, + "grad_norm": 2.340120315551758, + "learning_rate": 4.161569798272087e-05, + "loss": 0.7415, + "num_input_tokens_seen": 2976944, + "step": 8845 + }, + { + "epoch": 6.839258114374034, + "grad_norm": 0.7845291495323181, + "learning_rate": 4.16030969727409e-05, + "loss": 0.4658, + "num_input_tokens_seen": 2978416, + "step": 8850 + }, + { + "epoch": 6.843122102009273, + "grad_norm": 1.4571284055709839, + "learning_rate": 4.1590488411573006e-05, + "loss": 0.6137, + "num_input_tokens_seen": 2980240, + "step": 8855 + }, + { + "epoch": 6.846986089644513, + "grad_norm": 0.6776977181434631, + "learning_rate": 4.157787230495161e-05, + "loss": 0.3709, + "num_input_tokens_seen": 2981968, + "step": 8860 + }, + { + "epoch": 6.850850077279753, + "grad_norm": 1.3310073614120483, + "learning_rate": 4.156524865861462e-05, + "loss": 0.4012, + "num_input_tokens_seen": 2983536, + "step": 8865 + }, + { + "epoch": 6.854714064914992, + "grad_norm": 0.6591601371765137, + "learning_rate": 4.155261747830332e-05, + "loss": 0.4019, + "num_input_tokens_seen": 2985072, + "step": 8870 + }, + { + "epoch": 6.858578052550232, + "grad_norm": 1.9005932807922363, + "learning_rate": 4.153997876976248e-05, + "loss": 0.4897, + "num_input_tokens_seen": 2986704, + "step": 8875 + }, + { + "epoch": 6.8624420401854715, + "grad_norm": 0.9107431769371033, + "learning_rate": 4.152733253874023e-05, + "loss": 0.5082, + "num_input_tokens_seen": 2988304, + "step": 8880 + }, + { + "epoch": 6.866306027820711, + "grad_norm": 0.8762601613998413, + "learning_rate": 4.151467879098817e-05, + "loss": 0.5711, + "num_input_tokens_seen": 2989904, + "step": 8885 + }, + { + "epoch": 6.87017001545595, + "grad_norm": 0.8641261458396912, + "learning_rate": 4.150201753226129e-05, + "loss": 0.526, + "num_input_tokens_seen": 2991504, + "step": 8890 + }, + { + "epoch": 6.87403400309119, + "grad_norm": 0.5810456275939941, + "learning_rate": 4.148934876831801e-05, + "loss": 0.3998, + "num_input_tokens_seen": 2993232, + "step": 8895 + }, + { + "epoch": 6.87789799072643, + "grad_norm": 0.6742425560951233, + "learning_rate": 4.1476672504920154e-05, + "loss": 0.4927, + "num_input_tokens_seen": 2994864, + "step": 8900 + }, + { + "epoch": 6.881761978361669, + "grad_norm": 1.8628501892089844, + "learning_rate": 4.146398874783297e-05, + "loss": 0.5474, + "num_input_tokens_seen": 2996592, + "step": 8905 + }, + { + "epoch": 6.885625965996908, + "grad_norm": 1.5452954769134521, + "learning_rate": 4.1451297502825116e-05, + "loss": 0.4614, + "num_input_tokens_seen": 2998544, + "step": 8910 + }, + { + "epoch": 6.8894899536321486, + "grad_norm": 0.5614861249923706, + "learning_rate": 4.143859877566863e-05, + "loss": 0.366, + "num_input_tokens_seen": 3000112, + "step": 8915 + }, + { + "epoch": 6.893353941267388, + "grad_norm": 0.811635434627533, + "learning_rate": 4.1425892572138966e-05, + "loss": 0.4935, + "num_input_tokens_seen": 3001712, + "step": 8920 + }, + { + "epoch": 6.897217928902627, + "grad_norm": 0.6274958252906799, + "learning_rate": 4.141317889801499e-05, + "loss": 0.4643, + "num_input_tokens_seen": 3003312, + "step": 8925 + }, + { + "epoch": 6.901081916537867, + "grad_norm": 0.816335916519165, + "learning_rate": 4.140045775907896e-05, + "loss": 0.4005, + "num_input_tokens_seen": 3005008, + "step": 8930 + }, + { + "epoch": 6.904945904173107, + "grad_norm": 0.8399748206138611, + "learning_rate": 4.138772916111653e-05, + "loss": 0.5514, + "num_input_tokens_seen": 3006864, + "step": 8935 + }, + { + "epoch": 6.908809891808346, + "grad_norm": 1.609634280204773, + "learning_rate": 4.137499310991672e-05, + "loss": 0.4646, + "num_input_tokens_seen": 3008848, + "step": 8940 + }, + { + "epoch": 6.912673879443586, + "grad_norm": 1.333967924118042, + "learning_rate": 4.136224961127199e-05, + "loss": 0.4646, + "num_input_tokens_seen": 3010800, + "step": 8945 + }, + { + "epoch": 6.916537867078826, + "grad_norm": 0.8874390721321106, + "learning_rate": 4.1349498670978134e-05, + "loss": 0.4931, + "num_input_tokens_seen": 3012624, + "step": 8950 + }, + { + "epoch": 6.920401854714065, + "grad_norm": 0.6711632013320923, + "learning_rate": 4.1336740294834384e-05, + "loss": 0.4286, + "num_input_tokens_seen": 3014256, + "step": 8955 + }, + { + "epoch": 6.924265842349304, + "grad_norm": 0.6471444368362427, + "learning_rate": 4.13239744886433e-05, + "loss": 0.5766, + "num_input_tokens_seen": 3016048, + "step": 8960 + }, + { + "epoch": 6.928129829984544, + "grad_norm": 0.7349487543106079, + "learning_rate": 4.1311201258210867e-05, + "loss": 0.3671, + "num_input_tokens_seen": 3017648, + "step": 8965 + }, + { + "epoch": 6.931993817619784, + "grad_norm": 0.6894106268882751, + "learning_rate": 4.1298420609346414e-05, + "loss": 0.605, + "num_input_tokens_seen": 3019472, + "step": 8970 + }, + { + "epoch": 6.935857805255023, + "grad_norm": 1.1568249464035034, + "learning_rate": 4.128563254786266e-05, + "loss": 0.6173, + "num_input_tokens_seen": 3021136, + "step": 8975 + }, + { + "epoch": 6.939721792890262, + "grad_norm": 1.5250325202941895, + "learning_rate": 4.1272837079575686e-05, + "loss": 0.6211, + "num_input_tokens_seen": 3023120, + "step": 8980 + }, + { + "epoch": 6.943585780525503, + "grad_norm": 0.5724444389343262, + "learning_rate": 4.1260034210304966e-05, + "loss": 0.4034, + "num_input_tokens_seen": 3024912, + "step": 8985 + }, + { + "epoch": 6.947449768160742, + "grad_norm": 0.7211331129074097, + "learning_rate": 4.1247223945873307e-05, + "loss": 0.4776, + "num_input_tokens_seen": 3026576, + "step": 8990 + }, + { + "epoch": 6.951313755795981, + "grad_norm": 1.1877089738845825, + "learning_rate": 4.123440629210689e-05, + "loss": 0.6378, + "num_input_tokens_seen": 3028176, + "step": 8995 + }, + { + "epoch": 6.955177743431221, + "grad_norm": 0.8114324808120728, + "learning_rate": 4.1221581254835276e-05, + "loss": 0.517, + "num_input_tokens_seen": 3030096, + "step": 9000 + }, + { + "epoch": 6.959041731066461, + "grad_norm": 0.987675666809082, + "learning_rate": 4.120874883989135e-05, + "loss": 0.4373, + "num_input_tokens_seen": 3031568, + "step": 9005 + }, + { + "epoch": 6.9629057187017, + "grad_norm": 0.915221095085144, + "learning_rate": 4.1195909053111386e-05, + "loss": 0.4748, + "num_input_tokens_seen": 3033264, + "step": 9010 + }, + { + "epoch": 6.966769706336939, + "grad_norm": 0.7201360464096069, + "learning_rate": 4.1183061900335e-05, + "loss": 0.4458, + "num_input_tokens_seen": 3034864, + "step": 9015 + }, + { + "epoch": 6.97063369397218, + "grad_norm": 0.7520477771759033, + "learning_rate": 4.117020738740512e-05, + "loss": 0.558, + "num_input_tokens_seen": 3036560, + "step": 9020 + }, + { + "epoch": 6.974497681607419, + "grad_norm": 0.8088432550430298, + "learning_rate": 4.1157345520168106e-05, + "loss": 0.468, + "num_input_tokens_seen": 3038000, + "step": 9025 + }, + { + "epoch": 6.978361669242658, + "grad_norm": 1.2135518789291382, + "learning_rate": 4.1144476304473575e-05, + "loss": 0.5339, + "num_input_tokens_seen": 3039824, + "step": 9030 + }, + { + "epoch": 6.9822256568778975, + "grad_norm": 0.914861798286438, + "learning_rate": 4.113159974617454e-05, + "loss": 0.3857, + "num_input_tokens_seen": 3041520, + "step": 9035 + }, + { + "epoch": 6.986089644513138, + "grad_norm": 0.6380100846290588, + "learning_rate": 4.111871585112733e-05, + "loss": 0.5147, + "num_input_tokens_seen": 3043664, + "step": 9040 + }, + { + "epoch": 6.989953632148377, + "grad_norm": 1.2169189453125, + "learning_rate": 4.1105824625191624e-05, + "loss": 0.4325, + "num_input_tokens_seen": 3045232, + "step": 9045 + }, + { + "epoch": 6.993817619783616, + "grad_norm": 0.6298080086708069, + "learning_rate": 4.109292607423042e-05, + "loss": 0.4413, + "num_input_tokens_seen": 3047184, + "step": 9050 + }, + { + "epoch": 6.997681607418857, + "grad_norm": 0.4174688756465912, + "learning_rate": 4.108002020411006e-05, + "loss": 0.5125, + "num_input_tokens_seen": 3048624, + "step": 9055 + }, + { + "epoch": 7.0, + "eval_loss": 0.4705168306827545, + "eval_runtime": 6.2587, + "eval_samples_per_second": 91.873, + "eval_steps_per_second": 23.008, + "num_input_tokens_seen": 3049776, + "step": 9058 + }, + { + "epoch": 7.001545595054096, + "grad_norm": 0.9280962944030762, + "learning_rate": 4.1067107020700216e-05, + "loss": 0.71, + "num_input_tokens_seen": 3050608, + "step": 9060 + }, + { + "epoch": 7.005409582689335, + "grad_norm": 1.1673401594161987, + "learning_rate": 4.105418652987387e-05, + "loss": 0.4196, + "num_input_tokens_seen": 3052304, + "step": 9065 + }, + { + "epoch": 7.0092735703245745, + "grad_norm": 1.1409801244735718, + "learning_rate": 4.1041258737507347e-05, + "loss": 0.5217, + "num_input_tokens_seen": 3053968, + "step": 9070 + }, + { + "epoch": 7.013137557959815, + "grad_norm": 0.9148547649383545, + "learning_rate": 4.102832364948029e-05, + "loss": 0.4176, + "num_input_tokens_seen": 3055728, + "step": 9075 + }, + { + "epoch": 7.017001545595054, + "grad_norm": 1.314579725265503, + "learning_rate": 4.101538127167564e-05, + "loss": 0.5741, + "num_input_tokens_seen": 3057296, + "step": 9080 + }, + { + "epoch": 7.020865533230293, + "grad_norm": 0.7044819593429565, + "learning_rate": 4.100243160997968e-05, + "loss": 0.5717, + "num_input_tokens_seen": 3058960, + "step": 9085 + }, + { + "epoch": 7.024729520865534, + "grad_norm": 0.9583092927932739, + "learning_rate": 4.0989474670281986e-05, + "loss": 0.4876, + "num_input_tokens_seen": 3060528, + "step": 9090 + }, + { + "epoch": 7.028593508500773, + "grad_norm": 0.9028961062431335, + "learning_rate": 4.097651045847546e-05, + "loss": 0.4115, + "num_input_tokens_seen": 3062128, + "step": 9095 + }, + { + "epoch": 7.032457496136012, + "grad_norm": 1.1360951662063599, + "learning_rate": 4.096353898045628e-05, + "loss": 0.4559, + "num_input_tokens_seen": 3063920, + "step": 9100 + }, + { + "epoch": 7.0363214837712516, + "grad_norm": 0.959746778011322, + "learning_rate": 4.095056024212399e-05, + "loss": 0.6253, + "num_input_tokens_seen": 3065552, + "step": 9105 + }, + { + "epoch": 7.040185471406492, + "grad_norm": 0.9316136240959167, + "learning_rate": 4.0937574249381375e-05, + "loss": 0.4028, + "num_input_tokens_seen": 3067056, + "step": 9110 + }, + { + "epoch": 7.044049459041731, + "grad_norm": 0.8003732562065125, + "learning_rate": 4.0924581008134544e-05, + "loss": 0.4953, + "num_input_tokens_seen": 3068784, + "step": 9115 + }, + { + "epoch": 7.04791344667697, + "grad_norm": 0.8434422016143799, + "learning_rate": 4.091158052429289e-05, + "loss": 0.4341, + "num_input_tokens_seen": 3070512, + "step": 9120 + }, + { + "epoch": 7.051777434312211, + "grad_norm": 1.0823076963424683, + "learning_rate": 4.089857280376914e-05, + "loss": 0.619, + "num_input_tokens_seen": 3072432, + "step": 9125 + }, + { + "epoch": 7.05564142194745, + "grad_norm": 0.7390895485877991, + "learning_rate": 4.088555785247925e-05, + "loss": 0.4321, + "num_input_tokens_seen": 3074352, + "step": 9130 + }, + { + "epoch": 7.059505409582689, + "grad_norm": 0.8409620523452759, + "learning_rate": 4.087253567634253e-05, + "loss": 0.7314, + "num_input_tokens_seen": 3075984, + "step": 9135 + }, + { + "epoch": 7.063369397217929, + "grad_norm": 1.060449242591858, + "learning_rate": 4.085950628128151e-05, + "loss": 0.4029, + "num_input_tokens_seen": 3077744, + "step": 9140 + }, + { + "epoch": 7.067233384853169, + "grad_norm": 1.3264496326446533, + "learning_rate": 4.084646967322206e-05, + "loss": 0.4904, + "num_input_tokens_seen": 3079568, + "step": 9145 + }, + { + "epoch": 7.071097372488408, + "grad_norm": 0.6658898591995239, + "learning_rate": 4.083342585809331e-05, + "loss": 0.3855, + "num_input_tokens_seen": 3081264, + "step": 9150 + }, + { + "epoch": 7.074961360123647, + "grad_norm": 0.5356266498565674, + "learning_rate": 4.082037484182766e-05, + "loss": 0.5374, + "num_input_tokens_seen": 3082800, + "step": 9155 + }, + { + "epoch": 7.078825347758887, + "grad_norm": 1.1030722856521606, + "learning_rate": 4.080731663036077e-05, + "loss": 0.4056, + "num_input_tokens_seen": 3084656, + "step": 9160 + }, + { + "epoch": 7.082689335394127, + "grad_norm": 1.0373103618621826, + "learning_rate": 4.0794251229631624e-05, + "loss": 0.4682, + "num_input_tokens_seen": 3086192, + "step": 9165 + }, + { + "epoch": 7.086553323029366, + "grad_norm": 1.100117802619934, + "learning_rate": 4.078117864558243e-05, + "loss": 0.4699, + "num_input_tokens_seen": 3087920, + "step": 9170 + }, + { + "epoch": 7.090417310664606, + "grad_norm": 0.6488837003707886, + "learning_rate": 4.0768098884158674e-05, + "loss": 0.3994, + "num_input_tokens_seen": 3089424, + "step": 9175 + }, + { + "epoch": 7.094281298299846, + "grad_norm": 0.9212996959686279, + "learning_rate": 4.0755011951309115e-05, + "loss": 0.662, + "num_input_tokens_seen": 3091248, + "step": 9180 + }, + { + "epoch": 7.098145285935085, + "grad_norm": 0.8406440019607544, + "learning_rate": 4.074191785298577e-05, + "loss": 0.6252, + "num_input_tokens_seen": 3092976, + "step": 9185 + }, + { + "epoch": 7.102009273570324, + "grad_norm": 0.5849459171295166, + "learning_rate": 4.07288165951439e-05, + "loss": 0.3781, + "num_input_tokens_seen": 3094384, + "step": 9190 + }, + { + "epoch": 7.105873261205564, + "grad_norm": 0.9501202702522278, + "learning_rate": 4.071570818374206e-05, + "loss": 0.5421, + "num_input_tokens_seen": 3096112, + "step": 9195 + }, + { + "epoch": 7.109737248840804, + "grad_norm": 1.0923726558685303, + "learning_rate": 4.070259262474201e-05, + "loss": 0.5404, + "num_input_tokens_seen": 3098032, + "step": 9200 + }, + { + "epoch": 7.113601236476043, + "grad_norm": 1.0773900747299194, + "learning_rate": 4.0689469924108804e-05, + "loss": 0.6635, + "num_input_tokens_seen": 3099728, + "step": 9205 + }, + { + "epoch": 7.117465224111283, + "grad_norm": 0.6345598697662354, + "learning_rate": 4.067634008781072e-05, + "loss": 0.4562, + "num_input_tokens_seen": 3101648, + "step": 9210 + }, + { + "epoch": 7.121329211746523, + "grad_norm": 0.7845984697341919, + "learning_rate": 4.066320312181927e-05, + "loss": 0.383, + "num_input_tokens_seen": 3103120, + "step": 9215 + }, + { + "epoch": 7.125193199381762, + "grad_norm": 0.8858041763305664, + "learning_rate": 4.065005903210923e-05, + "loss": 0.3833, + "num_input_tokens_seen": 3104624, + "step": 9220 + }, + { + "epoch": 7.129057187017001, + "grad_norm": 1.157418966293335, + "learning_rate": 4.063690782465863e-05, + "loss": 0.4178, + "num_input_tokens_seen": 3106096, + "step": 9225 + }, + { + "epoch": 7.132921174652241, + "grad_norm": 0.9097228050231934, + "learning_rate": 4.062374950544871e-05, + "loss": 0.343, + "num_input_tokens_seen": 3107888, + "step": 9230 + }, + { + "epoch": 7.136785162287481, + "grad_norm": 1.0499883890151978, + "learning_rate": 4.0610584080463946e-05, + "loss": 0.3852, + "num_input_tokens_seen": 3109392, + "step": 9235 + }, + { + "epoch": 7.14064914992272, + "grad_norm": 0.8650587201118469, + "learning_rate": 4.0597411555692044e-05, + "loss": 0.6619, + "num_input_tokens_seen": 3110992, + "step": 9240 + }, + { + "epoch": 7.14451313755796, + "grad_norm": 1.3933100700378418, + "learning_rate": 4.058423193712397e-05, + "loss": 0.5282, + "num_input_tokens_seen": 3112464, + "step": 9245 + }, + { + "epoch": 7.1483771251932, + "grad_norm": 0.7903625965118408, + "learning_rate": 4.057104523075387e-05, + "loss": 0.4048, + "num_input_tokens_seen": 3114288, + "step": 9250 + }, + { + "epoch": 7.152241112828439, + "grad_norm": 0.7414626479148865, + "learning_rate": 4.055785144257915e-05, + "loss": 0.4971, + "num_input_tokens_seen": 3116016, + "step": 9255 + }, + { + "epoch": 7.156105100463678, + "grad_norm": 1.2043973207473755, + "learning_rate": 4.054465057860043e-05, + "loss": 0.6163, + "num_input_tokens_seen": 3117776, + "step": 9260 + }, + { + "epoch": 7.159969088098918, + "grad_norm": 0.7665708661079407, + "learning_rate": 4.053144264482153e-05, + "loss": 0.4837, + "num_input_tokens_seen": 3119440, + "step": 9265 + }, + { + "epoch": 7.163833075734158, + "grad_norm": 0.9684591293334961, + "learning_rate": 4.0518227647249495e-05, + "loss": 0.4827, + "num_input_tokens_seen": 3121104, + "step": 9270 + }, + { + "epoch": 7.167697063369397, + "grad_norm": 0.8196788430213928, + "learning_rate": 4.0505005591894595e-05, + "loss": 0.6181, + "num_input_tokens_seen": 3122928, + "step": 9275 + }, + { + "epoch": 7.171561051004637, + "grad_norm": 0.7989099025726318, + "learning_rate": 4.04917764847703e-05, + "loss": 0.5062, + "num_input_tokens_seen": 3124848, + "step": 9280 + }, + { + "epoch": 7.175425038639876, + "grad_norm": 0.7682555913925171, + "learning_rate": 4.0478540331893263e-05, + "loss": 0.5386, + "num_input_tokens_seen": 3126352, + "step": 9285 + }, + { + "epoch": 7.179289026275116, + "grad_norm": 0.9222993850708008, + "learning_rate": 4.0465297139283406e-05, + "loss": 0.3615, + "num_input_tokens_seen": 3127920, + "step": 9290 + }, + { + "epoch": 7.183153013910355, + "grad_norm": 1.0482041835784912, + "learning_rate": 4.0452046912963794e-05, + "loss": 0.4307, + "num_input_tokens_seen": 3129776, + "step": 9295 + }, + { + "epoch": 7.187017001545595, + "grad_norm": 1.1884597539901733, + "learning_rate": 4.04387896589607e-05, + "loss": 0.4163, + "num_input_tokens_seen": 3131504, + "step": 9300 + }, + { + "epoch": 7.190880989180835, + "grad_norm": 0.555387556552887, + "learning_rate": 4.042552538330361e-05, + "loss": 0.4409, + "num_input_tokens_seen": 3133392, + "step": 9305 + }, + { + "epoch": 7.194744976816074, + "grad_norm": 1.1998738050460815, + "learning_rate": 4.041225409202519e-05, + "loss": 0.4303, + "num_input_tokens_seen": 3135152, + "step": 9310 + }, + { + "epoch": 7.198608964451314, + "grad_norm": 0.5302574038505554, + "learning_rate": 4.039897579116132e-05, + "loss": 0.4827, + "num_input_tokens_seen": 3136848, + "step": 9315 + }, + { + "epoch": 7.202472952086553, + "grad_norm": 0.656398355960846, + "learning_rate": 4.038569048675103e-05, + "loss": 0.4122, + "num_input_tokens_seen": 3138768, + "step": 9320 + }, + { + "epoch": 7.206336939721793, + "grad_norm": 2.553516387939453, + "learning_rate": 4.037239818483657e-05, + "loss": 0.5207, + "num_input_tokens_seen": 3140432, + "step": 9325 + }, + { + "epoch": 7.210200927357032, + "grad_norm": 0.9409830570220947, + "learning_rate": 4.0359098891463355e-05, + "loss": 0.5794, + "num_input_tokens_seen": 3142000, + "step": 9330 + }, + { + "epoch": 7.214064914992272, + "grad_norm": 0.9150859713554382, + "learning_rate": 4.034579261267998e-05, + "loss": 0.4465, + "num_input_tokens_seen": 3143728, + "step": 9335 + }, + { + "epoch": 7.217928902627512, + "grad_norm": 0.782433807849884, + "learning_rate": 4.033247935453822e-05, + "loss": 0.6928, + "num_input_tokens_seen": 3145680, + "step": 9340 + }, + { + "epoch": 7.221792890262751, + "grad_norm": 1.0952417850494385, + "learning_rate": 4.031915912309303e-05, + "loss": 0.3889, + "num_input_tokens_seen": 3147280, + "step": 9345 + }, + { + "epoch": 7.225656877897991, + "grad_norm": 0.876739501953125, + "learning_rate": 4.030583192440253e-05, + "loss": 0.3659, + "num_input_tokens_seen": 3149040, + "step": 9350 + }, + { + "epoch": 7.22952086553323, + "grad_norm": 1.9194996356964111, + "learning_rate": 4.0292497764528e-05, + "loss": 0.4917, + "num_input_tokens_seen": 3150928, + "step": 9355 + }, + { + "epoch": 7.23338485316847, + "grad_norm": 0.8128892779350281, + "learning_rate": 4.027915664953391e-05, + "loss": 0.4359, + "num_input_tokens_seen": 3152592, + "step": 9360 + }, + { + "epoch": 7.2372488408037094, + "grad_norm": 1.2413538694381714, + "learning_rate": 4.0265808585487874e-05, + "loss": 0.7204, + "num_input_tokens_seen": 3154320, + "step": 9365 + }, + { + "epoch": 7.241112828438949, + "grad_norm": 0.6518245935440063, + "learning_rate": 4.0252453578460666e-05, + "loss": 0.3419, + "num_input_tokens_seen": 3156144, + "step": 9370 + }, + { + "epoch": 7.244976816074189, + "grad_norm": 1.1237682104110718, + "learning_rate": 4.023909163452623e-05, + "loss": 0.4684, + "num_input_tokens_seen": 3158000, + "step": 9375 + }, + { + "epoch": 7.248840803709428, + "grad_norm": 0.9546007513999939, + "learning_rate": 4.0225722759761656e-05, + "loss": 0.3862, + "num_input_tokens_seen": 3159632, + "step": 9380 + }, + { + "epoch": 7.252704791344668, + "grad_norm": 0.8679941892623901, + "learning_rate": 4.021234696024718e-05, + "loss": 0.44, + "num_input_tokens_seen": 3161360, + "step": 9385 + }, + { + "epoch": 7.256568778979907, + "grad_norm": 0.9105110168457031, + "learning_rate": 4.0198964242066215e-05, + "loss": 0.6608, + "num_input_tokens_seen": 3162992, + "step": 9390 + }, + { + "epoch": 7.260432766615147, + "grad_norm": 1.1811500787734985, + "learning_rate": 4.018557461130528e-05, + "loss": 0.4423, + "num_input_tokens_seen": 3164656, + "step": 9395 + }, + { + "epoch": 7.2642967542503865, + "grad_norm": 0.7704162001609802, + "learning_rate": 4.017217807405407e-05, + "loss": 0.369, + "num_input_tokens_seen": 3166608, + "step": 9400 + }, + { + "epoch": 7.268160741885626, + "grad_norm": 1.000335931777954, + "learning_rate": 4.015877463640542e-05, + "loss": 0.4151, + "num_input_tokens_seen": 3168560, + "step": 9405 + }, + { + "epoch": 7.272024729520865, + "grad_norm": 1.2230581045150757, + "learning_rate": 4.014536430445527e-05, + "loss": 0.4858, + "num_input_tokens_seen": 3170352, + "step": 9410 + }, + { + "epoch": 7.275888717156105, + "grad_norm": 1.263555884361267, + "learning_rate": 4.013194708430273e-05, + "loss": 0.6096, + "num_input_tokens_seen": 3172432, + "step": 9415 + }, + { + "epoch": 7.279752704791345, + "grad_norm": 1.258428692817688, + "learning_rate": 4.0118522982050045e-05, + "loss": 0.4692, + "num_input_tokens_seen": 3174064, + "step": 9420 + }, + { + "epoch": 7.283616692426584, + "grad_norm": 0.8337683081626892, + "learning_rate": 4.010509200380257e-05, + "loss": 0.4174, + "num_input_tokens_seen": 3175920, + "step": 9425 + }, + { + "epoch": 7.287480680061824, + "grad_norm": 1.0306439399719238, + "learning_rate": 4.009165415566878e-05, + "loss": 0.4064, + "num_input_tokens_seen": 3177552, + "step": 9430 + }, + { + "epoch": 7.2913446676970635, + "grad_norm": 1.4661468267440796, + "learning_rate": 4.007820944376031e-05, + "loss": 0.4331, + "num_input_tokens_seen": 3179728, + "step": 9435 + }, + { + "epoch": 7.295208655332303, + "grad_norm": 0.8655324578285217, + "learning_rate": 4.006475787419187e-05, + "loss": 0.459, + "num_input_tokens_seen": 3181648, + "step": 9440 + }, + { + "epoch": 7.299072642967542, + "grad_norm": 0.6795656681060791, + "learning_rate": 4.0051299453081337e-05, + "loss": 0.5116, + "num_input_tokens_seen": 3183408, + "step": 9445 + }, + { + "epoch": 7.302936630602782, + "grad_norm": 0.8673535585403442, + "learning_rate": 4.003783418654968e-05, + "loss": 0.6015, + "num_input_tokens_seen": 3185008, + "step": 9450 + }, + { + "epoch": 7.306800618238022, + "grad_norm": 0.4937629997730255, + "learning_rate": 4.0024362080720964e-05, + "loss": 0.3723, + "num_input_tokens_seen": 3186928, + "step": 9455 + }, + { + "epoch": 7.310664605873261, + "grad_norm": 0.9435788989067078, + "learning_rate": 4.001088314172241e-05, + "loss": 0.4896, + "num_input_tokens_seen": 3188976, + "step": 9460 + }, + { + "epoch": 7.314528593508501, + "grad_norm": 0.8359382748603821, + "learning_rate": 3.9997397375684295e-05, + "loss": 0.4316, + "num_input_tokens_seen": 3190448, + "step": 9465 + }, + { + "epoch": 7.3183925811437405, + "grad_norm": 0.46068376302719116, + "learning_rate": 3.9983904788740044e-05, + "loss": 0.3575, + "num_input_tokens_seen": 3191952, + "step": 9470 + }, + { + "epoch": 7.32225656877898, + "grad_norm": 0.9593237638473511, + "learning_rate": 3.9970405387026165e-05, + "loss": 0.446, + "num_input_tokens_seen": 3193712, + "step": 9475 + }, + { + "epoch": 7.326120556414219, + "grad_norm": 1.4203895330429077, + "learning_rate": 3.995689917668225e-05, + "loss": 0.4695, + "num_input_tokens_seen": 3195600, + "step": 9480 + }, + { + "epoch": 7.329984544049459, + "grad_norm": 1.3504276275634766, + "learning_rate": 3.9943386163851025e-05, + "loss": 0.5867, + "num_input_tokens_seen": 3197200, + "step": 9485 + }, + { + "epoch": 7.333848531684699, + "grad_norm": 0.8455384969711304, + "learning_rate": 3.992986635467828e-05, + "loss": 0.4123, + "num_input_tokens_seen": 3198928, + "step": 9490 + }, + { + "epoch": 7.337712519319938, + "grad_norm": 1.1959726810455322, + "learning_rate": 3.99163397553129e-05, + "loss": 0.4781, + "num_input_tokens_seen": 3200304, + "step": 9495 + }, + { + "epoch": 7.341576506955178, + "grad_norm": 1.097657322883606, + "learning_rate": 3.9902806371906875e-05, + "loss": 0.4288, + "num_input_tokens_seen": 3201968, + "step": 9500 + }, + { + "epoch": 7.3454404945904175, + "grad_norm": 0.9043393135070801, + "learning_rate": 3.988926621061526e-05, + "loss": 0.5958, + "num_input_tokens_seen": 3203856, + "step": 9505 + }, + { + "epoch": 7.349304482225657, + "grad_norm": 0.5301652550697327, + "learning_rate": 3.9875719277596204e-05, + "loss": 0.4302, + "num_input_tokens_seen": 3205456, + "step": 9510 + }, + { + "epoch": 7.353168469860896, + "grad_norm": 0.9669467210769653, + "learning_rate": 3.986216557901094e-05, + "loss": 0.3794, + "num_input_tokens_seen": 3207184, + "step": 9515 + }, + { + "epoch": 7.357032457496136, + "grad_norm": 0.7884445190429688, + "learning_rate": 3.9848605121023746e-05, + "loss": 0.3762, + "num_input_tokens_seen": 3208752, + "step": 9520 + }, + { + "epoch": 7.360896445131376, + "grad_norm": 1.268231749534607, + "learning_rate": 3.9835037909802034e-05, + "loss": 0.43, + "num_input_tokens_seen": 3210192, + "step": 9525 + }, + { + "epoch": 7.364760432766615, + "grad_norm": 0.8803391456604004, + "learning_rate": 3.9821463951516234e-05, + "loss": 0.3798, + "num_input_tokens_seen": 3211824, + "step": 9530 + }, + { + "epoch": 7.368624420401854, + "grad_norm": 0.9189442992210388, + "learning_rate": 3.980788325233986e-05, + "loss": 0.3992, + "num_input_tokens_seen": 3213584, + "step": 9535 + }, + { + "epoch": 7.3724884080370945, + "grad_norm": 2.113792896270752, + "learning_rate": 3.9794295818449515e-05, + "loss": 0.5919, + "num_input_tokens_seen": 3215312, + "step": 9540 + }, + { + "epoch": 7.376352395672334, + "grad_norm": 0.8288263082504272, + "learning_rate": 3.9780701656024815e-05, + "loss": 0.3378, + "num_input_tokens_seen": 3216944, + "step": 9545 + }, + { + "epoch": 7.380216383307573, + "grad_norm": 0.9887806177139282, + "learning_rate": 3.97671007712485e-05, + "loss": 0.403, + "num_input_tokens_seen": 3218704, + "step": 9550 + }, + { + "epoch": 7.384080370942813, + "grad_norm": 0.8950603604316711, + "learning_rate": 3.9753493170306314e-05, + "loss": 0.4268, + "num_input_tokens_seen": 3220304, + "step": 9555 + }, + { + "epoch": 7.387944358578053, + "grad_norm": 0.7787631154060364, + "learning_rate": 3.973987885938707e-05, + "loss": 0.4559, + "num_input_tokens_seen": 3222128, + "step": 9560 + }, + { + "epoch": 7.391808346213292, + "grad_norm": 1.0664417743682861, + "learning_rate": 3.972625784468264e-05, + "loss": 0.4732, + "num_input_tokens_seen": 3223760, + "step": 9565 + }, + { + "epoch": 7.395672333848531, + "grad_norm": 0.7609180808067322, + "learning_rate": 3.9712630132387975e-05, + "loss": 0.3994, + "num_input_tokens_seen": 3225424, + "step": 9570 + }, + { + "epoch": 7.3995363214837715, + "grad_norm": 0.9744125008583069, + "learning_rate": 3.969899572870101e-05, + "loss": 0.4673, + "num_input_tokens_seen": 3227024, + "step": 9575 + }, + { + "epoch": 7.403400309119011, + "grad_norm": 0.610955536365509, + "learning_rate": 3.968535463982275e-05, + "loss": 0.476, + "num_input_tokens_seen": 3228624, + "step": 9580 + }, + { + "epoch": 7.40726429675425, + "grad_norm": 0.8166180849075317, + "learning_rate": 3.967170687195725e-05, + "loss": 0.406, + "num_input_tokens_seen": 3230320, + "step": 9585 + }, + { + "epoch": 7.41112828438949, + "grad_norm": 0.7300429344177246, + "learning_rate": 3.96580524313116e-05, + "loss": 0.4084, + "num_input_tokens_seen": 3231824, + "step": 9590 + }, + { + "epoch": 7.41499227202473, + "grad_norm": 0.8919588923454285, + "learning_rate": 3.9644391324095925e-05, + "loss": 0.3635, + "num_input_tokens_seen": 3233520, + "step": 9595 + }, + { + "epoch": 7.418856259659969, + "grad_norm": 0.5415533185005188, + "learning_rate": 3.963072355652338e-05, + "loss": 0.4424, + "num_input_tokens_seen": 3235248, + "step": 9600 + }, + { + "epoch": 7.422720247295208, + "grad_norm": 1.1733537912368774, + "learning_rate": 3.961704913481012e-05, + "loss": 0.4145, + "num_input_tokens_seen": 3236944, + "step": 9605 + }, + { + "epoch": 7.4265842349304485, + "grad_norm": 0.7117109298706055, + "learning_rate": 3.96033680651754e-05, + "loss": 0.4686, + "num_input_tokens_seen": 3238608, + "step": 9610 + }, + { + "epoch": 7.430448222565688, + "grad_norm": 0.8218488097190857, + "learning_rate": 3.958968035384142e-05, + "loss": 0.5036, + "num_input_tokens_seen": 3240400, + "step": 9615 + }, + { + "epoch": 7.434312210200927, + "grad_norm": 1.0563310384750366, + "learning_rate": 3.957598600703344e-05, + "loss": 0.6241, + "num_input_tokens_seen": 3242128, + "step": 9620 + }, + { + "epoch": 7.438176197836167, + "grad_norm": 0.6147783994674683, + "learning_rate": 3.9562285030979727e-05, + "loss": 0.4149, + "num_input_tokens_seen": 3243888, + "step": 9625 + }, + { + "epoch": 7.442040185471407, + "grad_norm": 0.7115205526351929, + "learning_rate": 3.954857743191157e-05, + "loss": 0.435, + "num_input_tokens_seen": 3245616, + "step": 9630 + }, + { + "epoch": 7.445904173106646, + "grad_norm": 0.9470164179801941, + "learning_rate": 3.953486321606328e-05, + "loss": 0.6355, + "num_input_tokens_seen": 3247376, + "step": 9635 + }, + { + "epoch": 7.449768160741885, + "grad_norm": 0.6451857686042786, + "learning_rate": 3.952114238967215e-05, + "loss": 0.3573, + "num_input_tokens_seen": 3248912, + "step": 9640 + }, + { + "epoch": 7.4536321483771255, + "grad_norm": 0.5877951979637146, + "learning_rate": 3.9507414958978494e-05, + "loss": 0.5888, + "num_input_tokens_seen": 3250832, + "step": 9645 + }, + { + "epoch": 7.457496136012365, + "grad_norm": 1.5845133066177368, + "learning_rate": 3.9493680930225626e-05, + "loss": 0.688, + "num_input_tokens_seen": 3252656, + "step": 9650 + }, + { + "epoch": 7.461360123647604, + "grad_norm": 1.06534743309021, + "learning_rate": 3.947994030965989e-05, + "loss": 0.625, + "num_input_tokens_seen": 3254160, + "step": 9655 + }, + { + "epoch": 7.4652241112828435, + "grad_norm": 0.7112820148468018, + "learning_rate": 3.946619310353057e-05, + "loss": 0.5689, + "num_input_tokens_seen": 3255856, + "step": 9660 + }, + { + "epoch": 7.469088098918084, + "grad_norm": 0.8947231769561768, + "learning_rate": 3.945243931809e-05, + "loss": 0.413, + "num_input_tokens_seen": 3257424, + "step": 9665 + }, + { + "epoch": 7.472952086553323, + "grad_norm": 0.5371418595314026, + "learning_rate": 3.943867895959347e-05, + "loss": 0.3573, + "num_input_tokens_seen": 3258992, + "step": 9670 + }, + { + "epoch": 7.476816074188562, + "grad_norm": 0.6968140602111816, + "learning_rate": 3.942491203429928e-05, + "loss": 0.5201, + "num_input_tokens_seen": 3260784, + "step": 9675 + }, + { + "epoch": 7.4806800618238025, + "grad_norm": 0.8625325560569763, + "learning_rate": 3.941113854846871e-05, + "loss": 0.6224, + "num_input_tokens_seen": 3262288, + "step": 9680 + }, + { + "epoch": 7.484544049459042, + "grad_norm": 1.0516620874404907, + "learning_rate": 3.939735850836602e-05, + "loss": 0.6196, + "num_input_tokens_seen": 3263696, + "step": 9685 + }, + { + "epoch": 7.488408037094281, + "grad_norm": 0.6250459551811218, + "learning_rate": 3.9383571920258465e-05, + "loss": 0.4116, + "num_input_tokens_seen": 3265424, + "step": 9690 + }, + { + "epoch": 7.4922720247295205, + "grad_norm": 1.1165794134140015, + "learning_rate": 3.936977879041626e-05, + "loss": 0.435, + "num_input_tokens_seen": 3267120, + "step": 9695 + }, + { + "epoch": 7.496136012364761, + "grad_norm": 0.9347681403160095, + "learning_rate": 3.935597912511261e-05, + "loss": 0.4688, + "num_input_tokens_seen": 3268720, + "step": 9700 + }, + { + "epoch": 7.5, + "grad_norm": 0.8035856485366821, + "learning_rate": 3.934217293062367e-05, + "loss": 0.3799, + "num_input_tokens_seen": 3270256, + "step": 9705 + }, + { + "epoch": 7.503863987635239, + "grad_norm": 0.9083061814308167, + "learning_rate": 3.932836021322859e-05, + "loss": 0.393, + "num_input_tokens_seen": 3271792, + "step": 9710 + }, + { + "epoch": 7.507727975270479, + "grad_norm": 0.7576149106025696, + "learning_rate": 3.931454097920947e-05, + "loss": 0.4892, + "num_input_tokens_seen": 3273392, + "step": 9715 + }, + { + "epoch": 7.511591962905719, + "grad_norm": 0.9501492977142334, + "learning_rate": 3.930071523485139e-05, + "loss": 0.3917, + "num_input_tokens_seen": 3274992, + "step": 9720 + }, + { + "epoch": 7.515455950540958, + "grad_norm": 1.0212472677230835, + "learning_rate": 3.928688298644238e-05, + "loss": 0.4163, + "num_input_tokens_seen": 3277008, + "step": 9725 + }, + { + "epoch": 7.5193199381761975, + "grad_norm": 1.0352874994277954, + "learning_rate": 3.9273044240273434e-05, + "loss": 0.4447, + "num_input_tokens_seen": 3278608, + "step": 9730 + }, + { + "epoch": 7.523183925811438, + "grad_norm": 0.7807784080505371, + "learning_rate": 3.925919900263848e-05, + "loss": 0.4759, + "num_input_tokens_seen": 3280240, + "step": 9735 + }, + { + "epoch": 7.527047913446677, + "grad_norm": 2.3276798725128174, + "learning_rate": 3.924534727983443e-05, + "loss": 0.4521, + "num_input_tokens_seen": 3281968, + "step": 9740 + }, + { + "epoch": 7.530911901081916, + "grad_norm": 1.3307292461395264, + "learning_rate": 3.923148907816112e-05, + "loss": 0.4211, + "num_input_tokens_seen": 3283568, + "step": 9745 + }, + { + "epoch": 7.5347758887171565, + "grad_norm": 1.3753581047058105, + "learning_rate": 3.921762440392135e-05, + "loss": 0.4437, + "num_input_tokens_seen": 3285200, + "step": 9750 + }, + { + "epoch": 7.538639876352396, + "grad_norm": 0.7683613896369934, + "learning_rate": 3.9203753263420854e-05, + "loss": 0.5832, + "num_input_tokens_seen": 3287024, + "step": 9755 + }, + { + "epoch": 7.542503863987635, + "grad_norm": 0.8539109826087952, + "learning_rate": 3.918987566296831e-05, + "loss": 0.4653, + "num_input_tokens_seen": 3288944, + "step": 9760 + }, + { + "epoch": 7.5463678516228745, + "grad_norm": 0.8324636220932007, + "learning_rate": 3.917599160887534e-05, + "loss": 0.602, + "num_input_tokens_seen": 3290736, + "step": 9765 + }, + { + "epoch": 7.550231839258115, + "grad_norm": 1.3089796304702759, + "learning_rate": 3.916210110745648e-05, + "loss": 0.4818, + "num_input_tokens_seen": 3292432, + "step": 9770 + }, + { + "epoch": 7.554095826893354, + "grad_norm": 1.0206376314163208, + "learning_rate": 3.9148204165029235e-05, + "loss": 0.5888, + "num_input_tokens_seen": 3293968, + "step": 9775 + }, + { + "epoch": 7.557959814528593, + "grad_norm": 0.46485087275505066, + "learning_rate": 3.9134300787914e-05, + "loss": 0.544, + "num_input_tokens_seen": 3295440, + "step": 9780 + }, + { + "epoch": 7.561823802163833, + "grad_norm": 1.0918030738830566, + "learning_rate": 3.912039098243412e-05, + "loss": 0.4844, + "num_input_tokens_seen": 3297008, + "step": 9785 + }, + { + "epoch": 7.565687789799073, + "grad_norm": 1.1341018676757812, + "learning_rate": 3.9106474754915856e-05, + "loss": 0.4341, + "num_input_tokens_seen": 3298832, + "step": 9790 + }, + { + "epoch": 7.569551777434312, + "grad_norm": 0.9529621601104736, + "learning_rate": 3.90925521116884e-05, + "loss": 0.3556, + "num_input_tokens_seen": 3300496, + "step": 9795 + }, + { + "epoch": 7.5734157650695515, + "grad_norm": 1.067521572113037, + "learning_rate": 3.907862305908384e-05, + "loss": 0.3856, + "num_input_tokens_seen": 3302096, + "step": 9800 + }, + { + "epoch": 7.577279752704792, + "grad_norm": 1.0680949687957764, + "learning_rate": 3.90646876034372e-05, + "loss": 0.4967, + "num_input_tokens_seen": 3303984, + "step": 9805 + }, + { + "epoch": 7.581143740340031, + "grad_norm": 0.8159752488136292, + "learning_rate": 3.905074575108641e-05, + "loss": 0.4051, + "num_input_tokens_seen": 3305552, + "step": 9810 + }, + { + "epoch": 7.58500772797527, + "grad_norm": 1.0230821371078491, + "learning_rate": 3.9036797508372306e-05, + "loss": 0.422, + "num_input_tokens_seen": 3307248, + "step": 9815 + }, + { + "epoch": 7.58887171561051, + "grad_norm": 1.1921089887619019, + "learning_rate": 3.9022842881638624e-05, + "loss": 0.5307, + "num_input_tokens_seen": 3309232, + "step": 9820 + }, + { + "epoch": 7.59273570324575, + "grad_norm": 0.7388320565223694, + "learning_rate": 3.900888187723203e-05, + "loss": 0.469, + "num_input_tokens_seen": 3310928, + "step": 9825 + }, + { + "epoch": 7.596599690880989, + "grad_norm": 0.7417129278182983, + "learning_rate": 3.899491450150206e-05, + "loss": 0.3716, + "num_input_tokens_seen": 3312560, + "step": 9830 + }, + { + "epoch": 7.6004636785162285, + "grad_norm": 0.9002905488014221, + "learning_rate": 3.898094076080115e-05, + "loss": 0.6126, + "num_input_tokens_seen": 3314032, + "step": 9835 + }, + { + "epoch": 7.604327666151468, + "grad_norm": 0.5943809747695923, + "learning_rate": 3.8966960661484666e-05, + "loss": 0.5571, + "num_input_tokens_seen": 3315632, + "step": 9840 + }, + { + "epoch": 7.608191653786708, + "grad_norm": 0.7216756939888, + "learning_rate": 3.895297420991083e-05, + "loss": 0.3643, + "num_input_tokens_seen": 3317424, + "step": 9845 + }, + { + "epoch": 7.612055641421947, + "grad_norm": 0.7214248180389404, + "learning_rate": 3.8938981412440755e-05, + "loss": 0.4003, + "num_input_tokens_seen": 3319056, + "step": 9850 + }, + { + "epoch": 7.615919629057187, + "grad_norm": 1.454262614250183, + "learning_rate": 3.892498227543846e-05, + "loss": 0.6902, + "num_input_tokens_seen": 3321200, + "step": 9855 + }, + { + "epoch": 7.619783616692427, + "grad_norm": 0.9143612384796143, + "learning_rate": 3.8910976805270837e-05, + "loss": 0.4501, + "num_input_tokens_seen": 3322832, + "step": 9860 + }, + { + "epoch": 7.623647604327666, + "grad_norm": 0.677970826625824, + "learning_rate": 3.8896965008307646e-05, + "loss": 0.3549, + "num_input_tokens_seen": 3324336, + "step": 9865 + }, + { + "epoch": 7.6275115919629055, + "grad_norm": 1.1094694137573242, + "learning_rate": 3.8882946890921545e-05, + "loss": 0.6829, + "num_input_tokens_seen": 3326064, + "step": 9870 + }, + { + "epoch": 7.631375579598146, + "grad_norm": 1.2796393632888794, + "learning_rate": 3.886892245948806e-05, + "loss": 0.4862, + "num_input_tokens_seen": 3327632, + "step": 9875 + }, + { + "epoch": 7.635239567233385, + "grad_norm": 0.6388095021247864, + "learning_rate": 3.885489172038558e-05, + "loss": 0.4773, + "num_input_tokens_seen": 3329104, + "step": 9880 + }, + { + "epoch": 7.639103554868624, + "grad_norm": 1.381230354309082, + "learning_rate": 3.884085467999537e-05, + "loss": 0.9275, + "num_input_tokens_seen": 3330448, + "step": 9885 + }, + { + "epoch": 7.642967542503864, + "grad_norm": 2.089945077896118, + "learning_rate": 3.8826811344701565e-05, + "loss": 1.051, + "num_input_tokens_seen": 3332048, + "step": 9890 + }, + { + "epoch": 7.646831530139104, + "grad_norm": 1.3151627779006958, + "learning_rate": 3.8812761720891156e-05, + "loss": 0.6459, + "num_input_tokens_seen": 3333808, + "step": 9895 + }, + { + "epoch": 7.650695517774343, + "grad_norm": 1.2516531944274902, + "learning_rate": 3.879870581495399e-05, + "loss": 0.4388, + "num_input_tokens_seen": 3335536, + "step": 9900 + }, + { + "epoch": 7.6545595054095825, + "grad_norm": 0.8650918006896973, + "learning_rate": 3.878464363328279e-05, + "loss": 0.4339, + "num_input_tokens_seen": 3337424, + "step": 9905 + }, + { + "epoch": 7.658423493044822, + "grad_norm": 1.0637725591659546, + "learning_rate": 3.8770575182273104e-05, + "loss": 0.4922, + "num_input_tokens_seen": 3339408, + "step": 9910 + }, + { + "epoch": 7.662287480680062, + "grad_norm": 0.7054779529571533, + "learning_rate": 3.8756500468323365e-05, + "loss": 0.4908, + "num_input_tokens_seen": 3341008, + "step": 9915 + }, + { + "epoch": 7.666151468315301, + "grad_norm": 0.5956030488014221, + "learning_rate": 3.874241949783483e-05, + "loss": 0.4071, + "num_input_tokens_seen": 3342640, + "step": 9920 + }, + { + "epoch": 7.670015455950541, + "grad_norm": 1.1557046175003052, + "learning_rate": 3.872833227721159e-05, + "loss": 0.7432, + "num_input_tokens_seen": 3344240, + "step": 9925 + }, + { + "epoch": 7.673879443585781, + "grad_norm": 1.0521539449691772, + "learning_rate": 3.871423881286062e-05, + "loss": 0.4209, + "num_input_tokens_seen": 3345776, + "step": 9930 + }, + { + "epoch": 7.67774343122102, + "grad_norm": 0.7300729751586914, + "learning_rate": 3.870013911119172e-05, + "loss": 0.4363, + "num_input_tokens_seen": 3347440, + "step": 9935 + }, + { + "epoch": 7.6816074188562595, + "grad_norm": 1.255725383758545, + "learning_rate": 3.8686033178617496e-05, + "loss": 0.5395, + "num_input_tokens_seen": 3349168, + "step": 9940 + }, + { + "epoch": 7.685471406491499, + "grad_norm": 1.2178806066513062, + "learning_rate": 3.8671921021553427e-05, + "loss": 0.6204, + "num_input_tokens_seen": 3350928, + "step": 9945 + }, + { + "epoch": 7.689335394126739, + "grad_norm": 0.7551068067550659, + "learning_rate": 3.865780264641778e-05, + "loss": 0.5486, + "num_input_tokens_seen": 3352752, + "step": 9950 + }, + { + "epoch": 7.693199381761978, + "grad_norm": 1.399586796760559, + "learning_rate": 3.864367805963172e-05, + "loss": 0.4829, + "num_input_tokens_seen": 3354448, + "step": 9955 + }, + { + "epoch": 7.697063369397218, + "grad_norm": 0.7167933583259583, + "learning_rate": 3.862954726761916e-05, + "loss": 0.3831, + "num_input_tokens_seen": 3356240, + "step": 9960 + }, + { + "epoch": 7.700927357032457, + "grad_norm": 1.062625527381897, + "learning_rate": 3.8615410276806874e-05, + "loss": 0.4567, + "num_input_tokens_seen": 3358096, + "step": 9965 + }, + { + "epoch": 7.704791344667697, + "grad_norm": 0.7838727831840515, + "learning_rate": 3.860126709362446e-05, + "loss": 0.5561, + "num_input_tokens_seen": 3359632, + "step": 9970 + }, + { + "epoch": 7.7086553323029365, + "grad_norm": 0.7545928955078125, + "learning_rate": 3.858711772450432e-05, + "loss": 0.426, + "num_input_tokens_seen": 3361232, + "step": 9975 + }, + { + "epoch": 7.712519319938176, + "grad_norm": 1.0302640199661255, + "learning_rate": 3.857296217588167e-05, + "loss": 0.3561, + "num_input_tokens_seen": 3362864, + "step": 9980 + }, + { + "epoch": 7.716383307573416, + "grad_norm": 1.1473032236099243, + "learning_rate": 3.8558800454194524e-05, + "loss": 0.6484, + "num_input_tokens_seen": 3364880, + "step": 9985 + }, + { + "epoch": 7.720247295208655, + "grad_norm": 0.8283623456954956, + "learning_rate": 3.854463256588374e-05, + "loss": 0.5524, + "num_input_tokens_seen": 3366576, + "step": 9990 + }, + { + "epoch": 7.724111282843895, + "grad_norm": 0.7660837173461914, + "learning_rate": 3.853045851739295e-05, + "loss": 0.4908, + "num_input_tokens_seen": 3368496, + "step": 9995 + }, + { + "epoch": 7.727975270479135, + "grad_norm": 0.7256408929824829, + "learning_rate": 3.851627831516859e-05, + "loss": 0.4788, + "num_input_tokens_seen": 3370160, + "step": 10000 + }, + { + "epoch": 7.731839258114374, + "grad_norm": 1.735221028327942, + "learning_rate": 3.850209196565991e-05, + "loss": 0.5383, + "num_input_tokens_seen": 3371856, + "step": 10005 + }, + { + "epoch": 7.7357032457496135, + "grad_norm": 0.48557278513908386, + "learning_rate": 3.848789947531895e-05, + "loss": 0.5243, + "num_input_tokens_seen": 3373232, + "step": 10010 + }, + { + "epoch": 7.739567233384853, + "grad_norm": 0.7252404093742371, + "learning_rate": 3.847370085060052e-05, + "loss": 0.4523, + "num_input_tokens_seen": 3375024, + "step": 10015 + }, + { + "epoch": 7.743431221020093, + "grad_norm": 0.755211591720581, + "learning_rate": 3.845949609796226e-05, + "loss": 0.409, + "num_input_tokens_seen": 3376784, + "step": 10020 + }, + { + "epoch": 7.747295208655332, + "grad_norm": 1.039905309677124, + "learning_rate": 3.844528522386457e-05, + "loss": 0.6209, + "num_input_tokens_seen": 3378448, + "step": 10025 + }, + { + "epoch": 7.751159196290572, + "grad_norm": 1.2129647731781006, + "learning_rate": 3.8431068234770636e-05, + "loss": 0.7392, + "num_input_tokens_seen": 3379952, + "step": 10030 + }, + { + "epoch": 7.755023183925811, + "grad_norm": 1.3702020645141602, + "learning_rate": 3.841684513714643e-05, + "loss": 0.4387, + "num_input_tokens_seen": 3381392, + "step": 10035 + }, + { + "epoch": 7.758887171561051, + "grad_norm": 0.8696306943893433, + "learning_rate": 3.840261593746071e-05, + "loss": 0.4587, + "num_input_tokens_seen": 3383184, + "step": 10040 + }, + { + "epoch": 7.7627511591962906, + "grad_norm": 0.7625377178192139, + "learning_rate": 3.8388380642184993e-05, + "loss": 0.4448, + "num_input_tokens_seen": 3384784, + "step": 10045 + }, + { + "epoch": 7.76661514683153, + "grad_norm": 1.1152318716049194, + "learning_rate": 3.8374139257793586e-05, + "loss": 0.4508, + "num_input_tokens_seen": 3386224, + "step": 10050 + }, + { + "epoch": 7.77047913446677, + "grad_norm": 1.0585014820098877, + "learning_rate": 3.8359891790763546e-05, + "loss": 0.44, + "num_input_tokens_seen": 3387984, + "step": 10055 + }, + { + "epoch": 7.774343122102009, + "grad_norm": 0.6969020962715149, + "learning_rate": 3.834563824757471e-05, + "loss": 0.3863, + "num_input_tokens_seen": 3389616, + "step": 10060 + }, + { + "epoch": 7.778207109737249, + "grad_norm": 1.0327528715133667, + "learning_rate": 3.833137863470968e-05, + "loss": 0.4706, + "num_input_tokens_seen": 3391088, + "step": 10065 + }, + { + "epoch": 7.782071097372488, + "grad_norm": 0.9309789538383484, + "learning_rate": 3.83171129586538e-05, + "loss": 0.7675, + "num_input_tokens_seen": 3393072, + "step": 10070 + }, + { + "epoch": 7.785935085007728, + "grad_norm": 0.7712658047676086, + "learning_rate": 3.8302841225895204e-05, + "loss": 0.4097, + "num_input_tokens_seen": 3394704, + "step": 10075 + }, + { + "epoch": 7.789799072642968, + "grad_norm": 0.9402986168861389, + "learning_rate": 3.8288563442924746e-05, + "loss": 0.5091, + "num_input_tokens_seen": 3396304, + "step": 10080 + }, + { + "epoch": 7.793663060278207, + "grad_norm": 0.7366388440132141, + "learning_rate": 3.827427961623604e-05, + "loss": 0.4017, + "num_input_tokens_seen": 3398160, + "step": 10085 + }, + { + "epoch": 7.797527047913446, + "grad_norm": 0.6348068714141846, + "learning_rate": 3.825998975232549e-05, + "loss": 0.3915, + "num_input_tokens_seen": 3399600, + "step": 10090 + }, + { + "epoch": 7.801391035548686, + "grad_norm": 0.7397517561912537, + "learning_rate": 3.824569385769218e-05, + "loss": 0.5149, + "num_input_tokens_seen": 3401168, + "step": 10095 + }, + { + "epoch": 7.805255023183926, + "grad_norm": 0.995282769203186, + "learning_rate": 3.8231391938837966e-05, + "loss": 0.5159, + "num_input_tokens_seen": 3402864, + "step": 10100 + }, + { + "epoch": 7.809119010819165, + "grad_norm": 0.7381584644317627, + "learning_rate": 3.821708400226747e-05, + "loss": 0.6087, + "num_input_tokens_seen": 3404528, + "step": 10105 + }, + { + "epoch": 7.812982998454405, + "grad_norm": 0.956479012966156, + "learning_rate": 3.820277005448802e-05, + "loss": 0.5356, + "num_input_tokens_seen": 3406448, + "step": 10110 + }, + { + "epoch": 7.816846986089645, + "grad_norm": 0.9837088584899902, + "learning_rate": 3.8188450102009674e-05, + "loss": 0.526, + "num_input_tokens_seen": 3408304, + "step": 10115 + }, + { + "epoch": 7.820710973724884, + "grad_norm": 1.3353694677352905, + "learning_rate": 3.817412415134525e-05, + "loss": 0.4473, + "num_input_tokens_seen": 3409904, + "step": 10120 + }, + { + "epoch": 7.824574961360124, + "grad_norm": 0.6678333282470703, + "learning_rate": 3.815979220901025e-05, + "loss": 0.3737, + "num_input_tokens_seen": 3411568, + "step": 10125 + }, + { + "epoch": 7.828438948995363, + "grad_norm": 0.8355128765106201, + "learning_rate": 3.814545428152295e-05, + "loss": 0.6227, + "num_input_tokens_seen": 3413168, + "step": 10130 + }, + { + "epoch": 7.832302936630603, + "grad_norm": 1.2737605571746826, + "learning_rate": 3.813111037540432e-05, + "loss": 0.3891, + "num_input_tokens_seen": 3414896, + "step": 10135 + }, + { + "epoch": 7.836166924265842, + "grad_norm": 0.5015549063682556, + "learning_rate": 3.811676049717805e-05, + "loss": 0.446, + "num_input_tokens_seen": 3416752, + "step": 10140 + }, + { + "epoch": 7.840030911901082, + "grad_norm": 0.8336794972419739, + "learning_rate": 3.810240465337055e-05, + "loss": 0.6437, + "num_input_tokens_seen": 3418480, + "step": 10145 + }, + { + "epoch": 7.843894899536322, + "grad_norm": 0.6949018239974976, + "learning_rate": 3.8088042850510946e-05, + "loss": 0.4246, + "num_input_tokens_seen": 3420112, + "step": 10150 + }, + { + "epoch": 7.847758887171561, + "grad_norm": 0.8404116034507751, + "learning_rate": 3.8073675095131076e-05, + "loss": 0.4396, + "num_input_tokens_seen": 3421776, + "step": 10155 + }, + { + "epoch": 7.8516228748068, + "grad_norm": 0.692557156085968, + "learning_rate": 3.805930139376548e-05, + "loss": 0.438, + "num_input_tokens_seen": 3423376, + "step": 10160 + }, + { + "epoch": 7.85548686244204, + "grad_norm": 1.0389255285263062, + "learning_rate": 3.804492175295139e-05, + "loss": 0.4082, + "num_input_tokens_seen": 3425168, + "step": 10165 + }, + { + "epoch": 7.85935085007728, + "grad_norm": 0.6130293607711792, + "learning_rate": 3.803053617922877e-05, + "loss": 0.4169, + "num_input_tokens_seen": 3426672, + "step": 10170 + }, + { + "epoch": 7.863214837712519, + "grad_norm": 1.6475811004638672, + "learning_rate": 3.801614467914026e-05, + "loss": 0.4889, + "num_input_tokens_seen": 3428208, + "step": 10175 + }, + { + "epoch": 7.867078825347759, + "grad_norm": 1.04109525680542, + "learning_rate": 3.80017472592312e-05, + "loss": 0.4538, + "num_input_tokens_seen": 3429776, + "step": 10180 + }, + { + "epoch": 7.870942812982999, + "grad_norm": 1.2426332235336304, + "learning_rate": 3.798734392604962e-05, + "loss": 0.5348, + "num_input_tokens_seen": 3431504, + "step": 10185 + }, + { + "epoch": 7.874806800618238, + "grad_norm": 0.9020674228668213, + "learning_rate": 3.7972934686146245e-05, + "loss": 0.3846, + "num_input_tokens_seen": 3433392, + "step": 10190 + }, + { + "epoch": 7.878670788253477, + "grad_norm": 1.2093346118927002, + "learning_rate": 3.795851954607448e-05, + "loss": 0.4575, + "num_input_tokens_seen": 3435376, + "step": 10195 + }, + { + "epoch": 7.882534775888717, + "grad_norm": 1.6056519746780396, + "learning_rate": 3.7944098512390426e-05, + "loss": 0.5169, + "num_input_tokens_seen": 3437040, + "step": 10200 + }, + { + "epoch": 7.886398763523957, + "grad_norm": 0.9834762811660767, + "learning_rate": 3.7929671591652843e-05, + "loss": 0.4627, + "num_input_tokens_seen": 3438576, + "step": 10205 + }, + { + "epoch": 7.890262751159196, + "grad_norm": 0.82396000623703, + "learning_rate": 3.791523879042319e-05, + "loss": 0.4748, + "num_input_tokens_seen": 3440208, + "step": 10210 + }, + { + "epoch": 7.894126738794435, + "grad_norm": 0.7057739496231079, + "learning_rate": 3.790080011526559e-05, + "loss": 0.5148, + "num_input_tokens_seen": 3441904, + "step": 10215 + }, + { + "epoch": 7.897990726429676, + "grad_norm": 0.6339409351348877, + "learning_rate": 3.788635557274684e-05, + "loss": 0.3936, + "num_input_tokens_seen": 3443504, + "step": 10220 + }, + { + "epoch": 7.901854714064915, + "grad_norm": 0.4751756489276886, + "learning_rate": 3.78719051694364e-05, + "loss": 0.4183, + "num_input_tokens_seen": 3445072, + "step": 10225 + }, + { + "epoch": 7.905718701700154, + "grad_norm": 0.8564066290855408, + "learning_rate": 3.78574489119064e-05, + "loss": 0.4675, + "num_input_tokens_seen": 3446704, + "step": 10230 + }, + { + "epoch": 7.909582689335394, + "grad_norm": 1.0623277425765991, + "learning_rate": 3.784298680673164e-05, + "loss": 0.4719, + "num_input_tokens_seen": 3448208, + "step": 10235 + }, + { + "epoch": 7.913446676970634, + "grad_norm": 1.4707165956497192, + "learning_rate": 3.782851886048956e-05, + "loss": 0.5912, + "num_input_tokens_seen": 3449712, + "step": 10240 + }, + { + "epoch": 7.917310664605873, + "grad_norm": 0.9841352105140686, + "learning_rate": 3.781404507976027e-05, + "loss": 0.4028, + "num_input_tokens_seen": 3451216, + "step": 10245 + }, + { + "epoch": 7.921174652241113, + "grad_norm": 1.1506658792495728, + "learning_rate": 3.779956547112655e-05, + "loss": 0.407, + "num_input_tokens_seen": 3452816, + "step": 10250 + }, + { + "epoch": 7.925038639876353, + "grad_norm": 0.6428325176239014, + "learning_rate": 3.77850800411738e-05, + "loss": 0.5788, + "num_input_tokens_seen": 3454576, + "step": 10255 + }, + { + "epoch": 7.928902627511592, + "grad_norm": 1.0539432764053345, + "learning_rate": 3.777058879649007e-05, + "loss": 0.6475, + "num_input_tokens_seen": 3456144, + "step": 10260 + }, + { + "epoch": 7.932766615146831, + "grad_norm": 0.673104465007782, + "learning_rate": 3.7756091743666086e-05, + "loss": 0.4461, + "num_input_tokens_seen": 3457616, + "step": 10265 + }, + { + "epoch": 7.936630602782071, + "grad_norm": 1.287684440612793, + "learning_rate": 3.7741588889295173e-05, + "loss": 0.3486, + "num_input_tokens_seen": 3459312, + "step": 10270 + }, + { + "epoch": 7.940494590417311, + "grad_norm": 0.6414971351623535, + "learning_rate": 3.7727080239973337e-05, + "loss": 0.6821, + "num_input_tokens_seen": 3461104, + "step": 10275 + }, + { + "epoch": 7.94435857805255, + "grad_norm": 0.8316476941108704, + "learning_rate": 3.7712565802299185e-05, + "loss": 0.4329, + "num_input_tokens_seen": 3462640, + "step": 10280 + }, + { + "epoch": 7.948222565687789, + "grad_norm": 1.2278087139129639, + "learning_rate": 3.769804558287397e-05, + "loss": 0.4506, + "num_input_tokens_seen": 3464144, + "step": 10285 + }, + { + "epoch": 7.95208655332303, + "grad_norm": 1.002350926399231, + "learning_rate": 3.768351958830159e-05, + "loss": 0.4485, + "num_input_tokens_seen": 3465552, + "step": 10290 + }, + { + "epoch": 7.955950540958269, + "grad_norm": 0.8261182904243469, + "learning_rate": 3.766898782518853e-05, + "loss": 0.4725, + "num_input_tokens_seen": 3467504, + "step": 10295 + }, + { + "epoch": 7.959814528593508, + "grad_norm": 0.6780874133110046, + "learning_rate": 3.765445030014394e-05, + "loss": 0.3526, + "num_input_tokens_seen": 3469232, + "step": 10300 + }, + { + "epoch": 7.9636785162287484, + "grad_norm": 0.8356141448020935, + "learning_rate": 3.7639907019779565e-05, + "loss": 0.3735, + "num_input_tokens_seen": 3470768, + "step": 10305 + }, + { + "epoch": 7.967542503863988, + "grad_norm": 0.8013710379600525, + "learning_rate": 3.762535799070978e-05, + "loss": 0.4184, + "num_input_tokens_seen": 3472752, + "step": 10310 + }, + { + "epoch": 7.971406491499227, + "grad_norm": 1.1448560953140259, + "learning_rate": 3.7610803219551574e-05, + "loss": 0.3611, + "num_input_tokens_seen": 3474384, + "step": 10315 + }, + { + "epoch": 7.975270479134466, + "grad_norm": 1.5993592739105225, + "learning_rate": 3.7596242712924544e-05, + "loss": 0.6732, + "num_input_tokens_seen": 3475984, + "step": 10320 + }, + { + "epoch": 7.979134466769707, + "grad_norm": 0.7928754091262817, + "learning_rate": 3.758167647745089e-05, + "loss": 0.9553, + "num_input_tokens_seen": 3477776, + "step": 10325 + }, + { + "epoch": 7.982998454404946, + "grad_norm": 0.6426587700843811, + "learning_rate": 3.756710451975543e-05, + "loss": 0.4967, + "num_input_tokens_seen": 3479536, + "step": 10330 + }, + { + "epoch": 7.986862442040185, + "grad_norm": 0.7731752991676331, + "learning_rate": 3.7552526846465565e-05, + "loss": 0.3873, + "num_input_tokens_seen": 3481360, + "step": 10335 + }, + { + "epoch": 7.990726429675425, + "grad_norm": 0.7780867218971252, + "learning_rate": 3.7537943464211314e-05, + "loss": 0.7628, + "num_input_tokens_seen": 3483024, + "step": 10340 + }, + { + "epoch": 7.994590417310665, + "grad_norm": 0.8878014087677002, + "learning_rate": 3.752335437962529e-05, + "loss": 0.5024, + "num_input_tokens_seen": 3484624, + "step": 10345 + }, + { + "epoch": 7.998454404945904, + "grad_norm": 0.9259576797485352, + "learning_rate": 3.7508759599342696e-05, + "loss": 0.5041, + "num_input_tokens_seen": 3486512, + "step": 10350 + }, + { + "epoch": 8.0, + "eval_loss": 0.4627378284931183, + "eval_runtime": 6.3324, + "eval_samples_per_second": 90.803, + "eval_steps_per_second": 22.74, + "num_input_tokens_seen": 3486928, + "step": 10352 + }, + { + "epoch": 8.002318392581143, + "grad_norm": 1.2512315511703491, + "learning_rate": 3.749415913000133e-05, + "loss": 0.5301, + "num_input_tokens_seen": 3488048, + "step": 10355 + }, + { + "epoch": 8.006182380216384, + "grad_norm": 0.6593831181526184, + "learning_rate": 3.7479552978241564e-05, + "loss": 0.5003, + "num_input_tokens_seen": 3489616, + "step": 10360 + }, + { + "epoch": 8.010046367851622, + "grad_norm": 1.5452454090118408, + "learning_rate": 3.746494115070636e-05, + "loss": 0.6158, + "num_input_tokens_seen": 3491248, + "step": 10365 + }, + { + "epoch": 8.013910355486862, + "grad_norm": 1.1132962703704834, + "learning_rate": 3.745032365404127e-05, + "loss": 0.4232, + "num_input_tokens_seen": 3493136, + "step": 10370 + }, + { + "epoch": 8.017774343122102, + "grad_norm": 1.0476287603378296, + "learning_rate": 3.7435700494894434e-05, + "loss": 0.4282, + "num_input_tokens_seen": 3494736, + "step": 10375 + }, + { + "epoch": 8.021638330757341, + "grad_norm": 1.0076993703842163, + "learning_rate": 3.742107167991653e-05, + "loss": 0.3431, + "num_input_tokens_seen": 3496304, + "step": 10380 + }, + { + "epoch": 8.025502318392581, + "grad_norm": 0.7108847498893738, + "learning_rate": 3.7406437215760836e-05, + "loss": 0.3774, + "num_input_tokens_seen": 3498064, + "step": 10385 + }, + { + "epoch": 8.029366306027821, + "grad_norm": 1.051451563835144, + "learning_rate": 3.73917971090832e-05, + "loss": 0.4708, + "num_input_tokens_seen": 3499664, + "step": 10390 + }, + { + "epoch": 8.03323029366306, + "grad_norm": 0.6702438592910767, + "learning_rate": 3.737715136654203e-05, + "loss": 0.3696, + "num_input_tokens_seen": 3501456, + "step": 10395 + }, + { + "epoch": 8.0370942812983, + "grad_norm": 0.8545743823051453, + "learning_rate": 3.7362499994798296e-05, + "loss": 0.461, + "num_input_tokens_seen": 3503152, + "step": 10400 + }, + { + "epoch": 8.04095826893354, + "grad_norm": 1.0116082429885864, + "learning_rate": 3.734784300051552e-05, + "loss": 0.4646, + "num_input_tokens_seen": 3504880, + "step": 10405 + }, + { + "epoch": 8.044822256568779, + "grad_norm": 1.0601493120193481, + "learning_rate": 3.7333180390359805e-05, + "loss": 0.4435, + "num_input_tokens_seen": 3506640, + "step": 10410 + }, + { + "epoch": 8.048686244204019, + "grad_norm": 1.3154196739196777, + "learning_rate": 3.731851217099979e-05, + "loss": 0.4426, + "num_input_tokens_seen": 3508368, + "step": 10415 + }, + { + "epoch": 8.052550231839259, + "grad_norm": 0.6886674165725708, + "learning_rate": 3.730383834910666e-05, + "loss": 0.3913, + "num_input_tokens_seen": 3510192, + "step": 10420 + }, + { + "epoch": 8.056414219474497, + "grad_norm": 1.4589358568191528, + "learning_rate": 3.728915893135417e-05, + "loss": 0.4125, + "num_input_tokens_seen": 3512080, + "step": 10425 + }, + { + "epoch": 8.060278207109738, + "grad_norm": 1.3954017162322998, + "learning_rate": 3.72744739244186e-05, + "loss": 0.4361, + "num_input_tokens_seen": 3513808, + "step": 10430 + }, + { + "epoch": 8.064142194744976, + "grad_norm": 0.90900719165802, + "learning_rate": 3.7259783334978775e-05, + "loss": 0.482, + "num_input_tokens_seen": 3515440, + "step": 10435 + }, + { + "epoch": 8.068006182380216, + "grad_norm": 1.2362951040267944, + "learning_rate": 3.724508716971607e-05, + "loss": 0.5514, + "num_input_tokens_seen": 3517296, + "step": 10440 + }, + { + "epoch": 8.071870170015456, + "grad_norm": 0.9714062809944153, + "learning_rate": 3.7230385435314397e-05, + "loss": 0.4018, + "num_input_tokens_seen": 3518960, + "step": 10445 + }, + { + "epoch": 8.075734157650695, + "grad_norm": 0.6810563206672668, + "learning_rate": 3.7215678138460176e-05, + "loss": 0.4806, + "num_input_tokens_seen": 3520528, + "step": 10450 + }, + { + "epoch": 8.079598145285935, + "grad_norm": 0.5605296492576599, + "learning_rate": 3.7200965285842384e-05, + "loss": 0.518, + "num_input_tokens_seen": 3522288, + "step": 10455 + }, + { + "epoch": 8.083462132921175, + "grad_norm": 1.282467007637024, + "learning_rate": 3.7186246884152505e-05, + "loss": 0.4133, + "num_input_tokens_seen": 3524016, + "step": 10460 + }, + { + "epoch": 8.087326120556414, + "grad_norm": 0.8330532312393188, + "learning_rate": 3.717152294008456e-05, + "loss": 0.4941, + "num_input_tokens_seen": 3525520, + "step": 10465 + }, + { + "epoch": 8.091190108191654, + "grad_norm": 0.7671037316322327, + "learning_rate": 3.7156793460335096e-05, + "loss": 0.4108, + "num_input_tokens_seen": 3527376, + "step": 10470 + }, + { + "epoch": 8.095054095826894, + "grad_norm": 1.5592666864395142, + "learning_rate": 3.714205845160316e-05, + "loss": 0.7326, + "num_input_tokens_seen": 3528784, + "step": 10475 + }, + { + "epoch": 8.098918083462133, + "grad_norm": 0.9646002650260925, + "learning_rate": 3.7127317920590324e-05, + "loss": 0.4514, + "num_input_tokens_seen": 3530640, + "step": 10480 + }, + { + "epoch": 8.102782071097373, + "grad_norm": 0.6456332206726074, + "learning_rate": 3.7112571874000676e-05, + "loss": 0.4367, + "num_input_tokens_seen": 3532272, + "step": 10485 + }, + { + "epoch": 8.106646058732611, + "grad_norm": 1.2716219425201416, + "learning_rate": 3.709782031854079e-05, + "loss": 0.4727, + "num_input_tokens_seen": 3533904, + "step": 10490 + }, + { + "epoch": 8.110510046367851, + "grad_norm": 1.0568599700927734, + "learning_rate": 3.708306326091978e-05, + "loss": 0.5856, + "num_input_tokens_seen": 3535408, + "step": 10495 + }, + { + "epoch": 8.114374034003092, + "grad_norm": 0.8858072757720947, + "learning_rate": 3.706830070784924e-05, + "loss": 0.5363, + "num_input_tokens_seen": 3536880, + "step": 10500 + }, + { + "epoch": 8.11823802163833, + "grad_norm": 0.8395446538925171, + "learning_rate": 3.705353266604326e-05, + "loss": 0.4047, + "num_input_tokens_seen": 3538576, + "step": 10505 + }, + { + "epoch": 8.12210200927357, + "grad_norm": 1.1070517301559448, + "learning_rate": 3.703875914221843e-05, + "loss": 0.5632, + "num_input_tokens_seen": 3540432, + "step": 10510 + }, + { + "epoch": 8.12596599690881, + "grad_norm": 0.7250924706459045, + "learning_rate": 3.702398014309385e-05, + "loss": 0.4116, + "num_input_tokens_seen": 3542032, + "step": 10515 + }, + { + "epoch": 8.129829984544049, + "grad_norm": 1.4386205673217773, + "learning_rate": 3.7009195675391096e-05, + "loss": 0.5424, + "num_input_tokens_seen": 3543856, + "step": 10520 + }, + { + "epoch": 8.13369397217929, + "grad_norm": 0.7331787347793579, + "learning_rate": 3.699440574583423e-05, + "loss": 0.5872, + "num_input_tokens_seen": 3545552, + "step": 10525 + }, + { + "epoch": 8.13755795981453, + "grad_norm": 0.7893253564834595, + "learning_rate": 3.6979610361149785e-05, + "loss": 0.3765, + "num_input_tokens_seen": 3547088, + "step": 10530 + }, + { + "epoch": 8.141421947449768, + "grad_norm": 0.6969060301780701, + "learning_rate": 3.6964809528066814e-05, + "loss": 0.3931, + "num_input_tokens_seen": 3548912, + "step": 10535 + }, + { + "epoch": 8.145285935085008, + "grad_norm": 0.7190797328948975, + "learning_rate": 3.6950003253316816e-05, + "loss": 0.5698, + "num_input_tokens_seen": 3550576, + "step": 10540 + }, + { + "epoch": 8.149149922720248, + "grad_norm": 0.9787706136703491, + "learning_rate": 3.6935191543633776e-05, + "loss": 0.5211, + "num_input_tokens_seen": 3552080, + "step": 10545 + }, + { + "epoch": 8.153013910355487, + "grad_norm": 0.6588833928108215, + "learning_rate": 3.6920374405754134e-05, + "loss": 0.4676, + "num_input_tokens_seen": 3553456, + "step": 10550 + }, + { + "epoch": 8.156877897990727, + "grad_norm": 0.7333282232284546, + "learning_rate": 3.690555184641683e-05, + "loss": 0.5369, + "num_input_tokens_seen": 3555056, + "step": 10555 + }, + { + "epoch": 8.160741885625965, + "grad_norm": 1.083021640777588, + "learning_rate": 3.6890723872363256e-05, + "loss": 0.842, + "num_input_tokens_seen": 3557072, + "step": 10560 + }, + { + "epoch": 8.164605873261205, + "grad_norm": 0.6893942952156067, + "learning_rate": 3.687589049033724e-05, + "loss": 0.3535, + "num_input_tokens_seen": 3558704, + "step": 10565 + }, + { + "epoch": 8.168469860896446, + "grad_norm": 0.9537882208824158, + "learning_rate": 3.686105170708511e-05, + "loss": 0.4787, + "num_input_tokens_seen": 3560368, + "step": 10570 + }, + { + "epoch": 8.172333848531684, + "grad_norm": 0.6597425937652588, + "learning_rate": 3.684620752935564e-05, + "loss": 0.4248, + "num_input_tokens_seen": 3562096, + "step": 10575 + }, + { + "epoch": 8.176197836166924, + "grad_norm": 1.5355877876281738, + "learning_rate": 3.683135796390003e-05, + "loss": 0.5005, + "num_input_tokens_seen": 3564016, + "step": 10580 + }, + { + "epoch": 8.180061823802165, + "grad_norm": 0.6876901984214783, + "learning_rate": 3.681650301747196e-05, + "loss": 0.3588, + "num_input_tokens_seen": 3565712, + "step": 10585 + }, + { + "epoch": 8.183925811437403, + "grad_norm": 0.8849683403968811, + "learning_rate": 3.680164269682756e-05, + "loss": 0.434, + "num_input_tokens_seen": 3567600, + "step": 10590 + }, + { + "epoch": 8.187789799072643, + "grad_norm": 0.9040293097496033, + "learning_rate": 3.678677700872539e-05, + "loss": 0.4857, + "num_input_tokens_seen": 3569008, + "step": 10595 + }, + { + "epoch": 8.191653786707883, + "grad_norm": 1.0567021369934082, + "learning_rate": 3.677190595992644e-05, + "loss": 0.6344, + "num_input_tokens_seen": 3570800, + "step": 10600 + }, + { + "epoch": 8.195517774343122, + "grad_norm": 0.7420886158943176, + "learning_rate": 3.675702955719416e-05, + "loss": 0.3885, + "num_input_tokens_seen": 3572304, + "step": 10605 + }, + { + "epoch": 8.199381761978362, + "grad_norm": 0.7501323223114014, + "learning_rate": 3.674214780729443e-05, + "loss": 0.5667, + "num_input_tokens_seen": 3574224, + "step": 10610 + }, + { + "epoch": 8.2032457496136, + "grad_norm": 1.1044316291809082, + "learning_rate": 3.6727260716995555e-05, + "loss": 0.7687, + "num_input_tokens_seen": 3576208, + "step": 10615 + }, + { + "epoch": 8.20710973724884, + "grad_norm": 0.8977000713348389, + "learning_rate": 3.6712368293068274e-05, + "loss": 0.4533, + "num_input_tokens_seen": 3577904, + "step": 10620 + }, + { + "epoch": 8.21097372488408, + "grad_norm": 0.8848065733909607, + "learning_rate": 3.669747054228575e-05, + "loss": 0.4571, + "num_input_tokens_seen": 3579440, + "step": 10625 + }, + { + "epoch": 8.21483771251932, + "grad_norm": 0.9888221621513367, + "learning_rate": 3.668256747142357e-05, + "loss": 0.4971, + "num_input_tokens_seen": 3581328, + "step": 10630 + }, + { + "epoch": 8.21870170015456, + "grad_norm": 0.759125828742981, + "learning_rate": 3.666765908725974e-05, + "loss": 0.6139, + "num_input_tokens_seen": 3582992, + "step": 10635 + }, + { + "epoch": 8.2225656877898, + "grad_norm": 0.7093187570571899, + "learning_rate": 3.6652745396574685e-05, + "loss": 0.4148, + "num_input_tokens_seen": 3584624, + "step": 10640 + }, + { + "epoch": 8.226429675425038, + "grad_norm": 1.3135408163070679, + "learning_rate": 3.6637826406151236e-05, + "loss": 0.4178, + "num_input_tokens_seen": 3586512, + "step": 10645 + }, + { + "epoch": 8.230293663060278, + "grad_norm": 0.7830641269683838, + "learning_rate": 3.662290212277464e-05, + "loss": 0.5491, + "num_input_tokens_seen": 3588272, + "step": 10650 + }, + { + "epoch": 8.234157650695519, + "grad_norm": 0.5679023861885071, + "learning_rate": 3.660797255323255e-05, + "loss": 0.3289, + "num_input_tokens_seen": 3589872, + "step": 10655 + }, + { + "epoch": 8.238021638330757, + "grad_norm": 0.8191899061203003, + "learning_rate": 3.659303770431503e-05, + "loss": 0.591, + "num_input_tokens_seen": 3591536, + "step": 10660 + }, + { + "epoch": 8.241885625965997, + "grad_norm": 0.7953160405158997, + "learning_rate": 3.657809758281454e-05, + "loss": 0.3846, + "num_input_tokens_seen": 3593392, + "step": 10665 + }, + { + "epoch": 8.245749613601237, + "grad_norm": 0.6029991507530212, + "learning_rate": 3.656315219552592e-05, + "loss": 0.3791, + "num_input_tokens_seen": 3595088, + "step": 10670 + }, + { + "epoch": 8.249613601236476, + "grad_norm": 0.9145137667655945, + "learning_rate": 3.654820154924643e-05, + "loss": 0.4843, + "num_input_tokens_seen": 3596880, + "step": 10675 + }, + { + "epoch": 8.253477588871716, + "grad_norm": 0.6728259325027466, + "learning_rate": 3.6533245650775726e-05, + "loss": 0.4003, + "num_input_tokens_seen": 3598416, + "step": 10680 + }, + { + "epoch": 8.257341576506954, + "grad_norm": 1.107701063156128, + "learning_rate": 3.651828450691582e-05, + "loss": 0.5124, + "num_input_tokens_seen": 3600048, + "step": 10685 + }, + { + "epoch": 8.261205564142195, + "grad_norm": 0.9781903028488159, + "learning_rate": 3.650331812447114e-05, + "loss": 0.4696, + "num_input_tokens_seen": 3601488, + "step": 10690 + }, + { + "epoch": 8.265069551777435, + "grad_norm": 0.7747291922569275, + "learning_rate": 3.64883465102485e-05, + "loss": 0.7234, + "num_input_tokens_seen": 3603152, + "step": 10695 + }, + { + "epoch": 8.268933539412673, + "grad_norm": 0.7854416966438293, + "learning_rate": 3.647336967105706e-05, + "loss": 0.4939, + "num_input_tokens_seen": 3604944, + "step": 10700 + }, + { + "epoch": 8.272797527047913, + "grad_norm": 0.769913375377655, + "learning_rate": 3.645838761370838e-05, + "loss": 0.4003, + "num_input_tokens_seen": 3606544, + "step": 10705 + }, + { + "epoch": 8.276661514683154, + "grad_norm": 0.8760992884635925, + "learning_rate": 3.6443400345016385e-05, + "loss": 0.376, + "num_input_tokens_seen": 3608080, + "step": 10710 + }, + { + "epoch": 8.280525502318392, + "grad_norm": 1.294608473777771, + "learning_rate": 3.6428407871797396e-05, + "loss": 0.8416, + "num_input_tokens_seen": 3609712, + "step": 10715 + }, + { + "epoch": 8.284389489953632, + "grad_norm": 0.7074591517448425, + "learning_rate": 3.6413410200870055e-05, + "loss": 0.5195, + "num_input_tokens_seen": 3611184, + "step": 10720 + }, + { + "epoch": 8.288253477588873, + "grad_norm": 0.7488788962364197, + "learning_rate": 3.639840733905541e-05, + "loss": 0.6312, + "num_input_tokens_seen": 3613008, + "step": 10725 + }, + { + "epoch": 8.292117465224111, + "grad_norm": 0.857903778553009, + "learning_rate": 3.638339929317683e-05, + "loss": 0.4213, + "num_input_tokens_seen": 3614896, + "step": 10730 + }, + { + "epoch": 8.295981452859351, + "grad_norm": 1.2156805992126465, + "learning_rate": 3.63683860700601e-05, + "loss": 0.5088, + "num_input_tokens_seen": 3616592, + "step": 10735 + }, + { + "epoch": 8.29984544049459, + "grad_norm": 0.752341628074646, + "learning_rate": 3.6353367676533285e-05, + "loss": 0.6543, + "num_input_tokens_seen": 3618128, + "step": 10740 + }, + { + "epoch": 8.30370942812983, + "grad_norm": 0.9795384407043457, + "learning_rate": 3.6338344119426866e-05, + "loss": 0.4604, + "num_input_tokens_seen": 3619824, + "step": 10745 + }, + { + "epoch": 8.30757341576507, + "grad_norm": 0.7563225030899048, + "learning_rate": 3.632331540557363e-05, + "loss": 0.5863, + "num_input_tokens_seen": 3621552, + "step": 10750 + }, + { + "epoch": 8.311437403400308, + "grad_norm": 0.7305439114570618, + "learning_rate": 3.6308281541808745e-05, + "loss": 0.3591, + "num_input_tokens_seen": 3623792, + "step": 10755 + }, + { + "epoch": 8.315301391035549, + "grad_norm": 0.7543095946311951, + "learning_rate": 3.629324253496969e-05, + "loss": 0.3754, + "num_input_tokens_seen": 3625200, + "step": 10760 + }, + { + "epoch": 8.319165378670789, + "grad_norm": 0.8908877968788147, + "learning_rate": 3.62781983918963e-05, + "loss": 0.3882, + "num_input_tokens_seen": 3627088, + "step": 10765 + }, + { + "epoch": 8.323029366306027, + "grad_norm": 1.038603663444519, + "learning_rate": 3.6263149119430727e-05, + "loss": 0.4252, + "num_input_tokens_seen": 3628688, + "step": 10770 + }, + { + "epoch": 8.326893353941268, + "grad_norm": 1.3282493352890015, + "learning_rate": 3.6248094724417494e-05, + "loss": 0.5949, + "num_input_tokens_seen": 3630320, + "step": 10775 + }, + { + "epoch": 8.330757341576508, + "grad_norm": 1.0539253950119019, + "learning_rate": 3.623303521370342e-05, + "loss": 0.5004, + "num_input_tokens_seen": 3631984, + "step": 10780 + }, + { + "epoch": 8.334621329211746, + "grad_norm": 1.046521782875061, + "learning_rate": 3.621797059413765e-05, + "loss": 0.3444, + "num_input_tokens_seen": 3633584, + "step": 10785 + }, + { + "epoch": 8.338485316846986, + "grad_norm": 0.6923836469650269, + "learning_rate": 3.6202900872571674e-05, + "loss": 0.4235, + "num_input_tokens_seen": 3635312, + "step": 10790 + }, + { + "epoch": 8.342349304482227, + "grad_norm": 1.5713469982147217, + "learning_rate": 3.6187826055859286e-05, + "loss": 0.5088, + "num_input_tokens_seen": 3636656, + "step": 10795 + }, + { + "epoch": 8.346213292117465, + "grad_norm": 0.7495591640472412, + "learning_rate": 3.6172746150856615e-05, + "loss": 0.4456, + "num_input_tokens_seen": 3638352, + "step": 10800 + }, + { + "epoch": 8.350077279752705, + "grad_norm": 0.6890503168106079, + "learning_rate": 3.6157661164422086e-05, + "loss": 0.4483, + "num_input_tokens_seen": 3640016, + "step": 10805 + }, + { + "epoch": 8.353941267387944, + "grad_norm": 0.8425655364990234, + "learning_rate": 3.6142571103416424e-05, + "loss": 0.42, + "num_input_tokens_seen": 3641680, + "step": 10810 + }, + { + "epoch": 8.357805255023184, + "grad_norm": 1.299270749092102, + "learning_rate": 3.612747597470271e-05, + "loss": 0.6642, + "num_input_tokens_seen": 3643600, + "step": 10815 + }, + { + "epoch": 8.361669242658424, + "grad_norm": 0.9831938147544861, + "learning_rate": 3.6112375785146276e-05, + "loss": 0.4427, + "num_input_tokens_seen": 3645264, + "step": 10820 + }, + { + "epoch": 8.365533230293662, + "grad_norm": 0.8371848464012146, + "learning_rate": 3.609727054161478e-05, + "loss": 0.4576, + "num_input_tokens_seen": 3647152, + "step": 10825 + }, + { + "epoch": 8.369397217928903, + "grad_norm": 0.854742169380188, + "learning_rate": 3.608216025097819e-05, + "loss": 0.4454, + "num_input_tokens_seen": 3649200, + "step": 10830 + }, + { + "epoch": 8.373261205564143, + "grad_norm": 1.156713604927063, + "learning_rate": 3.606704492010875e-05, + "loss": 0.4304, + "num_input_tokens_seen": 3650736, + "step": 10835 + }, + { + "epoch": 8.377125193199381, + "grad_norm": 0.936202347278595, + "learning_rate": 3.6051924555881e-05, + "loss": 0.4683, + "num_input_tokens_seen": 3652432, + "step": 10840 + }, + { + "epoch": 8.380989180834622, + "grad_norm": 1.1513667106628418, + "learning_rate": 3.6036799165171775e-05, + "loss": 0.5103, + "num_input_tokens_seen": 3654352, + "step": 10845 + }, + { + "epoch": 8.384853168469862, + "grad_norm": 0.6406664848327637, + "learning_rate": 3.602166875486019e-05, + "loss": 0.4231, + "num_input_tokens_seen": 3655952, + "step": 10850 + }, + { + "epoch": 8.3887171561051, + "grad_norm": 0.6962416172027588, + "learning_rate": 3.600653333182765e-05, + "loss": 0.3703, + "num_input_tokens_seen": 3657648, + "step": 10855 + }, + { + "epoch": 8.39258114374034, + "grad_norm": 1.021201491355896, + "learning_rate": 3.599139290295784e-05, + "loss": 0.5523, + "num_input_tokens_seen": 3659152, + "step": 10860 + }, + { + "epoch": 8.396445131375579, + "grad_norm": 1.0916900634765625, + "learning_rate": 3.597624747513671e-05, + "loss": 0.4927, + "num_input_tokens_seen": 3660816, + "step": 10865 + }, + { + "epoch": 8.400309119010819, + "grad_norm": 0.9369356632232666, + "learning_rate": 3.596109705525249e-05, + "loss": 0.459, + "num_input_tokens_seen": 3662608, + "step": 10870 + }, + { + "epoch": 8.40417310664606, + "grad_norm": 1.8277559280395508, + "learning_rate": 3.5945941650195694e-05, + "loss": 0.4539, + "num_input_tokens_seen": 3664272, + "step": 10875 + }, + { + "epoch": 8.408037094281298, + "grad_norm": 0.7330831289291382, + "learning_rate": 3.593078126685908e-05, + "loss": 0.4229, + "num_input_tokens_seen": 3666000, + "step": 10880 + }, + { + "epoch": 8.411901081916538, + "grad_norm": 0.9909090399742126, + "learning_rate": 3.591561591213768e-05, + "loss": 0.3891, + "num_input_tokens_seen": 3667664, + "step": 10885 + }, + { + "epoch": 8.415765069551778, + "grad_norm": 0.7132318615913391, + "learning_rate": 3.590044559292879e-05, + "loss": 0.4291, + "num_input_tokens_seen": 3669264, + "step": 10890 + }, + { + "epoch": 8.419629057187016, + "grad_norm": 1.3452038764953613, + "learning_rate": 3.588527031613197e-05, + "loss": 0.6305, + "num_input_tokens_seen": 3670736, + "step": 10895 + }, + { + "epoch": 8.423493044822257, + "grad_norm": 0.7029215097427368, + "learning_rate": 3.5870090088649025e-05, + "loss": 0.6247, + "num_input_tokens_seen": 3672400, + "step": 10900 + }, + { + "epoch": 8.427357032457497, + "grad_norm": 0.6425447463989258, + "learning_rate": 3.5854904917384e-05, + "loss": 0.4801, + "num_input_tokens_seen": 3673840, + "step": 10905 + }, + { + "epoch": 8.431221020092735, + "grad_norm": 0.7747843265533447, + "learning_rate": 3.5839714809243216e-05, + "loss": 0.4852, + "num_input_tokens_seen": 3675408, + "step": 10910 + }, + { + "epoch": 8.435085007727976, + "grad_norm": 1.0107040405273438, + "learning_rate": 3.582451977113521e-05, + "loss": 0.622, + "num_input_tokens_seen": 3677136, + "step": 10915 + }, + { + "epoch": 8.438948995363216, + "grad_norm": 0.5223643183708191, + "learning_rate": 3.58093198099708e-05, + "loss": 0.6814, + "num_input_tokens_seen": 3678960, + "step": 10920 + }, + { + "epoch": 8.442812982998454, + "grad_norm": 1.071202278137207, + "learning_rate": 3.5794114932663006e-05, + "loss": 0.4581, + "num_input_tokens_seen": 3680688, + "step": 10925 + }, + { + "epoch": 8.446676970633694, + "grad_norm": 0.9075097441673279, + "learning_rate": 3.5778905146127086e-05, + "loss": 0.4575, + "num_input_tokens_seen": 3682544, + "step": 10930 + }, + { + "epoch": 8.450540958268933, + "grad_norm": 0.826533317565918, + "learning_rate": 3.5763690457280566e-05, + "loss": 0.3787, + "num_input_tokens_seen": 3684624, + "step": 10935 + }, + { + "epoch": 8.454404945904173, + "grad_norm": 0.7104076147079468, + "learning_rate": 3.574847087304316e-05, + "loss": 0.5904, + "num_input_tokens_seen": 3686512, + "step": 10940 + }, + { + "epoch": 8.458268933539413, + "grad_norm": 1.0089470148086548, + "learning_rate": 3.5733246400336825e-05, + "loss": 0.4391, + "num_input_tokens_seen": 3688112, + "step": 10945 + }, + { + "epoch": 8.462132921174652, + "grad_norm": 0.8645634055137634, + "learning_rate": 3.571801704608575e-05, + "loss": 0.5288, + "num_input_tokens_seen": 3689776, + "step": 10950 + }, + { + "epoch": 8.465996908809892, + "grad_norm": 0.9598163366317749, + "learning_rate": 3.570278281721634e-05, + "loss": 0.5556, + "num_input_tokens_seen": 3691280, + "step": 10955 + }, + { + "epoch": 8.469860896445132, + "grad_norm": 1.1100084781646729, + "learning_rate": 3.568754372065721e-05, + "loss": 0.4306, + "num_input_tokens_seen": 3692816, + "step": 10960 + }, + { + "epoch": 8.47372488408037, + "grad_norm": 0.9017814993858337, + "learning_rate": 3.5672299763339185e-05, + "loss": 0.3606, + "num_input_tokens_seen": 3694512, + "step": 10965 + }, + { + "epoch": 8.47758887171561, + "grad_norm": 1.3964879512786865, + "learning_rate": 3.565705095219531e-05, + "loss": 0.6306, + "num_input_tokens_seen": 3696016, + "step": 10970 + }, + { + "epoch": 8.481452859350851, + "grad_norm": 0.6530844569206238, + "learning_rate": 3.564179729416085e-05, + "loss": 0.583, + "num_input_tokens_seen": 3697680, + "step": 10975 + }, + { + "epoch": 8.48531684698609, + "grad_norm": 0.9716774225234985, + "learning_rate": 3.562653879617324e-05, + "loss": 0.6145, + "num_input_tokens_seen": 3699472, + "step": 10980 + }, + { + "epoch": 8.48918083462133, + "grad_norm": 0.8583157658576965, + "learning_rate": 3.561127546517215e-05, + "loss": 0.488, + "num_input_tokens_seen": 3701136, + "step": 10985 + }, + { + "epoch": 8.493044822256568, + "grad_norm": 0.9134001731872559, + "learning_rate": 3.559600730809943e-05, + "loss": 0.3725, + "num_input_tokens_seen": 3702800, + "step": 10990 + }, + { + "epoch": 8.496908809891808, + "grad_norm": 1.3137269020080566, + "learning_rate": 3.558073433189913e-05, + "loss": 0.362, + "num_input_tokens_seen": 3704368, + "step": 10995 + }, + { + "epoch": 8.500772797527048, + "grad_norm": 0.6819579005241394, + "learning_rate": 3.556545654351749e-05, + "loss": 0.3726, + "num_input_tokens_seen": 3706128, + "step": 11000 + }, + { + "epoch": 8.504636785162287, + "grad_norm": 1.004225254058838, + "learning_rate": 3.555017394990294e-05, + "loss": 0.4454, + "num_input_tokens_seen": 3707600, + "step": 11005 + }, + { + "epoch": 8.508500772797527, + "grad_norm": 0.9635434150695801, + "learning_rate": 3.5534886558006094e-05, + "loss": 0.45, + "num_input_tokens_seen": 3709488, + "step": 11010 + }, + { + "epoch": 8.512364760432767, + "grad_norm": 0.8737886548042297, + "learning_rate": 3.5519594374779744e-05, + "loss": 0.4458, + "num_input_tokens_seen": 3710992, + "step": 11015 + }, + { + "epoch": 8.516228748068006, + "grad_norm": 0.7098567485809326, + "learning_rate": 3.5504297407178885e-05, + "loss": 0.3196, + "num_input_tokens_seen": 3712592, + "step": 11020 + }, + { + "epoch": 8.520092735703246, + "grad_norm": 0.716801643371582, + "learning_rate": 3.548899566216065e-05, + "loss": 0.4493, + "num_input_tokens_seen": 3714096, + "step": 11025 + }, + { + "epoch": 8.523956723338486, + "grad_norm": 1.144237756729126, + "learning_rate": 3.547368914668438e-05, + "loss": 0.5006, + "num_input_tokens_seen": 3715888, + "step": 11030 + }, + { + "epoch": 8.527820710973725, + "grad_norm": 0.8507484793663025, + "learning_rate": 3.545837786771155e-05, + "loss": 0.6326, + "num_input_tokens_seen": 3717552, + "step": 11035 + }, + { + "epoch": 8.531684698608965, + "grad_norm": 0.784989058971405, + "learning_rate": 3.544306183220584e-05, + "loss": 0.3598, + "num_input_tokens_seen": 3718992, + "step": 11040 + }, + { + "epoch": 8.535548686244205, + "grad_norm": 0.716141402721405, + "learning_rate": 3.5427741047133065e-05, + "loss": 0.3732, + "num_input_tokens_seen": 3720848, + "step": 11045 + }, + { + "epoch": 8.539412673879443, + "grad_norm": 0.9034358263015747, + "learning_rate": 3.541241551946122e-05, + "loss": 0.4758, + "num_input_tokens_seen": 3722704, + "step": 11050 + }, + { + "epoch": 8.543276661514684, + "grad_norm": 1.5580171346664429, + "learning_rate": 3.539708525616042e-05, + "loss": 0.4231, + "num_input_tokens_seen": 3724240, + "step": 11055 + }, + { + "epoch": 8.547140649149922, + "grad_norm": 1.0799485445022583, + "learning_rate": 3.5381750264203004e-05, + "loss": 0.5137, + "num_input_tokens_seen": 3725616, + "step": 11060 + }, + { + "epoch": 8.551004636785162, + "grad_norm": 0.8419619798660278, + "learning_rate": 3.536641055056338e-05, + "loss": 0.4574, + "num_input_tokens_seen": 3727312, + "step": 11065 + }, + { + "epoch": 8.554868624420402, + "grad_norm": 1.2236500978469849, + "learning_rate": 3.5351066122218155e-05, + "loss": 0.5414, + "num_input_tokens_seen": 3729136, + "step": 11070 + }, + { + "epoch": 8.55873261205564, + "grad_norm": 1.0258381366729736, + "learning_rate": 3.533571698614607e-05, + "loss": 0.4085, + "num_input_tokens_seen": 3730832, + "step": 11075 + }, + { + "epoch": 8.562596599690881, + "grad_norm": 1.1785855293273926, + "learning_rate": 3.5320363149328006e-05, + "loss": 0.4869, + "num_input_tokens_seen": 3732496, + "step": 11080 + }, + { + "epoch": 8.566460587326121, + "grad_norm": 1.6087218523025513, + "learning_rate": 3.5305004618746976e-05, + "loss": 0.4977, + "num_input_tokens_seen": 3734032, + "step": 11085 + }, + { + "epoch": 8.57032457496136, + "grad_norm": 0.991348147392273, + "learning_rate": 3.528964140138812e-05, + "loss": 0.4684, + "num_input_tokens_seen": 3735472, + "step": 11090 + }, + { + "epoch": 8.5741885625966, + "grad_norm": 0.8087354898452759, + "learning_rate": 3.527427350423874e-05, + "loss": 0.3664, + "num_input_tokens_seen": 3737040, + "step": 11095 + }, + { + "epoch": 8.578052550231838, + "grad_norm": 0.995976448059082, + "learning_rate": 3.525890093428824e-05, + "loss": 0.5033, + "num_input_tokens_seen": 3738928, + "step": 11100 + }, + { + "epoch": 8.581916537867079, + "grad_norm": 0.7722896933555603, + "learning_rate": 3.524352369852816e-05, + "loss": 0.7027, + "num_input_tokens_seen": 3740688, + "step": 11105 + }, + { + "epoch": 8.585780525502319, + "grad_norm": 1.2620346546173096, + "learning_rate": 3.522814180395215e-05, + "loss": 0.4743, + "num_input_tokens_seen": 3742480, + "step": 11110 + }, + { + "epoch": 8.589644513137557, + "grad_norm": 1.4497047662734985, + "learning_rate": 3.521275525755599e-05, + "loss": 0.4114, + "num_input_tokens_seen": 3744016, + "step": 11115 + }, + { + "epoch": 8.593508500772797, + "grad_norm": 1.3938676118850708, + "learning_rate": 3.5197364066337585e-05, + "loss": 0.5026, + "num_input_tokens_seen": 3745872, + "step": 11120 + }, + { + "epoch": 8.597372488408038, + "grad_norm": 0.6096464395523071, + "learning_rate": 3.518196823729693e-05, + "loss": 0.7342, + "num_input_tokens_seen": 3747632, + "step": 11125 + }, + { + "epoch": 8.601236476043276, + "grad_norm": 0.6492934823036194, + "learning_rate": 3.5166567777436146e-05, + "loss": 0.3874, + "num_input_tokens_seen": 3749104, + "step": 11130 + }, + { + "epoch": 8.605100463678516, + "grad_norm": 0.9591032266616821, + "learning_rate": 3.5151162693759455e-05, + "loss": 0.5035, + "num_input_tokens_seen": 3750640, + "step": 11135 + }, + { + "epoch": 8.608964451313756, + "grad_norm": 0.9860171675682068, + "learning_rate": 3.513575299327317e-05, + "loss": 0.4399, + "num_input_tokens_seen": 3752368, + "step": 11140 + }, + { + "epoch": 8.612828438948995, + "grad_norm": 1.2613753080368042, + "learning_rate": 3.5120338682985725e-05, + "loss": 0.4192, + "num_input_tokens_seen": 3754320, + "step": 11145 + }, + { + "epoch": 8.616692426584235, + "grad_norm": 0.9641014337539673, + "learning_rate": 3.510491976990764e-05, + "loss": 0.54, + "num_input_tokens_seen": 3756240, + "step": 11150 + }, + { + "epoch": 8.620556414219475, + "grad_norm": 0.9710080623626709, + "learning_rate": 3.508949626105152e-05, + "loss": 0.5662, + "num_input_tokens_seen": 3757968, + "step": 11155 + }, + { + "epoch": 8.624420401854714, + "grad_norm": 0.6599406003952026, + "learning_rate": 3.507406816343209e-05, + "loss": 0.4868, + "num_input_tokens_seen": 3759376, + "step": 11160 + }, + { + "epoch": 8.628284389489954, + "grad_norm": 0.5332229733467102, + "learning_rate": 3.505863548406613e-05, + "loss": 0.367, + "num_input_tokens_seen": 3760880, + "step": 11165 + }, + { + "epoch": 8.632148377125194, + "grad_norm": 0.9301839470863342, + "learning_rate": 3.50431982299725e-05, + "loss": 0.3632, + "num_input_tokens_seen": 3762544, + "step": 11170 + }, + { + "epoch": 8.636012364760433, + "grad_norm": 0.8046787977218628, + "learning_rate": 3.502775640817217e-05, + "loss": 0.4253, + "num_input_tokens_seen": 3764400, + "step": 11175 + }, + { + "epoch": 8.639876352395673, + "grad_norm": 1.433274507522583, + "learning_rate": 3.5012310025688176e-05, + "loss": 0.4622, + "num_input_tokens_seen": 3765936, + "step": 11180 + }, + { + "epoch": 8.643740340030911, + "grad_norm": 0.9687391519546509, + "learning_rate": 3.499685908954562e-05, + "loss": 0.5031, + "num_input_tokens_seen": 3767376, + "step": 11185 + }, + { + "epoch": 8.647604327666151, + "grad_norm": 0.9616634249687195, + "learning_rate": 3.498140360677168e-05, + "loss": 0.4782, + "num_input_tokens_seen": 3769264, + "step": 11190 + }, + { + "epoch": 8.651468315301392, + "grad_norm": 0.7425277829170227, + "learning_rate": 3.4965943584395604e-05, + "loss": 0.3491, + "num_input_tokens_seen": 3770832, + "step": 11195 + }, + { + "epoch": 8.65533230293663, + "grad_norm": 1.9730501174926758, + "learning_rate": 3.4950479029448706e-05, + "loss": 0.4585, + "num_input_tokens_seen": 3772432, + "step": 11200 + }, + { + "epoch": 8.65919629057187, + "grad_norm": 1.3684935569763184, + "learning_rate": 3.4935009948964345e-05, + "loss": 0.5888, + "num_input_tokens_seen": 3774160, + "step": 11205 + }, + { + "epoch": 8.66306027820711, + "grad_norm": 0.6056753993034363, + "learning_rate": 3.491953634997796e-05, + "loss": 0.4006, + "num_input_tokens_seen": 3775632, + "step": 11210 + }, + { + "epoch": 8.666924265842349, + "grad_norm": 0.9449382424354553, + "learning_rate": 3.4904058239527055e-05, + "loss": 0.3921, + "num_input_tokens_seen": 3777296, + "step": 11215 + }, + { + "epoch": 8.670788253477589, + "grad_norm": 0.9277834296226501, + "learning_rate": 3.4888575624651144e-05, + "loss": 0.4567, + "num_input_tokens_seen": 3778992, + "step": 11220 + }, + { + "epoch": 8.674652241112828, + "grad_norm": 0.7527439594268799, + "learning_rate": 3.487308851239181e-05, + "loss": 0.3793, + "num_input_tokens_seen": 3780592, + "step": 11225 + }, + { + "epoch": 8.678516228748068, + "grad_norm": 1.870705485343933, + "learning_rate": 3.4857596909792694e-05, + "loss": 0.4209, + "num_input_tokens_seen": 3782288, + "step": 11230 + }, + { + "epoch": 8.682380216383308, + "grad_norm": 1.0970112085342407, + "learning_rate": 3.484210082389947e-05, + "loss": 0.442, + "num_input_tokens_seen": 3783728, + "step": 11235 + }, + { + "epoch": 8.686244204018546, + "grad_norm": 1.1082730293273926, + "learning_rate": 3.482660026175985e-05, + "loss": 0.6603, + "num_input_tokens_seen": 3785520, + "step": 11240 + }, + { + "epoch": 8.690108191653787, + "grad_norm": 0.9617175459861755, + "learning_rate": 3.4811095230423586e-05, + "loss": 0.4039, + "num_input_tokens_seen": 3787184, + "step": 11245 + }, + { + "epoch": 8.693972179289027, + "grad_norm": 0.9294751286506653, + "learning_rate": 3.479558573694245e-05, + "loss": 0.4115, + "num_input_tokens_seen": 3788816, + "step": 11250 + }, + { + "epoch": 8.697836166924265, + "grad_norm": 0.8893986344337463, + "learning_rate": 3.478007178837026e-05, + "loss": 0.4418, + "num_input_tokens_seen": 3790704, + "step": 11255 + }, + { + "epoch": 8.701700154559505, + "grad_norm": 0.7246124744415283, + "learning_rate": 3.476455339176284e-05, + "loss": 0.3847, + "num_input_tokens_seen": 3792624, + "step": 11260 + }, + { + "epoch": 8.705564142194746, + "grad_norm": 1.1624765396118164, + "learning_rate": 3.474903055417807e-05, + "loss": 0.6857, + "num_input_tokens_seen": 3794160, + "step": 11265 + }, + { + "epoch": 8.709428129829984, + "grad_norm": 1.0984808206558228, + "learning_rate": 3.473350328267582e-05, + "loss": 0.6407, + "num_input_tokens_seen": 3795856, + "step": 11270 + }, + { + "epoch": 8.713292117465224, + "grad_norm": 0.838512122631073, + "learning_rate": 3.4717971584317984e-05, + "loss": 0.5477, + "num_input_tokens_seen": 3797712, + "step": 11275 + }, + { + "epoch": 8.717156105100464, + "grad_norm": 0.6516395211219788, + "learning_rate": 3.470243546616847e-05, + "loss": 0.3928, + "num_input_tokens_seen": 3799376, + "step": 11280 + }, + { + "epoch": 8.721020092735703, + "grad_norm": 1.0588940382003784, + "learning_rate": 3.468689493529321e-05, + "loss": 0.663, + "num_input_tokens_seen": 3801008, + "step": 11285 + }, + { + "epoch": 8.724884080370943, + "grad_norm": 1.0150171518325806, + "learning_rate": 3.4671349998760104e-05, + "loss": 0.4509, + "num_input_tokens_seen": 3802832, + "step": 11290 + }, + { + "epoch": 8.728748068006183, + "grad_norm": 0.7782866954803467, + "learning_rate": 3.465580066363911e-05, + "loss": 0.6731, + "num_input_tokens_seen": 3804656, + "step": 11295 + }, + { + "epoch": 8.732612055641422, + "grad_norm": 0.805216908454895, + "learning_rate": 3.4640246937002144e-05, + "loss": 0.4289, + "num_input_tokens_seen": 3806448, + "step": 11300 + }, + { + "epoch": 8.736476043276662, + "grad_norm": 0.8477769494056702, + "learning_rate": 3.4624688825923146e-05, + "loss": 0.4679, + "num_input_tokens_seen": 3808144, + "step": 11305 + }, + { + "epoch": 8.7403400309119, + "grad_norm": 1.3147850036621094, + "learning_rate": 3.4609126337478016e-05, + "loss": 0.4395, + "num_input_tokens_seen": 3809616, + "step": 11310 + }, + { + "epoch": 8.74420401854714, + "grad_norm": 0.7988678216934204, + "learning_rate": 3.459355947874469e-05, + "loss": 0.3576, + "num_input_tokens_seen": 3811120, + "step": 11315 + }, + { + "epoch": 8.74806800618238, + "grad_norm": 0.7791758179664612, + "learning_rate": 3.457798825680306e-05, + "loss": 0.4217, + "num_input_tokens_seen": 3812880, + "step": 11320 + }, + { + "epoch": 8.75193199381762, + "grad_norm": 0.9095813035964966, + "learning_rate": 3.456241267873501e-05, + "loss": 0.3845, + "num_input_tokens_seen": 3814672, + "step": 11325 + }, + { + "epoch": 8.75579598145286, + "grad_norm": 0.9713788628578186, + "learning_rate": 3.45468327516244e-05, + "loss": 0.51, + "num_input_tokens_seen": 3816496, + "step": 11330 + }, + { + "epoch": 8.7596599690881, + "grad_norm": 1.0047889947891235, + "learning_rate": 3.4531248482557086e-05, + "loss": 0.4751, + "num_input_tokens_seen": 3818288, + "step": 11335 + }, + { + "epoch": 8.763523956723338, + "grad_norm": 1.212613582611084, + "learning_rate": 3.4515659878620886e-05, + "loss": 0.4384, + "num_input_tokens_seen": 3820048, + "step": 11340 + }, + { + "epoch": 8.767387944358578, + "grad_norm": 0.7681643962860107, + "learning_rate": 3.4500066946905585e-05, + "loss": 0.3865, + "num_input_tokens_seen": 3821616, + "step": 11345 + }, + { + "epoch": 8.771251931993817, + "grad_norm": 0.6002762913703918, + "learning_rate": 3.4484469694502934e-05, + "loss": 0.4387, + "num_input_tokens_seen": 3823472, + "step": 11350 + }, + { + "epoch": 8.775115919629057, + "grad_norm": 1.008750319480896, + "learning_rate": 3.446886812850668e-05, + "loss": 0.3991, + "num_input_tokens_seen": 3825456, + "step": 11355 + }, + { + "epoch": 8.778979907264297, + "grad_norm": 0.9856827259063721, + "learning_rate": 3.4453262256012476e-05, + "loss": 0.4692, + "num_input_tokens_seen": 3827280, + "step": 11360 + }, + { + "epoch": 8.782843894899536, + "grad_norm": 1.3722552061080933, + "learning_rate": 3.4437652084118e-05, + "loss": 0.6129, + "num_input_tokens_seen": 3828880, + "step": 11365 + }, + { + "epoch": 8.786707882534776, + "grad_norm": 1.5796374082565308, + "learning_rate": 3.4422037619922826e-05, + "loss": 0.668, + "num_input_tokens_seen": 3830576, + "step": 11370 + }, + { + "epoch": 8.790571870170016, + "grad_norm": 1.0470051765441895, + "learning_rate": 3.440641887052852e-05, + "loss": 0.3926, + "num_input_tokens_seen": 3832112, + "step": 11375 + }, + { + "epoch": 8.794435857805254, + "grad_norm": 0.9240325689315796, + "learning_rate": 3.439079584303858e-05, + "loss": 0.4517, + "num_input_tokens_seen": 3833808, + "step": 11380 + }, + { + "epoch": 8.798299845440495, + "grad_norm": 1.3263416290283203, + "learning_rate": 3.437516854455846e-05, + "loss": 0.4432, + "num_input_tokens_seen": 3835472, + "step": 11385 + }, + { + "epoch": 8.802163833075735, + "grad_norm": 0.6707918047904968, + "learning_rate": 3.4359536982195527e-05, + "loss": 0.4081, + "num_input_tokens_seen": 3837104, + "step": 11390 + }, + { + "epoch": 8.806027820710973, + "grad_norm": 1.7243413925170898, + "learning_rate": 3.4343901163059125e-05, + "loss": 0.5067, + "num_input_tokens_seen": 3838832, + "step": 11395 + }, + { + "epoch": 8.809891808346213, + "grad_norm": 0.9192718267440796, + "learning_rate": 3.432826109426052e-05, + "loss": 0.3495, + "num_input_tokens_seen": 3840560, + "step": 11400 + }, + { + "epoch": 8.813755795981454, + "grad_norm": 0.8882884979248047, + "learning_rate": 3.4312616782912897e-05, + "loss": 0.4304, + "num_input_tokens_seen": 3842288, + "step": 11405 + }, + { + "epoch": 8.817619783616692, + "grad_norm": 1.1443355083465576, + "learning_rate": 3.42969682361314e-05, + "loss": 0.4767, + "num_input_tokens_seen": 3843984, + "step": 11410 + }, + { + "epoch": 8.821483771251932, + "grad_norm": 0.7157551050186157, + "learning_rate": 3.428131546103306e-05, + "loss": 0.4134, + "num_input_tokens_seen": 3845680, + "step": 11415 + }, + { + "epoch": 8.825347758887172, + "grad_norm": 0.9334703087806702, + "learning_rate": 3.4265658464736876e-05, + "loss": 0.4133, + "num_input_tokens_seen": 3847440, + "step": 11420 + }, + { + "epoch": 8.829211746522411, + "grad_norm": 1.669008731842041, + "learning_rate": 3.424999725436373e-05, + "loss": 0.3877, + "num_input_tokens_seen": 3848944, + "step": 11425 + }, + { + "epoch": 8.833075734157651, + "grad_norm": 1.1440805196762085, + "learning_rate": 3.423433183703643e-05, + "loss": 0.5059, + "num_input_tokens_seen": 3850800, + "step": 11430 + }, + { + "epoch": 8.83693972179289, + "grad_norm": 1.0591709613800049, + "learning_rate": 3.421866221987972e-05, + "loss": 0.6606, + "num_input_tokens_seen": 3852368, + "step": 11435 + }, + { + "epoch": 8.84080370942813, + "grad_norm": 0.8224958777427673, + "learning_rate": 3.420298841002021e-05, + "loss": 0.3968, + "num_input_tokens_seen": 3854192, + "step": 11440 + }, + { + "epoch": 8.84466769706337, + "grad_norm": 1.3519151210784912, + "learning_rate": 3.4187310414586474e-05, + "loss": 0.424, + "num_input_tokens_seen": 3856112, + "step": 11445 + }, + { + "epoch": 8.848531684698608, + "grad_norm": 0.47656068205833435, + "learning_rate": 3.417162824070892e-05, + "loss": 0.4106, + "num_input_tokens_seen": 3857744, + "step": 11450 + }, + { + "epoch": 8.852395672333849, + "grad_norm": 0.9924972057342529, + "learning_rate": 3.415594189551993e-05, + "loss": 0.5518, + "num_input_tokens_seen": 3859248, + "step": 11455 + }, + { + "epoch": 8.856259659969089, + "grad_norm": 0.7680327892303467, + "learning_rate": 3.414025138615372e-05, + "loss": 0.4995, + "num_input_tokens_seen": 3860816, + "step": 11460 + }, + { + "epoch": 8.860123647604327, + "grad_norm": 0.7283303141593933, + "learning_rate": 3.4124556719746455e-05, + "loss": 0.4573, + "num_input_tokens_seen": 3862800, + "step": 11465 + }, + { + "epoch": 8.863987635239567, + "grad_norm": 1.21336030960083, + "learning_rate": 3.410885790343614e-05, + "loss": 0.4402, + "num_input_tokens_seen": 3864496, + "step": 11470 + }, + { + "epoch": 8.867851622874806, + "grad_norm": 1.5134820938110352, + "learning_rate": 3.4093154944362706e-05, + "loss": 0.7169, + "num_input_tokens_seen": 3866064, + "step": 11475 + }, + { + "epoch": 8.871715610510046, + "grad_norm": 1.2454161643981934, + "learning_rate": 3.407744784966795e-05, + "loss": 0.4933, + "num_input_tokens_seen": 3867792, + "step": 11480 + }, + { + "epoch": 8.875579598145286, + "grad_norm": 0.9121777415275574, + "learning_rate": 3.406173662649554e-05, + "loss": 0.4056, + "num_input_tokens_seen": 3869456, + "step": 11485 + }, + { + "epoch": 8.879443585780525, + "grad_norm": 0.6111441254615784, + "learning_rate": 3.404602128199105e-05, + "loss": 0.3537, + "num_input_tokens_seen": 3870928, + "step": 11490 + }, + { + "epoch": 8.883307573415765, + "grad_norm": 0.9139411449432373, + "learning_rate": 3.40303018233019e-05, + "loss": 0.5503, + "num_input_tokens_seen": 3872624, + "step": 11495 + }, + { + "epoch": 8.887171561051005, + "grad_norm": 1.116279125213623, + "learning_rate": 3.40145782575774e-05, + "loss": 0.3963, + "num_input_tokens_seen": 3874032, + "step": 11500 + }, + { + "epoch": 8.891035548686244, + "grad_norm": 0.7426797151565552, + "learning_rate": 3.399885059196873e-05, + "loss": 0.4034, + "num_input_tokens_seen": 3875568, + "step": 11505 + }, + { + "epoch": 8.894899536321484, + "grad_norm": 0.8565546274185181, + "learning_rate": 3.3983118833628914e-05, + "loss": 0.3668, + "num_input_tokens_seen": 3877360, + "step": 11510 + }, + { + "epoch": 8.898763523956724, + "grad_norm": 1.2723388671875, + "learning_rate": 3.3967382989712856e-05, + "loss": 0.5897, + "num_input_tokens_seen": 3879248, + "step": 11515 + }, + { + "epoch": 8.902627511591962, + "grad_norm": 1.4320738315582275, + "learning_rate": 3.39516430673773e-05, + "loss": 0.5524, + "num_input_tokens_seen": 3880816, + "step": 11520 + }, + { + "epoch": 8.906491499227203, + "grad_norm": 0.8292056322097778, + "learning_rate": 3.3935899073780885e-05, + "loss": 0.3455, + "num_input_tokens_seen": 3882576, + "step": 11525 + }, + { + "epoch": 8.910355486862443, + "grad_norm": 0.7946489453315735, + "learning_rate": 3.392015101608405e-05, + "loss": 0.4363, + "num_input_tokens_seen": 3884208, + "step": 11530 + }, + { + "epoch": 8.914219474497681, + "grad_norm": 0.8079558610916138, + "learning_rate": 3.39043989014491e-05, + "loss": 0.4126, + "num_input_tokens_seen": 3886096, + "step": 11535 + }, + { + "epoch": 8.918083462132921, + "grad_norm": 0.6243640184402466, + "learning_rate": 3.3888642737040224e-05, + "loss": 0.426, + "num_input_tokens_seen": 3887760, + "step": 11540 + }, + { + "epoch": 8.921947449768162, + "grad_norm": 0.9272558689117432, + "learning_rate": 3.387288253002339e-05, + "loss": 0.4318, + "num_input_tokens_seen": 3889616, + "step": 11545 + }, + { + "epoch": 8.9258114374034, + "grad_norm": 0.7858620882034302, + "learning_rate": 3.385711828756644e-05, + "loss": 0.4569, + "num_input_tokens_seen": 3891504, + "step": 11550 + }, + { + "epoch": 8.92967542503864, + "grad_norm": 1.0407615900039673, + "learning_rate": 3.384135001683905e-05, + "loss": 0.433, + "num_input_tokens_seen": 3892912, + "step": 11555 + }, + { + "epoch": 8.933539412673879, + "grad_norm": 1.2179776430130005, + "learning_rate": 3.382557772501273e-05, + "loss": 0.4949, + "num_input_tokens_seen": 3894416, + "step": 11560 + }, + { + "epoch": 8.937403400309119, + "grad_norm": 0.8698446750640869, + "learning_rate": 3.38098014192608e-05, + "loss": 0.5016, + "num_input_tokens_seen": 3896016, + "step": 11565 + }, + { + "epoch": 8.94126738794436, + "grad_norm": 0.6859121918678284, + "learning_rate": 3.379402110675843e-05, + "loss": 0.6449, + "num_input_tokens_seen": 3897680, + "step": 11570 + }, + { + "epoch": 8.945131375579598, + "grad_norm": 0.8695012927055359, + "learning_rate": 3.377823679468259e-05, + "loss": 0.5728, + "num_input_tokens_seen": 3899632, + "step": 11575 + }, + { + "epoch": 8.948995363214838, + "grad_norm": 1.2380353212356567, + "learning_rate": 3.37624484902121e-05, + "loss": 0.4562, + "num_input_tokens_seen": 3901616, + "step": 11580 + }, + { + "epoch": 8.952859350850078, + "grad_norm": 0.7308654189109802, + "learning_rate": 3.3746656200527535e-05, + "loss": 0.5869, + "num_input_tokens_seen": 3903312, + "step": 11585 + }, + { + "epoch": 8.956723338485316, + "grad_norm": 0.9882901310920715, + "learning_rate": 3.3730859932811364e-05, + "loss": 0.3635, + "num_input_tokens_seen": 3905104, + "step": 11590 + }, + { + "epoch": 8.960587326120557, + "grad_norm": 1.1030919551849365, + "learning_rate": 3.371505969424781e-05, + "loss": 0.4543, + "num_input_tokens_seen": 3906896, + "step": 11595 + }, + { + "epoch": 8.964451313755795, + "grad_norm": 0.9177827835083008, + "learning_rate": 3.369925549202291e-05, + "loss": 0.422, + "num_input_tokens_seen": 3908560, + "step": 11600 + }, + { + "epoch": 8.968315301391035, + "grad_norm": 0.6573162078857422, + "learning_rate": 3.368344733332451e-05, + "loss": 0.3762, + "num_input_tokens_seen": 3910032, + "step": 11605 + }, + { + "epoch": 8.972179289026275, + "grad_norm": 0.722504198551178, + "learning_rate": 3.366763522534227e-05, + "loss": 0.3334, + "num_input_tokens_seen": 3911472, + "step": 11610 + }, + { + "epoch": 8.976043276661514, + "grad_norm": 1.219506025314331, + "learning_rate": 3.365181917526761e-05, + "loss": 0.4446, + "num_input_tokens_seen": 3913232, + "step": 11615 + }, + { + "epoch": 8.979907264296754, + "grad_norm": 1.389388084411621, + "learning_rate": 3.363599919029378e-05, + "loss": 0.6526, + "num_input_tokens_seen": 3915024, + "step": 11620 + }, + { + "epoch": 8.983771251931994, + "grad_norm": 1.109411358833313, + "learning_rate": 3.3620175277615806e-05, + "loss": 0.5547, + "num_input_tokens_seen": 3916944, + "step": 11625 + }, + { + "epoch": 8.987635239567233, + "grad_norm": 1.0708798170089722, + "learning_rate": 3.360434744443049e-05, + "loss": 0.4272, + "num_input_tokens_seen": 3918864, + "step": 11630 + }, + { + "epoch": 8.991499227202473, + "grad_norm": 1.2970728874206543, + "learning_rate": 3.358851569793642e-05, + "loss": 0.4861, + "num_input_tokens_seen": 3920688, + "step": 11635 + }, + { + "epoch": 8.995363214837713, + "grad_norm": 0.8009732961654663, + "learning_rate": 3.357268004533398e-05, + "loss": 0.4003, + "num_input_tokens_seen": 3922640, + "step": 11640 + }, + { + "epoch": 8.999227202472952, + "grad_norm": 1.05343496799469, + "learning_rate": 3.355684049382532e-05, + "loss": 0.3721, + "num_input_tokens_seen": 3924080, + "step": 11645 + }, + { + "epoch": 9.0, + "eval_loss": 0.46014276146888733, + "eval_runtime": 6.2488, + "eval_samples_per_second": 92.017, + "eval_steps_per_second": 23.044, + "num_input_tokens_seen": 3924192, + "step": 11646 + }, + { + "epoch": 9.003091190108192, + "grad_norm": 0.6342517137527466, + "learning_rate": 3.354099705061435e-05, + "loss": 0.4592, + "num_input_tokens_seen": 3925600, + "step": 11650 + }, + { + "epoch": 9.006955177743432, + "grad_norm": 1.366655945777893, + "learning_rate": 3.352514972290676e-05, + "loss": 0.5437, + "num_input_tokens_seen": 3927424, + "step": 11655 + }, + { + "epoch": 9.01081916537867, + "grad_norm": 1.2763205766677856, + "learning_rate": 3.3509298517910045e-05, + "loss": 0.9005, + "num_input_tokens_seen": 3929504, + "step": 11660 + }, + { + "epoch": 9.01468315301391, + "grad_norm": 1.2634462118148804, + "learning_rate": 3.3493443442833397e-05, + "loss": 0.4332, + "num_input_tokens_seen": 3931264, + "step": 11665 + }, + { + "epoch": 9.018547140649149, + "grad_norm": 0.606615424156189, + "learning_rate": 3.34775845048878e-05, + "loss": 0.4366, + "num_input_tokens_seen": 3932736, + "step": 11670 + }, + { + "epoch": 9.02241112828439, + "grad_norm": 0.6981626152992249, + "learning_rate": 3.3461721711286e-05, + "loss": 0.439, + "num_input_tokens_seen": 3934336, + "step": 11675 + }, + { + "epoch": 9.02627511591963, + "grad_norm": 1.3104767799377441, + "learning_rate": 3.344585506924249e-05, + "loss": 0.4784, + "num_input_tokens_seen": 3936128, + "step": 11680 + }, + { + "epoch": 9.030139103554868, + "grad_norm": 0.8850065469741821, + "learning_rate": 3.342998458597352e-05, + "loss": 0.3163, + "num_input_tokens_seen": 3937600, + "step": 11685 + }, + { + "epoch": 9.034003091190108, + "grad_norm": 1.1754002571105957, + "learning_rate": 3.3414110268697075e-05, + "loss": 0.5872, + "num_input_tokens_seen": 3939136, + "step": 11690 + }, + { + "epoch": 9.037867078825348, + "grad_norm": 0.7362832427024841, + "learning_rate": 3.3398232124632884e-05, + "loss": 0.4454, + "num_input_tokens_seen": 3940704, + "step": 11695 + }, + { + "epoch": 9.041731066460587, + "grad_norm": 1.0437588691711426, + "learning_rate": 3.3382350161002434e-05, + "loss": 0.4252, + "num_input_tokens_seen": 3942272, + "step": 11700 + }, + { + "epoch": 9.045595054095827, + "grad_norm": 0.7351408004760742, + "learning_rate": 3.336646438502893e-05, + "loss": 0.326, + "num_input_tokens_seen": 3943808, + "step": 11705 + }, + { + "epoch": 9.049459041731067, + "grad_norm": 0.8959833383560181, + "learning_rate": 3.3350574803937315e-05, + "loss": 0.43, + "num_input_tokens_seen": 3945376, + "step": 11710 + }, + { + "epoch": 9.053323029366306, + "grad_norm": 0.8262898325920105, + "learning_rate": 3.3334681424954274e-05, + "loss": 0.3518, + "num_input_tokens_seen": 3947072, + "step": 11715 + }, + { + "epoch": 9.057187017001546, + "grad_norm": 1.4424265623092651, + "learning_rate": 3.33187842553082e-05, + "loss": 0.943, + "num_input_tokens_seen": 3949152, + "step": 11720 + }, + { + "epoch": 9.061051004636786, + "grad_norm": 1.01751708984375, + "learning_rate": 3.330288330222923e-05, + "loss": 0.4976, + "num_input_tokens_seen": 3950720, + "step": 11725 + }, + { + "epoch": 9.064914992272024, + "grad_norm": 0.8158180713653564, + "learning_rate": 3.3286978572949214e-05, + "loss": 0.4389, + "num_input_tokens_seen": 3952320, + "step": 11730 + }, + { + "epoch": 9.068778979907265, + "grad_norm": 0.7000887393951416, + "learning_rate": 3.327107007470171e-05, + "loss": 0.6397, + "num_input_tokens_seen": 3954144, + "step": 11735 + }, + { + "epoch": 9.072642967542503, + "grad_norm": 0.7504770159721375, + "learning_rate": 3.3255157814722003e-05, + "loss": 0.3591, + "num_input_tokens_seen": 3956192, + "step": 11740 + }, + { + "epoch": 9.076506955177743, + "grad_norm": 1.2649695873260498, + "learning_rate": 3.3239241800247086e-05, + "loss": 0.3902, + "num_input_tokens_seen": 3957760, + "step": 11745 + }, + { + "epoch": 9.080370942812984, + "grad_norm": 1.003210425376892, + "learning_rate": 3.3223322038515656e-05, + "loss": 0.6797, + "num_input_tokens_seen": 3959648, + "step": 11750 + }, + { + "epoch": 9.084234930448222, + "grad_norm": 0.7819150686264038, + "learning_rate": 3.320739853676812e-05, + "loss": 0.4871, + "num_input_tokens_seen": 3961472, + "step": 11755 + }, + { + "epoch": 9.088098918083462, + "grad_norm": 1.9185489416122437, + "learning_rate": 3.319147130224656e-05, + "loss": 0.3658, + "num_input_tokens_seen": 3963040, + "step": 11760 + }, + { + "epoch": 9.091962905718702, + "grad_norm": 1.3213945627212524, + "learning_rate": 3.317554034219481e-05, + "loss": 0.7428, + "num_input_tokens_seen": 3964672, + "step": 11765 + }, + { + "epoch": 9.09582689335394, + "grad_norm": 0.7197156548500061, + "learning_rate": 3.315960566385835e-05, + "loss": 0.6393, + "num_input_tokens_seen": 3966496, + "step": 11770 + }, + { + "epoch": 9.099690880989181, + "grad_norm": 0.7921718955039978, + "learning_rate": 3.314366727448436e-05, + "loss": 0.5847, + "num_input_tokens_seen": 3968320, + "step": 11775 + }, + { + "epoch": 9.103554868624421, + "grad_norm": 0.6026407480239868, + "learning_rate": 3.312772518132173e-05, + "loss": 0.4924, + "num_input_tokens_seen": 3970240, + "step": 11780 + }, + { + "epoch": 9.10741885625966, + "grad_norm": 0.8319979906082153, + "learning_rate": 3.3111779391621014e-05, + "loss": 0.4109, + "num_input_tokens_seen": 3971808, + "step": 11785 + }, + { + "epoch": 9.1112828438949, + "grad_norm": 1.172366976737976, + "learning_rate": 3.3095829912634445e-05, + "loss": 0.6255, + "num_input_tokens_seen": 3973216, + "step": 11790 + }, + { + "epoch": 9.115146831530138, + "grad_norm": 0.6915415525436401, + "learning_rate": 3.307987675161595e-05, + "loss": 0.6285, + "num_input_tokens_seen": 3975008, + "step": 11795 + }, + { + "epoch": 9.119010819165378, + "grad_norm": 1.2987585067749023, + "learning_rate": 3.3063919915821115e-05, + "loss": 0.4563, + "num_input_tokens_seen": 3976640, + "step": 11800 + }, + { + "epoch": 9.122874806800619, + "grad_norm": 1.0090389251708984, + "learning_rate": 3.304795941250722e-05, + "loss": 0.4825, + "num_input_tokens_seen": 3978304, + "step": 11805 + }, + { + "epoch": 9.126738794435857, + "grad_norm": 1.4553642272949219, + "learning_rate": 3.3031995248933176e-05, + "loss": 0.4571, + "num_input_tokens_seen": 3979936, + "step": 11810 + }, + { + "epoch": 9.130602782071097, + "grad_norm": 0.7582365870475769, + "learning_rate": 3.30160274323596e-05, + "loss": 0.3759, + "num_input_tokens_seen": 3981664, + "step": 11815 + }, + { + "epoch": 9.134466769706338, + "grad_norm": 0.651155948638916, + "learning_rate": 3.3000055970048734e-05, + "loss": 0.348, + "num_input_tokens_seen": 3983424, + "step": 11820 + }, + { + "epoch": 9.138330757341576, + "grad_norm": 0.8478977084159851, + "learning_rate": 3.298408086926451e-05, + "loss": 0.5021, + "num_input_tokens_seen": 3985088, + "step": 11825 + }, + { + "epoch": 9.142194744976816, + "grad_norm": 0.9551302194595337, + "learning_rate": 3.296810213727249e-05, + "loss": 0.5425, + "num_input_tokens_seen": 3986752, + "step": 11830 + }, + { + "epoch": 9.146058732612056, + "grad_norm": 0.9851644039154053, + "learning_rate": 3.2952119781339895e-05, + "loss": 0.48, + "num_input_tokens_seen": 3988256, + "step": 11835 + }, + { + "epoch": 9.149922720247295, + "grad_norm": 1.6056536436080933, + "learning_rate": 3.29361338087356e-05, + "loss": 0.6172, + "num_input_tokens_seen": 3989888, + "step": 11840 + }, + { + "epoch": 9.153786707882535, + "grad_norm": 0.7076149582862854, + "learning_rate": 3.2920144226730124e-05, + "loss": 0.3736, + "num_input_tokens_seen": 3991488, + "step": 11845 + }, + { + "epoch": 9.157650695517773, + "grad_norm": 0.9709126353263855, + "learning_rate": 3.290415104259563e-05, + "loss": 0.5934, + "num_input_tokens_seen": 3993056, + "step": 11850 + }, + { + "epoch": 9.161514683153014, + "grad_norm": 0.7400974631309509, + "learning_rate": 3.288815426360589e-05, + "loss": 0.5535, + "num_input_tokens_seen": 3994560, + "step": 11855 + }, + { + "epoch": 9.165378670788254, + "grad_norm": 0.8294342160224915, + "learning_rate": 3.287215389703636e-05, + "loss": 0.5421, + "num_input_tokens_seen": 3996288, + "step": 11860 + }, + { + "epoch": 9.169242658423492, + "grad_norm": 0.6509759426116943, + "learning_rate": 3.285614995016409e-05, + "loss": 0.3562, + "num_input_tokens_seen": 3997824, + "step": 11865 + }, + { + "epoch": 9.173106646058732, + "grad_norm": 0.6968894600868225, + "learning_rate": 3.284014243026778e-05, + "loss": 0.338, + "num_input_tokens_seen": 3999424, + "step": 11870 + }, + { + "epoch": 9.176970633693973, + "grad_norm": 0.6156551241874695, + "learning_rate": 3.282413134462773e-05, + "loss": 0.4656, + "num_input_tokens_seen": 4001248, + "step": 11875 + }, + { + "epoch": 9.180834621329211, + "grad_norm": 1.189975619316101, + "learning_rate": 3.2808116700525886e-05, + "loss": 0.3859, + "num_input_tokens_seen": 4002688, + "step": 11880 + }, + { + "epoch": 9.184698608964451, + "grad_norm": 0.7941542267799377, + "learning_rate": 3.279209850524582e-05, + "loss": 0.4213, + "num_input_tokens_seen": 4004224, + "step": 11885 + }, + { + "epoch": 9.188562596599692, + "grad_norm": 1.2526997327804565, + "learning_rate": 3.2776076766072685e-05, + "loss": 0.7434, + "num_input_tokens_seen": 4005824, + "step": 11890 + }, + { + "epoch": 9.19242658423493, + "grad_norm": 1.1977574825286865, + "learning_rate": 3.2760051490293255e-05, + "loss": 0.4716, + "num_input_tokens_seen": 4007456, + "step": 11895 + }, + { + "epoch": 9.19629057187017, + "grad_norm": 1.392229676246643, + "learning_rate": 3.274402268519594e-05, + "loss": 0.4674, + "num_input_tokens_seen": 4009376, + "step": 11900 + }, + { + "epoch": 9.20015455950541, + "grad_norm": 0.9214085936546326, + "learning_rate": 3.272799035807074e-05, + "loss": 0.5715, + "num_input_tokens_seen": 4010976, + "step": 11905 + }, + { + "epoch": 9.204018547140649, + "grad_norm": 0.5892452001571655, + "learning_rate": 3.2711954516209236e-05, + "loss": 0.3826, + "num_input_tokens_seen": 4012576, + "step": 11910 + }, + { + "epoch": 9.207882534775889, + "grad_norm": 0.9146372079849243, + "learning_rate": 3.269591516690463e-05, + "loss": 0.3948, + "num_input_tokens_seen": 4014208, + "step": 11915 + }, + { + "epoch": 9.211746522411127, + "grad_norm": 1.750875473022461, + "learning_rate": 3.267987231745172e-05, + "loss": 0.5806, + "num_input_tokens_seen": 4015968, + "step": 11920 + }, + { + "epoch": 9.215610510046368, + "grad_norm": 0.9858949780464172, + "learning_rate": 3.2663825975146896e-05, + "loss": 0.4542, + "num_input_tokens_seen": 4017760, + "step": 11925 + }, + { + "epoch": 9.219474497681608, + "grad_norm": 0.6001882553100586, + "learning_rate": 3.264777614728811e-05, + "loss": 0.4768, + "num_input_tokens_seen": 4019328, + "step": 11930 + }, + { + "epoch": 9.223338485316846, + "grad_norm": 1.1043297052383423, + "learning_rate": 3.263172284117493e-05, + "loss": 0.4535, + "num_input_tokens_seen": 4021088, + "step": 11935 + }, + { + "epoch": 9.227202472952087, + "grad_norm": 1.8163152933120728, + "learning_rate": 3.261566606410851e-05, + "loss": 0.4194, + "num_input_tokens_seen": 4022816, + "step": 11940 + }, + { + "epoch": 9.231066460587327, + "grad_norm": 0.7318902611732483, + "learning_rate": 3.259960582339155e-05, + "loss": 0.3848, + "num_input_tokens_seen": 4024512, + "step": 11945 + }, + { + "epoch": 9.234930448222565, + "grad_norm": 0.9285306334495544, + "learning_rate": 3.258354212632834e-05, + "loss": 0.5344, + "num_input_tokens_seen": 4026080, + "step": 11950 + }, + { + "epoch": 9.238794435857805, + "grad_norm": 0.6282941102981567, + "learning_rate": 3.256747498022476e-05, + "loss": 0.497, + "num_input_tokens_seen": 4027776, + "step": 11955 + }, + { + "epoch": 9.242658423493046, + "grad_norm": 1.0854583978652954, + "learning_rate": 3.255140439238825e-05, + "loss": 0.5421, + "num_input_tokens_seen": 4029504, + "step": 11960 + }, + { + "epoch": 9.246522411128284, + "grad_norm": 1.2815793752670288, + "learning_rate": 3.2535330370127786e-05, + "loss": 0.418, + "num_input_tokens_seen": 4031264, + "step": 11965 + }, + { + "epoch": 9.250386398763524, + "grad_norm": 0.7317447066307068, + "learning_rate": 3.251925292075395e-05, + "loss": 0.3638, + "num_input_tokens_seen": 4032800, + "step": 11970 + }, + { + "epoch": 9.254250386398763, + "grad_norm": 1.1539829969406128, + "learning_rate": 3.2503172051578846e-05, + "loss": 0.4749, + "num_input_tokens_seen": 4034592, + "step": 11975 + }, + { + "epoch": 9.258114374034003, + "grad_norm": 0.88090980052948, + "learning_rate": 3.248708776991617e-05, + "loss": 0.3696, + "num_input_tokens_seen": 4036256, + "step": 11980 + }, + { + "epoch": 9.261978361669243, + "grad_norm": 1.218133568763733, + "learning_rate": 3.2471000083081126e-05, + "loss": 0.5294, + "num_input_tokens_seen": 4037760, + "step": 11985 + }, + { + "epoch": 9.265842349304481, + "grad_norm": 1.790350079536438, + "learning_rate": 3.2454908998390506e-05, + "loss": 0.7004, + "num_input_tokens_seen": 4039776, + "step": 11990 + }, + { + "epoch": 9.269706336939722, + "grad_norm": 0.9798881411552429, + "learning_rate": 3.243881452316263e-05, + "loss": 0.4498, + "num_input_tokens_seen": 4041376, + "step": 11995 + }, + { + "epoch": 9.273570324574962, + "grad_norm": 0.9088618159294128, + "learning_rate": 3.242271666471736e-05, + "loss": 0.4625, + "num_input_tokens_seen": 4043104, + "step": 12000 + }, + { + "epoch": 9.2774343122102, + "grad_norm": 1.0214378833770752, + "learning_rate": 3.2406615430376095e-05, + "loss": 0.3808, + "num_input_tokens_seen": 4044704, + "step": 12005 + }, + { + "epoch": 9.28129829984544, + "grad_norm": 0.5557891726493835, + "learning_rate": 3.2390510827461785e-05, + "loss": 0.5139, + "num_input_tokens_seen": 4046336, + "step": 12010 + }, + { + "epoch": 9.28516228748068, + "grad_norm": 0.9534186124801636, + "learning_rate": 3.237440286329888e-05, + "loss": 0.3902, + "num_input_tokens_seen": 4047904, + "step": 12015 + }, + { + "epoch": 9.28902627511592, + "grad_norm": 0.7536749839782715, + "learning_rate": 3.235829154521339e-05, + "loss": 0.3818, + "num_input_tokens_seen": 4049888, + "step": 12020 + }, + { + "epoch": 9.29289026275116, + "grad_norm": 0.7613241672515869, + "learning_rate": 3.234217688053284e-05, + "loss": 0.3602, + "num_input_tokens_seen": 4051744, + "step": 12025 + }, + { + "epoch": 9.2967542503864, + "grad_norm": 0.8175050020217896, + "learning_rate": 3.232605887658628e-05, + "loss": 0.3999, + "num_input_tokens_seen": 4053312, + "step": 12030 + }, + { + "epoch": 9.300618238021638, + "grad_norm": 0.9474180340766907, + "learning_rate": 3.2309937540704256e-05, + "loss": 0.368, + "num_input_tokens_seen": 4055040, + "step": 12035 + }, + { + "epoch": 9.304482225656878, + "grad_norm": 1.0375590324401855, + "learning_rate": 3.229381288021887e-05, + "loss": 0.6305, + "num_input_tokens_seen": 4056896, + "step": 12040 + }, + { + "epoch": 9.308346213292117, + "grad_norm": 0.7682391405105591, + "learning_rate": 3.2277684902463705e-05, + "loss": 0.4037, + "num_input_tokens_seen": 4058592, + "step": 12045 + }, + { + "epoch": 9.312210200927357, + "grad_norm": 0.7668389678001404, + "learning_rate": 3.226155361477386e-05, + "loss": 0.5675, + "num_input_tokens_seen": 4060128, + "step": 12050 + }, + { + "epoch": 9.316074188562597, + "grad_norm": 1.0516327619552612, + "learning_rate": 3.224541902448594e-05, + "loss": 0.6066, + "num_input_tokens_seen": 4061600, + "step": 12055 + }, + { + "epoch": 9.319938176197835, + "grad_norm": 1.0091170072555542, + "learning_rate": 3.2229281138938063e-05, + "loss": 0.5781, + "num_input_tokens_seen": 4063328, + "step": 12060 + }, + { + "epoch": 9.323802163833076, + "grad_norm": 1.6289970874786377, + "learning_rate": 3.221313996546983e-05, + "loss": 0.5075, + "num_input_tokens_seen": 4064992, + "step": 12065 + }, + { + "epoch": 9.327666151468316, + "grad_norm": 1.7977286577224731, + "learning_rate": 3.219699551142234e-05, + "loss": 0.5074, + "num_input_tokens_seen": 4066592, + "step": 12070 + }, + { + "epoch": 9.331530139103554, + "grad_norm": 1.0079339742660522, + "learning_rate": 3.2180847784138193e-05, + "loss": 0.4345, + "num_input_tokens_seen": 4068224, + "step": 12075 + }, + { + "epoch": 9.335394126738795, + "grad_norm": 0.532315194606781, + "learning_rate": 3.216469679096146e-05, + "loss": 0.3882, + "num_input_tokens_seen": 4070080, + "step": 12080 + }, + { + "epoch": 9.339258114374035, + "grad_norm": 0.6787154674530029, + "learning_rate": 3.214854253923772e-05, + "loss": 0.3961, + "num_input_tokens_seen": 4071712, + "step": 12085 + }, + { + "epoch": 9.343122102009273, + "grad_norm": 0.8873351812362671, + "learning_rate": 3.213238503631404e-05, + "loss": 0.4452, + "num_input_tokens_seen": 4073536, + "step": 12090 + }, + { + "epoch": 9.346986089644513, + "grad_norm": 1.1992220878601074, + "learning_rate": 3.2116224289538916e-05, + "loss": 0.4291, + "num_input_tokens_seen": 4075200, + "step": 12095 + }, + { + "epoch": 9.350850077279752, + "grad_norm": 0.9857003688812256, + "learning_rate": 3.210006030626237e-05, + "loss": 0.663, + "num_input_tokens_seen": 4076768, + "step": 12100 + }, + { + "epoch": 9.354714064914992, + "grad_norm": 0.6789494752883911, + "learning_rate": 3.2083893093835876e-05, + "loss": 0.3943, + "num_input_tokens_seen": 4078400, + "step": 12105 + }, + { + "epoch": 9.358578052550232, + "grad_norm": 0.9226491451263428, + "learning_rate": 3.2067722659612384e-05, + "loss": 0.4477, + "num_input_tokens_seen": 4080096, + "step": 12110 + }, + { + "epoch": 9.36244204018547, + "grad_norm": 0.863536536693573, + "learning_rate": 3.205154901094629e-05, + "loss": 0.3762, + "num_input_tokens_seen": 4081824, + "step": 12115 + }, + { + "epoch": 9.36630602782071, + "grad_norm": 0.7092015147209167, + "learning_rate": 3.203537215519349e-05, + "loss": 0.6676, + "num_input_tokens_seen": 4083392, + "step": 12120 + }, + { + "epoch": 9.370170015455951, + "grad_norm": 1.1675121784210205, + "learning_rate": 3.201919209971128e-05, + "loss": 0.594, + "num_input_tokens_seen": 4085248, + "step": 12125 + }, + { + "epoch": 9.37403400309119, + "grad_norm": 1.1627898216247559, + "learning_rate": 3.200300885185849e-05, + "loss": 0.73, + "num_input_tokens_seen": 4086880, + "step": 12130 + }, + { + "epoch": 9.37789799072643, + "grad_norm": 1.0247116088867188, + "learning_rate": 3.1986822418995314e-05, + "loss": 0.3857, + "num_input_tokens_seen": 4088768, + "step": 12135 + }, + { + "epoch": 9.38176197836167, + "grad_norm": 1.2392778396606445, + "learning_rate": 3.197063280848347e-05, + "loss": 0.4984, + "num_input_tokens_seen": 4090432, + "step": 12140 + }, + { + "epoch": 9.385625965996908, + "grad_norm": 0.9083396196365356, + "learning_rate": 3.195444002768608e-05, + "loss": 0.391, + "num_input_tokens_seen": 4091904, + "step": 12145 + }, + { + "epoch": 9.389489953632149, + "grad_norm": 0.9728336930274963, + "learning_rate": 3.193824408396772e-05, + "loss": 0.4308, + "num_input_tokens_seen": 4093536, + "step": 12150 + }, + { + "epoch": 9.393353941267389, + "grad_norm": 0.6852738857269287, + "learning_rate": 3.1922044984694386e-05, + "loss": 0.469, + "num_input_tokens_seen": 4095136, + "step": 12155 + }, + { + "epoch": 9.397217928902627, + "grad_norm": 0.5640857219696045, + "learning_rate": 3.190584273723355e-05, + "loss": 0.3842, + "num_input_tokens_seen": 4096864, + "step": 12160 + }, + { + "epoch": 9.401081916537867, + "grad_norm": 0.9432113170623779, + "learning_rate": 3.1889637348954076e-05, + "loss": 0.5018, + "num_input_tokens_seen": 4098592, + "step": 12165 + }, + { + "epoch": 9.404945904173106, + "grad_norm": 1.2781380414962769, + "learning_rate": 3.187342882722628e-05, + "loss": 0.4515, + "num_input_tokens_seen": 4100288, + "step": 12170 + }, + { + "epoch": 9.408809891808346, + "grad_norm": 1.173780918121338, + "learning_rate": 3.185721717942188e-05, + "loss": 0.3709, + "num_input_tokens_seen": 4102048, + "step": 12175 + }, + { + "epoch": 9.412673879443586, + "grad_norm": 1.265337347984314, + "learning_rate": 3.184100241291405e-05, + "loss": 0.5707, + "num_input_tokens_seen": 4103872, + "step": 12180 + }, + { + "epoch": 9.416537867078825, + "grad_norm": 1.0413291454315186, + "learning_rate": 3.1824784535077344e-05, + "loss": 0.3694, + "num_input_tokens_seen": 4105536, + "step": 12185 + }, + { + "epoch": 9.420401854714065, + "grad_norm": 0.9160683751106262, + "learning_rate": 3.180856355328776e-05, + "loss": 0.5168, + "num_input_tokens_seen": 4107360, + "step": 12190 + }, + { + "epoch": 9.424265842349305, + "grad_norm": 1.1587311029434204, + "learning_rate": 3.1792339474922704e-05, + "loss": 0.4328, + "num_input_tokens_seen": 4109152, + "step": 12195 + }, + { + "epoch": 9.428129829984544, + "grad_norm": 1.4957998991012573, + "learning_rate": 3.177611230736098e-05, + "loss": 0.4317, + "num_input_tokens_seen": 4110944, + "step": 12200 + }, + { + "epoch": 9.431993817619784, + "grad_norm": 1.043190836906433, + "learning_rate": 3.175988205798279e-05, + "loss": 0.4435, + "num_input_tokens_seen": 4112640, + "step": 12205 + }, + { + "epoch": 9.435857805255024, + "grad_norm": 0.9445250630378723, + "learning_rate": 3.174364873416976e-05, + "loss": 0.3961, + "num_input_tokens_seen": 4114624, + "step": 12210 + }, + { + "epoch": 9.439721792890262, + "grad_norm": 1.0917181968688965, + "learning_rate": 3.1727412343304896e-05, + "loss": 0.4286, + "num_input_tokens_seen": 4116160, + "step": 12215 + }, + { + "epoch": 9.443585780525503, + "grad_norm": 0.8319627642631531, + "learning_rate": 3.171117289277262e-05, + "loss": 0.3696, + "num_input_tokens_seen": 4117824, + "step": 12220 + }, + { + "epoch": 9.447449768160741, + "grad_norm": 0.7744445204734802, + "learning_rate": 3.169493038995871e-05, + "loss": 0.466, + "num_input_tokens_seen": 4119392, + "step": 12225 + }, + { + "epoch": 9.451313755795981, + "grad_norm": 0.9481706023216248, + "learning_rate": 3.167868484225037e-05, + "loss": 0.4393, + "num_input_tokens_seen": 4120992, + "step": 12230 + }, + { + "epoch": 9.455177743431221, + "grad_norm": 1.2216180562973022, + "learning_rate": 3.166243625703616e-05, + "loss": 0.4541, + "num_input_tokens_seen": 4122912, + "step": 12235 + }, + { + "epoch": 9.45904173106646, + "grad_norm": 0.7574498057365417, + "learning_rate": 3.1646184641706054e-05, + "loss": 0.5397, + "num_input_tokens_seen": 4125024, + "step": 12240 + }, + { + "epoch": 9.4629057187017, + "grad_norm": 0.9236118793487549, + "learning_rate": 3.162993000365135e-05, + "loss": 0.592, + "num_input_tokens_seen": 4126560, + "step": 12245 + }, + { + "epoch": 9.46676970633694, + "grad_norm": 0.6639173030853271, + "learning_rate": 3.1613672350264795e-05, + "loss": 0.3541, + "num_input_tokens_seen": 4128192, + "step": 12250 + }, + { + "epoch": 9.470633693972179, + "grad_norm": 0.923959493637085, + "learning_rate": 3.1597411688940435e-05, + "loss": 0.4424, + "num_input_tokens_seen": 4130208, + "step": 12255 + }, + { + "epoch": 9.474497681607419, + "grad_norm": 0.6455754637718201, + "learning_rate": 3.158114802707373e-05, + "loss": 0.3886, + "num_input_tokens_seen": 4131584, + "step": 12260 + }, + { + "epoch": 9.478361669242659, + "grad_norm": 1.0506973266601562, + "learning_rate": 3.1564881372061493e-05, + "loss": 0.5405, + "num_input_tokens_seen": 4133312, + "step": 12265 + }, + { + "epoch": 9.482225656877898, + "grad_norm": 0.601471483707428, + "learning_rate": 3.1548611731301895e-05, + "loss": 0.5222, + "num_input_tokens_seen": 4135008, + "step": 12270 + }, + { + "epoch": 9.486089644513138, + "grad_norm": 0.7911010980606079, + "learning_rate": 3.153233911219446e-05, + "loss": 0.4212, + "num_input_tokens_seen": 4136480, + "step": 12275 + }, + { + "epoch": 9.489953632148378, + "grad_norm": 0.8855540156364441, + "learning_rate": 3.151606352214007e-05, + "loss": 0.4329, + "num_input_tokens_seen": 4138016, + "step": 12280 + }, + { + "epoch": 9.493817619783616, + "grad_norm": 0.649513840675354, + "learning_rate": 3.149978496854098e-05, + "loss": 0.497, + "num_input_tokens_seen": 4139488, + "step": 12285 + }, + { + "epoch": 9.497681607418857, + "grad_norm": 1.0721580982208252, + "learning_rate": 3.1483503458800755e-05, + "loss": 0.4417, + "num_input_tokens_seen": 4141088, + "step": 12290 + }, + { + "epoch": 9.501545595054095, + "grad_norm": 1.1772985458374023, + "learning_rate": 3.146721900032431e-05, + "loss": 0.3871, + "num_input_tokens_seen": 4142720, + "step": 12295 + }, + { + "epoch": 9.505409582689335, + "grad_norm": 1.165857195854187, + "learning_rate": 3.1450931600517966e-05, + "loss": 0.4359, + "num_input_tokens_seen": 4144384, + "step": 12300 + }, + { + "epoch": 9.509273570324575, + "grad_norm": 1.4686825275421143, + "learning_rate": 3.143464126678928e-05, + "loss": 0.4634, + "num_input_tokens_seen": 4145984, + "step": 12305 + }, + { + "epoch": 9.513137557959814, + "grad_norm": 1.6765962839126587, + "learning_rate": 3.141834800654721e-05, + "loss": 0.5902, + "num_input_tokens_seen": 4147648, + "step": 12310 + }, + { + "epoch": 9.517001545595054, + "grad_norm": 1.0832332372665405, + "learning_rate": 3.140205182720203e-05, + "loss": 0.5108, + "num_input_tokens_seen": 4149568, + "step": 12315 + }, + { + "epoch": 9.520865533230294, + "grad_norm": 0.7668581604957581, + "learning_rate": 3.1385752736165336e-05, + "loss": 0.3948, + "num_input_tokens_seen": 4151200, + "step": 12320 + }, + { + "epoch": 9.524729520865533, + "grad_norm": 0.8930904269218445, + "learning_rate": 3.136945074085006e-05, + "loss": 0.5001, + "num_input_tokens_seen": 4152960, + "step": 12325 + }, + { + "epoch": 9.528593508500773, + "grad_norm": 1.0017601251602173, + "learning_rate": 3.135314584867044e-05, + "loss": 0.3795, + "num_input_tokens_seen": 4154592, + "step": 12330 + }, + { + "epoch": 9.532457496136013, + "grad_norm": 0.8287608027458191, + "learning_rate": 3.133683806704203e-05, + "loss": 0.4576, + "num_input_tokens_seen": 4156032, + "step": 12335 + }, + { + "epoch": 9.536321483771252, + "grad_norm": 1.1969608068466187, + "learning_rate": 3.132052740338174e-05, + "loss": 0.3642, + "num_input_tokens_seen": 4157728, + "step": 12340 + }, + { + "epoch": 9.540185471406492, + "grad_norm": 1.0296846628189087, + "learning_rate": 3.1304213865107715e-05, + "loss": 0.4504, + "num_input_tokens_seen": 4159424, + "step": 12345 + }, + { + "epoch": 9.54404945904173, + "grad_norm": 0.8406485319137573, + "learning_rate": 3.128789745963948e-05, + "loss": 0.366, + "num_input_tokens_seen": 4161120, + "step": 12350 + }, + { + "epoch": 9.54791344667697, + "grad_norm": 1.125619649887085, + "learning_rate": 3.127157819439782e-05, + "loss": 0.3968, + "num_input_tokens_seen": 4162816, + "step": 12355 + }, + { + "epoch": 9.55177743431221, + "grad_norm": 1.059698462486267, + "learning_rate": 3.125525607680484e-05, + "loss": 0.404, + "num_input_tokens_seen": 4164544, + "step": 12360 + }, + { + "epoch": 9.555641421947449, + "grad_norm": 1.946881651878357, + "learning_rate": 3.123893111428393e-05, + "loss": 0.4651, + "num_input_tokens_seen": 4166240, + "step": 12365 + }, + { + "epoch": 9.55950540958269, + "grad_norm": 1.235958456993103, + "learning_rate": 3.122260331425979e-05, + "loss": 0.5555, + "num_input_tokens_seen": 4167936, + "step": 12370 + }, + { + "epoch": 9.56336939721793, + "grad_norm": 0.9719915390014648, + "learning_rate": 3.12062726841584e-05, + "loss": 0.5467, + "num_input_tokens_seen": 4169472, + "step": 12375 + }, + { + "epoch": 9.567233384853168, + "grad_norm": 0.7769190073013306, + "learning_rate": 3.118993923140702e-05, + "loss": 0.6562, + "num_input_tokens_seen": 4171552, + "step": 12380 + }, + { + "epoch": 9.571097372488408, + "grad_norm": 1.0344538688659668, + "learning_rate": 3.117360296343421e-05, + "loss": 0.3944, + "num_input_tokens_seen": 4173376, + "step": 12385 + }, + { + "epoch": 9.574961360123648, + "grad_norm": 0.7368112802505493, + "learning_rate": 3.11572638876698e-05, + "loss": 0.4798, + "num_input_tokens_seen": 4175040, + "step": 12390 + }, + { + "epoch": 9.578825347758887, + "grad_norm": 0.9347113966941833, + "learning_rate": 3.1140922011544895e-05, + "loss": 0.6964, + "num_input_tokens_seen": 4176896, + "step": 12395 + }, + { + "epoch": 9.582689335394127, + "grad_norm": 0.9663181304931641, + "learning_rate": 3.1124577342491884e-05, + "loss": 0.5228, + "num_input_tokens_seen": 4178656, + "step": 12400 + }, + { + "epoch": 9.586553323029367, + "grad_norm": 1.2115358114242554, + "learning_rate": 3.110822988794442e-05, + "loss": 0.7108, + "num_input_tokens_seen": 4180128, + "step": 12405 + }, + { + "epoch": 9.590417310664606, + "grad_norm": 2.0211904048919678, + "learning_rate": 3.109187965533743e-05, + "loss": 0.5025, + "num_input_tokens_seen": 4181760, + "step": 12410 + }, + { + "epoch": 9.594281298299846, + "grad_norm": 1.7055644989013672, + "learning_rate": 3.107552665210708e-05, + "loss": 0.508, + "num_input_tokens_seen": 4183424, + "step": 12415 + }, + { + "epoch": 9.598145285935084, + "grad_norm": 0.8939148187637329, + "learning_rate": 3.1059170885690827e-05, + "loss": 0.3678, + "num_input_tokens_seen": 4185216, + "step": 12420 + }, + { + "epoch": 9.602009273570324, + "grad_norm": 1.5509419441223145, + "learning_rate": 3.104281236352737e-05, + "loss": 0.6059, + "num_input_tokens_seen": 4186816, + "step": 12425 + }, + { + "epoch": 9.605873261205565, + "grad_norm": 0.8781647682189941, + "learning_rate": 3.102645109305666e-05, + "loss": 0.383, + "num_input_tokens_seen": 4188192, + "step": 12430 + }, + { + "epoch": 9.609737248840803, + "grad_norm": 0.8158278465270996, + "learning_rate": 3.10100870817199e-05, + "loss": 0.4367, + "num_input_tokens_seen": 4189600, + "step": 12435 + }, + { + "epoch": 9.613601236476043, + "grad_norm": 0.7353460788726807, + "learning_rate": 3.099372033695954e-05, + "loss": 0.4882, + "num_input_tokens_seen": 4191200, + "step": 12440 + }, + { + "epoch": 9.617465224111283, + "grad_norm": 0.7942819595336914, + "learning_rate": 3.097735086621928e-05, + "loss": 0.4417, + "num_input_tokens_seen": 4192928, + "step": 12445 + }, + { + "epoch": 9.621329211746522, + "grad_norm": 1.3515405654907227, + "learning_rate": 3.096097867694405e-05, + "loss": 0.5585, + "num_input_tokens_seen": 4194624, + "step": 12450 + }, + { + "epoch": 9.625193199381762, + "grad_norm": 0.8807207345962524, + "learning_rate": 3.0944603776580016e-05, + "loss": 0.4738, + "num_input_tokens_seen": 4196160, + "step": 12455 + }, + { + "epoch": 9.629057187017002, + "grad_norm": 1.0857415199279785, + "learning_rate": 3.0928226172574585e-05, + "loss": 0.4105, + "num_input_tokens_seen": 4197952, + "step": 12460 + }, + { + "epoch": 9.63292117465224, + "grad_norm": 1.1562774181365967, + "learning_rate": 3.091184587237639e-05, + "loss": 0.4825, + "num_input_tokens_seen": 4199936, + "step": 12465 + }, + { + "epoch": 9.636785162287481, + "grad_norm": 1.0491265058517456, + "learning_rate": 3.0895462883435285e-05, + "loss": 0.5111, + "num_input_tokens_seen": 4201504, + "step": 12470 + }, + { + "epoch": 9.64064914992272, + "grad_norm": 1.887813687324524, + "learning_rate": 3.087907721320236e-05, + "loss": 0.376, + "num_input_tokens_seen": 4203136, + "step": 12475 + }, + { + "epoch": 9.64451313755796, + "grad_norm": 0.6552990078926086, + "learning_rate": 3.0862688869129895e-05, + "loss": 0.4201, + "num_input_tokens_seen": 4204832, + "step": 12480 + }, + { + "epoch": 9.6483771251932, + "grad_norm": 0.8338639140129089, + "learning_rate": 3.084629785867143e-05, + "loss": 0.5316, + "num_input_tokens_seen": 4206560, + "step": 12485 + }, + { + "epoch": 9.652241112828438, + "grad_norm": 1.0538944005966187, + "learning_rate": 3.0829904189281694e-05, + "loss": 0.4008, + "num_input_tokens_seen": 4208256, + "step": 12490 + }, + { + "epoch": 9.656105100463678, + "grad_norm": 0.6692137718200684, + "learning_rate": 3.081350786841661e-05, + "loss": 0.3807, + "num_input_tokens_seen": 4210016, + "step": 12495 + }, + { + "epoch": 9.659969088098919, + "grad_norm": 1.12278413772583, + "learning_rate": 3.079710890353334e-05, + "loss": 0.4118, + "num_input_tokens_seen": 4211712, + "step": 12500 + }, + { + "epoch": 9.663833075734157, + "grad_norm": 1.8917323350906372, + "learning_rate": 3.078070730209021e-05, + "loss": 0.4604, + "num_input_tokens_seen": 4213248, + "step": 12505 + }, + { + "epoch": 9.667697063369397, + "grad_norm": 0.7210338711738586, + "learning_rate": 3.0764303071546794e-05, + "loss": 0.3649, + "num_input_tokens_seen": 4214976, + "step": 12510 + }, + { + "epoch": 9.671561051004637, + "grad_norm": 0.7782012224197388, + "learning_rate": 3.074789621936381e-05, + "loss": 0.3814, + "num_input_tokens_seen": 4216448, + "step": 12515 + }, + { + "epoch": 9.675425038639876, + "grad_norm": 0.8555471301078796, + "learning_rate": 3.07314867530032e-05, + "loss": 0.5232, + "num_input_tokens_seen": 4217984, + "step": 12520 + }, + { + "epoch": 9.679289026275116, + "grad_norm": 0.6587982773780823, + "learning_rate": 3.07150746799281e-05, + "loss": 0.4565, + "num_input_tokens_seen": 4219424, + "step": 12525 + }, + { + "epoch": 9.683153013910356, + "grad_norm": 1.1501327753067017, + "learning_rate": 3.069866000760281e-05, + "loss": 0.4111, + "num_input_tokens_seen": 4221280, + "step": 12530 + }, + { + "epoch": 9.687017001545595, + "grad_norm": 0.8389038443565369, + "learning_rate": 3.0682242743492816e-05, + "loss": 0.3925, + "num_input_tokens_seen": 4222912, + "step": 12535 + }, + { + "epoch": 9.690880989180835, + "grad_norm": 0.7025433778762817, + "learning_rate": 3.066582289506479e-05, + "loss": 0.3387, + "num_input_tokens_seen": 4224480, + "step": 12540 + }, + { + "epoch": 9.694744976816073, + "grad_norm": 0.6481710076332092, + "learning_rate": 3.064940046978658e-05, + "loss": 0.3249, + "num_input_tokens_seen": 4226112, + "step": 12545 + }, + { + "epoch": 9.698608964451314, + "grad_norm": 0.9001286625862122, + "learning_rate": 3.0632975475127216e-05, + "loss": 0.5419, + "num_input_tokens_seen": 4227904, + "step": 12550 + }, + { + "epoch": 9.702472952086554, + "grad_norm": 0.7891894578933716, + "learning_rate": 3.061654791855686e-05, + "loss": 0.5255, + "num_input_tokens_seen": 4229888, + "step": 12555 + }, + { + "epoch": 9.706336939721792, + "grad_norm": 0.9386869668960571, + "learning_rate": 3.060011780754687e-05, + "loss": 0.3599, + "num_input_tokens_seen": 4231712, + "step": 12560 + }, + { + "epoch": 9.710200927357032, + "grad_norm": 0.7590145468711853, + "learning_rate": 3.058368514956977e-05, + "loss": 0.4647, + "num_input_tokens_seen": 4233760, + "step": 12565 + }, + { + "epoch": 9.714064914992273, + "grad_norm": 0.8687173128128052, + "learning_rate": 3.056724995209923e-05, + "loss": 0.4602, + "num_input_tokens_seen": 4235648, + "step": 12570 + }, + { + "epoch": 9.717928902627511, + "grad_norm": 1.0502352714538574, + "learning_rate": 3.055081222261006e-05, + "loss": 0.357, + "num_input_tokens_seen": 4237344, + "step": 12575 + }, + { + "epoch": 9.721792890262751, + "grad_norm": 1.0616201162338257, + "learning_rate": 3.0534371968578256e-05, + "loss": 0.3652, + "num_input_tokens_seen": 4239072, + "step": 12580 + }, + { + "epoch": 9.725656877897991, + "grad_norm": 0.5615456104278564, + "learning_rate": 3.0517929197480935e-05, + "loss": 0.4141, + "num_input_tokens_seen": 4240640, + "step": 12585 + }, + { + "epoch": 9.72952086553323, + "grad_norm": 1.1381762027740479, + "learning_rate": 3.050148391679637e-05, + "loss": 0.6299, + "num_input_tokens_seen": 4242336, + "step": 12590 + }, + { + "epoch": 9.73338485316847, + "grad_norm": 1.1241328716278076, + "learning_rate": 3.048503613400397e-05, + "loss": 0.4392, + "num_input_tokens_seen": 4243872, + "step": 12595 + }, + { + "epoch": 9.737248840803709, + "grad_norm": 0.6489281058311462, + "learning_rate": 3.0468585856584288e-05, + "loss": 0.4224, + "num_input_tokens_seen": 4245248, + "step": 12600 + }, + { + "epoch": 9.741112828438949, + "grad_norm": 0.9271618127822876, + "learning_rate": 3.045213309201901e-05, + "loss": 0.531, + "num_input_tokens_seen": 4247296, + "step": 12605 + }, + { + "epoch": 9.744976816074189, + "grad_norm": 0.7833739519119263, + "learning_rate": 3.043567784779095e-05, + "loss": 0.3573, + "num_input_tokens_seen": 4248672, + "step": 12610 + }, + { + "epoch": 9.748840803709427, + "grad_norm": 0.9824546575546265, + "learning_rate": 3.0419220131384053e-05, + "loss": 0.3717, + "num_input_tokens_seen": 4250176, + "step": 12615 + }, + { + "epoch": 9.752704791344668, + "grad_norm": 1.127068042755127, + "learning_rate": 3.040275995028338e-05, + "loss": 0.4863, + "num_input_tokens_seen": 4251712, + "step": 12620 + }, + { + "epoch": 9.756568778979908, + "grad_norm": 0.9797120690345764, + "learning_rate": 3.0386297311975126e-05, + "loss": 0.4378, + "num_input_tokens_seen": 4253568, + "step": 12625 + }, + { + "epoch": 9.760432766615146, + "grad_norm": 1.1885513067245483, + "learning_rate": 3.0369832223946603e-05, + "loss": 0.5077, + "num_input_tokens_seen": 4255296, + "step": 12630 + }, + { + "epoch": 9.764296754250386, + "grad_norm": 0.6952155232429504, + "learning_rate": 3.0353364693686233e-05, + "loss": 0.3708, + "num_input_tokens_seen": 4256832, + "step": 12635 + }, + { + "epoch": 9.768160741885627, + "grad_norm": 1.090328574180603, + "learning_rate": 3.033689472868352e-05, + "loss": 0.3864, + "num_input_tokens_seen": 4258432, + "step": 12640 + }, + { + "epoch": 9.772024729520865, + "grad_norm": 1.4451791048049927, + "learning_rate": 3.032042233642914e-05, + "loss": 0.5365, + "num_input_tokens_seen": 4260128, + "step": 12645 + }, + { + "epoch": 9.775888717156105, + "grad_norm": 1.024991512298584, + "learning_rate": 3.030394752441481e-05, + "loss": 0.5584, + "num_input_tokens_seen": 4262048, + "step": 12650 + }, + { + "epoch": 9.779752704791346, + "grad_norm": 1.5409144163131714, + "learning_rate": 3.0287470300133384e-05, + "loss": 0.605, + "num_input_tokens_seen": 4263680, + "step": 12655 + }, + { + "epoch": 9.783616692426584, + "grad_norm": 1.387296438217163, + "learning_rate": 3.0270990671078798e-05, + "loss": 0.4103, + "num_input_tokens_seen": 4265408, + "step": 12660 + }, + { + "epoch": 9.787480680061824, + "grad_norm": 0.9869155883789062, + "learning_rate": 3.0254508644746092e-05, + "loss": 0.3973, + "num_input_tokens_seen": 4267168, + "step": 12665 + }, + { + "epoch": 9.791344667697063, + "grad_norm": 0.9510881900787354, + "learning_rate": 3.023802422863139e-05, + "loss": 0.5376, + "num_input_tokens_seen": 4268672, + "step": 12670 + }, + { + "epoch": 9.795208655332303, + "grad_norm": 0.9065356850624084, + "learning_rate": 3.0221537430231893e-05, + "loss": 0.6063, + "num_input_tokens_seen": 4270496, + "step": 12675 + }, + { + "epoch": 9.799072642967543, + "grad_norm": 0.8389283418655396, + "learning_rate": 3.0205048257045898e-05, + "loss": 0.363, + "num_input_tokens_seen": 4272096, + "step": 12680 + }, + { + "epoch": 9.802936630602781, + "grad_norm": 1.0621027946472168, + "learning_rate": 3.0188556716572798e-05, + "loss": 0.4035, + "num_input_tokens_seen": 4273856, + "step": 12685 + }, + { + "epoch": 9.806800618238022, + "grad_norm": 0.8292392492294312, + "learning_rate": 3.017206281631302e-05, + "loss": 0.3964, + "num_input_tokens_seen": 4275488, + "step": 12690 + }, + { + "epoch": 9.810664605873262, + "grad_norm": 0.9377778172492981, + "learning_rate": 3.01555665637681e-05, + "loss": 0.533, + "num_input_tokens_seen": 4277152, + "step": 12695 + }, + { + "epoch": 9.8145285935085, + "grad_norm": 0.6959136128425598, + "learning_rate": 3.0139067966440633e-05, + "loss": 0.4509, + "num_input_tokens_seen": 4278912, + "step": 12700 + }, + { + "epoch": 9.81839258114374, + "grad_norm": 1.2766226530075073, + "learning_rate": 3.0122567031834275e-05, + "loss": 0.4931, + "num_input_tokens_seen": 4280896, + "step": 12705 + }, + { + "epoch": 9.82225656877898, + "grad_norm": 0.8495757579803467, + "learning_rate": 3.0106063767453756e-05, + "loss": 0.3358, + "num_input_tokens_seen": 4282560, + "step": 12710 + }, + { + "epoch": 9.826120556414219, + "grad_norm": 0.9005439281463623, + "learning_rate": 3.0089558180804857e-05, + "loss": 0.3711, + "num_input_tokens_seen": 4283968, + "step": 12715 + }, + { + "epoch": 9.82998454404946, + "grad_norm": 1.0053863525390625, + "learning_rate": 3.0073050279394416e-05, + "loss": 0.4567, + "num_input_tokens_seen": 4285664, + "step": 12720 + }, + { + "epoch": 9.833848531684698, + "grad_norm": 0.8427708148956299, + "learning_rate": 3.0056540070730323e-05, + "loss": 0.4706, + "num_input_tokens_seen": 4287584, + "step": 12725 + }, + { + "epoch": 9.837712519319938, + "grad_norm": 1.3879148960113525, + "learning_rate": 3.0040027562321525e-05, + "loss": 0.4223, + "num_input_tokens_seen": 4289472, + "step": 12730 + }, + { + "epoch": 9.841576506955178, + "grad_norm": 0.8163154125213623, + "learning_rate": 3.0023512761678017e-05, + "loss": 0.4383, + "num_input_tokens_seen": 4291328, + "step": 12735 + }, + { + "epoch": 9.845440494590417, + "grad_norm": 0.9294406771659851, + "learning_rate": 3.0006995676310813e-05, + "loss": 0.4993, + "num_input_tokens_seen": 4293088, + "step": 12740 + }, + { + "epoch": 9.849304482225657, + "grad_norm": 0.6189129948616028, + "learning_rate": 2.9990476313731986e-05, + "loss": 0.464, + "num_input_tokens_seen": 4294720, + "step": 12745 + }, + { + "epoch": 9.853168469860897, + "grad_norm": 1.3029412031173706, + "learning_rate": 2.997395468145465e-05, + "loss": 0.5343, + "num_input_tokens_seen": 4296416, + "step": 12750 + }, + { + "epoch": 9.857032457496135, + "grad_norm": 1.122997760772705, + "learning_rate": 2.995743078699294e-05, + "loss": 0.4245, + "num_input_tokens_seen": 4298048, + "step": 12755 + }, + { + "epoch": 9.860896445131376, + "grad_norm": 0.9818601012229919, + "learning_rate": 2.994090463786201e-05, + "loss": 0.3866, + "num_input_tokens_seen": 4299776, + "step": 12760 + }, + { + "epoch": 9.864760432766616, + "grad_norm": 1.0715118646621704, + "learning_rate": 2.9924376241578068e-05, + "loss": 0.4444, + "num_input_tokens_seen": 4301440, + "step": 12765 + }, + { + "epoch": 9.868624420401854, + "grad_norm": 0.8407461047172546, + "learning_rate": 2.990784560565832e-05, + "loss": 0.6067, + "num_input_tokens_seen": 4303200, + "step": 12770 + }, + { + "epoch": 9.872488408037094, + "grad_norm": 0.9402106404304504, + "learning_rate": 2.9891312737620996e-05, + "loss": 0.3988, + "num_input_tokens_seen": 4304768, + "step": 12775 + }, + { + "epoch": 9.876352395672335, + "grad_norm": 0.7825422286987305, + "learning_rate": 2.987477764498534e-05, + "loss": 0.3603, + "num_input_tokens_seen": 4306368, + "step": 12780 + }, + { + "epoch": 9.880216383307573, + "grad_norm": 0.5780319571495056, + "learning_rate": 2.985824033527163e-05, + "loss": 0.3744, + "num_input_tokens_seen": 4308000, + "step": 12785 + }, + { + "epoch": 9.884080370942813, + "grad_norm": 0.8074292540550232, + "learning_rate": 2.9841700816001115e-05, + "loss": 0.7029, + "num_input_tokens_seen": 4309632, + "step": 12790 + }, + { + "epoch": 9.887944358578052, + "grad_norm": 1.0768837928771973, + "learning_rate": 2.9825159094696076e-05, + "loss": 0.4571, + "num_input_tokens_seen": 4311328, + "step": 12795 + }, + { + "epoch": 9.891808346213292, + "grad_norm": 0.8343664407730103, + "learning_rate": 2.9808615178879778e-05, + "loss": 0.7622, + "num_input_tokens_seen": 4312960, + "step": 12800 + }, + { + "epoch": 9.895672333848532, + "grad_norm": 0.5081785917282104, + "learning_rate": 2.9792069076076502e-05, + "loss": 0.5373, + "num_input_tokens_seen": 4314624, + "step": 12805 + }, + { + "epoch": 9.89953632148377, + "grad_norm": 0.984761118888855, + "learning_rate": 2.9775520793811514e-05, + "loss": 0.3981, + "num_input_tokens_seen": 4316576, + "step": 12810 + }, + { + "epoch": 9.90340030911901, + "grad_norm": 1.1741276979446411, + "learning_rate": 2.975897033961107e-05, + "loss": 0.4059, + "num_input_tokens_seen": 4318144, + "step": 12815 + }, + { + "epoch": 9.907264296754251, + "grad_norm": 0.7531351447105408, + "learning_rate": 2.974241772100241e-05, + "loss": 0.524, + "num_input_tokens_seen": 4319712, + "step": 12820 + }, + { + "epoch": 9.91112828438949, + "grad_norm": 0.9343253374099731, + "learning_rate": 2.972586294551377e-05, + "loss": 0.3636, + "num_input_tokens_seen": 4321408, + "step": 12825 + }, + { + "epoch": 9.91499227202473, + "grad_norm": 0.8735472559928894, + "learning_rate": 2.970930602067436e-05, + "loss": 0.3849, + "num_input_tokens_seen": 4323296, + "step": 12830 + }, + { + "epoch": 9.91885625965997, + "grad_norm": 1.0266348123550415, + "learning_rate": 2.969274695401437e-05, + "loss": 0.415, + "num_input_tokens_seen": 4325120, + "step": 12835 + }, + { + "epoch": 9.922720247295208, + "grad_norm": 0.7063328623771667, + "learning_rate": 2.967618575306496e-05, + "loss": 0.4987, + "num_input_tokens_seen": 4326880, + "step": 12840 + }, + { + "epoch": 9.926584234930449, + "grad_norm": 0.8791564106941223, + "learning_rate": 2.9659622425358276e-05, + "loss": 0.3616, + "num_input_tokens_seen": 4328608, + "step": 12845 + }, + { + "epoch": 9.930448222565687, + "grad_norm": 1.2632652521133423, + "learning_rate": 2.9643056978427392e-05, + "loss": 0.747, + "num_input_tokens_seen": 4330176, + "step": 12850 + }, + { + "epoch": 9.934312210200927, + "grad_norm": 0.9486606121063232, + "learning_rate": 2.9626489419806396e-05, + "loss": 0.5465, + "num_input_tokens_seen": 4332064, + "step": 12855 + }, + { + "epoch": 9.938176197836167, + "grad_norm": 1.5945061445236206, + "learning_rate": 2.96099197570303e-05, + "loss": 0.4565, + "num_input_tokens_seen": 4333888, + "step": 12860 + }, + { + "epoch": 9.942040185471406, + "grad_norm": 1.1957365274429321, + "learning_rate": 2.9593347997635096e-05, + "loss": 0.371, + "num_input_tokens_seen": 4335680, + "step": 12865 + }, + { + "epoch": 9.945904173106646, + "grad_norm": 1.8634339570999146, + "learning_rate": 2.9576774149157715e-05, + "loss": 0.405, + "num_input_tokens_seen": 4337472, + "step": 12870 + }, + { + "epoch": 9.949768160741886, + "grad_norm": 2.4132306575775146, + "learning_rate": 2.9560198219136043e-05, + "loss": 0.7976, + "num_input_tokens_seen": 4339232, + "step": 12875 + }, + { + "epoch": 9.953632148377125, + "grad_norm": 1.3001327514648438, + "learning_rate": 2.9543620215108904e-05, + "loss": 0.5679, + "num_input_tokens_seen": 4341024, + "step": 12880 + }, + { + "epoch": 9.957496136012365, + "grad_norm": 0.6941986083984375, + "learning_rate": 2.952704014461608e-05, + "loss": 0.5284, + "num_input_tokens_seen": 4342976, + "step": 12885 + }, + { + "epoch": 9.961360123647605, + "grad_norm": 1.336464762687683, + "learning_rate": 2.9510458015198295e-05, + "loss": 0.4235, + "num_input_tokens_seen": 4344576, + "step": 12890 + }, + { + "epoch": 9.965224111282843, + "grad_norm": 0.9332967400550842, + "learning_rate": 2.949387383439719e-05, + "loss": 0.5804, + "num_input_tokens_seen": 4346272, + "step": 12895 + }, + { + "epoch": 9.969088098918084, + "grad_norm": 1.5983091592788696, + "learning_rate": 2.9477287609755343e-05, + "loss": 0.6105, + "num_input_tokens_seen": 4347872, + "step": 12900 + }, + { + "epoch": 9.972952086553324, + "grad_norm": 0.6831768751144409, + "learning_rate": 2.946069934881629e-05, + "loss": 0.4094, + "num_input_tokens_seen": 4349408, + "step": 12905 + }, + { + "epoch": 9.976816074188562, + "grad_norm": 1.5013999938964844, + "learning_rate": 2.9444109059124458e-05, + "loss": 0.6317, + "num_input_tokens_seen": 4351072, + "step": 12910 + }, + { + "epoch": 9.980680061823803, + "grad_norm": 1.1013160943984985, + "learning_rate": 2.9427516748225205e-05, + "loss": 0.4899, + "num_input_tokens_seen": 4352544, + "step": 12915 + }, + { + "epoch": 9.984544049459041, + "grad_norm": 1.036176085472107, + "learning_rate": 2.9410922423664823e-05, + "loss": 0.4576, + "num_input_tokens_seen": 4354112, + "step": 12920 + }, + { + "epoch": 9.988408037094281, + "grad_norm": 0.8796930313110352, + "learning_rate": 2.9394326092990504e-05, + "loss": 0.3961, + "num_input_tokens_seen": 4355904, + "step": 12925 + }, + { + "epoch": 9.992272024729521, + "grad_norm": 0.7679850459098816, + "learning_rate": 2.937772776375037e-05, + "loss": 0.4852, + "num_input_tokens_seen": 4357568, + "step": 12930 + }, + { + "epoch": 9.99613601236476, + "grad_norm": 0.9028803706169128, + "learning_rate": 2.936112744349342e-05, + "loss": 0.4365, + "num_input_tokens_seen": 4359264, + "step": 12935 + }, + { + "epoch": 10.0, + "grad_norm": 1.7845436334609985, + "learning_rate": 2.934452513976959e-05, + "loss": 0.3501, + "num_input_tokens_seen": 4360736, + "step": 12940 + }, + { + "epoch": 10.0, + "eval_loss": 0.45453083515167236, + "eval_runtime": 6.236, + "eval_samples_per_second": 92.207, + "eval_steps_per_second": 23.092, + "num_input_tokens_seen": 4360736, + "step": 12940 + }, + { + "epoch": 10.00386398763524, + "grad_norm": 1.1186085939407349, + "learning_rate": 2.9327920860129722e-05, + "loss": 0.5035, + "num_input_tokens_seen": 4362400, + "step": 12945 + }, + { + "epoch": 10.007727975270479, + "grad_norm": 1.0782623291015625, + "learning_rate": 2.9311314612125517e-05, + "loss": 0.5507, + "num_input_tokens_seen": 4363872, + "step": 12950 + }, + { + "epoch": 10.011591962905719, + "grad_norm": 1.0635303258895874, + "learning_rate": 2.9294706403309614e-05, + "loss": 0.4508, + "num_input_tokens_seen": 4365472, + "step": 12955 + }, + { + "epoch": 10.015455950540959, + "grad_norm": 1.3774062395095825, + "learning_rate": 2.9278096241235508e-05, + "loss": 0.3906, + "num_input_tokens_seen": 4366976, + "step": 12960 + }, + { + "epoch": 10.019319938176197, + "grad_norm": 1.0986824035644531, + "learning_rate": 2.9261484133457624e-05, + "loss": 0.4406, + "num_input_tokens_seen": 4368672, + "step": 12965 + }, + { + "epoch": 10.023183925811438, + "grad_norm": 0.8805785179138184, + "learning_rate": 2.9244870087531222e-05, + "loss": 0.4465, + "num_input_tokens_seen": 4370496, + "step": 12970 + }, + { + "epoch": 10.027047913446676, + "grad_norm": 1.1724432706832886, + "learning_rate": 2.9228254111012494e-05, + "loss": 0.4405, + "num_input_tokens_seen": 4372576, + "step": 12975 + }, + { + "epoch": 10.030911901081916, + "grad_norm": 0.5466436147689819, + "learning_rate": 2.9211636211458464e-05, + "loss": 0.4381, + "num_input_tokens_seen": 4374304, + "step": 12980 + }, + { + "epoch": 10.034775888717157, + "grad_norm": 0.5487546920776367, + "learning_rate": 2.9195016396427067e-05, + "loss": 0.3966, + "num_input_tokens_seen": 4375808, + "step": 12985 + }, + { + "epoch": 10.038639876352395, + "grad_norm": 1.0589704513549805, + "learning_rate": 2.9178394673477094e-05, + "loss": 0.3912, + "num_input_tokens_seen": 4377472, + "step": 12990 + }, + { + "epoch": 10.042503863987635, + "grad_norm": 1.1170918941497803, + "learning_rate": 2.9161771050168203e-05, + "loss": 0.4724, + "num_input_tokens_seen": 4379328, + "step": 12995 + }, + { + "epoch": 10.046367851622875, + "grad_norm": 0.8123857975006104, + "learning_rate": 2.9145145534060907e-05, + "loss": 0.3717, + "num_input_tokens_seen": 4380992, + "step": 13000 + }, + { + "epoch": 10.050231839258114, + "grad_norm": 0.8666255474090576, + "learning_rate": 2.91285181327166e-05, + "loss": 0.3598, + "num_input_tokens_seen": 4382656, + "step": 13005 + }, + { + "epoch": 10.054095826893354, + "grad_norm": 1.705977201461792, + "learning_rate": 2.9111888853697523e-05, + "loss": 0.4428, + "num_input_tokens_seen": 4384352, + "step": 13010 + }, + { + "epoch": 10.057959814528594, + "grad_norm": 0.8070244789123535, + "learning_rate": 2.909525770456677e-05, + "loss": 0.413, + "num_input_tokens_seen": 4385920, + "step": 13015 + }, + { + "epoch": 10.061823802163833, + "grad_norm": 0.8495039343833923, + "learning_rate": 2.9078624692888277e-05, + "loss": 0.4145, + "num_input_tokens_seen": 4387648, + "step": 13020 + }, + { + "epoch": 10.065687789799073, + "grad_norm": 0.5979478359222412, + "learning_rate": 2.906198982622686e-05, + "loss": 0.4308, + "num_input_tokens_seen": 4389120, + "step": 13025 + }, + { + "epoch": 10.069551777434313, + "grad_norm": 1.307705044746399, + "learning_rate": 2.9045353112148144e-05, + "loss": 0.4641, + "num_input_tokens_seen": 4390880, + "step": 13030 + }, + { + "epoch": 10.073415765069551, + "grad_norm": 1.6554192304611206, + "learning_rate": 2.9028714558218596e-05, + "loss": 0.5352, + "num_input_tokens_seen": 4392800, + "step": 13035 + }, + { + "epoch": 10.077279752704792, + "grad_norm": 0.557498037815094, + "learning_rate": 2.9012074172005542e-05, + "loss": 0.458, + "num_input_tokens_seen": 4394528, + "step": 13040 + }, + { + "epoch": 10.08114374034003, + "grad_norm": 0.6149708032608032, + "learning_rate": 2.8995431961077136e-05, + "loss": 0.4342, + "num_input_tokens_seen": 4396352, + "step": 13045 + }, + { + "epoch": 10.08500772797527, + "grad_norm": 0.7483985424041748, + "learning_rate": 2.8978787933002345e-05, + "loss": 0.4109, + "num_input_tokens_seen": 4397952, + "step": 13050 + }, + { + "epoch": 10.08887171561051, + "grad_norm": 0.6776561141014099, + "learning_rate": 2.896214209535097e-05, + "loss": 0.3622, + "num_input_tokens_seen": 4399552, + "step": 13055 + }, + { + "epoch": 10.092735703245749, + "grad_norm": 1.1511815786361694, + "learning_rate": 2.894549445569364e-05, + "loss": 0.4003, + "num_input_tokens_seen": 4401312, + "step": 13060 + }, + { + "epoch": 10.09659969088099, + "grad_norm": 1.385332465171814, + "learning_rate": 2.892884502160181e-05, + "loss": 0.5662, + "num_input_tokens_seen": 4402848, + "step": 13065 + }, + { + "epoch": 10.10046367851623, + "grad_norm": 0.7317171692848206, + "learning_rate": 2.8912193800647724e-05, + "loss": 0.4031, + "num_input_tokens_seen": 4404544, + "step": 13070 + }, + { + "epoch": 10.104327666151468, + "grad_norm": 0.9947272539138794, + "learning_rate": 2.889554080040448e-05, + "loss": 0.5002, + "num_input_tokens_seen": 4406208, + "step": 13075 + }, + { + "epoch": 10.108191653786708, + "grad_norm": 1.3453503847122192, + "learning_rate": 2.887888602844594e-05, + "loss": 0.658, + "num_input_tokens_seen": 4407680, + "step": 13080 + }, + { + "epoch": 10.112055641421948, + "grad_norm": 0.6511138677597046, + "learning_rate": 2.8862229492346814e-05, + "loss": 0.4353, + "num_input_tokens_seen": 4409440, + "step": 13085 + }, + { + "epoch": 10.115919629057187, + "grad_norm": 0.6030790209770203, + "learning_rate": 2.8845571199682574e-05, + "loss": 0.3606, + "num_input_tokens_seen": 4411040, + "step": 13090 + }, + { + "epoch": 10.119783616692427, + "grad_norm": 0.8376440405845642, + "learning_rate": 2.8828911158029535e-05, + "loss": 0.4439, + "num_input_tokens_seen": 4412736, + "step": 13095 + }, + { + "epoch": 10.123647604327665, + "grad_norm": 0.6951779127120972, + "learning_rate": 2.881224937496476e-05, + "loss": 0.4139, + "num_input_tokens_seen": 4414528, + "step": 13100 + }, + { + "epoch": 10.127511591962906, + "grad_norm": 1.4922066926956177, + "learning_rate": 2.8795585858066142e-05, + "loss": 0.7857, + "num_input_tokens_seen": 4416256, + "step": 13105 + }, + { + "epoch": 10.131375579598146, + "grad_norm": 0.9838178157806396, + "learning_rate": 2.877892061491235e-05, + "loss": 0.4053, + "num_input_tokens_seen": 4418016, + "step": 13110 + }, + { + "epoch": 10.135239567233384, + "grad_norm": 0.8162969946861267, + "learning_rate": 2.876225365308283e-05, + "loss": 0.5407, + "num_input_tokens_seen": 4419872, + "step": 13115 + }, + { + "epoch": 10.139103554868624, + "grad_norm": 1.0248290300369263, + "learning_rate": 2.8745584980157813e-05, + "loss": 0.4226, + "num_input_tokens_seen": 4421632, + "step": 13120 + }, + { + "epoch": 10.142967542503865, + "grad_norm": 1.0713471174240112, + "learning_rate": 2.8728914603718315e-05, + "loss": 0.4468, + "num_input_tokens_seen": 4423456, + "step": 13125 + }, + { + "epoch": 10.146831530139103, + "grad_norm": 1.3089042901992798, + "learning_rate": 2.8712242531346127e-05, + "loss": 0.5416, + "num_input_tokens_seen": 4425216, + "step": 13130 + }, + { + "epoch": 10.150695517774343, + "grad_norm": 1.160058617591858, + "learning_rate": 2.869556877062381e-05, + "loss": 0.4774, + "num_input_tokens_seen": 4427200, + "step": 13135 + }, + { + "epoch": 10.154559505409583, + "grad_norm": 1.5971025228500366, + "learning_rate": 2.867889332913467e-05, + "loss": 0.9225, + "num_input_tokens_seen": 4428768, + "step": 13140 + }, + { + "epoch": 10.158423493044822, + "grad_norm": 1.4801124334335327, + "learning_rate": 2.8662216214462822e-05, + "loss": 0.5265, + "num_input_tokens_seen": 4430592, + "step": 13145 + }, + { + "epoch": 10.162287480680062, + "grad_norm": 0.8829579949378967, + "learning_rate": 2.8645537434193104e-05, + "loss": 0.3977, + "num_input_tokens_seen": 4432256, + "step": 13150 + }, + { + "epoch": 10.166151468315302, + "grad_norm": 1.6147266626358032, + "learning_rate": 2.862885699591113e-05, + "loss": 0.5069, + "num_input_tokens_seen": 4433984, + "step": 13155 + }, + { + "epoch": 10.17001545595054, + "grad_norm": 0.6263956427574158, + "learning_rate": 2.861217490720326e-05, + "loss": 0.4306, + "num_input_tokens_seen": 4435392, + "step": 13160 + }, + { + "epoch": 10.173879443585781, + "grad_norm": 1.4779052734375, + "learning_rate": 2.8595491175656608e-05, + "loss": 0.4201, + "num_input_tokens_seen": 4436992, + "step": 13165 + }, + { + "epoch": 10.17774343122102, + "grad_norm": 0.6400034427642822, + "learning_rate": 2.8578805808859044e-05, + "loss": 0.492, + "num_input_tokens_seen": 4438912, + "step": 13170 + }, + { + "epoch": 10.18160741885626, + "grad_norm": 0.8500723838806152, + "learning_rate": 2.8562118814399158e-05, + "loss": 0.3225, + "num_input_tokens_seen": 4440800, + "step": 13175 + }, + { + "epoch": 10.1854714064915, + "grad_norm": 0.9238780736923218, + "learning_rate": 2.854543019986631e-05, + "loss": 0.5115, + "num_input_tokens_seen": 4442528, + "step": 13180 + }, + { + "epoch": 10.189335394126738, + "grad_norm": 1.5370628833770752, + "learning_rate": 2.852873997285057e-05, + "loss": 0.5166, + "num_input_tokens_seen": 4444256, + "step": 13185 + }, + { + "epoch": 10.193199381761978, + "grad_norm": 0.9089831113815308, + "learning_rate": 2.851204814094276e-05, + "loss": 0.3846, + "num_input_tokens_seen": 4445760, + "step": 13190 + }, + { + "epoch": 10.197063369397219, + "grad_norm": 0.971215546131134, + "learning_rate": 2.849535471173442e-05, + "loss": 0.4749, + "num_input_tokens_seen": 4447488, + "step": 13195 + }, + { + "epoch": 10.200927357032457, + "grad_norm": 0.5986497402191162, + "learning_rate": 2.8478659692817816e-05, + "loss": 0.3646, + "num_input_tokens_seen": 4449216, + "step": 13200 + }, + { + "epoch": 10.204791344667697, + "grad_norm": 0.6111299991607666, + "learning_rate": 2.8461963091785966e-05, + "loss": 0.3898, + "num_input_tokens_seen": 4450912, + "step": 13205 + }, + { + "epoch": 10.208655332302937, + "grad_norm": 0.7607316374778748, + "learning_rate": 2.8445264916232563e-05, + "loss": 0.3888, + "num_input_tokens_seen": 4452320, + "step": 13210 + }, + { + "epoch": 10.212519319938176, + "grad_norm": 0.6857308745384216, + "learning_rate": 2.8428565173752043e-05, + "loss": 0.3752, + "num_input_tokens_seen": 4454112, + "step": 13215 + }, + { + "epoch": 10.216383307573416, + "grad_norm": 1.0508909225463867, + "learning_rate": 2.841186387193954e-05, + "loss": 0.5067, + "num_input_tokens_seen": 4455552, + "step": 13220 + }, + { + "epoch": 10.220247295208654, + "grad_norm": 1.0017448663711548, + "learning_rate": 2.839516101839093e-05, + "loss": 0.4225, + "num_input_tokens_seen": 4456992, + "step": 13225 + }, + { + "epoch": 10.224111282843895, + "grad_norm": 0.7801126837730408, + "learning_rate": 2.8378456620702748e-05, + "loss": 0.3823, + "num_input_tokens_seen": 4458784, + "step": 13230 + }, + { + "epoch": 10.227975270479135, + "grad_norm": 0.49611252546310425, + "learning_rate": 2.8361750686472265e-05, + "loss": 0.3516, + "num_input_tokens_seen": 4460480, + "step": 13235 + }, + { + "epoch": 10.231839258114373, + "grad_norm": 0.8276845216751099, + "learning_rate": 2.8345043223297436e-05, + "loss": 0.4866, + "num_input_tokens_seen": 4462176, + "step": 13240 + }, + { + "epoch": 10.235703245749614, + "grad_norm": 0.8588369488716125, + "learning_rate": 2.8328334238776915e-05, + "loss": 0.3621, + "num_input_tokens_seen": 4464192, + "step": 13245 + }, + { + "epoch": 10.239567233384854, + "grad_norm": 0.9450176954269409, + "learning_rate": 2.831162374051005e-05, + "loss": 0.414, + "num_input_tokens_seen": 4465792, + "step": 13250 + }, + { + "epoch": 10.243431221020092, + "grad_norm": 0.8810413479804993, + "learning_rate": 2.829491173609688e-05, + "loss": 0.3777, + "num_input_tokens_seen": 4467552, + "step": 13255 + }, + { + "epoch": 10.247295208655332, + "grad_norm": 0.6290249228477478, + "learning_rate": 2.8278198233138115e-05, + "loss": 0.3469, + "num_input_tokens_seen": 4469216, + "step": 13260 + }, + { + "epoch": 10.251159196290573, + "grad_norm": 0.6323533058166504, + "learning_rate": 2.826148323923516e-05, + "loss": 0.4612, + "num_input_tokens_seen": 4471008, + "step": 13265 + }, + { + "epoch": 10.255023183925811, + "grad_norm": 1.3167589902877808, + "learning_rate": 2.82447667619901e-05, + "loss": 0.4698, + "num_input_tokens_seen": 4472576, + "step": 13270 + }, + { + "epoch": 10.258887171561051, + "grad_norm": 0.8348168730735779, + "learning_rate": 2.8228048809005687e-05, + "loss": 0.4365, + "num_input_tokens_seen": 4474112, + "step": 13275 + }, + { + "epoch": 10.262751159196291, + "grad_norm": 1.0631403923034668, + "learning_rate": 2.8211329387885333e-05, + "loss": 0.4536, + "num_input_tokens_seen": 4475936, + "step": 13280 + }, + { + "epoch": 10.26661514683153, + "grad_norm": 0.5905399918556213, + "learning_rate": 2.819460850623315e-05, + "loss": 0.3563, + "num_input_tokens_seen": 4478112, + "step": 13285 + }, + { + "epoch": 10.27047913446677, + "grad_norm": 1.0273041725158691, + "learning_rate": 2.8177886171653888e-05, + "loss": 0.4306, + "num_input_tokens_seen": 4479680, + "step": 13290 + }, + { + "epoch": 10.274343122102009, + "grad_norm": 0.697745680809021, + "learning_rate": 2.8161162391752955e-05, + "loss": 0.4637, + "num_input_tokens_seen": 4481472, + "step": 13295 + }, + { + "epoch": 10.278207109737249, + "grad_norm": 0.6733278632164001, + "learning_rate": 2.814443717413644e-05, + "loss": 0.4094, + "num_input_tokens_seen": 4482976, + "step": 13300 + }, + { + "epoch": 10.282071097372489, + "grad_norm": 0.71390700340271, + "learning_rate": 2.8127710526411067e-05, + "loss": 0.4172, + "num_input_tokens_seen": 4484640, + "step": 13305 + }, + { + "epoch": 10.285935085007727, + "grad_norm": 0.9141629934310913, + "learning_rate": 2.8110982456184213e-05, + "loss": 0.4301, + "num_input_tokens_seen": 4486560, + "step": 13310 + }, + { + "epoch": 10.289799072642968, + "grad_norm": 1.01697838306427, + "learning_rate": 2.8094252971063912e-05, + "loss": 0.668, + "num_input_tokens_seen": 4488288, + "step": 13315 + }, + { + "epoch": 10.293663060278208, + "grad_norm": 1.4185620546340942, + "learning_rate": 2.807752207865883e-05, + "loss": 0.3955, + "num_input_tokens_seen": 4489632, + "step": 13320 + }, + { + "epoch": 10.297527047913446, + "grad_norm": 0.7816479802131653, + "learning_rate": 2.806078978657827e-05, + "loss": 0.382, + "num_input_tokens_seen": 4491136, + "step": 13325 + }, + { + "epoch": 10.301391035548686, + "grad_norm": 0.6919955611228943, + "learning_rate": 2.804405610243218e-05, + "loss": 0.424, + "num_input_tokens_seen": 4493024, + "step": 13330 + }, + { + "epoch": 10.305255023183927, + "grad_norm": 1.0213953256607056, + "learning_rate": 2.8027321033831156e-05, + "loss": 0.5486, + "num_input_tokens_seen": 4494624, + "step": 13335 + }, + { + "epoch": 10.309119010819165, + "grad_norm": 0.9536899924278259, + "learning_rate": 2.801058458838639e-05, + "loss": 0.4184, + "num_input_tokens_seen": 4496224, + "step": 13340 + }, + { + "epoch": 10.312982998454405, + "grad_norm": 0.7635327577590942, + "learning_rate": 2.7993846773709725e-05, + "loss": 0.4116, + "num_input_tokens_seen": 4497760, + "step": 13345 + }, + { + "epoch": 10.316846986089644, + "grad_norm": 0.9556565284729004, + "learning_rate": 2.7977107597413614e-05, + "loss": 0.4082, + "num_input_tokens_seen": 4499552, + "step": 13350 + }, + { + "epoch": 10.320710973724884, + "grad_norm": 1.0821788311004639, + "learning_rate": 2.796036706711115e-05, + "loss": 0.5159, + "num_input_tokens_seen": 4500992, + "step": 13355 + }, + { + "epoch": 10.324574961360124, + "grad_norm": 0.9153280854225159, + "learning_rate": 2.7943625190416005e-05, + "loss": 0.5038, + "num_input_tokens_seen": 4502656, + "step": 13360 + }, + { + "epoch": 10.328438948995363, + "grad_norm": 0.676741361618042, + "learning_rate": 2.7926881974942488e-05, + "loss": 0.4427, + "num_input_tokens_seen": 4504416, + "step": 13365 + }, + { + "epoch": 10.332302936630603, + "grad_norm": 1.1709225177764893, + "learning_rate": 2.7910137428305534e-05, + "loss": 0.4886, + "num_input_tokens_seen": 4506016, + "step": 13370 + }, + { + "epoch": 10.336166924265843, + "grad_norm": 1.156480312347412, + "learning_rate": 2.7893391558120646e-05, + "loss": 0.3867, + "num_input_tokens_seen": 4507488, + "step": 13375 + }, + { + "epoch": 10.340030911901081, + "grad_norm": 1.4660266637802124, + "learning_rate": 2.7876644372003945e-05, + "loss": 0.5426, + "num_input_tokens_seen": 4509312, + "step": 13380 + }, + { + "epoch": 10.343894899536322, + "grad_norm": 0.9145311713218689, + "learning_rate": 2.7859895877572155e-05, + "loss": 0.4228, + "num_input_tokens_seen": 4511008, + "step": 13385 + }, + { + "epoch": 10.347758887171562, + "grad_norm": 0.9687382578849792, + "learning_rate": 2.78431460824426e-05, + "loss": 0.4835, + "num_input_tokens_seen": 4512800, + "step": 13390 + }, + { + "epoch": 10.3516228748068, + "grad_norm": 1.1383724212646484, + "learning_rate": 2.7826394994233178e-05, + "loss": 0.4165, + "num_input_tokens_seen": 4514240, + "step": 13395 + }, + { + "epoch": 10.35548686244204, + "grad_norm": 1.2185887098312378, + "learning_rate": 2.780964262056239e-05, + "loss": 0.4182, + "num_input_tokens_seen": 4515872, + "step": 13400 + }, + { + "epoch": 10.35935085007728, + "grad_norm": 1.007194995880127, + "learning_rate": 2.7792888969049304e-05, + "loss": 0.6323, + "num_input_tokens_seen": 4517696, + "step": 13405 + }, + { + "epoch": 10.363214837712519, + "grad_norm": 0.7198383808135986, + "learning_rate": 2.777613404731359e-05, + "loss": 0.5279, + "num_input_tokens_seen": 4519200, + "step": 13410 + }, + { + "epoch": 10.36707882534776, + "grad_norm": 1.2288141250610352, + "learning_rate": 2.7759377862975484e-05, + "loss": 0.4893, + "num_input_tokens_seen": 4521024, + "step": 13415 + }, + { + "epoch": 10.370942812982998, + "grad_norm": 1.1775104999542236, + "learning_rate": 2.7742620423655806e-05, + "loss": 0.4393, + "num_input_tokens_seen": 4522784, + "step": 13420 + }, + { + "epoch": 10.374806800618238, + "grad_norm": 0.8120720386505127, + "learning_rate": 2.772586173697593e-05, + "loss": 0.3414, + "num_input_tokens_seen": 4524320, + "step": 13425 + }, + { + "epoch": 10.378670788253478, + "grad_norm": 0.5481179356575012, + "learning_rate": 2.7709101810557813e-05, + "loss": 0.5836, + "num_input_tokens_seen": 4526144, + "step": 13430 + }, + { + "epoch": 10.382534775888717, + "grad_norm": 2.3338139057159424, + "learning_rate": 2.769234065202397e-05, + "loss": 0.5327, + "num_input_tokens_seen": 4527872, + "step": 13435 + }, + { + "epoch": 10.386398763523957, + "grad_norm": 1.0229066610336304, + "learning_rate": 2.7675578268997477e-05, + "loss": 0.6053, + "num_input_tokens_seen": 4529760, + "step": 13440 + }, + { + "epoch": 10.390262751159197, + "grad_norm": 1.0743952989578247, + "learning_rate": 2.7658814669101956e-05, + "loss": 0.4461, + "num_input_tokens_seen": 4531616, + "step": 13445 + }, + { + "epoch": 10.394126738794435, + "grad_norm": 0.7870469093322754, + "learning_rate": 2.7642049859961606e-05, + "loss": 0.7305, + "num_input_tokens_seen": 4533472, + "step": 13450 + }, + { + "epoch": 10.397990726429676, + "grad_norm": 1.2462728023529053, + "learning_rate": 2.7625283849201157e-05, + "loss": 0.5598, + "num_input_tokens_seen": 4535136, + "step": 13455 + }, + { + "epoch": 10.401854714064916, + "grad_norm": 0.4973558783531189, + "learning_rate": 2.760851664444589e-05, + "loss": 0.3709, + "num_input_tokens_seen": 4536768, + "step": 13460 + }, + { + "epoch": 10.405718701700154, + "grad_norm": 0.7680723667144775, + "learning_rate": 2.7591748253321632e-05, + "loss": 0.4109, + "num_input_tokens_seen": 4538400, + "step": 13465 + }, + { + "epoch": 10.409582689335394, + "grad_norm": 1.551784873008728, + "learning_rate": 2.7574978683454743e-05, + "loss": 0.457, + "num_input_tokens_seen": 4540128, + "step": 13470 + }, + { + "epoch": 10.413446676970633, + "grad_norm": 1.3327360153198242, + "learning_rate": 2.7558207942472136e-05, + "loss": 0.3952, + "num_input_tokens_seen": 4541632, + "step": 13475 + }, + { + "epoch": 10.417310664605873, + "grad_norm": 0.7097187042236328, + "learning_rate": 2.7541436038001234e-05, + "loss": 0.3986, + "num_input_tokens_seen": 4543232, + "step": 13480 + }, + { + "epoch": 10.421174652241113, + "grad_norm": 0.8665366768836975, + "learning_rate": 2.7524662977669992e-05, + "loss": 0.4413, + "num_input_tokens_seen": 4544800, + "step": 13485 + }, + { + "epoch": 10.425038639876352, + "grad_norm": 0.9876944422721863, + "learning_rate": 2.7507888769106914e-05, + "loss": 0.7038, + "num_input_tokens_seen": 4546336, + "step": 13490 + }, + { + "epoch": 10.428902627511592, + "grad_norm": 1.0070568323135376, + "learning_rate": 2.7491113419941013e-05, + "loss": 0.5526, + "num_input_tokens_seen": 4547840, + "step": 13495 + }, + { + "epoch": 10.432766615146832, + "grad_norm": 0.9897714257240295, + "learning_rate": 2.7474336937801798e-05, + "loss": 0.474, + "num_input_tokens_seen": 4549504, + "step": 13500 + }, + { + "epoch": 10.43663060278207, + "grad_norm": 1.4545201063156128, + "learning_rate": 2.7457559330319326e-05, + "loss": 0.5528, + "num_input_tokens_seen": 4551072, + "step": 13505 + }, + { + "epoch": 10.44049459041731, + "grad_norm": 2.0390894412994385, + "learning_rate": 2.744078060512416e-05, + "loss": 0.7323, + "num_input_tokens_seen": 4552800, + "step": 13510 + }, + { + "epoch": 10.444358578052551, + "grad_norm": 0.8815717101097107, + "learning_rate": 2.742400076984736e-05, + "loss": 0.4349, + "num_input_tokens_seen": 4554400, + "step": 13515 + }, + { + "epoch": 10.44822256568779, + "grad_norm": 0.6594089865684509, + "learning_rate": 2.7407219832120484e-05, + "loss": 0.4686, + "num_input_tokens_seen": 4556032, + "step": 13520 + }, + { + "epoch": 10.45208655332303, + "grad_norm": 0.8238839507102966, + "learning_rate": 2.7390437799575615e-05, + "loss": 0.4395, + "num_input_tokens_seen": 4557664, + "step": 13525 + }, + { + "epoch": 10.45595054095827, + "grad_norm": 0.9629359245300293, + "learning_rate": 2.7373654679845323e-05, + "loss": 0.4447, + "num_input_tokens_seen": 4559072, + "step": 13530 + }, + { + "epoch": 10.459814528593508, + "grad_norm": 0.9239744544029236, + "learning_rate": 2.7356870480562662e-05, + "loss": 0.3534, + "num_input_tokens_seen": 4560608, + "step": 13535 + }, + { + "epoch": 10.463678516228748, + "grad_norm": 1.1576189994812012, + "learning_rate": 2.73400852093612e-05, + "loss": 0.4249, + "num_input_tokens_seen": 4562240, + "step": 13540 + }, + { + "epoch": 10.467542503863987, + "grad_norm": 1.1995596885681152, + "learning_rate": 2.7323298873874958e-05, + "loss": 0.3864, + "num_input_tokens_seen": 4563840, + "step": 13545 + }, + { + "epoch": 10.471406491499227, + "grad_norm": 0.6728004813194275, + "learning_rate": 2.7306511481738483e-05, + "loss": 0.3971, + "num_input_tokens_seen": 4565440, + "step": 13550 + }, + { + "epoch": 10.475270479134467, + "grad_norm": 0.6959586143493652, + "learning_rate": 2.7289723040586773e-05, + "loss": 0.3828, + "num_input_tokens_seen": 4567040, + "step": 13555 + }, + { + "epoch": 10.479134466769706, + "grad_norm": 1.0571309328079224, + "learning_rate": 2.7272933558055312e-05, + "loss": 0.5181, + "num_input_tokens_seen": 4568832, + "step": 13560 + }, + { + "epoch": 10.482998454404946, + "grad_norm": 0.9863837361335754, + "learning_rate": 2.725614304178005e-05, + "loss": 0.3829, + "num_input_tokens_seen": 4570336, + "step": 13565 + }, + { + "epoch": 10.486862442040186, + "grad_norm": 1.2639412879943848, + "learning_rate": 2.723935149939743e-05, + "loss": 0.5533, + "num_input_tokens_seen": 4572128, + "step": 13570 + }, + { + "epoch": 10.490726429675425, + "grad_norm": 1.3061115741729736, + "learning_rate": 2.7222558938544328e-05, + "loss": 0.6414, + "num_input_tokens_seen": 4573728, + "step": 13575 + }, + { + "epoch": 10.494590417310665, + "grad_norm": 2.2105369567871094, + "learning_rate": 2.7205765366858122e-05, + "loss": 0.5607, + "num_input_tokens_seen": 4575488, + "step": 13580 + }, + { + "epoch": 10.498454404945905, + "grad_norm": 0.603509783744812, + "learning_rate": 2.7188970791976603e-05, + "loss": 0.3299, + "num_input_tokens_seen": 4577376, + "step": 13585 + }, + { + "epoch": 10.502318392581143, + "grad_norm": 0.8894162178039551, + "learning_rate": 2.7172175221538064e-05, + "loss": 0.5564, + "num_input_tokens_seen": 4579104, + "step": 13590 + }, + { + "epoch": 10.506182380216384, + "grad_norm": 1.3522170782089233, + "learning_rate": 2.715537866318123e-05, + "loss": 0.5489, + "num_input_tokens_seen": 4580864, + "step": 13595 + }, + { + "epoch": 10.510046367851622, + "grad_norm": 0.7887694239616394, + "learning_rate": 2.7138581124545274e-05, + "loss": 0.4058, + "num_input_tokens_seen": 4582624, + "step": 13600 + }, + { + "epoch": 10.513910355486862, + "grad_norm": 1.1258572340011597, + "learning_rate": 2.7121782613269807e-05, + "loss": 0.4493, + "num_input_tokens_seen": 4584352, + "step": 13605 + }, + { + "epoch": 10.517774343122102, + "grad_norm": 2.4209418296813965, + "learning_rate": 2.7104983136994903e-05, + "loss": 0.5731, + "num_input_tokens_seen": 4586144, + "step": 13610 + }, + { + "epoch": 10.521638330757341, + "grad_norm": 0.9690748453140259, + "learning_rate": 2.7088182703361065e-05, + "loss": 0.6004, + "num_input_tokens_seen": 4588032, + "step": 13615 + }, + { + "epoch": 10.525502318392581, + "grad_norm": 1.0836536884307861, + "learning_rate": 2.707138132000923e-05, + "loss": 0.6104, + "num_input_tokens_seen": 4589888, + "step": 13620 + }, + { + "epoch": 10.529366306027821, + "grad_norm": 0.9322319030761719, + "learning_rate": 2.7054578994580754e-05, + "loss": 0.5146, + "num_input_tokens_seen": 4591744, + "step": 13625 + }, + { + "epoch": 10.53323029366306, + "grad_norm": 1.5119764804840088, + "learning_rate": 2.7037775734717458e-05, + "loss": 0.5717, + "num_input_tokens_seen": 4593216, + "step": 13630 + }, + { + "epoch": 10.5370942812983, + "grad_norm": 1.193421721458435, + "learning_rate": 2.7020971548061554e-05, + "loss": 0.5102, + "num_input_tokens_seen": 4595040, + "step": 13635 + }, + { + "epoch": 10.54095826893354, + "grad_norm": 0.6732072830200195, + "learning_rate": 2.700416644225568e-05, + "loss": 0.3704, + "num_input_tokens_seen": 4596608, + "step": 13640 + }, + { + "epoch": 10.544822256568779, + "grad_norm": 0.9554773569107056, + "learning_rate": 2.6987360424942903e-05, + "loss": 0.4523, + "num_input_tokens_seen": 4598240, + "step": 13645 + }, + { + "epoch": 10.548686244204019, + "grad_norm": 1.1127783060073853, + "learning_rate": 2.6970553503766717e-05, + "loss": 0.6384, + "num_input_tokens_seen": 4599936, + "step": 13650 + }, + { + "epoch": 10.552550231839259, + "grad_norm": 1.0903276205062866, + "learning_rate": 2.695374568637099e-05, + "loss": 0.4653, + "num_input_tokens_seen": 4602016, + "step": 13655 + }, + { + "epoch": 10.556414219474497, + "grad_norm": 1.8492169380187988, + "learning_rate": 2.6936936980400018e-05, + "loss": 0.6, + "num_input_tokens_seen": 4604032, + "step": 13660 + }, + { + "epoch": 10.560278207109738, + "grad_norm": 1.0499318838119507, + "learning_rate": 2.692012739349851e-05, + "loss": 0.3668, + "num_input_tokens_seen": 4605536, + "step": 13665 + }, + { + "epoch": 10.564142194744976, + "grad_norm": 1.0446349382400513, + "learning_rate": 2.6903316933311568e-05, + "loss": 0.4188, + "num_input_tokens_seen": 4607136, + "step": 13670 + }, + { + "epoch": 10.568006182380216, + "grad_norm": 1.098772644996643, + "learning_rate": 2.688650560748468e-05, + "loss": 0.3376, + "num_input_tokens_seen": 4608768, + "step": 13675 + }, + { + "epoch": 10.571870170015456, + "grad_norm": 1.2966485023498535, + "learning_rate": 2.6869693423663754e-05, + "loss": 0.5556, + "num_input_tokens_seen": 4610432, + "step": 13680 + }, + { + "epoch": 10.575734157650695, + "grad_norm": 1.0167115926742554, + "learning_rate": 2.6852880389495057e-05, + "loss": 0.515, + "num_input_tokens_seen": 4612288, + "step": 13685 + }, + { + "epoch": 10.579598145285935, + "grad_norm": 1.2517338991165161, + "learning_rate": 2.6836066512625264e-05, + "loss": 0.6064, + "num_input_tokens_seen": 4613792, + "step": 13690 + }, + { + "epoch": 10.583462132921175, + "grad_norm": 0.833247721195221, + "learning_rate": 2.6819251800701416e-05, + "loss": 0.442, + "num_input_tokens_seen": 4615520, + "step": 13695 + }, + { + "epoch": 10.587326120556414, + "grad_norm": 0.7955629229545593, + "learning_rate": 2.6802436261370967e-05, + "loss": 0.6207, + "num_input_tokens_seen": 4617088, + "step": 13700 + }, + { + "epoch": 10.591190108191654, + "grad_norm": 1.1459589004516602, + "learning_rate": 2.67856199022817e-05, + "loss": 0.5232, + "num_input_tokens_seen": 4618656, + "step": 13705 + }, + { + "epoch": 10.595054095826894, + "grad_norm": 0.8802782893180847, + "learning_rate": 2.676880273108181e-05, + "loss": 0.3795, + "num_input_tokens_seen": 4620512, + "step": 13710 + }, + { + "epoch": 10.598918083462133, + "grad_norm": 0.7383247017860413, + "learning_rate": 2.675198475541985e-05, + "loss": 0.3569, + "num_input_tokens_seen": 4622336, + "step": 13715 + }, + { + "epoch": 10.602782071097373, + "grad_norm": 0.9065098762512207, + "learning_rate": 2.673516598294474e-05, + "loss": 0.3979, + "num_input_tokens_seen": 4624128, + "step": 13720 + }, + { + "epoch": 10.606646058732611, + "grad_norm": 1.4276349544525146, + "learning_rate": 2.6718346421305735e-05, + "loss": 0.4543, + "num_input_tokens_seen": 4625568, + "step": 13725 + }, + { + "epoch": 10.610510046367851, + "grad_norm": 0.7984321713447571, + "learning_rate": 2.6701526078152484e-05, + "loss": 0.3521, + "num_input_tokens_seen": 4627168, + "step": 13730 + }, + { + "epoch": 10.614374034003092, + "grad_norm": 1.12966787815094, + "learning_rate": 2.6684704961134994e-05, + "loss": 0.4971, + "num_input_tokens_seen": 4628800, + "step": 13735 + }, + { + "epoch": 10.61823802163833, + "grad_norm": 0.7801388502120972, + "learning_rate": 2.6667883077903595e-05, + "loss": 0.3619, + "num_input_tokens_seen": 4630304, + "step": 13740 + }, + { + "epoch": 10.62210200927357, + "grad_norm": 0.8294705152511597, + "learning_rate": 2.6651060436108977e-05, + "loss": 0.3791, + "num_input_tokens_seen": 4632032, + "step": 13745 + }, + { + "epoch": 10.62596599690881, + "grad_norm": 1.7021310329437256, + "learning_rate": 2.6634237043402193e-05, + "loss": 0.666, + "num_input_tokens_seen": 4633696, + "step": 13750 + }, + { + "epoch": 10.629829984544049, + "grad_norm": 0.6384702324867249, + "learning_rate": 2.6617412907434612e-05, + "loss": 0.4224, + "num_input_tokens_seen": 4635584, + "step": 13755 + }, + { + "epoch": 10.63369397217929, + "grad_norm": 1.1129627227783203, + "learning_rate": 2.6600588035857955e-05, + "loss": 0.4032, + "num_input_tokens_seen": 4637120, + "step": 13760 + }, + { + "epoch": 10.63755795981453, + "grad_norm": 0.8692330121994019, + "learning_rate": 2.6583762436324266e-05, + "loss": 0.5296, + "num_input_tokens_seen": 4638624, + "step": 13765 + }, + { + "epoch": 10.641421947449768, + "grad_norm": 0.9034613966941833, + "learning_rate": 2.6566936116485946e-05, + "loss": 0.7488, + "num_input_tokens_seen": 4640544, + "step": 13770 + }, + { + "epoch": 10.645285935085008, + "grad_norm": 1.5325511693954468, + "learning_rate": 2.65501090839957e-05, + "loss": 0.431, + "num_input_tokens_seen": 4642336, + "step": 13775 + }, + { + "epoch": 10.649149922720248, + "grad_norm": 1.0011711120605469, + "learning_rate": 2.653328134650655e-05, + "loss": 0.4361, + "num_input_tokens_seen": 4643808, + "step": 13780 + }, + { + "epoch": 10.653013910355487, + "grad_norm": 0.669127881526947, + "learning_rate": 2.651645291167186e-05, + "loss": 0.447, + "num_input_tokens_seen": 4645760, + "step": 13785 + }, + { + "epoch": 10.656877897990727, + "grad_norm": 1.1029834747314453, + "learning_rate": 2.649962378714531e-05, + "loss": 0.4229, + "num_input_tokens_seen": 4647520, + "step": 13790 + }, + { + "epoch": 10.660741885625965, + "grad_norm": 1.5698806047439575, + "learning_rate": 2.648279398058088e-05, + "loss": 0.5982, + "num_input_tokens_seen": 4649376, + "step": 13795 + }, + { + "epoch": 10.664605873261205, + "grad_norm": 0.7912302613258362, + "learning_rate": 2.6465963499632866e-05, + "loss": 0.4389, + "num_input_tokens_seen": 4650976, + "step": 13800 + }, + { + "epoch": 10.668469860896446, + "grad_norm": 1.022456407546997, + "learning_rate": 2.644913235195587e-05, + "loss": 0.3884, + "num_input_tokens_seen": 4652704, + "step": 13805 + }, + { + "epoch": 10.672333848531684, + "grad_norm": 1.1832802295684814, + "learning_rate": 2.643230054520481e-05, + "loss": 0.4106, + "num_input_tokens_seen": 4654112, + "step": 13810 + }, + { + "epoch": 10.676197836166924, + "grad_norm": 1.253833532333374, + "learning_rate": 2.6415468087034872e-05, + "loss": 0.4065, + "num_input_tokens_seen": 4656000, + "step": 13815 + }, + { + "epoch": 10.680061823802165, + "grad_norm": 0.8783530592918396, + "learning_rate": 2.6398634985101582e-05, + "loss": 0.4224, + "num_input_tokens_seen": 4657504, + "step": 13820 + }, + { + "epoch": 10.683925811437403, + "grad_norm": 1.6087638139724731, + "learning_rate": 2.638180124706072e-05, + "loss": 0.4484, + "num_input_tokens_seen": 4659040, + "step": 13825 + }, + { + "epoch": 10.687789799072643, + "grad_norm": 0.7079886794090271, + "learning_rate": 2.6364966880568377e-05, + "loss": 0.5411, + "num_input_tokens_seen": 4660672, + "step": 13830 + }, + { + "epoch": 10.691653786707883, + "grad_norm": 1.2148042917251587, + "learning_rate": 2.6348131893280927e-05, + "loss": 0.3709, + "num_input_tokens_seen": 4662464, + "step": 13835 + }, + { + "epoch": 10.695517774343122, + "grad_norm": 0.7578526735305786, + "learning_rate": 2.6331296292855013e-05, + "loss": 0.4056, + "num_input_tokens_seen": 4664224, + "step": 13840 + }, + { + "epoch": 10.699381761978362, + "grad_norm": 1.459817886352539, + "learning_rate": 2.6314460086947567e-05, + "loss": 0.4546, + "num_input_tokens_seen": 4665824, + "step": 13845 + }, + { + "epoch": 10.7032457496136, + "grad_norm": 1.3162646293640137, + "learning_rate": 2.6297623283215806e-05, + "loss": 0.3486, + "num_input_tokens_seen": 4667328, + "step": 13850 + }, + { + "epoch": 10.70710973724884, + "grad_norm": 0.9608094692230225, + "learning_rate": 2.628078588931721e-05, + "loss": 0.4437, + "num_input_tokens_seen": 4668960, + "step": 13855 + }, + { + "epoch": 10.71097372488408, + "grad_norm": 0.44345682859420776, + "learning_rate": 2.626394791290952e-05, + "loss": 0.5525, + "num_input_tokens_seen": 4670432, + "step": 13860 + }, + { + "epoch": 10.71483771251932, + "grad_norm": 0.961518406867981, + "learning_rate": 2.6247109361650742e-05, + "loss": 0.5842, + "num_input_tokens_seen": 4671968, + "step": 13865 + }, + { + "epoch": 10.71870170015456, + "grad_norm": 0.7571280002593994, + "learning_rate": 2.623027024319916e-05, + "loss": 0.4411, + "num_input_tokens_seen": 4673728, + "step": 13870 + }, + { + "epoch": 10.7225656877898, + "grad_norm": 0.8748910427093506, + "learning_rate": 2.621343056521331e-05, + "loss": 0.3583, + "num_input_tokens_seen": 4675392, + "step": 13875 + }, + { + "epoch": 10.726429675425038, + "grad_norm": 0.8936399817466736, + "learning_rate": 2.619659033535196e-05, + "loss": 0.5564, + "num_input_tokens_seen": 4677248, + "step": 13880 + }, + { + "epoch": 10.730293663060278, + "grad_norm": 1.5683189630508423, + "learning_rate": 2.617974956127417e-05, + "loss": 0.5853, + "num_input_tokens_seen": 4678848, + "step": 13885 + }, + { + "epoch": 10.734157650695519, + "grad_norm": 0.9442083239555359, + "learning_rate": 2.6162908250639212e-05, + "loss": 0.4662, + "num_input_tokens_seen": 4680608, + "step": 13890 + }, + { + "epoch": 10.738021638330757, + "grad_norm": 1.2800004482269287, + "learning_rate": 2.6146066411106618e-05, + "loss": 0.456, + "num_input_tokens_seen": 4682208, + "step": 13895 + }, + { + "epoch": 10.741885625965997, + "grad_norm": 0.8739837408065796, + "learning_rate": 2.6129224050336155e-05, + "loss": 0.4883, + "num_input_tokens_seen": 4683648, + "step": 13900 + }, + { + "epoch": 10.745749613601237, + "grad_norm": 1.4460583925247192, + "learning_rate": 2.6112381175987828e-05, + "loss": 0.5529, + "num_input_tokens_seen": 4685216, + "step": 13905 + }, + { + "epoch": 10.749613601236476, + "grad_norm": 0.9453259706497192, + "learning_rate": 2.6095537795721886e-05, + "loss": 0.4585, + "num_input_tokens_seen": 4686944, + "step": 13910 + }, + { + "epoch": 10.753477588871716, + "grad_norm": 0.9373969435691833, + "learning_rate": 2.6078693917198798e-05, + "loss": 0.6088, + "num_input_tokens_seen": 4688512, + "step": 13915 + }, + { + "epoch": 10.757341576506954, + "grad_norm": 0.7966509461402893, + "learning_rate": 2.6061849548079247e-05, + "loss": 0.5752, + "num_input_tokens_seen": 4690368, + "step": 13920 + }, + { + "epoch": 10.761205564142195, + "grad_norm": 1.1123753786087036, + "learning_rate": 2.604500469602416e-05, + "loss": 0.4296, + "num_input_tokens_seen": 4691904, + "step": 13925 + }, + { + "epoch": 10.765069551777435, + "grad_norm": 1.15681791305542, + "learning_rate": 2.602815936869469e-05, + "loss": 0.3978, + "num_input_tokens_seen": 4693312, + "step": 13930 + }, + { + "epoch": 10.768933539412673, + "grad_norm": 0.5617603063583374, + "learning_rate": 2.601131357375217e-05, + "loss": 0.4277, + "num_input_tokens_seen": 4695008, + "step": 13935 + }, + { + "epoch": 10.772797527047913, + "grad_norm": 0.7654773592948914, + "learning_rate": 2.599446731885819e-05, + "loss": 0.3867, + "num_input_tokens_seen": 4696544, + "step": 13940 + }, + { + "epoch": 10.776661514683154, + "grad_norm": 0.49295878410339355, + "learning_rate": 2.5977620611674514e-05, + "loss": 0.425, + "num_input_tokens_seen": 4698272, + "step": 13945 + }, + { + "epoch": 10.780525502318392, + "grad_norm": 1.4496320486068726, + "learning_rate": 2.5960773459863132e-05, + "loss": 0.5514, + "num_input_tokens_seen": 4700000, + "step": 13950 + }, + { + "epoch": 10.784389489953632, + "grad_norm": 1.0240797996520996, + "learning_rate": 2.5943925871086216e-05, + "loss": 0.6323, + "num_input_tokens_seen": 4702048, + "step": 13955 + }, + { + "epoch": 10.788253477588873, + "grad_norm": 0.6336647272109985, + "learning_rate": 2.5927077853006178e-05, + "loss": 0.3814, + "num_input_tokens_seen": 4703712, + "step": 13960 + }, + { + "epoch": 10.792117465224111, + "grad_norm": 1.0626847743988037, + "learning_rate": 2.5910229413285563e-05, + "loss": 0.3924, + "num_input_tokens_seen": 4705408, + "step": 13965 + }, + { + "epoch": 10.795981452859351, + "grad_norm": 1.5913808345794678, + "learning_rate": 2.5893380559587167e-05, + "loss": 0.6181, + "num_input_tokens_seen": 4706912, + "step": 13970 + }, + { + "epoch": 10.79984544049459, + "grad_norm": 1.5207215547561646, + "learning_rate": 2.5876531299573947e-05, + "loss": 0.392, + "num_input_tokens_seen": 4708288, + "step": 13975 + }, + { + "epoch": 10.80370942812983, + "grad_norm": 1.1613585948944092, + "learning_rate": 2.585968164090904e-05, + "loss": 0.5173, + "num_input_tokens_seen": 4710176, + "step": 13980 + }, + { + "epoch": 10.80757341576507, + "grad_norm": 0.7771515250205994, + "learning_rate": 2.5842831591255768e-05, + "loss": 0.3895, + "num_input_tokens_seen": 4711584, + "step": 13985 + }, + { + "epoch": 10.811437403400308, + "grad_norm": 1.4017021656036377, + "learning_rate": 2.5825981158277645e-05, + "loss": 0.5209, + "num_input_tokens_seen": 4713056, + "step": 13990 + }, + { + "epoch": 10.815301391035549, + "grad_norm": 1.188698410987854, + "learning_rate": 2.580913034963835e-05, + "loss": 0.4354, + "num_input_tokens_seen": 4714624, + "step": 13995 + }, + { + "epoch": 10.819165378670789, + "grad_norm": 0.8445923328399658, + "learning_rate": 2.5792279173001722e-05, + "loss": 0.3827, + "num_input_tokens_seen": 4716192, + "step": 14000 + }, + { + "epoch": 10.823029366306027, + "grad_norm": 1.1130300760269165, + "learning_rate": 2.5775427636031773e-05, + "loss": 0.3753, + "num_input_tokens_seen": 4718336, + "step": 14005 + }, + { + "epoch": 10.826893353941268, + "grad_norm": 0.7283535003662109, + "learning_rate": 2.57585757463927e-05, + "loss": 0.443, + "num_input_tokens_seen": 4720096, + "step": 14010 + }, + { + "epoch": 10.830757341576508, + "grad_norm": 0.7768594622612, + "learning_rate": 2.5741723511748837e-05, + "loss": 0.4027, + "num_input_tokens_seen": 4721632, + "step": 14015 + }, + { + "epoch": 10.834621329211746, + "grad_norm": 1.0702109336853027, + "learning_rate": 2.5724870939764674e-05, + "loss": 0.4051, + "num_input_tokens_seen": 4723232, + "step": 14020 + }, + { + "epoch": 10.838485316846986, + "grad_norm": 0.7445734143257141, + "learning_rate": 2.5708018038104862e-05, + "loss": 0.4431, + "num_input_tokens_seen": 4725056, + "step": 14025 + }, + { + "epoch": 10.842349304482227, + "grad_norm": 1.1739108562469482, + "learning_rate": 2.5691164814434214e-05, + "loss": 0.5134, + "num_input_tokens_seen": 4726848, + "step": 14030 + }, + { + "epoch": 10.846213292117465, + "grad_norm": 0.8745645880699158, + "learning_rate": 2.5674311276417672e-05, + "loss": 0.4531, + "num_input_tokens_seen": 4728640, + "step": 14035 + }, + { + "epoch": 10.850077279752705, + "grad_norm": 1.2058465480804443, + "learning_rate": 2.5657457431720315e-05, + "loss": 0.658, + "num_input_tokens_seen": 4730336, + "step": 14040 + }, + { + "epoch": 10.853941267387944, + "grad_norm": 1.0835639238357544, + "learning_rate": 2.5640603288007385e-05, + "loss": 0.4092, + "num_input_tokens_seen": 4731872, + "step": 14045 + }, + { + "epoch": 10.857805255023184, + "grad_norm": 0.9193301200866699, + "learning_rate": 2.5623748852944246e-05, + "loss": 0.5981, + "num_input_tokens_seen": 4733568, + "step": 14050 + }, + { + "epoch": 10.861669242658424, + "grad_norm": 0.8366310596466064, + "learning_rate": 2.5606894134196386e-05, + "loss": 0.3678, + "num_input_tokens_seen": 4735168, + "step": 14055 + }, + { + "epoch": 10.865533230293662, + "grad_norm": 0.7434107065200806, + "learning_rate": 2.5590039139429444e-05, + "loss": 0.4517, + "num_input_tokens_seen": 4737152, + "step": 14060 + }, + { + "epoch": 10.869397217928903, + "grad_norm": 1.0774776935577393, + "learning_rate": 2.5573183876309165e-05, + "loss": 0.4823, + "num_input_tokens_seen": 4738688, + "step": 14065 + }, + { + "epoch": 10.873261205564143, + "grad_norm": 0.8167977929115295, + "learning_rate": 2.555632835250143e-05, + "loss": 0.3683, + "num_input_tokens_seen": 4740544, + "step": 14070 + }, + { + "epoch": 10.877125193199381, + "grad_norm": 2.2043676376342773, + "learning_rate": 2.5539472575672226e-05, + "loss": 0.4043, + "num_input_tokens_seen": 4742080, + "step": 14075 + }, + { + "epoch": 10.880989180834622, + "grad_norm": 1.0803759098052979, + "learning_rate": 2.5522616553487664e-05, + "loss": 0.4955, + "num_input_tokens_seen": 4743648, + "step": 14080 + }, + { + "epoch": 10.884853168469862, + "grad_norm": 0.7642216682434082, + "learning_rate": 2.5505760293613962e-05, + "loss": 0.4464, + "num_input_tokens_seen": 4745216, + "step": 14085 + }, + { + "epoch": 10.8887171561051, + "grad_norm": 0.8230618238449097, + "learning_rate": 2.548890380371745e-05, + "loss": 0.5763, + "num_input_tokens_seen": 4746944, + "step": 14090 + }, + { + "epoch": 10.89258114374034, + "grad_norm": 1.1804075241088867, + "learning_rate": 2.5472047091464564e-05, + "loss": 0.419, + "num_input_tokens_seen": 4748640, + "step": 14095 + }, + { + "epoch": 10.896445131375579, + "grad_norm": 1.313843846321106, + "learning_rate": 2.5455190164521838e-05, + "loss": 0.5217, + "num_input_tokens_seen": 4750208, + "step": 14100 + }, + { + "epoch": 10.900309119010819, + "grad_norm": 1.3376606702804565, + "learning_rate": 2.5438333030555887e-05, + "loss": 0.5065, + "num_input_tokens_seen": 4751840, + "step": 14105 + }, + { + "epoch": 10.90417310664606, + "grad_norm": 1.0318970680236816, + "learning_rate": 2.5421475697233455e-05, + "loss": 0.3561, + "num_input_tokens_seen": 4753344, + "step": 14110 + }, + { + "epoch": 10.908037094281298, + "grad_norm": 1.0548306703567505, + "learning_rate": 2.540461817222135e-05, + "loss": 0.3917, + "num_input_tokens_seen": 4754720, + "step": 14115 + }, + { + "epoch": 10.911901081916538, + "grad_norm": 0.8197954893112183, + "learning_rate": 2.5387760463186484e-05, + "loss": 0.5872, + "num_input_tokens_seen": 4756224, + "step": 14120 + }, + { + "epoch": 10.915765069551778, + "grad_norm": 0.6927675008773804, + "learning_rate": 2.5370902577795817e-05, + "loss": 0.3562, + "num_input_tokens_seen": 4757856, + "step": 14125 + }, + { + "epoch": 10.919629057187016, + "grad_norm": 1.1543140411376953, + "learning_rate": 2.5354044523716458e-05, + "loss": 0.3513, + "num_input_tokens_seen": 4759456, + "step": 14130 + }, + { + "epoch": 10.923493044822257, + "grad_norm": 0.850391149520874, + "learning_rate": 2.5337186308615523e-05, + "loss": 0.4408, + "num_input_tokens_seen": 4761184, + "step": 14135 + }, + { + "epoch": 10.927357032457497, + "grad_norm": 0.9273867011070251, + "learning_rate": 2.532032794016023e-05, + "loss": 0.6023, + "num_input_tokens_seen": 4762912, + "step": 14140 + }, + { + "epoch": 10.931221020092735, + "grad_norm": 1.003811240196228, + "learning_rate": 2.5303469426017878e-05, + "loss": 0.3689, + "num_input_tokens_seen": 4764352, + "step": 14145 + }, + { + "epoch": 10.935085007727976, + "grad_norm": 0.682937502861023, + "learning_rate": 2.5286610773855813e-05, + "loss": 0.4833, + "num_input_tokens_seen": 4765856, + "step": 14150 + }, + { + "epoch": 10.938948995363216, + "grad_norm": 0.8388909697532654, + "learning_rate": 2.5269751991341455e-05, + "loss": 0.4635, + "num_input_tokens_seen": 4767712, + "step": 14155 + }, + { + "epoch": 10.942812982998454, + "grad_norm": 0.7751749157905579, + "learning_rate": 2.5252893086142266e-05, + "loss": 0.4133, + "num_input_tokens_seen": 4769312, + "step": 14160 + }, + { + "epoch": 10.946676970633694, + "grad_norm": 0.5969831347465515, + "learning_rate": 2.523603406592579e-05, + "loss": 0.353, + "num_input_tokens_seen": 4771072, + "step": 14165 + }, + { + "epoch": 10.950540958268933, + "grad_norm": 1.492628812789917, + "learning_rate": 2.5219174938359612e-05, + "loss": 0.6666, + "num_input_tokens_seen": 4772800, + "step": 14170 + }, + { + "epoch": 10.954404945904173, + "grad_norm": 0.8178871273994446, + "learning_rate": 2.5202315711111358e-05, + "loss": 0.4361, + "num_input_tokens_seen": 4774912, + "step": 14175 + }, + { + "epoch": 10.958268933539413, + "grad_norm": 0.9379414916038513, + "learning_rate": 2.5185456391848705e-05, + "loss": 0.4915, + "num_input_tokens_seen": 4776544, + "step": 14180 + }, + { + "epoch": 10.962132921174652, + "grad_norm": 0.8828489184379578, + "learning_rate": 2.5168596988239374e-05, + "loss": 0.4841, + "num_input_tokens_seen": 4778240, + "step": 14185 + }, + { + "epoch": 10.965996908809892, + "grad_norm": 0.8041975498199463, + "learning_rate": 2.5151737507951123e-05, + "loss": 0.484, + "num_input_tokens_seen": 4780064, + "step": 14190 + }, + { + "epoch": 10.969860896445132, + "grad_norm": 0.7901705503463745, + "learning_rate": 2.5134877958651747e-05, + "loss": 0.3816, + "num_input_tokens_seen": 4781600, + "step": 14195 + }, + { + "epoch": 10.97372488408037, + "grad_norm": 0.9097367525100708, + "learning_rate": 2.511801834800907e-05, + "loss": 0.3755, + "num_input_tokens_seen": 4783136, + "step": 14200 + }, + { + "epoch": 10.97758887171561, + "grad_norm": 0.7306483387947083, + "learning_rate": 2.5101158683690935e-05, + "loss": 0.3948, + "num_input_tokens_seen": 4784576, + "step": 14205 + }, + { + "epoch": 10.98145285935085, + "grad_norm": 2.4374136924743652, + "learning_rate": 2.5084298973365222e-05, + "loss": 0.4446, + "num_input_tokens_seen": 4786080, + "step": 14210 + }, + { + "epoch": 10.98531684698609, + "grad_norm": 1.0376441478729248, + "learning_rate": 2.506743922469984e-05, + "loss": 0.5071, + "num_input_tokens_seen": 4787520, + "step": 14215 + }, + { + "epoch": 10.98918083462133, + "grad_norm": 0.9695479273796082, + "learning_rate": 2.5050579445362693e-05, + "loss": 0.4158, + "num_input_tokens_seen": 4789024, + "step": 14220 + }, + { + "epoch": 10.993044822256568, + "grad_norm": 0.9933780431747437, + "learning_rate": 2.5033719643021707e-05, + "loss": 0.4236, + "num_input_tokens_seen": 4790944, + "step": 14225 + }, + { + "epoch": 10.996908809891808, + "grad_norm": 1.2388668060302734, + "learning_rate": 2.501685982534483e-05, + "loss": 0.603, + "num_input_tokens_seen": 4792288, + "step": 14230 + }, + { + "epoch": 11.0, + "eval_loss": 0.45220205187797546, + "eval_runtime": 6.2386, + "eval_samples_per_second": 92.168, + "eval_steps_per_second": 23.082, + "num_input_tokens_seen": 4793520, + "step": 14234 + }, + { + "epoch": 11.000772797527048, + "grad_norm": 1.067646861076355, + "learning_rate": 2.5e-05, + "loss": 0.5551, + "num_input_tokens_seen": 4793776, + "step": 14235 + }, + { + "epoch": 11.004636785162287, + "grad_norm": 1.0982306003570557, + "learning_rate": 2.498314017465518e-05, + "loss": 0.4226, + "num_input_tokens_seen": 4795248, + "step": 14240 + }, + { + "epoch": 11.008500772797527, + "grad_norm": 1.1091535091400146, + "learning_rate": 2.4966280356978296e-05, + "loss": 0.4328, + "num_input_tokens_seen": 4796944, + "step": 14245 + }, + { + "epoch": 11.012364760432767, + "grad_norm": 1.182661533355713, + "learning_rate": 2.4949420554637316e-05, + "loss": 0.433, + "num_input_tokens_seen": 4798512, + "step": 14250 + }, + { + "epoch": 11.016228748068006, + "grad_norm": 0.8334532380104065, + "learning_rate": 2.493256077530017e-05, + "loss": 0.4137, + "num_input_tokens_seen": 4800432, + "step": 14255 + }, + { + "epoch": 11.020092735703246, + "grad_norm": 1.0452592372894287, + "learning_rate": 2.4915701026634777e-05, + "loss": 0.3951, + "num_input_tokens_seen": 4802160, + "step": 14260 + }, + { + "epoch": 11.023956723338486, + "grad_norm": 1.040830373764038, + "learning_rate": 2.4898841316309067e-05, + "loss": 0.4173, + "num_input_tokens_seen": 4803856, + "step": 14265 + }, + { + "epoch": 11.027820710973725, + "grad_norm": 1.2464960813522339, + "learning_rate": 2.4881981651990937e-05, + "loss": 0.6715, + "num_input_tokens_seen": 4805424, + "step": 14270 + }, + { + "epoch": 11.031684698608965, + "grad_norm": 0.7762731313705444, + "learning_rate": 2.486512204134826e-05, + "loss": 0.4923, + "num_input_tokens_seen": 4807472, + "step": 14275 + }, + { + "epoch": 11.035548686244203, + "grad_norm": 0.7905091047286987, + "learning_rate": 2.484826249204888e-05, + "loss": 0.4312, + "num_input_tokens_seen": 4809424, + "step": 14280 + }, + { + "epoch": 11.039412673879443, + "grad_norm": 1.1494383811950684, + "learning_rate": 2.4831403011760635e-05, + "loss": 0.4528, + "num_input_tokens_seen": 4810864, + "step": 14285 + }, + { + "epoch": 11.043276661514684, + "grad_norm": 0.98436039686203, + "learning_rate": 2.4814543608151305e-05, + "loss": 0.466, + "num_input_tokens_seen": 4812368, + "step": 14290 + }, + { + "epoch": 11.047140649149922, + "grad_norm": 0.9470975995063782, + "learning_rate": 2.479768428888865e-05, + "loss": 0.4108, + "num_input_tokens_seen": 4814224, + "step": 14295 + }, + { + "epoch": 11.051004636785162, + "grad_norm": 1.0675991773605347, + "learning_rate": 2.4780825061640387e-05, + "loss": 0.4151, + "num_input_tokens_seen": 4816016, + "step": 14300 + }, + { + "epoch": 11.054868624420402, + "grad_norm": 0.8857765793800354, + "learning_rate": 2.476396593407421e-05, + "loss": 0.5085, + "num_input_tokens_seen": 4817616, + "step": 14305 + }, + { + "epoch": 11.05873261205564, + "grad_norm": 0.8531879186630249, + "learning_rate": 2.4747106913857737e-05, + "loss": 0.6078, + "num_input_tokens_seen": 4819408, + "step": 14310 + }, + { + "epoch": 11.062596599690881, + "grad_norm": 0.8894729018211365, + "learning_rate": 2.473024800865855e-05, + "loss": 0.3836, + "num_input_tokens_seen": 4821200, + "step": 14315 + }, + { + "epoch": 11.066460587326121, + "grad_norm": 1.346943974494934, + "learning_rate": 2.4713389226144193e-05, + "loss": 0.4355, + "num_input_tokens_seen": 4822736, + "step": 14320 + }, + { + "epoch": 11.07032457496136, + "grad_norm": 0.7447229027748108, + "learning_rate": 2.469653057398213e-05, + "loss": 0.4258, + "num_input_tokens_seen": 4824752, + "step": 14325 + }, + { + "epoch": 11.0741885625966, + "grad_norm": 1.079765796661377, + "learning_rate": 2.4679672059839774e-05, + "loss": 0.3857, + "num_input_tokens_seen": 4826224, + "step": 14330 + }, + { + "epoch": 11.07805255023184, + "grad_norm": 0.8862722516059875, + "learning_rate": 2.4662813691384486e-05, + "loss": 0.461, + "num_input_tokens_seen": 4827920, + "step": 14335 + }, + { + "epoch": 11.081916537867079, + "grad_norm": 1.6807994842529297, + "learning_rate": 2.464595547628354e-05, + "loss": 0.7272, + "num_input_tokens_seen": 4829616, + "step": 14340 + }, + { + "epoch": 11.085780525502319, + "grad_norm": 0.8663679361343384, + "learning_rate": 2.462909742220418e-05, + "loss": 0.4427, + "num_input_tokens_seen": 4831472, + "step": 14345 + }, + { + "epoch": 11.089644513137557, + "grad_norm": 1.0052882432937622, + "learning_rate": 2.461223953681352e-05, + "loss": 0.4117, + "num_input_tokens_seen": 4833072, + "step": 14350 + }, + { + "epoch": 11.093508500772797, + "grad_norm": 1.1670385599136353, + "learning_rate": 2.4595381827778655e-05, + "loss": 0.46, + "num_input_tokens_seen": 4834544, + "step": 14355 + }, + { + "epoch": 11.097372488408038, + "grad_norm": 2.3007593154907227, + "learning_rate": 2.4578524302766554e-05, + "loss": 0.676, + "num_input_tokens_seen": 4836080, + "step": 14360 + }, + { + "epoch": 11.101236476043276, + "grad_norm": 0.7840058207511902, + "learning_rate": 2.456166696944412e-05, + "loss": 0.388, + "num_input_tokens_seen": 4837744, + "step": 14365 + }, + { + "epoch": 11.105100463678516, + "grad_norm": 0.6871770620346069, + "learning_rate": 2.4544809835478175e-05, + "loss": 0.4571, + "num_input_tokens_seen": 4839216, + "step": 14370 + }, + { + "epoch": 11.108964451313756, + "grad_norm": 1.861790418624878, + "learning_rate": 2.4527952908535445e-05, + "loss": 0.4286, + "num_input_tokens_seen": 4840720, + "step": 14375 + }, + { + "epoch": 11.112828438948995, + "grad_norm": 1.0005258321762085, + "learning_rate": 2.4511096196282547e-05, + "loss": 0.3287, + "num_input_tokens_seen": 4842576, + "step": 14380 + }, + { + "epoch": 11.116692426584235, + "grad_norm": 0.5872525572776794, + "learning_rate": 2.4494239706386037e-05, + "loss": 0.7287, + "num_input_tokens_seen": 4844336, + "step": 14385 + }, + { + "epoch": 11.120556414219475, + "grad_norm": 1.0230430364608765, + "learning_rate": 2.4477383446512338e-05, + "loss": 0.4877, + "num_input_tokens_seen": 4846096, + "step": 14390 + }, + { + "epoch": 11.124420401854714, + "grad_norm": 0.9460806250572205, + "learning_rate": 2.446052742432778e-05, + "loss": 0.3928, + "num_input_tokens_seen": 4847824, + "step": 14395 + }, + { + "epoch": 11.128284389489954, + "grad_norm": 0.895838737487793, + "learning_rate": 2.4443671647498577e-05, + "loss": 0.4415, + "num_input_tokens_seen": 4849456, + "step": 14400 + }, + { + "epoch": 11.132148377125192, + "grad_norm": 0.9957274794578552, + "learning_rate": 2.442681612369084e-05, + "loss": 0.4146, + "num_input_tokens_seen": 4850928, + "step": 14405 + }, + { + "epoch": 11.136012364760433, + "grad_norm": 1.2116020917892456, + "learning_rate": 2.4409960860570566e-05, + "loss": 0.3908, + "num_input_tokens_seen": 4852688, + "step": 14410 + }, + { + "epoch": 11.139876352395673, + "grad_norm": 1.0037299394607544, + "learning_rate": 2.439310586580362e-05, + "loss": 0.4056, + "num_input_tokens_seen": 4854320, + "step": 14415 + }, + { + "epoch": 11.143740340030911, + "grad_norm": 0.9636319279670715, + "learning_rate": 2.4376251147055757e-05, + "loss": 0.3722, + "num_input_tokens_seen": 4855824, + "step": 14420 + }, + { + "epoch": 11.147604327666151, + "grad_norm": 1.1338610649108887, + "learning_rate": 2.4359396711992617e-05, + "loss": 0.4216, + "num_input_tokens_seen": 4857552, + "step": 14425 + }, + { + "epoch": 11.151468315301392, + "grad_norm": 0.8677776455879211, + "learning_rate": 2.4342542568279687e-05, + "loss": 0.3751, + "num_input_tokens_seen": 4859344, + "step": 14430 + }, + { + "epoch": 11.15533230293663, + "grad_norm": 0.8040023446083069, + "learning_rate": 2.432568872358233e-05, + "loss": 0.6796, + "num_input_tokens_seen": 4860944, + "step": 14435 + }, + { + "epoch": 11.15919629057187, + "grad_norm": 0.5877929925918579, + "learning_rate": 2.430883518556579e-05, + "loss": 0.405, + "num_input_tokens_seen": 4862576, + "step": 14440 + }, + { + "epoch": 11.16306027820711, + "grad_norm": 0.9531632661819458, + "learning_rate": 2.429198196189514e-05, + "loss": 0.3807, + "num_input_tokens_seen": 4864368, + "step": 14445 + }, + { + "epoch": 11.166924265842349, + "grad_norm": 0.7936416268348694, + "learning_rate": 2.4275129060235332e-05, + "loss": 0.4334, + "num_input_tokens_seen": 4866128, + "step": 14450 + }, + { + "epoch": 11.170788253477589, + "grad_norm": 1.215572476387024, + "learning_rate": 2.4258276488251172e-05, + "loss": 0.5419, + "num_input_tokens_seen": 4867984, + "step": 14455 + }, + { + "epoch": 11.17465224111283, + "grad_norm": 0.8434062004089355, + "learning_rate": 2.42414242536073e-05, + "loss": 0.5122, + "num_input_tokens_seen": 4869584, + "step": 14460 + }, + { + "epoch": 11.178516228748068, + "grad_norm": 0.851548433303833, + "learning_rate": 2.422457236396823e-05, + "loss": 0.4419, + "num_input_tokens_seen": 4871376, + "step": 14465 + }, + { + "epoch": 11.182380216383308, + "grad_norm": 0.7365286350250244, + "learning_rate": 2.4207720826998284e-05, + "loss": 0.424, + "num_input_tokens_seen": 4873136, + "step": 14470 + }, + { + "epoch": 11.186244204018546, + "grad_norm": 0.6400526762008667, + "learning_rate": 2.419086965036166e-05, + "loss": 0.5025, + "num_input_tokens_seen": 4874800, + "step": 14475 + }, + { + "epoch": 11.190108191653787, + "grad_norm": 1.1530317068099976, + "learning_rate": 2.417401884172236e-05, + "loss": 0.5793, + "num_input_tokens_seen": 4876528, + "step": 14480 + }, + { + "epoch": 11.193972179289027, + "grad_norm": 1.107296347618103, + "learning_rate": 2.4157168408744235e-05, + "loss": 0.3407, + "num_input_tokens_seen": 4878320, + "step": 14485 + }, + { + "epoch": 11.197836166924265, + "grad_norm": 1.8204528093338013, + "learning_rate": 2.414031835909097e-05, + "loss": 0.4558, + "num_input_tokens_seen": 4879920, + "step": 14490 + }, + { + "epoch": 11.201700154559505, + "grad_norm": 0.6165599822998047, + "learning_rate": 2.4123468700426065e-05, + "loss": 0.443, + "num_input_tokens_seen": 4881872, + "step": 14495 + }, + { + "epoch": 11.205564142194746, + "grad_norm": 0.9416526556015015, + "learning_rate": 2.4106619440412835e-05, + "loss": 0.418, + "num_input_tokens_seen": 4883440, + "step": 14500 + }, + { + "epoch": 11.209428129829984, + "grad_norm": 1.041098952293396, + "learning_rate": 2.4089770586714436e-05, + "loss": 0.3671, + "num_input_tokens_seen": 4885040, + "step": 14505 + }, + { + "epoch": 11.213292117465224, + "grad_norm": 1.1705303192138672, + "learning_rate": 2.407292214699383e-05, + "loss": 0.5241, + "num_input_tokens_seen": 4886896, + "step": 14510 + }, + { + "epoch": 11.217156105100464, + "grad_norm": 0.7866234183311462, + "learning_rate": 2.4056074128913787e-05, + "loss": 0.3609, + "num_input_tokens_seen": 4888528, + "step": 14515 + }, + { + "epoch": 11.221020092735703, + "grad_norm": 1.2179110050201416, + "learning_rate": 2.4039226540136874e-05, + "loss": 0.3733, + "num_input_tokens_seen": 4890064, + "step": 14520 + }, + { + "epoch": 11.224884080370943, + "grad_norm": 1.5968573093414307, + "learning_rate": 2.4022379388325495e-05, + "loss": 0.5871, + "num_input_tokens_seen": 4891888, + "step": 14525 + }, + { + "epoch": 11.228748068006182, + "grad_norm": 0.8139042258262634, + "learning_rate": 2.4005532681141822e-05, + "loss": 0.535, + "num_input_tokens_seen": 4893616, + "step": 14530 + }, + { + "epoch": 11.232612055641422, + "grad_norm": 1.2139962911605835, + "learning_rate": 2.3988686426247834e-05, + "loss": 0.4158, + "num_input_tokens_seen": 4895152, + "step": 14535 + }, + { + "epoch": 11.236476043276662, + "grad_norm": 0.8381314873695374, + "learning_rate": 2.3971840631305317e-05, + "loss": 0.3285, + "num_input_tokens_seen": 4896976, + "step": 14540 + }, + { + "epoch": 11.2403400309119, + "grad_norm": 1.0596565008163452, + "learning_rate": 2.395499530397584e-05, + "loss": 0.4915, + "num_input_tokens_seen": 4898704, + "step": 14545 + }, + { + "epoch": 11.24420401854714, + "grad_norm": 0.806423008441925, + "learning_rate": 2.393815045192076e-05, + "loss": 0.44, + "num_input_tokens_seen": 4900272, + "step": 14550 + }, + { + "epoch": 11.24806800618238, + "grad_norm": 0.8341187834739685, + "learning_rate": 2.3921306082801208e-05, + "loss": 0.5288, + "num_input_tokens_seen": 4901840, + "step": 14555 + }, + { + "epoch": 11.25193199381762, + "grad_norm": 0.9419524073600769, + "learning_rate": 2.3904462204278117e-05, + "loss": 0.5045, + "num_input_tokens_seen": 4903664, + "step": 14560 + }, + { + "epoch": 11.25579598145286, + "grad_norm": 0.873992919921875, + "learning_rate": 2.3887618824012175e-05, + "loss": 0.4083, + "num_input_tokens_seen": 4905168, + "step": 14565 + }, + { + "epoch": 11.2596599690881, + "grad_norm": 1.1411312818527222, + "learning_rate": 2.387077594966385e-05, + "loss": 0.4964, + "num_input_tokens_seen": 4906864, + "step": 14570 + }, + { + "epoch": 11.263523956723338, + "grad_norm": 0.826043963432312, + "learning_rate": 2.385393358889339e-05, + "loss": 0.3592, + "num_input_tokens_seen": 4908464, + "step": 14575 + }, + { + "epoch": 11.267387944358578, + "grad_norm": 1.659110188484192, + "learning_rate": 2.3837091749360787e-05, + "loss": 0.6761, + "num_input_tokens_seen": 4910192, + "step": 14580 + }, + { + "epoch": 11.271251931993818, + "grad_norm": 0.7931973934173584, + "learning_rate": 2.3820250438725834e-05, + "loss": 0.4108, + "num_input_tokens_seen": 4911760, + "step": 14585 + }, + { + "epoch": 11.275115919629057, + "grad_norm": 0.8046302199363708, + "learning_rate": 2.3803409664648042e-05, + "loss": 0.346, + "num_input_tokens_seen": 4913456, + "step": 14590 + }, + { + "epoch": 11.278979907264297, + "grad_norm": 1.0545457601547241, + "learning_rate": 2.3786569434786696e-05, + "loss": 0.4535, + "num_input_tokens_seen": 4915280, + "step": 14595 + }, + { + "epoch": 11.282843894899536, + "grad_norm": 0.9903993010520935, + "learning_rate": 2.3769729756800845e-05, + "loss": 0.3596, + "num_input_tokens_seen": 4916912, + "step": 14600 + }, + { + "epoch": 11.286707882534776, + "grad_norm": 1.1982008218765259, + "learning_rate": 2.375289063834926e-05, + "loss": 0.4287, + "num_input_tokens_seen": 4918544, + "step": 14605 + }, + { + "epoch": 11.290571870170016, + "grad_norm": 0.7237623929977417, + "learning_rate": 2.3736052087090494e-05, + "loss": 0.3363, + "num_input_tokens_seen": 4920208, + "step": 14610 + }, + { + "epoch": 11.294435857805254, + "grad_norm": 1.0899306535720825, + "learning_rate": 2.37192141106828e-05, + "loss": 0.4539, + "num_input_tokens_seen": 4922000, + "step": 14615 + }, + { + "epoch": 11.298299845440495, + "grad_norm": 1.3016822338104248, + "learning_rate": 2.3702376716784196e-05, + "loss": 0.4111, + "num_input_tokens_seen": 4923600, + "step": 14620 + }, + { + "epoch": 11.302163833075735, + "grad_norm": 0.5854988694190979, + "learning_rate": 2.368553991305244e-05, + "loss": 0.3245, + "num_input_tokens_seen": 4925264, + "step": 14625 + }, + { + "epoch": 11.306027820710973, + "grad_norm": 1.6358275413513184, + "learning_rate": 2.3668703707144993e-05, + "loss": 0.4516, + "num_input_tokens_seen": 4927024, + "step": 14630 + }, + { + "epoch": 11.309891808346213, + "grad_norm": 1.0328454971313477, + "learning_rate": 2.3651868106719082e-05, + "loss": 0.626, + "num_input_tokens_seen": 4928656, + "step": 14635 + }, + { + "epoch": 11.313755795981454, + "grad_norm": 1.0389366149902344, + "learning_rate": 2.363503311943163e-05, + "loss": 0.4569, + "num_input_tokens_seen": 4930192, + "step": 14640 + }, + { + "epoch": 11.317619783616692, + "grad_norm": 1.1743770837783813, + "learning_rate": 2.3618198752939284e-05, + "loss": 0.4404, + "num_input_tokens_seen": 4931792, + "step": 14645 + }, + { + "epoch": 11.321483771251932, + "grad_norm": 2.21855092048645, + "learning_rate": 2.3601365014898427e-05, + "loss": 0.5498, + "num_input_tokens_seen": 4933360, + "step": 14650 + }, + { + "epoch": 11.32534775888717, + "grad_norm": 0.7382971048355103, + "learning_rate": 2.358453191296513e-05, + "loss": 0.5544, + "num_input_tokens_seen": 4934960, + "step": 14655 + }, + { + "epoch": 11.329211746522411, + "grad_norm": 1.426228404045105, + "learning_rate": 2.3567699454795197e-05, + "loss": 0.4524, + "num_input_tokens_seen": 4936656, + "step": 14660 + }, + { + "epoch": 11.333075734157651, + "grad_norm": 1.022573471069336, + "learning_rate": 2.3550867648044127e-05, + "loss": 0.4472, + "num_input_tokens_seen": 4938224, + "step": 14665 + }, + { + "epoch": 11.33693972179289, + "grad_norm": 0.8359565734863281, + "learning_rate": 2.353403650036714e-05, + "loss": 0.3335, + "num_input_tokens_seen": 4939920, + "step": 14670 + }, + { + "epoch": 11.34080370942813, + "grad_norm": 2.1719582080841064, + "learning_rate": 2.351720601941913e-05, + "loss": 0.8517, + "num_input_tokens_seen": 4941296, + "step": 14675 + }, + { + "epoch": 11.34466769706337, + "grad_norm": 0.5982954502105713, + "learning_rate": 2.3500376212854694e-05, + "loss": 0.4773, + "num_input_tokens_seen": 4942992, + "step": 14680 + }, + { + "epoch": 11.348531684698608, + "grad_norm": 0.9643141627311707, + "learning_rate": 2.3483547088328143e-05, + "loss": 0.7015, + "num_input_tokens_seen": 4944848, + "step": 14685 + }, + { + "epoch": 11.352395672333849, + "grad_norm": 1.1119256019592285, + "learning_rate": 2.3466718653493464e-05, + "loss": 0.4349, + "num_input_tokens_seen": 4946416, + "step": 14690 + }, + { + "epoch": 11.356259659969089, + "grad_norm": 0.971078097820282, + "learning_rate": 2.3449890916004312e-05, + "loss": 0.4002, + "num_input_tokens_seen": 4948112, + "step": 14695 + }, + { + "epoch": 11.360123647604327, + "grad_norm": 0.9194567203521729, + "learning_rate": 2.343306388351405e-05, + "loss": 0.4724, + "num_input_tokens_seen": 4950000, + "step": 14700 + }, + { + "epoch": 11.363987635239567, + "grad_norm": 0.6490898132324219, + "learning_rate": 2.341623756367573e-05, + "loss": 0.4519, + "num_input_tokens_seen": 4951664, + "step": 14705 + }, + { + "epoch": 11.367851622874808, + "grad_norm": 1.347347617149353, + "learning_rate": 2.3399411964142054e-05, + "loss": 0.4834, + "num_input_tokens_seen": 4953584, + "step": 14710 + }, + { + "epoch": 11.371715610510046, + "grad_norm": 0.9837502837181091, + "learning_rate": 2.3382587092565393e-05, + "loss": 0.3963, + "num_input_tokens_seen": 4955216, + "step": 14715 + }, + { + "epoch": 11.375579598145286, + "grad_norm": 0.7703550457954407, + "learning_rate": 2.3365762956597813e-05, + "loss": 0.4636, + "num_input_tokens_seen": 4956848, + "step": 14720 + }, + { + "epoch": 11.379443585780525, + "grad_norm": 1.0406757593154907, + "learning_rate": 2.3348939563891032e-05, + "loss": 0.5537, + "num_input_tokens_seen": 4958512, + "step": 14725 + }, + { + "epoch": 11.383307573415765, + "grad_norm": 1.71559739112854, + "learning_rate": 2.3332116922096414e-05, + "loss": 0.5297, + "num_input_tokens_seen": 4960016, + "step": 14730 + }, + { + "epoch": 11.387171561051005, + "grad_norm": 0.9425277709960938, + "learning_rate": 2.331529503886502e-05, + "loss": 0.362, + "num_input_tokens_seen": 4961808, + "step": 14735 + }, + { + "epoch": 11.391035548686244, + "grad_norm": 0.8557973504066467, + "learning_rate": 2.3298473921847512e-05, + "loss": 0.6111, + "num_input_tokens_seen": 4963152, + "step": 14740 + }, + { + "epoch": 11.394899536321484, + "grad_norm": 1.0059163570404053, + "learning_rate": 2.3281653578694274e-05, + "loss": 0.4816, + "num_input_tokens_seen": 4964912, + "step": 14745 + }, + { + "epoch": 11.398763523956724, + "grad_norm": 0.7389404773712158, + "learning_rate": 2.326483401705527e-05, + "loss": 0.8417, + "num_input_tokens_seen": 4966416, + "step": 14750 + }, + { + "epoch": 11.402627511591962, + "grad_norm": 0.7172061204910278, + "learning_rate": 2.3248015244580153e-05, + "loss": 0.3704, + "num_input_tokens_seen": 4967856, + "step": 14755 + }, + { + "epoch": 11.406491499227203, + "grad_norm": 2.0736474990844727, + "learning_rate": 2.3231197268918192e-05, + "loss": 0.4236, + "num_input_tokens_seen": 4969584, + "step": 14760 + }, + { + "epoch": 11.410355486862443, + "grad_norm": 0.9410192370414734, + "learning_rate": 2.3214380097718306e-05, + "loss": 0.4157, + "num_input_tokens_seen": 4971024, + "step": 14765 + }, + { + "epoch": 11.414219474497681, + "grad_norm": 0.8883486986160278, + "learning_rate": 2.3197563738629046e-05, + "loss": 0.4284, + "num_input_tokens_seen": 4972336, + "step": 14770 + }, + { + "epoch": 11.418083462132921, + "grad_norm": 0.834237277507782, + "learning_rate": 2.3180748199298593e-05, + "loss": 0.3778, + "num_input_tokens_seen": 4974032, + "step": 14775 + }, + { + "epoch": 11.42194744976816, + "grad_norm": 1.1675045490264893, + "learning_rate": 2.3163933487374745e-05, + "loss": 0.4344, + "num_input_tokens_seen": 4975536, + "step": 14780 + }, + { + "epoch": 11.4258114374034, + "grad_norm": 0.6203500628471375, + "learning_rate": 2.3147119610504946e-05, + "loss": 0.3329, + "num_input_tokens_seen": 4977008, + "step": 14785 + }, + { + "epoch": 11.42967542503864, + "grad_norm": 0.9011710286140442, + "learning_rate": 2.313030657633625e-05, + "loss": 0.3606, + "num_input_tokens_seen": 4978384, + "step": 14790 + }, + { + "epoch": 11.433539412673879, + "grad_norm": 0.757209062576294, + "learning_rate": 2.3113494392515324e-05, + "loss": 0.4511, + "num_input_tokens_seen": 4979952, + "step": 14795 + }, + { + "epoch": 11.437403400309119, + "grad_norm": 0.7960240840911865, + "learning_rate": 2.3096683066688438e-05, + "loss": 0.3341, + "num_input_tokens_seen": 4981552, + "step": 14800 + }, + { + "epoch": 11.44126738794436, + "grad_norm": 1.134954571723938, + "learning_rate": 2.3079872606501495e-05, + "loss": 0.4602, + "num_input_tokens_seen": 4983152, + "step": 14805 + }, + { + "epoch": 11.445131375579598, + "grad_norm": 1.7408809661865234, + "learning_rate": 2.306306301959999e-05, + "loss": 0.4384, + "num_input_tokens_seen": 4984880, + "step": 14810 + }, + { + "epoch": 11.448995363214838, + "grad_norm": 1.5439568758010864, + "learning_rate": 2.3046254313629023e-05, + "loss": 0.3851, + "num_input_tokens_seen": 4986608, + "step": 14815 + }, + { + "epoch": 11.452859350850078, + "grad_norm": 1.3083860874176025, + "learning_rate": 2.3029446496233286e-05, + "loss": 0.4778, + "num_input_tokens_seen": 4988016, + "step": 14820 + }, + { + "epoch": 11.456723338485316, + "grad_norm": 0.7549315094947815, + "learning_rate": 2.3012639575057092e-05, + "loss": 0.5662, + "num_input_tokens_seen": 4989680, + "step": 14825 + }, + { + "epoch": 11.460587326120557, + "grad_norm": 0.9246808886528015, + "learning_rate": 2.2995833557744326e-05, + "loss": 0.4528, + "num_input_tokens_seen": 4991600, + "step": 14830 + }, + { + "epoch": 11.464451313755795, + "grad_norm": 1.6138570308685303, + "learning_rate": 2.2979028451938452e-05, + "loss": 0.46, + "num_input_tokens_seen": 4993456, + "step": 14835 + }, + { + "epoch": 11.468315301391035, + "grad_norm": 1.1445684432983398, + "learning_rate": 2.296222426528255e-05, + "loss": 0.4305, + "num_input_tokens_seen": 4995120, + "step": 14840 + }, + { + "epoch": 11.472179289026275, + "grad_norm": 1.4240186214447021, + "learning_rate": 2.2945421005419252e-05, + "loss": 0.4765, + "num_input_tokens_seen": 4996816, + "step": 14845 + }, + { + "epoch": 11.476043276661514, + "grad_norm": 1.1666303873062134, + "learning_rate": 2.292861867999078e-05, + "loss": 0.404, + "num_input_tokens_seen": 4998448, + "step": 14850 + }, + { + "epoch": 11.479907264296754, + "grad_norm": 1.1255382299423218, + "learning_rate": 2.2911817296638947e-05, + "loss": 0.3898, + "num_input_tokens_seen": 5000112, + "step": 14855 + }, + { + "epoch": 11.483771251931994, + "grad_norm": 1.0315372943878174, + "learning_rate": 2.28950168630051e-05, + "loss": 0.4177, + "num_input_tokens_seen": 5001584, + "step": 14860 + }, + { + "epoch": 11.487635239567233, + "grad_norm": 0.8964202404022217, + "learning_rate": 2.2878217386730196e-05, + "loss": 0.6014, + "num_input_tokens_seen": 5003408, + "step": 14865 + }, + { + "epoch": 11.491499227202473, + "grad_norm": 1.352885365486145, + "learning_rate": 2.286141887545473e-05, + "loss": 0.4543, + "num_input_tokens_seen": 5005360, + "step": 14870 + }, + { + "epoch": 11.495363214837713, + "grad_norm": 1.3331842422485352, + "learning_rate": 2.2844621336818774e-05, + "loss": 0.4037, + "num_input_tokens_seen": 5007024, + "step": 14875 + }, + { + "epoch": 11.499227202472952, + "grad_norm": 0.8418006896972656, + "learning_rate": 2.282782477846194e-05, + "loss": 0.3956, + "num_input_tokens_seen": 5008592, + "step": 14880 + }, + { + "epoch": 11.503091190108192, + "grad_norm": 0.9119228720664978, + "learning_rate": 2.2811029208023403e-05, + "loss": 0.4599, + "num_input_tokens_seen": 5010448, + "step": 14885 + }, + { + "epoch": 11.506955177743432, + "grad_norm": 0.6914101839065552, + "learning_rate": 2.279423463314189e-05, + "loss": 0.3316, + "num_input_tokens_seen": 5012080, + "step": 14890 + }, + { + "epoch": 11.51081916537867, + "grad_norm": 0.9551845192909241, + "learning_rate": 2.277744106145568e-05, + "loss": 0.4783, + "num_input_tokens_seen": 5013776, + "step": 14895 + }, + { + "epoch": 11.51468315301391, + "grad_norm": 0.9804009795188904, + "learning_rate": 2.276064850060258e-05, + "loss": 0.431, + "num_input_tokens_seen": 5015440, + "step": 14900 + }, + { + "epoch": 11.51854714064915, + "grad_norm": 1.07405686378479, + "learning_rate": 2.274385695821995e-05, + "loss": 0.3543, + "num_input_tokens_seen": 5017072, + "step": 14905 + }, + { + "epoch": 11.52241112828439, + "grad_norm": 0.5324490070343018, + "learning_rate": 2.2727066441944693e-05, + "loss": 0.4161, + "num_input_tokens_seen": 5018800, + "step": 14910 + }, + { + "epoch": 11.52627511591963, + "grad_norm": 0.9378987550735474, + "learning_rate": 2.2710276959413236e-05, + "loss": 0.4922, + "num_input_tokens_seen": 5020336, + "step": 14915 + }, + { + "epoch": 11.530139103554868, + "grad_norm": 1.2325400114059448, + "learning_rate": 2.269348851826152e-05, + "loss": 0.4152, + "num_input_tokens_seen": 5022192, + "step": 14920 + }, + { + "epoch": 11.534003091190108, + "grad_norm": 0.8577722907066345, + "learning_rate": 2.2676701126125044e-05, + "loss": 0.5543, + "num_input_tokens_seen": 5023600, + "step": 14925 + }, + { + "epoch": 11.537867078825348, + "grad_norm": 0.7252225279808044, + "learning_rate": 2.2659914790638813e-05, + "loss": 0.5055, + "num_input_tokens_seen": 5025168, + "step": 14930 + }, + { + "epoch": 11.541731066460587, + "grad_norm": 1.4641350507736206, + "learning_rate": 2.2643129519437344e-05, + "loss": 0.4365, + "num_input_tokens_seen": 5026864, + "step": 14935 + }, + { + "epoch": 11.545595054095827, + "grad_norm": 0.8375564217567444, + "learning_rate": 2.2626345320154676e-05, + "loss": 0.3123, + "num_input_tokens_seen": 5028496, + "step": 14940 + }, + { + "epoch": 11.549459041731067, + "grad_norm": 1.0921297073364258, + "learning_rate": 2.2609562200424384e-05, + "loss": 0.4343, + "num_input_tokens_seen": 5030640, + "step": 14945 + }, + { + "epoch": 11.553323029366306, + "grad_norm": 0.9175233840942383, + "learning_rate": 2.2592780167879518e-05, + "loss": 0.4431, + "num_input_tokens_seen": 5032400, + "step": 14950 + }, + { + "epoch": 11.557187017001546, + "grad_norm": 1.4318376779556274, + "learning_rate": 2.2575999230152644e-05, + "loss": 0.3621, + "num_input_tokens_seen": 5034096, + "step": 14955 + }, + { + "epoch": 11.561051004636784, + "grad_norm": 1.1270090341567993, + "learning_rate": 2.255921939487584e-05, + "loss": 0.3643, + "num_input_tokens_seen": 5036016, + "step": 14960 + }, + { + "epoch": 11.564914992272024, + "grad_norm": 1.0487334728240967, + "learning_rate": 2.2542440669680676e-05, + "loss": 0.5126, + "num_input_tokens_seen": 5037680, + "step": 14965 + }, + { + "epoch": 11.568778979907265, + "grad_norm": 1.2174023389816284, + "learning_rate": 2.2525663062198208e-05, + "loss": 0.5297, + "num_input_tokens_seen": 5039312, + "step": 14970 + }, + { + "epoch": 11.572642967542503, + "grad_norm": 1.3397058248519897, + "learning_rate": 2.2508886580059e-05, + "loss": 0.4655, + "num_input_tokens_seen": 5041072, + "step": 14975 + }, + { + "epoch": 11.576506955177743, + "grad_norm": 1.7723979949951172, + "learning_rate": 2.2492111230893085e-05, + "loss": 0.5189, + "num_input_tokens_seen": 5042704, + "step": 14980 + }, + { + "epoch": 11.580370942812984, + "grad_norm": 2.113835096359253, + "learning_rate": 2.247533702233001e-05, + "loss": 0.5154, + "num_input_tokens_seen": 5044208, + "step": 14985 + }, + { + "epoch": 11.584234930448222, + "grad_norm": 1.0993317365646362, + "learning_rate": 2.2458563961998775e-05, + "loss": 0.4722, + "num_input_tokens_seen": 5045808, + "step": 14990 + }, + { + "epoch": 11.588098918083462, + "grad_norm": 0.8029483556747437, + "learning_rate": 2.2441792057527873e-05, + "loss": 0.2997, + "num_input_tokens_seen": 5047440, + "step": 14995 + }, + { + "epoch": 11.591962905718702, + "grad_norm": 1.4351609945297241, + "learning_rate": 2.2425021316545262e-05, + "loss": 0.5036, + "num_input_tokens_seen": 5049232, + "step": 15000 + }, + { + "epoch": 11.59582689335394, + "grad_norm": 1.541358232498169, + "learning_rate": 2.2408251746678374e-05, + "loss": 0.3555, + "num_input_tokens_seen": 5050800, + "step": 15005 + }, + { + "epoch": 11.599690880989181, + "grad_norm": 1.0010231733322144, + "learning_rate": 2.239148335555412e-05, + "loss": 0.4849, + "num_input_tokens_seen": 5052144, + "step": 15010 + }, + { + "epoch": 11.603554868624421, + "grad_norm": 1.8421694040298462, + "learning_rate": 2.2374716150798856e-05, + "loss": 0.6419, + "num_input_tokens_seen": 5054096, + "step": 15015 + }, + { + "epoch": 11.60741885625966, + "grad_norm": 0.963641345500946, + "learning_rate": 2.2357950140038397e-05, + "loss": 0.3993, + "num_input_tokens_seen": 5055824, + "step": 15020 + }, + { + "epoch": 11.6112828438949, + "grad_norm": 0.6796716451644897, + "learning_rate": 2.2341185330898043e-05, + "loss": 0.5197, + "num_input_tokens_seen": 5057392, + "step": 15025 + }, + { + "epoch": 11.615146831530138, + "grad_norm": 1.0473697185516357, + "learning_rate": 2.232442173100253e-05, + "loss": 0.57, + "num_input_tokens_seen": 5059056, + "step": 15030 + }, + { + "epoch": 11.619010819165378, + "grad_norm": 0.8567425012588501, + "learning_rate": 2.2307659347976033e-05, + "loss": 0.3873, + "num_input_tokens_seen": 5060720, + "step": 15035 + }, + { + "epoch": 11.622874806800619, + "grad_norm": 0.6999382972717285, + "learning_rate": 2.229089818944219e-05, + "loss": 0.3924, + "num_input_tokens_seen": 5062384, + "step": 15040 + }, + { + "epoch": 11.626738794435857, + "grad_norm": 1.0622963905334473, + "learning_rate": 2.2274138263024074e-05, + "loss": 0.3751, + "num_input_tokens_seen": 5064016, + "step": 15045 + }, + { + "epoch": 11.630602782071097, + "grad_norm": 1.0290998220443726, + "learning_rate": 2.2257379576344203e-05, + "loss": 0.4453, + "num_input_tokens_seen": 5065616, + "step": 15050 + }, + { + "epoch": 11.634466769706338, + "grad_norm": 1.604946255683899, + "learning_rate": 2.2240622137024522e-05, + "loss": 0.5303, + "num_input_tokens_seen": 5067376, + "step": 15055 + }, + { + "epoch": 11.638330757341576, + "grad_norm": 1.123206377029419, + "learning_rate": 2.222386595268641e-05, + "loss": 0.4727, + "num_input_tokens_seen": 5069072, + "step": 15060 + }, + { + "epoch": 11.642194744976816, + "grad_norm": 1.2704401016235352, + "learning_rate": 2.22071110309507e-05, + "loss": 0.4859, + "num_input_tokens_seen": 5070864, + "step": 15065 + }, + { + "epoch": 11.646058732612056, + "grad_norm": 1.119348406791687, + "learning_rate": 2.219035737943762e-05, + "loss": 0.4986, + "num_input_tokens_seen": 5072592, + "step": 15070 + }, + { + "epoch": 11.649922720247295, + "grad_norm": 1.0561310052871704, + "learning_rate": 2.2173605005766825e-05, + "loss": 0.3872, + "num_input_tokens_seen": 5074256, + "step": 15075 + }, + { + "epoch": 11.653786707882535, + "grad_norm": 1.0518296957015991, + "learning_rate": 2.21568539175574e-05, + "loss": 0.3567, + "num_input_tokens_seen": 5075952, + "step": 15080 + }, + { + "epoch": 11.657650695517773, + "grad_norm": 1.308140754699707, + "learning_rate": 2.2140104122427848e-05, + "loss": 0.3713, + "num_input_tokens_seen": 5077744, + "step": 15085 + }, + { + "epoch": 11.661514683153014, + "grad_norm": 1.2635273933410645, + "learning_rate": 2.212335562799606e-05, + "loss": 0.4131, + "num_input_tokens_seen": 5079536, + "step": 15090 + }, + { + "epoch": 11.665378670788254, + "grad_norm": 1.3814412355422974, + "learning_rate": 2.2106608441879363e-05, + "loss": 0.5069, + "num_input_tokens_seen": 5081360, + "step": 15095 + }, + { + "epoch": 11.669242658423492, + "grad_norm": 1.5741840600967407, + "learning_rate": 2.2089862571694465e-05, + "loss": 0.4193, + "num_input_tokens_seen": 5083248, + "step": 15100 + }, + { + "epoch": 11.673106646058732, + "grad_norm": 1.0519064664840698, + "learning_rate": 2.207311802505751e-05, + "loss": 0.4159, + "num_input_tokens_seen": 5084976, + "step": 15105 + }, + { + "epoch": 11.676970633693973, + "grad_norm": 1.3646507263183594, + "learning_rate": 2.2056374809583998e-05, + "loss": 0.5767, + "num_input_tokens_seen": 5086544, + "step": 15110 + }, + { + "epoch": 11.680834621329211, + "grad_norm": 1.445197343826294, + "learning_rate": 2.203963293288886e-05, + "loss": 0.5618, + "num_input_tokens_seen": 5088112, + "step": 15115 + }, + { + "epoch": 11.684698608964451, + "grad_norm": 1.253570318222046, + "learning_rate": 2.202289240258639e-05, + "loss": 0.6865, + "num_input_tokens_seen": 5089968, + "step": 15120 + }, + { + "epoch": 11.688562596599692, + "grad_norm": 0.8401662707328796, + "learning_rate": 2.200615322629028e-05, + "loss": 0.5267, + "num_input_tokens_seen": 5091472, + "step": 15125 + }, + { + "epoch": 11.69242658423493, + "grad_norm": 1.1110023260116577, + "learning_rate": 2.198941541161362e-05, + "loss": 0.4052, + "num_input_tokens_seen": 5092944, + "step": 15130 + }, + { + "epoch": 11.69629057187017, + "grad_norm": 0.896393358707428, + "learning_rate": 2.1972678966168857e-05, + "loss": 0.4803, + "num_input_tokens_seen": 5094608, + "step": 15135 + }, + { + "epoch": 11.70015455950541, + "grad_norm": 1.0043491125106812, + "learning_rate": 2.195594389756782e-05, + "loss": 0.3995, + "num_input_tokens_seen": 5096304, + "step": 15140 + }, + { + "epoch": 11.704018547140649, + "grad_norm": 1.0947543382644653, + "learning_rate": 2.193921021342173e-05, + "loss": 0.6338, + "num_input_tokens_seen": 5097968, + "step": 15145 + }, + { + "epoch": 11.707882534775889, + "grad_norm": 2.122551679611206, + "learning_rate": 2.192247792134118e-05, + "loss": 0.5432, + "num_input_tokens_seen": 5099888, + "step": 15150 + }, + { + "epoch": 11.711746522411127, + "grad_norm": 1.1866618394851685, + "learning_rate": 2.1905747028936093e-05, + "loss": 0.469, + "num_input_tokens_seen": 5101584, + "step": 15155 + }, + { + "epoch": 11.715610510046368, + "grad_norm": 0.8925483226776123, + "learning_rate": 2.188901754381579e-05, + "loss": 0.3576, + "num_input_tokens_seen": 5103536, + "step": 15160 + }, + { + "epoch": 11.719474497681608, + "grad_norm": 1.157280683517456, + "learning_rate": 2.187228947358894e-05, + "loss": 0.6388, + "num_input_tokens_seen": 5105264, + "step": 15165 + }, + { + "epoch": 11.723338485316846, + "grad_norm": 2.1671907901763916, + "learning_rate": 2.185556282586357e-05, + "loss": 0.4679, + "num_input_tokens_seen": 5106960, + "step": 15170 + }, + { + "epoch": 11.727202472952087, + "grad_norm": 0.8286451101303101, + "learning_rate": 2.183883760824705e-05, + "loss": 0.3994, + "num_input_tokens_seen": 5108688, + "step": 15175 + }, + { + "epoch": 11.731066460587327, + "grad_norm": 0.9466137886047363, + "learning_rate": 2.1822113828346124e-05, + "loss": 0.5255, + "num_input_tokens_seen": 5110672, + "step": 15180 + }, + { + "epoch": 11.734930448222565, + "grad_norm": 1.143232822418213, + "learning_rate": 2.1805391493766854e-05, + "loss": 0.4854, + "num_input_tokens_seen": 5112688, + "step": 15185 + }, + { + "epoch": 11.738794435857805, + "grad_norm": 1.1668983697891235, + "learning_rate": 2.178867061211467e-05, + "loss": 0.3871, + "num_input_tokens_seen": 5114288, + "step": 15190 + }, + { + "epoch": 11.742658423493046, + "grad_norm": 1.2909685373306274, + "learning_rate": 2.177195119099432e-05, + "loss": 0.436, + "num_input_tokens_seen": 5116112, + "step": 15195 + }, + { + "epoch": 11.746522411128284, + "grad_norm": 1.3106898069381714, + "learning_rate": 2.1755233238009904e-05, + "loss": 0.3674, + "num_input_tokens_seen": 5117968, + "step": 15200 + }, + { + "epoch": 11.750386398763524, + "grad_norm": 1.0284228324890137, + "learning_rate": 2.1738516760764843e-05, + "loss": 0.4802, + "num_input_tokens_seen": 5119632, + "step": 15205 + }, + { + "epoch": 11.754250386398763, + "grad_norm": 0.8425537347793579, + "learning_rate": 2.172180176686189e-05, + "loss": 0.4922, + "num_input_tokens_seen": 5121360, + "step": 15210 + }, + { + "epoch": 11.758114374034003, + "grad_norm": 1.2954424619674683, + "learning_rate": 2.1705088263903127e-05, + "loss": 0.5551, + "num_input_tokens_seen": 5123120, + "step": 15215 + }, + { + "epoch": 11.761978361669243, + "grad_norm": 0.9608129858970642, + "learning_rate": 2.1688376259489958e-05, + "loss": 0.4327, + "num_input_tokens_seen": 5124976, + "step": 15220 + }, + { + "epoch": 11.765842349304481, + "grad_norm": 1.2911005020141602, + "learning_rate": 2.1671665761223087e-05, + "loss": 0.4712, + "num_input_tokens_seen": 5126640, + "step": 15225 + }, + { + "epoch": 11.769706336939722, + "grad_norm": 1.682766318321228, + "learning_rate": 2.1654956776702563e-05, + "loss": 0.5833, + "num_input_tokens_seen": 5128208, + "step": 15230 + }, + { + "epoch": 11.773570324574962, + "grad_norm": 1.396045207977295, + "learning_rate": 2.1638249313527737e-05, + "loss": 0.3321, + "num_input_tokens_seen": 5129616, + "step": 15235 + }, + { + "epoch": 11.7774343122102, + "grad_norm": 1.2590371370315552, + "learning_rate": 2.1621543379297258e-05, + "loss": 0.7721, + "num_input_tokens_seen": 5131184, + "step": 15240 + }, + { + "epoch": 11.78129829984544, + "grad_norm": 0.6970559358596802, + "learning_rate": 2.1604838981609075e-05, + "loss": 0.43, + "num_input_tokens_seen": 5132848, + "step": 15245 + }, + { + "epoch": 11.78516228748068, + "grad_norm": 1.173458218574524, + "learning_rate": 2.158813612806046e-05, + "loss": 0.4653, + "num_input_tokens_seen": 5134384, + "step": 15250 + }, + { + "epoch": 11.78902627511592, + "grad_norm": 1.2420564889907837, + "learning_rate": 2.1571434826247973e-05, + "loss": 0.446, + "num_input_tokens_seen": 5136144, + "step": 15255 + }, + { + "epoch": 11.79289026275116, + "grad_norm": 0.8537799119949341, + "learning_rate": 2.1554735083767447e-05, + "loss": 0.4502, + "num_input_tokens_seen": 5137872, + "step": 15260 + }, + { + "epoch": 11.7967542503864, + "grad_norm": 0.7899070978164673, + "learning_rate": 2.153803690821404e-05, + "loss": 0.4113, + "num_input_tokens_seen": 5139600, + "step": 15265 + }, + { + "epoch": 11.800618238021638, + "grad_norm": 1.6663321256637573, + "learning_rate": 2.152134030718218e-05, + "loss": 0.3871, + "num_input_tokens_seen": 5141264, + "step": 15270 + }, + { + "epoch": 11.804482225656878, + "grad_norm": 0.7626575231552124, + "learning_rate": 2.150464528826559e-05, + "loss": 0.3576, + "num_input_tokens_seen": 5142800, + "step": 15275 + }, + { + "epoch": 11.808346213292117, + "grad_norm": 0.8468099236488342, + "learning_rate": 2.1487951859057248e-05, + "loss": 0.5502, + "num_input_tokens_seen": 5144784, + "step": 15280 + }, + { + "epoch": 11.812210200927357, + "grad_norm": 1.0238662958145142, + "learning_rate": 2.147126002714944e-05, + "loss": 0.3752, + "num_input_tokens_seen": 5146448, + "step": 15285 + }, + { + "epoch": 11.816074188562597, + "grad_norm": 0.9614712595939636, + "learning_rate": 2.14545698001337e-05, + "loss": 0.4169, + "num_input_tokens_seen": 5148176, + "step": 15290 + }, + { + "epoch": 11.819938176197835, + "grad_norm": 1.1828287839889526, + "learning_rate": 2.1437881185600845e-05, + "loss": 0.6685, + "num_input_tokens_seen": 5150000, + "step": 15295 + }, + { + "epoch": 11.823802163833076, + "grad_norm": 0.9059768915176392, + "learning_rate": 2.1421194191140965e-05, + "loss": 0.4852, + "num_input_tokens_seen": 5152016, + "step": 15300 + }, + { + "epoch": 11.827666151468316, + "grad_norm": 2.2449874877929688, + "learning_rate": 2.1404508824343388e-05, + "loss": 0.5353, + "num_input_tokens_seen": 5153776, + "step": 15305 + }, + { + "epoch": 11.831530139103554, + "grad_norm": 0.8473239541053772, + "learning_rate": 2.1387825092796742e-05, + "loss": 0.3749, + "num_input_tokens_seen": 5155504, + "step": 15310 + }, + { + "epoch": 11.835394126738795, + "grad_norm": 1.0484373569488525, + "learning_rate": 2.137114300408888e-05, + "loss": 0.6096, + "num_input_tokens_seen": 5157168, + "step": 15315 + }, + { + "epoch": 11.839258114374035, + "grad_norm": 1.1728549003601074, + "learning_rate": 2.13544625658069e-05, + "loss": 0.7371, + "num_input_tokens_seen": 5158928, + "step": 15320 + }, + { + "epoch": 11.843122102009273, + "grad_norm": 1.4883272647857666, + "learning_rate": 2.1337783785537184e-05, + "loss": 0.5499, + "num_input_tokens_seen": 5160688, + "step": 15325 + }, + { + "epoch": 11.846986089644513, + "grad_norm": 0.8416780829429626, + "learning_rate": 2.1321106670865332e-05, + "loss": 0.3883, + "num_input_tokens_seen": 5162352, + "step": 15330 + }, + { + "epoch": 11.850850077279752, + "grad_norm": 1.2094547748565674, + "learning_rate": 2.1304431229376204e-05, + "loss": 0.3656, + "num_input_tokens_seen": 5164144, + "step": 15335 + }, + { + "epoch": 11.854714064914992, + "grad_norm": 1.0672295093536377, + "learning_rate": 2.1287757468653882e-05, + "loss": 0.4572, + "num_input_tokens_seen": 5166096, + "step": 15340 + }, + { + "epoch": 11.858578052550232, + "grad_norm": 0.9970989227294922, + "learning_rate": 2.1271085396281684e-05, + "loss": 0.38, + "num_input_tokens_seen": 5167888, + "step": 15345 + }, + { + "epoch": 11.86244204018547, + "grad_norm": 0.8690469861030579, + "learning_rate": 2.1254415019842193e-05, + "loss": 0.4441, + "num_input_tokens_seen": 5169744, + "step": 15350 + }, + { + "epoch": 11.86630602782071, + "grad_norm": 0.6535691618919373, + "learning_rate": 2.1237746346917174e-05, + "loss": 0.4258, + "num_input_tokens_seen": 5171472, + "step": 15355 + }, + { + "epoch": 11.870170015455951, + "grad_norm": 0.8732820153236389, + "learning_rate": 2.1221079385087654e-05, + "loss": 0.394, + "num_input_tokens_seen": 5173328, + "step": 15360 + }, + { + "epoch": 11.87403400309119, + "grad_norm": 1.711259365081787, + "learning_rate": 2.1204414141933863e-05, + "loss": 0.6797, + "num_input_tokens_seen": 5175184, + "step": 15365 + }, + { + "epoch": 11.87789799072643, + "grad_norm": 1.4603962898254395, + "learning_rate": 2.118775062503524e-05, + "loss": 0.5126, + "num_input_tokens_seen": 5176656, + "step": 15370 + }, + { + "epoch": 11.88176197836167, + "grad_norm": 1.045409083366394, + "learning_rate": 2.1171088841970477e-05, + "loss": 0.4776, + "num_input_tokens_seen": 5178224, + "step": 15375 + }, + { + "epoch": 11.885625965996908, + "grad_norm": 0.9440944194793701, + "learning_rate": 2.115442880031743e-05, + "loss": 0.446, + "num_input_tokens_seen": 5179920, + "step": 15380 + }, + { + "epoch": 11.889489953632149, + "grad_norm": 0.8613791465759277, + "learning_rate": 2.1137770507653192e-05, + "loss": 0.4369, + "num_input_tokens_seen": 5181616, + "step": 15385 + }, + { + "epoch": 11.893353941267389, + "grad_norm": 0.8521418571472168, + "learning_rate": 2.1121113971554057e-05, + "loss": 0.3729, + "num_input_tokens_seen": 5183504, + "step": 15390 + }, + { + "epoch": 11.897217928902627, + "grad_norm": 0.860295832157135, + "learning_rate": 2.1104459199595527e-05, + "loss": 0.38, + "num_input_tokens_seen": 5185200, + "step": 15395 + }, + { + "epoch": 11.901081916537867, + "grad_norm": 1.236000657081604, + "learning_rate": 2.1087806199352282e-05, + "loss": 0.5625, + "num_input_tokens_seen": 5187248, + "step": 15400 + }, + { + "epoch": 11.904945904173106, + "grad_norm": 0.8332390189170837, + "learning_rate": 2.10711549783982e-05, + "loss": 0.3459, + "num_input_tokens_seen": 5188848, + "step": 15405 + }, + { + "epoch": 11.908809891808346, + "grad_norm": 0.9973399043083191, + "learning_rate": 2.1054505544306367e-05, + "loss": 0.5185, + "num_input_tokens_seen": 5190608, + "step": 15410 + }, + { + "epoch": 11.912673879443586, + "grad_norm": 1.0821834802627563, + "learning_rate": 2.1037857904649043e-05, + "loss": 0.4907, + "num_input_tokens_seen": 5192400, + "step": 15415 + }, + { + "epoch": 11.916537867078825, + "grad_norm": 0.9177914261817932, + "learning_rate": 2.1021212066997664e-05, + "loss": 0.3642, + "num_input_tokens_seen": 5194224, + "step": 15420 + }, + { + "epoch": 11.920401854714065, + "grad_norm": 0.6770352125167847, + "learning_rate": 2.1004568038922863e-05, + "loss": 0.7743, + "num_input_tokens_seen": 5196048, + "step": 15425 + }, + { + "epoch": 11.924265842349305, + "grad_norm": 1.580132246017456, + "learning_rate": 2.0987925827994457e-05, + "loss": 0.6264, + "num_input_tokens_seen": 5198032, + "step": 15430 + }, + { + "epoch": 11.928129829984544, + "grad_norm": 1.4740744829177856, + "learning_rate": 2.0971285441781407e-05, + "loss": 0.5469, + "num_input_tokens_seen": 5199696, + "step": 15435 + }, + { + "epoch": 11.931993817619784, + "grad_norm": 1.5552549362182617, + "learning_rate": 2.0954646887851865e-05, + "loss": 0.4989, + "num_input_tokens_seen": 5201360, + "step": 15440 + }, + { + "epoch": 11.935857805255024, + "grad_norm": 0.7195632457733154, + "learning_rate": 2.093801017377315e-05, + "loss": 0.4239, + "num_input_tokens_seen": 5203024, + "step": 15445 + }, + { + "epoch": 11.939721792890262, + "grad_norm": 1.2153557538986206, + "learning_rate": 2.092137530711173e-05, + "loss": 0.4536, + "num_input_tokens_seen": 5204592, + "step": 15450 + }, + { + "epoch": 11.943585780525503, + "grad_norm": 0.790496289730072, + "learning_rate": 2.0904742295433245e-05, + "loss": 0.3229, + "num_input_tokens_seen": 5206192, + "step": 15455 + }, + { + "epoch": 11.947449768160741, + "grad_norm": 1.0509967803955078, + "learning_rate": 2.0888111146302493e-05, + "loss": 0.3508, + "num_input_tokens_seen": 5207728, + "step": 15460 + }, + { + "epoch": 11.951313755795981, + "grad_norm": 1.1808723211288452, + "learning_rate": 2.0871481867283404e-05, + "loss": 0.5277, + "num_input_tokens_seen": 5209328, + "step": 15465 + }, + { + "epoch": 11.955177743431221, + "grad_norm": 1.3248443603515625, + "learning_rate": 2.08548544659391e-05, + "loss": 0.4794, + "num_input_tokens_seen": 5210832, + "step": 15470 + }, + { + "epoch": 11.95904173106646, + "grad_norm": 0.6403617262840271, + "learning_rate": 2.0838228949831803e-05, + "loss": 0.6099, + "num_input_tokens_seen": 5212688, + "step": 15475 + }, + { + "epoch": 11.9629057187017, + "grad_norm": 0.9655976295471191, + "learning_rate": 2.0821605326522908e-05, + "loss": 0.356, + "num_input_tokens_seen": 5214480, + "step": 15480 + }, + { + "epoch": 11.96676970633694, + "grad_norm": 1.199709415435791, + "learning_rate": 2.0804983603572935e-05, + "loss": 0.3414, + "num_input_tokens_seen": 5216144, + "step": 15485 + }, + { + "epoch": 11.970633693972179, + "grad_norm": 1.207990288734436, + "learning_rate": 2.078836378854154e-05, + "loss": 0.4228, + "num_input_tokens_seen": 5217872, + "step": 15490 + }, + { + "epoch": 11.974497681607419, + "grad_norm": 1.4751747846603394, + "learning_rate": 2.0771745888987515e-05, + "loss": 0.5163, + "num_input_tokens_seen": 5219600, + "step": 15495 + }, + { + "epoch": 11.978361669242659, + "grad_norm": 1.2280291318893433, + "learning_rate": 2.0755129912468787e-05, + "loss": 0.4407, + "num_input_tokens_seen": 5221264, + "step": 15500 + }, + { + "epoch": 11.982225656877898, + "grad_norm": 1.0038012266159058, + "learning_rate": 2.0738515866542385e-05, + "loss": 0.4818, + "num_input_tokens_seen": 5222800, + "step": 15505 + }, + { + "epoch": 11.986089644513138, + "grad_norm": 0.9620409607887268, + "learning_rate": 2.072190375876449e-05, + "loss": 0.484, + "num_input_tokens_seen": 5224496, + "step": 15510 + }, + { + "epoch": 11.989953632148378, + "grad_norm": 1.0312037467956543, + "learning_rate": 2.0705293596690395e-05, + "loss": 0.3859, + "num_input_tokens_seen": 5226064, + "step": 15515 + }, + { + "epoch": 11.993817619783616, + "grad_norm": 0.9265300035476685, + "learning_rate": 2.068868538787449e-05, + "loss": 0.4787, + "num_input_tokens_seen": 5228016, + "step": 15520 + }, + { + "epoch": 11.997681607418857, + "grad_norm": 0.7725517749786377, + "learning_rate": 2.0672079139870287e-05, + "loss": 0.3599, + "num_input_tokens_seen": 5229776, + "step": 15525 + }, + { + "epoch": 12.0, + "eval_loss": 0.44986608624458313, + "eval_runtime": 6.2299, + "eval_samples_per_second": 92.297, + "eval_steps_per_second": 23.114, + "num_input_tokens_seen": 5230528, + "step": 15528 + }, + { + "epoch": 12.001545595054095, + "grad_norm": 1.546624779701233, + "learning_rate": 2.0655474860230413e-05, + "loss": 0.4648, + "num_input_tokens_seen": 5231168, + "step": 15530 + }, + { + "epoch": 12.005409582689335, + "grad_norm": 0.8333700299263, + "learning_rate": 2.0638872556506592e-05, + "loss": 0.3474, + "num_input_tokens_seen": 5232736, + "step": 15535 + }, + { + "epoch": 12.009273570324575, + "grad_norm": 1.2814242839813232, + "learning_rate": 2.0622272236249646e-05, + "loss": 0.4664, + "num_input_tokens_seen": 5234496, + "step": 15540 + }, + { + "epoch": 12.013137557959814, + "grad_norm": 1.025385856628418, + "learning_rate": 2.0605673907009495e-05, + "loss": 0.4735, + "num_input_tokens_seen": 5236192, + "step": 15545 + }, + { + "epoch": 12.017001545595054, + "grad_norm": 0.7969051599502563, + "learning_rate": 2.058907757633518e-05, + "loss": 0.3532, + "num_input_tokens_seen": 5237760, + "step": 15550 + }, + { + "epoch": 12.020865533230294, + "grad_norm": 1.0974007844924927, + "learning_rate": 2.05724832517748e-05, + "loss": 0.5838, + "num_input_tokens_seen": 5239488, + "step": 15555 + }, + { + "epoch": 12.024729520865533, + "grad_norm": 1.0895462036132812, + "learning_rate": 2.0555890940875548e-05, + "loss": 0.377, + "num_input_tokens_seen": 5241024, + "step": 15560 + }, + { + "epoch": 12.028593508500773, + "grad_norm": 0.8964334726333618, + "learning_rate": 2.0539300651183715e-05, + "loss": 0.5786, + "num_input_tokens_seen": 5242656, + "step": 15565 + }, + { + "epoch": 12.032457496136013, + "grad_norm": 1.0309745073318481, + "learning_rate": 2.0522712390244662e-05, + "loss": 0.4031, + "num_input_tokens_seen": 5244288, + "step": 15570 + }, + { + "epoch": 12.036321483771252, + "grad_norm": 0.631672203540802, + "learning_rate": 2.0506126165602816e-05, + "loss": 0.4314, + "num_input_tokens_seen": 5246048, + "step": 15575 + }, + { + "epoch": 12.040185471406492, + "grad_norm": 0.7306867837905884, + "learning_rate": 2.0489541984801717e-05, + "loss": 0.384, + "num_input_tokens_seen": 5247744, + "step": 15580 + }, + { + "epoch": 12.04404945904173, + "grad_norm": 0.8839312791824341, + "learning_rate": 2.0472959855383916e-05, + "loss": 0.6295, + "num_input_tokens_seen": 5249280, + "step": 15585 + }, + { + "epoch": 12.04791344667697, + "grad_norm": 0.7839388251304626, + "learning_rate": 2.04563797848911e-05, + "loss": 0.4265, + "num_input_tokens_seen": 5251200, + "step": 15590 + }, + { + "epoch": 12.05177743431221, + "grad_norm": 1.3206349611282349, + "learning_rate": 2.0439801780863963e-05, + "loss": 0.5197, + "num_input_tokens_seen": 5252768, + "step": 15595 + }, + { + "epoch": 12.055641421947449, + "grad_norm": 1.319883942604065, + "learning_rate": 2.042322585084229e-05, + "loss": 0.5512, + "num_input_tokens_seen": 5254336, + "step": 15600 + }, + { + "epoch": 12.05950540958269, + "grad_norm": 0.9970423579216003, + "learning_rate": 2.040665200236491e-05, + "loss": 0.3497, + "num_input_tokens_seen": 5256032, + "step": 15605 + }, + { + "epoch": 12.06336939721793, + "grad_norm": 1.115315318107605, + "learning_rate": 2.0390080242969702e-05, + "loss": 0.6284, + "num_input_tokens_seen": 5258048, + "step": 15610 + }, + { + "epoch": 12.067233384853168, + "grad_norm": 0.855114758014679, + "learning_rate": 2.037351058019361e-05, + "loss": 0.347, + "num_input_tokens_seen": 5259616, + "step": 15615 + }, + { + "epoch": 12.071097372488408, + "grad_norm": 0.7869085669517517, + "learning_rate": 2.0356943021572617e-05, + "loss": 0.5284, + "num_input_tokens_seen": 5261216, + "step": 15620 + }, + { + "epoch": 12.074961360123648, + "grad_norm": 0.6645036339759827, + "learning_rate": 2.0340377574641734e-05, + "loss": 0.4255, + "num_input_tokens_seen": 5262816, + "step": 15625 + }, + { + "epoch": 12.078825347758887, + "grad_norm": 1.103737711906433, + "learning_rate": 2.0323814246935036e-05, + "loss": 0.4254, + "num_input_tokens_seen": 5264576, + "step": 15630 + }, + { + "epoch": 12.082689335394127, + "grad_norm": 1.544823169708252, + "learning_rate": 2.030725304598563e-05, + "loss": 0.7157, + "num_input_tokens_seen": 5266112, + "step": 15635 + }, + { + "epoch": 12.086553323029367, + "grad_norm": 0.7050670385360718, + "learning_rate": 2.0290693979325646e-05, + "loss": 0.3565, + "num_input_tokens_seen": 5267744, + "step": 15640 + }, + { + "epoch": 12.090417310664606, + "grad_norm": 1.47642982006073, + "learning_rate": 2.0274137054486232e-05, + "loss": 0.5667, + "num_input_tokens_seen": 5269344, + "step": 15645 + }, + { + "epoch": 12.094281298299846, + "grad_norm": 1.1516900062561035, + "learning_rate": 2.02575822789976e-05, + "loss": 0.5193, + "num_input_tokens_seen": 5271072, + "step": 15650 + }, + { + "epoch": 12.098145285935084, + "grad_norm": 1.0378600358963013, + "learning_rate": 2.0241029660388943e-05, + "loss": 0.4219, + "num_input_tokens_seen": 5272800, + "step": 15655 + }, + { + "epoch": 12.102009273570324, + "grad_norm": 1.797910213470459, + "learning_rate": 2.0224479206188496e-05, + "loss": 0.4661, + "num_input_tokens_seen": 5274528, + "step": 15660 + }, + { + "epoch": 12.105873261205565, + "grad_norm": 0.8062668442726135, + "learning_rate": 2.0207930923923497e-05, + "loss": 0.4338, + "num_input_tokens_seen": 5276256, + "step": 15665 + }, + { + "epoch": 12.109737248840803, + "grad_norm": 0.985824465751648, + "learning_rate": 2.0191384821120225e-05, + "loss": 0.4522, + "num_input_tokens_seen": 5277728, + "step": 15670 + }, + { + "epoch": 12.113601236476043, + "grad_norm": 1.082461953163147, + "learning_rate": 2.0174840905303933e-05, + "loss": 0.4115, + "num_input_tokens_seen": 5279296, + "step": 15675 + }, + { + "epoch": 12.117465224111283, + "grad_norm": 0.9728394746780396, + "learning_rate": 2.0158299183998887e-05, + "loss": 0.4919, + "num_input_tokens_seen": 5280928, + "step": 15680 + }, + { + "epoch": 12.121329211746522, + "grad_norm": 0.8698716163635254, + "learning_rate": 2.0141759664728376e-05, + "loss": 0.5556, + "num_input_tokens_seen": 5282624, + "step": 15685 + }, + { + "epoch": 12.125193199381762, + "grad_norm": 1.229811668395996, + "learning_rate": 2.012522235501466e-05, + "loss": 0.5382, + "num_input_tokens_seen": 5284320, + "step": 15690 + }, + { + "epoch": 12.129057187017002, + "grad_norm": 1.2634782791137695, + "learning_rate": 2.010868726237901e-05, + "loss": 0.5022, + "num_input_tokens_seen": 5286176, + "step": 15695 + }, + { + "epoch": 12.13292117465224, + "grad_norm": 0.9290205836296082, + "learning_rate": 2.009215439434169e-05, + "loss": 0.369, + "num_input_tokens_seen": 5287840, + "step": 15700 + }, + { + "epoch": 12.136785162287481, + "grad_norm": 1.440690040588379, + "learning_rate": 2.007562375842193e-05, + "loss": 0.5589, + "num_input_tokens_seen": 5289504, + "step": 15705 + }, + { + "epoch": 12.14064914992272, + "grad_norm": 0.8424994945526123, + "learning_rate": 2.005909536213799e-05, + "loss": 0.3663, + "num_input_tokens_seen": 5291168, + "step": 15710 + }, + { + "epoch": 12.14451313755796, + "grad_norm": 1.337843894958496, + "learning_rate": 2.0042569213007064e-05, + "loss": 0.5677, + "num_input_tokens_seen": 5292864, + "step": 15715 + }, + { + "epoch": 12.1483771251932, + "grad_norm": 1.0286564826965332, + "learning_rate": 2.002604531854535e-05, + "loss": 0.393, + "num_input_tokens_seen": 5294464, + "step": 15720 + }, + { + "epoch": 12.152241112828438, + "grad_norm": 1.1068159341812134, + "learning_rate": 2.000952368626802e-05, + "loss": 0.4025, + "num_input_tokens_seen": 5296128, + "step": 15725 + }, + { + "epoch": 12.156105100463678, + "grad_norm": 1.412851095199585, + "learning_rate": 1.9993004323689193e-05, + "loss": 0.6298, + "num_input_tokens_seen": 5297952, + "step": 15730 + }, + { + "epoch": 12.159969088098919, + "grad_norm": 0.6234666109085083, + "learning_rate": 1.997648723832199e-05, + "loss": 0.4167, + "num_input_tokens_seen": 5299584, + "step": 15735 + }, + { + "epoch": 12.163833075734157, + "grad_norm": 1.3260730504989624, + "learning_rate": 1.995997243767848e-05, + "loss": 0.6361, + "num_input_tokens_seen": 5301056, + "step": 15740 + }, + { + "epoch": 12.167697063369397, + "grad_norm": 1.5018393993377686, + "learning_rate": 1.994345992926968e-05, + "loss": 0.469, + "num_input_tokens_seen": 5302624, + "step": 15745 + }, + { + "epoch": 12.171561051004637, + "grad_norm": 1.6814357042312622, + "learning_rate": 1.9926949720605587e-05, + "loss": 0.4066, + "num_input_tokens_seen": 5304544, + "step": 15750 + }, + { + "epoch": 12.175425038639876, + "grad_norm": 0.5688886046409607, + "learning_rate": 1.9910441819195146e-05, + "loss": 0.3983, + "num_input_tokens_seen": 5306368, + "step": 15755 + }, + { + "epoch": 12.179289026275116, + "grad_norm": 1.688575029373169, + "learning_rate": 1.989393623254625e-05, + "loss": 0.4724, + "num_input_tokens_seen": 5308224, + "step": 15760 + }, + { + "epoch": 12.183153013910356, + "grad_norm": 1.1937401294708252, + "learning_rate": 1.9877432968165728e-05, + "loss": 0.3936, + "num_input_tokens_seen": 5309760, + "step": 15765 + }, + { + "epoch": 12.187017001545595, + "grad_norm": 1.0540322065353394, + "learning_rate": 1.9860932033559377e-05, + "loss": 0.3335, + "num_input_tokens_seen": 5311488, + "step": 15770 + }, + { + "epoch": 12.190880989180835, + "grad_norm": 1.011772871017456, + "learning_rate": 1.984443343623191e-05, + "loss": 0.4111, + "num_input_tokens_seen": 5313312, + "step": 15775 + }, + { + "epoch": 12.194744976816073, + "grad_norm": 2.4582929611206055, + "learning_rate": 1.982793718368699e-05, + "loss": 0.4958, + "num_input_tokens_seen": 5315008, + "step": 15780 + }, + { + "epoch": 12.198608964451314, + "grad_norm": 0.66780686378479, + "learning_rate": 1.9811443283427205e-05, + "loss": 0.3656, + "num_input_tokens_seen": 5316800, + "step": 15785 + }, + { + "epoch": 12.202472952086554, + "grad_norm": 0.9346194267272949, + "learning_rate": 1.9794951742954098e-05, + "loss": 0.3766, + "num_input_tokens_seen": 5318464, + "step": 15790 + }, + { + "epoch": 12.206336939721792, + "grad_norm": 1.0115538835525513, + "learning_rate": 1.9778462569768113e-05, + "loss": 0.4393, + "num_input_tokens_seen": 5320128, + "step": 15795 + }, + { + "epoch": 12.210200927357032, + "grad_norm": 0.6496846079826355, + "learning_rate": 1.9761975771368615e-05, + "loss": 0.3835, + "num_input_tokens_seen": 5321728, + "step": 15800 + }, + { + "epoch": 12.214064914992273, + "grad_norm": 1.201934576034546, + "learning_rate": 1.974549135525391e-05, + "loss": 0.4304, + "num_input_tokens_seen": 5323648, + "step": 15805 + }, + { + "epoch": 12.217928902627511, + "grad_norm": 0.6783020496368408, + "learning_rate": 1.9729009328921205e-05, + "loss": 0.8058, + "num_input_tokens_seen": 5325568, + "step": 15810 + }, + { + "epoch": 12.221792890262751, + "grad_norm": 0.6668700575828552, + "learning_rate": 1.971252969986662e-05, + "loss": 0.3804, + "num_input_tokens_seen": 5327232, + "step": 15815 + }, + { + "epoch": 12.225656877897991, + "grad_norm": 0.9131088256835938, + "learning_rate": 1.9696052475585196e-05, + "loss": 0.5116, + "num_input_tokens_seen": 5328960, + "step": 15820 + }, + { + "epoch": 12.22952086553323, + "grad_norm": 1.0061237812042236, + "learning_rate": 1.9679577663570863e-05, + "loss": 0.3858, + "num_input_tokens_seen": 5330656, + "step": 15825 + }, + { + "epoch": 12.23338485316847, + "grad_norm": 1.23479163646698, + "learning_rate": 1.966310527131648e-05, + "loss": 0.3807, + "num_input_tokens_seen": 5332064, + "step": 15830 + }, + { + "epoch": 12.237248840803709, + "grad_norm": 1.3499755859375, + "learning_rate": 1.9646635306313777e-05, + "loss": 0.5916, + "num_input_tokens_seen": 5333760, + "step": 15835 + }, + { + "epoch": 12.241112828438949, + "grad_norm": 0.9142582416534424, + "learning_rate": 1.96301677760534e-05, + "loss": 0.4414, + "num_input_tokens_seen": 5335648, + "step": 15840 + }, + { + "epoch": 12.244976816074189, + "grad_norm": 0.8659746646881104, + "learning_rate": 1.9613702688024877e-05, + "loss": 0.4086, + "num_input_tokens_seen": 5337376, + "step": 15845 + }, + { + "epoch": 12.248840803709427, + "grad_norm": 0.8395435810089111, + "learning_rate": 1.9597240049716625e-05, + "loss": 0.4024, + "num_input_tokens_seen": 5339008, + "step": 15850 + }, + { + "epoch": 12.252704791344668, + "grad_norm": 1.056457281112671, + "learning_rate": 1.958077986861596e-05, + "loss": 0.4089, + "num_input_tokens_seen": 5340608, + "step": 15855 + }, + { + "epoch": 12.256568778979908, + "grad_norm": 0.9207427501678467, + "learning_rate": 1.9564322152209065e-05, + "loss": 0.4676, + "num_input_tokens_seen": 5342272, + "step": 15860 + }, + { + "epoch": 12.260432766615146, + "grad_norm": 1.1268154382705688, + "learning_rate": 1.9547866907980993e-05, + "loss": 0.4607, + "num_input_tokens_seen": 5343776, + "step": 15865 + }, + { + "epoch": 12.264296754250386, + "grad_norm": 1.1624088287353516, + "learning_rate": 1.9531414143415715e-05, + "loss": 0.452, + "num_input_tokens_seen": 5345440, + "step": 15870 + }, + { + "epoch": 12.268160741885627, + "grad_norm": 1.0132900476455688, + "learning_rate": 1.9514963865996034e-05, + "loss": 0.4003, + "num_input_tokens_seen": 5347200, + "step": 15875 + }, + { + "epoch": 12.272024729520865, + "grad_norm": 0.6232582330703735, + "learning_rate": 1.949851608320364e-05, + "loss": 0.3665, + "num_input_tokens_seen": 5348768, + "step": 15880 + }, + { + "epoch": 12.275888717156105, + "grad_norm": 1.2709803581237793, + "learning_rate": 1.948207080251907e-05, + "loss": 0.4421, + "num_input_tokens_seen": 5350464, + "step": 15885 + }, + { + "epoch": 12.279752704791346, + "grad_norm": 1.1489944458007812, + "learning_rate": 1.946562803142175e-05, + "loss": 0.4398, + "num_input_tokens_seen": 5352288, + "step": 15890 + }, + { + "epoch": 12.283616692426584, + "grad_norm": 0.7805213332176208, + "learning_rate": 1.944918777738995e-05, + "loss": 0.4389, + "num_input_tokens_seen": 5353920, + "step": 15895 + }, + { + "epoch": 12.287480680061824, + "grad_norm": 0.9197027087211609, + "learning_rate": 1.943275004790078e-05, + "loss": 0.5575, + "num_input_tokens_seen": 5355776, + "step": 15900 + }, + { + "epoch": 12.291344667697063, + "grad_norm": 1.0457139015197754, + "learning_rate": 1.9416314850430224e-05, + "loss": 0.3958, + "num_input_tokens_seen": 5357344, + "step": 15905 + }, + { + "epoch": 12.295208655332303, + "grad_norm": 2.3024723529815674, + "learning_rate": 1.9399882192453127e-05, + "loss": 0.4382, + "num_input_tokens_seen": 5358816, + "step": 15910 + }, + { + "epoch": 12.299072642967543, + "grad_norm": 0.6338358521461487, + "learning_rate": 1.938345208144315e-05, + "loss": 0.3524, + "num_input_tokens_seen": 5360192, + "step": 15915 + }, + { + "epoch": 12.302936630602781, + "grad_norm": 1.7076793909072876, + "learning_rate": 1.936702452487279e-05, + "loss": 0.5453, + "num_input_tokens_seen": 5361952, + "step": 15920 + }, + { + "epoch": 12.306800618238022, + "grad_norm": 1.432176113128662, + "learning_rate": 1.935059953021342e-05, + "loss": 0.7181, + "num_input_tokens_seen": 5363456, + "step": 15925 + }, + { + "epoch": 12.310664605873262, + "grad_norm": 0.5847358107566833, + "learning_rate": 1.9334177104935218e-05, + "loss": 0.6867, + "num_input_tokens_seen": 5365152, + "step": 15930 + }, + { + "epoch": 12.3145285935085, + "grad_norm": 0.8060809373855591, + "learning_rate": 1.931775725650719e-05, + "loss": 0.3756, + "num_input_tokens_seen": 5366688, + "step": 15935 + }, + { + "epoch": 12.31839258114374, + "grad_norm": 0.9635569453239441, + "learning_rate": 1.93013399923972e-05, + "loss": 0.3453, + "num_input_tokens_seen": 5368256, + "step": 15940 + }, + { + "epoch": 12.32225656877898, + "grad_norm": 1.1232637166976929, + "learning_rate": 1.9284925320071898e-05, + "loss": 0.535, + "num_input_tokens_seen": 5369952, + "step": 15945 + }, + { + "epoch": 12.326120556414219, + "grad_norm": 0.908298134803772, + "learning_rate": 1.92685132469968e-05, + "loss": 0.6338, + "num_input_tokens_seen": 5371680, + "step": 15950 + }, + { + "epoch": 12.32998454404946, + "grad_norm": 0.7989517450332642, + "learning_rate": 1.9252103780636192e-05, + "loss": 0.3823, + "num_input_tokens_seen": 5373184, + "step": 15955 + }, + { + "epoch": 12.333848531684698, + "grad_norm": 1.0732579231262207, + "learning_rate": 1.9235696928453212e-05, + "loss": 0.583, + "num_input_tokens_seen": 5375168, + "step": 15960 + }, + { + "epoch": 12.337712519319938, + "grad_norm": 1.014569640159607, + "learning_rate": 1.9219292697909794e-05, + "loss": 0.5202, + "num_input_tokens_seen": 5376864, + "step": 15965 + }, + { + "epoch": 12.341576506955178, + "grad_norm": 0.8016404509544373, + "learning_rate": 1.920289109646667e-05, + "loss": 0.4201, + "num_input_tokens_seen": 5378464, + "step": 15970 + }, + { + "epoch": 12.345440494590417, + "grad_norm": 1.2728102207183838, + "learning_rate": 1.9186492131583395e-05, + "loss": 0.4254, + "num_input_tokens_seen": 5380256, + "step": 15975 + }, + { + "epoch": 12.349304482225657, + "grad_norm": 1.1223340034484863, + "learning_rate": 1.9170095810718318e-05, + "loss": 0.417, + "num_input_tokens_seen": 5381888, + "step": 15980 + }, + { + "epoch": 12.353168469860897, + "grad_norm": 0.765171468257904, + "learning_rate": 1.9153702141328567e-05, + "loss": 0.442, + "num_input_tokens_seen": 5383392, + "step": 15985 + }, + { + "epoch": 12.357032457496135, + "grad_norm": 0.83039790391922, + "learning_rate": 1.9137311130870104e-05, + "loss": 0.5989, + "num_input_tokens_seen": 5385152, + "step": 15990 + }, + { + "epoch": 12.360896445131376, + "grad_norm": 0.982333242893219, + "learning_rate": 1.9120922786797648e-05, + "loss": 0.5011, + "num_input_tokens_seen": 5386720, + "step": 15995 + }, + { + "epoch": 12.364760432766616, + "grad_norm": 0.8910461664199829, + "learning_rate": 1.910453711656472e-05, + "loss": 0.3728, + "num_input_tokens_seen": 5388192, + "step": 16000 + }, + { + "epoch": 12.368624420401854, + "grad_norm": 1.1029726266860962, + "learning_rate": 1.9088154127623615e-05, + "loss": 0.4673, + "num_input_tokens_seen": 5390176, + "step": 16005 + }, + { + "epoch": 12.372488408037094, + "grad_norm": 1.349474310874939, + "learning_rate": 1.907177382742542e-05, + "loss": 0.294, + "num_input_tokens_seen": 5391616, + "step": 16010 + }, + { + "epoch": 12.376352395672335, + "grad_norm": 0.7299522757530212, + "learning_rate": 1.905539622341999e-05, + "loss": 0.4193, + "num_input_tokens_seen": 5393280, + "step": 16015 + }, + { + "epoch": 12.380216383307573, + "grad_norm": 0.9890732765197754, + "learning_rate": 1.9039021323055956e-05, + "loss": 0.4218, + "num_input_tokens_seen": 5395232, + "step": 16020 + }, + { + "epoch": 12.384080370942813, + "grad_norm": 1.097030520439148, + "learning_rate": 1.902264913378072e-05, + "loss": 0.4898, + "num_input_tokens_seen": 5397152, + "step": 16025 + }, + { + "epoch": 12.387944358578052, + "grad_norm": 0.6751796007156372, + "learning_rate": 1.9006279663040458e-05, + "loss": 0.3525, + "num_input_tokens_seen": 5398816, + "step": 16030 + }, + { + "epoch": 12.391808346213292, + "grad_norm": 1.2094149589538574, + "learning_rate": 1.8989912918280102e-05, + "loss": 0.5881, + "num_input_tokens_seen": 5400448, + "step": 16035 + }, + { + "epoch": 12.395672333848532, + "grad_norm": 1.1465582847595215, + "learning_rate": 1.897354890694335e-05, + "loss": 0.5333, + "num_input_tokens_seen": 5402368, + "step": 16040 + }, + { + "epoch": 12.39953632148377, + "grad_norm": 0.9157137274742126, + "learning_rate": 1.8957187636472635e-05, + "loss": 0.3336, + "num_input_tokens_seen": 5403808, + "step": 16045 + }, + { + "epoch": 12.40340030911901, + "grad_norm": 1.866590976715088, + "learning_rate": 1.894082911430918e-05, + "loss": 0.5852, + "num_input_tokens_seen": 5405504, + "step": 16050 + }, + { + "epoch": 12.407264296754251, + "grad_norm": 0.5444645285606384, + "learning_rate": 1.8924473347892922e-05, + "loss": 0.4615, + "num_input_tokens_seen": 5406912, + "step": 16055 + }, + { + "epoch": 12.41112828438949, + "grad_norm": 0.8817809224128723, + "learning_rate": 1.890812034466258e-05, + "loss": 0.4044, + "num_input_tokens_seen": 5408320, + "step": 16060 + }, + { + "epoch": 12.41499227202473, + "grad_norm": 2.026017665863037, + "learning_rate": 1.8891770112055576e-05, + "loss": 0.5181, + "num_input_tokens_seen": 5410240, + "step": 16065 + }, + { + "epoch": 12.41885625965997, + "grad_norm": 1.020612120628357, + "learning_rate": 1.8875422657508115e-05, + "loss": 0.5271, + "num_input_tokens_seen": 5412192, + "step": 16070 + }, + { + "epoch": 12.422720247295208, + "grad_norm": 0.901839017868042, + "learning_rate": 1.885907798845511e-05, + "loss": 0.4509, + "num_input_tokens_seen": 5414240, + "step": 16075 + }, + { + "epoch": 12.426584234930449, + "grad_norm": 1.0236060619354248, + "learning_rate": 1.8842736112330206e-05, + "loss": 0.3907, + "num_input_tokens_seen": 5416064, + "step": 16080 + }, + { + "epoch": 12.430448222565687, + "grad_norm": 0.6461854577064514, + "learning_rate": 1.8826397036565797e-05, + "loss": 0.3679, + "num_input_tokens_seen": 5417632, + "step": 16085 + }, + { + "epoch": 12.434312210200927, + "grad_norm": 0.8970606923103333, + "learning_rate": 1.8810060768592992e-05, + "loss": 0.6006, + "num_input_tokens_seen": 5419552, + "step": 16090 + }, + { + "epoch": 12.438176197836167, + "grad_norm": 1.230958104133606, + "learning_rate": 1.8793727315841608e-05, + "loss": 0.4462, + "num_input_tokens_seen": 5420992, + "step": 16095 + }, + { + "epoch": 12.442040185471406, + "grad_norm": 1.4211452007293701, + "learning_rate": 1.877739668574022e-05, + "loss": 0.6411, + "num_input_tokens_seen": 5422816, + "step": 16100 + }, + { + "epoch": 12.445904173106646, + "grad_norm": 1.3749794960021973, + "learning_rate": 1.876106888571607e-05, + "loss": 0.5682, + "num_input_tokens_seen": 5424384, + "step": 16105 + }, + { + "epoch": 12.449768160741886, + "grad_norm": 1.3641725778579712, + "learning_rate": 1.8744743923195166e-05, + "loss": 0.4569, + "num_input_tokens_seen": 5425984, + "step": 16110 + }, + { + "epoch": 12.453632148377125, + "grad_norm": 1.4511828422546387, + "learning_rate": 1.872842180560218e-05, + "loss": 0.4439, + "num_input_tokens_seen": 5427360, + "step": 16115 + }, + { + "epoch": 12.457496136012365, + "grad_norm": 1.2506440877914429, + "learning_rate": 1.8712102540360527e-05, + "loss": 0.5406, + "num_input_tokens_seen": 5428960, + "step": 16120 + }, + { + "epoch": 12.461360123647605, + "grad_norm": 0.7061681151390076, + "learning_rate": 1.869578613489229e-05, + "loss": 0.3597, + "num_input_tokens_seen": 5430528, + "step": 16125 + }, + { + "epoch": 12.465224111282843, + "grad_norm": 1.2814159393310547, + "learning_rate": 1.8679472596618268e-05, + "loss": 0.5971, + "num_input_tokens_seen": 5432384, + "step": 16130 + }, + { + "epoch": 12.469088098918084, + "grad_norm": 0.789394199848175, + "learning_rate": 1.8663161932957966e-05, + "loss": 0.5771, + "num_input_tokens_seen": 5434208, + "step": 16135 + }, + { + "epoch": 12.472952086553324, + "grad_norm": 1.3778287172317505, + "learning_rate": 1.8646854151329575e-05, + "loss": 0.4222, + "num_input_tokens_seen": 5435680, + "step": 16140 + }, + { + "epoch": 12.476816074188562, + "grad_norm": 0.7693764567375183, + "learning_rate": 1.863054925914995e-05, + "loss": 0.3709, + "num_input_tokens_seen": 5437408, + "step": 16145 + }, + { + "epoch": 12.480680061823803, + "grad_norm": 1.1847442388534546, + "learning_rate": 1.861424726383466e-05, + "loss": 0.4427, + "num_input_tokens_seen": 5439168, + "step": 16150 + }, + { + "epoch": 12.484544049459041, + "grad_norm": 0.7372468709945679, + "learning_rate": 1.8597948172797975e-05, + "loss": 0.3998, + "num_input_tokens_seen": 5440928, + "step": 16155 + }, + { + "epoch": 12.488408037094281, + "grad_norm": 1.1516306400299072, + "learning_rate": 1.85816519934528e-05, + "loss": 0.385, + "num_input_tokens_seen": 5442560, + "step": 16160 + }, + { + "epoch": 12.492272024729521, + "grad_norm": 1.0215682983398438, + "learning_rate": 1.8565358733210725e-05, + "loss": 0.3999, + "num_input_tokens_seen": 5444096, + "step": 16165 + }, + { + "epoch": 12.49613601236476, + "grad_norm": 0.9735298156738281, + "learning_rate": 1.8549068399482043e-05, + "loss": 0.418, + "num_input_tokens_seen": 5445504, + "step": 16170 + }, + { + "epoch": 12.5, + "grad_norm": 0.7597795128822327, + "learning_rate": 1.8532780999675686e-05, + "loss": 0.5612, + "num_input_tokens_seen": 5447168, + "step": 16175 + }, + { + "epoch": 12.50386398763524, + "grad_norm": 0.7377928495407104, + "learning_rate": 1.8516496541199257e-05, + "loss": 0.3642, + "num_input_tokens_seen": 5448832, + "step": 16180 + }, + { + "epoch": 12.507727975270479, + "grad_norm": 1.39464271068573, + "learning_rate": 1.8500215031459035e-05, + "loss": 0.554, + "num_input_tokens_seen": 5450528, + "step": 16185 + }, + { + "epoch": 12.511591962905719, + "grad_norm": 1.29013192653656, + "learning_rate": 1.8483936477859932e-05, + "loss": 0.3807, + "num_input_tokens_seen": 5452224, + "step": 16190 + }, + { + "epoch": 12.515455950540959, + "grad_norm": 0.9500560760498047, + "learning_rate": 1.846766088780555e-05, + "loss": 0.3572, + "num_input_tokens_seen": 5453920, + "step": 16195 + }, + { + "epoch": 12.519319938176197, + "grad_norm": 1.1173900365829468, + "learning_rate": 1.845138826869811e-05, + "loss": 0.4128, + "num_input_tokens_seen": 5455648, + "step": 16200 + }, + { + "epoch": 12.523183925811438, + "grad_norm": 0.8946996927261353, + "learning_rate": 1.8435118627938512e-05, + "loss": 0.388, + "num_input_tokens_seen": 5457568, + "step": 16205 + }, + { + "epoch": 12.527047913446676, + "grad_norm": 0.9647992849349976, + "learning_rate": 1.8418851972926275e-05, + "loss": 0.6033, + "num_input_tokens_seen": 5459264, + "step": 16210 + }, + { + "epoch": 12.530911901081916, + "grad_norm": 0.9623526334762573, + "learning_rate": 1.840258831105957e-05, + "loss": 0.369, + "num_input_tokens_seen": 5460832, + "step": 16215 + }, + { + "epoch": 12.534775888717157, + "grad_norm": 0.9751954674720764, + "learning_rate": 1.8386327649735217e-05, + "loss": 0.3952, + "num_input_tokens_seen": 5462432, + "step": 16220 + }, + { + "epoch": 12.538639876352395, + "grad_norm": 1.5048834085464478, + "learning_rate": 1.8370069996348658e-05, + "loss": 0.4658, + "num_input_tokens_seen": 5464128, + "step": 16225 + }, + { + "epoch": 12.542503863987635, + "grad_norm": 0.8859071135520935, + "learning_rate": 1.835381535829396e-05, + "loss": 0.3925, + "num_input_tokens_seen": 5465632, + "step": 16230 + }, + { + "epoch": 12.546367851622875, + "grad_norm": 1.0002102851867676, + "learning_rate": 1.833756374296384e-05, + "loss": 0.5787, + "num_input_tokens_seen": 5467520, + "step": 16235 + }, + { + "epoch": 12.550231839258114, + "grad_norm": 1.0413841009140015, + "learning_rate": 1.8321315157749635e-05, + "loss": 0.4151, + "num_input_tokens_seen": 5469184, + "step": 16240 + }, + { + "epoch": 12.554095826893354, + "grad_norm": 0.8793993592262268, + "learning_rate": 1.8305069610041298e-05, + "loss": 0.4218, + "num_input_tokens_seen": 5470752, + "step": 16245 + }, + { + "epoch": 12.557959814528594, + "grad_norm": 0.923982560634613, + "learning_rate": 1.828882710722739e-05, + "loss": 0.3879, + "num_input_tokens_seen": 5472608, + "step": 16250 + }, + { + "epoch": 12.561823802163833, + "grad_norm": 1.1206953525543213, + "learning_rate": 1.8272587656695106e-05, + "loss": 0.55, + "num_input_tokens_seen": 5474144, + "step": 16255 + }, + { + "epoch": 12.565687789799073, + "grad_norm": 0.8061923384666443, + "learning_rate": 1.8256351265830248e-05, + "loss": 0.4824, + "num_input_tokens_seen": 5476160, + "step": 16260 + }, + { + "epoch": 12.569551777434313, + "grad_norm": 1.5069202184677124, + "learning_rate": 1.8240117942017214e-05, + "loss": 0.3959, + "num_input_tokens_seen": 5477728, + "step": 16265 + }, + { + "epoch": 12.573415765069551, + "grad_norm": 0.5896226763725281, + "learning_rate": 1.8223887692639022e-05, + "loss": 0.3588, + "num_input_tokens_seen": 5479424, + "step": 16270 + }, + { + "epoch": 12.577279752704792, + "grad_norm": 0.8735615015029907, + "learning_rate": 1.82076605250773e-05, + "loss": 0.4317, + "num_input_tokens_seen": 5481184, + "step": 16275 + }, + { + "epoch": 12.58114374034003, + "grad_norm": 0.740938127040863, + "learning_rate": 1.819143644671224e-05, + "loss": 0.4027, + "num_input_tokens_seen": 5482688, + "step": 16280 + }, + { + "epoch": 12.58500772797527, + "grad_norm": 0.7219180464744568, + "learning_rate": 1.8175215464922655e-05, + "loss": 0.3605, + "num_input_tokens_seen": 5484672, + "step": 16285 + }, + { + "epoch": 12.58887171561051, + "grad_norm": 2.346268653869629, + "learning_rate": 1.815899758708596e-05, + "loss": 0.7683, + "num_input_tokens_seen": 5486592, + "step": 16290 + }, + { + "epoch": 12.592735703245749, + "grad_norm": 1.2244535684585571, + "learning_rate": 1.814278282057813e-05, + "loss": 0.4719, + "num_input_tokens_seen": 5488448, + "step": 16295 + }, + { + "epoch": 12.59659969088099, + "grad_norm": 1.8902347087860107, + "learning_rate": 1.8126571172773733e-05, + "loss": 0.419, + "num_input_tokens_seen": 5489920, + "step": 16300 + }, + { + "epoch": 12.60046367851623, + "grad_norm": 0.8561967611312866, + "learning_rate": 1.8110362651045933e-05, + "loss": 0.405, + "num_input_tokens_seen": 5491360, + "step": 16305 + }, + { + "epoch": 12.604327666151468, + "grad_norm": 1.745963454246521, + "learning_rate": 1.8094157262766452e-05, + "loss": 0.535, + "num_input_tokens_seen": 5493184, + "step": 16310 + }, + { + "epoch": 12.608191653786708, + "grad_norm": 1.8781979084014893, + "learning_rate": 1.8077955015305613e-05, + "loss": 0.4956, + "num_input_tokens_seen": 5495008, + "step": 16315 + }, + { + "epoch": 12.612055641421948, + "grad_norm": 0.6960299611091614, + "learning_rate": 1.8061755916032286e-05, + "loss": 0.4024, + "num_input_tokens_seen": 5497216, + "step": 16320 + }, + { + "epoch": 12.615919629057187, + "grad_norm": 1.3623735904693604, + "learning_rate": 1.8045559972313925e-05, + "loss": 0.3949, + "num_input_tokens_seen": 5498880, + "step": 16325 + }, + { + "epoch": 12.619783616692427, + "grad_norm": 0.9318702816963196, + "learning_rate": 1.8029367191516535e-05, + "loss": 0.3975, + "num_input_tokens_seen": 5500480, + "step": 16330 + }, + { + "epoch": 12.623647604327665, + "grad_norm": 0.6454766988754272, + "learning_rate": 1.8013177581004685e-05, + "loss": 0.4104, + "num_input_tokens_seen": 5502176, + "step": 16335 + }, + { + "epoch": 12.627511591962906, + "grad_norm": 0.8985052704811096, + "learning_rate": 1.7996991148141522e-05, + "loss": 0.4233, + "num_input_tokens_seen": 5503680, + "step": 16340 + }, + { + "epoch": 12.631375579598146, + "grad_norm": 1.0905131101608276, + "learning_rate": 1.7980807900288726e-05, + "loss": 0.6521, + "num_input_tokens_seen": 5505280, + "step": 16345 + }, + { + "epoch": 12.635239567233384, + "grad_norm": 1.252641201019287, + "learning_rate": 1.796462784480652e-05, + "loss": 0.4713, + "num_input_tokens_seen": 5507136, + "step": 16350 + }, + { + "epoch": 12.639103554868624, + "grad_norm": 1.0790085792541504, + "learning_rate": 1.7948450989053707e-05, + "loss": 0.4355, + "num_input_tokens_seen": 5508896, + "step": 16355 + }, + { + "epoch": 12.642967542503865, + "grad_norm": 0.7499663233757019, + "learning_rate": 1.793227734038762e-05, + "loss": 0.4695, + "num_input_tokens_seen": 5510656, + "step": 16360 + }, + { + "epoch": 12.646831530139103, + "grad_norm": 0.6889140009880066, + "learning_rate": 1.791610690616413e-05, + "loss": 0.4483, + "num_input_tokens_seen": 5512192, + "step": 16365 + }, + { + "epoch": 12.650695517774343, + "grad_norm": 0.6936729550361633, + "learning_rate": 1.7899939693737634e-05, + "loss": 0.4917, + "num_input_tokens_seen": 5513728, + "step": 16370 + }, + { + "epoch": 12.654559505409583, + "grad_norm": 0.7438124418258667, + "learning_rate": 1.7883775710461093e-05, + "loss": 0.4305, + "num_input_tokens_seen": 5515232, + "step": 16375 + }, + { + "epoch": 12.658423493044822, + "grad_norm": 1.5702036619186401, + "learning_rate": 1.7867614963685976e-05, + "loss": 0.6447, + "num_input_tokens_seen": 5516832, + "step": 16380 + }, + { + "epoch": 12.662287480680062, + "grad_norm": 0.9360661506652832, + "learning_rate": 1.7851457460762277e-05, + "loss": 0.5007, + "num_input_tokens_seen": 5518560, + "step": 16385 + }, + { + "epoch": 12.666151468315302, + "grad_norm": 0.9282054305076599, + "learning_rate": 1.7835303209038536e-05, + "loss": 0.4263, + "num_input_tokens_seen": 5520352, + "step": 16390 + }, + { + "epoch": 12.67001545595054, + "grad_norm": 1.0683971643447876, + "learning_rate": 1.7819152215861812e-05, + "loss": 0.5616, + "num_input_tokens_seen": 5522016, + "step": 16395 + }, + { + "epoch": 12.673879443585781, + "grad_norm": 0.9879785180091858, + "learning_rate": 1.7803004488577667e-05, + "loss": 0.7446, + "num_input_tokens_seen": 5523616, + "step": 16400 + }, + { + "epoch": 12.67774343122102, + "grad_norm": 0.6787902116775513, + "learning_rate": 1.7786860034530174e-05, + "loss": 0.4329, + "num_input_tokens_seen": 5525408, + "step": 16405 + }, + { + "epoch": 12.68160741885626, + "grad_norm": 0.8874455094337463, + "learning_rate": 1.7770718861061942e-05, + "loss": 0.3523, + "num_input_tokens_seen": 5526880, + "step": 16410 + }, + { + "epoch": 12.6854714064915, + "grad_norm": 0.8715839982032776, + "learning_rate": 1.7754580975514062e-05, + "loss": 0.4542, + "num_input_tokens_seen": 5528320, + "step": 16415 + }, + { + "epoch": 12.689335394126738, + "grad_norm": 0.8610461950302124, + "learning_rate": 1.7738446385226145e-05, + "loss": 0.4915, + "num_input_tokens_seen": 5529888, + "step": 16420 + }, + { + "epoch": 12.693199381761978, + "grad_norm": 0.7528938055038452, + "learning_rate": 1.7722315097536304e-05, + "loss": 0.3703, + "num_input_tokens_seen": 5531680, + "step": 16425 + }, + { + "epoch": 12.697063369397219, + "grad_norm": 1.218527913093567, + "learning_rate": 1.7706187119781132e-05, + "loss": 0.3371, + "num_input_tokens_seen": 5533344, + "step": 16430 + }, + { + "epoch": 12.700927357032457, + "grad_norm": 0.8768514394760132, + "learning_rate": 1.7690062459295746e-05, + "loss": 0.5369, + "num_input_tokens_seen": 5534912, + "step": 16435 + }, + { + "epoch": 12.704791344667697, + "grad_norm": 0.8257859349250793, + "learning_rate": 1.7673941123413726e-05, + "loss": 0.444, + "num_input_tokens_seen": 5536640, + "step": 16440 + }, + { + "epoch": 12.708655332302937, + "grad_norm": 1.0123454332351685, + "learning_rate": 1.7657823119467165e-05, + "loss": 0.4268, + "num_input_tokens_seen": 5538240, + "step": 16445 + }, + { + "epoch": 12.712519319938176, + "grad_norm": 1.1089893579483032, + "learning_rate": 1.7641708454786615e-05, + "loss": 0.4711, + "num_input_tokens_seen": 5539680, + "step": 16450 + }, + { + "epoch": 12.716383307573416, + "grad_norm": 1.2453898191452026, + "learning_rate": 1.7625597136701127e-05, + "loss": 0.4425, + "num_input_tokens_seen": 5541440, + "step": 16455 + }, + { + "epoch": 12.720247295208654, + "grad_norm": 1.0252487659454346, + "learning_rate": 1.760948917253823e-05, + "loss": 0.6423, + "num_input_tokens_seen": 5543008, + "step": 16460 + }, + { + "epoch": 12.724111282843895, + "grad_norm": 1.1080358028411865, + "learning_rate": 1.7593384569623914e-05, + "loss": 0.4354, + "num_input_tokens_seen": 5544672, + "step": 16465 + }, + { + "epoch": 12.727975270479135, + "grad_norm": 1.2973514795303345, + "learning_rate": 1.757728333528264e-05, + "loss": 0.4226, + "num_input_tokens_seen": 5546368, + "step": 16470 + }, + { + "epoch": 12.731839258114373, + "grad_norm": 0.8985741138458252, + "learning_rate": 1.756118547683737e-05, + "loss": 0.3497, + "num_input_tokens_seen": 5547904, + "step": 16475 + }, + { + "epoch": 12.735703245749614, + "grad_norm": 0.7800002098083496, + "learning_rate": 1.7545091001609496e-05, + "loss": 0.3586, + "num_input_tokens_seen": 5549536, + "step": 16480 + }, + { + "epoch": 12.739567233384854, + "grad_norm": 0.6624599695205688, + "learning_rate": 1.752899991691888e-05, + "loss": 0.4435, + "num_input_tokens_seen": 5551072, + "step": 16485 + }, + { + "epoch": 12.743431221020092, + "grad_norm": 0.7872043251991272, + "learning_rate": 1.7512912230083838e-05, + "loss": 0.5528, + "num_input_tokens_seen": 5552832, + "step": 16490 + }, + { + "epoch": 12.747295208655332, + "grad_norm": 0.7891296148300171, + "learning_rate": 1.7496827948421157e-05, + "loss": 0.4649, + "num_input_tokens_seen": 5554624, + "step": 16495 + }, + { + "epoch": 12.751159196290573, + "grad_norm": 0.8348652124404907, + "learning_rate": 1.7480747079246063e-05, + "loss": 0.3296, + "num_input_tokens_seen": 5556288, + "step": 16500 + }, + { + "epoch": 12.755023183925811, + "grad_norm": 0.5268945097923279, + "learning_rate": 1.746466962987222e-05, + "loss": 0.5805, + "num_input_tokens_seen": 5558080, + "step": 16505 + }, + { + "epoch": 12.758887171561051, + "grad_norm": 0.7995531558990479, + "learning_rate": 1.7448595607611753e-05, + "loss": 0.3927, + "num_input_tokens_seen": 5559872, + "step": 16510 + }, + { + "epoch": 12.762751159196291, + "grad_norm": 0.7800191640853882, + "learning_rate": 1.7432525019775236e-05, + "loss": 0.5107, + "num_input_tokens_seen": 5561536, + "step": 16515 + }, + { + "epoch": 12.76661514683153, + "grad_norm": 0.8637405633926392, + "learning_rate": 1.7416457873671663e-05, + "loss": 0.3123, + "num_input_tokens_seen": 5563200, + "step": 16520 + }, + { + "epoch": 12.77047913446677, + "grad_norm": 0.8381799459457397, + "learning_rate": 1.7400394176608457e-05, + "loss": 0.4689, + "num_input_tokens_seen": 5565088, + "step": 16525 + }, + { + "epoch": 12.774343122102009, + "grad_norm": 0.6472877860069275, + "learning_rate": 1.73843339358915e-05, + "loss": 0.3318, + "num_input_tokens_seen": 5566784, + "step": 16530 + }, + { + "epoch": 12.778207109737249, + "grad_norm": 0.8914239406585693, + "learning_rate": 1.7368277158825076e-05, + "loss": 0.4064, + "num_input_tokens_seen": 5568480, + "step": 16535 + }, + { + "epoch": 12.782071097372489, + "grad_norm": 1.0719541311264038, + "learning_rate": 1.7352223852711896e-05, + "loss": 0.3957, + "num_input_tokens_seen": 5569984, + "step": 16540 + }, + { + "epoch": 12.785935085007727, + "grad_norm": 1.0400763750076294, + "learning_rate": 1.733617402485312e-05, + "loss": 0.3962, + "num_input_tokens_seen": 5571776, + "step": 16545 + }, + { + "epoch": 12.789799072642968, + "grad_norm": 1.134968638420105, + "learning_rate": 1.7320127682548277e-05, + "loss": 0.5933, + "num_input_tokens_seen": 5573504, + "step": 16550 + }, + { + "epoch": 12.793663060278208, + "grad_norm": 1.1215271949768066, + "learning_rate": 1.730408483309537e-05, + "loss": 0.4355, + "num_input_tokens_seen": 5575296, + "step": 16555 + }, + { + "epoch": 12.797527047913446, + "grad_norm": 0.8145341873168945, + "learning_rate": 1.7288045483790766e-05, + "loss": 0.5323, + "num_input_tokens_seen": 5576928, + "step": 16560 + }, + { + "epoch": 12.801391035548686, + "grad_norm": 0.9270142912864685, + "learning_rate": 1.7272009641929267e-05, + "loss": 0.5981, + "num_input_tokens_seen": 5578816, + "step": 16565 + }, + { + "epoch": 12.805255023183927, + "grad_norm": 0.8438762426376343, + "learning_rate": 1.7255977314804063e-05, + "loss": 0.3176, + "num_input_tokens_seen": 5580384, + "step": 16570 + }, + { + "epoch": 12.809119010819165, + "grad_norm": 0.5557670593261719, + "learning_rate": 1.723994850970675e-05, + "loss": 0.4056, + "num_input_tokens_seen": 5581792, + "step": 16575 + }, + { + "epoch": 12.812982998454405, + "grad_norm": 0.8611416816711426, + "learning_rate": 1.722392323392733e-05, + "loss": 0.605, + "num_input_tokens_seen": 5583424, + "step": 16580 + }, + { + "epoch": 12.816846986089644, + "grad_norm": 1.5481605529785156, + "learning_rate": 1.7207901494754192e-05, + "loss": 0.5494, + "num_input_tokens_seen": 5585312, + "step": 16585 + }, + { + "epoch": 12.820710973724884, + "grad_norm": 1.1863884925842285, + "learning_rate": 1.719188329947411e-05, + "loss": 0.4041, + "num_input_tokens_seen": 5586816, + "step": 16590 + }, + { + "epoch": 12.824574961360124, + "grad_norm": 0.7720677256584167, + "learning_rate": 1.717586865537227e-05, + "loss": 0.3194, + "num_input_tokens_seen": 5588256, + "step": 16595 + }, + { + "epoch": 12.828438948995363, + "grad_norm": 0.5263153314590454, + "learning_rate": 1.715985756973223e-05, + "loss": 0.4333, + "num_input_tokens_seen": 5589856, + "step": 16600 + }, + { + "epoch": 12.832302936630603, + "grad_norm": 0.9906795024871826, + "learning_rate": 1.7143850049835915e-05, + "loss": 0.4261, + "num_input_tokens_seen": 5591616, + "step": 16605 + }, + { + "epoch": 12.836166924265843, + "grad_norm": 0.7812103033065796, + "learning_rate": 1.7127846102963646e-05, + "loss": 0.3519, + "num_input_tokens_seen": 5593216, + "step": 16610 + }, + { + "epoch": 12.840030911901081, + "grad_norm": 1.47635018825531, + "learning_rate": 1.7111845736394118e-05, + "loss": 0.4634, + "num_input_tokens_seen": 5594720, + "step": 16615 + }, + { + "epoch": 12.843894899536322, + "grad_norm": 1.0523537397384644, + "learning_rate": 1.7095848957404384e-05, + "loss": 0.3753, + "num_input_tokens_seen": 5596096, + "step": 16620 + }, + { + "epoch": 12.847758887171562, + "grad_norm": 1.1614161729812622, + "learning_rate": 1.707985577326988e-05, + "loss": 0.4266, + "num_input_tokens_seen": 5597824, + "step": 16625 + }, + { + "epoch": 12.8516228748068, + "grad_norm": 0.9659643769264221, + "learning_rate": 1.7063866191264398e-05, + "loss": 0.3964, + "num_input_tokens_seen": 5599552, + "step": 16630 + }, + { + "epoch": 12.85548686244204, + "grad_norm": 0.8516104221343994, + "learning_rate": 1.7047880218660107e-05, + "loss": 0.6104, + "num_input_tokens_seen": 5601376, + "step": 16635 + }, + { + "epoch": 12.85935085007728, + "grad_norm": 1.1296143531799316, + "learning_rate": 1.7031897862727513e-05, + "loss": 0.6784, + "num_input_tokens_seen": 5603168, + "step": 16640 + }, + { + "epoch": 12.863214837712519, + "grad_norm": 1.165533185005188, + "learning_rate": 1.7015919130735493e-05, + "loss": 0.3751, + "num_input_tokens_seen": 5605024, + "step": 16645 + }, + { + "epoch": 12.86707882534776, + "grad_norm": 0.6692014336585999, + "learning_rate": 1.6999944029951265e-05, + "loss": 0.3475, + "num_input_tokens_seen": 5606752, + "step": 16650 + }, + { + "epoch": 12.870942812982998, + "grad_norm": 1.121605634689331, + "learning_rate": 1.698397256764041e-05, + "loss": 0.5176, + "num_input_tokens_seen": 5608320, + "step": 16655 + }, + { + "epoch": 12.874806800618238, + "grad_norm": 1.1865904331207275, + "learning_rate": 1.6968004751066823e-05, + "loss": 0.4168, + "num_input_tokens_seen": 5610368, + "step": 16660 + }, + { + "epoch": 12.878670788253478, + "grad_norm": 0.8609924912452698, + "learning_rate": 1.695204058749279e-05, + "loss": 0.3862, + "num_input_tokens_seen": 5611968, + "step": 16665 + }, + { + "epoch": 12.882534775888717, + "grad_norm": 0.8229366540908813, + "learning_rate": 1.693608008417888e-05, + "loss": 0.3688, + "num_input_tokens_seen": 5613440, + "step": 16670 + }, + { + "epoch": 12.886398763523957, + "grad_norm": 1.2881734371185303, + "learning_rate": 1.6920123248384054e-05, + "loss": 0.4855, + "num_input_tokens_seen": 5615008, + "step": 16675 + }, + { + "epoch": 12.890262751159197, + "grad_norm": 1.1142557859420776, + "learning_rate": 1.690417008736556e-05, + "loss": 0.4367, + "num_input_tokens_seen": 5616736, + "step": 16680 + }, + { + "epoch": 12.894126738794435, + "grad_norm": 1.4608800411224365, + "learning_rate": 1.6888220608378992e-05, + "loss": 0.5587, + "num_input_tokens_seen": 5618432, + "step": 16685 + }, + { + "epoch": 12.897990726429676, + "grad_norm": 1.712098479270935, + "learning_rate": 1.6872274818678275e-05, + "loss": 0.417, + "num_input_tokens_seen": 5620192, + "step": 16690 + }, + { + "epoch": 12.901854714064916, + "grad_norm": 0.9417773485183716, + "learning_rate": 1.6856332725515643e-05, + "loss": 0.4296, + "num_input_tokens_seen": 5622240, + "step": 16695 + }, + { + "epoch": 12.905718701700154, + "grad_norm": 1.142723560333252, + "learning_rate": 1.684039433614166e-05, + "loss": 0.3812, + "num_input_tokens_seen": 5623584, + "step": 16700 + }, + { + "epoch": 12.909582689335394, + "grad_norm": 1.242540955543518, + "learning_rate": 1.68244596578052e-05, + "loss": 0.4235, + "num_input_tokens_seen": 5625504, + "step": 16705 + }, + { + "epoch": 12.913446676970633, + "grad_norm": 1.063730001449585, + "learning_rate": 1.680852869775344e-05, + "loss": 0.3535, + "num_input_tokens_seen": 5627136, + "step": 16710 + }, + { + "epoch": 12.917310664605873, + "grad_norm": 1.2511929273605347, + "learning_rate": 1.6792601463231892e-05, + "loss": 0.4709, + "num_input_tokens_seen": 5628864, + "step": 16715 + }, + { + "epoch": 12.921174652241113, + "grad_norm": 0.8866894841194153, + "learning_rate": 1.6776677961484346e-05, + "loss": 0.752, + "num_input_tokens_seen": 5630464, + "step": 16720 + }, + { + "epoch": 12.925038639876352, + "grad_norm": 0.8910337090492249, + "learning_rate": 1.676075819975292e-05, + "loss": 0.5772, + "num_input_tokens_seen": 5632000, + "step": 16725 + }, + { + "epoch": 12.928902627511592, + "grad_norm": 1.388218641281128, + "learning_rate": 1.6744842185278002e-05, + "loss": 0.5644, + "num_input_tokens_seen": 5633536, + "step": 16730 + }, + { + "epoch": 12.932766615146832, + "grad_norm": 0.8340286016464233, + "learning_rate": 1.672892992529829e-05, + "loss": 0.4718, + "num_input_tokens_seen": 5635360, + "step": 16735 + }, + { + "epoch": 12.93663060278207, + "grad_norm": 1.4171593189239502, + "learning_rate": 1.6713021427050795e-05, + "loss": 0.6341, + "num_input_tokens_seen": 5636992, + "step": 16740 + }, + { + "epoch": 12.94049459041731, + "grad_norm": 0.8731650710105896, + "learning_rate": 1.6697116697770773e-05, + "loss": 0.4213, + "num_input_tokens_seen": 5638368, + "step": 16745 + }, + { + "epoch": 12.944358578052551, + "grad_norm": 0.6550624370574951, + "learning_rate": 1.6681215744691804e-05, + "loss": 0.3762, + "num_input_tokens_seen": 5640064, + "step": 16750 + }, + { + "epoch": 12.94822256568779, + "grad_norm": 1.021328091621399, + "learning_rate": 1.666531857504573e-05, + "loss": 0.3221, + "num_input_tokens_seen": 5641632, + "step": 16755 + }, + { + "epoch": 12.95208655332303, + "grad_norm": 1.2402644157409668, + "learning_rate": 1.664942519606269e-05, + "loss": 0.4614, + "num_input_tokens_seen": 5643456, + "step": 16760 + }, + { + "epoch": 12.95595054095827, + "grad_norm": 1.0375025272369385, + "learning_rate": 1.6633535614971078e-05, + "loss": 0.371, + "num_input_tokens_seen": 5645344, + "step": 16765 + }, + { + "epoch": 12.959814528593508, + "grad_norm": 1.3642486333847046, + "learning_rate": 1.661764983899757e-05, + "loss": 0.4682, + "num_input_tokens_seen": 5646912, + "step": 16770 + }, + { + "epoch": 12.963678516228748, + "grad_norm": 0.7691318392753601, + "learning_rate": 1.6601767875367118e-05, + "loss": 0.5111, + "num_input_tokens_seen": 5648480, + "step": 16775 + }, + { + "epoch": 12.967542503863987, + "grad_norm": 1.1227343082427979, + "learning_rate": 1.6585889731302934e-05, + "loss": 0.6132, + "num_input_tokens_seen": 5650112, + "step": 16780 + }, + { + "epoch": 12.971406491499227, + "grad_norm": 1.1295578479766846, + "learning_rate": 1.6570015414026486e-05, + "loss": 0.4558, + "num_input_tokens_seen": 5652096, + "step": 16785 + }, + { + "epoch": 12.975270479134467, + "grad_norm": 1.8284275531768799, + "learning_rate": 1.6554144930757504e-05, + "loss": 0.4855, + "num_input_tokens_seen": 5653856, + "step": 16790 + }, + { + "epoch": 12.979134466769706, + "grad_norm": 0.9069955945014954, + "learning_rate": 1.6538278288714003e-05, + "loss": 0.3678, + "num_input_tokens_seen": 5655392, + "step": 16795 + }, + { + "epoch": 12.982998454404946, + "grad_norm": 1.4063166379928589, + "learning_rate": 1.652241549511221e-05, + "loss": 0.4987, + "num_input_tokens_seen": 5657312, + "step": 16800 + }, + { + "epoch": 12.986862442040186, + "grad_norm": 1.0737559795379639, + "learning_rate": 1.650655655716661e-05, + "loss": 0.4244, + "num_input_tokens_seen": 5658976, + "step": 16805 + }, + { + "epoch": 12.990726429675425, + "grad_norm": 1.0569552183151245, + "learning_rate": 1.649070148208996e-05, + "loss": 0.3988, + "num_input_tokens_seen": 5660896, + "step": 16810 + }, + { + "epoch": 12.994590417310665, + "grad_norm": 1.5029339790344238, + "learning_rate": 1.647485027709324e-05, + "loss": 0.4503, + "num_input_tokens_seen": 5662560, + "step": 16815 + }, + { + "epoch": 12.998454404945905, + "grad_norm": 1.0386077165603638, + "learning_rate": 1.6459002949385662e-05, + "loss": 0.5075, + "num_input_tokens_seen": 5664320, + "step": 16820 + }, + { + "epoch": 13.0, + "eval_loss": 0.44928625226020813, + "eval_runtime": 6.2426, + "eval_samples_per_second": 92.109, + "eval_steps_per_second": 23.067, + "num_input_tokens_seen": 5664848, + "step": 16822 + }, + { + "epoch": 13.002318392581143, + "grad_norm": 1.5401493310928345, + "learning_rate": 1.64431595061747e-05, + "loss": 0.3769, + "num_input_tokens_seen": 5665776, + "step": 16825 + }, + { + "epoch": 13.006182380216384, + "grad_norm": 1.0814201831817627, + "learning_rate": 1.6427319954666027e-05, + "loss": 0.498, + "num_input_tokens_seen": 5667440, + "step": 16830 + }, + { + "epoch": 13.010046367851622, + "grad_norm": 0.871538519859314, + "learning_rate": 1.6411484302063587e-05, + "loss": 0.3703, + "num_input_tokens_seen": 5669104, + "step": 16835 + }, + { + "epoch": 13.013910355486862, + "grad_norm": 1.0392868518829346, + "learning_rate": 1.6395652555569518e-05, + "loss": 0.6648, + "num_input_tokens_seen": 5670896, + "step": 16840 + }, + { + "epoch": 13.017774343122102, + "grad_norm": 0.7649109959602356, + "learning_rate": 1.6379824722384203e-05, + "loss": 0.4008, + "num_input_tokens_seen": 5672624, + "step": 16845 + }, + { + "epoch": 13.021638330757341, + "grad_norm": 1.2551175355911255, + "learning_rate": 1.6364000809706222e-05, + "loss": 0.4554, + "num_input_tokens_seen": 5674384, + "step": 16850 + }, + { + "epoch": 13.025502318392581, + "grad_norm": 0.9877638816833496, + "learning_rate": 1.634818082473239e-05, + "loss": 0.4096, + "num_input_tokens_seen": 5676176, + "step": 16855 + }, + { + "epoch": 13.029366306027821, + "grad_norm": 1.0232632160186768, + "learning_rate": 1.633236477465774e-05, + "loss": 0.4071, + "num_input_tokens_seen": 5677744, + "step": 16860 + }, + { + "epoch": 13.03323029366306, + "grad_norm": 1.9517185688018799, + "learning_rate": 1.63165526666755e-05, + "loss": 0.5572, + "num_input_tokens_seen": 5679824, + "step": 16865 + }, + { + "epoch": 13.0370942812983, + "grad_norm": 1.5648781061172485, + "learning_rate": 1.6300744507977095e-05, + "loss": 0.3703, + "num_input_tokens_seen": 5681360, + "step": 16870 + }, + { + "epoch": 13.04095826893354, + "grad_norm": 0.9035128355026245, + "learning_rate": 1.6284940305752195e-05, + "loss": 0.4144, + "num_input_tokens_seen": 5683024, + "step": 16875 + }, + { + "epoch": 13.044822256568779, + "grad_norm": 1.623870611190796, + "learning_rate": 1.6269140067188638e-05, + "loss": 0.387, + "num_input_tokens_seen": 5684816, + "step": 16880 + }, + { + "epoch": 13.048686244204019, + "grad_norm": 1.0243149995803833, + "learning_rate": 1.6253343799472467e-05, + "loss": 0.5861, + "num_input_tokens_seen": 5686480, + "step": 16885 + }, + { + "epoch": 13.052550231839259, + "grad_norm": 1.5724592208862305, + "learning_rate": 1.6237551509787912e-05, + "loss": 0.6697, + "num_input_tokens_seen": 5688080, + "step": 16890 + }, + { + "epoch": 13.056414219474497, + "grad_norm": 1.0776346921920776, + "learning_rate": 1.6221763205317415e-05, + "loss": 0.4273, + "num_input_tokens_seen": 5689712, + "step": 16895 + }, + { + "epoch": 13.060278207109738, + "grad_norm": 1.1777868270874023, + "learning_rate": 1.620597889324158e-05, + "loss": 0.5746, + "num_input_tokens_seen": 5691408, + "step": 16900 + }, + { + "epoch": 13.064142194744976, + "grad_norm": 2.082993507385254, + "learning_rate": 1.6190198580739206e-05, + "loss": 0.6529, + "num_input_tokens_seen": 5692944, + "step": 16905 + }, + { + "epoch": 13.068006182380216, + "grad_norm": 1.1658885478973389, + "learning_rate": 1.617442227498727e-05, + "loss": 0.3864, + "num_input_tokens_seen": 5694864, + "step": 16910 + }, + { + "epoch": 13.071870170015456, + "grad_norm": 0.6862993836402893, + "learning_rate": 1.615864998316095e-05, + "loss": 0.385, + "num_input_tokens_seen": 5696592, + "step": 16915 + }, + { + "epoch": 13.075734157650695, + "grad_norm": 1.06672203540802, + "learning_rate": 1.6142881712433566e-05, + "loss": 0.3859, + "num_input_tokens_seen": 5698416, + "step": 16920 + }, + { + "epoch": 13.079598145285935, + "grad_norm": 1.3805172443389893, + "learning_rate": 1.6127117469976617e-05, + "loss": 0.376, + "num_input_tokens_seen": 5700208, + "step": 16925 + }, + { + "epoch": 13.083462132921175, + "grad_norm": 1.6068094968795776, + "learning_rate": 1.6111357262959785e-05, + "loss": 0.4163, + "num_input_tokens_seen": 5701680, + "step": 16930 + }, + { + "epoch": 13.087326120556414, + "grad_norm": 0.8874915242195129, + "learning_rate": 1.60956010985509e-05, + "loss": 0.4125, + "num_input_tokens_seen": 5703376, + "step": 16935 + }, + { + "epoch": 13.091190108191654, + "grad_norm": 1.652901530265808, + "learning_rate": 1.607984898391596e-05, + "loss": 0.5768, + "num_input_tokens_seen": 5705296, + "step": 16940 + }, + { + "epoch": 13.095054095826894, + "grad_norm": 0.9350986480712891, + "learning_rate": 1.6064100926219128e-05, + "loss": 0.4194, + "num_input_tokens_seen": 5706800, + "step": 16945 + }, + { + "epoch": 13.098918083462133, + "grad_norm": 0.7440901398658752, + "learning_rate": 1.6048356932622696e-05, + "loss": 0.3818, + "num_input_tokens_seen": 5708304, + "step": 16950 + }, + { + "epoch": 13.102782071097373, + "grad_norm": 1.0128964185714722, + "learning_rate": 1.6032617010287154e-05, + "loss": 0.4046, + "num_input_tokens_seen": 5710128, + "step": 16955 + }, + { + "epoch": 13.106646058732611, + "grad_norm": 1.3175569772720337, + "learning_rate": 1.601688116637109e-05, + "loss": 0.3888, + "num_input_tokens_seen": 5711792, + "step": 16960 + }, + { + "epoch": 13.110510046367851, + "grad_norm": 0.954981803894043, + "learning_rate": 1.600114940803128e-05, + "loss": 0.495, + "num_input_tokens_seen": 5713488, + "step": 16965 + }, + { + "epoch": 13.114374034003092, + "grad_norm": 1.2357321977615356, + "learning_rate": 1.5985421742422608e-05, + "loss": 0.6838, + "num_input_tokens_seen": 5715152, + "step": 16970 + }, + { + "epoch": 13.11823802163833, + "grad_norm": 0.7278153896331787, + "learning_rate": 1.596969817669811e-05, + "loss": 0.6631, + "num_input_tokens_seen": 5716560, + "step": 16975 + }, + { + "epoch": 13.12210200927357, + "grad_norm": 0.9769477248191833, + "learning_rate": 1.5953978718008965e-05, + "loss": 0.4116, + "num_input_tokens_seen": 5718096, + "step": 16980 + }, + { + "epoch": 13.12596599690881, + "grad_norm": 0.9679589867591858, + "learning_rate": 1.5938263373504475e-05, + "loss": 0.393, + "num_input_tokens_seen": 5719728, + "step": 16985 + }, + { + "epoch": 13.129829984544049, + "grad_norm": 0.7865661382675171, + "learning_rate": 1.592255215033206e-05, + "loss": 0.61, + "num_input_tokens_seen": 5721200, + "step": 16990 + }, + { + "epoch": 13.13369397217929, + "grad_norm": 0.9562196731567383, + "learning_rate": 1.5906845055637293e-05, + "loss": 0.4152, + "num_input_tokens_seen": 5722960, + "step": 16995 + }, + { + "epoch": 13.13755795981453, + "grad_norm": 0.5692915320396423, + "learning_rate": 1.589114209656386e-05, + "loss": 0.4208, + "num_input_tokens_seen": 5724784, + "step": 17000 + }, + { + "epoch": 13.141421947449768, + "grad_norm": 0.9010636806488037, + "learning_rate": 1.587544328025355e-05, + "loss": 0.4387, + "num_input_tokens_seen": 5726352, + "step": 17005 + }, + { + "epoch": 13.145285935085008, + "grad_norm": 0.9508593082427979, + "learning_rate": 1.585974861384628e-05, + "loss": 0.4262, + "num_input_tokens_seen": 5727984, + "step": 17010 + }, + { + "epoch": 13.149149922720248, + "grad_norm": 0.6777143478393555, + "learning_rate": 1.5844058104480082e-05, + "loss": 0.4166, + "num_input_tokens_seen": 5729648, + "step": 17015 + }, + { + "epoch": 13.153013910355487, + "grad_norm": 1.0546754598617554, + "learning_rate": 1.5828371759291088e-05, + "loss": 0.4474, + "num_input_tokens_seen": 5731248, + "step": 17020 + }, + { + "epoch": 13.156877897990727, + "grad_norm": 1.1129637956619263, + "learning_rate": 1.5812689585413542e-05, + "loss": 0.6639, + "num_input_tokens_seen": 5732848, + "step": 17025 + }, + { + "epoch": 13.160741885625965, + "grad_norm": 0.9674705266952515, + "learning_rate": 1.5797011589979788e-05, + "loss": 0.3614, + "num_input_tokens_seen": 5734448, + "step": 17030 + }, + { + "epoch": 13.164605873261205, + "grad_norm": 1.0796257257461548, + "learning_rate": 1.5781337780120287e-05, + "loss": 0.3989, + "num_input_tokens_seen": 5736016, + "step": 17035 + }, + { + "epoch": 13.168469860896446, + "grad_norm": 1.0189164876937866, + "learning_rate": 1.5765668162963572e-05, + "loss": 0.5808, + "num_input_tokens_seen": 5737776, + "step": 17040 + }, + { + "epoch": 13.172333848531684, + "grad_norm": 0.7298932671546936, + "learning_rate": 1.5750002745636275e-05, + "loss": 0.3219, + "num_input_tokens_seen": 5739120, + "step": 17045 + }, + { + "epoch": 13.176197836166924, + "grad_norm": 0.8791738748550415, + "learning_rate": 1.573434153526313e-05, + "loss": 0.3699, + "num_input_tokens_seen": 5740816, + "step": 17050 + }, + { + "epoch": 13.180061823802165, + "grad_norm": 0.811692476272583, + "learning_rate": 1.5718684538966944e-05, + "loss": 0.3997, + "num_input_tokens_seen": 5742608, + "step": 17055 + }, + { + "epoch": 13.183925811437403, + "grad_norm": 2.114764451980591, + "learning_rate": 1.570303176386861e-05, + "loss": 0.5035, + "num_input_tokens_seen": 5744496, + "step": 17060 + }, + { + "epoch": 13.187789799072643, + "grad_norm": 0.8198548555374146, + "learning_rate": 1.568738321708711e-05, + "loss": 0.4991, + "num_input_tokens_seen": 5746096, + "step": 17065 + }, + { + "epoch": 13.191653786707883, + "grad_norm": 0.8648368120193481, + "learning_rate": 1.567173890573949e-05, + "loss": 0.3648, + "num_input_tokens_seen": 5747728, + "step": 17070 + }, + { + "epoch": 13.195517774343122, + "grad_norm": 1.1997311115264893, + "learning_rate": 1.5656098836940877e-05, + "loss": 0.4079, + "num_input_tokens_seen": 5749392, + "step": 17075 + }, + { + "epoch": 13.199381761978362, + "grad_norm": 1.0903103351593018, + "learning_rate": 1.5640463017804476e-05, + "loss": 0.4058, + "num_input_tokens_seen": 5750864, + "step": 17080 + }, + { + "epoch": 13.2032457496136, + "grad_norm": 0.5902786254882812, + "learning_rate": 1.562483145544155e-05, + "loss": 0.3173, + "num_input_tokens_seen": 5752560, + "step": 17085 + }, + { + "epoch": 13.20710973724884, + "grad_norm": 0.7905175089836121, + "learning_rate": 1.560920415696142e-05, + "loss": 0.4438, + "num_input_tokens_seen": 5754288, + "step": 17090 + }, + { + "epoch": 13.21097372488408, + "grad_norm": 0.7962562441825867, + "learning_rate": 1.559358112947148e-05, + "loss": 0.436, + "num_input_tokens_seen": 5756144, + "step": 17095 + }, + { + "epoch": 13.21483771251932, + "grad_norm": 0.7173594236373901, + "learning_rate": 1.5577962380077177e-05, + "loss": 0.4499, + "num_input_tokens_seen": 5757904, + "step": 17100 + }, + { + "epoch": 13.21870170015456, + "grad_norm": 1.2220085859298706, + "learning_rate": 1.556234791588201e-05, + "loss": 0.4988, + "num_input_tokens_seen": 5759504, + "step": 17105 + }, + { + "epoch": 13.2225656877898, + "grad_norm": 1.3290438652038574, + "learning_rate": 1.5546737743987526e-05, + "loss": 0.4567, + "num_input_tokens_seen": 5761008, + "step": 17110 + }, + { + "epoch": 13.226429675425038, + "grad_norm": 0.8041403293609619, + "learning_rate": 1.5531131871493327e-05, + "loss": 0.3262, + "num_input_tokens_seen": 5762640, + "step": 17115 + }, + { + "epoch": 13.230293663060278, + "grad_norm": 0.7751356959342957, + "learning_rate": 1.5515530305497065e-05, + "loss": 0.4118, + "num_input_tokens_seen": 5764208, + "step": 17120 + }, + { + "epoch": 13.234157650695519, + "grad_norm": 0.88230961561203, + "learning_rate": 1.5499933053094425e-05, + "loss": 0.3938, + "num_input_tokens_seen": 5765968, + "step": 17125 + }, + { + "epoch": 13.238021638330757, + "grad_norm": 1.0807344913482666, + "learning_rate": 1.5484340121379116e-05, + "loss": 0.4764, + "num_input_tokens_seen": 5767600, + "step": 17130 + }, + { + "epoch": 13.241885625965997, + "grad_norm": 2.0851528644561768, + "learning_rate": 1.5468751517442913e-05, + "loss": 0.4676, + "num_input_tokens_seen": 5769424, + "step": 17135 + }, + { + "epoch": 13.245749613601237, + "grad_norm": 0.8853353261947632, + "learning_rate": 1.5453167248375606e-05, + "loss": 0.4677, + "num_input_tokens_seen": 5771184, + "step": 17140 + }, + { + "epoch": 13.249613601236476, + "grad_norm": 0.9650543332099915, + "learning_rate": 1.5437587321264995e-05, + "loss": 0.4438, + "num_input_tokens_seen": 5772720, + "step": 17145 + }, + { + "epoch": 13.253477588871716, + "grad_norm": 1.0178592205047607, + "learning_rate": 1.542201174319695e-05, + "loss": 0.3747, + "num_input_tokens_seen": 5774352, + "step": 17150 + }, + { + "epoch": 13.257341576506954, + "grad_norm": 1.1455883979797363, + "learning_rate": 1.5406440521255312e-05, + "loss": 0.3643, + "num_input_tokens_seen": 5776080, + "step": 17155 + }, + { + "epoch": 13.261205564142195, + "grad_norm": 1.0836435556411743, + "learning_rate": 1.5390873662521983e-05, + "loss": 0.567, + "num_input_tokens_seen": 5777648, + "step": 17160 + }, + { + "epoch": 13.265069551777435, + "grad_norm": 0.7457519173622131, + "learning_rate": 1.5375311174076863e-05, + "loss": 0.6635, + "num_input_tokens_seen": 5779408, + "step": 17165 + }, + { + "epoch": 13.268933539412673, + "grad_norm": 0.7283689379692078, + "learning_rate": 1.5359753062997858e-05, + "loss": 0.3573, + "num_input_tokens_seen": 5781232, + "step": 17170 + }, + { + "epoch": 13.272797527047913, + "grad_norm": 1.331739902496338, + "learning_rate": 1.53441993363609e-05, + "loss": 0.4616, + "num_input_tokens_seen": 5782864, + "step": 17175 + }, + { + "epoch": 13.276661514683154, + "grad_norm": 0.8591625094413757, + "learning_rate": 1.5328650001239898e-05, + "loss": 0.4691, + "num_input_tokens_seen": 5784656, + "step": 17180 + }, + { + "epoch": 13.280525502318392, + "grad_norm": 0.6905813217163086, + "learning_rate": 1.5313105064706803e-05, + "loss": 0.3709, + "num_input_tokens_seen": 5786352, + "step": 17185 + }, + { + "epoch": 13.284389489953632, + "grad_norm": 1.0064094066619873, + "learning_rate": 1.5297564533831536e-05, + "loss": 0.4711, + "num_input_tokens_seen": 5787856, + "step": 17190 + }, + { + "epoch": 13.288253477588873, + "grad_norm": 1.1733109951019287, + "learning_rate": 1.528202841568202e-05, + "loss": 0.5611, + "num_input_tokens_seen": 5789328, + "step": 17195 + }, + { + "epoch": 13.292117465224111, + "grad_norm": 0.6991374492645264, + "learning_rate": 1.526649671732418e-05, + "loss": 0.6987, + "num_input_tokens_seen": 5790800, + "step": 17200 + }, + { + "epoch": 13.295981452859351, + "grad_norm": 0.9756714105606079, + "learning_rate": 1.5250969445821928e-05, + "loss": 0.4459, + "num_input_tokens_seen": 5792336, + "step": 17205 + }, + { + "epoch": 13.29984544049459, + "grad_norm": 1.0821627378463745, + "learning_rate": 1.523544660823716e-05, + "loss": 0.3631, + "num_input_tokens_seen": 5793744, + "step": 17210 + }, + { + "epoch": 13.30370942812983, + "grad_norm": 1.6091512441635132, + "learning_rate": 1.5219928211629747e-05, + "loss": 0.5984, + "num_input_tokens_seen": 5795632, + "step": 17215 + }, + { + "epoch": 13.30757341576507, + "grad_norm": 0.8915444016456604, + "learning_rate": 1.5204414263057559e-05, + "loss": 0.4467, + "num_input_tokens_seen": 5797168, + "step": 17220 + }, + { + "epoch": 13.311437403400308, + "grad_norm": 1.2766704559326172, + "learning_rate": 1.5188904769576423e-05, + "loss": 0.5479, + "num_input_tokens_seen": 5798992, + "step": 17225 + }, + { + "epoch": 13.315301391035549, + "grad_norm": 0.881824791431427, + "learning_rate": 1.5173399738240154e-05, + "loss": 0.4241, + "num_input_tokens_seen": 5800528, + "step": 17230 + }, + { + "epoch": 13.319165378670789, + "grad_norm": 1.1008801460266113, + "learning_rate": 1.5157899176100526e-05, + "loss": 0.3698, + "num_input_tokens_seen": 5802256, + "step": 17235 + }, + { + "epoch": 13.323029366306027, + "grad_norm": 0.9739639759063721, + "learning_rate": 1.5142403090207307e-05, + "loss": 0.4378, + "num_input_tokens_seen": 5803952, + "step": 17240 + }, + { + "epoch": 13.326893353941268, + "grad_norm": 0.8932632207870483, + "learning_rate": 1.5126911487608198e-05, + "loss": 0.4102, + "num_input_tokens_seen": 5805872, + "step": 17245 + }, + { + "epoch": 13.330757341576508, + "grad_norm": 1.224489688873291, + "learning_rate": 1.5111424375348866e-05, + "loss": 0.3994, + "num_input_tokens_seen": 5807472, + "step": 17250 + }, + { + "epoch": 13.334621329211746, + "grad_norm": 0.7758581042289734, + "learning_rate": 1.5095941760472947e-05, + "loss": 0.4046, + "num_input_tokens_seen": 5809360, + "step": 17255 + }, + { + "epoch": 13.338485316846986, + "grad_norm": 0.7742945551872253, + "learning_rate": 1.5080463650022036e-05, + "loss": 0.4954, + "num_input_tokens_seen": 5811024, + "step": 17260 + }, + { + "epoch": 13.342349304482227, + "grad_norm": 0.9021164178848267, + "learning_rate": 1.5064990051035654e-05, + "loss": 0.388, + "num_input_tokens_seen": 5812528, + "step": 17265 + }, + { + "epoch": 13.346213292117465, + "grad_norm": 1.0972179174423218, + "learning_rate": 1.5049520970551307e-05, + "loss": 0.4623, + "num_input_tokens_seen": 5814480, + "step": 17270 + }, + { + "epoch": 13.350077279752705, + "grad_norm": 0.6693075895309448, + "learning_rate": 1.5034056415604397e-05, + "loss": 0.3517, + "num_input_tokens_seen": 5816144, + "step": 17275 + }, + { + "epoch": 13.353941267387944, + "grad_norm": 0.6248348951339722, + "learning_rate": 1.5018596393228323e-05, + "loss": 0.4866, + "num_input_tokens_seen": 5817872, + "step": 17280 + }, + { + "epoch": 13.357805255023184, + "grad_norm": 0.7967627048492432, + "learning_rate": 1.5003140910454383e-05, + "loss": 0.3301, + "num_input_tokens_seen": 5819696, + "step": 17285 + }, + { + "epoch": 13.361669242658424, + "grad_norm": 0.966340959072113, + "learning_rate": 1.498768997431183e-05, + "loss": 0.4079, + "num_input_tokens_seen": 5821744, + "step": 17290 + }, + { + "epoch": 13.365533230293662, + "grad_norm": 0.8677964210510254, + "learning_rate": 1.4972243591827839e-05, + "loss": 0.3772, + "num_input_tokens_seen": 5823472, + "step": 17295 + }, + { + "epoch": 13.369397217928903, + "grad_norm": 0.8662424683570862, + "learning_rate": 1.495680177002751e-05, + "loss": 0.5201, + "num_input_tokens_seen": 5825360, + "step": 17300 + }, + { + "epoch": 13.373261205564143, + "grad_norm": 0.709933876991272, + "learning_rate": 1.4941364515933886e-05, + "loss": 0.3627, + "num_input_tokens_seen": 5826896, + "step": 17305 + }, + { + "epoch": 13.377125193199381, + "grad_norm": 1.1533540487289429, + "learning_rate": 1.4925931836567922e-05, + "loss": 0.4697, + "num_input_tokens_seen": 5828592, + "step": 17310 + }, + { + "epoch": 13.380989180834622, + "grad_norm": 1.4195582866668701, + "learning_rate": 1.4910503738948477e-05, + "loss": 0.3752, + "num_input_tokens_seen": 5830128, + "step": 17315 + }, + { + "epoch": 13.384853168469862, + "grad_norm": 0.6415035724639893, + "learning_rate": 1.4895080230092363e-05, + "loss": 0.4542, + "num_input_tokens_seen": 5831952, + "step": 17320 + }, + { + "epoch": 13.3887171561051, + "grad_norm": 3.039553642272949, + "learning_rate": 1.4879661317014279e-05, + "loss": 0.5766, + "num_input_tokens_seen": 5833648, + "step": 17325 + }, + { + "epoch": 13.39258114374034, + "grad_norm": 1.215824842453003, + "learning_rate": 1.4864247006726839e-05, + "loss": 0.489, + "num_input_tokens_seen": 5835408, + "step": 17330 + }, + { + "epoch": 13.396445131375579, + "grad_norm": 1.2153334617614746, + "learning_rate": 1.4848837306240554e-05, + "loss": 0.3695, + "num_input_tokens_seen": 5837328, + "step": 17335 + }, + { + "epoch": 13.400309119010819, + "grad_norm": 1.9285653829574585, + "learning_rate": 1.4833432222563858e-05, + "loss": 0.4576, + "num_input_tokens_seen": 5838992, + "step": 17340 + }, + { + "epoch": 13.40417310664606, + "grad_norm": 0.6855834722518921, + "learning_rate": 1.4818031762703078e-05, + "loss": 0.5377, + "num_input_tokens_seen": 5840592, + "step": 17345 + }, + { + "epoch": 13.408037094281298, + "grad_norm": 2.2100582122802734, + "learning_rate": 1.480263593366242e-05, + "loss": 0.5263, + "num_input_tokens_seen": 5842384, + "step": 17350 + }, + { + "epoch": 13.411901081916538, + "grad_norm": 1.794752836227417, + "learning_rate": 1.4787244742444012e-05, + "loss": 0.434, + "num_input_tokens_seen": 5843920, + "step": 17355 + }, + { + "epoch": 13.415765069551778, + "grad_norm": 0.7870399355888367, + "learning_rate": 1.4771858196047856e-05, + "loss": 0.3723, + "num_input_tokens_seen": 5845680, + "step": 17360 + }, + { + "epoch": 13.419629057187016, + "grad_norm": 1.9062353372573853, + "learning_rate": 1.475647630147185e-05, + "loss": 0.4726, + "num_input_tokens_seen": 5847248, + "step": 17365 + }, + { + "epoch": 13.423493044822257, + "grad_norm": 0.7593907713890076, + "learning_rate": 1.4741099065711761e-05, + "loss": 0.4539, + "num_input_tokens_seen": 5848816, + "step": 17370 + }, + { + "epoch": 13.427357032457497, + "grad_norm": 0.9664857983589172, + "learning_rate": 1.4725726495761267e-05, + "loss": 0.5772, + "num_input_tokens_seen": 5850448, + "step": 17375 + }, + { + "epoch": 13.431221020092735, + "grad_norm": 0.8207223415374756, + "learning_rate": 1.4710358598611886e-05, + "loss": 0.296, + "num_input_tokens_seen": 5852176, + "step": 17380 + }, + { + "epoch": 13.435085007727976, + "grad_norm": 0.9393683671951294, + "learning_rate": 1.4694995381253035e-05, + "loss": 0.3446, + "num_input_tokens_seen": 5854000, + "step": 17385 + }, + { + "epoch": 13.438948995363216, + "grad_norm": 1.0746829509735107, + "learning_rate": 1.4679636850672002e-05, + "loss": 0.5419, + "num_input_tokens_seen": 5855600, + "step": 17390 + }, + { + "epoch": 13.442812982998454, + "grad_norm": 0.77485191822052, + "learning_rate": 1.466428301385393e-05, + "loss": 0.4661, + "num_input_tokens_seen": 5857392, + "step": 17395 + }, + { + "epoch": 13.446676970633694, + "grad_norm": 1.2081841230392456, + "learning_rate": 1.4648933877781846e-05, + "loss": 0.612, + "num_input_tokens_seen": 5859184, + "step": 17400 + }, + { + "epoch": 13.450540958268933, + "grad_norm": 0.7136641144752502, + "learning_rate": 1.463358944943663e-05, + "loss": 0.4855, + "num_input_tokens_seen": 5860784, + "step": 17405 + }, + { + "epoch": 13.454404945904173, + "grad_norm": 1.2333787679672241, + "learning_rate": 1.4618249735797005e-05, + "loss": 0.5687, + "num_input_tokens_seen": 5862640, + "step": 17410 + }, + { + "epoch": 13.458268933539413, + "grad_norm": 2.370220899581909, + "learning_rate": 1.460291474383958e-05, + "loss": 0.4517, + "num_input_tokens_seen": 5864048, + "step": 17415 + }, + { + "epoch": 13.462132921174652, + "grad_norm": 1.1620062589645386, + "learning_rate": 1.4587584480538796e-05, + "loss": 0.3884, + "num_input_tokens_seen": 5865520, + "step": 17420 + }, + { + "epoch": 13.465996908809892, + "grad_norm": 0.8393563628196716, + "learning_rate": 1.4572258952866945e-05, + "loss": 0.6475, + "num_input_tokens_seen": 5867056, + "step": 17425 + }, + { + "epoch": 13.469860896445132, + "grad_norm": 1.067672848701477, + "learning_rate": 1.4556938167794166e-05, + "loss": 0.5588, + "num_input_tokens_seen": 5868880, + "step": 17430 + }, + { + "epoch": 13.47372488408037, + "grad_norm": 0.6863765716552734, + "learning_rate": 1.4541622132288445e-05, + "loss": 0.4415, + "num_input_tokens_seen": 5870352, + "step": 17435 + }, + { + "epoch": 13.47758887171561, + "grad_norm": 0.6271547079086304, + "learning_rate": 1.4526310853315625e-05, + "loss": 0.3884, + "num_input_tokens_seen": 5872080, + "step": 17440 + }, + { + "epoch": 13.481452859350851, + "grad_norm": 0.7001942992210388, + "learning_rate": 1.4511004337839352e-05, + "loss": 0.4757, + "num_input_tokens_seen": 5873968, + "step": 17445 + }, + { + "epoch": 13.48531684698609, + "grad_norm": 0.9950211048126221, + "learning_rate": 1.449570259282112e-05, + "loss": 0.4118, + "num_input_tokens_seen": 5875344, + "step": 17450 + }, + { + "epoch": 13.48918083462133, + "grad_norm": 1.748724102973938, + "learning_rate": 1.4480405625220261e-05, + "loss": 0.6636, + "num_input_tokens_seen": 5877104, + "step": 17455 + }, + { + "epoch": 13.493044822256568, + "grad_norm": 1.1648329496383667, + "learning_rate": 1.4465113441993918e-05, + "loss": 0.4866, + "num_input_tokens_seen": 5878800, + "step": 17460 + }, + { + "epoch": 13.496908809891808, + "grad_norm": 0.7372883558273315, + "learning_rate": 1.4449826050097065e-05, + "loss": 0.3975, + "num_input_tokens_seen": 5880912, + "step": 17465 + }, + { + "epoch": 13.500772797527048, + "grad_norm": 1.2158008813858032, + "learning_rate": 1.443454345648252e-05, + "loss": 0.4145, + "num_input_tokens_seen": 5882640, + "step": 17470 + }, + { + "epoch": 13.504636785162287, + "grad_norm": 0.7798665165901184, + "learning_rate": 1.4419265668100868e-05, + "loss": 0.4722, + "num_input_tokens_seen": 5884336, + "step": 17475 + }, + { + "epoch": 13.508500772797527, + "grad_norm": 0.9417523741722107, + "learning_rate": 1.440399269190057e-05, + "loss": 0.3994, + "num_input_tokens_seen": 5885936, + "step": 17480 + }, + { + "epoch": 13.512364760432767, + "grad_norm": 0.9762256741523743, + "learning_rate": 1.4388724534827852e-05, + "loss": 0.5486, + "num_input_tokens_seen": 5887376, + "step": 17485 + }, + { + "epoch": 13.516228748068006, + "grad_norm": 1.1350575685501099, + "learning_rate": 1.4373461203826767e-05, + "loss": 0.4696, + "num_input_tokens_seen": 5889168, + "step": 17490 + }, + { + "epoch": 13.520092735703246, + "grad_norm": 0.8478720784187317, + "learning_rate": 1.4358202705839163e-05, + "loss": 0.544, + "num_input_tokens_seen": 5890832, + "step": 17495 + }, + { + "epoch": 13.523956723338486, + "grad_norm": 0.8264853954315186, + "learning_rate": 1.4342949047804688e-05, + "loss": 0.505, + "num_input_tokens_seen": 5892400, + "step": 17500 + }, + { + "epoch": 13.527820710973725, + "grad_norm": 1.6753758192062378, + "learning_rate": 1.4327700236660824e-05, + "loss": 0.5656, + "num_input_tokens_seen": 5894160, + "step": 17505 + }, + { + "epoch": 13.531684698608965, + "grad_norm": 1.3359450101852417, + "learning_rate": 1.4312456279342801e-05, + "loss": 0.4221, + "num_input_tokens_seen": 5895760, + "step": 17510 + }, + { + "epoch": 13.535548686244205, + "grad_norm": 0.8970622420310974, + "learning_rate": 1.4297217182783661e-05, + "loss": 0.4951, + "num_input_tokens_seen": 5897392, + "step": 17515 + }, + { + "epoch": 13.539412673879443, + "grad_norm": 0.9369198083877563, + "learning_rate": 1.4281982953914252e-05, + "loss": 0.5207, + "num_input_tokens_seen": 5898992, + "step": 17520 + }, + { + "epoch": 13.543276661514684, + "grad_norm": 1.1263132095336914, + "learning_rate": 1.4266753599663179e-05, + "loss": 0.535, + "num_input_tokens_seen": 5900912, + "step": 17525 + }, + { + "epoch": 13.547140649149922, + "grad_norm": 1.049924612045288, + "learning_rate": 1.4251529126956852e-05, + "loss": 0.5254, + "num_input_tokens_seen": 5902640, + "step": 17530 + }, + { + "epoch": 13.551004636785162, + "grad_norm": 0.9030024409294128, + "learning_rate": 1.423630954271944e-05, + "loss": 0.4665, + "num_input_tokens_seen": 5904496, + "step": 17535 + }, + { + "epoch": 13.554868624420402, + "grad_norm": 2.5749711990356445, + "learning_rate": 1.4221094853872915e-05, + "loss": 0.469, + "num_input_tokens_seen": 5906224, + "step": 17540 + }, + { + "epoch": 13.55873261205564, + "grad_norm": 1.0888874530792236, + "learning_rate": 1.4205885067337005e-05, + "loss": 0.503, + "num_input_tokens_seen": 5907952, + "step": 17545 + }, + { + "epoch": 13.562596599690881, + "grad_norm": 1.5909305810928345, + "learning_rate": 1.4190680190029209e-05, + "loss": 0.4936, + "num_input_tokens_seen": 5909840, + "step": 17550 + }, + { + "epoch": 13.566460587326121, + "grad_norm": 1.10933518409729, + "learning_rate": 1.4175480228864788e-05, + "loss": 0.5054, + "num_input_tokens_seen": 5911600, + "step": 17555 + }, + { + "epoch": 13.57032457496136, + "grad_norm": 0.808907687664032, + "learning_rate": 1.4160285190756795e-05, + "loss": 0.3601, + "num_input_tokens_seen": 5913200, + "step": 17560 + }, + { + "epoch": 13.5741885625966, + "grad_norm": 0.9742183685302734, + "learning_rate": 1.4145095082616012e-05, + "loss": 0.6156, + "num_input_tokens_seen": 5915216, + "step": 17565 + }, + { + "epoch": 13.578052550231838, + "grad_norm": 0.8169615864753723, + "learning_rate": 1.4129909911350981e-05, + "loss": 0.4193, + "num_input_tokens_seen": 5916752, + "step": 17570 + }, + { + "epoch": 13.581916537867079, + "grad_norm": 0.7339168190956116, + "learning_rate": 1.4114729683868033e-05, + "loss": 0.4162, + "num_input_tokens_seen": 5918288, + "step": 17575 + }, + { + "epoch": 13.585780525502319, + "grad_norm": 1.670540452003479, + "learning_rate": 1.4099554407071214e-05, + "loss": 0.5202, + "num_input_tokens_seen": 5920016, + "step": 17580 + }, + { + "epoch": 13.589644513137557, + "grad_norm": 1.5331319570541382, + "learning_rate": 1.4084384087862331e-05, + "loss": 0.4318, + "num_input_tokens_seen": 5921744, + "step": 17585 + }, + { + "epoch": 13.593508500772797, + "grad_norm": 1.0340778827667236, + "learning_rate": 1.4069218733140938e-05, + "loss": 0.4264, + "num_input_tokens_seen": 5923568, + "step": 17590 + }, + { + "epoch": 13.597372488408038, + "grad_norm": 0.981850266456604, + "learning_rate": 1.405405834980431e-05, + "loss": 0.3921, + "num_input_tokens_seen": 5925360, + "step": 17595 + }, + { + "epoch": 13.601236476043276, + "grad_norm": 0.7784262895584106, + "learning_rate": 1.4038902944747514e-05, + "loss": 0.3033, + "num_input_tokens_seen": 5926992, + "step": 17600 + }, + { + "epoch": 13.605100463678516, + "grad_norm": 1.2133136987686157, + "learning_rate": 1.4023752524863293e-05, + "loss": 0.4917, + "num_input_tokens_seen": 5928688, + "step": 17605 + }, + { + "epoch": 13.608964451313756, + "grad_norm": 1.109610676765442, + "learning_rate": 1.4008607097042165e-05, + "loss": 0.3499, + "num_input_tokens_seen": 5930448, + "step": 17610 + }, + { + "epoch": 13.612828438948995, + "grad_norm": 0.8646442890167236, + "learning_rate": 1.3993466668172353e-05, + "loss": 0.4129, + "num_input_tokens_seen": 5932144, + "step": 17615 + }, + { + "epoch": 13.616692426584235, + "grad_norm": 1.4073898792266846, + "learning_rate": 1.3978331245139817e-05, + "loss": 0.6785, + "num_input_tokens_seen": 5933520, + "step": 17620 + }, + { + "epoch": 13.620556414219475, + "grad_norm": 2.5873148441314697, + "learning_rate": 1.3963200834828238e-05, + "loss": 0.4604, + "num_input_tokens_seen": 5935088, + "step": 17625 + }, + { + "epoch": 13.624420401854714, + "grad_norm": 1.2251821756362915, + "learning_rate": 1.3948075444119013e-05, + "loss": 0.4094, + "num_input_tokens_seen": 5936560, + "step": 17630 + }, + { + "epoch": 13.628284389489954, + "grad_norm": 0.5826811790466309, + "learning_rate": 1.3932955079891257e-05, + "loss": 0.4136, + "num_input_tokens_seen": 5938192, + "step": 17635 + }, + { + "epoch": 13.632148377125194, + "grad_norm": 1.0630645751953125, + "learning_rate": 1.3917839749021805e-05, + "loss": 0.496, + "num_input_tokens_seen": 5940048, + "step": 17640 + }, + { + "epoch": 13.636012364760433, + "grad_norm": 0.6567503213882446, + "learning_rate": 1.3902729458385216e-05, + "loss": 0.3407, + "num_input_tokens_seen": 5941744, + "step": 17645 + }, + { + "epoch": 13.639876352395673, + "grad_norm": 1.1714764833450317, + "learning_rate": 1.3887624214853729e-05, + "loss": 0.5145, + "num_input_tokens_seen": 5943568, + "step": 17650 + }, + { + "epoch": 13.643740340030911, + "grad_norm": 0.8236311674118042, + "learning_rate": 1.3872524025297298e-05, + "loss": 0.3286, + "num_input_tokens_seen": 5945328, + "step": 17655 + }, + { + "epoch": 13.647604327666151, + "grad_norm": 1.2523810863494873, + "learning_rate": 1.3857428896583579e-05, + "loss": 0.4214, + "num_input_tokens_seen": 5946704, + "step": 17660 + }, + { + "epoch": 13.651468315301392, + "grad_norm": 1.1793568134307861, + "learning_rate": 1.3842338835577928e-05, + "loss": 0.325, + "num_input_tokens_seen": 5948464, + "step": 17665 + }, + { + "epoch": 13.65533230293663, + "grad_norm": 1.1129891872406006, + "learning_rate": 1.3827253849143388e-05, + "loss": 0.3969, + "num_input_tokens_seen": 5950064, + "step": 17670 + }, + { + "epoch": 13.65919629057187, + "grad_norm": 0.5157621502876282, + "learning_rate": 1.3812173944140705e-05, + "loss": 0.4106, + "num_input_tokens_seen": 5951600, + "step": 17675 + }, + { + "epoch": 13.66306027820711, + "grad_norm": 0.958209753036499, + "learning_rate": 1.3797099127428325e-05, + "loss": 0.5207, + "num_input_tokens_seen": 5953072, + "step": 17680 + }, + { + "epoch": 13.666924265842349, + "grad_norm": 0.7412235736846924, + "learning_rate": 1.3782029405862354e-05, + "loss": 0.4977, + "num_input_tokens_seen": 5954704, + "step": 17685 + }, + { + "epoch": 13.670788253477589, + "grad_norm": 1.3102761507034302, + "learning_rate": 1.3766964786296587e-05, + "loss": 0.4308, + "num_input_tokens_seen": 5956464, + "step": 17690 + }, + { + "epoch": 13.674652241112828, + "grad_norm": 1.071866512298584, + "learning_rate": 1.3751905275582513e-05, + "loss": 0.4901, + "num_input_tokens_seen": 5957936, + "step": 17695 + }, + { + "epoch": 13.678516228748068, + "grad_norm": 0.6494676470756531, + "learning_rate": 1.373685088056928e-05, + "loss": 0.3772, + "num_input_tokens_seen": 5959536, + "step": 17700 + }, + { + "epoch": 13.682380216383308, + "grad_norm": 2.219801425933838, + "learning_rate": 1.3721801608103707e-05, + "loss": 0.4521, + "num_input_tokens_seen": 5961232, + "step": 17705 + }, + { + "epoch": 13.686244204018546, + "grad_norm": 0.8404219150543213, + "learning_rate": 1.3706757465030317e-05, + "loss": 0.4344, + "num_input_tokens_seen": 5963120, + "step": 17710 + }, + { + "epoch": 13.690108191653787, + "grad_norm": 0.7713928818702698, + "learning_rate": 1.3691718458191255e-05, + "loss": 0.4794, + "num_input_tokens_seen": 5964688, + "step": 17715 + }, + { + "epoch": 13.693972179289027, + "grad_norm": 0.7883641123771667, + "learning_rate": 1.3676684594426367e-05, + "loss": 0.4409, + "num_input_tokens_seen": 5966256, + "step": 17720 + }, + { + "epoch": 13.697836166924265, + "grad_norm": 0.6151412725448608, + "learning_rate": 1.366165588057314e-05, + "loss": 0.3476, + "num_input_tokens_seen": 5967728, + "step": 17725 + }, + { + "epoch": 13.701700154559505, + "grad_norm": 1.4078949689865112, + "learning_rate": 1.3646632323466724e-05, + "loss": 0.7056, + "num_input_tokens_seen": 5969456, + "step": 17730 + }, + { + "epoch": 13.705564142194746, + "grad_norm": 2.0726003646850586, + "learning_rate": 1.3631613929939918e-05, + "loss": 0.7315, + "num_input_tokens_seen": 5970928, + "step": 17735 + }, + { + "epoch": 13.709428129829984, + "grad_norm": 1.1175040006637573, + "learning_rate": 1.3616600706823168e-05, + "loss": 0.3896, + "num_input_tokens_seen": 5972816, + "step": 17740 + }, + { + "epoch": 13.713292117465224, + "grad_norm": 1.1235419511795044, + "learning_rate": 1.3601592660944601e-05, + "loss": 0.4363, + "num_input_tokens_seen": 5974384, + "step": 17745 + }, + { + "epoch": 13.717156105100464, + "grad_norm": 0.7609992027282715, + "learning_rate": 1.358658979912995e-05, + "loss": 0.4215, + "num_input_tokens_seen": 5975984, + "step": 17750 + }, + { + "epoch": 13.721020092735703, + "grad_norm": 1.0430344343185425, + "learning_rate": 1.3571592128202606e-05, + "loss": 0.3801, + "num_input_tokens_seen": 5977712, + "step": 17755 + }, + { + "epoch": 13.724884080370943, + "grad_norm": 0.98097825050354, + "learning_rate": 1.3556599654983613e-05, + "loss": 0.6798, + "num_input_tokens_seen": 5979408, + "step": 17760 + }, + { + "epoch": 13.728748068006183, + "grad_norm": 2.079603910446167, + "learning_rate": 1.3541612386291627e-05, + "loss": 0.5785, + "num_input_tokens_seen": 5981296, + "step": 17765 + }, + { + "epoch": 13.732612055641422, + "grad_norm": 1.2040987014770508, + "learning_rate": 1.3526630328942949e-05, + "loss": 0.512, + "num_input_tokens_seen": 5982992, + "step": 17770 + }, + { + "epoch": 13.736476043276662, + "grad_norm": 0.9182040691375732, + "learning_rate": 1.3511653489751502e-05, + "loss": 0.378, + "num_input_tokens_seen": 5984784, + "step": 17775 + }, + { + "epoch": 13.7403400309119, + "grad_norm": 0.8253678679466248, + "learning_rate": 1.3496681875528855e-05, + "loss": 0.4605, + "num_input_tokens_seen": 5986480, + "step": 17780 + }, + { + "epoch": 13.74420401854714, + "grad_norm": 0.87001633644104, + "learning_rate": 1.3481715493084185e-05, + "loss": 0.325, + "num_input_tokens_seen": 5987984, + "step": 17785 + }, + { + "epoch": 13.74806800618238, + "grad_norm": 1.62651526927948, + "learning_rate": 1.3466754349224287e-05, + "loss": 0.6157, + "num_input_tokens_seen": 5989776, + "step": 17790 + }, + { + "epoch": 13.75193199381762, + "grad_norm": 1.214057445526123, + "learning_rate": 1.3451798450753569e-05, + "loss": 0.3345, + "num_input_tokens_seen": 5991216, + "step": 17795 + }, + { + "epoch": 13.75579598145286, + "grad_norm": 1.4502732753753662, + "learning_rate": 1.343684780447409e-05, + "loss": 0.4661, + "num_input_tokens_seen": 5992784, + "step": 17800 + }, + { + "epoch": 13.7596599690881, + "grad_norm": 1.075416088104248, + "learning_rate": 1.3421902417185473e-05, + "loss": 0.3883, + "num_input_tokens_seen": 5994480, + "step": 17805 + }, + { + "epoch": 13.763523956723338, + "grad_norm": 0.9406943321228027, + "learning_rate": 1.3406962295684977e-05, + "loss": 0.36, + "num_input_tokens_seen": 5996336, + "step": 17810 + }, + { + "epoch": 13.767387944358578, + "grad_norm": 0.6393059492111206, + "learning_rate": 1.3392027446767449e-05, + "loss": 0.3612, + "num_input_tokens_seen": 5997936, + "step": 17815 + }, + { + "epoch": 13.771251931993817, + "grad_norm": 1.0397279262542725, + "learning_rate": 1.3377097877225363e-05, + "loss": 0.4444, + "num_input_tokens_seen": 5999920, + "step": 17820 + }, + { + "epoch": 13.775115919629057, + "grad_norm": 1.0192394256591797, + "learning_rate": 1.3362173593848774e-05, + "loss": 0.4055, + "num_input_tokens_seen": 6001872, + "step": 17825 + }, + { + "epoch": 13.778979907264297, + "grad_norm": 1.5186998844146729, + "learning_rate": 1.3347254603425327e-05, + "loss": 0.4065, + "num_input_tokens_seen": 6003376, + "step": 17830 + }, + { + "epoch": 13.782843894899536, + "grad_norm": 0.9261975288391113, + "learning_rate": 1.3332340912740263e-05, + "loss": 0.411, + "num_input_tokens_seen": 6004752, + "step": 17835 + }, + { + "epoch": 13.786707882534776, + "grad_norm": 0.9382060766220093, + "learning_rate": 1.3317432528576435e-05, + "loss": 0.377, + "num_input_tokens_seen": 6006480, + "step": 17840 + }, + { + "epoch": 13.790571870170016, + "grad_norm": 1.0788642168045044, + "learning_rate": 1.3302529457714258e-05, + "loss": 0.7231, + "num_input_tokens_seen": 6008208, + "step": 17845 + }, + { + "epoch": 13.794435857805254, + "grad_norm": 1.0893783569335938, + "learning_rate": 1.3287631706931727e-05, + "loss": 0.736, + "num_input_tokens_seen": 6010192, + "step": 17850 + }, + { + "epoch": 13.798299845440495, + "grad_norm": 1.4192616939544678, + "learning_rate": 1.3272739283004449e-05, + "loss": 0.3654, + "num_input_tokens_seen": 6011984, + "step": 17855 + }, + { + "epoch": 13.802163833075735, + "grad_norm": 0.63017737865448, + "learning_rate": 1.3257852192705578e-05, + "loss": 0.5232, + "num_input_tokens_seen": 6013584, + "step": 17860 + }, + { + "epoch": 13.806027820710973, + "grad_norm": 1.4672414064407349, + "learning_rate": 1.3242970442805846e-05, + "loss": 0.4433, + "num_input_tokens_seen": 6015248, + "step": 17865 + }, + { + "epoch": 13.809891808346213, + "grad_norm": 0.7767432332038879, + "learning_rate": 1.3228094040073574e-05, + "loss": 0.4162, + "num_input_tokens_seen": 6016848, + "step": 17870 + }, + { + "epoch": 13.813755795981454, + "grad_norm": 0.7056379914283752, + "learning_rate": 1.3213222991274616e-05, + "loss": 0.3856, + "num_input_tokens_seen": 6018576, + "step": 17875 + }, + { + "epoch": 13.817619783616692, + "grad_norm": 1.188812255859375, + "learning_rate": 1.3198357303172443e-05, + "loss": 0.505, + "num_input_tokens_seen": 6020496, + "step": 17880 + }, + { + "epoch": 13.821483771251932, + "grad_norm": 1.1888601779937744, + "learning_rate": 1.3183496982528031e-05, + "loss": 0.3531, + "num_input_tokens_seen": 6022160, + "step": 17885 + }, + { + "epoch": 13.825347758887172, + "grad_norm": 1.166250228881836, + "learning_rate": 1.3168642036099973e-05, + "loss": 0.4341, + "num_input_tokens_seen": 6023920, + "step": 17890 + }, + { + "epoch": 13.829211746522411, + "grad_norm": 0.9497909545898438, + "learning_rate": 1.3153792470644371e-05, + "loss": 0.5722, + "num_input_tokens_seen": 6025520, + "step": 17895 + }, + { + "epoch": 13.833075734157651, + "grad_norm": 1.2214404344558716, + "learning_rate": 1.3138948292914896e-05, + "loss": 0.3814, + "num_input_tokens_seen": 6027184, + "step": 17900 + }, + { + "epoch": 13.83693972179289, + "grad_norm": 1.0474265813827515, + "learning_rate": 1.3124109509662772e-05, + "loss": 0.4487, + "num_input_tokens_seen": 6028816, + "step": 17905 + }, + { + "epoch": 13.84080370942813, + "grad_norm": 0.7210708260536194, + "learning_rate": 1.3109276127636763e-05, + "loss": 0.3434, + "num_input_tokens_seen": 6030416, + "step": 17910 + }, + { + "epoch": 13.84466769706337, + "grad_norm": 1.0325512886047363, + "learning_rate": 1.3094448153583171e-05, + "loss": 0.3549, + "num_input_tokens_seen": 6032176, + "step": 17915 + }, + { + "epoch": 13.848531684698608, + "grad_norm": 1.3064517974853516, + "learning_rate": 1.3079625594245865e-05, + "loss": 0.5357, + "num_input_tokens_seen": 6034160, + "step": 17920 + }, + { + "epoch": 13.852395672333849, + "grad_norm": 1.0630073547363281, + "learning_rate": 1.3064808456366228e-05, + "loss": 0.3606, + "num_input_tokens_seen": 6036080, + "step": 17925 + }, + { + "epoch": 13.856259659969089, + "grad_norm": 0.8764272928237915, + "learning_rate": 1.3049996746683188e-05, + "loss": 0.346, + "num_input_tokens_seen": 6037840, + "step": 17930 + }, + { + "epoch": 13.860123647604327, + "grad_norm": 0.7328134775161743, + "learning_rate": 1.3035190471933193e-05, + "loss": 0.3511, + "num_input_tokens_seen": 6039472, + "step": 17935 + }, + { + "epoch": 13.863987635239567, + "grad_norm": 0.9408679604530334, + "learning_rate": 1.3020389638850223e-05, + "loss": 0.3788, + "num_input_tokens_seen": 6041328, + "step": 17940 + }, + { + "epoch": 13.867851622874806, + "grad_norm": 1.6568267345428467, + "learning_rate": 1.300559425416579e-05, + "loss": 0.5968, + "num_input_tokens_seen": 6042832, + "step": 17945 + }, + { + "epoch": 13.871715610510046, + "grad_norm": 0.65269535779953, + "learning_rate": 1.2990804324608913e-05, + "loss": 0.3876, + "num_input_tokens_seen": 6044432, + "step": 17950 + }, + { + "epoch": 13.875579598145286, + "grad_norm": 0.7437227368354797, + "learning_rate": 1.2976019856906146e-05, + "loss": 0.4015, + "num_input_tokens_seen": 6046064, + "step": 17955 + }, + { + "epoch": 13.879443585780525, + "grad_norm": 1.3418443202972412, + "learning_rate": 1.296124085778157e-05, + "loss": 0.5142, + "num_input_tokens_seen": 6047824, + "step": 17960 + }, + { + "epoch": 13.883307573415765, + "grad_norm": 0.9721065759658813, + "learning_rate": 1.294646733395675e-05, + "loss": 0.6899, + "num_input_tokens_seen": 6049488, + "step": 17965 + }, + { + "epoch": 13.887171561051005, + "grad_norm": 1.6689496040344238, + "learning_rate": 1.2931699292150767e-05, + "loss": 0.4502, + "num_input_tokens_seen": 6051312, + "step": 17970 + }, + { + "epoch": 13.891035548686244, + "grad_norm": 1.2289295196533203, + "learning_rate": 1.2916936739080226e-05, + "loss": 0.4882, + "num_input_tokens_seen": 6053136, + "step": 17975 + }, + { + "epoch": 13.894899536321484, + "grad_norm": 1.6984257698059082, + "learning_rate": 1.2902179681459215e-05, + "loss": 0.6253, + "num_input_tokens_seen": 6054640, + "step": 17980 + }, + { + "epoch": 13.898763523956724, + "grad_norm": 1.012587070465088, + "learning_rate": 1.2887428125999329e-05, + "loss": 0.3316, + "num_input_tokens_seen": 6056240, + "step": 17985 + }, + { + "epoch": 13.902627511591962, + "grad_norm": 1.108728289604187, + "learning_rate": 1.2872682079409678e-05, + "loss": 0.4992, + "num_input_tokens_seen": 6057904, + "step": 17990 + }, + { + "epoch": 13.906491499227203, + "grad_norm": 1.4040029048919678, + "learning_rate": 1.2857941548396846e-05, + "loss": 0.479, + "num_input_tokens_seen": 6059504, + "step": 17995 + }, + { + "epoch": 13.910355486862443, + "grad_norm": 0.6203539967536926, + "learning_rate": 1.2843206539664903e-05, + "loss": 0.7103, + "num_input_tokens_seen": 6061136, + "step": 18000 + }, + { + "epoch": 13.914219474497681, + "grad_norm": 0.8756282329559326, + "learning_rate": 1.2828477059915443e-05, + "loss": 0.4885, + "num_input_tokens_seen": 6062576, + "step": 18005 + }, + { + "epoch": 13.918083462132921, + "grad_norm": 1.132379412651062, + "learning_rate": 1.2813753115847504e-05, + "loss": 0.4132, + "num_input_tokens_seen": 6063920, + "step": 18010 + }, + { + "epoch": 13.921947449768162, + "grad_norm": 0.9973307847976685, + "learning_rate": 1.2799034714157632e-05, + "loss": 0.493, + "num_input_tokens_seen": 6065232, + "step": 18015 + }, + { + "epoch": 13.9258114374034, + "grad_norm": 0.7363815307617188, + "learning_rate": 1.2784321861539828e-05, + "loss": 0.3808, + "num_input_tokens_seen": 6067024, + "step": 18020 + }, + { + "epoch": 13.92967542503864, + "grad_norm": 0.7645331621170044, + "learning_rate": 1.2769614564685611e-05, + "loss": 0.4012, + "num_input_tokens_seen": 6068592, + "step": 18025 + }, + { + "epoch": 13.933539412673879, + "grad_norm": 2.0086605548858643, + "learning_rate": 1.2754912830283933e-05, + "loss": 0.4891, + "num_input_tokens_seen": 6070320, + "step": 18030 + }, + { + "epoch": 13.937403400309119, + "grad_norm": 0.9162660837173462, + "learning_rate": 1.2740216665021231e-05, + "loss": 0.4287, + "num_input_tokens_seen": 6071856, + "step": 18035 + }, + { + "epoch": 13.94126738794436, + "grad_norm": 1.1238036155700684, + "learning_rate": 1.2725526075581404e-05, + "loss": 0.479, + "num_input_tokens_seen": 6073648, + "step": 18040 + }, + { + "epoch": 13.945131375579598, + "grad_norm": 1.3576602935791016, + "learning_rate": 1.2710841068645834e-05, + "loss": 0.3773, + "num_input_tokens_seen": 6075312, + "step": 18045 + }, + { + "epoch": 13.948995363214838, + "grad_norm": 0.8166390657424927, + "learning_rate": 1.2696161650893346e-05, + "loss": 0.5654, + "num_input_tokens_seen": 6076976, + "step": 18050 + }, + { + "epoch": 13.952859350850078, + "grad_norm": 0.6953926086425781, + "learning_rate": 1.2681487829000214e-05, + "loss": 0.3913, + "num_input_tokens_seen": 6078800, + "step": 18055 + }, + { + "epoch": 13.956723338485316, + "grad_norm": 0.7167173624038696, + "learning_rate": 1.2666819609640196e-05, + "loss": 0.4418, + "num_input_tokens_seen": 6080336, + "step": 18060 + }, + { + "epoch": 13.960587326120557, + "grad_norm": 0.6684638261795044, + "learning_rate": 1.2652156999484482e-05, + "loss": 0.4072, + "num_input_tokens_seen": 6081968, + "step": 18065 + }, + { + "epoch": 13.964451313755795, + "grad_norm": 0.8237266540527344, + "learning_rate": 1.2637500005201713e-05, + "loss": 0.3732, + "num_input_tokens_seen": 6083792, + "step": 18070 + }, + { + "epoch": 13.968315301391035, + "grad_norm": 1.184685230255127, + "learning_rate": 1.2622848633457979e-05, + "loss": 0.4525, + "num_input_tokens_seen": 6085584, + "step": 18075 + }, + { + "epoch": 13.972179289026275, + "grad_norm": 0.8137785196304321, + "learning_rate": 1.26082028909168e-05, + "loss": 0.3441, + "num_input_tokens_seen": 6087216, + "step": 18080 + }, + { + "epoch": 13.976043276661514, + "grad_norm": 0.9840213656425476, + "learning_rate": 1.2593562784239166e-05, + "loss": 0.3633, + "num_input_tokens_seen": 6089200, + "step": 18085 + }, + { + "epoch": 13.979907264296754, + "grad_norm": 1.3678056001663208, + "learning_rate": 1.2578928320083472e-05, + "loss": 0.3964, + "num_input_tokens_seen": 6090864, + "step": 18090 + }, + { + "epoch": 13.983771251931994, + "grad_norm": 1.803592324256897, + "learning_rate": 1.256429950510557e-05, + "loss": 0.6359, + "num_input_tokens_seen": 6092912, + "step": 18095 + }, + { + "epoch": 13.987635239567233, + "grad_norm": 0.9281842112541199, + "learning_rate": 1.2549676345958727e-05, + "loss": 0.398, + "num_input_tokens_seen": 6094736, + "step": 18100 + }, + { + "epoch": 13.991499227202473, + "grad_norm": 1.0555342435836792, + "learning_rate": 1.2535058849293646e-05, + "loss": 0.4637, + "num_input_tokens_seen": 6096496, + "step": 18105 + }, + { + "epoch": 13.995363214837713, + "grad_norm": 1.0137866735458374, + "learning_rate": 1.252044702175845e-05, + "loss": 0.4426, + "num_input_tokens_seen": 6098320, + "step": 18110 + }, + { + "epoch": 13.999227202472952, + "grad_norm": 0.8358750939369202, + "learning_rate": 1.2505840869998686e-05, + "loss": 0.561, + "num_input_tokens_seen": 6100144, + "step": 18115 + }, + { + "epoch": 14.0, + "eval_loss": 0.44765037298202515, + "eval_runtime": 6.226, + "eval_samples_per_second": 92.355, + "eval_steps_per_second": 23.129, + "num_input_tokens_seen": 6100288, + "step": 18116 + }, + { + "epoch": 14.003091190108192, + "grad_norm": 0.8023648858070374, + "learning_rate": 1.2491240400657309e-05, + "loss": 0.4363, + "num_input_tokens_seen": 6101536, + "step": 18120 + }, + { + "epoch": 14.006955177743432, + "grad_norm": 1.6105399131774902, + "learning_rate": 1.2476645620374708e-05, + "loss": 0.6618, + "num_input_tokens_seen": 6103072, + "step": 18125 + }, + { + "epoch": 14.01081916537867, + "grad_norm": 0.8783650398254395, + "learning_rate": 1.2462056535788689e-05, + "loss": 0.5782, + "num_input_tokens_seen": 6104672, + "step": 18130 + }, + { + "epoch": 14.01468315301391, + "grad_norm": 1.6162853240966797, + "learning_rate": 1.2447473153534444e-05, + "loss": 0.4225, + "num_input_tokens_seen": 6106208, + "step": 18135 + }, + { + "epoch": 14.018547140649149, + "grad_norm": 1.078931450843811, + "learning_rate": 1.2432895480244583e-05, + "loss": 0.4907, + "num_input_tokens_seen": 6107680, + "step": 18140 + }, + { + "epoch": 14.02241112828439, + "grad_norm": 1.5977040529251099, + "learning_rate": 1.2418323522549122e-05, + "loss": 0.4866, + "num_input_tokens_seen": 6109184, + "step": 18145 + }, + { + "epoch": 14.02627511591963, + "grad_norm": 1.1871243715286255, + "learning_rate": 1.2403757287075468e-05, + "loss": 0.3936, + "num_input_tokens_seen": 6111008, + "step": 18150 + }, + { + "epoch": 14.030139103554868, + "grad_norm": 1.1880214214324951, + "learning_rate": 1.2389196780448425e-05, + "loss": 0.4965, + "num_input_tokens_seen": 6112800, + "step": 18155 + }, + { + "epoch": 14.034003091190108, + "grad_norm": 1.1713966131210327, + "learning_rate": 1.2374642009290213e-05, + "loss": 0.3966, + "num_input_tokens_seen": 6114432, + "step": 18160 + }, + { + "epoch": 14.037867078825348, + "grad_norm": 1.2738338708877563, + "learning_rate": 1.2360092980220434e-05, + "loss": 0.3999, + "num_input_tokens_seen": 6116128, + "step": 18165 + }, + { + "epoch": 14.041731066460587, + "grad_norm": 0.9651294350624084, + "learning_rate": 1.2345549699856065e-05, + "loss": 0.4148, + "num_input_tokens_seen": 6117824, + "step": 18170 + }, + { + "epoch": 14.045595054095827, + "grad_norm": 0.8436691761016846, + "learning_rate": 1.2331012174811477e-05, + "loss": 0.709, + "num_input_tokens_seen": 6119488, + "step": 18175 + }, + { + "epoch": 14.049459041731067, + "grad_norm": 0.9639518857002258, + "learning_rate": 1.2316480411698423e-05, + "loss": 0.3751, + "num_input_tokens_seen": 6121216, + "step": 18180 + }, + { + "epoch": 14.053323029366306, + "grad_norm": 1.0099014043807983, + "learning_rate": 1.2301954417126035e-05, + "loss": 0.3727, + "num_input_tokens_seen": 6123168, + "step": 18185 + }, + { + "epoch": 14.057187017001546, + "grad_norm": 1.256992220878601, + "learning_rate": 1.2287434197700817e-05, + "loss": 0.3628, + "num_input_tokens_seen": 6124960, + "step": 18190 + }, + { + "epoch": 14.061051004636786, + "grad_norm": 1.1592105627059937, + "learning_rate": 1.2272919760026666e-05, + "loss": 0.5209, + "num_input_tokens_seen": 6126816, + "step": 18195 + }, + { + "epoch": 14.064914992272024, + "grad_norm": 1.8165122270584106, + "learning_rate": 1.225841111070482e-05, + "loss": 0.4296, + "num_input_tokens_seen": 6128384, + "step": 18200 + }, + { + "epoch": 14.068778979907265, + "grad_norm": 0.9686217904090881, + "learning_rate": 1.2243908256333917e-05, + "loss": 0.3801, + "num_input_tokens_seen": 6129984, + "step": 18205 + }, + { + "epoch": 14.072642967542503, + "grad_norm": 0.8710289001464844, + "learning_rate": 1.222941120350993e-05, + "loss": 0.5836, + "num_input_tokens_seen": 6131520, + "step": 18210 + }, + { + "epoch": 14.076506955177743, + "grad_norm": 0.8693529963493347, + "learning_rate": 1.2214919958826206e-05, + "loss": 0.5678, + "num_input_tokens_seen": 6133632, + "step": 18215 + }, + { + "epoch": 14.080370942812984, + "grad_norm": 0.9735471606254578, + "learning_rate": 1.2200434528873456e-05, + "loss": 0.4584, + "num_input_tokens_seen": 6135328, + "step": 18220 + }, + { + "epoch": 14.084234930448222, + "grad_norm": 1.2008250951766968, + "learning_rate": 1.2185954920239725e-05, + "loss": 0.437, + "num_input_tokens_seen": 6136992, + "step": 18225 + }, + { + "epoch": 14.088098918083462, + "grad_norm": 1.3932315111160278, + "learning_rate": 1.2171481139510446e-05, + "loss": 0.4562, + "num_input_tokens_seen": 6138688, + "step": 18230 + }, + { + "epoch": 14.091962905718702, + "grad_norm": 1.8770512342453003, + "learning_rate": 1.2157013193268371e-05, + "loss": 0.3871, + "num_input_tokens_seen": 6140320, + "step": 18235 + }, + { + "epoch": 14.09582689335394, + "grad_norm": 0.7949201464653015, + "learning_rate": 1.2142551088093599e-05, + "loss": 0.4295, + "num_input_tokens_seen": 6141984, + "step": 18240 + }, + { + "epoch": 14.099690880989181, + "grad_norm": 0.6474002003669739, + "learning_rate": 1.2128094830563605e-05, + "loss": 0.4052, + "num_input_tokens_seen": 6143744, + "step": 18245 + }, + { + "epoch": 14.103554868624421, + "grad_norm": 0.7943708896636963, + "learning_rate": 1.2113644427253165e-05, + "loss": 0.3458, + "num_input_tokens_seen": 6145248, + "step": 18250 + }, + { + "epoch": 14.10741885625966, + "grad_norm": 1.0096027851104736, + "learning_rate": 1.2099199884734416e-05, + "loss": 0.3611, + "num_input_tokens_seen": 6146688, + "step": 18255 + }, + { + "epoch": 14.1112828438949, + "grad_norm": 0.8969514966011047, + "learning_rate": 1.2084761209576808e-05, + "loss": 0.5458, + "num_input_tokens_seen": 6148544, + "step": 18260 + }, + { + "epoch": 14.115146831530138, + "grad_norm": 1.2430285215377808, + "learning_rate": 1.2070328408347159e-05, + "loss": 0.5273, + "num_input_tokens_seen": 6150368, + "step": 18265 + }, + { + "epoch": 14.119010819165378, + "grad_norm": 1.1626566648483276, + "learning_rate": 1.205590148760958e-05, + "loss": 0.5732, + "num_input_tokens_seen": 6152160, + "step": 18270 + }, + { + "epoch": 14.122874806800619, + "grad_norm": 0.853899359703064, + "learning_rate": 1.2041480453925527e-05, + "loss": 0.3899, + "num_input_tokens_seen": 6153632, + "step": 18275 + }, + { + "epoch": 14.126738794435857, + "grad_norm": 0.8481065630912781, + "learning_rate": 1.2027065313853759e-05, + "loss": 0.4515, + "num_input_tokens_seen": 6155168, + "step": 18280 + }, + { + "epoch": 14.130602782071097, + "grad_norm": 0.8647884726524353, + "learning_rate": 1.2012656073950385e-05, + "loss": 0.3456, + "num_input_tokens_seen": 6156896, + "step": 18285 + }, + { + "epoch": 14.134466769706338, + "grad_norm": 0.7665649056434631, + "learning_rate": 1.1998252740768809e-05, + "loss": 0.4165, + "num_input_tokens_seen": 6158400, + "step": 18290 + }, + { + "epoch": 14.138330757341576, + "grad_norm": 1.525496244430542, + "learning_rate": 1.198385532085974e-05, + "loss": 0.3776, + "num_input_tokens_seen": 6160000, + "step": 18295 + }, + { + "epoch": 14.142194744976816, + "grad_norm": 1.750066876411438, + "learning_rate": 1.1969463820771231e-05, + "loss": 0.4191, + "num_input_tokens_seen": 6161920, + "step": 18300 + }, + { + "epoch": 14.146058732612056, + "grad_norm": 1.011940836906433, + "learning_rate": 1.1955078247048614e-05, + "loss": 0.3335, + "num_input_tokens_seen": 6163520, + "step": 18305 + }, + { + "epoch": 14.149922720247295, + "grad_norm": 1.5191904306411743, + "learning_rate": 1.1940698606234535e-05, + "loss": 0.4747, + "num_input_tokens_seen": 6165344, + "step": 18310 + }, + { + "epoch": 14.153786707882535, + "grad_norm": 0.7159601449966431, + "learning_rate": 1.1926324904868938e-05, + "loss": 0.4087, + "num_input_tokens_seen": 6167296, + "step": 18315 + }, + { + "epoch": 14.157650695517773, + "grad_norm": 0.7259432077407837, + "learning_rate": 1.1911957149489058e-05, + "loss": 0.3831, + "num_input_tokens_seen": 6168832, + "step": 18320 + }, + { + "epoch": 14.161514683153014, + "grad_norm": 1.2123980522155762, + "learning_rate": 1.1897595346629459e-05, + "loss": 0.5224, + "num_input_tokens_seen": 6170528, + "step": 18325 + }, + { + "epoch": 14.165378670788254, + "grad_norm": 0.7655565738677979, + "learning_rate": 1.1883239502821954e-05, + "loss": 0.3738, + "num_input_tokens_seen": 6172384, + "step": 18330 + }, + { + "epoch": 14.169242658423492, + "grad_norm": 0.7133057117462158, + "learning_rate": 1.1868889624595686e-05, + "loss": 0.4602, + "num_input_tokens_seen": 6174112, + "step": 18335 + }, + { + "epoch": 14.173106646058732, + "grad_norm": 1.0156805515289307, + "learning_rate": 1.1854545718477054e-05, + "loss": 0.3755, + "num_input_tokens_seen": 6175776, + "step": 18340 + }, + { + "epoch": 14.176970633693973, + "grad_norm": 1.1247799396514893, + "learning_rate": 1.1840207790989754e-05, + "loss": 0.4701, + "num_input_tokens_seen": 6177504, + "step": 18345 + }, + { + "epoch": 14.180834621329211, + "grad_norm": 1.2541275024414062, + "learning_rate": 1.1825875848654766e-05, + "loss": 0.3743, + "num_input_tokens_seen": 6179072, + "step": 18350 + }, + { + "epoch": 14.184698608964451, + "grad_norm": 0.8307157158851624, + "learning_rate": 1.1811549897990335e-05, + "loss": 0.4445, + "num_input_tokens_seen": 6180640, + "step": 18355 + }, + { + "epoch": 14.188562596599692, + "grad_norm": 1.2621872425079346, + "learning_rate": 1.1797229945511983e-05, + "loss": 0.4961, + "num_input_tokens_seen": 6182336, + "step": 18360 + }, + { + "epoch": 14.19242658423493, + "grad_norm": 1.3894596099853516, + "learning_rate": 1.1782915997732522e-05, + "loss": 0.5693, + "num_input_tokens_seen": 6184128, + "step": 18365 + }, + { + "epoch": 14.19629057187017, + "grad_norm": 0.8047972321510315, + "learning_rate": 1.1768608061162028e-05, + "loss": 0.6942, + "num_input_tokens_seen": 6185824, + "step": 18370 + }, + { + "epoch": 14.20015455950541, + "grad_norm": 1.4424245357513428, + "learning_rate": 1.1754306142307827e-05, + "loss": 0.6651, + "num_input_tokens_seen": 6187456, + "step": 18375 + }, + { + "epoch": 14.204018547140649, + "grad_norm": 1.0440351963043213, + "learning_rate": 1.1740010247674518e-05, + "loss": 0.4473, + "num_input_tokens_seen": 6188928, + "step": 18380 + }, + { + "epoch": 14.207882534775889, + "grad_norm": 1.205666422843933, + "learning_rate": 1.172572038376396e-05, + "loss": 0.3709, + "num_input_tokens_seen": 6190496, + "step": 18385 + }, + { + "epoch": 14.211746522411127, + "grad_norm": 0.980274498462677, + "learning_rate": 1.1711436557075268e-05, + "loss": 0.3747, + "num_input_tokens_seen": 6192096, + "step": 18390 + }, + { + "epoch": 14.215610510046368, + "grad_norm": 0.8518127799034119, + "learning_rate": 1.1697158774104802e-05, + "loss": 0.4307, + "num_input_tokens_seen": 6193856, + "step": 18395 + }, + { + "epoch": 14.219474497681608, + "grad_norm": 0.6072325706481934, + "learning_rate": 1.1682887041346195e-05, + "loss": 0.3242, + "num_input_tokens_seen": 6195712, + "step": 18400 + }, + { + "epoch": 14.223338485316846, + "grad_norm": 1.350637674331665, + "learning_rate": 1.1668621365290322e-05, + "loss": 0.4656, + "num_input_tokens_seen": 6197632, + "step": 18405 + }, + { + "epoch": 14.227202472952087, + "grad_norm": 1.4499738216400146, + "learning_rate": 1.165436175242529e-05, + "loss": 0.4427, + "num_input_tokens_seen": 6199264, + "step": 18410 + }, + { + "epoch": 14.231066460587327, + "grad_norm": 0.6877166032791138, + "learning_rate": 1.1640108209236458e-05, + "loss": 0.4757, + "num_input_tokens_seen": 6200992, + "step": 18415 + }, + { + "epoch": 14.234930448222565, + "grad_norm": 1.0496816635131836, + "learning_rate": 1.162586074220642e-05, + "loss": 0.3694, + "num_input_tokens_seen": 6202784, + "step": 18420 + }, + { + "epoch": 14.238794435857805, + "grad_norm": 1.5804002285003662, + "learning_rate": 1.1611619357815012e-05, + "loss": 0.5736, + "num_input_tokens_seen": 6204256, + "step": 18425 + }, + { + "epoch": 14.242658423493046, + "grad_norm": 0.9547789096832275, + "learning_rate": 1.1597384062539293e-05, + "loss": 0.4142, + "num_input_tokens_seen": 6205952, + "step": 18430 + }, + { + "epoch": 14.246522411128284, + "grad_norm": 0.8012205958366394, + "learning_rate": 1.1583154862853573e-05, + "loss": 0.4272, + "num_input_tokens_seen": 6207712, + "step": 18435 + }, + { + "epoch": 14.250386398763524, + "grad_norm": 1.2554547786712646, + "learning_rate": 1.1568931765229365e-05, + "loss": 0.6636, + "num_input_tokens_seen": 6209472, + "step": 18440 + }, + { + "epoch": 14.254250386398763, + "grad_norm": 1.6156198978424072, + "learning_rate": 1.1554714776135437e-05, + "loss": 0.8547, + "num_input_tokens_seen": 6211040, + "step": 18445 + }, + { + "epoch": 14.258114374034003, + "grad_norm": 1.1304223537445068, + "learning_rate": 1.1540503902037744e-05, + "loss": 0.6712, + "num_input_tokens_seen": 6212576, + "step": 18450 + }, + { + "epoch": 14.261978361669243, + "grad_norm": 1.1730715036392212, + "learning_rate": 1.1526299149399486e-05, + "loss": 0.5085, + "num_input_tokens_seen": 6214208, + "step": 18455 + }, + { + "epoch": 14.265842349304481, + "grad_norm": 1.051052451133728, + "learning_rate": 1.1512100524681064e-05, + "loss": 0.3778, + "num_input_tokens_seen": 6215648, + "step": 18460 + }, + { + "epoch": 14.269706336939722, + "grad_norm": 1.1226890087127686, + "learning_rate": 1.149790803434009e-05, + "loss": 0.339, + "num_input_tokens_seen": 6217440, + "step": 18465 + }, + { + "epoch": 14.273570324574962, + "grad_norm": 1.0202078819274902, + "learning_rate": 1.1483721684831414e-05, + "loss": 0.3652, + "num_input_tokens_seen": 6219136, + "step": 18470 + }, + { + "epoch": 14.2774343122102, + "grad_norm": 0.7292530536651611, + "learning_rate": 1.146954148260706e-05, + "loss": 0.3811, + "num_input_tokens_seen": 6220832, + "step": 18475 + }, + { + "epoch": 14.28129829984544, + "grad_norm": 0.6909418106079102, + "learning_rate": 1.145536743411626e-05, + "loss": 0.3877, + "num_input_tokens_seen": 6222304, + "step": 18480 + }, + { + "epoch": 14.28516228748068, + "grad_norm": 0.5071355700492859, + "learning_rate": 1.1441199545805479e-05, + "loss": 0.4672, + "num_input_tokens_seen": 6224128, + "step": 18485 + }, + { + "epoch": 14.28902627511592, + "grad_norm": 1.2900867462158203, + "learning_rate": 1.1427037824118342e-05, + "loss": 0.4722, + "num_input_tokens_seen": 6225760, + "step": 18490 + }, + { + "epoch": 14.29289026275116, + "grad_norm": 0.851842999458313, + "learning_rate": 1.141288227549569e-05, + "loss": 0.3823, + "num_input_tokens_seen": 6227520, + "step": 18495 + }, + { + "epoch": 14.2967542503864, + "grad_norm": 0.8902162909507751, + "learning_rate": 1.139873290637554e-05, + "loss": 0.4158, + "num_input_tokens_seen": 6229248, + "step": 18500 + }, + { + "epoch": 14.300618238021638, + "grad_norm": 1.461371898651123, + "learning_rate": 1.1384589723193126e-05, + "loss": 0.5275, + "num_input_tokens_seen": 6230848, + "step": 18505 + }, + { + "epoch": 14.304482225656878, + "grad_norm": 1.2108509540557861, + "learning_rate": 1.1370452732380845e-05, + "loss": 0.3718, + "num_input_tokens_seen": 6232352, + "step": 18510 + }, + { + "epoch": 14.308346213292117, + "grad_norm": 0.6516467332839966, + "learning_rate": 1.135632194036829e-05, + "loss": 0.6366, + "num_input_tokens_seen": 6233824, + "step": 18515 + }, + { + "epoch": 14.312210200927357, + "grad_norm": 1.081891655921936, + "learning_rate": 1.1342197353582213e-05, + "loss": 0.4125, + "num_input_tokens_seen": 6235680, + "step": 18520 + }, + { + "epoch": 14.316074188562597, + "grad_norm": 1.1642329692840576, + "learning_rate": 1.1328078978446583e-05, + "loss": 0.3538, + "num_input_tokens_seen": 6237472, + "step": 18525 + }, + { + "epoch": 14.319938176197835, + "grad_norm": 1.154725432395935, + "learning_rate": 1.131396682138251e-05, + "loss": 0.3837, + "num_input_tokens_seen": 6239136, + "step": 18530 + }, + { + "epoch": 14.323802163833076, + "grad_norm": 1.007784128189087, + "learning_rate": 1.129986088880829e-05, + "loss": 0.3902, + "num_input_tokens_seen": 6240928, + "step": 18535 + }, + { + "epoch": 14.327666151468316, + "grad_norm": 1.0060807466506958, + "learning_rate": 1.1285761187139373e-05, + "loss": 0.3969, + "num_input_tokens_seen": 6242656, + "step": 18540 + }, + { + "epoch": 14.331530139103554, + "grad_norm": 0.9418326616287231, + "learning_rate": 1.1271667722788412e-05, + "loss": 0.3688, + "num_input_tokens_seen": 6244224, + "step": 18545 + }, + { + "epoch": 14.335394126738795, + "grad_norm": 1.0219963788986206, + "learning_rate": 1.1257580502165186e-05, + "loss": 0.453, + "num_input_tokens_seen": 6245792, + "step": 18550 + }, + { + "epoch": 14.339258114374035, + "grad_norm": 1.3618683815002441, + "learning_rate": 1.1243499531676646e-05, + "loss": 0.5243, + "num_input_tokens_seen": 6247360, + "step": 18555 + }, + { + "epoch": 14.343122102009273, + "grad_norm": 1.1166194677352905, + "learning_rate": 1.1229424817726897e-05, + "loss": 0.621, + "num_input_tokens_seen": 6249088, + "step": 18560 + }, + { + "epoch": 14.346986089644513, + "grad_norm": 0.7266783714294434, + "learning_rate": 1.1215356366717216e-05, + "loss": 0.4996, + "num_input_tokens_seen": 6250752, + "step": 18565 + }, + { + "epoch": 14.350850077279752, + "grad_norm": 0.5907155871391296, + "learning_rate": 1.1201294185046015e-05, + "loss": 0.3813, + "num_input_tokens_seen": 6252832, + "step": 18570 + }, + { + "epoch": 14.354714064914992, + "grad_norm": 1.476656198501587, + "learning_rate": 1.1187238279108844e-05, + "loss": 0.73, + "num_input_tokens_seen": 6254336, + "step": 18575 + }, + { + "epoch": 14.358578052550232, + "grad_norm": 0.9165200591087341, + "learning_rate": 1.1173188655298436e-05, + "loss": 0.3981, + "num_input_tokens_seen": 6255936, + "step": 18580 + }, + { + "epoch": 14.36244204018547, + "grad_norm": 1.0351524353027344, + "learning_rate": 1.1159145320004632e-05, + "loss": 0.4078, + "num_input_tokens_seen": 6257536, + "step": 18585 + }, + { + "epoch": 14.36630602782071, + "grad_norm": 1.0478949546813965, + "learning_rate": 1.1145108279614427e-05, + "loss": 0.3768, + "num_input_tokens_seen": 6259104, + "step": 18590 + }, + { + "epoch": 14.370170015455951, + "grad_norm": 1.2703198194503784, + "learning_rate": 1.1131077540511952e-05, + "loss": 0.5167, + "num_input_tokens_seen": 6260608, + "step": 18595 + }, + { + "epoch": 14.37403400309119, + "grad_norm": 1.1016513109207153, + "learning_rate": 1.1117053109078457e-05, + "loss": 0.3803, + "num_input_tokens_seen": 6261984, + "step": 18600 + }, + { + "epoch": 14.37789799072643, + "grad_norm": 0.789962112903595, + "learning_rate": 1.110303499169236e-05, + "loss": 0.3166, + "num_input_tokens_seen": 6263680, + "step": 18605 + }, + { + "epoch": 14.38176197836167, + "grad_norm": 0.762914776802063, + "learning_rate": 1.1089023194729164e-05, + "loss": 0.4104, + "num_input_tokens_seen": 6265504, + "step": 18610 + }, + { + "epoch": 14.385625965996908, + "grad_norm": 1.3462907075881958, + "learning_rate": 1.107501772456154e-05, + "loss": 0.4137, + "num_input_tokens_seen": 6267360, + "step": 18615 + }, + { + "epoch": 14.389489953632149, + "grad_norm": 1.1974008083343506, + "learning_rate": 1.106101858755925e-05, + "loss": 0.3694, + "num_input_tokens_seen": 6269568, + "step": 18620 + }, + { + "epoch": 14.393353941267389, + "grad_norm": 0.9965327382087708, + "learning_rate": 1.104702579008918e-05, + "loss": 0.3686, + "num_input_tokens_seen": 6271264, + "step": 18625 + }, + { + "epoch": 14.397217928902627, + "grad_norm": 1.1057487726211548, + "learning_rate": 1.1033039338515341e-05, + "loss": 0.3701, + "num_input_tokens_seen": 6272800, + "step": 18630 + }, + { + "epoch": 14.401081916537867, + "grad_norm": 1.2023204565048218, + "learning_rate": 1.1019059239198859e-05, + "loss": 0.4819, + "num_input_tokens_seen": 6274368, + "step": 18635 + }, + { + "epoch": 14.404945904173106, + "grad_norm": 0.9131171107292175, + "learning_rate": 1.1005085498497952e-05, + "loss": 0.5433, + "num_input_tokens_seen": 6275968, + "step": 18640 + }, + { + "epoch": 14.408809891808346, + "grad_norm": 0.7960932850837708, + "learning_rate": 1.0991118122767974e-05, + "loss": 0.3775, + "num_input_tokens_seen": 6277632, + "step": 18645 + }, + { + "epoch": 14.412673879443586, + "grad_norm": 0.8816131353378296, + "learning_rate": 1.0977157118361378e-05, + "loss": 0.4667, + "num_input_tokens_seen": 6279136, + "step": 18650 + }, + { + "epoch": 14.416537867078825, + "grad_norm": 0.802797794342041, + "learning_rate": 1.0963202491627703e-05, + "loss": 0.4152, + "num_input_tokens_seen": 6280768, + "step": 18655 + }, + { + "epoch": 14.420401854714065, + "grad_norm": 0.8983383178710938, + "learning_rate": 1.09492542489136e-05, + "loss": 0.3469, + "num_input_tokens_seen": 6282752, + "step": 18660 + }, + { + "epoch": 14.424265842349305, + "grad_norm": 0.86562180519104, + "learning_rate": 1.093531239656281e-05, + "loss": 0.3762, + "num_input_tokens_seen": 6284384, + "step": 18665 + }, + { + "epoch": 14.428129829984544, + "grad_norm": 1.0333343744277954, + "learning_rate": 1.0921376940916173e-05, + "loss": 0.4097, + "num_input_tokens_seen": 6285824, + "step": 18670 + }, + { + "epoch": 14.431993817619784, + "grad_norm": 1.1567142009735107, + "learning_rate": 1.0907447888311606e-05, + "loss": 0.3827, + "num_input_tokens_seen": 6287616, + "step": 18675 + }, + { + "epoch": 14.435857805255024, + "grad_norm": 1.3570128679275513, + "learning_rate": 1.0893525245084138e-05, + "loss": 0.6503, + "num_input_tokens_seen": 6289248, + "step": 18680 + }, + { + "epoch": 14.439721792890262, + "grad_norm": 0.9938489198684692, + "learning_rate": 1.0879609017565879e-05, + "loss": 0.5701, + "num_input_tokens_seen": 6291040, + "step": 18685 + }, + { + "epoch": 14.443585780525503, + "grad_norm": 0.8156684041023254, + "learning_rate": 1.0865699212086e-05, + "loss": 0.6918, + "num_input_tokens_seen": 6292672, + "step": 18690 + }, + { + "epoch": 14.447449768160741, + "grad_norm": 1.3093206882476807, + "learning_rate": 1.0851795834970767e-05, + "loss": 0.5452, + "num_input_tokens_seen": 6294464, + "step": 18695 + }, + { + "epoch": 14.451313755795981, + "grad_norm": 0.9413927793502808, + "learning_rate": 1.0837898892543522e-05, + "loss": 0.3652, + "num_input_tokens_seen": 6296288, + "step": 18700 + }, + { + "epoch": 14.455177743431221, + "grad_norm": 1.0122294425964355, + "learning_rate": 1.0824008391124669e-05, + "loss": 0.4133, + "num_input_tokens_seen": 6298048, + "step": 18705 + }, + { + "epoch": 14.45904173106646, + "grad_norm": 0.694329559803009, + "learning_rate": 1.0810124337031691e-05, + "loss": 0.4232, + "num_input_tokens_seen": 6300000, + "step": 18710 + }, + { + "epoch": 14.4629057187017, + "grad_norm": 1.3572640419006348, + "learning_rate": 1.0796246736579152e-05, + "loss": 0.4436, + "num_input_tokens_seen": 6301760, + "step": 18715 + }, + { + "epoch": 14.46676970633694, + "grad_norm": 1.6352431774139404, + "learning_rate": 1.078237559607865e-05, + "loss": 0.5104, + "num_input_tokens_seen": 6303488, + "step": 18720 + }, + { + "epoch": 14.470633693972179, + "grad_norm": 1.605540156364441, + "learning_rate": 1.0768510921838885e-05, + "loss": 0.5389, + "num_input_tokens_seen": 6305280, + "step": 18725 + }, + { + "epoch": 14.474497681607419, + "grad_norm": 0.8017362356185913, + "learning_rate": 1.0754652720165578e-05, + "loss": 0.4521, + "num_input_tokens_seen": 6306848, + "step": 18730 + }, + { + "epoch": 14.478361669242659, + "grad_norm": 0.8698828816413879, + "learning_rate": 1.0740800997361528e-05, + "loss": 0.5681, + "num_input_tokens_seen": 6308576, + "step": 18735 + }, + { + "epoch": 14.482225656877898, + "grad_norm": 0.7461806535720825, + "learning_rate": 1.0726955759726579e-05, + "loss": 0.3436, + "num_input_tokens_seen": 6310144, + "step": 18740 + }, + { + "epoch": 14.486089644513138, + "grad_norm": 0.6005837917327881, + "learning_rate": 1.0713117013557618e-05, + "loss": 0.4513, + "num_input_tokens_seen": 6312096, + "step": 18745 + }, + { + "epoch": 14.489953632148378, + "grad_norm": 1.0171581506729126, + "learning_rate": 1.0699284765148613e-05, + "loss": 0.4, + "num_input_tokens_seen": 6313600, + "step": 18750 + }, + { + "epoch": 14.493817619783616, + "grad_norm": 0.7637044787406921, + "learning_rate": 1.0685459020790536e-05, + "loss": 0.4782, + "num_input_tokens_seen": 6315232, + "step": 18755 + }, + { + "epoch": 14.497681607418857, + "grad_norm": 1.9467551708221436, + "learning_rate": 1.0671639786771415e-05, + "loss": 0.5427, + "num_input_tokens_seen": 6316928, + "step": 18760 + }, + { + "epoch": 14.501545595054095, + "grad_norm": 1.4528374671936035, + "learning_rate": 1.0657827069376339e-05, + "loss": 0.3824, + "num_input_tokens_seen": 6318464, + "step": 18765 + }, + { + "epoch": 14.505409582689335, + "grad_norm": 0.6477615833282471, + "learning_rate": 1.0644020874887404e-05, + "loss": 0.5195, + "num_input_tokens_seen": 6320384, + "step": 18770 + }, + { + "epoch": 14.509273570324575, + "grad_norm": 1.02899968624115, + "learning_rate": 1.0630221209583747e-05, + "loss": 0.4103, + "num_input_tokens_seen": 6321920, + "step": 18775 + }, + { + "epoch": 14.513137557959814, + "grad_norm": 1.4422944784164429, + "learning_rate": 1.0616428079741534e-05, + "loss": 0.4722, + "num_input_tokens_seen": 6323488, + "step": 18780 + }, + { + "epoch": 14.517001545595054, + "grad_norm": 0.9301635026931763, + "learning_rate": 1.0602641491633977e-05, + "loss": 0.4501, + "num_input_tokens_seen": 6325184, + "step": 18785 + }, + { + "epoch": 14.520865533230294, + "grad_norm": 1.0041052103042603, + "learning_rate": 1.0588861451531293e-05, + "loss": 0.396, + "num_input_tokens_seen": 6326880, + "step": 18790 + }, + { + "epoch": 14.524729520865533, + "grad_norm": 1.134040355682373, + "learning_rate": 1.0575087965700728e-05, + "loss": 0.4247, + "num_input_tokens_seen": 6328672, + "step": 18795 + }, + { + "epoch": 14.528593508500773, + "grad_norm": 1.1266374588012695, + "learning_rate": 1.0561321040406532e-05, + "loss": 0.388, + "num_input_tokens_seen": 6330496, + "step": 18800 + }, + { + "epoch": 14.532457496136013, + "grad_norm": 2.3578543663024902, + "learning_rate": 1.0547560681910008e-05, + "loss": 0.5759, + "num_input_tokens_seen": 6332256, + "step": 18805 + }, + { + "epoch": 14.536321483771252, + "grad_norm": 0.702247142791748, + "learning_rate": 1.0533806896469436e-05, + "loss": 0.4216, + "num_input_tokens_seen": 6333984, + "step": 18810 + }, + { + "epoch": 14.540185471406492, + "grad_norm": 0.8511767983436584, + "learning_rate": 1.0520059690340115e-05, + "loss": 0.3616, + "num_input_tokens_seen": 6335616, + "step": 18815 + }, + { + "epoch": 14.54404945904173, + "grad_norm": 0.7143738865852356, + "learning_rate": 1.050631906977437e-05, + "loss": 0.5329, + "num_input_tokens_seen": 6337152, + "step": 18820 + }, + { + "epoch": 14.54791344667697, + "grad_norm": 0.9391282796859741, + "learning_rate": 1.0492585041021513e-05, + "loss": 0.4286, + "num_input_tokens_seen": 6338976, + "step": 18825 + }, + { + "epoch": 14.55177743431221, + "grad_norm": 1.054762840270996, + "learning_rate": 1.047885761032786e-05, + "loss": 0.3906, + "num_input_tokens_seen": 6340736, + "step": 18830 + }, + { + "epoch": 14.555641421947449, + "grad_norm": 1.0484645366668701, + "learning_rate": 1.0465136783936732e-05, + "loss": 0.5946, + "num_input_tokens_seen": 6342400, + "step": 18835 + }, + { + "epoch": 14.55950540958269, + "grad_norm": 0.9372894763946533, + "learning_rate": 1.045142256808843e-05, + "loss": 0.4965, + "num_input_tokens_seen": 6343936, + "step": 18840 + }, + { + "epoch": 14.56336939721793, + "grad_norm": 0.8474923968315125, + "learning_rate": 1.043771496902028e-05, + "loss": 0.3554, + "num_input_tokens_seen": 6345472, + "step": 18845 + }, + { + "epoch": 14.567233384853168, + "grad_norm": 0.977354884147644, + "learning_rate": 1.0424013992966564e-05, + "loss": 0.3662, + "num_input_tokens_seen": 6347232, + "step": 18850 + }, + { + "epoch": 14.571097372488408, + "grad_norm": 2.201728105545044, + "learning_rate": 1.0410319646158587e-05, + "loss": 0.4228, + "num_input_tokens_seen": 6348832, + "step": 18855 + }, + { + "epoch": 14.574961360123648, + "grad_norm": 2.487440824508667, + "learning_rate": 1.0396631934824605e-05, + "loss": 0.5312, + "num_input_tokens_seen": 6350528, + "step": 18860 + }, + { + "epoch": 14.578825347758887, + "grad_norm": 1.70631742477417, + "learning_rate": 1.0382950865189878e-05, + "loss": 0.4836, + "num_input_tokens_seen": 6352160, + "step": 18865 + }, + { + "epoch": 14.582689335394127, + "grad_norm": 1.138287901878357, + "learning_rate": 1.0369276443476636e-05, + "loss": 0.4229, + "num_input_tokens_seen": 6353824, + "step": 18870 + }, + { + "epoch": 14.586553323029367, + "grad_norm": 0.9266315698623657, + "learning_rate": 1.0355608675904086e-05, + "loss": 0.3519, + "num_input_tokens_seen": 6355328, + "step": 18875 + }, + { + "epoch": 14.590417310664606, + "grad_norm": 0.8323154449462891, + "learning_rate": 1.0341947568688404e-05, + "loss": 0.6961, + "num_input_tokens_seen": 6357216, + "step": 18880 + }, + { + "epoch": 14.594281298299846, + "grad_norm": 1.134502649307251, + "learning_rate": 1.0328293128042752e-05, + "loss": 0.5108, + "num_input_tokens_seen": 6358848, + "step": 18885 + }, + { + "epoch": 14.598145285935084, + "grad_norm": 1.2332698106765747, + "learning_rate": 1.0314645360177258e-05, + "loss": 0.4376, + "num_input_tokens_seen": 6360416, + "step": 18890 + }, + { + "epoch": 14.602009273570324, + "grad_norm": 0.9741493463516235, + "learning_rate": 1.0301004271299003e-05, + "loss": 0.3826, + "num_input_tokens_seen": 6362144, + "step": 18895 + }, + { + "epoch": 14.605873261205565, + "grad_norm": 0.9338679909706116, + "learning_rate": 1.0287369867612032e-05, + "loss": 0.7363, + "num_input_tokens_seen": 6364064, + "step": 18900 + }, + { + "epoch": 14.609737248840803, + "grad_norm": 0.8032610416412354, + "learning_rate": 1.027374215531736e-05, + "loss": 0.3608, + "num_input_tokens_seen": 6365664, + "step": 18905 + }, + { + "epoch": 14.613601236476043, + "grad_norm": 1.4822943210601807, + "learning_rate": 1.0260121140612944e-05, + "loss": 0.3519, + "num_input_tokens_seen": 6367232, + "step": 18910 + }, + { + "epoch": 14.617465224111283, + "grad_norm": 1.2872449159622192, + "learning_rate": 1.0246506829693697e-05, + "loss": 0.4192, + "num_input_tokens_seen": 6369024, + "step": 18915 + }, + { + "epoch": 14.621329211746522, + "grad_norm": 0.8082886934280396, + "learning_rate": 1.0232899228751502e-05, + "loss": 0.3563, + "num_input_tokens_seen": 6370816, + "step": 18920 + }, + { + "epoch": 14.625193199381762, + "grad_norm": 1.1702730655670166, + "learning_rate": 1.021929834397518e-05, + "loss": 0.4307, + "num_input_tokens_seen": 6372448, + "step": 18925 + }, + { + "epoch": 14.629057187017002, + "grad_norm": 0.8610081672668457, + "learning_rate": 1.0205704181550493e-05, + "loss": 0.3191, + "num_input_tokens_seen": 6373888, + "step": 18930 + }, + { + "epoch": 14.63292117465224, + "grad_norm": 0.8797902464866638, + "learning_rate": 1.0192116747660144e-05, + "loss": 0.4089, + "num_input_tokens_seen": 6375520, + "step": 18935 + }, + { + "epoch": 14.636785162287481, + "grad_norm": 1.2876731157302856, + "learning_rate": 1.0178536048483777e-05, + "loss": 0.5049, + "num_input_tokens_seen": 6377216, + "step": 18940 + }, + { + "epoch": 14.64064914992272, + "grad_norm": 0.76628577709198, + "learning_rate": 1.0164962090197977e-05, + "loss": 0.4204, + "num_input_tokens_seen": 6378752, + "step": 18945 + }, + { + "epoch": 14.64451313755796, + "grad_norm": 0.9510080218315125, + "learning_rate": 1.0151394878976256e-05, + "loss": 0.4176, + "num_input_tokens_seen": 6380384, + "step": 18950 + }, + { + "epoch": 14.6483771251932, + "grad_norm": 1.1815871000289917, + "learning_rate": 1.0137834420989076e-05, + "loss": 0.4498, + "num_input_tokens_seen": 6382112, + "step": 18955 + }, + { + "epoch": 14.652241112828438, + "grad_norm": 1.503778100013733, + "learning_rate": 1.0124280722403807e-05, + "loss": 0.5189, + "num_input_tokens_seen": 6383456, + "step": 18960 + }, + { + "epoch": 14.656105100463678, + "grad_norm": 0.9390483498573303, + "learning_rate": 1.0110733789384744e-05, + "loss": 0.5353, + "num_input_tokens_seen": 6385056, + "step": 18965 + }, + { + "epoch": 14.659969088098919, + "grad_norm": 0.7025409936904907, + "learning_rate": 1.009719362809313e-05, + "loss": 0.3587, + "num_input_tokens_seen": 6386752, + "step": 18970 + }, + { + "epoch": 14.663833075734157, + "grad_norm": 2.146989107131958, + "learning_rate": 1.0083660244687104e-05, + "loss": 0.4556, + "num_input_tokens_seen": 6388288, + "step": 18975 + }, + { + "epoch": 14.667697063369397, + "grad_norm": 1.5384396314620972, + "learning_rate": 1.0070133645321728e-05, + "loss": 0.8507, + "num_input_tokens_seen": 6389696, + "step": 18980 + }, + { + "epoch": 14.671561051004637, + "grad_norm": 1.3901667594909668, + "learning_rate": 1.0056613836148976e-05, + "loss": 0.5908, + "num_input_tokens_seen": 6391392, + "step": 18985 + }, + { + "epoch": 14.675425038639876, + "grad_norm": 1.3973126411437988, + "learning_rate": 1.004310082331775e-05, + "loss": 0.4558, + "num_input_tokens_seen": 6392992, + "step": 18990 + }, + { + "epoch": 14.679289026275116, + "grad_norm": 0.8832582831382751, + "learning_rate": 1.0029594612973842e-05, + "loss": 0.586, + "num_input_tokens_seen": 6394592, + "step": 18995 + }, + { + "epoch": 14.683153013910356, + "grad_norm": 0.69040447473526, + "learning_rate": 1.001609521125996e-05, + "loss": 0.374, + "num_input_tokens_seen": 6396256, + "step": 19000 + }, + { + "epoch": 14.687017001545595, + "grad_norm": 0.9956910610198975, + "learning_rate": 1.0002602624315702e-05, + "loss": 0.3607, + "num_input_tokens_seen": 6397856, + "step": 19005 + }, + { + "epoch": 14.690880989180835, + "grad_norm": 0.8383082151412964, + "learning_rate": 9.989116858277595e-06, + "loss": 0.4326, + "num_input_tokens_seen": 6399936, + "step": 19010 + }, + { + "epoch": 14.694744976816073, + "grad_norm": 1.3901172876358032, + "learning_rate": 9.975637919279038e-06, + "loss": 0.4212, + "num_input_tokens_seen": 6401536, + "step": 19015 + }, + { + "epoch": 14.698608964451314, + "grad_norm": 1.0677555799484253, + "learning_rate": 9.962165813450322e-06, + "loss": 0.6979, + "num_input_tokens_seen": 6403328, + "step": 19020 + }, + { + "epoch": 14.702472952086554, + "grad_norm": 1.7755262851715088, + "learning_rate": 9.948700546918663e-06, + "loss": 0.4723, + "num_input_tokens_seen": 6405024, + "step": 19025 + }, + { + "epoch": 14.706336939721792, + "grad_norm": 1.4040586948394775, + "learning_rate": 9.935242125808134e-06, + "loss": 0.5271, + "num_input_tokens_seen": 6406944, + "step": 19030 + }, + { + "epoch": 14.710200927357032, + "grad_norm": 1.1248540878295898, + "learning_rate": 9.921790556239704e-06, + "loss": 0.3823, + "num_input_tokens_seen": 6408736, + "step": 19035 + }, + { + "epoch": 14.714064914992273, + "grad_norm": 0.8560416102409363, + "learning_rate": 9.90834584433123e-06, + "loss": 0.5863, + "num_input_tokens_seen": 6410368, + "step": 19040 + }, + { + "epoch": 14.717928902627511, + "grad_norm": 1.2067691087722778, + "learning_rate": 9.894907996197436e-06, + "loss": 0.6819, + "num_input_tokens_seen": 6412000, + "step": 19045 + }, + { + "epoch": 14.721792890262751, + "grad_norm": 0.6416230201721191, + "learning_rate": 9.881477017949959e-06, + "loss": 0.4384, + "num_input_tokens_seen": 6413728, + "step": 19050 + }, + { + "epoch": 14.725656877897991, + "grad_norm": 1.2818965911865234, + "learning_rate": 9.868052915697263e-06, + "loss": 0.5601, + "num_input_tokens_seen": 6415232, + "step": 19055 + }, + { + "epoch": 14.72952086553323, + "grad_norm": 0.6619262099266052, + "learning_rate": 9.854635695544731e-06, + "loss": 0.5513, + "num_input_tokens_seen": 6416832, + "step": 19060 + }, + { + "epoch": 14.73338485316847, + "grad_norm": 1.2322999238967896, + "learning_rate": 9.84122536359459e-06, + "loss": 0.3995, + "num_input_tokens_seen": 6418688, + "step": 19065 + }, + { + "epoch": 14.737248840803709, + "grad_norm": 0.8166007399559021, + "learning_rate": 9.827821925945932e-06, + "loss": 0.4087, + "num_input_tokens_seen": 6420256, + "step": 19070 + }, + { + "epoch": 14.741112828438949, + "grad_norm": 1.0896974802017212, + "learning_rate": 9.814425388694728e-06, + "loss": 0.3865, + "num_input_tokens_seen": 6422080, + "step": 19075 + }, + { + "epoch": 14.744976816074189, + "grad_norm": 0.8469075560569763, + "learning_rate": 9.8010357579338e-06, + "loss": 0.42, + "num_input_tokens_seen": 6423744, + "step": 19080 + }, + { + "epoch": 14.748840803709427, + "grad_norm": 1.130494475364685, + "learning_rate": 9.787653039752819e-06, + "loss": 0.3732, + "num_input_tokens_seen": 6425440, + "step": 19085 + }, + { + "epoch": 14.752704791344668, + "grad_norm": 0.9096692800521851, + "learning_rate": 9.774277240238343e-06, + "loss": 0.4912, + "num_input_tokens_seen": 6427264, + "step": 19090 + }, + { + "epoch": 14.756568778979908, + "grad_norm": 0.9941928386688232, + "learning_rate": 9.76090836547377e-06, + "loss": 0.4086, + "num_input_tokens_seen": 6429248, + "step": 19095 + }, + { + "epoch": 14.760432766615146, + "grad_norm": 0.8798866868019104, + "learning_rate": 9.747546421539333e-06, + "loss": 0.4379, + "num_input_tokens_seen": 6431168, + "step": 19100 + }, + { + "epoch": 14.764296754250386, + "grad_norm": 1.2937068939208984, + "learning_rate": 9.734191414512132e-06, + "loss": 0.4013, + "num_input_tokens_seen": 6433024, + "step": 19105 + }, + { + "epoch": 14.768160741885627, + "grad_norm": 0.9211562275886536, + "learning_rate": 9.720843350466094e-06, + "loss": 0.4096, + "num_input_tokens_seen": 6434656, + "step": 19110 + }, + { + "epoch": 14.772024729520865, + "grad_norm": 0.8748378157615662, + "learning_rate": 9.707502235472005e-06, + "loss": 0.432, + "num_input_tokens_seen": 6436288, + "step": 19115 + }, + { + "epoch": 14.775888717156105, + "grad_norm": 0.9544143676757812, + "learning_rate": 9.694168075597474e-06, + "loss": 0.4693, + "num_input_tokens_seen": 6438240, + "step": 19120 + }, + { + "epoch": 14.779752704791346, + "grad_norm": 1.1725656986236572, + "learning_rate": 9.680840876906974e-06, + "loss": 0.6642, + "num_input_tokens_seen": 6439776, + "step": 19125 + }, + { + "epoch": 14.783616692426584, + "grad_norm": 0.6626648902893066, + "learning_rate": 9.667520645461777e-06, + "loss": 0.72, + "num_input_tokens_seen": 6441376, + "step": 19130 + }, + { + "epoch": 14.787480680061824, + "grad_norm": 1.3559625148773193, + "learning_rate": 9.654207387320022e-06, + "loss": 0.3881, + "num_input_tokens_seen": 6442816, + "step": 19135 + }, + { + "epoch": 14.791344667697063, + "grad_norm": 0.9956012964248657, + "learning_rate": 9.64090110853665e-06, + "loss": 0.4574, + "num_input_tokens_seen": 6444256, + "step": 19140 + }, + { + "epoch": 14.795208655332303, + "grad_norm": 0.8595713973045349, + "learning_rate": 9.627601815163436e-06, + "loss": 0.403, + "num_input_tokens_seen": 6445856, + "step": 19145 + }, + { + "epoch": 14.799072642967543, + "grad_norm": 1.1554110050201416, + "learning_rate": 9.614309513248976e-06, + "loss": 0.3861, + "num_input_tokens_seen": 6447584, + "step": 19150 + }, + { + "epoch": 14.802936630602781, + "grad_norm": 0.5216554403305054, + "learning_rate": 9.601024208838686e-06, + "loss": 0.3747, + "num_input_tokens_seen": 6449280, + "step": 19155 + }, + { + "epoch": 14.806800618238022, + "grad_norm": 1.2247309684753418, + "learning_rate": 9.587745907974812e-06, + "loss": 0.431, + "num_input_tokens_seen": 6450912, + "step": 19160 + }, + { + "epoch": 14.810664605873262, + "grad_norm": 1.0654209852218628, + "learning_rate": 9.574474616696391e-06, + "loss": 0.4394, + "num_input_tokens_seen": 6452928, + "step": 19165 + }, + { + "epoch": 14.8145285935085, + "grad_norm": 0.9155231714248657, + "learning_rate": 9.561210341039303e-06, + "loss": 0.377, + "num_input_tokens_seen": 6454624, + "step": 19170 + }, + { + "epoch": 14.81839258114374, + "grad_norm": 0.6782565116882324, + "learning_rate": 9.547953087036212e-06, + "loss": 0.3534, + "num_input_tokens_seen": 6456128, + "step": 19175 + }, + { + "epoch": 14.82225656877898, + "grad_norm": 1.3056684732437134, + "learning_rate": 9.534702860716596e-06, + "loss": 0.4038, + "num_input_tokens_seen": 6458080, + "step": 19180 + }, + { + "epoch": 14.826120556414219, + "grad_norm": 2.1828761100769043, + "learning_rate": 9.521459668106736e-06, + "loss": 0.5735, + "num_input_tokens_seen": 6459968, + "step": 19185 + }, + { + "epoch": 14.82998454404946, + "grad_norm": 1.2046387195587158, + "learning_rate": 9.508223515229709e-06, + "loss": 0.4219, + "num_input_tokens_seen": 6461536, + "step": 19190 + }, + { + "epoch": 14.833848531684698, + "grad_norm": 1.5026905536651611, + "learning_rate": 9.494994408105412e-06, + "loss": 0.4102, + "num_input_tokens_seen": 6463264, + "step": 19195 + }, + { + "epoch": 14.837712519319938, + "grad_norm": 1.2938494682312012, + "learning_rate": 9.481772352750513e-06, + "loss": 0.5886, + "num_input_tokens_seen": 6465056, + "step": 19200 + }, + { + "epoch": 14.841576506955178, + "grad_norm": 0.5056613683700562, + "learning_rate": 9.468557355178476e-06, + "loss": 0.3693, + "num_input_tokens_seen": 6466624, + "step": 19205 + }, + { + "epoch": 14.845440494590417, + "grad_norm": 1.2695354223251343, + "learning_rate": 9.455349421399575e-06, + "loss": 0.3859, + "num_input_tokens_seen": 6468160, + "step": 19210 + }, + { + "epoch": 14.849304482225657, + "grad_norm": 2.0812432765960693, + "learning_rate": 9.442148557420851e-06, + "loss": 0.4818, + "num_input_tokens_seen": 6469824, + "step": 19215 + }, + { + "epoch": 14.853168469860897, + "grad_norm": 0.6542742848396301, + "learning_rate": 9.428954769246134e-06, + "loss": 0.4309, + "num_input_tokens_seen": 6471392, + "step": 19220 + }, + { + "epoch": 14.857032457496135, + "grad_norm": 1.110998272895813, + "learning_rate": 9.415768062876043e-06, + "loss": 0.4438, + "num_input_tokens_seen": 6473280, + "step": 19225 + }, + { + "epoch": 14.860896445131376, + "grad_norm": 1.0936638116836548, + "learning_rate": 9.402588444307955e-06, + "loss": 0.4074, + "num_input_tokens_seen": 6475232, + "step": 19230 + }, + { + "epoch": 14.864760432766616, + "grad_norm": 0.6189517974853516, + "learning_rate": 9.389415919536062e-06, + "loss": 0.5177, + "num_input_tokens_seen": 6476672, + "step": 19235 + }, + { + "epoch": 14.868624420401854, + "grad_norm": 1.5511856079101562, + "learning_rate": 9.376250494551298e-06, + "loss": 0.351, + "num_input_tokens_seen": 6478496, + "step": 19240 + }, + { + "epoch": 14.872488408037094, + "grad_norm": 0.9187338948249817, + "learning_rate": 9.363092175341365e-06, + "loss": 0.3538, + "num_input_tokens_seen": 6480160, + "step": 19245 + }, + { + "epoch": 14.876352395672335, + "grad_norm": 1.0896908044815063, + "learning_rate": 9.349940967890767e-06, + "loss": 0.4619, + "num_input_tokens_seen": 6481632, + "step": 19250 + }, + { + "epoch": 14.880216383307573, + "grad_norm": 1.1256067752838135, + "learning_rate": 9.33679687818074e-06, + "loss": 0.5075, + "num_input_tokens_seen": 6483200, + "step": 19255 + }, + { + "epoch": 14.884080370942813, + "grad_norm": 0.8312526941299438, + "learning_rate": 9.323659912189295e-06, + "loss": 0.4977, + "num_input_tokens_seen": 6484736, + "step": 19260 + }, + { + "epoch": 14.887944358578052, + "grad_norm": 1.7460790872573853, + "learning_rate": 9.310530075891196e-06, + "loss": 0.4757, + "num_input_tokens_seen": 6486432, + "step": 19265 + }, + { + "epoch": 14.891808346213292, + "grad_norm": 1.1610931158065796, + "learning_rate": 9.29740737525799e-06, + "loss": 0.4204, + "num_input_tokens_seen": 6488064, + "step": 19270 + }, + { + "epoch": 14.895672333848532, + "grad_norm": 1.0567940473556519, + "learning_rate": 9.284291816257947e-06, + "loss": 0.523, + "num_input_tokens_seen": 6489696, + "step": 19275 + }, + { + "epoch": 14.89953632148377, + "grad_norm": 0.76185542345047, + "learning_rate": 9.271183404856104e-06, + "loss": 0.463, + "num_input_tokens_seen": 6491104, + "step": 19280 + }, + { + "epoch": 14.90340030911901, + "grad_norm": 0.9124199151992798, + "learning_rate": 9.258082147014236e-06, + "loss": 0.3958, + "num_input_tokens_seen": 6492864, + "step": 19285 + }, + { + "epoch": 14.907264296754251, + "grad_norm": 1.1489614248275757, + "learning_rate": 9.244988048690892e-06, + "loss": 0.4639, + "num_input_tokens_seen": 6494400, + "step": 19290 + }, + { + "epoch": 14.91112828438949, + "grad_norm": 0.7609090209007263, + "learning_rate": 9.231901115841335e-06, + "loss": 0.3885, + "num_input_tokens_seen": 6496160, + "step": 19295 + }, + { + "epoch": 14.91499227202473, + "grad_norm": 0.932395339012146, + "learning_rate": 9.218821354417574e-06, + "loss": 0.3764, + "num_input_tokens_seen": 6497792, + "step": 19300 + }, + { + "epoch": 14.91885625965997, + "grad_norm": 1.000910997390747, + "learning_rate": 9.205748770368378e-06, + "loss": 0.6343, + "num_input_tokens_seen": 6499552, + "step": 19305 + }, + { + "epoch": 14.922720247295208, + "grad_norm": 0.8024441599845886, + "learning_rate": 9.19268336963923e-06, + "loss": 0.3577, + "num_input_tokens_seen": 6501088, + "step": 19310 + }, + { + "epoch": 14.926584234930449, + "grad_norm": 0.8605881929397583, + "learning_rate": 9.179625158172354e-06, + "loss": 0.4827, + "num_input_tokens_seen": 6502912, + "step": 19315 + }, + { + "epoch": 14.930448222565687, + "grad_norm": 1.2684447765350342, + "learning_rate": 9.166574141906698e-06, + "loss": 0.4414, + "num_input_tokens_seen": 6504352, + "step": 19320 + }, + { + "epoch": 14.934312210200927, + "grad_norm": 0.9322152137756348, + "learning_rate": 9.153530326777937e-06, + "loss": 0.5737, + "num_input_tokens_seen": 6506304, + "step": 19325 + }, + { + "epoch": 14.938176197836167, + "grad_norm": 0.6869697570800781, + "learning_rate": 9.140493718718493e-06, + "loss": 0.4828, + "num_input_tokens_seen": 6507872, + "step": 19330 + }, + { + "epoch": 14.942040185471406, + "grad_norm": 1.375704288482666, + "learning_rate": 9.127464323657476e-06, + "loss": 0.4022, + "num_input_tokens_seen": 6509568, + "step": 19335 + }, + { + "epoch": 14.945904173106646, + "grad_norm": 0.7888519167900085, + "learning_rate": 9.114442147520749e-06, + "loss": 0.4532, + "num_input_tokens_seen": 6511456, + "step": 19340 + }, + { + "epoch": 14.949768160741886, + "grad_norm": 1.3007315397262573, + "learning_rate": 9.101427196230869e-06, + "loss": 0.4082, + "num_input_tokens_seen": 6513184, + "step": 19345 + }, + { + "epoch": 14.953632148377125, + "grad_norm": 1.3249828815460205, + "learning_rate": 9.088419475707113e-06, + "loss": 0.3954, + "num_input_tokens_seen": 6515040, + "step": 19350 + }, + { + "epoch": 14.957496136012365, + "grad_norm": 0.866541862487793, + "learning_rate": 9.07541899186547e-06, + "loss": 0.3643, + "num_input_tokens_seen": 6516704, + "step": 19355 + }, + { + "epoch": 14.961360123647605, + "grad_norm": 0.7323192954063416, + "learning_rate": 9.06242575061864e-06, + "loss": 0.3793, + "num_input_tokens_seen": 6518592, + "step": 19360 + }, + { + "epoch": 14.965224111282843, + "grad_norm": 0.8769561052322388, + "learning_rate": 9.049439757876013e-06, + "loss": 0.4975, + "num_input_tokens_seen": 6520224, + "step": 19365 + }, + { + "epoch": 14.969088098918084, + "grad_norm": 1.012906551361084, + "learning_rate": 9.03646101954371e-06, + "loss": 0.6043, + "num_input_tokens_seen": 6521856, + "step": 19370 + }, + { + "epoch": 14.972952086553324, + "grad_norm": 1.343111276626587, + "learning_rate": 9.023489541524546e-06, + "loss": 0.3793, + "num_input_tokens_seen": 6523488, + "step": 19375 + }, + { + "epoch": 14.976816074188562, + "grad_norm": 0.9257279634475708, + "learning_rate": 9.010525329718017e-06, + "loss": 0.4752, + "num_input_tokens_seen": 6524928, + "step": 19380 + }, + { + "epoch": 14.980680061823803, + "grad_norm": 1.0979074239730835, + "learning_rate": 8.997568390020328e-06, + "loss": 0.4227, + "num_input_tokens_seen": 6526304, + "step": 19385 + }, + { + "epoch": 14.984544049459041, + "grad_norm": 0.6850817203521729, + "learning_rate": 8.984618728324368e-06, + "loss": 0.6766, + "num_input_tokens_seen": 6527936, + "step": 19390 + }, + { + "epoch": 14.988408037094281, + "grad_norm": 2.0053086280822754, + "learning_rate": 8.971676350519723e-06, + "loss": 0.3653, + "num_input_tokens_seen": 6529376, + "step": 19395 + }, + { + "epoch": 14.992272024729521, + "grad_norm": 1.2202056646347046, + "learning_rate": 8.958741262492654e-06, + "loss": 0.4151, + "num_input_tokens_seen": 6531136, + "step": 19400 + }, + { + "epoch": 14.99613601236476, + "grad_norm": 1.8601274490356445, + "learning_rate": 8.945813470126127e-06, + "loss": 0.5159, + "num_input_tokens_seen": 6532896, + "step": 19405 + }, + { + "epoch": 15.0, + "grad_norm": 0.8929230570793152, + "learning_rate": 8.932892979299787e-06, + "loss": 0.3745, + "num_input_tokens_seen": 6534240, + "step": 19410 + }, + { + "epoch": 15.0, + "eval_loss": 0.4465736746788025, + "eval_runtime": 6.2506, + "eval_samples_per_second": 91.991, + "eval_steps_per_second": 23.038, + "num_input_tokens_seen": 6534240, + "step": 19410 + }, + { + "epoch": 15.00386398763524, + "grad_norm": 0.8009699583053589, + "learning_rate": 8.919979795889943e-06, + "loss": 0.5169, + "num_input_tokens_seen": 6535904, + "step": 19415 + }, + { + "epoch": 15.007727975270479, + "grad_norm": 1.0678600072860718, + "learning_rate": 8.907073925769585e-06, + "loss": 0.4273, + "num_input_tokens_seen": 6537568, + "step": 19420 + }, + { + "epoch": 15.011591962905719, + "grad_norm": 0.8184014558792114, + "learning_rate": 8.894175374808386e-06, + "loss": 0.4982, + "num_input_tokens_seen": 6539264, + "step": 19425 + }, + { + "epoch": 15.015455950540959, + "grad_norm": 1.1535882949829102, + "learning_rate": 8.881284148872678e-06, + "loss": 0.5563, + "num_input_tokens_seen": 6541376, + "step": 19430 + }, + { + "epoch": 15.019319938176197, + "grad_norm": 0.8726192712783813, + "learning_rate": 8.868400253825462e-06, + "loss": 0.5002, + "num_input_tokens_seen": 6542976, + "step": 19435 + }, + { + "epoch": 15.023183925811438, + "grad_norm": 0.9828437566757202, + "learning_rate": 8.855523695526427e-06, + "loss": 0.4079, + "num_input_tokens_seen": 6544448, + "step": 19440 + }, + { + "epoch": 15.027047913446676, + "grad_norm": 1.0725635290145874, + "learning_rate": 8.842654479831895e-06, + "loss": 0.5329, + "num_input_tokens_seen": 6546240, + "step": 19445 + }, + { + "epoch": 15.030911901081916, + "grad_norm": 0.8974360227584839, + "learning_rate": 8.829792612594873e-06, + "loss": 0.3984, + "num_input_tokens_seen": 6547936, + "step": 19450 + }, + { + "epoch": 15.034775888717157, + "grad_norm": 0.8009597659111023, + "learning_rate": 8.816938099665011e-06, + "loss": 0.3178, + "num_input_tokens_seen": 6549216, + "step": 19455 + }, + { + "epoch": 15.038639876352395, + "grad_norm": 1.4524519443511963, + "learning_rate": 8.804090946888618e-06, + "loss": 0.433, + "num_input_tokens_seen": 6550816, + "step": 19460 + }, + { + "epoch": 15.042503863987635, + "grad_norm": 0.8508633971214294, + "learning_rate": 8.791251160108657e-06, + "loss": 0.4184, + "num_input_tokens_seen": 6552544, + "step": 19465 + }, + { + "epoch": 15.046367851622875, + "grad_norm": 0.807330846786499, + "learning_rate": 8.778418745164733e-06, + "loss": 0.6296, + "num_input_tokens_seen": 6554272, + "step": 19470 + }, + { + "epoch": 15.050231839258114, + "grad_norm": 1.3249893188476562, + "learning_rate": 8.765593707893114e-06, + "loss": 0.5668, + "num_input_tokens_seen": 6555968, + "step": 19475 + }, + { + "epoch": 15.054095826893354, + "grad_norm": 1.1027915477752686, + "learning_rate": 8.752776054126704e-06, + "loss": 0.4912, + "num_input_tokens_seen": 6557952, + "step": 19480 + }, + { + "epoch": 15.057959814528594, + "grad_norm": 1.061011791229248, + "learning_rate": 8.739965789695034e-06, + "loss": 0.403, + "num_input_tokens_seen": 6559648, + "step": 19485 + }, + { + "epoch": 15.061823802163833, + "grad_norm": 0.7746157050132751, + "learning_rate": 8.727162920424311e-06, + "loss": 0.4582, + "num_input_tokens_seen": 6561344, + "step": 19490 + }, + { + "epoch": 15.065687789799073, + "grad_norm": 0.6940308809280396, + "learning_rate": 8.714367452137348e-06, + "loss": 0.4018, + "num_input_tokens_seen": 6562880, + "step": 19495 + }, + { + "epoch": 15.069551777434313, + "grad_norm": 1.3450284004211426, + "learning_rate": 8.701579390653595e-06, + "loss": 0.3944, + "num_input_tokens_seen": 6564800, + "step": 19500 + }, + { + "epoch": 15.073415765069551, + "grad_norm": 1.206872582435608, + "learning_rate": 8.688798741789136e-06, + "loss": 0.3593, + "num_input_tokens_seen": 6566624, + "step": 19505 + }, + { + "epoch": 15.077279752704792, + "grad_norm": 0.8517448902130127, + "learning_rate": 8.6760255113567e-06, + "loss": 0.4381, + "num_input_tokens_seen": 6568672, + "step": 19510 + }, + { + "epoch": 15.08114374034003, + "grad_norm": 1.2247217893600464, + "learning_rate": 8.663259705165625e-06, + "loss": 0.4273, + "num_input_tokens_seen": 6570432, + "step": 19515 + }, + { + "epoch": 15.08500772797527, + "grad_norm": 0.9805135726928711, + "learning_rate": 8.65050132902187e-06, + "loss": 0.5863, + "num_input_tokens_seen": 6571936, + "step": 19520 + }, + { + "epoch": 15.08887171561051, + "grad_norm": 1.5709044933319092, + "learning_rate": 8.637750388728016e-06, + "loss": 0.6274, + "num_input_tokens_seen": 6573632, + "step": 19525 + }, + { + "epoch": 15.092735703245749, + "grad_norm": 0.9769726991653442, + "learning_rate": 8.625006890083284e-06, + "loss": 0.378, + "num_input_tokens_seen": 6575616, + "step": 19530 + }, + { + "epoch": 15.09659969088099, + "grad_norm": 1.712844729423523, + "learning_rate": 8.612270838883484e-06, + "loss": 0.4417, + "num_input_tokens_seen": 6577440, + "step": 19535 + }, + { + "epoch": 15.10046367851623, + "grad_norm": 1.6903003454208374, + "learning_rate": 8.59954224092104e-06, + "loss": 0.4779, + "num_input_tokens_seen": 6578976, + "step": 19540 + }, + { + "epoch": 15.104327666151468, + "grad_norm": 1.1218183040618896, + "learning_rate": 8.586821101985013e-06, + "loss": 0.3827, + "num_input_tokens_seen": 6580768, + "step": 19545 + }, + { + "epoch": 15.108191653786708, + "grad_norm": 0.9960443377494812, + "learning_rate": 8.574107427861042e-06, + "loss": 0.6317, + "num_input_tokens_seen": 6582432, + "step": 19550 + }, + { + "epoch": 15.112055641421948, + "grad_norm": 1.2856096029281616, + "learning_rate": 8.561401224331384e-06, + "loss": 0.583, + "num_input_tokens_seen": 6584064, + "step": 19555 + }, + { + "epoch": 15.115919629057187, + "grad_norm": 1.3420871496200562, + "learning_rate": 8.548702497174896e-06, + "loss": 0.4011, + "num_input_tokens_seen": 6585824, + "step": 19560 + }, + { + "epoch": 15.119783616692427, + "grad_norm": 1.180583119392395, + "learning_rate": 8.536011252167029e-06, + "loss": 0.4153, + "num_input_tokens_seen": 6587584, + "step": 19565 + }, + { + "epoch": 15.123647604327665, + "grad_norm": 1.2594581842422485, + "learning_rate": 8.523327495079847e-06, + "loss": 0.6341, + "num_input_tokens_seen": 6589088, + "step": 19570 + }, + { + "epoch": 15.127511591962906, + "grad_norm": 0.866003155708313, + "learning_rate": 8.51065123168199e-06, + "loss": 0.3673, + "num_input_tokens_seen": 6590816, + "step": 19575 + }, + { + "epoch": 15.131375579598146, + "grad_norm": 0.8052422404289246, + "learning_rate": 8.497982467738713e-06, + "loss": 0.4398, + "num_input_tokens_seen": 6592384, + "step": 19580 + }, + { + "epoch": 15.135239567233384, + "grad_norm": 0.9237096309661865, + "learning_rate": 8.485321209011835e-06, + "loss": 0.5375, + "num_input_tokens_seen": 6593856, + "step": 19585 + }, + { + "epoch": 15.139103554868624, + "grad_norm": 1.0710930824279785, + "learning_rate": 8.472667461259773e-06, + "loss": 0.4717, + "num_input_tokens_seen": 6595264, + "step": 19590 + }, + { + "epoch": 15.142967542503865, + "grad_norm": 1.013279914855957, + "learning_rate": 8.46002123023753e-06, + "loss": 0.425, + "num_input_tokens_seen": 6596896, + "step": 19595 + }, + { + "epoch": 15.146831530139103, + "grad_norm": 0.9662644267082214, + "learning_rate": 8.447382521696683e-06, + "loss": 0.362, + "num_input_tokens_seen": 6598592, + "step": 19600 + }, + { + "epoch": 15.150695517774343, + "grad_norm": 0.9982304573059082, + "learning_rate": 8.434751341385388e-06, + "loss": 0.5516, + "num_input_tokens_seen": 6600288, + "step": 19605 + }, + { + "epoch": 15.154559505409583, + "grad_norm": 0.9968289732933044, + "learning_rate": 8.42212769504839e-06, + "loss": 0.3613, + "num_input_tokens_seen": 6601952, + "step": 19610 + }, + { + "epoch": 15.158423493044822, + "grad_norm": 0.8184341192245483, + "learning_rate": 8.409511588427002e-06, + "loss": 0.5111, + "num_input_tokens_seen": 6603680, + "step": 19615 + }, + { + "epoch": 15.162287480680062, + "grad_norm": 1.2145545482635498, + "learning_rate": 8.396903027259103e-06, + "loss": 0.3615, + "num_input_tokens_seen": 6605408, + "step": 19620 + }, + { + "epoch": 15.166151468315302, + "grad_norm": 0.9953843355178833, + "learning_rate": 8.38430201727914e-06, + "loss": 0.5492, + "num_input_tokens_seen": 6606976, + "step": 19625 + }, + { + "epoch": 15.17001545595054, + "grad_norm": 1.150465726852417, + "learning_rate": 8.371708564218123e-06, + "loss": 0.4602, + "num_input_tokens_seen": 6608384, + "step": 19630 + }, + { + "epoch": 15.173879443585781, + "grad_norm": 1.2695693969726562, + "learning_rate": 8.359122673803638e-06, + "loss": 0.3751, + "num_input_tokens_seen": 6610240, + "step": 19635 + }, + { + "epoch": 15.17774343122102, + "grad_norm": 1.5632212162017822, + "learning_rate": 8.346544351759807e-06, + "loss": 0.5264, + "num_input_tokens_seen": 6611936, + "step": 19640 + }, + { + "epoch": 15.18160741885626, + "grad_norm": 0.64694744348526, + "learning_rate": 8.333973603807341e-06, + "loss": 0.477, + "num_input_tokens_seen": 6613504, + "step": 19645 + }, + { + "epoch": 15.1854714064915, + "grad_norm": 0.7492008209228516, + "learning_rate": 8.321410435663496e-06, + "loss": 0.3203, + "num_input_tokens_seen": 6615040, + "step": 19650 + }, + { + "epoch": 15.189335394126738, + "grad_norm": 0.855402410030365, + "learning_rate": 8.30885485304207e-06, + "loss": 0.4598, + "num_input_tokens_seen": 6616928, + "step": 19655 + }, + { + "epoch": 15.193199381761978, + "grad_norm": 0.9963394403457642, + "learning_rate": 8.296306861653415e-06, + "loss": 0.3405, + "num_input_tokens_seen": 6618784, + "step": 19660 + }, + { + "epoch": 15.197063369397219, + "grad_norm": 1.6153042316436768, + "learning_rate": 8.283766467204438e-06, + "loss": 0.4277, + "num_input_tokens_seen": 6620480, + "step": 19665 + }, + { + "epoch": 15.200927357032457, + "grad_norm": 0.8377222418785095, + "learning_rate": 8.271233675398576e-06, + "loss": 0.3908, + "num_input_tokens_seen": 6622048, + "step": 19670 + }, + { + "epoch": 15.204791344667697, + "grad_norm": 0.700201153755188, + "learning_rate": 8.258708491935819e-06, + "loss": 0.4357, + "num_input_tokens_seen": 6623584, + "step": 19675 + }, + { + "epoch": 15.208655332302937, + "grad_norm": 0.815274178981781, + "learning_rate": 8.246190922512704e-06, + "loss": 0.3529, + "num_input_tokens_seen": 6625376, + "step": 19680 + }, + { + "epoch": 15.212519319938176, + "grad_norm": 0.5899463295936584, + "learning_rate": 8.233680972822286e-06, + "loss": 0.3453, + "num_input_tokens_seen": 6627104, + "step": 19685 + }, + { + "epoch": 15.216383307573416, + "grad_norm": 1.1364473104476929, + "learning_rate": 8.221178648554178e-06, + "loss": 0.3927, + "num_input_tokens_seen": 6628640, + "step": 19690 + }, + { + "epoch": 15.220247295208654, + "grad_norm": 1.097684621810913, + "learning_rate": 8.208683955394506e-06, + "loss": 0.4298, + "num_input_tokens_seen": 6630304, + "step": 19695 + }, + { + "epoch": 15.224111282843895, + "grad_norm": 1.2952423095703125, + "learning_rate": 8.196196899025929e-06, + "loss": 0.5914, + "num_input_tokens_seen": 6632000, + "step": 19700 + }, + { + "epoch": 15.227975270479135, + "grad_norm": 0.694442629814148, + "learning_rate": 8.18371748512764e-06, + "loss": 0.4063, + "num_input_tokens_seen": 6634080, + "step": 19705 + }, + { + "epoch": 15.231839258114373, + "grad_norm": 0.9703161120414734, + "learning_rate": 8.171245719375337e-06, + "loss": 0.3484, + "num_input_tokens_seen": 6635776, + "step": 19710 + }, + { + "epoch": 15.235703245749614, + "grad_norm": 0.6412186622619629, + "learning_rate": 8.15878160744127e-06, + "loss": 0.3906, + "num_input_tokens_seen": 6637280, + "step": 19715 + }, + { + "epoch": 15.239567233384854, + "grad_norm": 0.7708855867385864, + "learning_rate": 8.146325154994189e-06, + "loss": 0.3553, + "num_input_tokens_seen": 6639040, + "step": 19720 + }, + { + "epoch": 15.243431221020092, + "grad_norm": 0.8750407099723816, + "learning_rate": 8.133876367699353e-06, + "loss": 0.3984, + "num_input_tokens_seen": 6640576, + "step": 19725 + }, + { + "epoch": 15.247295208655332, + "grad_norm": 1.9510886669158936, + "learning_rate": 8.12143525121856e-06, + "loss": 0.3993, + "num_input_tokens_seen": 6641952, + "step": 19730 + }, + { + "epoch": 15.251159196290573, + "grad_norm": 1.274250864982605, + "learning_rate": 8.109001811210093e-06, + "loss": 0.4977, + "num_input_tokens_seen": 6643552, + "step": 19735 + }, + { + "epoch": 15.255023183925811, + "grad_norm": 0.6898272037506104, + "learning_rate": 8.096576053328761e-06, + "loss": 0.4305, + "num_input_tokens_seen": 6645184, + "step": 19740 + }, + { + "epoch": 15.258887171561051, + "grad_norm": 0.6274954676628113, + "learning_rate": 8.084157983225862e-06, + "loss": 0.3497, + "num_input_tokens_seen": 6646816, + "step": 19745 + }, + { + "epoch": 15.262751159196291, + "grad_norm": 1.2222141027450562, + "learning_rate": 8.071747606549226e-06, + "loss": 0.4037, + "num_input_tokens_seen": 6648544, + "step": 19750 + }, + { + "epoch": 15.26661514683153, + "grad_norm": 0.9979909658432007, + "learning_rate": 8.059344928943157e-06, + "loss": 0.3629, + "num_input_tokens_seen": 6650304, + "step": 19755 + }, + { + "epoch": 15.27047913446677, + "grad_norm": 1.5646278858184814, + "learning_rate": 8.04694995604847e-06, + "loss": 0.4389, + "num_input_tokens_seen": 6652128, + "step": 19760 + }, + { + "epoch": 15.274343122102009, + "grad_norm": 0.8219774961471558, + "learning_rate": 8.03456269350246e-06, + "loss": 0.4943, + "num_input_tokens_seen": 6653728, + "step": 19765 + }, + { + "epoch": 15.278207109737249, + "grad_norm": 1.205041527748108, + "learning_rate": 8.02218314693895e-06, + "loss": 0.5007, + "num_input_tokens_seen": 6655424, + "step": 19770 + }, + { + "epoch": 15.282071097372489, + "grad_norm": 2.119938373565674, + "learning_rate": 8.009811321988217e-06, + "loss": 0.4742, + "num_input_tokens_seen": 6657152, + "step": 19775 + }, + { + "epoch": 15.285935085007727, + "grad_norm": 0.8497427105903625, + "learning_rate": 7.99744722427704e-06, + "loss": 0.4729, + "num_input_tokens_seen": 6659104, + "step": 19780 + }, + { + "epoch": 15.289799072642968, + "grad_norm": 0.8575440645217896, + "learning_rate": 7.985090859428695e-06, + "loss": 0.4709, + "num_input_tokens_seen": 6661056, + "step": 19785 + }, + { + "epoch": 15.293663060278208, + "grad_norm": 0.6178226470947266, + "learning_rate": 7.97274223306293e-06, + "loss": 0.5806, + "num_input_tokens_seen": 6662656, + "step": 19790 + }, + { + "epoch": 15.297527047913446, + "grad_norm": 0.8677821755409241, + "learning_rate": 7.960401350795965e-06, + "loss": 0.4081, + "num_input_tokens_seen": 6664672, + "step": 19795 + }, + { + "epoch": 15.301391035548686, + "grad_norm": 1.162745714187622, + "learning_rate": 7.948068218240514e-06, + "loss": 0.5781, + "num_input_tokens_seen": 6666304, + "step": 19800 + }, + { + "epoch": 15.305255023183927, + "grad_norm": 1.2114852666854858, + "learning_rate": 7.93574284100575e-06, + "loss": 0.4351, + "num_input_tokens_seen": 6667968, + "step": 19805 + }, + { + "epoch": 15.309119010819165, + "grad_norm": 0.9507246017456055, + "learning_rate": 7.923425224697342e-06, + "loss": 0.3826, + "num_input_tokens_seen": 6669600, + "step": 19810 + }, + { + "epoch": 15.312982998454405, + "grad_norm": 0.7619415521621704, + "learning_rate": 7.911115374917402e-06, + "loss": 0.3604, + "num_input_tokens_seen": 6671168, + "step": 19815 + }, + { + "epoch": 15.316846986089644, + "grad_norm": 1.3316044807434082, + "learning_rate": 7.89881329726454e-06, + "loss": 0.3952, + "num_input_tokens_seen": 6672832, + "step": 19820 + }, + { + "epoch": 15.320710973724884, + "grad_norm": 1.2301119565963745, + "learning_rate": 7.886518997333805e-06, + "loss": 0.4214, + "num_input_tokens_seen": 6674272, + "step": 19825 + }, + { + "epoch": 15.324574961360124, + "grad_norm": 1.1376993656158447, + "learning_rate": 7.874232480716718e-06, + "loss": 0.4543, + "num_input_tokens_seen": 6676096, + "step": 19830 + }, + { + "epoch": 15.328438948995363, + "grad_norm": 0.7251328229904175, + "learning_rate": 7.861953753001262e-06, + "loss": 0.4128, + "num_input_tokens_seen": 6677536, + "step": 19835 + }, + { + "epoch": 15.332302936630603, + "grad_norm": 0.8988460302352905, + "learning_rate": 7.849682819771872e-06, + "loss": 0.4693, + "num_input_tokens_seen": 6679232, + "step": 19840 + }, + { + "epoch": 15.336166924265843, + "grad_norm": 0.7492660880088806, + "learning_rate": 7.83741968660944e-06, + "loss": 0.5004, + "num_input_tokens_seen": 6680832, + "step": 19845 + }, + { + "epoch": 15.340030911901081, + "grad_norm": 1.6654706001281738, + "learning_rate": 7.825164359091323e-06, + "loss": 0.6795, + "num_input_tokens_seen": 6682432, + "step": 19850 + }, + { + "epoch": 15.343894899536322, + "grad_norm": 2.3393611907958984, + "learning_rate": 7.812916842791304e-06, + "loss": 0.5862, + "num_input_tokens_seen": 6684192, + "step": 19855 + }, + { + "epoch": 15.347758887171562, + "grad_norm": 0.6433477997779846, + "learning_rate": 7.800677143279645e-06, + "loss": 0.4661, + "num_input_tokens_seen": 6685920, + "step": 19860 + }, + { + "epoch": 15.3516228748068, + "grad_norm": 0.8546140193939209, + "learning_rate": 7.78844526612302e-06, + "loss": 0.4315, + "num_input_tokens_seen": 6687680, + "step": 19865 + }, + { + "epoch": 15.35548686244204, + "grad_norm": 2.14125394821167, + "learning_rate": 7.776221216884566e-06, + "loss": 0.3896, + "num_input_tokens_seen": 6689408, + "step": 19870 + }, + { + "epoch": 15.35935085007728, + "grad_norm": 1.7254513502120972, + "learning_rate": 7.764005001123851e-06, + "loss": 0.5919, + "num_input_tokens_seen": 6691168, + "step": 19875 + }, + { + "epoch": 15.363214837712519, + "grad_norm": 1.1580407619476318, + "learning_rate": 7.751796624396876e-06, + "loss": 0.3717, + "num_input_tokens_seen": 6693152, + "step": 19880 + }, + { + "epoch": 15.36707882534776, + "grad_norm": 1.5190110206604004, + "learning_rate": 7.7395960922561e-06, + "loss": 0.4664, + "num_input_tokens_seen": 6694912, + "step": 19885 + }, + { + "epoch": 15.370942812982998, + "grad_norm": 1.413554072380066, + "learning_rate": 7.72740341025038e-06, + "loss": 0.4697, + "num_input_tokens_seen": 6696544, + "step": 19890 + }, + { + "epoch": 15.374806800618238, + "grad_norm": 2.120906114578247, + "learning_rate": 7.71521858392504e-06, + "loss": 0.4639, + "num_input_tokens_seen": 6698304, + "step": 19895 + }, + { + "epoch": 15.378670788253478, + "grad_norm": 0.9825869202613831, + "learning_rate": 7.703041618821805e-06, + "loss": 0.4183, + "num_input_tokens_seen": 6699744, + "step": 19900 + }, + { + "epoch": 15.382534775888717, + "grad_norm": 0.967065155506134, + "learning_rate": 7.690872520478825e-06, + "loss": 0.3451, + "num_input_tokens_seen": 6701344, + "step": 19905 + }, + { + "epoch": 15.386398763523957, + "grad_norm": 1.3602639436721802, + "learning_rate": 7.678711294430685e-06, + "loss": 0.66, + "num_input_tokens_seen": 6702944, + "step": 19910 + }, + { + "epoch": 15.390262751159197, + "grad_norm": 1.150402545928955, + "learning_rate": 7.666557946208375e-06, + "loss": 0.5346, + "num_input_tokens_seen": 6704608, + "step": 19915 + }, + { + "epoch": 15.394126738794435, + "grad_norm": 2.132601261138916, + "learning_rate": 7.654412481339324e-06, + "loss": 0.9155, + "num_input_tokens_seen": 6706432, + "step": 19920 + }, + { + "epoch": 15.397990726429676, + "grad_norm": 0.9678389430046082, + "learning_rate": 7.642274905347353e-06, + "loss": 0.3751, + "num_input_tokens_seen": 6707936, + "step": 19925 + }, + { + "epoch": 15.401854714064916, + "grad_norm": 0.8871216773986816, + "learning_rate": 7.630145223752699e-06, + "loss": 0.5347, + "num_input_tokens_seen": 6709408, + "step": 19930 + }, + { + "epoch": 15.405718701700154, + "grad_norm": 2.372087240219116, + "learning_rate": 7.618023442072031e-06, + "loss": 0.4549, + "num_input_tokens_seen": 6711072, + "step": 19935 + }, + { + "epoch": 15.409582689335394, + "grad_norm": 1.0116444826126099, + "learning_rate": 7.6059095658183975e-06, + "loss": 0.3935, + "num_input_tokens_seen": 6712800, + "step": 19940 + }, + { + "epoch": 15.413446676970633, + "grad_norm": 0.9258891940116882, + "learning_rate": 7.593803600501262e-06, + "loss": 0.4352, + "num_input_tokens_seen": 6714688, + "step": 19945 + }, + { + "epoch": 15.417310664605873, + "grad_norm": 0.8772342801094055, + "learning_rate": 7.581705551626489e-06, + "loss": 0.3798, + "num_input_tokens_seen": 6716448, + "step": 19950 + }, + { + "epoch": 15.421174652241113, + "grad_norm": 1.7110413312911987, + "learning_rate": 7.569615424696341e-06, + "loss": 0.7811, + "num_input_tokens_seen": 6717984, + "step": 19955 + }, + { + "epoch": 15.425038639876352, + "grad_norm": 1.3989678621292114, + "learning_rate": 7.55753322520949e-06, + "loss": 0.4297, + "num_input_tokens_seen": 6719712, + "step": 19960 + }, + { + "epoch": 15.428902627511592, + "grad_norm": 1.0085521936416626, + "learning_rate": 7.545458958660987e-06, + "loss": 0.4664, + "num_input_tokens_seen": 6721152, + "step": 19965 + }, + { + "epoch": 15.432766615146832, + "grad_norm": 1.200664758682251, + "learning_rate": 7.533392630542272e-06, + "loss": 0.4449, + "num_input_tokens_seen": 6722816, + "step": 19970 + }, + { + "epoch": 15.43663060278207, + "grad_norm": 1.0289428234100342, + "learning_rate": 7.521334246341202e-06, + "loss": 0.5452, + "num_input_tokens_seen": 6725056, + "step": 19975 + }, + { + "epoch": 15.44049459041731, + "grad_norm": 1.0661991834640503, + "learning_rate": 7.509283811541992e-06, + "loss": 0.3639, + "num_input_tokens_seen": 6726688, + "step": 19980 + }, + { + "epoch": 15.444358578052551, + "grad_norm": 1.2554715871810913, + "learning_rate": 7.497241331625252e-06, + "loss": 0.6762, + "num_input_tokens_seen": 6728384, + "step": 19985 + }, + { + "epoch": 15.44822256568779, + "grad_norm": 1.199162244796753, + "learning_rate": 7.4852068120679656e-06, + "loss": 0.3796, + "num_input_tokens_seen": 6730144, + "step": 19990 + }, + { + "epoch": 15.45208655332303, + "grad_norm": 0.8159550428390503, + "learning_rate": 7.473180258343521e-06, + "loss": 0.3922, + "num_input_tokens_seen": 6731840, + "step": 19995 + }, + { + "epoch": 15.45595054095827, + "grad_norm": 0.822553277015686, + "learning_rate": 7.46116167592166e-06, + "loss": 0.3352, + "num_input_tokens_seen": 6733312, + "step": 20000 + }, + { + "epoch": 15.459814528593508, + "grad_norm": 1.0011452436447144, + "learning_rate": 7.449151070268504e-06, + "loss": 0.4315, + "num_input_tokens_seen": 6734944, + "step": 20005 + }, + { + "epoch": 15.463678516228748, + "grad_norm": 0.7589259147644043, + "learning_rate": 7.43714844684654e-06, + "loss": 0.408, + "num_input_tokens_seen": 6736768, + "step": 20010 + }, + { + "epoch": 15.467542503863987, + "grad_norm": 0.8666070103645325, + "learning_rate": 7.425153811114652e-06, + "loss": 0.4954, + "num_input_tokens_seen": 6738496, + "step": 20015 + }, + { + "epoch": 15.471406491499227, + "grad_norm": 0.7783313989639282, + "learning_rate": 7.413167168528062e-06, + "loss": 0.4248, + "num_input_tokens_seen": 6740448, + "step": 20020 + }, + { + "epoch": 15.475270479134467, + "grad_norm": 1.0165104866027832, + "learning_rate": 7.4011885245383604e-06, + "loss": 0.4328, + "num_input_tokens_seen": 6742304, + "step": 20025 + }, + { + "epoch": 15.479134466769706, + "grad_norm": 1.9519741535186768, + "learning_rate": 7.389217884593519e-06, + "loss": 0.7713, + "num_input_tokens_seen": 6744000, + "step": 20030 + }, + { + "epoch": 15.482998454404946, + "grad_norm": 0.7920148372650146, + "learning_rate": 7.377255254137852e-06, + "loss": 0.4002, + "num_input_tokens_seen": 6745632, + "step": 20035 + }, + { + "epoch": 15.486862442040186, + "grad_norm": 1.0320329666137695, + "learning_rate": 7.3653006386120326e-06, + "loss": 0.3764, + "num_input_tokens_seen": 6747264, + "step": 20040 + }, + { + "epoch": 15.490726429675425, + "grad_norm": 1.0915716886520386, + "learning_rate": 7.353354043453092e-06, + "loss": 0.5065, + "num_input_tokens_seen": 6748736, + "step": 20045 + }, + { + "epoch": 15.494590417310665, + "grad_norm": 0.7949996590614319, + "learning_rate": 7.341415474094407e-06, + "loss": 0.3863, + "num_input_tokens_seen": 6750144, + "step": 20050 + }, + { + "epoch": 15.498454404945905, + "grad_norm": 1.3323826789855957, + "learning_rate": 7.329484935965728e-06, + "loss": 0.473, + "num_input_tokens_seen": 6751872, + "step": 20055 + }, + { + "epoch": 15.502318392581143, + "grad_norm": 2.0922129154205322, + "learning_rate": 7.317562434493114e-06, + "loss": 0.4903, + "num_input_tokens_seen": 6753760, + "step": 20060 + }, + { + "epoch": 15.506182380216384, + "grad_norm": 0.9453170299530029, + "learning_rate": 7.305647975099009e-06, + "loss": 0.4486, + "num_input_tokens_seen": 6755264, + "step": 20065 + }, + { + "epoch": 15.510046367851622, + "grad_norm": 0.6229506134986877, + "learning_rate": 7.293741563202172e-06, + "loss": 0.4268, + "num_input_tokens_seen": 6757152, + "step": 20070 + }, + { + "epoch": 15.513910355486862, + "grad_norm": 1.4595153331756592, + "learning_rate": 7.281843204217711e-06, + "loss": 0.4862, + "num_input_tokens_seen": 6758656, + "step": 20075 + }, + { + "epoch": 15.517774343122102, + "grad_norm": 1.0275111198425293, + "learning_rate": 7.2699529035570705e-06, + "loss": 0.5188, + "num_input_tokens_seen": 6760448, + "step": 20080 + }, + { + "epoch": 15.521638330757341, + "grad_norm": 0.8728389739990234, + "learning_rate": 7.258070666628031e-06, + "loss": 0.6128, + "num_input_tokens_seen": 6762208, + "step": 20085 + }, + { + "epoch": 15.525502318392581, + "grad_norm": 1.9572978019714355, + "learning_rate": 7.246196498834695e-06, + "loss": 0.5273, + "num_input_tokens_seen": 6763744, + "step": 20090 + }, + { + "epoch": 15.529366306027821, + "grad_norm": 0.9206822514533997, + "learning_rate": 7.234330405577516e-06, + "loss": 0.57, + "num_input_tokens_seen": 6765312, + "step": 20095 + }, + { + "epoch": 15.53323029366306, + "grad_norm": 0.8617801070213318, + "learning_rate": 7.2224723922532735e-06, + "loss": 0.4288, + "num_input_tokens_seen": 6766912, + "step": 20100 + }, + { + "epoch": 15.5370942812983, + "grad_norm": 1.0216401815414429, + "learning_rate": 7.210622464255049e-06, + "loss": 0.366, + "num_input_tokens_seen": 6768576, + "step": 20105 + }, + { + "epoch": 15.54095826893354, + "grad_norm": 0.8396607041358948, + "learning_rate": 7.198780626972265e-06, + "loss": 0.4202, + "num_input_tokens_seen": 6770400, + "step": 20110 + }, + { + "epoch": 15.544822256568779, + "grad_norm": 1.127238154411316, + "learning_rate": 7.18694688579066e-06, + "loss": 0.3524, + "num_input_tokens_seen": 6772000, + "step": 20115 + }, + { + "epoch": 15.548686244204019, + "grad_norm": 1.652673363685608, + "learning_rate": 7.17512124609229e-06, + "loss": 0.544, + "num_input_tokens_seen": 6773952, + "step": 20120 + }, + { + "epoch": 15.552550231839259, + "grad_norm": 1.3230972290039062, + "learning_rate": 7.163303713255515e-06, + "loss": 0.4129, + "num_input_tokens_seen": 6775712, + "step": 20125 + }, + { + "epoch": 15.556414219474497, + "grad_norm": 0.7698720693588257, + "learning_rate": 7.1514942926550335e-06, + "loss": 0.4931, + "num_input_tokens_seen": 6777056, + "step": 20130 + }, + { + "epoch": 15.560278207109738, + "grad_norm": 0.8341095447540283, + "learning_rate": 7.139692989661845e-06, + "loss": 0.424, + "num_input_tokens_seen": 6778592, + "step": 20135 + }, + { + "epoch": 15.564142194744976, + "grad_norm": 1.0750056505203247, + "learning_rate": 7.127899809643248e-06, + "loss": 0.3975, + "num_input_tokens_seen": 6780512, + "step": 20140 + }, + { + "epoch": 15.568006182380216, + "grad_norm": 0.7884015440940857, + "learning_rate": 7.1161147579628465e-06, + "loss": 0.4151, + "num_input_tokens_seen": 6782144, + "step": 20145 + }, + { + "epoch": 15.571870170015456, + "grad_norm": 1.4386061429977417, + "learning_rate": 7.10433783998056e-06, + "loss": 0.5448, + "num_input_tokens_seen": 6783744, + "step": 20150 + }, + { + "epoch": 15.575734157650695, + "grad_norm": 0.7986371517181396, + "learning_rate": 7.092569061052592e-06, + "loss": 0.3908, + "num_input_tokens_seen": 6785600, + "step": 20155 + }, + { + "epoch": 15.579598145285935, + "grad_norm": 0.8629196286201477, + "learning_rate": 7.080808426531455e-06, + "loss": 0.3796, + "num_input_tokens_seen": 6787264, + "step": 20160 + }, + { + "epoch": 15.583462132921175, + "grad_norm": 0.90826815366745, + "learning_rate": 7.069055941765962e-06, + "loss": 0.3989, + "num_input_tokens_seen": 6788704, + "step": 20165 + }, + { + "epoch": 15.587326120556414, + "grad_norm": 1.1250300407409668, + "learning_rate": 7.0573116121012056e-06, + "loss": 0.43, + "num_input_tokens_seen": 6790272, + "step": 20170 + }, + { + "epoch": 15.591190108191654, + "grad_norm": 0.8687810897827148, + "learning_rate": 7.0455754428785904e-06, + "loss": 0.6152, + "num_input_tokens_seen": 6792160, + "step": 20175 + }, + { + "epoch": 15.595054095826894, + "grad_norm": 1.3824326992034912, + "learning_rate": 7.033847439435789e-06, + "loss": 0.6846, + "num_input_tokens_seen": 6793888, + "step": 20180 + }, + { + "epoch": 15.598918083462133, + "grad_norm": 0.7930173873901367, + "learning_rate": 7.0221276071067685e-06, + "loss": 0.4964, + "num_input_tokens_seen": 6795616, + "step": 20185 + }, + { + "epoch": 15.602782071097373, + "grad_norm": 1.0547645092010498, + "learning_rate": 7.010415951221777e-06, + "loss": 0.3983, + "num_input_tokens_seen": 6797120, + "step": 20190 + }, + { + "epoch": 15.606646058732611, + "grad_norm": 0.919299304485321, + "learning_rate": 6.998712477107336e-06, + "loss": 0.7372, + "num_input_tokens_seen": 6798720, + "step": 20195 + }, + { + "epoch": 15.610510046367851, + "grad_norm": 1.7476730346679688, + "learning_rate": 6.9870171900862755e-06, + "loss": 0.7029, + "num_input_tokens_seen": 6800576, + "step": 20200 + }, + { + "epoch": 15.614374034003092, + "grad_norm": 0.9196231365203857, + "learning_rate": 6.975330095477673e-06, + "loss": 0.4193, + "num_input_tokens_seen": 6802496, + "step": 20205 + }, + { + "epoch": 15.61823802163833, + "grad_norm": 1.4470895528793335, + "learning_rate": 6.96365119859688e-06, + "loss": 0.4503, + "num_input_tokens_seen": 6804320, + "step": 20210 + }, + { + "epoch": 15.62210200927357, + "grad_norm": 1.0913792848587036, + "learning_rate": 6.951980504755545e-06, + "loss": 0.4086, + "num_input_tokens_seen": 6805952, + "step": 20215 + }, + { + "epoch": 15.62596599690881, + "grad_norm": 0.8974303603172302, + "learning_rate": 6.940318019261563e-06, + "loss": 0.3173, + "num_input_tokens_seen": 6807520, + "step": 20220 + }, + { + "epoch": 15.629829984544049, + "grad_norm": 1.1113022565841675, + "learning_rate": 6.928663747419098e-06, + "loss": 0.3374, + "num_input_tokens_seen": 6809088, + "step": 20225 + }, + { + "epoch": 15.63369397217929, + "grad_norm": 1.1957283020019531, + "learning_rate": 6.91701769452858e-06, + "loss": 0.3804, + "num_input_tokens_seen": 6810592, + "step": 20230 + }, + { + "epoch": 15.63755795981453, + "grad_norm": 0.6390475630760193, + "learning_rate": 6.905379865886718e-06, + "loss": 0.5538, + "num_input_tokens_seen": 6812160, + "step": 20235 + }, + { + "epoch": 15.641421947449768, + "grad_norm": 0.7514482140541077, + "learning_rate": 6.8937502667864555e-06, + "loss": 0.3703, + "num_input_tokens_seen": 6813856, + "step": 20240 + }, + { + "epoch": 15.645285935085008, + "grad_norm": 0.9797745943069458, + "learning_rate": 6.8821289025170075e-06, + "loss": 0.4188, + "num_input_tokens_seen": 6815456, + "step": 20245 + }, + { + "epoch": 15.649149922720248, + "grad_norm": 1.190314769744873, + "learning_rate": 6.8705157783638286e-06, + "loss": 0.5225, + "num_input_tokens_seen": 6817408, + "step": 20250 + }, + { + "epoch": 15.653013910355487, + "grad_norm": 0.6899247169494629, + "learning_rate": 6.858910899608656e-06, + "loss": 0.3636, + "num_input_tokens_seen": 6819008, + "step": 20255 + }, + { + "epoch": 15.656877897990727, + "grad_norm": 1.0316195487976074, + "learning_rate": 6.847314271529448e-06, + "loss": 0.4551, + "num_input_tokens_seen": 6820896, + "step": 20260 + }, + { + "epoch": 15.660741885625965, + "grad_norm": 0.7891581058502197, + "learning_rate": 6.835725899400417e-06, + "loss": 0.3849, + "num_input_tokens_seen": 6822432, + "step": 20265 + }, + { + "epoch": 15.664605873261205, + "grad_norm": 1.2226697206497192, + "learning_rate": 6.824145788492031e-06, + "loss": 0.4466, + "num_input_tokens_seen": 6824192, + "step": 20270 + }, + { + "epoch": 15.668469860896446, + "grad_norm": 0.995210587978363, + "learning_rate": 6.812573944070996e-06, + "loss": 0.6376, + "num_input_tokens_seen": 6825792, + "step": 20275 + }, + { + "epoch": 15.672333848531684, + "grad_norm": 1.2985414266586304, + "learning_rate": 6.801010371400249e-06, + "loss": 0.3657, + "num_input_tokens_seen": 6827776, + "step": 20280 + }, + { + "epoch": 15.676197836166924, + "grad_norm": 0.7942848205566406, + "learning_rate": 6.789455075738973e-06, + "loss": 0.4174, + "num_input_tokens_seen": 6829632, + "step": 20285 + }, + { + "epoch": 15.680061823802165, + "grad_norm": 0.7884257435798645, + "learning_rate": 6.777908062342583e-06, + "loss": 0.3164, + "num_input_tokens_seen": 6831296, + "step": 20290 + }, + { + "epoch": 15.683925811437403, + "grad_norm": 0.7939369678497314, + "learning_rate": 6.766369336462742e-06, + "loss": 0.3397, + "num_input_tokens_seen": 6833024, + "step": 20295 + }, + { + "epoch": 15.687789799072643, + "grad_norm": 1.241320252418518, + "learning_rate": 6.7548389033473135e-06, + "loss": 0.4879, + "num_input_tokens_seen": 6834624, + "step": 20300 + }, + { + "epoch": 15.691653786707883, + "grad_norm": 0.8322783708572388, + "learning_rate": 6.743316768240426e-06, + "loss": 0.4211, + "num_input_tokens_seen": 6836256, + "step": 20305 + }, + { + "epoch": 15.695517774343122, + "grad_norm": 0.6409168839454651, + "learning_rate": 6.731802936382408e-06, + "loss": 0.4017, + "num_input_tokens_seen": 6837728, + "step": 20310 + }, + { + "epoch": 15.699381761978362, + "grad_norm": 0.864135205745697, + "learning_rate": 6.7202974130098185e-06, + "loss": 0.3545, + "num_input_tokens_seen": 6839392, + "step": 20315 + }, + { + "epoch": 15.7032457496136, + "grad_norm": 1.1942708492279053, + "learning_rate": 6.708800203355436e-06, + "loss": 0.5059, + "num_input_tokens_seen": 6841024, + "step": 20320 + }, + { + "epoch": 15.70710973724884, + "grad_norm": 0.8857835531234741, + "learning_rate": 6.697311312648266e-06, + "loss": 0.3679, + "num_input_tokens_seen": 6842816, + "step": 20325 + }, + { + "epoch": 15.71097372488408, + "grad_norm": 0.907315194606781, + "learning_rate": 6.685830746113511e-06, + "loss": 0.4679, + "num_input_tokens_seen": 6844576, + "step": 20330 + }, + { + "epoch": 15.71483771251932, + "grad_norm": 0.49144911766052246, + "learning_rate": 6.674358508972614e-06, + "loss": 0.3318, + "num_input_tokens_seen": 6846208, + "step": 20335 + }, + { + "epoch": 15.71870170015456, + "grad_norm": 1.0744843482971191, + "learning_rate": 6.662894606443224e-06, + "loss": 0.3629, + "num_input_tokens_seen": 6848160, + "step": 20340 + }, + { + "epoch": 15.7225656877898, + "grad_norm": 1.4852099418640137, + "learning_rate": 6.65143904373918e-06, + "loss": 0.4358, + "num_input_tokens_seen": 6850048, + "step": 20345 + }, + { + "epoch": 15.726429675425038, + "grad_norm": 1.4977492094039917, + "learning_rate": 6.6399918260705466e-06, + "loss": 0.7723, + "num_input_tokens_seen": 6851744, + "step": 20350 + }, + { + "epoch": 15.730293663060278, + "grad_norm": 1.7677677869796753, + "learning_rate": 6.628552958643583e-06, + "loss": 0.466, + "num_input_tokens_seen": 6853280, + "step": 20355 + }, + { + "epoch": 15.734157650695519, + "grad_norm": 0.983131468296051, + "learning_rate": 6.617122446660756e-06, + "loss": 0.4033, + "num_input_tokens_seen": 6854848, + "step": 20360 + }, + { + "epoch": 15.738021638330757, + "grad_norm": 1.3267947435379028, + "learning_rate": 6.605700295320724e-06, + "loss": 0.4524, + "num_input_tokens_seen": 6856512, + "step": 20365 + }, + { + "epoch": 15.741885625965997, + "grad_norm": 0.8856111168861389, + "learning_rate": 6.594286509818359e-06, + "loss": 0.6652, + "num_input_tokens_seen": 6858336, + "step": 20370 + }, + { + "epoch": 15.745749613601237, + "grad_norm": 0.6350502967834473, + "learning_rate": 6.582881095344723e-06, + "loss": 0.3487, + "num_input_tokens_seen": 6859872, + "step": 20375 + }, + { + "epoch": 15.749613601236476, + "grad_norm": 1.5601087808609009, + "learning_rate": 6.57148405708706e-06, + "loss": 0.4378, + "num_input_tokens_seen": 6861632, + "step": 20380 + }, + { + "epoch": 15.753477588871716, + "grad_norm": 0.7390826940536499, + "learning_rate": 6.560095400228811e-06, + "loss": 0.3863, + "num_input_tokens_seen": 6863456, + "step": 20385 + }, + { + "epoch": 15.757341576506954, + "grad_norm": 1.16038978099823, + "learning_rate": 6.548715129949607e-06, + "loss": 0.5374, + "num_input_tokens_seen": 6864832, + "step": 20390 + }, + { + "epoch": 15.761205564142195, + "grad_norm": 0.7726519703865051, + "learning_rate": 6.537343251425263e-06, + "loss": 0.3735, + "num_input_tokens_seen": 6866496, + "step": 20395 + }, + { + "epoch": 15.765069551777435, + "grad_norm": 0.9513989686965942, + "learning_rate": 6.525979769827769e-06, + "loss": 0.3579, + "num_input_tokens_seen": 6868160, + "step": 20400 + }, + { + "epoch": 15.768933539412673, + "grad_norm": 1.3128849267959595, + "learning_rate": 6.514624690325319e-06, + "loss": 0.3717, + "num_input_tokens_seen": 6869824, + "step": 20405 + }, + { + "epoch": 15.772797527047913, + "grad_norm": 1.2133084535598755, + "learning_rate": 6.503278018082257e-06, + "loss": 0.6837, + "num_input_tokens_seen": 6871456, + "step": 20410 + }, + { + "epoch": 15.776661514683154, + "grad_norm": 1.161434531211853, + "learning_rate": 6.491939758259133e-06, + "loss": 0.4139, + "num_input_tokens_seen": 6873216, + "step": 20415 + }, + { + "epoch": 15.780525502318392, + "grad_norm": 0.9228918552398682, + "learning_rate": 6.480609916012647e-06, + "loss": 0.4468, + "num_input_tokens_seen": 6874912, + "step": 20420 + }, + { + "epoch": 15.784389489953632, + "grad_norm": 0.8053871989250183, + "learning_rate": 6.469288496495682e-06, + "loss": 0.3718, + "num_input_tokens_seen": 6876480, + "step": 20425 + }, + { + "epoch": 15.788253477588873, + "grad_norm": 1.303210973739624, + "learning_rate": 6.45797550485728e-06, + "loss": 0.7283, + "num_input_tokens_seen": 6878016, + "step": 20430 + }, + { + "epoch": 15.792117465224111, + "grad_norm": 1.193707823753357, + "learning_rate": 6.446670946242659e-06, + "loss": 0.4301, + "num_input_tokens_seen": 6879584, + "step": 20435 + }, + { + "epoch": 15.795981452859351, + "grad_norm": 1.041031002998352, + "learning_rate": 6.435374825793208e-06, + "loss": 0.3956, + "num_input_tokens_seen": 6881440, + "step": 20440 + }, + { + "epoch": 15.79984544049459, + "grad_norm": 0.8441275358200073, + "learning_rate": 6.424087148646468e-06, + "loss": 0.6115, + "num_input_tokens_seen": 6883104, + "step": 20445 + }, + { + "epoch": 15.80370942812983, + "grad_norm": 0.8499625325202942, + "learning_rate": 6.412807919936128e-06, + "loss": 0.3355, + "num_input_tokens_seen": 6884768, + "step": 20450 + }, + { + "epoch": 15.80757341576507, + "grad_norm": 1.1978169679641724, + "learning_rate": 6.401537144792072e-06, + "loss": 0.4774, + "num_input_tokens_seen": 6886432, + "step": 20455 + }, + { + "epoch": 15.811437403400308, + "grad_norm": 1.1674638986587524, + "learning_rate": 6.390274828340303e-06, + "loss": 0.5118, + "num_input_tokens_seen": 6888256, + "step": 20460 + }, + { + "epoch": 15.815301391035549, + "grad_norm": 1.0722906589508057, + "learning_rate": 6.37902097570299e-06, + "loss": 0.4681, + "num_input_tokens_seen": 6889952, + "step": 20465 + }, + { + "epoch": 15.819165378670789, + "grad_norm": 0.6896441578865051, + "learning_rate": 6.367775591998448e-06, + "loss": 0.4134, + "num_input_tokens_seen": 6891680, + "step": 20470 + }, + { + "epoch": 15.823029366306027, + "grad_norm": 0.8366831541061401, + "learning_rate": 6.3565386823411565e-06, + "loss": 0.4756, + "num_input_tokens_seen": 6893280, + "step": 20475 + }, + { + "epoch": 15.826893353941268, + "grad_norm": 0.7284594774246216, + "learning_rate": 6.345310251841727e-06, + "loss": 0.4384, + "num_input_tokens_seen": 6895136, + "step": 20480 + }, + { + "epoch": 15.830757341576508, + "grad_norm": 2.63962721824646, + "learning_rate": 6.33409030560691e-06, + "loss": 0.5339, + "num_input_tokens_seen": 6896640, + "step": 20485 + }, + { + "epoch": 15.834621329211746, + "grad_norm": 0.7187374234199524, + "learning_rate": 6.3228788487396025e-06, + "loss": 0.4218, + "num_input_tokens_seen": 6898112, + "step": 20490 + }, + { + "epoch": 15.838485316846986, + "grad_norm": 1.2576639652252197, + "learning_rate": 6.311675886338852e-06, + "loss": 0.4489, + "num_input_tokens_seen": 6899456, + "step": 20495 + }, + { + "epoch": 15.842349304482227, + "grad_norm": 0.965420126914978, + "learning_rate": 6.3004814234998326e-06, + "loss": 0.4213, + "num_input_tokens_seen": 6901088, + "step": 20500 + }, + { + "epoch": 15.846213292117465, + "grad_norm": 0.7931709289550781, + "learning_rate": 6.2892954653138384e-06, + "loss": 0.4503, + "num_input_tokens_seen": 6903136, + "step": 20505 + }, + { + "epoch": 15.850077279752705, + "grad_norm": 1.4070634841918945, + "learning_rate": 6.278118016868328e-06, + "loss": 0.4658, + "num_input_tokens_seen": 6904864, + "step": 20510 + }, + { + "epoch": 15.853941267387944, + "grad_norm": 0.7536591291427612, + "learning_rate": 6.266949083246867e-06, + "loss": 0.4129, + "num_input_tokens_seen": 6906720, + "step": 20515 + }, + { + "epoch": 15.857805255023184, + "grad_norm": 1.2389588356018066, + "learning_rate": 6.255788669529147e-06, + "loss": 0.5066, + "num_input_tokens_seen": 6908640, + "step": 20520 + }, + { + "epoch": 15.861669242658424, + "grad_norm": 0.7274506688117981, + "learning_rate": 6.2446367807909995e-06, + "loss": 0.3311, + "num_input_tokens_seen": 6910080, + "step": 20525 + }, + { + "epoch": 15.865533230293662, + "grad_norm": 0.9432801604270935, + "learning_rate": 6.233493422104356e-06, + "loss": 0.4728, + "num_input_tokens_seen": 6911648, + "step": 20530 + }, + { + "epoch": 15.869397217928903, + "grad_norm": 0.9632031917572021, + "learning_rate": 6.2223585985372974e-06, + "loss": 0.4491, + "num_input_tokens_seen": 6913376, + "step": 20535 + }, + { + "epoch": 15.873261205564143, + "grad_norm": 1.4408072233200073, + "learning_rate": 6.211232315153998e-06, + "loss": 0.5119, + "num_input_tokens_seen": 6915008, + "step": 20540 + }, + { + "epoch": 15.877125193199381, + "grad_norm": 0.6356920003890991, + "learning_rate": 6.2001145770147705e-06, + "loss": 0.4051, + "num_input_tokens_seen": 6916672, + "step": 20545 + }, + { + "epoch": 15.880989180834622, + "grad_norm": 1.032992959022522, + "learning_rate": 6.18900538917602e-06, + "loss": 0.3057, + "num_input_tokens_seen": 6918272, + "step": 20550 + }, + { + "epoch": 15.884853168469862, + "grad_norm": 0.7147422432899475, + "learning_rate": 6.177904756690276e-06, + "loss": 0.4155, + "num_input_tokens_seen": 6919840, + "step": 20555 + }, + { + "epoch": 15.8887171561051, + "grad_norm": 0.974606990814209, + "learning_rate": 6.166812684606165e-06, + "loss": 0.3639, + "num_input_tokens_seen": 6921888, + "step": 20560 + }, + { + "epoch": 15.89258114374034, + "grad_norm": 1.447662591934204, + "learning_rate": 6.155729177968436e-06, + "loss": 0.4473, + "num_input_tokens_seen": 6923744, + "step": 20565 + }, + { + "epoch": 15.896445131375579, + "grad_norm": 0.9365066885948181, + "learning_rate": 6.144654241817924e-06, + "loss": 0.513, + "num_input_tokens_seen": 6925280, + "step": 20570 + }, + { + "epoch": 15.900309119010819, + "grad_norm": 0.5748670697212219, + "learning_rate": 6.133587881191591e-06, + "loss": 0.3633, + "num_input_tokens_seen": 6926752, + "step": 20575 + }, + { + "epoch": 15.90417310664606, + "grad_norm": 1.2599077224731445, + "learning_rate": 6.122530101122464e-06, + "loss": 0.6656, + "num_input_tokens_seen": 6928800, + "step": 20580 + }, + { + "epoch": 15.908037094281298, + "grad_norm": 0.7263852953910828, + "learning_rate": 6.111480906639713e-06, + "loss": 0.6119, + "num_input_tokens_seen": 6930528, + "step": 20585 + }, + { + "epoch": 15.911901081916538, + "grad_norm": 0.7954482436180115, + "learning_rate": 6.100440302768562e-06, + "loss": 0.3689, + "num_input_tokens_seen": 6932064, + "step": 20590 + }, + { + "epoch": 15.915765069551778, + "grad_norm": 1.0504976511001587, + "learning_rate": 6.089408294530344e-06, + "loss": 0.4137, + "num_input_tokens_seen": 6933824, + "step": 20595 + }, + { + "epoch": 15.919629057187016, + "grad_norm": 1.5479108095169067, + "learning_rate": 6.078384886942487e-06, + "loss": 0.4184, + "num_input_tokens_seen": 6935584, + "step": 20600 + }, + { + "epoch": 15.923493044822257, + "grad_norm": 1.2542948722839355, + "learning_rate": 6.067370085018495e-06, + "loss": 0.3576, + "num_input_tokens_seen": 6937216, + "step": 20605 + }, + { + "epoch": 15.927357032457497, + "grad_norm": 1.5865787267684937, + "learning_rate": 6.056363893767975e-06, + "loss": 0.5892, + "num_input_tokens_seen": 6938816, + "step": 20610 + }, + { + "epoch": 15.931221020092735, + "grad_norm": 0.9517288208007812, + "learning_rate": 6.0453663181965995e-06, + "loss": 0.4152, + "num_input_tokens_seen": 6940448, + "step": 20615 + }, + { + "epoch": 15.935085007727976, + "grad_norm": 1.0416942834854126, + "learning_rate": 6.034377363306146e-06, + "loss": 0.3738, + "num_input_tokens_seen": 6942144, + "step": 20620 + }, + { + "epoch": 15.938948995363216, + "grad_norm": 0.8787986636161804, + "learning_rate": 6.0233970340944465e-06, + "loss": 0.5558, + "num_input_tokens_seen": 6943808, + "step": 20625 + }, + { + "epoch": 15.942812982998454, + "grad_norm": 1.13632071018219, + "learning_rate": 6.012425335555422e-06, + "loss": 0.4399, + "num_input_tokens_seen": 6945312, + "step": 20630 + }, + { + "epoch": 15.946676970633694, + "grad_norm": 1.3964589834213257, + "learning_rate": 6.0014622726790676e-06, + "loss": 0.3786, + "num_input_tokens_seen": 6946656, + "step": 20635 + }, + { + "epoch": 15.950540958268933, + "grad_norm": 0.8443779349327087, + "learning_rate": 5.990507850451443e-06, + "loss": 0.3127, + "num_input_tokens_seen": 6948256, + "step": 20640 + }, + { + "epoch": 15.954404945904173, + "grad_norm": 0.8286401033401489, + "learning_rate": 5.9795620738547e-06, + "loss": 0.4949, + "num_input_tokens_seen": 6950080, + "step": 20645 + }, + { + "epoch": 15.958268933539413, + "grad_norm": 1.0733803510665894, + "learning_rate": 5.9686249478670245e-06, + "loss": 0.3436, + "num_input_tokens_seen": 6951680, + "step": 20650 + }, + { + "epoch": 15.962132921174652, + "grad_norm": 0.8796066045761108, + "learning_rate": 5.957696477462704e-06, + "loss": 0.3342, + "num_input_tokens_seen": 6953568, + "step": 20655 + }, + { + "epoch": 15.965996908809892, + "grad_norm": 1.3512946367263794, + "learning_rate": 5.9467766676120666e-06, + "loss": 0.4756, + "num_input_tokens_seen": 6955328, + "step": 20660 + }, + { + "epoch": 15.969860896445132, + "grad_norm": 0.8262040615081787, + "learning_rate": 5.935865523281509e-06, + "loss": 0.4264, + "num_input_tokens_seen": 6956864, + "step": 20665 + }, + { + "epoch": 15.97372488408037, + "grad_norm": 0.8352743983268738, + "learning_rate": 5.924963049433477e-06, + "loss": 0.4204, + "num_input_tokens_seen": 6958752, + "step": 20670 + }, + { + "epoch": 15.97758887171561, + "grad_norm": 0.9608845710754395, + "learning_rate": 5.914069251026489e-06, + "loss": 0.4064, + "num_input_tokens_seen": 6960288, + "step": 20675 + }, + { + "epoch": 15.98145285935085, + "grad_norm": 1.8183549642562866, + "learning_rate": 5.9031841330151e-06, + "loss": 0.5304, + "num_input_tokens_seen": 6961696, + "step": 20680 + }, + { + "epoch": 15.98531684698609, + "grad_norm": 1.3601635694503784, + "learning_rate": 5.892307700349939e-06, + "loss": 0.5048, + "num_input_tokens_seen": 6963488, + "step": 20685 + }, + { + "epoch": 15.98918083462133, + "grad_norm": 0.8878825306892395, + "learning_rate": 5.881439957977661e-06, + "loss": 0.4806, + "num_input_tokens_seen": 6965248, + "step": 20690 + }, + { + "epoch": 15.993044822256568, + "grad_norm": 1.261020541191101, + "learning_rate": 5.870580910840995e-06, + "loss": 0.321, + "num_input_tokens_seen": 6967072, + "step": 20695 + }, + { + "epoch": 15.996908809891808, + "grad_norm": 1.6172759532928467, + "learning_rate": 5.85973056387869e-06, + "loss": 0.5419, + "num_input_tokens_seen": 6968672, + "step": 20700 + }, + { + "epoch": 16.0, + "eval_loss": 0.4456382393836975, + "eval_runtime": 6.2512, + "eval_samples_per_second": 91.982, + "eval_steps_per_second": 23.035, + "num_input_tokens_seen": 6969936, + "step": 20704 + }, + { + "epoch": 16.00077279752705, + "grad_norm": 1.9482969045639038, + "learning_rate": 5.848888922025553e-06, + "loss": 0.4909, + "num_input_tokens_seen": 6970192, + "step": 20705 + }, + { + "epoch": 16.004636785162287, + "grad_norm": 0.9816694855690002, + "learning_rate": 5.838055990212424e-06, + "loss": 0.3505, + "num_input_tokens_seen": 6971568, + "step": 20710 + }, + { + "epoch": 16.00850077279753, + "grad_norm": 0.9042841792106628, + "learning_rate": 5.8272317733661815e-06, + "loss": 0.4925, + "num_input_tokens_seen": 6973168, + "step": 20715 + }, + { + "epoch": 16.012364760432767, + "grad_norm": 1.0442454814910889, + "learning_rate": 5.816416276409756e-06, + "loss": 0.5692, + "num_input_tokens_seen": 6974896, + "step": 20720 + }, + { + "epoch": 16.016228748068006, + "grad_norm": 1.0432196855545044, + "learning_rate": 5.805609504262094e-06, + "loss": 0.3933, + "num_input_tokens_seen": 6976368, + "step": 20725 + }, + { + "epoch": 16.020092735703244, + "grad_norm": 0.8930319547653198, + "learning_rate": 5.794811461838173e-06, + "loss": 0.3599, + "num_input_tokens_seen": 6977808, + "step": 20730 + }, + { + "epoch": 16.023956723338486, + "grad_norm": 1.2242423295974731, + "learning_rate": 5.7840221540490234e-06, + "loss": 0.4481, + "num_input_tokens_seen": 6979472, + "step": 20735 + }, + { + "epoch": 16.027820710973725, + "grad_norm": 0.7464310526847839, + "learning_rate": 5.773241585801676e-06, + "loss": 0.3853, + "num_input_tokens_seen": 6981296, + "step": 20740 + }, + { + "epoch": 16.031684698608963, + "grad_norm": 0.994107723236084, + "learning_rate": 5.762469761999201e-06, + "loss": 0.4089, + "num_input_tokens_seen": 6982672, + "step": 20745 + }, + { + "epoch": 16.035548686244205, + "grad_norm": 2.7634878158569336, + "learning_rate": 5.751706687540679e-06, + "loss": 0.3703, + "num_input_tokens_seen": 6984240, + "step": 20750 + }, + { + "epoch": 16.039412673879443, + "grad_norm": 1.4560456275939941, + "learning_rate": 5.740952367321237e-06, + "loss": 0.3491, + "num_input_tokens_seen": 6985744, + "step": 20755 + }, + { + "epoch": 16.043276661514682, + "grad_norm": 0.6837807893753052, + "learning_rate": 5.7302068062319965e-06, + "loss": 0.4317, + "num_input_tokens_seen": 6987440, + "step": 20760 + }, + { + "epoch": 16.047140649149924, + "grad_norm": 1.3936007022857666, + "learning_rate": 5.719470009160102e-06, + "loss": 0.4434, + "num_input_tokens_seen": 6989104, + "step": 20765 + }, + { + "epoch": 16.051004636785162, + "grad_norm": 0.8334749341011047, + "learning_rate": 5.708741980988708e-06, + "loss": 0.5097, + "num_input_tokens_seen": 6990704, + "step": 20770 + }, + { + "epoch": 16.0548686244204, + "grad_norm": 0.7262420654296875, + "learning_rate": 5.698022726596996e-06, + "loss": 0.7069, + "num_input_tokens_seen": 6992432, + "step": 20775 + }, + { + "epoch": 16.058732612055643, + "grad_norm": 0.7518740892410278, + "learning_rate": 5.687312250860147e-06, + "loss": 0.3552, + "num_input_tokens_seen": 6993904, + "step": 20780 + }, + { + "epoch": 16.06259659969088, + "grad_norm": 1.3252332210540771, + "learning_rate": 5.676610558649337e-06, + "loss": 0.4092, + "num_input_tokens_seen": 6995632, + "step": 20785 + }, + { + "epoch": 16.06646058732612, + "grad_norm": 1.245721697807312, + "learning_rate": 5.665917654831773e-06, + "loss": 0.5183, + "num_input_tokens_seen": 6997392, + "step": 20790 + }, + { + "epoch": 16.07032457496136, + "grad_norm": 0.7158064842224121, + "learning_rate": 5.655233544270649e-06, + "loss": 0.563, + "num_input_tokens_seen": 6999088, + "step": 20795 + }, + { + "epoch": 16.0741885625966, + "grad_norm": 0.892929196357727, + "learning_rate": 5.644558231825162e-06, + "loss": 0.4967, + "num_input_tokens_seen": 7001040, + "step": 20800 + }, + { + "epoch": 16.07805255023184, + "grad_norm": 0.8718937635421753, + "learning_rate": 5.633891722350504e-06, + "loss": 0.4367, + "num_input_tokens_seen": 7002960, + "step": 20805 + }, + { + "epoch": 16.08191653786708, + "grad_norm": 1.1867109537124634, + "learning_rate": 5.623234020697868e-06, + "loss": 0.3682, + "num_input_tokens_seen": 7004848, + "step": 20810 + }, + { + "epoch": 16.08578052550232, + "grad_norm": 1.287893533706665, + "learning_rate": 5.612585131714437e-06, + "loss": 0.4471, + "num_input_tokens_seen": 7006448, + "step": 20815 + }, + { + "epoch": 16.089644513137557, + "grad_norm": 0.8129810094833374, + "learning_rate": 5.601945060243397e-06, + "loss": 0.4137, + "num_input_tokens_seen": 7008144, + "step": 20820 + }, + { + "epoch": 16.0935085007728, + "grad_norm": 0.7422745823860168, + "learning_rate": 5.591313811123919e-06, + "loss": 0.4607, + "num_input_tokens_seen": 7009904, + "step": 20825 + }, + { + "epoch": 16.097372488408038, + "grad_norm": 0.6965657472610474, + "learning_rate": 5.580691389191153e-06, + "loss": 0.3277, + "num_input_tokens_seen": 7011696, + "step": 20830 + }, + { + "epoch": 16.101236476043276, + "grad_norm": 0.6295106410980225, + "learning_rate": 5.570077799276241e-06, + "loss": 0.3861, + "num_input_tokens_seen": 7013424, + "step": 20835 + }, + { + "epoch": 16.105100463678518, + "grad_norm": 1.9079949855804443, + "learning_rate": 5.559473046206309e-06, + "loss": 0.4404, + "num_input_tokens_seen": 7015152, + "step": 20840 + }, + { + "epoch": 16.108964451313756, + "grad_norm": 1.6223899126052856, + "learning_rate": 5.548877134804459e-06, + "loss": 0.4701, + "num_input_tokens_seen": 7016784, + "step": 20845 + }, + { + "epoch": 16.112828438948995, + "grad_norm": 1.7788853645324707, + "learning_rate": 5.538290069889768e-06, + "loss": 0.4051, + "num_input_tokens_seen": 7018352, + "step": 20850 + }, + { + "epoch": 16.116692426584233, + "grad_norm": 0.9425213932991028, + "learning_rate": 5.527711856277307e-06, + "loss": 0.3971, + "num_input_tokens_seen": 7020176, + "step": 20855 + }, + { + "epoch": 16.120556414219475, + "grad_norm": 1.4694525003433228, + "learning_rate": 5.5171424987781165e-06, + "loss": 0.5032, + "num_input_tokens_seen": 7021968, + "step": 20860 + }, + { + "epoch": 16.124420401854714, + "grad_norm": 1.0900253057479858, + "learning_rate": 5.506582002199193e-06, + "loss": 0.3865, + "num_input_tokens_seen": 7023632, + "step": 20865 + }, + { + "epoch": 16.128284389489952, + "grad_norm": 1.2048457860946655, + "learning_rate": 5.496030371343519e-06, + "loss": 0.4017, + "num_input_tokens_seen": 7025424, + "step": 20870 + }, + { + "epoch": 16.132148377125194, + "grad_norm": 0.9871649146080017, + "learning_rate": 5.485487611010034e-06, + "loss": 0.3728, + "num_input_tokens_seen": 7026864, + "step": 20875 + }, + { + "epoch": 16.136012364760433, + "grad_norm": 1.491629958152771, + "learning_rate": 5.474953725993653e-06, + "loss": 0.6733, + "num_input_tokens_seen": 7028368, + "step": 20880 + }, + { + "epoch": 16.13987635239567, + "grad_norm": 1.1607404947280884, + "learning_rate": 5.46442872108524e-06, + "loss": 0.3561, + "num_input_tokens_seen": 7030000, + "step": 20885 + }, + { + "epoch": 16.143740340030913, + "grad_norm": 0.9300575852394104, + "learning_rate": 5.453912601071648e-06, + "loss": 0.4658, + "num_input_tokens_seen": 7031728, + "step": 20890 + }, + { + "epoch": 16.14760432766615, + "grad_norm": 1.0308364629745483, + "learning_rate": 5.443405370735655e-06, + "loss": 0.389, + "num_input_tokens_seen": 7033264, + "step": 20895 + }, + { + "epoch": 16.15146831530139, + "grad_norm": 2.3216395378112793, + "learning_rate": 5.432907034856024e-06, + "loss": 0.4903, + "num_input_tokens_seen": 7034992, + "step": 20900 + }, + { + "epoch": 16.155332302936632, + "grad_norm": 1.0752851963043213, + "learning_rate": 5.4224175982074575e-06, + "loss": 0.4247, + "num_input_tokens_seen": 7037040, + "step": 20905 + }, + { + "epoch": 16.15919629057187, + "grad_norm": 1.0780686140060425, + "learning_rate": 5.411937065560613e-06, + "loss": 0.4302, + "num_input_tokens_seen": 7038992, + "step": 20910 + }, + { + "epoch": 16.16306027820711, + "grad_norm": 0.8073505163192749, + "learning_rate": 5.401465441682099e-06, + "loss": 0.3726, + "num_input_tokens_seen": 7040880, + "step": 20915 + }, + { + "epoch": 16.16692426584235, + "grad_norm": 1.0599944591522217, + "learning_rate": 5.391002731334466e-06, + "loss": 0.417, + "num_input_tokens_seen": 7042640, + "step": 20920 + }, + { + "epoch": 16.17078825347759, + "grad_norm": 2.3060526847839355, + "learning_rate": 5.380548939276231e-06, + "loss": 0.6172, + "num_input_tokens_seen": 7044528, + "step": 20925 + }, + { + "epoch": 16.174652241112828, + "grad_norm": 0.8815882205963135, + "learning_rate": 5.370104070261836e-06, + "loss": 0.3863, + "num_input_tokens_seen": 7046256, + "step": 20930 + }, + { + "epoch": 16.17851622874807, + "grad_norm": 1.044198751449585, + "learning_rate": 5.359668129041662e-06, + "loss": 0.4433, + "num_input_tokens_seen": 7048016, + "step": 20935 + }, + { + "epoch": 16.182380216383308, + "grad_norm": 1.329807162284851, + "learning_rate": 5.34924112036205e-06, + "loss": 0.4765, + "num_input_tokens_seen": 7049776, + "step": 20940 + }, + { + "epoch": 16.186244204018546, + "grad_norm": 1.047234296798706, + "learning_rate": 5.338823048965261e-06, + "loss": 0.5574, + "num_input_tokens_seen": 7051600, + "step": 20945 + }, + { + "epoch": 16.19010819165379, + "grad_norm": 1.6829049587249756, + "learning_rate": 5.3284139195894924e-06, + "loss": 0.4792, + "num_input_tokens_seen": 7053680, + "step": 20950 + }, + { + "epoch": 16.193972179289027, + "grad_norm": 1.91993248462677, + "learning_rate": 5.318013736968877e-06, + "loss": 0.7749, + "num_input_tokens_seen": 7055440, + "step": 20955 + }, + { + "epoch": 16.197836166924265, + "grad_norm": 1.1349197626113892, + "learning_rate": 5.307622505833493e-06, + "loss": 0.3592, + "num_input_tokens_seen": 7057008, + "step": 20960 + }, + { + "epoch": 16.201700154559504, + "grad_norm": 0.8509416580200195, + "learning_rate": 5.297240230909326e-06, + "loss": 0.6008, + "num_input_tokens_seen": 7058768, + "step": 20965 + }, + { + "epoch": 16.205564142194746, + "grad_norm": 0.8482810854911804, + "learning_rate": 5.2868669169182955e-06, + "loss": 0.4465, + "num_input_tokens_seen": 7060464, + "step": 20970 + }, + { + "epoch": 16.209428129829984, + "grad_norm": 0.7648070454597473, + "learning_rate": 5.2765025685782425e-06, + "loss": 0.415, + "num_input_tokens_seen": 7062000, + "step": 20975 + }, + { + "epoch": 16.213292117465222, + "grad_norm": 1.146429419517517, + "learning_rate": 5.266147190602949e-06, + "loss": 0.4477, + "num_input_tokens_seen": 7063568, + "step": 20980 + }, + { + "epoch": 16.217156105100464, + "grad_norm": 0.9882820844650269, + "learning_rate": 5.255800787702095e-06, + "loss": 0.4964, + "num_input_tokens_seen": 7065520, + "step": 20985 + }, + { + "epoch": 16.221020092735703, + "grad_norm": 1.2167514562606812, + "learning_rate": 5.245463364581277e-06, + "loss": 0.501, + "num_input_tokens_seen": 7067216, + "step": 20990 + }, + { + "epoch": 16.22488408037094, + "grad_norm": 1.1155014038085938, + "learning_rate": 5.235134925942034e-06, + "loss": 0.4011, + "num_input_tokens_seen": 7068784, + "step": 20995 + }, + { + "epoch": 16.228748068006183, + "grad_norm": 1.0774484872817993, + "learning_rate": 5.2248154764817925e-06, + "loss": 0.3766, + "num_input_tokens_seen": 7070096, + "step": 21000 + }, + { + "epoch": 16.23261205564142, + "grad_norm": 1.2173144817352295, + "learning_rate": 5.214505020893903e-06, + "loss": 0.6554, + "num_input_tokens_seen": 7072016, + "step": 21005 + }, + { + "epoch": 16.23647604327666, + "grad_norm": 0.620244562625885, + "learning_rate": 5.204203563867619e-06, + "loss": 0.3316, + "num_input_tokens_seen": 7073584, + "step": 21010 + }, + { + "epoch": 16.240340030911902, + "grad_norm": 1.7188172340393066, + "learning_rate": 5.193911110088101e-06, + "loss": 0.4588, + "num_input_tokens_seen": 7075600, + "step": 21015 + }, + { + "epoch": 16.24420401854714, + "grad_norm": 1.326581358909607, + "learning_rate": 5.183627664236429e-06, + "loss": 0.5216, + "num_input_tokens_seen": 7077264, + "step": 21020 + }, + { + "epoch": 16.24806800618238, + "grad_norm": 1.8985950946807861, + "learning_rate": 5.173353230989567e-06, + "loss": 0.66, + "num_input_tokens_seen": 7079312, + "step": 21025 + }, + { + "epoch": 16.25193199381762, + "grad_norm": 0.9802998304367065, + "learning_rate": 5.163087815020398e-06, + "loss": 0.3292, + "num_input_tokens_seen": 7081072, + "step": 21030 + }, + { + "epoch": 16.25579598145286, + "grad_norm": 1.0888473987579346, + "learning_rate": 5.152831420997689e-06, + "loss": 0.4214, + "num_input_tokens_seen": 7082896, + "step": 21035 + }, + { + "epoch": 16.259659969088098, + "grad_norm": 0.8955757021903992, + "learning_rate": 5.1425840535861106e-06, + "loss": 0.4466, + "num_input_tokens_seen": 7084560, + "step": 21040 + }, + { + "epoch": 16.26352395672334, + "grad_norm": 0.8911628723144531, + "learning_rate": 5.132345717446227e-06, + "loss": 0.4093, + "num_input_tokens_seen": 7085936, + "step": 21045 + }, + { + "epoch": 16.26738794435858, + "grad_norm": 0.8940807580947876, + "learning_rate": 5.12211641723449e-06, + "loss": 0.523, + "num_input_tokens_seen": 7087536, + "step": 21050 + }, + { + "epoch": 16.271251931993817, + "grad_norm": 1.226826786994934, + "learning_rate": 5.111896157603246e-06, + "loss": 0.4654, + "num_input_tokens_seen": 7089360, + "step": 21055 + }, + { + "epoch": 16.27511591962906, + "grad_norm": 0.751864492893219, + "learning_rate": 5.101684943200735e-06, + "loss": 0.3581, + "num_input_tokens_seen": 7091312, + "step": 21060 + }, + { + "epoch": 16.278979907264297, + "grad_norm": 1.386484980583191, + "learning_rate": 5.091482778671086e-06, + "loss": 0.4595, + "num_input_tokens_seen": 7092880, + "step": 21065 + }, + { + "epoch": 16.282843894899536, + "grad_norm": 1.5253369808197021, + "learning_rate": 5.081289668654296e-06, + "loss": 0.4944, + "num_input_tokens_seen": 7094768, + "step": 21070 + }, + { + "epoch": 16.286707882534778, + "grad_norm": 0.8404417634010315, + "learning_rate": 5.071105617786251e-06, + "loss": 0.4356, + "num_input_tokens_seen": 7096176, + "step": 21075 + }, + { + "epoch": 16.290571870170016, + "grad_norm": 0.7593590021133423, + "learning_rate": 5.060930630698724e-06, + "loss": 0.331, + "num_input_tokens_seen": 7097552, + "step": 21080 + }, + { + "epoch": 16.294435857805254, + "grad_norm": 0.6707010269165039, + "learning_rate": 5.050764712019354e-06, + "loss": 0.6286, + "num_input_tokens_seen": 7099312, + "step": 21085 + }, + { + "epoch": 16.298299845440496, + "grad_norm": 0.713027834892273, + "learning_rate": 5.040607866371658e-06, + "loss": 0.3532, + "num_input_tokens_seen": 7100912, + "step": 21090 + }, + { + "epoch": 16.302163833075735, + "grad_norm": 0.7900995016098022, + "learning_rate": 5.030460098375037e-06, + "loss": 0.3584, + "num_input_tokens_seen": 7102320, + "step": 21095 + }, + { + "epoch": 16.306027820710973, + "grad_norm": 1.1353850364685059, + "learning_rate": 5.0203214126447625e-06, + "loss": 0.3617, + "num_input_tokens_seen": 7104144, + "step": 21100 + }, + { + "epoch": 16.30989180834621, + "grad_norm": 1.274515986442566, + "learning_rate": 5.010191813791962e-06, + "loss": 0.3741, + "num_input_tokens_seen": 7105904, + "step": 21105 + }, + { + "epoch": 16.313755795981454, + "grad_norm": 1.4876877069473267, + "learning_rate": 5.00007130642364e-06, + "loss": 0.4342, + "num_input_tokens_seen": 7107760, + "step": 21110 + }, + { + "epoch": 16.317619783616692, + "grad_norm": 0.8889727592468262, + "learning_rate": 4.989959895142663e-06, + "loss": 0.5823, + "num_input_tokens_seen": 7109136, + "step": 21115 + }, + { + "epoch": 16.32148377125193, + "grad_norm": 0.9930894374847412, + "learning_rate": 4.979857584547762e-06, + "loss": 0.6356, + "num_input_tokens_seen": 7110832, + "step": 21120 + }, + { + "epoch": 16.325347758887172, + "grad_norm": 1.3517951965332031, + "learning_rate": 4.969764379233518e-06, + "loss": 0.3834, + "num_input_tokens_seen": 7112784, + "step": 21125 + }, + { + "epoch": 16.32921174652241, + "grad_norm": 0.8644434809684753, + "learning_rate": 4.959680283790399e-06, + "loss": 0.385, + "num_input_tokens_seen": 7114448, + "step": 21130 + }, + { + "epoch": 16.33307573415765, + "grad_norm": 0.9017316102981567, + "learning_rate": 4.9496053028046965e-06, + "loss": 0.4848, + "num_input_tokens_seen": 7116304, + "step": 21135 + }, + { + "epoch": 16.33693972179289, + "grad_norm": 1.4070026874542236, + "learning_rate": 4.939539440858587e-06, + "loss": 0.4707, + "num_input_tokens_seen": 7117968, + "step": 21140 + }, + { + "epoch": 16.34080370942813, + "grad_norm": 0.9856851696968079, + "learning_rate": 4.929482702530078e-06, + "loss": 0.4967, + "num_input_tokens_seen": 7119984, + "step": 21145 + }, + { + "epoch": 16.344667697063368, + "grad_norm": 1.0887386798858643, + "learning_rate": 4.919435092393032e-06, + "loss": 0.4619, + "num_input_tokens_seen": 7121712, + "step": 21150 + }, + { + "epoch": 16.34853168469861, + "grad_norm": 0.8341226577758789, + "learning_rate": 4.909396615017164e-06, + "loss": 0.3938, + "num_input_tokens_seen": 7123344, + "step": 21155 + }, + { + "epoch": 16.35239567233385, + "grad_norm": 1.0242809057235718, + "learning_rate": 4.899367274968028e-06, + "loss": 0.3662, + "num_input_tokens_seen": 7125136, + "step": 21160 + }, + { + "epoch": 16.356259659969087, + "grad_norm": 1.2410309314727783, + "learning_rate": 4.889347076807038e-06, + "loss": 0.7163, + "num_input_tokens_seen": 7126896, + "step": 21165 + }, + { + "epoch": 16.36012364760433, + "grad_norm": 1.055590271949768, + "learning_rate": 4.879336025091435e-06, + "loss": 0.5302, + "num_input_tokens_seen": 7128784, + "step": 21170 + }, + { + "epoch": 16.363987635239567, + "grad_norm": 0.9632639288902283, + "learning_rate": 4.869334124374303e-06, + "loss": 0.4134, + "num_input_tokens_seen": 7130320, + "step": 21175 + }, + { + "epoch": 16.367851622874806, + "grad_norm": 0.8615253567695618, + "learning_rate": 4.859341379204571e-06, + "loss": 0.4884, + "num_input_tokens_seen": 7131856, + "step": 21180 + }, + { + "epoch": 16.371715610510048, + "grad_norm": 1.7437676191329956, + "learning_rate": 4.849357794126999e-06, + "loss": 0.7373, + "num_input_tokens_seen": 7133360, + "step": 21185 + }, + { + "epoch": 16.375579598145286, + "grad_norm": 0.7939372062683105, + "learning_rate": 4.8393833736821795e-06, + "loss": 0.4523, + "num_input_tokens_seen": 7134928, + "step": 21190 + }, + { + "epoch": 16.379443585780525, + "grad_norm": 0.7917546629905701, + "learning_rate": 4.8294181224065345e-06, + "loss": 0.4401, + "num_input_tokens_seen": 7136400, + "step": 21195 + }, + { + "epoch": 16.383307573415767, + "grad_norm": 0.842266321182251, + "learning_rate": 4.8194620448323294e-06, + "loss": 0.3156, + "num_input_tokens_seen": 7137936, + "step": 21200 + }, + { + "epoch": 16.387171561051005, + "grad_norm": 2.442293882369995, + "learning_rate": 4.809515145487642e-06, + "loss": 0.6559, + "num_input_tokens_seen": 7139632, + "step": 21205 + }, + { + "epoch": 16.391035548686244, + "grad_norm": 0.7992222309112549, + "learning_rate": 4.799577428896385e-06, + "loss": 0.4694, + "num_input_tokens_seen": 7141520, + "step": 21210 + }, + { + "epoch": 16.394899536321482, + "grad_norm": 0.9714305400848389, + "learning_rate": 4.789648899578278e-06, + "loss": 0.3793, + "num_input_tokens_seen": 7143216, + "step": 21215 + }, + { + "epoch": 16.398763523956724, + "grad_norm": 0.8824301958084106, + "learning_rate": 4.7797295620488954e-06, + "loss": 0.4402, + "num_input_tokens_seen": 7144624, + "step": 21220 + }, + { + "epoch": 16.402627511591962, + "grad_norm": 0.946717381477356, + "learning_rate": 4.7698194208196045e-06, + "loss": 0.4003, + "num_input_tokens_seen": 7146608, + "step": 21225 + }, + { + "epoch": 16.4064914992272, + "grad_norm": 1.1121662855148315, + "learning_rate": 4.759918480397585e-06, + "loss": 0.3602, + "num_input_tokens_seen": 7147984, + "step": 21230 + }, + { + "epoch": 16.410355486862443, + "grad_norm": 1.0465922355651855, + "learning_rate": 4.750026745285863e-06, + "loss": 0.4634, + "num_input_tokens_seen": 7149584, + "step": 21235 + }, + { + "epoch": 16.41421947449768, + "grad_norm": 1.068784475326538, + "learning_rate": 4.740144219983247e-06, + "loss": 0.4074, + "num_input_tokens_seen": 7151280, + "step": 21240 + }, + { + "epoch": 16.41808346213292, + "grad_norm": 0.9258303046226501, + "learning_rate": 4.7302709089843744e-06, + "loss": 0.5798, + "num_input_tokens_seen": 7152912, + "step": 21245 + }, + { + "epoch": 16.42194744976816, + "grad_norm": 0.8284452557563782, + "learning_rate": 4.720406816779679e-06, + "loss": 0.3611, + "num_input_tokens_seen": 7154256, + "step": 21250 + }, + { + "epoch": 16.4258114374034, + "grad_norm": 1.1494745016098022, + "learning_rate": 4.71055194785541e-06, + "loss": 0.3545, + "num_input_tokens_seen": 7155888, + "step": 21255 + }, + { + "epoch": 16.42967542503864, + "grad_norm": 0.8026381134986877, + "learning_rate": 4.700706306693628e-06, + "loss": 0.3991, + "num_input_tokens_seen": 7157424, + "step": 21260 + }, + { + "epoch": 16.43353941267388, + "grad_norm": 0.9038469791412354, + "learning_rate": 4.69086989777218e-06, + "loss": 0.3662, + "num_input_tokens_seen": 7159152, + "step": 21265 + }, + { + "epoch": 16.43740340030912, + "grad_norm": 1.6145529747009277, + "learning_rate": 4.681042725564735e-06, + "loss": 0.5589, + "num_input_tokens_seen": 7160976, + "step": 21270 + }, + { + "epoch": 16.441267387944357, + "grad_norm": 0.7096735239028931, + "learning_rate": 4.671224794540746e-06, + "loss": 0.4519, + "num_input_tokens_seen": 7163120, + "step": 21275 + }, + { + "epoch": 16.4451313755796, + "grad_norm": 0.7602511048316956, + "learning_rate": 4.661416109165462e-06, + "loss": 0.5052, + "num_input_tokens_seen": 7164848, + "step": 21280 + }, + { + "epoch": 16.448995363214838, + "grad_norm": 0.9985172748565674, + "learning_rate": 4.651616673899936e-06, + "loss": 0.3517, + "num_input_tokens_seen": 7166576, + "step": 21285 + }, + { + "epoch": 16.452859350850076, + "grad_norm": 1.1432689428329468, + "learning_rate": 4.641826493201007e-06, + "loss": 0.3675, + "num_input_tokens_seen": 7168112, + "step": 21290 + }, + { + "epoch": 16.456723338485318, + "grad_norm": 1.1651647090911865, + "learning_rate": 4.632045571521304e-06, + "loss": 0.6265, + "num_input_tokens_seen": 7169680, + "step": 21295 + }, + { + "epoch": 16.460587326120557, + "grad_norm": 0.710870087146759, + "learning_rate": 4.6222739133092605e-06, + "loss": 0.3541, + "num_input_tokens_seen": 7171312, + "step": 21300 + }, + { + "epoch": 16.464451313755795, + "grad_norm": 0.6575993299484253, + "learning_rate": 4.6125115230090724e-06, + "loss": 0.4194, + "num_input_tokens_seen": 7172912, + "step": 21305 + }, + { + "epoch": 16.468315301391037, + "grad_norm": 1.106665015220642, + "learning_rate": 4.602758405060745e-06, + "loss": 0.6613, + "num_input_tokens_seen": 7174608, + "step": 21310 + }, + { + "epoch": 16.472179289026275, + "grad_norm": 0.5727087259292603, + "learning_rate": 4.59301456390005e-06, + "loss": 0.4658, + "num_input_tokens_seen": 7176176, + "step": 21315 + }, + { + "epoch": 16.476043276661514, + "grad_norm": 1.0357130765914917, + "learning_rate": 4.583280003958546e-06, + "loss": 0.4083, + "num_input_tokens_seen": 7177872, + "step": 21320 + }, + { + "epoch": 16.479907264296756, + "grad_norm": 1.5137066841125488, + "learning_rate": 4.573554729663562e-06, + "loss": 0.477, + "num_input_tokens_seen": 7179600, + "step": 21325 + }, + { + "epoch": 16.483771251931994, + "grad_norm": 1.3938734531402588, + "learning_rate": 4.563838745438215e-06, + "loss": 0.5426, + "num_input_tokens_seen": 7181232, + "step": 21330 + }, + { + "epoch": 16.487635239567233, + "grad_norm": 1.357791543006897, + "learning_rate": 4.554132055701396e-06, + "loss": 0.4913, + "num_input_tokens_seen": 7182896, + "step": 21335 + }, + { + "epoch": 16.491499227202475, + "grad_norm": 0.9916489124298096, + "learning_rate": 4.544434664867761e-06, + "loss": 0.5445, + "num_input_tokens_seen": 7184496, + "step": 21340 + }, + { + "epoch": 16.495363214837713, + "grad_norm": 1.8149871826171875, + "learning_rate": 4.534746577347748e-06, + "loss": 0.4619, + "num_input_tokens_seen": 7186032, + "step": 21345 + }, + { + "epoch": 16.49922720247295, + "grad_norm": 1.0521920919418335, + "learning_rate": 4.525067797547553e-06, + "loss": 0.3453, + "num_input_tokens_seen": 7187760, + "step": 21350 + }, + { + "epoch": 16.50309119010819, + "grad_norm": 0.5833854675292969, + "learning_rate": 4.515398329869144e-06, + "loss": 0.457, + "num_input_tokens_seen": 7189520, + "step": 21355 + }, + { + "epoch": 16.506955177743432, + "grad_norm": 0.9878664612770081, + "learning_rate": 4.505738178710253e-06, + "loss": 0.4128, + "num_input_tokens_seen": 7191120, + "step": 21360 + }, + { + "epoch": 16.51081916537867, + "grad_norm": 1.0625340938568115, + "learning_rate": 4.496087348464365e-06, + "loss": 0.3457, + "num_input_tokens_seen": 7192816, + "step": 21365 + }, + { + "epoch": 16.51468315301391, + "grad_norm": 1.2318980693817139, + "learning_rate": 4.486445843520751e-06, + "loss": 0.3896, + "num_input_tokens_seen": 7194480, + "step": 21370 + }, + { + "epoch": 16.51854714064915, + "grad_norm": 2.1041624546051025, + "learning_rate": 4.4768136682644124e-06, + "loss": 0.5689, + "num_input_tokens_seen": 7196336, + "step": 21375 + }, + { + "epoch": 16.52241112828439, + "grad_norm": 0.583465039730072, + "learning_rate": 4.467190827076134e-06, + "loss": 0.4212, + "num_input_tokens_seen": 7197840, + "step": 21380 + }, + { + "epoch": 16.526275115919628, + "grad_norm": 0.898486316204071, + "learning_rate": 4.457577324332432e-06, + "loss": 0.6061, + "num_input_tokens_seen": 7199376, + "step": 21385 + }, + { + "epoch": 16.53013910355487, + "grad_norm": 0.6913543343544006, + "learning_rate": 4.447973164405586e-06, + "loss": 0.5318, + "num_input_tokens_seen": 7200976, + "step": 21390 + }, + { + "epoch": 16.534003091190108, + "grad_norm": 1.5172858238220215, + "learning_rate": 4.438378351663627e-06, + "loss": 0.604, + "num_input_tokens_seen": 7202576, + "step": 21395 + }, + { + "epoch": 16.537867078825347, + "grad_norm": 0.876370906829834, + "learning_rate": 4.428792890470332e-06, + "loss": 0.4291, + "num_input_tokens_seen": 7204112, + "step": 21400 + }, + { + "epoch": 16.54173106646059, + "grad_norm": 0.7177854180335999, + "learning_rate": 4.419216785185221e-06, + "loss": 0.3412, + "num_input_tokens_seen": 7205648, + "step": 21405 + }, + { + "epoch": 16.545595054095827, + "grad_norm": 0.9952713847160339, + "learning_rate": 4.4096500401635734e-06, + "loss": 0.4762, + "num_input_tokens_seen": 7207216, + "step": 21410 + }, + { + "epoch": 16.549459041731065, + "grad_norm": 0.7078539729118347, + "learning_rate": 4.400092659756397e-06, + "loss": 0.4597, + "num_input_tokens_seen": 7208880, + "step": 21415 + }, + { + "epoch": 16.553323029366307, + "grad_norm": 1.1234594583511353, + "learning_rate": 4.390544648310449e-06, + "loss": 0.5982, + "num_input_tokens_seen": 7210480, + "step": 21420 + }, + { + "epoch": 16.557187017001546, + "grad_norm": 1.2430055141448975, + "learning_rate": 4.38100601016822e-06, + "loss": 0.4044, + "num_input_tokens_seen": 7212304, + "step": 21425 + }, + { + "epoch": 16.561051004636784, + "grad_norm": 1.622780203819275, + "learning_rate": 4.371476749667941e-06, + "loss": 0.6427, + "num_input_tokens_seen": 7213712, + "step": 21430 + }, + { + "epoch": 16.564914992272026, + "grad_norm": 1.1926122903823853, + "learning_rate": 4.361956871143577e-06, + "loss": 0.4588, + "num_input_tokens_seen": 7215408, + "step": 21435 + }, + { + "epoch": 16.568778979907265, + "grad_norm": 1.5380171537399292, + "learning_rate": 4.352446378924818e-06, + "loss": 0.5192, + "num_input_tokens_seen": 7216912, + "step": 21440 + }, + { + "epoch": 16.572642967542503, + "grad_norm": 0.8744388222694397, + "learning_rate": 4.342945277337104e-06, + "loss": 0.5764, + "num_input_tokens_seen": 7218832, + "step": 21445 + }, + { + "epoch": 16.576506955177745, + "grad_norm": 0.7078390717506409, + "learning_rate": 4.333453570701587e-06, + "loss": 0.3674, + "num_input_tokens_seen": 7220304, + "step": 21450 + }, + { + "epoch": 16.580370942812984, + "grad_norm": 1.5411580801010132, + "learning_rate": 4.32397126333515e-06, + "loss": 0.426, + "num_input_tokens_seen": 7221840, + "step": 21455 + }, + { + "epoch": 16.584234930448222, + "grad_norm": 1.2212163209915161, + "learning_rate": 4.314498359550412e-06, + "loss": 0.3709, + "num_input_tokens_seen": 7223408, + "step": 21460 + }, + { + "epoch": 16.58809891808346, + "grad_norm": 0.874761164188385, + "learning_rate": 4.3050348636556994e-06, + "loss": 0.521, + "num_input_tokens_seen": 7225520, + "step": 21465 + }, + { + "epoch": 16.591962905718702, + "grad_norm": 1.2480387687683105, + "learning_rate": 4.295580779955066e-06, + "loss": 0.3768, + "num_input_tokens_seen": 7227504, + "step": 21470 + }, + { + "epoch": 16.59582689335394, + "grad_norm": 0.6545411348342896, + "learning_rate": 4.286136112748285e-06, + "loss": 0.3169, + "num_input_tokens_seen": 7229232, + "step": 21475 + }, + { + "epoch": 16.59969088098918, + "grad_norm": 1.5878156423568726, + "learning_rate": 4.276700866330854e-06, + "loss": 0.6232, + "num_input_tokens_seen": 7230768, + "step": 21480 + }, + { + "epoch": 16.60355486862442, + "grad_norm": 1.1139289140701294, + "learning_rate": 4.267275044993979e-06, + "loss": 0.4723, + "num_input_tokens_seen": 7232656, + "step": 21485 + }, + { + "epoch": 16.60741885625966, + "grad_norm": 1.6868337392807007, + "learning_rate": 4.257858653024577e-06, + "loss": 0.7142, + "num_input_tokens_seen": 7234384, + "step": 21490 + }, + { + "epoch": 16.611282843894898, + "grad_norm": 0.8314191102981567, + "learning_rate": 4.248451694705271e-06, + "loss": 0.3285, + "num_input_tokens_seen": 7235984, + "step": 21495 + }, + { + "epoch": 16.61514683153014, + "grad_norm": 0.9028626084327698, + "learning_rate": 4.239054174314417e-06, + "loss": 0.4199, + "num_input_tokens_seen": 7237680, + "step": 21500 + }, + { + "epoch": 16.61901081916538, + "grad_norm": 1.172930359840393, + "learning_rate": 4.229666096126056e-06, + "loss": 0.4433, + "num_input_tokens_seen": 7239568, + "step": 21505 + }, + { + "epoch": 16.622874806800617, + "grad_norm": 0.6822320818901062, + "learning_rate": 4.220287464409939e-06, + "loss": 0.63, + "num_input_tokens_seen": 7241264, + "step": 21510 + }, + { + "epoch": 16.62673879443586, + "grad_norm": 0.704092800617218, + "learning_rate": 4.210918283431534e-06, + "loss": 0.4059, + "num_input_tokens_seen": 7243056, + "step": 21515 + }, + { + "epoch": 16.630602782071097, + "grad_norm": 1.5867959260940552, + "learning_rate": 4.201558557451993e-06, + "loss": 0.3874, + "num_input_tokens_seen": 7244592, + "step": 21520 + }, + { + "epoch": 16.634466769706336, + "grad_norm": 1.8626105785369873, + "learning_rate": 4.192208290728178e-06, + "loss": 0.3857, + "num_input_tokens_seen": 7246256, + "step": 21525 + }, + { + "epoch": 16.638330757341578, + "grad_norm": 2.1486525535583496, + "learning_rate": 4.182867487512645e-06, + "loss": 0.4619, + "num_input_tokens_seen": 7248016, + "step": 21530 + }, + { + "epoch": 16.642194744976816, + "grad_norm": 0.9108997583389282, + "learning_rate": 4.173536152053642e-06, + "loss": 0.4227, + "num_input_tokens_seen": 7249680, + "step": 21535 + }, + { + "epoch": 16.646058732612055, + "grad_norm": 0.7319694757461548, + "learning_rate": 4.164214288595128e-06, + "loss": 0.3829, + "num_input_tokens_seen": 7251248, + "step": 21540 + }, + { + "epoch": 16.649922720247297, + "grad_norm": 0.8625435829162598, + "learning_rate": 4.154901901376729e-06, + "loss": 0.4136, + "num_input_tokens_seen": 7253072, + "step": 21545 + }, + { + "epoch": 16.653786707882535, + "grad_norm": 0.9442502856254578, + "learning_rate": 4.14559899463379e-06, + "loss": 0.5242, + "num_input_tokens_seen": 7254640, + "step": 21550 + }, + { + "epoch": 16.657650695517773, + "grad_norm": 1.062147617340088, + "learning_rate": 4.136305572597318e-06, + "loss": 0.4257, + "num_input_tokens_seen": 7256464, + "step": 21555 + }, + { + "epoch": 16.661514683153015, + "grad_norm": 1.1697014570236206, + "learning_rate": 4.127021639494022e-06, + "loss": 0.4179, + "num_input_tokens_seen": 7258192, + "step": 21560 + }, + { + "epoch": 16.665378670788254, + "grad_norm": 0.7938528060913086, + "learning_rate": 4.117747199546285e-06, + "loss": 0.3848, + "num_input_tokens_seen": 7259728, + "step": 21565 + }, + { + "epoch": 16.669242658423492, + "grad_norm": 0.9116014242172241, + "learning_rate": 4.108482256972182e-06, + "loss": 0.3998, + "num_input_tokens_seen": 7261680, + "step": 21570 + }, + { + "epoch": 16.673106646058734, + "grad_norm": 0.8527852892875671, + "learning_rate": 4.099226815985458e-06, + "loss": 0.6086, + "num_input_tokens_seen": 7263440, + "step": 21575 + }, + { + "epoch": 16.676970633693973, + "grad_norm": 0.8077791333198547, + "learning_rate": 4.089980880795543e-06, + "loss": 0.3584, + "num_input_tokens_seen": 7264976, + "step": 21580 + }, + { + "epoch": 16.68083462132921, + "grad_norm": 1.2751842737197876, + "learning_rate": 4.08074445560756e-06, + "loss": 0.4328, + "num_input_tokens_seen": 7266608, + "step": 21585 + }, + { + "epoch": 16.684698608964453, + "grad_norm": 0.642117977142334, + "learning_rate": 4.071517544622278e-06, + "loss": 0.3407, + "num_input_tokens_seen": 7268304, + "step": 21590 + }, + { + "epoch": 16.68856259659969, + "grad_norm": 0.8153823018074036, + "learning_rate": 4.0623001520361494e-06, + "loss": 0.5094, + "num_input_tokens_seen": 7270128, + "step": 21595 + }, + { + "epoch": 16.69242658423493, + "grad_norm": 1.2434595823287964, + "learning_rate": 4.053092282041307e-06, + "loss": 0.5128, + "num_input_tokens_seen": 7271888, + "step": 21600 + }, + { + "epoch": 16.69629057187017, + "grad_norm": 1.4376749992370605, + "learning_rate": 4.043893938825538e-06, + "loss": 0.4104, + "num_input_tokens_seen": 7273456, + "step": 21605 + }, + { + "epoch": 16.70015455950541, + "grad_norm": 1.9159314632415771, + "learning_rate": 4.034705126572299e-06, + "loss": 0.3653, + "num_input_tokens_seen": 7275120, + "step": 21610 + }, + { + "epoch": 16.70401854714065, + "grad_norm": 1.6465415954589844, + "learning_rate": 4.025525849460729e-06, + "loss": 0.6028, + "num_input_tokens_seen": 7277040, + "step": 21615 + }, + { + "epoch": 16.707882534775887, + "grad_norm": 2.100062608718872, + "learning_rate": 4.016356111665617e-06, + "loss": 0.4567, + "num_input_tokens_seen": 7278768, + "step": 21620 + }, + { + "epoch": 16.71174652241113, + "grad_norm": 1.714015245437622, + "learning_rate": 4.007195917357412e-06, + "loss": 0.5255, + "num_input_tokens_seen": 7280176, + "step": 21625 + }, + { + "epoch": 16.715610510046368, + "grad_norm": 0.858583927154541, + "learning_rate": 3.998045270702227e-06, + "loss": 0.4495, + "num_input_tokens_seen": 7281744, + "step": 21630 + }, + { + "epoch": 16.719474497681606, + "grad_norm": 0.8245298862457275, + "learning_rate": 3.988904175861827e-06, + "loss": 0.3232, + "num_input_tokens_seen": 7283248, + "step": 21635 + }, + { + "epoch": 16.723338485316848, + "grad_norm": 1.1458126306533813, + "learning_rate": 3.979772636993636e-06, + "loss": 0.5352, + "num_input_tokens_seen": 7285008, + "step": 21640 + }, + { + "epoch": 16.727202472952087, + "grad_norm": 0.6978813409805298, + "learning_rate": 3.970650658250732e-06, + "loss": 0.3537, + "num_input_tokens_seen": 7286800, + "step": 21645 + }, + { + "epoch": 16.731066460587325, + "grad_norm": 1.3331215381622314, + "learning_rate": 3.961538243781854e-06, + "loss": 0.3784, + "num_input_tokens_seen": 7288592, + "step": 21650 + }, + { + "epoch": 16.734930448222567, + "grad_norm": 0.7797898054122925, + "learning_rate": 3.9524353977313715e-06, + "loss": 0.5653, + "num_input_tokens_seen": 7290288, + "step": 21655 + }, + { + "epoch": 16.738794435857805, + "grad_norm": 0.8978723883628845, + "learning_rate": 3.943342124239324e-06, + "loss": 0.3941, + "num_input_tokens_seen": 7291760, + "step": 21660 + }, + { + "epoch": 16.742658423493044, + "grad_norm": 1.2752972841262817, + "learning_rate": 3.934258427441381e-06, + "loss": 0.6095, + "num_input_tokens_seen": 7293392, + "step": 21665 + }, + { + "epoch": 16.746522411128286, + "grad_norm": 0.9432182908058167, + "learning_rate": 3.925184311468865e-06, + "loss": 0.4557, + "num_input_tokens_seen": 7294832, + "step": 21670 + }, + { + "epoch": 16.750386398763524, + "grad_norm": 0.937378466129303, + "learning_rate": 3.916119780448735e-06, + "loss": 0.6541, + "num_input_tokens_seen": 7296368, + "step": 21675 + }, + { + "epoch": 16.754250386398763, + "grad_norm": 2.1365854740142822, + "learning_rate": 3.907064838503591e-06, + "loss": 0.519, + "num_input_tokens_seen": 7297968, + "step": 21680 + }, + { + "epoch": 16.758114374034005, + "grad_norm": 0.9801201820373535, + "learning_rate": 3.898019489751684e-06, + "loss": 0.4595, + "num_input_tokens_seen": 7299696, + "step": 21685 + }, + { + "epoch": 16.761978361669243, + "grad_norm": 1.0859501361846924, + "learning_rate": 3.8889837383068864e-06, + "loss": 0.4471, + "num_input_tokens_seen": 7301552, + "step": 21690 + }, + { + "epoch": 16.76584234930448, + "grad_norm": 1.4675029516220093, + "learning_rate": 3.879957588278707e-06, + "loss": 0.4844, + "num_input_tokens_seen": 7303376, + "step": 21695 + }, + { + "epoch": 16.769706336939723, + "grad_norm": 1.7275809049606323, + "learning_rate": 3.870941043772308e-06, + "loss": 0.6296, + "num_input_tokens_seen": 7304976, + "step": 21700 + }, + { + "epoch": 16.773570324574962, + "grad_norm": 0.9393656849861145, + "learning_rate": 3.8619341088884595e-06, + "loss": 0.3773, + "num_input_tokens_seen": 7306608, + "step": 21705 + }, + { + "epoch": 16.7774343122102, + "grad_norm": 0.5222443342208862, + "learning_rate": 3.852936787723568e-06, + "loss": 0.3507, + "num_input_tokens_seen": 7308304, + "step": 21710 + }, + { + "epoch": 16.78129829984544, + "grad_norm": 1.2508381605148315, + "learning_rate": 3.843949084369663e-06, + "loss": 0.4443, + "num_input_tokens_seen": 7309936, + "step": 21715 + }, + { + "epoch": 16.78516228748068, + "grad_norm": 0.8647729754447937, + "learning_rate": 3.83497100291442e-06, + "loss": 0.4244, + "num_input_tokens_seen": 7311472, + "step": 21720 + }, + { + "epoch": 16.78902627511592, + "grad_norm": 0.5969973206520081, + "learning_rate": 3.826002547441118e-06, + "loss": 0.3465, + "num_input_tokens_seen": 7313168, + "step": 21725 + }, + { + "epoch": 16.792890262751158, + "grad_norm": 0.757500946521759, + "learning_rate": 3.817043722028663e-06, + "loss": 0.3603, + "num_input_tokens_seen": 7314640, + "step": 21730 + }, + { + "epoch": 16.7967542503864, + "grad_norm": 1.8316437005996704, + "learning_rate": 3.808094530751577e-06, + "loss": 0.5129, + "num_input_tokens_seen": 7316208, + "step": 21735 + }, + { + "epoch": 16.800618238021638, + "grad_norm": 0.8329024314880371, + "learning_rate": 3.7991549776800197e-06, + "loss": 0.3827, + "num_input_tokens_seen": 7317872, + "step": 21740 + }, + { + "epoch": 16.804482225656876, + "grad_norm": 1.0698034763336182, + "learning_rate": 3.7902250668797435e-06, + "loss": 0.4954, + "num_input_tokens_seen": 7319728, + "step": 21745 + }, + { + "epoch": 16.80834621329212, + "grad_norm": 1.2088043689727783, + "learning_rate": 3.7813048024121196e-06, + "loss": 0.3814, + "num_input_tokens_seen": 7321456, + "step": 21750 + }, + { + "epoch": 16.812210200927357, + "grad_norm": 1.3958659172058105, + "learning_rate": 3.7723941883341526e-06, + "loss": 0.3854, + "num_input_tokens_seen": 7323056, + "step": 21755 + }, + { + "epoch": 16.816074188562595, + "grad_norm": 0.8333188891410828, + "learning_rate": 3.7634932286984363e-06, + "loss": 0.3988, + "num_input_tokens_seen": 7324560, + "step": 21760 + }, + { + "epoch": 16.819938176197837, + "grad_norm": 1.207977533340454, + "learning_rate": 3.7546019275531806e-06, + "loss": 0.4389, + "num_input_tokens_seen": 7326096, + "step": 21765 + }, + { + "epoch": 16.823802163833076, + "grad_norm": 0.8682445287704468, + "learning_rate": 3.7457202889422004e-06, + "loss": 0.4214, + "num_input_tokens_seen": 7327664, + "step": 21770 + }, + { + "epoch": 16.827666151468314, + "grad_norm": 1.728540062904358, + "learning_rate": 3.736848316904923e-06, + "loss": 0.3944, + "num_input_tokens_seen": 7329456, + "step": 21775 + }, + { + "epoch": 16.831530139103556, + "grad_norm": 1.4719874858856201, + "learning_rate": 3.727986015476362e-06, + "loss": 0.4854, + "num_input_tokens_seen": 7331024, + "step": 21780 + }, + { + "epoch": 16.835394126738795, + "grad_norm": 1.0506550073623657, + "learning_rate": 3.7191333886871543e-06, + "loss": 0.4676, + "num_input_tokens_seen": 7332752, + "step": 21785 + }, + { + "epoch": 16.839258114374033, + "grad_norm": 0.8378403186798096, + "learning_rate": 3.710290440563535e-06, + "loss": 0.477, + "num_input_tokens_seen": 7334576, + "step": 21790 + }, + { + "epoch": 16.843122102009275, + "grad_norm": 0.7384207248687744, + "learning_rate": 3.7014571751273207e-06, + "loss": 0.4574, + "num_input_tokens_seen": 7336464, + "step": 21795 + }, + { + "epoch": 16.846986089644513, + "grad_norm": 1.084450125694275, + "learning_rate": 3.692633596395936e-06, + "loss": 0.4376, + "num_input_tokens_seen": 7338000, + "step": 21800 + }, + { + "epoch": 16.850850077279752, + "grad_norm": 1.016628384590149, + "learning_rate": 3.6838197083823965e-06, + "loss": 0.5168, + "num_input_tokens_seen": 7339888, + "step": 21805 + }, + { + "epoch": 16.854714064914994, + "grad_norm": 0.9261325597763062, + "learning_rate": 3.675015515095312e-06, + "loss": 0.4795, + "num_input_tokens_seen": 7341520, + "step": 21810 + }, + { + "epoch": 16.858578052550232, + "grad_norm": 0.899547815322876, + "learning_rate": 3.6662210205388766e-06, + "loss": 0.3884, + "num_input_tokens_seen": 7343216, + "step": 21815 + }, + { + "epoch": 16.86244204018547, + "grad_norm": 1.0074747800827026, + "learning_rate": 3.657436228712882e-06, + "loss": 0.3993, + "num_input_tokens_seen": 7344848, + "step": 21820 + }, + { + "epoch": 16.866306027820713, + "grad_norm": 0.8787218928337097, + "learning_rate": 3.648661143612711e-06, + "loss": 0.4632, + "num_input_tokens_seen": 7346736, + "step": 21825 + }, + { + "epoch": 16.87017001545595, + "grad_norm": 1.6872434616088867, + "learning_rate": 3.6398957692293205e-06, + "loss": 0.4331, + "num_input_tokens_seen": 7348464, + "step": 21830 + }, + { + "epoch": 16.87403400309119, + "grad_norm": 0.847767174243927, + "learning_rate": 3.631140109549258e-06, + "loss": 0.5605, + "num_input_tokens_seen": 7350320, + "step": 21835 + }, + { + "epoch": 16.87789799072643, + "grad_norm": 1.062722086906433, + "learning_rate": 3.622394168554644e-06, + "loss": 0.4589, + "num_input_tokens_seen": 7352176, + "step": 21840 + }, + { + "epoch": 16.88176197836167, + "grad_norm": 0.6494854688644409, + "learning_rate": 3.613657950223187e-06, + "loss": 0.552, + "num_input_tokens_seen": 7354000, + "step": 21845 + }, + { + "epoch": 16.88562596599691, + "grad_norm": 1.3545957803726196, + "learning_rate": 3.6049314585281686e-06, + "loss": 0.5441, + "num_input_tokens_seen": 7355760, + "step": 21850 + }, + { + "epoch": 16.889489953632147, + "grad_norm": 0.8064526319503784, + "learning_rate": 3.5962146974384575e-06, + "loss": 0.3709, + "num_input_tokens_seen": 7357328, + "step": 21855 + }, + { + "epoch": 16.89335394126739, + "grad_norm": 0.4999433159828186, + "learning_rate": 3.5875076709184773e-06, + "loss": 0.4206, + "num_input_tokens_seen": 7358864, + "step": 21860 + }, + { + "epoch": 16.897217928902627, + "grad_norm": 0.8278981447219849, + "learning_rate": 3.578810382928249e-06, + "loss": 0.4577, + "num_input_tokens_seen": 7360784, + "step": 21865 + }, + { + "epoch": 16.901081916537866, + "grad_norm": 0.850002646446228, + "learning_rate": 3.570122837423348e-06, + "loss": 0.4539, + "num_input_tokens_seen": 7362256, + "step": 21870 + }, + { + "epoch": 16.904945904173108, + "grad_norm": 1.397514820098877, + "learning_rate": 3.5614450383549157e-06, + "loss": 0.3921, + "num_input_tokens_seen": 7363760, + "step": 21875 + }, + { + "epoch": 16.908809891808346, + "grad_norm": 0.8131007552146912, + "learning_rate": 3.5527769896696706e-06, + "loss": 0.3843, + "num_input_tokens_seen": 7365648, + "step": 21880 + }, + { + "epoch": 16.912673879443584, + "grad_norm": 0.8388241529464722, + "learning_rate": 3.5441186953098894e-06, + "loss": 0.3339, + "num_input_tokens_seen": 7367088, + "step": 21885 + }, + { + "epoch": 16.916537867078826, + "grad_norm": 1.1901265382766724, + "learning_rate": 3.535470159213425e-06, + "loss": 0.3607, + "num_input_tokens_seen": 7368880, + "step": 21890 + }, + { + "epoch": 16.920401854714065, + "grad_norm": 1.4353764057159424, + "learning_rate": 3.5268313853136754e-06, + "loss": 0.5696, + "num_input_tokens_seen": 7370384, + "step": 21895 + }, + { + "epoch": 16.924265842349303, + "grad_norm": 1.256813406944275, + "learning_rate": 3.5182023775396062e-06, + "loss": 0.5408, + "num_input_tokens_seen": 7372528, + "step": 21900 + }, + { + "epoch": 16.928129829984545, + "grad_norm": 0.9378314018249512, + "learning_rate": 3.50958313981575e-06, + "loss": 0.3916, + "num_input_tokens_seen": 7374064, + "step": 21905 + }, + { + "epoch": 16.931993817619784, + "grad_norm": 0.8498517274856567, + "learning_rate": 3.500973676062183e-06, + "loss": 0.4072, + "num_input_tokens_seen": 7375952, + "step": 21910 + }, + { + "epoch": 16.935857805255022, + "grad_norm": 0.8149335980415344, + "learning_rate": 3.492373990194542e-06, + "loss": 0.3863, + "num_input_tokens_seen": 7377488, + "step": 21915 + }, + { + "epoch": 16.939721792890264, + "grad_norm": 0.6875500679016113, + "learning_rate": 3.483784086124009e-06, + "loss": 0.5382, + "num_input_tokens_seen": 7379312, + "step": 21920 + }, + { + "epoch": 16.943585780525503, + "grad_norm": 0.8451194167137146, + "learning_rate": 3.4752039677573316e-06, + "loss": 0.3092, + "num_input_tokens_seen": 7380848, + "step": 21925 + }, + { + "epoch": 16.94744976816074, + "grad_norm": 0.6990697979927063, + "learning_rate": 3.4666336389967996e-06, + "loss": 0.4254, + "num_input_tokens_seen": 7382608, + "step": 21930 + }, + { + "epoch": 16.951313755795983, + "grad_norm": 1.1683424711227417, + "learning_rate": 3.458073103740245e-06, + "loss": 0.4655, + "num_input_tokens_seen": 7384336, + "step": 21935 + }, + { + "epoch": 16.95517774343122, + "grad_norm": 1.664170265197754, + "learning_rate": 3.44952236588105e-06, + "loss": 0.4614, + "num_input_tokens_seen": 7385872, + "step": 21940 + }, + { + "epoch": 16.95904173106646, + "grad_norm": 1.5069433450698853, + "learning_rate": 3.440981429308146e-06, + "loss": 0.4774, + "num_input_tokens_seen": 7387600, + "step": 21945 + }, + { + "epoch": 16.962905718701702, + "grad_norm": 1.2796605825424194, + "learning_rate": 3.4324502979060006e-06, + "loss": 0.3611, + "num_input_tokens_seen": 7389232, + "step": 21950 + }, + { + "epoch": 16.96676970633694, + "grad_norm": 1.2773208618164062, + "learning_rate": 3.423928975554616e-06, + "loss": 0.431, + "num_input_tokens_seen": 7391056, + "step": 21955 + }, + { + "epoch": 16.97063369397218, + "grad_norm": 0.8339182734489441, + "learning_rate": 3.415417466129556e-06, + "loss": 0.3782, + "num_input_tokens_seen": 7392816, + "step": 21960 + }, + { + "epoch": 16.974497681607417, + "grad_norm": 0.8961596488952637, + "learning_rate": 3.4069157735018953e-06, + "loss": 0.5793, + "num_input_tokens_seen": 7394480, + "step": 21965 + }, + { + "epoch": 16.97836166924266, + "grad_norm": 0.8305552005767822, + "learning_rate": 3.3984239015382557e-06, + "loss": 0.3443, + "num_input_tokens_seen": 7396080, + "step": 21970 + }, + { + "epoch": 16.982225656877898, + "grad_norm": 0.6872624158859253, + "learning_rate": 3.3899418541007947e-06, + "loss": 0.3426, + "num_input_tokens_seen": 7397680, + "step": 21975 + }, + { + "epoch": 16.986089644513136, + "grad_norm": 1.2816295623779297, + "learning_rate": 3.381469635047191e-06, + "loss": 0.5527, + "num_input_tokens_seen": 7399344, + "step": 21980 + }, + { + "epoch": 16.989953632148378, + "grad_norm": 0.956911563873291, + "learning_rate": 3.3730072482306697e-06, + "loss": 0.3812, + "num_input_tokens_seen": 7401008, + "step": 21985 + }, + { + "epoch": 16.993817619783616, + "grad_norm": 2.3640620708465576, + "learning_rate": 3.3645546974999636e-06, + "loss": 0.4692, + "num_input_tokens_seen": 7402576, + "step": 21990 + }, + { + "epoch": 16.997681607418855, + "grad_norm": 1.9483131170272827, + "learning_rate": 3.356111986699359e-06, + "loss": 0.4025, + "num_input_tokens_seen": 7404144, + "step": 21995 + }, + { + "epoch": 17.0, + "eval_loss": 0.4446313679218292, + "eval_runtime": 6.2565, + "eval_samples_per_second": 91.904, + "eval_steps_per_second": 23.016, + "num_input_tokens_seen": 7405056, + "step": 21998 + }, + { + "epoch": 17.001545595054097, + "grad_norm": 1.9948616027832031, + "learning_rate": 3.3476791196686426e-06, + "loss": 0.4551, + "num_input_tokens_seen": 7405952, + "step": 22000 + }, + { + "epoch": 17.005409582689335, + "grad_norm": 1.792299747467041, + "learning_rate": 3.3392561002431323e-06, + "loss": 0.6065, + "num_input_tokens_seen": 7407776, + "step": 22005 + }, + { + "epoch": 17.009273570324574, + "grad_norm": 0.723055362701416, + "learning_rate": 3.3308429322536692e-06, + "loss": 0.3917, + "num_input_tokens_seen": 7409344, + "step": 22010 + }, + { + "epoch": 17.013137557959816, + "grad_norm": 1.1659749746322632, + "learning_rate": 3.3224396195266127e-06, + "loss": 0.3789, + "num_input_tokens_seen": 7411104, + "step": 22015 + }, + { + "epoch": 17.017001545595054, + "grad_norm": 0.6490089893341064, + "learning_rate": 3.31404616588383e-06, + "loss": 0.3685, + "num_input_tokens_seen": 7412832, + "step": 22020 + }, + { + "epoch": 17.020865533230292, + "grad_norm": 0.8831002116203308, + "learning_rate": 3.3056625751427317e-06, + "loss": 0.4673, + "num_input_tokens_seen": 7414656, + "step": 22025 + }, + { + "epoch": 17.024729520865534, + "grad_norm": 1.2506321668624878, + "learning_rate": 3.297288851116212e-06, + "loss": 0.4093, + "num_input_tokens_seen": 7416384, + "step": 22030 + }, + { + "epoch": 17.028593508500773, + "grad_norm": 1.8936840295791626, + "learning_rate": 3.2889249976126995e-06, + "loss": 0.4337, + "num_input_tokens_seen": 7417824, + "step": 22035 + }, + { + "epoch": 17.03245749613601, + "grad_norm": 0.7695091962814331, + "learning_rate": 3.280571018436121e-06, + "loss": 0.3305, + "num_input_tokens_seen": 7419488, + "step": 22040 + }, + { + "epoch": 17.036321483771253, + "grad_norm": 0.5791177153587341, + "learning_rate": 3.272226917385915e-06, + "loss": 0.5728, + "num_input_tokens_seen": 7421152, + "step": 22045 + }, + { + "epoch": 17.04018547140649, + "grad_norm": 0.665290117263794, + "learning_rate": 3.263892698257029e-06, + "loss": 0.3149, + "num_input_tokens_seen": 7422880, + "step": 22050 + }, + { + "epoch": 17.04404945904173, + "grad_norm": 0.902586817741394, + "learning_rate": 3.2555683648399118e-06, + "loss": 0.4749, + "num_input_tokens_seen": 7424800, + "step": 22055 + }, + { + "epoch": 17.047913446676972, + "grad_norm": 0.8566022515296936, + "learning_rate": 3.2472539209205316e-06, + "loss": 0.4519, + "num_input_tokens_seen": 7426688, + "step": 22060 + }, + { + "epoch": 17.05177743431221, + "grad_norm": 1.0793064832687378, + "learning_rate": 3.238949370280331e-06, + "loss": 0.5104, + "num_input_tokens_seen": 7428416, + "step": 22065 + }, + { + "epoch": 17.05564142194745, + "grad_norm": 0.8128682374954224, + "learning_rate": 3.230654716696288e-06, + "loss": 0.6547, + "num_input_tokens_seen": 7430400, + "step": 22070 + }, + { + "epoch": 17.05950540958269, + "grad_norm": 0.9095790386199951, + "learning_rate": 3.2223699639408493e-06, + "loss": 0.4803, + "num_input_tokens_seen": 7432512, + "step": 22075 + }, + { + "epoch": 17.06336939721793, + "grad_norm": 0.8616628050804138, + "learning_rate": 3.2140951157819703e-06, + "loss": 0.3583, + "num_input_tokens_seen": 7434208, + "step": 22080 + }, + { + "epoch": 17.067233384853168, + "grad_norm": 1.448685884475708, + "learning_rate": 3.2058301759831073e-06, + "loss": 0.5266, + "num_input_tokens_seen": 7436064, + "step": 22085 + }, + { + "epoch": 17.071097372488406, + "grad_norm": 1.8891137838363647, + "learning_rate": 3.197575148303192e-06, + "loss": 0.5971, + "num_input_tokens_seen": 7437824, + "step": 22090 + }, + { + "epoch": 17.07496136012365, + "grad_norm": 1.0005844831466675, + "learning_rate": 3.1893300364966766e-06, + "loss": 0.3834, + "num_input_tokens_seen": 7439744, + "step": 22095 + }, + { + "epoch": 17.078825347758887, + "grad_norm": 0.8007699251174927, + "learning_rate": 3.181094844313473e-06, + "loss": 0.3655, + "num_input_tokens_seen": 7441664, + "step": 22100 + }, + { + "epoch": 17.082689335394125, + "grad_norm": 0.7616332173347473, + "learning_rate": 3.1728695754990074e-06, + "loss": 0.3707, + "num_input_tokens_seen": 7443392, + "step": 22105 + }, + { + "epoch": 17.086553323029367, + "grad_norm": 0.6500002145767212, + "learning_rate": 3.164654233794176e-06, + "loss": 0.4475, + "num_input_tokens_seen": 7445024, + "step": 22110 + }, + { + "epoch": 17.090417310664606, + "grad_norm": 2.183126926422119, + "learning_rate": 3.1564488229353677e-06, + "loss": 0.7693, + "num_input_tokens_seen": 7446656, + "step": 22115 + }, + { + "epoch": 17.094281298299844, + "grad_norm": 0.9525463581085205, + "learning_rate": 3.1482533466544477e-06, + "loss": 0.4568, + "num_input_tokens_seen": 7448448, + "step": 22120 + }, + { + "epoch": 17.098145285935086, + "grad_norm": 1.0888959169387817, + "learning_rate": 3.140067808678773e-06, + "loss": 0.5003, + "num_input_tokens_seen": 7449952, + "step": 22125 + }, + { + "epoch": 17.102009273570324, + "grad_norm": 0.5360842347145081, + "learning_rate": 3.131892212731169e-06, + "loss": 0.3663, + "num_input_tokens_seen": 7451456, + "step": 22130 + }, + { + "epoch": 17.105873261205563, + "grad_norm": 1.113724946975708, + "learning_rate": 3.1237265625299524e-06, + "loss": 0.5194, + "num_input_tokens_seen": 7453280, + "step": 22135 + }, + { + "epoch": 17.109737248840805, + "grad_norm": 0.9033219218254089, + "learning_rate": 3.1155708617889023e-06, + "loss": 0.4269, + "num_input_tokens_seen": 7455008, + "step": 22140 + }, + { + "epoch": 17.113601236476043, + "grad_norm": 1.0086547136306763, + "learning_rate": 3.107425114217291e-06, + "loss": 0.649, + "num_input_tokens_seen": 7456704, + "step": 22145 + }, + { + "epoch": 17.11746522411128, + "grad_norm": 2.1252033710479736, + "learning_rate": 3.0992893235198466e-06, + "loss": 0.4307, + "num_input_tokens_seen": 7458272, + "step": 22150 + }, + { + "epoch": 17.121329211746524, + "grad_norm": 1.4525922536849976, + "learning_rate": 3.091163493396776e-06, + "loss": 0.4857, + "num_input_tokens_seen": 7460192, + "step": 22155 + }, + { + "epoch": 17.125193199381762, + "grad_norm": 0.8552473187446594, + "learning_rate": 3.0830476275437533e-06, + "loss": 0.4987, + "num_input_tokens_seen": 7461920, + "step": 22160 + }, + { + "epoch": 17.129057187017, + "grad_norm": 1.0825731754302979, + "learning_rate": 3.0749417296519228e-06, + "loss": 0.3794, + "num_input_tokens_seen": 7463616, + "step": 22165 + }, + { + "epoch": 17.132921174652243, + "grad_norm": 1.7922110557556152, + "learning_rate": 3.066845803407903e-06, + "loss": 0.494, + "num_input_tokens_seen": 7464992, + "step": 22170 + }, + { + "epoch": 17.13678516228748, + "grad_norm": 1.47572660446167, + "learning_rate": 3.0587598524937617e-06, + "loss": 0.4251, + "num_input_tokens_seen": 7466816, + "step": 22175 + }, + { + "epoch": 17.14064914992272, + "grad_norm": 0.9089978933334351, + "learning_rate": 3.050683880587038e-06, + "loss": 0.5746, + "num_input_tokens_seen": 7468704, + "step": 22180 + }, + { + "epoch": 17.14451313755796, + "grad_norm": 1.3337457180023193, + "learning_rate": 3.0426178913607383e-06, + "loss": 0.4491, + "num_input_tokens_seen": 7470304, + "step": 22185 + }, + { + "epoch": 17.1483771251932, + "grad_norm": 2.1071183681488037, + "learning_rate": 3.03456188848332e-06, + "loss": 0.5584, + "num_input_tokens_seen": 7471968, + "step": 22190 + }, + { + "epoch": 17.152241112828438, + "grad_norm": 1.0740458965301514, + "learning_rate": 3.026515875618702e-06, + "loss": 0.3332, + "num_input_tokens_seen": 7473440, + "step": 22195 + }, + { + "epoch": 17.15610510046368, + "grad_norm": 0.8611363172531128, + "learning_rate": 3.0184798564262513e-06, + "loss": 0.3941, + "num_input_tokens_seen": 7475104, + "step": 22200 + }, + { + "epoch": 17.15996908809892, + "grad_norm": 1.1323130130767822, + "learning_rate": 3.0104538345608085e-06, + "loss": 0.4307, + "num_input_tokens_seen": 7476800, + "step": 22205 + }, + { + "epoch": 17.163833075734157, + "grad_norm": 1.4218814373016357, + "learning_rate": 3.002437813672651e-06, + "loss": 0.4867, + "num_input_tokens_seen": 7478432, + "step": 22210 + }, + { + "epoch": 17.167697063369395, + "grad_norm": 0.6948588490486145, + "learning_rate": 2.9944317974075153e-06, + "loss": 0.3376, + "num_input_tokens_seen": 7479904, + "step": 22215 + }, + { + "epoch": 17.171561051004637, + "grad_norm": 1.5886380672454834, + "learning_rate": 2.9864357894065805e-06, + "loss": 0.397, + "num_input_tokens_seen": 7481728, + "step": 22220 + }, + { + "epoch": 17.175425038639876, + "grad_norm": 1.3705387115478516, + "learning_rate": 2.978449793306487e-06, + "loss": 0.3651, + "num_input_tokens_seen": 7483360, + "step": 22225 + }, + { + "epoch": 17.179289026275114, + "grad_norm": 0.7768755555152893, + "learning_rate": 2.9704738127393078e-06, + "loss": 0.4369, + "num_input_tokens_seen": 7484768, + "step": 22230 + }, + { + "epoch": 17.183153013910356, + "grad_norm": 0.901326596736908, + "learning_rate": 2.962507851332566e-06, + "loss": 0.3383, + "num_input_tokens_seen": 7486240, + "step": 22235 + }, + { + "epoch": 17.187017001545595, + "grad_norm": 0.9307828545570374, + "learning_rate": 2.954551912709233e-06, + "loss": 0.4126, + "num_input_tokens_seen": 7487872, + "step": 22240 + }, + { + "epoch": 17.190880989180833, + "grad_norm": 0.9677280187606812, + "learning_rate": 2.9466060004877174e-06, + "loss": 0.4488, + "num_input_tokens_seen": 7489376, + "step": 22245 + }, + { + "epoch": 17.194744976816075, + "grad_norm": 1.259951114654541, + "learning_rate": 2.938670118281864e-06, + "loss": 0.491, + "num_input_tokens_seen": 7490976, + "step": 22250 + }, + { + "epoch": 17.198608964451314, + "grad_norm": 0.921419620513916, + "learning_rate": 2.9307442697009606e-06, + "loss": 0.6038, + "num_input_tokens_seen": 7492704, + "step": 22255 + }, + { + "epoch": 17.202472952086552, + "grad_norm": 1.5192558765411377, + "learning_rate": 2.922828458349727e-06, + "loss": 0.4767, + "num_input_tokens_seen": 7494272, + "step": 22260 + }, + { + "epoch": 17.206336939721794, + "grad_norm": 1.077980637550354, + "learning_rate": 2.914922687828331e-06, + "loss": 0.4033, + "num_input_tokens_seen": 7496320, + "step": 22265 + }, + { + "epoch": 17.210200927357032, + "grad_norm": 0.8895778059959412, + "learning_rate": 2.9070269617323537e-06, + "loss": 0.3323, + "num_input_tokens_seen": 7498016, + "step": 22270 + }, + { + "epoch": 17.21406491499227, + "grad_norm": 1.473997712135315, + "learning_rate": 2.8991412836528285e-06, + "loss": 0.4339, + "num_input_tokens_seen": 7499488, + "step": 22275 + }, + { + "epoch": 17.217928902627513, + "grad_norm": 1.2562055587768555, + "learning_rate": 2.8912656571762036e-06, + "loss": 0.3727, + "num_input_tokens_seen": 7501056, + "step": 22280 + }, + { + "epoch": 17.22179289026275, + "grad_norm": 0.9808709621429443, + "learning_rate": 2.883400085884361e-06, + "loss": 0.423, + "num_input_tokens_seen": 7502912, + "step": 22285 + }, + { + "epoch": 17.22565687789799, + "grad_norm": 1.5322372913360596, + "learning_rate": 2.8755445733546134e-06, + "loss": 0.4876, + "num_input_tokens_seen": 7504960, + "step": 22290 + }, + { + "epoch": 17.22952086553323, + "grad_norm": 0.6630092859268188, + "learning_rate": 2.8676991231596894e-06, + "loss": 0.4507, + "num_input_tokens_seen": 7506656, + "step": 22295 + }, + { + "epoch": 17.23338485316847, + "grad_norm": 1.143005609512329, + "learning_rate": 2.859863738867746e-06, + "loss": 0.3436, + "num_input_tokens_seen": 7508224, + "step": 22300 + }, + { + "epoch": 17.23724884080371, + "grad_norm": 1.0305103063583374, + "learning_rate": 2.8520384240423665e-06, + "loss": 0.3866, + "num_input_tokens_seen": 7509856, + "step": 22305 + }, + { + "epoch": 17.24111282843895, + "grad_norm": 0.9697922468185425, + "learning_rate": 2.8442231822425532e-06, + "loss": 0.3883, + "num_input_tokens_seen": 7511616, + "step": 22310 + }, + { + "epoch": 17.24497681607419, + "grad_norm": 0.90516197681427, + "learning_rate": 2.836418017022724e-06, + "loss": 0.3755, + "num_input_tokens_seen": 7513472, + "step": 22315 + }, + { + "epoch": 17.248840803709427, + "grad_norm": 0.8560720086097717, + "learning_rate": 2.8286229319327147e-06, + "loss": 0.4315, + "num_input_tokens_seen": 7515296, + "step": 22320 + }, + { + "epoch": 17.25270479134467, + "grad_norm": 1.0126131772994995, + "learning_rate": 2.8208379305177725e-06, + "loss": 0.4438, + "num_input_tokens_seen": 7517056, + "step": 22325 + }, + { + "epoch": 17.256568778979908, + "grad_norm": 0.709793746471405, + "learning_rate": 2.813063016318565e-06, + "loss": 0.4117, + "num_input_tokens_seen": 7518912, + "step": 22330 + }, + { + "epoch": 17.260432766615146, + "grad_norm": 0.8325903415679932, + "learning_rate": 2.805298192871167e-06, + "loss": 0.5222, + "num_input_tokens_seen": 7520576, + "step": 22335 + }, + { + "epoch": 17.264296754250385, + "grad_norm": 1.0759330987930298, + "learning_rate": 2.7975434637070698e-06, + "loss": 0.3067, + "num_input_tokens_seen": 7522208, + "step": 22340 + }, + { + "epoch": 17.268160741885627, + "grad_norm": 0.8292084336280823, + "learning_rate": 2.789798832353174e-06, + "loss": 0.3747, + "num_input_tokens_seen": 7523680, + "step": 22345 + }, + { + "epoch": 17.272024729520865, + "grad_norm": 0.7178846001625061, + "learning_rate": 2.7820643023317827e-06, + "loss": 0.3052, + "num_input_tokens_seen": 7525440, + "step": 22350 + }, + { + "epoch": 17.275888717156104, + "grad_norm": 0.9662672281265259, + "learning_rate": 2.7743398771606034e-06, + "loss": 0.3894, + "num_input_tokens_seen": 7527200, + "step": 22355 + }, + { + "epoch": 17.279752704791346, + "grad_norm": 0.9295123815536499, + "learning_rate": 2.7666255603527535e-06, + "loss": 0.5471, + "num_input_tokens_seen": 7529184, + "step": 22360 + }, + { + "epoch": 17.283616692426584, + "grad_norm": 1.0103076696395874, + "learning_rate": 2.7589213554167466e-06, + "loss": 0.4891, + "num_input_tokens_seen": 7530720, + "step": 22365 + }, + { + "epoch": 17.287480680061822, + "grad_norm": 1.211356520652771, + "learning_rate": 2.7512272658565012e-06, + "loss": 0.548, + "num_input_tokens_seen": 7532512, + "step": 22370 + }, + { + "epoch": 17.291344667697064, + "grad_norm": 0.976014256477356, + "learning_rate": 2.7435432951713443e-06, + "loss": 0.5406, + "num_input_tokens_seen": 7534048, + "step": 22375 + }, + { + "epoch": 17.295208655332303, + "grad_norm": 1.0391805171966553, + "learning_rate": 2.7358694468559766e-06, + "loss": 0.4, + "num_input_tokens_seen": 7535840, + "step": 22380 + }, + { + "epoch": 17.29907264296754, + "grad_norm": 0.6628127098083496, + "learning_rate": 2.728205724400526e-06, + "loss": 0.3963, + "num_input_tokens_seen": 7537472, + "step": 22385 + }, + { + "epoch": 17.302936630602783, + "grad_norm": 1.913769245147705, + "learning_rate": 2.7205521312904937e-06, + "loss": 0.6381, + "num_input_tokens_seen": 7539232, + "step": 22390 + }, + { + "epoch": 17.30680061823802, + "grad_norm": 0.7442816495895386, + "learning_rate": 2.712908671006775e-06, + "loss": 0.3153, + "num_input_tokens_seen": 7540928, + "step": 22395 + }, + { + "epoch": 17.31066460587326, + "grad_norm": 0.7615408897399902, + "learning_rate": 2.7052753470256683e-06, + "loss": 0.4004, + "num_input_tokens_seen": 7542656, + "step": 22400 + }, + { + "epoch": 17.314528593508502, + "grad_norm": 0.8900472521781921, + "learning_rate": 2.697652162818845e-06, + "loss": 0.3659, + "num_input_tokens_seen": 7544384, + "step": 22405 + }, + { + "epoch": 17.31839258114374, + "grad_norm": 0.9182357788085938, + "learning_rate": 2.6900391218533882e-06, + "loss": 0.3887, + "num_input_tokens_seen": 7545920, + "step": 22410 + }, + { + "epoch": 17.32225656877898, + "grad_norm": 0.712069034576416, + "learning_rate": 2.6824362275917475e-06, + "loss": 0.3726, + "num_input_tokens_seen": 7547360, + "step": 22415 + }, + { + "epoch": 17.32612055641422, + "grad_norm": 0.9142125248908997, + "learning_rate": 2.6748434834917595e-06, + "loss": 0.3791, + "num_input_tokens_seen": 7549248, + "step": 22420 + }, + { + "epoch": 17.32998454404946, + "grad_norm": 0.7837327122688293, + "learning_rate": 2.6672608930066596e-06, + "loss": 0.5678, + "num_input_tokens_seen": 7550688, + "step": 22425 + }, + { + "epoch": 17.333848531684698, + "grad_norm": 0.7001990079879761, + "learning_rate": 2.6596884595850523e-06, + "loss": 0.4049, + "num_input_tokens_seen": 7552320, + "step": 22430 + }, + { + "epoch": 17.33771251931994, + "grad_norm": 1.1479661464691162, + "learning_rate": 2.6521261866709224e-06, + "loss": 0.4399, + "num_input_tokens_seen": 7553984, + "step": 22435 + }, + { + "epoch": 17.341576506955178, + "grad_norm": 0.9920941591262817, + "learning_rate": 2.644574077703635e-06, + "loss": 0.3274, + "num_input_tokens_seen": 7555776, + "step": 22440 + }, + { + "epoch": 17.345440494590417, + "grad_norm": 0.7455354928970337, + "learning_rate": 2.637032136117945e-06, + "loss": 0.4236, + "num_input_tokens_seen": 7557536, + "step": 22445 + }, + { + "epoch": 17.34930448222566, + "grad_norm": 0.8461039066314697, + "learning_rate": 2.6295003653439648e-06, + "loss": 0.3927, + "num_input_tokens_seen": 7559136, + "step": 22450 + }, + { + "epoch": 17.353168469860897, + "grad_norm": 0.9485459923744202, + "learning_rate": 2.62197876880719e-06, + "loss": 0.5099, + "num_input_tokens_seen": 7560928, + "step": 22455 + }, + { + "epoch": 17.357032457496135, + "grad_norm": 0.890741229057312, + "learning_rate": 2.6144673499284842e-06, + "loss": 0.526, + "num_input_tokens_seen": 7562688, + "step": 22460 + }, + { + "epoch": 17.360896445131374, + "grad_norm": 0.9337940216064453, + "learning_rate": 2.606966112124093e-06, + "loss": 0.4137, + "num_input_tokens_seen": 7564480, + "step": 22465 + }, + { + "epoch": 17.364760432766616, + "grad_norm": 0.7147164344787598, + "learning_rate": 2.59947505880562e-06, + "loss": 0.4317, + "num_input_tokens_seen": 7565984, + "step": 22470 + }, + { + "epoch": 17.368624420401854, + "grad_norm": 1.0156794786453247, + "learning_rate": 2.5919941933800373e-06, + "loss": 0.3944, + "num_input_tokens_seen": 7567424, + "step": 22475 + }, + { + "epoch": 17.372488408037093, + "grad_norm": 1.0187954902648926, + "learning_rate": 2.5845235192496984e-06, + "loss": 0.6053, + "num_input_tokens_seen": 7569344, + "step": 22480 + }, + { + "epoch": 17.376352395672335, + "grad_norm": 1.413957953453064, + "learning_rate": 2.5770630398123026e-06, + "loss": 0.4123, + "num_input_tokens_seen": 7571008, + "step": 22485 + }, + { + "epoch": 17.380216383307573, + "grad_norm": 0.8767400979995728, + "learning_rate": 2.569612758460921e-06, + "loss": 0.3299, + "num_input_tokens_seen": 7572448, + "step": 22490 + }, + { + "epoch": 17.38408037094281, + "grad_norm": 1.0291730165481567, + "learning_rate": 2.5621726785839877e-06, + "loss": 0.4557, + "num_input_tokens_seen": 7574368, + "step": 22495 + }, + { + "epoch": 17.387944358578054, + "grad_norm": 1.1408222913742065, + "learning_rate": 2.554742803565291e-06, + "loss": 0.4441, + "num_input_tokens_seen": 7575904, + "step": 22500 + }, + { + "epoch": 17.391808346213292, + "grad_norm": 0.7750499844551086, + "learning_rate": 2.547323136783991e-06, + "loss": 0.4595, + "num_input_tokens_seen": 7577472, + "step": 22505 + }, + { + "epoch": 17.39567233384853, + "grad_norm": 2.017164945602417, + "learning_rate": 2.539913681614589e-06, + "loss": 0.5259, + "num_input_tokens_seen": 7579104, + "step": 22510 + }, + { + "epoch": 17.399536321483772, + "grad_norm": 0.9570824503898621, + "learning_rate": 2.532514441426956e-06, + "loss": 0.407, + "num_input_tokens_seen": 7580640, + "step": 22515 + }, + { + "epoch": 17.40340030911901, + "grad_norm": 0.8711598515510559, + "learning_rate": 2.525125419586308e-06, + "loss": 0.4199, + "num_input_tokens_seen": 7582368, + "step": 22520 + }, + { + "epoch": 17.40726429675425, + "grad_norm": 1.1918078660964966, + "learning_rate": 2.517746619453215e-06, + "loss": 0.4287, + "num_input_tokens_seen": 7584096, + "step": 22525 + }, + { + "epoch": 17.41112828438949, + "grad_norm": 1.2134252786636353, + "learning_rate": 2.510378044383602e-06, + "loss": 0.393, + "num_input_tokens_seen": 7585440, + "step": 22530 + }, + { + "epoch": 17.41499227202473, + "grad_norm": 0.8260073065757751, + "learning_rate": 2.503019697728737e-06, + "loss": 0.3767, + "num_input_tokens_seen": 7587104, + "step": 22535 + }, + { + "epoch": 17.418856259659968, + "grad_norm": 0.8323020339012146, + "learning_rate": 2.4956715828352377e-06, + "loss": 0.4003, + "num_input_tokens_seen": 7588800, + "step": 22540 + }, + { + "epoch": 17.42272024729521, + "grad_norm": 1.280670166015625, + "learning_rate": 2.4883337030450786e-06, + "loss": 0.4234, + "num_input_tokens_seen": 7590592, + "step": 22545 + }, + { + "epoch": 17.42658423493045, + "grad_norm": 1.0937315225601196, + "learning_rate": 2.4810060616955707e-06, + "loss": 0.3431, + "num_input_tokens_seen": 7592576, + "step": 22550 + }, + { + "epoch": 17.430448222565687, + "grad_norm": 1.5807888507843018, + "learning_rate": 2.4736886621193693e-06, + "loss": 0.4058, + "num_input_tokens_seen": 7594112, + "step": 22555 + }, + { + "epoch": 17.43431221020093, + "grad_norm": 1.0005507469177246, + "learning_rate": 2.466381507644469e-06, + "loss": 0.3525, + "num_input_tokens_seen": 7595616, + "step": 22560 + }, + { + "epoch": 17.438176197836167, + "grad_norm": 0.9799661636352539, + "learning_rate": 2.4590846015942053e-06, + "loss": 0.401, + "num_input_tokens_seen": 7597440, + "step": 22565 + }, + { + "epoch": 17.442040185471406, + "grad_norm": 0.9501713514328003, + "learning_rate": 2.451797947287257e-06, + "loss": 0.4991, + "num_input_tokens_seen": 7599360, + "step": 22570 + }, + { + "epoch": 17.445904173106648, + "grad_norm": 1.3514901399612427, + "learning_rate": 2.444521548037637e-06, + "loss": 0.3425, + "num_input_tokens_seen": 7601344, + "step": 22575 + }, + { + "epoch": 17.449768160741886, + "grad_norm": 0.813324511051178, + "learning_rate": 2.437255407154693e-06, + "loss": 0.4541, + "num_input_tokens_seen": 7603200, + "step": 22580 + }, + { + "epoch": 17.453632148377125, + "grad_norm": 0.7942838072776794, + "learning_rate": 2.429999527943119e-06, + "loss": 0.5529, + "num_input_tokens_seen": 7604704, + "step": 22585 + }, + { + "epoch": 17.457496136012363, + "grad_norm": 0.6962345242500305, + "learning_rate": 2.422753913702924e-06, + "loss": 0.5098, + "num_input_tokens_seen": 7606432, + "step": 22590 + }, + { + "epoch": 17.461360123647605, + "grad_norm": 1.039262056350708, + "learning_rate": 2.4155185677294607e-06, + "loss": 0.5846, + "num_input_tokens_seen": 7607936, + "step": 22595 + }, + { + "epoch": 17.465224111282843, + "grad_norm": 1.7748233079910278, + "learning_rate": 2.408293493313407e-06, + "loss": 0.9494, + "num_input_tokens_seen": 7610144, + "step": 22600 + }, + { + "epoch": 17.469088098918082, + "grad_norm": 1.4095641374588013, + "learning_rate": 2.4010786937407687e-06, + "loss": 0.4452, + "num_input_tokens_seen": 7611584, + "step": 22605 + }, + { + "epoch": 17.472952086553324, + "grad_norm": 0.9302821755409241, + "learning_rate": 2.393874172292873e-06, + "loss": 0.381, + "num_input_tokens_seen": 7613312, + "step": 22610 + }, + { + "epoch": 17.476816074188562, + "grad_norm": 0.8315171003341675, + "learning_rate": 2.386679932246394e-06, + "loss": 0.5249, + "num_input_tokens_seen": 7615200, + "step": 22615 + }, + { + "epoch": 17.4806800618238, + "grad_norm": 0.8503612279891968, + "learning_rate": 2.3794959768733e-06, + "loss": 0.7055, + "num_input_tokens_seen": 7616960, + "step": 22620 + }, + { + "epoch": 17.484544049459043, + "grad_norm": 1.6682122945785522, + "learning_rate": 2.3723223094409108e-06, + "loss": 0.6571, + "num_input_tokens_seen": 7618528, + "step": 22625 + }, + { + "epoch": 17.48840803709428, + "grad_norm": 2.1178295612335205, + "learning_rate": 2.3651589332118474e-06, + "loss": 0.4335, + "num_input_tokens_seen": 7620064, + "step": 22630 + }, + { + "epoch": 17.49227202472952, + "grad_norm": 0.9222102165222168, + "learning_rate": 2.358005851444056e-06, + "loss": 0.5576, + "num_input_tokens_seen": 7621792, + "step": 22635 + }, + { + "epoch": 17.49613601236476, + "grad_norm": 1.1246178150177002, + "learning_rate": 2.3508630673908017e-06, + "loss": 0.3952, + "num_input_tokens_seen": 7623328, + "step": 22640 + }, + { + "epoch": 17.5, + "grad_norm": 1.3120086193084717, + "learning_rate": 2.3437305843006604e-06, + "loss": 0.5851, + "num_input_tokens_seen": 7624960, + "step": 22645 + }, + { + "epoch": 17.50386398763524, + "grad_norm": 1.2270498275756836, + "learning_rate": 2.336608405417534e-06, + "loss": 0.5151, + "num_input_tokens_seen": 7626720, + "step": 22650 + }, + { + "epoch": 17.50772797527048, + "grad_norm": 0.7868618965148926, + "learning_rate": 2.3294965339806324e-06, + "loss": 0.3943, + "num_input_tokens_seen": 7628256, + "step": 22655 + }, + { + "epoch": 17.51159196290572, + "grad_norm": 1.0298553705215454, + "learning_rate": 2.3223949732244704e-06, + "loss": 0.3644, + "num_input_tokens_seen": 7629856, + "step": 22660 + }, + { + "epoch": 17.515455950540957, + "grad_norm": 0.8250898718833923, + "learning_rate": 2.3153037263788925e-06, + "loss": 0.5056, + "num_input_tokens_seen": 7631424, + "step": 22665 + }, + { + "epoch": 17.5193199381762, + "grad_norm": 1.15542471408844, + "learning_rate": 2.3082227966690297e-06, + "loss": 0.354, + "num_input_tokens_seen": 7633344, + "step": 22670 + }, + { + "epoch": 17.523183925811438, + "grad_norm": 1.1067044734954834, + "learning_rate": 2.3011521873153364e-06, + "loss": 0.347, + "num_input_tokens_seen": 7635168, + "step": 22675 + }, + { + "epoch": 17.527047913446676, + "grad_norm": 1.3777285814285278, + "learning_rate": 2.29409190153356e-06, + "loss": 0.4078, + "num_input_tokens_seen": 7637088, + "step": 22680 + }, + { + "epoch": 17.530911901081918, + "grad_norm": 1.1046420335769653, + "learning_rate": 2.287041942534773e-06, + "loss": 0.3768, + "num_input_tokens_seen": 7638976, + "step": 22685 + }, + { + "epoch": 17.534775888717157, + "grad_norm": 0.9666231870651245, + "learning_rate": 2.280002313525334e-06, + "loss": 0.4026, + "num_input_tokens_seen": 7640768, + "step": 22690 + }, + { + "epoch": 17.538639876352395, + "grad_norm": 0.7697038650512695, + "learning_rate": 2.2729730177069086e-06, + "loss": 0.4634, + "num_input_tokens_seen": 7642592, + "step": 22695 + }, + { + "epoch": 17.542503863987637, + "grad_norm": 0.9582953453063965, + "learning_rate": 2.2659540582764593e-06, + "loss": 0.3245, + "num_input_tokens_seen": 7644192, + "step": 22700 + }, + { + "epoch": 17.546367851622875, + "grad_norm": 0.8207132816314697, + "learning_rate": 2.2589454384262494e-06, + "loss": 0.3905, + "num_input_tokens_seen": 7645920, + "step": 22705 + }, + { + "epoch": 17.550231839258114, + "grad_norm": 2.130613327026367, + "learning_rate": 2.2519471613438482e-06, + "loss": 0.4221, + "num_input_tokens_seen": 7647520, + "step": 22710 + }, + { + "epoch": 17.554095826893352, + "grad_norm": 0.8935186266899109, + "learning_rate": 2.244959230212107e-06, + "loss": 0.3883, + "num_input_tokens_seen": 7649184, + "step": 22715 + }, + { + "epoch": 17.557959814528594, + "grad_norm": 0.9960846900939941, + "learning_rate": 2.2379816482091866e-06, + "loss": 0.5107, + "num_input_tokens_seen": 7650880, + "step": 22720 + }, + { + "epoch": 17.561823802163833, + "grad_norm": 0.9487952589988708, + "learning_rate": 2.2310144185085314e-06, + "loss": 0.4152, + "num_input_tokens_seen": 7652288, + "step": 22725 + }, + { + "epoch": 17.56568778979907, + "grad_norm": 1.2349590063095093, + "learning_rate": 2.2240575442788735e-06, + "loss": 0.4154, + "num_input_tokens_seen": 7653856, + "step": 22730 + }, + { + "epoch": 17.569551777434313, + "grad_norm": 0.6662828326225281, + "learning_rate": 2.217111028684246e-06, + "loss": 0.3331, + "num_input_tokens_seen": 7655904, + "step": 22735 + }, + { + "epoch": 17.57341576506955, + "grad_norm": 0.7730731964111328, + "learning_rate": 2.2101748748839633e-06, + "loss": 0.5462, + "num_input_tokens_seen": 7657568, + "step": 22740 + }, + { + "epoch": 17.57727975270479, + "grad_norm": 1.8750691413879395, + "learning_rate": 2.203249086032627e-06, + "loss": 0.4827, + "num_input_tokens_seen": 7659104, + "step": 22745 + }, + { + "epoch": 17.581143740340032, + "grad_norm": 1.440234661102295, + "learning_rate": 2.196333665280134e-06, + "loss": 0.5579, + "num_input_tokens_seen": 7660672, + "step": 22750 + }, + { + "epoch": 17.58500772797527, + "grad_norm": 1.1359909772872925, + "learning_rate": 2.189428615771652e-06, + "loss": 0.4344, + "num_input_tokens_seen": 7662272, + "step": 22755 + }, + { + "epoch": 17.58887171561051, + "grad_norm": 1.3199260234832764, + "learning_rate": 2.182533940647649e-06, + "loss": 0.4475, + "num_input_tokens_seen": 7664032, + "step": 22760 + }, + { + "epoch": 17.59273570324575, + "grad_norm": 0.9394795298576355, + "learning_rate": 2.175649643043856e-06, + "loss": 0.3512, + "num_input_tokens_seen": 7665568, + "step": 22765 + }, + { + "epoch": 17.59659969088099, + "grad_norm": 0.6670156121253967, + "learning_rate": 2.168775726091296e-06, + "loss": 0.4999, + "num_input_tokens_seen": 7667264, + "step": 22770 + }, + { + "epoch": 17.600463678516228, + "grad_norm": 1.0848815441131592, + "learning_rate": 2.1619121929162654e-06, + "loss": 0.4683, + "num_input_tokens_seen": 7669120, + "step": 22775 + }, + { + "epoch": 17.60432766615147, + "grad_norm": 1.0929890871047974, + "learning_rate": 2.155059046640337e-06, + "loss": 0.4337, + "num_input_tokens_seen": 7670560, + "step": 22780 + }, + { + "epoch": 17.608191653786708, + "grad_norm": 1.1082762479782104, + "learning_rate": 2.1482162903803725e-06, + "loss": 0.4853, + "num_input_tokens_seen": 7672160, + "step": 22785 + }, + { + "epoch": 17.612055641421946, + "grad_norm": 0.8665377497673035, + "learning_rate": 2.1413839272484887e-06, + "loss": 0.528, + "num_input_tokens_seen": 7673984, + "step": 22790 + }, + { + "epoch": 17.61591962905719, + "grad_norm": 1.0054361820220947, + "learning_rate": 2.1345619603520937e-06, + "loss": 0.3904, + "num_input_tokens_seen": 7675776, + "step": 22795 + }, + { + "epoch": 17.619783616692427, + "grad_norm": 1.1008789539337158, + "learning_rate": 2.1277503927938535e-06, + "loss": 0.383, + "num_input_tokens_seen": 7677376, + "step": 22800 + }, + { + "epoch": 17.623647604327665, + "grad_norm": 0.9189634919166565, + "learning_rate": 2.1209492276717148e-06, + "loss": 0.3979, + "num_input_tokens_seen": 7679072, + "step": 22805 + }, + { + "epoch": 17.627511591962907, + "grad_norm": 1.4805761575698853, + "learning_rate": 2.1141584680788805e-06, + "loss": 0.403, + "num_input_tokens_seen": 7680736, + "step": 22810 + }, + { + "epoch": 17.631375579598146, + "grad_norm": 1.2116371393203735, + "learning_rate": 2.107378117103831e-06, + "loss": 0.4182, + "num_input_tokens_seen": 7682272, + "step": 22815 + }, + { + "epoch": 17.635239567233384, + "grad_norm": 0.7231366038322449, + "learning_rate": 2.1006081778303157e-06, + "loss": 0.3261, + "num_input_tokens_seen": 7683840, + "step": 22820 + }, + { + "epoch": 17.639103554868626, + "grad_norm": 0.6776334643363953, + "learning_rate": 2.093848653337335e-06, + "loss": 0.3502, + "num_input_tokens_seen": 7685600, + "step": 22825 + }, + { + "epoch": 17.642967542503865, + "grad_norm": 1.2742061614990234, + "learning_rate": 2.087099546699173e-06, + "loss": 0.4588, + "num_input_tokens_seen": 7687360, + "step": 22830 + }, + { + "epoch": 17.646831530139103, + "grad_norm": 0.7711167931556702, + "learning_rate": 2.080360860985356e-06, + "loss": 0.3732, + "num_input_tokens_seen": 7689056, + "step": 22835 + }, + { + "epoch": 17.65069551777434, + "grad_norm": 0.8957486152648926, + "learning_rate": 2.0736325992606804e-06, + "loss": 0.3153, + "num_input_tokens_seen": 7690816, + "step": 22840 + }, + { + "epoch": 17.654559505409583, + "grad_norm": 1.0427614450454712, + "learning_rate": 2.0669147645851984e-06, + "loss": 0.5386, + "num_input_tokens_seen": 7692608, + "step": 22845 + }, + { + "epoch": 17.658423493044822, + "grad_norm": 1.8085880279541016, + "learning_rate": 2.060207360014224e-06, + "loss": 0.781, + "num_input_tokens_seen": 7694656, + "step": 22850 + }, + { + "epoch": 17.66228748068006, + "grad_norm": 1.1713560819625854, + "learning_rate": 2.0535103885983177e-06, + "loss": 0.4513, + "num_input_tokens_seen": 7696416, + "step": 22855 + }, + { + "epoch": 17.666151468315302, + "grad_norm": 2.121859550476074, + "learning_rate": 2.0468238533833117e-06, + "loss": 0.5832, + "num_input_tokens_seen": 7698208, + "step": 22860 + }, + { + "epoch": 17.67001545595054, + "grad_norm": 0.8760755658149719, + "learning_rate": 2.040147757410274e-06, + "loss": 0.4726, + "num_input_tokens_seen": 7699808, + "step": 22865 + }, + { + "epoch": 17.67387944358578, + "grad_norm": 1.5158188343048096, + "learning_rate": 2.033482103715542e-06, + "loss": 0.4164, + "num_input_tokens_seen": 7701344, + "step": 22870 + }, + { + "epoch": 17.67774343122102, + "grad_norm": 1.010646939277649, + "learning_rate": 2.026826895330691e-06, + "loss": 0.3541, + "num_input_tokens_seen": 7702976, + "step": 22875 + }, + { + "epoch": 17.68160741885626, + "grad_norm": 0.7085983157157898, + "learning_rate": 2.020182135282547e-06, + "loss": 0.4285, + "num_input_tokens_seen": 7704672, + "step": 22880 + }, + { + "epoch": 17.685471406491498, + "grad_norm": 0.9403572082519531, + "learning_rate": 2.0135478265931902e-06, + "loss": 0.4379, + "num_input_tokens_seen": 7706304, + "step": 22885 + }, + { + "epoch": 17.68933539412674, + "grad_norm": 0.9688157439231873, + "learning_rate": 2.0069239722799392e-06, + "loss": 0.5092, + "num_input_tokens_seen": 7708096, + "step": 22890 + }, + { + "epoch": 17.69319938176198, + "grad_norm": 1.176788568496704, + "learning_rate": 2.0003105753553685e-06, + "loss": 0.5785, + "num_input_tokens_seen": 7709760, + "step": 22895 + }, + { + "epoch": 17.697063369397217, + "grad_norm": 0.7487061023712158, + "learning_rate": 1.9937076388272857e-06, + "loss": 0.3358, + "num_input_tokens_seen": 7711136, + "step": 22900 + }, + { + "epoch": 17.70092735703246, + "grad_norm": 1.141180396080017, + "learning_rate": 1.987115165698747e-06, + "loss": 0.5732, + "num_input_tokens_seen": 7713216, + "step": 22905 + }, + { + "epoch": 17.704791344667697, + "grad_norm": 1.0417168140411377, + "learning_rate": 1.9805331589680538e-06, + "loss": 0.6175, + "num_input_tokens_seen": 7715136, + "step": 22910 + }, + { + "epoch": 17.708655332302936, + "grad_norm": 0.6313415169715881, + "learning_rate": 1.9739616216287365e-06, + "loss": 0.7462, + "num_input_tokens_seen": 7716704, + "step": 22915 + }, + { + "epoch": 17.712519319938178, + "grad_norm": 1.8612898588180542, + "learning_rate": 1.9674005566695714e-06, + "loss": 0.6313, + "num_input_tokens_seen": 7718272, + "step": 22920 + }, + { + "epoch": 17.716383307573416, + "grad_norm": 1.1475234031677246, + "learning_rate": 1.9608499670745686e-06, + "loss": 0.5575, + "num_input_tokens_seen": 7720128, + "step": 22925 + }, + { + "epoch": 17.720247295208654, + "grad_norm": 1.4526785612106323, + "learning_rate": 1.9543098558229776e-06, + "loss": 0.5611, + "num_input_tokens_seen": 7721888, + "step": 22930 + }, + { + "epoch": 17.724111282843896, + "grad_norm": 0.8574931025505066, + "learning_rate": 1.9477802258892812e-06, + "loss": 0.3945, + "num_input_tokens_seen": 7723520, + "step": 22935 + }, + { + "epoch": 17.727975270479135, + "grad_norm": 0.9281511306762695, + "learning_rate": 1.9412610802431923e-06, + "loss": 0.4026, + "num_input_tokens_seen": 7725152, + "step": 22940 + }, + { + "epoch": 17.731839258114373, + "grad_norm": 1.2941579818725586, + "learning_rate": 1.9347524218496505e-06, + "loss": 0.4819, + "num_input_tokens_seen": 7726688, + "step": 22945 + }, + { + "epoch": 17.735703245749615, + "grad_norm": 1.0604618787765503, + "learning_rate": 1.928254253668846e-06, + "loss": 0.4864, + "num_input_tokens_seen": 7728512, + "step": 22950 + }, + { + "epoch": 17.739567233384854, + "grad_norm": 1.5218149423599243, + "learning_rate": 1.9217665786561783e-06, + "loss": 0.4467, + "num_input_tokens_seen": 7730400, + "step": 22955 + }, + { + "epoch": 17.743431221020092, + "grad_norm": 0.7504799962043762, + "learning_rate": 1.9152893997622766e-06, + "loss": 0.3802, + "num_input_tokens_seen": 7732192, + "step": 22960 + }, + { + "epoch": 17.74729520865533, + "grad_norm": 0.917056143283844, + "learning_rate": 1.9088227199330095e-06, + "loss": 0.7999, + "num_input_tokens_seen": 7733600, + "step": 22965 + }, + { + "epoch": 17.751159196290573, + "grad_norm": 1.1930760145187378, + "learning_rate": 1.9023665421094572e-06, + "loss": 0.5247, + "num_input_tokens_seen": 7734880, + "step": 22970 + }, + { + "epoch": 17.75502318392581, + "grad_norm": 1.1038923263549805, + "learning_rate": 1.8959208692279267e-06, + "loss": 0.3544, + "num_input_tokens_seen": 7736800, + "step": 22975 + }, + { + "epoch": 17.75888717156105, + "grad_norm": 0.7738339900970459, + "learning_rate": 1.889485704219951e-06, + "loss": 0.3696, + "num_input_tokens_seen": 7738720, + "step": 22980 + }, + { + "epoch": 17.76275115919629, + "grad_norm": 0.8728271722793579, + "learning_rate": 1.8830610500122748e-06, + "loss": 0.4434, + "num_input_tokens_seen": 7740288, + "step": 22985 + }, + { + "epoch": 17.76661514683153, + "grad_norm": 0.7554761171340942, + "learning_rate": 1.87664690952688e-06, + "loss": 0.3374, + "num_input_tokens_seen": 7742208, + "step": 22990 + }, + { + "epoch": 17.77047913446677, + "grad_norm": 1.050930142402649, + "learning_rate": 1.870243285680945e-06, + "loss": 0.3626, + "num_input_tokens_seen": 7743616, + "step": 22995 + }, + { + "epoch": 17.77434312210201, + "grad_norm": 1.915151834487915, + "learning_rate": 1.8638501813868892e-06, + "loss": 0.4532, + "num_input_tokens_seen": 7745216, + "step": 23000 + }, + { + "epoch": 17.77820710973725, + "grad_norm": 0.7026137709617615, + "learning_rate": 1.8574675995523261e-06, + "loss": 0.5702, + "num_input_tokens_seen": 7746880, + "step": 23005 + }, + { + "epoch": 17.782071097372487, + "grad_norm": 1.224769949913025, + "learning_rate": 1.8510955430800948e-06, + "loss": 0.6398, + "num_input_tokens_seen": 7748608, + "step": 23010 + }, + { + "epoch": 17.78593508500773, + "grad_norm": 1.8967565298080444, + "learning_rate": 1.8447340148682435e-06, + "loss": 0.6209, + "num_input_tokens_seen": 7750112, + "step": 23015 + }, + { + "epoch": 17.789799072642968, + "grad_norm": 0.7184792160987854, + "learning_rate": 1.8383830178100358e-06, + "loss": 0.4968, + "num_input_tokens_seen": 7751776, + "step": 23020 + }, + { + "epoch": 17.793663060278206, + "grad_norm": 0.8215920329093933, + "learning_rate": 1.8320425547939335e-06, + "loss": 0.4666, + "num_input_tokens_seen": 7753536, + "step": 23025 + }, + { + "epoch": 17.797527047913448, + "grad_norm": 0.8872660398483276, + "learning_rate": 1.8257126287036269e-06, + "loss": 0.3432, + "num_input_tokens_seen": 7755104, + "step": 23030 + }, + { + "epoch": 17.801391035548686, + "grad_norm": 0.7039785981178284, + "learning_rate": 1.819393242418005e-06, + "loss": 0.5807, + "num_input_tokens_seen": 7756640, + "step": 23035 + }, + { + "epoch": 17.805255023183925, + "grad_norm": 0.8161751627922058, + "learning_rate": 1.81308439881116e-06, + "loss": 0.4972, + "num_input_tokens_seen": 7758112, + "step": 23040 + }, + { + "epoch": 17.809119010819167, + "grad_norm": 0.9454885721206665, + "learning_rate": 1.8067861007523918e-06, + "loss": 0.6282, + "num_input_tokens_seen": 7759712, + "step": 23045 + }, + { + "epoch": 17.812982998454405, + "grad_norm": 1.0814603567123413, + "learning_rate": 1.8004983511062057e-06, + "loss": 0.5616, + "num_input_tokens_seen": 7761152, + "step": 23050 + }, + { + "epoch": 17.816846986089644, + "grad_norm": 1.8934857845306396, + "learning_rate": 1.7942211527323034e-06, + "loss": 0.4434, + "num_input_tokens_seen": 7763232, + "step": 23055 + }, + { + "epoch": 17.820710973724886, + "grad_norm": 0.8747102618217468, + "learning_rate": 1.7879545084855898e-06, + "loss": 0.4265, + "num_input_tokens_seen": 7764832, + "step": 23060 + }, + { + "epoch": 17.824574961360124, + "grad_norm": 0.8706793785095215, + "learning_rate": 1.7816984212161797e-06, + "loss": 0.3636, + "num_input_tokens_seen": 7766560, + "step": 23065 + }, + { + "epoch": 17.828438948995363, + "grad_norm": 0.9222778081893921, + "learning_rate": 1.7754528937693777e-06, + "loss": 0.4571, + "num_input_tokens_seen": 7768096, + "step": 23070 + }, + { + "epoch": 17.832302936630605, + "grad_norm": 1.1221449375152588, + "learning_rate": 1.7692179289856892e-06, + "loss": 0.5618, + "num_input_tokens_seen": 7769984, + "step": 23075 + }, + { + "epoch": 17.836166924265843, + "grad_norm": 1.391809105873108, + "learning_rate": 1.7629935297008071e-06, + "loss": 0.4239, + "num_input_tokens_seen": 7771520, + "step": 23080 + }, + { + "epoch": 17.84003091190108, + "grad_norm": 1.2929881811141968, + "learning_rate": 1.756779698745631e-06, + "loss": 0.4038, + "num_input_tokens_seen": 7773312, + "step": 23085 + }, + { + "epoch": 17.84389489953632, + "grad_norm": 1.0340296030044556, + "learning_rate": 1.750576438946247e-06, + "loss": 0.5011, + "num_input_tokens_seen": 7774944, + "step": 23090 + }, + { + "epoch": 17.847758887171562, + "grad_norm": 1.321460485458374, + "learning_rate": 1.7443837531239264e-06, + "loss": 0.7081, + "num_input_tokens_seen": 7776800, + "step": 23095 + }, + { + "epoch": 17.8516228748068, + "grad_norm": 0.8127958178520203, + "learning_rate": 1.7382016440951554e-06, + "loss": 0.4276, + "num_input_tokens_seen": 7778368, + "step": 23100 + }, + { + "epoch": 17.85548686244204, + "grad_norm": 0.6260814666748047, + "learning_rate": 1.732030114671579e-06, + "loss": 0.4253, + "num_input_tokens_seen": 7780032, + "step": 23105 + }, + { + "epoch": 17.85935085007728, + "grad_norm": 0.8214403986930847, + "learning_rate": 1.7258691676600575e-06, + "loss": 0.4486, + "num_input_tokens_seen": 7781696, + "step": 23110 + }, + { + "epoch": 17.86321483771252, + "grad_norm": 1.809912085533142, + "learning_rate": 1.7197188058626217e-06, + "loss": 0.5551, + "num_input_tokens_seen": 7783392, + "step": 23115 + }, + { + "epoch": 17.867078825347757, + "grad_norm": 1.4956732988357544, + "learning_rate": 1.7135790320764955e-06, + "loss": 0.657, + "num_input_tokens_seen": 7784992, + "step": 23120 + }, + { + "epoch": 17.870942812983, + "grad_norm": 1.1603388786315918, + "learning_rate": 1.707449849094081e-06, + "loss": 0.5251, + "num_input_tokens_seen": 7786464, + "step": 23125 + }, + { + "epoch": 17.874806800618238, + "grad_norm": 0.7976587414741516, + "learning_rate": 1.7013312597029623e-06, + "loss": 0.3917, + "num_input_tokens_seen": 7787936, + "step": 23130 + }, + { + "epoch": 17.878670788253476, + "grad_norm": 1.187113642692566, + "learning_rate": 1.6952232666859247e-06, + "loss": 0.3956, + "num_input_tokens_seen": 7789728, + "step": 23135 + }, + { + "epoch": 17.88253477588872, + "grad_norm": 1.1969292163848877, + "learning_rate": 1.6891258728209097e-06, + "loss": 0.4034, + "num_input_tokens_seen": 7791584, + "step": 23140 + }, + { + "epoch": 17.886398763523957, + "grad_norm": 1.4409983158111572, + "learning_rate": 1.6830390808810465e-06, + "loss": 0.4153, + "num_input_tokens_seen": 7793472, + "step": 23145 + }, + { + "epoch": 17.890262751159195, + "grad_norm": 1.0594583749771118, + "learning_rate": 1.6769628936346566e-06, + "loss": 0.4487, + "num_input_tokens_seen": 7795200, + "step": 23150 + }, + { + "epoch": 17.894126738794437, + "grad_norm": 1.4681893587112427, + "learning_rate": 1.6708973138452155e-06, + "loss": 0.4164, + "num_input_tokens_seen": 7796896, + "step": 23155 + }, + { + "epoch": 17.897990726429676, + "grad_norm": 0.6983473896980286, + "learning_rate": 1.664842344271389e-06, + "loss": 0.5698, + "num_input_tokens_seen": 7798432, + "step": 23160 + }, + { + "epoch": 17.901854714064914, + "grad_norm": 0.7906505465507507, + "learning_rate": 1.6587979876670102e-06, + "loss": 0.4154, + "num_input_tokens_seen": 7800128, + "step": 23165 + }, + { + "epoch": 17.905718701700156, + "grad_norm": 0.5562921762466431, + "learning_rate": 1.6527642467810967e-06, + "loss": 0.574, + "num_input_tokens_seen": 7801728, + "step": 23170 + }, + { + "epoch": 17.909582689335394, + "grad_norm": 2.341447114944458, + "learning_rate": 1.6467411243578229e-06, + "loss": 0.4614, + "num_input_tokens_seen": 7803328, + "step": 23175 + }, + { + "epoch": 17.913446676970633, + "grad_norm": 1.2992461919784546, + "learning_rate": 1.6407286231365449e-06, + "loss": 0.5054, + "num_input_tokens_seen": 7805024, + "step": 23180 + }, + { + "epoch": 17.917310664605875, + "grad_norm": 0.903904139995575, + "learning_rate": 1.6347267458517752e-06, + "loss": 0.3995, + "num_input_tokens_seen": 7806688, + "step": 23185 + }, + { + "epoch": 17.921174652241113, + "grad_norm": 0.9579862952232361, + "learning_rate": 1.6287354952332162e-06, + "loss": 0.385, + "num_input_tokens_seen": 7808576, + "step": 23190 + }, + { + "epoch": 17.92503863987635, + "grad_norm": 0.8048845529556274, + "learning_rate": 1.6227548740057191e-06, + "loss": 0.4107, + "num_input_tokens_seen": 7810144, + "step": 23195 + }, + { + "epoch": 17.92890262751159, + "grad_norm": 1.048541784286499, + "learning_rate": 1.6167848848893024e-06, + "loss": 0.4247, + "num_input_tokens_seen": 7811712, + "step": 23200 + }, + { + "epoch": 17.932766615146832, + "grad_norm": 0.7403237223625183, + "learning_rate": 1.610825530599161e-06, + "loss": 0.4397, + "num_input_tokens_seen": 7813472, + "step": 23205 + }, + { + "epoch": 17.93663060278207, + "grad_norm": 1.03505277633667, + "learning_rate": 1.6048768138456406e-06, + "loss": 0.3939, + "num_input_tokens_seen": 7815136, + "step": 23210 + }, + { + "epoch": 17.94049459041731, + "grad_norm": 0.6871843934059143, + "learning_rate": 1.5989387373342518e-06, + "loss": 0.5319, + "num_input_tokens_seen": 7816864, + "step": 23215 + }, + { + "epoch": 17.94435857805255, + "grad_norm": 2.555375576019287, + "learning_rate": 1.593011303765668e-06, + "loss": 0.6395, + "num_input_tokens_seen": 7818592, + "step": 23220 + }, + { + "epoch": 17.94822256568779, + "grad_norm": 0.9060570597648621, + "learning_rate": 1.5870945158357214e-06, + "loss": 0.4107, + "num_input_tokens_seen": 7820192, + "step": 23225 + }, + { + "epoch": 17.952086553323028, + "grad_norm": 0.8628582954406738, + "learning_rate": 1.581188376235404e-06, + "loss": 0.4698, + "num_input_tokens_seen": 7821952, + "step": 23230 + }, + { + "epoch": 17.95595054095827, + "grad_norm": 1.1965086460113525, + "learning_rate": 1.5752928876508615e-06, + "loss": 0.3251, + "num_input_tokens_seen": 7823488, + "step": 23235 + }, + { + "epoch": 17.95981452859351, + "grad_norm": 1.0371081829071045, + "learning_rate": 1.569408052763402e-06, + "loss": 0.3824, + "num_input_tokens_seen": 7825376, + "step": 23240 + }, + { + "epoch": 17.963678516228747, + "grad_norm": 0.990441083908081, + "learning_rate": 1.5635338742494787e-06, + "loss": 0.3029, + "num_input_tokens_seen": 7826944, + "step": 23245 + }, + { + "epoch": 17.96754250386399, + "grad_norm": 1.0769695043563843, + "learning_rate": 1.5576703547807075e-06, + "loss": 0.5537, + "num_input_tokens_seen": 7828608, + "step": 23250 + }, + { + "epoch": 17.971406491499227, + "grad_norm": 0.8875908851623535, + "learning_rate": 1.5518174970238496e-06, + "loss": 0.3848, + "num_input_tokens_seen": 7830304, + "step": 23255 + }, + { + "epoch": 17.975270479134466, + "grad_norm": 1.289713978767395, + "learning_rate": 1.5459753036408175e-06, + "loss": 0.4662, + "num_input_tokens_seen": 7831936, + "step": 23260 + }, + { + "epoch": 17.979134466769708, + "grad_norm": 0.8762754201889038, + "learning_rate": 1.5401437772886745e-06, + "loss": 0.3508, + "num_input_tokens_seen": 7834048, + "step": 23265 + }, + { + "epoch": 17.982998454404946, + "grad_norm": 0.8054763674736023, + "learning_rate": 1.534322920619638e-06, + "loss": 0.3537, + "num_input_tokens_seen": 7835616, + "step": 23270 + }, + { + "epoch": 17.986862442040184, + "grad_norm": 0.6060965657234192, + "learning_rate": 1.5285127362810708e-06, + "loss": 0.3752, + "num_input_tokens_seen": 7837280, + "step": 23275 + }, + { + "epoch": 17.990726429675426, + "grad_norm": 1.0349284410476685, + "learning_rate": 1.5227132269154787e-06, + "loss": 0.7375, + "num_input_tokens_seen": 7839040, + "step": 23280 + }, + { + "epoch": 17.994590417310665, + "grad_norm": 1.1452683210372925, + "learning_rate": 1.5169243951605071e-06, + "loss": 0.5014, + "num_input_tokens_seen": 7840672, + "step": 23285 + }, + { + "epoch": 17.998454404945903, + "grad_norm": 0.6614452600479126, + "learning_rate": 1.5111462436489587e-06, + "loss": 0.3401, + "num_input_tokens_seen": 7842208, + "step": 23290 + }, + { + "epoch": 18.0, + "eval_loss": 0.4447212219238281, + "eval_runtime": 6.2464, + "eval_samples_per_second": 92.052, + "eval_steps_per_second": 23.053, + "num_input_tokens_seen": 7842624, + "step": 23292 + }, + { + "epoch": 18.002318392581145, + "grad_norm": 1.046319603919983, + "learning_rate": 1.5053787750087645e-06, + "loss": 0.5099, + "num_input_tokens_seen": 7843552, + "step": 23295 + }, + { + "epoch": 18.006182380216384, + "grad_norm": 1.018530011177063, + "learning_rate": 1.4996219918630068e-06, + "loss": 0.5311, + "num_input_tokens_seen": 7845184, + "step": 23300 + }, + { + "epoch": 18.010046367851622, + "grad_norm": 0.9071861505508423, + "learning_rate": 1.4938758968299022e-06, + "loss": 0.5992, + "num_input_tokens_seen": 7846848, + "step": 23305 + }, + { + "epoch": 18.013910355486864, + "grad_norm": 0.7536181211471558, + "learning_rate": 1.4881404925228187e-06, + "loss": 0.4011, + "num_input_tokens_seen": 7848544, + "step": 23310 + }, + { + "epoch": 18.017774343122102, + "grad_norm": 0.7473558783531189, + "learning_rate": 1.4824157815502448e-06, + "loss": 0.2976, + "num_input_tokens_seen": 7850176, + "step": 23315 + }, + { + "epoch": 18.02163833075734, + "grad_norm": 1.103052020072937, + "learning_rate": 1.4767017665158145e-06, + "loss": 0.4757, + "num_input_tokens_seen": 7851808, + "step": 23320 + }, + { + "epoch": 18.025502318392583, + "grad_norm": 0.8527570366859436, + "learning_rate": 1.4709984500182987e-06, + "loss": 0.6638, + "num_input_tokens_seen": 7853536, + "step": 23325 + }, + { + "epoch": 18.02936630602782, + "grad_norm": 0.6516656875610352, + "learning_rate": 1.4653058346515953e-06, + "loss": 0.4167, + "num_input_tokens_seen": 7855136, + "step": 23330 + }, + { + "epoch": 18.03323029366306, + "grad_norm": 0.7242406606674194, + "learning_rate": 1.4596239230047381e-06, + "loss": 0.3952, + "num_input_tokens_seen": 7856864, + "step": 23335 + }, + { + "epoch": 18.037094281298298, + "grad_norm": 1.5943115949630737, + "learning_rate": 1.453952717661905e-06, + "loss": 0.5679, + "num_input_tokens_seen": 7858464, + "step": 23340 + }, + { + "epoch": 18.04095826893354, + "grad_norm": 0.5928425788879395, + "learning_rate": 1.4482922212023797e-06, + "loss": 0.3481, + "num_input_tokens_seen": 7860256, + "step": 23345 + }, + { + "epoch": 18.04482225656878, + "grad_norm": 0.8551400899887085, + "learning_rate": 1.4426424362006057e-06, + "loss": 0.5168, + "num_input_tokens_seen": 7861952, + "step": 23350 + }, + { + "epoch": 18.048686244204017, + "grad_norm": 1.1145281791687012, + "learning_rate": 1.4370033652261277e-06, + "loss": 0.4413, + "num_input_tokens_seen": 7863616, + "step": 23355 + }, + { + "epoch": 18.05255023183926, + "grad_norm": 0.9601885676383972, + "learning_rate": 1.4313750108436359e-06, + "loss": 0.3563, + "num_input_tokens_seen": 7865216, + "step": 23360 + }, + { + "epoch": 18.056414219474497, + "grad_norm": 1.6645326614379883, + "learning_rate": 1.4257573756129321e-06, + "loss": 0.6268, + "num_input_tokens_seen": 7866848, + "step": 23365 + }, + { + "epoch": 18.060278207109736, + "grad_norm": 0.7394644021987915, + "learning_rate": 1.4201504620889538e-06, + "loss": 0.3878, + "num_input_tokens_seen": 7868640, + "step": 23370 + }, + { + "epoch": 18.064142194744978, + "grad_norm": 0.9675918221473694, + "learning_rate": 1.4145542728217637e-06, + "loss": 0.5709, + "num_input_tokens_seen": 7870496, + "step": 23375 + }, + { + "epoch": 18.068006182380216, + "grad_norm": 0.8582371473312378, + "learning_rate": 1.4089688103565368e-06, + "loss": 0.3465, + "num_input_tokens_seen": 7872096, + "step": 23380 + }, + { + "epoch": 18.071870170015455, + "grad_norm": 0.8879966139793396, + "learning_rate": 1.4033940772335719e-06, + "loss": 0.3852, + "num_input_tokens_seen": 7873600, + "step": 23385 + }, + { + "epoch": 18.075734157650697, + "grad_norm": 1.3991281986236572, + "learning_rate": 1.397830075988299e-06, + "loss": 0.4681, + "num_input_tokens_seen": 7875616, + "step": 23390 + }, + { + "epoch": 18.079598145285935, + "grad_norm": 0.8433775901794434, + "learning_rate": 1.392276809151255e-06, + "loss": 0.4233, + "num_input_tokens_seen": 7877376, + "step": 23395 + }, + { + "epoch": 18.083462132921174, + "grad_norm": 0.9845796227455139, + "learning_rate": 1.3867342792481003e-06, + "loss": 0.4207, + "num_input_tokens_seen": 7879008, + "step": 23400 + }, + { + "epoch": 18.087326120556416, + "grad_norm": 0.627405047416687, + "learning_rate": 1.3812024887996045e-06, + "loss": 0.4802, + "num_input_tokens_seen": 7880768, + "step": 23405 + }, + { + "epoch": 18.091190108191654, + "grad_norm": 1.5126547813415527, + "learning_rate": 1.3756814403216688e-06, + "loss": 0.5037, + "num_input_tokens_seen": 7882528, + "step": 23410 + }, + { + "epoch": 18.095054095826892, + "grad_norm": 0.6157695055007935, + "learning_rate": 1.3701711363252962e-06, + "loss": 0.3401, + "num_input_tokens_seen": 7884320, + "step": 23415 + }, + { + "epoch": 18.098918083462134, + "grad_norm": 2.1682567596435547, + "learning_rate": 1.3646715793166037e-06, + "loss": 0.5923, + "num_input_tokens_seen": 7885728, + "step": 23420 + }, + { + "epoch": 18.102782071097373, + "grad_norm": 0.6888889074325562, + "learning_rate": 1.3591827717968186e-06, + "loss": 0.3785, + "num_input_tokens_seen": 7887456, + "step": 23425 + }, + { + "epoch": 18.10664605873261, + "grad_norm": 0.7219893932342529, + "learning_rate": 1.3537047162622912e-06, + "loss": 0.3605, + "num_input_tokens_seen": 7889184, + "step": 23430 + }, + { + "epoch": 18.110510046367853, + "grad_norm": 1.0837304592132568, + "learning_rate": 1.348237415204473e-06, + "loss": 0.4589, + "num_input_tokens_seen": 7890752, + "step": 23435 + }, + { + "epoch": 18.11437403400309, + "grad_norm": 1.2403433322906494, + "learning_rate": 1.3427808711099165e-06, + "loss": 0.8711, + "num_input_tokens_seen": 7892448, + "step": 23440 + }, + { + "epoch": 18.11823802163833, + "grad_norm": 2.194737195968628, + "learning_rate": 1.3373350864603034e-06, + "loss": 0.5613, + "num_input_tokens_seen": 7893920, + "step": 23445 + }, + { + "epoch": 18.122102009273572, + "grad_norm": 1.28925359249115, + "learning_rate": 1.3319000637324025e-06, + "loss": 0.4945, + "num_input_tokens_seen": 7895776, + "step": 23450 + }, + { + "epoch": 18.12596599690881, + "grad_norm": 1.0317364931106567, + "learning_rate": 1.3264758053980974e-06, + "loss": 0.4361, + "num_input_tokens_seen": 7897504, + "step": 23455 + }, + { + "epoch": 18.12982998454405, + "grad_norm": 1.4959992170333862, + "learning_rate": 1.3210623139243678e-06, + "loss": 0.469, + "num_input_tokens_seen": 7899040, + "step": 23460 + }, + { + "epoch": 18.133693972179287, + "grad_norm": 1.0198211669921875, + "learning_rate": 1.315659591773305e-06, + "loss": 0.5716, + "num_input_tokens_seen": 7900480, + "step": 23465 + }, + { + "epoch": 18.13755795981453, + "grad_norm": 1.2747077941894531, + "learning_rate": 1.3102676414020993e-06, + "loss": 0.575, + "num_input_tokens_seen": 7901888, + "step": 23470 + }, + { + "epoch": 18.141421947449768, + "grad_norm": 1.2196166515350342, + "learning_rate": 1.304886465263047e-06, + "loss": 0.4349, + "num_input_tokens_seen": 7903744, + "step": 23475 + }, + { + "epoch": 18.145285935085006, + "grad_norm": 1.1154414415359497, + "learning_rate": 1.2995160658035272e-06, + "loss": 0.3598, + "num_input_tokens_seen": 7905440, + "step": 23480 + }, + { + "epoch": 18.149149922720248, + "grad_norm": 0.8685609698295593, + "learning_rate": 1.2941564454660438e-06, + "loss": 0.3399, + "num_input_tokens_seen": 7906880, + "step": 23485 + }, + { + "epoch": 18.153013910355487, + "grad_norm": 1.535690426826477, + "learning_rate": 1.2888076066881778e-06, + "loss": 0.4597, + "num_input_tokens_seen": 7908512, + "step": 23490 + }, + { + "epoch": 18.156877897990725, + "grad_norm": 0.861924409866333, + "learning_rate": 1.2834695519026109e-06, + "loss": 0.3845, + "num_input_tokens_seen": 7910080, + "step": 23495 + }, + { + "epoch": 18.160741885625967, + "grad_norm": 0.686421275138855, + "learning_rate": 1.2781422835371259e-06, + "loss": 0.4262, + "num_input_tokens_seen": 7911712, + "step": 23500 + }, + { + "epoch": 18.164605873261205, + "grad_norm": 1.1435225009918213, + "learning_rate": 1.2728258040145907e-06, + "loss": 0.459, + "num_input_tokens_seen": 7913248, + "step": 23505 + }, + { + "epoch": 18.168469860896444, + "grad_norm": 1.09714937210083, + "learning_rate": 1.2675201157529792e-06, + "loss": 0.4538, + "num_input_tokens_seen": 7914976, + "step": 23510 + }, + { + "epoch": 18.172333848531686, + "grad_norm": 1.4341585636138916, + "learning_rate": 1.2622252211653473e-06, + "loss": 0.5873, + "num_input_tokens_seen": 7916672, + "step": 23515 + }, + { + "epoch": 18.176197836166924, + "grad_norm": 1.4640921354293823, + "learning_rate": 1.2569411226598438e-06, + "loss": 0.5083, + "num_input_tokens_seen": 7918336, + "step": 23520 + }, + { + "epoch": 18.180061823802163, + "grad_norm": 1.2753840684890747, + "learning_rate": 1.2516678226397127e-06, + "loss": 0.4112, + "num_input_tokens_seen": 7920128, + "step": 23525 + }, + { + "epoch": 18.183925811437405, + "grad_norm": 1.4099947214126587, + "learning_rate": 1.2464053235032775e-06, + "loss": 0.4055, + "num_input_tokens_seen": 7921824, + "step": 23530 + }, + { + "epoch": 18.187789799072643, + "grad_norm": 1.8579721450805664, + "learning_rate": 1.2411536276439567e-06, + "loss": 0.4653, + "num_input_tokens_seen": 7923488, + "step": 23535 + }, + { + "epoch": 18.19165378670788, + "grad_norm": 0.8972510695457458, + "learning_rate": 1.2359127374502482e-06, + "loss": 0.3603, + "num_input_tokens_seen": 7925152, + "step": 23540 + }, + { + "epoch": 18.195517774343124, + "grad_norm": 1.6834410429000854, + "learning_rate": 1.2306826553057454e-06, + "loss": 0.4244, + "num_input_tokens_seen": 7926880, + "step": 23545 + }, + { + "epoch": 18.199381761978362, + "grad_norm": 1.0509382486343384, + "learning_rate": 1.2254633835891205e-06, + "loss": 0.3454, + "num_input_tokens_seen": 7928704, + "step": 23550 + }, + { + "epoch": 18.2032457496136, + "grad_norm": 1.7658499479293823, + "learning_rate": 1.2202549246741302e-06, + "loss": 0.42, + "num_input_tokens_seen": 7930752, + "step": 23555 + }, + { + "epoch": 18.207109737248842, + "grad_norm": 1.1289076805114746, + "learning_rate": 1.21505728092961e-06, + "loss": 0.4464, + "num_input_tokens_seen": 7932544, + "step": 23560 + }, + { + "epoch": 18.21097372488408, + "grad_norm": 1.1763187646865845, + "learning_rate": 1.2098704547194834e-06, + "loss": 0.4013, + "num_input_tokens_seen": 7934176, + "step": 23565 + }, + { + "epoch": 18.21483771251932, + "grad_norm": 0.9077515006065369, + "learning_rate": 1.2046944484027462e-06, + "loss": 0.3808, + "num_input_tokens_seen": 7935968, + "step": 23570 + }, + { + "epoch": 18.21870170015456, + "grad_norm": 0.8464998602867126, + "learning_rate": 1.1995292643334794e-06, + "loss": 0.5159, + "num_input_tokens_seen": 7937664, + "step": 23575 + }, + { + "epoch": 18.2225656877898, + "grad_norm": 0.6509355306625366, + "learning_rate": 1.1943749048608343e-06, + "loss": 0.4111, + "num_input_tokens_seen": 7939264, + "step": 23580 + }, + { + "epoch": 18.226429675425038, + "grad_norm": 1.0727828741073608, + "learning_rate": 1.18923137232905e-06, + "loss": 0.3846, + "num_input_tokens_seen": 7941120, + "step": 23585 + }, + { + "epoch": 18.230293663060277, + "grad_norm": 0.662339985370636, + "learning_rate": 1.1840986690774353e-06, + "loss": 0.4387, + "num_input_tokens_seen": 7943040, + "step": 23590 + }, + { + "epoch": 18.23415765069552, + "grad_norm": 0.855204701423645, + "learning_rate": 1.1789767974403759e-06, + "loss": 0.4183, + "num_input_tokens_seen": 7944864, + "step": 23595 + }, + { + "epoch": 18.238021638330757, + "grad_norm": 2.0758769512176514, + "learning_rate": 1.173865759747328e-06, + "loss": 0.4024, + "num_input_tokens_seen": 7946560, + "step": 23600 + }, + { + "epoch": 18.241885625965995, + "grad_norm": 1.3251495361328125, + "learning_rate": 1.1687655583228207e-06, + "loss": 0.5151, + "num_input_tokens_seen": 7948320, + "step": 23605 + }, + { + "epoch": 18.245749613601237, + "grad_norm": 0.8413941860198975, + "learning_rate": 1.1636761954864573e-06, + "loss": 0.3708, + "num_input_tokens_seen": 7950080, + "step": 23610 + }, + { + "epoch": 18.249613601236476, + "grad_norm": 0.8677348494529724, + "learning_rate": 1.158597673552908e-06, + "loss": 0.4216, + "num_input_tokens_seen": 7951584, + "step": 23615 + }, + { + "epoch": 18.253477588871714, + "grad_norm": 0.716961681842804, + "learning_rate": 1.15352999483192e-06, + "loss": 0.3514, + "num_input_tokens_seen": 7953056, + "step": 23620 + }, + { + "epoch": 18.257341576506956, + "grad_norm": 1.052541971206665, + "learning_rate": 1.1484731616282967e-06, + "loss": 0.5077, + "num_input_tokens_seen": 7954720, + "step": 23625 + }, + { + "epoch": 18.261205564142195, + "grad_norm": 0.8514098525047302, + "learning_rate": 1.1434271762419235e-06, + "loss": 0.4388, + "num_input_tokens_seen": 7956448, + "step": 23630 + }, + { + "epoch": 18.265069551777433, + "grad_norm": 1.0098835229873657, + "learning_rate": 1.138392040967745e-06, + "loss": 0.4501, + "num_input_tokens_seen": 7958144, + "step": 23635 + }, + { + "epoch": 18.268933539412675, + "grad_norm": 0.6743622422218323, + "learning_rate": 1.1333677580957657e-06, + "loss": 0.4318, + "num_input_tokens_seen": 7959680, + "step": 23640 + }, + { + "epoch": 18.272797527047913, + "grad_norm": 0.9019069671630859, + "learning_rate": 1.1283543299110632e-06, + "loss": 0.5323, + "num_input_tokens_seen": 7961152, + "step": 23645 + }, + { + "epoch": 18.276661514683152, + "grad_norm": 0.7401171326637268, + "learning_rate": 1.1233517586937664e-06, + "loss": 0.3254, + "num_input_tokens_seen": 7963072, + "step": 23650 + }, + { + "epoch": 18.280525502318394, + "grad_norm": 1.2176625728607178, + "learning_rate": 1.1183600467190885e-06, + "loss": 0.4123, + "num_input_tokens_seen": 7964928, + "step": 23655 + }, + { + "epoch": 18.284389489953632, + "grad_norm": 1.079466462135315, + "learning_rate": 1.1133791962572805e-06, + "loss": 0.3245, + "num_input_tokens_seen": 7966624, + "step": 23660 + }, + { + "epoch": 18.28825347758887, + "grad_norm": 0.9282268285751343, + "learning_rate": 1.1084092095736659e-06, + "loss": 0.3971, + "num_input_tokens_seen": 7968032, + "step": 23665 + }, + { + "epoch": 18.292117465224113, + "grad_norm": 1.0733413696289062, + "learning_rate": 1.103450088928623e-06, + "loss": 0.5823, + "num_input_tokens_seen": 7969664, + "step": 23670 + }, + { + "epoch": 18.29598145285935, + "grad_norm": 1.7676650285720825, + "learning_rate": 1.0985018365775922e-06, + "loss": 0.6555, + "num_input_tokens_seen": 7971456, + "step": 23675 + }, + { + "epoch": 18.29984544049459, + "grad_norm": 1.0531691312789917, + "learning_rate": 1.093564454771065e-06, + "loss": 0.4053, + "num_input_tokens_seen": 7973312, + "step": 23680 + }, + { + "epoch": 18.30370942812983, + "grad_norm": 1.0006672143936157, + "learning_rate": 1.088637945754592e-06, + "loss": 0.3418, + "num_input_tokens_seen": 7974976, + "step": 23685 + }, + { + "epoch": 18.30757341576507, + "grad_norm": 0.9149067997932434, + "learning_rate": 1.0837223117687839e-06, + "loss": 0.4025, + "num_input_tokens_seen": 7976512, + "step": 23690 + }, + { + "epoch": 18.31143740340031, + "grad_norm": 1.624746322631836, + "learning_rate": 1.0788175550492969e-06, + "loss": 0.6113, + "num_input_tokens_seen": 7977952, + "step": 23695 + }, + { + "epoch": 18.315301391035547, + "grad_norm": 0.9131195545196533, + "learning_rate": 1.0739236778268435e-06, + "loss": 0.5132, + "num_input_tokens_seen": 7979680, + "step": 23700 + }, + { + "epoch": 18.31916537867079, + "grad_norm": 0.9557772278785706, + "learning_rate": 1.0690406823271909e-06, + "loss": 0.4531, + "num_input_tokens_seen": 7981504, + "step": 23705 + }, + { + "epoch": 18.323029366306027, + "grad_norm": 1.0316540002822876, + "learning_rate": 1.0641685707711486e-06, + "loss": 0.5796, + "num_input_tokens_seen": 7983136, + "step": 23710 + }, + { + "epoch": 18.326893353941266, + "grad_norm": 1.8220103979110718, + "learning_rate": 1.059307345374591e-06, + "loss": 0.5254, + "num_input_tokens_seen": 7984736, + "step": 23715 + }, + { + "epoch": 18.330757341576508, + "grad_norm": 0.9613310098648071, + "learning_rate": 1.0544570083484223e-06, + "loss": 0.4591, + "num_input_tokens_seen": 7986336, + "step": 23720 + }, + { + "epoch": 18.334621329211746, + "grad_norm": 0.8159534931182861, + "learning_rate": 1.0496175618986166e-06, + "loss": 0.4174, + "num_input_tokens_seen": 7988224, + "step": 23725 + }, + { + "epoch": 18.338485316846985, + "grad_norm": 1.8652052879333496, + "learning_rate": 1.0447890082261742e-06, + "loss": 0.5732, + "num_input_tokens_seen": 7989920, + "step": 23730 + }, + { + "epoch": 18.342349304482227, + "grad_norm": 1.0244779586791992, + "learning_rate": 1.0399713495271551e-06, + "loss": 0.3651, + "num_input_tokens_seen": 7991456, + "step": 23735 + }, + { + "epoch": 18.346213292117465, + "grad_norm": 0.7476295232772827, + "learning_rate": 1.035164587992657e-06, + "loss": 0.4171, + "num_input_tokens_seen": 7993024, + "step": 23740 + }, + { + "epoch": 18.350077279752703, + "grad_norm": 0.6203731298446655, + "learning_rate": 1.0303687258088223e-06, + "loss": 0.4829, + "num_input_tokens_seen": 7994656, + "step": 23745 + }, + { + "epoch": 18.353941267387945, + "grad_norm": 0.8393150568008423, + "learning_rate": 1.0255837651568373e-06, + "loss": 0.3752, + "num_input_tokens_seen": 7996448, + "step": 23750 + }, + { + "epoch": 18.357805255023184, + "grad_norm": 1.2570436000823975, + "learning_rate": 1.0208097082129332e-06, + "loss": 0.3928, + "num_input_tokens_seen": 7998272, + "step": 23755 + }, + { + "epoch": 18.361669242658422, + "grad_norm": 0.9559906125068665, + "learning_rate": 1.0160465571483812e-06, + "loss": 0.3627, + "num_input_tokens_seen": 8000192, + "step": 23760 + }, + { + "epoch": 18.365533230293664, + "grad_norm": 0.5921657681465149, + "learning_rate": 1.0112943141294907e-06, + "loss": 0.4775, + "num_input_tokens_seen": 8001952, + "step": 23765 + }, + { + "epoch": 18.369397217928903, + "grad_norm": 0.9006633162498474, + "learning_rate": 1.00655298131761e-06, + "loss": 0.3373, + "num_input_tokens_seen": 8003584, + "step": 23770 + }, + { + "epoch": 18.37326120556414, + "grad_norm": 1.0212990045547485, + "learning_rate": 1.001822560869123e-06, + "loss": 0.406, + "num_input_tokens_seen": 8005280, + "step": 23775 + }, + { + "epoch": 18.377125193199383, + "grad_norm": 0.9663121104240417, + "learning_rate": 9.971030549354554e-07, + "loss": 0.4008, + "num_input_tokens_seen": 8007104, + "step": 23780 + }, + { + "epoch": 18.38098918083462, + "grad_norm": 0.9937095046043396, + "learning_rate": 9.923944656630628e-07, + "loss": 0.3871, + "num_input_tokens_seen": 8008672, + "step": 23785 + }, + { + "epoch": 18.38485316846986, + "grad_norm": 1.3066445589065552, + "learning_rate": 9.876967951934435e-07, + "loss": 0.6489, + "num_input_tokens_seen": 8010240, + "step": 23790 + }, + { + "epoch": 18.388717156105102, + "grad_norm": 0.7807162404060364, + "learning_rate": 9.83010045663127e-07, + "loss": 0.659, + "num_input_tokens_seen": 8011840, + "step": 23795 + }, + { + "epoch": 18.39258114374034, + "grad_norm": 0.7051476836204529, + "learning_rate": 9.783342192036777e-07, + "loss": 0.3317, + "num_input_tokens_seen": 8013536, + "step": 23800 + }, + { + "epoch": 18.39644513137558, + "grad_norm": 0.8243148326873779, + "learning_rate": 9.736693179416834e-07, + "loss": 0.5688, + "num_input_tokens_seen": 8015040, + "step": 23805 + }, + { + "epoch": 18.40030911901082, + "grad_norm": 0.8464577198028564, + "learning_rate": 9.690153439987692e-07, + "loss": 0.6422, + "num_input_tokens_seen": 8016608, + "step": 23810 + }, + { + "epoch": 18.40417310664606, + "grad_norm": 0.9502264857292175, + "learning_rate": 9.643722994915949e-07, + "loss": 0.872, + "num_input_tokens_seen": 8018304, + "step": 23815 + }, + { + "epoch": 18.408037094281298, + "grad_norm": 0.8824434876441956, + "learning_rate": 9.597401865318405e-07, + "loss": 0.5236, + "num_input_tokens_seen": 8020064, + "step": 23820 + }, + { + "epoch": 18.41190108191654, + "grad_norm": 0.8446050882339478, + "learning_rate": 9.55119007226221e-07, + "loss": 0.497, + "num_input_tokens_seen": 8021728, + "step": 23825 + }, + { + "epoch": 18.415765069551778, + "grad_norm": 0.7718901634216309, + "learning_rate": 9.505087636764748e-07, + "loss": 0.3451, + "num_input_tokens_seen": 8023360, + "step": 23830 + }, + { + "epoch": 18.419629057187016, + "grad_norm": 0.871103048324585, + "learning_rate": 9.459094579793715e-07, + "loss": 0.3722, + "num_input_tokens_seen": 8025120, + "step": 23835 + }, + { + "epoch": 18.423493044822255, + "grad_norm": 0.7830568552017212, + "learning_rate": 9.413210922267019e-07, + "loss": 0.347, + "num_input_tokens_seen": 8027040, + "step": 23840 + }, + { + "epoch": 18.427357032457497, + "grad_norm": 0.8436391949653625, + "learning_rate": 9.367436685052828e-07, + "loss": 0.3832, + "num_input_tokens_seen": 8028928, + "step": 23845 + }, + { + "epoch": 18.431221020092735, + "grad_norm": 1.498425841331482, + "learning_rate": 9.321771888969488e-07, + "loss": 0.4482, + "num_input_tokens_seen": 8030560, + "step": 23850 + }, + { + "epoch": 18.435085007727974, + "grad_norm": 1.0552829504013062, + "learning_rate": 9.276216554785666e-07, + "loss": 0.5743, + "num_input_tokens_seen": 8032448, + "step": 23855 + }, + { + "epoch": 18.438948995363216, + "grad_norm": 2.1057491302490234, + "learning_rate": 9.230770703220204e-07, + "loss": 0.547, + "num_input_tokens_seen": 8034240, + "step": 23860 + }, + { + "epoch": 18.442812982998454, + "grad_norm": 0.9302696585655212, + "learning_rate": 9.185434354942124e-07, + "loss": 0.6101, + "num_input_tokens_seen": 8035936, + "step": 23865 + }, + { + "epoch": 18.446676970633693, + "grad_norm": 1.150943398475647, + "learning_rate": 9.140207530570683e-07, + "loss": 0.4982, + "num_input_tokens_seen": 8037920, + "step": 23870 + }, + { + "epoch": 18.450540958268935, + "grad_norm": 0.7538676857948303, + "learning_rate": 9.095090250675315e-07, + "loss": 0.3701, + "num_input_tokens_seen": 8039744, + "step": 23875 + }, + { + "epoch": 18.454404945904173, + "grad_norm": 0.6941826939582825, + "learning_rate": 9.050082535775634e-07, + "loss": 0.3438, + "num_input_tokens_seen": 8041312, + "step": 23880 + }, + { + "epoch": 18.45826893353941, + "grad_norm": 1.2320266962051392, + "learning_rate": 9.005184406341405e-07, + "loss": 0.4675, + "num_input_tokens_seen": 8042880, + "step": 23885 + }, + { + "epoch": 18.462132921174653, + "grad_norm": 1.1923420429229736, + "learning_rate": 8.960395882792544e-07, + "loss": 0.4818, + "num_input_tokens_seen": 8044832, + "step": 23890 + }, + { + "epoch": 18.465996908809892, + "grad_norm": 1.2371371984481812, + "learning_rate": 8.9157169854992e-07, + "loss": 0.4807, + "num_input_tokens_seen": 8046688, + "step": 23895 + }, + { + "epoch": 18.46986089644513, + "grad_norm": 1.2072370052337646, + "learning_rate": 8.871147734781538e-07, + "loss": 0.5158, + "num_input_tokens_seen": 8048384, + "step": 23900 + }, + { + "epoch": 18.473724884080372, + "grad_norm": 1.0245811939239502, + "learning_rate": 8.826688150909979e-07, + "loss": 0.471, + "num_input_tokens_seen": 8049984, + "step": 23905 + }, + { + "epoch": 18.47758887171561, + "grad_norm": 1.0173358917236328, + "learning_rate": 8.782338254104932e-07, + "loss": 0.3554, + "num_input_tokens_seen": 8051584, + "step": 23910 + }, + { + "epoch": 18.48145285935085, + "grad_norm": 1.1196449995040894, + "learning_rate": 8.738098064537098e-07, + "loss": 0.3815, + "num_input_tokens_seen": 8053344, + "step": 23915 + }, + { + "epoch": 18.48531684698609, + "grad_norm": 1.1000112295150757, + "learning_rate": 8.693967602327102e-07, + "loss": 0.3967, + "num_input_tokens_seen": 8055072, + "step": 23920 + }, + { + "epoch": 18.48918083462133, + "grad_norm": 1.0380680561065674, + "learning_rate": 8.649946887545751e-07, + "loss": 0.3701, + "num_input_tokens_seen": 8056896, + "step": 23925 + }, + { + "epoch": 18.493044822256568, + "grad_norm": 1.6158372163772583, + "learning_rate": 8.606035940213974e-07, + "loss": 0.4193, + "num_input_tokens_seen": 8058208, + "step": 23930 + }, + { + "epoch": 18.49690880989181, + "grad_norm": 0.8458508253097534, + "learning_rate": 8.562234780302686e-07, + "loss": 0.3889, + "num_input_tokens_seen": 8059904, + "step": 23935 + }, + { + "epoch": 18.50077279752705, + "grad_norm": 0.5191845893859863, + "learning_rate": 8.51854342773295e-07, + "loss": 0.3936, + "num_input_tokens_seen": 8061920, + "step": 23940 + }, + { + "epoch": 18.504636785162287, + "grad_norm": 0.9308202266693115, + "learning_rate": 8.474961902375816e-07, + "loss": 0.4436, + "num_input_tokens_seen": 8063520, + "step": 23945 + }, + { + "epoch": 18.508500772797525, + "grad_norm": 0.792233407497406, + "learning_rate": 8.431490224052457e-07, + "loss": 0.4518, + "num_input_tokens_seen": 8064960, + "step": 23950 + }, + { + "epoch": 18.512364760432767, + "grad_norm": 1.1354738473892212, + "learning_rate": 8.388128412534029e-07, + "loss": 0.5454, + "num_input_tokens_seen": 8066560, + "step": 23955 + }, + { + "epoch": 18.516228748068006, + "grad_norm": 0.8447077870368958, + "learning_rate": 8.344876487541759e-07, + "loss": 0.4387, + "num_input_tokens_seen": 8068192, + "step": 23960 + }, + { + "epoch": 18.520092735703244, + "grad_norm": 1.1906555891036987, + "learning_rate": 8.30173446874688e-07, + "loss": 0.4453, + "num_input_tokens_seen": 8069984, + "step": 23965 + }, + { + "epoch": 18.523956723338486, + "grad_norm": 1.161672592163086, + "learning_rate": 8.258702375770644e-07, + "loss": 0.5107, + "num_input_tokens_seen": 8071648, + "step": 23970 + }, + { + "epoch": 18.527820710973725, + "grad_norm": 2.2608797550201416, + "learning_rate": 8.215780228184311e-07, + "loss": 0.5026, + "num_input_tokens_seen": 8073440, + "step": 23975 + }, + { + "epoch": 18.531684698608963, + "grad_norm": 1.3362122774124146, + "learning_rate": 8.172968045509127e-07, + "loss": 0.5088, + "num_input_tokens_seen": 8075200, + "step": 23980 + }, + { + "epoch": 18.535548686244205, + "grad_norm": 1.461244821548462, + "learning_rate": 8.130265847216295e-07, + "loss": 0.4079, + "num_input_tokens_seen": 8076928, + "step": 23985 + }, + { + "epoch": 18.539412673879443, + "grad_norm": 1.1092498302459717, + "learning_rate": 8.087673652727057e-07, + "loss": 0.4828, + "num_input_tokens_seen": 8078848, + "step": 23990 + }, + { + "epoch": 18.543276661514682, + "grad_norm": 1.2492246627807617, + "learning_rate": 8.045191481412584e-07, + "loss": 0.361, + "num_input_tokens_seen": 8080544, + "step": 23995 + }, + { + "epoch": 18.547140649149924, + "grad_norm": 0.8591610789299011, + "learning_rate": 8.00281935259406e-07, + "loss": 0.4226, + "num_input_tokens_seen": 8082144, + "step": 24000 + }, + { + "epoch": 18.551004636785162, + "grad_norm": 0.9022918343544006, + "learning_rate": 7.960557285542569e-07, + "loss": 0.4345, + "num_input_tokens_seen": 8084000, + "step": 24005 + }, + { + "epoch": 18.5548686244204, + "grad_norm": 0.7921047210693359, + "learning_rate": 7.918405299479126e-07, + "loss": 0.5675, + "num_input_tokens_seen": 8085856, + "step": 24010 + }, + { + "epoch": 18.558732612055643, + "grad_norm": 0.8335136771202087, + "learning_rate": 7.876363413574728e-07, + "loss": 0.3702, + "num_input_tokens_seen": 8087680, + "step": 24015 + }, + { + "epoch": 18.56259659969088, + "grad_norm": 1.337491750717163, + "learning_rate": 7.834431646950275e-07, + "loss": 0.5195, + "num_input_tokens_seen": 8089760, + "step": 24020 + }, + { + "epoch": 18.56646058732612, + "grad_norm": 1.3640937805175781, + "learning_rate": 7.792610018676538e-07, + "loss": 0.4042, + "num_input_tokens_seen": 8091328, + "step": 24025 + }, + { + "epoch": 18.57032457496136, + "grad_norm": 1.8697383403778076, + "learning_rate": 7.750898547774305e-07, + "loss": 0.4217, + "num_input_tokens_seen": 8093280, + "step": 24030 + }, + { + "epoch": 18.5741885625966, + "grad_norm": 0.8403565883636475, + "learning_rate": 7.709297253214231e-07, + "loss": 0.4693, + "num_input_tokens_seen": 8095040, + "step": 24035 + }, + { + "epoch": 18.57805255023184, + "grad_norm": 1.0010437965393066, + "learning_rate": 7.667806153916768e-07, + "loss": 0.3899, + "num_input_tokens_seen": 8096640, + "step": 24040 + }, + { + "epoch": 18.58191653786708, + "grad_norm": 1.2245954275131226, + "learning_rate": 7.626425268752318e-07, + "loss": 0.4144, + "num_input_tokens_seen": 8098528, + "step": 24045 + }, + { + "epoch": 18.58578052550232, + "grad_norm": 0.7652437686920166, + "learning_rate": 7.585154616541191e-07, + "loss": 0.3657, + "num_input_tokens_seen": 8100352, + "step": 24050 + }, + { + "epoch": 18.589644513137557, + "grad_norm": 1.067984700202942, + "learning_rate": 7.543994216053535e-07, + "loss": 0.3971, + "num_input_tokens_seen": 8101888, + "step": 24055 + }, + { + "epoch": 18.5935085007728, + "grad_norm": 1.3772095441818237, + "learning_rate": 7.502944086009267e-07, + "loss": 0.4485, + "num_input_tokens_seen": 8103456, + "step": 24060 + }, + { + "epoch": 18.597372488408038, + "grad_norm": 0.8983032703399658, + "learning_rate": 7.462004245078313e-07, + "loss": 0.314, + "num_input_tokens_seen": 8105184, + "step": 24065 + }, + { + "epoch": 18.601236476043276, + "grad_norm": 0.7636988162994385, + "learning_rate": 7.421174711880307e-07, + "loss": 0.3691, + "num_input_tokens_seen": 8106976, + "step": 24070 + }, + { + "epoch": 18.605100463678518, + "grad_norm": 0.9477993845939636, + "learning_rate": 7.380455504984812e-07, + "loss": 0.4499, + "num_input_tokens_seen": 8108480, + "step": 24075 + }, + { + "epoch": 18.608964451313756, + "grad_norm": 1.0352174043655396, + "learning_rate": 7.339846642911152e-07, + "loss": 0.5474, + "num_input_tokens_seen": 8110080, + "step": 24080 + }, + { + "epoch": 18.612828438948995, + "grad_norm": 0.8890045285224915, + "learning_rate": 7.299348144128471e-07, + "loss": 0.6049, + "num_input_tokens_seen": 8111904, + "step": 24085 + }, + { + "epoch": 18.616692426584233, + "grad_norm": 0.7946910262107849, + "learning_rate": 7.258960027055756e-07, + "loss": 0.4169, + "num_input_tokens_seen": 8113568, + "step": 24090 + }, + { + "epoch": 18.620556414219475, + "grad_norm": 1.2336289882659912, + "learning_rate": 7.218682310061675e-07, + "loss": 0.4673, + "num_input_tokens_seen": 8115232, + "step": 24095 + }, + { + "epoch": 18.624420401854714, + "grad_norm": 0.9571043252944946, + "learning_rate": 7.178515011464882e-07, + "loss": 0.451, + "num_input_tokens_seen": 8116768, + "step": 24100 + }, + { + "epoch": 18.628284389489952, + "grad_norm": 0.8007246255874634, + "learning_rate": 7.138458149533678e-07, + "loss": 0.3895, + "num_input_tokens_seen": 8118336, + "step": 24105 + }, + { + "epoch": 18.632148377125194, + "grad_norm": 0.8718985319137573, + "learning_rate": 7.098511742486103e-07, + "loss": 0.4228, + "num_input_tokens_seen": 8120032, + "step": 24110 + }, + { + "epoch": 18.636012364760433, + "grad_norm": 0.8599821329116821, + "learning_rate": 7.058675808490095e-07, + "loss": 0.3872, + "num_input_tokens_seen": 8121600, + "step": 24115 + }, + { + "epoch": 18.63987635239567, + "grad_norm": 1.0739319324493408, + "learning_rate": 7.018950365663246e-07, + "loss": 0.4208, + "num_input_tokens_seen": 8123456, + "step": 24120 + }, + { + "epoch": 18.643740340030913, + "grad_norm": 0.6927552223205566, + "learning_rate": 6.979335432072937e-07, + "loss": 0.4437, + "num_input_tokens_seen": 8125056, + "step": 24125 + }, + { + "epoch": 18.64760432766615, + "grad_norm": 0.7644144892692566, + "learning_rate": 6.939831025736226e-07, + "loss": 0.3678, + "num_input_tokens_seen": 8126784, + "step": 24130 + }, + { + "epoch": 18.65146831530139, + "grad_norm": 0.7974783778190613, + "learning_rate": 6.900437164620022e-07, + "loss": 0.6138, + "num_input_tokens_seen": 8128384, + "step": 24135 + }, + { + "epoch": 18.655332302936632, + "grad_norm": 1.323075771331787, + "learning_rate": 6.861153866640879e-07, + "loss": 0.5226, + "num_input_tokens_seen": 8129984, + "step": 24140 + }, + { + "epoch": 18.65919629057187, + "grad_norm": 1.9687765836715698, + "learning_rate": 6.821981149665064e-07, + "loss": 0.5569, + "num_input_tokens_seen": 8131744, + "step": 24145 + }, + { + "epoch": 18.66306027820711, + "grad_norm": 1.4983477592468262, + "learning_rate": 6.782919031508517e-07, + "loss": 0.3103, + "num_input_tokens_seen": 8133248, + "step": 24150 + }, + { + "epoch": 18.66692426584235, + "grad_norm": 1.1465758085250854, + "learning_rate": 6.743967529936974e-07, + "loss": 0.3789, + "num_input_tokens_seen": 8134976, + "step": 24155 + }, + { + "epoch": 18.67078825347759, + "grad_norm": 1.3298848867416382, + "learning_rate": 6.705126662665817e-07, + "loss": 0.4541, + "num_input_tokens_seen": 8136800, + "step": 24160 + }, + { + "epoch": 18.674652241112828, + "grad_norm": 1.3600656986236572, + "learning_rate": 6.666396447360084e-07, + "loss": 0.3773, + "num_input_tokens_seen": 8138560, + "step": 24165 + }, + { + "epoch": 18.67851622874807, + "grad_norm": 2.1564550399780273, + "learning_rate": 6.627776901634519e-07, + "loss": 0.4548, + "num_input_tokens_seen": 8140000, + "step": 24170 + }, + { + "epoch": 18.682380216383308, + "grad_norm": 1.1901170015335083, + "learning_rate": 6.589268043053514e-07, + "loss": 0.4789, + "num_input_tokens_seen": 8141568, + "step": 24175 + }, + { + "epoch": 18.686244204018546, + "grad_norm": 0.7812175750732422, + "learning_rate": 6.550869889131144e-07, + "loss": 0.3699, + "num_input_tokens_seen": 8143296, + "step": 24180 + }, + { + "epoch": 18.69010819165379, + "grad_norm": 0.7271068096160889, + "learning_rate": 6.512582457331107e-07, + "loss": 0.6679, + "num_input_tokens_seen": 8144928, + "step": 24185 + }, + { + "epoch": 18.693972179289027, + "grad_norm": 1.2868528366088867, + "learning_rate": 6.474405765066721e-07, + "loss": 0.4015, + "num_input_tokens_seen": 8146592, + "step": 24190 + }, + { + "epoch": 18.697836166924265, + "grad_norm": 0.7219987511634827, + "learning_rate": 6.436339829701044e-07, + "loss": 0.3362, + "num_input_tokens_seen": 8148160, + "step": 24195 + }, + { + "epoch": 18.701700154559504, + "grad_norm": 1.4295532703399658, + "learning_rate": 6.398384668546669e-07, + "loss": 0.6057, + "num_input_tokens_seen": 8149664, + "step": 24200 + }, + { + "epoch": 18.705564142194746, + "grad_norm": 0.844257652759552, + "learning_rate": 6.360540298865764e-07, + "loss": 0.4325, + "num_input_tokens_seen": 8151264, + "step": 24205 + }, + { + "epoch": 18.709428129829984, + "grad_norm": 1.1140153408050537, + "learning_rate": 6.322806737870279e-07, + "loss": 0.5656, + "num_input_tokens_seen": 8153056, + "step": 24210 + }, + { + "epoch": 18.713292117465222, + "grad_norm": 1.2776380777359009, + "learning_rate": 6.285184002721628e-07, + "loss": 0.4625, + "num_input_tokens_seen": 8154656, + "step": 24215 + }, + { + "epoch": 18.717156105100464, + "grad_norm": 0.890159547328949, + "learning_rate": 6.247672110530816e-07, + "loss": 0.3965, + "num_input_tokens_seen": 8156352, + "step": 24220 + }, + { + "epoch": 18.721020092735703, + "grad_norm": 1.3899339437484741, + "learning_rate": 6.210271078358503e-07, + "loss": 0.459, + "num_input_tokens_seen": 8157856, + "step": 24225 + }, + { + "epoch": 18.72488408037094, + "grad_norm": 1.5619587898254395, + "learning_rate": 6.172980923214889e-07, + "loss": 0.5695, + "num_input_tokens_seen": 8159488, + "step": 24230 + }, + { + "epoch": 18.728748068006183, + "grad_norm": 1.4335854053497314, + "learning_rate": 6.13580166205982e-07, + "loss": 0.5258, + "num_input_tokens_seen": 8160864, + "step": 24235 + }, + { + "epoch": 18.73261205564142, + "grad_norm": 1.0585994720458984, + "learning_rate": 6.098733311802552e-07, + "loss": 0.4228, + "num_input_tokens_seen": 8162336, + "step": 24240 + }, + { + "epoch": 18.73647604327666, + "grad_norm": 1.2252436876296997, + "learning_rate": 6.061775889302068e-07, + "loss": 0.4123, + "num_input_tokens_seen": 8164032, + "step": 24245 + }, + { + "epoch": 18.740340030911902, + "grad_norm": 0.7875584363937378, + "learning_rate": 6.024929411366787e-07, + "loss": 0.3803, + "num_input_tokens_seen": 8165696, + "step": 24250 + }, + { + "epoch": 18.74420401854714, + "grad_norm": 1.4066344499588013, + "learning_rate": 5.988193894754746e-07, + "loss": 0.4319, + "num_input_tokens_seen": 8167360, + "step": 24255 + }, + { + "epoch": 18.74806800618238, + "grad_norm": 1.1273267269134521, + "learning_rate": 5.951569356173414e-07, + "loss": 0.4208, + "num_input_tokens_seen": 8169696, + "step": 24260 + }, + { + "epoch": 18.75193199381762, + "grad_norm": 1.7862389087677002, + "learning_rate": 5.915055812279913e-07, + "loss": 0.5111, + "num_input_tokens_seen": 8171168, + "step": 24265 + }, + { + "epoch": 18.75579598145286, + "grad_norm": 0.9757835865020752, + "learning_rate": 5.878653279680762e-07, + "loss": 0.4731, + "num_input_tokens_seen": 8172640, + "step": 24270 + }, + { + "epoch": 18.759659969088098, + "grad_norm": 1.132973551750183, + "learning_rate": 5.842361774932109e-07, + "loss": 0.4046, + "num_input_tokens_seen": 8174144, + "step": 24275 + }, + { + "epoch": 18.76352395672334, + "grad_norm": 0.6047128438949585, + "learning_rate": 5.806181314539527e-07, + "loss": 0.3944, + "num_input_tokens_seen": 8175648, + "step": 24280 + }, + { + "epoch": 18.76738794435858, + "grad_norm": 0.7090243101119995, + "learning_rate": 5.770111914958104e-07, + "loss": 0.366, + "num_input_tokens_seen": 8177184, + "step": 24285 + }, + { + "epoch": 18.771251931993817, + "grad_norm": 1.3638601303100586, + "learning_rate": 5.734153592592412e-07, + "loss": 0.4243, + "num_input_tokens_seen": 8178848, + "step": 24290 + }, + { + "epoch": 18.77511591962906, + "grad_norm": 1.0864406824111938, + "learning_rate": 5.698306363796535e-07, + "loss": 0.395, + "num_input_tokens_seen": 8180544, + "step": 24295 + }, + { + "epoch": 18.778979907264297, + "grad_norm": 0.7808727622032166, + "learning_rate": 5.66257024487396e-07, + "loss": 0.4394, + "num_input_tokens_seen": 8182464, + "step": 24300 + }, + { + "epoch": 18.782843894899536, + "grad_norm": 0.7468141317367554, + "learning_rate": 5.626945252077714e-07, + "loss": 0.3438, + "num_input_tokens_seen": 8184384, + "step": 24305 + }, + { + "epoch": 18.786707882534778, + "grad_norm": 0.7304196357727051, + "learning_rate": 5.591431401610253e-07, + "loss": 0.4037, + "num_input_tokens_seen": 8186112, + "step": 24310 + }, + { + "epoch": 18.790571870170016, + "grad_norm": 1.2045289278030396, + "learning_rate": 5.556028709623545e-07, + "loss": 0.4644, + "num_input_tokens_seen": 8187936, + "step": 24315 + }, + { + "epoch": 18.794435857805254, + "grad_norm": 1.0108559131622314, + "learning_rate": 5.520737192218877e-07, + "loss": 0.3938, + "num_input_tokens_seen": 8189696, + "step": 24320 + }, + { + "epoch": 18.798299845440496, + "grad_norm": 1.0643342733383179, + "learning_rate": 5.48555686544705e-07, + "loss": 0.5161, + "num_input_tokens_seen": 8191456, + "step": 24325 + }, + { + "epoch": 18.802163833075735, + "grad_norm": 0.9663770794868469, + "learning_rate": 5.450487745308319e-07, + "loss": 0.3855, + "num_input_tokens_seen": 8193120, + "step": 24330 + }, + { + "epoch": 18.806027820710973, + "grad_norm": 1.0769788026809692, + "learning_rate": 5.415529847752287e-07, + "loss": 0.3537, + "num_input_tokens_seen": 8194496, + "step": 24335 + }, + { + "epoch": 18.80989180834621, + "grad_norm": 0.9536316990852356, + "learning_rate": 5.380683188678042e-07, + "loss": 0.3789, + "num_input_tokens_seen": 8195936, + "step": 24340 + }, + { + "epoch": 18.813755795981454, + "grad_norm": 0.7724034786224365, + "learning_rate": 5.345947783934075e-07, + "loss": 0.353, + "num_input_tokens_seen": 8197472, + "step": 24345 + }, + { + "epoch": 18.817619783616692, + "grad_norm": 1.1435004472732544, + "learning_rate": 5.311323649318189e-07, + "loss": 0.4163, + "num_input_tokens_seen": 8199328, + "step": 24350 + }, + { + "epoch": 18.82148377125193, + "grad_norm": 0.9673146605491638, + "learning_rate": 5.276810800577736e-07, + "loss": 0.4148, + "num_input_tokens_seen": 8201216, + "step": 24355 + }, + { + "epoch": 18.825347758887172, + "grad_norm": 1.1323678493499756, + "learning_rate": 5.242409253409297e-07, + "loss": 0.4469, + "num_input_tokens_seen": 8202944, + "step": 24360 + }, + { + "epoch": 18.82921174652241, + "grad_norm": 1.1263647079467773, + "learning_rate": 5.208119023458941e-07, + "loss": 0.3404, + "num_input_tokens_seen": 8204864, + "step": 24365 + }, + { + "epoch": 18.83307573415765, + "grad_norm": 0.9124608039855957, + "learning_rate": 5.173940126322052e-07, + "loss": 0.4295, + "num_input_tokens_seen": 8206624, + "step": 24370 + }, + { + "epoch": 18.83693972179289, + "grad_norm": 0.7978571057319641, + "learning_rate": 5.139872577543364e-07, + "loss": 0.36, + "num_input_tokens_seen": 8208448, + "step": 24375 + }, + { + "epoch": 18.84080370942813, + "grad_norm": 1.010238766670227, + "learning_rate": 5.105916392617066e-07, + "loss": 0.9224, + "num_input_tokens_seen": 8210464, + "step": 24380 + }, + { + "epoch": 18.844667697063368, + "grad_norm": 0.7887149453163147, + "learning_rate": 5.07207158698661e-07, + "loss": 0.4168, + "num_input_tokens_seen": 8212256, + "step": 24385 + }, + { + "epoch": 18.84853168469861, + "grad_norm": 1.229722499847412, + "learning_rate": 5.038338176044794e-07, + "loss": 0.5298, + "num_input_tokens_seen": 8213824, + "step": 24390 + }, + { + "epoch": 18.85239567233385, + "grad_norm": 1.7064296007156372, + "learning_rate": 5.004716175133817e-07, + "loss": 0.524, + "num_input_tokens_seen": 8215616, + "step": 24395 + }, + { + "epoch": 18.856259659969087, + "grad_norm": 1.0192147493362427, + "learning_rate": 4.971205599545115e-07, + "loss": 0.3523, + "num_input_tokens_seen": 8217376, + "step": 24400 + }, + { + "epoch": 18.86012364760433, + "grad_norm": 0.9670393466949463, + "learning_rate": 4.937806464519551e-07, + "loss": 0.3889, + "num_input_tokens_seen": 8219008, + "step": 24405 + }, + { + "epoch": 18.863987635239567, + "grad_norm": 1.07809317111969, + "learning_rate": 4.904518785247225e-07, + "loss": 0.6339, + "num_input_tokens_seen": 8220672, + "step": 24410 + }, + { + "epoch": 18.867851622874806, + "grad_norm": 1.288251280784607, + "learning_rate": 4.871342576867555e-07, + "loss": 0.4093, + "num_input_tokens_seen": 8222720, + "step": 24415 + }, + { + "epoch": 18.871715610510048, + "grad_norm": 1.0717532634735107, + "learning_rate": 4.83827785446933e-07, + "loss": 0.4195, + "num_input_tokens_seen": 8224352, + "step": 24420 + }, + { + "epoch": 18.875579598145286, + "grad_norm": 0.9707883596420288, + "learning_rate": 4.805324633090525e-07, + "loss": 0.3634, + "num_input_tokens_seen": 8225984, + "step": 24425 + }, + { + "epoch": 18.879443585780525, + "grad_norm": 0.9237554669380188, + "learning_rate": 4.77248292771848e-07, + "loss": 0.5187, + "num_input_tokens_seen": 8227840, + "step": 24430 + }, + { + "epoch": 18.883307573415767, + "grad_norm": 1.2400919198989868, + "learning_rate": 4.7397527532898333e-07, + "loss": 0.4042, + "num_input_tokens_seen": 8229728, + "step": 24435 + }, + { + "epoch": 18.887171561051005, + "grad_norm": 1.167190670967102, + "learning_rate": 4.7071341246904545e-07, + "loss": 0.3994, + "num_input_tokens_seen": 8231392, + "step": 24440 + }, + { + "epoch": 18.891035548686244, + "grad_norm": 0.8822414875030518, + "learning_rate": 4.674627056755448e-07, + "loss": 0.6316, + "num_input_tokens_seen": 8233280, + "step": 24445 + }, + { + "epoch": 18.894899536321482, + "grad_norm": 0.7227250337600708, + "learning_rate": 4.642231564269267e-07, + "loss": 0.4045, + "num_input_tokens_seen": 8235072, + "step": 24450 + }, + { + "epoch": 18.898763523956724, + "grad_norm": 0.6966329216957092, + "learning_rate": 4.609947661965569e-07, + "loss": 0.5222, + "num_input_tokens_seen": 8236640, + "step": 24455 + }, + { + "epoch": 18.902627511591962, + "grad_norm": 1.0096718072891235, + "learning_rate": 4.577775364527248e-07, + "loss": 0.3699, + "num_input_tokens_seen": 8238784, + "step": 24460 + }, + { + "epoch": 18.9064914992272, + "grad_norm": 1.924736738204956, + "learning_rate": 4.545714686586461e-07, + "loss": 0.6459, + "num_input_tokens_seen": 8240352, + "step": 24465 + }, + { + "epoch": 18.910355486862443, + "grad_norm": 1.2561705112457275, + "learning_rate": 4.513765642724599e-07, + "loss": 0.5288, + "num_input_tokens_seen": 8242176, + "step": 24470 + }, + { + "epoch": 18.91421947449768, + "grad_norm": 1.0536388158798218, + "learning_rate": 4.4819282474722893e-07, + "loss": 0.4355, + "num_input_tokens_seen": 8243616, + "step": 24475 + }, + { + "epoch": 18.91808346213292, + "grad_norm": 1.424281358718872, + "learning_rate": 4.4502025153093376e-07, + "loss": 0.4508, + "num_input_tokens_seen": 8245184, + "step": 24480 + }, + { + "epoch": 18.92194744976816, + "grad_norm": 0.6728430986404419, + "learning_rate": 4.4185884606648686e-07, + "loss": 0.4661, + "num_input_tokens_seen": 8246752, + "step": 24485 + }, + { + "epoch": 18.9258114374034, + "grad_norm": 1.4224647283554077, + "learning_rate": 4.387086097917076e-07, + "loss": 0.4262, + "num_input_tokens_seen": 8248288, + "step": 24490 + }, + { + "epoch": 18.92967542503864, + "grad_norm": 0.867988109588623, + "learning_rate": 4.3556954413934424e-07, + "loss": 0.5338, + "num_input_tokens_seen": 8249728, + "step": 24495 + }, + { + "epoch": 18.93353941267388, + "grad_norm": 0.7371512651443481, + "learning_rate": 4.3244165053706323e-07, + "loss": 0.4488, + "num_input_tokens_seen": 8251488, + "step": 24500 + }, + { + "epoch": 18.93740340030912, + "grad_norm": 0.872118353843689, + "learning_rate": 4.293249304074487e-07, + "loss": 0.3836, + "num_input_tokens_seen": 8253152, + "step": 24505 + }, + { + "epoch": 18.941267387944357, + "grad_norm": 1.2345342636108398, + "learning_rate": 4.2621938516800296e-07, + "loss": 0.5821, + "num_input_tokens_seen": 8254880, + "step": 24510 + }, + { + "epoch": 18.9451313755796, + "grad_norm": 1.6425693035125732, + "learning_rate": 4.231250162311462e-07, + "loss": 0.3647, + "num_input_tokens_seen": 8256576, + "step": 24515 + }, + { + "epoch": 18.948995363214838, + "grad_norm": 1.0188947916030884, + "learning_rate": 4.200418250042193e-07, + "loss": 0.6205, + "num_input_tokens_seen": 8258336, + "step": 24520 + }, + { + "epoch": 18.952859350850076, + "grad_norm": 1.0158017873764038, + "learning_rate": 4.1696981288947556e-07, + "loss": 0.3978, + "num_input_tokens_seen": 8260160, + "step": 24525 + }, + { + "epoch": 18.956723338485318, + "grad_norm": 0.7218586802482605, + "learning_rate": 4.1390898128408076e-07, + "loss": 0.4019, + "num_input_tokens_seen": 8261568, + "step": 24530 + }, + { + "epoch": 18.960587326120557, + "grad_norm": 1.0714592933654785, + "learning_rate": 4.10859331580124e-07, + "loss": 0.4243, + "num_input_tokens_seen": 8263424, + "step": 24535 + }, + { + "epoch": 18.964451313755795, + "grad_norm": 1.4332916736602783, + "learning_rate": 4.078208651645987e-07, + "loss": 0.4444, + "num_input_tokens_seen": 8265120, + "step": 24540 + }, + { + "epoch": 18.968315301391037, + "grad_norm": 0.9581173658370972, + "learning_rate": 4.0479358341942164e-07, + "loss": 0.4062, + "num_input_tokens_seen": 8266624, + "step": 24545 + }, + { + "epoch": 18.972179289026275, + "grad_norm": 1.157893180847168, + "learning_rate": 4.0177748772141646e-07, + "loss": 0.362, + "num_input_tokens_seen": 8268288, + "step": 24550 + }, + { + "epoch": 18.976043276661514, + "grad_norm": 0.7987955808639526, + "learning_rate": 3.9877257944232474e-07, + "loss": 0.3791, + "num_input_tokens_seen": 8270048, + "step": 24555 + }, + { + "epoch": 18.979907264296756, + "grad_norm": 1.1480220556259155, + "learning_rate": 3.957788599487949e-07, + "loss": 0.4451, + "num_input_tokens_seen": 8271616, + "step": 24560 + }, + { + "epoch": 18.983771251931994, + "grad_norm": 1.2845534086227417, + "learning_rate": 3.9279633060238797e-07, + "loss": 0.5601, + "num_input_tokens_seen": 8273472, + "step": 24565 + }, + { + "epoch": 18.987635239567233, + "grad_norm": 1.1263058185577393, + "learning_rate": 3.8982499275957704e-07, + "loss": 0.6198, + "num_input_tokens_seen": 8275008, + "step": 24570 + }, + { + "epoch": 18.991499227202475, + "grad_norm": 2.053903818130493, + "learning_rate": 3.8686484777174513e-07, + "loss": 0.4398, + "num_input_tokens_seen": 8276512, + "step": 24575 + }, + { + "epoch": 18.995363214837713, + "grad_norm": 1.775848627090454, + "learning_rate": 3.8391589698517915e-07, + "loss": 0.4482, + "num_input_tokens_seen": 8278176, + "step": 24580 + }, + { + "epoch": 18.99922720247295, + "grad_norm": 1.4401966333389282, + "learning_rate": 3.809781417410868e-07, + "loss": 0.4637, + "num_input_tokens_seen": 8279808, + "step": 24585 + }, + { + "epoch": 19.0, + "eval_loss": 0.4461950659751892, + "eval_runtime": 6.2443, + "eval_samples_per_second": 92.084, + "eval_steps_per_second": 23.061, + "num_input_tokens_seen": 8279952, + "step": 24586 + }, + { + "epoch": 19.00309119010819, + "grad_norm": 1.0926011800765991, + "learning_rate": 3.7805158337557155e-07, + "loss": 0.5344, + "num_input_tokens_seen": 8281424, + "step": 24590 + }, + { + "epoch": 19.006955177743432, + "grad_norm": 1.726877212524414, + "learning_rate": 3.7513622321964927e-07, + "loss": 0.3767, + "num_input_tokens_seen": 8283120, + "step": 24595 + }, + { + "epoch": 19.01081916537867, + "grad_norm": 0.8868471384048462, + "learning_rate": 3.7223206259924813e-07, + "loss": 0.3547, + "num_input_tokens_seen": 8284848, + "step": 24600 + }, + { + "epoch": 19.01468315301391, + "grad_norm": 0.7703593373298645, + "learning_rate": 3.69339102835195e-07, + "loss": 0.4555, + "num_input_tokens_seen": 8286576, + "step": 24605 + }, + { + "epoch": 19.01854714064915, + "grad_norm": 1.201552152633667, + "learning_rate": 3.664573452432235e-07, + "loss": 0.4214, + "num_input_tokens_seen": 8288080, + "step": 24610 + }, + { + "epoch": 19.02241112828439, + "grad_norm": 1.5116703510284424, + "learning_rate": 3.635867911339741e-07, + "loss": 0.4805, + "num_input_tokens_seen": 8289936, + "step": 24615 + }, + { + "epoch": 19.026275115919628, + "grad_norm": 1.236424446105957, + "learning_rate": 3.607274418129969e-07, + "loss": 0.6505, + "num_input_tokens_seen": 8291376, + "step": 24620 + }, + { + "epoch": 19.03013910355487, + "grad_norm": 0.9921685457229614, + "learning_rate": 3.5787929858073777e-07, + "loss": 0.4318, + "num_input_tokens_seen": 8293168, + "step": 24625 + }, + { + "epoch": 19.034003091190108, + "grad_norm": 0.8478972911834717, + "learning_rate": 3.5504236273254943e-07, + "loss": 0.4026, + "num_input_tokens_seen": 8294832, + "step": 24630 + }, + { + "epoch": 19.037867078825347, + "grad_norm": 1.6846437454223633, + "learning_rate": 3.5221663555868587e-07, + "loss": 0.4684, + "num_input_tokens_seen": 8296624, + "step": 24635 + }, + { + "epoch": 19.04173106646059, + "grad_norm": 1.1322494745254517, + "learning_rate": 3.4940211834430804e-07, + "loss": 0.4608, + "num_input_tokens_seen": 8298512, + "step": 24640 + }, + { + "epoch": 19.045595054095827, + "grad_norm": 1.0427048206329346, + "learning_rate": 3.4659881236947246e-07, + "loss": 0.4861, + "num_input_tokens_seen": 8300112, + "step": 24645 + }, + { + "epoch": 19.049459041731065, + "grad_norm": 0.6782751679420471, + "learning_rate": 3.4380671890913985e-07, + "loss": 0.4152, + "num_input_tokens_seen": 8301872, + "step": 24650 + }, + { + "epoch": 19.053323029366307, + "grad_norm": 1.0502495765686035, + "learning_rate": 3.410258392331722e-07, + "loss": 0.3847, + "num_input_tokens_seen": 8303632, + "step": 24655 + }, + { + "epoch": 19.057187017001546, + "grad_norm": 1.2498869895935059, + "learning_rate": 3.3825617460633006e-07, + "loss": 0.4412, + "num_input_tokens_seen": 8305488, + "step": 24660 + }, + { + "epoch": 19.061051004636784, + "grad_norm": 1.7911944389343262, + "learning_rate": 3.3549772628827524e-07, + "loss": 0.4968, + "num_input_tokens_seen": 8307184, + "step": 24665 + }, + { + "epoch": 19.064914992272026, + "grad_norm": 1.0564531087875366, + "learning_rate": 3.327504955335625e-07, + "loss": 0.5806, + "num_input_tokens_seen": 8308816, + "step": 24670 + }, + { + "epoch": 19.068778979907265, + "grad_norm": 1.5498889684677124, + "learning_rate": 3.30014483591648e-07, + "loss": 0.7481, + "num_input_tokens_seen": 8310640, + "step": 24675 + }, + { + "epoch": 19.072642967542503, + "grad_norm": 0.8468605279922485, + "learning_rate": 3.2728969170689183e-07, + "loss": 0.4375, + "num_input_tokens_seen": 8312208, + "step": 24680 + }, + { + "epoch": 19.076506955177745, + "grad_norm": 0.5781393647193909, + "learning_rate": 3.2457612111854165e-07, + "loss": 0.3744, + "num_input_tokens_seen": 8313744, + "step": 24685 + }, + { + "epoch": 19.080370942812984, + "grad_norm": 1.137801170349121, + "learning_rate": 3.218737730607491e-07, + "loss": 0.3691, + "num_input_tokens_seen": 8315440, + "step": 24690 + }, + { + "epoch": 19.084234930448222, + "grad_norm": 0.7037649750709534, + "learning_rate": 3.191826487625532e-07, + "loss": 0.6245, + "num_input_tokens_seen": 8317200, + "step": 24695 + }, + { + "epoch": 19.08809891808346, + "grad_norm": 0.6138149499893188, + "learning_rate": 3.1650274944790004e-07, + "loss": 0.3481, + "num_input_tokens_seen": 8318704, + "step": 24700 + }, + { + "epoch": 19.091962905718702, + "grad_norm": 0.6091812252998352, + "learning_rate": 3.1383407633561734e-07, + "loss": 0.3801, + "num_input_tokens_seen": 8320304, + "step": 24705 + }, + { + "epoch": 19.09582689335394, + "grad_norm": 1.0127975940704346, + "learning_rate": 3.1117663063943705e-07, + "loss": 0.4514, + "num_input_tokens_seen": 8321616, + "step": 24710 + }, + { + "epoch": 19.09969088098918, + "grad_norm": 1.002914547920227, + "learning_rate": 3.0853041356798116e-07, + "loss": 0.5577, + "num_input_tokens_seen": 8323344, + "step": 24715 + }, + { + "epoch": 19.10355486862442, + "grad_norm": 0.8001795411109924, + "learning_rate": 3.058954263247621e-07, + "loss": 0.3641, + "num_input_tokens_seen": 8325104, + "step": 24720 + }, + { + "epoch": 19.10741885625966, + "grad_norm": 1.6157336235046387, + "learning_rate": 3.0327167010819333e-07, + "loss": 0.4778, + "num_input_tokens_seen": 8326928, + "step": 24725 + }, + { + "epoch": 19.111282843894898, + "grad_norm": 0.984545111656189, + "learning_rate": 3.006591461115704e-07, + "loss": 0.4224, + "num_input_tokens_seen": 8328848, + "step": 24730 + }, + { + "epoch": 19.11514683153014, + "grad_norm": 0.6239376068115234, + "learning_rate": 2.9805785552308727e-07, + "loss": 0.3609, + "num_input_tokens_seen": 8330288, + "step": 24735 + }, + { + "epoch": 19.11901081916538, + "grad_norm": 0.6660906076431274, + "learning_rate": 2.954677995258254e-07, + "loss": 0.4791, + "num_input_tokens_seen": 8331792, + "step": 24740 + }, + { + "epoch": 19.122874806800617, + "grad_norm": 1.5154433250427246, + "learning_rate": 2.9288897929775905e-07, + "loss": 0.5538, + "num_input_tokens_seen": 8333616, + "step": 24745 + }, + { + "epoch": 19.12673879443586, + "grad_norm": 1.1993200778961182, + "learning_rate": 2.9032139601174734e-07, + "loss": 0.5134, + "num_input_tokens_seen": 8335280, + "step": 24750 + }, + { + "epoch": 19.130602782071097, + "grad_norm": 1.4672974348068237, + "learning_rate": 2.8776505083554504e-07, + "loss": 0.3791, + "num_input_tokens_seen": 8336752, + "step": 24755 + }, + { + "epoch": 19.134466769706336, + "grad_norm": 1.0254573822021484, + "learning_rate": 2.852199449317944e-07, + "loss": 0.4347, + "num_input_tokens_seen": 8338288, + "step": 24760 + }, + { + "epoch": 19.138330757341578, + "grad_norm": 1.161313772201538, + "learning_rate": 2.8268607945802493e-07, + "loss": 0.3888, + "num_input_tokens_seen": 8340176, + "step": 24765 + }, + { + "epoch": 19.142194744976816, + "grad_norm": 0.7210527658462524, + "learning_rate": 2.801634555666538e-07, + "loss": 0.4404, + "num_input_tokens_seen": 8341616, + "step": 24770 + }, + { + "epoch": 19.146058732612055, + "grad_norm": 1.3198491334915161, + "learning_rate": 2.7765207440498266e-07, + "loss": 0.4172, + "num_input_tokens_seen": 8343408, + "step": 24775 + }, + { + "epoch": 19.149922720247297, + "grad_norm": 1.106820821762085, + "learning_rate": 2.751519371152034e-07, + "loss": 0.4195, + "num_input_tokens_seen": 8345104, + "step": 24780 + }, + { + "epoch": 19.153786707882535, + "grad_norm": 1.5408446788787842, + "learning_rate": 2.726630448343953e-07, + "loss": 0.4461, + "num_input_tokens_seen": 8346736, + "step": 24785 + }, + { + "epoch": 19.157650695517773, + "grad_norm": 1.0785117149353027, + "learning_rate": 2.7018539869451963e-07, + "loss": 0.4406, + "num_input_tokens_seen": 8348304, + "step": 24790 + }, + { + "epoch": 19.161514683153015, + "grad_norm": 1.273820400238037, + "learning_rate": 2.6771899982242774e-07, + "loss": 0.4654, + "num_input_tokens_seen": 8349872, + "step": 24795 + }, + { + "epoch": 19.165378670788254, + "grad_norm": 0.5795221924781799, + "learning_rate": 2.6526384933984737e-07, + "loss": 0.3819, + "num_input_tokens_seen": 8351568, + "step": 24800 + }, + { + "epoch": 19.169242658423492, + "grad_norm": 0.9224506616592407, + "learning_rate": 2.6281994836340195e-07, + "loss": 0.5031, + "num_input_tokens_seen": 8353296, + "step": 24805 + }, + { + "epoch": 19.173106646058734, + "grad_norm": 0.844683825969696, + "learning_rate": 2.603872980045885e-07, + "loss": 0.3461, + "num_input_tokens_seen": 8355248, + "step": 24810 + }, + { + "epoch": 19.176970633693973, + "grad_norm": 1.038627028465271, + "learning_rate": 2.5796589936979423e-07, + "loss": 0.3764, + "num_input_tokens_seen": 8356848, + "step": 24815 + }, + { + "epoch": 19.18083462132921, + "grad_norm": 0.9644440412521362, + "learning_rate": 2.5555575356027703e-07, + "loss": 0.3569, + "num_input_tokens_seen": 8358736, + "step": 24820 + }, + { + "epoch": 19.18469860896445, + "grad_norm": 0.8718637228012085, + "learning_rate": 2.531568616721963e-07, + "loss": 0.3349, + "num_input_tokens_seen": 8360496, + "step": 24825 + }, + { + "epoch": 19.18856259659969, + "grad_norm": 0.7750914096832275, + "learning_rate": 2.5076922479657647e-07, + "loss": 0.4136, + "num_input_tokens_seen": 8362032, + "step": 24830 + }, + { + "epoch": 19.19242658423493, + "grad_norm": 0.82159823179245, + "learning_rate": 2.483928440193295e-07, + "loss": 0.694, + "num_input_tokens_seen": 8363536, + "step": 24835 + }, + { + "epoch": 19.19629057187017, + "grad_norm": 0.7446980476379395, + "learning_rate": 2.460277204212519e-07, + "loss": 0.5105, + "num_input_tokens_seen": 8365136, + "step": 24840 + }, + { + "epoch": 19.20015455950541, + "grad_norm": 0.9215542078018188, + "learning_rate": 2.43673855078011e-07, + "loss": 0.5209, + "num_input_tokens_seen": 8366928, + "step": 24845 + }, + { + "epoch": 19.20401854714065, + "grad_norm": 1.1248698234558105, + "learning_rate": 2.413312490601588e-07, + "loss": 0.5336, + "num_input_tokens_seen": 8368944, + "step": 24850 + }, + { + "epoch": 19.207882534775887, + "grad_norm": 0.8785467743873596, + "learning_rate": 2.3899990343312916e-07, + "loss": 0.4757, + "num_input_tokens_seen": 8370480, + "step": 24855 + }, + { + "epoch": 19.21174652241113, + "grad_norm": 0.7612584233283997, + "learning_rate": 2.3667981925723226e-07, + "loss": 0.3979, + "num_input_tokens_seen": 8372080, + "step": 24860 + }, + { + "epoch": 19.215610510046368, + "grad_norm": 0.8381969928741455, + "learning_rate": 2.3437099758765734e-07, + "loss": 0.4274, + "num_input_tokens_seen": 8373968, + "step": 24865 + }, + { + "epoch": 19.219474497681606, + "grad_norm": 1.08933687210083, + "learning_rate": 2.3207343947446447e-07, + "loss": 0.3993, + "num_input_tokens_seen": 8375344, + "step": 24870 + }, + { + "epoch": 19.223338485316848, + "grad_norm": 1.4356443881988525, + "learning_rate": 2.2978714596260108e-07, + "loss": 0.4176, + "num_input_tokens_seen": 8377040, + "step": 24875 + }, + { + "epoch": 19.227202472952087, + "grad_norm": 0.7805351614952087, + "learning_rate": 2.275121180918882e-07, + "loss": 0.424, + "num_input_tokens_seen": 8378800, + "step": 24880 + }, + { + "epoch": 19.231066460587325, + "grad_norm": 1.50419282913208, + "learning_rate": 2.2524835689702316e-07, + "loss": 0.4446, + "num_input_tokens_seen": 8380560, + "step": 24885 + }, + { + "epoch": 19.234930448222567, + "grad_norm": 0.9862515926361084, + "learning_rate": 2.229958634075713e-07, + "loss": 0.4194, + "num_input_tokens_seen": 8382096, + "step": 24890 + }, + { + "epoch": 19.238794435857805, + "grad_norm": 0.8811295628547668, + "learning_rate": 2.207546386479853e-07, + "loss": 0.4505, + "num_input_tokens_seen": 8384016, + "step": 24895 + }, + { + "epoch": 19.242658423493044, + "grad_norm": 1.0748602151870728, + "learning_rate": 2.1852468363758594e-07, + "loss": 0.4803, + "num_input_tokens_seen": 8386000, + "step": 24900 + }, + { + "epoch": 19.246522411128286, + "grad_norm": 0.9280661344528198, + "learning_rate": 2.1630599939057306e-07, + "loss": 0.6431, + "num_input_tokens_seen": 8387728, + "step": 24905 + }, + { + "epoch": 19.250386398763524, + "grad_norm": 0.8291170001029968, + "learning_rate": 2.140985869160145e-07, + "loss": 0.4076, + "num_input_tokens_seen": 8389456, + "step": 24910 + }, + { + "epoch": 19.254250386398763, + "grad_norm": 1.0909970998764038, + "learning_rate": 2.1190244721785435e-07, + "loss": 0.5268, + "num_input_tokens_seen": 8391088, + "step": 24915 + }, + { + "epoch": 19.258114374034005, + "grad_norm": 0.8869785666465759, + "learning_rate": 2.0971758129491314e-07, + "loss": 0.4544, + "num_input_tokens_seen": 8392784, + "step": 24920 + }, + { + "epoch": 19.261978361669243, + "grad_norm": 0.8990395665168762, + "learning_rate": 2.0754399014087933e-07, + "loss": 0.3726, + "num_input_tokens_seen": 8394512, + "step": 24925 + }, + { + "epoch": 19.26584234930448, + "grad_norm": 0.6706898212432861, + "learning_rate": 2.0538167474431214e-07, + "loss": 0.3624, + "num_input_tokens_seen": 8396400, + "step": 24930 + }, + { + "epoch": 19.269706336939723, + "grad_norm": 1.0923047065734863, + "learning_rate": 2.0323063608865267e-07, + "loss": 0.4928, + "num_input_tokens_seen": 8398320, + "step": 24935 + }, + { + "epoch": 19.273570324574962, + "grad_norm": 0.9544326066970825, + "learning_rate": 2.0109087515219894e-07, + "loss": 0.4937, + "num_input_tokens_seen": 8399728, + "step": 24940 + }, + { + "epoch": 19.2774343122102, + "grad_norm": 0.9457153081893921, + "learning_rate": 1.9896239290813078e-07, + "loss": 0.5363, + "num_input_tokens_seen": 8401584, + "step": 24945 + }, + { + "epoch": 19.28129829984544, + "grad_norm": 0.9113226532936096, + "learning_rate": 1.9684519032449333e-07, + "loss": 0.3615, + "num_input_tokens_seen": 8403344, + "step": 24950 + }, + { + "epoch": 19.28516228748068, + "grad_norm": 1.2570593357086182, + "learning_rate": 1.947392683642052e-07, + "loss": 0.5165, + "num_input_tokens_seen": 8404912, + "step": 24955 + }, + { + "epoch": 19.28902627511592, + "grad_norm": 1.0338269472122192, + "learning_rate": 1.9264462798505023e-07, + "loss": 0.4414, + "num_input_tokens_seen": 8406736, + "step": 24960 + }, + { + "epoch": 19.292890262751158, + "grad_norm": 1.1648426055908203, + "learning_rate": 1.905612701396803e-07, + "loss": 0.4552, + "num_input_tokens_seen": 8408560, + "step": 24965 + }, + { + "epoch": 19.2967542503864, + "grad_norm": 2.343404531478882, + "learning_rate": 1.884891957756263e-07, + "loss": 0.5652, + "num_input_tokens_seen": 8410192, + "step": 24970 + }, + { + "epoch": 19.300618238021638, + "grad_norm": 0.9342493414878845, + "learning_rate": 1.864284058352761e-07, + "loss": 0.3761, + "num_input_tokens_seen": 8411824, + "step": 24975 + }, + { + "epoch": 19.304482225656876, + "grad_norm": 0.8417518138885498, + "learning_rate": 1.8437890125589109e-07, + "loss": 0.3893, + "num_input_tokens_seen": 8413584, + "step": 24980 + }, + { + "epoch": 19.30834621329212, + "grad_norm": 2.4846031665802, + "learning_rate": 1.8234068296959506e-07, + "loss": 0.7046, + "num_input_tokens_seen": 8415088, + "step": 24985 + }, + { + "epoch": 19.312210200927357, + "grad_norm": 1.2373486757278442, + "learning_rate": 1.8031375190338261e-07, + "loss": 0.4265, + "num_input_tokens_seen": 8416912, + "step": 24990 + }, + { + "epoch": 19.316074188562595, + "grad_norm": 0.886104941368103, + "learning_rate": 1.782981089791136e-07, + "loss": 0.4514, + "num_input_tokens_seen": 8418384, + "step": 24995 + }, + { + "epoch": 19.319938176197837, + "grad_norm": 1.393311858177185, + "learning_rate": 1.7629375511351852e-07, + "loss": 0.4552, + "num_input_tokens_seen": 8420112, + "step": 25000 + }, + { + "epoch": 19.323802163833076, + "grad_norm": 1.6821430921554565, + "learning_rate": 1.7430069121818492e-07, + "loss": 0.3544, + "num_input_tokens_seen": 8421680, + "step": 25005 + }, + { + "epoch": 19.327666151468314, + "grad_norm": 1.4660944938659668, + "learning_rate": 1.7231891819957657e-07, + "loss": 0.4154, + "num_input_tokens_seen": 8423376, + "step": 25010 + }, + { + "epoch": 19.331530139103556, + "grad_norm": 1.184749960899353, + "learning_rate": 1.703484369590086e-07, + "loss": 0.3883, + "num_input_tokens_seen": 8424816, + "step": 25015 + }, + { + "epoch": 19.335394126738795, + "grad_norm": 1.4526047706604004, + "learning_rate": 1.6838924839266966e-07, + "loss": 0.4847, + "num_input_tokens_seen": 8426640, + "step": 25020 + }, + { + "epoch": 19.339258114374033, + "grad_norm": 1.355412244796753, + "learning_rate": 1.664413533916137e-07, + "loss": 0.4769, + "num_input_tokens_seen": 8428464, + "step": 25025 + }, + { + "epoch": 19.343122102009275, + "grad_norm": 1.0578185319900513, + "learning_rate": 1.645047528417487e-07, + "loss": 0.307, + "num_input_tokens_seen": 8430032, + "step": 25030 + }, + { + "epoch": 19.346986089644513, + "grad_norm": 1.6071953773498535, + "learning_rate": 1.62579447623859e-07, + "loss": 0.4919, + "num_input_tokens_seen": 8431632, + "step": 25035 + }, + { + "epoch": 19.350850077279752, + "grad_norm": 1.1118407249450684, + "learning_rate": 1.606654386135803e-07, + "loss": 0.3638, + "num_input_tokens_seen": 8433584, + "step": 25040 + }, + { + "epoch": 19.354714064914994, + "grad_norm": 0.6581734418869019, + "learning_rate": 1.5876272668141902e-07, + "loss": 0.3291, + "num_input_tokens_seen": 8435152, + "step": 25045 + }, + { + "epoch": 19.358578052550232, + "grad_norm": 1.2362300157546997, + "learning_rate": 1.568713126927357e-07, + "loss": 0.5148, + "num_input_tokens_seen": 8436848, + "step": 25050 + }, + { + "epoch": 19.36244204018547, + "grad_norm": 1.2031587362289429, + "learning_rate": 1.549911975077617e-07, + "loss": 0.4066, + "num_input_tokens_seen": 8438704, + "step": 25055 + }, + { + "epoch": 19.366306027820713, + "grad_norm": 0.7847995758056641, + "learning_rate": 1.5312238198157968e-07, + "loss": 0.4281, + "num_input_tokens_seen": 8440368, + "step": 25060 + }, + { + "epoch": 19.37017001545595, + "grad_norm": 1.0284206867218018, + "learning_rate": 1.5126486696414032e-07, + "loss": 0.4067, + "num_input_tokens_seen": 8442160, + "step": 25065 + }, + { + "epoch": 19.37403400309119, + "grad_norm": 0.762018084526062, + "learning_rate": 1.4941865330025394e-07, + "loss": 0.4194, + "num_input_tokens_seen": 8443920, + "step": 25070 + }, + { + "epoch": 19.377897990726428, + "grad_norm": 2.818887948989868, + "learning_rate": 1.475837418295878e-07, + "loss": 0.4347, + "num_input_tokens_seen": 8445328, + "step": 25075 + }, + { + "epoch": 19.38176197836167, + "grad_norm": 0.7357758283615112, + "learning_rate": 1.457601333866715e-07, + "loss": 0.496, + "num_input_tokens_seen": 8446768, + "step": 25080 + }, + { + "epoch": 19.38562596599691, + "grad_norm": 0.9111052751541138, + "learning_rate": 1.4394782880089443e-07, + "loss": 0.3709, + "num_input_tokens_seen": 8448240, + "step": 25085 + }, + { + "epoch": 19.389489953632147, + "grad_norm": 1.0819905996322632, + "learning_rate": 1.4214682889649998e-07, + "loss": 0.4135, + "num_input_tokens_seen": 8449936, + "step": 25090 + }, + { + "epoch": 19.39335394126739, + "grad_norm": 0.6567110419273376, + "learning_rate": 1.403571344925969e-07, + "loss": 0.3946, + "num_input_tokens_seen": 8451664, + "step": 25095 + }, + { + "epoch": 19.397217928902627, + "grad_norm": 1.4687540531158447, + "learning_rate": 1.3857874640314516e-07, + "loss": 0.4288, + "num_input_tokens_seen": 8453232, + "step": 25100 + }, + { + "epoch": 19.401081916537866, + "grad_norm": 2.0141193866729736, + "learning_rate": 1.3681166543697e-07, + "loss": 0.4396, + "num_input_tokens_seen": 8455120, + "step": 25105 + }, + { + "epoch": 19.404945904173108, + "grad_norm": 1.3435983657836914, + "learning_rate": 1.3505589239775073e-07, + "loss": 0.5157, + "num_input_tokens_seen": 8456656, + "step": 25110 + }, + { + "epoch": 19.408809891808346, + "grad_norm": 0.7928199768066406, + "learning_rate": 1.3331142808401808e-07, + "loss": 0.6977, + "num_input_tokens_seen": 8458192, + "step": 25115 + }, + { + "epoch": 19.412673879443584, + "grad_norm": 0.9053761959075928, + "learning_rate": 1.315782732891735e-07, + "loss": 0.4514, + "num_input_tokens_seen": 8459984, + "step": 25120 + }, + { + "epoch": 19.416537867078826, + "grad_norm": 2.267094135284424, + "learning_rate": 1.2985642880145864e-07, + "loss": 0.3581, + "num_input_tokens_seen": 8461424, + "step": 25125 + }, + { + "epoch": 19.420401854714065, + "grad_norm": 1.117997407913208, + "learning_rate": 1.2814589540398048e-07, + "loss": 0.3975, + "num_input_tokens_seen": 8463024, + "step": 25130 + }, + { + "epoch": 19.424265842349303, + "grad_norm": 1.1091703176498413, + "learning_rate": 1.2644667387470276e-07, + "loss": 0.3898, + "num_input_tokens_seen": 8464720, + "step": 25135 + }, + { + "epoch": 19.428129829984545, + "grad_norm": 1.372036337852478, + "learning_rate": 1.247587649864379e-07, + "loss": 0.7687, + "num_input_tokens_seen": 8466192, + "step": 25140 + }, + { + "epoch": 19.431993817619784, + "grad_norm": 0.8137943744659424, + "learning_rate": 1.230821695068607e-07, + "loss": 0.562, + "num_input_tokens_seen": 8467888, + "step": 25145 + }, + { + "epoch": 19.435857805255022, + "grad_norm": 1.0319921970367432, + "learning_rate": 1.214168881984945e-07, + "loss": 0.565, + "num_input_tokens_seen": 8469680, + "step": 25150 + }, + { + "epoch": 19.439721792890264, + "grad_norm": 1.433727502822876, + "learning_rate": 1.1976292181871684e-07, + "loss": 0.3662, + "num_input_tokens_seen": 8471408, + "step": 25155 + }, + { + "epoch": 19.443585780525503, + "grad_norm": 1.349067211151123, + "learning_rate": 1.1812027111976764e-07, + "loss": 0.5288, + "num_input_tokens_seen": 8473072, + "step": 25160 + }, + { + "epoch": 19.44744976816074, + "grad_norm": 1.9450969696044922, + "learning_rate": 1.1648893684872986e-07, + "loss": 0.4789, + "num_input_tokens_seen": 8474832, + "step": 25165 + }, + { + "epoch": 19.451313755795983, + "grad_norm": 1.1526639461517334, + "learning_rate": 1.1486891974754332e-07, + "loss": 0.3657, + "num_input_tokens_seen": 8476528, + "step": 25170 + }, + { + "epoch": 19.45517774343122, + "grad_norm": 1.166718602180481, + "learning_rate": 1.1326022055300478e-07, + "loss": 0.4136, + "num_input_tokens_seen": 8478224, + "step": 25175 + }, + { + "epoch": 19.45904173106646, + "grad_norm": 0.9190477728843689, + "learning_rate": 1.1166283999675953e-07, + "loss": 0.4305, + "num_input_tokens_seen": 8479760, + "step": 25180 + }, + { + "epoch": 19.462905718701702, + "grad_norm": 1.6057536602020264, + "learning_rate": 1.100767788053042e-07, + "loss": 0.4258, + "num_input_tokens_seen": 8481584, + "step": 25185 + }, + { + "epoch": 19.46676970633694, + "grad_norm": 1.036025881767273, + "learning_rate": 1.0850203769998957e-07, + "loss": 0.5385, + "num_input_tokens_seen": 8483440, + "step": 25190 + }, + { + "epoch": 19.47063369397218, + "grad_norm": 1.1612486839294434, + "learning_rate": 1.0693861739701771e-07, + "loss": 0.3794, + "num_input_tokens_seen": 8485200, + "step": 25195 + }, + { + "epoch": 19.474497681607417, + "grad_norm": 0.8292748928070068, + "learning_rate": 1.0538651860744208e-07, + "loss": 0.3755, + "num_input_tokens_seen": 8486928, + "step": 25200 + }, + { + "epoch": 19.47836166924266, + "grad_norm": 0.938823401927948, + "learning_rate": 1.0384574203716469e-07, + "loss": 0.3554, + "num_input_tokens_seen": 8488592, + "step": 25205 + }, + { + "epoch": 19.482225656877898, + "grad_norm": 0.8428736925125122, + "learning_rate": 1.0231628838694163e-07, + "loss": 0.4432, + "num_input_tokens_seen": 8490064, + "step": 25210 + }, + { + "epoch": 19.486089644513136, + "grad_norm": 0.984057605266571, + "learning_rate": 1.0079815835237761e-07, + "loss": 0.4153, + "num_input_tokens_seen": 8491632, + "step": 25215 + }, + { + "epoch": 19.489953632148378, + "grad_norm": 0.9082183837890625, + "learning_rate": 9.929135262392586e-08, + "loss": 0.7196, + "num_input_tokens_seen": 8493360, + "step": 25220 + }, + { + "epoch": 19.493817619783616, + "grad_norm": 1.5522273778915405, + "learning_rate": 9.779587188689099e-08, + "loss": 0.3707, + "num_input_tokens_seen": 8495088, + "step": 25225 + }, + { + "epoch": 19.497681607418855, + "grad_norm": 1.0485467910766602, + "learning_rate": 9.631171682142893e-08, + "loss": 0.4924, + "num_input_tokens_seen": 8496592, + "step": 25230 + }, + { + "epoch": 19.501545595054097, + "grad_norm": 0.7266064286231995, + "learning_rate": 9.483888810253582e-08, + "loss": 0.365, + "num_input_tokens_seen": 8498288, + "step": 25235 + }, + { + "epoch": 19.505409582689335, + "grad_norm": 0.8823100924491882, + "learning_rate": 9.337738640007032e-08, + "loss": 0.4979, + "num_input_tokens_seen": 8500016, + "step": 25240 + }, + { + "epoch": 19.509273570324574, + "grad_norm": 0.6729595065116882, + "learning_rate": 9.192721237873125e-08, + "loss": 0.3902, + "num_input_tokens_seen": 8501872, + "step": 25245 + }, + { + "epoch": 19.513137557959816, + "grad_norm": 0.7755356431007385, + "learning_rate": 9.048836669806326e-08, + "loss": 0.3695, + "num_input_tokens_seen": 8503344, + "step": 25250 + }, + { + "epoch": 19.517001545595054, + "grad_norm": 0.6973748803138733, + "learning_rate": 8.906085001246233e-08, + "loss": 0.5319, + "num_input_tokens_seen": 8505104, + "step": 25255 + }, + { + "epoch": 19.520865533230292, + "grad_norm": 1.2506455183029175, + "learning_rate": 8.764466297117302e-08, + "loss": 0.4431, + "num_input_tokens_seen": 8506768, + "step": 25260 + }, + { + "epoch": 19.524729520865534, + "grad_norm": 1.3554425239562988, + "learning_rate": 8.623980621828842e-08, + "loss": 0.6339, + "num_input_tokens_seen": 8508368, + "step": 25265 + }, + { + "epoch": 19.528593508500773, + "grad_norm": 1.3120923042297363, + "learning_rate": 8.484628039273912e-08, + "loss": 0.427, + "num_input_tokens_seen": 8510096, + "step": 25270 + }, + { + "epoch": 19.53245749613601, + "grad_norm": 0.9212894439697266, + "learning_rate": 8.34640861283098e-08, + "loss": 0.4375, + "num_input_tokens_seen": 8511600, + "step": 25275 + }, + { + "epoch": 19.536321483771253, + "grad_norm": 1.2109931707382202, + "learning_rate": 8.209322405363929e-08, + "loss": 0.362, + "num_input_tokens_seen": 8513072, + "step": 25280 + }, + { + "epoch": 19.54018547140649, + "grad_norm": 0.7041727304458618, + "learning_rate": 8.073369479219551e-08, + "loss": 0.4037, + "num_input_tokens_seen": 8514960, + "step": 25285 + }, + { + "epoch": 19.54404945904173, + "grad_norm": 0.8034092783927917, + "learning_rate": 7.938549896230329e-08, + "loss": 0.4023, + "num_input_tokens_seen": 8516784, + "step": 25290 + }, + { + "epoch": 19.547913446676972, + "grad_norm": 0.9466056227684021, + "learning_rate": 7.804863717712774e-08, + "loss": 0.3816, + "num_input_tokens_seen": 8518256, + "step": 25295 + }, + { + "epoch": 19.55177743431221, + "grad_norm": 0.6956223845481873, + "learning_rate": 7.672311004468802e-08, + "loss": 0.4206, + "num_input_tokens_seen": 8519952, + "step": 25300 + }, + { + "epoch": 19.55564142194745, + "grad_norm": 1.0627851486206055, + "learning_rate": 7.540891816783246e-08, + "loss": 0.4609, + "num_input_tokens_seen": 8521680, + "step": 25305 + }, + { + "epoch": 19.55950540958269, + "grad_norm": 1.0208990573883057, + "learning_rate": 7.410606214427185e-08, + "loss": 0.411, + "num_input_tokens_seen": 8523280, + "step": 25310 + }, + { + "epoch": 19.56336939721793, + "grad_norm": 1.3339687585830688, + "learning_rate": 7.281454256654885e-08, + "loss": 0.3241, + "num_input_tokens_seen": 8524656, + "step": 25315 + }, + { + "epoch": 19.567233384853168, + "grad_norm": 0.5912158489227295, + "learning_rate": 7.153436002205472e-08, + "loss": 0.3593, + "num_input_tokens_seen": 8526384, + "step": 25320 + }, + { + "epoch": 19.57109737248841, + "grad_norm": 1.344706654548645, + "learning_rate": 7.02655150930237e-08, + "loss": 0.3394, + "num_input_tokens_seen": 8527984, + "step": 25325 + }, + { + "epoch": 19.57496136012365, + "grad_norm": 0.995964527130127, + "learning_rate": 6.900800835653587e-08, + "loss": 0.3329, + "num_input_tokens_seen": 8529552, + "step": 25330 + }, + { + "epoch": 19.578825347758887, + "grad_norm": 1.385327935218811, + "learning_rate": 6.77618403845115e-08, + "loss": 0.3933, + "num_input_tokens_seen": 8531440, + "step": 25335 + }, + { + "epoch": 19.582689335394125, + "grad_norm": 0.9393791556358337, + "learning_rate": 6.652701174371389e-08, + "loss": 0.5348, + "num_input_tokens_seen": 8533104, + "step": 25340 + }, + { + "epoch": 19.586553323029367, + "grad_norm": 1.5668020248413086, + "learning_rate": 6.530352299575215e-08, + "loss": 0.5721, + "num_input_tokens_seen": 8534896, + "step": 25345 + }, + { + "epoch": 19.590417310664606, + "grad_norm": 0.917822539806366, + "learning_rate": 6.409137469707837e-08, + "loss": 0.4204, + "num_input_tokens_seen": 8536464, + "step": 25350 + }, + { + "epoch": 19.594281298299844, + "grad_norm": 2.15273118019104, + "learning_rate": 6.289056739898213e-08, + "loss": 0.6522, + "num_input_tokens_seen": 8538032, + "step": 25355 + }, + { + "epoch": 19.598145285935086, + "grad_norm": 0.919039785861969, + "learning_rate": 6.170110164759879e-08, + "loss": 0.4902, + "num_input_tokens_seen": 8540016, + "step": 25360 + }, + { + "epoch": 19.602009273570324, + "grad_norm": 0.7377104163169861, + "learning_rate": 6.052297798390116e-08, + "loss": 0.4824, + "num_input_tokens_seen": 8541968, + "step": 25365 + }, + { + "epoch": 19.605873261205563, + "grad_norm": 0.9431612491607666, + "learning_rate": 5.9356196943713415e-08, + "loss": 0.5451, + "num_input_tokens_seen": 8543696, + "step": 25370 + }, + { + "epoch": 19.609737248840805, + "grad_norm": 1.0083136558532715, + "learning_rate": 5.8200759057688845e-08, + "loss": 0.331, + "num_input_tokens_seen": 8545456, + "step": 25375 + }, + { + "epoch": 19.613601236476043, + "grad_norm": 0.8971596956253052, + "learning_rate": 5.705666485132932e-08, + "loss": 0.4092, + "num_input_tokens_seen": 8546960, + "step": 25380 + }, + { + "epoch": 19.61746522411128, + "grad_norm": 1.0684689283370972, + "learning_rate": 5.5923914844976944e-08, + "loss": 0.389, + "num_input_tokens_seen": 8548368, + "step": 25385 + }, + { + "epoch": 19.621329211746524, + "grad_norm": 1.2111191749572754, + "learning_rate": 5.4802509553811274e-08, + "loss": 0.4914, + "num_input_tokens_seen": 8550128, + "step": 25390 + }, + { + "epoch": 19.625193199381762, + "grad_norm": 0.6074286699295044, + "learning_rate": 5.3692449487857675e-08, + "loss": 0.5197, + "num_input_tokens_seen": 8551600, + "step": 25395 + }, + { + "epoch": 19.629057187017, + "grad_norm": 1.361335277557373, + "learning_rate": 5.259373515197341e-08, + "loss": 0.4063, + "num_input_tokens_seen": 8553232, + "step": 25400 + }, + { + "epoch": 19.632921174652243, + "grad_norm": 0.8489187359809875, + "learning_rate": 5.150636704586431e-08, + "loss": 0.4002, + "num_input_tokens_seen": 8554768, + "step": 25405 + }, + { + "epoch": 19.63678516228748, + "grad_norm": 1.2614710330963135, + "learning_rate": 5.043034566406812e-08, + "loss": 0.5945, + "num_input_tokens_seen": 8556176, + "step": 25410 + }, + { + "epoch": 19.64064914992272, + "grad_norm": 0.9739209413528442, + "learning_rate": 4.936567149596838e-08, + "loss": 0.4335, + "num_input_tokens_seen": 8557872, + "step": 25415 + }, + { + "epoch": 19.64451313755796, + "grad_norm": 0.8989166617393494, + "learning_rate": 4.8312345025786075e-08, + "loss": 0.4408, + "num_input_tokens_seen": 8559760, + "step": 25420 + }, + { + "epoch": 19.6483771251932, + "grad_norm": 0.5679698586463928, + "learning_rate": 4.7270366732576896e-08, + "loss": 0.3892, + "num_input_tokens_seen": 8561296, + "step": 25425 + }, + { + "epoch": 19.652241112828438, + "grad_norm": 1.2224053144454956, + "learning_rate": 4.6239737090242316e-08, + "loss": 0.8623, + "num_input_tokens_seen": 8563376, + "step": 25430 + }, + { + "epoch": 19.65610510046368, + "grad_norm": 1.0001189708709717, + "learning_rate": 4.5220456567515725e-08, + "loss": 0.55, + "num_input_tokens_seen": 8565040, + "step": 25435 + }, + { + "epoch": 19.65996908809892, + "grad_norm": 1.025390863418579, + "learning_rate": 4.421252562797629e-08, + "loss": 0.4622, + "num_input_tokens_seen": 8566608, + "step": 25440 + }, + { + "epoch": 19.663833075734157, + "grad_norm": 0.8368094563484192, + "learning_rate": 4.321594473003232e-08, + "loss": 0.3754, + "num_input_tokens_seen": 8568400, + "step": 25445 + }, + { + "epoch": 19.667697063369395, + "grad_norm": 1.0197736024856567, + "learning_rate": 4.22307143269407e-08, + "loss": 0.564, + "num_input_tokens_seen": 8570160, + "step": 25450 + }, + { + "epoch": 19.671561051004637, + "grad_norm": 0.879740297794342, + "learning_rate": 4.125683486678189e-08, + "loss": 0.4374, + "num_input_tokens_seen": 8571824, + "step": 25455 + }, + { + "epoch": 19.675425038639876, + "grad_norm": 0.9456315040588379, + "learning_rate": 4.0294306792490466e-08, + "loss": 0.3965, + "num_input_tokens_seen": 8573520, + "step": 25460 + }, + { + "epoch": 19.679289026275114, + "grad_norm": 0.846095621585846, + "learning_rate": 3.934313054182459e-08, + "loss": 0.3932, + "num_input_tokens_seen": 8575184, + "step": 25465 + }, + { + "epoch": 19.683153013910356, + "grad_norm": 1.3747841119766235, + "learning_rate": 3.840330654738544e-08, + "loss": 0.4097, + "num_input_tokens_seen": 8576976, + "step": 25470 + }, + { + "epoch": 19.687017001545595, + "grad_norm": 0.7584800720214844, + "learning_rate": 3.747483523661166e-08, + "loss": 0.4515, + "num_input_tokens_seen": 8578704, + "step": 25475 + }, + { + "epoch": 19.690880989180833, + "grad_norm": 1.0685194730758667, + "learning_rate": 3.655771703177935e-08, + "loss": 0.5981, + "num_input_tokens_seen": 8580272, + "step": 25480 + }, + { + "epoch": 19.694744976816075, + "grad_norm": 0.8110545873641968, + "learning_rate": 3.565195234999652e-08, + "loss": 0.3607, + "num_input_tokens_seen": 8581872, + "step": 25485 + }, + { + "epoch": 19.698608964451314, + "grad_norm": 0.6002579927444458, + "learning_rate": 3.475754160321143e-08, + "loss": 0.583, + "num_input_tokens_seen": 8583248, + "step": 25490 + }, + { + "epoch": 19.702472952086552, + "grad_norm": 1.621241807937622, + "learning_rate": 3.3874485198207015e-08, + "loss": 0.5216, + "num_input_tokens_seen": 8584688, + "step": 25495 + }, + { + "epoch": 19.706336939721794, + "grad_norm": 0.7492280602455139, + "learning_rate": 3.3002783536603685e-08, + "loss": 0.4673, + "num_input_tokens_seen": 8586256, + "step": 25500 + }, + { + "epoch": 19.710200927357032, + "grad_norm": 2.135986089706421, + "learning_rate": 3.214243701485653e-08, + "loss": 0.4213, + "num_input_tokens_seen": 8587920, + "step": 25505 + }, + { + "epoch": 19.71406491499227, + "grad_norm": 1.1724828481674194, + "learning_rate": 3.129344602425255e-08, + "loss": 0.4262, + "num_input_tokens_seen": 8589520, + "step": 25510 + }, + { + "epoch": 19.717928902627513, + "grad_norm": 1.0737409591674805, + "learning_rate": 3.045581095092453e-08, + "loss": 0.3381, + "num_input_tokens_seen": 8591216, + "step": 25515 + }, + { + "epoch": 19.72179289026275, + "grad_norm": 0.6066201329231262, + "learning_rate": 2.9629532175828867e-08, + "loss": 0.3205, + "num_input_tokens_seen": 8593104, + "step": 25520 + }, + { + "epoch": 19.72565687789799, + "grad_norm": 1.3816529512405396, + "learning_rate": 2.881461007476216e-08, + "loss": 0.4164, + "num_input_tokens_seen": 8594704, + "step": 25525 + }, + { + "epoch": 19.72952086553323, + "grad_norm": 0.8431971073150635, + "learning_rate": 2.8011045018361272e-08, + "loss": 0.5573, + "num_input_tokens_seen": 8596336, + "step": 25530 + }, + { + "epoch": 19.73338485316847, + "grad_norm": 0.9219988584518433, + "learning_rate": 2.7218837372086636e-08, + "loss": 0.4151, + "num_input_tokens_seen": 8597840, + "step": 25535 + }, + { + "epoch": 19.73724884080371, + "grad_norm": 1.1115741729736328, + "learning_rate": 2.6437987496238935e-08, + "loss": 0.3644, + "num_input_tokens_seen": 8599536, + "step": 25540 + }, + { + "epoch": 19.74111282843895, + "grad_norm": 0.8025049567222595, + "learning_rate": 2.566849574595631e-08, + "loss": 0.4367, + "num_input_tokens_seen": 8601296, + "step": 25545 + }, + { + "epoch": 19.74497681607419, + "grad_norm": 1.9370099306106567, + "learning_rate": 2.4910362471208815e-08, + "loss": 0.5413, + "num_input_tokens_seen": 8602960, + "step": 25550 + }, + { + "epoch": 19.748840803709427, + "grad_norm": 1.3661025762557983, + "learning_rate": 2.4163588016795636e-08, + "loss": 0.3322, + "num_input_tokens_seen": 8604368, + "step": 25555 + }, + { + "epoch": 19.75270479134467, + "grad_norm": 0.6200522184371948, + "learning_rate": 2.3428172722358977e-08, + "loss": 0.4798, + "num_input_tokens_seen": 8605904, + "step": 25560 + }, + { + "epoch": 19.756568778979908, + "grad_norm": 1.1999223232269287, + "learning_rate": 2.270411692237018e-08, + "loss": 0.4875, + "num_input_tokens_seen": 8607632, + "step": 25565 + }, + { + "epoch": 19.760432766615146, + "grad_norm": 1.577952265739441, + "learning_rate": 2.1991420946129714e-08, + "loss": 0.4446, + "num_input_tokens_seen": 8609200, + "step": 25570 + }, + { + "epoch": 19.764296754250385, + "grad_norm": 1.0240226984024048, + "learning_rate": 2.12900851177783e-08, + "loss": 0.4243, + "num_input_tokens_seen": 8610960, + "step": 25575 + }, + { + "epoch": 19.768160741885627, + "grad_norm": 0.8185793161392212, + "learning_rate": 2.0600109756288565e-08, + "loss": 0.387, + "num_input_tokens_seen": 8612848, + "step": 25580 + }, + { + "epoch": 19.772024729520865, + "grad_norm": 1.2016416788101196, + "learning_rate": 1.992149517546227e-08, + "loss": 0.4638, + "num_input_tokens_seen": 8614352, + "step": 25585 + }, + { + "epoch": 19.775888717156104, + "grad_norm": 1.1392863988876343, + "learning_rate": 1.925424168394141e-08, + "loss": 0.3641, + "num_input_tokens_seen": 8616272, + "step": 25590 + }, + { + "epoch": 19.779752704791346, + "grad_norm": 1.083642601966858, + "learning_rate": 1.8598349585197128e-08, + "loss": 0.4065, + "num_input_tokens_seen": 8618032, + "step": 25595 + }, + { + "epoch": 19.783616692426584, + "grad_norm": 1.0263673067092896, + "learning_rate": 1.7953819177529697e-08, + "loss": 0.3933, + "num_input_tokens_seen": 8619504, + "step": 25600 + }, + { + "epoch": 19.787480680061822, + "grad_norm": 1.4446533918380737, + "learning_rate": 1.732065075407685e-08, + "loss": 0.4531, + "num_input_tokens_seen": 8620976, + "step": 25605 + }, + { + "epoch": 19.791344667697064, + "grad_norm": 1.2676607370376587, + "learning_rate": 1.6698844602808238e-08, + "loss": 0.6506, + "num_input_tokens_seen": 8622448, + "step": 25610 + }, + { + "epoch": 19.795208655332303, + "grad_norm": 1.276167631149292, + "learning_rate": 1.6088401006522647e-08, + "loss": 0.4041, + "num_input_tokens_seen": 8623856, + "step": 25615 + }, + { + "epoch": 19.79907264296754, + "grad_norm": 1.010301113128662, + "learning_rate": 1.548932024285632e-08, + "loss": 0.4735, + "num_input_tokens_seen": 8625456, + "step": 25620 + }, + { + "epoch": 19.802936630602783, + "grad_norm": 1.32160222530365, + "learning_rate": 1.4901602584271868e-08, + "loss": 0.5244, + "num_input_tokens_seen": 8627408, + "step": 25625 + }, + { + "epoch": 19.80680061823802, + "grad_norm": 1.5332496166229248, + "learning_rate": 1.4325248298069361e-08, + "loss": 0.357, + "num_input_tokens_seen": 8628944, + "step": 25630 + }, + { + "epoch": 19.81066460587326, + "grad_norm": 0.9434252381324768, + "learning_rate": 1.3760257646378005e-08, + "loss": 0.3306, + "num_input_tokens_seen": 8630672, + "step": 25635 + }, + { + "epoch": 19.814528593508502, + "grad_norm": 1.5018599033355713, + "learning_rate": 1.3206630886158921e-08, + "loss": 0.502, + "num_input_tokens_seen": 8632336, + "step": 25640 + }, + { + "epoch": 19.81839258114374, + "grad_norm": 0.7255854606628418, + "learning_rate": 1.2664368269202365e-08, + "loss": 0.3848, + "num_input_tokens_seen": 8633840, + "step": 25645 + }, + { + "epoch": 19.82225656877898, + "grad_norm": 0.7335339784622192, + "learning_rate": 1.2133470042136052e-08, + "loss": 0.342, + "num_input_tokens_seen": 8635792, + "step": 25650 + }, + { + "epoch": 19.82612055641422, + "grad_norm": 0.9972112774848938, + "learning_rate": 1.161393644641129e-08, + "loss": 0.4559, + "num_input_tokens_seen": 8637904, + "step": 25655 + }, + { + "epoch": 19.82998454404946, + "grad_norm": 0.7319034337997437, + "learning_rate": 1.1105767718319614e-08, + "loss": 0.524, + "num_input_tokens_seen": 8639696, + "step": 25660 + }, + { + "epoch": 19.833848531684698, + "grad_norm": 1.0056685209274292, + "learning_rate": 1.0608964088978934e-08, + "loss": 0.4749, + "num_input_tokens_seen": 8641360, + "step": 25665 + }, + { + "epoch": 19.83771251931994, + "grad_norm": 1.3115276098251343, + "learning_rate": 1.012352578433351e-08, + "loss": 0.52, + "num_input_tokens_seen": 8643216, + "step": 25670 + }, + { + "epoch": 19.841576506955178, + "grad_norm": 1.4125704765319824, + "learning_rate": 9.649453025170618e-09, + "loss": 0.5659, + "num_input_tokens_seen": 8644816, + "step": 25675 + }, + { + "epoch": 19.845440494590417, + "grad_norm": 0.9551747441291809, + "learning_rate": 9.186746027095571e-09, + "loss": 0.6191, + "num_input_tokens_seen": 8646352, + "step": 25680 + }, + { + "epoch": 19.84930448222566, + "grad_norm": 1.0744891166687012, + "learning_rate": 8.73540500055392e-09, + "loss": 0.4028, + "num_input_tokens_seen": 8648048, + "step": 25685 + }, + { + "epoch": 19.853168469860897, + "grad_norm": 1.3722339868545532, + "learning_rate": 8.295430150814798e-09, + "loss": 0.423, + "num_input_tokens_seen": 8649584, + "step": 25690 + }, + { + "epoch": 19.857032457496135, + "grad_norm": 1.2845990657806396, + "learning_rate": 7.866821677984804e-09, + "loss": 0.4027, + "num_input_tokens_seen": 8651152, + "step": 25695 + }, + { + "epoch": 19.860896445131374, + "grad_norm": 1.108228325843811, + "learning_rate": 7.449579776996895e-09, + "loss": 0.4079, + "num_input_tokens_seen": 8652560, + "step": 25700 + }, + { + "epoch": 19.864760432766616, + "grad_norm": 0.8734350800514221, + "learning_rate": 7.043704637613169e-09, + "loss": 0.5352, + "num_input_tokens_seen": 8654448, + "step": 25705 + }, + { + "epoch": 19.868624420401854, + "grad_norm": 0.7551730871200562, + "learning_rate": 6.6491964444304054e-09, + "loss": 0.3745, + "num_input_tokens_seen": 8656144, + "step": 25710 + }, + { + "epoch": 19.872488408037093, + "grad_norm": 0.913265585899353, + "learning_rate": 6.266055376871749e-09, + "loss": 0.3659, + "num_input_tokens_seen": 8657776, + "step": 25715 + }, + { + "epoch": 19.876352395672335, + "grad_norm": 0.8702437877655029, + "learning_rate": 5.894281609195029e-09, + "loss": 0.4095, + "num_input_tokens_seen": 8659472, + "step": 25720 + }, + { + "epoch": 19.880216383307573, + "grad_norm": 1.2557283639907837, + "learning_rate": 5.533875310478886e-09, + "loss": 0.514, + "num_input_tokens_seen": 8661136, + "step": 25725 + }, + { + "epoch": 19.88408037094281, + "grad_norm": 1.261478304862976, + "learning_rate": 5.184836644644975e-09, + "loss": 0.5502, + "num_input_tokens_seen": 8662800, + "step": 25730 + }, + { + "epoch": 19.887944358578054, + "grad_norm": 1.0660401582717896, + "learning_rate": 4.847165770435758e-09, + "loss": 0.4169, + "num_input_tokens_seen": 8664592, + "step": 25735 + }, + { + "epoch": 19.891808346213292, + "grad_norm": 0.7870724201202393, + "learning_rate": 4.52086284142561e-09, + "loss": 0.5899, + "num_input_tokens_seen": 8666480, + "step": 25740 + }, + { + "epoch": 19.89567233384853, + "grad_norm": 1.9972574710845947, + "learning_rate": 4.205928006018045e-09, + "loss": 0.415, + "num_input_tokens_seen": 8668112, + "step": 25745 + }, + { + "epoch": 19.899536321483772, + "grad_norm": 1.0323288440704346, + "learning_rate": 3.9023614074484845e-09, + "loss": 0.533, + "num_input_tokens_seen": 8669776, + "step": 25750 + }, + { + "epoch": 19.90340030911901, + "grad_norm": 1.2883342504501343, + "learning_rate": 3.6101631837814896e-09, + "loss": 0.5479, + "num_input_tokens_seen": 8671696, + "step": 25755 + }, + { + "epoch": 19.90726429675425, + "grad_norm": 1.7419813871383667, + "learning_rate": 3.32933346790798e-09, + "loss": 0.5905, + "num_input_tokens_seen": 8673616, + "step": 25760 + }, + { + "epoch": 19.91112828438949, + "grad_norm": 1.0620404481887817, + "learning_rate": 3.059872387553564e-09, + "loss": 0.4874, + "num_input_tokens_seen": 8675312, + "step": 25765 + }, + { + "epoch": 19.91499227202473, + "grad_norm": 1.0067814588546753, + "learning_rate": 2.8017800652702097e-09, + "loss": 0.3755, + "num_input_tokens_seen": 8676976, + "step": 25770 + }, + { + "epoch": 19.918856259659968, + "grad_norm": 1.5790009498596191, + "learning_rate": 2.5550566184390224e-09, + "loss": 0.4936, + "num_input_tokens_seen": 8678960, + "step": 25775 + }, + { + "epoch": 19.92272024729521, + "grad_norm": 1.198170781135559, + "learning_rate": 2.3197021592730185e-09, + "loss": 0.5284, + "num_input_tokens_seen": 8680720, + "step": 25780 + }, + { + "epoch": 19.92658423493045, + "grad_norm": 1.007878303527832, + "learning_rate": 2.095716794811575e-09, + "loss": 0.4439, + "num_input_tokens_seen": 8682352, + "step": 25785 + }, + { + "epoch": 19.930448222565687, + "grad_norm": 1.3929650783538818, + "learning_rate": 1.883100626925982e-09, + "loss": 0.6204, + "num_input_tokens_seen": 8683952, + "step": 25790 + }, + { + "epoch": 19.93431221020093, + "grad_norm": 1.1908001899719238, + "learning_rate": 1.6818537523111134e-09, + "loss": 0.325, + "num_input_tokens_seen": 8685392, + "step": 25795 + }, + { + "epoch": 19.938176197836167, + "grad_norm": 0.7341814637184143, + "learning_rate": 1.491976262499306e-09, + "loss": 0.3567, + "num_input_tokens_seen": 8687088, + "step": 25800 + }, + { + "epoch": 19.942040185471406, + "grad_norm": 1.4253329038619995, + "learning_rate": 1.3134682438492585e-09, + "loss": 0.347, + "num_input_tokens_seen": 8688752, + "step": 25805 + }, + { + "epoch": 19.945904173106648, + "grad_norm": 0.954338788986206, + "learning_rate": 1.1463297775432535e-09, + "loss": 0.5814, + "num_input_tokens_seen": 8690384, + "step": 25810 + }, + { + "epoch": 19.949768160741886, + "grad_norm": 0.7942813634872437, + "learning_rate": 9.905609395982617e-10, + "loss": 0.4264, + "num_input_tokens_seen": 8692208, + "step": 25815 + }, + { + "epoch": 19.953632148377125, + "grad_norm": 0.9907118082046509, + "learning_rate": 8.4616180086039e-10, + "loss": 0.4662, + "num_input_tokens_seen": 8693808, + "step": 25820 + }, + { + "epoch": 19.957496136012363, + "grad_norm": 1.0477628707885742, + "learning_rate": 7.1313242699933e-10, + "loss": 0.4275, + "num_input_tokens_seen": 8695536, + "step": 25825 + }, + { + "epoch": 19.961360123647605, + "grad_norm": 0.9093778729438782, + "learning_rate": 5.914728785250123e-10, + "loss": 0.4044, + "num_input_tokens_seen": 8697392, + "step": 25830 + }, + { + "epoch": 19.965224111282843, + "grad_norm": 1.131814956665039, + "learning_rate": 4.811832107598502e-10, + "loss": 0.3563, + "num_input_tokens_seen": 8699216, + "step": 25835 + }, + { + "epoch": 19.969088098918082, + "grad_norm": 0.9060852527618408, + "learning_rate": 3.8226347387204654e-10, + "loss": 0.5002, + "num_input_tokens_seen": 8701168, + "step": 25840 + }, + { + "epoch": 19.972952086553324, + "grad_norm": 0.8021042943000793, + "learning_rate": 2.9471371284783834e-10, + "loss": 0.3505, + "num_input_tokens_seen": 8702800, + "step": 25845 + }, + { + "epoch": 19.976816074188562, + "grad_norm": 0.9027987122535706, + "learning_rate": 2.185339675025988e-10, + "loss": 0.4798, + "num_input_tokens_seen": 8704400, + "step": 25850 + }, + { + "epoch": 19.9806800618238, + "grad_norm": 0.9617196321487427, + "learning_rate": 1.5372427248638853e-10, + "loss": 0.5805, + "num_input_tokens_seen": 8706096, + "step": 25855 + }, + { + "epoch": 19.984544049459043, + "grad_norm": 1.0981700420379639, + "learning_rate": 1.0028465727562885e-10, + "loss": 0.4725, + "num_input_tokens_seen": 8707984, + "step": 25860 + }, + { + "epoch": 19.98840803709428, + "grad_norm": 0.7804635763168335, + "learning_rate": 5.821514617587731e-11, + "loss": 0.3305, + "num_input_tokens_seen": 8709552, + "step": 25865 + }, + { + "epoch": 19.99227202472952, + "grad_norm": 0.7190715074539185, + "learning_rate": 2.751575831627662e-11, + "loss": 0.3717, + "num_input_tokens_seen": 8711472, + "step": 25870 + }, + { + "epoch": 19.99613601236476, + "grad_norm": 0.7890521287918091, + "learning_rate": 8.186507660656873e-12, + "loss": 0.3542, + "num_input_tokens_seen": 8713296, + "step": 25875 + }, + { + "epoch": 20.0, + "grad_norm": 1.1835862398147583, + "learning_rate": 2.2740300198442753e-13, + "loss": 0.5089, + "num_input_tokens_seen": 8714656, + "step": 25880 + }, + { + "epoch": 20.0, + "eval_loss": 0.4453426003456116, + "eval_runtime": 6.2376, + "eval_samples_per_second": 92.183, + "eval_steps_per_second": 23.086, + "num_input_tokens_seen": 8714656, + "step": 25880 + }, + { + "epoch": 20.0, + "num_input_tokens_seen": 8714656, + "step": 25880, + "total_flos": 3.924273011608781e+17, + "train_loss": 0.6342448019585248, + "train_runtime": 3869.8044, + "train_samples_per_second": 26.74, + "train_steps_per_second": 6.688 + } + ], + "logging_steps": 5, + "max_steps": 25880, + "num_input_tokens_seen": 8714656, + "num_train_epochs": 20, + "save_steps": 1294, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.924273011608781e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}