{ "best_global_step": 5058, "best_metric": 0.18196314573287964, "best_model_checkpoint": "saves/prefix-tuning/llama-3-8b-instruct/train_rte_1754652145/checkpoint-5058", "epoch": 10.0, "eval_steps": 281, "global_step": 5610, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008912655971479501, "grad_norm": 2.640630006790161, "learning_rate": 3.5650623885918005e-07, "loss": 11.4646, "num_input_tokens_seen": 3168, "step": 5 }, { "epoch": 0.017825311942959002, "grad_norm": 2.218021869659424, "learning_rate": 8.021390374331552e-07, "loss": 11.5893, "num_input_tokens_seen": 6272, "step": 10 }, { "epoch": 0.026737967914438502, "grad_norm": 2.5081377029418945, "learning_rate": 1.2477718360071302e-06, "loss": 11.4013, "num_input_tokens_seen": 10144, "step": 15 }, { "epoch": 0.035650623885918005, "grad_norm": 2.2221429347991943, "learning_rate": 1.6934046345811053e-06, "loss": 11.4758, "num_input_tokens_seen": 13536, "step": 20 }, { "epoch": 0.044563279857397504, "grad_norm": 2.0649468898773193, "learning_rate": 2.1390374331550802e-06, "loss": 11.3651, "num_input_tokens_seen": 16128, "step": 25 }, { "epoch": 0.053475935828877004, "grad_norm": 2.8866872787475586, "learning_rate": 2.5846702317290554e-06, "loss": 11.4745, "num_input_tokens_seen": 18784, "step": 30 }, { "epoch": 0.062388591800356503, "grad_norm": 2.091982841491699, "learning_rate": 3.0303030303030305e-06, "loss": 11.5827, "num_input_tokens_seen": 22336, "step": 35 }, { "epoch": 0.07130124777183601, "grad_norm": 2.3348405361175537, "learning_rate": 3.4759358288770056e-06, "loss": 11.5288, "num_input_tokens_seen": 25408, "step": 40 }, { "epoch": 0.08021390374331551, "grad_norm": 2.6505680084228516, "learning_rate": 3.92156862745098e-06, "loss": 11.3178, "num_input_tokens_seen": 27968, "step": 45 }, { "epoch": 0.08912655971479501, "grad_norm": 2.241699457168579, "learning_rate": 4.3672014260249555e-06, "loss": 11.2239, "num_input_tokens_seen": 30752, "step": 50 }, { "epoch": 0.09803921568627451, "grad_norm": 2.149437427520752, "learning_rate": 4.812834224598931e-06, "loss": 11.3085, "num_input_tokens_seen": 33376, "step": 55 }, { "epoch": 0.10695187165775401, "grad_norm": 2.2778542041778564, "learning_rate": 5.258467023172906e-06, "loss": 11.3491, "num_input_tokens_seen": 37280, "step": 60 }, { "epoch": 0.11586452762923351, "grad_norm": 2.1370596885681152, "learning_rate": 5.704099821746881e-06, "loss": 11.0892, "num_input_tokens_seen": 40640, "step": 65 }, { "epoch": 0.12477718360071301, "grad_norm": 2.1671693325042725, "learning_rate": 6.149732620320856e-06, "loss": 11.4158, "num_input_tokens_seen": 44128, "step": 70 }, { "epoch": 0.13368983957219252, "grad_norm": 2.1441879272460938, "learning_rate": 6.59536541889483e-06, "loss": 11.0242, "num_input_tokens_seen": 47648, "step": 75 }, { "epoch": 0.14260249554367202, "grad_norm": 2.2412052154541016, "learning_rate": 7.040998217468805e-06, "loss": 10.8869, "num_input_tokens_seen": 50816, "step": 80 }, { "epoch": 0.15151515151515152, "grad_norm": 2.3039534091949463, "learning_rate": 7.4866310160427806e-06, "loss": 11.031, "num_input_tokens_seen": 53728, "step": 85 }, { "epoch": 0.16042780748663102, "grad_norm": 2.3388712406158447, "learning_rate": 7.932263814616755e-06, "loss": 10.9959, "num_input_tokens_seen": 57056, "step": 90 }, { "epoch": 0.16934046345811052, "grad_norm": 2.324082851409912, "learning_rate": 8.377896613190733e-06, "loss": 10.8078, "num_input_tokens_seen": 59808, "step": 95 }, { "epoch": 0.17825311942959002, "grad_norm": 2.343338966369629, "learning_rate": 8.823529411764707e-06, "loss": 10.6152, "num_input_tokens_seen": 62848, "step": 100 }, { "epoch": 0.18716577540106952, "grad_norm": 2.170870542526245, "learning_rate": 9.269162210338681e-06, "loss": 10.7252, "num_input_tokens_seen": 65856, "step": 105 }, { "epoch": 0.19607843137254902, "grad_norm": 2.1757500171661377, "learning_rate": 9.714795008912657e-06, "loss": 10.702, "num_input_tokens_seen": 68672, "step": 110 }, { "epoch": 0.20499108734402852, "grad_norm": 2.319809675216675, "learning_rate": 1.0160427807486631e-05, "loss": 10.7596, "num_input_tokens_seen": 71840, "step": 115 }, { "epoch": 0.21390374331550802, "grad_norm": 2.85723876953125, "learning_rate": 1.0606060606060607e-05, "loss": 10.6329, "num_input_tokens_seen": 74624, "step": 120 }, { "epoch": 0.22281639928698752, "grad_norm": 2.3634092807769775, "learning_rate": 1.1051693404634582e-05, "loss": 10.694, "num_input_tokens_seen": 78080, "step": 125 }, { "epoch": 0.23172905525846701, "grad_norm": 2.2238471508026123, "learning_rate": 1.1497326203208558e-05, "loss": 10.4616, "num_input_tokens_seen": 81408, "step": 130 }, { "epoch": 0.24064171122994651, "grad_norm": 2.2605199813842773, "learning_rate": 1.1942959001782532e-05, "loss": 10.2704, "num_input_tokens_seen": 84192, "step": 135 }, { "epoch": 0.24955436720142601, "grad_norm": 2.334446668624878, "learning_rate": 1.2388591800356506e-05, "loss": 10.1217, "num_input_tokens_seen": 87264, "step": 140 }, { "epoch": 0.25846702317290554, "grad_norm": 2.1008996963500977, "learning_rate": 1.2834224598930484e-05, "loss": 9.9505, "num_input_tokens_seen": 90336, "step": 145 }, { "epoch": 0.26737967914438504, "grad_norm": 2.1396262645721436, "learning_rate": 1.3279857397504458e-05, "loss": 9.9953, "num_input_tokens_seen": 93760, "step": 150 }, { "epoch": 0.27629233511586454, "grad_norm": 1.9306892156600952, "learning_rate": 1.3725490196078432e-05, "loss": 10.0273, "num_input_tokens_seen": 97120, "step": 155 }, { "epoch": 0.28520499108734404, "grad_norm": 2.2339835166931152, "learning_rate": 1.4171122994652408e-05, "loss": 9.8194, "num_input_tokens_seen": 100160, "step": 160 }, { "epoch": 0.29411764705882354, "grad_norm": 2.1370038986206055, "learning_rate": 1.4616755793226383e-05, "loss": 9.7234, "num_input_tokens_seen": 103136, "step": 165 }, { "epoch": 0.30303030303030304, "grad_norm": 2.2204971313476562, "learning_rate": 1.5062388591800359e-05, "loss": 9.4737, "num_input_tokens_seen": 105696, "step": 170 }, { "epoch": 0.31194295900178254, "grad_norm": 2.0649607181549072, "learning_rate": 1.5508021390374333e-05, "loss": 9.299, "num_input_tokens_seen": 108800, "step": 175 }, { "epoch": 0.32085561497326204, "grad_norm": 2.166388511657715, "learning_rate": 1.5953654188948307e-05, "loss": 9.3115, "num_input_tokens_seen": 111808, "step": 180 }, { "epoch": 0.32976827094474154, "grad_norm": 2.0328972339630127, "learning_rate": 1.639928698752228e-05, "loss": 9.3707, "num_input_tokens_seen": 114944, "step": 185 }, { "epoch": 0.33868092691622104, "grad_norm": 2.443514347076416, "learning_rate": 1.684491978609626e-05, "loss": 8.9663, "num_input_tokens_seen": 118112, "step": 190 }, { "epoch": 0.34759358288770054, "grad_norm": 2.0616464614868164, "learning_rate": 1.7290552584670233e-05, "loss": 8.9474, "num_input_tokens_seen": 120896, "step": 195 }, { "epoch": 0.35650623885918004, "grad_norm": 2.2355945110321045, "learning_rate": 1.7736185383244208e-05, "loss": 8.6637, "num_input_tokens_seen": 123904, "step": 200 }, { "epoch": 0.36541889483065954, "grad_norm": 2.044498920440674, "learning_rate": 1.8181818181818182e-05, "loss": 8.6211, "num_input_tokens_seen": 127008, "step": 205 }, { "epoch": 0.37433155080213903, "grad_norm": 2.1903281211853027, "learning_rate": 1.862745098039216e-05, "loss": 8.4521, "num_input_tokens_seen": 129984, "step": 210 }, { "epoch": 0.38324420677361853, "grad_norm": 2.253875255584717, "learning_rate": 1.9073083778966134e-05, "loss": 8.4635, "num_input_tokens_seen": 133152, "step": 215 }, { "epoch": 0.39215686274509803, "grad_norm": 2.23766827583313, "learning_rate": 1.951871657754011e-05, "loss": 8.4012, "num_input_tokens_seen": 136096, "step": 220 }, { "epoch": 0.40106951871657753, "grad_norm": 2.4483225345611572, "learning_rate": 1.9964349376114083e-05, "loss": 8.019, "num_input_tokens_seen": 139136, "step": 225 }, { "epoch": 0.40998217468805703, "grad_norm": 2.141366958618164, "learning_rate": 2.0409982174688057e-05, "loss": 8.2362, "num_input_tokens_seen": 142080, "step": 230 }, { "epoch": 0.41889483065953653, "grad_norm": 2.049794912338257, "learning_rate": 2.0855614973262035e-05, "loss": 8.3716, "num_input_tokens_seen": 145824, "step": 235 }, { "epoch": 0.42780748663101603, "grad_norm": 2.0718395709991455, "learning_rate": 2.130124777183601e-05, "loss": 7.742, "num_input_tokens_seen": 149280, "step": 240 }, { "epoch": 0.43672014260249553, "grad_norm": 2.133650064468384, "learning_rate": 2.1746880570409983e-05, "loss": 7.7851, "num_input_tokens_seen": 152544, "step": 245 }, { "epoch": 0.44563279857397503, "grad_norm": 2.0652763843536377, "learning_rate": 2.2192513368983957e-05, "loss": 7.4258, "num_input_tokens_seen": 156416, "step": 250 }, { "epoch": 0.45454545454545453, "grad_norm": 1.8414599895477295, "learning_rate": 2.2638146167557932e-05, "loss": 7.1734, "num_input_tokens_seen": 159712, "step": 255 }, { "epoch": 0.46345811051693403, "grad_norm": 2.0587077140808105, "learning_rate": 2.308377896613191e-05, "loss": 6.8801, "num_input_tokens_seen": 162400, "step": 260 }, { "epoch": 0.47237076648841353, "grad_norm": 1.8652368783950806, "learning_rate": 2.3529411764705884e-05, "loss": 7.0346, "num_input_tokens_seen": 166048, "step": 265 }, { "epoch": 0.48128342245989303, "grad_norm": 1.6939105987548828, "learning_rate": 2.3975044563279858e-05, "loss": 6.5944, "num_input_tokens_seen": 168576, "step": 270 }, { "epoch": 0.49019607843137253, "grad_norm": 1.9076436758041382, "learning_rate": 2.4420677361853832e-05, "loss": 6.7204, "num_input_tokens_seen": 172320, "step": 275 }, { "epoch": 0.49910873440285203, "grad_norm": 1.65463387966156, "learning_rate": 2.4866310160427807e-05, "loss": 6.7786, "num_input_tokens_seen": 175424, "step": 280 }, { "epoch": 0.5008912655971479, "eval_loss": 6.320615768432617, "eval_runtime": 4.2449, "eval_samples_per_second": 58.659, "eval_steps_per_second": 14.841, "num_input_tokens_seen": 176032, "step": 281 }, { "epoch": 0.5080213903743316, "grad_norm": 2.3921778202056885, "learning_rate": 2.5311942959001784e-05, "loss": 6.4536, "num_input_tokens_seen": 178016, "step": 285 }, { "epoch": 0.5169340463458111, "grad_norm": 1.5767650604248047, "learning_rate": 2.575757575757576e-05, "loss": 6.7214, "num_input_tokens_seen": 181888, "step": 290 }, { "epoch": 0.5258467023172906, "grad_norm": 1.6409612894058228, "learning_rate": 2.6203208556149733e-05, "loss": 6.0779, "num_input_tokens_seen": 184960, "step": 295 }, { "epoch": 0.5347593582887701, "grad_norm": 1.5643103122711182, "learning_rate": 2.6648841354723707e-05, "loss": 5.8182, "num_input_tokens_seen": 187488, "step": 300 }, { "epoch": 0.5436720142602496, "grad_norm": 1.7608228921890259, "learning_rate": 2.7094474153297685e-05, "loss": 6.2207, "num_input_tokens_seen": 191232, "step": 305 }, { "epoch": 0.5525846702317291, "grad_norm": 1.5273125171661377, "learning_rate": 2.754010695187166e-05, "loss": 5.8064, "num_input_tokens_seen": 194272, "step": 310 }, { "epoch": 0.5614973262032086, "grad_norm": 1.3673619031906128, "learning_rate": 2.7985739750445633e-05, "loss": 5.7312, "num_input_tokens_seen": 197184, "step": 315 }, { "epoch": 0.5704099821746881, "grad_norm": 1.3092046976089478, "learning_rate": 2.8431372549019608e-05, "loss": 5.36, "num_input_tokens_seen": 199840, "step": 320 }, { "epoch": 0.5793226381461676, "grad_norm": 1.5241113901138306, "learning_rate": 2.8877005347593582e-05, "loss": 5.6509, "num_input_tokens_seen": 203008, "step": 325 }, { "epoch": 0.5882352941176471, "grad_norm": 1.2224637269973755, "learning_rate": 2.932263814616756e-05, "loss": 5.3917, "num_input_tokens_seen": 206400, "step": 330 }, { "epoch": 0.5971479500891266, "grad_norm": 1.1933878660202026, "learning_rate": 2.9768270944741534e-05, "loss": 5.2637, "num_input_tokens_seen": 209440, "step": 335 }, { "epoch": 0.6060606060606061, "grad_norm": 1.1900209188461304, "learning_rate": 3.0213903743315508e-05, "loss": 5.4659, "num_input_tokens_seen": 212736, "step": 340 }, { "epoch": 0.6149732620320856, "grad_norm": 1.3414652347564697, "learning_rate": 3.065953654188948e-05, "loss": 5.324, "num_input_tokens_seen": 216096, "step": 345 }, { "epoch": 0.6238859180035651, "grad_norm": 1.1607022285461426, "learning_rate": 3.110516934046346e-05, "loss": 5.2878, "num_input_tokens_seen": 219200, "step": 350 }, { "epoch": 0.6327985739750446, "grad_norm": 1.153671383857727, "learning_rate": 3.155080213903743e-05, "loss": 4.9444, "num_input_tokens_seen": 221952, "step": 355 }, { "epoch": 0.6417112299465241, "grad_norm": 1.139689326286316, "learning_rate": 3.199643493761141e-05, "loss": 4.891, "num_input_tokens_seen": 225376, "step": 360 }, { "epoch": 0.6506238859180036, "grad_norm": 1.0437010526657104, "learning_rate": 3.2442067736185386e-05, "loss": 4.9337, "num_input_tokens_seen": 228736, "step": 365 }, { "epoch": 0.6595365418894831, "grad_norm": 1.2458043098449707, "learning_rate": 3.288770053475936e-05, "loss": 4.7023, "num_input_tokens_seen": 231648, "step": 370 }, { "epoch": 0.6684491978609626, "grad_norm": 1.0675745010375977, "learning_rate": 3.3333333333333335e-05, "loss": 4.594, "num_input_tokens_seen": 234976, "step": 375 }, { "epoch": 0.6773618538324421, "grad_norm": 1.0720183849334717, "learning_rate": 3.3778966131907306e-05, "loss": 4.82, "num_input_tokens_seen": 238368, "step": 380 }, { "epoch": 0.6862745098039216, "grad_norm": 1.044710636138916, "learning_rate": 3.4224598930481284e-05, "loss": 4.5563, "num_input_tokens_seen": 241440, "step": 385 }, { "epoch": 0.6951871657754011, "grad_norm": 1.0943641662597656, "learning_rate": 3.467023172905526e-05, "loss": 4.5969, "num_input_tokens_seen": 244448, "step": 390 }, { "epoch": 0.7040998217468806, "grad_norm": 1.082396149635315, "learning_rate": 3.511586452762923e-05, "loss": 4.3737, "num_input_tokens_seen": 246880, "step": 395 }, { "epoch": 0.7130124777183601, "grad_norm": 1.1410984992980957, "learning_rate": 3.556149732620321e-05, "loss": 4.3754, "num_input_tokens_seen": 250240, "step": 400 }, { "epoch": 0.7219251336898396, "grad_norm": 1.1234968900680542, "learning_rate": 3.600713012477718e-05, "loss": 4.3313, "num_input_tokens_seen": 253184, "step": 405 }, { "epoch": 0.7308377896613191, "grad_norm": 1.2889167070388794, "learning_rate": 3.645276292335116e-05, "loss": 4.1676, "num_input_tokens_seen": 255968, "step": 410 }, { "epoch": 0.7397504456327986, "grad_norm": 0.9909088611602783, "learning_rate": 3.6898395721925136e-05, "loss": 4.1332, "num_input_tokens_seen": 258688, "step": 415 }, { "epoch": 0.7486631016042781, "grad_norm": 1.12320077419281, "learning_rate": 3.734402852049911e-05, "loss": 4.1551, "num_input_tokens_seen": 262240, "step": 420 }, { "epoch": 0.7575757575757576, "grad_norm": 1.1998422145843506, "learning_rate": 3.7789661319073085e-05, "loss": 4.1066, "num_input_tokens_seen": 265952, "step": 425 }, { "epoch": 0.7664884135472371, "grad_norm": 1.6095830202102661, "learning_rate": 3.8235294117647055e-05, "loss": 4.3427, "num_input_tokens_seen": 269312, "step": 430 }, { "epoch": 0.7754010695187166, "grad_norm": 1.1973387002944946, "learning_rate": 3.868092691622103e-05, "loss": 4.0544, "num_input_tokens_seen": 272128, "step": 435 }, { "epoch": 0.7843137254901961, "grad_norm": 1.13062584400177, "learning_rate": 3.912655971479501e-05, "loss": 4.0524, "num_input_tokens_seen": 275552, "step": 440 }, { "epoch": 0.7932263814616756, "grad_norm": 1.095451831817627, "learning_rate": 3.957219251336899e-05, "loss": 3.9436, "num_input_tokens_seen": 278720, "step": 445 }, { "epoch": 0.8021390374331551, "grad_norm": 0.9978923201560974, "learning_rate": 4.0017825311942966e-05, "loss": 3.6121, "num_input_tokens_seen": 281536, "step": 450 }, { "epoch": 0.8110516934046346, "grad_norm": 1.036067008972168, "learning_rate": 4.046345811051694e-05, "loss": 3.8184, "num_input_tokens_seen": 284672, "step": 455 }, { "epoch": 0.8199643493761141, "grad_norm": 0.8888896107673645, "learning_rate": 4.0909090909090915e-05, "loss": 3.7184, "num_input_tokens_seen": 288416, "step": 460 }, { "epoch": 0.8288770053475936, "grad_norm": 0.8882661461830139, "learning_rate": 4.1354723707664886e-05, "loss": 3.6762, "num_input_tokens_seen": 291232, "step": 465 }, { "epoch": 0.8377896613190731, "grad_norm": 1.3067046403884888, "learning_rate": 4.180035650623886e-05, "loss": 3.7256, "num_input_tokens_seen": 294784, "step": 470 }, { "epoch": 0.8467023172905526, "grad_norm": 1.1890095472335815, "learning_rate": 4.224598930481284e-05, "loss": 3.4105, "num_input_tokens_seen": 297632, "step": 475 }, { "epoch": 0.8556149732620321, "grad_norm": 0.9891613125801086, "learning_rate": 4.269162210338681e-05, "loss": 3.2745, "num_input_tokens_seen": 300416, "step": 480 }, { "epoch": 0.8645276292335116, "grad_norm": 0.9931787848472595, "learning_rate": 4.313725490196079e-05, "loss": 3.1763, "num_input_tokens_seen": 303232, "step": 485 }, { "epoch": 0.8734402852049911, "grad_norm": 0.8934875130653381, "learning_rate": 4.358288770053476e-05, "loss": 3.2828, "num_input_tokens_seen": 306144, "step": 490 }, { "epoch": 0.8823529411764706, "grad_norm": 1.265254259109497, "learning_rate": 4.402852049910874e-05, "loss": 3.2048, "num_input_tokens_seen": 308576, "step": 495 }, { "epoch": 0.8912655971479501, "grad_norm": 1.0396374464035034, "learning_rate": 4.4474153297682716e-05, "loss": 3.197, "num_input_tokens_seen": 312000, "step": 500 }, { "epoch": 0.9001782531194296, "grad_norm": 0.8916023373603821, "learning_rate": 4.491978609625669e-05, "loss": 2.9296, "num_input_tokens_seen": 314848, "step": 505 }, { "epoch": 0.9090909090909091, "grad_norm": 1.1076226234436035, "learning_rate": 4.5365418894830664e-05, "loss": 3.0006, "num_input_tokens_seen": 318112, "step": 510 }, { "epoch": 0.9180035650623886, "grad_norm": 1.0348403453826904, "learning_rate": 4.5811051693404635e-05, "loss": 3.2128, "num_input_tokens_seen": 321152, "step": 515 }, { "epoch": 0.9269162210338681, "grad_norm": 0.9368388056755066, "learning_rate": 4.625668449197861e-05, "loss": 2.6109, "num_input_tokens_seen": 323552, "step": 520 }, { "epoch": 0.9358288770053476, "grad_norm": 0.9401017427444458, "learning_rate": 4.670231729055259e-05, "loss": 2.6761, "num_input_tokens_seen": 326112, "step": 525 }, { "epoch": 0.9447415329768271, "grad_norm": 1.0641679763793945, "learning_rate": 4.714795008912656e-05, "loss": 2.7169, "num_input_tokens_seen": 328800, "step": 530 }, { "epoch": 0.9536541889483066, "grad_norm": 1.1021815538406372, "learning_rate": 4.759358288770054e-05, "loss": 3.1103, "num_input_tokens_seen": 332512, "step": 535 }, { "epoch": 0.9625668449197861, "grad_norm": 0.8338248133659363, "learning_rate": 4.803921568627452e-05, "loss": 2.3949, "num_input_tokens_seen": 335360, "step": 540 }, { "epoch": 0.9714795008912656, "grad_norm": 1.311125636100769, "learning_rate": 4.848484848484849e-05, "loss": 2.9292, "num_input_tokens_seen": 339488, "step": 545 }, { "epoch": 0.9803921568627451, "grad_norm": 0.993326723575592, "learning_rate": 4.8930481283422465e-05, "loss": 2.2154, "num_input_tokens_seen": 342176, "step": 550 }, { "epoch": 0.9893048128342246, "grad_norm": 1.0523838996887207, "learning_rate": 4.9376114081996436e-05, "loss": 2.6187, "num_input_tokens_seen": 345568, "step": 555 }, { "epoch": 0.9982174688057041, "grad_norm": 1.2461936473846436, "learning_rate": 4.9821746880570414e-05, "loss": 2.0606, "num_input_tokens_seen": 348000, "step": 560 }, { "epoch": 1.0017825311942958, "eval_loss": 2.2780375480651855, "eval_runtime": 4.2492, "eval_samples_per_second": 58.599, "eval_steps_per_second": 14.826, "num_input_tokens_seen": 349200, "step": 562 }, { "epoch": 1.0071301247771836, "grad_norm": 0.8942297697067261, "learning_rate": 4.99999564446608e-05, "loss": 2.598, "num_input_tokens_seen": 350960, "step": 565 }, { "epoch": 1.0160427807486632, "grad_norm": 0.9582070708274841, "learning_rate": 4.9999690273693036e-05, "loss": 2.0767, "num_input_tokens_seen": 354288, "step": 570 }, { "epoch": 1.0249554367201426, "grad_norm": 1.0559678077697754, "learning_rate": 4.999918213174131e-05, "loss": 2.1588, "num_input_tokens_seen": 357648, "step": 575 }, { "epoch": 1.0338680926916222, "grad_norm": 1.2316597700119019, "learning_rate": 4.9998432023723915e-05, "loss": 2.0186, "num_input_tokens_seen": 360496, "step": 580 }, { "epoch": 1.0427807486631016, "grad_norm": 1.1366970539093018, "learning_rate": 4.9997439956901106e-05, "loss": 2.0455, "num_input_tokens_seen": 363376, "step": 585 }, { "epoch": 1.0516934046345812, "grad_norm": 1.041366696357727, "learning_rate": 4.999620594087507e-05, "loss": 1.995, "num_input_tokens_seen": 366320, "step": 590 }, { "epoch": 1.0606060606060606, "grad_norm": 0.9262757301330566, "learning_rate": 4.999472998758978e-05, "loss": 1.912, "num_input_tokens_seen": 369488, "step": 595 }, { "epoch": 1.0695187165775402, "grad_norm": 1.3618220090866089, "learning_rate": 4.999301211133095e-05, "loss": 1.8174, "num_input_tokens_seen": 372656, "step": 600 }, { "epoch": 1.0784313725490196, "grad_norm": 0.9017401337623596, "learning_rate": 4.999105232872582e-05, "loss": 1.7304, "num_input_tokens_seen": 376048, "step": 605 }, { "epoch": 1.0873440285204992, "grad_norm": 1.131372332572937, "learning_rate": 4.998885065874305e-05, "loss": 2.0501, "num_input_tokens_seen": 379472, "step": 610 }, { "epoch": 1.0962566844919786, "grad_norm": 0.743751585483551, "learning_rate": 4.9986407122692504e-05, "loss": 1.6725, "num_input_tokens_seen": 382288, "step": 615 }, { "epoch": 1.1051693404634582, "grad_norm": 1.2746849060058594, "learning_rate": 4.998372174422507e-05, "loss": 1.5424, "num_input_tokens_seen": 385392, "step": 620 }, { "epoch": 1.1140819964349375, "grad_norm": 1.250909686088562, "learning_rate": 4.998079454933244e-05, "loss": 1.9679, "num_input_tokens_seen": 389200, "step": 625 }, { "epoch": 1.1229946524064172, "grad_norm": 0.8632287979125977, "learning_rate": 4.99776255663468e-05, "loss": 1.2718, "num_input_tokens_seen": 391664, "step": 630 }, { "epoch": 1.1319073083778965, "grad_norm": 0.773535966873169, "learning_rate": 4.997421482594059e-05, "loss": 1.3693, "num_input_tokens_seen": 394416, "step": 635 }, { "epoch": 1.1408199643493762, "grad_norm": 1.104138731956482, "learning_rate": 4.997056236112625e-05, "loss": 1.9817, "num_input_tokens_seen": 399248, "step": 640 }, { "epoch": 1.1497326203208555, "grad_norm": 0.7540408372879028, "learning_rate": 4.9966668207255826e-05, "loss": 1.2948, "num_input_tokens_seen": 402032, "step": 645 }, { "epoch": 1.1586452762923352, "grad_norm": 0.9450183510780334, "learning_rate": 4.996253240202069e-05, "loss": 1.2707, "num_input_tokens_seen": 405296, "step": 650 }, { "epoch": 1.1675579322638145, "grad_norm": 1.1226730346679688, "learning_rate": 4.9958154985451114e-05, "loss": 1.2088, "num_input_tokens_seen": 408400, "step": 655 }, { "epoch": 1.1764705882352942, "grad_norm": 0.9736111760139465, "learning_rate": 4.995353599991595e-05, "loss": 1.4309, "num_input_tokens_seen": 412016, "step": 660 }, { "epoch": 1.1853832442067735, "grad_norm": 0.9693507552146912, "learning_rate": 4.994867549012215e-05, "loss": 1.2743, "num_input_tokens_seen": 415504, "step": 665 }, { "epoch": 1.1942959001782532, "grad_norm": 1.0443888902664185, "learning_rate": 4.99435735031144e-05, "loss": 1.1155, "num_input_tokens_seen": 418448, "step": 670 }, { "epoch": 1.2032085561497325, "grad_norm": 1.0174163579940796, "learning_rate": 4.993823008827465e-05, "loss": 1.092, "num_input_tokens_seen": 421168, "step": 675 }, { "epoch": 1.2121212121212122, "grad_norm": 0.7569769620895386, "learning_rate": 4.9932645297321555e-05, "loss": 0.9307, "num_input_tokens_seen": 423632, "step": 680 }, { "epoch": 1.2210338680926915, "grad_norm": 0.7273694276809692, "learning_rate": 4.9926819184310103e-05, "loss": 0.9791, "num_input_tokens_seen": 426640, "step": 685 }, { "epoch": 1.2299465240641712, "grad_norm": 0.952115535736084, "learning_rate": 4.9920751805631e-05, "loss": 1.1522, "num_input_tokens_seen": 430032, "step": 690 }, { "epoch": 1.2388591800356505, "grad_norm": 1.1709868907928467, "learning_rate": 4.991444322001014e-05, "loss": 1.0973, "num_input_tokens_seen": 433008, "step": 695 }, { "epoch": 1.2477718360071302, "grad_norm": 0.6561676263809204, "learning_rate": 4.99078934885081e-05, "loss": 1.0868, "num_input_tokens_seen": 436400, "step": 700 }, { "epoch": 1.2566844919786098, "grad_norm": 0.8287897109985352, "learning_rate": 4.990110267451944e-05, "loss": 0.8352, "num_input_tokens_seen": 439248, "step": 705 }, { "epoch": 1.2655971479500892, "grad_norm": 0.9313675165176392, "learning_rate": 4.989407084377218e-05, "loss": 0.8707, "num_input_tokens_seen": 442416, "step": 710 }, { "epoch": 1.2745098039215685, "grad_norm": 0.9105520844459534, "learning_rate": 4.988679806432712e-05, "loss": 0.9153, "num_input_tokens_seen": 445616, "step": 715 }, { "epoch": 1.2834224598930482, "grad_norm": 0.7386419773101807, "learning_rate": 4.9879284406577195e-05, "loss": 0.7514, "num_input_tokens_seen": 448528, "step": 720 }, { "epoch": 1.2923351158645278, "grad_norm": 0.8464149236679077, "learning_rate": 4.98715299432468e-05, "loss": 0.897, "num_input_tokens_seen": 451664, "step": 725 }, { "epoch": 1.3012477718360071, "grad_norm": 0.7016708254814148, "learning_rate": 4.986353474939106e-05, "loss": 0.9608, "num_input_tokens_seen": 455120, "step": 730 }, { "epoch": 1.3101604278074865, "grad_norm": 0.7350292801856995, "learning_rate": 4.9855298902395134e-05, "loss": 0.8485, "num_input_tokens_seen": 458352, "step": 735 }, { "epoch": 1.3190730837789661, "grad_norm": 0.657071053981781, "learning_rate": 4.9846822481973455e-05, "loss": 0.9055, "num_input_tokens_seen": 461488, "step": 740 }, { "epoch": 1.3279857397504458, "grad_norm": 0.7406115531921387, "learning_rate": 4.9838105570168946e-05, "loss": 0.9068, "num_input_tokens_seen": 464848, "step": 745 }, { "epoch": 1.3368983957219251, "grad_norm": 0.9874480962753296, "learning_rate": 4.982914825135224e-05, "loss": 1.0902, "num_input_tokens_seen": 468944, "step": 750 }, { "epoch": 1.3458110516934045, "grad_norm": 0.7415845990180969, "learning_rate": 4.981995061222087e-05, "loss": 0.6795, "num_input_tokens_seen": 471312, "step": 755 }, { "epoch": 1.3547237076648841, "grad_norm": 0.6649575233459473, "learning_rate": 4.98105127417984e-05, "loss": 0.6273, "num_input_tokens_seen": 474128, "step": 760 }, { "epoch": 1.3636363636363638, "grad_norm": 1.0872315168380737, "learning_rate": 4.9800834731433596e-05, "loss": 0.5981, "num_input_tokens_seen": 476592, "step": 765 }, { "epoch": 1.3725490196078431, "grad_norm": 0.7500861287117004, "learning_rate": 4.9790916674799526e-05, "loss": 1.014, "num_input_tokens_seen": 480240, "step": 770 }, { "epoch": 1.3814616755793225, "grad_norm": 1.2134431600570679, "learning_rate": 4.9780758667892656e-05, "loss": 0.681, "num_input_tokens_seen": 483472, "step": 775 }, { "epoch": 1.3903743315508021, "grad_norm": 0.8633726835250854, "learning_rate": 4.977036080903193e-05, "loss": 0.6929, "num_input_tokens_seen": 486768, "step": 780 }, { "epoch": 1.3992869875222818, "grad_norm": 0.903477668762207, "learning_rate": 4.975972319885779e-05, "loss": 0.5834, "num_input_tokens_seen": 489392, "step": 785 }, { "epoch": 1.4081996434937611, "grad_norm": 0.7039727568626404, "learning_rate": 4.974884594033123e-05, "loss": 0.7406, "num_input_tokens_seen": 492560, "step": 790 }, { "epoch": 1.4171122994652405, "grad_norm": 0.9972723126411438, "learning_rate": 4.9737729138732805e-05, "loss": 0.5558, "num_input_tokens_seen": 495344, "step": 795 }, { "epoch": 1.4260249554367201, "grad_norm": 1.2662111520767212, "learning_rate": 4.972637290166158e-05, "loss": 0.6374, "num_input_tokens_seen": 498128, "step": 800 }, { "epoch": 1.4349376114081998, "grad_norm": 1.4038677215576172, "learning_rate": 4.97147773390341e-05, "loss": 0.8173, "num_input_tokens_seen": 501488, "step": 805 }, { "epoch": 1.4438502673796791, "grad_norm": 0.730514407157898, "learning_rate": 4.9702942563083356e-05, "loss": 0.5782, "num_input_tokens_seen": 504272, "step": 810 }, { "epoch": 1.4527629233511585, "grad_norm": 0.5917222499847412, "learning_rate": 4.969086868835765e-05, "loss": 0.4533, "num_input_tokens_seen": 506672, "step": 815 }, { "epoch": 1.4616755793226381, "grad_norm": 0.49027279019355774, "learning_rate": 4.967855583171954e-05, "loss": 0.4866, "num_input_tokens_seen": 509232, "step": 820 }, { "epoch": 1.4705882352941178, "grad_norm": 1.1144423484802246, "learning_rate": 4.9666004112344656e-05, "loss": 0.7116, "num_input_tokens_seen": 512528, "step": 825 }, { "epoch": 1.4795008912655971, "grad_norm": 0.6267158389091492, "learning_rate": 4.965321365172057e-05, "loss": 0.576, "num_input_tokens_seen": 514896, "step": 830 }, { "epoch": 1.4884135472370765, "grad_norm": 0.8494957089424133, "learning_rate": 4.9640184573645646e-05, "loss": 0.6064, "num_input_tokens_seen": 518384, "step": 835 }, { "epoch": 1.4973262032085561, "grad_norm": 1.1032313108444214, "learning_rate": 4.962691700422778e-05, "loss": 0.8595, "num_input_tokens_seen": 522448, "step": 840 }, { "epoch": 1.5026737967914439, "eval_loss": 0.5879648327827454, "eval_runtime": 4.2487, "eval_samples_per_second": 58.606, "eval_steps_per_second": 14.828, "num_input_tokens_seen": 524208, "step": 843 }, { "epoch": 1.5062388591800357, "grad_norm": 0.7947481274604797, "learning_rate": 4.9613411071883267e-05, "loss": 0.4532, "num_input_tokens_seen": 525264, "step": 845 }, { "epoch": 1.5151515151515151, "grad_norm": 0.6550034284591675, "learning_rate": 4.959966690733544e-05, "loss": 0.7043, "num_input_tokens_seen": 528528, "step": 850 }, { "epoch": 1.5240641711229945, "grad_norm": 1.126085877418518, "learning_rate": 4.958568464361353e-05, "loss": 0.6396, "num_input_tokens_seen": 531536, "step": 855 }, { "epoch": 1.5329768270944741, "grad_norm": 0.6209072470664978, "learning_rate": 4.9571464416051294e-05, "loss": 0.5435, "num_input_tokens_seen": 534704, "step": 860 }, { "epoch": 1.5418894830659537, "grad_norm": 0.5790075063705444, "learning_rate": 4.955700636228573e-05, "loss": 0.359, "num_input_tokens_seen": 537264, "step": 865 }, { "epoch": 1.5508021390374331, "grad_norm": 0.9781410694122314, "learning_rate": 4.954231062225576e-05, "loss": 0.6823, "num_input_tokens_seen": 541328, "step": 870 }, { "epoch": 1.5597147950089125, "grad_norm": 0.7598072290420532, "learning_rate": 4.9527377338200855e-05, "loss": 0.4973, "num_input_tokens_seen": 544496, "step": 875 }, { "epoch": 1.5686274509803921, "grad_norm": 0.8549111485481262, "learning_rate": 4.951220665465964e-05, "loss": 0.6291, "num_input_tokens_seen": 547696, "step": 880 }, { "epoch": 1.5775401069518717, "grad_norm": 0.7234603762626648, "learning_rate": 4.949679871846857e-05, "loss": 0.4632, "num_input_tokens_seen": 550416, "step": 885 }, { "epoch": 1.5864527629233511, "grad_norm": 0.5888731479644775, "learning_rate": 4.948115367876043e-05, "loss": 0.5336, "num_input_tokens_seen": 553968, "step": 890 }, { "epoch": 1.5953654188948305, "grad_norm": 0.8173357844352722, "learning_rate": 4.94652716869629e-05, "loss": 0.3634, "num_input_tokens_seen": 556656, "step": 895 }, { "epoch": 1.6042780748663101, "grad_norm": 0.5093280673027039, "learning_rate": 4.944915289679716e-05, "loss": 0.3877, "num_input_tokens_seen": 559536, "step": 900 }, { "epoch": 1.6131907308377897, "grad_norm": 0.9982839226722717, "learning_rate": 4.94327974642763e-05, "loss": 0.5395, "num_input_tokens_seen": 562704, "step": 905 }, { "epoch": 1.6221033868092691, "grad_norm": 1.0210356712341309, "learning_rate": 4.94162055477039e-05, "loss": 0.5995, "num_input_tokens_seen": 566352, "step": 910 }, { "epoch": 1.6310160427807485, "grad_norm": 1.2152962684631348, "learning_rate": 4.939937730767243e-05, "loss": 0.5234, "num_input_tokens_seen": 569584, "step": 915 }, { "epoch": 1.6399286987522281, "grad_norm": 0.8112650513648987, "learning_rate": 4.9382312907061755e-05, "loss": 0.3781, "num_input_tokens_seen": 571824, "step": 920 }, { "epoch": 1.6488413547237077, "grad_norm": 0.8025038838386536, "learning_rate": 4.9365012511037514e-05, "loss": 0.5397, "num_input_tokens_seen": 575248, "step": 925 }, { "epoch": 1.6577540106951871, "grad_norm": 1.2283076047897339, "learning_rate": 4.934747628704952e-05, "loss": 0.4426, "num_input_tokens_seen": 578032, "step": 930 }, { "epoch": 1.6666666666666665, "grad_norm": 0.8238184452056885, "learning_rate": 4.932970440483018e-05, "loss": 0.4614, "num_input_tokens_seen": 581744, "step": 935 }, { "epoch": 1.6755793226381461, "grad_norm": 0.7958811521530151, "learning_rate": 4.931169703639282e-05, "loss": 0.4136, "num_input_tokens_seen": 584880, "step": 940 }, { "epoch": 1.6844919786096257, "grad_norm": 1.2087262868881226, "learning_rate": 4.929345435603003e-05, "loss": 0.4801, "num_input_tokens_seen": 587856, "step": 945 }, { "epoch": 1.6934046345811051, "grad_norm": 0.7868252992630005, "learning_rate": 4.9274976540311956e-05, "loss": 0.5347, "num_input_tokens_seen": 590928, "step": 950 }, { "epoch": 1.7023172905525845, "grad_norm": 0.9967821836471558, "learning_rate": 4.9256263768084635e-05, "loss": 0.37, "num_input_tokens_seen": 594096, "step": 955 }, { "epoch": 1.7112299465240641, "grad_norm": 0.8641761541366577, "learning_rate": 4.923731622046823e-05, "loss": 0.3977, "num_input_tokens_seen": 597136, "step": 960 }, { "epoch": 1.7201426024955437, "grad_norm": 0.6801542639732361, "learning_rate": 4.9218134080855273e-05, "loss": 0.5575, "num_input_tokens_seen": 600912, "step": 965 }, { "epoch": 1.7290552584670231, "grad_norm": 0.9356634616851807, "learning_rate": 4.919871753490891e-05, "loss": 0.5977, "num_input_tokens_seen": 604240, "step": 970 }, { "epoch": 1.7379679144385025, "grad_norm": 0.898560106754303, "learning_rate": 4.917906677056111e-05, "loss": 0.4074, "num_input_tokens_seen": 607248, "step": 975 }, { "epoch": 1.7468805704099821, "grad_norm": 0.7507029175758362, "learning_rate": 4.9159181978010814e-05, "loss": 0.4812, "num_input_tokens_seen": 610736, "step": 980 }, { "epoch": 1.7557932263814617, "grad_norm": 0.9444867372512817, "learning_rate": 4.9139063349722113e-05, "loss": 0.4682, "num_input_tokens_seen": 614128, "step": 985 }, { "epoch": 1.7647058823529411, "grad_norm": 0.9318161010742188, "learning_rate": 4.911871108042241e-05, "loss": 0.4571, "num_input_tokens_seen": 617232, "step": 990 }, { "epoch": 1.7736185383244205, "grad_norm": 0.7218228578567505, "learning_rate": 4.909812536710048e-05, "loss": 0.5007, "num_input_tokens_seen": 620880, "step": 995 }, { "epoch": 1.7825311942959001, "grad_norm": 0.7028499841690063, "learning_rate": 4.9077306409004585e-05, "loss": 0.6674, "num_input_tokens_seen": 624368, "step": 1000 }, { "epoch": 1.7914438502673797, "grad_norm": 0.5062604546546936, "learning_rate": 4.9056254407640604e-05, "loss": 0.3413, "num_input_tokens_seen": 627152, "step": 1005 }, { "epoch": 1.8003565062388591, "grad_norm": 0.49366044998168945, "learning_rate": 4.903496956676998e-05, "loss": 0.3736, "num_input_tokens_seen": 629680, "step": 1010 }, { "epoch": 1.8092691622103387, "grad_norm": 0.6387802958488464, "learning_rate": 4.901345209240784e-05, "loss": 0.3377, "num_input_tokens_seen": 632848, "step": 1015 }, { "epoch": 1.8181818181818183, "grad_norm": 0.8644296526908875, "learning_rate": 4.8991702192820924e-05, "loss": 0.4588, "num_input_tokens_seen": 635920, "step": 1020 }, { "epoch": 1.8270944741532977, "grad_norm": 0.4941517114639282, "learning_rate": 4.896972007852563e-05, "loss": 0.3705, "num_input_tokens_seen": 639056, "step": 1025 }, { "epoch": 1.8360071301247771, "grad_norm": 0.5460651516914368, "learning_rate": 4.894750596228594e-05, "loss": 0.3389, "num_input_tokens_seen": 642192, "step": 1030 }, { "epoch": 1.8449197860962567, "grad_norm": 0.7782461643218994, "learning_rate": 4.8925060059111394e-05, "loss": 0.4158, "num_input_tokens_seen": 645488, "step": 1035 }, { "epoch": 1.8538324420677363, "grad_norm": 0.5338404178619385, "learning_rate": 4.890238258625496e-05, "loss": 0.3644, "num_input_tokens_seen": 648336, "step": 1040 }, { "epoch": 1.8627450980392157, "grad_norm": 0.8528239727020264, "learning_rate": 4.887947376321099e-05, "loss": 0.3682, "num_input_tokens_seen": 651696, "step": 1045 }, { "epoch": 1.8716577540106951, "grad_norm": 0.4754684865474701, "learning_rate": 4.885633381171304e-05, "loss": 0.3467, "num_input_tokens_seen": 654640, "step": 1050 }, { "epoch": 1.8805704099821747, "grad_norm": 0.9799590110778809, "learning_rate": 4.883296295573176e-05, "loss": 0.511, "num_input_tokens_seen": 658128, "step": 1055 }, { "epoch": 1.8894830659536543, "grad_norm": 0.6689459085464478, "learning_rate": 4.880936142147271e-05, "loss": 0.3246, "num_input_tokens_seen": 660848, "step": 1060 }, { "epoch": 1.8983957219251337, "grad_norm": 0.7261871099472046, "learning_rate": 4.878552943737418e-05, "loss": 0.2685, "num_input_tokens_seen": 663120, "step": 1065 }, { "epoch": 1.9073083778966131, "grad_norm": 0.7026433944702148, "learning_rate": 4.876146723410498e-05, "loss": 0.3756, "num_input_tokens_seen": 666288, "step": 1070 }, { "epoch": 1.9162210338680927, "grad_norm": 1.4159960746765137, "learning_rate": 4.873717504456219e-05, "loss": 0.3687, "num_input_tokens_seen": 669360, "step": 1075 }, { "epoch": 1.9251336898395723, "grad_norm": 0.7870906591415405, "learning_rate": 4.8712653103868916e-05, "loss": 0.2532, "num_input_tokens_seen": 671344, "step": 1080 }, { "epoch": 1.9340463458110517, "grad_norm": 0.8793025612831116, "learning_rate": 4.868790164937204e-05, "loss": 0.3925, "num_input_tokens_seen": 674672, "step": 1085 }, { "epoch": 1.9429590017825311, "grad_norm": 0.40374019742012024, "learning_rate": 4.8662920920639866e-05, "loss": 0.3251, "num_input_tokens_seen": 677968, "step": 1090 }, { "epoch": 1.9518716577540107, "grad_norm": 0.5041529536247253, "learning_rate": 4.8637711159459855e-05, "loss": 0.3022, "num_input_tokens_seen": 680560, "step": 1095 }, { "epoch": 1.9607843137254903, "grad_norm": 1.0466898679733276, "learning_rate": 4.8612272609836263e-05, "loss": 0.3464, "num_input_tokens_seen": 683824, "step": 1100 }, { "epoch": 1.9696969696969697, "grad_norm": 0.8734254240989685, "learning_rate": 4.858660551798778e-05, "loss": 0.4663, "num_input_tokens_seen": 687216, "step": 1105 }, { "epoch": 1.9786096256684491, "grad_norm": 0.589005172252655, "learning_rate": 4.856071013234513e-05, "loss": 0.3396, "num_input_tokens_seen": 690128, "step": 1110 }, { "epoch": 1.9875222816399287, "grad_norm": 0.570462167263031, "learning_rate": 4.85345867035487e-05, "loss": 0.3839, "num_input_tokens_seen": 693232, "step": 1115 }, { "epoch": 1.9964349376114083, "grad_norm": 0.9086877107620239, "learning_rate": 4.8508235484446095e-05, "loss": 0.4327, "num_input_tokens_seen": 696880, "step": 1120 }, { "epoch": 2.0035650623885917, "eval_loss": 0.37957677245140076, "eval_runtime": 4.2451, "eval_samples_per_second": 58.656, "eval_steps_per_second": 14.841, "num_input_tokens_seen": 699264, "step": 1124 }, { "epoch": 2.0053475935828877, "grad_norm": 0.9719306826591492, "learning_rate": 4.8481656730089695e-05, "loss": 0.4008, "num_input_tokens_seen": 700096, "step": 1125 }, { "epoch": 2.014260249554367, "grad_norm": 0.9481471180915833, "learning_rate": 4.8454850697734174e-05, "loss": 0.4113, "num_input_tokens_seen": 703360, "step": 1130 }, { "epoch": 2.0231729055258465, "grad_norm": 0.7257654666900635, "learning_rate": 4.842781764683403e-05, "loss": 0.3966, "num_input_tokens_seen": 706624, "step": 1135 }, { "epoch": 2.0320855614973263, "grad_norm": 0.8015730977058411, "learning_rate": 4.8400557839041064e-05, "loss": 0.3069, "num_input_tokens_seen": 709472, "step": 1140 }, { "epoch": 2.0409982174688057, "grad_norm": 0.43969354033470154, "learning_rate": 4.837307153820184e-05, "loss": 0.337, "num_input_tokens_seen": 713152, "step": 1145 }, { "epoch": 2.049910873440285, "grad_norm": 0.934760570526123, "learning_rate": 4.8345359010355155e-05, "loss": 0.3539, "num_input_tokens_seen": 716480, "step": 1150 }, { "epoch": 2.0588235294117645, "grad_norm": 0.4905712306499481, "learning_rate": 4.831742052372943e-05, "loss": 0.3069, "num_input_tokens_seen": 719104, "step": 1155 }, { "epoch": 2.0677361853832443, "grad_norm": 0.6868427395820618, "learning_rate": 4.828925634874014e-05, "loss": 0.3006, "num_input_tokens_seen": 722016, "step": 1160 }, { "epoch": 2.0766488413547237, "grad_norm": 0.6591427326202393, "learning_rate": 4.8260866757987177e-05, "loss": 0.2809, "num_input_tokens_seen": 725184, "step": 1165 }, { "epoch": 2.085561497326203, "grad_norm": 1.2832831144332886, "learning_rate": 4.823225202625226e-05, "loss": 0.3441, "num_input_tokens_seen": 728352, "step": 1170 }, { "epoch": 2.0944741532976825, "grad_norm": 0.7174959182739258, "learning_rate": 4.820341243049618e-05, "loss": 0.4048, "num_input_tokens_seen": 731712, "step": 1175 }, { "epoch": 2.1033868092691623, "grad_norm": 0.6431313157081604, "learning_rate": 4.8174348249856236e-05, "loss": 0.3201, "num_input_tokens_seen": 734880, "step": 1180 }, { "epoch": 2.1122994652406417, "grad_norm": 0.658487856388092, "learning_rate": 4.814505976564343e-05, "loss": 0.3509, "num_input_tokens_seen": 737728, "step": 1185 }, { "epoch": 2.121212121212121, "grad_norm": 0.7958409786224365, "learning_rate": 4.8115547261339824e-05, "loss": 0.3429, "num_input_tokens_seen": 741376, "step": 1190 }, { "epoch": 2.1301247771836005, "grad_norm": 0.6729584336280823, "learning_rate": 4.808581102259573e-05, "loss": 0.2909, "num_input_tokens_seen": 744256, "step": 1195 }, { "epoch": 2.1390374331550803, "grad_norm": 0.740015983581543, "learning_rate": 4.8055851337227006e-05, "loss": 0.2479, "num_input_tokens_seen": 746944, "step": 1200 }, { "epoch": 2.1479500891265597, "grad_norm": 0.5458919405937195, "learning_rate": 4.802566849521222e-05, "loss": 0.2943, "num_input_tokens_seen": 750272, "step": 1205 }, { "epoch": 2.156862745098039, "grad_norm": 0.508515477180481, "learning_rate": 4.799526278868987e-05, "loss": 0.2486, "num_input_tokens_seen": 753024, "step": 1210 }, { "epoch": 2.165775401069519, "grad_norm": 0.8448687791824341, "learning_rate": 4.796463451195554e-05, "loss": 0.388, "num_input_tokens_seen": 756576, "step": 1215 }, { "epoch": 2.1746880570409983, "grad_norm": 0.5762525200843811, "learning_rate": 4.7933783961459094e-05, "loss": 0.3068, "num_input_tokens_seen": 759680, "step": 1220 }, { "epoch": 2.1836007130124777, "grad_norm": 0.6639679670333862, "learning_rate": 4.790271143580174e-05, "loss": 0.331, "num_input_tokens_seen": 762880, "step": 1225 }, { "epoch": 2.192513368983957, "grad_norm": 0.5362179279327393, "learning_rate": 4.7871417235733196e-05, "loss": 0.2964, "num_input_tokens_seen": 765920, "step": 1230 }, { "epoch": 2.2014260249554365, "grad_norm": 0.5786792039871216, "learning_rate": 4.783990166414875e-05, "loss": 0.4138, "num_input_tokens_seen": 769728, "step": 1235 }, { "epoch": 2.2103386809269163, "grad_norm": 0.47215279936790466, "learning_rate": 4.780816502608632e-05, "loss": 0.3199, "num_input_tokens_seen": 772832, "step": 1240 }, { "epoch": 2.2192513368983957, "grad_norm": 0.4350599944591522, "learning_rate": 4.777620762872355e-05, "loss": 0.3148, "num_input_tokens_seen": 776352, "step": 1245 }, { "epoch": 2.228163992869875, "grad_norm": 0.6416548490524292, "learning_rate": 4.774402978137479e-05, "loss": 0.3055, "num_input_tokens_seen": 779456, "step": 1250 }, { "epoch": 2.237076648841355, "grad_norm": 0.2961161434650421, "learning_rate": 4.7711631795488096e-05, "loss": 0.2604, "num_input_tokens_seen": 782112, "step": 1255 }, { "epoch": 2.2459893048128343, "grad_norm": 0.5333968997001648, "learning_rate": 4.767901398464227e-05, "loss": 0.346, "num_input_tokens_seen": 784864, "step": 1260 }, { "epoch": 2.2549019607843137, "grad_norm": 0.7181191444396973, "learning_rate": 4.7646176664543763e-05, "loss": 0.2688, "num_input_tokens_seen": 787936, "step": 1265 }, { "epoch": 2.263814616755793, "grad_norm": 1.1632299423217773, "learning_rate": 4.761312015302367e-05, "loss": 0.2973, "num_input_tokens_seen": 790976, "step": 1270 }, { "epoch": 2.2727272727272725, "grad_norm": 1.0037575960159302, "learning_rate": 4.757984477003462e-05, "loss": 0.3304, "num_input_tokens_seen": 794016, "step": 1275 }, { "epoch": 2.2816399286987523, "grad_norm": 0.6830529570579529, "learning_rate": 4.7546350837647666e-05, "loss": 0.2141, "num_input_tokens_seen": 796864, "step": 1280 }, { "epoch": 2.2905525846702317, "grad_norm": 0.7043412327766418, "learning_rate": 4.7512638680049245e-05, "loss": 0.3195, "num_input_tokens_seen": 800096, "step": 1285 }, { "epoch": 2.299465240641711, "grad_norm": 0.6342535018920898, "learning_rate": 4.7478708623537956e-05, "loss": 0.2506, "num_input_tokens_seen": 803392, "step": 1290 }, { "epoch": 2.308377896613191, "grad_norm": 1.047386646270752, "learning_rate": 4.7444560996521415e-05, "loss": 0.3365, "num_input_tokens_seen": 806400, "step": 1295 }, { "epoch": 2.3172905525846703, "grad_norm": 1.372889518737793, "learning_rate": 4.741019612951312e-05, "loss": 0.4817, "num_input_tokens_seen": 809568, "step": 1300 }, { "epoch": 2.3262032085561497, "grad_norm": 0.4855256974697113, "learning_rate": 4.737561435512923e-05, "loss": 0.2226, "num_input_tokens_seen": 812768, "step": 1305 }, { "epoch": 2.335115864527629, "grad_norm": 0.5740591287612915, "learning_rate": 4.734081600808531e-05, "loss": 0.2448, "num_input_tokens_seen": 815968, "step": 1310 }, { "epoch": 2.344028520499109, "grad_norm": 0.5068109631538391, "learning_rate": 4.7305801425193165e-05, "loss": 0.2175, "num_input_tokens_seen": 818976, "step": 1315 }, { "epoch": 2.3529411764705883, "grad_norm": 0.9766526818275452, "learning_rate": 4.727057094535749e-05, "loss": 0.2615, "num_input_tokens_seen": 821760, "step": 1320 }, { "epoch": 2.3618538324420677, "grad_norm": 0.5878629684448242, "learning_rate": 4.72351249095727e-05, "loss": 0.3121, "num_input_tokens_seen": 824288, "step": 1325 }, { "epoch": 2.370766488413547, "grad_norm": 0.8109356760978699, "learning_rate": 4.7199463660919514e-05, "loss": 0.3045, "num_input_tokens_seen": 827424, "step": 1330 }, { "epoch": 2.379679144385027, "grad_norm": 0.6713225245475769, "learning_rate": 4.7163587544561705e-05, "loss": 0.2503, "num_input_tokens_seen": 830176, "step": 1335 }, { "epoch": 2.3885918003565063, "grad_norm": 0.7476429343223572, "learning_rate": 4.7127496907742734e-05, "loss": 0.357, "num_input_tokens_seen": 833664, "step": 1340 }, { "epoch": 2.3975044563279857, "grad_norm": 1.1430628299713135, "learning_rate": 4.709119209978242e-05, "loss": 0.3525, "num_input_tokens_seen": 836736, "step": 1345 }, { "epoch": 2.406417112299465, "grad_norm": 0.5232317447662354, "learning_rate": 4.7054673472073506e-05, "loss": 0.3624, "num_input_tokens_seen": 840160, "step": 1350 }, { "epoch": 2.415329768270945, "grad_norm": 0.9793670773506165, "learning_rate": 4.7017941378078314e-05, "loss": 0.3082, "num_input_tokens_seen": 843168, "step": 1355 }, { "epoch": 2.4242424242424243, "grad_norm": 0.6311604380607605, "learning_rate": 4.698099617332528e-05, "loss": 0.2339, "num_input_tokens_seen": 845952, "step": 1360 }, { "epoch": 2.4331550802139037, "grad_norm": 0.9364222288131714, "learning_rate": 4.694383821540555e-05, "loss": 0.2302, "num_input_tokens_seen": 848448, "step": 1365 }, { "epoch": 2.442067736185383, "grad_norm": 1.2326656579971313, "learning_rate": 4.690646786396945e-05, "loss": 0.2639, "num_input_tokens_seen": 851552, "step": 1370 }, { "epoch": 2.450980392156863, "grad_norm": 0.7579092979431152, "learning_rate": 4.686888548072312e-05, "loss": 0.3276, "num_input_tokens_seen": 854752, "step": 1375 }, { "epoch": 2.4598930481283423, "grad_norm": 0.9993529915809631, "learning_rate": 4.683109142942492e-05, "loss": 0.2741, "num_input_tokens_seen": 857600, "step": 1380 }, { "epoch": 2.4688057040998217, "grad_norm": 0.5094732642173767, "learning_rate": 4.679308607588192e-05, "loss": 0.4073, "num_input_tokens_seen": 861248, "step": 1385 }, { "epoch": 2.477718360071301, "grad_norm": 0.6214059591293335, "learning_rate": 4.6754869787946386e-05, "loss": 0.3205, "num_input_tokens_seen": 865056, "step": 1390 }, { "epoch": 2.486631016042781, "grad_norm": 0.432815283536911, "learning_rate": 4.6716442935512214e-05, "loss": 0.2478, "num_input_tokens_seen": 867936, "step": 1395 }, { "epoch": 2.4955436720142603, "grad_norm": 0.5354329347610474, "learning_rate": 4.6677805890511354e-05, "loss": 0.2816, "num_input_tokens_seen": 871136, "step": 1400 }, { "epoch": 2.5044563279857397, "grad_norm": 0.5837387442588806, "learning_rate": 4.663895902691018e-05, "loss": 0.239, "num_input_tokens_seen": 873600, "step": 1405 }, { "epoch": 2.5044563279857397, "eval_loss": 0.286673367023468, "eval_runtime": 4.2516, "eval_samples_per_second": 58.566, "eval_steps_per_second": 14.818, "num_input_tokens_seen": 873600, "step": 1405 }, { "epoch": 2.5133689839572195, "grad_norm": 0.48573535680770874, "learning_rate": 4.659990272070591e-05, "loss": 0.31, "num_input_tokens_seen": 877152, "step": 1410 }, { "epoch": 2.522281639928699, "grad_norm": 0.5476496815681458, "learning_rate": 4.656063734992294e-05, "loss": 0.2718, "num_input_tokens_seen": 880096, "step": 1415 }, { "epoch": 2.5311942959001783, "grad_norm": 0.5417474508285522, "learning_rate": 4.6521163294609196e-05, "loss": 0.2433, "num_input_tokens_seen": 882944, "step": 1420 }, { "epoch": 2.5401069518716577, "grad_norm": 0.7648299932479858, "learning_rate": 4.6481480936832444e-05, "loss": 0.3607, "num_input_tokens_seen": 886848, "step": 1425 }, { "epoch": 2.549019607843137, "grad_norm": 0.6219758987426758, "learning_rate": 4.644159066067662e-05, "loss": 0.2771, "num_input_tokens_seen": 890272, "step": 1430 }, { "epoch": 2.557932263814617, "grad_norm": 0.6586949825286865, "learning_rate": 4.640149285223806e-05, "loss": 0.2683, "num_input_tokens_seen": 893600, "step": 1435 }, { "epoch": 2.5668449197860963, "grad_norm": 1.156497836112976, "learning_rate": 4.636118789962184e-05, "loss": 0.2513, "num_input_tokens_seen": 896448, "step": 1440 }, { "epoch": 2.5757575757575757, "grad_norm": 0.6117565631866455, "learning_rate": 4.632067619293795e-05, "loss": 0.2491, "num_input_tokens_seen": 899424, "step": 1445 }, { "epoch": 2.5846702317290555, "grad_norm": 0.6213181614875793, "learning_rate": 4.6279958124297554e-05, "loss": 0.2476, "num_input_tokens_seen": 902624, "step": 1450 }, { "epoch": 2.593582887700535, "grad_norm": 0.8394727110862732, "learning_rate": 4.623903408780916e-05, "loss": 0.2327, "num_input_tokens_seen": 905568, "step": 1455 }, { "epoch": 2.6024955436720143, "grad_norm": 0.65825355052948, "learning_rate": 4.619790447957488e-05, "loss": 0.321, "num_input_tokens_seen": 908960, "step": 1460 }, { "epoch": 2.6114081996434937, "grad_norm": 0.7782941460609436, "learning_rate": 4.615656969768649e-05, "loss": 0.2843, "num_input_tokens_seen": 912640, "step": 1465 }, { "epoch": 2.620320855614973, "grad_norm": 0.8492444157600403, "learning_rate": 4.611503014222168e-05, "loss": 0.2464, "num_input_tokens_seen": 915328, "step": 1470 }, { "epoch": 2.629233511586453, "grad_norm": 1.3704971075057983, "learning_rate": 4.6073286215240105e-05, "loss": 0.2942, "num_input_tokens_seen": 918656, "step": 1475 }, { "epoch": 2.6381461675579323, "grad_norm": 0.8433835506439209, "learning_rate": 4.6031338320779534e-05, "loss": 0.2215, "num_input_tokens_seen": 921344, "step": 1480 }, { "epoch": 2.6470588235294117, "grad_norm": 0.5805216431617737, "learning_rate": 4.598918686485193e-05, "loss": 0.2321, "num_input_tokens_seen": 924192, "step": 1485 }, { "epoch": 2.6559714795008915, "grad_norm": 0.4831686317920685, "learning_rate": 4.594683225543952e-05, "loss": 0.2957, "num_input_tokens_seen": 927424, "step": 1490 }, { "epoch": 2.664884135472371, "grad_norm": 0.7766821980476379, "learning_rate": 4.590427490249084e-05, "loss": 0.2587, "num_input_tokens_seen": 930080, "step": 1495 }, { "epoch": 2.6737967914438503, "grad_norm": 0.4486106038093567, "learning_rate": 4.5861515217916785e-05, "loss": 0.202, "num_input_tokens_seen": 932768, "step": 1500 }, { "epoch": 2.6827094474153297, "grad_norm": 0.43728289008140564, "learning_rate": 4.581855361558659e-05, "loss": 0.2685, "num_input_tokens_seen": 935904, "step": 1505 }, { "epoch": 2.691622103386809, "grad_norm": 0.5914068222045898, "learning_rate": 4.577539051132386e-05, "loss": 0.2218, "num_input_tokens_seen": 938784, "step": 1510 }, { "epoch": 2.700534759358289, "grad_norm": 0.4907556176185608, "learning_rate": 4.573202632290252e-05, "loss": 0.2022, "num_input_tokens_seen": 941280, "step": 1515 }, { "epoch": 2.7094474153297683, "grad_norm": 0.7610965967178345, "learning_rate": 4.568846147004279e-05, "loss": 0.2046, "num_input_tokens_seen": 944672, "step": 1520 }, { "epoch": 2.7183600713012477, "grad_norm": 0.7069556713104248, "learning_rate": 4.5644696374407105e-05, "loss": 0.2896, "num_input_tokens_seen": 948032, "step": 1525 }, { "epoch": 2.7272727272727275, "grad_norm": 0.7775002121925354, "learning_rate": 4.560073145959602e-05, "loss": 0.322, "num_input_tokens_seen": 952000, "step": 1530 }, { "epoch": 2.736185383244207, "grad_norm": 0.5535850524902344, "learning_rate": 4.555656715114419e-05, "loss": 0.278, "num_input_tokens_seen": 955456, "step": 1535 }, { "epoch": 2.7450980392156863, "grad_norm": 0.6513121724128723, "learning_rate": 4.551220387651615e-05, "loss": 0.2629, "num_input_tokens_seen": 959232, "step": 1540 }, { "epoch": 2.7540106951871657, "grad_norm": 0.5215713977813721, "learning_rate": 4.546764206510221e-05, "loss": 0.2042, "num_input_tokens_seen": 962304, "step": 1545 }, { "epoch": 2.762923351158645, "grad_norm": 0.5402376651763916, "learning_rate": 4.542288214821433e-05, "loss": 0.213, "num_input_tokens_seen": 965344, "step": 1550 }, { "epoch": 2.771836007130125, "grad_norm": 1.1007705926895142, "learning_rate": 4.5377924559081946e-05, "loss": 0.1996, "num_input_tokens_seen": 968032, "step": 1555 }, { "epoch": 2.7807486631016043, "grad_norm": 0.5571001172065735, "learning_rate": 4.533276973284771e-05, "loss": 0.2281, "num_input_tokens_seen": 970624, "step": 1560 }, { "epoch": 2.7896613190730837, "grad_norm": 0.7429901361465454, "learning_rate": 4.528741810656336e-05, "loss": 0.2868, "num_input_tokens_seen": 973760, "step": 1565 }, { "epoch": 2.7985739750445635, "grad_norm": 0.3642044961452484, "learning_rate": 4.5241870119185426e-05, "loss": 0.2662, "num_input_tokens_seen": 976480, "step": 1570 }, { "epoch": 2.807486631016043, "grad_norm": 0.5374373197555542, "learning_rate": 4.519612621157103e-05, "loss": 0.241, "num_input_tokens_seen": 979328, "step": 1575 }, { "epoch": 2.8163992869875223, "grad_norm": 0.9241515398025513, "learning_rate": 4.515018682647359e-05, "loss": 0.2839, "num_input_tokens_seen": 982624, "step": 1580 }, { "epoch": 2.8253119429590017, "grad_norm": 0.6853222846984863, "learning_rate": 4.510405240853854e-05, "loss": 0.2158, "num_input_tokens_seen": 985664, "step": 1585 }, { "epoch": 2.834224598930481, "grad_norm": 0.5483903884887695, "learning_rate": 4.505772340429905e-05, "loss": 0.2571, "num_input_tokens_seen": 989024, "step": 1590 }, { "epoch": 2.843137254901961, "grad_norm": 0.4872891902923584, "learning_rate": 4.501120026217164e-05, "loss": 0.2331, "num_input_tokens_seen": 992160, "step": 1595 }, { "epoch": 2.8520499108734403, "grad_norm": 0.5892439484596252, "learning_rate": 4.496448343245192e-05, "loss": 0.2645, "num_input_tokens_seen": 995328, "step": 1600 }, { "epoch": 2.8609625668449197, "grad_norm": 0.6122104525566101, "learning_rate": 4.4917573367310184e-05, "loss": 0.3106, "num_input_tokens_seen": 999136, "step": 1605 }, { "epoch": 2.8698752228163995, "grad_norm": 0.657755970954895, "learning_rate": 4.4870470520787035e-05, "loss": 0.2123, "num_input_tokens_seen": 1001920, "step": 1610 }, { "epoch": 2.878787878787879, "grad_norm": 0.6398863196372986, "learning_rate": 4.482317534878901e-05, "loss": 0.385, "num_input_tokens_seen": 1005632, "step": 1615 }, { "epoch": 2.8877005347593583, "grad_norm": 0.9357530474662781, "learning_rate": 4.477568830908415e-05, "loss": 0.2565, "num_input_tokens_seen": 1009408, "step": 1620 }, { "epoch": 2.8966131907308377, "grad_norm": 0.767514705657959, "learning_rate": 4.4728009861297586e-05, "loss": 0.2551, "num_input_tokens_seen": 1012448, "step": 1625 }, { "epoch": 2.905525846702317, "grad_norm": 0.5800440311431885, "learning_rate": 4.468014046690707e-05, "loss": 0.2587, "num_input_tokens_seen": 1015616, "step": 1630 }, { "epoch": 2.914438502673797, "grad_norm": 0.487104207277298, "learning_rate": 4.463208058923851e-05, "loss": 0.2677, "num_input_tokens_seen": 1018944, "step": 1635 }, { "epoch": 2.9233511586452763, "grad_norm": 0.799360454082489, "learning_rate": 4.458383069346152e-05, "loss": 0.2031, "num_input_tokens_seen": 1021696, "step": 1640 }, { "epoch": 2.9322638146167557, "grad_norm": 0.5832977890968323, "learning_rate": 4.453539124658486e-05, "loss": 0.2505, "num_input_tokens_seen": 1024832, "step": 1645 }, { "epoch": 2.9411764705882355, "grad_norm": 0.7471289038658142, "learning_rate": 4.4486762717451975e-05, "loss": 0.2521, "num_input_tokens_seen": 1027712, "step": 1650 }, { "epoch": 2.950089126559715, "grad_norm": 0.5479772090911865, "learning_rate": 4.443794557673641e-05, "loss": 0.2542, "num_input_tokens_seen": 1031040, "step": 1655 }, { "epoch": 2.9590017825311943, "grad_norm": 0.5916025042533875, "learning_rate": 4.43889402969373e-05, "loss": 0.1892, "num_input_tokens_seen": 1033440, "step": 1660 }, { "epoch": 2.9679144385026737, "grad_norm": 0.7155612111091614, "learning_rate": 4.4339747352374726e-05, "loss": 0.2661, "num_input_tokens_seen": 1036864, "step": 1665 }, { "epoch": 2.976827094474153, "grad_norm": 0.4465028941631317, "learning_rate": 4.4290367219185206e-05, "loss": 0.2583, "num_input_tokens_seen": 1039808, "step": 1670 }, { "epoch": 2.985739750445633, "grad_norm": 0.5775701999664307, "learning_rate": 4.424080037531705e-05, "loss": 0.2162, "num_input_tokens_seen": 1043200, "step": 1675 }, { "epoch": 2.9946524064171123, "grad_norm": 0.49966952204704285, "learning_rate": 4.4191047300525704e-05, "loss": 0.1902, "num_input_tokens_seen": 1045504, "step": 1680 }, { "epoch": 3.0035650623885917, "grad_norm": 0.5228843092918396, "learning_rate": 4.414110847636916e-05, "loss": 0.196, "num_input_tokens_seen": 1047768, "step": 1685 }, { "epoch": 3.0053475935828877, "eval_loss": 0.2455865740776062, "eval_runtime": 4.252, "eval_samples_per_second": 58.561, "eval_steps_per_second": 14.817, "num_input_tokens_seen": 1048184, "step": 1686 }, { "epoch": 3.0124777183600715, "grad_norm": 0.3864419162273407, "learning_rate": 4.409098438620326e-05, "loss": 0.1859, "num_input_tokens_seen": 1050456, "step": 1690 }, { "epoch": 3.021390374331551, "grad_norm": 0.7427952885627747, "learning_rate": 4.404067551517703e-05, "loss": 0.2342, "num_input_tokens_seen": 1053592, "step": 1695 }, { "epoch": 3.0303030303030303, "grad_norm": 0.8005133867263794, "learning_rate": 4.399018235022799e-05, "loss": 0.2547, "num_input_tokens_seen": 1056664, "step": 1700 }, { "epoch": 3.0392156862745097, "grad_norm": 0.42377611994743347, "learning_rate": 4.393950538007743e-05, "loss": 0.2227, "num_input_tokens_seen": 1059384, "step": 1705 }, { "epoch": 3.0481283422459895, "grad_norm": 0.4982529878616333, "learning_rate": 4.3888645095225675e-05, "loss": 0.1863, "num_input_tokens_seen": 1062168, "step": 1710 }, { "epoch": 3.057040998217469, "grad_norm": 0.9931812882423401, "learning_rate": 4.383760198794734e-05, "loss": 0.2083, "num_input_tokens_seen": 1064952, "step": 1715 }, { "epoch": 3.0659536541889483, "grad_norm": 0.6572649478912354, "learning_rate": 4.37863765522866e-05, "loss": 0.1863, "num_input_tokens_seen": 1067416, "step": 1720 }, { "epoch": 3.0748663101604277, "grad_norm": 0.6921285390853882, "learning_rate": 4.3734969284052345e-05, "loss": 0.2354, "num_input_tokens_seen": 1070552, "step": 1725 }, { "epoch": 3.0837789661319075, "grad_norm": 0.7747342586517334, "learning_rate": 4.368338068081343e-05, "loss": 0.3332, "num_input_tokens_seen": 1074136, "step": 1730 }, { "epoch": 3.092691622103387, "grad_norm": 1.056235432624817, "learning_rate": 4.3631611241893874e-05, "loss": 0.2396, "num_input_tokens_seen": 1077848, "step": 1735 }, { "epoch": 3.1016042780748663, "grad_norm": 0.7865013480186462, "learning_rate": 4.3579661468367924e-05, "loss": 0.2057, "num_input_tokens_seen": 1080664, "step": 1740 }, { "epoch": 3.1105169340463457, "grad_norm": 0.6681080460548401, "learning_rate": 4.352753186305536e-05, "loss": 0.2823, "num_input_tokens_seen": 1083992, "step": 1745 }, { "epoch": 3.1194295900178255, "grad_norm": 0.4991186559200287, "learning_rate": 4.347522293051648e-05, "loss": 0.2609, "num_input_tokens_seen": 1087800, "step": 1750 }, { "epoch": 3.128342245989305, "grad_norm": 0.5108634829521179, "learning_rate": 4.3422735177047324e-05, "loss": 0.2318, "num_input_tokens_seen": 1090776, "step": 1755 }, { "epoch": 3.1372549019607843, "grad_norm": 1.343435525894165, "learning_rate": 4.337006911067473e-05, "loss": 0.2593, "num_input_tokens_seen": 1093624, "step": 1760 }, { "epoch": 3.1461675579322637, "grad_norm": 0.7029876708984375, "learning_rate": 4.331722524115139e-05, "loss": 0.1993, "num_input_tokens_seen": 1096472, "step": 1765 }, { "epoch": 3.1550802139037435, "grad_norm": 0.5673936605453491, "learning_rate": 4.3264204079950975e-05, "loss": 0.2703, "num_input_tokens_seen": 1099736, "step": 1770 }, { "epoch": 3.163992869875223, "grad_norm": 0.49642717838287354, "learning_rate": 4.321100614026315e-05, "loss": 0.3485, "num_input_tokens_seen": 1103384, "step": 1775 }, { "epoch": 3.1729055258467023, "grad_norm": 0.7280632257461548, "learning_rate": 4.31576319369886e-05, "loss": 0.2451, "num_input_tokens_seen": 1106520, "step": 1780 }, { "epoch": 3.1818181818181817, "grad_norm": 0.642463207244873, "learning_rate": 4.310408198673406e-05, "loss": 0.2062, "num_input_tokens_seen": 1109208, "step": 1785 }, { "epoch": 3.1907308377896615, "grad_norm": 0.7189128994941711, "learning_rate": 4.305035680780732e-05, "loss": 0.2478, "num_input_tokens_seen": 1112536, "step": 1790 }, { "epoch": 3.199643493761141, "grad_norm": 1.2781462669372559, "learning_rate": 4.299645692021221e-05, "loss": 0.2381, "num_input_tokens_seen": 1115992, "step": 1795 }, { "epoch": 3.2085561497326203, "grad_norm": 0.598044753074646, "learning_rate": 4.294238284564354e-05, "loss": 0.2208, "num_input_tokens_seen": 1119192, "step": 1800 }, { "epoch": 3.2174688057040997, "grad_norm": 0.6014571189880371, "learning_rate": 4.2888135107482067e-05, "loss": 0.2393, "num_input_tokens_seen": 1122552, "step": 1805 }, { "epoch": 3.2263814616755795, "grad_norm": 0.8126239776611328, "learning_rate": 4.283371423078945e-05, "loss": 0.2321, "num_input_tokens_seen": 1126072, "step": 1810 }, { "epoch": 3.235294117647059, "grad_norm": 0.6001937985420227, "learning_rate": 4.277912074230312e-05, "loss": 0.1901, "num_input_tokens_seen": 1128792, "step": 1815 }, { "epoch": 3.2442067736185383, "grad_norm": 0.6077953577041626, "learning_rate": 4.272435517043125e-05, "loss": 0.2166, "num_input_tokens_seen": 1132152, "step": 1820 }, { "epoch": 3.2531194295900177, "grad_norm": 0.38485997915267944, "learning_rate": 4.2669418045247576e-05, "loss": 0.2028, "num_input_tokens_seen": 1135064, "step": 1825 }, { "epoch": 3.2620320855614975, "grad_norm": 0.5066972970962524, "learning_rate": 4.2614309898486297e-05, "loss": 0.247, "num_input_tokens_seen": 1137976, "step": 1830 }, { "epoch": 3.270944741532977, "grad_norm": 0.5907444357872009, "learning_rate": 4.25590312635369e-05, "loss": 0.1952, "num_input_tokens_seen": 1141080, "step": 1835 }, { "epoch": 3.2798573975044563, "grad_norm": 0.6255643963813782, "learning_rate": 4.250358267543907e-05, "loss": 0.2124, "num_input_tokens_seen": 1144376, "step": 1840 }, { "epoch": 3.2887700534759357, "grad_norm": 0.9536407589912415, "learning_rate": 4.244796467087741e-05, "loss": 0.23, "num_input_tokens_seen": 1147224, "step": 1845 }, { "epoch": 3.2976827094474155, "grad_norm": 0.7920709252357483, "learning_rate": 4.2392177788176335e-05, "loss": 0.2005, "num_input_tokens_seen": 1150360, "step": 1850 }, { "epoch": 3.306595365418895, "grad_norm": 0.4633888602256775, "learning_rate": 4.2336222567294804e-05, "loss": 0.1962, "num_input_tokens_seen": 1153688, "step": 1855 }, { "epoch": 3.3155080213903743, "grad_norm": 0.384843111038208, "learning_rate": 4.228009954982112e-05, "loss": 0.2039, "num_input_tokens_seen": 1157016, "step": 1860 }, { "epoch": 3.3244206773618536, "grad_norm": 0.4141569435596466, "learning_rate": 4.22238092789677e-05, "loss": 0.2075, "num_input_tokens_seen": 1159768, "step": 1865 }, { "epoch": 3.3333333333333335, "grad_norm": 0.5076260566711426, "learning_rate": 4.2167352299565746e-05, "loss": 0.198, "num_input_tokens_seen": 1162520, "step": 1870 }, { "epoch": 3.342245989304813, "grad_norm": 0.6106960773468018, "learning_rate": 4.21107291580601e-05, "loss": 0.1931, "num_input_tokens_seen": 1165336, "step": 1875 }, { "epoch": 3.3511586452762923, "grad_norm": 0.49231547117233276, "learning_rate": 4.205394040250382e-05, "loss": 0.2574, "num_input_tokens_seen": 1168632, "step": 1880 }, { "epoch": 3.3600713012477716, "grad_norm": 0.5341747403144836, "learning_rate": 4.199698658255298e-05, "loss": 0.2002, "num_input_tokens_seen": 1171352, "step": 1885 }, { "epoch": 3.3689839572192515, "grad_norm": 0.5527672171592712, "learning_rate": 4.193986824946125e-05, "loss": 0.2148, "num_input_tokens_seen": 1174360, "step": 1890 }, { "epoch": 3.377896613190731, "grad_norm": 0.5493122935295105, "learning_rate": 4.188258595607468e-05, "loss": 0.2173, "num_input_tokens_seen": 1177368, "step": 1895 }, { "epoch": 3.3868092691622103, "grad_norm": 0.6076507568359375, "learning_rate": 4.182514025682625e-05, "loss": 0.2365, "num_input_tokens_seen": 1180824, "step": 1900 }, { "epoch": 3.3957219251336896, "grad_norm": 0.38345441222190857, "learning_rate": 4.176753170773052e-05, "loss": 0.237, "num_input_tokens_seen": 1183544, "step": 1905 }, { "epoch": 3.4046345811051695, "grad_norm": 0.8067929744720459, "learning_rate": 4.170976086637832e-05, "loss": 0.1945, "num_input_tokens_seen": 1185848, "step": 1910 }, { "epoch": 3.413547237076649, "grad_norm": 0.5404775142669678, "learning_rate": 4.1651828291931264e-05, "loss": 0.1856, "num_input_tokens_seen": 1189176, "step": 1915 }, { "epoch": 3.4224598930481283, "grad_norm": 0.6067723631858826, "learning_rate": 4.159373454511636e-05, "loss": 0.2464, "num_input_tokens_seen": 1192984, "step": 1920 }, { "epoch": 3.431372549019608, "grad_norm": 0.6056991815567017, "learning_rate": 4.1535480188220636e-05, "loss": 0.2909, "num_input_tokens_seen": 1196888, "step": 1925 }, { "epoch": 3.4402852049910875, "grad_norm": 0.7518835067749023, "learning_rate": 4.1477065785085634e-05, "loss": 0.2496, "num_input_tokens_seen": 1200792, "step": 1930 }, { "epoch": 3.449197860962567, "grad_norm": 0.41140249371528625, "learning_rate": 4.141849190110199e-05, "loss": 0.2267, "num_input_tokens_seen": 1203832, "step": 1935 }, { "epoch": 3.4581105169340463, "grad_norm": 0.44746679067611694, "learning_rate": 4.1359759103203935e-05, "loss": 0.215, "num_input_tokens_seen": 1207160, "step": 1940 }, { "epoch": 3.4670231729055256, "grad_norm": 0.7266998291015625, "learning_rate": 4.130086795986383e-05, "loss": 0.2169, "num_input_tokens_seen": 1210616, "step": 1945 }, { "epoch": 3.4759358288770055, "grad_norm": 0.5968104600906372, "learning_rate": 4.124181904108664e-05, "loss": 0.1875, "num_input_tokens_seen": 1213528, "step": 1950 }, { "epoch": 3.484848484848485, "grad_norm": 0.5463330149650574, "learning_rate": 4.1182612918404466e-05, "loss": 0.1969, "num_input_tokens_seen": 1216568, "step": 1955 }, { "epoch": 3.4937611408199643, "grad_norm": 0.6442824006080627, "learning_rate": 4.1123250164870955e-05, "loss": 0.3184, "num_input_tokens_seen": 1219896, "step": 1960 }, { "epoch": 3.502673796791444, "grad_norm": 0.701900064945221, "learning_rate": 4.1063731355055763e-05, "loss": 0.2079, "num_input_tokens_seen": 1222904, "step": 1965 }, { "epoch": 3.5062388591800357, "eval_loss": 0.22395405173301697, "eval_runtime": 4.2462, "eval_samples_per_second": 58.641, "eval_steps_per_second": 14.837, "num_input_tokens_seen": 1223864, "step": 1967 }, { "epoch": 3.5115864527629235, "grad_norm": 0.39802566170692444, "learning_rate": 4.100405706503904e-05, "loss": 0.158, "num_input_tokens_seen": 1225496, "step": 1970 }, { "epoch": 3.520499108734403, "grad_norm": 0.7380387783050537, "learning_rate": 4.094422787240581e-05, "loss": 0.1725, "num_input_tokens_seen": 1228280, "step": 1975 }, { "epoch": 3.5294117647058822, "grad_norm": 0.6759628653526306, "learning_rate": 4.088424435624038e-05, "loss": 0.2052, "num_input_tokens_seen": 1231288, "step": 1980 }, { "epoch": 3.5383244206773616, "grad_norm": 1.158799409866333, "learning_rate": 4.082410709712077e-05, "loss": 0.2018, "num_input_tokens_seen": 1234456, "step": 1985 }, { "epoch": 3.5472370766488415, "grad_norm": 0.7307495474815369, "learning_rate": 4.0763816677113064e-05, "loss": 0.2669, "num_input_tokens_seen": 1237912, "step": 1990 }, { "epoch": 3.556149732620321, "grad_norm": 0.9738561511039734, "learning_rate": 4.070337367976578e-05, "loss": 0.2444, "num_input_tokens_seen": 1240984, "step": 1995 }, { "epoch": 3.5650623885918002, "grad_norm": 0.5394619703292847, "learning_rate": 4.064277869010421e-05, "loss": 0.2265, "num_input_tokens_seen": 1244280, "step": 2000 }, { "epoch": 3.57397504456328, "grad_norm": 0.7028752565383911, "learning_rate": 4.058203229462482e-05, "loss": 0.2192, "num_input_tokens_seen": 1246904, "step": 2005 }, { "epoch": 3.5828877005347595, "grad_norm": 1.353464126586914, "learning_rate": 4.052113508128948e-05, "loss": 0.2313, "num_input_tokens_seen": 1249880, "step": 2010 }, { "epoch": 3.591800356506239, "grad_norm": 0.8846970796585083, "learning_rate": 4.0460087639519836e-05, "loss": 0.1889, "num_input_tokens_seen": 1252408, "step": 2015 }, { "epoch": 3.6007130124777182, "grad_norm": 1.0351589918136597, "learning_rate": 4.039889056019159e-05, "loss": 0.2567, "num_input_tokens_seen": 1255800, "step": 2020 }, { "epoch": 3.6096256684491976, "grad_norm": 0.6438773274421692, "learning_rate": 4.03375444356288e-05, "loss": 0.2018, "num_input_tokens_seen": 1259160, "step": 2025 }, { "epoch": 3.6185383244206775, "grad_norm": 0.8322818279266357, "learning_rate": 4.0276049859598084e-05, "loss": 0.2269, "num_input_tokens_seen": 1262488, "step": 2030 }, { "epoch": 3.627450980392157, "grad_norm": 0.5302309393882751, "learning_rate": 4.021440742730295e-05, "loss": 0.2032, "num_input_tokens_seen": 1265368, "step": 2035 }, { "epoch": 3.6363636363636362, "grad_norm": 0.8041933178901672, "learning_rate": 4.015261773537799e-05, "loss": 0.2316, "num_input_tokens_seen": 1269112, "step": 2040 }, { "epoch": 3.645276292335116, "grad_norm": 0.5872630476951599, "learning_rate": 4.009068138188311e-05, "loss": 0.2389, "num_input_tokens_seen": 1272408, "step": 2045 }, { "epoch": 3.6541889483065955, "grad_norm": 0.5462104678153992, "learning_rate": 4.002859896629776e-05, "loss": 0.1955, "num_input_tokens_seen": 1275640, "step": 2050 }, { "epoch": 3.663101604278075, "grad_norm": 0.7330032587051392, "learning_rate": 3.99663710895151e-05, "loss": 0.2116, "num_input_tokens_seen": 1278616, "step": 2055 }, { "epoch": 3.6720142602495542, "grad_norm": 0.5604473352432251, "learning_rate": 3.990399835383623e-05, "loss": 0.2285, "num_input_tokens_seen": 1281624, "step": 2060 }, { "epoch": 3.6809269162210336, "grad_norm": 0.49228572845458984, "learning_rate": 3.984148136296431e-05, "loss": 0.2026, "num_input_tokens_seen": 1284216, "step": 2065 }, { "epoch": 3.6898395721925135, "grad_norm": 0.8332962393760681, "learning_rate": 3.977882072199874e-05, "loss": 0.2028, "num_input_tokens_seen": 1286808, "step": 2070 }, { "epoch": 3.698752228163993, "grad_norm": 0.6717101335525513, "learning_rate": 3.971601703742932e-05, "loss": 0.2117, "num_input_tokens_seen": 1289944, "step": 2075 }, { "epoch": 3.7076648841354722, "grad_norm": 0.6963510513305664, "learning_rate": 3.965307091713037e-05, "loss": 0.1899, "num_input_tokens_seen": 1292856, "step": 2080 }, { "epoch": 3.716577540106952, "grad_norm": 0.771668553352356, "learning_rate": 3.95899829703548e-05, "loss": 0.2491, "num_input_tokens_seen": 1296792, "step": 2085 }, { "epoch": 3.7254901960784315, "grad_norm": 0.9969800710678101, "learning_rate": 3.9526753807728295e-05, "loss": 0.2512, "num_input_tokens_seen": 1299800, "step": 2090 }, { "epoch": 3.734402852049911, "grad_norm": 0.5737549066543579, "learning_rate": 3.946338404124334e-05, "loss": 0.1831, "num_input_tokens_seen": 1302648, "step": 2095 }, { "epoch": 3.7433155080213902, "grad_norm": 0.5544306039810181, "learning_rate": 3.939987428425331e-05, "loss": 0.1678, "num_input_tokens_seen": 1305016, "step": 2100 }, { "epoch": 3.7522281639928696, "grad_norm": 0.4125676155090332, "learning_rate": 3.933622515146658e-05, "loss": 0.1715, "num_input_tokens_seen": 1308024, "step": 2105 }, { "epoch": 3.7611408199643495, "grad_norm": 0.6266154646873474, "learning_rate": 3.9272437258940494e-05, "loss": 0.2112, "num_input_tokens_seen": 1310552, "step": 2110 }, { "epoch": 3.770053475935829, "grad_norm": 0.44769471883773804, "learning_rate": 3.9208511224075484e-05, "loss": 0.2325, "num_input_tokens_seen": 1313656, "step": 2115 }, { "epoch": 3.7789661319073082, "grad_norm": 0.5761722922325134, "learning_rate": 3.914444766560902e-05, "loss": 0.2712, "num_input_tokens_seen": 1316728, "step": 2120 }, { "epoch": 3.787878787878788, "grad_norm": 0.556746780872345, "learning_rate": 3.908024720360968e-05, "loss": 0.2286, "num_input_tokens_seen": 1320344, "step": 2125 }, { "epoch": 3.7967914438502675, "grad_norm": 0.45677894353866577, "learning_rate": 3.9015910459471126e-05, "loss": 0.196, "num_input_tokens_seen": 1323416, "step": 2130 }, { "epoch": 3.805704099821747, "grad_norm": 0.6750150322914124, "learning_rate": 3.8951438055906084e-05, "loss": 0.1779, "num_input_tokens_seen": 1326360, "step": 2135 }, { "epoch": 3.8146167557932262, "grad_norm": 0.9360057711601257, "learning_rate": 3.888683061694032e-05, "loss": 0.2523, "num_input_tokens_seen": 1329944, "step": 2140 }, { "epoch": 3.8235294117647056, "grad_norm": 0.4923909604549408, "learning_rate": 3.882208876790661e-05, "loss": 0.1995, "num_input_tokens_seen": 1333080, "step": 2145 }, { "epoch": 3.8324420677361855, "grad_norm": 0.6493288278579712, "learning_rate": 3.8757213135438655e-05, "loss": 0.1972, "num_input_tokens_seen": 1336504, "step": 2150 }, { "epoch": 3.841354723707665, "grad_norm": 0.5835461616516113, "learning_rate": 3.869220434746509e-05, "loss": 0.2229, "num_input_tokens_seen": 1339704, "step": 2155 }, { "epoch": 3.8502673796791442, "grad_norm": 0.6278809309005737, "learning_rate": 3.862706303320329e-05, "loss": 0.2137, "num_input_tokens_seen": 1343032, "step": 2160 }, { "epoch": 3.859180035650624, "grad_norm": 0.7989611625671387, "learning_rate": 3.856178982315342e-05, "loss": 0.2522, "num_input_tokens_seen": 1346104, "step": 2165 }, { "epoch": 3.8680926916221035, "grad_norm": 0.4888596534729004, "learning_rate": 3.849638534909219e-05, "loss": 0.1977, "num_input_tokens_seen": 1348984, "step": 2170 }, { "epoch": 3.877005347593583, "grad_norm": 0.590801477432251, "learning_rate": 3.843085024406686e-05, "loss": 0.2031, "num_input_tokens_seen": 1351480, "step": 2175 }, { "epoch": 3.8859180035650622, "grad_norm": 0.6255959868431091, "learning_rate": 3.836518514238903e-05, "loss": 0.2707, "num_input_tokens_seen": 1355448, "step": 2180 }, { "epoch": 3.8948306595365416, "grad_norm": 0.5446547269821167, "learning_rate": 3.8299390679628555e-05, "loss": 0.1831, "num_input_tokens_seen": 1358392, "step": 2185 }, { "epoch": 3.9037433155080214, "grad_norm": 0.5819702744483948, "learning_rate": 3.8233467492607354e-05, "loss": 0.2039, "num_input_tokens_seen": 1361368, "step": 2190 }, { "epoch": 3.912655971479501, "grad_norm": 0.5366934537887573, "learning_rate": 3.816741621939327e-05, "loss": 0.1955, "num_input_tokens_seen": 1364536, "step": 2195 }, { "epoch": 3.9215686274509802, "grad_norm": 1.1435610055923462, "learning_rate": 3.81012374992939e-05, "loss": 0.2049, "num_input_tokens_seen": 1367800, "step": 2200 }, { "epoch": 3.93048128342246, "grad_norm": 0.5551317930221558, "learning_rate": 3.803493197285036e-05, "loss": 0.2268, "num_input_tokens_seen": 1371224, "step": 2205 }, { "epoch": 3.9393939393939394, "grad_norm": 1.10652756690979, "learning_rate": 3.7968500281831146e-05, "loss": 0.1848, "num_input_tokens_seen": 1373944, "step": 2210 }, { "epoch": 3.948306595365419, "grad_norm": 0.9579757452011108, "learning_rate": 3.79019430692259e-05, "loss": 0.2114, "num_input_tokens_seen": 1377240, "step": 2215 }, { "epoch": 3.9572192513368982, "grad_norm": 0.42045828700065613, "learning_rate": 3.783526097923915e-05, "loss": 0.2034, "num_input_tokens_seen": 1380248, "step": 2220 }, { "epoch": 3.966131907308378, "grad_norm": 0.6384634375572205, "learning_rate": 3.7768454657284154e-05, "loss": 0.1566, "num_input_tokens_seen": 1382712, "step": 2225 }, { "epoch": 3.9750445632798574, "grad_norm": 0.9116731882095337, "learning_rate": 3.770152474997657e-05, "loss": 0.2102, "num_input_tokens_seen": 1385976, "step": 2230 }, { "epoch": 3.983957219251337, "grad_norm": 0.6810240149497986, "learning_rate": 3.763447190512824e-05, "loss": 0.2052, "num_input_tokens_seen": 1389624, "step": 2235 }, { "epoch": 3.9928698752228167, "grad_norm": 0.3541090488433838, "learning_rate": 3.7567296771740925e-05, "loss": 0.244, "num_input_tokens_seen": 1392728, "step": 2240 }, { "epoch": 4.001782531194296, "grad_norm": 1.0409997701644897, "learning_rate": 3.7500000000000003e-05, "loss": 0.2358, "num_input_tokens_seen": 1395704, "step": 2245 }, { "epoch": 4.007130124777183, "eval_loss": 0.21653257310390472, "eval_runtime": 4.2509, "eval_samples_per_second": 58.576, "eval_steps_per_second": 14.82, "num_input_tokens_seen": 1397624, "step": 2248 }, { "epoch": 4.010695187165775, "grad_norm": 0.5523825287818909, "learning_rate": 3.743258224126819e-05, "loss": 0.1735, "num_input_tokens_seen": 1398584, "step": 2250 }, { "epoch": 4.019607843137255, "grad_norm": 0.7276411652565002, "learning_rate": 3.736504414807922e-05, "loss": 0.1992, "num_input_tokens_seen": 1401784, "step": 2255 }, { "epoch": 4.028520499108734, "grad_norm": 0.36699721217155457, "learning_rate": 3.729738637413156e-05, "loss": 0.1728, "num_input_tokens_seen": 1404312, "step": 2260 }, { "epoch": 4.037433155080214, "grad_norm": 0.7663154006004333, "learning_rate": 3.722960957428203e-05, "loss": 0.1866, "num_input_tokens_seen": 1407352, "step": 2265 }, { "epoch": 4.046345811051693, "grad_norm": 0.4959503412246704, "learning_rate": 3.716171440453952e-05, "loss": 0.1823, "num_input_tokens_seen": 1410648, "step": 2270 }, { "epoch": 4.055258467023173, "grad_norm": 0.6325064897537231, "learning_rate": 3.709370152205863e-05, "loss": 0.1698, "num_input_tokens_seen": 1413816, "step": 2275 }, { "epoch": 4.064171122994653, "grad_norm": 0.4548736810684204, "learning_rate": 3.7025571585133254e-05, "loss": 0.1626, "num_input_tokens_seen": 1416024, "step": 2280 }, { "epoch": 4.073083778966132, "grad_norm": 0.3842249810695648, "learning_rate": 3.69573252531903e-05, "loss": 0.1929, "num_input_tokens_seen": 1419128, "step": 2285 }, { "epoch": 4.081996434937611, "grad_norm": 0.6341343522071838, "learning_rate": 3.6888963186783224e-05, "loss": 0.1625, "num_input_tokens_seen": 1421720, "step": 2290 }, { "epoch": 4.090909090909091, "grad_norm": 0.5091090798377991, "learning_rate": 3.682048604758567e-05, "loss": 0.1771, "num_input_tokens_seen": 1424632, "step": 2295 }, { "epoch": 4.09982174688057, "grad_norm": 0.24424993991851807, "learning_rate": 3.67518944983851e-05, "loss": 0.1739, "num_input_tokens_seen": 1427480, "step": 2300 }, { "epoch": 4.10873440285205, "grad_norm": 0.589100182056427, "learning_rate": 3.668318920307632e-05, "loss": 0.2092, "num_input_tokens_seen": 1430296, "step": 2305 }, { "epoch": 4.117647058823529, "grad_norm": 0.41250258684158325, "learning_rate": 3.6614370826655074e-05, "loss": 0.1714, "num_input_tokens_seen": 1432920, "step": 2310 }, { "epoch": 4.126559714795009, "grad_norm": 0.7590497136116028, "learning_rate": 3.654544003521164e-05, "loss": 0.2039, "num_input_tokens_seen": 1435544, "step": 2315 }, { "epoch": 4.135472370766489, "grad_norm": 0.8127907514572144, "learning_rate": 3.647639749592433e-05, "loss": 0.1583, "num_input_tokens_seen": 1438040, "step": 2320 }, { "epoch": 4.144385026737968, "grad_norm": 0.6445732712745667, "learning_rate": 3.640724387705308e-05, "loss": 0.2149, "num_input_tokens_seen": 1441528, "step": 2325 }, { "epoch": 4.153297682709447, "grad_norm": 0.44771522283554077, "learning_rate": 3.633797984793294e-05, "loss": 0.1543, "num_input_tokens_seen": 1444920, "step": 2330 }, { "epoch": 4.162210338680927, "grad_norm": 0.47167617082595825, "learning_rate": 3.626860607896764e-05, "loss": 0.2014, "num_input_tokens_seen": 1447896, "step": 2335 }, { "epoch": 4.171122994652406, "grad_norm": 0.49547502398490906, "learning_rate": 3.6199123241623046e-05, "loss": 0.2085, "num_input_tokens_seen": 1451256, "step": 2340 }, { "epoch": 4.180035650623886, "grad_norm": 0.5464377403259277, "learning_rate": 3.6129532008420715e-05, "loss": 0.1821, "num_input_tokens_seen": 1454136, "step": 2345 }, { "epoch": 4.188948306595365, "grad_norm": 0.44719406962394714, "learning_rate": 3.605983305293137e-05, "loss": 0.1703, "num_input_tokens_seen": 1456504, "step": 2350 }, { "epoch": 4.197860962566845, "grad_norm": 0.905034065246582, "learning_rate": 3.599002704976835e-05, "loss": 0.1734, "num_input_tokens_seen": 1459768, "step": 2355 }, { "epoch": 4.206773618538325, "grad_norm": 0.3426745533943176, "learning_rate": 3.592011467458113e-05, "loss": 0.1501, "num_input_tokens_seen": 1462392, "step": 2360 }, { "epoch": 4.215686274509804, "grad_norm": 1.105431318283081, "learning_rate": 3.585009660404873e-05, "loss": 0.2289, "num_input_tokens_seen": 1466040, "step": 2365 }, { "epoch": 4.224598930481283, "grad_norm": 0.6577187776565552, "learning_rate": 3.577997351587322e-05, "loss": 0.2166, "num_input_tokens_seen": 1469208, "step": 2370 }, { "epoch": 4.233511586452763, "grad_norm": 0.5719982981681824, "learning_rate": 3.5709746088773085e-05, "loss": 0.222, "num_input_tokens_seen": 1472536, "step": 2375 }, { "epoch": 4.242424242424242, "grad_norm": 0.4010562598705292, "learning_rate": 3.563941500247676e-05, "loss": 0.1836, "num_input_tokens_seen": 1475608, "step": 2380 }, { "epoch": 4.251336898395722, "grad_norm": 0.6845771074295044, "learning_rate": 3.5568980937715945e-05, "loss": 0.1762, "num_input_tokens_seen": 1479256, "step": 2385 }, { "epoch": 4.260249554367201, "grad_norm": 0.5753139853477478, "learning_rate": 3.54984445762191e-05, "loss": 0.2054, "num_input_tokens_seen": 1483064, "step": 2390 }, { "epoch": 4.269162210338681, "grad_norm": 0.586729109287262, "learning_rate": 3.5427806600704785e-05, "loss": 0.1733, "num_input_tokens_seen": 1485880, "step": 2395 }, { "epoch": 4.278074866310161, "grad_norm": 0.5614349842071533, "learning_rate": 3.535706769487509e-05, "loss": 0.1777, "num_input_tokens_seen": 1489208, "step": 2400 }, { "epoch": 4.28698752228164, "grad_norm": 0.6715386509895325, "learning_rate": 3.5286228543409004e-05, "loss": 0.1883, "num_input_tokens_seen": 1492216, "step": 2405 }, { "epoch": 4.295900178253119, "grad_norm": 0.5051096677780151, "learning_rate": 3.5215289831955786e-05, "loss": 0.2037, "num_input_tokens_seen": 1495960, "step": 2410 }, { "epoch": 4.304812834224599, "grad_norm": 0.8140228390693665, "learning_rate": 3.514425224712835e-05, "loss": 0.1892, "num_input_tokens_seen": 1498584, "step": 2415 }, { "epoch": 4.313725490196078, "grad_norm": 0.45702996850013733, "learning_rate": 3.507311647649657e-05, "loss": 0.179, "num_input_tokens_seen": 1501880, "step": 2420 }, { "epoch": 4.322638146167558, "grad_norm": 0.6330050230026245, "learning_rate": 3.5001883208580665e-05, "loss": 0.1901, "num_input_tokens_seen": 1505112, "step": 2425 }, { "epoch": 4.331550802139038, "grad_norm": 0.5689657330513, "learning_rate": 3.493055313284456e-05, "loss": 0.2295, "num_input_tokens_seen": 1507768, "step": 2430 }, { "epoch": 4.340463458110517, "grad_norm": 0.9648520946502686, "learning_rate": 3.485912693968913e-05, "loss": 0.2049, "num_input_tokens_seen": 1511224, "step": 2435 }, { "epoch": 4.349376114081997, "grad_norm": 0.4425726532936096, "learning_rate": 3.478760532044561e-05, "loss": 0.2032, "num_input_tokens_seen": 1514456, "step": 2440 }, { "epoch": 4.358288770053476, "grad_norm": 0.5605233311653137, "learning_rate": 3.471598896736881e-05, "loss": 0.207, "num_input_tokens_seen": 1517400, "step": 2445 }, { "epoch": 4.367201426024955, "grad_norm": 0.5907042622566223, "learning_rate": 3.464427857363052e-05, "loss": 0.2018, "num_input_tokens_seen": 1520664, "step": 2450 }, { "epoch": 4.376114081996435, "grad_norm": 0.8678156137466431, "learning_rate": 3.457247483331272e-05, "loss": 0.2408, "num_input_tokens_seen": 1523960, "step": 2455 }, { "epoch": 4.385026737967914, "grad_norm": 0.4271613359451294, "learning_rate": 3.4500578441400876e-05, "loss": 0.1568, "num_input_tokens_seen": 1526616, "step": 2460 }, { "epoch": 4.393939393939394, "grad_norm": 1.1846132278442383, "learning_rate": 3.4428590093777244e-05, "loss": 0.3417, "num_input_tokens_seen": 1530808, "step": 2465 }, { "epoch": 4.402852049910873, "grad_norm": 0.49708229303359985, "learning_rate": 3.43565104872141e-05, "loss": 0.1599, "num_input_tokens_seen": 1533336, "step": 2470 }, { "epoch": 4.411764705882353, "grad_norm": 0.35631561279296875, "learning_rate": 3.428434031936704e-05, "loss": 0.1646, "num_input_tokens_seen": 1535864, "step": 2475 }, { "epoch": 4.420677361853833, "grad_norm": 0.6264846324920654, "learning_rate": 3.421208028876815e-05, "loss": 0.2114, "num_input_tokens_seen": 1539192, "step": 2480 }, { "epoch": 4.429590017825312, "grad_norm": 0.3950527310371399, "learning_rate": 3.413973109481935e-05, "loss": 0.227, "num_input_tokens_seen": 1542712, "step": 2485 }, { "epoch": 4.438502673796791, "grad_norm": 0.7369870543479919, "learning_rate": 3.406729343778552e-05, "loss": 0.1871, "num_input_tokens_seen": 1545272, "step": 2490 }, { "epoch": 4.447415329768271, "grad_norm": 0.549528956413269, "learning_rate": 3.3994768018787815e-05, "loss": 0.3024, "num_input_tokens_seen": 1549464, "step": 2495 }, { "epoch": 4.45632798573975, "grad_norm": 0.5840650796890259, "learning_rate": 3.392215553979679e-05, "loss": 0.2244, "num_input_tokens_seen": 1552280, "step": 2500 }, { "epoch": 4.46524064171123, "grad_norm": 0.399300754070282, "learning_rate": 3.38494567036257e-05, "loss": 0.2032, "num_input_tokens_seen": 1555448, "step": 2505 }, { "epoch": 4.47415329768271, "grad_norm": 0.47554269433021545, "learning_rate": 3.3776672213923587e-05, "loss": 0.2211, "num_input_tokens_seen": 1559480, "step": 2510 }, { "epoch": 4.483065953654189, "grad_norm": 0.3855815827846527, "learning_rate": 3.370380277516858e-05, "loss": 0.1718, "num_input_tokens_seen": 1562872, "step": 2515 }, { "epoch": 4.491978609625669, "grad_norm": 0.5743004679679871, "learning_rate": 3.3630849092661e-05, "loss": 0.183, "num_input_tokens_seen": 1565752, "step": 2520 }, { "epoch": 4.500891265597148, "grad_norm": 0.527409553527832, "learning_rate": 3.355781187251657e-05, "loss": 0.1778, "num_input_tokens_seen": 1568600, "step": 2525 }, { "epoch": 4.508021390374331, "eval_loss": 0.2118549942970276, "eval_runtime": 4.2596, "eval_samples_per_second": 58.457, "eval_steps_per_second": 14.79, "num_input_tokens_seen": 1570936, "step": 2529 }, { "epoch": 4.509803921568627, "grad_norm": 0.39879217743873596, "learning_rate": 3.3484691821659584e-05, "loss": 0.1747, "num_input_tokens_seen": 1571512, "step": 2530 }, { "epoch": 4.518716577540107, "grad_norm": 0.5035882592201233, "learning_rate": 3.3411489647816016e-05, "loss": 0.1871, "num_input_tokens_seen": 1574232, "step": 2535 }, { "epoch": 4.527629233511586, "grad_norm": 1.1074864864349365, "learning_rate": 3.3338206059506736e-05, "loss": 0.2403, "num_input_tokens_seen": 1577816, "step": 2540 }, { "epoch": 4.536541889483066, "grad_norm": 0.8603164553642273, "learning_rate": 3.326484176604061e-05, "loss": 0.2662, "num_input_tokens_seen": 1581368, "step": 2545 }, { "epoch": 4.545454545454545, "grad_norm": 0.43185243010520935, "learning_rate": 3.3191397477507655e-05, "loss": 0.1828, "num_input_tokens_seen": 1583800, "step": 2550 }, { "epoch": 4.554367201426025, "grad_norm": 0.397795170545578, "learning_rate": 3.3117873904772123e-05, "loss": 0.206, "num_input_tokens_seen": 1587384, "step": 2555 }, { "epoch": 4.563279857397505, "grad_norm": 0.7756383419036865, "learning_rate": 3.30442717594657e-05, "loss": 0.1919, "num_input_tokens_seen": 1590328, "step": 2560 }, { "epoch": 4.572192513368984, "grad_norm": 0.7332653999328613, "learning_rate": 3.297059175398056e-05, "loss": 0.2376, "num_input_tokens_seen": 1594136, "step": 2565 }, { "epoch": 4.581105169340463, "grad_norm": 0.541881799697876, "learning_rate": 3.289683460146244e-05, "loss": 0.1923, "num_input_tokens_seen": 1597656, "step": 2570 }, { "epoch": 4.590017825311943, "grad_norm": 0.48139122128486633, "learning_rate": 3.282300101580386e-05, "loss": 0.198, "num_input_tokens_seen": 1600536, "step": 2575 }, { "epoch": 4.598930481283422, "grad_norm": 0.7859025001525879, "learning_rate": 3.274909171163706e-05, "loss": 0.1965, "num_input_tokens_seen": 1603832, "step": 2580 }, { "epoch": 4.607843137254902, "grad_norm": 0.8468954563140869, "learning_rate": 3.2675107404327194e-05, "loss": 0.1882, "num_input_tokens_seen": 1607480, "step": 2585 }, { "epoch": 4.616755793226382, "grad_norm": 0.6784586310386658, "learning_rate": 3.2601048809965355e-05, "loss": 0.187, "num_input_tokens_seen": 1610296, "step": 2590 }, { "epoch": 4.625668449197861, "grad_norm": 0.4848667085170746, "learning_rate": 3.2526916645361666e-05, "loss": 0.1797, "num_input_tokens_seen": 1613336, "step": 2595 }, { "epoch": 4.634581105169341, "grad_norm": 0.4509483575820923, "learning_rate": 3.2452711628038324e-05, "loss": 0.159, "num_input_tokens_seen": 1616152, "step": 2600 }, { "epoch": 4.64349376114082, "grad_norm": 0.9891667366027832, "learning_rate": 3.2378434476222666e-05, "loss": 0.2153, "num_input_tokens_seen": 1620024, "step": 2605 }, { "epoch": 4.652406417112299, "grad_norm": 0.45274657011032104, "learning_rate": 3.2304085908840244e-05, "loss": 0.1975, "num_input_tokens_seen": 1623544, "step": 2610 }, { "epoch": 4.661319073083779, "grad_norm": 0.5668216943740845, "learning_rate": 3.222966664550777e-05, "loss": 0.1748, "num_input_tokens_seen": 1626296, "step": 2615 }, { "epoch": 4.670231729055258, "grad_norm": 0.6975745558738708, "learning_rate": 3.2155177406526304e-05, "loss": 0.1868, "num_input_tokens_seen": 1629336, "step": 2620 }, { "epoch": 4.6791443850267385, "grad_norm": 0.7208099961280823, "learning_rate": 3.208061891287414e-05, "loss": 0.214, "num_input_tokens_seen": 1632888, "step": 2625 }, { "epoch": 4.688057040998218, "grad_norm": 0.41192349791526794, "learning_rate": 3.200599188619989e-05, "loss": 0.1753, "num_input_tokens_seen": 1635768, "step": 2630 }, { "epoch": 4.696969696969697, "grad_norm": 1.2426398992538452, "learning_rate": 3.1931297048815534e-05, "loss": 0.2339, "num_input_tokens_seen": 1639256, "step": 2635 }, { "epoch": 4.705882352941177, "grad_norm": 0.4843774735927582, "learning_rate": 3.185653512368933e-05, "loss": 0.2591, "num_input_tokens_seen": 1643128, "step": 2640 }, { "epoch": 4.714795008912656, "grad_norm": 0.6016537547111511, "learning_rate": 3.178170683443893e-05, "loss": 0.1748, "num_input_tokens_seen": 1646424, "step": 2645 }, { "epoch": 4.723707664884135, "grad_norm": 0.5028678178787231, "learning_rate": 3.1706812905324276e-05, "loss": 0.1844, "num_input_tokens_seen": 1649240, "step": 2650 }, { "epoch": 4.732620320855615, "grad_norm": 0.694146454334259, "learning_rate": 3.1631854061240684e-05, "loss": 0.1668, "num_input_tokens_seen": 1652184, "step": 2655 }, { "epoch": 4.741532976827094, "grad_norm": 0.6105802655220032, "learning_rate": 3.155683102771173e-05, "loss": 0.2189, "num_input_tokens_seen": 1655480, "step": 2660 }, { "epoch": 4.750445632798574, "grad_norm": 0.8289818167686462, "learning_rate": 3.1481744530882305e-05, "loss": 0.2437, "num_input_tokens_seen": 1659352, "step": 2665 }, { "epoch": 4.759358288770054, "grad_norm": 0.5131431221961975, "learning_rate": 3.1406595297511566e-05, "loss": 0.1756, "num_input_tokens_seen": 1661976, "step": 2670 }, { "epoch": 4.768270944741533, "grad_norm": 0.6698647737503052, "learning_rate": 3.133138405496587e-05, "loss": 0.1713, "num_input_tokens_seen": 1664504, "step": 2675 }, { "epoch": 4.777183600713013, "grad_norm": 0.5975663065910339, "learning_rate": 3.125611153121178e-05, "loss": 0.1763, "num_input_tokens_seen": 1667288, "step": 2680 }, { "epoch": 4.786096256684492, "grad_norm": 0.5346847772598267, "learning_rate": 3.118077845480897e-05, "loss": 0.1686, "num_input_tokens_seen": 1670360, "step": 2685 }, { "epoch": 4.795008912655971, "grad_norm": 0.5491595268249512, "learning_rate": 3.110538555490324e-05, "loss": 0.1884, "num_input_tokens_seen": 1673624, "step": 2690 }, { "epoch": 4.803921568627451, "grad_norm": 0.35313117504119873, "learning_rate": 3.1029933561219375e-05, "loss": 0.1675, "num_input_tokens_seen": 1676440, "step": 2695 }, { "epoch": 4.81283422459893, "grad_norm": 0.5857532024383545, "learning_rate": 3.095442320405418e-05, "loss": 0.1637, "num_input_tokens_seen": 1679448, "step": 2700 }, { "epoch": 4.8217468805704105, "grad_norm": 0.6775690913200378, "learning_rate": 3.0878855214269293e-05, "loss": 0.1642, "num_input_tokens_seen": 1682520, "step": 2705 }, { "epoch": 4.83065953654189, "grad_norm": 0.5732465386390686, "learning_rate": 3.0803230323284225e-05, "loss": 0.1834, "num_input_tokens_seen": 1685656, "step": 2710 }, { "epoch": 4.839572192513369, "grad_norm": 1.1239274740219116, "learning_rate": 3.0727549263069224e-05, "loss": 0.2211, "num_input_tokens_seen": 1688856, "step": 2715 }, { "epoch": 4.848484848484849, "grad_norm": 0.8710312247276306, "learning_rate": 3.065181276613817e-05, "loss": 0.1483, "num_input_tokens_seen": 1691768, "step": 2720 }, { "epoch": 4.857397504456328, "grad_norm": 0.28014299273490906, "learning_rate": 3.057602156554155e-05, "loss": 0.1538, "num_input_tokens_seen": 1694488, "step": 2725 }, { "epoch": 4.866310160427807, "grad_norm": 0.5496522784233093, "learning_rate": 3.0500176394859293e-05, "loss": 0.2051, "num_input_tokens_seen": 1697752, "step": 2730 }, { "epoch": 4.875222816399287, "grad_norm": 0.673943817615509, "learning_rate": 3.042427798819373e-05, "loss": 0.1897, "num_input_tokens_seen": 1700408, "step": 2735 }, { "epoch": 4.884135472370766, "grad_norm": 0.7624504566192627, "learning_rate": 3.0348327080162435e-05, "loss": 0.1842, "num_input_tokens_seen": 1703512, "step": 2740 }, { "epoch": 4.893048128342246, "grad_norm": 0.5836613774299622, "learning_rate": 3.0272324405891172e-05, "loss": 0.1811, "num_input_tokens_seen": 1707032, "step": 2745 }, { "epoch": 4.901960784313726, "grad_norm": 0.6330267190933228, "learning_rate": 3.0196270701006706e-05, "loss": 0.1925, "num_input_tokens_seen": 1710328, "step": 2750 }, { "epoch": 4.910873440285205, "grad_norm": 0.764445960521698, "learning_rate": 3.012016670162977e-05, "loss": 0.1888, "num_input_tokens_seen": 1712632, "step": 2755 }, { "epoch": 4.919786096256685, "grad_norm": 0.3074583113193512, "learning_rate": 3.0044013144367866e-05, "loss": 0.2241, "num_input_tokens_seen": 1716344, "step": 2760 }, { "epoch": 4.928698752228164, "grad_norm": 0.4822777509689331, "learning_rate": 2.996781076630816e-05, "loss": 0.1661, "num_input_tokens_seen": 1718712, "step": 2765 }, { "epoch": 4.937611408199643, "grad_norm": 0.56252521276474, "learning_rate": 2.9891560305010392e-05, "loss": 0.1863, "num_input_tokens_seen": 1722328, "step": 2770 }, { "epoch": 4.946524064171123, "grad_norm": 0.5701931118965149, "learning_rate": 2.9815262498499657e-05, "loss": 0.2022, "num_input_tokens_seen": 1725464, "step": 2775 }, { "epoch": 4.955436720142602, "grad_norm": 0.6118953227996826, "learning_rate": 2.9738918085259314e-05, "loss": 0.1703, "num_input_tokens_seen": 1728472, "step": 2780 }, { "epoch": 4.9643493761140824, "grad_norm": 0.43155810236930847, "learning_rate": 2.9662527804223827e-05, "loss": 0.1658, "num_input_tokens_seen": 1731160, "step": 2785 }, { "epoch": 4.973262032085562, "grad_norm": 0.622303307056427, "learning_rate": 2.9586092394771637e-05, "loss": 0.2174, "num_input_tokens_seen": 1734264, "step": 2790 }, { "epoch": 4.982174688057041, "grad_norm": 0.592126727104187, "learning_rate": 2.950961259671793e-05, "loss": 0.1573, "num_input_tokens_seen": 1737144, "step": 2795 }, { "epoch": 4.991087344028521, "grad_norm": 0.4473949372768402, "learning_rate": 2.943308915030757e-05, "loss": 0.1619, "num_input_tokens_seen": 1740664, "step": 2800 }, { "epoch": 5.0, "grad_norm": 1.4496628046035767, "learning_rate": 2.935652279620788e-05, "loss": 0.194, "num_input_tokens_seen": 1743216, "step": 2805 }, { "epoch": 5.008912655971479, "grad_norm": 0.5206677913665771, "learning_rate": 2.9279914275501473e-05, "loss": 0.2055, "num_input_tokens_seen": 1746384, "step": 2810 }, { "epoch": 5.008912655971479, "eval_loss": 0.19685669243335724, "eval_runtime": 4.2355, "eval_samples_per_second": 58.788, "eval_steps_per_second": 14.874, "num_input_tokens_seen": 1746384, "step": 2810 }, { "epoch": 5.017825311942959, "grad_norm": 0.46784770488739014, "learning_rate": 2.9203264329679115e-05, "loss": 0.1835, "num_input_tokens_seen": 1749680, "step": 2815 }, { "epoch": 5.026737967914438, "grad_norm": 0.9836930632591248, "learning_rate": 2.9126573700632504e-05, "loss": 0.1855, "num_input_tokens_seen": 1753104, "step": 2820 }, { "epoch": 5.035650623885918, "grad_norm": 0.48144713044166565, "learning_rate": 2.9049843130647112e-05, "loss": 0.1857, "num_input_tokens_seen": 1756112, "step": 2825 }, { "epoch": 5.044563279857398, "grad_norm": 0.49128931760787964, "learning_rate": 2.8973073362394998e-05, "loss": 0.1802, "num_input_tokens_seen": 1759344, "step": 2830 }, { "epoch": 5.053475935828877, "grad_norm": 0.4599247872829437, "learning_rate": 2.8896265138927638e-05, "loss": 0.1939, "num_input_tokens_seen": 1762288, "step": 2835 }, { "epoch": 5.062388591800357, "grad_norm": 0.4987725615501404, "learning_rate": 2.881941920366868e-05, "loss": 0.1583, "num_input_tokens_seen": 1765072, "step": 2840 }, { "epoch": 5.071301247771836, "grad_norm": 0.4939536452293396, "learning_rate": 2.8742536300406804e-05, "loss": 0.2022, "num_input_tokens_seen": 1767952, "step": 2845 }, { "epoch": 5.080213903743315, "grad_norm": 0.2937607765197754, "learning_rate": 2.8665617173288516e-05, "loss": 0.1696, "num_input_tokens_seen": 1770896, "step": 2850 }, { "epoch": 5.089126559714795, "grad_norm": 0.6866093277931213, "learning_rate": 2.8588662566810893e-05, "loss": 0.1683, "num_input_tokens_seen": 1773840, "step": 2855 }, { "epoch": 5.098039215686274, "grad_norm": 0.5026021003723145, "learning_rate": 2.851167322581445e-05, "loss": 0.1924, "num_input_tokens_seen": 1776720, "step": 2860 }, { "epoch": 5.106951871657754, "grad_norm": 0.5058155059814453, "learning_rate": 2.8434649895475877e-05, "loss": 0.1572, "num_input_tokens_seen": 1779088, "step": 2865 }, { "epoch": 5.115864527629234, "grad_norm": 0.47404804825782776, "learning_rate": 2.8357593321300856e-05, "loss": 0.1753, "num_input_tokens_seen": 1781776, "step": 2870 }, { "epoch": 5.124777183600713, "grad_norm": 0.5163501501083374, "learning_rate": 2.828050424911683e-05, "loss": 0.1685, "num_input_tokens_seen": 1784720, "step": 2875 }, { "epoch": 5.133689839572193, "grad_norm": 0.6680046319961548, "learning_rate": 2.8203383425065787e-05, "loss": 0.1854, "num_input_tokens_seen": 1787856, "step": 2880 }, { "epoch": 5.142602495543672, "grad_norm": 0.47441810369491577, "learning_rate": 2.812623159559704e-05, "loss": 0.1793, "num_input_tokens_seen": 1791088, "step": 2885 }, { "epoch": 5.151515151515151, "grad_norm": 0.4247751533985138, "learning_rate": 2.8049049507460003e-05, "loss": 0.2227, "num_input_tokens_seen": 1795056, "step": 2890 }, { "epoch": 5.160427807486631, "grad_norm": 0.4086715281009674, "learning_rate": 2.7971837907696973e-05, "loss": 0.2894, "num_input_tokens_seen": 1798928, "step": 2895 }, { "epoch": 5.16934046345811, "grad_norm": 0.48060083389282227, "learning_rate": 2.7894597543635863e-05, "loss": 0.1778, "num_input_tokens_seen": 1802384, "step": 2900 }, { "epoch": 5.17825311942959, "grad_norm": 0.5457305312156677, "learning_rate": 2.781732916288303e-05, "loss": 0.1873, "num_input_tokens_seen": 1805616, "step": 2905 }, { "epoch": 5.18716577540107, "grad_norm": 0.7138332724571228, "learning_rate": 2.774003351331597e-05, "loss": 0.1532, "num_input_tokens_seen": 1809008, "step": 2910 }, { "epoch": 5.196078431372549, "grad_norm": 0.5133665204048157, "learning_rate": 2.7662711343076135e-05, "loss": 0.1604, "num_input_tokens_seen": 1812784, "step": 2915 }, { "epoch": 5.204991087344029, "grad_norm": 0.48487603664398193, "learning_rate": 2.7585363400561658e-05, "loss": 0.155, "num_input_tokens_seen": 1815248, "step": 2920 }, { "epoch": 5.213903743315508, "grad_norm": 0.5267552137374878, "learning_rate": 2.7507990434420126e-05, "loss": 0.186, "num_input_tokens_seen": 1818032, "step": 2925 }, { "epoch": 5.222816399286987, "grad_norm": 0.45045390725135803, "learning_rate": 2.7430593193541325e-05, "loss": 0.1804, "num_input_tokens_seen": 1821232, "step": 2930 }, { "epoch": 5.231729055258467, "grad_norm": 0.5850667953491211, "learning_rate": 2.7353172427049995e-05, "loss": 0.2057, "num_input_tokens_seen": 1824784, "step": 2935 }, { "epoch": 5.240641711229946, "grad_norm": 0.4316384792327881, "learning_rate": 2.7275728884298596e-05, "loss": 0.1754, "num_input_tokens_seen": 1827088, "step": 2940 }, { "epoch": 5.249554367201426, "grad_norm": 0.350407212972641, "learning_rate": 2.719826331486e-05, "loss": 0.1627, "num_input_tokens_seen": 1829328, "step": 2945 }, { "epoch": 5.258467023172906, "grad_norm": 0.6626913547515869, "learning_rate": 2.7120776468520314e-05, "loss": 0.2147, "num_input_tokens_seen": 1833136, "step": 2950 }, { "epoch": 5.267379679144385, "grad_norm": 0.711764931678772, "learning_rate": 2.7043269095271573e-05, "loss": 0.185, "num_input_tokens_seen": 1835632, "step": 2955 }, { "epoch": 5.276292335115865, "grad_norm": 0.5972061157226562, "learning_rate": 2.6965741945304467e-05, "loss": 0.199, "num_input_tokens_seen": 1838992, "step": 2960 }, { "epoch": 5.285204991087344, "grad_norm": 0.9157897233963013, "learning_rate": 2.6888195769001146e-05, "loss": 0.1782, "num_input_tokens_seen": 1841840, "step": 2965 }, { "epoch": 5.294117647058823, "grad_norm": 0.4935537874698639, "learning_rate": 2.681063131692787e-05, "loss": 0.1843, "num_input_tokens_seen": 1844560, "step": 2970 }, { "epoch": 5.303030303030303, "grad_norm": 0.5020252466201782, "learning_rate": 2.673304933982783e-05, "loss": 0.1891, "num_input_tokens_seen": 1848624, "step": 2975 }, { "epoch": 5.311942959001782, "grad_norm": 0.5348985195159912, "learning_rate": 2.6655450588613806e-05, "loss": 0.1925, "num_input_tokens_seen": 1851952, "step": 2980 }, { "epoch": 5.320855614973262, "grad_norm": 0.42828452587127686, "learning_rate": 2.657783581436097e-05, "loss": 0.2381, "num_input_tokens_seen": 1855696, "step": 2985 }, { "epoch": 5.329768270944742, "grad_norm": 0.6298767328262329, "learning_rate": 2.6500205768299535e-05, "loss": 0.193, "num_input_tokens_seen": 1859408, "step": 2990 }, { "epoch": 5.338680926916221, "grad_norm": 0.6732975244522095, "learning_rate": 2.642256120180758e-05, "loss": 0.1508, "num_input_tokens_seen": 1861936, "step": 2995 }, { "epoch": 5.347593582887701, "grad_norm": 0.6173202991485596, "learning_rate": 2.6344902866403687e-05, "loss": 0.1724, "num_input_tokens_seen": 1864624, "step": 3000 }, { "epoch": 5.35650623885918, "grad_norm": 0.4392896890640259, "learning_rate": 2.6267231513739726e-05, "loss": 0.2092, "num_input_tokens_seen": 1867600, "step": 3005 }, { "epoch": 5.365418894830659, "grad_norm": 0.621001660823822, "learning_rate": 2.6189547895593562e-05, "loss": 0.1982, "num_input_tokens_seen": 1870672, "step": 3010 }, { "epoch": 5.374331550802139, "grad_norm": 0.5161955952644348, "learning_rate": 2.611185276386176e-05, "loss": 0.1923, "num_input_tokens_seen": 1874160, "step": 3015 }, { "epoch": 5.383244206773618, "grad_norm": 0.5126301050186157, "learning_rate": 2.6034146870552346e-05, "loss": 0.1906, "num_input_tokens_seen": 1877616, "step": 3020 }, { "epoch": 5.392156862745098, "grad_norm": 0.6807987093925476, "learning_rate": 2.595643096777748e-05, "loss": 0.1862, "num_input_tokens_seen": 1880432, "step": 3025 }, { "epoch": 5.401069518716578, "grad_norm": 0.6361598372459412, "learning_rate": 2.5878705807746245e-05, "loss": 0.2137, "num_input_tokens_seen": 1884528, "step": 3030 }, { "epoch": 5.409982174688057, "grad_norm": 0.6302884221076965, "learning_rate": 2.580097214275727e-05, "loss": 0.1688, "num_input_tokens_seen": 1887152, "step": 3035 }, { "epoch": 5.418894830659537, "grad_norm": 0.5410829186439514, "learning_rate": 2.5723230725191554e-05, "loss": 0.1772, "num_input_tokens_seen": 1890032, "step": 3040 }, { "epoch": 5.427807486631016, "grad_norm": 0.5092021822929382, "learning_rate": 2.5645482307505108e-05, "loss": 0.1677, "num_input_tokens_seen": 1892304, "step": 3045 }, { "epoch": 5.436720142602495, "grad_norm": 0.7809433937072754, "learning_rate": 2.55677276422217e-05, "loss": 0.1875, "num_input_tokens_seen": 1895728, "step": 3050 }, { "epoch": 5.445632798573975, "grad_norm": 0.43497583270072937, "learning_rate": 2.548996748192556e-05, "loss": 0.167, "num_input_tokens_seen": 1898384, "step": 3055 }, { "epoch": 5.454545454545454, "grad_norm": 0.36343979835510254, "learning_rate": 2.541220257925412e-05, "loss": 0.1719, "num_input_tokens_seen": 1901104, "step": 3060 }, { "epoch": 5.463458110516934, "grad_norm": 0.6379041075706482, "learning_rate": 2.5334433686890702e-05, "loss": 0.1879, "num_input_tokens_seen": 1904976, "step": 3065 }, { "epoch": 5.472370766488414, "grad_norm": 0.501068651676178, "learning_rate": 2.5256661557557247e-05, "loss": 0.1898, "num_input_tokens_seen": 1908688, "step": 3070 }, { "epoch": 5.481283422459893, "grad_norm": 0.4064844250679016, "learning_rate": 2.517888694400704e-05, "loss": 0.1471, "num_input_tokens_seen": 1911792, "step": 3075 }, { "epoch": 5.490196078431373, "grad_norm": 0.7375326156616211, "learning_rate": 2.5101110599017374e-05, "loss": 0.223, "num_input_tokens_seen": 1915248, "step": 3080 }, { "epoch": 5.499108734402852, "grad_norm": 0.7120162844657898, "learning_rate": 2.502333327538235e-05, "loss": 0.1666, "num_input_tokens_seen": 1918544, "step": 3085 }, { "epoch": 5.508021390374331, "grad_norm": 0.4658108353614807, "learning_rate": 2.4945555725905502e-05, "loss": 0.2039, "num_input_tokens_seen": 1922032, "step": 3090 }, { "epoch": 5.509803921568627, "eval_loss": 0.19006255269050598, "eval_runtime": 4.2606, "eval_samples_per_second": 58.442, "eval_steps_per_second": 14.787, "num_input_tokens_seen": 1922384, "step": 3091 }, { "epoch": 5.516934046345811, "grad_norm": 0.6522291898727417, "learning_rate": 2.4867778703392554e-05, "loss": 0.1586, "num_input_tokens_seen": 1924400, "step": 3095 }, { "epoch": 5.52584670231729, "grad_norm": 0.5256299376487732, "learning_rate": 2.479000296064417e-05, "loss": 0.2169, "num_input_tokens_seen": 1927376, "step": 3100 }, { "epoch": 5.53475935828877, "grad_norm": 0.5868116021156311, "learning_rate": 2.4712229250448567e-05, "loss": 0.1768, "num_input_tokens_seen": 1930352, "step": 3105 }, { "epoch": 5.54367201426025, "grad_norm": 0.6082111597061157, "learning_rate": 2.4634458325574323e-05, "loss": 0.2153, "num_input_tokens_seen": 1933680, "step": 3110 }, { "epoch": 5.552584670231729, "grad_norm": 0.5021962523460388, "learning_rate": 2.4556690938763062e-05, "loss": 0.1667, "num_input_tokens_seen": 1937488, "step": 3115 }, { "epoch": 5.561497326203209, "grad_norm": 0.5544887781143188, "learning_rate": 2.4478927842722154e-05, "loss": 0.1854, "num_input_tokens_seen": 1940368, "step": 3120 }, { "epoch": 5.570409982174688, "grad_norm": 0.6153222322463989, "learning_rate": 2.4401169790117427e-05, "loss": 0.1775, "num_input_tokens_seen": 1943728, "step": 3125 }, { "epoch": 5.579322638146167, "grad_norm": 0.7217985987663269, "learning_rate": 2.4323417533565916e-05, "loss": 0.1929, "num_input_tokens_seen": 1946832, "step": 3130 }, { "epoch": 5.588235294117647, "grad_norm": 0.5232107639312744, "learning_rate": 2.424567182562854e-05, "loss": 0.205, "num_input_tokens_seen": 1949904, "step": 3135 }, { "epoch": 5.597147950089127, "grad_norm": 0.5853015184402466, "learning_rate": 2.4167933418802837e-05, "loss": 0.1431, "num_input_tokens_seen": 1952432, "step": 3140 }, { "epoch": 5.606060606060606, "grad_norm": 0.7414368391036987, "learning_rate": 2.4090203065515695e-05, "loss": 0.1622, "num_input_tokens_seen": 1955216, "step": 3145 }, { "epoch": 5.614973262032086, "grad_norm": 0.4388047456741333, "learning_rate": 2.4012481518116022e-05, "loss": 0.1707, "num_input_tokens_seen": 1958096, "step": 3150 }, { "epoch": 5.623885918003565, "grad_norm": 0.5946722626686096, "learning_rate": 2.3934769528867513e-05, "loss": 0.198, "num_input_tokens_seen": 1961456, "step": 3155 }, { "epoch": 5.632798573975045, "grad_norm": 0.4028293192386627, "learning_rate": 2.385706784994135e-05, "loss": 0.162, "num_input_tokens_seen": 1964272, "step": 3160 }, { "epoch": 5.641711229946524, "grad_norm": 0.4915693700313568, "learning_rate": 2.3779377233408923e-05, "loss": 0.192, "num_input_tokens_seen": 1967120, "step": 3165 }, { "epoch": 5.650623885918003, "grad_norm": 0.4452253580093384, "learning_rate": 2.3701698431234528e-05, "loss": 0.1601, "num_input_tokens_seen": 1969872, "step": 3170 }, { "epoch": 5.659536541889483, "grad_norm": 0.5284585356712341, "learning_rate": 2.362403219526815e-05, "loss": 0.1605, "num_input_tokens_seen": 1972944, "step": 3175 }, { "epoch": 5.668449197860962, "grad_norm": 0.48784369230270386, "learning_rate": 2.3546379277238107e-05, "loss": 0.1533, "num_input_tokens_seen": 1975888, "step": 3180 }, { "epoch": 5.677361853832442, "grad_norm": 0.5844167470932007, "learning_rate": 2.3468740428743833e-05, "loss": 0.1903, "num_input_tokens_seen": 1979088, "step": 3185 }, { "epoch": 5.686274509803922, "grad_norm": 0.6798781752586365, "learning_rate": 2.339111640124859e-05, "loss": 0.171, "num_input_tokens_seen": 1981520, "step": 3190 }, { "epoch": 5.695187165775401, "grad_norm": 0.8696448802947998, "learning_rate": 2.3313507946072172e-05, "loss": 0.1648, "num_input_tokens_seen": 1984880, "step": 3195 }, { "epoch": 5.704099821746881, "grad_norm": 0.4180395007133484, "learning_rate": 2.323591581438365e-05, "loss": 0.1617, "num_input_tokens_seen": 1987440, "step": 3200 }, { "epoch": 5.71301247771836, "grad_norm": 0.6146518588066101, "learning_rate": 2.3158340757194116e-05, "loss": 0.1963, "num_input_tokens_seen": 1990640, "step": 3205 }, { "epoch": 5.721925133689839, "grad_norm": 0.8348390460014343, "learning_rate": 2.3080783525349388e-05, "loss": 0.1653, "num_input_tokens_seen": 1993808, "step": 3210 }, { "epoch": 5.730837789661319, "grad_norm": 0.7081406712532043, "learning_rate": 2.3003244869522743e-05, "loss": 0.1779, "num_input_tokens_seen": 1996688, "step": 3215 }, { "epoch": 5.739750445632799, "grad_norm": 0.5054243206977844, "learning_rate": 2.2925725540207688e-05, "loss": 0.1565, "num_input_tokens_seen": 1999696, "step": 3220 }, { "epoch": 5.748663101604278, "grad_norm": 0.5454304814338684, "learning_rate": 2.2848226287710645e-05, "loss": 0.1536, "num_input_tokens_seen": 2002032, "step": 3225 }, { "epoch": 5.757575757575758, "grad_norm": 0.6999877095222473, "learning_rate": 2.277074786214372e-05, "loss": 0.1683, "num_input_tokens_seen": 2005584, "step": 3230 }, { "epoch": 5.766488413547237, "grad_norm": 0.765386164188385, "learning_rate": 2.2693291013417453e-05, "loss": 0.1592, "num_input_tokens_seen": 2008176, "step": 3235 }, { "epoch": 5.775401069518717, "grad_norm": 0.7968612909317017, "learning_rate": 2.2615856491233513e-05, "loss": 0.3207, "num_input_tokens_seen": 2011376, "step": 3240 }, { "epoch": 5.784313725490196, "grad_norm": 0.3482127785682678, "learning_rate": 2.2538445045077488e-05, "loss": 0.1455, "num_input_tokens_seen": 2014224, "step": 3245 }, { "epoch": 5.793226381461675, "grad_norm": 0.5806959271430969, "learning_rate": 2.246105742421162e-05, "loss": 0.1741, "num_input_tokens_seen": 2016912, "step": 3250 }, { "epoch": 5.802139037433155, "grad_norm": 0.7654284834861755, "learning_rate": 2.2383694377667543e-05, "loss": 0.1575, "num_input_tokens_seen": 2020048, "step": 3255 }, { "epoch": 5.811051693404634, "grad_norm": 0.642106831073761, "learning_rate": 2.2306356654239012e-05, "loss": 0.1756, "num_input_tokens_seen": 2023216, "step": 3260 }, { "epoch": 5.819964349376114, "grad_norm": 0.43349790573120117, "learning_rate": 2.222904500247473e-05, "loss": 0.1924, "num_input_tokens_seen": 2026928, "step": 3265 }, { "epoch": 5.828877005347594, "grad_norm": 0.4377082884311676, "learning_rate": 2.2151760170671004e-05, "loss": 0.1696, "num_input_tokens_seen": 2029584, "step": 3270 }, { "epoch": 5.837789661319073, "grad_norm": 0.40771257877349854, "learning_rate": 2.207450290686458e-05, "loss": 0.1603, "num_input_tokens_seen": 2032720, "step": 3275 }, { "epoch": 5.846702317290553, "grad_norm": 0.5143370628356934, "learning_rate": 2.1997273958825375e-05, "loss": 0.1845, "num_input_tokens_seen": 2036176, "step": 3280 }, { "epoch": 5.855614973262032, "grad_norm": 0.5394704341888428, "learning_rate": 2.1920074074049225e-05, "loss": 0.1801, "num_input_tokens_seen": 2039632, "step": 3285 }, { "epoch": 5.864527629233511, "grad_norm": 0.6020737290382385, "learning_rate": 2.1842903999750665e-05, "loss": 0.1862, "num_input_tokens_seen": 2043184, "step": 3290 }, { "epoch": 5.873440285204991, "grad_norm": 0.7539795637130737, "learning_rate": 2.1765764482855715e-05, "loss": 0.1628, "num_input_tokens_seen": 2046416, "step": 3295 }, { "epoch": 5.882352941176471, "grad_norm": 0.6914777755737305, "learning_rate": 2.1688656269994612e-05, "loss": 0.1768, "num_input_tokens_seen": 2049008, "step": 3300 }, { "epoch": 5.89126559714795, "grad_norm": 1.2212262153625488, "learning_rate": 2.1611580107494597e-05, "loss": 0.1982, "num_input_tokens_seen": 2052656, "step": 3305 }, { "epoch": 5.90017825311943, "grad_norm": 0.5432605743408203, "learning_rate": 2.153453674137272e-05, "loss": 0.1885, "num_input_tokens_seen": 2055888, "step": 3310 }, { "epoch": 5.909090909090909, "grad_norm": 0.5268386006355286, "learning_rate": 2.1457526917328588e-05, "loss": 0.1492, "num_input_tokens_seen": 2059056, "step": 3315 }, { "epoch": 5.918003565062389, "grad_norm": 0.8248959183692932, "learning_rate": 2.1380551380737128e-05, "loss": 0.1755, "num_input_tokens_seen": 2062096, "step": 3320 }, { "epoch": 5.926916221033868, "grad_norm": 0.5520910024642944, "learning_rate": 2.130361087664145e-05, "loss": 0.1899, "num_input_tokens_seen": 2065168, "step": 3325 }, { "epoch": 5.935828877005347, "grad_norm": 0.5292351841926575, "learning_rate": 2.122670614974555e-05, "loss": 0.1983, "num_input_tokens_seen": 2067856, "step": 3330 }, { "epoch": 5.944741532976827, "grad_norm": 0.8153255581855774, "learning_rate": 2.1149837944407136e-05, "loss": 0.1517, "num_input_tokens_seen": 2071056, "step": 3335 }, { "epoch": 5.953654188948306, "grad_norm": 0.7868825197219849, "learning_rate": 2.107300700463045e-05, "loss": 0.193, "num_input_tokens_seen": 2074192, "step": 3340 }, { "epoch": 5.962566844919786, "grad_norm": 0.39180079102516174, "learning_rate": 2.0996214074059034e-05, "loss": 0.166, "num_input_tokens_seen": 2077040, "step": 3345 }, { "epoch": 5.971479500891266, "grad_norm": 0.5239204168319702, "learning_rate": 2.0919459895968517e-05, "loss": 0.1395, "num_input_tokens_seen": 2079312, "step": 3350 }, { "epoch": 5.980392156862745, "grad_norm": 0.4734959304332733, "learning_rate": 2.084274521325948e-05, "loss": 0.1701, "num_input_tokens_seen": 2082864, "step": 3355 }, { "epoch": 5.989304812834225, "grad_norm": 0.6230949759483337, "learning_rate": 2.0766070768450206e-05, "loss": 0.1928, "num_input_tokens_seen": 2085872, "step": 3360 }, { "epoch": 5.998217468805704, "grad_norm": 0.6036242246627808, "learning_rate": 2.0689437303669508e-05, "loss": 0.1673, "num_input_tokens_seen": 2088272, "step": 3365 }, { "epoch": 6.007130124777183, "grad_norm": 0.6001238822937012, "learning_rate": 2.0612845560649603e-05, "loss": 0.1752, "num_input_tokens_seen": 2091232, "step": 3370 }, { "epoch": 6.010695187165775, "eval_loss": 0.19044770300388336, "eval_runtime": 4.266, "eval_samples_per_second": 58.369, "eval_steps_per_second": 14.768, "num_input_tokens_seen": 2092320, "step": 3372 }, { "epoch": 6.016042780748663, "grad_norm": 0.9030793309211731, "learning_rate": 2.0536296280718825e-05, "loss": 0.1664, "num_input_tokens_seen": 2093952, "step": 3375 }, { "epoch": 6.024955436720143, "grad_norm": 0.6371573209762573, "learning_rate": 2.0459790204794545e-05, "loss": 0.1941, "num_input_tokens_seen": 2097728, "step": 3380 }, { "epoch": 6.033868092691622, "grad_norm": 0.4168316125869751, "learning_rate": 2.0383328073375955e-05, "loss": 0.2223, "num_input_tokens_seen": 2100736, "step": 3385 }, { "epoch": 6.042780748663102, "grad_norm": 0.8262919187545776, "learning_rate": 2.0306910626536926e-05, "loss": 0.1762, "num_input_tokens_seen": 2104032, "step": 3390 }, { "epoch": 6.051693404634581, "grad_norm": 0.482316255569458, "learning_rate": 2.0230538603918787e-05, "loss": 0.1594, "num_input_tokens_seen": 2107264, "step": 3395 }, { "epoch": 6.0606060606060606, "grad_norm": 1.0964471101760864, "learning_rate": 2.015421274472325e-05, "loss": 0.1881, "num_input_tokens_seen": 2110336, "step": 3400 }, { "epoch": 6.06951871657754, "grad_norm": 0.49298667907714844, "learning_rate": 2.0077933787705204e-05, "loss": 0.151, "num_input_tokens_seen": 2113248, "step": 3405 }, { "epoch": 6.078431372549019, "grad_norm": 0.6304886341094971, "learning_rate": 2.000170247116554e-05, "loss": 0.1657, "num_input_tokens_seen": 2116032, "step": 3410 }, { "epoch": 6.087344028520499, "grad_norm": 0.4530024230480194, "learning_rate": 1.9925519532944104e-05, "loss": 0.1692, "num_input_tokens_seen": 2118848, "step": 3415 }, { "epoch": 6.096256684491979, "grad_norm": 0.5926321744918823, "learning_rate": 1.9849385710412424e-05, "loss": 0.3085, "num_input_tokens_seen": 2122208, "step": 3420 }, { "epoch": 6.105169340463458, "grad_norm": 0.5866901874542236, "learning_rate": 1.977330174046667e-05, "loss": 0.1675, "num_input_tokens_seen": 2125248, "step": 3425 }, { "epoch": 6.114081996434938, "grad_norm": 0.35337719321250916, "learning_rate": 1.9697268359520506e-05, "loss": 0.2589, "num_input_tokens_seen": 2129248, "step": 3430 }, { "epoch": 6.122994652406417, "grad_norm": 0.4666219651699066, "learning_rate": 1.9621286303497915e-05, "loss": 0.1709, "num_input_tokens_seen": 2131904, "step": 3435 }, { "epoch": 6.1319073083778965, "grad_norm": 0.6858420372009277, "learning_rate": 1.954535630782612e-05, "loss": 0.183, "num_input_tokens_seen": 2135552, "step": 3440 }, { "epoch": 6.140819964349376, "grad_norm": 0.41474148631095886, "learning_rate": 1.9469479107428463e-05, "loss": 0.1723, "num_input_tokens_seen": 2138688, "step": 3445 }, { "epoch": 6.149732620320855, "grad_norm": 0.60605388879776, "learning_rate": 1.9393655436717283e-05, "loss": 0.1506, "num_input_tokens_seen": 2141248, "step": 3450 }, { "epoch": 6.158645276292335, "grad_norm": 0.9076442122459412, "learning_rate": 1.9317886029586778e-05, "loss": 0.2039, "num_input_tokens_seen": 2144768, "step": 3455 }, { "epoch": 6.167557932263815, "grad_norm": 0.9373259544372559, "learning_rate": 1.9242171619405986e-05, "loss": 0.1797, "num_input_tokens_seen": 2147552, "step": 3460 }, { "epoch": 6.176470588235294, "grad_norm": 0.6851420998573303, "learning_rate": 1.916651293901157e-05, "loss": 0.1825, "num_input_tokens_seen": 2151040, "step": 3465 }, { "epoch": 6.185383244206774, "grad_norm": 0.6892784833908081, "learning_rate": 1.909091072070083e-05, "loss": 0.171, "num_input_tokens_seen": 2155040, "step": 3470 }, { "epoch": 6.194295900178253, "grad_norm": 0.6285828948020935, "learning_rate": 1.9015365696224564e-05, "loss": 0.158, "num_input_tokens_seen": 2157824, "step": 3475 }, { "epoch": 6.2032085561497325, "grad_norm": 0.5884494781494141, "learning_rate": 1.893987859677997e-05, "loss": 0.181, "num_input_tokens_seen": 2160672, "step": 3480 }, { "epoch": 6.212121212121212, "grad_norm": 0.7425735592842102, "learning_rate": 1.886445015300362e-05, "loss": 0.1473, "num_input_tokens_seen": 2163552, "step": 3485 }, { "epoch": 6.221033868092691, "grad_norm": 0.39105650782585144, "learning_rate": 1.8789081094964347e-05, "loss": 0.1441, "num_input_tokens_seen": 2167456, "step": 3490 }, { "epoch": 6.229946524064171, "grad_norm": 0.30422699451446533, "learning_rate": 1.8713772152156205e-05, "loss": 0.1294, "num_input_tokens_seen": 2170560, "step": 3495 }, { "epoch": 6.238859180035651, "grad_norm": 0.7964766621589661, "learning_rate": 1.863852405349135e-05, "loss": 0.1838, "num_input_tokens_seen": 2173152, "step": 3500 }, { "epoch": 6.24777183600713, "grad_norm": 0.6463519334793091, "learning_rate": 1.856333752729311e-05, "loss": 0.1637, "num_input_tokens_seen": 2175808, "step": 3505 }, { "epoch": 6.25668449197861, "grad_norm": 0.8007080554962158, "learning_rate": 1.848821330128878e-05, "loss": 0.1717, "num_input_tokens_seen": 2178304, "step": 3510 }, { "epoch": 6.265597147950089, "grad_norm": 1.0539445877075195, "learning_rate": 1.8413152102602687e-05, "loss": 0.1892, "num_input_tokens_seen": 2181312, "step": 3515 }, { "epoch": 6.2745098039215685, "grad_norm": 0.6273789405822754, "learning_rate": 1.8338154657749128e-05, "loss": 0.1699, "num_input_tokens_seen": 2184128, "step": 3520 }, { "epoch": 6.283422459893048, "grad_norm": 0.5192899703979492, "learning_rate": 1.826322169262531e-05, "loss": 0.1772, "num_input_tokens_seen": 2187584, "step": 3525 }, { "epoch": 6.292335115864527, "grad_norm": 0.6465858221054077, "learning_rate": 1.818835393250434e-05, "loss": 0.1814, "num_input_tokens_seen": 2191168, "step": 3530 }, { "epoch": 6.301247771836007, "grad_norm": 0.5996541380882263, "learning_rate": 1.8113552102028236e-05, "loss": 0.1888, "num_input_tokens_seen": 2194880, "step": 3535 }, { "epoch": 6.310160427807487, "grad_norm": 0.3005512058734894, "learning_rate": 1.803881692520087e-05, "loss": 0.1483, "num_input_tokens_seen": 2197184, "step": 3540 }, { "epoch": 6.319073083778966, "grad_norm": 0.4426136016845703, "learning_rate": 1.796414912538095e-05, "loss": 0.162, "num_input_tokens_seen": 2200160, "step": 3545 }, { "epoch": 6.327985739750446, "grad_norm": 0.7000912427902222, "learning_rate": 1.7889549425275093e-05, "loss": 0.1686, "num_input_tokens_seen": 2203776, "step": 3550 }, { "epoch": 6.336898395721925, "grad_norm": 0.5500680804252625, "learning_rate": 1.7815018546930754e-05, "loss": 0.1716, "num_input_tokens_seen": 2207104, "step": 3555 }, { "epoch": 6.3458110516934045, "grad_norm": 0.5378794074058533, "learning_rate": 1.7740557211729258e-05, "loss": 0.1653, "num_input_tokens_seen": 2210400, "step": 3560 }, { "epoch": 6.354723707664884, "grad_norm": 0.20100829005241394, "learning_rate": 1.7666166140378852e-05, "loss": 0.1604, "num_input_tokens_seen": 2213728, "step": 3565 }, { "epoch": 6.363636363636363, "grad_norm": 0.33214375376701355, "learning_rate": 1.7591846052907673e-05, "loss": 0.1524, "num_input_tokens_seen": 2216416, "step": 3570 }, { "epoch": 6.372549019607844, "grad_norm": 1.197052240371704, "learning_rate": 1.7517597668656823e-05, "loss": 0.1849, "num_input_tokens_seen": 2219328, "step": 3575 }, { "epoch": 6.381461675579323, "grad_norm": 0.704537034034729, "learning_rate": 1.7443421706273395e-05, "loss": 0.1927, "num_input_tokens_seen": 2222496, "step": 3580 }, { "epoch": 6.390374331550802, "grad_norm": 0.6272372007369995, "learning_rate": 1.7369318883703506e-05, "loss": 0.1855, "num_input_tokens_seen": 2225504, "step": 3585 }, { "epoch": 6.399286987522282, "grad_norm": 0.8482812643051147, "learning_rate": 1.7295289918185348e-05, "loss": 0.1753, "num_input_tokens_seen": 2229312, "step": 3590 }, { "epoch": 6.408199643493761, "grad_norm": 0.5499706864356995, "learning_rate": 1.722133552624227e-05, "loss": 0.1939, "num_input_tokens_seen": 2232544, "step": 3595 }, { "epoch": 6.4171122994652405, "grad_norm": 0.48051542043685913, "learning_rate": 1.714745642367583e-05, "loss": 0.1707, "num_input_tokens_seen": 2235808, "step": 3600 }, { "epoch": 6.42602495543672, "grad_norm": 1.0482089519500732, "learning_rate": 1.707365332555883e-05, "loss": 0.183, "num_input_tokens_seen": 2239040, "step": 3605 }, { "epoch": 6.434937611408199, "grad_norm": 0.5002045631408691, "learning_rate": 1.699992694622847e-05, "loss": 0.1476, "num_input_tokens_seen": 2241728, "step": 3610 }, { "epoch": 6.443850267379679, "grad_norm": 0.5338446497917175, "learning_rate": 1.6926277999279372e-05, "loss": 0.1712, "num_input_tokens_seen": 2244928, "step": 3615 }, { "epoch": 6.452762923351159, "grad_norm": 0.5092248320579529, "learning_rate": 1.6852707197556677e-05, "loss": 0.1569, "num_input_tokens_seen": 2247936, "step": 3620 }, { "epoch": 6.461675579322638, "grad_norm": 0.4300782382488251, "learning_rate": 1.67792152531492e-05, "loss": 0.1658, "num_input_tokens_seen": 2250560, "step": 3625 }, { "epoch": 6.470588235294118, "grad_norm": 0.3229581415653229, "learning_rate": 1.6705802877382464e-05, "loss": 0.1451, "num_input_tokens_seen": 2253248, "step": 3630 }, { "epoch": 6.479500891265597, "grad_norm": 0.5048878788948059, "learning_rate": 1.6632470780811866e-05, "loss": 0.1803, "num_input_tokens_seen": 2256320, "step": 3635 }, { "epoch": 6.4884135472370765, "grad_norm": 0.7852115631103516, "learning_rate": 1.6559219673215784e-05, "loss": 0.1825, "num_input_tokens_seen": 2259168, "step": 3640 }, { "epoch": 6.497326203208556, "grad_norm": 0.3399798572063446, "learning_rate": 1.6486050263588702e-05, "loss": 0.1856, "num_input_tokens_seen": 2262240, "step": 3645 }, { "epoch": 6.506238859180035, "grad_norm": 0.5445297360420227, "learning_rate": 1.641296326013436e-05, "loss": 0.2109, "num_input_tokens_seen": 2265600, "step": 3650 }, { "epoch": 6.5115864527629235, "eval_loss": 0.1881975382566452, "eval_runtime": 4.2584, "eval_samples_per_second": 58.472, "eval_steps_per_second": 14.794, "num_input_tokens_seen": 2267520, "step": 3653 }, { "epoch": 6.515151515151516, "grad_norm": 0.33709490299224854, "learning_rate": 1.633995937025889e-05, "loss": 0.1652, "num_input_tokens_seen": 2268768, "step": 3655 }, { "epoch": 6.524064171122995, "grad_norm": 0.4406679570674896, "learning_rate": 1.6267039300563965e-05, "loss": 0.2093, "num_input_tokens_seen": 2272256, "step": 3660 }, { "epoch": 6.532976827094474, "grad_norm": 0.6629878878593445, "learning_rate": 1.619420375683996e-05, "loss": 0.1718, "num_input_tokens_seen": 2275968, "step": 3665 }, { "epoch": 6.541889483065954, "grad_norm": 0.665874183177948, "learning_rate": 1.6121453444059153e-05, "loss": 0.1913, "num_input_tokens_seen": 2278784, "step": 3670 }, { "epoch": 6.550802139037433, "grad_norm": 0.5533963441848755, "learning_rate": 1.6048789066368858e-05, "loss": 0.1798, "num_input_tokens_seen": 2281472, "step": 3675 }, { "epoch": 6.5597147950089125, "grad_norm": 0.40691274404525757, "learning_rate": 1.5976211327084606e-05, "loss": 0.1737, "num_input_tokens_seen": 2284608, "step": 3680 }, { "epoch": 6.568627450980392, "grad_norm": 0.7153930068016052, "learning_rate": 1.59037209286834e-05, "loss": 0.1607, "num_input_tokens_seen": 2287296, "step": 3685 }, { "epoch": 6.577540106951871, "grad_norm": 0.4068545401096344, "learning_rate": 1.583131857279685e-05, "loss": 0.1584, "num_input_tokens_seen": 2290176, "step": 3690 }, { "epoch": 6.586452762923351, "grad_norm": 0.5864424109458923, "learning_rate": 1.57590049602044e-05, "loss": 0.175, "num_input_tokens_seen": 2292960, "step": 3695 }, { "epoch": 6.595365418894831, "grad_norm": 0.729058027267456, "learning_rate": 1.5686780790826574e-05, "loss": 0.1749, "num_input_tokens_seen": 2296192, "step": 3700 }, { "epoch": 6.60427807486631, "grad_norm": 0.7947399616241455, "learning_rate": 1.561464676371816e-05, "loss": 0.1895, "num_input_tokens_seen": 2300224, "step": 3705 }, { "epoch": 6.61319073083779, "grad_norm": 0.5141013860702515, "learning_rate": 1.5542603577061464e-05, "loss": 0.1672, "num_input_tokens_seen": 2303040, "step": 3710 }, { "epoch": 6.622103386809269, "grad_norm": 0.7291932702064514, "learning_rate": 1.5470651928159564e-05, "loss": 0.1447, "num_input_tokens_seen": 2305600, "step": 3715 }, { "epoch": 6.6310160427807485, "grad_norm": 0.48628827929496765, "learning_rate": 1.539879251342954e-05, "loss": 0.1646, "num_input_tokens_seen": 2308736, "step": 3720 }, { "epoch": 6.639928698752228, "grad_norm": 0.6047589778900146, "learning_rate": 1.5327026028395724e-05, "loss": 0.1547, "num_input_tokens_seen": 2311840, "step": 3725 }, { "epoch": 6.648841354723707, "grad_norm": 0.5494013428688049, "learning_rate": 1.5255353167683017e-05, "loss": 0.1728, "num_input_tokens_seen": 2315808, "step": 3730 }, { "epoch": 6.657754010695188, "grad_norm": 0.6367866396903992, "learning_rate": 1.5183774625010119e-05, "loss": 0.1566, "num_input_tokens_seen": 2319072, "step": 3735 }, { "epoch": 6.666666666666667, "grad_norm": 0.6009120345115662, "learning_rate": 1.5112291093182818e-05, "loss": 0.187, "num_input_tokens_seen": 2323104, "step": 3740 }, { "epoch": 6.675579322638146, "grad_norm": 0.5307632088661194, "learning_rate": 1.5040903264087328e-05, "loss": 0.174, "num_input_tokens_seen": 2325984, "step": 3745 }, { "epoch": 6.684491978609626, "grad_norm": 0.4566698372364044, "learning_rate": 1.4969611828683517e-05, "loss": 0.1415, "num_input_tokens_seen": 2329152, "step": 3750 }, { "epoch": 6.693404634581105, "grad_norm": 0.5744293928146362, "learning_rate": 1.4898417476998289e-05, "loss": 0.2178, "num_input_tokens_seen": 2332768, "step": 3755 }, { "epoch": 6.7023172905525845, "grad_norm": 0.4906589984893799, "learning_rate": 1.4827320898118884e-05, "loss": 0.1595, "num_input_tokens_seen": 2335680, "step": 3760 }, { "epoch": 6.711229946524064, "grad_norm": 0.643140435218811, "learning_rate": 1.4756322780186193e-05, "loss": 0.1865, "num_input_tokens_seen": 2338656, "step": 3765 }, { "epoch": 6.720142602495543, "grad_norm": 0.6035706996917725, "learning_rate": 1.4685423810388094e-05, "loss": 0.1639, "num_input_tokens_seen": 2342016, "step": 3770 }, { "epoch": 6.729055258467023, "grad_norm": 0.35557159781455994, "learning_rate": 1.4614624674952842e-05, "loss": 0.1617, "num_input_tokens_seen": 2345120, "step": 3775 }, { "epoch": 6.737967914438503, "grad_norm": 0.589004397392273, "learning_rate": 1.4543926059142379e-05, "loss": 0.1699, "num_input_tokens_seen": 2348512, "step": 3780 }, { "epoch": 6.746880570409982, "grad_norm": 0.4238247573375702, "learning_rate": 1.4473328647245726e-05, "loss": 0.1614, "num_input_tokens_seen": 2350688, "step": 3785 }, { "epoch": 6.755793226381462, "grad_norm": 0.6005486845970154, "learning_rate": 1.4402833122572368e-05, "loss": 0.1801, "num_input_tokens_seen": 2353504, "step": 3790 }, { "epoch": 6.764705882352941, "grad_norm": 0.6389063000679016, "learning_rate": 1.4332440167445613e-05, "loss": 0.1597, "num_input_tokens_seen": 2356672, "step": 3795 }, { "epoch": 6.7736185383244205, "grad_norm": 0.4916219115257263, "learning_rate": 1.4262150463195981e-05, "loss": 0.1759, "num_input_tokens_seen": 2360288, "step": 3800 }, { "epoch": 6.7825311942959, "grad_norm": 0.6930426359176636, "learning_rate": 1.4191964690154702e-05, "loss": 0.1552, "num_input_tokens_seen": 2362944, "step": 3805 }, { "epoch": 6.791443850267379, "grad_norm": 0.5594033598899841, "learning_rate": 1.412188352764699e-05, "loss": 0.1858, "num_input_tokens_seen": 2366080, "step": 3810 }, { "epoch": 6.80035650623886, "grad_norm": 0.6492391228675842, "learning_rate": 1.4051907653985552e-05, "loss": 0.1954, "num_input_tokens_seen": 2369632, "step": 3815 }, { "epoch": 6.809269162210339, "grad_norm": 0.7449959516525269, "learning_rate": 1.3982037746464043e-05, "loss": 0.1986, "num_input_tokens_seen": 2373504, "step": 3820 }, { "epoch": 6.818181818181818, "grad_norm": 0.6552306413650513, "learning_rate": 1.3912274481350433e-05, "loss": 0.1672, "num_input_tokens_seen": 2376480, "step": 3825 }, { "epoch": 6.827094474153298, "grad_norm": 0.5298140048980713, "learning_rate": 1.3842618533880531e-05, "loss": 0.1679, "num_input_tokens_seen": 2379488, "step": 3830 }, { "epoch": 6.836007130124777, "grad_norm": 0.6472254395484924, "learning_rate": 1.3773070578251424e-05, "loss": 0.179, "num_input_tokens_seen": 2382496, "step": 3835 }, { "epoch": 6.8449197860962565, "grad_norm": 0.5164865851402283, "learning_rate": 1.3703631287614935e-05, "loss": 0.1802, "num_input_tokens_seen": 2386304, "step": 3840 }, { "epoch": 6.853832442067736, "grad_norm": 0.4910835325717926, "learning_rate": 1.363430133407112e-05, "loss": 0.1772, "num_input_tokens_seen": 2389504, "step": 3845 }, { "epoch": 6.862745098039216, "grad_norm": 0.5745038986206055, "learning_rate": 1.3565081388661782e-05, "loss": 0.1634, "num_input_tokens_seen": 2392320, "step": 3850 }, { "epoch": 6.871657754010696, "grad_norm": 0.5505916476249695, "learning_rate": 1.3495972121363968e-05, "loss": 0.1739, "num_input_tokens_seen": 2395648, "step": 3855 }, { "epoch": 6.880570409982175, "grad_norm": 0.6166315674781799, "learning_rate": 1.3426974201083439e-05, "loss": 0.1693, "num_input_tokens_seen": 2398080, "step": 3860 }, { "epoch": 6.889483065953654, "grad_norm": 1.0031318664550781, "learning_rate": 1.3358088295648274e-05, "loss": 0.175, "num_input_tokens_seen": 2400448, "step": 3865 }, { "epoch": 6.898395721925134, "grad_norm": 0.43097200989723206, "learning_rate": 1.328931507180233e-05, "loss": 0.1634, "num_input_tokens_seen": 2403424, "step": 3870 }, { "epoch": 6.907308377896613, "grad_norm": 0.4086379110813141, "learning_rate": 1.3220655195198847e-05, "loss": 0.1469, "num_input_tokens_seen": 2405984, "step": 3875 }, { "epoch": 6.9162210338680925, "grad_norm": 0.40902405977249146, "learning_rate": 1.3152109330393985e-05, "loss": 0.1677, "num_input_tokens_seen": 2409472, "step": 3880 }, { "epoch": 6.925133689839572, "grad_norm": 0.629298985004425, "learning_rate": 1.3083678140840366e-05, "loss": 0.1898, "num_input_tokens_seen": 2412384, "step": 3885 }, { "epoch": 6.934046345811051, "grad_norm": 0.4956974387168884, "learning_rate": 1.3015362288880678e-05, "loss": 0.1628, "num_input_tokens_seen": 2415328, "step": 3890 }, { "epoch": 6.942959001782532, "grad_norm": 0.31115609407424927, "learning_rate": 1.2947162435741278e-05, "loss": 0.1869, "num_input_tokens_seen": 2418848, "step": 3895 }, { "epoch": 6.951871657754011, "grad_norm": 0.5426957011222839, "learning_rate": 1.2879079241525783e-05, "loss": 0.1615, "num_input_tokens_seen": 2421824, "step": 3900 }, { "epoch": 6.96078431372549, "grad_norm": 0.6043846011161804, "learning_rate": 1.2811113365208627e-05, "loss": 0.189, "num_input_tokens_seen": 2424224, "step": 3905 }, { "epoch": 6.96969696969697, "grad_norm": 0.48290809988975525, "learning_rate": 1.2743265464628786e-05, "loss": 0.1779, "num_input_tokens_seen": 2427616, "step": 3910 }, { "epoch": 6.978609625668449, "grad_norm": 0.5067238211631775, "learning_rate": 1.2675536196483306e-05, "loss": 0.1568, "num_input_tokens_seen": 2430368, "step": 3915 }, { "epoch": 6.9875222816399285, "grad_norm": 0.43254604935646057, "learning_rate": 1.260792621632102e-05, "loss": 0.1876, "num_input_tokens_seen": 2433376, "step": 3920 }, { "epoch": 6.996434937611408, "grad_norm": 0.8352137804031372, "learning_rate": 1.2540436178536186e-05, "loss": 0.186, "num_input_tokens_seen": 2436608, "step": 3925 }, { "epoch": 7.005347593582887, "grad_norm": 0.8926360011100769, "learning_rate": 1.2473066736362124e-05, "loss": 0.1554, "num_input_tokens_seen": 2439064, "step": 3930 }, { "epoch": 7.0124777183600715, "eval_loss": 0.18532642722129822, "eval_runtime": 4.2481, "eval_samples_per_second": 58.614, "eval_steps_per_second": 14.83, "num_input_tokens_seen": 2441688, "step": 3934 }, { "epoch": 7.0142602495543676, "grad_norm": 0.40735986828804016, "learning_rate": 1.2405818541864905e-05, "loss": 0.1639, "num_input_tokens_seen": 2442328, "step": 3935 }, { "epoch": 7.023172905525847, "grad_norm": 0.8125144243240356, "learning_rate": 1.2338692245937077e-05, "loss": 0.1518, "num_input_tokens_seen": 2445272, "step": 3940 }, { "epoch": 7.032085561497326, "grad_norm": 0.352469801902771, "learning_rate": 1.2271688498291335e-05, "loss": 0.1499, "num_input_tokens_seen": 2448216, "step": 3945 }, { "epoch": 7.040998217468806, "grad_norm": 0.5842772722244263, "learning_rate": 1.2204807947454203e-05, "loss": 0.173, "num_input_tokens_seen": 2451704, "step": 3950 }, { "epoch": 7.049910873440285, "grad_norm": 0.8481732606887817, "learning_rate": 1.2138051240759826e-05, "loss": 0.1489, "num_input_tokens_seen": 2454392, "step": 3955 }, { "epoch": 7.0588235294117645, "grad_norm": 0.6517293453216553, "learning_rate": 1.2071419024343633e-05, "loss": 0.1674, "num_input_tokens_seen": 2457112, "step": 3960 }, { "epoch": 7.067736185383244, "grad_norm": 0.5270460844039917, "learning_rate": 1.2004911943136143e-05, "loss": 0.1551, "num_input_tokens_seen": 2460312, "step": 3965 }, { "epoch": 7.076648841354723, "grad_norm": 0.5227533578872681, "learning_rate": 1.1938530640856696e-05, "loss": 0.1572, "num_input_tokens_seen": 2463224, "step": 3970 }, { "epoch": 7.0855614973262036, "grad_norm": 0.29230085015296936, "learning_rate": 1.1872275760007198e-05, "loss": 0.1661, "num_input_tokens_seen": 2466008, "step": 3975 }, { "epoch": 7.094474153297683, "grad_norm": 0.5345339179039001, "learning_rate": 1.1806147941865938e-05, "loss": 0.1784, "num_input_tokens_seen": 2469176, "step": 3980 }, { "epoch": 7.103386809269162, "grad_norm": 0.4222520589828491, "learning_rate": 1.1740147826481385e-05, "loss": 0.1405, "num_input_tokens_seen": 2472408, "step": 3985 }, { "epoch": 7.112299465240642, "grad_norm": 0.5282605290412903, "learning_rate": 1.1674276052665973e-05, "loss": 0.1902, "num_input_tokens_seen": 2475608, "step": 3990 }, { "epoch": 7.121212121212121, "grad_norm": 0.4751206636428833, "learning_rate": 1.1608533257989901e-05, "loss": 0.1489, "num_input_tokens_seen": 2478680, "step": 3995 }, { "epoch": 7.1301247771836005, "grad_norm": 0.3280528783798218, "learning_rate": 1.1542920078775018e-05, "loss": 0.1666, "num_input_tokens_seen": 2481592, "step": 4000 }, { "epoch": 7.13903743315508, "grad_norm": 0.9430297017097473, "learning_rate": 1.14774371500886e-05, "loss": 0.2094, "num_input_tokens_seen": 2485176, "step": 4005 }, { "epoch": 7.14795008912656, "grad_norm": 0.27522483468055725, "learning_rate": 1.141208510573725e-05, "loss": 0.1596, "num_input_tokens_seen": 2488152, "step": 4010 }, { "epoch": 7.1568627450980395, "grad_norm": 0.5842289328575134, "learning_rate": 1.1346864578260758e-05, "loss": 0.1904, "num_input_tokens_seen": 2491320, "step": 4015 }, { "epoch": 7.165775401069519, "grad_norm": 0.38907817006111145, "learning_rate": 1.1281776198925939e-05, "loss": 0.1459, "num_input_tokens_seen": 2493944, "step": 4020 }, { "epoch": 7.174688057040998, "grad_norm": 0.31314197182655334, "learning_rate": 1.121682059772056e-05, "loss": 0.1407, "num_input_tokens_seen": 2496664, "step": 4025 }, { "epoch": 7.183600713012478, "grad_norm": 0.5018792748451233, "learning_rate": 1.1151998403347244e-05, "loss": 0.2596, "num_input_tokens_seen": 2500216, "step": 4030 }, { "epoch": 7.192513368983957, "grad_norm": 0.4724593162536621, "learning_rate": 1.1087310243217386e-05, "loss": 0.1538, "num_input_tokens_seen": 2503544, "step": 4035 }, { "epoch": 7.2014260249554365, "grad_norm": 0.647865891456604, "learning_rate": 1.1022756743445028e-05, "loss": 0.1738, "num_input_tokens_seen": 2507160, "step": 4040 }, { "epoch": 7.210338680926916, "grad_norm": 0.48006606101989746, "learning_rate": 1.0958338528840893e-05, "loss": 0.1834, "num_input_tokens_seen": 2510232, "step": 4045 }, { "epoch": 7.219251336898395, "grad_norm": 0.4462122917175293, "learning_rate": 1.0894056222906226e-05, "loss": 0.1348, "num_input_tokens_seen": 2513144, "step": 4050 }, { "epoch": 7.2281639928698755, "grad_norm": 0.48262760043144226, "learning_rate": 1.0829910447826868e-05, "loss": 0.1547, "num_input_tokens_seen": 2516504, "step": 4055 }, { "epoch": 7.237076648841355, "grad_norm": 0.5589674711227417, "learning_rate": 1.0765901824467167e-05, "loss": 0.1723, "num_input_tokens_seen": 2518648, "step": 4060 }, { "epoch": 7.245989304812834, "grad_norm": 0.4827505946159363, "learning_rate": 1.0702030972363963e-05, "loss": 0.1625, "num_input_tokens_seen": 2521880, "step": 4065 }, { "epoch": 7.254901960784314, "grad_norm": 0.5129882097244263, "learning_rate": 1.063829850972065e-05, "loss": 0.1871, "num_input_tokens_seen": 2525336, "step": 4070 }, { "epoch": 7.263814616755793, "grad_norm": 0.5441546440124512, "learning_rate": 1.0574705053401127e-05, "loss": 0.1591, "num_input_tokens_seen": 2528184, "step": 4075 }, { "epoch": 7.2727272727272725, "grad_norm": 0.42811569571495056, "learning_rate": 1.0511251218923868e-05, "loss": 0.1592, "num_input_tokens_seen": 2530904, "step": 4080 }, { "epoch": 7.281639928698752, "grad_norm": 0.43192997574806213, "learning_rate": 1.0447937620455964e-05, "loss": 0.178, "num_input_tokens_seen": 2533656, "step": 4085 }, { "epoch": 7.290552584670232, "grad_norm": 0.7238538265228271, "learning_rate": 1.0384764870807149e-05, "loss": 0.1817, "num_input_tokens_seen": 2535928, "step": 4090 }, { "epoch": 7.2994652406417115, "grad_norm": 0.4946947991847992, "learning_rate": 1.0321733581423884e-05, "loss": 0.1685, "num_input_tokens_seen": 2539352, "step": 4095 }, { "epoch": 7.308377896613191, "grad_norm": 0.5055748224258423, "learning_rate": 1.025884436238346e-05, "loss": 0.1722, "num_input_tokens_seen": 2542456, "step": 4100 }, { "epoch": 7.31729055258467, "grad_norm": 0.9246964454650879, "learning_rate": 1.0196097822388075e-05, "loss": 0.1772, "num_input_tokens_seen": 2545816, "step": 4105 }, { "epoch": 7.32620320855615, "grad_norm": 0.8303518891334534, "learning_rate": 1.013349456875892e-05, "loss": 0.1608, "num_input_tokens_seen": 2548824, "step": 4110 }, { "epoch": 7.335115864527629, "grad_norm": 0.5074154734611511, "learning_rate": 1.0071035207430352e-05, "loss": 0.1655, "num_input_tokens_seen": 2552152, "step": 4115 }, { "epoch": 7.3440285204991085, "grad_norm": 0.4153769910335541, "learning_rate": 1.0008720342943966e-05, "loss": 0.1643, "num_input_tokens_seen": 2555768, "step": 4120 }, { "epoch": 7.352941176470588, "grad_norm": 0.3799455165863037, "learning_rate": 9.94655057844281e-06, "loss": 0.1602, "num_input_tokens_seen": 2558328, "step": 4125 }, { "epoch": 7.361853832442068, "grad_norm": 0.6474289298057556, "learning_rate": 9.884526515665508e-06, "loss": 0.17, "num_input_tokens_seen": 2561368, "step": 4130 }, { "epoch": 7.3707664884135475, "grad_norm": 0.7523593902587891, "learning_rate": 9.822648754940431e-06, "loss": 0.156, "num_input_tokens_seen": 2564056, "step": 4135 }, { "epoch": 7.379679144385027, "grad_norm": 0.5380316972732544, "learning_rate": 9.760917895179894e-06, "loss": 0.1746, "num_input_tokens_seen": 2566744, "step": 4140 }, { "epoch": 7.388591800356506, "grad_norm": 1.0373018980026245, "learning_rate": 9.699334533874386e-06, "loss": 0.1959, "num_input_tokens_seen": 2569656, "step": 4145 }, { "epoch": 7.397504456327986, "grad_norm": 0.6027229428291321, "learning_rate": 9.637899267086758e-06, "loss": 0.1752, "num_input_tokens_seen": 2573112, "step": 4150 }, { "epoch": 7.406417112299465, "grad_norm": 0.5722499489784241, "learning_rate": 9.576612689446444e-06, "loss": 0.1712, "num_input_tokens_seen": 2576952, "step": 4155 }, { "epoch": 7.4153297682709445, "grad_norm": 0.5797430276870728, "learning_rate": 9.515475394143742e-06, "loss": 0.1445, "num_input_tokens_seen": 2579896, "step": 4160 }, { "epoch": 7.424242424242424, "grad_norm": 0.4454365670681, "learning_rate": 9.45448797292403e-06, "loss": 0.2141, "num_input_tokens_seen": 2583544, "step": 4165 }, { "epoch": 7.433155080213904, "grad_norm": 0.3823348879814148, "learning_rate": 9.393651016082083e-06, "loss": 0.154, "num_input_tokens_seen": 2586200, "step": 4170 }, { "epoch": 7.4420677361853835, "grad_norm": 0.44054359197616577, "learning_rate": 9.332965112456337e-06, "loss": 0.1803, "num_input_tokens_seen": 2589496, "step": 4175 }, { "epoch": 7.450980392156863, "grad_norm": 0.4444521963596344, "learning_rate": 9.272430849423174e-06, "loss": 0.1813, "num_input_tokens_seen": 2591928, "step": 4180 }, { "epoch": 7.459893048128342, "grad_norm": 0.6432741284370422, "learning_rate": 9.21204881289125e-06, "loss": 0.1793, "num_input_tokens_seen": 2595064, "step": 4185 }, { "epoch": 7.468805704099822, "grad_norm": 0.5586231350898743, "learning_rate": 9.151819587295845e-06, "loss": 0.162, "num_input_tokens_seen": 2597944, "step": 4190 }, { "epoch": 7.477718360071301, "grad_norm": 0.4838408827781677, "learning_rate": 9.09174375559319e-06, "loss": 0.1969, "num_input_tokens_seen": 2601656, "step": 4195 }, { "epoch": 7.4866310160427805, "grad_norm": 0.4085644483566284, "learning_rate": 9.031821899254796e-06, "loss": 0.1497, "num_input_tokens_seen": 2604472, "step": 4200 }, { "epoch": 7.49554367201426, "grad_norm": 0.3888384699821472, "learning_rate": 8.972054598261892e-06, "loss": 0.1631, "num_input_tokens_seen": 2607992, "step": 4205 }, { "epoch": 7.50445632798574, "grad_norm": 0.7054049372673035, "learning_rate": 8.912442431099724e-06, "loss": 0.1672, "num_input_tokens_seen": 2611800, "step": 4210 }, { "epoch": 7.5133689839572195, "grad_norm": 0.5162657499313354, "learning_rate": 8.852985974752045e-06, "loss": 0.1665, "num_input_tokens_seen": 2614936, "step": 4215 }, { "epoch": 7.5133689839572195, "eval_loss": 0.18536153435707092, "eval_runtime": 4.2538, "eval_samples_per_second": 58.536, "eval_steps_per_second": 14.81, "num_input_tokens_seen": 2614936, "step": 4215 }, { "epoch": 7.522281639928699, "grad_norm": 0.357683002948761, "learning_rate": 8.793685804695482e-06, "loss": 0.2229, "num_input_tokens_seen": 2618744, "step": 4220 }, { "epoch": 7.531194295900178, "grad_norm": 0.4619935154914856, "learning_rate": 8.734542494893955e-06, "loss": 0.1613, "num_input_tokens_seen": 2621496, "step": 4225 }, { "epoch": 7.540106951871658, "grad_norm": 0.5771064758300781, "learning_rate": 8.675556617793143e-06, "loss": 0.1607, "num_input_tokens_seen": 2624568, "step": 4230 }, { "epoch": 7.549019607843137, "grad_norm": 0.5340394377708435, "learning_rate": 8.616728744314956e-06, "loss": 0.1969, "num_input_tokens_seen": 2627832, "step": 4235 }, { "epoch": 7.5579322638146165, "grad_norm": 0.5918867588043213, "learning_rate": 8.558059443851998e-06, "loss": 0.1702, "num_input_tokens_seen": 2631160, "step": 4240 }, { "epoch": 7.566844919786096, "grad_norm": 0.4290253520011902, "learning_rate": 8.499549284262017e-06, "loss": 0.158, "num_input_tokens_seen": 2634488, "step": 4245 }, { "epoch": 7.575757575757576, "grad_norm": 0.6583709120750427, "learning_rate": 8.441198831862485e-06, "loss": 0.1691, "num_input_tokens_seen": 2637240, "step": 4250 }, { "epoch": 7.5846702317290555, "grad_norm": 0.6762195825576782, "learning_rate": 8.383008651425035e-06, "loss": 0.1565, "num_input_tokens_seen": 2639992, "step": 4255 }, { "epoch": 7.593582887700535, "grad_norm": 0.29171764850616455, "learning_rate": 8.32497930617006e-06, "loss": 0.1893, "num_input_tokens_seen": 2643832, "step": 4260 }, { "epoch": 7.602495543672014, "grad_norm": 0.4991152286529541, "learning_rate": 8.267111357761243e-06, "loss": 0.1343, "num_input_tokens_seen": 2646712, "step": 4265 }, { "epoch": 7.611408199643494, "grad_norm": 0.6517699360847473, "learning_rate": 8.209405366300088e-06, "loss": 0.1455, "num_input_tokens_seen": 2650072, "step": 4270 }, { "epoch": 7.620320855614973, "grad_norm": 1.1518526077270508, "learning_rate": 8.151861890320528e-06, "loss": 0.1928, "num_input_tokens_seen": 2653656, "step": 4275 }, { "epoch": 7.6292335115864525, "grad_norm": 0.7069615721702576, "learning_rate": 8.094481486783534e-06, "loss": 0.2059, "num_input_tokens_seen": 2657464, "step": 4280 }, { "epoch": 7.638146167557933, "grad_norm": 0.3675689697265625, "learning_rate": 8.0372647110717e-06, "loss": 0.1825, "num_input_tokens_seen": 2660568, "step": 4285 }, { "epoch": 7.647058823529412, "grad_norm": 0.5671415328979492, "learning_rate": 7.98021211698385e-06, "loss": 0.1507, "num_input_tokens_seen": 2663448, "step": 4290 }, { "epoch": 7.6559714795008915, "grad_norm": 0.5237590074539185, "learning_rate": 7.923324256729738e-06, "loss": 0.1794, "num_input_tokens_seen": 2666136, "step": 4295 }, { "epoch": 7.664884135472371, "grad_norm": 0.6967838406562805, "learning_rate": 7.866601680924633e-06, "loss": 0.183, "num_input_tokens_seen": 2669048, "step": 4300 }, { "epoch": 7.67379679144385, "grad_norm": 0.48244914412498474, "learning_rate": 7.810044938584038e-06, "loss": 0.1663, "num_input_tokens_seen": 2671800, "step": 4305 }, { "epoch": 7.68270944741533, "grad_norm": 0.5121620893478394, "learning_rate": 7.75365457711837e-06, "loss": 0.1757, "num_input_tokens_seen": 2675448, "step": 4310 }, { "epoch": 7.691622103386809, "grad_norm": 0.5723910331726074, "learning_rate": 7.697431142327632e-06, "loss": 0.1654, "num_input_tokens_seen": 2678392, "step": 4315 }, { "epoch": 7.7005347593582885, "grad_norm": 0.4338489770889282, "learning_rate": 7.641375178396151e-06, "loss": 0.1645, "num_input_tokens_seen": 2681112, "step": 4320 }, { "epoch": 7.709447415329768, "grad_norm": 0.5260465145111084, "learning_rate": 7.585487227887328e-06, "loss": 0.1636, "num_input_tokens_seen": 2684856, "step": 4325 }, { "epoch": 7.718360071301248, "grad_norm": 0.37905287742614746, "learning_rate": 7.529767831738366e-06, "loss": 0.1682, "num_input_tokens_seen": 2687576, "step": 4330 }, { "epoch": 7.7272727272727275, "grad_norm": 0.5463063716888428, "learning_rate": 7.474217529255018e-06, "loss": 0.1472, "num_input_tokens_seen": 2690328, "step": 4335 }, { "epoch": 7.736185383244207, "grad_norm": 0.640016496181488, "learning_rate": 7.4188368581064124e-06, "loss": 0.17, "num_input_tokens_seen": 2694168, "step": 4340 }, { "epoch": 7.745098039215686, "grad_norm": 0.42445164918899536, "learning_rate": 7.3636263543197945e-06, "loss": 0.1617, "num_input_tokens_seen": 2697208, "step": 4345 }, { "epoch": 7.754010695187166, "grad_norm": 1.0092363357543945, "learning_rate": 7.30858655227539e-06, "loss": 0.182, "num_input_tokens_seen": 2700376, "step": 4350 }, { "epoch": 7.762923351158645, "grad_norm": 0.2814575433731079, "learning_rate": 7.253717984701208e-06, "loss": 0.1667, "num_input_tokens_seen": 2703256, "step": 4355 }, { "epoch": 7.7718360071301245, "grad_norm": 0.5186646580696106, "learning_rate": 7.199021182667873e-06, "loss": 0.1594, "num_input_tokens_seen": 2705752, "step": 4360 }, { "epoch": 7.780748663101605, "grad_norm": 0.4522174000740051, "learning_rate": 7.1444966755834954e-06, "loss": 0.1373, "num_input_tokens_seen": 2708888, "step": 4365 }, { "epoch": 7.789661319073084, "grad_norm": 0.4952068328857422, "learning_rate": 7.0901449911885685e-06, "loss": 0.159, "num_input_tokens_seen": 2711576, "step": 4370 }, { "epoch": 7.7985739750445635, "grad_norm": 0.47718411684036255, "learning_rate": 7.035966655550838e-06, "loss": 0.1856, "num_input_tokens_seen": 2715000, "step": 4375 }, { "epoch": 7.807486631016043, "grad_norm": 0.5538311004638672, "learning_rate": 6.98196219306019e-06, "loss": 0.1708, "num_input_tokens_seen": 2717880, "step": 4380 }, { "epoch": 7.816399286987522, "grad_norm": 0.40867936611175537, "learning_rate": 6.928132126423636e-06, "loss": 0.1424, "num_input_tokens_seen": 2721240, "step": 4385 }, { "epoch": 7.825311942959002, "grad_norm": 0.579886257648468, "learning_rate": 6.8744769766601854e-06, "loss": 0.1844, "num_input_tokens_seen": 2724696, "step": 4390 }, { "epoch": 7.834224598930481, "grad_norm": 0.4526924788951874, "learning_rate": 6.820997263095849e-06, "loss": 0.1754, "num_input_tokens_seen": 2727960, "step": 4395 }, { "epoch": 7.8431372549019605, "grad_norm": 0.5530297756195068, "learning_rate": 6.767693503358608e-06, "loss": 0.1816, "num_input_tokens_seen": 2731000, "step": 4400 }, { "epoch": 7.85204991087344, "grad_norm": 0.3621399700641632, "learning_rate": 6.7145662133733715e-06, "loss": 0.1751, "num_input_tokens_seen": 2734264, "step": 4405 }, { "epoch": 7.86096256684492, "grad_norm": 0.5544110536575317, "learning_rate": 6.6616159073570135e-06, "loss": 0.1635, "num_input_tokens_seen": 2736664, "step": 4410 }, { "epoch": 7.8698752228163995, "grad_norm": 0.504298985004425, "learning_rate": 6.6088430978133914e-06, "loss": 0.1685, "num_input_tokens_seen": 2739672, "step": 4415 }, { "epoch": 7.878787878787879, "grad_norm": 0.45025068521499634, "learning_rate": 6.556248295528389e-06, "loss": 0.1576, "num_input_tokens_seen": 2742552, "step": 4420 }, { "epoch": 7.887700534759358, "grad_norm": 0.9994719624519348, "learning_rate": 6.5038320095649395e-06, "loss": 0.1938, "num_input_tokens_seen": 2745880, "step": 4425 }, { "epoch": 7.896613190730838, "grad_norm": 0.5288066267967224, "learning_rate": 6.451594747258155e-06, "loss": 0.1818, "num_input_tokens_seen": 2749912, "step": 4430 }, { "epoch": 7.905525846702317, "grad_norm": 0.5786968469619751, "learning_rate": 6.399537014210355e-06, "loss": 0.1757, "num_input_tokens_seen": 2753368, "step": 4435 }, { "epoch": 7.9144385026737964, "grad_norm": 0.3910267651081085, "learning_rate": 6.3476593142862275e-06, "loss": 0.1794, "num_input_tokens_seen": 2756568, "step": 4440 }, { "epoch": 7.923351158645277, "grad_norm": 1.0030827522277832, "learning_rate": 6.29596214960792e-06, "loss": 0.1752, "num_input_tokens_seen": 2759704, "step": 4445 }, { "epoch": 7.932263814616756, "grad_norm": 0.41212958097457886, "learning_rate": 6.244446020550182e-06, "loss": 0.1709, "num_input_tokens_seen": 2762584, "step": 4450 }, { "epoch": 7.9411764705882355, "grad_norm": 0.5541166067123413, "learning_rate": 6.193111425735515e-06, "loss": 0.1763, "num_input_tokens_seen": 2765752, "step": 4455 }, { "epoch": 7.950089126559715, "grad_norm": 0.6690767407417297, "learning_rate": 6.141958862029384e-06, "loss": 0.1624, "num_input_tokens_seen": 2768696, "step": 4460 }, { "epoch": 7.959001782531194, "grad_norm": 0.5791964530944824, "learning_rate": 6.090988824535374e-06, "loss": 0.1844, "num_input_tokens_seen": 2772120, "step": 4465 }, { "epoch": 7.967914438502674, "grad_norm": 0.40184465050697327, "learning_rate": 6.040201806590387e-06, "loss": 0.1918, "num_input_tokens_seen": 2775384, "step": 4470 }, { "epoch": 7.976827094474153, "grad_norm": 0.4650464951992035, "learning_rate": 5.989598299759919e-06, "loss": 0.1778, "num_input_tokens_seen": 2778520, "step": 4475 }, { "epoch": 7.9857397504456324, "grad_norm": 0.5422367453575134, "learning_rate": 5.939178793833233e-06, "loss": 0.1734, "num_input_tokens_seen": 2780888, "step": 4480 }, { "epoch": 7.994652406417112, "grad_norm": 0.5420627593994141, "learning_rate": 5.888943776818684e-06, "loss": 0.1781, "num_input_tokens_seen": 2784312, "step": 4485 }, { "epoch": 8.003565062388592, "grad_norm": 0.465055912733078, "learning_rate": 5.83889373493896e-06, "loss": 0.1861, "num_input_tokens_seen": 2787056, "step": 4490 }, { "epoch": 8.01247771836007, "grad_norm": 0.8877488970756531, "learning_rate": 5.789029152626374e-06, "loss": 0.1686, "num_input_tokens_seen": 2790288, "step": 4495 }, { "epoch": 8.014260249554367, "eval_loss": 0.18306031823158264, "eval_runtime": 4.2492, "eval_samples_per_second": 58.599, "eval_steps_per_second": 14.826, "num_input_tokens_seen": 2790832, "step": 4496 }, { "epoch": 8.02139037433155, "grad_norm": 0.3791468143463135, "learning_rate": 5.73935051251818e-06, "loss": 0.1626, "num_input_tokens_seen": 2793136, "step": 4500 }, { "epoch": 8.030303030303031, "grad_norm": 0.6450890302658081, "learning_rate": 5.689858295451914e-06, "loss": 0.1684, "num_input_tokens_seen": 2796464, "step": 4505 }, { "epoch": 8.03921568627451, "grad_norm": 0.36496949195861816, "learning_rate": 5.640552980460742e-06, "loss": 0.1524, "num_input_tokens_seen": 2799344, "step": 4510 }, { "epoch": 8.04812834224599, "grad_norm": 0.5503035187721252, "learning_rate": 5.591435044768783e-06, "loss": 0.1529, "num_input_tokens_seen": 2801648, "step": 4515 }, { "epoch": 8.057040998217468, "grad_norm": 0.4298340678215027, "learning_rate": 5.542504963786552e-06, "loss": 0.1769, "num_input_tokens_seen": 2804976, "step": 4520 }, { "epoch": 8.065953654188949, "grad_norm": 0.44245445728302, "learning_rate": 5.493763211106293e-06, "loss": 0.1543, "num_input_tokens_seen": 2807472, "step": 4525 }, { "epoch": 8.074866310160427, "grad_norm": 0.27881208062171936, "learning_rate": 5.4452102584974545e-06, "loss": 0.1436, "num_input_tokens_seen": 2810768, "step": 4530 }, { "epoch": 8.083778966131907, "grad_norm": 0.9025391340255737, "learning_rate": 5.396846575902095e-06, "loss": 0.1822, "num_input_tokens_seen": 2814480, "step": 4535 }, { "epoch": 8.092691622103386, "grad_norm": 0.33398008346557617, "learning_rate": 5.348672631430318e-06, "loss": 0.1551, "num_input_tokens_seen": 2817968, "step": 4540 }, { "epoch": 8.101604278074866, "grad_norm": 0.45554453134536743, "learning_rate": 5.300688891355765e-06, "loss": 0.1626, "num_input_tokens_seen": 2820784, "step": 4545 }, { "epoch": 8.110516934046347, "grad_norm": 0.38997194170951843, "learning_rate": 5.252895820111112e-06, "loss": 0.1377, "num_input_tokens_seen": 2823824, "step": 4550 }, { "epoch": 8.119429590017825, "grad_norm": 0.5823608040809631, "learning_rate": 5.205293880283552e-06, "loss": 0.1602, "num_input_tokens_seen": 2826832, "step": 4555 }, { "epoch": 8.128342245989305, "grad_norm": 0.6442610025405884, "learning_rate": 5.157883532610305e-06, "loss": 0.189, "num_input_tokens_seen": 2830256, "step": 4560 }, { "epoch": 8.137254901960784, "grad_norm": 0.6161116361618042, "learning_rate": 5.110665235974219e-06, "loss": 0.181, "num_input_tokens_seen": 2832848, "step": 4565 }, { "epoch": 8.146167557932264, "grad_norm": 0.5139124989509583, "learning_rate": 5.06363944739924e-06, "loss": 0.1593, "num_input_tokens_seen": 2835664, "step": 4570 }, { "epoch": 8.155080213903743, "grad_norm": 0.4244152903556824, "learning_rate": 5.0168066220460715e-06, "loss": 0.1533, "num_input_tokens_seen": 2838864, "step": 4575 }, { "epoch": 8.163992869875223, "grad_norm": 0.8236415386199951, "learning_rate": 4.97016721320773e-06, "loss": 0.1638, "num_input_tokens_seen": 2841840, "step": 4580 }, { "epoch": 8.172905525846703, "grad_norm": 0.6396406292915344, "learning_rate": 4.9237216723051485e-06, "loss": 0.1693, "num_input_tokens_seen": 2844976, "step": 4585 }, { "epoch": 8.181818181818182, "grad_norm": 0.41378054022789, "learning_rate": 4.877470448882815e-06, "loss": 0.1585, "num_input_tokens_seen": 2847856, "step": 4590 }, { "epoch": 8.190730837789662, "grad_norm": 0.5032555460929871, "learning_rate": 4.831413990604447e-06, "loss": 0.1465, "num_input_tokens_seen": 2850192, "step": 4595 }, { "epoch": 8.19964349376114, "grad_norm": 0.4285055994987488, "learning_rate": 4.7855527432486336e-06, "loss": 0.1517, "num_input_tokens_seen": 2853008, "step": 4600 }, { "epoch": 8.20855614973262, "grad_norm": 0.5328398942947388, "learning_rate": 4.739887150704508e-06, "loss": 0.2001, "num_input_tokens_seen": 2856464, "step": 4605 }, { "epoch": 8.2174688057041, "grad_norm": 0.45751845836639404, "learning_rate": 4.694417654967492e-06, "loss": 0.1507, "num_input_tokens_seen": 2858864, "step": 4610 }, { "epoch": 8.22638146167558, "grad_norm": 0.44036829471588135, "learning_rate": 4.649144696134972e-06, "loss": 0.1711, "num_input_tokens_seen": 2861488, "step": 4615 }, { "epoch": 8.235294117647058, "grad_norm": 0.4446769654750824, "learning_rate": 4.6040687124020794e-06, "loss": 0.168, "num_input_tokens_seen": 2865136, "step": 4620 }, { "epoch": 8.244206773618538, "grad_norm": 0.6855089068412781, "learning_rate": 4.5591901400574285e-06, "loss": 0.1646, "num_input_tokens_seen": 2867984, "step": 4625 }, { "epoch": 8.253119429590019, "grad_norm": 0.6599955558776855, "learning_rate": 4.514509413478888e-06, "loss": 0.1795, "num_input_tokens_seen": 2871088, "step": 4630 }, { "epoch": 8.262032085561497, "grad_norm": 0.42294609546661377, "learning_rate": 4.470026965129384e-06, "loss": 0.1433, "num_input_tokens_seen": 2874352, "step": 4635 }, { "epoch": 8.270944741532977, "grad_norm": 0.4342804551124573, "learning_rate": 4.425743225552731e-06, "loss": 0.1762, "num_input_tokens_seen": 2877840, "step": 4640 }, { "epoch": 8.279857397504456, "grad_norm": 0.5680054426193237, "learning_rate": 4.381658623369445e-06, "loss": 0.1532, "num_input_tokens_seen": 2881456, "step": 4645 }, { "epoch": 8.288770053475936, "grad_norm": 0.5137624740600586, "learning_rate": 4.337773585272581e-06, "loss": 0.1694, "num_input_tokens_seen": 2884400, "step": 4650 }, { "epoch": 8.297682709447415, "grad_norm": 0.3794878125190735, "learning_rate": 4.294088536023652e-06, "loss": 0.1475, "num_input_tokens_seen": 2887536, "step": 4655 }, { "epoch": 8.306595365418895, "grad_norm": 0.6075329184532166, "learning_rate": 4.250603898448455e-06, "loss": 0.1811, "num_input_tokens_seen": 2890352, "step": 4660 }, { "epoch": 8.315508021390375, "grad_norm": 0.45767733454704285, "learning_rate": 4.2073200934330315e-06, "loss": 0.1871, "num_input_tokens_seen": 2893520, "step": 4665 }, { "epoch": 8.324420677361854, "grad_norm": 0.46819356083869934, "learning_rate": 4.164237539919577e-06, "loss": 0.1842, "num_input_tokens_seen": 2896048, "step": 4670 }, { "epoch": 8.333333333333334, "grad_norm": 0.5235320329666138, "learning_rate": 4.121356654902364e-06, "loss": 0.164, "num_input_tokens_seen": 2899472, "step": 4675 }, { "epoch": 8.342245989304812, "grad_norm": 0.8180021047592163, "learning_rate": 4.078677853423724e-06, "loss": 0.1573, "num_input_tokens_seen": 2902832, "step": 4680 }, { "epoch": 8.351158645276293, "grad_norm": 0.9956904649734497, "learning_rate": 4.036201548570049e-06, "loss": 0.2367, "num_input_tokens_seen": 2906576, "step": 4685 }, { "epoch": 8.360071301247771, "grad_norm": 0.6165153980255127, "learning_rate": 3.993928151467766e-06, "loss": 0.1987, "num_input_tokens_seen": 2909840, "step": 4690 }, { "epoch": 8.368983957219251, "grad_norm": 0.48898622393608093, "learning_rate": 3.951858071279352e-06, "loss": 0.1454, "num_input_tokens_seen": 2912752, "step": 4695 }, { "epoch": 8.37789661319073, "grad_norm": 0.48024001717567444, "learning_rate": 3.909991715199412e-06, "loss": 0.1633, "num_input_tokens_seen": 2915024, "step": 4700 }, { "epoch": 8.38680926916221, "grad_norm": 0.4968958795070648, "learning_rate": 3.8683294884506945e-06, "loss": 0.1655, "num_input_tokens_seen": 2918480, "step": 4705 }, { "epoch": 8.39572192513369, "grad_norm": 0.5491753220558167, "learning_rate": 3.826871794280193e-06, "loss": 0.1729, "num_input_tokens_seen": 2921712, "step": 4710 }, { "epoch": 8.404634581105169, "grad_norm": 0.5808373093605042, "learning_rate": 3.7856190339552513e-06, "loss": 0.1851, "num_input_tokens_seen": 2925040, "step": 4715 }, { "epoch": 8.41354723707665, "grad_norm": 0.9629413485527039, "learning_rate": 3.7445716067596503e-06, "loss": 0.1578, "num_input_tokens_seen": 2928112, "step": 4720 }, { "epoch": 8.422459893048128, "grad_norm": 0.8614413142204285, "learning_rate": 3.7037299099897586e-06, "loss": 0.1865, "num_input_tokens_seen": 2932368, "step": 4725 }, { "epoch": 8.431372549019608, "grad_norm": 0.5639718770980835, "learning_rate": 3.663094338950704e-06, "loss": 0.1738, "num_input_tokens_seen": 2935088, "step": 4730 }, { "epoch": 8.440285204991087, "grad_norm": 0.5123082995414734, "learning_rate": 3.6226652869525285e-06, "loss": 0.1471, "num_input_tokens_seen": 2937840, "step": 4735 }, { "epoch": 8.449197860962567, "grad_norm": 0.5894414186477661, "learning_rate": 3.5824431453063662e-06, "loss": 0.1638, "num_input_tokens_seen": 2941008, "step": 4740 }, { "epoch": 8.458110516934047, "grad_norm": 0.34330514073371887, "learning_rate": 3.5424283033207024e-06, "loss": 0.1672, "num_input_tokens_seen": 2944464, "step": 4745 }, { "epoch": 8.467023172905526, "grad_norm": 0.37955033779144287, "learning_rate": 3.5026211482975497e-06, "loss": 0.1584, "num_input_tokens_seen": 2947376, "step": 4750 }, { "epoch": 8.475935828877006, "grad_norm": 0.9495477080345154, "learning_rate": 3.463022065528748e-06, "loss": 0.1767, "num_input_tokens_seen": 2950480, "step": 4755 }, { "epoch": 8.484848484848484, "grad_norm": 0.3263673782348633, "learning_rate": 3.4236314382922103e-06, "loss": 0.1429, "num_input_tokens_seen": 2953392, "step": 4760 }, { "epoch": 8.493761140819965, "grad_norm": 0.5537719130516052, "learning_rate": 3.3844496478482064e-06, "loss": 0.1588, "num_input_tokens_seen": 2956272, "step": 4765 }, { "epoch": 8.502673796791443, "grad_norm": 0.30169588327407837, "learning_rate": 3.345477073435685e-06, "loss": 0.167, "num_input_tokens_seen": 2959056, "step": 4770 }, { "epoch": 8.511586452762923, "grad_norm": 0.5430099964141846, "learning_rate": 3.3067140922686174e-06, "loss": 0.1655, "num_input_tokens_seen": 2962480, "step": 4775 }, { "epoch": 8.515151515151516, "eval_loss": 0.1827203780412674, "eval_runtime": 4.2534, "eval_samples_per_second": 58.541, "eval_steps_per_second": 14.812, "num_input_tokens_seen": 2963888, "step": 4777 }, { "epoch": 8.520499108734402, "grad_norm": 0.44720202684402466, "learning_rate": 3.268161079532317e-06, "loss": 0.1494, "num_input_tokens_seen": 2965360, "step": 4780 }, { "epoch": 8.529411764705882, "grad_norm": 0.3062620162963867, "learning_rate": 3.22981840837982e-06, "loss": 0.1712, "num_input_tokens_seen": 2968464, "step": 4785 }, { "epoch": 8.538324420677363, "grad_norm": 0.9861251711845398, "learning_rate": 3.1916864499282856e-06, "loss": 0.1779, "num_input_tokens_seen": 2972144, "step": 4790 }, { "epoch": 8.547237076648841, "grad_norm": 0.43644168972969055, "learning_rate": 3.1537655732553768e-06, "loss": 0.1509, "num_input_tokens_seen": 2974384, "step": 4795 }, { "epoch": 8.556149732620321, "grad_norm": 0.5110581517219543, "learning_rate": 3.1160561453957183e-06, "loss": 0.1578, "num_input_tokens_seen": 2977104, "step": 4800 }, { "epoch": 8.5650623885918, "grad_norm": 0.5604438781738281, "learning_rate": 3.078558531337336e-06, "loss": 0.1694, "num_input_tokens_seen": 2980464, "step": 4805 }, { "epoch": 8.57397504456328, "grad_norm": 0.5687141418457031, "learning_rate": 3.0412730940181015e-06, "loss": 0.1643, "num_input_tokens_seen": 2983248, "step": 4810 }, { "epoch": 8.582887700534759, "grad_norm": 0.9281808137893677, "learning_rate": 3.0042001943222376e-06, "loss": 0.165, "num_input_tokens_seen": 2986256, "step": 4815 }, { "epoch": 8.591800356506239, "grad_norm": 0.6919686794281006, "learning_rate": 2.967340191076834e-06, "loss": 0.1902, "num_input_tokens_seen": 2990256, "step": 4820 }, { "epoch": 8.60071301247772, "grad_norm": 0.7080613374710083, "learning_rate": 2.930693441048371e-06, "loss": 0.149, "num_input_tokens_seen": 2992592, "step": 4825 }, { "epoch": 8.609625668449198, "grad_norm": 0.5117068886756897, "learning_rate": 2.8942602989392386e-06, "loss": 0.174, "num_input_tokens_seen": 2995888, "step": 4830 }, { "epoch": 8.618538324420678, "grad_norm": 0.37796565890312195, "learning_rate": 2.858041117384341e-06, "loss": 0.148, "num_input_tokens_seen": 2999280, "step": 4835 }, { "epoch": 8.627450980392156, "grad_norm": 0.6607238054275513, "learning_rate": 2.8220362469476624e-06, "loss": 0.1541, "num_input_tokens_seen": 3002864, "step": 4840 }, { "epoch": 8.636363636363637, "grad_norm": 0.4288221001625061, "learning_rate": 2.7862460361188614e-06, "loss": 0.1521, "num_input_tokens_seen": 3004944, "step": 4845 }, { "epoch": 8.645276292335115, "grad_norm": 0.49076348543167114, "learning_rate": 2.750670831309957e-06, "loss": 0.1682, "num_input_tokens_seen": 3008464, "step": 4850 }, { "epoch": 8.654188948306595, "grad_norm": 0.615407407283783, "learning_rate": 2.7153109768518925e-06, "loss": 0.171, "num_input_tokens_seen": 3012240, "step": 4855 }, { "epoch": 8.663101604278076, "grad_norm": 0.5121405124664307, "learning_rate": 2.680166814991256e-06, "loss": 0.1606, "num_input_tokens_seen": 3015056, "step": 4860 }, { "epoch": 8.672014260249554, "grad_norm": 0.7262160778045654, "learning_rate": 2.645238685886961e-06, "loss": 0.2009, "num_input_tokens_seen": 3018160, "step": 4865 }, { "epoch": 8.680926916221035, "grad_norm": 0.5012710690498352, "learning_rate": 2.6105269276069573e-06, "loss": 0.1641, "num_input_tokens_seen": 3021392, "step": 4870 }, { "epoch": 8.689839572192513, "grad_norm": 0.681621789932251, "learning_rate": 2.5760318761249263e-06, "loss": 0.1751, "num_input_tokens_seen": 3024240, "step": 4875 }, { "epoch": 8.698752228163993, "grad_norm": 0.4795394539833069, "learning_rate": 2.541753865317076e-06, "loss": 0.171, "num_input_tokens_seen": 3026800, "step": 4880 }, { "epoch": 8.707664884135472, "grad_norm": 0.4269944429397583, "learning_rate": 2.507693226958871e-06, "loss": 0.1673, "num_input_tokens_seen": 3029968, "step": 4885 }, { "epoch": 8.716577540106952, "grad_norm": 0.6113168597221375, "learning_rate": 2.473850290721838e-06, "loss": 0.1568, "num_input_tokens_seen": 3032656, "step": 4890 }, { "epoch": 8.72549019607843, "grad_norm": 0.5832796692848206, "learning_rate": 2.4402253841703914e-06, "loss": 0.1645, "num_input_tokens_seen": 3035376, "step": 4895 }, { "epoch": 8.73440285204991, "grad_norm": 0.4533407986164093, "learning_rate": 2.4068188327586257e-06, "loss": 0.1798, "num_input_tokens_seen": 3038512, "step": 4900 }, { "epoch": 8.743315508021391, "grad_norm": 0.6923168897628784, "learning_rate": 2.373630959827186e-06, "loss": 0.161, "num_input_tokens_seen": 3041744, "step": 4905 }, { "epoch": 8.75222816399287, "grad_norm": 0.5411429405212402, "learning_rate": 2.3406620866001485e-06, "loss": 0.1696, "num_input_tokens_seen": 3045232, "step": 4910 }, { "epoch": 8.76114081996435, "grad_norm": 0.40592697262763977, "learning_rate": 2.3079125321818996e-06, "loss": 0.1636, "num_input_tokens_seen": 3047728, "step": 4915 }, { "epoch": 8.770053475935828, "grad_norm": 0.7785168886184692, "learning_rate": 2.275382613554031e-06, "loss": 0.1534, "num_input_tokens_seen": 3050864, "step": 4920 }, { "epoch": 8.778966131907309, "grad_norm": 0.46840912103652954, "learning_rate": 2.2430726455723113e-06, "loss": 0.1651, "num_input_tokens_seen": 3053680, "step": 4925 }, { "epoch": 8.787878787878787, "grad_norm": 0.5858107209205627, "learning_rate": 2.210982940963596e-06, "loss": 0.1632, "num_input_tokens_seen": 3057136, "step": 4930 }, { "epoch": 8.796791443850267, "grad_norm": 0.8381409049034119, "learning_rate": 2.1791138103228275e-06, "loss": 0.1736, "num_input_tokens_seen": 3060144, "step": 4935 }, { "epoch": 8.805704099821746, "grad_norm": 0.4155525863170624, "learning_rate": 2.1474655621100347e-06, "loss": 0.1759, "num_input_tokens_seen": 3063024, "step": 4940 }, { "epoch": 8.814616755793226, "grad_norm": 0.7829816937446594, "learning_rate": 2.116038502647319e-06, "loss": 0.1736, "num_input_tokens_seen": 3066320, "step": 4945 }, { "epoch": 8.823529411764707, "grad_norm": 0.44637227058410645, "learning_rate": 2.084832936115902e-06, "loss": 0.1513, "num_input_tokens_seen": 3069296, "step": 4950 }, { "epoch": 8.832442067736185, "grad_norm": 0.49461662769317627, "learning_rate": 2.0538491645531982e-06, "loss": 0.1745, "num_input_tokens_seen": 3071888, "step": 4955 }, { "epoch": 8.841354723707665, "grad_norm": 0.5589842200279236, "learning_rate": 2.0230874878498648e-06, "loss": 0.2835, "num_input_tokens_seen": 3075984, "step": 4960 }, { "epoch": 8.850267379679144, "grad_norm": 0.544204592704773, "learning_rate": 1.9925482037469188e-06, "loss": 0.1654, "num_input_tokens_seen": 3079152, "step": 4965 }, { "epoch": 8.859180035650624, "grad_norm": 0.5478450059890747, "learning_rate": 1.9622316078328566e-06, "loss": 0.1682, "num_input_tokens_seen": 3082544, "step": 4970 }, { "epoch": 8.868092691622103, "grad_norm": 0.5605227947235107, "learning_rate": 1.9321379935407697e-06, "loss": 0.145, "num_input_tokens_seen": 3085680, "step": 4975 }, { "epoch": 8.877005347593583, "grad_norm": 0.5030500292778015, "learning_rate": 1.9022676521455117e-06, "loss": 0.1795, "num_input_tokens_seen": 3089392, "step": 4980 }, { "epoch": 8.885918003565063, "grad_norm": 0.6063732504844666, "learning_rate": 1.8726208727609219e-06, "loss": 0.1604, "num_input_tokens_seen": 3092656, "step": 4985 }, { "epoch": 8.894830659536542, "grad_norm": 0.6032387018203735, "learning_rate": 1.8431979423369604e-06, "loss": 0.1646, "num_input_tokens_seen": 3095600, "step": 4990 }, { "epoch": 8.903743315508022, "grad_norm": 0.4930381774902344, "learning_rate": 1.8139991456569694e-06, "loss": 0.1622, "num_input_tokens_seen": 3098320, "step": 4995 }, { "epoch": 8.9126559714795, "grad_norm": 0.8425898551940918, "learning_rate": 1.7850247653349223e-06, "loss": 0.1554, "num_input_tokens_seen": 3101520, "step": 5000 }, { "epoch": 8.92156862745098, "grad_norm": 0.6207576394081116, "learning_rate": 1.7562750818126556e-06, "loss": 0.1733, "num_input_tokens_seen": 3104816, "step": 5005 }, { "epoch": 8.93048128342246, "grad_norm": 0.5085470676422119, "learning_rate": 1.727750373357187e-06, "loss": 0.1686, "num_input_tokens_seen": 3108176, "step": 5010 }, { "epoch": 8.93939393939394, "grad_norm": 0.4193607568740845, "learning_rate": 1.699450916058018e-06, "loss": 0.1473, "num_input_tokens_seen": 3111248, "step": 5015 }, { "epoch": 8.94830659536542, "grad_norm": 0.3501569330692291, "learning_rate": 1.6713769838244325e-06, "loss": 0.154, "num_input_tokens_seen": 3114224, "step": 5020 }, { "epoch": 8.957219251336898, "grad_norm": 0.40926966071128845, "learning_rate": 1.6435288483828748e-06, "loss": 0.1529, "num_input_tokens_seen": 3117232, "step": 5025 }, { "epoch": 8.966131907308379, "grad_norm": 0.3181830644607544, "learning_rate": 1.615906779274326e-06, "loss": 0.2044, "num_input_tokens_seen": 3120240, "step": 5030 }, { "epoch": 8.975044563279857, "grad_norm": 0.9511982798576355, "learning_rate": 1.588511043851662e-06, "loss": 0.2427, "num_input_tokens_seen": 3123792, "step": 5035 }, { "epoch": 8.983957219251337, "grad_norm": 0.3971862494945526, "learning_rate": 1.5613419072770864e-06, "loss": 0.1803, "num_input_tokens_seen": 3127184, "step": 5040 }, { "epoch": 8.992869875222816, "grad_norm": 0.527430534362793, "learning_rate": 1.534399632519573e-06, "loss": 0.1621, "num_input_tokens_seen": 3130480, "step": 5045 }, { "epoch": 9.001782531194296, "grad_norm": 0.4454513490200043, "learning_rate": 1.5076844803522922e-06, "loss": 0.1472, "num_input_tokens_seen": 3132712, "step": 5050 }, { "epoch": 9.010695187165775, "grad_norm": 0.8424109816551208, "learning_rate": 1.4811967093501189e-06, "loss": 0.1594, "num_input_tokens_seen": 3135400, "step": 5055 }, { "epoch": 9.016042780748663, "eval_loss": 0.18196314573287964, "eval_runtime": 4.2599, "eval_samples_per_second": 58.452, "eval_steps_per_second": 14.789, "num_input_tokens_seen": 3137352, "step": 5058 }, { "epoch": 9.019607843137255, "grad_norm": 0.8189364075660706, "learning_rate": 1.4549365758871142e-06, "loss": 0.1552, "num_input_tokens_seen": 3138248, "step": 5060 }, { "epoch": 9.028520499108735, "grad_norm": 0.40512701869010925, "learning_rate": 1.4289043341340375e-06, "loss": 0.1724, "num_input_tokens_seen": 3141480, "step": 5065 }, { "epoch": 9.037433155080214, "grad_norm": 0.5652516484260559, "learning_rate": 1.4031002360558849e-06, "loss": 0.1694, "num_input_tokens_seen": 3144904, "step": 5070 }, { "epoch": 9.046345811051694, "grad_norm": 0.5365282893180847, "learning_rate": 1.377524531409491e-06, "loss": 0.1725, "num_input_tokens_seen": 3148968, "step": 5075 }, { "epoch": 9.055258467023172, "grad_norm": 0.3831281065940857, "learning_rate": 1.3521774677410476e-06, "loss": 0.1522, "num_input_tokens_seen": 3151912, "step": 5080 }, { "epoch": 9.064171122994653, "grad_norm": 0.4094650149345398, "learning_rate": 1.3270592903837503e-06, "loss": 0.1649, "num_input_tokens_seen": 3155080, "step": 5085 }, { "epoch": 9.073083778966131, "grad_norm": 0.7728195786476135, "learning_rate": 1.3021702424554221e-06, "loss": 0.1512, "num_input_tokens_seen": 3157768, "step": 5090 }, { "epoch": 9.081996434937611, "grad_norm": 0.6765234470367432, "learning_rate": 1.2775105648561352e-06, "loss": 0.1841, "num_input_tokens_seen": 3161224, "step": 5095 }, { "epoch": 9.090909090909092, "grad_norm": 0.5181841254234314, "learning_rate": 1.2530804962659098e-06, "loss": 0.1716, "num_input_tokens_seen": 3163944, "step": 5100 }, { "epoch": 9.09982174688057, "grad_norm": 0.8874284625053406, "learning_rate": 1.2288802731423883e-06, "loss": 0.176, "num_input_tokens_seen": 3166728, "step": 5105 }, { "epoch": 9.10873440285205, "grad_norm": 0.6627284288406372, "learning_rate": 1.2049101297185422e-06, "loss": 0.1661, "num_input_tokens_seen": 3170120, "step": 5110 }, { "epoch": 9.117647058823529, "grad_norm": 0.7040612101554871, "learning_rate": 1.1811702980004058e-06, "loss": 0.1486, "num_input_tokens_seen": 3173000, "step": 5115 }, { "epoch": 9.12655971479501, "grad_norm": 0.6169217228889465, "learning_rate": 1.1576610077648513e-06, "loss": 0.1868, "num_input_tokens_seen": 3176520, "step": 5120 }, { "epoch": 9.135472370766488, "grad_norm": 0.464032381772995, "learning_rate": 1.134382486557342e-06, "loss": 0.1539, "num_input_tokens_seen": 3179496, "step": 5125 }, { "epoch": 9.144385026737968, "grad_norm": 0.679073691368103, "learning_rate": 1.1113349596897331e-06, "loss": 0.1429, "num_input_tokens_seen": 3182248, "step": 5130 }, { "epoch": 9.153297682709447, "grad_norm": 0.32752713561058044, "learning_rate": 1.0885186502381017e-06, "loss": 0.154, "num_input_tokens_seen": 3184840, "step": 5135 }, { "epoch": 9.162210338680927, "grad_norm": 0.6518117189407349, "learning_rate": 1.0659337790405704e-06, "loss": 0.1727, "num_input_tokens_seen": 3187720, "step": 5140 }, { "epoch": 9.171122994652407, "grad_norm": 0.6068860292434692, "learning_rate": 1.0435805646951958e-06, "loss": 0.1512, "num_input_tokens_seen": 3190536, "step": 5145 }, { "epoch": 9.180035650623886, "grad_norm": 0.42867806553840637, "learning_rate": 1.0214592235578274e-06, "loss": 0.162, "num_input_tokens_seen": 3193608, "step": 5150 }, { "epoch": 9.188948306595366, "grad_norm": 0.49051374197006226, "learning_rate": 9.995699697400247e-07, "loss": 0.181, "num_input_tokens_seen": 3196936, "step": 5155 }, { "epoch": 9.197860962566844, "grad_norm": 0.5725313425064087, "learning_rate": 9.77913015106982e-07, "loss": 0.1708, "num_input_tokens_seen": 3200040, "step": 5160 }, { "epoch": 9.206773618538325, "grad_norm": 0.9723972082138062, "learning_rate": 9.564885692754793e-07, "loss": 0.1814, "num_input_tokens_seen": 3203240, "step": 5165 }, { "epoch": 9.215686274509803, "grad_norm": 0.506613552570343, "learning_rate": 9.352968396118628e-07, "loss": 0.1726, "num_input_tokens_seen": 3206376, "step": 5170 }, { "epoch": 9.224598930481283, "grad_norm": 0.6921798586845398, "learning_rate": 9.143380312300137e-07, "loss": 0.1543, "num_input_tokens_seen": 3209480, "step": 5175 }, { "epoch": 9.233511586452764, "grad_norm": 0.5370962023735046, "learning_rate": 8.936123469893892e-07, "loss": 0.2448, "num_input_tokens_seen": 3213448, "step": 5180 }, { "epoch": 9.242424242424242, "grad_norm": 0.6006255745887756, "learning_rate": 8.731199874930374e-07, "loss": 0.1604, "num_input_tokens_seen": 3216776, "step": 5185 }, { "epoch": 9.251336898395722, "grad_norm": 0.5161803960800171, "learning_rate": 8.528611510856766e-07, "loss": 0.1543, "num_input_tokens_seen": 3219752, "step": 5190 }, { "epoch": 9.260249554367201, "grad_norm": 0.5216704607009888, "learning_rate": 8.328360338517583e-07, "loss": 0.1659, "num_input_tokens_seen": 3223048, "step": 5195 }, { "epoch": 9.269162210338681, "grad_norm": 0.43477028608322144, "learning_rate": 8.130448296135768e-07, "loss": 0.1847, "num_input_tokens_seen": 3226984, "step": 5200 }, { "epoch": 9.27807486631016, "grad_norm": 0.5066149234771729, "learning_rate": 7.934877299293875e-07, "loss": 0.1806, "num_input_tokens_seen": 3230088, "step": 5205 }, { "epoch": 9.28698752228164, "grad_norm": 0.9408987760543823, "learning_rate": 7.741649240915666e-07, "loss": 0.1692, "num_input_tokens_seen": 3232840, "step": 5210 }, { "epoch": 9.29590017825312, "grad_norm": 0.41510528326034546, "learning_rate": 7.550765991247654e-07, "loss": 0.144, "num_input_tokens_seen": 3235944, "step": 5215 }, { "epoch": 9.304812834224599, "grad_norm": 0.5157932043075562, "learning_rate": 7.362229397840981e-07, "loss": 0.1744, "num_input_tokens_seen": 3238728, "step": 5220 }, { "epoch": 9.313725490196079, "grad_norm": 0.44517961144447327, "learning_rate": 7.17604128553373e-07, "loss": 0.1478, "num_input_tokens_seen": 3241256, "step": 5225 }, { "epoch": 9.322638146167558, "grad_norm": 0.6294628977775574, "learning_rate": 6.992203456432977e-07, "loss": 0.1887, "num_input_tokens_seen": 3244680, "step": 5230 }, { "epoch": 9.331550802139038, "grad_norm": 0.3271355628967285, "learning_rate": 6.810717689897633e-07, "loss": 0.1474, "num_input_tokens_seen": 3247560, "step": 5235 }, { "epoch": 9.340463458110516, "grad_norm": 0.5900879502296448, "learning_rate": 6.631585742521068e-07, "loss": 0.1654, "num_input_tokens_seen": 3251176, "step": 5240 }, { "epoch": 9.349376114081997, "grad_norm": 1.2029948234558105, "learning_rate": 6.454809348114044e-07, "loss": 0.1985, "num_input_tokens_seen": 3254152, "step": 5245 }, { "epoch": 9.358288770053475, "grad_norm": 0.7293168902397156, "learning_rate": 6.280390217688114e-07, "loss": 0.1636, "num_input_tokens_seen": 3256744, "step": 5250 }, { "epoch": 9.367201426024955, "grad_norm": 0.28766605257987976, "learning_rate": 6.108330039438892e-07, "loss": 0.1729, "num_input_tokens_seen": 3259400, "step": 5255 }, { "epoch": 9.376114081996436, "grad_norm": 0.7399141788482666, "learning_rate": 5.938630478729917e-07, "loss": 0.1547, "num_input_tokens_seen": 3262728, "step": 5260 }, { "epoch": 9.385026737967914, "grad_norm": 0.45791682600975037, "learning_rate": 5.771293178076286e-07, "loss": 0.1693, "num_input_tokens_seen": 3266376, "step": 5265 }, { "epoch": 9.393939393939394, "grad_norm": 0.6668148636817932, "learning_rate": 5.606319757128914e-07, "loss": 0.169, "num_input_tokens_seen": 3268808, "step": 5270 }, { "epoch": 9.402852049910873, "grad_norm": 0.580091655254364, "learning_rate": 5.443711812658792e-07, "loss": 0.174, "num_input_tokens_seen": 3272008, "step": 5275 }, { "epoch": 9.411764705882353, "grad_norm": 0.47462576627731323, "learning_rate": 5.283470918541616e-07, "loss": 0.1395, "num_input_tokens_seen": 3274920, "step": 5280 }, { "epoch": 9.420677361853832, "grad_norm": 0.4406573474407196, "learning_rate": 5.125598625742523e-07, "loss": 0.1781, "num_input_tokens_seen": 3278376, "step": 5285 }, { "epoch": 9.429590017825312, "grad_norm": 0.4939647614955902, "learning_rate": 4.970096462300927e-07, "loss": 0.1745, "num_input_tokens_seen": 3281704, "step": 5290 }, { "epoch": 9.43850267379679, "grad_norm": 0.3747076988220215, "learning_rate": 4.816965933315987e-07, "loss": 0.1692, "num_input_tokens_seen": 3285256, "step": 5295 }, { "epoch": 9.44741532976827, "grad_norm": 0.5448613166809082, "learning_rate": 4.6662085209318305e-07, "loss": 0.1651, "num_input_tokens_seen": 3288616, "step": 5300 }, { "epoch": 9.456327985739751, "grad_norm": 0.5583840608596802, "learning_rate": 4.517825684323324e-07, "loss": 0.1549, "num_input_tokens_seen": 3291752, "step": 5305 }, { "epoch": 9.46524064171123, "grad_norm": 0.4584488272666931, "learning_rate": 4.3718188596819086e-07, "loss": 0.1519, "num_input_tokens_seen": 3294344, "step": 5310 }, { "epoch": 9.47415329768271, "grad_norm": 0.6175810694694519, "learning_rate": 4.228189460201676e-07, "loss": 0.1706, "num_input_tokens_seen": 3297512, "step": 5315 }, { "epoch": 9.483065953654188, "grad_norm": 0.5118115544319153, "learning_rate": 4.086938876065732e-07, "loss": 0.1538, "num_input_tokens_seen": 3300296, "step": 5320 }, { "epoch": 9.491978609625669, "grad_norm": 0.5376412868499756, "learning_rate": 3.948068474432715e-07, "loss": 0.274, "num_input_tokens_seen": 3304360, "step": 5325 }, { "epoch": 9.500891265597147, "grad_norm": 0.5221200585365295, "learning_rate": 3.8115795994236313e-07, "loss": 0.1658, "num_input_tokens_seen": 3307304, "step": 5330 }, { "epoch": 9.509803921568627, "grad_norm": 0.4227612316608429, "learning_rate": 3.6774735721087085e-07, "loss": 0.1618, "num_input_tokens_seen": 3310536, "step": 5335 }, { "epoch": 9.516934046345812, "eval_loss": 0.183439701795578, "eval_runtime": 4.2535, "eval_samples_per_second": 58.539, "eval_steps_per_second": 14.811, "num_input_tokens_seen": 3312648, "step": 5339 }, { "epoch": 9.518716577540108, "grad_norm": 0.601445734500885, "learning_rate": 3.5457516904947587e-07, "loss": 0.1771, "num_input_tokens_seen": 3313672, "step": 5340 }, { "epoch": 9.527629233511586, "grad_norm": 0.5191211700439453, "learning_rate": 3.416415229512443e-07, "loss": 0.1688, "num_input_tokens_seen": 3317224, "step": 5345 }, { "epoch": 9.536541889483066, "grad_norm": 0.6869432330131531, "learning_rate": 3.2894654410041417e-07, "loss": 0.1661, "num_input_tokens_seen": 3319848, "step": 5350 }, { "epoch": 9.545454545454545, "grad_norm": 0.905884325504303, "learning_rate": 3.1649035537117123e-07, "loss": 0.1521, "num_input_tokens_seen": 3322664, "step": 5355 }, { "epoch": 9.554367201426025, "grad_norm": 0.5753766894340515, "learning_rate": 3.042730773264557e-07, "loss": 0.1512, "num_input_tokens_seen": 3325928, "step": 5360 }, { "epoch": 9.563279857397504, "grad_norm": 0.5148957967758179, "learning_rate": 2.9229482821680197e-07, "loss": 0.1496, "num_input_tokens_seen": 3328680, "step": 5365 }, { "epoch": 9.572192513368984, "grad_norm": 0.47426876425743103, "learning_rate": 2.8055572397919784e-07, "loss": 0.152, "num_input_tokens_seen": 3331976, "step": 5370 }, { "epoch": 9.581105169340464, "grad_norm": 0.5953306555747986, "learning_rate": 2.690558782359576e-07, "loss": 0.1609, "num_input_tokens_seen": 3334888, "step": 5375 }, { "epoch": 9.590017825311943, "grad_norm": 0.49842748045921326, "learning_rate": 2.5779540229361745e-07, "loss": 0.1822, "num_input_tokens_seen": 3337960, "step": 5380 }, { "epoch": 9.598930481283423, "grad_norm": 0.6325761079788208, "learning_rate": 2.467744051418641e-07, "loss": 0.155, "num_input_tokens_seen": 3340936, "step": 5385 }, { "epoch": 9.607843137254902, "grad_norm": 0.8439469933509827, "learning_rate": 2.3599299345248292e-07, "loss": 0.1561, "num_input_tokens_seen": 3343784, "step": 5390 }, { "epoch": 9.616755793226382, "grad_norm": 0.7139554619789124, "learning_rate": 2.2545127157831413e-07, "loss": 0.1669, "num_input_tokens_seen": 3347016, "step": 5395 }, { "epoch": 9.62566844919786, "grad_norm": 0.3963601291179657, "learning_rate": 2.1514934155226208e-07, "loss": 0.1412, "num_input_tokens_seen": 3349800, "step": 5400 }, { "epoch": 9.63458110516934, "grad_norm": 0.5459052324295044, "learning_rate": 2.0508730308627933e-07, "loss": 0.1527, "num_input_tokens_seen": 3353640, "step": 5405 }, { "epoch": 9.643493761140821, "grad_norm": 0.7221339344978333, "learning_rate": 1.9526525357043136e-07, "loss": 0.1708, "num_input_tokens_seen": 3356904, "step": 5410 }, { "epoch": 9.6524064171123, "grad_norm": 0.39834100008010864, "learning_rate": 1.8568328807193337e-07, "loss": 0.1623, "num_input_tokens_seen": 3360232, "step": 5415 }, { "epoch": 9.66131907308378, "grad_norm": 0.3296028673648834, "learning_rate": 1.7634149933423993e-07, "loss": 0.1723, "num_input_tokens_seen": 3362824, "step": 5420 }, { "epoch": 9.670231729055258, "grad_norm": 0.6187313199043274, "learning_rate": 1.6723997777614574e-07, "loss": 0.2013, "num_input_tokens_seen": 3366152, "step": 5425 }, { "epoch": 9.679144385026738, "grad_norm": 0.4088561236858368, "learning_rate": 1.5837881149090294e-07, "loss": 0.1668, "num_input_tokens_seen": 3369192, "step": 5430 }, { "epoch": 9.688057040998217, "grad_norm": 0.6721343994140625, "learning_rate": 1.497580862453829e-07, "loss": 0.1767, "num_input_tokens_seen": 3372776, "step": 5435 }, { "epoch": 9.696969696969697, "grad_norm": 0.6333170533180237, "learning_rate": 1.4137788547923246e-07, "loss": 0.1829, "num_input_tokens_seen": 3376232, "step": 5440 }, { "epoch": 9.705882352941176, "grad_norm": 0.6064999103546143, "learning_rate": 1.3323829030407465e-07, "loss": 0.1916, "num_input_tokens_seen": 3379912, "step": 5445 }, { "epoch": 9.714795008912656, "grad_norm": 0.5454294085502625, "learning_rate": 1.2533937950272023e-07, "loss": 0.1639, "num_input_tokens_seen": 3382824, "step": 5450 }, { "epoch": 9.723707664884136, "grad_norm": 0.4902726411819458, "learning_rate": 1.176812295283991e-07, "loss": 0.1577, "num_input_tokens_seen": 3385640, "step": 5455 }, { "epoch": 9.732620320855615, "grad_norm": 0.4689973294734955, "learning_rate": 1.1026391450404128e-07, "loss": 0.1652, "num_input_tokens_seen": 3389672, "step": 5460 }, { "epoch": 9.741532976827095, "grad_norm": 0.6127117276191711, "learning_rate": 1.0308750622153307e-07, "loss": 0.1815, "num_input_tokens_seen": 3393096, "step": 5465 }, { "epoch": 9.750445632798574, "grad_norm": 0.40860888361930847, "learning_rate": 9.615207414103434e-08, "loss": 0.149, "num_input_tokens_seen": 3396136, "step": 5470 }, { "epoch": 9.759358288770054, "grad_norm": 0.5143342018127441, "learning_rate": 8.945768539031785e-08, "loss": 0.1785, "num_input_tokens_seen": 3399304, "step": 5475 }, { "epoch": 9.768270944741532, "grad_norm": 0.599516749382019, "learning_rate": 8.30044047640921e-08, "loss": 0.1617, "num_input_tokens_seen": 3402216, "step": 5480 }, { "epoch": 9.777183600713013, "grad_norm": 0.37185174226760864, "learning_rate": 7.679229472340176e-08, "loss": 0.1554, "num_input_tokens_seen": 3405096, "step": 5485 }, { "epoch": 9.786096256684491, "grad_norm": 0.4413319528102875, "learning_rate": 7.082141539500597e-08, "loss": 0.1639, "num_input_tokens_seen": 3407912, "step": 5490 }, { "epoch": 9.795008912655971, "grad_norm": 0.7090705633163452, "learning_rate": 6.509182457080376e-08, "loss": 0.1679, "num_input_tokens_seen": 3410856, "step": 5495 }, { "epoch": 9.803921568627452, "grad_norm": 0.5437349677085876, "learning_rate": 5.9603577707267875e-08, "loss": 0.1559, "num_input_tokens_seen": 3413928, "step": 5500 }, { "epoch": 9.81283422459893, "grad_norm": 0.5729760527610779, "learning_rate": 5.435672792491742e-08, "loss": 0.1623, "num_input_tokens_seen": 3417416, "step": 5505 }, { "epoch": 9.82174688057041, "grad_norm": 0.38444051146507263, "learning_rate": 4.935132600780157e-08, "loss": 0.1769, "num_input_tokens_seen": 3420136, "step": 5510 }, { "epoch": 9.830659536541889, "grad_norm": 0.4345572292804718, "learning_rate": 4.4587420402997235e-08, "loss": 0.1537, "num_input_tokens_seen": 3423272, "step": 5515 }, { "epoch": 9.83957219251337, "grad_norm": 0.44134852290153503, "learning_rate": 4.006505722015386e-08, "loss": 0.1499, "num_input_tokens_seen": 3426472, "step": 5520 }, { "epoch": 9.848484848484848, "grad_norm": 0.6951932907104492, "learning_rate": 3.578428023103819e-08, "loss": 0.1725, "num_input_tokens_seen": 3429992, "step": 5525 }, { "epoch": 9.857397504456328, "grad_norm": 0.47553181648254395, "learning_rate": 3.1745130869123566e-08, "loss": 0.1554, "num_input_tokens_seen": 3432456, "step": 5530 }, { "epoch": 9.866310160427808, "grad_norm": 0.5962952375411987, "learning_rate": 2.794764822916518e-08, "loss": 0.1618, "num_input_tokens_seen": 3434888, "step": 5535 }, { "epoch": 9.875222816399287, "grad_norm": 0.4873346984386444, "learning_rate": 2.4391869066844874e-08, "loss": 0.1773, "num_input_tokens_seen": 3437832, "step": 5540 }, { "epoch": 9.884135472370767, "grad_norm": 0.65750652551651, "learning_rate": 2.1077827798404726e-08, "loss": 0.1697, "num_input_tokens_seen": 3440872, "step": 5545 }, { "epoch": 9.893048128342246, "grad_norm": 0.4054161012172699, "learning_rate": 1.8005556500313993e-08, "loss": 0.1495, "num_input_tokens_seen": 3443784, "step": 5550 }, { "epoch": 9.901960784313726, "grad_norm": 0.605219841003418, "learning_rate": 1.51750849089638e-08, "loss": 0.1643, "num_input_tokens_seen": 3447592, "step": 5555 }, { "epoch": 9.910873440285204, "grad_norm": 0.3572712540626526, "learning_rate": 1.2586440420372936e-08, "loss": 0.1714, "num_input_tokens_seen": 3451048, "step": 5560 }, { "epoch": 9.919786096256685, "grad_norm": 0.5080024600028992, "learning_rate": 1.023964808992417e-08, "loss": 0.1497, "num_input_tokens_seen": 3453928, "step": 5565 }, { "epoch": 9.928698752228165, "grad_norm": 0.5494665503501892, "learning_rate": 8.134730632125554e-09, "loss": 0.1739, "num_input_tokens_seen": 3456968, "step": 5570 }, { "epoch": 9.937611408199643, "grad_norm": 0.5445519089698792, "learning_rate": 6.271708420385603e-09, "loss": 0.1683, "num_input_tokens_seen": 3460616, "step": 5575 }, { "epoch": 9.946524064171124, "grad_norm": 0.4502975046634674, "learning_rate": 4.650599486827334e-09, "loss": 0.1625, "num_input_tokens_seen": 3463592, "step": 5580 }, { "epoch": 9.955436720142602, "grad_norm": 0.713843047618866, "learning_rate": 3.2714195220912013e-09, "loss": 0.1604, "num_input_tokens_seen": 3466888, "step": 5585 }, { "epoch": 9.964349376114082, "grad_norm": 0.457069456577301, "learning_rate": 2.134181875204644e-09, "loss": 0.1602, "num_input_tokens_seen": 3470408, "step": 5590 }, { "epoch": 9.973262032085561, "grad_norm": 0.5743651390075684, "learning_rate": 1.2388975534460834e-09, "loss": 0.1584, "num_input_tokens_seen": 3473608, "step": 5595 }, { "epoch": 9.982174688057041, "grad_norm": 0.41813942790031433, "learning_rate": 5.855752222366783e-10, "loss": 0.163, "num_input_tokens_seen": 3476616, "step": 5600 }, { "epoch": 9.99108734402852, "grad_norm": 0.4171542227268219, "learning_rate": 1.7422120505705686e-10, "loss": 0.1549, "num_input_tokens_seen": 3479624, "step": 5605 }, { "epoch": 10.0, "grad_norm": 1.7215794324874878, "learning_rate": 4.839483383478616e-12, "loss": 0.1694, "num_input_tokens_seen": 3481336, "step": 5610 }, { "epoch": 10.0, "num_input_tokens_seen": 3481336, "step": 5610, "total_flos": 1.5676298662753075e+17, "train_loss": 0.9318533902924754, "train_runtime": 970.4341, "train_samples_per_second": 23.093, "train_steps_per_second": 5.781 } ], "logging_steps": 5, "max_steps": 5610, "num_input_tokens_seen": 3481336, "num_train_epochs": 10, "save_steps": 281, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.5676298662753075e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }