diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,6784 +1,8067 @@ { - "best_global_step": 416560, - "best_metric": 1.8425856828689575, - "best_model_checkpoint": "/media/user/Expansion1/opus-mt-zhtw-en/checkpoint-416560", - "epoch": 10.0, + "best_global_step": 499760, + "best_metric": 0.9877662062644958, + "best_model_checkpoint": "/media/user/Expansion1/opus-mt-zhtw-en-google-translate3/checkpoint-499760", + "epoch": 5.0, "eval_steps": 500, - "global_step": 416560, + "global_step": 499760, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.012003072786633378, - "grad_norm": 7.134702682495117, - "learning_rate": 4.99401046667947e-05, - "loss": 2.6596, - "num_input_tokens_seen": 268152, + "epoch": 0.005002401152553225, + "grad_norm": 11.430471420288086, + "learning_rate": 4.995007603649752e-05, + "loss": 2.0932, + "num_input_tokens_seen": 276616, "step": 500 }, { - "epoch": 0.024006145573266757, - "grad_norm": 8.367463111877441, - "learning_rate": 4.988008930286153e-05, - "loss": 2.6058, - "num_input_tokens_seen": 532848, + "epoch": 0.01000480230510645, + "grad_norm": 11.411553382873535, + "learning_rate": 4.990005202497199e-05, + "loss": 2.0509, + "num_input_tokens_seen": 545376, "step": 1000 }, { - "epoch": 0.03600921835990013, - "grad_norm": 9.669520378112793, - "learning_rate": 4.982007393892837e-05, - "loss": 2.6011, - "num_input_tokens_seen": 802128, + "epoch": 0.015007203457659676, + "grad_norm": 7.671140193939209, + "learning_rate": 4.985002801344646e-05, + "loss": 2.0225, + "num_input_tokens_seen": 812256, "step": 1500 }, { - "epoch": 0.04801229114653351, - "grad_norm": 7.237158298492432, - "learning_rate": 4.97600585749952e-05, - "loss": 2.5826, - "num_input_tokens_seen": 1073544, + "epoch": 0.0200096046102129, + "grad_norm": 10.581878662109375, + "learning_rate": 4.9800004001920924e-05, + "loss": 1.9906, + "num_input_tokens_seen": 1089008, "step": 2000 }, { - "epoch": 0.06001536393316689, - "grad_norm": 6.998780727386475, - "learning_rate": 4.9700043211062036e-05, - "loss": 2.5691, - "num_input_tokens_seen": 1345480, + "epoch": 0.02501200576276613, + "grad_norm": 9.597127914428711, + "learning_rate": 4.974997999039539e-05, + "loss": 1.9507, + "num_input_tokens_seen": 1366424, "step": 2500 }, { - "epoch": 0.07201843671980027, - "grad_norm": 7.587129592895508, - "learning_rate": 4.9640027847128865e-05, - "loss": 2.5483, - "num_input_tokens_seen": 1612520, + "epoch": 0.030014406915319352, + "grad_norm": 8.959872245788574, + "learning_rate": 4.9699955978869863e-05, + "loss": 1.9346, + "num_input_tokens_seen": 1638376, "step": 3000 }, { - "epoch": 0.08402150950643365, - "grad_norm": 8.61057186126709, - "learning_rate": 4.95800124831957e-05, - "loss": 2.4904, - "num_input_tokens_seen": 1879712, + "epoch": 0.03501680806787258, + "grad_norm": 8.698769569396973, + "learning_rate": 4.9649931967344327e-05, + "loss": 1.9318, + "num_input_tokens_seen": 1906088, "step": 3500 }, { - "epoch": 0.09602458229306703, - "grad_norm": 8.535271644592285, - "learning_rate": 4.9519997119262536e-05, - "loss": 2.5237, - "num_input_tokens_seen": 2149360, + "epoch": 0.0400192092204258, + "grad_norm": 9.283397674560547, + "learning_rate": 4.9599907955818796e-05, + "loss": 1.9044, + "num_input_tokens_seen": 2180336, "step": 4000 }, { - "epoch": 0.1080276550797004, - "grad_norm": 8.541385650634766, - "learning_rate": 4.9459981755329364e-05, - "loss": 2.4751, - "num_input_tokens_seen": 2416512, + "epoch": 0.04502161037297903, + "grad_norm": 8.18691635131836, + "learning_rate": 4.954988394429326e-05, + "loss": 1.8793, + "num_input_tokens_seen": 2446104, "step": 4500 }, { - "epoch": 0.12003072786633379, - "grad_norm": 10.306108474731445, - "learning_rate": 4.93999663913962e-05, - "loss": 2.4531, - "num_input_tokens_seen": 2684624, + "epoch": 0.05002401152553226, + "grad_norm": 9.12602424621582, + "learning_rate": 4.949985993276773e-05, + "loss": 1.8827, + "num_input_tokens_seen": 2707352, "step": 5000 }, { - "epoch": 0.13203380065296716, - "grad_norm": 8.887262344360352, - "learning_rate": 4.9339951027463035e-05, - "loss": 2.4388, - "num_input_tokens_seen": 2955168, + "epoch": 0.05502641267808548, + "grad_norm": 7.385958194732666, + "learning_rate": 4.94498359212422e-05, + "loss": 1.8553, + "num_input_tokens_seen": 2980496, "step": 5500 }, { - "epoch": 0.14403687343960053, - "grad_norm": 8.917656898498535, - "learning_rate": 4.9279935663529864e-05, - "loss": 2.4688, - "num_input_tokens_seen": 3226208, + "epoch": 0.060028813830638704, + "grad_norm": 12.771719932556152, + "learning_rate": 4.939981190971666e-05, + "loss": 1.8257, + "num_input_tokens_seen": 3261104, "step": 6000 }, { - "epoch": 0.1560399462262339, - "grad_norm": 9.032322883605957, - "learning_rate": 4.92199202995967e-05, - "loss": 2.4204, - "num_input_tokens_seen": 3498344, + "epoch": 0.06503121498319193, + "grad_norm": 7.724299430847168, + "learning_rate": 4.934978789819113e-05, + "loss": 1.82, + "num_input_tokens_seen": 3530600, "step": 6500 }, { - "epoch": 0.1680430190128673, - "grad_norm": 7.882916450500488, - "learning_rate": 4.915990493566353e-05, - "loss": 2.4319, - "num_input_tokens_seen": 3763464, + "epoch": 0.07003361613574516, + "grad_norm": 10.434374809265137, + "learning_rate": 4.92997638866656e-05, + "loss": 1.8089, + "num_input_tokens_seen": 3798856, "step": 7000 }, { - "epoch": 0.18004609179950068, - "grad_norm": 7.744749069213867, - "learning_rate": 4.909988957173037e-05, - "loss": 2.4149, - "num_input_tokens_seen": 4034192, + "epoch": 0.07503601728829838, + "grad_norm": 6.624173164367676, + "learning_rate": 4.924973987514007e-05, + "loss": 1.8068, + "num_input_tokens_seen": 4068760, "step": 7500 }, { - "epoch": 0.19204916458613405, - "grad_norm": 6.833868980407715, - "learning_rate": 4.90398742077972e-05, - "loss": 2.4027, - "num_input_tokens_seen": 4305696, + "epoch": 0.0800384184408516, + "grad_norm": 7.316147327423096, + "learning_rate": 4.9199715863614536e-05, + "loss": 1.7867, + "num_input_tokens_seen": 4337376, "step": 8000 }, { - "epoch": 0.20405223737276743, - "grad_norm": 7.211818218231201, - "learning_rate": 4.897985884386403e-05, - "loss": 2.4139, - "num_input_tokens_seen": 4573608, + "epoch": 0.08504081959340483, + "grad_norm": 7.759427547454834, + "learning_rate": 4.9149691852089006e-05, + "loss": 1.7814, + "num_input_tokens_seen": 4610160, "step": 8500 }, { - "epoch": 0.2160553101594008, - "grad_norm": 7.917566776275635, - "learning_rate": 4.891984347993086e-05, - "loss": 2.3822, - "num_input_tokens_seen": 4846152, + "epoch": 0.09004322074595807, + "grad_norm": 7.609263896942139, + "learning_rate": 4.9099667840563476e-05, + "loss": 1.7981, + "num_input_tokens_seen": 4885232, "step": 9000 }, { - "epoch": 0.22805838294603417, - "grad_norm": 7.854650974273682, - "learning_rate": 4.88598281159977e-05, - "loss": 2.3968, - "num_input_tokens_seen": 5119088, + "epoch": 0.09504562189851129, + "grad_norm": 6.2129669189453125, + "learning_rate": 4.904964382903794e-05, + "loss": 1.7689, + "num_input_tokens_seen": 5155216, "step": 9500 }, { - "epoch": 0.24006145573266757, - "grad_norm": 7.690464019775391, - "learning_rate": 4.8799812752064534e-05, - "loss": 2.4074, - "num_input_tokens_seen": 5389744, + "epoch": 0.10004802305106451, + "grad_norm": 6.697121620178223, + "learning_rate": 4.899961981751241e-05, + "loss": 1.7388, + "num_input_tokens_seen": 5428160, "step": 10000 }, { - "epoch": 0.25206452851930095, - "grad_norm": 8.977276802062988, - "learning_rate": 4.873979738813136e-05, - "loss": 2.4113, - "num_input_tokens_seen": 5660712, + "epoch": 0.10505042420361774, + "grad_norm": 10.624402046203613, + "learning_rate": 4.894959580598688e-05, + "loss": 1.7443, + "num_input_tokens_seen": 5692136, "step": 10500 }, { - "epoch": 0.2640676013059343, - "grad_norm": 7.349117279052734, - "learning_rate": 4.86797820241982e-05, - "loss": 2.3492, - "num_input_tokens_seen": 5928440, + "epoch": 0.11005282535617096, + "grad_norm": 8.177070617675781, + "learning_rate": 4.889957179446135e-05, + "loss": 1.7421, + "num_input_tokens_seen": 5965544, "step": 11000 }, { - "epoch": 0.2760706740925677, - "grad_norm": 7.154156684875488, - "learning_rate": 4.8619766660265034e-05, - "loss": 2.3652, - "num_input_tokens_seen": 6202048, + "epoch": 0.11505522650872418, + "grad_norm": 9.939962387084961, + "learning_rate": 4.884954778293581e-05, + "loss": 1.7128, + "num_input_tokens_seen": 6230272, "step": 11500 }, { - "epoch": 0.28807374687920106, - "grad_norm": 7.58486270904541, - "learning_rate": 4.855975129633186e-05, - "loss": 2.3358, - "num_input_tokens_seen": 6471464, + "epoch": 0.12005762766127741, + "grad_norm": 8.003973007202148, + "learning_rate": 4.8799523771410275e-05, + "loss": 1.7197, + "num_input_tokens_seen": 6505160, "step": 12000 }, { - "epoch": 0.30007681966583444, - "grad_norm": 7.290213108062744, - "learning_rate": 4.84997359323987e-05, - "loss": 2.3522, - "num_input_tokens_seen": 6738992, + "epoch": 0.12506002881383063, + "grad_norm": 6.709536075592041, + "learning_rate": 4.8749499759884745e-05, + "loss": 1.7021, + "num_input_tokens_seen": 6782752, "step": 12500 }, { - "epoch": 0.3120798924524678, - "grad_norm": 7.593165397644043, - "learning_rate": 4.8439720568465527e-05, - "loss": 2.3534, - "num_input_tokens_seen": 7013448, + "epoch": 0.13006242996638387, + "grad_norm": 7.609007835388184, + "learning_rate": 4.8699475748359215e-05, + "loss": 1.7083, + "num_input_tokens_seen": 7059144, "step": 13000 }, { - "epoch": 0.3240829652391012, - "grad_norm": 7.277749538421631, - "learning_rate": 4.837970520453236e-05, - "loss": 2.3659, - "num_input_tokens_seen": 7286320, + "epoch": 0.13506483111893708, + "grad_norm": 7.746858596801758, + "learning_rate": 4.8649451736833685e-05, + "loss": 1.709, + "num_input_tokens_seen": 7327608, "step": 13500 }, { - "epoch": 0.3360860380257346, - "grad_norm": 7.899440765380859, - "learning_rate": 4.83196898405992e-05, - "loss": 2.3338, - "num_input_tokens_seen": 7558232, + "epoch": 0.14006723227149032, + "grad_norm": 9.576034545898438, + "learning_rate": 4.859942772530815e-05, + "loss": 1.7204, + "num_input_tokens_seen": 7598784, "step": 14000 }, { - "epoch": 0.348089110812368, - "grad_norm": 6.855794906616211, - "learning_rate": 4.8259674476666026e-05, - "loss": 2.3419, - "num_input_tokens_seen": 7828120, + "epoch": 0.14506963342404355, + "grad_norm": 7.995354175567627, + "learning_rate": 4.854940371378262e-05, + "loss": 1.6987, + "num_input_tokens_seen": 7866144, "step": 14500 }, { - "epoch": 0.36009218359900136, - "grad_norm": 7.486812591552734, - "learning_rate": 4.819965911273286e-05, - "loss": 2.3384, - "num_input_tokens_seen": 8098392, + "epoch": 0.15007203457659676, + "grad_norm": 7.990232944488525, + "learning_rate": 4.849937970225709e-05, + "loss": 1.695, + "num_input_tokens_seen": 8132240, "step": 15000 }, { - "epoch": 0.37209525638563473, - "grad_norm": 8.39892292022705, - "learning_rate": 4.81396437487997e-05, - "loss": 2.3113, - "num_input_tokens_seen": 8363424, + "epoch": 0.15507443572915, + "grad_norm": 6.7087507247924805, + "learning_rate": 4.844935569073155e-05, + "loss": 1.6766, + "num_input_tokens_seen": 8400496, "step": 15500 }, { - "epoch": 0.3840983291722681, - "grad_norm": 6.449926376342773, - "learning_rate": 4.8079628384866526e-05, - "loss": 2.3452, - "num_input_tokens_seen": 8633528, + "epoch": 0.1600768368817032, + "grad_norm": 6.8279900550842285, + "learning_rate": 4.839933167920602e-05, + "loss": 1.6662, + "num_input_tokens_seen": 8666120, "step": 16000 }, { - "epoch": 0.3961014019589015, - "grad_norm": 6.3692193031311035, - "learning_rate": 4.801961302093336e-05, - "loss": 2.309, - "num_input_tokens_seen": 8901464, + "epoch": 0.16507923803425645, + "grad_norm": 8.427155494689941, + "learning_rate": 4.834930766768049e-05, + "loss": 1.6781, + "num_input_tokens_seen": 8935352, "step": 16500 }, { - "epoch": 0.40810447474553485, - "grad_norm": 7.785754203796387, - "learning_rate": 4.79595976570002e-05, - "loss": 2.2909, - "num_input_tokens_seen": 9166304, + "epoch": 0.17008163918680966, + "grad_norm": 8.360432624816895, + "learning_rate": 4.829928365615496e-05, + "loss": 1.6682, + "num_input_tokens_seen": 9209648, "step": 17000 }, { - "epoch": 0.4201075475321682, - "grad_norm": 7.993614673614502, - "learning_rate": 4.789958229306703e-05, - "loss": 2.3106, - "num_input_tokens_seen": 9435944, + "epoch": 0.1750840403393629, + "grad_norm": 7.38226842880249, + "learning_rate": 4.8249259644629424e-05, + "loss": 1.6475, + "num_input_tokens_seen": 9475336, "step": 17500 }, { - "epoch": 0.4321106203188016, - "grad_norm": 8.270709991455078, - "learning_rate": 4.783956692913386e-05, - "loss": 2.3095, - "num_input_tokens_seen": 9707656, + "epoch": 0.18008644149191613, + "grad_norm": 8.226577758789062, + "learning_rate": 4.819923563310389e-05, + "loss": 1.6668, + "num_input_tokens_seen": 9752088, "step": 18000 }, { - "epoch": 0.44411369310543497, - "grad_norm": 7.394924640655518, - "learning_rate": 4.777955156520069e-05, - "loss": 2.284, - "num_input_tokens_seen": 9974984, + "epoch": 0.18508884264446934, + "grad_norm": 9.053323745727539, + "learning_rate": 4.8149211621578364e-05, + "loss": 1.6615, + "num_input_tokens_seen": 10021568, "step": 18500 }, { - "epoch": 0.45611676589206834, - "grad_norm": 8.036295890808105, - "learning_rate": 4.771953620126753e-05, - "loss": 2.2999, - "num_input_tokens_seen": 10245424, + "epoch": 0.19009124379702258, + "grad_norm": 10.050972938537598, + "learning_rate": 4.809918761005283e-05, + "loss": 1.6453, + "num_input_tokens_seen": 10294200, "step": 19000 }, { - "epoch": 0.46811983867870177, - "grad_norm": 7.7529425621032715, - "learning_rate": 4.765952083733436e-05, - "loss": 2.2811, - "num_input_tokens_seen": 10518840, + "epoch": 0.1950936449495758, + "grad_norm": 12.779481887817383, + "learning_rate": 4.80491635985273e-05, + "loss": 1.6305, + "num_input_tokens_seen": 10563664, "step": 19500 }, { - "epoch": 0.48012291146533514, - "grad_norm": 8.465028762817383, - "learning_rate": 4.759950547340119e-05, - "loss": 2.2928, - "num_input_tokens_seen": 10789136, + "epoch": 0.20009604610212903, + "grad_norm": 9.113444328308105, + "learning_rate": 4.799913958700176e-05, + "loss": 1.6374, + "num_input_tokens_seen": 10833928, "step": 20000 }, { - "epoch": 0.4921259842519685, - "grad_norm": 6.370537757873535, - "learning_rate": 4.7539490109468025e-05, - "loss": 2.2911, - "num_input_tokens_seen": 11059592, + "epoch": 0.20509844725468224, + "grad_norm": 9.230701446533203, + "learning_rate": 4.794911557547624e-05, + "loss": 1.633, + "num_input_tokens_seen": 11098416, "step": 20500 }, { - "epoch": 0.5041290570386019, - "grad_norm": 8.359167098999023, - "learning_rate": 4.747947474553486e-05, - "loss": 2.2926, - "num_input_tokens_seen": 11331240, + "epoch": 0.21010084840723547, + "grad_norm": 7.994055271148682, + "learning_rate": 4.78990915639507e-05, + "loss": 1.6267, + "num_input_tokens_seen": 11368280, "step": 21000 }, { - "epoch": 0.5161321298252353, - "grad_norm": 7.884887218475342, - "learning_rate": 4.7419459381601695e-05, - "loss": 2.2798, - "num_input_tokens_seen": 11601616, + "epoch": 0.2151032495597887, + "grad_norm": 8.182880401611328, + "learning_rate": 4.784906755242516e-05, + "loss": 1.6199, + "num_input_tokens_seen": 11634848, "step": 21500 }, { - "epoch": 0.5281352026118686, - "grad_norm": 5.859597206115723, - "learning_rate": 4.7359444017668524e-05, - "loss": 2.2839, - "num_input_tokens_seen": 11870056, + "epoch": 0.22010565071234192, + "grad_norm": 8.053507804870605, + "learning_rate": 4.779904354089963e-05, + "loss": 1.6318, + "num_input_tokens_seen": 11912456, "step": 22000 }, { - "epoch": 0.540138275398502, - "grad_norm": 7.169973850250244, - "learning_rate": 4.729942865373535e-05, - "loss": 2.2692, - "num_input_tokens_seen": 12141520, + "epoch": 0.22510805186489516, + "grad_norm": 8.162029266357422, + "learning_rate": 4.77490195293741e-05, + "loss": 1.6156, + "num_input_tokens_seen": 12177848, "step": 22500 }, { - "epoch": 0.5521413481851354, - "grad_norm": 8.827645301818848, - "learning_rate": 4.7239413289802195e-05, - "loss": 2.2501, - "num_input_tokens_seen": 12415688, + "epoch": 0.23011045301744837, + "grad_norm": 7.447554588317871, + "learning_rate": 4.769899551784857e-05, + "loss": 1.6181, + "num_input_tokens_seen": 12448808, "step": 23000 }, { - "epoch": 0.5641444209717688, - "grad_norm": 7.155609607696533, - "learning_rate": 4.7179397925869024e-05, - "loss": 2.3006, - "num_input_tokens_seen": 12689272, + "epoch": 0.2351128541700016, + "grad_norm": 8.473464012145996, + "learning_rate": 4.7648971506323036e-05, + "loss": 1.6544, + "num_input_tokens_seen": 12724056, "step": 23500 }, { - "epoch": 0.5761474937584021, - "grad_norm": 7.983636379241943, - "learning_rate": 4.711938256193586e-05, - "loss": 2.2681, - "num_input_tokens_seen": 12958424, + "epoch": 0.24011525532255482, + "grad_norm": 8.162822723388672, + "learning_rate": 4.7598947494797506e-05, + "loss": 1.6097, + "num_input_tokens_seen": 12992248, "step": 24000 }, { - "epoch": 0.5881505665450355, - "grad_norm": 6.6479973793029785, - "learning_rate": 4.705936719800269e-05, - "loss": 2.2833, - "num_input_tokens_seen": 13228744, + "epoch": 0.24511765647510805, + "grad_norm": 8.350125312805176, + "learning_rate": 4.7548923483271976e-05, + "loss": 1.5977, + "num_input_tokens_seen": 13267848, "step": 24500 }, { - "epoch": 0.6001536393316689, - "grad_norm": 8.416579246520996, - "learning_rate": 4.699935183406952e-05, - "loss": 2.2577, - "num_input_tokens_seen": 13497616, + "epoch": 0.25012005762766126, + "grad_norm": 6.6851091384887695, + "learning_rate": 4.749889947174644e-05, + "loss": 1.6008, + "num_input_tokens_seen": 13544424, "step": 25000 }, { - "epoch": 0.6121567121183022, - "grad_norm": 6.238585472106934, - "learning_rate": 4.693933647013636e-05, - "loss": 2.2318, - "num_input_tokens_seen": 13763760, + "epoch": 0.2551224587802145, + "grad_norm": 8.89284610748291, + "learning_rate": 4.744887546022091e-05, + "loss": 1.6099, + "num_input_tokens_seen": 13818272, "step": 25500 }, { - "epoch": 0.6241597849049356, - "grad_norm": 6.087265491485596, - "learning_rate": 4.687932110620319e-05, - "loss": 2.2341, - "num_input_tokens_seen": 14032680, + "epoch": 0.26012485993276774, + "grad_norm": 8.554915428161621, + "learning_rate": 4.739885144869537e-05, + "loss": 1.6121, + "num_input_tokens_seen": 14088432, "step": 26000 }, { - "epoch": 0.636162857691569, - "grad_norm": 7.391331195831299, - "learning_rate": 4.681930574227002e-05, - "loss": 2.2651, - "num_input_tokens_seen": 14303864, + "epoch": 0.265127261085321, + "grad_norm": 7.6910529136657715, + "learning_rate": 4.734882743716985e-05, + "loss": 1.6073, + "num_input_tokens_seen": 14362896, "step": 26500 }, { - "epoch": 0.6481659304782024, - "grad_norm": 10.410062789916992, - "learning_rate": 4.675929037833686e-05, - "loss": 2.261, - "num_input_tokens_seen": 14574368, + "epoch": 0.27012966223787416, + "grad_norm": 6.856411457061768, + "learning_rate": 4.729880342564431e-05, + "loss": 1.604, + "num_input_tokens_seen": 14629232, "step": 27000 }, { - "epoch": 0.6601690032648359, - "grad_norm": 6.998879909515381, - "learning_rate": 4.669927501440369e-05, - "loss": 2.2389, - "num_input_tokens_seen": 14842952, + "epoch": 0.2751320633904274, + "grad_norm": 8.767107009887695, + "learning_rate": 4.7248779414118775e-05, + "loss": 1.587, + "num_input_tokens_seen": 14904336, "step": 27500 }, { - "epoch": 0.6721720760514692, - "grad_norm": 6.261806488037109, - "learning_rate": 4.663925965047052e-05, - "loss": 2.2474, - "num_input_tokens_seen": 15111672, + "epoch": 0.28013446454298063, + "grad_norm": 6.474203586578369, + "learning_rate": 4.7198755402593245e-05, + "loss": 1.5936, + "num_input_tokens_seen": 15171200, "step": 28000 }, { - "epoch": 0.6841751488381026, - "grad_norm": 6.822585105895996, - "learning_rate": 4.657924428653736e-05, - "loss": 2.235, - "num_input_tokens_seen": 15381512, + "epoch": 0.28513686569553387, + "grad_norm": 10.540902137756348, + "learning_rate": 4.7148731391067715e-05, + "loss": 1.5861, + "num_input_tokens_seen": 15438392, "step": 28500 }, { - "epoch": 0.696178221624736, - "grad_norm": 7.5159502029418945, - "learning_rate": 4.6519228922604193e-05, - "loss": 2.2444, - "num_input_tokens_seen": 15649496, + "epoch": 0.2901392668480871, + "grad_norm": 9.134960174560547, + "learning_rate": 4.7098707379542185e-05, + "loss": 1.5717, + "num_input_tokens_seen": 15707768, "step": 29000 }, { - "epoch": 0.7081812944113693, - "grad_norm": 6.662530899047852, - "learning_rate": 4.645921355867102e-05, - "loss": 2.2405, - "num_input_tokens_seen": 15918392, + "epoch": 0.2951416680006403, + "grad_norm": 6.669952869415283, + "learning_rate": 4.704868336801665e-05, + "loss": 1.5989, + "num_input_tokens_seen": 15980896, "step": 29500 }, { - "epoch": 0.7201843671980027, - "grad_norm": 6.133045196533203, - "learning_rate": 4.639919819473785e-05, - "loss": 2.2511, - "num_input_tokens_seen": 16191400, + "epoch": 0.3001440691531935, + "grad_norm": 6.744385242462158, + "learning_rate": 4.699865935649112e-05, + "loss": 1.5858, + "num_input_tokens_seen": 16249024, "step": 30000 }, { - "epoch": 0.7321874399846361, - "grad_norm": 6.733817100524902, - "learning_rate": 4.6339182830804686e-05, - "loss": 2.2523, - "num_input_tokens_seen": 16462096, + "epoch": 0.30514647030574676, + "grad_norm": 8.088431358337402, + "learning_rate": 4.694863534496559e-05, + "loss": 1.5612, + "num_input_tokens_seen": 16516448, "step": 30500 }, { - "epoch": 0.7441905127712695, - "grad_norm": 8.158308029174805, - "learning_rate": 4.627916746687152e-05, - "loss": 2.2235, - "num_input_tokens_seen": 16731424, + "epoch": 0.3101488714583, + "grad_norm": 8.115087509155273, + "learning_rate": 4.689861133344005e-05, + "loss": 1.5995, + "num_input_tokens_seen": 16788416, "step": 31000 }, { - "epoch": 0.7561935855579028, - "grad_norm": 7.353145122528076, - "learning_rate": 4.621915210293836e-05, - "loss": 2.2615, - "num_input_tokens_seen": 17003608, + "epoch": 0.3151512726108532, + "grad_norm": 8.037360191345215, + "learning_rate": 4.684858732191452e-05, + "loss": 1.5787, + "num_input_tokens_seen": 17057432, "step": 31500 }, { - "epoch": 0.7681966583445362, - "grad_norm": 7.659285068511963, - "learning_rate": 4.6159136739005186e-05, - "loss": 2.2055, - "num_input_tokens_seen": 17273656, + "epoch": 0.3201536737634064, + "grad_norm": 7.91753625869751, + "learning_rate": 4.6798563310388984e-05, + "loss": 1.5642, + "num_input_tokens_seen": 17324440, "step": 32000 }, { - "epoch": 0.7801997311311696, - "grad_norm": 6.596842288970947, - "learning_rate": 4.609912137507202e-05, - "loss": 2.2501, - "num_input_tokens_seen": 17545824, + "epoch": 0.32515607491595966, + "grad_norm": 8.645842552185059, + "learning_rate": 4.674853929886346e-05, + "loss": 1.5685, + "num_input_tokens_seen": 17597472, "step": 32500 }, { - "epoch": 0.792202803917803, - "grad_norm": 7.472955703735352, - "learning_rate": 4.603910601113886e-05, - "loss": 2.2122, - "num_input_tokens_seen": 17813656, + "epoch": 0.3301584760685129, + "grad_norm": 7.731500148773193, + "learning_rate": 4.6698515287337924e-05, + "loss": 1.568, + "num_input_tokens_seen": 17862696, "step": 33000 }, { - "epoch": 0.8042058767044363, - "grad_norm": 9.334726333618164, - "learning_rate": 4.5979090647205685e-05, - "loss": 2.2193, - "num_input_tokens_seen": 18089016, + "epoch": 0.33516087722106613, + "grad_norm": 8.492512702941895, + "learning_rate": 4.6648491275812394e-05, + "loss": 1.5639, + "num_input_tokens_seen": 18132104, "step": 33500 }, { - "epoch": 0.8162089494910697, - "grad_norm": 6.854348182678223, - "learning_rate": 4.591907528327252e-05, - "loss": 2.2171, - "num_input_tokens_seen": 18352400, + "epoch": 0.3401632783736193, + "grad_norm": 6.294936656951904, + "learning_rate": 4.659846726428686e-05, + "loss": 1.5651, + "num_input_tokens_seen": 18398680, "step": 34000 }, { - "epoch": 0.8282120222777031, - "grad_norm": 6.85307502746582, - "learning_rate": 4.5859059919339356e-05, - "loss": 2.1773, - "num_input_tokens_seen": 18616896, + "epoch": 0.34516567952617255, + "grad_norm": 6.749844551086426, + "learning_rate": 4.654844325276133e-05, + "loss": 1.5501, + "num_input_tokens_seen": 18664584, "step": 34500 }, { - "epoch": 0.8402150950643364, - "grad_norm": 7.919052600860596, - "learning_rate": 4.5799044555406185e-05, - "loss": 2.2275, - "num_input_tokens_seen": 18886576, + "epoch": 0.3501680806787258, + "grad_norm": 8.394216537475586, + "learning_rate": 4.64984192412358e-05, + "loss": 1.5655, + "num_input_tokens_seen": 18935000, "step": 35000 }, { - "epoch": 0.8522181678509698, - "grad_norm": 8.456831932067871, - "learning_rate": 4.573902919147302e-05, - "loss": 2.2176, - "num_input_tokens_seen": 19155624, + "epoch": 0.35517048183127903, + "grad_norm": 7.8091607093811035, + "learning_rate": 4.644839522971026e-05, + "loss": 1.5545, + "num_input_tokens_seen": 19215376, "step": 35500 }, { - "epoch": 0.8642212406376032, - "grad_norm": 6.772243499755859, - "learning_rate": 4.567901382753985e-05, - "loss": 2.2008, - "num_input_tokens_seen": 19429016, + "epoch": 0.36017288298383227, + "grad_norm": 8.398904800415039, + "learning_rate": 4.639837121818473e-05, + "loss": 1.5482, + "num_input_tokens_seen": 19493104, "step": 36000 }, { - "epoch": 0.8762243134242366, - "grad_norm": 8.724201202392578, - "learning_rate": 4.561899846360669e-05, - "loss": 2.2038, - "num_input_tokens_seen": 19698392, + "epoch": 0.36517528413638545, + "grad_norm": 20.516576766967773, + "learning_rate": 4.63483472066592e-05, + "loss": 1.546, + "num_input_tokens_seen": 19761160, "step": 36500 }, { - "epoch": 0.8882273862108699, - "grad_norm": 7.608213424682617, - "learning_rate": 4.555898309967352e-05, - "loss": 2.2355, - "num_input_tokens_seen": 19975200, + "epoch": 0.3701776852889387, + "grad_norm": 6.954863548278809, + "learning_rate": 4.629832319513366e-05, + "loss": 1.5581, + "num_input_tokens_seen": 20034376, "step": 37000 }, { - "epoch": 0.9002304589975033, - "grad_norm": 6.957444190979004, - "learning_rate": 4.549896773574035e-05, - "loss": 2.1945, - "num_input_tokens_seen": 20243576, + "epoch": 0.3751800864414919, + "grad_norm": 7.736358642578125, + "learning_rate": 4.624829918360813e-05, + "loss": 1.5477, + "num_input_tokens_seen": 20303800, "step": 37500 }, { - "epoch": 0.9122335317841367, - "grad_norm": 7.173886775970459, - "learning_rate": 4.5438952371807184e-05, - "loss": 2.2062, - "num_input_tokens_seen": 20514328, + "epoch": 0.38018248759404516, + "grad_norm": 9.256717681884766, + "learning_rate": 4.61982751720826e-05, + "loss": 1.5379, + "num_input_tokens_seen": 20577472, "step": 38000 }, { - "epoch": 0.9242366045707702, - "grad_norm": 6.2140116691589355, - "learning_rate": 4.537893700787402e-05, - "loss": 2.2135, - "num_input_tokens_seen": 20784032, + "epoch": 0.38518488874659834, + "grad_norm": 7.987185478210449, + "learning_rate": 4.614825116055707e-05, + "loss": 1.5411, + "num_input_tokens_seen": 20844712, "step": 38500 }, { - "epoch": 0.9362396773574035, - "grad_norm": 7.315997123718262, - "learning_rate": 4.5318921643940855e-05, - "loss": 2.1922, - "num_input_tokens_seen": 21051360, + "epoch": 0.3901872898991516, + "grad_norm": 6.768152713775635, + "learning_rate": 4.6098227149031536e-05, + "loss": 1.5317, + "num_input_tokens_seen": 21112208, "step": 39000 }, { - "epoch": 0.9482427501440369, - "grad_norm": 6.743567943572998, - "learning_rate": 4.5258906280007684e-05, - "loss": 2.2017, - "num_input_tokens_seen": 21323776, + "epoch": 0.3951896910517048, + "grad_norm": 8.385108947753906, + "learning_rate": 4.6048203137506006e-05, + "loss": 1.522, + "num_input_tokens_seen": 21378704, "step": 39500 }, { - "epoch": 0.9602458229306703, - "grad_norm": 6.185552597045898, - "learning_rate": 4.519889091607451e-05, - "loss": 2.2165, - "num_input_tokens_seen": 21592696, + "epoch": 0.40019209220425805, + "grad_norm": 6.166186809539795, + "learning_rate": 4.5998179125980476e-05, + "loss": 1.5167, + "num_input_tokens_seen": 21648840, "step": 40000 }, { - "epoch": 0.9722488957173037, - "grad_norm": 7.2669453620910645, - "learning_rate": 4.5138875552141355e-05, - "loss": 2.1858, - "num_input_tokens_seen": 21863480, + "epoch": 0.4051944933568113, + "grad_norm": 6.178369045257568, + "learning_rate": 4.594815511445494e-05, + "loss": 1.5161, + "num_input_tokens_seen": 21917272, "step": 40500 }, { - "epoch": 0.984251968503937, - "grad_norm": 6.04233980178833, - "learning_rate": 4.5078860188208183e-05, - "loss": 2.1923, - "num_input_tokens_seen": 22134088, + "epoch": 0.4101968945093645, + "grad_norm": 8.063447952270508, + "learning_rate": 4.589813110292941e-05, + "loss": 1.5335, + "num_input_tokens_seen": 22185960, "step": 41000 }, { - "epoch": 0.9962550412905704, - "grad_norm": 7.742071151733398, - "learning_rate": 4.501884482427501e-05, - "loss": 2.1687, - "num_input_tokens_seen": 22399016, + "epoch": 0.4151992956619177, + "grad_norm": 9.296152114868164, + "learning_rate": 4.584810709140387e-05, + "loss": 1.5212, + "num_input_tokens_seen": 22451608, "step": 41500 }, { - "epoch": 1.0, - "eval_loss": 2.0160465240478516, - "eval_runtime": 77.6201, - "eval_samples_per_second": 1073.318, - "eval_steps_per_second": 134.166, - "num_input_tokens_seen": 22482354, - "step": 41656 - }, - { - "epoch": 1.0082581140772038, - "grad_norm": 6.644104480743408, - "learning_rate": 4.495882946034185e-05, - "loss": 2.0583, - "num_input_tokens_seen": 22664650, + "epoch": 0.42020169681447095, + "grad_norm": 7.5216965675354, + "learning_rate": 4.579808307987834e-05, + "loss": 1.5275, + "num_input_tokens_seen": 22720912, "step": 42000 }, { - "epoch": 1.0202611868638372, - "grad_norm": 8.196748733520508, - "learning_rate": 4.489881409640868e-05, - "loss": 2.03, - "num_input_tokens_seen": 22933130, + "epoch": 0.4252040979670242, + "grad_norm": 7.390750408172607, + "learning_rate": 4.574805906835281e-05, + "loss": 1.5063, + "num_input_tokens_seen": 22985424, "step": 42500 }, { - "epoch": 1.0322642596504705, - "grad_norm": 7.018587589263916, - "learning_rate": 4.483879873247552e-05, - "loss": 2.036, - "num_input_tokens_seen": 23205202, + "epoch": 0.4302064991195774, + "grad_norm": 7.005786895751953, + "learning_rate": 4.569803505682728e-05, + "loss": 1.5166, + "num_input_tokens_seen": 23258384, "step": 43000 }, { - "epoch": 1.044267332437104, - "grad_norm": 8.443140983581543, - "learning_rate": 4.477878336854235e-05, - "loss": 2.0316, - "num_input_tokens_seen": 23472874, + "epoch": 0.4352089002721306, + "grad_norm": 8.231675148010254, + "learning_rate": 4.5648011045301745e-05, + "loss": 1.5198, + "num_input_tokens_seen": 23531648, "step": 43500 }, { - "epoch": 1.0562704052237373, - "grad_norm": 6.988731384277344, - "learning_rate": 4.471876800460918e-05, - "loss": 2.0008, - "num_input_tokens_seen": 23740306, + "epoch": 0.44021130142468384, + "grad_norm": 8.228809356689453, + "learning_rate": 4.5597987033776215e-05, + "loss": 1.4984, + "num_input_tokens_seen": 23798648, "step": 44000 }, { - "epoch": 1.0682734780103706, - "grad_norm": 6.918605327606201, - "learning_rate": 4.465875264067602e-05, - "loss": 2.0431, - "num_input_tokens_seen": 24010810, + "epoch": 0.4452137025772371, + "grad_norm": 7.101296901702881, + "learning_rate": 4.5547963022250685e-05, + "loss": 1.5261, + "num_input_tokens_seen": 24061032, "step": 44500 }, { - "epoch": 1.080276550797004, - "grad_norm": 7.313216686248779, - "learning_rate": 4.459873727674285e-05, - "loss": 2.0309, - "num_input_tokens_seen": 24280754, + "epoch": 0.4502161037297903, + "grad_norm": 7.954161167144775, + "learning_rate": 4.549793901072515e-05, + "loss": 1.5011, + "num_input_tokens_seen": 24329816, "step": 45000 }, { - "epoch": 1.0922796235836374, - "grad_norm": 6.166088581085205, - "learning_rate": 4.453872191280968e-05, - "loss": 2.0121, - "num_input_tokens_seen": 24546290, + "epoch": 0.4552185048823435, + "grad_norm": 6.646183490753174, + "learning_rate": 4.544791499919962e-05, + "loss": 1.5281, + "num_input_tokens_seen": 24600296, "step": 45500 }, { - "epoch": 1.1042826963702708, - "grad_norm": 6.181678771972656, - "learning_rate": 4.447870654887651e-05, - "loss": 2.0484, - "num_input_tokens_seen": 24815810, + "epoch": 0.46022090603489674, + "grad_norm": 6.38918924331665, + "learning_rate": 4.539789098767409e-05, + "loss": 1.5084, + "num_input_tokens_seen": 24864680, "step": 46000 }, { - "epoch": 1.1162857691569041, - "grad_norm": 6.6109843254089355, - "learning_rate": 4.4418691184943346e-05, - "loss": 2.0329, - "num_input_tokens_seen": 25085306, + "epoch": 0.46522330718745, + "grad_norm": 9.28714656829834, + "learning_rate": 4.534786697614855e-05, + "loss": 1.5132, + "num_input_tokens_seen": 25138944, "step": 46500 }, { - "epoch": 1.1282888419435375, - "grad_norm": 8.683442115783691, - "learning_rate": 4.435867582101018e-05, - "loss": 2.0286, - "num_input_tokens_seen": 25361298, + "epoch": 0.4702257083400032, + "grad_norm": 7.2790937423706055, + "learning_rate": 4.529784296462302e-05, + "loss": 1.4851, + "num_input_tokens_seen": 25405872, "step": 47000 }, { - "epoch": 1.1402919147301709, - "grad_norm": 6.830644130706787, - "learning_rate": 4.429866045707701e-05, - "loss": 2.0372, - "num_input_tokens_seen": 25630434, + "epoch": 0.47522810949255645, + "grad_norm": 6.309217929840088, + "learning_rate": 4.5247818953097484e-05, + "loss": 1.4966, + "num_input_tokens_seen": 25672064, "step": 47500 }, { - "epoch": 1.1522949875168043, - "grad_norm": 7.53049898147583, - "learning_rate": 4.4238645093143846e-05, - "loss": 2.0353, - "num_input_tokens_seen": 25908290, + "epoch": 0.48023051064510963, + "grad_norm": 5.927547931671143, + "learning_rate": 4.519779494157196e-05, + "loss": 1.4871, + "num_input_tokens_seen": 25944256, "step": 48000 }, { - "epoch": 1.1642980603034376, - "grad_norm": 6.1298441886901855, - "learning_rate": 4.417862972921068e-05, - "loss": 2.0572, - "num_input_tokens_seen": 26180626, + "epoch": 0.48523291179766287, + "grad_norm": 7.525789260864258, + "learning_rate": 4.5147770930046424e-05, + "loss": 1.5077, + "num_input_tokens_seen": 26218880, "step": 48500 }, { - "epoch": 1.176301133090071, - "grad_norm": 6.606161117553711, - "learning_rate": 4.411861436527751e-05, - "loss": 2.0561, - "num_input_tokens_seen": 26455234, + "epoch": 0.4902353129502161, + "grad_norm": 7.0813517570495605, + "learning_rate": 4.5097746918520894e-05, + "loss": 1.5036, + "num_input_tokens_seen": 26492264, "step": 49000 }, { - "epoch": 1.1883042058767044, - "grad_norm": 6.336781024932861, - "learning_rate": 4.4058599001344346e-05, - "loss": 2.0382, - "num_input_tokens_seen": 26728138, + "epoch": 0.49523771410276934, + "grad_norm": 10.278409004211426, + "learning_rate": 4.504772290699536e-05, + "loss": 1.4888, + "num_input_tokens_seen": 26761928, "step": 49500 }, { - "epoch": 1.2003072786633378, - "grad_norm": 6.390580654144287, - "learning_rate": 4.399858363741118e-05, - "loss": 2.0478, - "num_input_tokens_seen": 27002778, + "epoch": 0.5002401152553225, + "grad_norm": 10.73009967803955, + "learning_rate": 4.499769889546983e-05, + "loss": 1.4941, + "num_input_tokens_seen": 27033072, "step": 50000 }, { - "epoch": 1.2123103514499711, - "grad_norm": 7.768282413482666, - "learning_rate": 4.3938568273478016e-05, - "loss": 2.0553, - "num_input_tokens_seen": 27273858, + "epoch": 0.5052425164078758, + "grad_norm": 8.047172546386719, + "learning_rate": 4.49476748839443e-05, + "loss": 1.4882, + "num_input_tokens_seen": 27297448, "step": 50500 }, { - "epoch": 1.2243134242366045, - "grad_norm": 5.878102779388428, - "learning_rate": 4.3878552909544845e-05, - "loss": 2.0442, - "num_input_tokens_seen": 27540218, + "epoch": 0.510244917560429, + "grad_norm": 7.059236526489258, + "learning_rate": 4.489765087241876e-05, + "loss": 1.5093, + "num_input_tokens_seen": 27572696, "step": 51000 }, { - "epoch": 1.2363164970232379, - "grad_norm": 8.041106224060059, - "learning_rate": 4.3818537545611674e-05, - "loss": 2.0397, - "num_input_tokens_seen": 27812802, + "epoch": 0.5152473187129822, + "grad_norm": 6.407097816467285, + "learning_rate": 4.484762686089323e-05, + "loss": 1.4841, + "num_input_tokens_seen": 27850272, "step": 51500 }, { - "epoch": 1.2483195698098712, - "grad_norm": 7.381927967071533, - "learning_rate": 4.3758522181678516e-05, - "loss": 2.0352, - "num_input_tokens_seen": 28083754, + "epoch": 0.5202497198655355, + "grad_norm": 7.751036167144775, + "learning_rate": 4.47976028493677e-05, + "loss": 1.4797, + "num_input_tokens_seen": 28125984, "step": 52000 }, { - "epoch": 1.2603226425965048, - "grad_norm": 7.340395450592041, - "learning_rate": 4.3698506817745345e-05, - "loss": 2.0271, - "num_input_tokens_seen": 28350122, + "epoch": 0.5252521210180887, + "grad_norm": 7.869476795196533, + "learning_rate": 4.474757883784217e-05, + "loss": 1.4826, + "num_input_tokens_seen": 28400064, "step": 52500 }, { - "epoch": 1.272325715383138, - "grad_norm": 6.426082134246826, - "learning_rate": 4.363849145381218e-05, - "loss": 2.0479, - "num_input_tokens_seen": 28616874, + "epoch": 0.530254522170642, + "grad_norm": 6.702480316162109, + "learning_rate": 4.4697554826316633e-05, + "loss": 1.4888, + "num_input_tokens_seen": 28668520, "step": 53000 }, { - "epoch": 1.2843287881697716, - "grad_norm": 5.428038120269775, - "learning_rate": 4.357847608987901e-05, - "loss": 2.066, - "num_input_tokens_seen": 28891394, + "epoch": 0.5352569233231952, + "grad_norm": 5.916720867156982, + "learning_rate": 4.46475308147911e-05, + "loss": 1.4984, + "num_input_tokens_seen": 28938448, "step": 53500 }, { - "epoch": 1.2963318609564047, - "grad_norm": 5.9977312088012695, - "learning_rate": 4.3518460725945844e-05, - "loss": 2.0455, - "num_input_tokens_seen": 29159002, + "epoch": 0.5402593244757483, + "grad_norm": 8.540026664733887, + "learning_rate": 4.459750680326557e-05, + "loss": 1.5038, + "num_input_tokens_seen": 29207464, "step": 54000 }, { - "epoch": 1.3083349337430383, - "grad_norm": 6.24228572845459, - "learning_rate": 4.345844536201268e-05, - "loss": 2.0339, - "num_input_tokens_seen": 29425642, + "epoch": 0.5452617256283016, + "grad_norm": 6.425217151641846, + "learning_rate": 4.4547482791740036e-05, + "loss": 1.4771, + "num_input_tokens_seen": 29476896, "step": 54500 }, { - "epoch": 1.3203380065296715, - "grad_norm": 6.873836994171143, - "learning_rate": 4.339842999807951e-05, - "loss": 2.0159, - "num_input_tokens_seen": 29696602, + "epoch": 0.5502641267808548, + "grad_norm": 5.669241428375244, + "learning_rate": 4.4497458780214506e-05, + "loss": 1.4618, + "num_input_tokens_seen": 29744952, "step": 55000 }, { - "epoch": 1.332341079316305, - "grad_norm": 6.552412986755371, - "learning_rate": 4.3338414634146344e-05, - "loss": 2.0486, - "num_input_tokens_seen": 29971010, + "epoch": 0.555266527933408, + "grad_norm": 6.841111660003662, + "learning_rate": 4.444743476868897e-05, + "loss": 1.4899, + "num_input_tokens_seen": 30012448, "step": 55500 }, { - "epoch": 1.3443441521029382, - "grad_norm": 5.9392924308776855, - "learning_rate": 4.327839927021318e-05, - "loss": 2.0484, - "num_input_tokens_seen": 30236370, + "epoch": 0.5602689290859613, + "grad_norm": 6.233266353607178, + "learning_rate": 4.439741075716344e-05, + "loss": 1.5034, + "num_input_tokens_seen": 30282976, "step": 56000 }, { - "epoch": 1.3563472248895718, - "grad_norm": 6.460512161254883, - "learning_rate": 4.321838390628001e-05, - "loss": 2.0382, - "num_input_tokens_seen": 30505698, + "epoch": 0.5652713302385145, + "grad_norm": 6.895115375518799, + "learning_rate": 4.434738674563791e-05, + "loss": 1.493, + "num_input_tokens_seen": 30555656, "step": 56500 }, { - "epoch": 1.368350297676205, - "grad_norm": 6.535401344299316, - "learning_rate": 4.3158368542346844e-05, - "loss": 2.0414, - "num_input_tokens_seen": 30775874, + "epoch": 0.5702737313910677, + "grad_norm": 6.7635674476623535, + "learning_rate": 4.429736273411237e-05, + "loss": 1.4729, + "num_input_tokens_seen": 30825952, "step": 57000 }, { - "epoch": 1.3803533704628386, - "grad_norm": 7.04288911819458, - "learning_rate": 4.309835317841367e-05, - "loss": 2.0225, - "num_input_tokens_seen": 31045450, + "epoch": 0.575276132543621, + "grad_norm": 8.052691459655762, + "learning_rate": 4.424733872258684e-05, + "loss": 1.4775, + "num_input_tokens_seen": 31095152, "step": 57500 }, { - "epoch": 1.392356443249472, - "grad_norm": 6.343100070953369, - "learning_rate": 4.3038337814480514e-05, - "loss": 2.0592, - "num_input_tokens_seen": 31315474, + "epoch": 0.5802785336961742, + "grad_norm": 7.0079545974731445, + "learning_rate": 4.419731471106131e-05, + "loss": 1.4726, + "num_input_tokens_seen": 31368672, "step": 58000 }, { - "epoch": 1.4043595160361053, - "grad_norm": 7.222833633422852, - "learning_rate": 4.297832245054734e-05, - "loss": 2.0554, - "num_input_tokens_seen": 31583426, + "epoch": 0.5852809348487273, + "grad_norm": 5.913256645202637, + "learning_rate": 4.414729069953578e-05, + "loss": 1.4758, + "num_input_tokens_seen": 31642464, "step": 58500 }, { - "epoch": 1.4163625888227387, - "grad_norm": 6.587092876434326, - "learning_rate": 4.291830708661417e-05, - "loss": 2.0666, - "num_input_tokens_seen": 31857034, + "epoch": 0.5902833360012806, + "grad_norm": 9.248602867126465, + "learning_rate": 4.4097266688010246e-05, + "loss": 1.4534, + "num_input_tokens_seen": 31913336, "step": 59000 }, { - "epoch": 1.428365661609372, - "grad_norm": 7.614266395568848, - "learning_rate": 4.285829172268101e-05, - "loss": 2.0361, - "num_input_tokens_seen": 32123650, + "epoch": 0.5952857371538338, + "grad_norm": 6.6079936027526855, + "learning_rate": 4.4047242676484716e-05, + "loss": 1.4581, + "num_input_tokens_seen": 32186344, "step": 59500 }, { - "epoch": 1.4403687343960054, - "grad_norm": 6.495140552520752, - "learning_rate": 4.279827635874784e-05, - "loss": 2.0595, - "num_input_tokens_seen": 32390954, + "epoch": 0.600288138306387, + "grad_norm": 6.47177791595459, + "learning_rate": 4.3997218664959185e-05, + "loss": 1.4505, + "num_input_tokens_seen": 32460808, "step": 60000 }, { - "epoch": 1.4523718071826388, - "grad_norm": 6.1124773025512695, - "learning_rate": 4.273826099481468e-05, - "loss": 2.0525, - "num_input_tokens_seen": 32660954, + "epoch": 0.6052905394589403, + "grad_norm": 6.106129169464111, + "learning_rate": 4.394719465343365e-05, + "loss": 1.4701, + "num_input_tokens_seen": 32728200, "step": 60500 }, { - "epoch": 1.4643748799692722, - "grad_norm": 6.0760345458984375, - "learning_rate": 4.267824563088151e-05, - "loss": 2.0268, - "num_input_tokens_seen": 32927826, + "epoch": 0.6102929406114935, + "grad_norm": 8.406516075134277, + "learning_rate": 4.389717064190812e-05, + "loss": 1.4591, + "num_input_tokens_seen": 33005896, "step": 61000 }, { - "epoch": 1.4763779527559056, - "grad_norm": 5.886369228363037, - "learning_rate": 4.261823026694834e-05, - "loss": 2.0277, - "num_input_tokens_seen": 33199402, + "epoch": 0.6152953417640468, + "grad_norm": 9.166064262390137, + "learning_rate": 4.384714663038258e-05, + "loss": 1.4875, + "num_input_tokens_seen": 33269392, "step": 61500 }, { - "epoch": 1.488381025542539, - "grad_norm": 6.692379474639893, - "learning_rate": 4.255821490301518e-05, - "loss": 2.0475, - "num_input_tokens_seen": 33471074, + "epoch": 0.6202977429166, + "grad_norm": 7.39436149597168, + "learning_rate": 4.379712261885706e-05, + "loss": 1.4563, + "num_input_tokens_seen": 33541424, "step": 62000 }, { - "epoch": 1.5003840983291723, - "grad_norm": 6.163520812988281, - "learning_rate": 4.2498199539082006e-05, - "loss": 2.0445, - "num_input_tokens_seen": 33744794, + "epoch": 0.6253001440691532, + "grad_norm": 5.612057685852051, + "learning_rate": 4.374709860733152e-05, + "loss": 1.4761, + "num_input_tokens_seen": 33806192, "step": 62500 }, { - "epoch": 1.5123871711158057, - "grad_norm": 6.49104642868042, - "learning_rate": 4.2438184175148835e-05, - "loss": 2.0546, - "num_input_tokens_seen": 34012698, + "epoch": 0.6303025452217064, + "grad_norm": 7.853974342346191, + "learning_rate": 4.3697074595805985e-05, + "loss": 1.4617, + "num_input_tokens_seen": 34076920, "step": 63000 }, { - "epoch": 1.524390243902439, - "grad_norm": 5.753612041473389, - "learning_rate": 4.237816881121567e-05, - "loss": 2.0216, - "num_input_tokens_seen": 34279938, + "epoch": 0.6353049463742596, + "grad_norm": 8.041677474975586, + "learning_rate": 4.3647050584280455e-05, + "loss": 1.442, + "num_input_tokens_seen": 34340464, "step": 63500 }, { - "epoch": 1.5363933166890724, - "grad_norm": 7.659876823425293, - "learning_rate": 4.2318153447282506e-05, - "loss": 2.061, - "num_input_tokens_seen": 34549090, + "epoch": 0.6403073475268128, + "grad_norm": 7.188261985778809, + "learning_rate": 4.3597026572754925e-05, + "loss": 1.4746, + "num_input_tokens_seen": 34608464, "step": 64000 }, { - "epoch": 1.5483963894757058, - "grad_norm": 6.293288230895996, - "learning_rate": 4.225813808334934e-05, - "loss": 2.0452, - "num_input_tokens_seen": 34822178, + "epoch": 0.6453097486793661, + "grad_norm": 7.935949802398682, + "learning_rate": 4.3547002561229395e-05, + "loss": 1.4655, + "num_input_tokens_seen": 34878936, "step": 64500 }, { - "epoch": 1.5603994622623392, - "grad_norm": 6.05708122253418, - "learning_rate": 4.219812271941617e-05, - "loss": 2.0383, - "num_input_tokens_seen": 35091458, + "epoch": 0.6503121498319193, + "grad_norm": 6.211294651031494, + "learning_rate": 4.349697854970386e-05, + "loss": 1.4481, + "num_input_tokens_seen": 35148000, "step": 65000 }, { - "epoch": 1.5724025350489725, - "grad_norm": 6.689181804656982, - "learning_rate": 4.2138107355483006e-05, - "loss": 2.0337, - "num_input_tokens_seen": 35361794, + "epoch": 0.6553145509844726, + "grad_norm": 8.786713600158691, + "learning_rate": 4.344695453817833e-05, + "loss": 1.4541, + "num_input_tokens_seen": 35425104, "step": 65500 }, { - "epoch": 1.584405607835606, - "grad_norm": 5.993900775909424, - "learning_rate": 4.207809199154984e-05, - "loss": 2.0345, - "num_input_tokens_seen": 35625594, + "epoch": 0.6603169521370258, + "grad_norm": 7.866344928741455, + "learning_rate": 4.33969305266528e-05, + "loss": 1.4537, + "num_input_tokens_seen": 35693952, "step": 66000 }, { - "epoch": 1.5964086806222393, - "grad_norm": 8.1398286819458, - "learning_rate": 4.201807662761667e-05, - "loss": 2.0596, - "num_input_tokens_seen": 35892314, + "epoch": 0.665319353289579, + "grad_norm": 7.549289703369141, + "learning_rate": 4.334690651512726e-05, + "loss": 1.4569, + "num_input_tokens_seen": 35966608, "step": 66500 }, { - "epoch": 1.6084117534088727, - "grad_norm": 6.438960552215576, - "learning_rate": 4.1958061263683505e-05, - "loss": 2.0235, - "num_input_tokens_seen": 36158834, + "epoch": 0.6703217544421323, + "grad_norm": 6.588021278381348, + "learning_rate": 4.329688250360173e-05, + "loss": 1.4477, + "num_input_tokens_seen": 36238976, "step": 67000 }, { - "epoch": 1.620414826195506, - "grad_norm": 6.247795581817627, - "learning_rate": 4.189804589975034e-05, - "loss": 2.0396, - "num_input_tokens_seen": 36428610, + "epoch": 0.6753241555946854, + "grad_norm": 7.3665852546691895, + "learning_rate": 4.32468584920762e-05, + "loss": 1.4524, + "num_input_tokens_seen": 36504184, "step": 67500 }, { - "epoch": 1.6324178989821394, - "grad_norm": 7.116842746734619, - "learning_rate": 4.183803053581717e-05, - "loss": 2.0474, - "num_input_tokens_seen": 36697826, + "epoch": 0.6803265567472386, + "grad_norm": 8.99618911743164, + "learning_rate": 4.319683448055067e-05, + "loss": 1.4559, + "num_input_tokens_seen": 36767640, "step": 68000 }, { - "epoch": 1.6444209717687728, - "grad_norm": 7.737421989440918, - "learning_rate": 4.1778015171884005e-05, - "loss": 2.0426, - "num_input_tokens_seen": 36972058, + "epoch": 0.6853289578997919, + "grad_norm": 8.354264259338379, + "learning_rate": 4.3146810469025134e-05, + "loss": 1.4552, + "num_input_tokens_seen": 37033640, "step": 68500 }, { - "epoch": 1.6564240445554061, - "grad_norm": 6.0263824462890625, - "learning_rate": 4.1717999807950833e-05, - "loss": 2.0206, - "num_input_tokens_seen": 37241602, + "epoch": 0.6903313590523451, + "grad_norm": 6.712357521057129, + "learning_rate": 4.30967864574996e-05, + "loss": 1.4428, + "num_input_tokens_seen": 37301576, "step": 69000 }, { - "epoch": 1.6684271173420395, - "grad_norm": 7.804500102996826, - "learning_rate": 4.1657984444017676e-05, - "loss": 2.0592, - "num_input_tokens_seen": 37514258, + "epoch": 0.6953337602048983, + "grad_norm": 6.98289680480957, + "learning_rate": 4.3046762445974074e-05, + "loss": 1.4389, + "num_input_tokens_seen": 37575040, "step": 69500 }, { - "epoch": 1.680430190128673, - "grad_norm": 7.370055198669434, - "learning_rate": 4.1597969080084504e-05, - "loss": 2.036, - "num_input_tokens_seen": 37788834, + "epoch": 0.7003361613574516, + "grad_norm": 7.0615410804748535, + "learning_rate": 4.299673843444854e-05, + "loss": 1.4323, + "num_input_tokens_seen": 37844304, "step": 70000 }, { - "epoch": 1.6924332629153063, - "grad_norm": 7.968544006347656, - "learning_rate": 4.153795371615133e-05, - "loss": 2.0351, - "num_input_tokens_seen": 38055162, + "epoch": 0.7053385625100048, + "grad_norm": 7.445618152618408, + "learning_rate": 4.294671442292301e-05, + "loss": 1.437, + "num_input_tokens_seen": 38118688, "step": 70500 }, { - "epoch": 1.7044363357019396, - "grad_norm": 7.359837055206299, - "learning_rate": 4.147793835221817e-05, - "loss": 2.0314, - "num_input_tokens_seen": 38329538, + "epoch": 0.7103409636625581, + "grad_norm": 5.989320278167725, + "learning_rate": 4.289669041139747e-05, + "loss": 1.4525, + "num_input_tokens_seen": 38387808, "step": 71000 }, { - "epoch": 1.716439408488573, - "grad_norm": 6.660539627075195, - "learning_rate": 4.1417922988285004e-05, - "loss": 2.0556, - "num_input_tokens_seen": 38602594, + "epoch": 0.7153433648151113, + "grad_norm": 6.6483283042907715, + "learning_rate": 4.284666639987194e-05, + "loss": 1.4389, + "num_input_tokens_seen": 38648568, "step": 71500 }, { - "epoch": 1.7284424812752066, - "grad_norm": 5.716453552246094, - "learning_rate": 4.135790762435184e-05, - "loss": 2.042, - "num_input_tokens_seen": 38877274, + "epoch": 0.7203457659676645, + "grad_norm": 7.410543918609619, + "learning_rate": 4.279664238834641e-05, + "loss": 1.4205, + "num_input_tokens_seen": 38924584, "step": 72000 }, { - "epoch": 1.7404455540618398, - "grad_norm": 5.669586658477783, - "learning_rate": 4.129789226041867e-05, - "loss": 2.0546, - "num_input_tokens_seen": 39143802, + "epoch": 0.7253481671202177, + "grad_norm": 6.462342739105225, + "learning_rate": 4.274661837682087e-05, + "loss": 1.4286, + "num_input_tokens_seen": 39189080, "step": 72500 }, { - "epoch": 1.7524486268484734, - "grad_norm": 6.265084266662598, - "learning_rate": 4.12378768964855e-05, - "loss": 2.0271, - "num_input_tokens_seen": 39410442, + "epoch": 0.7303505682727709, + "grad_norm": 6.404929161071777, + "learning_rate": 4.269659436529534e-05, + "loss": 1.4475, + "num_input_tokens_seen": 39456616, "step": 73000 }, { - "epoch": 1.7644516996351065, - "grad_norm": 5.818909645080566, - "learning_rate": 4.117786153255234e-05, - "loss": 2.0449, - "num_input_tokens_seen": 39677642, + "epoch": 0.7353529694253241, + "grad_norm": 7.46643590927124, + "learning_rate": 4.264657035376981e-05, + "loss": 1.4374, + "num_input_tokens_seen": 39729608, "step": 73500 }, { - "epoch": 1.77645477242174, - "grad_norm": 6.75243616104126, - "learning_rate": 4.111784616861917e-05, - "loss": 2.0462, - "num_input_tokens_seen": 39945162, + "epoch": 0.7403553705778774, + "grad_norm": 6.626592636108398, + "learning_rate": 4.259654634224428e-05, + "loss": 1.4194, + "num_input_tokens_seen": 39998368, "step": 74000 }, { - "epoch": 1.7884578452083733, - "grad_norm": 7.021036624908447, - "learning_rate": 4.1057830804686e-05, - "loss": 2.0157, - "num_input_tokens_seen": 40213378, + "epoch": 0.7453577717304306, + "grad_norm": 6.95764684677124, + "learning_rate": 4.2546522330718746e-05, + "loss": 1.4276, + "num_input_tokens_seen": 40271760, "step": 74500 }, { - "epoch": 1.8004609179950068, - "grad_norm": 6.168594837188721, - "learning_rate": 4.099781544075283e-05, - "loss": 2.0338, - "num_input_tokens_seen": 40480770, + "epoch": 0.7503601728829838, + "grad_norm": 5.589132785797119, + "learning_rate": 4.249649831919321e-05, + "loss": 1.4304, + "num_input_tokens_seen": 40539392, "step": 75000 }, { - "epoch": 1.81246399078164, - "grad_norm": 6.919841766357422, - "learning_rate": 4.093780007681967e-05, - "loss": 2.0252, - "num_input_tokens_seen": 40754610, + "epoch": 0.7553625740355371, + "grad_norm": 6.854423999786377, + "learning_rate": 4.2446474307667686e-05, + "loss": 1.4259, + "num_input_tokens_seen": 40805536, "step": 75500 }, { - "epoch": 1.8244670635682736, - "grad_norm": 6.469540119171143, - "learning_rate": 4.08777847128865e-05, - "loss": 2.0642, - "num_input_tokens_seen": 41020098, + "epoch": 0.7603649751880903, + "grad_norm": 7.378007888793945, + "learning_rate": 4.239645029614215e-05, + "loss": 1.4145, + "num_input_tokens_seen": 41074104, "step": 76000 }, { - "epoch": 1.8364701363549067, - "grad_norm": 7.299063205718994, - "learning_rate": 4.081776934895333e-05, - "loss": 2.0182, - "num_input_tokens_seen": 41288074, + "epoch": 0.7653673763406436, + "grad_norm": 6.971609592437744, + "learning_rate": 4.234642628461662e-05, + "loss": 1.421, + "num_input_tokens_seen": 41344576, "step": 76500 }, { - "epoch": 1.8484732091415403, - "grad_norm": 7.042043209075928, - "learning_rate": 4.075775398502017e-05, - "loss": 2.0147, - "num_input_tokens_seen": 41557010, + "epoch": 0.7703697774931967, + "grad_norm": 7.222036838531494, + "learning_rate": 4.229640227309108e-05, + "loss": 1.4314, + "num_input_tokens_seen": 41615376, "step": 77000 }, { - "epoch": 1.8604762819281735, - "grad_norm": 7.198431015014648, - "learning_rate": 4.0697738621087e-05, - "loss": 2.0275, - "num_input_tokens_seen": 41828674, + "epoch": 0.7753721786457499, + "grad_norm": 6.561624526977539, + "learning_rate": 4.224637826156556e-05, + "loss": 1.409, + "num_input_tokens_seen": 41886272, "step": 77500 }, { - "epoch": 1.872479354714807, - "grad_norm": 5.926443099975586, - "learning_rate": 4.063772325715383e-05, - "loss": 2.0271, - "num_input_tokens_seen": 42092282, + "epoch": 0.7803745797983032, + "grad_norm": 6.4644646644592285, + "learning_rate": 4.219635425004002e-05, + "loss": 1.4101, + "num_input_tokens_seen": 42156112, "step": 78000 }, { - "epoch": 1.8844824275014402, - "grad_norm": 6.787843227386475, - "learning_rate": 4.0577707893220667e-05, - "loss": 2.0453, - "num_input_tokens_seen": 42364482, + "epoch": 0.7853769809508564, + "grad_norm": 6.069692611694336, + "learning_rate": 4.2146330238514485e-05, + "loss": 1.4213, + "num_input_tokens_seen": 42424448, "step": 78500 }, { - "epoch": 1.8964855002880738, - "grad_norm": 5.574174404144287, - "learning_rate": 4.05176925292875e-05, - "loss": 2.0163, - "num_input_tokens_seen": 42637586, + "epoch": 0.7903793821034096, + "grad_norm": 6.701622486114502, + "learning_rate": 4.2096306226988955e-05, + "loss": 1.4204, + "num_input_tokens_seen": 42685864, "step": 79000 }, { - "epoch": 1.908488573074707, - "grad_norm": 6.725111484527588, - "learning_rate": 4.045767716535434e-05, - "loss": 2.0229, - "num_input_tokens_seen": 42908706, + "epoch": 0.7953817832559629, + "grad_norm": 8.732488632202148, + "learning_rate": 4.2046282215463425e-05, + "loss": 1.4225, + "num_input_tokens_seen": 42959264, "step": 79500 }, { - "epoch": 1.9204916458613406, - "grad_norm": 7.450390815734863, - "learning_rate": 4.0397661801421166e-05, - "loss": 2.0371, - "num_input_tokens_seen": 43183010, + "epoch": 0.8003841844085161, + "grad_norm": 7.264562129974365, + "learning_rate": 4.1996258203937895e-05, + "loss": 1.4044, + "num_input_tokens_seen": 43231192, "step": 80000 }, { - "epoch": 1.9324947186479737, - "grad_norm": 6.539551734924316, - "learning_rate": 4.0337646437487995e-05, - "loss": 2.008, - "num_input_tokens_seen": 43455834, + "epoch": 0.8053865855610693, + "grad_norm": 7.394875526428223, + "learning_rate": 4.194623419241236e-05, + "loss": 1.4211, + "num_input_tokens_seen": 43500168, "step": 80500 }, { - "epoch": 1.9444977914346073, - "grad_norm": 7.092611312866211, - "learning_rate": 4.027763107355483e-05, - "loss": 2.0392, - "num_input_tokens_seen": 43727626, + "epoch": 0.8103889867136226, + "grad_norm": 6.593264102935791, + "learning_rate": 4.189621018088683e-05, + "loss": 1.4179, + "num_input_tokens_seen": 43774064, "step": 81000 }, { - "epoch": 1.9565008642212405, - "grad_norm": 6.905966281890869, - "learning_rate": 4.0217615709621666e-05, - "loss": 2.0161, - "num_input_tokens_seen": 44001058, + "epoch": 0.8153913878661757, + "grad_norm": 7.966070175170898, + "learning_rate": 4.18461861693613e-05, + "loss": 1.4439, + "num_input_tokens_seen": 44046680, "step": 81500 }, { - "epoch": 1.968503937007874, - "grad_norm": 7.3359503746032715, - "learning_rate": 4.01576003456885e-05, - "loss": 2.0181, - "num_input_tokens_seen": 44272266, + "epoch": 0.820393789018729, + "grad_norm": 10.988821029663086, + "learning_rate": 4.179616215783576e-05, + "loss": 1.43, + "num_input_tokens_seen": 44320912, "step": 82000 }, { - "epoch": 1.9805070097945074, - "grad_norm": 6.227381706237793, - "learning_rate": 4.009758498175533e-05, - "loss": 2.0304, - "num_input_tokens_seen": 44543626, + "epoch": 0.8253961901712822, + "grad_norm": 6.874449729919434, + "learning_rate": 4.174613814631023e-05, + "loss": 1.4199, + "num_input_tokens_seen": 44593488, "step": 82500 }, { - "epoch": 1.9925100825811408, - "grad_norm": 6.048249244689941, - "learning_rate": 4.0037569617822165e-05, - "loss": 2.0294, - "num_input_tokens_seen": 44812362, + "epoch": 0.8303985913238354, + "grad_norm": 7.1776838302612305, + "learning_rate": 4.1696114134784694e-05, + "loss": 1.4131, + "num_input_tokens_seen": 44862096, "step": 83000 }, { - "epoch": 2.0, - "eval_loss": 1.9504808187484741, - "eval_runtime": 77.8836, - "eval_samples_per_second": 1069.686, - "eval_steps_per_second": 133.712, - "num_input_tokens_seen": 44979297, - "step": 83312 - }, - { - "epoch": 2.004513155367774, - "grad_norm": 6.9635515213012695, - "learning_rate": 3.9977554253889e-05, - "loss": 1.9596, - "num_input_tokens_seen": 45082185, + "epoch": 0.8354009924763887, + "grad_norm": 7.381138801574707, + "learning_rate": 4.164609012325917e-05, + "loss": 1.4059, + "num_input_tokens_seen": 45130632, "step": 83500 }, { - "epoch": 2.0165162281544076, - "grad_norm": 7.484299182891846, - "learning_rate": 3.991753888995583e-05, - "loss": 1.8417, - "num_input_tokens_seen": 45349033, + "epoch": 0.8404033936289419, + "grad_norm": 8.17155933380127, + "learning_rate": 4.1596066111733634e-05, + "loss": 1.4425, + "num_input_tokens_seen": 45400856, "step": 84000 }, { - "epoch": 2.0285193009410407, - "grad_norm": 6.063692092895508, - "learning_rate": 3.9857523526022665e-05, - "loss": 1.8613, - "num_input_tokens_seen": 45618753, + "epoch": 0.8454057947814951, + "grad_norm": 6.636998176574707, + "learning_rate": 4.1546042100208104e-05, + "loss": 1.3979, + "num_input_tokens_seen": 45671560, "step": 84500 }, { - "epoch": 2.0405223737276743, - "grad_norm": 6.687953472137451, - "learning_rate": 3.97975081620895e-05, - "loss": 1.8567, - "num_input_tokens_seen": 45886905, + "epoch": 0.8504081959340484, + "grad_norm": 5.552203178405762, + "learning_rate": 4.149601808868257e-05, + "loss": 1.4255, + "num_input_tokens_seen": 45945496, "step": 85000 }, { - "epoch": 2.0525254465143075, - "grad_norm": 6.7604827880859375, - "learning_rate": 3.973749279815633e-05, - "loss": 1.8526, - "num_input_tokens_seen": 46156441, + "epoch": 0.8554105970866016, + "grad_norm": 7.160405158996582, + "learning_rate": 4.144599407715704e-05, + "loss": 1.4122, + "num_input_tokens_seen": 46213464, "step": 85500 }, { - "epoch": 2.064528519300941, - "grad_norm": 8.532577514648438, - "learning_rate": 3.9677477434223164e-05, - "loss": 1.8464, - "num_input_tokens_seen": 46420697, + "epoch": 0.8604129982391548, + "grad_norm": 7.1668381690979, + "learning_rate": 4.139597006563151e-05, + "loss": 1.407, + "num_input_tokens_seen": 46482056, "step": 86000 }, { - "epoch": 2.076531592087574, - "grad_norm": 6.2575178146362305, - "learning_rate": 3.961746207028999e-05, - "loss": 1.8936, - "num_input_tokens_seen": 46689585, + "epoch": 0.865415399391708, + "grad_norm": 6.595818996429443, + "learning_rate": 4.134594605410597e-05, + "loss": 1.4052, + "num_input_tokens_seen": 46754792, "step": 86500 }, { - "epoch": 2.088534664874208, - "grad_norm": 6.570549488067627, - "learning_rate": 3.955744670635683e-05, - "loss": 1.8696, - "num_input_tokens_seen": 46962257, + "epoch": 0.8704178005442612, + "grad_norm": 7.962093830108643, + "learning_rate": 4.129592204258044e-05, + "loss": 1.3779, + "num_input_tokens_seen": 47023888, "step": 87000 }, { - "epoch": 2.100537737660841, - "grad_norm": 6.526065349578857, - "learning_rate": 3.9497431342423664e-05, - "loss": 1.8814, - "num_input_tokens_seen": 47229817, + "epoch": 0.8754202016968144, + "grad_norm": 5.4436421394348145, + "learning_rate": 4.124589803105491e-05, + "loss": 1.395, + "num_input_tokens_seen": 47294240, "step": 87500 }, { - "epoch": 2.1125408104474745, - "grad_norm": 7.117393970489502, - "learning_rate": 3.943741597849049e-05, - "loss": 1.8791, - "num_input_tokens_seen": 47497241, + "epoch": 0.8804226028493677, + "grad_norm": 9.327848434448242, + "learning_rate": 4.119587401952937e-05, + "loss": 1.4015, + "num_input_tokens_seen": 47564360, "step": 88000 }, { - "epoch": 2.124543883234108, - "grad_norm": 6.1792683601379395, - "learning_rate": 3.937740061455733e-05, - "loss": 1.8561, - "num_input_tokens_seen": 47766673, + "epoch": 0.8854250040019209, + "grad_norm": 5.366121768951416, + "learning_rate": 4.114585000800384e-05, + "loss": 1.4136, + "num_input_tokens_seen": 47846352, "step": 88500 }, { - "epoch": 2.1365469560207413, - "grad_norm": 5.680914402008057, - "learning_rate": 3.9317385250624164e-05, - "loss": 1.8764, - "num_input_tokens_seen": 48040913, + "epoch": 0.8904274051544742, + "grad_norm": 5.672398090362549, + "learning_rate": 4.109582599647831e-05, + "loss": 1.395, + "num_input_tokens_seen": 48117352, "step": 89000 }, { - "epoch": 2.148550028807375, - "grad_norm": 5.456541538238525, - "learning_rate": 3.925736988669099e-05, - "loss": 1.9065, - "num_input_tokens_seen": 48315145, + "epoch": 0.8954298063070274, + "grad_norm": 7.147487163543701, + "learning_rate": 4.104580198495278e-05, + "loss": 1.3946, + "num_input_tokens_seen": 48390832, "step": 89500 }, { - "epoch": 2.160553101594008, - "grad_norm": 6.988635063171387, - "learning_rate": 3.919735452275783e-05, - "loss": 1.8893, - "num_input_tokens_seen": 48591993, + "epoch": 0.9004322074595806, + "grad_norm": 9.567891120910645, + "learning_rate": 4.0995777973427246e-05, + "loss": 1.4111, + "num_input_tokens_seen": 48667984, "step": 90000 }, { - "epoch": 2.1725561743806416, - "grad_norm": 6.184472560882568, - "learning_rate": 3.9137339158824656e-05, - "loss": 1.8732, - "num_input_tokens_seen": 48863073, + "epoch": 0.9054346086121339, + "grad_norm": 7.761517524719238, + "learning_rate": 4.0945753961901716e-05, + "loss": 1.3972, + "num_input_tokens_seen": 48941240, "step": 90500 }, { - "epoch": 2.184559247167275, - "grad_norm": 6.220519065856934, - "learning_rate": 3.90773237948915e-05, - "loss": 1.8882, - "num_input_tokens_seen": 49134417, + "epoch": 0.910437009764687, + "grad_norm": 8.4068603515625, + "learning_rate": 4.089572995037618e-05, + "loss": 1.3894, + "num_input_tokens_seen": 49212696, "step": 91000 }, { - "epoch": 2.1965623199539084, - "grad_norm": 5.634116172790527, - "learning_rate": 3.901730843095833e-05, - "loss": 1.8948, - "num_input_tokens_seen": 49407905, + "epoch": 0.9154394109172402, + "grad_norm": 5.621284008026123, + "learning_rate": 4.084570593885065e-05, + "loss": 1.3823, + "num_input_tokens_seen": 49481760, "step": 91500 }, { - "epoch": 2.2085653927405415, - "grad_norm": 6.272744655609131, - "learning_rate": 3.8957293067025156e-05, - "loss": 1.9103, - "num_input_tokens_seen": 49679241, + "epoch": 0.9204418120697935, + "grad_norm": 8.205471992492676, + "learning_rate": 4.079568192732512e-05, + "loss": 1.3886, + "num_input_tokens_seen": 49753376, "step": 92000 }, { - "epoch": 2.220568465527175, - "grad_norm": 6.8459930419921875, - "learning_rate": 3.889727770309199e-05, - "loss": 1.8562, - "num_input_tokens_seen": 49952161, + "epoch": 0.9254442132223467, + "grad_norm": 8.143417358398438, + "learning_rate": 4.074565791579958e-05, + "loss": 1.384, + "num_input_tokens_seen": 50025128, "step": 92500 }, { - "epoch": 2.2325715383138083, - "grad_norm": 6.239691734313965, - "learning_rate": 3.883726233915883e-05, - "loss": 1.9002, - "num_input_tokens_seen": 50223441, + "epoch": 0.9304466143749, + "grad_norm": 7.172451496124268, + "learning_rate": 4.069563390427405e-05, + "loss": 1.4011, + "num_input_tokens_seen": 50301448, "step": 93000 }, { - "epoch": 2.244574611100442, - "grad_norm": 7.5250630378723145, - "learning_rate": 3.877724697522566e-05, - "loss": 1.8795, - "num_input_tokens_seen": 50494073, + "epoch": 0.9354490155274532, + "grad_norm": 7.71168851852417, + "learning_rate": 4.064560989274852e-05, + "loss": 1.3702, + "num_input_tokens_seen": 50569616, "step": 93500 }, { - "epoch": 2.256577683887075, - "grad_norm": 6.001899719238281, - "learning_rate": 3.871723161129249e-05, - "loss": 1.9055, - "num_input_tokens_seen": 50765569, + "epoch": 0.9404514166800064, + "grad_norm": 7.981653213500977, + "learning_rate": 4.059558588122299e-05, + "loss": 1.3897, + "num_input_tokens_seen": 50842808, "step": 94000 }, { - "epoch": 2.2685807566737086, - "grad_norm": 6.942378520965576, - "learning_rate": 3.8657216247359327e-05, - "loss": 1.9011, - "num_input_tokens_seen": 51035257, + "epoch": 0.9454538178325597, + "grad_norm": 6.760748386383057, + "learning_rate": 4.0545561869697455e-05, + "loss": 1.3878, + "num_input_tokens_seen": 51121760, "step": 94500 }, { - "epoch": 2.2805838294603418, - "grad_norm": 6.070522785186768, - "learning_rate": 3.859720088342616e-05, - "loss": 1.906, - "num_input_tokens_seen": 51304633, + "epoch": 0.9504562189851129, + "grad_norm": 7.034352779388428, + "learning_rate": 4.0495537858171925e-05, + "loss": 1.4073, + "num_input_tokens_seen": 51392648, "step": 95000 }, { - "epoch": 2.2925869022469754, - "grad_norm": 6.005266189575195, - "learning_rate": 3.853718551949299e-05, - "loss": 1.8874, - "num_input_tokens_seen": 51567921, + "epoch": 0.955458620137666, + "grad_norm": 6.021711349487305, + "learning_rate": 4.0445513846646395e-05, + "loss": 1.4106, + "num_input_tokens_seen": 51657888, "step": 95500 }, { - "epoch": 2.3045899750336085, - "grad_norm": 6.569184303283691, - "learning_rate": 3.8477170155559826e-05, - "loss": 1.8886, - "num_input_tokens_seen": 51836521, + "epoch": 0.9604610212902193, + "grad_norm": 7.470587253570557, + "learning_rate": 4.039548983512086e-05, + "loss": 1.3982, + "num_input_tokens_seen": 51934352, "step": 96000 }, { - "epoch": 2.316593047820242, - "grad_norm": 6.006720066070557, - "learning_rate": 3.841715479162666e-05, - "loss": 1.9048, - "num_input_tokens_seen": 52109561, + "epoch": 0.9654634224427725, + "grad_norm": 6.424021244049072, + "learning_rate": 4.034546582359533e-05, + "loss": 1.3788, + "num_input_tokens_seen": 52209064, "step": 96500 }, { - "epoch": 2.3285961206068753, - "grad_norm": 6.458248138427734, - "learning_rate": 3.835713942769349e-05, - "loss": 1.8977, - "num_input_tokens_seen": 52378169, + "epoch": 0.9704658235953257, + "grad_norm": 7.6357197761535645, + "learning_rate": 4.02954418120698e-05, + "loss": 1.3714, + "num_input_tokens_seen": 52474304, "step": 97000 }, { - "epoch": 2.340599193393509, - "grad_norm": 6.633649826049805, - "learning_rate": 3.8297124063760326e-05, - "loss": 1.8693, - "num_input_tokens_seen": 52642025, + "epoch": 0.975468224747879, + "grad_norm": 8.156658172607422, + "learning_rate": 4.024541780054426e-05, + "loss": 1.3992, + "num_input_tokens_seen": 52747832, "step": 97500 }, { - "epoch": 2.352602266180142, - "grad_norm": 6.779860019683838, - "learning_rate": 3.8237108699827154e-05, - "loss": 1.9123, - "num_input_tokens_seen": 52915873, + "epoch": 0.9804706259004322, + "grad_norm": 6.052001953125, + "learning_rate": 4.019539378901873e-05, + "loss": 1.3758, + "num_input_tokens_seen": 53024352, "step": 98000 }, { - "epoch": 2.3646053389667756, - "grad_norm": 6.39419412612915, - "learning_rate": 3.817709333589399e-05, - "loss": 1.8969, - "num_input_tokens_seen": 53187001, + "epoch": 0.9854730270529855, + "grad_norm": 6.635683059692383, + "learning_rate": 4.0145369777493194e-05, + "loss": 1.3868, + "num_input_tokens_seen": 53296880, "step": 98500 }, { - "epoch": 2.3766084117534088, - "grad_norm": 7.407958030700684, - "learning_rate": 3.8117077971960825e-05, - "loss": 1.9046, - "num_input_tokens_seen": 53460329, + "epoch": 0.9904754282055387, + "grad_norm": 6.532413482666016, + "learning_rate": 4.009534576596767e-05, + "loss": 1.3582, + "num_input_tokens_seen": 53571800, "step": 99000 }, { - "epoch": 2.3886114845400424, - "grad_norm": 6.006407260894775, - "learning_rate": 3.8057062608027654e-05, - "loss": 1.9105, - "num_input_tokens_seen": 53727321, + "epoch": 0.9954778293580919, + "grad_norm": 6.029451370239258, + "learning_rate": 4.0045321754442134e-05, + "loss": 1.3772, + "num_input_tokens_seen": 53843520, "step": 99500 }, { - "epoch": 2.4006145573266755, - "grad_norm": 5.723968029022217, - "learning_rate": 3.799704724409449e-05, - "loss": 1.8741, - "num_input_tokens_seen": 54000233, + "epoch": 1.0, + "eval_loss": 1.2156304121017456, + "eval_runtime": 188.3426, + "eval_samples_per_second": 1061.39, + "eval_steps_per_second": 132.678, + "num_input_tokens_seen": 54090088, + "step": 99952 + }, + { + "epoch": 1.000480230510645, + "grad_norm": 5.276439189910889, + "learning_rate": 3.9995297742916604e-05, + "loss": 1.2803, + "num_input_tokens_seen": 54117480, "step": 100000 }, { - "epoch": 2.412617630113309, - "grad_norm": 7.125787258148193, - "learning_rate": 3.7937031880161325e-05, - "loss": 1.8838, - "num_input_tokens_seen": 54265753, + "epoch": 1.0054826316631984, + "grad_norm": 6.911218643188477, + "learning_rate": 3.994527373139107e-05, + "loss": 1.2695, + "num_input_tokens_seen": 54389512, "step": 100500 }, { - "epoch": 2.4246207028999422, - "grad_norm": 6.293359279632568, - "learning_rate": 3.787701651622816e-05, - "loss": 1.9044, - "num_input_tokens_seen": 54536841, + "epoch": 1.0104850328157515, + "grad_norm": 9.460619926452637, + "learning_rate": 3.989524971986554e-05, + "loss": 1.2787, + "num_input_tokens_seen": 54665304, "step": 101000 }, { - "epoch": 2.436623775686576, - "grad_norm": 6.983723163604736, - "learning_rate": 3.781700115229499e-05, - "loss": 1.8807, - "num_input_tokens_seen": 54804817, + "epoch": 1.0154874339683049, + "grad_norm": 7.135129928588867, + "learning_rate": 3.984522570834001e-05, + "loss": 1.2616, + "num_input_tokens_seen": 54935144, "step": 101500 }, { - "epoch": 2.448626848473209, - "grad_norm": 5.855993270874023, - "learning_rate": 3.775698578836182e-05, - "loss": 1.9018, - "num_input_tokens_seen": 55077585, + "epoch": 1.020489835120858, + "grad_norm": 7.705801010131836, + "learning_rate": 3.979520169681447e-05, + "loss": 1.2673, + "num_input_tokens_seen": 55209312, "step": 102000 }, { - "epoch": 2.4606299212598426, - "grad_norm": 7.468148708343506, - "learning_rate": 3.769697042442866e-05, - "loss": 1.8996, - "num_input_tokens_seen": 55346345, + "epoch": 1.0254922362734111, + "grad_norm": 7.493370532989502, + "learning_rate": 3.974517768528894e-05, + "loss": 1.2623, + "num_input_tokens_seen": 55478440, "step": 102500 }, { - "epoch": 2.4726329940464757, - "grad_norm": 6.516229152679443, - "learning_rate": 3.763695506049549e-05, - "loss": 1.8747, - "num_input_tokens_seen": 55613985, + "epoch": 1.0304946374259645, + "grad_norm": 6.460716724395752, + "learning_rate": 3.969515367376341e-05, + "loss": 1.2881, + "num_input_tokens_seen": 55752896, "step": 103000 }, { - "epoch": 2.4846360668331093, - "grad_norm": 7.534191131591797, - "learning_rate": 3.7576939696562324e-05, - "loss": 1.8857, - "num_input_tokens_seen": 55884209, + "epoch": 1.0354970385785176, + "grad_norm": 7.391408443450928, + "learning_rate": 3.964512966223788e-05, + "loss": 1.2692, + "num_input_tokens_seen": 56028536, "step": 103500 }, { - "epoch": 2.4966391396197425, - "grad_norm": 6.425889492034912, - "learning_rate": 3.751692433262915e-05, - "loss": 1.8862, - "num_input_tokens_seen": 56156169, + "epoch": 1.040499439731071, + "grad_norm": 8.04489803314209, + "learning_rate": 3.9595105650712343e-05, + "loss": 1.2582, + "num_input_tokens_seen": 56297360, "step": 104000 }, { - "epoch": 2.508642212406376, - "grad_norm": 8.255390167236328, - "learning_rate": 3.745690896869599e-05, - "loss": 1.8951, - "num_input_tokens_seen": 56429409, + "epoch": 1.045501840883624, + "grad_norm": 6.487476348876953, + "learning_rate": 3.9545081639186807e-05, + "loss": 1.2791, + "num_input_tokens_seen": 56565576, "step": 104500 }, { - "epoch": 2.5206452851930097, - "grad_norm": 5.765437602996826, - "learning_rate": 3.7396893604762824e-05, - "loss": 1.9169, - "num_input_tokens_seen": 56695049, + "epoch": 1.0505042420361774, + "grad_norm": 7.118215084075928, + "learning_rate": 3.949505762766128e-05, + "loss": 1.2822, + "num_input_tokens_seen": 56841624, "step": 105000 }, { - "epoch": 2.532648357979643, - "grad_norm": 6.959896564483643, - "learning_rate": 3.733687824082965e-05, - "loss": 1.883, - "num_input_tokens_seen": 56958361, + "epoch": 1.0555066431887306, + "grad_norm": 6.419320583343506, + "learning_rate": 3.9445033616135746e-05, + "loss": 1.2533, + "num_input_tokens_seen": 57114504, "step": 105500 }, { - "epoch": 2.544651430766276, - "grad_norm": 7.03372049331665, - "learning_rate": 3.727686287689649e-05, - "loss": 1.8921, - "num_input_tokens_seen": 57228465, + "epoch": 1.060509044341284, + "grad_norm": 6.287978649139404, + "learning_rate": 3.9395009604610216e-05, + "loss": 1.2735, + "num_input_tokens_seen": 57384112, "step": 106000 }, { - "epoch": 2.5566545035529096, - "grad_norm": 6.022884368896484, - "learning_rate": 3.721684751296332e-05, - "loss": 1.9006, - "num_input_tokens_seen": 57497745, + "epoch": 1.065511445493837, + "grad_norm": 6.397841930389404, + "learning_rate": 3.934498559308468e-05, + "loss": 1.2715, + "num_input_tokens_seen": 57658904, "step": 106500 }, { - "epoch": 2.568657576339543, - "grad_norm": 5.677338600158691, - "learning_rate": 3.715683214903015e-05, - "loss": 1.8953, - "num_input_tokens_seen": 57764713, + "epoch": 1.0705138466463904, + "grad_norm": 6.377140998840332, + "learning_rate": 3.929496158155915e-05, + "loss": 1.2764, + "num_input_tokens_seen": 57930592, "step": 107000 }, { - "epoch": 2.5806606491261763, - "grad_norm": 5.7091522216796875, - "learning_rate": 3.709681678509699e-05, - "loss": 1.9122, - "num_input_tokens_seen": 58029033, + "epoch": 1.0755162477989435, + "grad_norm": 7.9464850425720215, + "learning_rate": 3.924493757003362e-05, + "loss": 1.2818, + "num_input_tokens_seen": 58206432, "step": 107500 }, { - "epoch": 2.5926637219128095, - "grad_norm": 6.313847541809082, - "learning_rate": 3.7036801421163816e-05, - "loss": 1.9101, - "num_input_tokens_seen": 58297121, + "epoch": 1.0805186489514966, + "grad_norm": 5.806307792663574, + "learning_rate": 3.919491355850808e-05, + "loss": 1.2724, + "num_input_tokens_seen": 58476704, "step": 108000 }, { - "epoch": 2.604666794699443, - "grad_norm": 6.706091403961182, - "learning_rate": 3.697678605723065e-05, - "loss": 1.9103, - "num_input_tokens_seen": 58565145, + "epoch": 1.08552105010405, + "grad_norm": 7.807882308959961, + "learning_rate": 3.914488954698255e-05, + "loss": 1.2901, + "num_input_tokens_seen": 58749936, "step": 108500 }, { - "epoch": 2.6166698674860767, - "grad_norm": 8.052926063537598, - "learning_rate": 3.691677069329749e-05, - "loss": 1.8918, - "num_input_tokens_seen": 58830209, + "epoch": 1.090523451256603, + "grad_norm": 6.01190185546875, + "learning_rate": 3.909486553545702e-05, + "loss": 1.2691, + "num_input_tokens_seen": 59014736, "step": 109000 }, { - "epoch": 2.62867294027271, - "grad_norm": 6.1878767013549805, - "learning_rate": 3.6856755329364316e-05, - "loss": 1.9117, - "num_input_tokens_seen": 59098017, + "epoch": 1.0955258524091565, + "grad_norm": 5.44499397277832, + "learning_rate": 3.904484152393149e-05, + "loss": 1.2589, + "num_input_tokens_seen": 59280912, "step": 109500 }, { - "epoch": 2.640676013059343, - "grad_norm": 6.439157009124756, - "learning_rate": 3.679673996543115e-05, - "loss": 1.9122, - "num_input_tokens_seen": 59368273, + "epoch": 1.1005282535617096, + "grad_norm": 7.433501243591309, + "learning_rate": 3.8994817512405956e-05, + "loss": 1.2686, + "num_input_tokens_seen": 59549544, "step": 110000 }, { - "epoch": 2.6526790858459766, - "grad_norm": 6.068935871124268, - "learning_rate": 3.673672460149799e-05, - "loss": 1.907, - "num_input_tokens_seen": 59637305, + "epoch": 1.105530654714263, + "grad_norm": 6.828175067901611, + "learning_rate": 3.8944793500880425e-05, + "loss": 1.2747, + "num_input_tokens_seen": 59820728, "step": 110500 }, { - "epoch": 2.66468215863261, - "grad_norm": 7.218774795532227, - "learning_rate": 3.6676709237564815e-05, - "loss": 1.9001, - "num_input_tokens_seen": 59907977, + "epoch": 1.110533055866816, + "grad_norm": 7.450278282165527, + "learning_rate": 3.8894769489354895e-05, + "loss": 1.281, + "num_input_tokens_seen": 60091056, "step": 111000 }, { - "epoch": 2.6766852314192433, - "grad_norm": 5.926177024841309, - "learning_rate": 3.661669387363165e-05, - "loss": 1.9143, - "num_input_tokens_seen": 60183665, + "epoch": 1.1155354570193694, + "grad_norm": 8.60688591003418, + "learning_rate": 3.884474547782936e-05, + "loss": 1.2745, + "num_input_tokens_seen": 60361616, "step": 111500 }, { - "epoch": 2.6886883042058765, - "grad_norm": 6.300632476806641, - "learning_rate": 3.6556678509698486e-05, - "loss": 1.9221, - "num_input_tokens_seen": 60453249, + "epoch": 1.1205378581719225, + "grad_norm": 6.874872207641602, + "learning_rate": 3.879472146630383e-05, + "loss": 1.2663, + "num_input_tokens_seen": 60628632, "step": 112000 }, { - "epoch": 2.70069137699251, - "grad_norm": 7.937029838562012, - "learning_rate": 3.649666314576532e-05, - "loss": 1.9043, - "num_input_tokens_seen": 60724969, + "epoch": 1.1255402593244757, + "grad_norm": 6.488132953643799, + "learning_rate": 3.874469745477829e-05, + "loss": 1.2733, + "num_input_tokens_seen": 60896840, "step": 112500 }, { - "epoch": 2.7126944497791436, - "grad_norm": 6.49869441986084, - "learning_rate": 3.643664778183215e-05, - "loss": 1.9162, - "num_input_tokens_seen": 60990609, + "epoch": 1.130542660477029, + "grad_norm": 10.988000869750977, + "learning_rate": 3.869467344325277e-05, + "loss": 1.2686, + "num_input_tokens_seen": 61170424, "step": 113000 }, { - "epoch": 2.724697522565777, - "grad_norm": 5.954387664794922, - "learning_rate": 3.637663241789898e-05, - "loss": 1.914, - "num_input_tokens_seen": 61264441, + "epoch": 1.1355450616295821, + "grad_norm": 6.758646011352539, + "learning_rate": 3.864464943172723e-05, + "loss": 1.2807, + "num_input_tokens_seen": 61437984, "step": 113500 }, { - "epoch": 2.73670059535241, - "grad_norm": 6.271395683288574, - "learning_rate": 3.6316617053965815e-05, - "loss": 1.9074, - "num_input_tokens_seen": 61534889, + "epoch": 1.1405474627821355, + "grad_norm": 6.983493804931641, + "learning_rate": 3.8594625420201695e-05, + "loss": 1.2807, + "num_input_tokens_seen": 61710280, "step": 114000 }, { - "epoch": 2.7487036681390435, - "grad_norm": 6.180550575256348, - "learning_rate": 3.625660169003265e-05, - "loss": 1.903, - "num_input_tokens_seen": 61806721, + "epoch": 1.1455498639346886, + "grad_norm": 6.096587181091309, + "learning_rate": 3.8544601408676165e-05, + "loss": 1.2711, + "num_input_tokens_seen": 61980960, "step": 114500 }, { - "epoch": 2.760706740925677, - "grad_norm": 5.373987197875977, - "learning_rate": 3.6196586326099485e-05, - "loss": 1.8946, - "num_input_tokens_seen": 62071993, + "epoch": 1.150552265087242, + "grad_norm": 6.4102373123168945, + "learning_rate": 3.8494577397150635e-05, + "loss": 1.2986, + "num_input_tokens_seen": 62254680, "step": 115000 }, { - "epoch": 2.7727098137123103, - "grad_norm": 6.226639747619629, - "learning_rate": 3.6136570962166314e-05, - "loss": 1.9192, - "num_input_tokens_seen": 62337537, + "epoch": 1.155554666239795, + "grad_norm": 7.1004638671875, + "learning_rate": 3.8444553385625105e-05, + "loss": 1.2837, + "num_input_tokens_seen": 62526128, "step": 115500 }, { - "epoch": 2.784712886498944, - "grad_norm": 6.833970546722412, - "learning_rate": 3.607655559823315e-05, - "loss": 1.8931, - "num_input_tokens_seen": 62608193, + "epoch": 1.1605570673923484, + "grad_norm": 6.682748794555664, + "learning_rate": 3.839452937409957e-05, + "loss": 1.2557, + "num_input_tokens_seen": 62793600, "step": 116000 }, { - "epoch": 2.796715959285577, - "grad_norm": 6.820917129516602, - "learning_rate": 3.6016540234299985e-05, - "loss": 1.9033, - "num_input_tokens_seen": 62878537, + "epoch": 1.1655594685449016, + "grad_norm": 5.091439247131348, + "learning_rate": 3.834450536257404e-05, + "loss": 1.2786, + "num_input_tokens_seen": 63060392, "step": 116500 }, { - "epoch": 2.8087190320722106, - "grad_norm": 7.808833122253418, - "learning_rate": 3.5956524870366814e-05, - "loss": 1.9109, - "num_input_tokens_seen": 63147977, + "epoch": 1.1705618696974547, + "grad_norm": 6.379209041595459, + "learning_rate": 3.829448135104851e-05, + "loss": 1.2803, + "num_input_tokens_seen": 63324312, "step": 117000 }, { - "epoch": 2.820722104858844, - "grad_norm": 7.095612525939941, - "learning_rate": 3.589650950643365e-05, - "loss": 1.9215, - "num_input_tokens_seen": 63418673, + "epoch": 1.175564270850008, + "grad_norm": 6.799802780151367, + "learning_rate": 3.824445733952297e-05, + "loss": 1.2776, + "num_input_tokens_seen": 63596232, "step": 117500 }, { - "epoch": 2.8327251776454774, - "grad_norm": 6.833127975463867, - "learning_rate": 3.5836494142500485e-05, - "loss": 1.9063, - "num_input_tokens_seen": 63688897, + "epoch": 1.1805666720025612, + "grad_norm": 5.58148193359375, + "learning_rate": 3.819443332799744e-05, + "loss": 1.2875, + "num_input_tokens_seen": 63863096, "step": 118000 }, { - "epoch": 2.8447282504321105, - "grad_norm": 6.481841087341309, - "learning_rate": 3.577647877856731e-05, - "loss": 1.9237, - "num_input_tokens_seen": 63959729, + "epoch": 1.1855690731551145, + "grad_norm": 6.822576999664307, + "learning_rate": 3.814440931647191e-05, + "loss": 1.2895, + "num_input_tokens_seen": 64138648, "step": 118500 }, { - "epoch": 2.856731323218744, - "grad_norm": 7.2938714027404785, - "learning_rate": 3.571646341463415e-05, - "loss": 1.9094, - "num_input_tokens_seen": 64226761, + "epoch": 1.1905714743076676, + "grad_norm": 8.899248123168945, + "learning_rate": 3.809438530494638e-05, + "loss": 1.2788, + "num_input_tokens_seen": 64408856, "step": 119000 }, { - "epoch": 2.8687343960053773, - "grad_norm": 6.996784210205078, - "learning_rate": 3.565644805070098e-05, - "loss": 1.9191, - "num_input_tokens_seen": 64495177, + "epoch": 1.195573875460221, + "grad_norm": 7.763192653656006, + "learning_rate": 3.8044361293420844e-05, + "loss": 1.2702, + "num_input_tokens_seen": 64679712, "step": 119500 }, { - "epoch": 2.880737468792011, - "grad_norm": 6.03670597076416, - "learning_rate": 3.559643268676782e-05, - "loss": 1.9108, - "num_input_tokens_seen": 64764497, + "epoch": 1.200576276612774, + "grad_norm": 8.2811861038208, + "learning_rate": 3.799433728189531e-05, + "loss": 1.2566, + "num_input_tokens_seen": 64945912, "step": 120000 }, { - "epoch": 2.892740541578644, - "grad_norm": 6.549102306365967, - "learning_rate": 3.553641732283465e-05, - "loss": 1.8846, - "num_input_tokens_seen": 65037881, + "epoch": 1.2055786777653275, + "grad_norm": 5.707862854003906, + "learning_rate": 3.794431327036978e-05, + "loss": 1.2667, + "num_input_tokens_seen": 65217232, "step": 120500 }, { - "epoch": 2.9047436143652776, - "grad_norm": 6.3701019287109375, - "learning_rate": 3.547640195890148e-05, - "loss": 1.9278, - "num_input_tokens_seen": 65309617, + "epoch": 1.2105810789178806, + "grad_norm": 6.428073406219482, + "learning_rate": 3.789428925884425e-05, + "loss": 1.2817, + "num_input_tokens_seen": 65479136, "step": 121000 }, { - "epoch": 2.9167466871519108, - "grad_norm": 6.802663326263428, - "learning_rate": 3.541638659496831e-05, - "loss": 1.8919, - "num_input_tokens_seen": 65581369, + "epoch": 1.2155834800704337, + "grad_norm": 6.519428730010986, + "learning_rate": 3.784426524731872e-05, + "loss": 1.2719, + "num_input_tokens_seen": 65749968, "step": 121500 }, { - "epoch": 2.9287497599385444, - "grad_norm": 6.365786075592041, - "learning_rate": 3.535637123103515e-05, - "loss": 1.9218, - "num_input_tokens_seen": 65850729, + "epoch": 1.220585881222987, + "grad_norm": 5.953312873840332, + "learning_rate": 3.779424123579318e-05, + "loss": 1.2701, + "num_input_tokens_seen": 66017680, "step": 122000 }, { - "epoch": 2.9407528327251775, - "grad_norm": 6.900768280029297, - "learning_rate": 3.5296355867101983e-05, - "loss": 1.9175, - "num_input_tokens_seen": 66121313, + "epoch": 1.2255882823755402, + "grad_norm": 6.453891277313232, + "learning_rate": 3.774421722426765e-05, + "loss": 1.2668, + "num_input_tokens_seen": 66288296, "step": 122500 }, { - "epoch": 2.952755905511811, - "grad_norm": 5.855416297912598, - "learning_rate": 3.523634050316881e-05, - "loss": 1.9022, - "num_input_tokens_seen": 66391609, + "epoch": 1.2305906835280935, + "grad_norm": 6.9297709465026855, + "learning_rate": 3.769419321274212e-05, + "loss": 1.2754, + "num_input_tokens_seen": 66555160, "step": 123000 }, { - "epoch": 2.9647589782984443, - "grad_norm": 9.250798225402832, - "learning_rate": 3.517632513923564e-05, - "loss": 1.888, - "num_input_tokens_seen": 66661065, + "epoch": 1.2355930846806467, + "grad_norm": 5.545460224151611, + "learning_rate": 3.764416920121658e-05, + "loss": 1.2636, + "num_input_tokens_seen": 66823296, "step": 123500 }, { - "epoch": 2.976762051085078, - "grad_norm": 8.351166725158691, - "learning_rate": 3.511630977530248e-05, - "loss": 1.9013, - "num_input_tokens_seen": 66929929, + "epoch": 1.2405954858332, + "grad_norm": 7.921981334686279, + "learning_rate": 3.759414518969105e-05, + "loss": 1.2671, + "num_input_tokens_seen": 67090360, "step": 124000 }, { - "epoch": 2.988765123871711, - "grad_norm": 5.7278151512146, - "learning_rate": 3.505629441136931e-05, - "loss": 1.8953, - "num_input_tokens_seen": 67200953, + "epoch": 1.2455978869857531, + "grad_norm": 7.033051490783691, + "learning_rate": 3.754412117816552e-05, + "loss": 1.2826, + "num_input_tokens_seen": 67363312, "step": 124500 }, { - "epoch": 3.0, - "eval_loss": 1.9114675521850586, - "eval_runtime": 77.826, - "eval_samples_per_second": 1070.477, - "eval_steps_per_second": 133.811, - "num_input_tokens_seen": 67451077, - "step": 124968 - }, - { - "epoch": 3.0007681966583446, - "grad_norm": 6.81101131439209, - "learning_rate": 3.499627904743615e-05, - "loss": 1.9038, - "num_input_tokens_seen": 67467141, + "epoch": 1.2506002881383065, + "grad_norm": 5.355251789093018, + "learning_rate": 3.749409716663999e-05, + "loss": 1.2712, + "num_input_tokens_seen": 67642752, "step": 125000 }, { - "epoch": 3.0127712694449778, - "grad_norm": 5.955549240112305, - "learning_rate": 3.4936263683502976e-05, - "loss": 1.7292, - "num_input_tokens_seen": 67736013, + "epoch": 1.2556026892908596, + "grad_norm": 7.9365081787109375, + "learning_rate": 3.7444073155114456e-05, + "loss": 1.2763, + "num_input_tokens_seen": 67905152, "step": 125500 }, { - "epoch": 3.0247743422316113, - "grad_norm": 7.834664344787598, - "learning_rate": 3.487624831956981e-05, - "loss": 1.7548, - "num_input_tokens_seen": 68006613, + "epoch": 1.2606050904434127, + "grad_norm": 6.157983779907227, + "learning_rate": 3.739404914358892e-05, + "loss": 1.2615, + "num_input_tokens_seen": 68168720, "step": 126000 }, { - "epoch": 3.0367774150182445, - "grad_norm": 6.410035133361816, - "learning_rate": 3.481623295563665e-05, - "loss": 1.756, - "num_input_tokens_seen": 68279781, + "epoch": 1.265607491595966, + "grad_norm": 5.456648349761963, + "learning_rate": 3.7344025132063396e-05, + "loss": 1.2642, + "num_input_tokens_seen": 68433784, "step": 126500 }, { - "epoch": 3.048780487804878, - "grad_norm": 6.483912944793701, - "learning_rate": 3.4756217591703475e-05, - "loss": 1.7415, - "num_input_tokens_seen": 68547861, + "epoch": 1.2706098927485192, + "grad_norm": 7.156668663024902, + "learning_rate": 3.729400112053786e-05, + "loss": 1.2866, + "num_input_tokens_seen": 68705336, "step": 127000 }, { - "epoch": 3.0607835605915112, - "grad_norm": 7.319478511810303, - "learning_rate": 3.469620222777031e-05, - "loss": 1.7516, - "num_input_tokens_seen": 68819797, + "epoch": 1.2756122939010726, + "grad_norm": 6.959549903869629, + "learning_rate": 3.724397710901233e-05, + "loss": 1.2733, + "num_input_tokens_seen": 68968320, "step": 127500 }, { - "epoch": 3.072786633378145, - "grad_norm": 7.587656497955322, - "learning_rate": 3.4636186863837146e-05, - "loss": 1.7531, - "num_input_tokens_seen": 69089125, + "epoch": 1.2806146950536257, + "grad_norm": 6.225592613220215, + "learning_rate": 3.719395309748679e-05, + "loss": 1.2684, + "num_input_tokens_seen": 69244592, "step": 128000 }, { - "epoch": 3.084789706164778, - "grad_norm": 6.104249000549316, - "learning_rate": 3.4576171499903975e-05, - "loss": 1.7496, - "num_input_tokens_seen": 69358693, + "epoch": 1.285617096206179, + "grad_norm": 7.163039684295654, + "learning_rate": 3.714392908596127e-05, + "loss": 1.2823, + "num_input_tokens_seen": 69518384, "step": 128500 }, { - "epoch": 3.0967927789514116, - "grad_norm": 6.5330376625061035, - "learning_rate": 3.451615613597081e-05, - "loss": 1.7678, - "num_input_tokens_seen": 69632237, + "epoch": 1.2906194973587322, + "grad_norm": 5.474428176879883, + "learning_rate": 3.709390507443573e-05, + "loss": 1.2684, + "num_input_tokens_seen": 69789560, "step": 129000 }, { - "epoch": 3.1087958517380447, - "grad_norm": 8.878678321838379, - "learning_rate": 3.4456140772037646e-05, - "loss": 1.7334, - "num_input_tokens_seen": 69897565, + "epoch": 1.2956218985112855, + "grad_norm": 6.292562961578369, + "learning_rate": 3.7043881062910195e-05, + "loss": 1.2697, + "num_input_tokens_seen": 70058688, "step": 129500 }, { - "epoch": 3.1207989245246783, - "grad_norm": 6.290347099304199, - "learning_rate": 3.4396125408104475e-05, - "loss": 1.759, - "num_input_tokens_seen": 70170405, + "epoch": 1.3006242996638386, + "grad_norm": 5.789345741271973, + "learning_rate": 3.6993857051384665e-05, + "loss": 1.2652, + "num_input_tokens_seen": 70330632, "step": 130000 }, { - "epoch": 3.1328019973113115, - "grad_norm": 6.117245674133301, - "learning_rate": 3.433611004417131e-05, - "loss": 1.7779, - "num_input_tokens_seen": 70440037, + "epoch": 1.3056267008163918, + "grad_norm": 7.9446821212768555, + "learning_rate": 3.6943833039859135e-05, + "loss": 1.2928, + "num_input_tokens_seen": 70594088, "step": 130500 }, { - "epoch": 3.144805070097945, - "grad_norm": 7.964632511138916, - "learning_rate": 3.427609468023814e-05, - "loss": 1.7782, - "num_input_tokens_seen": 70707957, + "epoch": 1.310629101968945, + "grad_norm": 9.38175106048584, + "learning_rate": 3.6893809028333605e-05, + "loss": 1.2636, + "num_input_tokens_seen": 70858664, "step": 131000 }, { - "epoch": 3.1568081428845787, - "grad_norm": 6.208136081695557, - "learning_rate": 3.4216079316304974e-05, - "loss": 1.7851, - "num_input_tokens_seen": 70976669, + "epoch": 1.3156315031214985, + "grad_norm": 7.6178812980651855, + "learning_rate": 3.684378501680807e-05, + "loss": 1.2767, + "num_input_tokens_seen": 71132936, "step": 131500 }, { - "epoch": 3.168811215671212, - "grad_norm": 6.764760494232178, - "learning_rate": 3.415606395237181e-05, - "loss": 1.7509, - "num_input_tokens_seen": 71244357, + "epoch": 1.3206339042740516, + "grad_norm": 7.7378435134887695, + "learning_rate": 3.679376100528254e-05, + "loss": 1.2752, + "num_input_tokens_seen": 71398296, "step": 132000 }, { - "epoch": 3.180814288457845, - "grad_norm": 6.424854278564453, - "learning_rate": 3.409604858843864e-05, - "loss": 1.7774, - "num_input_tokens_seen": 71514781, + "epoch": 1.3256363054266047, + "grad_norm": 7.162954807281494, + "learning_rate": 3.674373699375701e-05, + "loss": 1.2534, + "num_input_tokens_seen": 71673192, "step": 132500 }, { - "epoch": 3.1928173612444786, - "grad_norm": 6.203136920928955, - "learning_rate": 3.4036033224505474e-05, - "loss": 1.7766, - "num_input_tokens_seen": 71781541, + "epoch": 1.330638706579158, + "grad_norm": 6.323235511779785, + "learning_rate": 3.669371298223147e-05, + "loss": 1.265, + "num_input_tokens_seen": 71949960, "step": 133000 }, { - "epoch": 3.204820434031112, - "grad_norm": 6.723325252532959, - "learning_rate": 3.397601786057231e-05, - "loss": 1.7693, - "num_input_tokens_seen": 72049253, + "epoch": 1.3356411077317112, + "grad_norm": 6.324786186218262, + "learning_rate": 3.664368897070594e-05, + "loss": 1.2661, + "num_input_tokens_seen": 72217256, "step": 133500 }, { - "epoch": 3.2168235068177453, - "grad_norm": 6.210843086242676, - "learning_rate": 3.3916002496639145e-05, - "loss": 1.7804, - "num_input_tokens_seen": 72320325, + "epoch": 1.3406435088842645, + "grad_norm": 7.326359748840332, + "learning_rate": 3.6593664959180404e-05, + "loss": 1.244, + "num_input_tokens_seen": 72480696, "step": 134000 }, { - "epoch": 3.228826579604379, - "grad_norm": 6.450284957885742, - "learning_rate": 3.3855987132705973e-05, - "loss": 1.7684, - "num_input_tokens_seen": 72594053, + "epoch": 1.3456459100368177, + "grad_norm": 7.224339962005615, + "learning_rate": 3.654364094765488e-05, + "loss": 1.2677, + "num_input_tokens_seen": 72747888, "step": 134500 }, { - "epoch": 3.240829652391012, - "grad_norm": 6.720444202423096, - "learning_rate": 3.37959717687728e-05, - "loss": 1.7684, - "num_input_tokens_seen": 72864533, + "epoch": 1.3506483111893708, + "grad_norm": 6.521255970001221, + "learning_rate": 3.6493616936129344e-05, + "loss": 1.26, + "num_input_tokens_seen": 73016624, "step": 135000 }, { - "epoch": 3.2528327251776457, - "grad_norm": 6.4733662605285645, - "learning_rate": 3.3735956404839644e-05, - "loss": 1.779, - "num_input_tokens_seen": 73134469, + "epoch": 1.3556507123419241, + "grad_norm": 7.485130786895752, + "learning_rate": 3.644359292460381e-05, + "loss": 1.2703, + "num_input_tokens_seen": 73291936, "step": 135500 }, { - "epoch": 3.264835797964279, - "grad_norm": 7.630155563354492, - "learning_rate": 3.367594104090647e-05, - "loss": 1.7961, - "num_input_tokens_seen": 73402789, + "epoch": 1.3606531134944775, + "grad_norm": 7.1711626052856445, + "learning_rate": 3.639356891307828e-05, + "loss": 1.2798, + "num_input_tokens_seen": 73557640, "step": 136000 }, { - "epoch": 3.2768388707509124, - "grad_norm": 6.543163299560547, - "learning_rate": 3.361592567697331e-05, - "loss": 1.8108, - "num_input_tokens_seen": 73672013, + "epoch": 1.3656555146470306, + "grad_norm": 6.981902599334717, + "learning_rate": 3.634354490155275e-05, + "loss": 1.2485, + "num_input_tokens_seen": 73827312, "step": 136500 }, { - "epoch": 3.2888419435375456, - "grad_norm": 6.558376312255859, - "learning_rate": 3.355591031304014e-05, - "loss": 1.7724, - "num_input_tokens_seen": 73940061, + "epoch": 1.3706579157995837, + "grad_norm": 6.2199320793151855, + "learning_rate": 3.629352089002722e-05, + "loss": 1.2587, + "num_input_tokens_seen": 74098048, "step": 137000 }, { - "epoch": 3.300845016324179, - "grad_norm": 8.48396110534668, - "learning_rate": 3.349589494910697e-05, - "loss": 1.7906, - "num_input_tokens_seen": 74212261, + "epoch": 1.375660316952137, + "grad_norm": 6.726940155029297, + "learning_rate": 3.624349687850168e-05, + "loss": 1.2753, + "num_input_tokens_seen": 74365640, "step": 137500 }, { - "epoch": 3.3128480891108123, - "grad_norm": 6.456749439239502, - "learning_rate": 3.343587958517381e-05, - "loss": 1.7757, - "num_input_tokens_seen": 74485205, + "epoch": 1.3806627181046902, + "grad_norm": 5.759517669677734, + "learning_rate": 3.619347286697615e-05, + "loss": 1.2549, + "num_input_tokens_seen": 74632424, "step": 138000 }, { - "epoch": 3.324851161897446, - "grad_norm": 6.636175155639648, - "learning_rate": 3.337586422124064e-05, - "loss": 1.7805, - "num_input_tokens_seen": 74756509, + "epoch": 1.3856651192572436, + "grad_norm": 6.594145774841309, + "learning_rate": 3.614344885545062e-05, + "loss": 1.2601, + "num_input_tokens_seen": 74906848, "step": 138500 }, { - "epoch": 3.336854234684079, - "grad_norm": 6.118922710418701, - "learning_rate": 3.331584885730747e-05, - "loss": 1.7748, - "num_input_tokens_seen": 75028117, + "epoch": 1.3906675204097967, + "grad_norm": 6.375233173370361, + "learning_rate": 3.609342484392508e-05, + "loss": 1.2682, + "num_input_tokens_seen": 75176912, "step": 139000 }, { - "epoch": 3.3488573074707126, - "grad_norm": 7.567290306091309, - "learning_rate": 3.325583349337431e-05, - "loss": 1.7828, - "num_input_tokens_seen": 75299501, + "epoch": 1.3956699215623498, + "grad_norm": 6.785195827484131, + "learning_rate": 3.604340083239955e-05, + "loss": 1.2602, + "num_input_tokens_seen": 75447008, "step": 139500 }, { - "epoch": 3.360860380257346, - "grad_norm": 6.197093963623047, - "learning_rate": 3.3195818129441136e-05, - "loss": 1.7797, - "num_input_tokens_seen": 75570085, + "epoch": 1.4006723227149032, + "grad_norm": 6.8394694328308105, + "learning_rate": 3.599337682087402e-05, + "loss": 1.2432, + "num_input_tokens_seen": 75720552, "step": 140000 }, { - "epoch": 3.3728634530439794, - "grad_norm": 6.406919956207275, - "learning_rate": 3.313580276550797e-05, - "loss": 1.8031, - "num_input_tokens_seen": 75839621, + "epoch": 1.4056747238674565, + "grad_norm": 6.43784236907959, + "learning_rate": 3.594335280934849e-05, + "loss": 1.2576, + "num_input_tokens_seen": 75991000, "step": 140500 }, { - "epoch": 3.3848665258306125, - "grad_norm": 6.410861492156982, - "learning_rate": 3.30757874015748e-05, - "loss": 1.798, - "num_input_tokens_seen": 76108429, + "epoch": 1.4106771250200096, + "grad_norm": 6.5971479415893555, + "learning_rate": 3.5893328797822956e-05, + "loss": 1.2625, + "num_input_tokens_seen": 76262432, "step": 141000 }, { - "epoch": 3.396869598617246, - "grad_norm": 5.295762062072754, - "learning_rate": 3.301577203764164e-05, - "loss": 1.7929, - "num_input_tokens_seen": 76377581, + "epoch": 1.4156795261725628, + "grad_norm": 5.662843227386475, + "learning_rate": 3.5843304786297426e-05, + "loss": 1.2822, + "num_input_tokens_seen": 76537768, "step": 141500 }, { - "epoch": 3.4088726714038793, - "grad_norm": 8.557171821594238, - "learning_rate": 3.295575667370847e-05, - "loss": 1.7796, - "num_input_tokens_seen": 76645429, + "epoch": 1.4206819273251161, + "grad_norm": 5.13416862487793, + "learning_rate": 3.579328077477189e-05, + "loss": 1.2639, + "num_input_tokens_seen": 76807000, "step": 142000 }, { - "epoch": 3.420875744190513, - "grad_norm": 6.90181827545166, - "learning_rate": 3.28957413097753e-05, - "loss": 1.7632, - "num_input_tokens_seen": 76912941, + "epoch": 1.4256843284776692, + "grad_norm": 6.224869728088379, + "learning_rate": 3.574325676324636e-05, + "loss": 1.2633, + "num_input_tokens_seen": 77077208, "step": 142500 }, { - "epoch": 3.432878816977146, - "grad_norm": 6.235441207885742, - "learning_rate": 3.2835725945842136e-05, - "loss": 1.8087, - "num_input_tokens_seen": 77183101, + "epoch": 1.4306867296302226, + "grad_norm": 5.980476379394531, + "learning_rate": 3.569323275172083e-05, + "loss": 1.254, + "num_input_tokens_seen": 77348800, "step": 143000 }, { - "epoch": 3.4448818897637796, - "grad_norm": 6.225989818572998, - "learning_rate": 3.277571058190897e-05, - "loss": 1.7984, - "num_input_tokens_seen": 77450549, + "epoch": 1.4356891307827757, + "grad_norm": 5.705311298370361, + "learning_rate": 3.564320874019529e-05, + "loss": 1.2641, + "num_input_tokens_seen": 77623192, "step": 143500 }, { - "epoch": 3.4568849625504128, - "grad_norm": 5.960590839385986, - "learning_rate": 3.2715695217975806e-05, - "loss": 1.8059, - "num_input_tokens_seen": 77724549, + "epoch": 1.4406915319353288, + "grad_norm": 5.703660488128662, + "learning_rate": 3.559318472866976e-05, + "loss": 1.2509, + "num_input_tokens_seen": 77892080, "step": 144000 }, { - "epoch": 3.4688880353370464, - "grad_norm": 7.326996803283691, - "learning_rate": 3.2655679854042635e-05, - "loss": 1.8012, - "num_input_tokens_seen": 77994069, + "epoch": 1.4456939330878822, + "grad_norm": 6.834238052368164, + "learning_rate": 3.554316071714423e-05, + "loss": 1.2455, + "num_input_tokens_seen": 78161320, "step": 144500 }, { - "epoch": 3.4808911081236795, - "grad_norm": 6.052545547485352, - "learning_rate": 3.259566449010947e-05, - "loss": 1.7808, - "num_input_tokens_seen": 78263765, + "epoch": 1.4506963342404355, + "grad_norm": 5.70477294921875, + "learning_rate": 3.54931367056187e-05, + "loss": 1.2534, + "num_input_tokens_seen": 78426328, "step": 145000 }, { - "epoch": 3.492894180910313, - "grad_norm": 6.24308443069458, - "learning_rate": 3.2535649126176306e-05, - "loss": 1.8033, - "num_input_tokens_seen": 78532349, + "epoch": 1.4556987353929887, + "grad_norm": 7.84694766998291, + "learning_rate": 3.5443112694093165e-05, + "loss": 1.2696, + "num_input_tokens_seen": 78700184, "step": 145500 }, { - "epoch": 3.5048972536969463, - "grad_norm": 6.785669326782227, - "learning_rate": 3.2475633762243135e-05, - "loss": 1.7843, - "num_input_tokens_seen": 78801845, + "epoch": 1.4607011365455418, + "grad_norm": 6.4869914054870605, + "learning_rate": 3.5393088682567635e-05, + "loss": 1.2548, + "num_input_tokens_seen": 78968056, "step": 146000 }, { - "epoch": 3.51690032648358, - "grad_norm": 7.609140396118164, - "learning_rate": 3.241561839830997e-05, - "loss": 1.803, - "num_input_tokens_seen": 79073997, + "epoch": 1.4657035376980951, + "grad_norm": 7.2102251052856445, + "learning_rate": 3.5343064671042105e-05, + "loss": 1.2643, + "num_input_tokens_seen": 79236952, "step": 146500 }, { - "epoch": 3.528903399270213, - "grad_norm": 6.937623023986816, - "learning_rate": 3.2355603034376806e-05, - "loss": 1.8055, - "num_input_tokens_seen": 79339981, + "epoch": 1.4707059388506483, + "grad_norm": 8.560874938964844, + "learning_rate": 3.529304065951657e-05, + "loss": 1.2674, + "num_input_tokens_seen": 79508800, "step": 147000 }, { - "epoch": 3.5409064720568466, - "grad_norm": 7.0438408851623535, - "learning_rate": 3.2295587670443634e-05, - "loss": 1.8007, - "num_input_tokens_seen": 79611909, + "epoch": 1.4757083400032016, + "grad_norm": 6.250794410705566, + "learning_rate": 3.524301664799104e-05, + "loss": 1.2529, + "num_input_tokens_seen": 79781928, "step": 147500 }, { - "epoch": 3.55290954484348, - "grad_norm": 6.579649925231934, - "learning_rate": 3.223557230651047e-05, - "loss": 1.7895, - "num_input_tokens_seen": 79885773, + "epoch": 1.4807107411557547, + "grad_norm": 5.825743198394775, + "learning_rate": 3.519299263646551e-05, + "loss": 1.25, + "num_input_tokens_seen": 80044592, "step": 148000 }, { - "epoch": 3.5649126176301134, - "grad_norm": 6.212180137634277, - "learning_rate": 3.21755569425773e-05, - "loss": 1.802, - "num_input_tokens_seen": 80150693, + "epoch": 1.4857131423083079, + "grad_norm": 8.07386589050293, + "learning_rate": 3.514296862493997e-05, + "loss": 1.2546, + "num_input_tokens_seen": 80312648, "step": 148500 }, { - "epoch": 3.5769156904167465, - "grad_norm": 8.239595413208008, - "learning_rate": 3.2115541578644134e-05, - "loss": 1.7967, - "num_input_tokens_seen": 80417117, + "epoch": 1.4907155434608612, + "grad_norm": 6.903604984283447, + "learning_rate": 3.509294461341444e-05, + "loss": 1.2465, + "num_input_tokens_seen": 80580360, "step": 149000 }, { - "epoch": 3.58891876320338, - "grad_norm": 6.735275745391846, - "learning_rate": 3.205552621471097e-05, - "loss": 1.7946, - "num_input_tokens_seen": 80688349, + "epoch": 1.4957179446134146, + "grad_norm": 7.45670223236084, + "learning_rate": 3.5042920601888904e-05, + "loss": 1.2612, + "num_input_tokens_seen": 80854928, "step": 149500 }, { - "epoch": 3.6009218359900137, - "grad_norm": 7.175784111022949, - "learning_rate": 3.19955108507778e-05, - "loss": 1.8067, - "num_input_tokens_seen": 80963069, + "epoch": 1.5007203457659677, + "grad_norm": 7.703638553619385, + "learning_rate": 3.4992896590363374e-05, + "loss": 1.2494, + "num_input_tokens_seen": 81122392, "step": 150000 }, { - "epoch": 3.612924908776647, - "grad_norm": 5.902931213378906, - "learning_rate": 3.1935495486844634e-05, - "loss": 1.8386, - "num_input_tokens_seen": 81239093, + "epoch": 1.5057227469185208, + "grad_norm": 7.255218982696533, + "learning_rate": 3.4942872578837844e-05, + "loss": 1.2549, + "num_input_tokens_seen": 81393152, "step": 150500 }, { - "epoch": 3.62492798156328, - "grad_norm": 6.380657196044922, - "learning_rate": 3.187548012291147e-05, - "loss": 1.8163, - "num_input_tokens_seen": 81510173, + "epoch": 1.5107251480710742, + "grad_norm": 6.001245498657227, + "learning_rate": 3.4892848567312314e-05, + "loss": 1.2565, + "num_input_tokens_seen": 81663000, "step": 151000 }, { - "epoch": 3.6369310543499136, - "grad_norm": 6.682636737823486, - "learning_rate": 3.18154647589783e-05, - "loss": 1.7902, - "num_input_tokens_seen": 81777981, + "epoch": 1.5157275492236273, + "grad_norm": 8.100776672363281, + "learning_rate": 3.484282455578678e-05, + "loss": 1.2662, + "num_input_tokens_seen": 81935248, "step": 151500 }, { - "epoch": 3.648934127136547, - "grad_norm": 7.864612579345703, - "learning_rate": 3.175544939504513e-05, - "loss": 1.8114, - "num_input_tokens_seen": 82044037, + "epoch": 1.5207299503761806, + "grad_norm": 7.566408157348633, + "learning_rate": 3.479280054426125e-05, + "loss": 1.2521, + "num_input_tokens_seen": 82198824, "step": 152000 }, { - "epoch": 3.6609371999231803, - "grad_norm": 5.9174113273620605, - "learning_rate": 3.169543403111196e-05, - "loss": 1.8008, - "num_input_tokens_seen": 82318397, + "epoch": 1.5257323515287338, + "grad_norm": 6.650607109069824, + "learning_rate": 3.474277653273572e-05, + "loss": 1.2516, + "num_input_tokens_seen": 82465776, "step": 152500 }, { - "epoch": 3.6729402727098135, - "grad_norm": 6.14339017868042, - "learning_rate": 3.1635418667178804e-05, - "loss": 1.8091, - "num_input_tokens_seen": 82585997, + "epoch": 1.530734752681287, + "grad_norm": 6.24419641494751, + "learning_rate": 3.469275252121018e-05, + "loss": 1.2504, + "num_input_tokens_seen": 82741480, "step": 153000 }, { - "epoch": 3.684943345496447, - "grad_norm": 7.240347385406494, - "learning_rate": 3.157540330324563e-05, - "loss": 1.7979, - "num_input_tokens_seen": 82855109, + "epoch": 1.5357371538338402, + "grad_norm": 5.1919403076171875, + "learning_rate": 3.464272850968465e-05, + "loss": 1.2403, + "num_input_tokens_seen": 83010176, "step": 153500 }, { - "epoch": 3.6969464182830807, - "grad_norm": 6.051020622253418, - "learning_rate": 3.151538793931246e-05, - "loss": 1.7734, - "num_input_tokens_seen": 83123149, + "epoch": 1.5407395549863936, + "grad_norm": 7.3934407234191895, + "learning_rate": 3.459270449815912e-05, + "loss": 1.2455, + "num_input_tokens_seen": 83279712, "step": 154000 }, { - "epoch": 3.708949491069714, - "grad_norm": 6.831425666809082, - "learning_rate": 3.14553725753793e-05, - "loss": 1.818, - "num_input_tokens_seen": 83392741, + "epoch": 1.5457419561389467, + "grad_norm": 5.885237693786621, + "learning_rate": 3.454268048663359e-05, + "loss": 1.2667, + "num_input_tokens_seen": 83546624, "step": 154500 }, { - "epoch": 3.720952563856347, - "grad_norm": 5.485208988189697, - "learning_rate": 3.139535721144613e-05, - "loss": 1.7968, - "num_input_tokens_seen": 83662757, + "epoch": 1.5507443572914998, + "grad_norm": 6.22340726852417, + "learning_rate": 3.449265647510805e-05, + "loss": 1.2341, + "num_input_tokens_seen": 83814832, "step": 155000 }, { - "epoch": 3.7329556366429806, - "grad_norm": 6.413397312164307, - "learning_rate": 3.133534184751297e-05, - "loss": 1.8095, - "num_input_tokens_seen": 83933149, + "epoch": 1.5557467584440532, + "grad_norm": 7.060276508331299, + "learning_rate": 3.4442632463582516e-05, + "loss": 1.2419, + "num_input_tokens_seen": 84086544, "step": 155500 }, { - "epoch": 3.744958709429614, - "grad_norm": 7.249451160430908, - "learning_rate": 3.1275326483579796e-05, - "loss": 1.792, - "num_input_tokens_seen": 84201901, + "epoch": 1.5607491595966063, + "grad_norm": 6.495555400848389, + "learning_rate": 3.439260845205699e-05, + "loss": 1.2552, + "num_input_tokens_seen": 84352576, "step": 156000 }, { - "epoch": 3.7569617822162473, - "grad_norm": 6.752410411834717, - "learning_rate": 3.121531111964663e-05, - "loss": 1.7926, - "num_input_tokens_seen": 84473397, + "epoch": 1.5657515607491597, + "grad_norm": 7.454058647155762, + "learning_rate": 3.4342584440531456e-05, + "loss": 1.2469, + "num_input_tokens_seen": 84619200, "step": 156500 }, { - "epoch": 3.7689648550028805, - "grad_norm": 5.887633800506592, - "learning_rate": 3.115529575571347e-05, - "loss": 1.7983, - "num_input_tokens_seen": 84747525, + "epoch": 1.5707539619017128, + "grad_norm": 6.108017444610596, + "learning_rate": 3.4292560429005926e-05, + "loss": 1.2358, + "num_input_tokens_seen": 84891184, "step": 157000 }, { - "epoch": 3.780967927789514, - "grad_norm": 6.161904335021973, - "learning_rate": 3.1095280391780296e-05, - "loss": 1.8085, - "num_input_tokens_seen": 85015197, + "epoch": 1.575756363054266, + "grad_norm": 9.97182559967041, + "learning_rate": 3.424253641748039e-05, + "loss": 1.2482, + "num_input_tokens_seen": 85164008, "step": 157500 }, { - "epoch": 3.7929710005761477, - "grad_norm": 6.496557235717773, - "learning_rate": 3.103526502784713e-05, - "loss": 1.8253, - "num_input_tokens_seen": 85286845, + "epoch": 1.5807587642068193, + "grad_norm": 7.4442877769470215, + "learning_rate": 3.419251240595486e-05, + "loss": 1.2444, + "num_input_tokens_seen": 85431664, "step": 158000 }, { - "epoch": 3.804974073362781, - "grad_norm": 6.270902156829834, - "learning_rate": 3.097524966391396e-05, - "loss": 1.7986, - "num_input_tokens_seen": 85560461, + "epoch": 1.5857611653593726, + "grad_norm": 5.728388786315918, + "learning_rate": 3.414248839442933e-05, + "loss": 1.2496, + "num_input_tokens_seen": 85706112, "step": 158500 }, { - "epoch": 3.8169771461494144, - "grad_norm": 6.417201042175293, - "learning_rate": 3.0915234299980796e-05, - "loss": 1.8142, - "num_input_tokens_seen": 85834453, + "epoch": 1.5907635665119257, + "grad_norm": 5.5090861320495605, + "learning_rate": 3.409246438290379e-05, + "loss": 1.2468, + "num_input_tokens_seen": 85979600, "step": 159000 }, { - "epoch": 3.8289802189360476, - "grad_norm": 7.274606227874756, - "learning_rate": 3.085521893604763e-05, - "loss": 1.8126, - "num_input_tokens_seen": 86104741, + "epoch": 1.5957659676644789, + "grad_norm": 7.548877716064453, + "learning_rate": 3.404244037137826e-05, + "loss": 1.2483, + "num_input_tokens_seen": 86249384, "step": 159500 }, { - "epoch": 3.840983291722681, - "grad_norm": 6.281017303466797, - "learning_rate": 3.079520357211446e-05, - "loss": 1.8032, - "num_input_tokens_seen": 86372133, + "epoch": 1.6007683688170322, + "grad_norm": 6.700185775756836, + "learning_rate": 3.399241635985273e-05, + "loss": 1.2493, + "num_input_tokens_seen": 86511120, "step": 160000 }, { - "epoch": 3.8529863645093143, - "grad_norm": 6.500155448913574, - "learning_rate": 3.0735188208181295e-05, - "loss": 1.7892, - "num_input_tokens_seen": 86646853, + "epoch": 1.6057707699695856, + "grad_norm": 6.892191410064697, + "learning_rate": 3.39423923483272e-05, + "loss": 1.2528, + "num_input_tokens_seen": 86784872, "step": 160500 }, { - "epoch": 3.864989437295948, - "grad_norm": 8.005160331726074, - "learning_rate": 3.067517284424813e-05, - "loss": 1.7898, - "num_input_tokens_seen": 86915853, + "epoch": 1.6107731711221387, + "grad_norm": 5.970468521118164, + "learning_rate": 3.3892368336801665e-05, + "loss": 1.2309, + "num_input_tokens_seen": 87058000, "step": 161000 }, { - "epoch": 3.876992510082581, - "grad_norm": 7.663057327270508, - "learning_rate": 3.061515748031496e-05, - "loss": 1.8058, - "num_input_tokens_seen": 87184757, + "epoch": 1.6157755722746918, + "grad_norm": 6.773517608642578, + "learning_rate": 3.384234432527613e-05, + "loss": 1.2449, + "num_input_tokens_seen": 87331744, "step": 161500 }, { - "epoch": 3.8889955828692147, - "grad_norm": 6.693754196166992, - "learning_rate": 3.0555142116381795e-05, - "loss": 1.8058, - "num_input_tokens_seen": 87452701, + "epoch": 1.620777973427245, + "grad_norm": 6.08986234664917, + "learning_rate": 3.3792320313750605e-05, + "loss": 1.2518, + "num_input_tokens_seen": 87604792, "step": 162000 }, { - "epoch": 3.900998655655848, - "grad_norm": 6.509284019470215, - "learning_rate": 3.0495126752448627e-05, - "loss": 1.8045, - "num_input_tokens_seen": 87719533, + "epoch": 1.6257803745797983, + "grad_norm": 6.549533843994141, + "learning_rate": 3.374229630222507e-05, + "loss": 1.242, + "num_input_tokens_seen": 87871576, "step": 162500 }, { - "epoch": 3.9130017284424814, - "grad_norm": 6.752614498138428, - "learning_rate": 3.0435111388515462e-05, - "loss": 1.8113, - "num_input_tokens_seen": 87993005, + "epoch": 1.6307827757323516, + "grad_norm": 5.974827289581299, + "learning_rate": 3.369227229069954e-05, + "loss": 1.2347, + "num_input_tokens_seen": 88137032, "step": 163000 }, { - "epoch": 3.9250048012291145, - "grad_norm": 6.36919641494751, - "learning_rate": 3.0375096024582294e-05, - "loss": 1.8117, - "num_input_tokens_seen": 88262661, + "epoch": 1.6357851768849048, + "grad_norm": 6.639895915985107, + "learning_rate": 3.3642248279174e-05, + "loss": 1.2241, + "num_input_tokens_seen": 88403456, "step": 163500 }, { - "epoch": 3.937007874015748, - "grad_norm": 7.482510566711426, - "learning_rate": 3.0315080660649126e-05, - "loss": 1.7781, - "num_input_tokens_seen": 88532245, + "epoch": 1.640787578037458, + "grad_norm": 8.600828170776367, + "learning_rate": 3.359222426764848e-05, + "loss": 1.2496, + "num_input_tokens_seen": 88676440, "step": 164000 }, { - "epoch": 3.9490109468023813, - "grad_norm": 6.747243881225586, - "learning_rate": 3.0255065296715962e-05, - "loss": 1.8117, - "num_input_tokens_seen": 88805269, + "epoch": 1.6457899791900112, + "grad_norm": 6.850368976593018, + "learning_rate": 3.354220025612294e-05, + "loss": 1.232, + "num_input_tokens_seen": 88947256, "step": 164500 }, { - "epoch": 3.961014019589015, - "grad_norm": 6.453344821929932, - "learning_rate": 3.0195049932782794e-05, - "loss": 1.8238, - "num_input_tokens_seen": 89075181, + "epoch": 1.6507923803425646, + "grad_norm": 7.311619281768799, + "learning_rate": 3.3492176244597405e-05, + "loss": 1.2497, + "num_input_tokens_seen": 89215568, "step": 165000 }, { - "epoch": 3.973017092375648, - "grad_norm": 7.095656394958496, - "learning_rate": 3.013503456884963e-05, - "loss": 1.7925, - "num_input_tokens_seen": 89346501, + "epoch": 1.6557947814951177, + "grad_norm": 6.298097610473633, + "learning_rate": 3.3442152233071875e-05, + "loss": 1.2482, + "num_input_tokens_seen": 89487600, "step": 165500 }, { - "epoch": 3.9850201651622816, - "grad_norm": 6.070130348205566, - "learning_rate": 3.007501920491646e-05, - "loss": 1.8073, - "num_input_tokens_seen": 89620909, + "epoch": 1.6607971826476708, + "grad_norm": 6.948307514190674, + "learning_rate": 3.3392128221546345e-05, + "loss": 1.2274, + "num_input_tokens_seen": 89760496, "step": 166000 }, { - "epoch": 3.997023237948915, - "grad_norm": 5.398484230041504, - "learning_rate": 3.001500384098329e-05, - "loss": 1.7876, - "num_input_tokens_seen": 89888237, + "epoch": 1.665799583800224, + "grad_norm": 6.305945873260498, + "learning_rate": 3.3342104210020814e-05, + "loss": 1.2462, + "num_input_tokens_seen": 90033976, "step": 166500 }, { - "epoch": 4.0, - "eval_loss": 1.891203761100769, - "eval_runtime": 78.0045, - "eval_samples_per_second": 1068.028, - "eval_steps_per_second": 133.505, - "num_input_tokens_seen": 89955187, - "step": 166624 - }, - { - "epoch": 4.009026310735548, - "grad_norm": 7.0371413230896, - "learning_rate": 2.995498847705013e-05, - "loss": 1.6909, - "num_input_tokens_seen": 90156571, + "epoch": 1.6708019849527773, + "grad_norm": 6.428753852844238, + "learning_rate": 3.329208019849528e-05, + "loss": 1.2159, + "num_input_tokens_seen": 90309992, "step": 167000 }, { - "epoch": 4.0210293835221815, - "grad_norm": 6.302753925323486, - "learning_rate": 2.9894973113116958e-05, - "loss": 1.6656, - "num_input_tokens_seen": 90426763, + "epoch": 1.6758043861053307, + "grad_norm": 6.597380638122559, + "learning_rate": 3.324205618696975e-05, + "loss": 1.224, + "num_input_tokens_seen": 90577224, "step": 167500 }, { - "epoch": 4.033032456308815, - "grad_norm": 6.481957912445068, - "learning_rate": 2.9834957749183797e-05, - "loss": 1.6585, - "num_input_tokens_seen": 90697363, + "epoch": 1.6808067872578838, + "grad_norm": 6.567870140075684, + "learning_rate": 3.319203217544422e-05, + "loss": 1.2214, + "num_input_tokens_seen": 90853608, "step": 168000 }, { - "epoch": 4.045035529095449, - "grad_norm": 6.236865997314453, - "learning_rate": 2.9774942385250625e-05, - "loss": 1.689, - "num_input_tokens_seen": 90967563, + "epoch": 1.685809188410437, + "grad_norm": 6.079522609710693, + "learning_rate": 3.314200816391868e-05, + "loss": 1.2465, + "num_input_tokens_seen": 91122360, "step": 168500 }, { - "epoch": 4.057038601882081, - "grad_norm": 5.834167003631592, - "learning_rate": 2.9714927021317457e-05, - "loss": 1.6674, - "num_input_tokens_seen": 91238587, + "epoch": 1.6908115895629903, + "grad_norm": 5.641016006469727, + "learning_rate": 3.309198415239315e-05, + "loss": 1.2456, + "num_input_tokens_seen": 91395744, "step": 169000 }, { - "epoch": 4.069041674668715, - "grad_norm": 6.219510555267334, - "learning_rate": 2.9654911657384293e-05, - "loss": 1.6561, - "num_input_tokens_seen": 91507331, + "epoch": 1.6958139907155436, + "grad_norm": 6.620981216430664, + "learning_rate": 3.304196014086762e-05, + "loss": 1.2232, + "num_input_tokens_seen": 91664688, "step": 169500 }, { - "epoch": 4.081044747455349, - "grad_norm": 5.575843811035156, - "learning_rate": 2.9594896293451125e-05, - "loss": 1.6631, - "num_input_tokens_seen": 91774243, + "epoch": 1.7008163918680967, + "grad_norm": 5.642283916473389, + "learning_rate": 3.299193612934209e-05, + "loss": 1.2548, + "num_input_tokens_seen": 91937056, "step": 170000 }, { - "epoch": 4.093047820241982, - "grad_norm": 5.577406406402588, - "learning_rate": 2.953488092951796e-05, - "loss": 1.6654, - "num_input_tokens_seen": 92043115, + "epoch": 1.7058187930206499, + "grad_norm": 8.295174598693848, + "learning_rate": 3.2941912117816554e-05, + "loss": 1.2394, + "num_input_tokens_seen": 92211072, "step": 170500 }, { - "epoch": 4.105050893028615, - "grad_norm": 7.792404651641846, - "learning_rate": 2.9474865565584792e-05, - "loss": 1.6835, - "num_input_tokens_seen": 92311755, + "epoch": 1.710821194173203, + "grad_norm": 6.525012493133545, + "learning_rate": 3.289188810629102e-05, + "loss": 1.2371, + "num_input_tokens_seen": 92479560, "step": 171000 }, { - "epoch": 4.1170539658152485, - "grad_norm": 6.548659801483154, - "learning_rate": 2.941485020165162e-05, - "loss": 1.6663, - "num_input_tokens_seen": 92578651, + "epoch": 1.7158235953257563, + "grad_norm": 5.822702884674072, + "learning_rate": 3.284186409476549e-05, + "loss": 1.2512, + "num_input_tokens_seen": 92751592, "step": 171500 }, { - "epoch": 4.129057038601882, - "grad_norm": 6.135234832763672, - "learning_rate": 2.935483483771846e-05, - "loss": 1.6749, - "num_input_tokens_seen": 92849315, + "epoch": 1.7208259964783097, + "grad_norm": 7.12557315826416, + "learning_rate": 3.279184008323996e-05, + "loss": 1.2252, + "num_input_tokens_seen": 93027888, "step": 172000 }, { - "epoch": 4.141060111388516, - "grad_norm": 5.482029438018799, - "learning_rate": 2.929481947378529e-05, - "loss": 1.6754, - "num_input_tokens_seen": 93121611, + "epoch": 1.7258283976308628, + "grad_norm": 6.948513984680176, + "learning_rate": 3.2741816071714427e-05, + "loss": 1.2273, + "num_input_tokens_seen": 93305496, "step": 172500 }, { - "epoch": 4.153063184175148, - "grad_norm": 5.751239776611328, - "learning_rate": 2.923480410985212e-05, - "loss": 1.6747, - "num_input_tokens_seen": 93389683, + "epoch": 1.730830798783416, + "grad_norm": 6.272098064422607, + "learning_rate": 3.269179206018889e-05, + "loss": 1.2412, + "num_input_tokens_seen": 93571272, "step": 173000 }, { - "epoch": 4.165066256961782, - "grad_norm": 6.670873165130615, - "learning_rate": 2.9174788745918956e-05, - "loss": 1.688, - "num_input_tokens_seen": 93659019, + "epoch": 1.7358331999359693, + "grad_norm": 6.048664569854736, + "learning_rate": 3.264176804866336e-05, + "loss": 1.2192, + "num_input_tokens_seen": 93844768, "step": 173500 }, { - "epoch": 4.177069329748416, - "grad_norm": 6.536869049072266, - "learning_rate": 2.9114773381985788e-05, - "loss": 1.672, - "num_input_tokens_seen": 93929267, + "epoch": 1.7408356010885226, + "grad_norm": 7.0680999755859375, + "learning_rate": 3.259174403713783e-05, + "loss": 1.2389, + "num_input_tokens_seen": 94117208, "step": 174000 }, { - "epoch": 4.189072402535049, - "grad_norm": 6.931149959564209, - "learning_rate": 2.9054758018052624e-05, - "loss": 1.6946, - "num_input_tokens_seen": 94203539, + "epoch": 1.7458380022410758, + "grad_norm": 4.265655517578125, + "learning_rate": 3.254172002561229e-05, + "loss": 1.2285, + "num_input_tokens_seen": 94388984, "step": 174500 }, { - "epoch": 4.201075475321682, - "grad_norm": 7.10436487197876, - "learning_rate": 2.8994742654119456e-05, - "loss": 1.6659, - "num_input_tokens_seen": 94472539, + "epoch": 1.750840403393629, + "grad_norm": 5.9715118408203125, + "learning_rate": 3.249169601408676e-05, + "loss": 1.2427, + "num_input_tokens_seen": 94655112, "step": 175000 }, { - "epoch": 4.2130785481083155, - "grad_norm": 7.407200813293457, - "learning_rate": 2.8934727290186288e-05, - "loss": 1.6871, - "num_input_tokens_seen": 94747635, + "epoch": 1.755842804546182, + "grad_norm": 6.257503509521484, + "learning_rate": 3.244167200256123e-05, + "loss": 1.2361, + "num_input_tokens_seen": 94924496, "step": 175500 }, { - "epoch": 4.225081620894949, - "grad_norm": 5.649023056030273, - "learning_rate": 2.8874711926253123e-05, - "loss": 1.68, - "num_input_tokens_seen": 95016051, + "epoch": 1.7608452056987354, + "grad_norm": 8.316187858581543, + "learning_rate": 3.23916479910357e-05, + "loss": 1.2283, + "num_input_tokens_seen": 95192608, "step": 176000 }, { - "epoch": 4.237084693681583, - "grad_norm": 7.02678918838501, - "learning_rate": 2.8814696562319955e-05, - "loss": 1.6964, - "num_input_tokens_seen": 95284651, + "epoch": 1.7658476068512887, + "grad_norm": 6.69648551940918, + "learning_rate": 3.2341623979510166e-05, + "loss": 1.2364, + "num_input_tokens_seen": 95459872, "step": 176500 }, { - "epoch": 4.249087766468216, - "grad_norm": 6.069730758666992, - "learning_rate": 2.875468119838679e-05, - "loss": 1.6738, - "num_input_tokens_seen": 95556347, + "epoch": 1.7708500080038418, + "grad_norm": 7.617880821228027, + "learning_rate": 3.229159996798463e-05, + "loss": 1.2265, + "num_input_tokens_seen": 95729504, "step": 177000 }, { - "epoch": 4.261090839254849, - "grad_norm": 7.7946038246154785, - "learning_rate": 2.8694665834453623e-05, - "loss": 1.6832, - "num_input_tokens_seen": 95827955, + "epoch": 1.775852409156395, + "grad_norm": 7.258569240570068, + "learning_rate": 3.2241575956459106e-05, + "loss": 1.235, + "num_input_tokens_seen": 95996688, "step": 177500 }, { - "epoch": 4.273093912041483, - "grad_norm": 6.413337230682373, - "learning_rate": 2.863465047052045e-05, - "loss": 1.6818, - "num_input_tokens_seen": 96093755, + "epoch": 1.7808548103089483, + "grad_norm": 5.590980052947998, + "learning_rate": 3.219155194493357e-05, + "loss": 1.2335, + "num_input_tokens_seen": 96263440, "step": 178000 }, { - "epoch": 4.285096984828116, - "grad_norm": 5.625922679901123, - "learning_rate": 2.8574635106587287e-05, - "loss": 1.69, - "num_input_tokens_seen": 96363467, + "epoch": 1.7858572114615017, + "grad_norm": 5.80760383605957, + "learning_rate": 3.214152793340804e-05, + "loss": 1.2179, + "num_input_tokens_seen": 96539352, "step": 178500 }, { - "epoch": 4.29710005761475, - "grad_norm": 6.475124835968018, - "learning_rate": 2.851461974265412e-05, - "loss": 1.7061, - "num_input_tokens_seen": 96636819, + "epoch": 1.7908596126140548, + "grad_norm": 5.532135486602783, + "learning_rate": 3.20915039218825e-05, + "loss": 1.1994, + "num_input_tokens_seen": 96805416, "step": 179000 }, { - "epoch": 4.3091031304013825, - "grad_norm": 7.147923469543457, - "learning_rate": 2.8454604378720954e-05, - "loss": 1.6722, - "num_input_tokens_seen": 96905563, + "epoch": 1.795862013766608, + "grad_norm": 5.589640140533447, + "learning_rate": 3.204147991035697e-05, + "loss": 1.2264, + "num_input_tokens_seen": 97073464, "step": 179500 }, { - "epoch": 4.321106203188016, - "grad_norm": 5.891010761260986, - "learning_rate": 2.8394589014787787e-05, - "loss": 1.688, - "num_input_tokens_seen": 97172259, + "epoch": 1.800864414919161, + "grad_norm": 6.577908039093018, + "learning_rate": 3.199145589883144e-05, + "loss": 1.23, + "num_input_tokens_seen": 97347808, "step": 180000 }, { - "epoch": 4.33310927597465, - "grad_norm": 7.492738246917725, - "learning_rate": 2.833457365085462e-05, - "loss": 1.727, - "num_input_tokens_seen": 97438723, + "epoch": 1.8058668160717144, + "grad_norm": 6.8848724365234375, + "learning_rate": 3.1941431887305905e-05, + "loss": 1.2167, + "num_input_tokens_seen": 97615696, "step": 180500 }, { - "epoch": 4.345112348761283, - "grad_norm": 6.35986852645874, - "learning_rate": 2.8274558286921454e-05, - "loss": 1.6977, - "num_input_tokens_seen": 97710067, + "epoch": 1.8108692172242677, + "grad_norm": 6.463140964508057, + "learning_rate": 3.1891407875780375e-05, + "loss": 1.2189, + "num_input_tokens_seen": 97886312, "step": 181000 }, { - "epoch": 4.357115421547916, - "grad_norm": 5.948391437530518, - "learning_rate": 2.8214542922988286e-05, - "loss": 1.7085, - "num_input_tokens_seen": 97975859, + "epoch": 1.8158716183768209, + "grad_norm": 8.028167724609375, + "learning_rate": 3.1841383864254845e-05, + "loss": 1.2384, + "num_input_tokens_seen": 98151768, "step": 181500 }, { - "epoch": 4.36911849433455, - "grad_norm": 6.791003704071045, - "learning_rate": 2.815452755905512e-05, - "loss": 1.7103, - "num_input_tokens_seen": 98248123, + "epoch": 1.820874019529374, + "grad_norm": 7.106721878051758, + "learning_rate": 3.1791359852729315e-05, + "loss": 1.2264, + "num_input_tokens_seen": 98419192, "step": 182000 }, { - "epoch": 4.381121567121183, - "grad_norm": 5.743526458740234, - "learning_rate": 2.8094512195121954e-05, - "loss": 1.705, - "num_input_tokens_seen": 98518747, + "epoch": 1.8258764206819273, + "grad_norm": 5.771492004394531, + "learning_rate": 3.174133584120378e-05, + "loss": 1.2421, + "num_input_tokens_seen": 98695368, "step": 182500 }, { - "epoch": 4.393124639907817, - "grad_norm": 5.9909515380859375, - "learning_rate": 2.8034496831188782e-05, - "loss": 1.6854, - "num_input_tokens_seen": 98793323, + "epoch": 1.8308788218344807, + "grad_norm": 5.563631534576416, + "learning_rate": 3.169131182967825e-05, + "loss": 1.2325, + "num_input_tokens_seen": 98974312, "step": 183000 }, { - "epoch": 4.4051277126944495, - "grad_norm": 6.874711513519287, - "learning_rate": 2.797448146725562e-05, - "loss": 1.7003, - "num_input_tokens_seen": 99060403, + "epoch": 1.8358812229870338, + "grad_norm": 7.004051208496094, + "learning_rate": 3.164128781815272e-05, + "loss": 1.2107, + "num_input_tokens_seen": 99244240, "step": 183500 }, { - "epoch": 4.417130785481083, - "grad_norm": 6.583930969238281, - "learning_rate": 2.791446610332245e-05, - "loss": 1.7095, - "num_input_tokens_seen": 99335163, + "epoch": 1.840883624139587, + "grad_norm": 6.410153865814209, + "learning_rate": 3.159126380662718e-05, + "loss": 1.2251, + "num_input_tokens_seen": 99515024, "step": 184000 }, { - "epoch": 4.429133858267717, - "grad_norm": 5.787863731384277, - "learning_rate": 2.785445073938929e-05, - "loss": 1.6922, - "num_input_tokens_seen": 99607835, + "epoch": 1.84588602529214, + "grad_norm": 4.9155354499816895, + "learning_rate": 3.154123979510165e-05, + "loss": 1.2192, + "num_input_tokens_seen": 99785496, "step": 184500 }, { - "epoch": 4.44113693105435, - "grad_norm": 6.155502796173096, - "learning_rate": 2.7794435375456117e-05, - "loss": 1.6756, - "num_input_tokens_seen": 99874595, + "epoch": 1.8508884264446934, + "grad_norm": 7.882739067077637, + "learning_rate": 3.1491215783576114e-05, + "loss": 1.2405, + "num_input_tokens_seen": 100061088, "step": 185000 }, { - "epoch": 4.453140003840983, - "grad_norm": 6.477571964263916, - "learning_rate": 2.773442001152295e-05, - "loss": 1.6751, - "num_input_tokens_seen": 100138467, + "epoch": 1.8558908275972468, + "grad_norm": 5.929235935211182, + "learning_rate": 3.144119177205059e-05, + "loss": 1.2171, + "num_input_tokens_seen": 100330216, "step": 185500 }, { - "epoch": 4.465143076627617, - "grad_norm": 7.841823577880859, - "learning_rate": 2.7674404647589785e-05, - "loss": 1.7214, - "num_input_tokens_seen": 100409267, + "epoch": 1.8608932287498, + "grad_norm": 5.840740203857422, + "learning_rate": 3.1391167760525054e-05, + "loss": 1.2093, + "num_input_tokens_seen": 100606080, "step": 186000 }, { - "epoch": 4.47714614941425, - "grad_norm": 5.944327354431152, - "learning_rate": 2.7614389283656617e-05, - "loss": 1.7185, - "num_input_tokens_seen": 100676555, + "epoch": 1.865895629902353, + "grad_norm": 6.414222717285156, + "learning_rate": 3.134114374899952e-05, + "loss": 1.2154, + "num_input_tokens_seen": 100872752, "step": 186500 }, { - "epoch": 4.489149222200884, - "grad_norm": 6.225147247314453, - "learning_rate": 2.7554373919723452e-05, - "loss": 1.6905, - "num_input_tokens_seen": 100944955, + "epoch": 1.8708980310549064, + "grad_norm": 7.030595779418945, + "learning_rate": 3.129111973747399e-05, + "loss": 1.2209, + "num_input_tokens_seen": 101144088, "step": 187000 }, { - "epoch": 4.5011522949875165, - "grad_norm": 7.948217868804932, - "learning_rate": 2.7494358555790285e-05, - "loss": 1.6936, - "num_input_tokens_seen": 101214867, + "epoch": 1.8759004322074597, + "grad_norm": 5.81058406829834, + "learning_rate": 3.124109572594846e-05, + "loss": 1.2068, + "num_input_tokens_seen": 101414120, "step": 187500 }, { - "epoch": 4.51315536777415, - "grad_norm": 5.985561847686768, - "learning_rate": 2.7434343191857113e-05, - "loss": 1.7104, - "num_input_tokens_seen": 101481771, + "epoch": 1.8809028333600128, + "grad_norm": 5.672033309936523, + "learning_rate": 3.119107171442293e-05, + "loss": 1.2305, + "num_input_tokens_seen": 101679104, "step": 188000 }, { - "epoch": 4.525158440560784, - "grad_norm": 5.921054840087891, - "learning_rate": 2.7374327827923952e-05, - "loss": 1.7161, - "num_input_tokens_seen": 101751955, + "epoch": 1.885905234512566, + "grad_norm": 6.247150421142578, + "learning_rate": 3.114104770289739e-05, + "loss": 1.2281, + "num_input_tokens_seen": 101950272, "step": 188500 }, { - "epoch": 4.537161513347417, - "grad_norm": 5.415379524230957, - "learning_rate": 2.731431246399078e-05, - "loss": 1.712, - "num_input_tokens_seen": 102025611, + "epoch": 1.890907635665119, + "grad_norm": 6.070692539215088, + "learning_rate": 3.109102369137186e-05, + "loss": 1.2277, + "num_input_tokens_seen": 102222744, "step": 189000 }, { - "epoch": 4.54916458613405, - "grad_norm": 6.2882399559021, - "learning_rate": 2.725429710005762e-05, - "loss": 1.7076, - "num_input_tokens_seen": 102294651, + "epoch": 1.8959100368176725, + "grad_norm": 7.217655181884766, + "learning_rate": 3.104099967984633e-05, + "loss": 1.2227, + "num_input_tokens_seen": 102498072, "step": 189500 }, { - "epoch": 4.5611676589206835, - "grad_norm": 7.873308181762695, - "learning_rate": 2.7194281736124448e-05, - "loss": 1.7068, - "num_input_tokens_seen": 102564635, + "epoch": 1.9009124379702258, + "grad_norm": 6.292141914367676, + "learning_rate": 3.099097566832079e-05, + "loss": 1.226, + "num_input_tokens_seen": 102771184, "step": 190000 }, { - "epoch": 4.573170731707317, - "grad_norm": 5.54704475402832, - "learning_rate": 2.713426637219128e-05, - "loss": 1.7512, - "num_input_tokens_seen": 102836755, + "epoch": 1.905914839122779, + "grad_norm": 6.4393534660339355, + "learning_rate": 3.094095165679526e-05, + "loss": 1.2238, + "num_input_tokens_seen": 103041800, "step": 190500 }, { - "epoch": 4.585173804493951, - "grad_norm": 6.452486991882324, - "learning_rate": 2.7074251008258116e-05, - "loss": 1.7097, - "num_input_tokens_seen": 103107315, + "epoch": 1.910917240275332, + "grad_norm": 6.367134094238281, + "learning_rate": 3.0890927645269726e-05, + "loss": 1.2277, + "num_input_tokens_seen": 103315416, "step": 191000 }, { - "epoch": 4.597176877280583, - "grad_norm": 6.1506147384643555, - "learning_rate": 2.7014235644324948e-05, - "loss": 1.7008, - "num_input_tokens_seen": 103371971, + "epoch": 1.9159196414278854, + "grad_norm": 5.803537368774414, + "learning_rate": 3.08409036337442e-05, + "loss": 1.2101, + "num_input_tokens_seen": 103584680, "step": 191500 }, { - "epoch": 4.609179950067217, - "grad_norm": 6.299232482910156, - "learning_rate": 2.6954220280391783e-05, - "loss": 1.7125, - "num_input_tokens_seen": 103643947, + "epoch": 1.9209220425804387, + "grad_norm": 5.529000282287598, + "learning_rate": 3.0790879622218666e-05, + "loss": 1.2252, + "num_input_tokens_seen": 103854296, "step": 192000 }, { - "epoch": 4.621183022853851, - "grad_norm": 7.294856548309326, - "learning_rate": 2.6894204916458615e-05, - "loss": 1.7124, - "num_input_tokens_seen": 103917643, + "epoch": 1.9259244437329919, + "grad_norm": 6.204425811767578, + "learning_rate": 3.0740855610693136e-05, + "loss": 1.2234, + "num_input_tokens_seen": 104127296, "step": 192500 }, { - "epoch": 4.633186095640484, - "grad_norm": 6.6945648193359375, - "learning_rate": 2.6834189552525447e-05, - "loss": 1.7233, - "num_input_tokens_seen": 104188979, + "epoch": 1.930926844885545, + "grad_norm": 6.076712131500244, + "learning_rate": 3.06908315991676e-05, + "loss": 1.2245, + "num_input_tokens_seen": 104402160, "step": 193000 }, { - "epoch": 4.645189168427118, - "grad_norm": 6.664907455444336, - "learning_rate": 2.6774174188592283e-05, - "loss": 1.7144, - "num_input_tokens_seen": 104459259, + "epoch": 1.9359292460380981, + "grad_norm": 5.718363285064697, + "learning_rate": 3.064080758764207e-05, + "loss": 1.2288, + "num_input_tokens_seen": 104669408, "step": 193500 }, { - "epoch": 4.6571922412137505, - "grad_norm": 7.543137550354004, - "learning_rate": 2.6714158824659115e-05, - "loss": 1.7165, - "num_input_tokens_seen": 104725387, + "epoch": 1.9409316471906515, + "grad_norm": 5.174673080444336, + "learning_rate": 3.059078357611654e-05, + "loss": 1.2276, + "num_input_tokens_seen": 104944256, "step": 194000 }, { - "epoch": 4.669195314000384, - "grad_norm": 8.370292663574219, - "learning_rate": 2.6654143460725944e-05, - "loss": 1.7103, - "num_input_tokens_seen": 104997579, + "epoch": 1.9459340483432048, + "grad_norm": 6.684966564178467, + "learning_rate": 3.0540759564591e-05, + "loss": 1.2341, + "num_input_tokens_seen": 105217600, "step": 194500 }, { - "epoch": 4.681198386787018, - "grad_norm": 6.999393939971924, - "learning_rate": 2.6594128096792783e-05, - "loss": 1.7066, - "num_input_tokens_seen": 105261307, + "epoch": 1.950936449495758, + "grad_norm": 6.3069562911987305, + "learning_rate": 3.0490735553065475e-05, + "loss": 1.211, + "num_input_tokens_seen": 105491832, "step": 195000 }, { - "epoch": 4.69320145957365, - "grad_norm": 6.65623664855957, - "learning_rate": 2.653411273285961e-05, - "loss": 1.7047, - "num_input_tokens_seen": 105530627, + "epoch": 1.955938850648311, + "grad_norm": 8.71688461303711, + "learning_rate": 3.044071154153994e-05, + "loss": 1.231, + "num_input_tokens_seen": 105764832, "step": 195500 }, { - "epoch": 4.705204532360284, - "grad_norm": 6.124739170074463, - "learning_rate": 2.6474097368926447e-05, - "loss": 1.7132, - "num_input_tokens_seen": 105800235, + "epoch": 1.9609412518008644, + "grad_norm": 8.65140438079834, + "learning_rate": 3.0390687530014405e-05, + "loss": 1.2142, + "num_input_tokens_seen": 106034576, "step": 196000 }, { - "epoch": 4.717207605146918, - "grad_norm": 7.289402008056641, - "learning_rate": 2.641408200499328e-05, - "loss": 1.7301, - "num_input_tokens_seen": 106071707, + "epoch": 1.9659436529534178, + "grad_norm": 5.5850725173950195, + "learning_rate": 3.0340663518488875e-05, + "loss": 1.2254, + "num_input_tokens_seen": 106305056, "step": 196500 }, { - "epoch": 4.729210677933551, - "grad_norm": 6.411198616027832, - "learning_rate": 2.635406664106011e-05, - "loss": 1.7038, - "num_input_tokens_seen": 106338003, + "epoch": 1.970946054105971, + "grad_norm": 6.236534118652344, + "learning_rate": 3.029063950696334e-05, + "loss": 1.2121, + "num_input_tokens_seen": 106574696, "step": 197000 }, { - "epoch": 4.741213750720185, - "grad_norm": 6.389967918395996, - "learning_rate": 2.6294051277126946e-05, - "loss": 1.7394, - "num_input_tokens_seen": 106606651, + "epoch": 1.975948455258524, + "grad_norm": 6.221134185791016, + "learning_rate": 3.024061549543781e-05, + "loss": 1.2031, + "num_input_tokens_seen": 106850384, "step": 197500 }, { - "epoch": 4.7532168235068175, - "grad_norm": 7.213348388671875, - "learning_rate": 2.6234035913193778e-05, - "loss": 1.7239, - "num_input_tokens_seen": 106879107, + "epoch": 1.9809508564110772, + "grad_norm": 7.421369552612305, + "learning_rate": 3.0190591483912278e-05, + "loss": 1.2214, + "num_input_tokens_seen": 107126816, "step": 198000 }, { - "epoch": 4.765219896293451, - "grad_norm": 6.657505989074707, - "learning_rate": 2.6174020549260614e-05, - "loss": 1.6909, - "num_input_tokens_seen": 107148467, + "epoch": 1.9859532575636305, + "grad_norm": 6.951572418212891, + "learning_rate": 3.0140567472386748e-05, + "loss": 1.2052, + "num_input_tokens_seen": 107392688, "step": 198500 }, { - "epoch": 4.777222969080085, - "grad_norm": 5.557823657989502, - "learning_rate": 2.6114005185327446e-05, - "loss": 1.7301, - "num_input_tokens_seen": 107417635, + "epoch": 1.9909556587161839, + "grad_norm": 7.400991439819336, + "learning_rate": 3.0090543460861215e-05, + "loss": 1.2091, + "num_input_tokens_seen": 107668928, "step": 199000 }, { - "epoch": 4.789226041866717, - "grad_norm": 7.233010292053223, - "learning_rate": 2.6053989821394275e-05, - "loss": 1.7095, - "num_input_tokens_seen": 107683299, + "epoch": 1.995958059868737, + "grad_norm": 6.747934818267822, + "learning_rate": 3.004051944933568e-05, + "loss": 1.2001, + "num_input_tokens_seen": 107939312, "step": 199500 }, { - "epoch": 4.801229114653351, - "grad_norm": 6.778656005859375, - "learning_rate": 2.5993974457461113e-05, - "loss": 1.7276, - "num_input_tokens_seen": 107954603, + "epoch": 2.0, + "eval_loss": 1.1146966218948364, + "eval_runtime": 186.1963, + "eval_samples_per_second": 1073.625, + "eval_steps_per_second": 134.208, + "num_input_tokens_seen": 108157960, + "step": 199904 + }, + { + "epoch": 2.00096046102129, + "grad_norm": 5.809133052825928, + "learning_rate": 2.999049543781015e-05, + "loss": 1.21, + "num_input_tokens_seen": 108209480, "step": 200000 }, { - "epoch": 4.813232187439985, - "grad_norm": 6.341772079467773, - "learning_rate": 2.5933959093527942e-05, - "loss": 1.716, - "num_input_tokens_seen": 108225939, + "epoch": 2.0059628621738432, + "grad_norm": 7.035338401794434, + "learning_rate": 2.9940471426284618e-05, + "loss": 1.0955, + "num_input_tokens_seen": 108480360, "step": 200500 }, { - "epoch": 4.825235260226618, - "grad_norm": 6.43362283706665, - "learning_rate": 2.587394372959478e-05, - "loss": 1.7191, - "num_input_tokens_seen": 108494595, + "epoch": 2.010965263326397, + "grad_norm": 8.504626274108887, + "learning_rate": 2.9890447414759088e-05, + "loss": 1.1085, + "num_input_tokens_seen": 108745072, "step": 201000 }, { - "epoch": 4.837238333013252, - "grad_norm": 6.792752265930176, - "learning_rate": 2.581392836566161e-05, - "loss": 1.7075, - "num_input_tokens_seen": 108765811, + "epoch": 2.01596766447895, + "grad_norm": 7.347136497497559, + "learning_rate": 2.9840423403233554e-05, + "loss": 1.0983, + "num_input_tokens_seen": 109010096, "step": 201500 }, { - "epoch": 4.8492414057998845, - "grad_norm": 7.833460807800293, - "learning_rate": 2.575391300172844e-05, - "loss": 1.7263, - "num_input_tokens_seen": 109039187, + "epoch": 2.020970065631503, + "grad_norm": 5.3891215324401855, + "learning_rate": 2.9790399391708024e-05, + "loss": 1.0941, + "num_input_tokens_seen": 109280096, "step": 202000 }, { - "epoch": 4.861244478586518, - "grad_norm": 7.140945911407471, - "learning_rate": 2.5693897637795277e-05, - "loss": 1.6968, - "num_input_tokens_seen": 109311867, + "epoch": 2.025972466784056, + "grad_norm": 5.807075023651123, + "learning_rate": 2.974037538018249e-05, + "loss": 1.1066, + "num_input_tokens_seen": 109550512, "step": 202500 }, { - "epoch": 4.873247551373152, - "grad_norm": 6.381129741668701, - "learning_rate": 2.563388227386211e-05, - "loss": 1.7062, - "num_input_tokens_seen": 109582883, + "epoch": 2.0309748679366098, + "grad_norm": 7.344318866729736, + "learning_rate": 2.9690351368656954e-05, + "loss": 1.1149, + "num_input_tokens_seen": 109827584, "step": 203000 }, { - "epoch": 4.885250624159785, - "grad_norm": 6.6073102951049805, - "learning_rate": 2.5573866909928945e-05, - "loss": 1.736, - "num_input_tokens_seen": 109859179, + "epoch": 2.035977269089163, + "grad_norm": 4.556800842285156, + "learning_rate": 2.9640327357131427e-05, + "loss": 1.1137, + "num_input_tokens_seen": 110094320, "step": 203500 }, { - "epoch": 4.897253696946418, - "grad_norm": 7.899672508239746, - "learning_rate": 2.5513851545995777e-05, - "loss": 1.7204, - "num_input_tokens_seen": 110123699, + "epoch": 2.040979670241716, + "grad_norm": 6.238656044006348, + "learning_rate": 2.959030334560589e-05, + "loss": 1.1174, + "num_input_tokens_seen": 110369888, "step": 204000 }, { - "epoch": 4.909256769733052, - "grad_norm": 10.730802536010742, - "learning_rate": 2.5453836182062605e-05, - "loss": 1.7118, - "num_input_tokens_seen": 110396507, + "epoch": 2.045982071394269, + "grad_norm": 6.009298801422119, + "learning_rate": 2.954027933408036e-05, + "loss": 1.0978, + "num_input_tokens_seen": 110638112, "step": 204500 }, { - "epoch": 4.921259842519685, - "grad_norm": 5.577351093292236, - "learning_rate": 2.5393820818129444e-05, - "loss": 1.7174, - "num_input_tokens_seen": 110665603, + "epoch": 2.0509844725468223, + "grad_norm": 5.883254051208496, + "learning_rate": 2.9490255322554827e-05, + "loss": 1.1127, + "num_input_tokens_seen": 110907728, "step": 205000 }, { - "epoch": 4.933262915306319, - "grad_norm": 6.486494541168213, - "learning_rate": 2.5333805454196273e-05, - "loss": 1.7118, - "num_input_tokens_seen": 110934739, + "epoch": 2.055986873699376, + "grad_norm": 5.4123125076293945, + "learning_rate": 2.9440231311029297e-05, + "loss": 1.116, + "num_input_tokens_seen": 111174608, "step": 205500 }, { - "epoch": 4.9452659880929515, - "grad_norm": 7.368922233581543, - "learning_rate": 2.5273790090263112e-05, - "loss": 1.7205, - "num_input_tokens_seen": 111202651, + "epoch": 2.060989274851929, + "grad_norm": 6.456712245941162, + "learning_rate": 2.9390207299503763e-05, + "loss": 1.1306, + "num_input_tokens_seen": 111443896, "step": 206000 }, { - "epoch": 4.957269060879585, - "grad_norm": 6.480933666229248, - "learning_rate": 2.521377472632994e-05, - "loss": 1.6832, - "num_input_tokens_seen": 111472451, + "epoch": 2.065991676004482, + "grad_norm": 7.134698390960693, + "learning_rate": 2.934018328797823e-05, + "loss": 1.1149, + "num_input_tokens_seen": 111722672, "step": 206500 }, { - "epoch": 4.969272133666219, - "grad_norm": 6.915102481842041, - "learning_rate": 2.5153759362396772e-05, - "loss": 1.7264, - "num_input_tokens_seen": 111740891, + "epoch": 2.070994077157035, + "grad_norm": 5.317368984222412, + "learning_rate": 2.92901592764527e-05, + "loss": 1.093, + "num_input_tokens_seen": 111987688, "step": 207000 }, { - "epoch": 4.981275206452852, - "grad_norm": 6.025552272796631, - "learning_rate": 2.5093743998463608e-05, - "loss": 1.7216, - "num_input_tokens_seen": 112007507, + "epoch": 2.0759964783095888, + "grad_norm": 5.929445743560791, + "learning_rate": 2.9240135264927166e-05, + "loss": 1.124, + "num_input_tokens_seen": 112256088, "step": 207500 }, { - "epoch": 4.993278279239485, - "grad_norm": 5.880407333374023, - "learning_rate": 2.503372863453044e-05, - "loss": 1.7266, - "num_input_tokens_seen": 112280659, + "epoch": 2.080998879462142, + "grad_norm": 6.658150672912598, + "learning_rate": 2.9190111253401636e-05, + "loss": 1.1068, + "num_input_tokens_seen": 112530064, "step": 208000 }, { - "epoch": 5.0, - "eval_loss": 1.8686867952346802, - "eval_runtime": 78.0425, - "eval_samples_per_second": 1067.508, - "eval_steps_per_second": 133.44, - "num_input_tokens_seen": 112427069, - "step": 208280 - }, - { - "epoch": 5.005281352026119, - "grad_norm": 6.4857401847839355, - "learning_rate": 2.4973713270597272e-05, - "loss": 1.6509, - "num_input_tokens_seen": 112543253, + "epoch": 2.086001280614695, + "grad_norm": 7.434782028198242, + "learning_rate": 2.9140087241876103e-05, + "loss": 1.11, + "num_input_tokens_seen": 112807296, "step": 208500 }, { - "epoch": 5.017284424812752, - "grad_norm": 5.99040412902832, - "learning_rate": 2.4913697906664108e-05, - "loss": 1.5846, - "num_input_tokens_seen": 112814885, + "epoch": 2.091003681767248, + "grad_norm": 6.564949035644531, + "learning_rate": 2.9090063230350566e-05, + "loss": 1.1181, + "num_input_tokens_seen": 113075032, "step": 209000 }, { - "epoch": 5.029287497599386, - "grad_norm": 6.501333236694336, - "learning_rate": 2.485368254273094e-05, - "loss": 1.5863, - "num_input_tokens_seen": 113088717, + "epoch": 2.0960060829198017, + "grad_norm": 11.387114524841309, + "learning_rate": 2.904003921882504e-05, + "loss": 1.1097, + "num_input_tokens_seen": 113344824, "step": 209500 }, { - "epoch": 5.0412905703860185, - "grad_norm": 6.118101596832275, - "learning_rate": 2.4793667178797775e-05, - "loss": 1.5923, - "num_input_tokens_seen": 113356549, + "epoch": 2.101008484072355, + "grad_norm": 5.74482536315918, + "learning_rate": 2.8990015207299502e-05, + "loss": 1.1018, + "num_input_tokens_seen": 113618576, "step": 210000 }, { - "epoch": 5.053293643172652, - "grad_norm": 6.171685695648193, - "learning_rate": 2.4733651814864607e-05, - "loss": 1.5733, - "num_input_tokens_seen": 113626133, + "epoch": 2.106010885224908, + "grad_norm": 5.009258270263672, + "learning_rate": 2.8939991195773976e-05, + "loss": 1.1039, + "num_input_tokens_seen": 113889800, "step": 210500 }, { - "epoch": 5.065296715959286, - "grad_norm": 6.699287414550781, - "learning_rate": 2.467363645093144e-05, - "loss": 1.5719, - "num_input_tokens_seen": 113898637, + "epoch": 2.111013286377461, + "grad_norm": 7.421350955963135, + "learning_rate": 2.888996718424844e-05, + "loss": 1.1173, + "num_input_tokens_seen": 114168104, "step": 211000 }, { - "epoch": 5.077299788745919, - "grad_norm": 5.89196252822876, - "learning_rate": 2.4613621086998275e-05, - "loss": 1.5843, - "num_input_tokens_seen": 114167645, + "epoch": 2.1160156875300142, + "grad_norm": 6.955892086029053, + "learning_rate": 2.8839943172722912e-05, + "loss": 1.1217, + "num_input_tokens_seen": 114430440, "step": 211500 }, { - "epoch": 5.089302861532552, - "grad_norm": 6.0742268562316895, - "learning_rate": 2.4553605723065107e-05, - "loss": 1.5857, - "num_input_tokens_seen": 114437781, + "epoch": 2.121018088682568, + "grad_norm": 7.287781715393066, + "learning_rate": 2.8789919161197375e-05, + "loss": 1.1063, + "num_input_tokens_seen": 114706600, "step": 212000 }, { - "epoch": 5.1013059343191856, - "grad_norm": 7.158196926116943, - "learning_rate": 2.449359035913194e-05, - "loss": 1.6026, - "num_input_tokens_seen": 114709205, + "epoch": 2.126020489835121, + "grad_norm": 7.426519870758057, + "learning_rate": 2.8739895149671842e-05, + "loss": 1.1181, + "num_input_tokens_seen": 114975360, "step": 212500 }, { - "epoch": 5.113309007105819, - "grad_norm": 6.8287272453308105, - "learning_rate": 2.443357499519877e-05, - "loss": 1.5872, - "num_input_tokens_seen": 114981885, + "epoch": 2.131022890987674, + "grad_norm": 6.112298965454102, + "learning_rate": 2.8689871138146312e-05, + "loss": 1.1, + "num_input_tokens_seen": 115252480, "step": 213000 }, { - "epoch": 5.125312079892453, - "grad_norm": 5.438999652862549, - "learning_rate": 2.4373559631265606e-05, - "loss": 1.5834, - "num_input_tokens_seen": 115250653, + "epoch": 2.136025292140227, + "grad_norm": 8.356368064880371, + "learning_rate": 2.863984712662078e-05, + "loss": 1.1382, + "num_input_tokens_seen": 115527840, "step": 213500 }, { - "epoch": 5.1373151526790855, - "grad_norm": 6.723355770111084, - "learning_rate": 2.431354426733244e-05, - "loss": 1.591, - "num_input_tokens_seen": 115517069, + "epoch": 2.1410276932927808, + "grad_norm": 5.211204528808594, + "learning_rate": 2.858982311509525e-05, + "loss": 1.1241, + "num_input_tokens_seen": 115795240, "step": 214000 }, { - "epoch": 5.149318225465719, - "grad_norm": 7.157223224639893, - "learning_rate": 2.4253528903399274e-05, - "loss": 1.6004, - "num_input_tokens_seen": 115788813, + "epoch": 2.146030094445334, + "grad_norm": 7.513902187347412, + "learning_rate": 2.8539799103569715e-05, + "loss": 1.0984, + "num_input_tokens_seen": 116071656, "step": 214500 }, { - "epoch": 5.161321298252353, - "grad_norm": 5.415886878967285, - "learning_rate": 2.4193513539466103e-05, - "loss": 1.6154, - "num_input_tokens_seen": 116060005, + "epoch": 2.151032495597887, + "grad_norm": 5.553924560546875, + "learning_rate": 2.8489775092044185e-05, + "loss": 1.1138, + "num_input_tokens_seen": 116337944, "step": 215000 }, { - "epoch": 5.173324371038986, - "grad_norm": 6.716604232788086, - "learning_rate": 2.4133498175532938e-05, - "loss": 1.6065, - "num_input_tokens_seen": 116328429, + "epoch": 2.15603489675044, + "grad_norm": 5.7051920890808105, + "learning_rate": 2.843975108051865e-05, + "loss": 1.1193, + "num_input_tokens_seen": 116608560, "step": 215500 }, { - "epoch": 5.185327443825619, - "grad_norm": 6.5148725509643555, - "learning_rate": 2.407348281159977e-05, - "loss": 1.6034, - "num_input_tokens_seen": 116598381, + "epoch": 2.1610372979029933, + "grad_norm": 6.199916362762451, + "learning_rate": 2.8389727068993115e-05, + "loss": 1.114, + "num_input_tokens_seen": 116876312, "step": 216000 }, { - "epoch": 5.1973305166122525, - "grad_norm": 7.070413589477539, - "learning_rate": 2.4013467447666606e-05, - "loss": 1.6083, - "num_input_tokens_seen": 116870909, + "epoch": 2.166039699055547, + "grad_norm": 6.054383754730225, + "learning_rate": 2.8339703057467588e-05, + "loss": 1.0965, + "num_input_tokens_seen": 117146704, "step": 216500 }, { - "epoch": 5.209333589398886, - "grad_norm": 6.680785655975342, - "learning_rate": 2.3953452083733438e-05, - "loss": 1.6147, - "num_input_tokens_seen": 117138325, + "epoch": 2.1710421002081, + "grad_norm": 6.129938125610352, + "learning_rate": 2.828967904594205e-05, + "loss": 1.1191, + "num_input_tokens_seen": 117407640, "step": 217000 }, { - "epoch": 5.22133666218552, - "grad_norm": 6.164599895477295, - "learning_rate": 2.389343671980027e-05, - "loss": 1.6126, - "num_input_tokens_seen": 117415437, + "epoch": 2.176044501360653, + "grad_norm": 8.660636901855469, + "learning_rate": 2.8239655034416524e-05, + "loss": 1.1217, + "num_input_tokens_seen": 117675248, "step": 217500 }, { - "epoch": 5.233339734972153, - "grad_norm": 6.58738899230957, - "learning_rate": 2.3833421355867102e-05, - "loss": 1.5992, - "num_input_tokens_seen": 117684565, + "epoch": 2.181046902513206, + "grad_norm": 5.140537261962891, + "learning_rate": 2.8189631022890988e-05, + "loss": 1.106, + "num_input_tokens_seen": 117946856, "step": 218000 }, { - "epoch": 5.245342807758786, - "grad_norm": 6.455745697021484, - "learning_rate": 2.3773405991933937e-05, - "loss": 1.6155, - "num_input_tokens_seen": 117957301, + "epoch": 2.18604930366576, + "grad_norm": 8.983773231506348, + "learning_rate": 2.8139607011365454e-05, + "loss": 1.1055, + "num_input_tokens_seen": 118218576, "step": 218500 }, { - "epoch": 5.25734588054542, - "grad_norm": 6.0809736251831055, - "learning_rate": 2.371339062800077e-05, - "loss": 1.6081, - "num_input_tokens_seen": 118225149, + "epoch": 2.191051704818313, + "grad_norm": 5.122745513916016, + "learning_rate": 2.8089582999839924e-05, + "loss": 1.1022, + "num_input_tokens_seen": 118493280, "step": 219000 }, { - "epoch": 5.269348953332053, - "grad_norm": 5.793436527252197, - "learning_rate": 2.3653375264067605e-05, - "loss": 1.6335, - "num_input_tokens_seen": 118505301, + "epoch": 2.196054105970866, + "grad_norm": 5.861432075500488, + "learning_rate": 2.803955898831439e-05, + "loss": 1.0946, + "num_input_tokens_seen": 118764672, "step": 219500 }, { - "epoch": 5.281352026118686, - "grad_norm": 6.701602935791016, - "learning_rate": 2.3593359900134433e-05, - "loss": 1.6188, - "num_input_tokens_seen": 118778533, + "epoch": 2.201056507123419, + "grad_norm": 5.456287384033203, + "learning_rate": 2.798953497678886e-05, + "loss": 1.1136, + "num_input_tokens_seen": 119033632, "step": 220000 }, { - "epoch": 5.2933550989053195, - "grad_norm": 7.20522403717041, - "learning_rate": 2.353334453620127e-05, - "loss": 1.6238, - "num_input_tokens_seen": 119048181, + "epoch": 2.2060589082759723, + "grad_norm": 6.379229545593262, + "learning_rate": 2.7939510965263327e-05, + "loss": 1.1123, + "num_input_tokens_seen": 119302136, "step": 220500 }, { - "epoch": 5.305358171691953, - "grad_norm": 6.754197597503662, - "learning_rate": 2.34733291722681e-05, - "loss": 1.623, - "num_input_tokens_seen": 119314821, + "epoch": 2.211061309428526, + "grad_norm": 7.430028438568115, + "learning_rate": 2.7889486953737797e-05, + "loss": 1.1079, + "num_input_tokens_seen": 119571112, "step": 221000 }, { - "epoch": 5.317361244478587, - "grad_norm": 6.627007961273193, - "learning_rate": 2.3413313808334936e-05, - "loss": 1.6226, - "num_input_tokens_seen": 119581565, + "epoch": 2.216063710581079, + "grad_norm": 6.985309600830078, + "learning_rate": 2.7839462942212264e-05, + "loss": 1.142, + "num_input_tokens_seen": 119840376, "step": 221500 }, { - "epoch": 5.32936431726522, - "grad_norm": 7.077493190765381, - "learning_rate": 2.3353298444401765e-05, - "loss": 1.6056, - "num_input_tokens_seen": 119843469, + "epoch": 2.221066111733632, + "grad_norm": 5.228456974029541, + "learning_rate": 2.778943893068673e-05, + "loss": 1.111, + "num_input_tokens_seen": 120108496, "step": 222000 }, { - "epoch": 5.341367390051853, - "grad_norm": 7.528831958770752, - "learning_rate": 2.32932830804686e-05, - "loss": 1.6183, - "num_input_tokens_seen": 120110901, + "epoch": 2.2260685128861852, + "grad_norm": 7.293130874633789, + "learning_rate": 2.77394149191612e-05, + "loss": 1.1067, + "num_input_tokens_seen": 120381296, "step": 222500 }, { - "epoch": 5.353370462838487, - "grad_norm": 7.370020389556885, - "learning_rate": 2.3233267716535433e-05, - "loss": 1.6183, - "num_input_tokens_seen": 120378213, + "epoch": 2.231070914038739, + "grad_norm": 7.219442367553711, + "learning_rate": 2.7689390907635667e-05, + "loss": 1.1331, + "num_input_tokens_seen": 120646192, "step": 223000 }, { - "epoch": 5.36537353562512, - "grad_norm": 5.429224491119385, - "learning_rate": 2.3173252352602268e-05, - "loss": 1.6254, - "num_input_tokens_seen": 120646941, + "epoch": 2.236073315191292, + "grad_norm": 6.636627197265625, + "learning_rate": 2.7639366896110137e-05, + "loss": 1.1266, + "num_input_tokens_seen": 120912232, "step": 223500 }, { - "epoch": 5.377376608411754, - "grad_norm": 5.89351224899292, - "learning_rate": 2.31132369886691e-05, - "loss": 1.6223, - "num_input_tokens_seen": 120918613, + "epoch": 2.241075716343845, + "grad_norm": 6.974771976470947, + "learning_rate": 2.75893428845846e-05, + "loss": 1.1432, + "num_input_tokens_seen": 121178320, "step": 224000 }, { - "epoch": 5.3893796811983865, - "grad_norm": 6.414028167724609, - "learning_rate": 2.3053221624735932e-05, - "loss": 1.61, - "num_input_tokens_seen": 121188733, + "epoch": 2.246078117496398, + "grad_norm": 6.00003719329834, + "learning_rate": 2.7539318873059073e-05, + "loss": 1.1228, + "num_input_tokens_seen": 121450400, "step": 224500 }, { - "epoch": 5.40138275398502, - "grad_norm": 6.461724281311035, - "learning_rate": 2.2993206260802768e-05, - "loss": 1.6256, - "num_input_tokens_seen": 121458733, + "epoch": 2.2510805186489513, + "grad_norm": 6.582889556884766, + "learning_rate": 2.7489294861533536e-05, + "loss": 1.1236, + "num_input_tokens_seen": 121722344, "step": 225000 }, { - "epoch": 5.413385826771654, - "grad_norm": 6.628051280975342, - "learning_rate": 2.29331908968696e-05, - "loss": 1.6276, - "num_input_tokens_seen": 121727933, + "epoch": 2.256082919801505, + "grad_norm": 4.923620700836182, + "learning_rate": 2.7439270850008003e-05, + "loss": 1.1133, + "num_input_tokens_seen": 121986208, "step": 225500 }, { - "epoch": 5.425388899558287, - "grad_norm": 6.447346210479736, - "learning_rate": 2.2873175532936432e-05, - "loss": 1.6452, - "num_input_tokens_seen": 121994869, + "epoch": 2.261085320954058, + "grad_norm": 6.316877365112305, + "learning_rate": 2.7389246838482473e-05, + "loss": 1.1163, + "num_input_tokens_seen": 122249640, "step": 226000 }, { - "epoch": 5.43739197234492, - "grad_norm": 6.347208023071289, - "learning_rate": 2.2813160169003267e-05, - "loss": 1.6316, - "num_input_tokens_seen": 122261757, + "epoch": 2.266087722106611, + "grad_norm": 6.2502241134643555, + "learning_rate": 2.733922282695694e-05, + "loss": 1.103, + "num_input_tokens_seen": 122518696, "step": 226500 }, { - "epoch": 5.449395045131554, - "grad_norm": 5.433672904968262, - "learning_rate": 2.27531448050701e-05, - "loss": 1.6379, - "num_input_tokens_seen": 122532525, + "epoch": 2.2710901232591643, + "grad_norm": 8.201516151428223, + "learning_rate": 2.728919881543141e-05, + "loss": 1.1212, + "num_input_tokens_seen": 122783616, "step": 227000 }, { - "epoch": 5.461398117918187, - "grad_norm": 6.2445149421691895, - "learning_rate": 2.269312944113693e-05, - "loss": 1.6309, - "num_input_tokens_seen": 122801541, + "epoch": 2.276092524411718, + "grad_norm": 5.959327220916748, + "learning_rate": 2.7239174803905876e-05, + "loss": 1.1096, + "num_input_tokens_seen": 123053552, "step": 227500 }, { - "epoch": 5.473401190704821, - "grad_norm": 6.356237411499023, - "learning_rate": 2.2633114077203767e-05, - "loss": 1.6196, - "num_input_tokens_seen": 123071813, + "epoch": 2.281094925564271, + "grad_norm": 9.138140678405762, + "learning_rate": 2.7189150792380342e-05, + "loss": 1.1051, + "num_input_tokens_seen": 123325184, "step": 228000 }, { - "epoch": 5.4854042634914535, - "grad_norm": 7.516770362854004, - "learning_rate": 2.25730987132706e-05, - "loss": 1.6528, - "num_input_tokens_seen": 123345381, + "epoch": 2.286097326716824, + "grad_norm": 6.214888572692871, + "learning_rate": 2.7139126780854812e-05, + "loss": 1.1141, + "num_input_tokens_seen": 123600592, "step": 228500 }, { - "epoch": 5.497407336278087, - "grad_norm": 7.101819038391113, - "learning_rate": 2.2513083349337434e-05, - "loss": 1.6276, - "num_input_tokens_seen": 123616757, + "epoch": 2.291099727869377, + "grad_norm": 6.67230224609375, + "learning_rate": 2.708910276932928e-05, + "loss": 1.1149, + "num_input_tokens_seen": 123876144, "step": 229000 }, { - "epoch": 5.509410409064721, - "grad_norm": 7.030257225036621, - "learning_rate": 2.2453067985404263e-05, - "loss": 1.6173, - "num_input_tokens_seen": 123886909, + "epoch": 2.2961021290219303, + "grad_norm": 7.004880905151367, + "learning_rate": 2.703907875780375e-05, + "loss": 1.1136, + "num_input_tokens_seen": 124149576, "step": 229500 }, { - "epoch": 5.521413481851354, - "grad_norm": 6.59322452545166, - "learning_rate": 2.23930526214711e-05, - "loss": 1.6368, - "num_input_tokens_seen": 124155885, + "epoch": 2.301104530174484, + "grad_norm": 5.232549667358398, + "learning_rate": 2.6989054746278215e-05, + "loss": 1.136, + "num_input_tokens_seen": 124420360, "step": 230000 }, { - "epoch": 5.533416554637987, - "grad_norm": 6.921408653259277, - "learning_rate": 2.233303725753793e-05, - "loss": 1.6308, - "num_input_tokens_seen": 124423213, + "epoch": 2.306106931327037, + "grad_norm": 6.569345951080322, + "learning_rate": 2.6939030734752685e-05, + "loss": 1.1136, + "num_input_tokens_seen": 124694320, "step": 230500 }, { - "epoch": 5.545419627424621, - "grad_norm": 7.577388286590576, - "learning_rate": 2.2273021893604766e-05, - "loss": 1.6382, - "num_input_tokens_seen": 124696565, + "epoch": 2.31110933247959, + "grad_norm": 4.602709770202637, + "learning_rate": 2.688900672322715e-05, + "loss": 1.131, + "num_input_tokens_seen": 124973608, "step": 231000 }, { - "epoch": 5.557422700211254, - "grad_norm": 7.2776103019714355, - "learning_rate": 2.2213006529671598e-05, - "loss": 1.6389, - "num_input_tokens_seen": 124969853, + "epoch": 2.3161117336321433, + "grad_norm": 7.659350872039795, + "learning_rate": 2.6838982711701615e-05, + "loss": 1.1237, + "num_input_tokens_seen": 125247480, "step": 231500 }, { - "epoch": 5.569425772997888, - "grad_norm": 6.935811519622803, - "learning_rate": 2.215299116573843e-05, - "loss": 1.6449, - "num_input_tokens_seen": 125239677, + "epoch": 2.321114134784697, + "grad_norm": 5.581116676330566, + "learning_rate": 2.6788958700176088e-05, + "loss": 1.1373, + "num_input_tokens_seen": 125514944, "step": 232000 }, { - "epoch": 5.5814288457845205, - "grad_norm": 6.270691394805908, - "learning_rate": 2.2092975801805262e-05, - "loss": 1.6225, - "num_input_tokens_seen": 125507853, + "epoch": 2.32611653593725, + "grad_norm": 6.8799238204956055, + "learning_rate": 2.673893468865055e-05, + "loss": 1.1185, + "num_input_tokens_seen": 125786544, "step": 232500 }, { - "epoch": 5.593431918571154, - "grad_norm": 6.819216728210449, - "learning_rate": 2.2032960437872098e-05, - "loss": 1.6138, - "num_input_tokens_seen": 125774141, + "epoch": 2.331118937089803, + "grad_norm": 6.535116195678711, + "learning_rate": 2.6688910677125025e-05, + "loss": 1.1154, + "num_input_tokens_seen": 126061016, "step": 233000 }, { - "epoch": 5.605434991357788, - "grad_norm": 6.6686625480651855, - "learning_rate": 2.197294507393893e-05, - "loss": 1.6328, - "num_input_tokens_seen": 126041397, + "epoch": 2.3361213382423562, + "grad_norm": 9.319666862487793, + "learning_rate": 2.6638886665599488e-05, + "loss": 1.1293, + "num_input_tokens_seen": 126326104, "step": 233500 }, { - "epoch": 5.617438064144421, - "grad_norm": 6.477898597717285, - "learning_rate": 2.1912929710005762e-05, - "loss": 1.6194, - "num_input_tokens_seen": 126309429, + "epoch": 2.3411237393949094, + "grad_norm": 6.085050582885742, + "learning_rate": 2.6588862654073958e-05, + "loss": 1.1266, + "num_input_tokens_seen": 126600472, "step": 234000 }, { - "epoch": 5.629441136931054, - "grad_norm": 6.112322807312012, - "learning_rate": 2.1852914346072594e-05, - "loss": 1.6331, - "num_input_tokens_seen": 126583101, + "epoch": 2.346126140547463, + "grad_norm": 7.938391208648682, + "learning_rate": 2.6538838642548424e-05, + "loss": 1.1222, + "num_input_tokens_seen": 126867552, "step": 234500 }, { - "epoch": 5.641444209717688, - "grad_norm": 6.582950592041016, - "learning_rate": 2.179289898213943e-05, - "loss": 1.6269, - "num_input_tokens_seen": 126852693, + "epoch": 2.351128541700016, + "grad_norm": 6.2780537605285645, + "learning_rate": 2.648881463102289e-05, + "loss": 1.0976, + "num_input_tokens_seen": 127135696, "step": 235000 }, { - "epoch": 5.653447282504321, - "grad_norm": 6.389900207519531, - "learning_rate": 2.173288361820626e-05, - "loss": 1.6351, - "num_input_tokens_seen": 127120077, + "epoch": 2.356130942852569, + "grad_norm": 6.0472731590271, + "learning_rate": 2.643879061949736e-05, + "loss": 1.1141, + "num_input_tokens_seen": 127406128, "step": 235500 }, { - "epoch": 5.665450355290955, - "grad_norm": 6.27639627456665, - "learning_rate": 2.1672868254273097e-05, - "loss": 1.6418, - "num_input_tokens_seen": 127391733, + "epoch": 2.3611333440051223, + "grad_norm": 6.907486438751221, + "learning_rate": 2.6388766607971827e-05, + "loss": 1.1149, + "num_input_tokens_seen": 127678464, "step": 236000 }, { - "epoch": 5.6774534280775875, - "grad_norm": 6.2564215660095215, - "learning_rate": 2.1612852890339926e-05, - "loss": 1.6525, - "num_input_tokens_seen": 127665997, + "epoch": 2.366135745157676, + "grad_norm": 6.429139137268066, + "learning_rate": 2.6338742596446297e-05, + "loss": 1.1154, + "num_input_tokens_seen": 127947800, "step": 236500 }, { - "epoch": 5.689456500864221, - "grad_norm": 7.210122108459473, - "learning_rate": 2.155283752640676e-05, - "loss": 1.6416, - "num_input_tokens_seen": 127929549, + "epoch": 2.371138146310229, + "grad_norm": 5.432641506195068, + "learning_rate": 2.6288718584920764e-05, + "loss": 1.1246, + "num_input_tokens_seen": 128221960, "step": 237000 }, { - "epoch": 5.701459573650855, - "grad_norm": 7.050640106201172, - "learning_rate": 2.1492822162473593e-05, - "loss": 1.6433, - "num_input_tokens_seen": 128198805, + "epoch": 2.376140547462782, + "grad_norm": 6.486244201660156, + "learning_rate": 2.6238694573395227e-05, + "loss": 1.1295, + "num_input_tokens_seen": 128492192, "step": 237500 }, { - "epoch": 5.713462646437488, - "grad_norm": 7.553079605102539, - "learning_rate": 2.143280679854043e-05, - "loss": 1.6612, - "num_input_tokens_seen": 128472453, + "epoch": 2.3811429486153353, + "grad_norm": 6.889167308807373, + "learning_rate": 2.61886705618697e-05, + "loss": 1.1073, + "num_input_tokens_seen": 128761776, "step": 238000 }, { - "epoch": 5.725465719224122, - "grad_norm": 6.868207931518555, - "learning_rate": 2.137279143460726e-05, - "loss": 1.6364, - "num_input_tokens_seen": 128740869, + "epoch": 2.3861453497678884, + "grad_norm": 5.81854248046875, + "learning_rate": 2.6138646550344163e-05, + "loss": 1.0988, + "num_input_tokens_seen": 129033976, "step": 238500 }, { - "epoch": 5.7374687920107545, - "grad_norm": 6.591185569763184, - "learning_rate": 2.1312776070674093e-05, - "loss": 1.6368, - "num_input_tokens_seen": 129011285, + "epoch": 2.391147750920442, + "grad_norm": 6.5693864822387695, + "learning_rate": 2.6088622538818637e-05, + "loss": 1.1225, + "num_input_tokens_seen": 129304680, "step": 239000 }, { - "epoch": 5.749471864797388, - "grad_norm": 6.777072429656982, - "learning_rate": 2.1252760706740925e-05, - "loss": 1.6283, - "num_input_tokens_seen": 129281229, + "epoch": 2.396150152072995, + "grad_norm": 7.3249969482421875, + "learning_rate": 2.60385985272931e-05, + "loss": 1.1134, + "num_input_tokens_seen": 129577608, "step": 239500 }, { - "epoch": 5.761474937584022, - "grad_norm": 6.940945148468018, - "learning_rate": 2.119274534280776e-05, - "loss": 1.6416, - "num_input_tokens_seen": 129549037, + "epoch": 2.401152553225548, + "grad_norm": 6.37844181060791, + "learning_rate": 2.5988574515767573e-05, + "loss": 1.116, + "num_input_tokens_seen": 129853352, "step": 240000 }, { - "epoch": 5.7734780103706544, - "grad_norm": 7.2894287109375, - "learning_rate": 2.1132729978874592e-05, - "loss": 1.6283, - "num_input_tokens_seen": 129818693, + "epoch": 2.4061549543781013, + "grad_norm": 6.640512943267822, + "learning_rate": 2.5938550504242036e-05, + "loss": 1.1158, + "num_input_tokens_seen": 130123560, "step": 240500 }, { - "epoch": 5.785481083157288, - "grad_norm": 9.061758995056152, - "learning_rate": 2.1072714614941428e-05, - "loss": 1.6381, - "num_input_tokens_seen": 130090357, + "epoch": 2.411157355530655, + "grad_norm": 4.907979965209961, + "learning_rate": 2.5888526492716503e-05, + "loss": 1.1116, + "num_input_tokens_seen": 130395312, "step": 241000 }, { - "epoch": 5.797484155943922, - "grad_norm": 5.998315811157227, - "learning_rate": 2.101269925100826e-05, - "loss": 1.6452, - "num_input_tokens_seen": 130357429, + "epoch": 2.416159756683208, + "grad_norm": 5.592065811157227, + "learning_rate": 2.5838502481190973e-05, + "loss": 1.125, + "num_input_tokens_seen": 130667624, "step": 241500 }, { - "epoch": 5.809487228730555, - "grad_norm": 7.557010173797607, - "learning_rate": 2.0952683887075092e-05, - "loss": 1.6417, - "num_input_tokens_seen": 130628629, + "epoch": 2.421162157835761, + "grad_norm": 6.227156639099121, + "learning_rate": 2.578847846966544e-05, + "loss": 1.1101, + "num_input_tokens_seen": 130936288, "step": 242000 }, { - "epoch": 5.821490301517189, - "grad_norm": 6.6113600730896, - "learning_rate": 2.0892668523141924e-05, - "loss": 1.655, - "num_input_tokens_seen": 130896949, + "epoch": 2.4261645589883143, + "grad_norm": 6.889796733856201, + "learning_rate": 2.573845445813991e-05, + "loss": 1.1098, + "num_input_tokens_seen": 131202792, "step": 242500 }, { - "epoch": 5.8334933743038215, - "grad_norm": 6.220270156860352, - "learning_rate": 2.083265315920876e-05, - "loss": 1.663, - "num_input_tokens_seen": 131169493, + "epoch": 2.4311669601408674, + "grad_norm": 6.005047798156738, + "learning_rate": 2.5688430446614376e-05, + "loss": 1.1227, + "num_input_tokens_seen": 131474608, "step": 243000 }, { - "epoch": 5.845496447090455, - "grad_norm": 6.193788528442383, - "learning_rate": 2.077263779527559e-05, - "loss": 1.6268, - "num_input_tokens_seen": 131434509, + "epoch": 2.436169361293421, + "grad_norm": 6.773987293243408, + "learning_rate": 2.5638406435088846e-05, + "loss": 1.1167, + "num_input_tokens_seen": 131739400, "step": 243500 }, { - "epoch": 5.857499519877089, - "grad_norm": 7.121766090393066, - "learning_rate": 2.0712622431342424e-05, - "loss": 1.6339, - "num_input_tokens_seen": 131701237, + "epoch": 2.441171762445974, + "grad_norm": 8.459389686584473, + "learning_rate": 2.5588382423563312e-05, + "loss": 1.1275, + "num_input_tokens_seen": 132006752, "step": 244000 }, { - "epoch": 5.869502592663721, - "grad_norm": 6.5016865730285645, - "learning_rate": 2.065260706740926e-05, - "loss": 1.6357, - "num_input_tokens_seen": 131970293, + "epoch": 2.4461741635985272, + "grad_norm": 6.094442367553711, + "learning_rate": 2.5538358412037776e-05, + "loss": 1.1035, + "num_input_tokens_seen": 132280008, "step": 244500 }, { - "epoch": 5.881505665450355, - "grad_norm": 6.262420177459717, - "learning_rate": 2.059259170347609e-05, - "loss": 1.6349, - "num_input_tokens_seen": 132236197, + "epoch": 2.4511765647510804, + "grad_norm": 9.516000747680664, + "learning_rate": 2.548833440051225e-05, + "loss": 1.1194, + "num_input_tokens_seen": 132548616, "step": 245000 }, { - "epoch": 5.893508738236989, - "grad_norm": 7.660516262054443, - "learning_rate": 2.0532576339542926e-05, - "loss": 1.6556, - "num_input_tokens_seen": 132507661, + "epoch": 2.456178965903634, + "grad_norm": 9.94356918334961, + "learning_rate": 2.5438310388986712e-05, + "loss": 1.1136, + "num_input_tokens_seen": 132827096, "step": 245500 }, { - "epoch": 5.905511811023622, - "grad_norm": 6.039972305297852, - "learning_rate": 2.0472560975609755e-05, - "loss": 1.6455, - "num_input_tokens_seen": 132773693, + "epoch": 2.461181367056187, + "grad_norm": 7.003009796142578, + "learning_rate": 2.5388286377461185e-05, + "loss": 1.1257, + "num_input_tokens_seen": 133094168, "step": 246000 }, { - "epoch": 5.917514883810256, - "grad_norm": 6.091533184051514, - "learning_rate": 2.041254561167659e-05, - "loss": 1.6327, - "num_input_tokens_seen": 133047341, + "epoch": 2.46618376820874, + "grad_norm": 6.280598163604736, + "learning_rate": 2.533826236593565e-05, + "loss": 1.1241, + "num_input_tokens_seen": 133362264, "step": 246500 }, { - "epoch": 5.9295179565968885, - "grad_norm": 6.513091087341309, - "learning_rate": 2.0352530247743423e-05, - "loss": 1.6353, - "num_input_tokens_seen": 133316333, + "epoch": 2.4711861693612933, + "grad_norm": 7.221234321594238, + "learning_rate": 2.5288238354410115e-05, + "loss": 1.1095, + "num_input_tokens_seen": 133628496, "step": 247000 }, { - "epoch": 5.941521029383522, - "grad_norm": 6.623924255371094, - "learning_rate": 2.0292514883810258e-05, - "loss": 1.648, - "num_input_tokens_seen": 133582829, + "epoch": 2.4761885705138464, + "grad_norm": 6.677853584289551, + "learning_rate": 2.5238214342884585e-05, + "loss": 1.1148, + "num_input_tokens_seen": 133902784, "step": 247500 }, { - "epoch": 5.953524102170156, - "grad_norm": 6.540838241577148, - "learning_rate": 2.023249951987709e-05, - "loss": 1.6329, - "num_input_tokens_seen": 133854621, + "epoch": 2.4811909716664, + "grad_norm": 6.834347248077393, + "learning_rate": 2.518819033135905e-05, + "loss": 1.1128, + "num_input_tokens_seen": 134174984, "step": 248000 }, { - "epoch": 5.965527174956789, - "grad_norm": 6.405239105224609, - "learning_rate": 2.0172484155943922e-05, - "loss": 1.6542, - "num_input_tokens_seen": 134127477, + "epoch": 2.486193372818953, + "grad_norm": 5.890481948852539, + "learning_rate": 2.513816631983352e-05, + "loss": 1.1181, + "num_input_tokens_seen": 134442008, "step": 248500 }, { - "epoch": 5.977530247743422, - "grad_norm": 6.597908973693848, - "learning_rate": 2.0112468792010754e-05, - "loss": 1.645, - "num_input_tokens_seen": 134395245, + "epoch": 2.4911957739715063, + "grad_norm": 5.24491548538208, + "learning_rate": 2.5088142308307988e-05, + "loss": 1.112, + "num_input_tokens_seen": 134721144, "step": 249000 }, { - "epoch": 5.989533320530056, - "grad_norm": 6.698087692260742, - "learning_rate": 2.005245342807759e-05, - "loss": 1.6689, - "num_input_tokens_seen": 134666845, + "epoch": 2.4961981751240594, + "grad_norm": 6.424367904663086, + "learning_rate": 2.5038118296782458e-05, + "loss": 1.117, + "num_input_tokens_seen": 134996320, "step": 249500 }, { - "epoch": 6.0, - "eval_loss": 1.8628705739974976, - "eval_runtime": 78.3436, - "eval_samples_per_second": 1063.405, - "eval_steps_per_second": 132.927, - "num_input_tokens_seen": 134906493, - "step": 249936 - }, - { - "epoch": 6.001536393316689, - "grad_norm": 6.373103618621826, - "learning_rate": 1.9992438064144422e-05, - "loss": 1.6229, - "num_input_tokens_seen": 134941157, + "epoch": 2.501200576276613, + "grad_norm": 5.759153366088867, + "learning_rate": 2.4988094285256925e-05, + "loss": 1.1146, + "num_input_tokens_seen": 135261680, "step": 250000 }, { - "epoch": 6.013539466103323, - "grad_norm": 5.352808952331543, - "learning_rate": 1.9932422700211257e-05, - "loss": 1.5335, - "num_input_tokens_seen": 135216053, + "epoch": 2.506202977429166, + "grad_norm": 6.164818286895752, + "learning_rate": 2.493807027373139e-05, + "loss": 1.1171, + "num_input_tokens_seen": 135530080, "step": 250500 }, { - "epoch": 6.0255425388899555, - "grad_norm": 5.97913932800293, - "learning_rate": 1.9872407336278086e-05, - "loss": 1.4933, - "num_input_tokens_seen": 135482765, + "epoch": 2.511205378581719, + "grad_norm": 5.746749401092529, + "learning_rate": 2.488804626220586e-05, + "loss": 1.1156, + "num_input_tokens_seen": 135793120, "step": 251000 }, { - "epoch": 6.037545611676589, - "grad_norm": 9.400609016418457, - "learning_rate": 1.981239197234492e-05, - "loss": 1.5391, - "num_input_tokens_seen": 135753517, + "epoch": 2.5162077797342723, + "grad_norm": 8.123281478881836, + "learning_rate": 2.4838022250680328e-05, + "loss": 1.1139, + "num_input_tokens_seen": 136063864, "step": 251500 }, { - "epoch": 6.049548684463223, - "grad_norm": 6.692719459533691, - "learning_rate": 1.9752376608411754e-05, - "loss": 1.5196, - "num_input_tokens_seen": 136023021, + "epoch": 2.5212101808868255, + "grad_norm": 5.1486287117004395, + "learning_rate": 2.4787998239154794e-05, + "loss": 1.1146, + "num_input_tokens_seen": 136332816, "step": 252000 }, { - "epoch": 6.061551757249856, - "grad_norm": 6.5408806800842285, - "learning_rate": 1.969236124447859e-05, - "loss": 1.5123, - "num_input_tokens_seen": 136297021, + "epoch": 2.526212582039379, + "grad_norm": 5.926784038543701, + "learning_rate": 2.4737974227629264e-05, + "loss": 1.1072, + "num_input_tokens_seen": 136604928, "step": 252500 }, { - "epoch": 6.073554830036489, - "grad_norm": 5.2762556076049805, - "learning_rate": 1.963234588054542e-05, - "loss": 1.5365, - "num_input_tokens_seen": 136565205, + "epoch": 2.531214983191932, + "grad_norm": 5.782299041748047, + "learning_rate": 2.468795021610373e-05, + "loss": 1.1114, + "num_input_tokens_seen": 136875976, "step": 253000 }, { - "epoch": 6.085557902823123, - "grad_norm": 5.7840728759765625, - "learning_rate": 1.9572330516612253e-05, - "loss": 1.5516, - "num_input_tokens_seen": 136836149, + "epoch": 2.5362173843444853, + "grad_norm": 6.699214935302734, + "learning_rate": 2.4637926204578197e-05, + "loss": 1.1129, + "num_input_tokens_seen": 137150704, "step": 253500 }, { - "epoch": 6.097560975609756, - "grad_norm": 6.252090930938721, - "learning_rate": 1.9512315152679085e-05, - "loss": 1.562, - "num_input_tokens_seen": 137114245, + "epoch": 2.5412197854970384, + "grad_norm": 6.502534866333008, + "learning_rate": 2.4587902193052667e-05, + "loss": 1.0949, + "num_input_tokens_seen": 137430640, "step": 254000 }, { - "epoch": 6.10956404839639, - "grad_norm": 7.506107330322266, - "learning_rate": 1.945229978874592e-05, - "loss": 1.5351, - "num_input_tokens_seen": 137388133, + "epoch": 2.546222186649592, + "grad_norm": 6.316598892211914, + "learning_rate": 2.4537878181527134e-05, + "loss": 1.1061, + "num_input_tokens_seen": 137704496, "step": 254500 }, { - "epoch": 6.1215671211830225, - "grad_norm": 6.7411651611328125, - "learning_rate": 1.9392284424812753e-05, - "loss": 1.5401, - "num_input_tokens_seen": 137660421, + "epoch": 2.551224587802145, + "grad_norm": 6.855249881744385, + "learning_rate": 2.44878541700016e-05, + "loss": 1.1147, + "num_input_tokens_seen": 137973064, "step": 255000 }, { - "epoch": 6.133570193969656, - "grad_norm": 6.217577934265137, - "learning_rate": 1.9332269060879585e-05, - "loss": 1.5279, - "num_input_tokens_seen": 137935861, + "epoch": 2.5562269889546982, + "grad_norm": 6.485804080963135, + "learning_rate": 2.443783015847607e-05, + "loss": 1.1081, + "num_input_tokens_seen": 138245472, "step": 255500 }, { - "epoch": 6.14557326675629, - "grad_norm": 8.530903816223145, - "learning_rate": 1.9272253696946417e-05, - "loss": 1.529, - "num_input_tokens_seen": 138207373, + "epoch": 2.5612293901072514, + "grad_norm": 5.901826858520508, + "learning_rate": 2.4387806146950537e-05, + "loss": 1.1316, + "num_input_tokens_seen": 138521368, "step": 256000 }, { - "epoch": 6.157576339542923, - "grad_norm": 6.678954124450684, - "learning_rate": 1.9212238333013252e-05, - "loss": 1.5373, - "num_input_tokens_seen": 138480229, + "epoch": 2.5662317912598045, + "grad_norm": 10.232002258300781, + "learning_rate": 2.4337782135425007e-05, + "loss": 1.1237, + "num_input_tokens_seen": 138788592, "step": 256500 }, { - "epoch": 6.169579412329556, - "grad_norm": 5.037346363067627, - "learning_rate": 1.9152222969080084e-05, - "loss": 1.5401, - "num_input_tokens_seen": 138751957, + "epoch": 2.571234192412358, + "grad_norm": 6.8045148849487305, + "learning_rate": 2.4287758123899473e-05, + "loss": 1.1205, + "num_input_tokens_seen": 139062416, "step": 257000 }, { - "epoch": 6.18158248511619, - "grad_norm": 7.43416690826416, - "learning_rate": 1.909220760514692e-05, - "loss": 1.5507, - "num_input_tokens_seen": 139024925, + "epoch": 2.576236593564911, + "grad_norm": 6.035918712615967, + "learning_rate": 2.4237734112373943e-05, + "loss": 1.1063, + "num_input_tokens_seen": 139341400, "step": 257500 }, { - "epoch": 6.193585557902823, - "grad_norm": 7.292538642883301, - "learning_rate": 1.9032192241213752e-05, - "loss": 1.5567, - "num_input_tokens_seen": 139295821, + "epoch": 2.5812389947174643, + "grad_norm": 6.652617454528809, + "learning_rate": 2.418771010084841e-05, + "loss": 1.1018, + "num_input_tokens_seen": 139618008, "step": 258000 }, { - "epoch": 6.205588630689457, - "grad_norm": 5.848760604858398, - "learning_rate": 1.8972176877280584e-05, - "loss": 1.5433, - "num_input_tokens_seen": 139564493, + "epoch": 2.5862413958700174, + "grad_norm": 5.396528720855713, + "learning_rate": 2.4137686089322876e-05, + "loss": 1.1036, + "num_input_tokens_seen": 139893680, "step": 258500 }, { - "epoch": 6.2175917034760895, - "grad_norm": 6.4433722496032715, - "learning_rate": 1.891216151334742e-05, - "loss": 1.5404, - "num_input_tokens_seen": 139834997, + "epoch": 2.591243797022571, + "grad_norm": 7.620987415313721, + "learning_rate": 2.4087662077797343e-05, + "loss": 1.1127, + "num_input_tokens_seen": 140161304, "step": 259000 }, { - "epoch": 6.229594776262723, - "grad_norm": 6.394519805908203, - "learning_rate": 1.885214614941425e-05, - "loss": 1.541, - "num_input_tokens_seen": 140105909, + "epoch": 2.596246198175124, + "grad_norm": 6.9869279861450195, + "learning_rate": 2.4037638066271813e-05, + "loss": 1.115, + "num_input_tokens_seen": 140437008, "step": 259500 }, { - "epoch": 6.241597849049357, - "grad_norm": 6.207643508911133, - "learning_rate": 1.8792130785481084e-05, - "loss": 1.5418, - "num_input_tokens_seen": 140371037, + "epoch": 2.6012485993276773, + "grad_norm": 6.20002555847168, + "learning_rate": 2.398761405474628e-05, + "loss": 1.1184, + "num_input_tokens_seen": 140703800, "step": 260000 }, { - "epoch": 6.25360092183599, - "grad_norm": 5.81545352935791, - "learning_rate": 1.8732115421547916e-05, - "loss": 1.5406, - "num_input_tokens_seen": 140636965, + "epoch": 2.6062510004802304, + "grad_norm": 5.8140974044799805, + "learning_rate": 2.393759004322075e-05, + "loss": 1.1048, + "num_input_tokens_seen": 140976992, "step": 260500 }, { - "epoch": 6.265603994622623, - "grad_norm": 5.554136753082275, - "learning_rate": 1.867210005761475e-05, - "loss": 1.5349, - "num_input_tokens_seen": 140907253, + "epoch": 2.6112534016327835, + "grad_norm": 6.32145357131958, + "learning_rate": 2.3887566031695216e-05, + "loss": 1.1023, + "num_input_tokens_seen": 141245264, "step": 261000 }, { - "epoch": 6.277607067409257, - "grad_norm": 7.309840679168701, - "learning_rate": 1.8612084693681583e-05, - "loss": 1.5547, - "num_input_tokens_seen": 141174837, + "epoch": 2.616255802785337, + "grad_norm": 6.455646991729736, + "learning_rate": 2.3837542020169682e-05, + "loss": 1.107, + "num_input_tokens_seen": 141517384, "step": 261500 }, { - "epoch": 6.28961014019589, - "grad_norm": 6.695976734161377, - "learning_rate": 1.855206932974842e-05, - "loss": 1.5438, - "num_input_tokens_seen": 141439621, + "epoch": 2.62125820393789, + "grad_norm": 6.573545455932617, + "learning_rate": 2.378751800864415e-05, + "loss": 1.1047, + "num_input_tokens_seen": 141791208, "step": 262000 }, { - "epoch": 6.301613212982524, - "grad_norm": 6.354114055633545, - "learning_rate": 1.849205396581525e-05, - "loss": 1.5467, - "num_input_tokens_seen": 141705029, + "epoch": 2.6262606050904433, + "grad_norm": 9.841447830200195, + "learning_rate": 2.373749399711862e-05, + "loss": 1.0933, + "num_input_tokens_seen": 142065992, "step": 262500 }, { - "epoch": 6.313616285769157, - "grad_norm": 7.218320369720459, - "learning_rate": 1.8432038601882083e-05, - "loss": 1.5577, - "num_input_tokens_seen": 141973197, + "epoch": 2.631263006242997, + "grad_norm": 6.491105556488037, + "learning_rate": 2.3687469985593085e-05, + "loss": 1.1132, + "num_input_tokens_seen": 142333808, "step": 263000 }, { - "epoch": 6.32561935855579, - "grad_norm": 5.543737888336182, - "learning_rate": 1.8372023237948915e-05, - "loss": 1.5657, - "num_input_tokens_seen": 142245349, + "epoch": 2.63626540739555, + "grad_norm": 5.914114952087402, + "learning_rate": 2.3637445974067555e-05, + "loss": 1.1106, + "num_input_tokens_seen": 142602960, "step": 263500 }, { - "epoch": 6.337622431342424, - "grad_norm": 7.173905849456787, - "learning_rate": 1.831200787401575e-05, - "loss": 1.5485, - "num_input_tokens_seen": 142517133, + "epoch": 2.641267808548103, + "grad_norm": 5.9673261642456055, + "learning_rate": 2.3587421962542022e-05, + "loss": 1.1387, + "num_input_tokens_seen": 142870032, "step": 264000 }, { - "epoch": 6.349625504129057, - "grad_norm": 6.514544486999512, - "learning_rate": 1.8251992510082582e-05, - "loss": 1.5384, - "num_input_tokens_seen": 142784205, + "epoch": 2.6462702097006563, + "grad_norm": 6.37895393371582, + "learning_rate": 2.353739795101649e-05, + "loss": 1.1133, + "num_input_tokens_seen": 143142496, "step": 264500 }, { - "epoch": 6.36162857691569, - "grad_norm": 6.325170516967773, - "learning_rate": 1.8191977146149418e-05, - "loss": 1.5452, - "num_input_tokens_seen": 143052109, + "epoch": 2.6512726108532094, + "grad_norm": 6.1890692710876465, + "learning_rate": 2.3487373939490955e-05, + "loss": 1.1088, + "num_input_tokens_seen": 143420168, "step": 265000 }, { - "epoch": 6.3736316497023235, - "grad_norm": 6.427154064178467, - "learning_rate": 1.8131961782216247e-05, - "loss": 1.5551, - "num_input_tokens_seen": 143319565, + "epoch": 2.6562750120057625, + "grad_norm": 5.707185745239258, + "learning_rate": 2.3437349927965425e-05, + "loss": 1.1127, + "num_input_tokens_seen": 143692648, "step": 265500 }, { - "epoch": 6.385634722488957, - "grad_norm": 5.220027923583984, - "learning_rate": 1.8071946418283082e-05, - "loss": 1.5604, - "num_input_tokens_seen": 143584637, + "epoch": 2.661277413158316, + "grad_norm": 6.048717975616455, + "learning_rate": 2.338732591643989e-05, + "loss": 1.1068, + "num_input_tokens_seen": 143964856, "step": 266000 }, { - "epoch": 6.397637795275591, - "grad_norm": 6.929934024810791, - "learning_rate": 1.8011931054349914e-05, - "loss": 1.5637, - "num_input_tokens_seen": 143853173, + "epoch": 2.6662798143108692, + "grad_norm": 5.904679775238037, + "learning_rate": 2.333730190491436e-05, + "loss": 1.1039, + "num_input_tokens_seen": 144228744, "step": 266500 }, { - "epoch": 6.409640868062224, - "grad_norm": 5.886574745178223, - "learning_rate": 1.795191569041675e-05, - "loss": 1.5397, - "num_input_tokens_seen": 144123341, + "epoch": 2.6712822154634224, + "grad_norm": 6.36087703704834, + "learning_rate": 2.3287277893388828e-05, + "loss": 1.1197, + "num_input_tokens_seen": 144501872, "step": 267000 }, { - "epoch": 6.421643940848857, - "grad_norm": 5.858104705810547, - "learning_rate": 1.7891900326483578e-05, - "loss": 1.5378, - "num_input_tokens_seen": 144387925, + "epoch": 2.676284616615976, + "grad_norm": 5.9171576499938965, + "learning_rate": 2.3237253881863298e-05, + "loss": 1.1192, + "num_input_tokens_seen": 144768432, "step": 267500 }, { - "epoch": 6.433647013635491, - "grad_norm": 6.699682235717773, - "learning_rate": 1.7831884962550414e-05, - "loss": 1.5633, - "num_input_tokens_seen": 144659077, + "epoch": 2.681287017768529, + "grad_norm": 6.9919514656066895, + "learning_rate": 2.318722987033776e-05, + "loss": 1.1093, + "num_input_tokens_seen": 145035880, "step": 268000 }, { - "epoch": 6.445650086422124, - "grad_norm": 6.149874687194824, - "learning_rate": 1.7771869598617246e-05, - "loss": 1.5578, - "num_input_tokens_seen": 144930877, + "epoch": 2.686289418921082, + "grad_norm": 5.2417826652526855, + "learning_rate": 2.313720585881223e-05, + "loss": 1.1293, + "num_input_tokens_seen": 145311552, "step": 268500 }, { - "epoch": 6.457653159208758, - "grad_norm": 6.18446159362793, - "learning_rate": 1.771185423468408e-05, - "loss": 1.562, - "num_input_tokens_seen": 145195781, + "epoch": 2.6912918200736353, + "grad_norm": 5.52398681640625, + "learning_rate": 2.3087181847286697e-05, + "loss": 1.107, + "num_input_tokens_seen": 145584192, "step": 269000 }, { - "epoch": 6.4696562319953905, - "grad_norm": 6.286821365356445, - "learning_rate": 1.7651838870750913e-05, - "loss": 1.5605, - "num_input_tokens_seen": 145461845, + "epoch": 2.6962942212261884, + "grad_norm": 6.279477119445801, + "learning_rate": 2.3037157835761167e-05, + "loss": 1.0918, + "num_input_tokens_seen": 145854640, "step": 269500 }, { - "epoch": 6.481659304782024, - "grad_norm": 7.96785831451416, - "learning_rate": 1.7591823506817745e-05, - "loss": 1.5575, - "num_input_tokens_seen": 145733749, + "epoch": 2.7012966223787416, + "grad_norm": 8.50329303741455, + "learning_rate": 2.2987133824235634e-05, + "loss": 1.1232, + "num_input_tokens_seen": 146125568, "step": 270000 }, { - "epoch": 6.493662377568658, - "grad_norm": 5.531857967376709, - "learning_rate": 1.7531808142884577e-05, - "loss": 1.5576, - "num_input_tokens_seen": 146006637, + "epoch": 2.706299023531295, + "grad_norm": 7.494457721710205, + "learning_rate": 2.2937109812710104e-05, + "loss": 1.1178, + "num_input_tokens_seen": 146388376, "step": 270500 }, { - "epoch": 6.505665450355291, - "grad_norm": 6.168308734893799, - "learning_rate": 1.7471792778951413e-05, - "loss": 1.5626, - "num_input_tokens_seen": 146280565, + "epoch": 2.7113014246838483, + "grad_norm": 5.595491886138916, + "learning_rate": 2.2887085801184567e-05, + "loss": 1.1144, + "num_input_tokens_seen": 146655392, "step": 271000 }, { - "epoch": 6.517668523141924, - "grad_norm": 6.521546840667725, - "learning_rate": 1.7411777415018245e-05, - "loss": 1.5599, - "num_input_tokens_seen": 146550405, + "epoch": 2.7163038258364014, + "grad_norm": 7.584702968597412, + "learning_rate": 2.2837061789659037e-05, + "loss": 1.107, + "num_input_tokens_seen": 146920408, "step": 271500 }, { - "epoch": 6.529671595928558, - "grad_norm": 6.414017677307129, - "learning_rate": 1.735176205108508e-05, - "loss": 1.5614, - "num_input_tokens_seen": 146824541, + "epoch": 2.721306226988955, + "grad_norm": 5.952847957611084, + "learning_rate": 2.2787037778133504e-05, + "loss": 1.0989, + "num_input_tokens_seen": 147193680, "step": 272000 }, { - "epoch": 6.541674668715191, - "grad_norm": 6.40237283706665, - "learning_rate": 1.729174668715191e-05, - "loss": 1.5625, - "num_input_tokens_seen": 147091629, + "epoch": 2.726308628141508, + "grad_norm": 5.385768413543701, + "learning_rate": 2.2737013766607973e-05, + "loss": 1.1169, + "num_input_tokens_seen": 147465336, "step": 272500 }, { - "epoch": 6.553677741501825, - "grad_norm": 5.577338218688965, - "learning_rate": 1.7231731323218744e-05, - "loss": 1.5639, - "num_input_tokens_seen": 147362277, + "epoch": 2.731311029294061, + "grad_norm": 7.199370861053467, + "learning_rate": 2.268698975508244e-05, + "loss": 1.119, + "num_input_tokens_seen": 147733784, "step": 273000 }, { - "epoch": 6.5656808142884575, - "grad_norm": 7.223119735717773, - "learning_rate": 1.7171715959285577e-05, - "loss": 1.574, - "num_input_tokens_seen": 147637589, + "epoch": 2.7363134304466143, + "grad_norm": 6.557952880859375, + "learning_rate": 2.263696574355691e-05, + "loss": 1.0966, + "num_input_tokens_seen": 147998984, "step": 273500 }, { - "epoch": 6.577683887075091, - "grad_norm": 7.704106330871582, - "learning_rate": 1.7111700595352412e-05, - "loss": 1.5626, - "num_input_tokens_seen": 147912637, + "epoch": 2.7413158315991675, + "grad_norm": 6.291484355926514, + "learning_rate": 2.2586941732031373e-05, + "loss": 1.1039, + "num_input_tokens_seen": 148268168, "step": 274000 }, { - "epoch": 6.589686959861725, - "grad_norm": 7.144855499267578, - "learning_rate": 1.7051685231419244e-05, - "loss": 1.5629, - "num_input_tokens_seen": 148183741, + "epoch": 2.7463182327517206, + "grad_norm": 5.747891426086426, + "learning_rate": 2.2536917720505843e-05, + "loss": 1.1006, + "num_input_tokens_seen": 148546456, "step": 274500 }, { - "epoch": 6.601690032648358, - "grad_norm": 6.267205715179443, - "learning_rate": 1.6991669867486076e-05, - "loss": 1.5524, - "num_input_tokens_seen": 148452317, + "epoch": 2.751320633904274, + "grad_norm": 5.766910552978516, + "learning_rate": 2.248689370898031e-05, + "loss": 1.1216, + "num_input_tokens_seen": 148815712, "step": 275000 }, { - "epoch": 6.613693105434991, - "grad_norm": 6.738354206085205, - "learning_rate": 1.693165450355291e-05, - "loss": 1.5605, - "num_input_tokens_seen": 148720421, + "epoch": 2.7563230350568273, + "grad_norm": 6.185927391052246, + "learning_rate": 2.243686969745478e-05, + "loss": 1.1007, + "num_input_tokens_seen": 149086416, "step": 275500 }, { - "epoch": 6.625696178221625, - "grad_norm": 5.9649882316589355, - "learning_rate": 1.6871639139619744e-05, - "loss": 1.5679, - "num_input_tokens_seen": 148990253, + "epoch": 2.7613254362093804, + "grad_norm": 7.301943778991699, + "learning_rate": 2.2386845685929246e-05, + "loss": 1.1182, + "num_input_tokens_seen": 149351152, "step": 276000 }, { - "epoch": 6.637699251008258, - "grad_norm": 7.056519031524658, - "learning_rate": 1.6811623775686576e-05, - "loss": 1.5607, - "num_input_tokens_seen": 149258861, + "epoch": 2.766327837361934, + "grad_norm": 4.440983295440674, + "learning_rate": 2.2336821674403716e-05, + "loss": 1.1125, + "num_input_tokens_seen": 149617360, "step": 276500 }, { - "epoch": 6.649702323794892, - "grad_norm": 7.6884260177612305, - "learning_rate": 1.6751608411753408e-05, - "loss": 1.562, - "num_input_tokens_seen": 149529549, + "epoch": 2.771330238514487, + "grad_norm": 6.778481483459473, + "learning_rate": 2.2286797662878183e-05, + "loss": 1.1108, + "num_input_tokens_seen": 149884296, "step": 277000 }, { - "epoch": 6.6617053965815245, - "grad_norm": 5.666858196258545, - "learning_rate": 1.6691593047820243e-05, - "loss": 1.5637, - "num_input_tokens_seen": 149797285, + "epoch": 2.7763326396670402, + "grad_norm": 6.671989440917969, + "learning_rate": 2.223677365135265e-05, + "loss": 1.0942, + "num_input_tokens_seen": 150155088, "step": 277500 }, { - "epoch": 6.673708469368158, - "grad_norm": 7.696553707122803, - "learning_rate": 1.6631577683887075e-05, - "loss": 1.5827, - "num_input_tokens_seen": 150067189, + "epoch": 2.7813350408195934, + "grad_norm": 6.532144069671631, + "learning_rate": 2.218674963982712e-05, + "loss": 1.1102, + "num_input_tokens_seen": 150424504, "step": 278000 }, { - "epoch": 6.685711542154792, - "grad_norm": 6.615331172943115, - "learning_rate": 1.657156231995391e-05, - "loss": 1.5713, - "num_input_tokens_seen": 150333445, + "epoch": 2.7863374419721465, + "grad_norm": 7.665340423583984, + "learning_rate": 2.2136725628301586e-05, + "loss": 1.0995, + "num_input_tokens_seen": 150697200, "step": 278500 }, { - "epoch": 6.697714614941425, - "grad_norm": 5.72809362411499, - "learning_rate": 1.6511546956020743e-05, - "loss": 1.5743, - "num_input_tokens_seen": 150604397, + "epoch": 2.7913398431246996, + "grad_norm": 8.809953689575195, + "learning_rate": 2.2086701616776052e-05, + "loss": 1.0934, + "num_input_tokens_seen": 150963832, "step": 279000 }, { - "epoch": 6.709717687728059, - "grad_norm": 7.209686279296875, - "learning_rate": 1.6451531592087575e-05, - "loss": 1.5471, - "num_input_tokens_seen": 150873709, + "epoch": 2.796342244277253, + "grad_norm": 6.865957260131836, + "learning_rate": 2.2036677605250522e-05, + "loss": 1.0941, + "num_input_tokens_seen": 151236880, "step": 279500 }, { - "epoch": 6.721720760514692, - "grad_norm": 6.294345855712891, - "learning_rate": 1.6391516228154407e-05, - "loss": 1.5625, - "num_input_tokens_seen": 151141645, + "epoch": 2.8013446454298063, + "grad_norm": 8.230210304260254, + "learning_rate": 2.198665359372499e-05, + "loss": 1.105, + "num_input_tokens_seen": 151510320, "step": 280000 }, { - "epoch": 6.733723833301325, - "grad_norm": 6.413891315460205, - "learning_rate": 1.6331500864221242e-05, - "loss": 1.5677, - "num_input_tokens_seen": 151415205, + "epoch": 2.8063470465823595, + "grad_norm": 5.514502048492432, + "learning_rate": 2.1936629582199455e-05, + "loss": 1.1129, + "num_input_tokens_seen": 151773160, "step": 280500 }, { - "epoch": 6.745726906087959, - "grad_norm": 7.2737836837768555, - "learning_rate": 1.6271485500288075e-05, - "loss": 1.569, - "num_input_tokens_seen": 151688301, + "epoch": 2.811349447734913, + "grad_norm": 6.41658353805542, + "learning_rate": 2.1886605570673925e-05, + "loss": 1.1234, + "num_input_tokens_seen": 152043968, "step": 281000 }, { - "epoch": 6.7577299788745915, - "grad_norm": 5.877536296844482, - "learning_rate": 1.621147013635491e-05, - "loss": 1.5541, - "num_input_tokens_seen": 151959293, + "epoch": 2.816351848887466, + "grad_norm": 10.474380493164062, + "learning_rate": 2.183658155914839e-05, + "loss": 1.1109, + "num_input_tokens_seen": 152312528, "step": 281500 }, { - "epoch": 6.769733051661225, - "grad_norm": 7.197984218597412, - "learning_rate": 1.615145477242174e-05, - "loss": 1.5748, - "num_input_tokens_seen": 152232509, + "epoch": 2.8213542500400193, + "grad_norm": 6.710339069366455, + "learning_rate": 2.178655754762286e-05, + "loss": 1.1007, + "num_input_tokens_seen": 152583128, "step": 282000 }, { - "epoch": 6.781736124447859, - "grad_norm": 7.301248550415039, - "learning_rate": 1.6091439408488574e-05, - "loss": 1.5727, - "num_input_tokens_seen": 152502573, + "epoch": 2.8263566511925724, + "grad_norm": 6.992675304412842, + "learning_rate": 2.1736533536097328e-05, + "loss": 1.0936, + "num_input_tokens_seen": 152859696, "step": 282500 }, { - "epoch": 6.793739197234492, - "grad_norm": 6.984141826629639, - "learning_rate": 1.6031424044555406e-05, - "loss": 1.5724, - "num_input_tokens_seen": 152772765, + "epoch": 2.8313590523451255, + "grad_norm": 5.590021133422852, + "learning_rate": 2.1686509524571795e-05, + "loss": 1.1047, + "num_input_tokens_seen": 153129472, "step": 283000 }, { - "epoch": 6.805742270021126, - "grad_norm": 5.9413604736328125, - "learning_rate": 1.597140868062224e-05, - "loss": 1.5901, - "num_input_tokens_seen": 153043109, + "epoch": 2.8363614534976787, + "grad_norm": 5.853962421417236, + "learning_rate": 2.1636485513046265e-05, + "loss": 1.116, + "num_input_tokens_seen": 153397560, "step": 283500 }, { - "epoch": 6.817745342807759, - "grad_norm": 5.484622001647949, - "learning_rate": 1.5911393316689074e-05, - "loss": 1.5698, - "num_input_tokens_seen": 153309957, + "epoch": 2.8413638546502322, + "grad_norm": 5.7029242515563965, + "learning_rate": 2.158646150152073e-05, + "loss": 1.1153, + "num_input_tokens_seen": 153671176, "step": 284000 }, { - "epoch": 6.829748415594392, - "grad_norm": 5.288211345672607, - "learning_rate": 1.5851377952755906e-05, - "loss": 1.56, - "num_input_tokens_seen": 153581109, + "epoch": 2.8463662558027853, + "grad_norm": 6.952505111694336, + "learning_rate": 2.1536437489995198e-05, + "loss": 1.1153, + "num_input_tokens_seen": 153937560, "step": 284500 }, { - "epoch": 6.841751488381026, - "grad_norm": 6.08894157409668, - "learning_rate": 1.5791362588822738e-05, - "loss": 1.5451, - "num_input_tokens_seen": 153850805, + "epoch": 2.8513686569553385, + "grad_norm": 5.1992902755737305, + "learning_rate": 2.1486413478469668e-05, + "loss": 1.0892, + "num_input_tokens_seen": 154210728, "step": 285000 }, { - "epoch": 6.8537545611676585, - "grad_norm": 7.608197212219238, - "learning_rate": 1.5731347224889573e-05, - "loss": 1.5716, - "num_input_tokens_seen": 154122253, + "epoch": 2.856371058107892, + "grad_norm": 5.967268943786621, + "learning_rate": 2.1436389466944134e-05, + "loss": 1.0971, + "num_input_tokens_seen": 154484848, "step": 285500 }, { - "epoch": 6.865757633954292, - "grad_norm": 6.002559661865234, - "learning_rate": 1.5671331860956405e-05, - "loss": 1.5723, - "num_input_tokens_seen": 154392053, + "epoch": 2.861373459260445, + "grad_norm": 7.573243618011475, + "learning_rate": 2.1386365455418604e-05, + "loss": 1.1015, + "num_input_tokens_seen": 154756800, "step": 286000 }, { - "epoch": 6.877760706740926, - "grad_norm": 7.694547176361084, - "learning_rate": 1.561131649702324e-05, - "loss": 1.5746, - "num_input_tokens_seen": 154660181, + "epoch": 2.8663758604129983, + "grad_norm": 6.0880584716796875, + "learning_rate": 2.133634144389307e-05, + "loss": 1.0941, + "num_input_tokens_seen": 155027568, "step": 286500 }, { - "epoch": 6.889763779527559, - "grad_norm": 6.127479076385498, - "learning_rate": 1.555130113309007e-05, - "loss": 1.585, - "num_input_tokens_seen": 154931237, + "epoch": 2.8713782615655514, + "grad_norm": 8.533178329467773, + "learning_rate": 2.1286317432367537e-05, + "loss": 1.0951, + "num_input_tokens_seen": 155296992, "step": 287000 }, { - "epoch": 6.901766852314193, - "grad_norm": 7.878530025482178, - "learning_rate": 1.5491285769156905e-05, - "loss": 1.551, - "num_input_tokens_seen": 155205397, + "epoch": 2.8763806627181046, + "grad_norm": 6.032339096069336, + "learning_rate": 2.1236293420842004e-05, + "loss": 1.1088, + "num_input_tokens_seen": 155565816, "step": 287500 }, { - "epoch": 6.9137699251008256, - "grad_norm": 6.450705528259277, - "learning_rate": 1.5431270405223737e-05, - "loss": 1.5852, - "num_input_tokens_seen": 155474189, + "epoch": 2.8813830638706577, + "grad_norm": 7.005359649658203, + "learning_rate": 2.1186269409316474e-05, + "loss": 1.0966, + "num_input_tokens_seen": 155835736, "step": 288000 }, { - "epoch": 6.925772997887459, - "grad_norm": 6.806494235992432, - "learning_rate": 1.5371255041290573e-05, - "loss": 1.5679, - "num_input_tokens_seen": 155745005, + "epoch": 2.8863854650232112, + "grad_norm": 6.709108829498291, + "learning_rate": 2.113624539779094e-05, + "loss": 1.0999, + "num_input_tokens_seen": 156105320, "step": 288500 }, { - "epoch": 6.937776070674093, - "grad_norm": 5.971404075622559, - "learning_rate": 1.5311239677357405e-05, - "loss": 1.5501, - "num_input_tokens_seen": 156010093, + "epoch": 2.8913878661757644, + "grad_norm": 6.140367031097412, + "learning_rate": 2.108622138626541e-05, + "loss": 1.0919, + "num_input_tokens_seen": 156373336, "step": 289000 }, { - "epoch": 6.9497791434607255, - "grad_norm": 8.19913101196289, - "learning_rate": 1.5251224313424237e-05, - "loss": 1.5746, - "num_input_tokens_seen": 156281093, + "epoch": 2.8963902673283175, + "grad_norm": 6.799286365509033, + "learning_rate": 2.1036197374739877e-05, + "loss": 1.0977, + "num_input_tokens_seen": 156645480, "step": 289500 }, { - "epoch": 6.961782216247359, - "grad_norm": 6.947600841522217, - "learning_rate": 1.519120894949107e-05, - "loss": 1.5779, - "num_input_tokens_seen": 156550349, + "epoch": 2.901392668480871, + "grad_norm": 7.2591023445129395, + "learning_rate": 2.0986173363214343e-05, + "loss": 1.1045, + "num_input_tokens_seen": 156919152, "step": 290000 }, { - "epoch": 6.973785289033993, - "grad_norm": 5.614461421966553, - "learning_rate": 1.5131193585557904e-05, - "loss": 1.579, - "num_input_tokens_seen": 156819397, + "epoch": 2.906395069633424, + "grad_norm": 6.572688102722168, + "learning_rate": 2.093614935168881e-05, + "loss": 1.0954, + "num_input_tokens_seen": 157189944, "step": 290500 }, { - "epoch": 6.985788361820626, - "grad_norm": 5.388101577758789, - "learning_rate": 1.5071178221624738e-05, - "loss": 1.5704, - "num_input_tokens_seen": 157096093, + "epoch": 2.9113974707859773, + "grad_norm": 8.598788261413574, + "learning_rate": 2.088612534016328e-05, + "loss": 1.1201, + "num_input_tokens_seen": 157462520, "step": 291000 }, { - "epoch": 6.99779143460726, - "grad_norm": 5.86797571182251, - "learning_rate": 1.5011162857691568e-05, - "loss": 1.583, - "num_input_tokens_seen": 157366509, + "epoch": 2.9163998719385305, + "grad_norm": 7.680613994598389, + "learning_rate": 2.0836101328637746e-05, + "loss": 1.0966, + "num_input_tokens_seen": 157734256, "step": 291500 }, { - "epoch": 7.0, - "eval_loss": 1.8540226221084595, - "eval_runtime": 78.1514, - "eval_samples_per_second": 1066.02, - "eval_steps_per_second": 133.254, - "num_input_tokens_seen": 157415170, - "step": 291592 - }, - { - "epoch": 7.0097945073938925, - "grad_norm": 6.504717826843262, - "learning_rate": 1.4951147493758402e-05, - "loss": 1.4969, - "num_input_tokens_seen": 157635162, + "epoch": 2.9214022730910836, + "grad_norm": 7.925107479095459, + "learning_rate": 2.0786077317112216e-05, + "loss": 1.0918, + "num_input_tokens_seen": 158006528, "step": 292000 }, { - "epoch": 7.021797580180526, - "grad_norm": 7.135445594787598, - "learning_rate": 1.4891132129825236e-05, - "loss": 1.4769, - "num_input_tokens_seen": 157901970, + "epoch": 2.9264046742436367, + "grad_norm": 5.118693828582764, + "learning_rate": 2.0736053305586683e-05, + "loss": 1.0932, + "num_input_tokens_seen": 158275552, "step": 292500 }, { - "epoch": 7.03380065296716, - "grad_norm": 7.115129470825195, - "learning_rate": 1.483111676589207e-05, - "loss": 1.4714, - "num_input_tokens_seen": 158169178, + "epoch": 2.9314070753961903, + "grad_norm": 4.836045265197754, + "learning_rate": 2.0686029294061153e-05, + "loss": 1.0973, + "num_input_tokens_seen": 158547424, "step": 293000 }, { - "epoch": 7.045803725753793, - "grad_norm": 6.037563800811768, - "learning_rate": 1.4771101401958903e-05, - "loss": 1.4745, - "num_input_tokens_seen": 158442418, + "epoch": 2.9364094765487434, + "grad_norm": 5.422683238983154, + "learning_rate": 2.0636005282535616e-05, + "loss": 1.1073, + "num_input_tokens_seen": 158818568, "step": 293500 }, { - "epoch": 7.057806798540426, - "grad_norm": 5.890059471130371, - "learning_rate": 1.4711086038025734e-05, - "loss": 1.4806, - "num_input_tokens_seen": 158714770, + "epoch": 2.9414118777012965, + "grad_norm": 6.608382225036621, + "learning_rate": 2.0585981271010086e-05, + "loss": 1.0836, + "num_input_tokens_seen": 159088240, "step": 294000 }, { - "epoch": 7.06980987132706, - "grad_norm": 5.541204452514648, - "learning_rate": 1.4651070674092567e-05, - "loss": 1.4521, - "num_input_tokens_seen": 158985066, + "epoch": 2.94641427885385, + "grad_norm": 5.50745153427124, + "learning_rate": 2.0535957259484552e-05, + "loss": 1.1019, + "num_input_tokens_seen": 159356336, "step": 294500 }, { - "epoch": 7.081812944113693, - "grad_norm": 5.360498428344727, - "learning_rate": 1.4591055310159401e-05, - "loss": 1.4741, - "num_input_tokens_seen": 159252010, + "epoch": 2.9514166800064032, + "grad_norm": 5.7239251136779785, + "learning_rate": 2.0485933247959022e-05, + "loss": 1.0824, + "num_input_tokens_seen": 159630264, "step": 295000 }, { - "epoch": 7.093816016900327, - "grad_norm": 6.16367769241333, - "learning_rate": 1.4531039946226235e-05, - "loss": 1.4571, - "num_input_tokens_seen": 159520466, + "epoch": 2.9564190811589564, + "grad_norm": 6.992796421051025, + "learning_rate": 2.043590923643349e-05, + "loss": 1.1014, + "num_input_tokens_seen": 159892336, "step": 295500 }, { - "epoch": 7.1058190896869595, - "grad_norm": 5.670065402984619, - "learning_rate": 1.4471024582293069e-05, - "loss": 1.4682, - "num_input_tokens_seen": 159792554, + "epoch": 2.9614214823115095, + "grad_norm": 6.0249433517456055, + "learning_rate": 2.038588522490796e-05, + "loss": 1.0831, + "num_input_tokens_seen": 160159512, "step": 296000 }, { - "epoch": 7.117822162473593, - "grad_norm": 7.196038722991943, - "learning_rate": 1.44110092183599e-05, - "loss": 1.4918, - "num_input_tokens_seen": 160063042, + "epoch": 2.9664238834640626, + "grad_norm": 5.429805755615234, + "learning_rate": 2.0335861213382422e-05, + "loss": 1.0991, + "num_input_tokens_seen": 160423896, "step": 296500 }, { - "epoch": 7.129825235260227, - "grad_norm": 5.897183895111084, - "learning_rate": 1.4350993854426733e-05, - "loss": 1.4715, - "num_input_tokens_seen": 160334234, + "epoch": 2.9714262846166157, + "grad_norm": 7.0117034912109375, + "learning_rate": 2.0285837201856892e-05, + "loss": 1.0682, + "num_input_tokens_seen": 160699992, "step": 297000 }, { - "epoch": 7.14182830804686, - "grad_norm": 7.0402069091796875, - "learning_rate": 1.4290978490493567e-05, - "loss": 1.4658, - "num_input_tokens_seen": 160597754, + "epoch": 2.9764286857691693, + "grad_norm": 4.545111179351807, + "learning_rate": 2.023581319033136e-05, + "loss": 1.0976, + "num_input_tokens_seen": 160970504, "step": 297500 }, { - "epoch": 7.153831380833493, - "grad_norm": 6.796384334564209, - "learning_rate": 1.42309631265604e-05, - "loss": 1.4913, - "num_input_tokens_seen": 160864226, + "epoch": 2.9814310869217224, + "grad_norm": 7.641571998596191, + "learning_rate": 2.018578917880583e-05, + "loss": 1.1149, + "num_input_tokens_seen": 161244848, "step": 298000 }, { - "epoch": 7.165834453620127, - "grad_norm": 6.599667549133301, - "learning_rate": 1.4170947762627233e-05, - "loss": 1.4762, - "num_input_tokens_seen": 161133522, + "epoch": 2.9864334880742756, + "grad_norm": 5.6191205978393555, + "learning_rate": 2.0135765167280295e-05, + "loss": 1.0841, + "num_input_tokens_seen": 161521312, "step": 298500 }, { - "epoch": 7.17783752640676, - "grad_norm": 5.711678504943848, - "learning_rate": 1.4110932398694066e-05, - "loss": 1.4707, - "num_input_tokens_seen": 161399354, + "epoch": 2.991435889226829, + "grad_norm": 7.104705810546875, + "learning_rate": 2.0085741155754765e-05, + "loss": 1.1083, + "num_input_tokens_seen": 161787136, "step": 299000 }, { - "epoch": 7.189840599193394, - "grad_norm": 5.771945476531982, - "learning_rate": 1.40509170347609e-05, - "loss": 1.4822, - "num_input_tokens_seen": 161670186, + "epoch": 2.9964382903793823, + "grad_norm": 7.319199085235596, + "learning_rate": 2.0035717144229228e-05, + "loss": 1.0933, + "num_input_tokens_seen": 162058632, "step": 299500 }, { - "epoch": 7.2018436719800265, - "grad_norm": 7.326181888580322, - "learning_rate": 1.3990901670827732e-05, - "loss": 1.4799, - "num_input_tokens_seen": 161940154, + "epoch": 3.0, + "eval_loss": 1.0591504573822021, + "eval_runtime": 193.2048, + "eval_samples_per_second": 1034.679, + "eval_steps_per_second": 129.339, + "num_input_tokens_seen": 162248288, + "step": 299856 + }, + { + "epoch": 3.0014406915319354, + "grad_norm": 6.5569539070129395, + "learning_rate": 1.9985693132703698e-05, + "loss": 1.0801, + "num_input_tokens_seen": 162329952, "step": 300000 }, { - "epoch": 7.21384674476666, - "grad_norm": 6.072440147399902, - "learning_rate": 1.3930886306894566e-05, - "loss": 1.484, - "num_input_tokens_seen": 162207258, + "epoch": 3.0064430926844885, + "grad_norm": 4.675987720489502, + "learning_rate": 1.9935669121178165e-05, + "loss": 0.9946, + "num_input_tokens_seen": 162606936, "step": 300500 }, { - "epoch": 7.225849817553294, - "grad_norm": 6.544903755187988, - "learning_rate": 1.3870870942961398e-05, - "loss": 1.4801, - "num_input_tokens_seen": 162471442, + "epoch": 3.0114454938370416, + "grad_norm": 9.786909103393555, + "learning_rate": 1.9885645109652635e-05, + "loss": 1.0173, + "num_input_tokens_seen": 162883112, "step": 301000 }, { - "epoch": 7.237852890339927, - "grad_norm": 6.239735126495361, - "learning_rate": 1.3810855579028232e-05, - "loss": 1.4911, - "num_input_tokens_seen": 162742186, + "epoch": 3.016447894989595, + "grad_norm": 7.118892669677734, + "learning_rate": 1.98356210981271e-05, + "loss": 0.9956, + "num_input_tokens_seen": 163158976, "step": 301500 }, { - "epoch": 7.24985596312656, - "grad_norm": 6.918200969696045, - "learning_rate": 1.3750840215095065e-05, - "loss": 1.4885, - "num_input_tokens_seen": 163016658, + "epoch": 3.0214502961421483, + "grad_norm": 4.8414411544799805, + "learning_rate": 1.978559708660157e-05, + "loss": 0.9909, + "num_input_tokens_seen": 163431480, "step": 302000 }, { - "epoch": 7.261859035913194, - "grad_norm": 6.806629657745361, - "learning_rate": 1.36908248511619e-05, - "loss": 1.4836, - "num_input_tokens_seen": 163289242, + "epoch": 3.0264526972947015, + "grad_norm": 6.550401210784912, + "learning_rate": 1.9735573075076038e-05, + "loss": 1.0088, + "num_input_tokens_seen": 163705656, "step": 302500 }, { - "epoch": 7.273862108699827, - "grad_norm": 6.6947736740112305, - "learning_rate": 1.3630809487228733e-05, - "loss": 1.5068, - "num_input_tokens_seen": 163559754, + "epoch": 3.0314550984472546, + "grad_norm": 6.2179694175720215, + "learning_rate": 1.9685549063550504e-05, + "loss": 1.0046, + "num_input_tokens_seen": 163978520, "step": 303000 }, { - "epoch": 7.285865181486461, - "grad_norm": 6.7742018699646, - "learning_rate": 1.3570794123295563e-05, - "loss": 1.4956, - "num_input_tokens_seen": 163827674, + "epoch": 3.0364574995998077, + "grad_norm": 5.6524224281311035, + "learning_rate": 1.963552505202497e-05, + "loss": 0.9962, + "num_input_tokens_seen": 164246424, "step": 303500 }, { - "epoch": 7.297868254273094, - "grad_norm": 9.125452995300293, - "learning_rate": 1.3510778759362397e-05, - "loss": 1.4908, - "num_input_tokens_seen": 164095554, + "epoch": 3.0414599007523613, + "grad_norm": 6.216259479522705, + "learning_rate": 1.958550104049944e-05, + "loss": 1.0032, + "num_input_tokens_seen": 164520760, "step": 304000 }, { - "epoch": 7.309871327059727, - "grad_norm": 6.615844249725342, - "learning_rate": 1.3450763395429231e-05, - "loss": 1.4761, - "num_input_tokens_seen": 164361210, + "epoch": 3.0464623019049144, + "grad_norm": 5.4327311515808105, + "learning_rate": 1.9535477028973907e-05, + "loss": 0.9958, + "num_input_tokens_seen": 164791976, "step": 304500 }, { - "epoch": 7.321874399846361, - "grad_norm": 6.755634307861328, - "learning_rate": 1.3390748031496065e-05, - "loss": 1.4859, - "num_input_tokens_seen": 164634202, + "epoch": 3.0514647030574675, + "grad_norm": 6.64623498916626, + "learning_rate": 1.9485453017448377e-05, + "loss": 1.0027, + "num_input_tokens_seen": 165064512, "step": 305000 }, { - "epoch": 7.333877472632994, - "grad_norm": 6.546878337860107, - "learning_rate": 1.3330732667562898e-05, - "loss": 1.4854, - "num_input_tokens_seen": 164901770, + "epoch": 3.0564671042100207, + "grad_norm": 5.067431449890137, + "learning_rate": 1.9435429005922844e-05, + "loss": 1.0095, + "num_input_tokens_seen": 165335384, "step": 305500 }, { - "epoch": 7.345880545419627, - "grad_norm": 5.669297218322754, - "learning_rate": 1.3270717303629729e-05, - "loss": 1.4931, - "num_input_tokens_seen": 165172866, + "epoch": 3.0614695053625742, + "grad_norm": 5.332586765289307, + "learning_rate": 1.938540499439731e-05, + "loss": 1.0001, + "num_input_tokens_seen": 165600752, "step": 306000 }, { - "epoch": 7.357883618206261, - "grad_norm": 6.612642765045166, - "learning_rate": 1.3210701939696563e-05, - "loss": 1.4983, - "num_input_tokens_seen": 165441466, + "epoch": 3.0664719065151274, + "grad_norm": 6.432159900665283, + "learning_rate": 1.933538098287178e-05, + "loss": 0.9977, + "num_input_tokens_seen": 165870160, "step": 306500 }, { - "epoch": 7.369886690992894, - "grad_norm": 6.783679008483887, - "learning_rate": 1.3150686575763396e-05, - "loss": 1.4916, - "num_input_tokens_seen": 165710690, + "epoch": 3.0714743076676805, + "grad_norm": 6.297356605529785, + "learning_rate": 1.9285356971346247e-05, + "loss": 0.9981, + "num_input_tokens_seen": 166137752, "step": 307000 }, { - "epoch": 7.381889763779528, - "grad_norm": 5.914409160614014, - "learning_rate": 1.309067121183023e-05, - "loss": 1.4891, - "num_input_tokens_seen": 165980522, + "epoch": 3.0764767088202336, + "grad_norm": 6.82805871963501, + "learning_rate": 1.9235332959820717e-05, + "loss": 1.008, + "num_input_tokens_seen": 166409344, "step": 307500 }, { - "epoch": 7.393892836566161, - "grad_norm": 7.80264949798584, - "learning_rate": 1.3030655847897064e-05, - "loss": 1.5073, - "num_input_tokens_seen": 166247338, + "epoch": 3.0814791099727867, + "grad_norm": 5.371485710144043, + "learning_rate": 1.9185308948295183e-05, + "loss": 1.0077, + "num_input_tokens_seen": 166679600, "step": 308000 }, { - "epoch": 7.405895909352794, - "grad_norm": 6.48394775390625, - "learning_rate": 1.2970640483963894e-05, - "loss": 1.5093, - "num_input_tokens_seen": 166511898, + "epoch": 3.0864815111253403, + "grad_norm": 5.552392482757568, + "learning_rate": 1.913528493676965e-05, + "loss": 1.0154, + "num_input_tokens_seen": 166951960, "step": 308500 }, { - "epoch": 7.417898982139428, - "grad_norm": 6.138946056365967, - "learning_rate": 1.2910625120030728e-05, - "loss": 1.496, - "num_input_tokens_seen": 166784922, + "epoch": 3.0914839122778934, + "grad_norm": 5.485569953918457, + "learning_rate": 1.908526092524412e-05, + "loss": 1.0183, + "num_input_tokens_seen": 167229768, "step": 309000 }, { - "epoch": 7.429902054926061, - "grad_norm": 6.478691101074219, - "learning_rate": 1.2850609756097562e-05, - "loss": 1.5086, - "num_input_tokens_seen": 167052770, + "epoch": 3.0964863134304466, + "grad_norm": 7.161227226257324, + "learning_rate": 1.9035236913718586e-05, + "loss": 0.9953, + "num_input_tokens_seen": 167500816, "step": 309500 }, { - "epoch": 7.441905127712695, - "grad_norm": 6.674640655517578, - "learning_rate": 1.2790594392164396e-05, - "loss": 1.4949, - "num_input_tokens_seen": 167316786, + "epoch": 3.1014887145829997, + "grad_norm": 6.685337543487549, + "learning_rate": 1.8985212902193053e-05, + "loss": 1.0126, + "num_input_tokens_seen": 167761944, "step": 310000 }, { - "epoch": 7.453908200499328, - "grad_norm": 6.283424377441406, - "learning_rate": 1.2730579028231226e-05, - "loss": 1.5092, - "num_input_tokens_seen": 167588522, + "epoch": 3.1064911157355533, + "grad_norm": 7.007294178009033, + "learning_rate": 1.8935188890667523e-05, + "loss": 1.0168, + "num_input_tokens_seen": 168031200, "step": 310500 }, { - "epoch": 7.465911273285961, - "grad_norm": 7.709399223327637, - "learning_rate": 1.267056366429806e-05, - "loss": 1.4984, - "num_input_tokens_seen": 167854274, + "epoch": 3.1114935168881064, + "grad_norm": 4.6598615646362305, + "learning_rate": 1.888516487914199e-05, + "loss": 0.9972, + "num_input_tokens_seen": 168306704, "step": 311000 }, { - "epoch": 7.477914346072595, - "grad_norm": 7.502775192260742, - "learning_rate": 1.2610548300364893e-05, - "loss": 1.5158, - "num_input_tokens_seen": 168125602, + "epoch": 3.1164959180406595, + "grad_norm": 6.378694534301758, + "learning_rate": 1.883514086761646e-05, + "loss": 1.0057, + "num_input_tokens_seen": 168578560, "step": 311500 }, { - "epoch": 7.489917418859228, - "grad_norm": 8.468284606933594, - "learning_rate": 1.2550532936431727e-05, - "loss": 1.5024, - "num_input_tokens_seen": 168396914, + "epoch": 3.1214983191932126, + "grad_norm": 4.70497465133667, + "learning_rate": 1.8785116856090926e-05, + "loss": 0.9805, + "num_input_tokens_seen": 168857808, "step": 312000 }, { - "epoch": 7.501920491645861, - "grad_norm": 6.24788761138916, - "learning_rate": 1.249051757249856e-05, - "loss": 1.4941, - "num_input_tokens_seen": 168662218, + "epoch": 3.1265007203457658, + "grad_norm": 6.10917329788208, + "learning_rate": 1.8735092844565392e-05, + "loss": 1.0013, + "num_input_tokens_seen": 169130360, "step": 312500 }, { - "epoch": 7.513923564432495, - "grad_norm": 5.687014579772949, - "learning_rate": 1.2430502208565393e-05, - "loss": 1.4903, - "num_input_tokens_seen": 168931594, + "epoch": 3.1315031214983193, + "grad_norm": 5.804021835327148, + "learning_rate": 1.868506883303986e-05, + "loss": 1.0111, + "num_input_tokens_seen": 169403088, "step": 313000 }, { - "epoch": 7.525926637219128, - "grad_norm": 6.3434739112854, - "learning_rate": 1.2370486844632225e-05, - "loss": 1.504, - "num_input_tokens_seen": 169203474, + "epoch": 3.1365055226508725, + "grad_norm": 6.582799911499023, + "learning_rate": 1.863504482151433e-05, + "loss": 1.0181, + "num_input_tokens_seen": 169678360, "step": 313500 }, { - "epoch": 7.537929710005762, - "grad_norm": 7.105179786682129, - "learning_rate": 1.2310471480699059e-05, - "loss": 1.5048, - "num_input_tokens_seen": 169479274, + "epoch": 3.1415079238034256, + "grad_norm": 6.149540424346924, + "learning_rate": 1.8585020809988795e-05, + "loss": 1.0122, + "num_input_tokens_seen": 169944568, "step": 314000 }, { - "epoch": 7.5499327827923945, - "grad_norm": 6.754655361175537, - "learning_rate": 1.2250456116765893e-05, - "loss": 1.5021, - "num_input_tokens_seen": 169751738, + "epoch": 3.1465103249559787, + "grad_norm": 8.258193969726562, + "learning_rate": 1.8534996798463265e-05, + "loss": 0.9965, + "num_input_tokens_seen": 170213776, "step": 314500 }, { - "epoch": 7.561935855579028, - "grad_norm": 6.408166885375977, - "learning_rate": 1.2190440752832726e-05, - "loss": 1.4947, - "num_input_tokens_seen": 170022378, + "epoch": 3.1515127261085323, + "grad_norm": 5.790067195892334, + "learning_rate": 1.8484972786937732e-05, + "loss": 1.0083, + "num_input_tokens_seen": 170487696, "step": 315000 }, { - "epoch": 7.573938928365662, - "grad_norm": 7.029279708862305, - "learning_rate": 1.2130425388899558e-05, - "loss": 1.5004, - "num_input_tokens_seen": 170289026, + "epoch": 3.1565151272610854, + "grad_norm": 6.756849765777588, + "learning_rate": 1.84349487754122e-05, + "loss": 1.0112, + "num_input_tokens_seen": 170763864, "step": 315500 }, { - "epoch": 7.585942001152295, - "grad_norm": 6.276601791381836, - "learning_rate": 1.2070410024966392e-05, - "loss": 1.5208, - "num_input_tokens_seen": 170561498, + "epoch": 3.1615175284136385, + "grad_norm": 8.113907814025879, + "learning_rate": 1.8384924763886665e-05, + "loss": 1.0113, + "num_input_tokens_seen": 171034920, "step": 316000 }, { - "epoch": 7.597945073938928, - "grad_norm": 8.284005165100098, - "learning_rate": 1.2010394661033226e-05, - "loss": 1.5155, - "num_input_tokens_seen": 170833114, + "epoch": 3.1665199295661917, + "grad_norm": 6.584122180938721, + "learning_rate": 1.8334900752361135e-05, + "loss": 1.0071, + "num_input_tokens_seen": 171319328, "step": 316500 }, { - "epoch": 7.609948146725562, - "grad_norm": 6.610816955566406, - "learning_rate": 1.195037929710006e-05, - "loss": 1.4943, - "num_input_tokens_seen": 171105362, + "epoch": 3.1715223307187452, + "grad_norm": 7.648674964904785, + "learning_rate": 1.82848767408356e-05, + "loss": 1.0092, + "num_input_tokens_seen": 171587048, "step": 317000 }, { - "epoch": 7.621951219512195, - "grad_norm": 6.658823490142822, - "learning_rate": 1.1890363933166892e-05, - "loss": 1.5132, - "num_input_tokens_seen": 171375522, + "epoch": 3.1765247318712984, + "grad_norm": 6.150393486022949, + "learning_rate": 1.823485272931007e-05, + "loss": 1.0213, + "num_input_tokens_seen": 171854768, "step": 317500 }, { - "epoch": 7.633954292298829, - "grad_norm": 6.3469767570495605, - "learning_rate": 1.1830348569233726e-05, - "loss": 1.517, - "num_input_tokens_seen": 171648866, + "epoch": 3.1815271330238515, + "grad_norm": 6.078028202056885, + "learning_rate": 1.8184828717784538e-05, + "loss": 1.0129, + "num_input_tokens_seen": 172122080, "step": 318000 }, { - "epoch": 7.6459573650854615, - "grad_norm": 6.271878719329834, - "learning_rate": 1.1770333205300558e-05, - "loss": 1.5105, - "num_input_tokens_seen": 171919554, + "epoch": 3.1865295341764046, + "grad_norm": 6.360128402709961, + "learning_rate": 1.8134804706259008e-05, + "loss": 0.9966, + "num_input_tokens_seen": 172386096, "step": 318500 }, { - "epoch": 7.657960437872095, - "grad_norm": 6.963322162628174, - "learning_rate": 1.1710317841367391e-05, - "loss": 1.4968, - "num_input_tokens_seen": 172191338, + "epoch": 3.1915319353289577, + "grad_norm": 7.205709934234619, + "learning_rate": 1.808478069473347e-05, + "loss": 1.019, + "num_input_tokens_seen": 172656264, "step": 319000 }, { - "epoch": 7.669963510658729, - "grad_norm": 6.459115028381348, - "learning_rate": 1.1650302477434223e-05, - "loss": 1.5238, - "num_input_tokens_seen": 172461762, + "epoch": 3.1965343364815113, + "grad_norm": 6.01072883605957, + "learning_rate": 1.803475668320794e-05, + "loss": 1.0104, + "num_input_tokens_seen": 172928696, "step": 319500 }, { - "epoch": 7.681966583445362, - "grad_norm": 7.1065897941589355, - "learning_rate": 1.1590287113501057e-05, - "loss": 1.4975, - "num_input_tokens_seen": 172731498, + "epoch": 3.2015367376340644, + "grad_norm": 5.552466869354248, + "learning_rate": 1.7984732671682407e-05, + "loss": 1.0045, + "num_input_tokens_seen": 173193664, "step": 320000 }, { - "epoch": 7.693969656231995, - "grad_norm": 6.904537677764893, - "learning_rate": 1.1530271749567891e-05, - "loss": 1.5007, - "num_input_tokens_seen": 173002666, + "epoch": 3.2065391387866176, + "grad_norm": 6.396902561187744, + "learning_rate": 1.7934708660156877e-05, + "loss": 1.007, + "num_input_tokens_seen": 173460776, "step": 320500 }, { - "epoch": 7.705972729018629, - "grad_norm": 6.658944606781006, - "learning_rate": 1.1470256385634723e-05, - "loss": 1.522, - "num_input_tokens_seen": 173276026, + "epoch": 3.2115415399391707, + "grad_norm": 5.7529778480529785, + "learning_rate": 1.7884684648631344e-05, + "loss": 1.0185, + "num_input_tokens_seen": 173728072, "step": 321000 }, { - "epoch": 7.717975801805262, - "grad_norm": 7.3184814453125, - "learning_rate": 1.1410241021701557e-05, - "loss": 1.5182, - "num_input_tokens_seen": 173549130, + "epoch": 3.2165439410917243, + "grad_norm": 8.143234252929688, + "learning_rate": 1.7834660637105814e-05, + "loss": 1.0306, + "num_input_tokens_seen": 173997832, "step": 321500 }, { - "epoch": 7.729978874591896, - "grad_norm": 7.4477081298828125, - "learning_rate": 1.1350225657768389e-05, - "loss": 1.5198, - "num_input_tokens_seen": 173817618, + "epoch": 3.2215463422442774, + "grad_norm": 5.61393928527832, + "learning_rate": 1.7784636625580277e-05, + "loss": 1.0206, + "num_input_tokens_seen": 174268112, "step": 322000 }, { - "epoch": 7.7419819473785285, - "grad_norm": 7.1104021072387695, - "learning_rate": 1.1290210293835223e-05, - "loss": 1.501, - "num_input_tokens_seen": 174085890, + "epoch": 3.2265487433968305, + "grad_norm": 5.928481578826904, + "learning_rate": 1.7734612614054747e-05, + "loss": 0.9988, + "num_input_tokens_seen": 174533880, "step": 322500 }, { - "epoch": 7.753985020165162, - "grad_norm": 6.134059906005859, - "learning_rate": 1.1230194929902055e-05, - "loss": 1.5012, - "num_input_tokens_seen": 174355138, + "epoch": 3.2315511445493836, + "grad_norm": 5.389233589172363, + "learning_rate": 1.7684588602529213e-05, + "loss": 1.02, + "num_input_tokens_seen": 174799568, "step": 323000 }, { - "epoch": 7.765988092951796, - "grad_norm": 5.851232051849365, - "learning_rate": 1.1170179565968888e-05, - "loss": 1.4949, - "num_input_tokens_seen": 174622970, + "epoch": 3.2365535457019368, + "grad_norm": 7.281908988952637, + "learning_rate": 1.7634564591003683e-05, + "loss": 0.9986, + "num_input_tokens_seen": 175066168, "step": 323500 }, { - "epoch": 7.777991165738429, - "grad_norm": 6.635881423950195, - "learning_rate": 1.1110164202035722e-05, - "loss": 1.4811, - "num_input_tokens_seen": 174892194, + "epoch": 3.2415559468544903, + "grad_norm": 6.380090713500977, + "learning_rate": 1.758454057947815e-05, + "loss": 1.0057, + "num_input_tokens_seen": 175336944, "step": 324000 }, { - "epoch": 7.789994238525063, - "grad_norm": 7.079593181610107, - "learning_rate": 1.1050148838102554e-05, - "loss": 1.4957, - "num_input_tokens_seen": 175163402, + "epoch": 3.2465583480070435, + "grad_norm": 6.550302982330322, + "learning_rate": 1.753451656795262e-05, + "loss": 1.0171, + "num_input_tokens_seen": 175611424, "step": 324500 }, { - "epoch": 7.801997311311696, - "grad_norm": 6.069626808166504, - "learning_rate": 1.0990133474169388e-05, - "loss": 1.4939, - "num_input_tokens_seen": 175435458, + "epoch": 3.2515607491595966, + "grad_norm": 5.882409572601318, + "learning_rate": 1.7484492556427083e-05, + "loss": 1.0112, + "num_input_tokens_seen": 175879768, "step": 325000 }, { - "epoch": 7.814000384098329, - "grad_norm": 7.611231327056885, - "learning_rate": 1.093011811023622e-05, - "loss": 1.496, - "num_input_tokens_seen": 175705498, + "epoch": 3.2565631503121497, + "grad_norm": 6.047407627105713, + "learning_rate": 1.7434468544901553e-05, + "loss": 1.0332, + "num_input_tokens_seen": 176157824, "step": 325500 }, { - "epoch": 7.826003456884963, - "grad_norm": 6.542228698730469, - "learning_rate": 1.0870102746303054e-05, - "loss": 1.523, - "num_input_tokens_seen": 175972954, + "epoch": 3.2615655514647033, + "grad_norm": 6.9863691329956055, + "learning_rate": 1.738444453337602e-05, + "loss": 1.0008, + "num_input_tokens_seen": 176428696, "step": 326000 }, { - "epoch": 7.8380065296715955, - "grad_norm": 5.749320030212402, - "learning_rate": 1.0810087382369888e-05, - "loss": 1.5091, - "num_input_tokens_seen": 176242650, + "epoch": 3.2665679526172564, + "grad_norm": 6.332062721252441, + "learning_rate": 1.733442052185049e-05, + "loss": 1.0005, + "num_input_tokens_seen": 176707880, "step": 326500 }, { - "epoch": 7.850009602458229, - "grad_norm": 6.390203475952148, - "learning_rate": 1.075007201843672e-05, - "loss": 1.514, - "num_input_tokens_seen": 176517386, + "epoch": 3.2715703537698095, + "grad_norm": 5.405006408691406, + "learning_rate": 1.7284396510324956e-05, + "loss": 1.0249, + "num_input_tokens_seen": 176978528, "step": 327000 }, { - "epoch": 7.862012675244863, - "grad_norm": 5.876443386077881, - "learning_rate": 1.0690056654503553e-05, - "loss": 1.5176, - "num_input_tokens_seen": 176792834, + "epoch": 3.2765727549223627, + "grad_norm": 6.076756477355957, + "learning_rate": 1.7234372498799426e-05, + "loss": 1.0156, + "num_input_tokens_seen": 177248232, "step": 327500 }, { - "epoch": 7.874015748031496, - "grad_norm": 7.726831436157227, - "learning_rate": 1.0630041290570386e-05, - "loss": 1.5105, - "num_input_tokens_seen": 177062114, + "epoch": 3.281575156074916, + "grad_norm": 7.379303932189941, + "learning_rate": 1.7184348487273893e-05, + "loss": 1.0039, + "num_input_tokens_seen": 177519848, "step": 328000 }, { - "epoch": 7.88601882081813, - "grad_norm": 5.911496639251709, - "learning_rate": 1.057002592663722e-05, - "loss": 1.4902, - "num_input_tokens_seen": 177332370, + "epoch": 3.2865775572274694, + "grad_norm": 6.788669109344482, + "learning_rate": 1.713432447574836e-05, + "loss": 1.0092, + "num_input_tokens_seen": 177789288, "step": 328500 }, { - "epoch": 7.898021893604763, - "grad_norm": 7.214710712432861, - "learning_rate": 1.0510010562704051e-05, - "loss": 1.5151, - "num_input_tokens_seen": 177603106, + "epoch": 3.2915799583800225, + "grad_norm": 6.320953369140625, + "learning_rate": 1.7084300464222826e-05, + "loss": 1.0143, + "num_input_tokens_seen": 178056680, "step": 329000 }, { - "epoch": 7.910024966391396, - "grad_norm": 6.843635082244873, - "learning_rate": 1.0449995198770885e-05, - "loss": 1.5103, - "num_input_tokens_seen": 177876938, + "epoch": 3.2965823595325756, + "grad_norm": 6.267603397369385, + "learning_rate": 1.7034276452697296e-05, + "loss": 1.0023, + "num_input_tokens_seen": 178329192, "step": 329500 }, { - "epoch": 7.92202803917803, - "grad_norm": 6.6717634201049805, - "learning_rate": 1.0389979834837719e-05, - "loss": 1.4944, - "num_input_tokens_seen": 178142994, + "epoch": 3.3015847606851287, + "grad_norm": 5.770685195922852, + "learning_rate": 1.6984252441171762e-05, + "loss": 1.0279, + "num_input_tokens_seen": 178597800, "step": 330000 }, { - "epoch": 7.9340311119646625, - "grad_norm": 6.576992511749268, - "learning_rate": 1.0329964470904551e-05, - "loss": 1.5135, - "num_input_tokens_seen": 178416442, + "epoch": 3.3065871618376823, + "grad_norm": 6.4189863204956055, + "learning_rate": 1.6934228429646232e-05, + "loss": 1.0052, + "num_input_tokens_seen": 178869992, "step": 330500 }, { - "epoch": 7.946034184751296, - "grad_norm": 7.449835300445557, - "learning_rate": 1.0269949106971385e-05, - "loss": 1.5064, - "num_input_tokens_seen": 178681698, + "epoch": 3.3115895629902354, + "grad_norm": 5.872836589813232, + "learning_rate": 1.68842044181207e-05, + "loss": 0.9991, + "num_input_tokens_seen": 179139968, "step": 331000 }, { - "epoch": 7.95803725753793, - "grad_norm": 6.646006107330322, - "learning_rate": 1.0209933743038219e-05, - "loss": 1.5077, - "num_input_tokens_seen": 178948914, + "epoch": 3.3165919641427886, + "grad_norm": 4.180021286010742, + "learning_rate": 1.6834180406595165e-05, + "loss": 1.0087, + "num_input_tokens_seen": 179411592, "step": 331500 }, { - "epoch": 7.970040330324563, - "grad_norm": 6.2497639656066895, - "learning_rate": 1.0149918379105052e-05, - "loss": 1.5179, - "num_input_tokens_seen": 179220706, + "epoch": 3.3215943652953417, + "grad_norm": 5.875650405883789, + "learning_rate": 1.6784156395069635e-05, + "loss": 1.0076, + "num_input_tokens_seen": 179672616, "step": 332000 }, { - "epoch": 7.982043403111197, - "grad_norm": 6.532312870025635, - "learning_rate": 1.0089903015171884e-05, - "loss": 1.5048, - "num_input_tokens_seen": 179492346, + "epoch": 3.326596766447895, + "grad_norm": 5.798732280731201, + "learning_rate": 1.67341323835441e-05, + "loss": 1.0121, + "num_input_tokens_seen": 179943344, "step": 332500 }, { - "epoch": 7.99404647589783, - "grad_norm": 6.301452159881592, - "learning_rate": 1.0029887651238718e-05, - "loss": 1.5274, - "num_input_tokens_seen": 179762666, + "epoch": 3.3315991676004484, + "grad_norm": 5.229135513305664, + "learning_rate": 1.6684108372018568e-05, + "loss": 1.018, + "num_input_tokens_seen": 180212504, "step": 333000 }, { - "epoch": 8.0, - "eval_loss": 1.8503847122192383, - "eval_runtime": 77.834, - "eval_samples_per_second": 1070.368, - "eval_steps_per_second": 133.798, - "num_input_tokens_seen": 179896969, - "step": 333248 - }, - { - "epoch": 8.006049548684464, - "grad_norm": 7.001438140869141, - "learning_rate": 9.969872287305552e-06, - "loss": 1.4615, - "num_input_tokens_seen": 180032801, + "epoch": 3.3366015687530015, + "grad_norm": 6.477422714233398, + "learning_rate": 1.6634084360493038e-05, + "loss": 0.9992, + "num_input_tokens_seen": 180482184, "step": 333500 }, { - "epoch": 8.018052621471096, - "grad_norm": 6.001068115234375, - "learning_rate": 9.909856923372384e-06, - "loss": 1.4235, - "num_input_tokens_seen": 180305033, + "epoch": 3.3416039699055546, + "grad_norm": 6.4892497062683105, + "learning_rate": 1.6584060348967505e-05, + "loss": 1.0223, + "num_input_tokens_seen": 180754256, "step": 334000 }, { - "epoch": 8.03005569425773, - "grad_norm": 6.109913349151611, - "learning_rate": 9.849841559439218e-06, - "loss": 1.411, - "num_input_tokens_seen": 180574281, + "epoch": 3.3466063710581078, + "grad_norm": 5.80246639251709, + "learning_rate": 1.653403633744197e-05, + "loss": 1.0102, + "num_input_tokens_seen": 181030712, "step": 334500 }, { - "epoch": 8.042058767044363, - "grad_norm": 6.268576145172119, - "learning_rate": 9.78982619550605e-06, - "loss": 1.4271, - "num_input_tokens_seen": 180840177, + "epoch": 3.3516087722106613, + "grad_norm": 5.75023078918457, + "learning_rate": 1.648401232591644e-05, + "loss": 1.0111, + "num_input_tokens_seen": 181295800, "step": 335000 }, { - "epoch": 8.054061839830997, - "grad_norm": 5.492459297180176, - "learning_rate": 9.729810831572884e-06, - "loss": 1.4228, - "num_input_tokens_seen": 181104809, + "epoch": 3.3566111733632145, + "grad_norm": 5.059189796447754, + "learning_rate": 1.6433988314390908e-05, + "loss": 1.01, + "num_input_tokens_seen": 181561000, "step": 335500 }, { - "epoch": 8.06606491261763, - "grad_norm": 5.865973472595215, - "learning_rate": 9.669795467639717e-06, - "loss": 1.4511, - "num_input_tokens_seen": 181372673, + "epoch": 3.3616135745157676, + "grad_norm": 5.798236846923828, + "learning_rate": 1.6383964302865378e-05, + "loss": 1.0201, + "num_input_tokens_seen": 181836528, "step": 336000 }, { - "epoch": 8.078067985404264, - "grad_norm": 6.025683403015137, - "learning_rate": 9.60978010370655e-06, - "loss": 1.4291, - "num_input_tokens_seen": 181647809, + "epoch": 3.3666159756683207, + "grad_norm": 6.402642250061035, + "learning_rate": 1.6333940291339844e-05, + "loss": 1.0191, + "num_input_tokens_seen": 182106640, "step": 336500 }, { - "epoch": 8.090071058190897, - "grad_norm": 7.060949802398682, - "learning_rate": 9.549764739773383e-06, - "loss": 1.4252, - "num_input_tokens_seen": 181918481, + "epoch": 3.371618376820874, + "grad_norm": 6.876767635345459, + "learning_rate": 1.6283916279814314e-05, + "loss": 1.0171, + "num_input_tokens_seen": 182385832, "step": 337000 }, { - "epoch": 8.102074130977531, - "grad_norm": 5.3572678565979, - "learning_rate": 9.489749375840215e-06, - "loss": 1.4121, - "num_input_tokens_seen": 182184233, + "epoch": 3.3766207779734274, + "grad_norm": 8.243651390075684, + "learning_rate": 1.623389226828878e-05, + "loss": 1.0136, + "num_input_tokens_seen": 182652112, "step": 337500 }, { - "epoch": 8.114077203764163, - "grad_norm": 6.005801200866699, - "learning_rate": 9.429734011907049e-06, - "loss": 1.4247, - "num_input_tokens_seen": 182451169, + "epoch": 3.3816231791259805, + "grad_norm": 6.981409072875977, + "learning_rate": 1.6183868256763247e-05, + "loss": 1.0027, + "num_input_tokens_seen": 182920216, "step": 338000 }, { - "epoch": 8.126080276550796, - "grad_norm": 7.279883861541748, - "learning_rate": 9.369718647973883e-06, - "loss": 1.4276, - "num_input_tokens_seen": 182723713, + "epoch": 3.3866255802785337, + "grad_norm": 5.377172470092773, + "learning_rate": 1.6133844245237714e-05, + "loss": 1.0129, + "num_input_tokens_seen": 183185600, "step": 338500 }, { - "epoch": 8.13808334933743, - "grad_norm": 7.752192974090576, - "learning_rate": 9.309703284040715e-06, - "loss": 1.4297, - "num_input_tokens_seen": 182994585, + "epoch": 3.391627981431087, + "grad_norm": 7.256112575531006, + "learning_rate": 1.6083820233712184e-05, + "loss": 1.0061, + "num_input_tokens_seen": 183453440, "step": 339000 }, { - "epoch": 8.150086422124064, - "grad_norm": 6.684974193572998, - "learning_rate": 9.249687920107549e-06, - "loss": 1.4394, - "num_input_tokens_seen": 183269657, + "epoch": 3.3966303825836404, + "grad_norm": 5.170373916625977, + "learning_rate": 1.603379622218665e-05, + "loss": 1.0127, + "num_input_tokens_seen": 183722560, "step": 339500 }, { - "epoch": 8.162089494910697, - "grad_norm": 5.737115859985352, - "learning_rate": 9.18967255617438e-06, - "loss": 1.4421, - "num_input_tokens_seen": 183545377, + "epoch": 3.4016327837361935, + "grad_norm": 5.4537248611450195, + "learning_rate": 1.598377221066112e-05, + "loss": 1.0084, + "num_input_tokens_seen": 183991520, "step": 340000 }, { - "epoch": 8.17409256769733, - "grad_norm": 6.990055561065674, - "learning_rate": 9.129657192241214e-06, - "loss": 1.4172, - "num_input_tokens_seen": 183814473, + "epoch": 3.4066351848887466, + "grad_norm": 6.006635665893555, + "learning_rate": 1.5933748199135587e-05, + "loss": 1.0017, + "num_input_tokens_seen": 184263976, "step": 340500 }, { - "epoch": 8.186095640483964, - "grad_norm": 6.314241409301758, - "learning_rate": 9.069641828308046e-06, - "loss": 1.4388, - "num_input_tokens_seen": 184079777, + "epoch": 3.4116375860412997, + "grad_norm": 5.732070446014404, + "learning_rate": 1.5883724187610053e-05, + "loss": 1.0124, + "num_input_tokens_seen": 184528864, "step": 341000 }, { - "epoch": 8.198098713270598, - "grad_norm": 6.492166519165039, - "learning_rate": 9.00962646437488e-06, - "loss": 1.4524, - "num_input_tokens_seen": 184353497, + "epoch": 3.416639987193853, + "grad_norm": 7.936917304992676, + "learning_rate": 1.583370017608452e-05, + "loss": 1.0183, + "num_input_tokens_seen": 184798528, "step": 341500 }, { - "epoch": 8.21010178605723, - "grad_norm": 7.049396991729736, - "learning_rate": 8.949611100441714e-06, - "loss": 1.4292, - "num_input_tokens_seen": 184624497, + "epoch": 3.4216423883464064, + "grad_norm": 7.584635257720947, + "learning_rate": 1.578367616455899e-05, + "loss": 1.0236, + "num_input_tokens_seen": 185063968, "step": 342000 }, { - "epoch": 8.222104858843863, - "grad_norm": 6.09207010269165, - "learning_rate": 8.889595736508546e-06, - "loss": 1.4313, - "num_input_tokens_seen": 184896585, + "epoch": 3.4266447894989596, + "grad_norm": 6.110123634338379, + "learning_rate": 1.5733652153033456e-05, + "loss": 1.0113, + "num_input_tokens_seen": 185328760, "step": 342500 }, { - "epoch": 8.234107931630497, - "grad_norm": 6.875469207763672, - "learning_rate": 8.82958037257538e-06, - "loss": 1.4347, - "num_input_tokens_seen": 185164825, + "epoch": 3.4316471906515127, + "grad_norm": 6.163844585418701, + "learning_rate": 1.5683628141507926e-05, + "loss": 1.0143, + "num_input_tokens_seen": 185601312, "step": 343000 }, { - "epoch": 8.24611100441713, - "grad_norm": 6.660105228424072, - "learning_rate": 8.769565008642212e-06, - "loss": 1.4453, - "num_input_tokens_seen": 185429625, + "epoch": 3.436649591804066, + "grad_norm": 6.428155899047852, + "learning_rate": 1.5633604129982393e-05, + "loss": 1.0231, + "num_input_tokens_seen": 185868856, "step": 343500 }, { - "epoch": 8.258114077203764, - "grad_norm": 6.262195587158203, - "learning_rate": 8.709549644709046e-06, - "loss": 1.4463, - "num_input_tokens_seen": 185696369, + "epoch": 3.4416519929566194, + "grad_norm": 4.938517093658447, + "learning_rate": 1.5583580118456863e-05, + "loss": 1.029, + "num_input_tokens_seen": 186141200, "step": 344000 }, { - "epoch": 8.270117149990398, - "grad_norm": 6.421583652496338, - "learning_rate": 8.649534280775878e-06, - "loss": 1.4453, - "num_input_tokens_seen": 185969713, + "epoch": 3.4466543941091725, + "grad_norm": 6.4214019775390625, + "learning_rate": 1.5533556106931326e-05, + "loss": 1.0117, + "num_input_tokens_seen": 186415152, "step": 344500 }, { - "epoch": 8.282120222777031, - "grad_norm": 6.954902648925781, - "learning_rate": 8.589518916842711e-06, - "loss": 1.45, - "num_input_tokens_seen": 186237433, + "epoch": 3.4516567952617256, + "grad_norm": 5.303710460662842, + "learning_rate": 1.5483532095405796e-05, + "loss": 1.0094, + "num_input_tokens_seen": 186690176, "step": 345000 }, { - "epoch": 8.294123295563665, - "grad_norm": 6.543055057525635, - "learning_rate": 8.529503552909545e-06, - "loss": 1.4348, - "num_input_tokens_seen": 186509353, + "epoch": 3.4566591964142788, + "grad_norm": 4.86320161819458, + "learning_rate": 1.5433508083880262e-05, + "loss": 1.0027, + "num_input_tokens_seen": 186957432, "step": 345500 }, { - "epoch": 8.306126368350297, - "grad_norm": 6.88683557510376, - "learning_rate": 8.469488188976377e-06, - "loss": 1.4396, - "num_input_tokens_seen": 186777761, + "epoch": 3.461661597566832, + "grad_norm": 4.321279048919678, + "learning_rate": 1.5383484072354732e-05, + "loss": 1.0123, + "num_input_tokens_seen": 187226552, "step": 346000 }, { - "epoch": 8.31812944113693, - "grad_norm": 6.030125617980957, - "learning_rate": 8.409472825043211e-06, - "loss": 1.4488, - "num_input_tokens_seen": 187047665, + "epoch": 3.4666639987193855, + "grad_norm": 5.8327860832214355, + "learning_rate": 1.53334600608292e-05, + "loss": 1.0057, + "num_input_tokens_seen": 187497776, "step": 346500 }, { - "epoch": 8.330132513923564, - "grad_norm": 6.535872936248779, - "learning_rate": 8.349457461110045e-06, - "loss": 1.4448, - "num_input_tokens_seen": 187317673, + "epoch": 3.4716663998719386, + "grad_norm": 9.191901206970215, + "learning_rate": 1.528343604930367e-05, + "loss": 1.0008, + "num_input_tokens_seen": 187768112, "step": 347000 }, { - "epoch": 8.342135586710198, - "grad_norm": 6.740753650665283, - "learning_rate": 8.289442097176877e-06, - "loss": 1.4612, - "num_input_tokens_seen": 187586313, + "epoch": 3.4766688010244917, + "grad_norm": 6.153154373168945, + "learning_rate": 1.5233412037778134e-05, + "loss": 1.0, + "num_input_tokens_seen": 188036912, "step": 347500 }, { - "epoch": 8.354138659496831, - "grad_norm": 7.823575973510742, - "learning_rate": 8.22942673324371e-06, - "loss": 1.4281, - "num_input_tokens_seen": 187854433, + "epoch": 3.481671202177045, + "grad_norm": 5.405030250549316, + "learning_rate": 1.5183388026252602e-05, + "loss": 1.0119, + "num_input_tokens_seen": 188312952, "step": 348000 }, { - "epoch": 8.366141732283465, - "grad_norm": 6.176061630249023, - "learning_rate": 8.169411369310544e-06, - "loss": 1.4356, - "num_input_tokens_seen": 188119713, + "epoch": 3.4866736033295984, + "grad_norm": 6.342990398406982, + "learning_rate": 1.513336401472707e-05, + "loss": 1.0101, + "num_input_tokens_seen": 188585776, "step": 348500 }, { - "epoch": 8.378144805070098, - "grad_norm": 6.373409748077393, - "learning_rate": 8.109396005377378e-06, - "loss": 1.4513, - "num_input_tokens_seen": 188392345, + "epoch": 3.4916760044821515, + "grad_norm": 6.2145867347717285, + "learning_rate": 1.5083340003201538e-05, + "loss": 1.0018, + "num_input_tokens_seen": 188856160, "step": 349000 }, { - "epoch": 8.390147877856732, - "grad_norm": 5.931683540344238, - "learning_rate": 8.04938064144421e-06, - "loss": 1.4382, - "num_input_tokens_seen": 188664321, + "epoch": 3.4966784056347047, + "grad_norm": 5.479875564575195, + "learning_rate": 1.5033315991676007e-05, + "loss": 1.0092, + "num_input_tokens_seen": 189125536, "step": 349500 }, { - "epoch": 8.402150950643364, - "grad_norm": 6.967380046844482, - "learning_rate": 7.989365277511044e-06, - "loss": 1.4348, - "num_input_tokens_seen": 188934193, + "epoch": 3.501680806787258, + "grad_norm": 6.38485050201416, + "learning_rate": 1.4983291980150473e-05, + "loss": 1.0074, + "num_input_tokens_seen": 189397856, "step": 350000 }, { - "epoch": 8.414154023429997, - "grad_norm": 5.487102508544922, - "learning_rate": 7.929349913577876e-06, - "loss": 1.4521, - "num_input_tokens_seen": 189211009, + "epoch": 3.506683207939811, + "grad_norm": 5.203739166259766, + "learning_rate": 1.493326796862494e-05, + "loss": 0.9897, + "num_input_tokens_seen": 189664624, "step": 350500 }, { - "epoch": 8.426157096216631, - "grad_norm": 6.150306701660156, - "learning_rate": 7.86933454964471e-06, - "loss": 1.4179, - "num_input_tokens_seen": 189479497, + "epoch": 3.5116856090923645, + "grad_norm": 6.554189682006836, + "learning_rate": 1.4883243957099408e-05, + "loss": 1.0222, + "num_input_tokens_seen": 189931544, "step": 351000 }, { - "epoch": 8.438160169003265, - "grad_norm": 5.820715427398682, - "learning_rate": 7.809319185711544e-06, - "loss": 1.4402, - "num_input_tokens_seen": 189747145, + "epoch": 3.5166880102449176, + "grad_norm": 6.045382022857666, + "learning_rate": 1.4833219945573876e-05, + "loss": 1.0108, + "num_input_tokens_seen": 190201904, "step": 351500 }, { - "epoch": 8.450163241789898, - "grad_norm": 7.049651622772217, - "learning_rate": 7.749303821778376e-06, - "loss": 1.4499, - "num_input_tokens_seen": 190016673, + "epoch": 3.5216904113974707, + "grad_norm": 5.883347988128662, + "learning_rate": 1.4783195934048344e-05, + "loss": 1.0204, + "num_input_tokens_seen": 190469632, "step": 352000 }, { - "epoch": 8.462166314576532, - "grad_norm": 6.405355930328369, - "learning_rate": 7.68928845784521e-06, - "loss": 1.4468, - "num_input_tokens_seen": 190285505, + "epoch": 3.526692812550024, + "grad_norm": 5.156943321228027, + "learning_rate": 1.4733171922522813e-05, + "loss": 0.9963, + "num_input_tokens_seen": 190737512, "step": 352500 }, { - "epoch": 8.474169387363165, - "grad_norm": 6.422888278961182, - "learning_rate": 7.6292730939120415e-06, - "loss": 1.4466, - "num_input_tokens_seen": 190556825, + "epoch": 3.5316952137025774, + "grad_norm": 5.740571975708008, + "learning_rate": 1.4683147910997281e-05, + "loss": 1.0157, + "num_input_tokens_seen": 191003792, "step": 353000 }, { - "epoch": 8.486172460149799, - "grad_norm": 6.329121112823486, - "learning_rate": 7.569257729978875e-06, - "loss": 1.4406, - "num_input_tokens_seen": 190825761, + "epoch": 3.5366976148551306, + "grad_norm": 5.744316101074219, + "learning_rate": 1.463312389947175e-05, + "loss": 1.0182, + "num_input_tokens_seen": 191276360, "step": 353500 }, { - "epoch": 8.498175532936433, - "grad_norm": 6.55047607421875, - "learning_rate": 7.509242366045709e-06, - "loss": 1.4323, - "num_input_tokens_seen": 191094561, + "epoch": 3.5417000160076837, + "grad_norm": 6.743235111236572, + "learning_rate": 1.4583099887946214e-05, + "loss": 1.0212, + "num_input_tokens_seen": 191547000, "step": 354000 }, { - "epoch": 8.510178605723064, - "grad_norm": 7.034322738647461, - "learning_rate": 7.449227002112541e-06, - "loss": 1.4421, - "num_input_tokens_seen": 191365401, + "epoch": 3.546702417160237, + "grad_norm": 6.034450531005859, + "learning_rate": 1.4533075876420682e-05, + "loss": 1.0159, + "num_input_tokens_seen": 191816024, "step": 354500 }, { - "epoch": 8.522181678509698, - "grad_norm": 6.0111799240112305, - "learning_rate": 7.389211638179375e-06, - "loss": 1.4374, - "num_input_tokens_seen": 191637209, + "epoch": 3.55170481831279, + "grad_norm": 6.9873833656311035, + "learning_rate": 1.448305186489515e-05, + "loss": 0.9996, + "num_input_tokens_seen": 192078648, "step": 355000 }, { - "epoch": 8.534184751296332, - "grad_norm": 8.281228065490723, - "learning_rate": 7.329196274246207e-06, - "loss": 1.4293, - "num_input_tokens_seen": 191908033, + "epoch": 3.5567072194653435, + "grad_norm": 4.8513078689575195, + "learning_rate": 1.4433027853369619e-05, + "loss": 1.0022, + "num_input_tokens_seen": 192346960, "step": 355500 }, { - "epoch": 8.546187824082965, - "grad_norm": 6.296477794647217, - "learning_rate": 7.269180910313041e-06, - "loss": 1.4368, - "num_input_tokens_seen": 192180897, + "epoch": 3.5617096206178966, + "grad_norm": 6.602761268615723, + "learning_rate": 1.4383003841844087e-05, + "loss": 1.0182, + "num_input_tokens_seen": 192617800, "step": 356000 }, { - "epoch": 8.558190896869599, - "grad_norm": 6.709954261779785, - "learning_rate": 7.209165546379873e-06, - "loss": 1.4453, - "num_input_tokens_seen": 192451505, + "epoch": 3.5667120217704498, + "grad_norm": 5.9454851150512695, + "learning_rate": 1.4332979830318555e-05, + "loss": 1.0032, + "num_input_tokens_seen": 192887048, "step": 356500 }, { - "epoch": 8.570193969656232, - "grad_norm": 6.437607765197754, - "learning_rate": 7.1491501824467065e-06, - "loss": 1.4534, - "num_input_tokens_seen": 192722785, + "epoch": 3.571714422923003, + "grad_norm": 4.868193626403809, + "learning_rate": 1.428295581879302e-05, + "loss": 0.9844, + "num_input_tokens_seen": 193153600, "step": 357000 }, { - "epoch": 8.582197042442866, - "grad_norm": 7.061402320861816, - "learning_rate": 7.08913481851354e-06, - "loss": 1.4428, - "num_input_tokens_seen": 192989065, + "epoch": 3.5767168240755565, + "grad_norm": 5.1398749351501465, + "learning_rate": 1.4232931807267488e-05, + "loss": 1.0119, + "num_input_tokens_seen": 193428088, "step": 357500 }, { - "epoch": 8.5942001152295, - "grad_norm": 7.956037521362305, - "learning_rate": 7.029119454580372e-06, - "loss": 1.4449, - "num_input_tokens_seen": 193258953, + "epoch": 3.5817192252281096, + "grad_norm": 5.984772682189941, + "learning_rate": 1.4182907795741957e-05, + "loss": 1.0014, + "num_input_tokens_seen": 193702344, "step": 358000 }, { - "epoch": 8.606203188016131, - "grad_norm": 6.570489406585693, - "learning_rate": 6.969104090647206e-06, - "loss": 1.4308, - "num_input_tokens_seen": 193529041, + "epoch": 3.5867216263806627, + "grad_norm": 5.537957191467285, + "learning_rate": 1.4132883784216425e-05, + "loss": 1.0158, + "num_input_tokens_seen": 193976088, "step": 358500 }, { - "epoch": 8.618206260802765, - "grad_norm": 5.77236270904541, - "learning_rate": 6.909088726714038e-06, - "loss": 1.436, - "num_input_tokens_seen": 193798089, + "epoch": 3.591724027533216, + "grad_norm": 5.605039119720459, + "learning_rate": 1.4082859772690893e-05, + "loss": 1.0175, + "num_input_tokens_seen": 194241888, "step": 359000 }, { - "epoch": 8.630209333589399, - "grad_norm": 6.698272228240967, - "learning_rate": 6.849073362780872e-06, - "loss": 1.4554, - "num_input_tokens_seen": 194063657, + "epoch": 3.596726428685769, + "grad_norm": 8.43393611907959, + "learning_rate": 1.4032835761165361e-05, + "loss": 1.0001, + "num_input_tokens_seen": 194514168, "step": 359500 }, { - "epoch": 8.642212406376032, - "grad_norm": 6.7158122062683105, - "learning_rate": 6.789057998847706e-06, - "loss": 1.4455, - "num_input_tokens_seen": 194334121, + "epoch": 3.6017288298383225, + "grad_norm": 6.197558403015137, + "learning_rate": 1.3982811749639826e-05, + "loss": 1.0006, + "num_input_tokens_seen": 194788128, "step": 360000 }, { - "epoch": 8.654215479162666, - "grad_norm": 5.916762351989746, - "learning_rate": 6.729042634914539e-06, - "loss": 1.4487, - "num_input_tokens_seen": 194604249, + "epoch": 3.6067312309908757, + "grad_norm": 6.9046430587768555, + "learning_rate": 1.3932787738114294e-05, + "loss": 1.0163, + "num_input_tokens_seen": 195057368, "step": 360500 }, { - "epoch": 8.6662185519493, - "grad_norm": 5.992862701416016, - "learning_rate": 6.6690272709813716e-06, - "loss": 1.4528, - "num_input_tokens_seen": 194876889, + "epoch": 3.611733632143429, + "grad_norm": 6.350090026855469, + "learning_rate": 1.3882763726588763e-05, + "loss": 0.9951, + "num_input_tokens_seen": 195326896, "step": 361000 }, { - "epoch": 8.678221624735933, - "grad_norm": 6.150909900665283, - "learning_rate": 6.6090119070482045e-06, - "loss": 1.4573, - "num_input_tokens_seen": 195148057, + "epoch": 3.616736033295982, + "grad_norm": 7.581150531768799, + "learning_rate": 1.3832739715063231e-05, + "loss": 1.0004, + "num_input_tokens_seen": 195599832, "step": 361500 }, { - "epoch": 8.690224697522567, - "grad_norm": 7.3372321128845215, - "learning_rate": 6.548996543115038e-06, - "loss": 1.437, - "num_input_tokens_seen": 195413777, + "epoch": 3.6217384344485355, + "grad_norm": 5.0561017990112305, + "learning_rate": 1.3782715703537699e-05, + "loss": 1.0211, + "num_input_tokens_seen": 195878880, "step": 362000 }, { - "epoch": 8.702227770309198, - "grad_norm": 9.222699165344238, - "learning_rate": 6.48898117918187e-06, - "loss": 1.4483, - "num_input_tokens_seen": 195683329, + "epoch": 3.6267408356010886, + "grad_norm": 6.046396732330322, + "learning_rate": 1.3732691692012167e-05, + "loss": 1.0113, + "num_input_tokens_seen": 196149072, "step": 362500 }, { - "epoch": 8.714230843095832, - "grad_norm": 7.286736011505127, - "learning_rate": 6.428965815248704e-06, - "loss": 1.4513, - "num_input_tokens_seen": 195954249, + "epoch": 3.6317432367536417, + "grad_norm": 6.726164817810059, + "learning_rate": 1.3682667680486636e-05, + "loss": 1.0031, + "num_input_tokens_seen": 196417928, "step": 363000 }, { - "epoch": 8.726233915882466, - "grad_norm": 7.142899990081787, - "learning_rate": 6.368950451315538e-06, - "loss": 1.4621, - "num_input_tokens_seen": 196226689, + "epoch": 3.636745637906195, + "grad_norm": 10.790836334228516, + "learning_rate": 1.36326436689611e-05, + "loss": 1.0076, + "num_input_tokens_seen": 196688920, "step": 363500 }, { - "epoch": 8.7382369886691, - "grad_norm": 6.659136772155762, - "learning_rate": 6.30893508738237e-06, - "loss": 1.4399, - "num_input_tokens_seen": 196496457, + "epoch": 3.641748039058748, + "grad_norm": 6.029910087585449, + "learning_rate": 1.3582619657435569e-05, + "loss": 1.0027, + "num_input_tokens_seen": 196965448, "step": 364000 }, { - "epoch": 8.750240061455733, - "grad_norm": 6.504032611846924, - "learning_rate": 6.248919723449204e-06, - "loss": 1.4395, - "num_input_tokens_seen": 196765513, + "epoch": 3.6467504402113016, + "grad_norm": 4.5111308097839355, + "learning_rate": 1.3532595645910037e-05, + "loss": 0.9984, + "num_input_tokens_seen": 197241664, "step": 364500 }, { - "epoch": 8.762243134242366, - "grad_norm": 6.01947546005249, - "learning_rate": 6.188904359516037e-06, - "loss": 1.4448, - "num_input_tokens_seen": 197032273, + "epoch": 3.6517528413638547, + "grad_norm": 7.830237865447998, + "learning_rate": 1.3482571634384505e-05, + "loss": 1.0109, + "num_input_tokens_seen": 197508048, "step": 365000 }, { - "epoch": 8.774246207029, - "grad_norm": 6.728052139282227, - "learning_rate": 6.1288889955828695e-06, - "loss": 1.4638, - "num_input_tokens_seen": 197306721, + "epoch": 3.656755242516408, + "grad_norm": 6.430135726928711, + "learning_rate": 1.3432547622858973e-05, + "loss": 1.0041, + "num_input_tokens_seen": 197782528, "step": 365500 }, { - "epoch": 8.786249279815634, - "grad_norm": 6.051580429077148, - "learning_rate": 6.0688736316497024e-06, - "loss": 1.4515, - "num_input_tokens_seen": 197575177, + "epoch": 3.661757643668961, + "grad_norm": 8.756442070007324, + "learning_rate": 1.3382523611333442e-05, + "loss": 0.997, + "num_input_tokens_seen": 198054664, "step": 366000 }, { - "epoch": 8.798252352602265, - "grad_norm": 6.528171062469482, - "learning_rate": 6.008858267716535e-06, - "loss": 1.4477, - "num_input_tokens_seen": 197848049, + "epoch": 3.6667600448215145, + "grad_norm": 6.538286209106445, + "learning_rate": 1.3332499599807907e-05, + "loss": 1.0085, + "num_input_tokens_seen": 198320680, "step": 366500 }, { - "epoch": 8.810255425388899, - "grad_norm": 6.882514476776123, - "learning_rate": 5.948842903783368e-06, - "loss": 1.4374, - "num_input_tokens_seen": 198114185, + "epoch": 3.6717624459740676, + "grad_norm": 7.97443151473999, + "learning_rate": 1.3282475588282375e-05, + "loss": 1.0125, + "num_input_tokens_seen": 198590752, "step": 367000 }, { - "epoch": 8.822258498175533, - "grad_norm": 6.4761810302734375, - "learning_rate": 5.888827539850202e-06, - "loss": 1.4602, - "num_input_tokens_seen": 198384521, + "epoch": 3.6767648471266208, + "grad_norm": 5.407761573791504, + "learning_rate": 1.3232451576756843e-05, + "loss": 1.029, + "num_input_tokens_seen": 198861544, "step": 367500 }, { - "epoch": 8.834261570962166, - "grad_norm": 6.429546356201172, - "learning_rate": 5.828812175917035e-06, - "loss": 1.4419, - "num_input_tokens_seen": 198653209, + "epoch": 3.681767248279174, + "grad_norm": 6.2920355796813965, + "learning_rate": 1.3182427565231311e-05, + "loss": 1.0131, + "num_input_tokens_seen": 199129704, "step": 368000 }, { - "epoch": 8.8462646437488, - "grad_norm": 7.116391658782959, - "learning_rate": 5.768796811983868e-06, - "loss": 1.4406, - "num_input_tokens_seen": 198920169, + "epoch": 3.686769649431727, + "grad_norm": 8.73907470703125, + "learning_rate": 1.313240355370578e-05, + "loss": 1.0051, + "num_input_tokens_seen": 199397264, "step": 368500 }, { - "epoch": 8.858267716535433, - "grad_norm": 6.254596710205078, - "learning_rate": 5.708781448050702e-06, - "loss": 1.434, - "num_input_tokens_seen": 199187353, + "epoch": 3.6917720505842806, + "grad_norm": 6.030662536621094, + "learning_rate": 1.3082379542180248e-05, + "loss": 1.008, + "num_input_tokens_seen": 199664200, "step": 369000 }, { - "epoch": 8.870270789322067, - "grad_norm": 6.303454399108887, - "learning_rate": 5.6487660841175345e-06, - "loss": 1.4541, - "num_input_tokens_seen": 199456113, + "epoch": 3.6967744517368337, + "grad_norm": 6.953051567077637, + "learning_rate": 1.3032355530654716e-05, + "loss": 1.0075, + "num_input_tokens_seen": 199933048, "step": 369500 }, { - "epoch": 8.8822738621087, - "grad_norm": 6.005709648132324, - "learning_rate": 5.5887507201843674e-06, - "loss": 1.4583, - "num_input_tokens_seen": 199725417, + "epoch": 3.701776852889387, + "grad_norm": 6.4026618003845215, + "learning_rate": 1.2982331519129181e-05, + "loss": 0.9982, + "num_input_tokens_seen": 200203568, "step": 370000 }, { - "epoch": 8.894276934895334, - "grad_norm": 6.535337924957275, - "learning_rate": 5.5287353562512e-06, - "loss": 1.4488, - "num_input_tokens_seen": 199993145, + "epoch": 3.70677925404194, + "grad_norm": 8.120702743530273, + "learning_rate": 1.2932307507603649e-05, + "loss": 1.0093, + "num_input_tokens_seen": 200475136, "step": 370500 }, { - "epoch": 8.906280007681966, - "grad_norm": 6.578272819519043, - "learning_rate": 5.468719992318034e-06, - "loss": 1.4659, - "num_input_tokens_seen": 200265713, + "epoch": 3.7117816551944935, + "grad_norm": 4.819450855255127, + "learning_rate": 1.2882283496078117e-05, + "loss": 0.9915, + "num_input_tokens_seen": 200751456, "step": 371000 }, { - "epoch": 8.9182830804686, - "grad_norm": 6.937986850738525, - "learning_rate": 5.408704628384867e-06, - "loss": 1.451, - "num_input_tokens_seen": 200535577, + "epoch": 3.7167840563470467, + "grad_norm": 6.006054878234863, + "learning_rate": 1.2832259484552586e-05, + "loss": 0.9993, + "num_input_tokens_seen": 201022872, "step": 371500 }, { - "epoch": 8.930286153255233, - "grad_norm": 7.4389119148254395, - "learning_rate": 5.3486892644517e-06, - "loss": 1.4433, - "num_input_tokens_seen": 200802385, + "epoch": 3.7217864574996, + "grad_norm": 5.173740386962891, + "learning_rate": 1.2782235473027054e-05, + "loss": 1.0043, + "num_input_tokens_seen": 201293168, "step": 372000 }, { - "epoch": 8.942289226041867, - "grad_norm": 6.193032741546631, - "learning_rate": 5.288673900518533e-06, - "loss": 1.4827, - "num_input_tokens_seen": 201075529, + "epoch": 3.726788858652153, + "grad_norm": 4.628252983093262, + "learning_rate": 1.2732211461501522e-05, + "loss": 1.0047, + "num_input_tokens_seen": 201566544, "step": 372500 }, { - "epoch": 8.9542922988285, - "grad_norm": 6.252266883850098, - "learning_rate": 5.228658536585366e-06, - "loss": 1.4519, - "num_input_tokens_seen": 201342049, + "epoch": 3.731791259804706, + "grad_norm": 5.305530548095703, + "learning_rate": 1.2682187449975989e-05, + "loss": 1.0084, + "num_input_tokens_seen": 201835288, "step": 373000 }, { - "epoch": 8.966295371615134, - "grad_norm": 6.403021812438965, - "learning_rate": 5.168643172652199e-06, - "loss": 1.4469, - "num_input_tokens_seen": 201616073, + "epoch": 3.7367936609572596, + "grad_norm": 4.96281623840332, + "learning_rate": 1.2632163438450457e-05, + "loss": 1.009, + "num_input_tokens_seen": 202106240, "step": 373500 }, { - "epoch": 8.978298444401767, - "grad_norm": 6.8911566734313965, - "learning_rate": 5.1086278087190325e-06, - "loss": 1.4602, - "num_input_tokens_seen": 201888657, + "epoch": 3.7417960621098127, + "grad_norm": 5.782063007354736, + "learning_rate": 1.2582139426924925e-05, + "loss": 0.9932, + "num_input_tokens_seen": 202380080, "step": 374000 }, { - "epoch": 8.9903015171884, - "grad_norm": 5.930507183074951, - "learning_rate": 5.048612444785865e-06, - "loss": 1.4662, - "num_input_tokens_seen": 202158201, + "epoch": 3.746798463262366, + "grad_norm": 5.399883270263672, + "learning_rate": 1.2532115415399392e-05, + "loss": 1.0138, + "num_input_tokens_seen": 202647520, "step": 374500 }, { - "epoch": 9.0, - "eval_loss": 1.8432235717773438, - "eval_runtime": 78.1793, - "eval_samples_per_second": 1065.64, - "eval_steps_per_second": 133.207, - "num_input_tokens_seen": 202377935, - "step": 374904 - }, - { - "epoch": 9.002304589975033, - "grad_norm": 7.157642841339111, - "learning_rate": 4.988597080852698e-06, - "loss": 1.4353, - "num_input_tokens_seen": 202429351, + "epoch": 3.751800864414919, + "grad_norm": 5.419944763183594, + "learning_rate": 1.248209140387386e-05, + "loss": 0.9949, + "num_input_tokens_seen": 202913024, "step": 375000 }, { - "epoch": 9.014307662761667, - "grad_norm": 5.997623443603516, - "learning_rate": 4.928581716919531e-06, - "loss": 1.3797, - "num_input_tokens_seen": 202693951, + "epoch": 3.7568032655674726, + "grad_norm": 5.332630157470703, + "learning_rate": 1.2432067392348328e-05, + "loss": 1.0018, + "num_input_tokens_seen": 203175136, "step": 375500 }, { - "epoch": 9.0263107355483, - "grad_norm": 7.1180009841918945, - "learning_rate": 4.868566352986364e-06, - "loss": 1.39, - "num_input_tokens_seen": 202968343, + "epoch": 3.7618056667200257, + "grad_norm": 5.563337802886963, + "learning_rate": 1.2382043380822796e-05, + "loss": 1.0257, + "num_input_tokens_seen": 203444624, "step": 376000 }, { - "epoch": 9.038313808334934, - "grad_norm": 5.762534141540527, - "learning_rate": 4.808550989053198e-06, - "loss": 1.3889, - "num_input_tokens_seen": 203243079, + "epoch": 3.766808067872579, + "grad_norm": 7.322454452514648, + "learning_rate": 1.2332019369297265e-05, + "loss": 1.0097, + "num_input_tokens_seen": 203721152, "step": 376500 }, { - "epoch": 9.050316881121567, - "grad_norm": 6.5277419090271, - "learning_rate": 4.748535625120031e-06, - "loss": 1.403, - "num_input_tokens_seen": 203510423, + "epoch": 3.771810469025132, + "grad_norm": 5.674718379974365, + "learning_rate": 1.2281995357771731e-05, + "loss": 0.9849, + "num_input_tokens_seen": 203993696, "step": 377000 }, { - "epoch": 9.0623199539082, - "grad_norm": 6.3327507972717285, - "learning_rate": 4.6885202611868646e-06, - "loss": 1.3926, - "num_input_tokens_seen": 203777679, + "epoch": 3.776812870177685, + "grad_norm": 6.736847877502441, + "learning_rate": 1.22319713462462e-05, + "loss": 0.9913, + "num_input_tokens_seen": 204263944, "step": 377500 }, { - "epoch": 9.074323026694834, - "grad_norm": 7.27388334274292, - "learning_rate": 4.6285048972536975e-06, - "loss": 1.3664, - "num_input_tokens_seen": 204040847, + "epoch": 3.7818152713302386, + "grad_norm": 6.920697212219238, + "learning_rate": 1.2181947334720668e-05, + "loss": 0.9984, + "num_input_tokens_seen": 204534952, "step": 378000 }, { - "epoch": 9.086326099481468, - "grad_norm": 6.772297382354736, - "learning_rate": 4.56848953332053e-06, - "loss": 1.3858, - "num_input_tokens_seen": 204311303, + "epoch": 3.7868176724827918, + "grad_norm": 6.516974449157715, + "learning_rate": 1.2131923323195134e-05, + "loss": 1.01, + "num_input_tokens_seen": 204808160, "step": 378500 }, { - "epoch": 9.0983291722681, - "grad_norm": 5.18277645111084, - "learning_rate": 4.508474169387363e-06, - "loss": 1.4, - "num_input_tokens_seen": 204579519, + "epoch": 3.791820073635345, + "grad_norm": 5.656439781188965, + "learning_rate": 1.2081899311669602e-05, + "loss": 1.0013, + "num_input_tokens_seen": 205082712, "step": 379000 }, { - "epoch": 9.110332245054733, - "grad_norm": 6.660023212432861, - "learning_rate": 4.448458805454196e-06, - "loss": 1.3946, - "num_input_tokens_seen": 204845015, + "epoch": 3.796822474787898, + "grad_norm": 4.886724472045898, + "learning_rate": 1.203187530014407e-05, + "loss": 1.011, + "num_input_tokens_seen": 205354224, "step": 379500 }, { - "epoch": 9.122335317841367, - "grad_norm": 6.6264543533325195, - "learning_rate": 4.38844344152103e-06, - "loss": 1.3831, - "num_input_tokens_seen": 205113751, + "epoch": 3.8018248759404516, + "grad_norm": 8.127188682556152, + "learning_rate": 1.1981851288618537e-05, + "loss": 1.0004, + "num_input_tokens_seen": 205626120, "step": 380000 }, { - "epoch": 9.134338390628, - "grad_norm": 6.711910724639893, - "learning_rate": 4.328428077587863e-06, - "loss": 1.3947, - "num_input_tokens_seen": 205381519, + "epoch": 3.8068272770930047, + "grad_norm": 7.59630823135376, + "learning_rate": 1.1931827277093005e-05, + "loss": 1.0078, + "num_input_tokens_seen": 205895872, "step": 380500 }, { - "epoch": 9.146341463414634, - "grad_norm": 6.29475212097168, - "learning_rate": 4.268412713654696e-06, - "loss": 1.3925, - "num_input_tokens_seen": 205651319, + "epoch": 3.811829678245558, + "grad_norm": 5.615649700164795, + "learning_rate": 1.1881803265567474e-05, + "loss": 0.9887, + "num_input_tokens_seen": 206161088, "step": 381000 }, { - "epoch": 9.158344536201268, - "grad_norm": 5.5769124031066895, - "learning_rate": 4.208397349721529e-06, - "loss": 1.3957, - "num_input_tokens_seen": 205920751, + "epoch": 3.816832079398111, + "grad_norm": 5.560026168823242, + "learning_rate": 1.183177925404194e-05, + "loss": 0.9986, + "num_input_tokens_seen": 206434032, "step": 381500 }, { - "epoch": 9.170347608987901, - "grad_norm": 5.848575115203857, - "learning_rate": 4.148381985788362e-06, - "loss": 1.3953, - "num_input_tokens_seen": 206190351, + "epoch": 3.821834480550664, + "grad_norm": 4.2973480224609375, + "learning_rate": 1.1781755242516409e-05, + "loss": 0.9843, + "num_input_tokens_seen": 206711936, "step": 382000 }, { - "epoch": 9.182350681774535, - "grad_norm": 6.245150089263916, - "learning_rate": 4.088366621855195e-06, - "loss": 1.3589, - "num_input_tokens_seen": 206457319, + "epoch": 3.8268368817032177, + "grad_norm": 4.312121391296387, + "learning_rate": 1.1731731230990877e-05, + "loss": 0.995, + "num_input_tokens_seen": 206984224, "step": 382500 }, { - "epoch": 9.194353754561167, - "grad_norm": 6.232107639312744, - "learning_rate": 4.028351257922028e-06, - "loss": 1.4069, - "num_input_tokens_seen": 206727119, + "epoch": 3.831839282855771, + "grad_norm": 5.747461795806885, + "learning_rate": 1.1681707219465345e-05, + "loss": 1.0087, + "num_input_tokens_seen": 207254056, "step": 383000 }, { - "epoch": 9.2063568273478, - "grad_norm": 6.545021057128906, - "learning_rate": 3.968335893988861e-06, - "loss": 1.4141, - "num_input_tokens_seen": 206997471, + "epoch": 3.836841684008324, + "grad_norm": 5.281491279602051, + "learning_rate": 1.1631683207939812e-05, + "loss": 1.0027, + "num_input_tokens_seen": 207532144, "step": 383500 }, { - "epoch": 9.218359900134434, - "grad_norm": 7.558315277099609, - "learning_rate": 3.908320530055694e-06, - "loss": 1.395, - "num_input_tokens_seen": 207270519, + "epoch": 3.8418440851608775, + "grad_norm": 6.745446681976318, + "learning_rate": 1.158165919641428e-05, + "loss": 1.0079, + "num_input_tokens_seen": 207801576, "step": 384000 }, { - "epoch": 9.230362972921068, - "grad_norm": 6.40931510925293, - "learning_rate": 3.848305166122527e-06, - "loss": 1.3949, - "num_input_tokens_seen": 207538255, + "epoch": 3.8468464863134306, + "grad_norm": 6.459105968475342, + "learning_rate": 1.1531635184888748e-05, + "loss": 1.0094, + "num_input_tokens_seen": 208065512, "step": 384500 }, { - "epoch": 9.242366045707701, - "grad_norm": 6.304757118225098, - "learning_rate": 3.7882898021893604e-06, - "loss": 1.4099, - "num_input_tokens_seen": 207806191, + "epoch": 3.8518488874659838, + "grad_norm": 5.599144458770752, + "learning_rate": 1.1481611173363215e-05, + "loss": 0.9959, + "num_input_tokens_seen": 208339576, "step": 385000 }, { - "epoch": 9.254369118494335, - "grad_norm": 6.455826759338379, - "learning_rate": 3.7282744382561933e-06, - "loss": 1.3895, - "num_input_tokens_seen": 208076391, + "epoch": 3.856851288618537, + "grad_norm": 7.585973739624023, + "learning_rate": 1.1431587161837683e-05, + "loss": 1.0037, + "num_input_tokens_seen": 208617584, "step": 385500 }, { - "epoch": 9.266372191280968, - "grad_norm": 6.353752613067627, - "learning_rate": 3.668259074323027e-06, - "loss": 1.4217, - "num_input_tokens_seen": 208348879, + "epoch": 3.86185368977109, + "grad_norm": 6.892619609832764, + "learning_rate": 1.1381563150312151e-05, + "loss": 0.9997, + "num_input_tokens_seen": 208880256, "step": 386000 }, { - "epoch": 9.278375264067602, - "grad_norm": 6.472243309020996, - "learning_rate": 3.60824371038986e-06, - "loss": 1.3807, - "num_input_tokens_seen": 208617295, + "epoch": 3.866856090923643, + "grad_norm": 7.016179084777832, + "learning_rate": 1.1331539138786618e-05, + "loss": 1.0165, + "num_input_tokens_seen": 209152592, "step": 386500 }, { - "epoch": 9.290378336854234, - "grad_norm": 5.947717666625977, - "learning_rate": 3.5482283464566934e-06, - "loss": 1.408, - "num_input_tokens_seen": 208886695, + "epoch": 3.8718584920761967, + "grad_norm": 7.874701976776123, + "learning_rate": 1.1281515127261086e-05, + "loss": 0.9907, + "num_input_tokens_seen": 209423248, "step": 387000 }, { - "epoch": 9.302381409640867, - "grad_norm": 6.497421741485596, - "learning_rate": 3.4882129825235263e-06, - "loss": 1.3996, - "num_input_tokens_seen": 209159135, + "epoch": 3.87686089322875, + "grad_norm": 5.5261030197143555, + "learning_rate": 1.1231491115735554e-05, + "loss": 1.0029, + "num_input_tokens_seen": 209692384, "step": 387500 }, { - "epoch": 9.314384482427501, - "grad_norm": 6.405847072601318, - "learning_rate": 3.428197618590359e-06, - "loss": 1.3721, - "num_input_tokens_seen": 209430351, + "epoch": 3.881863294381303, + "grad_norm": 8.382930755615234, + "learning_rate": 1.118146710421002e-05, + "loss": 0.9865, + "num_input_tokens_seen": 209961656, "step": 388000 }, { - "epoch": 9.326387555214135, - "grad_norm": 6.963627815246582, - "learning_rate": 3.368182254657192e-06, - "loss": 1.389, - "num_input_tokens_seen": 209696783, + "epoch": 3.8868656955338565, + "grad_norm": 4.629281044006348, + "learning_rate": 1.1131443092684489e-05, + "loss": 0.995, + "num_input_tokens_seen": 210227928, "step": 388500 }, { - "epoch": 9.338390628000768, - "grad_norm": 6.220207214355469, - "learning_rate": 3.308166890724026e-06, - "loss": 1.3948, - "num_input_tokens_seen": 209970047, + "epoch": 3.8918680966864097, + "grad_norm": 4.65085506439209, + "learning_rate": 1.1081419081158957e-05, + "loss": 0.9815, + "num_input_tokens_seen": 210494672, "step": 389000 }, { - "epoch": 9.350393700787402, - "grad_norm": 6.8149495124816895, - "learning_rate": 3.248151526790859e-06, - "loss": 1.3974, - "num_input_tokens_seen": 210238103, + "epoch": 3.8968704978389628, + "grad_norm": 5.350659370422363, + "learning_rate": 1.1031395069633424e-05, + "loss": 0.9988, + "num_input_tokens_seen": 210757368, "step": 389500 }, { - "epoch": 9.362396773574035, - "grad_norm": 6.586900234222412, - "learning_rate": 3.1881361628576917e-06, - "loss": 1.3851, - "num_input_tokens_seen": 210508279, + "epoch": 3.901872898991516, + "grad_norm": 6.074803829193115, + "learning_rate": 1.0981371058107892e-05, + "loss": 0.9996, + "num_input_tokens_seen": 211028544, "step": 390000 }, { - "epoch": 9.374399846360669, - "grad_norm": 5.532411098480225, - "learning_rate": 3.1281207989245246e-06, - "loss": 1.4088, - "num_input_tokens_seen": 210779743, + "epoch": 3.906875300144069, + "grad_norm": 5.179644584655762, + "learning_rate": 1.093134704658236e-05, + "loss": 0.99, + "num_input_tokens_seen": 211297312, "step": 390500 }, { - "epoch": 9.3864029191473, - "grad_norm": 6.0678510665893555, - "learning_rate": 3.068105434991358e-06, - "loss": 1.4024, - "num_input_tokens_seen": 211045503, + "epoch": 3.911877701296622, + "grad_norm": 6.687560081481934, + "learning_rate": 1.0881323035056828e-05, + "loss": 1.0047, + "num_input_tokens_seen": 211563536, "step": 391000 }, { - "epoch": 9.398405991933934, - "grad_norm": 7.183020114898682, - "learning_rate": 3.0080900710581913e-06, - "loss": 1.4063, - "num_input_tokens_seen": 211314991, + "epoch": 3.9168801024491757, + "grad_norm": 5.4230570793151855, + "learning_rate": 1.0831299023531295e-05, + "loss": 1.0096, + "num_input_tokens_seen": 211837976, "step": 391500 }, { - "epoch": 9.410409064720568, - "grad_norm": 6.26782751083374, - "learning_rate": 2.9480747071250242e-06, - "loss": 1.4126, - "num_input_tokens_seen": 211585143, + "epoch": 3.921882503601729, + "grad_norm": 5.730657577514648, + "learning_rate": 1.0781275012005763e-05, + "loss": 1.0044, + "num_input_tokens_seen": 212105824, "step": 392000 }, { - "epoch": 9.422412137507202, - "grad_norm": 6.958585739135742, - "learning_rate": 2.888059343191857e-06, - "loss": 1.4001, - "num_input_tokens_seen": 211850743, + "epoch": 3.926884904754282, + "grad_norm": 6.501159191131592, + "learning_rate": 1.0731251000480231e-05, + "loss": 0.9933, + "num_input_tokens_seen": 212377128, "step": 392500 }, { - "epoch": 9.434415210293835, - "grad_norm": 5.788858413696289, - "learning_rate": 2.8280439792586905e-06, - "loss": 1.4133, - "num_input_tokens_seen": 212128335, + "epoch": 3.9318873059068356, + "grad_norm": 6.957518100738525, + "learning_rate": 1.0681226988954698e-05, + "loss": 0.9987, + "num_input_tokens_seen": 212653848, "step": 393000 }, { - "epoch": 9.446418283080469, - "grad_norm": 6.290924549102783, - "learning_rate": 2.7680286153255234e-06, - "loss": 1.4022, - "num_input_tokens_seen": 212399719, + "epoch": 3.9368897070593887, + "grad_norm": 6.272824287414551, + "learning_rate": 1.0631202977429166e-05, + "loss": 0.9901, + "num_input_tokens_seen": 212924520, "step": 393500 }, { - "epoch": 9.458421355867102, - "grad_norm": 5.750514984130859, - "learning_rate": 2.7080132513923563e-06, - "loss": 1.3992, - "num_input_tokens_seen": 212673503, + "epoch": 3.941892108211942, + "grad_norm": 7.048558712005615, + "learning_rate": 1.0581178965903634e-05, + "loss": 0.9943, + "num_input_tokens_seen": 213198256, "step": 394000 }, { - "epoch": 9.470424428653736, - "grad_norm": 6.040570259094238, - "learning_rate": 2.6479978874591896e-06, - "loss": 1.3963, - "num_input_tokens_seen": 212939887, + "epoch": 3.946894509364495, + "grad_norm": 6.269680976867676, + "learning_rate": 1.0531154954378101e-05, + "loss": 0.9846, + "num_input_tokens_seen": 213471024, "step": 394500 }, { - "epoch": 9.48242750144037, - "grad_norm": 6.502077102661133, - "learning_rate": 2.587982523526023e-06, - "loss": 1.3934, - "num_input_tokens_seen": 213209959, + "epoch": 3.951896910517048, + "grad_norm": 5.69096565246582, + "learning_rate": 1.048113094285257e-05, + "loss": 0.9993, + "num_input_tokens_seen": 213740224, "step": 395000 }, { - "epoch": 9.494430574227001, - "grad_norm": 8.491873741149902, - "learning_rate": 2.527967159592856e-06, - "loss": 1.3911, - "num_input_tokens_seen": 213481599, + "epoch": 3.956899311669601, + "grad_norm": 6.711835861206055, + "learning_rate": 1.0431106931327038e-05, + "loss": 0.9994, + "num_input_tokens_seen": 214004440, "step": 395500 }, { - "epoch": 9.506433647013635, - "grad_norm": 7.506811618804932, - "learning_rate": 2.4679517956596892e-06, - "loss": 1.4139, - "num_input_tokens_seen": 213754591, + "epoch": 3.9619017128221548, + "grad_norm": 9.549476623535156, + "learning_rate": 1.0381082919801504e-05, + "loss": 0.9853, + "num_input_tokens_seen": 214275152, "step": 396000 }, { - "epoch": 9.518436719800269, - "grad_norm": 7.821685314178467, - "learning_rate": 2.407936431726522e-06, - "loss": 1.4036, - "num_input_tokens_seen": 214027127, + "epoch": 3.966904113974708, + "grad_norm": 6.297016620635986, + "learning_rate": 1.0331058908275972e-05, + "loss": 0.9931, + "num_input_tokens_seen": 214549672, "step": 396500 }, { - "epoch": 9.530439792586902, - "grad_norm": 6.207556247711182, - "learning_rate": 2.347921067793355e-06, - "loss": 1.4121, - "num_input_tokens_seen": 214299919, + "epoch": 3.971906515127261, + "grad_norm": 5.232683181762695, + "learning_rate": 1.028103489675044e-05, + "loss": 0.9943, + "num_input_tokens_seen": 214823176, "step": 397000 }, { - "epoch": 9.542442865373536, - "grad_norm": 6.1572065353393555, - "learning_rate": 2.2879057038601884e-06, - "loss": 1.3913, - "num_input_tokens_seen": 214571943, + "epoch": 3.9769089162798146, + "grad_norm": 6.6180219650268555, + "learning_rate": 1.0231010885224907e-05, + "loss": 0.9965, + "num_input_tokens_seen": 215092304, "step": 397500 }, { - "epoch": 9.55444593816017, - "grad_norm": 7.173264026641846, - "learning_rate": 2.2278903399270213e-06, - "loss": 1.3868, - "num_input_tokens_seen": 214837503, + "epoch": 3.9819113174323677, + "grad_norm": 6.272809982299805, + "learning_rate": 1.0180986873699375e-05, + "loss": 0.9773, + "num_input_tokens_seen": 215364504, "step": 398000 }, { - "epoch": 9.566449010946803, - "grad_norm": 5.7680463790893555, - "learning_rate": 2.1678749759938542e-06, - "loss": 1.3877, - "num_input_tokens_seen": 215109519, + "epoch": 3.986913718584921, + "grad_norm": 6.778554916381836, + "learning_rate": 1.0130962862173844e-05, + "loss": 1.0018, + "num_input_tokens_seen": 215636456, "step": 398500 }, { - "epoch": 9.578452083733435, - "grad_norm": 5.968242645263672, - "learning_rate": 2.1078596120606876e-06, - "loss": 1.3923, - "num_input_tokens_seen": 215378551, + "epoch": 3.991916119737474, + "grad_norm": 5.8704071044921875, + "learning_rate": 1.0080938850648312e-05, + "loss": 0.9982, + "num_input_tokens_seen": 215902264, "step": 399000 }, { - "epoch": 9.590455156520068, - "grad_norm": 5.630865573883057, - "learning_rate": 2.047844248127521e-06, - "loss": 1.4111, - "num_input_tokens_seen": 215645975, + "epoch": 3.996918520890027, + "grad_norm": 6.430477142333984, + "learning_rate": 1.0030914839122778e-05, + "loss": 0.9897, + "num_input_tokens_seen": 216171688, "step": 399500 }, { - "epoch": 9.602458229306702, - "grad_norm": 6.464187145233154, - "learning_rate": 1.987828884194354e-06, - "loss": 1.3897, - "num_input_tokens_seen": 215916775, + "epoch": 4.0, + "eval_loss": 1.0106587409973145, + "eval_runtime": 189.1748, + "eval_samples_per_second": 1056.721, + "eval_steps_per_second": 132.095, + "num_input_tokens_seen": 216341560, + "step": 399808 + }, + { + "epoch": 4.00192092204258, + "grad_norm": 6.6500749588012695, + "learning_rate": 9.980890827597247e-06, + "loss": 0.9608, + "num_input_tokens_seen": 216448456, "step": 400000 }, { - "epoch": 9.614461302093336, - "grad_norm": 7.595790386199951, - "learning_rate": 1.927813520261187e-06, - "loss": 1.3831, - "num_input_tokens_seen": 216185999, + "epoch": 4.006923323195133, + "grad_norm": 5.360565662384033, + "learning_rate": 9.930866816071715e-06, + "loss": 0.9449, + "num_input_tokens_seen": 216721080, "step": 400500 }, { - "epoch": 9.62646437487997, - "grad_norm": 5.683437347412109, - "learning_rate": 1.86779815632802e-06, - "loss": 1.398, - "num_input_tokens_seen": 216457375, + "epoch": 4.0119257243476865, + "grad_norm": 9.556127548217773, + "learning_rate": 9.880842804546183e-06, + "loss": 0.9209, + "num_input_tokens_seen": 216986544, "step": 401000 }, { - "epoch": 9.638467447666603, - "grad_norm": 6.96507453918457, - "learning_rate": 1.807782792394853e-06, - "loss": 1.3921, - "num_input_tokens_seen": 216726543, + "epoch": 4.0169281255002405, + "grad_norm": 5.1922502517700195, + "learning_rate": 9.830818793020651e-06, + "loss": 0.9175, + "num_input_tokens_seen": 217262208, "step": 401500 }, { - "epoch": 9.650470520453236, - "grad_norm": 5.61997652053833, - "learning_rate": 1.7477674284616863e-06, - "loss": 1.4043, - "num_input_tokens_seen": 216998639, + "epoch": 4.021930526652794, + "grad_norm": 6.372469902038574, + "learning_rate": 9.780794781495118e-06, + "loss": 0.9079, + "num_input_tokens_seen": 217532376, "step": 402000 }, { - "epoch": 9.66247359323987, - "grad_norm": 6.20845890045166, - "learning_rate": 1.6877520645285195e-06, - "loss": 1.3875, - "num_input_tokens_seen": 217262495, + "epoch": 4.026932927805347, + "grad_norm": 5.694700717926025, + "learning_rate": 9.730770769969586e-06, + "loss": 0.9205, + "num_input_tokens_seen": 217800456, "step": 402500 }, { - "epoch": 9.674476666026504, - "grad_norm": 7.56028938293457, - "learning_rate": 1.6277367005953524e-06, - "loss": 1.3973, - "num_input_tokens_seen": 217529887, + "epoch": 4.0319353289579, + "grad_norm": 7.39500617980957, + "learning_rate": 9.680746758444054e-06, + "loss": 0.9262, + "num_input_tokens_seen": 218065808, "step": 403000 }, { - "epoch": 9.686479738813135, - "grad_norm": 6.721691608428955, - "learning_rate": 1.5677213366621857e-06, - "loss": 1.4062, - "num_input_tokens_seen": 217798863, + "epoch": 4.036937730110453, + "grad_norm": 5.675652980804443, + "learning_rate": 9.630722746918523e-06, + "loss": 0.9128, + "num_input_tokens_seen": 218330520, "step": 403500 }, { - "epoch": 9.698482811599769, - "grad_norm": 6.301656246185303, - "learning_rate": 1.5077059727290186e-06, - "loss": 1.4002, - "num_input_tokens_seen": 218067199, + "epoch": 4.041940131263006, + "grad_norm": 4.7625203132629395, + "learning_rate": 9.58069873539299e-06, + "loss": 0.9438, + "num_input_tokens_seen": 218603792, "step": 404000 }, { - "epoch": 9.710485884386403, - "grad_norm": 6.369684219360352, - "learning_rate": 1.4476906087958518e-06, - "loss": 1.3843, - "num_input_tokens_seen": 218336447, + "epoch": 4.046942532415559, + "grad_norm": 5.834499835968018, + "learning_rate": 9.530674723867457e-06, + "loss": 0.9191, + "num_input_tokens_seen": 218870800, "step": 404500 }, { - "epoch": 9.722488957173036, - "grad_norm": 7.407693862915039, - "learning_rate": 1.387675244862685e-06, - "loss": 1.406, - "num_input_tokens_seen": 218606543, + "epoch": 4.051944933568112, + "grad_norm": 7.351385116577148, + "learning_rate": 9.480650712341926e-06, + "loss": 0.9257, + "num_input_tokens_seen": 219142384, "step": 405000 }, { - "epoch": 9.73449202995967, - "grad_norm": 7.671813011169434, - "learning_rate": 1.327659880929518e-06, - "loss": 1.4085, - "num_input_tokens_seen": 218879271, + "epoch": 4.0569473347206655, + "grad_norm": 7.003774166107178, + "learning_rate": 9.430626700816392e-06, + "loss": 0.935, + "num_input_tokens_seen": 219404752, "step": 405500 }, { - "epoch": 9.746495102746303, - "grad_norm": 7.748348236083984, - "learning_rate": 1.2676445169963512e-06, - "loss": 1.415, - "num_input_tokens_seen": 219151943, + "epoch": 4.0619497358732195, + "grad_norm": 6.0088019371032715, + "learning_rate": 9.38060268929086e-06, + "loss": 0.9307, + "num_input_tokens_seen": 219675200, "step": 406000 }, { - "epoch": 9.758498175532937, - "grad_norm": 6.55049467086792, - "learning_rate": 1.2076291530631843e-06, - "loss": 1.3914, - "num_input_tokens_seen": 219422079, + "epoch": 4.066952137025773, + "grad_norm": 6.115697383880615, + "learning_rate": 9.330578677765329e-06, + "loss": 0.9281, + "num_input_tokens_seen": 219950728, "step": 406500 }, { - "epoch": 9.77050124831957, - "grad_norm": 6.072712421417236, - "learning_rate": 1.1476137891300174e-06, - "loss": 1.4189, - "num_input_tokens_seen": 219696047, + "epoch": 4.071954538178326, + "grad_norm": 5.1921868324279785, + "learning_rate": 9.280554666239795e-06, + "loss": 0.9391, + "num_input_tokens_seen": 220221608, "step": 407000 }, { - "epoch": 9.782504321106202, - "grad_norm": 7.149533748626709, - "learning_rate": 1.0875984251968503e-06, - "loss": 1.4054, - "num_input_tokens_seen": 219963863, + "epoch": 4.076956939330879, + "grad_norm": 6.665195465087891, + "learning_rate": 9.230530654714263e-06, + "loss": 0.9296, + "num_input_tokens_seen": 220497552, "step": 407500 }, { - "epoch": 9.794507393892836, - "grad_norm": 6.547571182250977, - "learning_rate": 1.0275830612636835e-06, - "loss": 1.4015, - "num_input_tokens_seen": 220228039, + "epoch": 4.081959340483432, + "grad_norm": 6.402169704437256, + "learning_rate": 9.180506643188732e-06, + "loss": 0.9199, + "num_input_tokens_seen": 220764032, "step": 408000 }, { - "epoch": 9.80651046667947, - "grad_norm": 7.683659553527832, - "learning_rate": 9.675676973305168e-07, - "loss": 1.3941, - "num_input_tokens_seen": 220494191, + "epoch": 4.086961741635985, + "grad_norm": 5.718327045440674, + "learning_rate": 9.1304826316632e-06, + "loss": 0.927, + "num_input_tokens_seen": 221033992, "step": 408500 }, { - "epoch": 9.818513539466103, - "grad_norm": 6.3097968101501465, - "learning_rate": 9.075523333973497e-07, - "loss": 1.4145, - "num_input_tokens_seen": 220773871, + "epoch": 4.091964142788538, + "grad_norm": 6.588100433349609, + "learning_rate": 9.080458620137667e-06, + "loss": 0.9374, + "num_input_tokens_seen": 221308640, "step": 409000 }, { - "epoch": 9.830516612252737, - "grad_norm": 7.043195724487305, - "learning_rate": 8.475369694641828e-07, - "loss": 1.3883, - "num_input_tokens_seen": 221042759, + "epoch": 4.096966543941091, + "grad_norm": 5.30639123916626, + "learning_rate": 9.030434608612135e-06, + "loss": 0.9251, + "num_input_tokens_seen": 221581024, "step": 409500 }, { - "epoch": 9.84251968503937, - "grad_norm": 6.251890659332275, - "learning_rate": 7.875216055310161e-07, - "loss": 1.4021, - "num_input_tokens_seen": 221314167, + "epoch": 4.1019689450936445, + "grad_norm": 4.8298821449279785, + "learning_rate": 8.980410597086603e-06, + "loss": 0.9137, + "num_input_tokens_seen": 221842152, "step": 410000 }, { - "epoch": 9.854522757826004, - "grad_norm": 6.094542026519775, - "learning_rate": 7.275062415978491e-07, - "loss": 1.4124, - "num_input_tokens_seen": 221584919, + "epoch": 4.1069713462461985, + "grad_norm": 5.2428483963012695, + "learning_rate": 8.93038658556107e-06, + "loss": 0.9333, + "num_input_tokens_seen": 222117312, "step": 410500 }, { - "epoch": 9.866525830612638, - "grad_norm": 5.8496575355529785, - "learning_rate": 6.674908776646822e-07, - "loss": 1.3981, - "num_input_tokens_seen": 221858023, + "epoch": 4.111973747398752, + "grad_norm": 7.0200114250183105, + "learning_rate": 8.880362574035538e-06, + "loss": 0.914, + "num_input_tokens_seen": 222386728, "step": 411000 }, { - "epoch": 9.878528903399271, - "grad_norm": 5.964223861694336, - "learning_rate": 6.074755137315152e-07, - "loss": 1.4071, - "num_input_tokens_seen": 222127271, + "epoch": 4.116976148551305, + "grad_norm": 5.761682510375977, + "learning_rate": 8.830338562510006e-06, + "loss": 0.9236, + "num_input_tokens_seen": 222656968, "step": 411500 }, { - "epoch": 9.890531976185903, - "grad_norm": 8.775946617126465, - "learning_rate": 5.474601497983484e-07, - "loss": 1.421, - "num_input_tokens_seen": 222398287, + "epoch": 4.121978549703858, + "grad_norm": 5.959192752838135, + "learning_rate": 8.780314550984473e-06, + "loss": 0.9383, + "num_input_tokens_seen": 222931472, "step": 412000 }, { - "epoch": 9.902535048972537, - "grad_norm": 6.631856441497803, - "learning_rate": 4.874447858651815e-07, - "loss": 1.3993, - "num_input_tokens_seen": 222670727, + "epoch": 4.126980950856411, + "grad_norm": 7.374218940734863, + "learning_rate": 8.73029053945894e-06, + "loss": 0.918, + "num_input_tokens_seen": 223197176, "step": 412500 }, { - "epoch": 9.91453812175917, - "grad_norm": 6.73166036605835, - "learning_rate": 4.274294219320146e-07, - "loss": 1.4149, - "num_input_tokens_seen": 222944559, + "epoch": 4.131983352008964, + "grad_norm": 5.401734352111816, + "learning_rate": 8.680266527933409e-06, + "loss": 0.9134, + "num_input_tokens_seen": 223473808, "step": 413000 }, { - "epoch": 9.926541194545804, - "grad_norm": 5.887240409851074, - "learning_rate": 3.6741405799884775e-07, - "loss": 1.3984, - "num_input_tokens_seen": 223216095, + "epoch": 4.136985753161517, + "grad_norm": 5.810543537139893, + "learning_rate": 8.630242516407876e-06, + "loss": 0.9219, + "num_input_tokens_seen": 223749560, "step": 413500 }, { - "epoch": 9.938544267332437, - "grad_norm": 5.728778839111328, - "learning_rate": 3.0739869406568083e-07, - "loss": 1.3933, - "num_input_tokens_seen": 223485895, + "epoch": 4.14198815431407, + "grad_norm": 6.911441326141357, + "learning_rate": 8.580218504882344e-06, + "loss": 0.9371, + "num_input_tokens_seen": 224026048, "step": 414000 }, { - "epoch": 9.950547340119071, - "grad_norm": 5.5044636726379395, - "learning_rate": 2.4738333013251396e-07, - "loss": 1.3924, - "num_input_tokens_seen": 223750791, + "epoch": 4.146990555466624, + "grad_norm": 5.81462287902832, + "learning_rate": 8.530194493356812e-06, + "loss": 0.922, + "num_input_tokens_seen": 224296456, "step": 414500 }, { - "epoch": 9.962550412905705, - "grad_norm": 6.243234157562256, - "learning_rate": 1.8736796619934703e-07, - "loss": 1.3945, - "num_input_tokens_seen": 224018287, + "epoch": 4.1519929566191776, + "grad_norm": 6.728647232055664, + "learning_rate": 8.480170481831279e-06, + "loss": 0.9261, + "num_input_tokens_seen": 224566384, "step": 415000 }, { - "epoch": 9.974553485692336, - "grad_norm": 7.025062561035156, - "learning_rate": 1.2735260226618013e-07, - "loss": 1.3998, - "num_input_tokens_seen": 224292607, + "epoch": 4.156995357771731, + "grad_norm": 6.1371564865112305, + "learning_rate": 8.430146470305747e-06, + "loss": 0.9286, + "num_input_tokens_seen": 224841952, "step": 415500 }, { - "epoch": 9.98655655847897, - "grad_norm": 6.2767486572265625, - "learning_rate": 6.733723833301326e-08, - "loss": 1.396, - "num_input_tokens_seen": 224563407, + "epoch": 4.161997758924284, + "grad_norm": 7.179012775421143, + "learning_rate": 8.380122458780215e-06, + "loss": 0.9196, + "num_input_tokens_seen": 225112416, "step": 416000 }, { - "epoch": 9.998559631265604, - "grad_norm": 6.174375057220459, - "learning_rate": 7.321874399846361e-09, - "loss": 1.3969, - "num_input_tokens_seen": 224833207, + "epoch": 4.167000160076837, + "grad_norm": 6.620611667633057, + "learning_rate": 8.330098447254682e-06, + "loss": 0.9358, + "num_input_tokens_seen": 225389344, "step": 416500 }, { - "epoch": 10.0, - "eval_loss": 1.8425856828689575, - "eval_runtime": 78.6166, - "eval_samples_per_second": 1059.712, - "eval_steps_per_second": 132.466, - "num_input_tokens_seen": 224864981, - "step": 416560 - }, - { - "epoch": 10.0, - "num_input_tokens_seen": 224864981, - "step": 416560, - "total_flos": 5.955121238645146e+16, - "train_loss": 1.7257680629754402, - "train_runtime": 16392.488, - "train_samples_per_second": 203.29, - "train_steps_per_second": 25.412, - "train_tokens_per_second": 13715.035 + "epoch": 4.17200256122939, + "grad_norm": 6.2398905754089355, + "learning_rate": 8.28007443572915e-06, + "loss": 0.9178, + "num_input_tokens_seen": 225663376, + "step": 417000 + }, + { + "epoch": 4.177004962381943, + "grad_norm": 6.071476936340332, + "learning_rate": 8.230050424203618e-06, + "loss": 0.9319, + "num_input_tokens_seen": 225931480, + "step": 417500 + }, + { + "epoch": 4.182007363534496, + "grad_norm": 5.602684497833252, + "learning_rate": 8.180026412678086e-06, + "loss": 0.9326, + "num_input_tokens_seen": 226198008, + "step": 418000 + }, + { + "epoch": 4.187009764687049, + "grad_norm": 6.832788944244385, + "learning_rate": 8.130002401152553e-06, + "loss": 0.929, + "num_input_tokens_seen": 226466280, + "step": 418500 + }, + { + "epoch": 4.1920121658396035, + "grad_norm": 6.3188862800598145, + "learning_rate": 8.079978389627021e-06, + "loss": 0.94, + "num_input_tokens_seen": 226741632, + "step": 419000 + }, + { + "epoch": 4.197014566992157, + "grad_norm": 5.0520548820495605, + "learning_rate": 8.02995437810149e-06, + "loss": 0.9327, + "num_input_tokens_seen": 227006288, + "step": 419500 + }, + { + "epoch": 4.20201696814471, + "grad_norm": 6.5376410484313965, + "learning_rate": 7.979930366575956e-06, + "loss": 0.9042, + "num_input_tokens_seen": 227281352, + "step": 420000 + }, + { + "epoch": 4.207019369297263, + "grad_norm": 5.164760112762451, + "learning_rate": 7.929906355050424e-06, + "loss": 0.9231, + "num_input_tokens_seen": 227542968, + "step": 420500 + }, + { + "epoch": 4.212021770449816, + "grad_norm": 9.208584785461426, + "learning_rate": 7.879882343524892e-06, + "loss": 0.9333, + "num_input_tokens_seen": 227813704, + "step": 421000 + }, + { + "epoch": 4.217024171602369, + "grad_norm": 5.241026878356934, + "learning_rate": 7.829858331999359e-06, + "loss": 0.9278, + "num_input_tokens_seen": 228083120, + "step": 421500 + }, + { + "epoch": 4.222026572754922, + "grad_norm": 6.501145839691162, + "learning_rate": 7.779834320473827e-06, + "loss": 0.9219, + "num_input_tokens_seen": 228352616, + "step": 422000 + }, + { + "epoch": 4.227028973907475, + "grad_norm": 4.962836742401123, + "learning_rate": 7.729810308948296e-06, + "loss": 0.9176, + "num_input_tokens_seen": 228615376, + "step": 422500 + }, + { + "epoch": 4.2320313750600285, + "grad_norm": 5.714748859405518, + "learning_rate": 7.679786297422762e-06, + "loss": 0.9459, + "num_input_tokens_seen": 228890544, + "step": 423000 + }, + { + "epoch": 4.2370337762125825, + "grad_norm": 5.394798755645752, + "learning_rate": 7.62976228589723e-06, + "loss": 0.9305, + "num_input_tokens_seen": 229158352, + "step": 423500 + }, + { + "epoch": 4.242036177365136, + "grad_norm": 7.450530529022217, + "learning_rate": 7.579738274371699e-06, + "loss": 0.9342, + "num_input_tokens_seen": 229430896, + "step": 424000 + }, + { + "epoch": 4.247038578517689, + "grad_norm": 6.761574745178223, + "learning_rate": 7.529714262846166e-06, + "loss": 0.9324, + "num_input_tokens_seen": 229692632, + "step": 424500 + }, + { + "epoch": 4.252040979670242, + "grad_norm": 6.781697750091553, + "learning_rate": 7.479690251320634e-06, + "loss": 0.9299, + "num_input_tokens_seen": 229963336, + "step": 425000 + }, + { + "epoch": 4.257043380822795, + "grad_norm": 6.029842376708984, + "learning_rate": 7.4296662397951024e-06, + "loss": 0.9411, + "num_input_tokens_seen": 230235128, + "step": 425500 + }, + { + "epoch": 4.262045781975348, + "grad_norm": 6.796103477478027, + "learning_rate": 7.379642228269571e-06, + "loss": 0.9119, + "num_input_tokens_seen": 230508152, + "step": 426000 + }, + { + "epoch": 4.267048183127901, + "grad_norm": 5.397275447845459, + "learning_rate": 7.329618216744037e-06, + "loss": 0.9287, + "num_input_tokens_seen": 230778056, + "step": 426500 + }, + { + "epoch": 4.272050584280454, + "grad_norm": 6.785423755645752, + "learning_rate": 7.2795942052185055e-06, + "loss": 0.9249, + "num_input_tokens_seen": 231050672, + "step": 427000 + }, + { + "epoch": 4.2770529854330075, + "grad_norm": 6.21229362487793, + "learning_rate": 7.229570193692974e-06, + "loss": 0.9293, + "num_input_tokens_seen": 231322192, + "step": 427500 + }, + { + "epoch": 4.2820553865855615, + "grad_norm": 5.904947757720947, + "learning_rate": 7.17954618216744e-06, + "loss": 0.9361, + "num_input_tokens_seen": 231593960, + "step": 428000 + }, + { + "epoch": 4.287057787738115, + "grad_norm": 4.485384464263916, + "learning_rate": 7.1295221706419085e-06, + "loss": 0.9359, + "num_input_tokens_seen": 231857176, + "step": 428500 + }, + { + "epoch": 4.292060188890668, + "grad_norm": 5.395241737365723, + "learning_rate": 7.079498159116377e-06, + "loss": 0.9382, + "num_input_tokens_seen": 232135880, + "step": 429000 + }, + { + "epoch": 4.297062590043221, + "grad_norm": 6.106403827667236, + "learning_rate": 7.029474147590843e-06, + "loss": 0.9266, + "num_input_tokens_seen": 232409736, + "step": 429500 + }, + { + "epoch": 4.302064991195774, + "grad_norm": 5.79823637008667, + "learning_rate": 6.9794501360653115e-06, + "loss": 0.9083, + "num_input_tokens_seen": 232681632, + "step": 430000 + }, + { + "epoch": 4.307067392348327, + "grad_norm": 5.688789367675781, + "learning_rate": 6.92942612453978e-06, + "loss": 0.9145, + "num_input_tokens_seen": 232953312, + "step": 430500 + }, + { + "epoch": 4.31206979350088, + "grad_norm": 5.097667694091797, + "learning_rate": 6.879402113014246e-06, + "loss": 0.9459, + "num_input_tokens_seen": 233217560, + "step": 431000 + }, + { + "epoch": 4.317072194653433, + "grad_norm": 5.180954456329346, + "learning_rate": 6.829378101488715e-06, + "loss": 0.9302, + "num_input_tokens_seen": 233491872, + "step": 431500 + }, + { + "epoch": 4.3220745958059865, + "grad_norm": 5.185079574584961, + "learning_rate": 6.779354089963183e-06, + "loss": 0.9304, + "num_input_tokens_seen": 233757296, + "step": 432000 + }, + { + "epoch": 4.3270769969585405, + "grad_norm": 5.8646464347839355, + "learning_rate": 6.729330078437649e-06, + "loss": 0.917, + "num_input_tokens_seen": 234033848, + "step": 432500 + }, + { + "epoch": 4.332079398111094, + "grad_norm": 4.718979358673096, + "learning_rate": 6.679306066912118e-06, + "loss": 0.9355, + "num_input_tokens_seen": 234301952, + "step": 433000 + }, + { + "epoch": 4.337081799263647, + "grad_norm": 5.194594383239746, + "learning_rate": 6.629282055386586e-06, + "loss": 0.9178, + "num_input_tokens_seen": 234570432, + "step": 433500 + }, + { + "epoch": 4.3420842004162, + "grad_norm": 6.157474994659424, + "learning_rate": 6.579258043861053e-06, + "loss": 0.9306, + "num_input_tokens_seen": 234848568, + "step": 434000 + }, + { + "epoch": 4.347086601568753, + "grad_norm": 5.017276763916016, + "learning_rate": 6.5292340323355215e-06, + "loss": 0.9266, + "num_input_tokens_seen": 235120480, + "step": 434500 + }, + { + "epoch": 4.352089002721306, + "grad_norm": 6.485071659088135, + "learning_rate": 6.479210020809989e-06, + "loss": 0.9245, + "num_input_tokens_seen": 235385120, + "step": 435000 + }, + { + "epoch": 4.357091403873859, + "grad_norm": 6.405189514160156, + "learning_rate": 6.429186009284457e-06, + "loss": 0.9148, + "num_input_tokens_seen": 235653424, + "step": 435500 + }, + { + "epoch": 4.362093805026412, + "grad_norm": 7.216737747192383, + "learning_rate": 6.3791619977589245e-06, + "loss": 0.9171, + "num_input_tokens_seen": 235927792, + "step": 436000 + }, + { + "epoch": 4.3670962061789655, + "grad_norm": 5.484450340270996, + "learning_rate": 6.329137986233393e-06, + "loss": 0.9253, + "num_input_tokens_seen": 236198512, + "step": 436500 + }, + { + "epoch": 4.37209860733152, + "grad_norm": 5.4462971687316895, + "learning_rate": 6.279113974707861e-06, + "loss": 0.9052, + "num_input_tokens_seen": 236470920, + "step": 437000 + }, + { + "epoch": 4.377101008484073, + "grad_norm": 6.061979293823242, + "learning_rate": 6.229089963182328e-06, + "loss": 0.9344, + "num_input_tokens_seen": 236743128, + "step": 437500 + }, + { + "epoch": 4.382103409636626, + "grad_norm": 7.572735786437988, + "learning_rate": 6.179065951656796e-06, + "loss": 0.9262, + "num_input_tokens_seen": 237019048, + "step": 438000 + }, + { + "epoch": 4.387105810789179, + "grad_norm": 5.743752956390381, + "learning_rate": 6.129041940131263e-06, + "loss": 0.9256, + "num_input_tokens_seen": 237287696, + "step": 438500 + }, + { + "epoch": 4.392108211941732, + "grad_norm": 7.014731407165527, + "learning_rate": 6.0790179286057314e-06, + "loss": 0.9242, + "num_input_tokens_seen": 237552080, + "step": 439000 + }, + { + "epoch": 4.397110613094285, + "grad_norm": 7.045165061950684, + "learning_rate": 6.028993917080199e-06, + "loss": 0.9256, + "num_input_tokens_seen": 237822224, + "step": 439500 + }, + { + "epoch": 4.402113014246838, + "grad_norm": 9.347033500671387, + "learning_rate": 5.978969905554666e-06, + "loss": 0.9339, + "num_input_tokens_seen": 238090088, + "step": 440000 + }, + { + "epoch": 4.4071154153993914, + "grad_norm": 5.694771766662598, + "learning_rate": 5.9289458940291345e-06, + "loss": 0.9289, + "num_input_tokens_seen": 238362352, + "step": 440500 + }, + { + "epoch": 4.412117816551945, + "grad_norm": 6.164538383483887, + "learning_rate": 5.878921882503602e-06, + "loss": 0.9315, + "num_input_tokens_seen": 238629408, + "step": 441000 + }, + { + "epoch": 4.417120217704499, + "grad_norm": 6.145501136779785, + "learning_rate": 5.82889787097807e-06, + "loss": 0.9265, + "num_input_tokens_seen": 238897488, + "step": 441500 + }, + { + "epoch": 4.422122618857052, + "grad_norm": 8.055036544799805, + "learning_rate": 5.7788738594525375e-06, + "loss": 0.9459, + "num_input_tokens_seen": 239173024, + "step": 442000 + }, + { + "epoch": 4.427125020009605, + "grad_norm": 6.102746963500977, + "learning_rate": 5.728849847927005e-06, + "loss": 0.9413, + "num_input_tokens_seen": 239441080, + "step": 442500 + }, + { + "epoch": 4.432127421162158, + "grad_norm": 5.538390159606934, + "learning_rate": 5.678825836401473e-06, + "loss": 0.9356, + "num_input_tokens_seen": 239711624, + "step": 443000 + }, + { + "epoch": 4.437129822314711, + "grad_norm": 6.128843307495117, + "learning_rate": 5.6288018248759405e-06, + "loss": 0.9201, + "num_input_tokens_seen": 239985480, + "step": 443500 + }, + { + "epoch": 4.442132223467264, + "grad_norm": 6.007194995880127, + "learning_rate": 5.578777813350408e-06, + "loss": 0.9176, + "num_input_tokens_seen": 240250312, + "step": 444000 + }, + { + "epoch": 4.447134624619817, + "grad_norm": 8.6255521774292, + "learning_rate": 5.528753801824876e-06, + "loss": 0.9216, + "num_input_tokens_seen": 240515680, + "step": 444500 + }, + { + "epoch": 4.4521370257723705, + "grad_norm": 6.955540180206299, + "learning_rate": 5.478729790299344e-06, + "loss": 0.9196, + "num_input_tokens_seen": 240785288, + "step": 445000 + }, + { + "epoch": 4.457139426924924, + "grad_norm": 7.786896228790283, + "learning_rate": 5.428705778773812e-06, + "loss": 0.9337, + "num_input_tokens_seen": 241060592, + "step": 445500 + }, + { + "epoch": 4.462141828077478, + "grad_norm": 5.2199482917785645, + "learning_rate": 5.378681767248279e-06, + "loss": 0.9356, + "num_input_tokens_seen": 241326064, + "step": 446000 + }, + { + "epoch": 4.467144229230031, + "grad_norm": 4.975681781768799, + "learning_rate": 5.328657755722747e-06, + "loss": 0.911, + "num_input_tokens_seen": 241596704, + "step": 446500 + }, + { + "epoch": 4.472146630382584, + "grad_norm": 4.91240119934082, + "learning_rate": 5.278633744197215e-06, + "loss": 0.9276, + "num_input_tokens_seen": 241870056, + "step": 447000 + }, + { + "epoch": 4.477149031535137, + "grad_norm": 7.170393466949463, + "learning_rate": 5.228609732671682e-06, + "loss": 0.9212, + "num_input_tokens_seen": 242140224, + "step": 447500 + }, + { + "epoch": 4.48215143268769, + "grad_norm": 5.982038497924805, + "learning_rate": 5.1785857211461505e-06, + "loss": 0.9112, + "num_input_tokens_seen": 242405736, + "step": 448000 + }, + { + "epoch": 4.487153833840243, + "grad_norm": 6.960501670837402, + "learning_rate": 5.128561709620618e-06, + "loss": 0.9284, + "num_input_tokens_seen": 242672696, + "step": 448500 + }, + { + "epoch": 4.492156234992796, + "grad_norm": 5.97189474105835, + "learning_rate": 5.078537698095086e-06, + "loss": 0.9304, + "num_input_tokens_seen": 242944136, + "step": 449000 + }, + { + "epoch": 4.4971586361453495, + "grad_norm": 5.7404704093933105, + "learning_rate": 5.028513686569554e-06, + "loss": 0.9166, + "num_input_tokens_seen": 243217160, + "step": 449500 + }, + { + "epoch": 4.502161037297903, + "grad_norm": 14.095989227294922, + "learning_rate": 4.978489675044022e-06, + "loss": 0.9364, + "num_input_tokens_seen": 243492064, + "step": 450000 + }, + { + "epoch": 4.507163438450457, + "grad_norm": 6.327740669250488, + "learning_rate": 4.928465663518489e-06, + "loss": 0.9334, + "num_input_tokens_seen": 243765824, + "step": 450500 + }, + { + "epoch": 4.51216583960301, + "grad_norm": 5.019825458526611, + "learning_rate": 4.878441651992957e-06, + "loss": 0.9183, + "num_input_tokens_seen": 244045424, + "step": 451000 + }, + { + "epoch": 4.517168240755563, + "grad_norm": 5.382749557495117, + "learning_rate": 4.828417640467425e-06, + "loss": 0.9183, + "num_input_tokens_seen": 244317544, + "step": 451500 + }, + { + "epoch": 4.522170641908116, + "grad_norm": 6.0456461906433105, + "learning_rate": 4.778393628941892e-06, + "loss": 0.9176, + "num_input_tokens_seen": 244587752, + "step": 452000 + }, + { + "epoch": 4.527173043060669, + "grad_norm": 6.2013983726501465, + "learning_rate": 4.7283696174163604e-06, + "loss": 0.9253, + "num_input_tokens_seen": 244852400, + "step": 452500 + }, + { + "epoch": 4.532175444213222, + "grad_norm": 5.575494766235352, + "learning_rate": 4.678345605890828e-06, + "loss": 0.9315, + "num_input_tokens_seen": 245130392, + "step": 453000 + }, + { + "epoch": 4.537177845365775, + "grad_norm": 6.855820178985596, + "learning_rate": 4.628321594365295e-06, + "loss": 0.9255, + "num_input_tokens_seen": 245393352, + "step": 453500 + }, + { + "epoch": 4.5421802465183285, + "grad_norm": 5.4364728927612305, + "learning_rate": 4.5782975828397635e-06, + "loss": 0.9327, + "num_input_tokens_seen": 245667248, + "step": 454000 + }, + { + "epoch": 4.547182647670882, + "grad_norm": 7.509527206420898, + "learning_rate": 4.528273571314231e-06, + "loss": 0.9207, + "num_input_tokens_seen": 245940192, + "step": 454500 + }, + { + "epoch": 4.552185048823436, + "grad_norm": 5.191705226898193, + "learning_rate": 4.478249559788699e-06, + "loss": 0.9231, + "num_input_tokens_seen": 246211320, + "step": 455000 + }, + { + "epoch": 4.557187449975989, + "grad_norm": 6.908538341522217, + "learning_rate": 4.4282255482631665e-06, + "loss": 0.9256, + "num_input_tokens_seen": 246488984, + "step": 455500 + }, + { + "epoch": 4.562189851128542, + "grad_norm": 6.262028694152832, + "learning_rate": 4.378201536737634e-06, + "loss": 0.9219, + "num_input_tokens_seen": 246764632, + "step": 456000 + }, + { + "epoch": 4.567192252281095, + "grad_norm": 6.5729475021362305, + "learning_rate": 4.328177525212102e-06, + "loss": 0.9244, + "num_input_tokens_seen": 247035024, + "step": 456500 + }, + { + "epoch": 4.572194653433648, + "grad_norm": 7.030519008636475, + "learning_rate": 4.2781535136865695e-06, + "loss": 0.9096, + "num_input_tokens_seen": 247301664, + "step": 457000 + }, + { + "epoch": 4.577197054586201, + "grad_norm": 5.72337532043457, + "learning_rate": 4.228129502161037e-06, + "loss": 0.9243, + "num_input_tokens_seen": 247574992, + "step": 457500 + }, + { + "epoch": 4.582199455738754, + "grad_norm": 5.769835948944092, + "learning_rate": 4.178105490635505e-06, + "loss": 0.9378, + "num_input_tokens_seen": 247843024, + "step": 458000 + }, + { + "epoch": 4.5872018568913075, + "grad_norm": 4.633671760559082, + "learning_rate": 4.128081479109973e-06, + "loss": 0.9249, + "num_input_tokens_seen": 248112056, + "step": 458500 + }, + { + "epoch": 4.592204258043861, + "grad_norm": 6.9910569190979, + "learning_rate": 4.078057467584441e-06, + "loss": 0.9164, + "num_input_tokens_seen": 248380816, + "step": 459000 + }, + { + "epoch": 4.597206659196415, + "grad_norm": 5.471499919891357, + "learning_rate": 4.028033456058908e-06, + "loss": 0.9241, + "num_input_tokens_seen": 248657168, + "step": 459500 + }, + { + "epoch": 4.602209060348968, + "grad_norm": 5.17936897277832, + "learning_rate": 3.978009444533376e-06, + "loss": 0.9308, + "num_input_tokens_seen": 248924720, + "step": 460000 + }, + { + "epoch": 4.607211461501521, + "grad_norm": 7.616632461547852, + "learning_rate": 3.927985433007844e-06, + "loss": 0.9316, + "num_input_tokens_seen": 249194600, + "step": 460500 + }, + { + "epoch": 4.612213862654074, + "grad_norm": 7.818989276885986, + "learning_rate": 3.877961421482311e-06, + "loss": 0.9142, + "num_input_tokens_seen": 249462512, + "step": 461000 + }, + { + "epoch": 4.617216263806627, + "grad_norm": 6.754061698913574, + "learning_rate": 3.8279374099567795e-06, + "loss": 0.9213, + "num_input_tokens_seen": 249729824, + "step": 461500 + }, + { + "epoch": 4.62221866495918, + "grad_norm": 5.925983905792236, + "learning_rate": 3.7779133984312473e-06, + "loss": 0.9171, + "num_input_tokens_seen": 250002496, + "step": 462000 + }, + { + "epoch": 4.6272210661117334, + "grad_norm": 5.226542949676514, + "learning_rate": 3.7278893869057147e-06, + "loss": 0.9118, + "num_input_tokens_seen": 250268376, + "step": 462500 + }, + { + "epoch": 4.632223467264287, + "grad_norm": 5.781167984008789, + "learning_rate": 3.677865375380183e-06, + "loss": 0.9271, + "num_input_tokens_seen": 250530912, + "step": 463000 + }, + { + "epoch": 4.63722586841684, + "grad_norm": 4.7409210205078125, + "learning_rate": 3.6278413638546503e-06, + "loss": 0.9203, + "num_input_tokens_seen": 250803368, + "step": 463500 + }, + { + "epoch": 4.642228269569394, + "grad_norm": 4.875260353088379, + "learning_rate": 3.5778173523291177e-06, + "loss": 0.9235, + "num_input_tokens_seen": 251074960, + "step": 464000 + }, + { + "epoch": 4.647230670721947, + "grad_norm": 6.038626194000244, + "learning_rate": 3.527793340803586e-06, + "loss": 0.9274, + "num_input_tokens_seen": 251347688, + "step": 464500 + }, + { + "epoch": 4.6522330718745, + "grad_norm": 5.727837562561035, + "learning_rate": 3.477769329278054e-06, + "loss": 0.9173, + "num_input_tokens_seen": 251623264, + "step": 465000 + }, + { + "epoch": 4.657235473027053, + "grad_norm": 8.719578742980957, + "learning_rate": 3.427745317752521e-06, + "loss": 0.9149, + "num_input_tokens_seen": 251894136, + "step": 465500 + }, + { + "epoch": 4.662237874179606, + "grad_norm": 5.212377071380615, + "learning_rate": 3.3777213062269894e-06, + "loss": 0.9151, + "num_input_tokens_seen": 252168632, + "step": 466000 + }, + { + "epoch": 4.667240275332159, + "grad_norm": 5.5170440673828125, + "learning_rate": 3.327697294701457e-06, + "loss": 0.9177, + "num_input_tokens_seen": 252446776, + "step": 466500 + }, + { + "epoch": 4.6722426764847125, + "grad_norm": 5.481988906860352, + "learning_rate": 3.277673283175925e-06, + "loss": 0.922, + "num_input_tokens_seen": 252711248, + "step": 467000 + }, + { + "epoch": 4.677245077637266, + "grad_norm": 5.051156520843506, + "learning_rate": 3.2276492716503925e-06, + "loss": 0.9133, + "num_input_tokens_seen": 252977920, + "step": 467500 + }, + { + "epoch": 4.682247478789819, + "grad_norm": 5.823482036590576, + "learning_rate": 3.17762526012486e-06, + "loss": 0.9274, + "num_input_tokens_seen": 253243784, + "step": 468000 + }, + { + "epoch": 4.687249879942373, + "grad_norm": 5.723121166229248, + "learning_rate": 3.127601248599328e-06, + "loss": 0.9144, + "num_input_tokens_seen": 253513552, + "step": 468500 + }, + { + "epoch": 4.692252281094926, + "grad_norm": 6.516372203826904, + "learning_rate": 3.0775772370737955e-06, + "loss": 0.9162, + "num_input_tokens_seen": 253780008, + "step": 469000 + }, + { + "epoch": 4.697254682247479, + "grad_norm": 5.488427639007568, + "learning_rate": 3.0275532255482633e-06, + "loss": 0.9181, + "num_input_tokens_seen": 254050256, + "step": 469500 + }, + { + "epoch": 4.702257083400032, + "grad_norm": 6.543509006500244, + "learning_rate": 2.977529214022731e-06, + "loss": 0.9251, + "num_input_tokens_seen": 254325728, + "step": 470000 + }, + { + "epoch": 4.707259484552585, + "grad_norm": 6.277120590209961, + "learning_rate": 2.9275052024971985e-06, + "loss": 0.9332, + "num_input_tokens_seen": 254596720, + "step": 470500 + }, + { + "epoch": 4.712261885705138, + "grad_norm": 5.882318496704102, + "learning_rate": 2.8774811909716664e-06, + "loss": 0.9091, + "num_input_tokens_seen": 254877768, + "step": 471000 + }, + { + "epoch": 4.7172642868576915, + "grad_norm": 6.018533706665039, + "learning_rate": 2.827457179446134e-06, + "loss": 0.9112, + "num_input_tokens_seen": 255146616, + "step": 471500 + }, + { + "epoch": 4.722266688010245, + "grad_norm": 7.787155628204346, + "learning_rate": 2.777433167920602e-06, + "loss": 0.9204, + "num_input_tokens_seen": 255411528, + "step": 472000 + }, + { + "epoch": 4.727269089162798, + "grad_norm": 4.750833034515381, + "learning_rate": 2.72740915639507e-06, + "loss": 0.9229, + "num_input_tokens_seen": 255673056, + "step": 472500 + }, + { + "epoch": 4.732271490315352, + "grad_norm": 5.9136457443237305, + "learning_rate": 2.6773851448695376e-06, + "loss": 0.9291, + "num_input_tokens_seen": 255947496, + "step": 473000 + }, + { + "epoch": 4.737273891467905, + "grad_norm": 5.880224704742432, + "learning_rate": 2.6273611333440055e-06, + "loss": 0.9202, + "num_input_tokens_seen": 256214416, + "step": 473500 + }, + { + "epoch": 4.742276292620458, + "grad_norm": 4.128984451293945, + "learning_rate": 2.577337121818473e-06, + "loss": 0.9225, + "num_input_tokens_seen": 256493072, + "step": 474000 + }, + { + "epoch": 4.747278693773011, + "grad_norm": 4.8430914878845215, + "learning_rate": 2.5273131102929407e-06, + "loss": 0.9142, + "num_input_tokens_seen": 256764208, + "step": 474500 + }, + { + "epoch": 4.752281094925564, + "grad_norm": 4.830491542816162, + "learning_rate": 2.4772890987674085e-06, + "loss": 0.9118, + "num_input_tokens_seen": 257032808, + "step": 475000 + }, + { + "epoch": 4.757283496078117, + "grad_norm": 4.94685697555542, + "learning_rate": 2.4272650872418763e-06, + "loss": 0.9182, + "num_input_tokens_seen": 257296856, + "step": 475500 + }, + { + "epoch": 4.7622858972306705, + "grad_norm": 5.098095417022705, + "learning_rate": 2.3772410757163437e-06, + "loss": 0.9135, + "num_input_tokens_seen": 257564344, + "step": 476000 + }, + { + "epoch": 4.767288298383224, + "grad_norm": 6.16255521774292, + "learning_rate": 2.3272170641908115e-06, + "loss": 0.9081, + "num_input_tokens_seen": 257834360, + "step": 476500 + }, + { + "epoch": 4.772290699535777, + "grad_norm": 5.006162643432617, + "learning_rate": 2.2771930526652793e-06, + "loss": 0.924, + "num_input_tokens_seen": 258105976, + "step": 477000 + }, + { + "epoch": 4.777293100688331, + "grad_norm": 5.462359428405762, + "learning_rate": 2.227169041139747e-06, + "loss": 0.8951, + "num_input_tokens_seen": 258374328, + "step": 477500 + }, + { + "epoch": 4.782295501840884, + "grad_norm": 6.263942241668701, + "learning_rate": 2.177145029614215e-06, + "loss": 0.9173, + "num_input_tokens_seen": 258646752, + "step": 478000 + }, + { + "epoch": 4.787297902993437, + "grad_norm": 6.507811546325684, + "learning_rate": 2.127121018088683e-06, + "loss": 0.9137, + "num_input_tokens_seen": 258920128, + "step": 478500 + }, + { + "epoch": 4.79230030414599, + "grad_norm": 5.116788864135742, + "learning_rate": 2.0770970065631506e-06, + "loss": 0.9134, + "num_input_tokens_seen": 259189648, + "step": 479000 + }, + { + "epoch": 4.797302705298543, + "grad_norm": 5.995227336883545, + "learning_rate": 2.0270729950376184e-06, + "loss": 0.9169, + "num_input_tokens_seen": 259457416, + "step": 479500 + }, + { + "epoch": 4.802305106451096, + "grad_norm": 4.341572284698486, + "learning_rate": 1.977048983512086e-06, + "loss": 0.9121, + "num_input_tokens_seen": 259733960, + "step": 480000 + }, + { + "epoch": 4.8073075076036496, + "grad_norm": 6.331231117248535, + "learning_rate": 1.9270249719865537e-06, + "loss": 0.9096, + "num_input_tokens_seen": 260007160, + "step": 480500 + }, + { + "epoch": 4.812309908756203, + "grad_norm": 7.295907497406006, + "learning_rate": 1.8770009604610215e-06, + "loss": 0.9227, + "num_input_tokens_seen": 260281344, + "step": 481000 + }, + { + "epoch": 4.817312309908756, + "grad_norm": 5.934523105621338, + "learning_rate": 1.8269769489354893e-06, + "loss": 0.9225, + "num_input_tokens_seen": 260555592, + "step": 481500 + }, + { + "epoch": 4.82231471106131, + "grad_norm": 6.325069904327393, + "learning_rate": 1.7769529374099567e-06, + "loss": 0.9061, + "num_input_tokens_seen": 260826576, + "step": 482000 + }, + { + "epoch": 4.827317112213863, + "grad_norm": 4.8332977294921875, + "learning_rate": 1.7269289258844245e-06, + "loss": 0.9298, + "num_input_tokens_seen": 261100264, + "step": 482500 + }, + { + "epoch": 4.832319513366416, + "grad_norm": 6.445847988128662, + "learning_rate": 1.6769049143588923e-06, + "loss": 0.9189, + "num_input_tokens_seen": 261372184, + "step": 483000 + }, + { + "epoch": 4.837321914518969, + "grad_norm": 6.000613212585449, + "learning_rate": 1.6268809028333602e-06, + "loss": 0.9195, + "num_input_tokens_seen": 261635736, + "step": 483500 + }, + { + "epoch": 4.842324315671522, + "grad_norm": 5.839612007141113, + "learning_rate": 1.5768568913078278e-06, + "loss": 0.9109, + "num_input_tokens_seen": 261908384, + "step": 484000 + }, + { + "epoch": 4.8473267168240755, + "grad_norm": 5.0340471267700195, + "learning_rate": 1.5268328797822956e-06, + "loss": 0.914, + "num_input_tokens_seen": 262183640, + "step": 484500 + }, + { + "epoch": 4.852329117976629, + "grad_norm": 9.325509071350098, + "learning_rate": 1.4768088682567634e-06, + "loss": 0.9097, + "num_input_tokens_seen": 262457280, + "step": 485000 + }, + { + "epoch": 4.857331519129182, + "grad_norm": 4.7551703453063965, + "learning_rate": 1.426784856731231e-06, + "loss": 0.916, + "num_input_tokens_seen": 262736336, + "step": 485500 + }, + { + "epoch": 4.862333920281735, + "grad_norm": 5.392652988433838, + "learning_rate": 1.3767608452056988e-06, + "loss": 0.9184, + "num_input_tokens_seen": 263002616, + "step": 486000 + }, + { + "epoch": 4.867336321434289, + "grad_norm": 6.265237808227539, + "learning_rate": 1.3267368336801664e-06, + "loss": 0.9233, + "num_input_tokens_seen": 263268536, + "step": 486500 + }, + { + "epoch": 4.872338722586842, + "grad_norm": 9.598567008972168, + "learning_rate": 1.2767128221546343e-06, + "loss": 0.9266, + "num_input_tokens_seen": 263537000, + "step": 487000 + }, + { + "epoch": 4.877341123739395, + "grad_norm": 4.646302700042725, + "learning_rate": 1.226688810629102e-06, + "loss": 0.9115, + "num_input_tokens_seen": 263804248, + "step": 487500 + }, + { + "epoch": 4.882343524891948, + "grad_norm": 5.325549125671387, + "learning_rate": 1.1766647991035699e-06, + "loss": 0.9065, + "num_input_tokens_seen": 264071728, + "step": 488000 + }, + { + "epoch": 4.887345926044501, + "grad_norm": 5.929357528686523, + "learning_rate": 1.1266407875780375e-06, + "loss": 0.9124, + "num_input_tokens_seen": 264342480, + "step": 488500 + }, + { + "epoch": 4.8923483271970545, + "grad_norm": 7.279246807098389, + "learning_rate": 1.0766167760525053e-06, + "loss": 0.9112, + "num_input_tokens_seen": 264615696, + "step": 489000 + }, + { + "epoch": 4.897350728349608, + "grad_norm": 7.106380939483643, + "learning_rate": 1.026592764526973e-06, + "loss": 0.9099, + "num_input_tokens_seen": 264886296, + "step": 489500 + }, + { + "epoch": 4.902353129502161, + "grad_norm": 6.210377216339111, + "learning_rate": 9.765687530014407e-07, + "loss": 0.9227, + "num_input_tokens_seen": 265156384, + "step": 490000 + }, + { + "epoch": 4.907355530654714, + "grad_norm": 6.38276481628418, + "learning_rate": 9.265447414759085e-07, + "loss": 0.9205, + "num_input_tokens_seen": 265426248, + "step": 490500 + }, + { + "epoch": 4.912357931807268, + "grad_norm": 5.883709907531738, + "learning_rate": 8.765207299503763e-07, + "loss": 0.9309, + "num_input_tokens_seen": 265695040, + "step": 491000 + }, + { + "epoch": 4.917360332959821, + "grad_norm": 5.776634693145752, + "learning_rate": 8.26496718424844e-07, + "loss": 0.9042, + "num_input_tokens_seen": 265968648, + "step": 491500 + }, + { + "epoch": 4.922362734112374, + "grad_norm": 6.002242088317871, + "learning_rate": 7.764727068993117e-07, + "loss": 0.9151, + "num_input_tokens_seen": 266237776, + "step": 492000 + }, + { + "epoch": 4.927365135264927, + "grad_norm": 6.250047206878662, + "learning_rate": 7.264486953737794e-07, + "loss": 0.9021, + "num_input_tokens_seen": 266510616, + "step": 492500 + }, + { + "epoch": 4.93236753641748, + "grad_norm": 7.225757598876953, + "learning_rate": 6.764246838482472e-07, + "loss": 0.9197, + "num_input_tokens_seen": 266780560, + "step": 493000 + }, + { + "epoch": 4.9373699375700335, + "grad_norm": 6.095335006713867, + "learning_rate": 6.26400672322715e-07, + "loss": 0.916, + "num_input_tokens_seen": 267050712, + "step": 493500 + }, + { + "epoch": 4.942372338722587, + "grad_norm": 7.186228275299072, + "learning_rate": 5.763766607971827e-07, + "loss": 0.9027, + "num_input_tokens_seen": 267324136, + "step": 494000 + }, + { + "epoch": 4.94737473987514, + "grad_norm": 6.71329402923584, + "learning_rate": 5.263526492716505e-07, + "loss": 0.9205, + "num_input_tokens_seen": 267592264, + "step": 494500 + }, + { + "epoch": 4.952377141027693, + "grad_norm": 6.1677045822143555, + "learning_rate": 4.763286377461182e-07, + "loss": 0.9235, + "num_input_tokens_seen": 267861184, + "step": 495000 + }, + { + "epoch": 4.957379542180247, + "grad_norm": 5.632607460021973, + "learning_rate": 4.263046262205859e-07, + "loss": 0.9131, + "num_input_tokens_seen": 268133248, + "step": 495500 + }, + { + "epoch": 4.9623819433328, + "grad_norm": 7.572421073913574, + "learning_rate": 3.7628061469505367e-07, + "loss": 0.9228, + "num_input_tokens_seen": 268408144, + "step": 496000 + }, + { + "epoch": 4.967384344485353, + "grad_norm": 5.349244594573975, + "learning_rate": 3.262566031695214e-07, + "loss": 0.9038, + "num_input_tokens_seen": 268677424, + "step": 496500 + }, + { + "epoch": 4.972386745637906, + "grad_norm": 5.9130754470825195, + "learning_rate": 2.7623259164398915e-07, + "loss": 0.9116, + "num_input_tokens_seen": 268944976, + "step": 497000 + }, + { + "epoch": 4.977389146790459, + "grad_norm": 6.351227283477783, + "learning_rate": 2.2620858011845684e-07, + "loss": 0.9281, + "num_input_tokens_seen": 269220512, + "step": 497500 + }, + { + "epoch": 4.9823915479430125, + "grad_norm": 4.36036491394043, + "learning_rate": 1.761845685929246e-07, + "loss": 0.9158, + "num_input_tokens_seen": 269491032, + "step": 498000 + }, + { + "epoch": 4.987393949095566, + "grad_norm": 6.718998432159424, + "learning_rate": 1.2616055706739237e-07, + "loss": 0.923, + "num_input_tokens_seen": 269761928, + "step": 498500 + }, + { + "epoch": 4.992396350248119, + "grad_norm": 6.188157558441162, + "learning_rate": 7.61365455418601e-08, + "loss": 0.9097, + "num_input_tokens_seen": 270030752, + "step": 499000 + }, + { + "epoch": 4.997398751400672, + "grad_norm": 7.353320598602295, + "learning_rate": 2.6112534016327838e-08, + "loss": 0.9016, + "num_input_tokens_seen": 270298112, + "step": 499500 + }, + { + "epoch": 5.0, + "eval_loss": 0.9877662062644958, + "eval_runtime": 193.3937, + "eval_samples_per_second": 1033.668, + "eval_steps_per_second": 129.213, + "num_input_tokens_seen": 270444104, + "step": 499760 + }, + { + "epoch": 5.0, + "num_input_tokens_seen": 270444104, + "step": 499760, + "total_flos": 7.16219760157655e+16, + "train_loss": 0.8578685343122414, + "train_runtime": 15819.1918, + "train_samples_per_second": 252.736, + "train_steps_per_second": 31.592, + "train_tokens_per_second": 17096.35 } ], "logging_steps": 500, - "max_steps": 416560, - "num_input_tokens_seen": 224864981, - "num_train_epochs": 10, + "max_steps": 499760, + "num_input_tokens_seen": 270444104, + "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { @@ -6792,7 +8075,7 @@ "attributes": {} } }, - "total_flos": 5.955121238645146e+16, + "total_flos": 7.16219760157655e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null